analysis.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567
  1. """
  2. 分析相关API路由
  3. """
  4. from fastapi import APIRouter, HTTPException, UploadFile, File
  5. from typing import Dict, Any, Optional, List
  6. from pydantic import BaseModel
  7. import logging
  8. import arxiv
  9. import os
  10. from core.config import get_config
  11. from core.llm_adapter import get_llm_adapter
  12. from utils.pdf_parser import pdf_parser
  13. logger = logging.getLogger(__name__)
  14. router = APIRouter()
  15. # 初始化 LLM 适配器(基于 HelloAgent)
  16. config = get_config()
  17. try:
  18. llm = get_llm_adapter() if config.llm.api_key else None
  19. except Exception as e:
  20. logger.warning(f"LLM 初始化失败: {str(e)}")
  21. llm = None
  22. # Pydantic模型
  23. class AnalysisRequest(BaseModel):
  24. paper_id: str
  25. user_id: Optional[str] = None
  26. analysis_type: str = "full" # full, quick, innovation_only
  27. class ComparisonRequest(BaseModel):
  28. paper_ids: List[str]
  29. user_id: Optional[str] = None
  30. comparison_aspects: List[str] = ["method", "results", "innovation"]
  31. class InnovationSearchRequest(BaseModel):
  32. query: str
  33. user_id: Optional[str] = None
  34. search_scope: str = "both" # l1, l2, both
  35. top_k: int = 10
  36. class PaperAnalysisRequest(BaseModel):
  37. paper_url: str
  38. analysis_type: str = "summary" # summary, innovation, comparison, comprehensive
  39. @router.post("/analyze", response_model=Dict[str, Any])
  40. async def analyze_paper(request: PaperAnalysisRequest):
  41. """分析论文 - 支持 ArXiv URL 和本地 PDF 文件"""
  42. try:
  43. if not llm:
  44. raise HTTPException(status_code=503, detail="AI 服务未配置,请设置 OPENAI_API_KEY")
  45. import re
  46. paper_url = request.paper_url.strip()
  47. # 检查是否是本地上传的 PDF 文件
  48. if paper_url.startswith('/uploads/') or paper_url.endswith('.pdf'):
  49. logger.info(f"检测到本地 PDF 文件: {paper_url}")
  50. # 构建完整的文件路径
  51. if paper_url.startswith('/uploads/'):
  52. # 假设上传的文件在 downloads 目录
  53. file_path = os.path.join('downloads', paper_url.replace('/uploads/', ''))
  54. else:
  55. file_path = paper_url
  56. # 检查文件是否存在
  57. if not os.path.exists(file_path):
  58. logger.warning(f"PDF 文件不存在: {file_path}")
  59. raise HTTPException(status_code=404, detail=f"PDF 文件不存在: {paper_url}")
  60. # 解析 PDF 文件
  61. logger.info(f"开始解析 PDF 文件: {file_path}")
  62. pdf_result = await pdf_parser.parse_pdf(file_path)
  63. if not pdf_result.get("success"):
  64. raise HTTPException(status_code=500, detail=pdf_result.get("error", "PDF 解析失败"))
  65. # 使用解析出的内容进行 AI 分析
  66. title = pdf_result.get("title", "未知标题")
  67. authors = pdf_result.get("authors", ["未知作者"])
  68. abstract = pdf_result.get("abstract", "")
  69. full_text = pdf_result.get("full_text", "")
  70. # 限制文本长度以避免超出 token 限制
  71. text_for_analysis = full_text[:8000] if len(full_text) > 8000 else full_text
  72. # 根据分析类型生成提示词
  73. prompts = {
  74. "summary": f"""请对以下论文进行摘要分析:
  75. 标题:{title}
  76. 作者:{', '.join(authors)}
  77. 摘要:{abstract}
  78. 论文内容(前8000字符):
  79. {text_for_analysis}
  80. 请提供:
  81. 1. 研究背景和动机
  82. 2. 主要方法
  83. 3. 核心贡献
  84. 4. 实验结果
  85. 5. 研究意义
  86. 请用中文回答,保持专业和简洁。""",
  87. "innovation": f"""请分析以下论文的创新点:
  88. 标题:{title}
  89. 摘要:{abstract}
  90. 论文内容:
  91. {text_for_analysis}
  92. 请详细分析:
  93. 1. 技术创新点
  94. 2. 方法论创新
  95. 3. 理论贡献
  96. 4. 与现有工作的区别
  97. 5. 潜在应用价值
  98. 请用中文回答。""",
  99. "comparison": f"""请对以下论文进行对比分析:
  100. 标题:{title}
  101. 摘要:{abstract}
  102. 论文内容:
  103. {text_for_analysis}
  104. 请分析:
  105. 1. 与传统方法的对比
  106. 2. 优势和劣势
  107. 3. 适用场景
  108. 4. 性能提升
  109. 5. 局限性
  110. 请用中文回答。""",
  111. "comprehensive": f"""请对以下论文进行全面综合分析:
  112. 标题:{title}
  113. 作者:{', '.join(authors)}
  114. 摘要:{abstract}
  115. 论文内容:
  116. {text_for_analysis}
  117. 请提供全面的分析,包括:
  118. 1. 研究背景和意义
  119. 2. 技术方法详解
  120. 3. 创新点分析
  121. 4. 实验验证
  122. 5. 优缺点评价
  123. 6. 未来研究方向
  124. 7. 实际应用价值
  125. 请用中文回答,保持专业和深度。"""
  126. }
  127. prompt = prompts.get(request.analysis_type, prompts["summary"])
  128. # 调用 LLM 进行分析
  129. logger.info(f"开始 AI 分析,类型: {request.analysis_type}")
  130. response = await llm.ainvoke(prompt)
  131. analysis_content = response.content if hasattr(response, 'content') else str(response)
  132. return {
  133. "success": True,
  134. "paper_info": {
  135. "id": "local_pdf",
  136. "title": title,
  137. "authors": authors,
  138. "published_date": "N/A",
  139. "url": paper_url,
  140. "categories": ["本地文件"],
  141. "page_count": pdf_result.get("page_count", 0),
  142. "word_count": pdf_result.get("word_count", 0)
  143. },
  144. "analysis_type": request.analysis_type,
  145. "analysis": analysis_content,
  146. "abstract": abstract
  147. }
  148. # ArXiv 论文处理
  149. arxiv_patterns = [
  150. r'arxiv\.org/abs/(\d+\.\d+)',
  151. r'arxiv\.org/pdf/(\d+\.\d+)',
  152. r'arXiv:(\d+\.\d+)',
  153. r'\[(\d+\.\d+)v?\d*\]',
  154. r'^(\d{4}\.\d{4,5})v?\d*$'
  155. ]
  156. paper_id = None
  157. for pattern in arxiv_patterns:
  158. match = re.search(pattern, paper_url, re.IGNORECASE)
  159. if match:
  160. paper_id = match.group(1)
  161. break
  162. if not paper_id:
  163. raise HTTPException(
  164. status_code=400,
  165. detail=f"无效的输入。支持的格式:\n" +
  166. "- ArXiv URL: https://arxiv.org/abs/2511.16672\n" +
  167. "- ArXiv ID: 2511.16672\n" +
  168. "- 本地 PDF: 上传后自动填充"
  169. )
  170. logger.info(f"正在分析 ArXiv 论文: {paper_id}")
  171. # 获取论文信息
  172. search = arxiv.Search(id_list=[paper_id])
  173. paper = next(search.results(), None)
  174. if not paper:
  175. raise HTTPException(status_code=404, detail=f"未找到 ArXiv 论文: {paper_id}")
  176. # 根据分析类型生成提示词
  177. prompts = {
  178. "summary": f"""请对以下论文进行摘要分析:
  179. 标题:{paper.title}
  180. 作者:{', '.join([a.name for a in paper.authors])}
  181. 摘要:{paper.summary}
  182. 请提供:
  183. 1. 研究背景和动机
  184. 2. 主要方法
  185. 3. 核心贡献
  186. 4. 实验结果
  187. 5. 研究意义
  188. 请用中文回答,保持专业和简洁。""",
  189. "innovation": f"""请分析以下论文的创新点:
  190. 标题:{paper.title}
  191. 摘要:{paper.summary}
  192. 请详细分析:
  193. 1. 技术创新点
  194. 2. 方法论创新
  195. 3. 理论贡献
  196. 4. 与现有工作的区别
  197. 5. 潜在应用价值
  198. 请用中文回答。""",
  199. "comparison": f"""请对以下论文进行对比分析:
  200. 标题:{paper.title}
  201. 摘要:{paper.summary}
  202. 请分析:
  203. 1. 与传统方法的对比
  204. 2. 优势和劣势
  205. 3. 适用场景
  206. 4. 性能提升
  207. 5. 局限性
  208. 请用中文回答。""",
  209. "comprehensive": f"""请对以下论文进行全面综合分析:
  210. 标题:{paper.title}
  211. 作者:{', '.join([a.name for a in paper.authors])}
  212. 摘要:{paper.summary}
  213. 分类:{', '.join(paper.categories)}
  214. 请提供全面的分析,包括:
  215. 1. 研究背景和意义
  216. 2. 技术方法详解
  217. 3. 创新点分析
  218. 4. 实验验证
  219. 5. 优缺点评价
  220. 6. 未来研究方向
  221. 7. 实际应用价值
  222. 请用中文回答,保持专业和深度。"""
  223. }
  224. prompt = prompts.get(request.analysis_type, prompts["summary"])
  225. # 调用 LLM 进行分析
  226. response = await llm.ainvoke(prompt)
  227. analysis_content = response.content if hasattr(response, 'content') else str(response)
  228. return {
  229. "success": True,
  230. "paper_info": {
  231. "id": paper_id,
  232. "title": paper.title,
  233. "authors": [a.name for a in paper.authors],
  234. "published_date": paper.published.strftime("%Y-%m-%d"),
  235. "url": paper.entry_id,
  236. "categories": paper.categories
  237. },
  238. "analysis_type": request.analysis_type,
  239. "analysis": analysis_content,
  240. "abstract": paper.summary
  241. }
  242. except HTTPException:
  243. raise
  244. except Exception as e:
  245. logger.error(f"论文分析失败: {str(e)}")
  246. raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
  247. @router.post("/compare", response_model=Dict[str, Any])
  248. async def compare_papers(request: ComparisonRequest):
  249. """对比多篇论文"""
  250. try:
  251. # 这里需要实现论文对比逻辑
  252. # 暂时返回模拟结果
  253. comparison_result = {
  254. "paper_ids": request.paper_ids,
  255. "comparison_aspects": request.comparison_aspects,
  256. "similarities": ["相似点1", "相似点2"],
  257. "differences": ["差异点1", "差异点2"],
  258. "innovation_gaps": ["创新空白1", "创新空白2"],
  259. "recommendations": ["建议1", "建议2"]
  260. }
  261. return {
  262. "success": True,
  263. "result": comparison_result
  264. }
  265. except Exception as e:
  266. logger.error(f"论文对比失败: {str(e)}")
  267. raise HTTPException(status_code=500, detail=str(e))
  268. @router.post("/innovation/search", response_model=Dict[str, Any])
  269. async def search_innovation_opportunities(request: InnovationSearchRequest):
  270. """搜索创新机会"""
  271. try:
  272. # 这里需要实现创新机会搜索逻辑
  273. # 暂时返回模拟结果
  274. innovation_results = {
  275. "query": request.query,
  276. "opportunities": [
  277. {
  278. "title": "创新机会1",
  279. "description": "基于当前研究的创新方向",
  280. "related_papers": ["paper1", "paper2"],
  281. "confidence": 0.85
  282. },
  283. {
  284. "title": "创新机会2",
  285. "description": "另一个潜在的研究方向",
  286. "related_papers": ["paper3", "paper4"],
  287. "confidence": 0.72
  288. }
  289. ],
  290. "research_gaps": ["研究空白1", "研究空白2"],
  291. "future_directions": ["未来方向1", "未来方向2"]
  292. }
  293. return {
  294. "success": True,
  295. "result": innovation_results
  296. }
  297. except Exception as e:
  298. logger.error(f"创新机会搜索失败: {str(e)}")
  299. raise HTTPException(status_code=500, detail=str(e))
  300. @router.get("/paper/{paper_id}/summary")
  301. async def get_paper_summary(paper_id: str, user_id: Optional[str] = None):
  302. """获取论文摘要"""
  303. try:
  304. # 这里需要实现论文摘要生成逻辑
  305. # 暂时返回模拟结果
  306. summary = {
  307. "paper_id": paper_id,
  308. "summary": "这是一篇关于...的论文,主要贡献包括...",
  309. "key_contributions": ["贡献1", "贡献2", "贡献3"],
  310. "methodology": "论文采用的方法是...",
  311. "results": "实验结果表明...",
  312. "limitations": "研究的局限性包括...",
  313. "future_work": "未来工作方向..."
  314. }
  315. return {
  316. "success": True,
  317. "summary": summary
  318. }
  319. except Exception as e:
  320. logger.error(f"获取论文摘要失败: {str(e)}")
  321. raise HTTPException(status_code=500, detail=str(e))
  322. @router.get("/paper/{paper_id}/innovations")
  323. async def get_paper_innovations(paper_id: str, user_id: Optional[str] = None):
  324. """获取论文创新点"""
  325. try:
  326. # 这里需要实现创新点提取逻辑
  327. # 暂时返回模拟结果
  328. innovations = {
  329. "paper_id": paper_id,
  330. "innovations": [
  331. {
  332. "aspect": "方法创新",
  333. "description": "提出了新的方法...",
  334. "novelty": "high",
  335. "impact": "significant"
  336. },
  337. {
  338. "aspect": "理论创新",
  339. "description": "在理论上有所突破...",
  340. "novelty": "medium",
  341. "impact": "moderate"
  342. }
  343. ],
  344. "comparison_with_prior_work": "与之前的工作相比...",
  345. "potential_applications": ["应用1", "应用2"]
  346. }
  347. return {
  348. "success": True,
  349. "innovations": innovations
  350. }
  351. except Exception as e:
  352. logger.error(f"获取论文创新点失败: {str(e)}")
  353. raise HTTPException(status_code=500, detail=str(e))
  354. @router.get("/user/{user_id}/insights")
  355. async def get_user_insights(user_id: str):
  356. """获取用户研究洞察"""
  357. try:
  358. # 这里需要实现用户研究洞察分析
  359. # 暂时返回模拟结果
  360. insights = {
  361. "user_id": user_id,
  362. "research_interests": ["兴趣1", "兴趣2"],
  363. "reading_patterns": {
  364. "papers_read": 50,
  365. "favorite_topics": ["主题1", "主题2"],
  366. "reading_frequency": "daily"
  367. },
  368. "knowledge_gaps": ["知识空白1", "知识空白2"],
  369. "research_suggestions": [
  370. {
  371. "topic": "建议研究方向1",
  372. "reason": "基于您的阅读历史...",
  373. "related_papers": ["paper1", "paper2"]
  374. }
  375. ],
  376. "skill_assessment": {
  377. "technical_skills": ["技能1", "技能2"],
  378. "writing_skills": ["写作技能1", "写作技能2"],
  379. "improvement_areas": ["改进领域1", "改进领域2"]
  380. }
  381. }
  382. return {
  383. "success": True,
  384. "insights": insights
  385. }
  386. except Exception as e:
  387. logger.error(f"获取用户研究洞察失败: {str(e)}")
  388. raise HTTPException(status_code=500, detail=str(e))
  389. @router.post("/batch", response_model=Dict[str, Any])
  390. async def batch_analyze_papers(paper_ids: List[str], user_id: Optional[str] = None):
  391. """批量分析论文"""
  392. try:
  393. results = []
  394. for paper_id in paper_ids:
  395. try:
  396. # 提交论文分析任务
  397. task_id = await agent_controller.submit_task(
  398. TaskType.PAPER_ANALYSIS,
  399. {
  400. "paper_id": paper_id,
  401. "user_id": user_id,
  402. "analysis_type": "quick" # 批量分析使用快速模式
  403. }
  404. )
  405. # 执行任务
  406. result = await agent_controller.execute_task(task_id)
  407. results.append({
  408. "paper_id": paper_id,
  409. "task_id": task_id,
  410. "success": True,
  411. "result": result
  412. })
  413. except Exception as e:
  414. results.append({
  415. "paper_id": paper_id,
  416. "success": False,
  417. "error": str(e)
  418. })
  419. return {
  420. "success": True,
  421. "total_papers": len(paper_ids),
  422. "successful_analyses": sum(1 for r in results if r["success"]),
  423. "results": results
  424. }
  425. except Exception as e:
  426. logger.error(f"批量分析论文失败: {str(e)}")
  427. raise HTTPException(status_code=500, detail=str(e))
  428. @router.post("/upload-pdf", response_model=Dict[str, Any])
  429. async def upload_pdf_for_analysis(file: UploadFile = File(...)):
  430. """
  431. 上传 PDF 文件并解析
  432. 返回文件信息和解析结果
  433. """
  434. try:
  435. # 检查文件类型
  436. if not file.filename.endswith('.pdf'):
  437. raise HTTPException(status_code=400, detail="只支持 PDF 文件")
  438. # 读取文件内容
  439. logger.info(f"接收到 PDF 文件: {file.filename}")
  440. pdf_bytes = await file.read()
  441. # 解析 PDF
  442. pdf_result = await pdf_parser.parse_pdf_from_bytes(pdf_bytes, file.filename)
  443. if not pdf_result.get("success"):
  444. raise HTTPException(status_code=500, detail=pdf_result.get("error", "PDF 解析失败"))
  445. # 保存文件到 downloads 目录
  446. os.makedirs("downloads", exist_ok=True)
  447. file_path = os.path.join("downloads", file.filename)
  448. with open(file_path, "wb") as f:
  449. f.write(pdf_bytes)
  450. logger.info(f"PDF 文件已保存: {file_path}")
  451. return {
  452. "success": True,
  453. "filename": file.filename,
  454. "file_path": f"/uploads/{file.filename}",
  455. "title": pdf_result.get("title", "未知标题"),
  456. "authors": pdf_result.get("authors", ["未知作者"]),
  457. "abstract": pdf_result.get("abstract", "")[:500], # 限制摘要长度
  458. "page_count": pdf_result.get("page_count", 0),
  459. "word_count": pdf_result.get("word_count", 0),
  460. "message": "PDF 文件上传并解析成功,可以使用返回的 file_path 进行分析"
  461. }
  462. except HTTPException:
  463. raise
  464. except Exception as e:
  465. logger.error(f"PDF 上传失败: {str(e)}")
  466. raise HTTPException(status_code=500, detail=f"上传失败: {str(e)}")