citations.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. """
  2. 引用校验API路由
  3. """
  4. from fastapi import APIRouter, HTTPException
  5. from typing import Dict, Any, Optional
  6. from pydantic import BaseModel
  7. import logging
  8. import httpx
  9. import re
  10. logger = logging.getLogger(__name__)
  11. router = APIRouter()
  12. # Pydantic模型
  13. class CitationValidationRequest(BaseModel):
  14. citation: str
  15. format: str = "bibtex" # bibtex, apa, ieee, mla
  16. class CitationGenerateRequest(BaseModel):
  17. doi: Optional[str] = None
  18. title: Optional[str] = None
  19. authors: Optional[str] = None
  20. year: Optional[int] = None
  21. journal: Optional[str] = None
  22. format: str = "bibtex"
  23. @router.post("/validate", response_model=Dict[str, Any])
  24. async def validate_citation(request: CitationValidationRequest):
  25. """校验引用格式 - 支持 ArXiv、DOI 和 AI 辅助验证"""
  26. try:
  27. logger.info(f"校验引用: {request.citation[:100]}...")
  28. metadata = None
  29. verified = False
  30. doi = None
  31. # 1. 尝试识别 ArXiv URL 或 ID
  32. arxiv_pattern = r'(?:arxiv\.org/abs/|arXiv:)(\d+\.\d+)'
  33. arxiv_match = re.search(arxiv_pattern, request.citation, re.IGNORECASE)
  34. if arxiv_match:
  35. arxiv_id = arxiv_match.group(1)
  36. logger.info(f"找到 ArXiv ID: {arxiv_id}")
  37. try:
  38. import arxiv
  39. search = arxiv.Search(id_list=[arxiv_id])
  40. paper = next(search.results(), None)
  41. if paper:
  42. metadata = {
  43. 'title': paper.title,
  44. 'authors': [author.name for author in paper.authors],
  45. 'year': paper.published.year,
  46. 'journal': 'arXiv preprint',
  47. 'arxiv_id': arxiv_id,
  48. 'url': paper.entry_id
  49. }
  50. verified = True
  51. logger.info(f"ArXiv 论文信息获取成功: {metadata['title'][:50]}...")
  52. logger.info(f"作者数量: {len(metadata['authors'])}")
  53. else:
  54. logger.warning(f"未找到 ArXiv ID: {arxiv_id}")
  55. except Exception as e:
  56. logger.error(f"ArXiv 查询失败: {str(e)}", exc_info=True)
  57. # 2. 尝试从引用中提取 DOI
  58. if not verified:
  59. doi_pattern = r'10\.\d{4,9}/[-._;()/:A-Z0-9]+'
  60. doi_match = re.search(doi_pattern, request.citation, re.IGNORECASE)
  61. if doi_match:
  62. doi = doi_match.group(0)
  63. logger.info(f"找到 DOI: {doi}")
  64. # 使用 Crossref API 验证 DOI
  65. async with httpx.AsyncClient() as client:
  66. try:
  67. response = await client.get(
  68. f"https://api.crossref.org/works/{doi}",
  69. timeout=10.0
  70. )
  71. if response.status_code == 200:
  72. data = response.json()
  73. msg = data.get('message', {})
  74. metadata = {
  75. 'title': msg.get('title', [''])[0],
  76. 'authors': [f"{a.get('given', '')} {a.get('family', '')}" for a in msg.get('author', [])],
  77. 'year': msg.get('published', {}).get('date-parts', [[None]])[0][0],
  78. 'journal': msg.get('container-title', [''])[0],
  79. 'volume': msg.get('volume', ''),
  80. 'issue': msg.get('issue', ''),
  81. 'pages': msg.get('page', ''),
  82. 'doi': doi
  83. }
  84. verified = True
  85. logger.info("DOI 验证成功")
  86. except Exception as e:
  87. logger.warning(f"DOI 验证失败: {str(e)}")
  88. # 3. 如果仍未验证,尝试使用 AI 解析引用信息
  89. if not verified:
  90. logger.info("尝试使用 AI 解析引用信息...")
  91. try:
  92. from core.config import get_config
  93. from core.llm_adapter import get_llm_adapter
  94. config = get_config()
  95. if config.llm.api_key:
  96. llm = get_llm_adapter()
  97. prompt = f"""请从以下引用信息中提取关键元数据,并以 JSON 格式返回。
  98. 引用信息:
  99. {request.citation}
  100. 请提取以下信息(如果有的话):
  101. - title: 论文标题
  102. - authors: 作者列表(字符串数组,例如 ["Zhang San", "Li Si"])
  103. - year: 发表年份(数字)
  104. - journal: 期刊或会议名称
  105. - volume: 卷号
  106. - issue: 期号
  107. - pages: 页码
  108. - doi: DOI(如果有)
  109. - arxiv_id: ArXiv ID(如果有)
  110. 只返回纯 JSON 格式,不要任何其他文字说明。如果某个字段不存在,请省略该字段。
  111. 示例输出:
  112. {{"title": "论文标题", "authors": ["作者1", "作者2"], "year": 2024, "journal": "期刊名"}}"""
  113. response = await llm.ainvoke(prompt)
  114. ai_result = response.content if hasattr(response, 'content') else str(response)
  115. # 尝试解析 AI 返回的 JSON
  116. import json
  117. # 提取 JSON 部分(支持代码块格式)
  118. json_match = re.search(r'```(?:json)?\s*(\{[\s\S]*?\})\s*```', ai_result)
  119. if json_match:
  120. metadata = json.loads(json_match.group(1))
  121. verified = True
  122. logger.info("AI 解析成功(代码块格式)")
  123. else:
  124. json_match = re.search(r'\{[\s\S]*\}', ai_result)
  125. if json_match:
  126. metadata = json.loads(json_match.group(0))
  127. verified = True
  128. logger.info("AI 解析成功")
  129. except Exception as e:
  130. logger.warning(f"AI 解析失败: {str(e)}")
  131. # 生成标准格式的引用
  132. if metadata and verified:
  133. title = metadata.get('title', 'Unknown Title')
  134. authors = metadata.get('authors', []) if isinstance(metadata.get('authors'), list) else [metadata.get('authors', 'Unknown Author')]
  135. year = metadata.get('year', 'n.d.')
  136. journal = metadata.get('journal', 'Unknown Journal')
  137. volume = metadata.get('volume', '')
  138. issue = metadata.get('issue', '')
  139. pages = metadata.get('pages', '')
  140. doi = metadata.get('doi', doi)
  141. arxiv_id = metadata.get('arxiv_id', '')
  142. # 处理作者列表
  143. if isinstance(authors, list):
  144. if len(authors) > 3:
  145. author_str = ', '.join(authors[:3]) + ' et al.'
  146. else:
  147. author_str = ', '.join(authors)
  148. else:
  149. author_str = str(authors)
  150. # 生成不同格式的引用
  151. # BibTeX 格式
  152. bibtex_parts = [
  153. f"@article{{key{year},",
  154. f" title={{{title}}},",
  155. f" author={{{author_str}}},",
  156. f" journal={{{journal}}},",
  157. f" year={{{year}}}"
  158. ]
  159. if volume:
  160. bibtex_parts.append(f" volume={{{volume}}}")
  161. if issue:
  162. bibtex_parts.append(f" number={{{issue}}}")
  163. if pages:
  164. bibtex_parts.append(f" pages={{{pages}}}")
  165. if arxiv_id:
  166. bibtex_parts.append(f" eprint={{{arxiv_id}}}")
  167. bibtex_parts.append(f" archivePrefix={{arXiv}}")
  168. if doi:
  169. bibtex_parts.append(f" doi={{{doi}}}")
  170. bibtex_citation = ',\n'.join(bibtex_parts) + '\n}'
  171. # APA 格式
  172. vol_str = f', {volume}' if volume else ''
  173. issue_str = f'({issue})' if issue else ''
  174. pages_str = f', {pages}' if pages else ''
  175. if arxiv_id:
  176. apa_citation = f"{author_str} ({year}). {title}. *{journal}*{vol_str}{issue_str}{pages_str}. arXiv:{arxiv_id}"
  177. elif doi:
  178. apa_citation = f"{author_str} ({year}). {title}. *{journal}*{vol_str}{issue_str}{pages_str}. https://doi.org/{doi}"
  179. else:
  180. apa_citation = f"{author_str} ({year}). {title}. *{journal}*{vol_str}{issue_str}{pages_str}."
  181. # IEEE 格式
  182. vol_ieee = f', vol. {volume}' if volume else ''
  183. issue_ieee = f', no. {issue}' if issue else ''
  184. pages_ieee = f', pp. {pages}' if pages else ''
  185. if arxiv_id:
  186. ieee_citation = f'[1] {author_str}, "{title}," *{journal}*{vol_ieee}{issue_ieee}{pages_ieee}, {year}, arXiv:{arxiv_id}.'
  187. elif doi:
  188. ieee_citation = f'[1] {author_str}, "{title}," *{journal}*{vol_ieee}{issue_ieee}{pages_ieee}, {year}, doi: {doi}.'
  189. else:
  190. ieee_citation = f'[1] {author_str}, "{title}," *{journal}*{vol_ieee}{issue_ieee}{pages_ieee}, {year}.'
  191. vol_mla = f', vol. {volume}' if volume else ''
  192. issue_mla = f', no. {issue}' if issue else ''
  193. pages_mla = f', pp. {pages}' if pages else ''
  194. mla_citation = f'{author_str}. "{title}." *{journal}*{vol_mla}{issue_mla}, {year}{pages_mla}.'
  195. citations = {
  196. "bibtex": bibtex_citation,
  197. "apa": apa_citation,
  198. "ieee": ieee_citation,
  199. "mla": mla_citation
  200. }
  201. formatted_citation = citations.get(request.format, citations["bibtex"])
  202. else:
  203. # 如果无法验证,返回原始引用和警告
  204. formatted_citation = request.citation
  205. verified = False
  206. result = {
  207. "success": True,
  208. "original_citation": request.citation,
  209. "formatted_citation": formatted_citation,
  210. "format": request.format,
  211. "verified": verified,
  212. "metadata": metadata if verified else None,
  213. "warnings": [] if verified else ["无法自动验证引用,已返回原始格式。建议提供包含 DOI 的引用信息以获得更准确的结果。"]
  214. }
  215. logger.info(f"返回结果 - verified: {verified}, metadata: {metadata is not None}")
  216. if metadata:
  217. logger.info(f"Metadata keys: {list(metadata.keys())}")
  218. return result
  219. except Exception as e:
  220. logger.error(f"引用校验失败: {str(e)}")
  221. raise HTTPException(status_code=500, detail=f"校验失败: {str(e)}")
  222. @router.post("/generate", response_model=Dict[str, Any])
  223. async def generate_citation(request: CitationGenerateRequest):
  224. """生成引用格式"""
  225. try:
  226. # 模拟引用生成
  227. newline = "\n"
  228. quote = '"'
  229. citation_formats = {
  230. "bibtex": f"@article{{{request.authors or 'author2024'},{newline} title={{{request.title or 'Title'}}},{newline} author={{{request.authors or 'Author'}}},{newline} journal={{{request.journal or 'Journal'}}},{newline} year={{{request.year or 2024}}}{newline}}}",
  231. "apa": f"{request.authors or 'Author'} ({request.year or 2024}). {request.title or 'Title'}. *{request.journal or 'Journal'}*.",
  232. "ieee": f"[1] {request.authors or 'Author'}, {quote}{request.title or 'Title'},{quote} *{request.journal or 'Journal'}*, {request.year or 2024}.",
  233. "mla": f"{request.authors or 'Author'}. {quote}{request.title or 'Title'}.{quote} *{request.journal or 'Journal'}*, {request.year or 2024}."
  234. }
  235. citation = citation_formats.get(request.format, citation_formats["bibtex"])
  236. return {
  237. "success": True,
  238. "citation": citation,
  239. "format": request.format,
  240. "metadata": {
  241. "title": request.title,
  242. "authors": request.authors,
  243. "year": request.year,
  244. "journal": request.journal,
  245. "doi": request.doi
  246. },
  247. "timestamp": "2024-01-15T10:30:00Z"
  248. }
  249. except Exception as e:
  250. logger.error(f"引用生成失败: {str(e)}")
  251. raise HTTPException(status_code=500, detail=f"生成失败: {str(e)}")
  252. @router.get("/formats", response_model=Dict[str, Any])
  253. async def get_citation_formats():
  254. """获取支持的引用格式"""
  255. try:
  256. formats = {
  257. "bibtex": {
  258. "name": "BibTeX",
  259. "description": "常用于LaTeX文档的引用格式",
  260. "example": "@article{key, title={Title}, author={Author}, year={2024}}"
  261. },
  262. "apa": {
  263. "name": "APA",
  264. "description": "美国心理学会格式,常用于社会科学",
  265. "example": "Author, A. (2024). Title. *Journal*, 1(1), 1-10."
  266. },
  267. "ieee": {
  268. "name": "IEEE",
  269. "description": "电气电子工程师学会格式,常用于工程技术",
  270. "example": "[1] A. Author, \"Title,\" *Journal*, vol. 1, no. 1, pp. 1-10, 2024."
  271. },
  272. "mla": {
  273. "name": "MLA",
  274. "description": "现代语言学会格式,常用于人文学科",
  275. "example": "Author. \"Title.\" *Journal*, vol. 1, no. 1, 2024, pp. 1-10."
  276. }
  277. }
  278. return {
  279. "success": True,
  280. "formats": formats,
  281. "total": len(formats)
  282. }
  283. except Exception as e:
  284. logger.error(f"获取引用格式失败: {str(e)}")
  285. raise HTTPException(status_code=500, detail=f"获取失败: {str(e)}")