validator.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610
  1. """
  2. InnoCore AI 校验官 (Validator Agent)
  3. 负责生成引用格式并联网校验元数据
  4. """
  5. import asyncio
  6. import aiohttp
  7. import re
  8. import json
  9. from typing import Dict, List, Optional, Any
  10. from datetime import datetime
  11. import hashlib
  12. from agents.base import BaseAgent
  13. from core.database import db_manager
  14. from core.exceptions import AgentException, ExternalAPIException
  15. class ValidatorAgent(BaseAgent):
  16. """校验官智能体"""
  17. def __init__(self, llm=None):
  18. super().__init__("Validator", llm)
  19. # API配置
  20. self.crossref_base_url = "https://api.crossref.org/works"
  21. self.google_scholar_url = "https://serpapi.com/search"
  22. # 添加工具
  23. self.add_tool("generate_bibtex", self._generate_bibtex, "生成BibTeX引用")
  24. self.add_tool("generate_apa", self._generate_apa, "生成APA格式引用")
  25. self.add_tool("generate_ieee", self._generate_ieee, "生成IEEE格式引用")
  26. self.add_tool("verify_metadata", self._verify_metadata, "校验元数据")
  27. self.add_tool("crossref_lookup", self._crossref_lookup, "CrossRef查询")
  28. self.add_tool("scholar_lookup", self._scholar_lookup, "Google Scholar查询")
  29. async def run(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
  30. """执行引用校验任务"""
  31. await self.validate_input(input_data)
  32. self.set_state("running")
  33. try:
  34. paper_info = input_data["paper_info"]
  35. formats = input_data.get("formats", ["bibtex", "apa", "ieee"])
  36. verify_external = input_data.get("verify_external", True)
  37. # 1. 生成多种格式的引用
  38. citations = await self._generate_citations(paper_info, formats)
  39. # 2. 外部校验元数据
  40. verification_result = {}
  41. if verify_external:
  42. verification_result = await self._verify_paper_metadata(paper_info)
  43. # 3. 合并和更新引用信息
  44. final_citations = await self._merge_citation_data(
  45. citations,
  46. verification_result,
  47. paper_info
  48. )
  49. # 4. 缓存结果
  50. await self._cache_citation_results(final_citations)
  51. self.set_state("completed")
  52. return {
  53. "status": "success",
  54. "paper_info": paper_info,
  55. "citations": final_citations,
  56. "verification": verification_result,
  57. "formats_generated": list(citations.keys()),
  58. "verification_status": verification_result.get("status", "unknown"),
  59. "timestamp": datetime.now().isoformat()
  60. }
  61. except Exception as e:
  62. self.set_state("error")
  63. raise AgentException(f"Validator Agent执行失败: {str(e)}")
  64. def get_required_fields(self) -> List[str]:
  65. """获取必需的输入字段"""
  66. return ["paper_info"]
  67. async def _generate_citations(self, paper_info: Dict, formats: List[str]) -> Dict[str, Any]:
  68. """生成多种格式的引用"""
  69. citations = {}
  70. for format_type in formats:
  71. try:
  72. if format_type.lower() == "bibtex":
  73. citations["bibtex"] = await self._generate_bibtex_citation(paper_info)
  74. elif format_type.lower() == "apa":
  75. citations["apa"] = await self._generate_apa_citation(paper_info)
  76. elif format_type.lower() == "ieee":
  77. citations["ieee"] = await self._generate_ieee_citation(paper_info)
  78. else:
  79. self._add_to_history(f"不支持的引用格式: {format_type}")
  80. except Exception as e:
  81. self._add_to_history(f"生成{format_type}格式失败: {str(e)}")
  82. citations[format_type] = f"生成失败: {str(e)}"
  83. return citations
  84. async def _generate_bibtex_citation(self, paper_info: Dict) -> str:
  85. """生成BibTeX格式引用"""
  86. # 生成引用键
  87. first_author = paper_info.get("authors", [""])[0]
  88. if isinstance(first_author, str):
  89. last_name = first_author.split()[-1].lower()
  90. else:
  91. last_name = "unknown"
  92. year = paper_info.get("year", datetime.now().year)
  93. title_words = paper_info.get("title", "").split()[:3]
  94. title_key = "".join([w.lower() for w in title_words if w.isalpha()])
  95. citation_key = f"{last_name}{year}{title_key}"
  96. # 构建BibTeX条目
  97. entry_type = self._determine_entry_type(paper_info)
  98. bibtex = f"@{entry_type}{{{citation_key},\n"
  99. # 添加作者
  100. authors = paper_info.get("authors", [])
  101. if authors:
  102. bibtex += f" author = {{{self._format_bibtex_authors(authors)}}},\n"
  103. # 添加标题
  104. title = paper_info.get("title", "")
  105. if title:
  106. bibtex += f" title = {{{title}}},\n"
  107. # 添加期刊/会议信息
  108. if entry_type == "article":
  109. journal = paper_info.get("journal", "")
  110. if journal:
  111. bibtex += f" journal = {{{journal}}},\n"
  112. volume = paper_info.get("volume", "")
  113. if volume:
  114. bibtex += f" volume = {{{volume}}},\n"
  115. number = paper_info.get("number", "")
  116. if number:
  117. bibtex += f" number = {{{number}}},\n"
  118. pages = paper_info.get("pages", "")
  119. if pages:
  120. bibtex += f" pages = {{{pages}}},\n"
  121. elif entry_type == "inproceedings":
  122. booktitle = paper_info.get("booktitle", "")
  123. if booktitle:
  124. bibtex += f" booktitle = {{{booktitle}}},\n"
  125. pages = paper_info.get("pages", "")
  126. if pages:
  127. bibtex += f" pages = {{{pages}}},\n"
  128. # 添加年份
  129. if year:
  130. bibtex += f" year = {{{year}}},\n"
  131. # 添加DOI
  132. doi = paper_info.get("doi", "")
  133. if doi:
  134. bibtex += f" doi = {{{doi}}},\n"
  135. # 添加URL
  136. url = paper_info.get("url", "")
  137. if url:
  138. bibtex += f" url = {{{url}}},\n"
  139. # 移除最后的逗号并关闭
  140. bibtex = bibtex.rstrip(",\n") + "\n}"
  141. return bibtex
  142. async def _generate_apa_citation(self, paper_info: Dict) -> str:
  143. """生成APA格式引用"""
  144. authors = paper_info.get("authors", [])
  145. year = paper_info.get("year", "")
  146. title = paper_info.get("title", "")
  147. # 格式化作者
  148. if len(authors) == 0:
  149. author_text = ""
  150. elif len(authors) == 1:
  151. author_text = authors[0]
  152. elif len(authors) == 2:
  153. author_text = f"{authors[0]} & {authors[1]}"
  154. elif len(authors) <= 7:
  155. author_text = ", ".join(authors[:-1]) + f", & {authors[-1]}"
  156. else:
  157. author_text = ", ".join(authors[:6]) + f", ... {authors[-1]}"
  158. # 构建APA引用
  159. if year:
  160. apa_citation = f"{author_text} ({year}). {title}."
  161. else:
  162. apa_citation = f"{author_text}. {title}."
  163. # 添加期刊信息
  164. journal = paper_info.get("journal", "")
  165. volume = paper_info.get("volume", "")
  166. number = paper_info.get("number", "")
  167. pages = paper_info.get("pages", "")
  168. if journal:
  169. if volume and number:
  170. apa_citation += f" *{journal}*, *{volume}({number})*"
  171. elif volume:
  172. apa_citation += f" *{journal}*, *{volume}*"
  173. else:
  174. apa_citation += f" *{journal}*"
  175. if pages:
  176. apa_citation += f", {pages}."
  177. else:
  178. apa_citation += "."
  179. # 添加DOI
  180. doi = paper_info.get("doi", "")
  181. if doi:
  182. apa_citation += f" https://doi.org/{doi}"
  183. return apa_citation
  184. async def _generate_ieee_citation(self, paper_info: Dict) -> str:
  185. """生成IEEE格式引用"""
  186. authors = paper_info.get("authors", [])
  187. year = paper_info.get("year", "")
  188. title = paper_info.get("title", "")
  189. # 格式化作者(IEEE使用首字母缩写)
  190. ieee_authors = []
  191. for author in authors[:3]: # IEEE通常只列出前3个作者
  192. if isinstance(author, str):
  193. parts = author.split()
  194. if len(parts) >= 2:
  195. last_name = parts[-1]
  196. initials = " ".join([p[0] + "." for p in parts[:-1]])
  197. ieee_authors.append(f"{initials} {last_name}")
  198. else:
  199. ieee_authors.append(author)
  200. if len(authors) > 3:
  201. ieee_authors.append("et al.")
  202. author_text = ", ".join(ieee_authors)
  203. # 构建IEEE引用
  204. if title:
  205. ieee_citation = f'"{title},"'
  206. else:
  207. ieee_citation = ""
  208. # 添加期刊信息
  209. journal = paper_info.get("journal", "")
  210. volume = paper_info.get("volume", "")
  211. number = paper_info.get("number", "")
  212. pages = paper_info.get("pages", "")
  213. if journal:
  214. if volume and number:
  215. ieee_citation += f" *{journal}*, vol. {volume}, no. {number}"
  216. elif volume:
  217. ieee_citation += f" *{journal}*, vol. {volume}"
  218. else:
  219. ieee_citation += f" *{journal}*"
  220. if pages:
  221. ieee_citation += f", pp. {pages}"
  222. # 添加年份和月份
  223. if year:
  224. month = paper_info.get("month", "")
  225. if month:
  226. ieee_citation += f", {month}. {year}."
  227. else:
  228. ieee_citation += f", {year}."
  229. # 添加DOI
  230. doi = paper_info.get("doi", "")
  231. if doi:
  232. ieee_citation += f" doi: {doi}"
  233. return ieee_citation
  234. def _determine_entry_type(self, paper_info: Dict) -> str:
  235. """确定BibTeX条目类型"""
  236. if paper_info.get("journal"):
  237. return "article"
  238. elif paper_info.get("booktitle"):
  239. return "inproceedings"
  240. elif paper_info.get("publisher"):
  241. return "book"
  242. else:
  243. return "misc"
  244. def _format_bibtex_authors(self, authors: List[str]) -> str:
  245. """格式化BibTeX作者"""
  246. formatted_authors = []
  247. for author in authors:
  248. if isinstance(author, str):
  249. # 将 "First Last" 转换为 "Last, First"
  250. parts = author.split()
  251. if len(parts) >= 2:
  252. formatted_authors.append(f"{parts[-1]}, {' '.join(parts[:-1])}")
  253. else:
  254. formatted_authors.append(author)
  255. else:
  256. formatted_authors.append(str(author))
  257. return " and ".join(formatted_authors)
  258. async def _verify_paper_metadata(self, paper_info: Dict) -> Dict[str, Any]:
  259. """校验论文元数据"""
  260. verification_result = {
  261. "status": "pending",
  262. "crossref_verified": False,
  263. "scholar_verified": False,
  264. "discrepancies": [],
  265. "suggested_corrections": {},
  266. "verification_timestamp": datetime.now().isoformat()
  267. }
  268. doi = paper_info.get("doi", "")
  269. title = paper_info.get("title", "")
  270. try:
  271. # 1. CrossRef校验
  272. if doi:
  273. crossref_data = await self._crossref_lookup_by_doi(doi)
  274. if crossref_data:
  275. verification_result["crossref_verified"] = True
  276. discrepancies = self._compare_metadata(paper_info, crossref_data)
  277. if discrepancies:
  278. verification_result["discrepancies"].extend(discrepancies)
  279. verification_result["suggested_corrections"].update(
  280. self._generate_corrections(discrepancies)
  281. )
  282. # 2. Google Scholar校验
  283. if title:
  284. scholar_data = await self._scholar_lookup_by_title(title)
  285. if scholar_data:
  286. verification_result["scholar_verified"] = True
  287. discrepancies = self._compare_metadata(paper_info, scholar_data)
  288. if discrepancies:
  289. verification_result["discrepancies"].extend(discrepancies)
  290. verification_result["suggested_corrections"].update(
  291. self._generate_corrections(discrepancies)
  292. )
  293. # 确定最终状态
  294. if verification_result["crossref_verified"] or verification_result["scholar_verified"]:
  295. if not verification_result["discrepancies"]:
  296. verification_result["status"] = "verified"
  297. else:
  298. verification_result["status"] = "discrepancies_found"
  299. else:
  300. verification_result["status"] = "unverified"
  301. except Exception as e:
  302. verification_result["status"] = "error"
  303. verification_result["error"] = str(e)
  304. self._add_to_history(f"元数据校验失败: {str(e)}")
  305. return verification_result
  306. async def _crossref_lookup_by_doi(self, doi: str) -> Optional[Dict]:
  307. """通过DOI查询CrossRef"""
  308. try:
  309. url = f"{self.crossref_base_url}/{doi}"
  310. async with aiohttp.ClientSession() as session:
  311. async with session.get(url) as response:
  312. if response.status == 200:
  313. data = await response.json()
  314. return self._parse_crossref_data(data)
  315. else:
  316. self._add_to_history(f"CrossRef查询失败,状态码: {response.status}")
  317. return None
  318. except Exception as e:
  319. self._add_to_history(f"CrossRef查询异常: {str(e)}")
  320. return None
  321. async def _scholar_lookup_by_title(self, title: str) -> Optional[Dict]:
  322. """通过标题查询Google Scholar"""
  323. try:
  324. config = self.config.external_apis
  325. if not config.serpapi_key:
  326. self._add_to_history("SerpApi key缺失,跳过Google Scholar查询")
  327. return None
  328. params = {
  329. "engine": "google_scholar",
  330. "q": title,
  331. "api_key": config.serpapi_key
  332. }
  333. async with aiohttp.ClientSession() as session:
  334. async with session.get(self.google_scholar_url, params=params) as response:
  335. if response.status == 200:
  336. data = await response.json()
  337. return self._parse_scholar_data(data)
  338. else:
  339. self._add_to_history(f"Google Scholar查询失败,状态码: {response.status}")
  340. return None
  341. except Exception as e:
  342. self._add_to_history(f"Google Scholar查询异常: {str(e)}")
  343. return None
  344. def _parse_crossref_data(self, data: Dict) -> Dict:
  345. """解析CrossRef数据"""
  346. message = data.get("message", {})
  347. return {
  348. "title": " ".join(message.get("title", [])),
  349. "authors": [f"{author.get('given', '')} {author.get('family', '')}"
  350. for author in message.get("author", [])],
  351. "year": message.get("published-print", {}).get("date-parts", [[""]])[0][0][:4],
  352. "journal": message.get("short-container-title", [""])[0],
  353. "volume": message.get("volume", ""),
  354. "issue": message.get("issue", ""),
  355. "page": message.get("page", ""),
  356. "doi": message.get("DOI", ""),
  357. "source": "crossref"
  358. }
  359. def _parse_scholar_data(self, data: Dict) -> Dict:
  360. """解析Google Scholar数据"""
  361. organic_results = data.get("organic_results", [])
  362. if not organic_results:
  363. return {}
  364. first_result = organic_results[0]
  365. # 提取年份
  366. publication_info = first_result.get("publication_info", {})
  367. year = ""
  368. if "summary" in publication_info:
  369. year_match = re.search(r'\b(19|20)\d{2}\b', publication_info["summary"])
  370. if year_match:
  371. year = year_match.group()
  372. return {
  373. "title": first_result.get("title", ""),
  374. "authors": first_result.get("publication_info", {}).get("authors", []),
  375. "year": year,
  376. "journal": publication_info.get("summary", "").split(",")[0] if publication_info.get("summary") else "",
  377. "source": "google_scholar"
  378. }
  379. def _compare_metadata(self, original: Dict, reference: Dict) -> List[Dict]:
  380. """比较元数据差异"""
  381. discrepancies = []
  382. # 比较标题
  383. orig_title = original.get("title", "").lower().strip()
  384. ref_title = reference.get("title", "").lower().strip()
  385. if orig_title and ref_title and orig_title != ref_title:
  386. discrepancies.append({
  387. "field": "title",
  388. "original": original.get("title", ""),
  389. "reference": reference.get("title", ""),
  390. "similarity": self._calculate_similarity(orig_title, ref_title)
  391. })
  392. # 比较作者
  393. orig_authors = set([author.lower() for author in original.get("authors", [])])
  394. ref_authors = set([author.lower() for author in reference.get("authors", [])])
  395. if orig_authors and ref_authors and orig_authors != ref_authors:
  396. discrepancies.append({
  397. "field": "authors",
  398. "original": original.get("authors", []),
  399. "reference": reference.get("authors", []),
  400. "missing_in_original": list(ref_authors - orig_authors),
  401. "extra_in_original": list(orig_authors - ref_authors)
  402. })
  403. # 比较年份
  404. orig_year = str(original.get("year", ""))
  405. ref_year = str(reference.get("year", ""))
  406. if orig_year and ref_year and orig_year != ref_year:
  407. discrepancies.append({
  408. "field": "year",
  409. "original": orig_year,
  410. "reference": ref_year
  411. })
  412. return discrepancies
  413. def _calculate_similarity(self, text1: str, text2: str) -> float:
  414. """计算文本相似度"""
  415. if not text1 or not text2:
  416. return 0.0
  417. words1 = set(text1.split())
  418. words2 = set(text2.split())
  419. intersection = words1.intersection(words2)
  420. union = words1.union(words2)
  421. return len(intersection) / len(union) if union else 0.0
  422. def _generate_corrections(self, discrepancies: List[Dict]) -> Dict:
  423. """生成修正建议"""
  424. corrections = {}
  425. for discrepancy in discrepancies:
  426. field = discrepancy["field"]
  427. if field == "title" and discrepancy.get("similarity", 0) > 0.8:
  428. corrections[field] = discrepancy["reference"]
  429. elif field == "year":
  430. corrections[field] = discrepancy["reference"]
  431. elif field == "authors":
  432. # 对于作者,建议使用参考数据的完整列表
  433. corrections[field] = discrepancy["reference"]
  434. return corrections
  435. async def _merge_citation_data(self, citations: Dict, verification: Dict, paper_info: Dict) -> Dict[str, Any]:
  436. """合并引用数据"""
  437. final_citations = {}
  438. for format_type, citation_text in citations.items():
  439. if isinstance(citation_text, str) and not citation_text.startswith("生成失败"):
  440. # 添加校验状态标记
  441. verification_status = verification.get("status", "unknown")
  442. if verification_status == "verified":
  443. citation_text += " % [Verified]"
  444. elif verification_status == "discrepancies_found":
  445. citation_text += " % [Discrepancies Found]"
  446. else:
  447. citation_text += " % [Unverified]"
  448. final_citations[format_type] = citation_text
  449. else:
  450. final_citations[format_type] = citation_text
  451. # 添加元数据
  452. final_citations["metadata"] = {
  453. "original_info": paper_info,
  454. "verification": verification,
  455. "generated_formats": list(citations.keys()),
  456. "generation_timestamp": datetime.now().isoformat()
  457. }
  458. return final_citations
  459. async def _cache_citation_results(self, citations: Dict):
  460. """缓存引用结果"""
  461. try:
  462. metadata = citations.get("metadata", {})
  463. original_info = metadata.get("original_info", {})
  464. doi = original_info.get("doi", "")
  465. if doi:
  466. # 缓存BibTeX格式
  467. bibtex = citations.get("bibtex", "")
  468. if bibtex and not bibtex.startswith("生成失败"):
  469. verification = metadata.get("verification", {})
  470. is_verified = verification.get("status") == "verified"
  471. await db_manager.cache_reference(
  472. doi=doi,
  473. bibtex=bibtex,
  474. is_verified=is_verified
  475. )
  476. self._add_to_history(f"引用已缓存: {doi}")
  477. except Exception as e:
  478. self._add_to_history(f"缓存引用失败: {str(e)}")
  479. # 工具方法
  480. async def _generate_bibtex(self, paper_info: Dict) -> str:
  481. """生成BibTeX工具"""
  482. return await self._generate_bibtex_citation(paper_info)
  483. async def _generate_apa(self, paper_info: Dict) -> str:
  484. """生成APA工具"""
  485. return await self._generate_apa_citation(paper_info)
  486. async def _generate_ieee(self, paper_info: Dict) -> str:
  487. """生成IEEE工具"""
  488. return await self._generate_ieee_citation(paper_info)
  489. async def _verify_metadata(self, paper_info: Dict) -> Dict:
  490. """校验元数据工具"""
  491. return await self._verify_paper_metadata(paper_info)
  492. async def _crossref_lookup(self, identifier: str) -> Dict:
  493. """CrossRef查询工具"""
  494. if identifier.startswith("10."): # DOI
  495. return await self._crossref_lookup_by_doi(identifier)
  496. else:
  497. return {"error": "请提供有效的DOI"}
  498. async def _scholar_lookup(self, title: str) -> Dict:
  499. """Google Scholar查询工具"""
  500. return await self._scholar_lookup_by_title(title)