pdf_parser.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. """
  2. PDF 解析工具
  3. 支持从 PDF 文件中提取文本、标题、作者等信息
  4. """
  5. import logging
  6. from typing import Dict, Any, Optional
  7. import re
  8. logger = logging.getLogger(__name__)
  9. class PDFParser:
  10. """PDF 解析器"""
  11. def __init__(self):
  12. """初始化 PDF 解析器"""
  13. self.supported_formats = ['.pdf']
  14. async def parse_pdf(self, file_path: str) -> Dict[str, Any]:
  15. """
  16. 解析 PDF 文件
  17. Args:
  18. file_path: PDF 文件路径
  19. Returns:
  20. 包含解析结果的字典
  21. """
  22. try:
  23. import pdfplumber
  24. logger.info(f"开始解析 PDF: {file_path}")
  25. with pdfplumber.open(file_path) as pdf:
  26. # 提取所有文本
  27. full_text = ""
  28. for page in pdf.pages:
  29. text = page.extract_text()
  30. if text:
  31. full_text += text + "\n"
  32. if not full_text.strip():
  33. logger.warning("PDF 文件为空或无法提取文本")
  34. return {
  35. "success": False,
  36. "error": "无法从 PDF 中提取文本"
  37. }
  38. # 提取元数据
  39. metadata = pdf.metadata or {}
  40. # 尝试从文本中提取标题(通常在第一页的前几行)
  41. title = self._extract_title(full_text, metadata)
  42. # 尝试提取作者
  43. authors = self._extract_authors(full_text, metadata)
  44. # 尝试提取摘要
  45. abstract = self._extract_abstract(full_text)
  46. # 统计信息
  47. page_count = len(pdf.pages)
  48. word_count = len(full_text.split())
  49. result = {
  50. "success": True,
  51. "title": title,
  52. "authors": authors,
  53. "abstract": abstract,
  54. "full_text": full_text,
  55. "page_count": page_count,
  56. "word_count": word_count,
  57. "metadata": {
  58. "creator": metadata.get("/Creator", ""),
  59. "producer": metadata.get("/Producer", ""),
  60. "subject": metadata.get("/Subject", ""),
  61. "keywords": metadata.get("/Keywords", "")
  62. }
  63. }
  64. logger.info(f"PDF 解析成功: {page_count} 页, {word_count} 词")
  65. return result
  66. except ImportError:
  67. logger.error("pdfplumber 未安装")
  68. return {
  69. "success": False,
  70. "error": "PDF 解析库未安装,请运行: pip install pdfplumber"
  71. }
  72. except Exception as e:
  73. logger.error(f"PDF 解析失败: {str(e)}")
  74. return {
  75. "success": False,
  76. "error": f"PDF 解析失败: {str(e)}"
  77. }
  78. def _extract_title(self, text: str, metadata: Dict) -> str:
  79. """从文本或元数据中提取标题"""
  80. # 首先尝试从元数据获取
  81. if metadata.get("/Title"):
  82. return metadata["/Title"]
  83. # 从文本前几行提取(通常标题在最前面且字体较大)
  84. lines = text.split('\n')
  85. for i, line in enumerate(lines[:10]): # 只检查前10行
  86. line = line.strip()
  87. # 标题通常较长且不包含特殊字符
  88. if len(line) > 10 and len(line) < 200 and not line.startswith(('http', 'www', '@')):
  89. # 排除一些常见的非标题行
  90. if not any(keyword in line.lower() for keyword in ['abstract', 'introduction', 'page', 'arxiv']):
  91. return line
  92. return "未知标题"
  93. def _extract_authors(self, text: str, metadata: Dict) -> list:
  94. """从文本或元数据中提取作者"""
  95. authors = []
  96. # 首先尝试从元数据获取
  97. if metadata.get("/Author"):
  98. author_str = metadata["/Author"]
  99. authors = [a.strip() for a in re.split(r'[,;]', author_str) if a.strip()]
  100. if authors:
  101. return authors
  102. # 从文本中提取(通常在标题后面)
  103. lines = text.split('\n')
  104. for i, line in enumerate(lines[:20]): # 检查前20行
  105. line = line.strip()
  106. # 查找包含作者信息的行(通常包含邮箱或机构)
  107. if '@' in line or 'university' in line.lower() or 'institute' in line.lower():
  108. # 尝试提取前面几行作为作者名
  109. for j in range(max(0, i-3), i):
  110. potential_author = lines[j].strip()
  111. if potential_author and len(potential_author) < 100:
  112. # 简单的名字模式匹配
  113. if re.match(r'^[A-Z][a-z]+\s+[A-Z][a-z]+', potential_author):
  114. authors.append(potential_author)
  115. return authors if authors else ["未知作者"]
  116. def _extract_abstract(self, text: str) -> str:
  117. """从文本中提取摘要"""
  118. # 查找 Abstract 关键词
  119. abstract_patterns = [
  120. r'Abstract\s*[:\-]?\s*(.*?)(?=\n\n|\nIntroduction|\n1\.|\nKeywords)',
  121. r'ABSTRACT\s*[:\-]?\s*(.*?)(?=\n\n|\nINTRODUCTION|\n1\.|\nKEYWORDS)',
  122. r'摘要\s*[:\-]?\s*(.*?)(?=\n\n|关键词|引言|1\.)',
  123. ]
  124. for pattern in abstract_patterns:
  125. match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
  126. if match:
  127. abstract = match.group(1).strip()
  128. # 限制摘要长度
  129. if len(abstract) > 50 and len(abstract) < 2000:
  130. return abstract[:1000] # 最多返回1000字符
  131. # 如果没找到,返回前500个字符作为摘要
  132. return text[:500].strip() + "..."
  133. async def parse_pdf_from_bytes(self, pdf_bytes: bytes, filename: str = "document.pdf") -> Dict[str, Any]:
  134. """
  135. 从字节流解析 PDF
  136. Args:
  137. pdf_bytes: PDF 文件的字节内容
  138. filename: 文件名(用于日志)
  139. Returns:
  140. 包含解析结果的字典
  141. """
  142. try:
  143. import pdfplumber
  144. import io
  145. logger.info(f"开始解析 PDF 字节流: {filename}")
  146. with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
  147. # 提取所有文本
  148. full_text = ""
  149. for page in pdf.pages:
  150. text = page.extract_text()
  151. if text:
  152. full_text += text + "\n"
  153. if not full_text.strip():
  154. return {
  155. "success": False,
  156. "error": "无法从 PDF 中提取文本"
  157. }
  158. # 提取元数据
  159. metadata = pdf.metadata or {}
  160. # 提取信息
  161. title = self._extract_title(full_text, metadata)
  162. authors = self._extract_authors(full_text, metadata)
  163. abstract = self._extract_abstract(full_text)
  164. result = {
  165. "success": True,
  166. "title": title,
  167. "authors": authors,
  168. "abstract": abstract,
  169. "full_text": full_text,
  170. "page_count": len(pdf.pages),
  171. "word_count": len(full_text.split()),
  172. "metadata": {
  173. "creator": metadata.get("/Creator", ""),
  174. "producer": metadata.get("/Producer", ""),
  175. "subject": metadata.get("/Subject", ""),
  176. "keywords": metadata.get("/Keywords", "")
  177. }
  178. }
  179. logger.info(f"PDF 字节流解析成功")
  180. return result
  181. except Exception as e:
  182. logger.error(f"PDF 字节流解析失败: {str(e)}")
  183. return {
  184. "success": False,
  185. "error": f"PDF 解析失败: {str(e)}"
  186. }
  187. # 全局 PDF 解析器实例
  188. pdf_parser = PDFParser()