| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229 |
- """
- PDF 解析工具
- 支持从 PDF 文件中提取文本、标题、作者等信息
- """
- import logging
- from typing import Dict, Any, Optional
- import re
- logger = logging.getLogger(__name__)
- class PDFParser:
- """PDF 解析器"""
-
- def __init__(self):
- """初始化 PDF 解析器"""
- self.supported_formats = ['.pdf']
-
- async def parse_pdf(self, file_path: str) -> Dict[str, Any]:
- """
- 解析 PDF 文件
-
- Args:
- file_path: PDF 文件路径
-
- Returns:
- 包含解析结果的字典
- """
- try:
- import pdfplumber
-
- logger.info(f"开始解析 PDF: {file_path}")
-
- with pdfplumber.open(file_path) as pdf:
- # 提取所有文本
- full_text = ""
- for page in pdf.pages:
- text = page.extract_text()
- if text:
- full_text += text + "\n"
-
- if not full_text.strip():
- logger.warning("PDF 文件为空或无法提取文本")
- return {
- "success": False,
- "error": "无法从 PDF 中提取文本"
- }
-
- # 提取元数据
- metadata = pdf.metadata or {}
-
- # 尝试从文本中提取标题(通常在第一页的前几行)
- title = self._extract_title(full_text, metadata)
-
- # 尝试提取作者
- authors = self._extract_authors(full_text, metadata)
-
- # 尝试提取摘要
- abstract = self._extract_abstract(full_text)
-
- # 统计信息
- page_count = len(pdf.pages)
- word_count = len(full_text.split())
-
- result = {
- "success": True,
- "title": title,
- "authors": authors,
- "abstract": abstract,
- "full_text": full_text,
- "page_count": page_count,
- "word_count": word_count,
- "metadata": {
- "creator": metadata.get("/Creator", ""),
- "producer": metadata.get("/Producer", ""),
- "subject": metadata.get("/Subject", ""),
- "keywords": metadata.get("/Keywords", "")
- }
- }
-
- logger.info(f"PDF 解析成功: {page_count} 页, {word_count} 词")
- return result
-
- except ImportError:
- logger.error("pdfplumber 未安装")
- return {
- "success": False,
- "error": "PDF 解析库未安装,请运行: pip install pdfplumber"
- }
- except Exception as e:
- logger.error(f"PDF 解析失败: {str(e)}")
- return {
- "success": False,
- "error": f"PDF 解析失败: {str(e)}"
- }
-
- def _extract_title(self, text: str, metadata: Dict) -> str:
- """从文本或元数据中提取标题"""
- # 首先尝试从元数据获取
- if metadata.get("/Title"):
- return metadata["/Title"]
-
- # 从文本前几行提取(通常标题在最前面且字体较大)
- lines = text.split('\n')
- for i, line in enumerate(lines[:10]): # 只检查前10行
- line = line.strip()
- # 标题通常较长且不包含特殊字符
- if len(line) > 10 and len(line) < 200 and not line.startswith(('http', 'www', '@')):
- # 排除一些常见的非标题行
- if not any(keyword in line.lower() for keyword in ['abstract', 'introduction', 'page', 'arxiv']):
- return line
-
- return "未知标题"
-
- def _extract_authors(self, text: str, metadata: Dict) -> list:
- """从文本或元数据中提取作者"""
- authors = []
-
- # 首先尝试从元数据获取
- if metadata.get("/Author"):
- author_str = metadata["/Author"]
- authors = [a.strip() for a in re.split(r'[,;]', author_str) if a.strip()]
- if authors:
- return authors
-
- # 从文本中提取(通常在标题后面)
- lines = text.split('\n')
- for i, line in enumerate(lines[:20]): # 检查前20行
- line = line.strip()
- # 查找包含作者信息的行(通常包含邮箱或机构)
- if '@' in line or 'university' in line.lower() or 'institute' in line.lower():
- # 尝试提取前面几行作为作者名
- for j in range(max(0, i-3), i):
- potential_author = lines[j].strip()
- if potential_author and len(potential_author) < 100:
- # 简单的名字模式匹配
- if re.match(r'^[A-Z][a-z]+\s+[A-Z][a-z]+', potential_author):
- authors.append(potential_author)
-
- return authors if authors else ["未知作者"]
-
- def _extract_abstract(self, text: str) -> str:
- """从文本中提取摘要"""
- # 查找 Abstract 关键词
- abstract_patterns = [
- r'Abstract\s*[:\-]?\s*(.*?)(?=\n\n|\nIntroduction|\n1\.|\nKeywords)',
- r'ABSTRACT\s*[:\-]?\s*(.*?)(?=\n\n|\nINTRODUCTION|\n1\.|\nKEYWORDS)',
- r'摘要\s*[:\-]?\s*(.*?)(?=\n\n|关键词|引言|1\.)',
- ]
-
- for pattern in abstract_patterns:
- match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
- if match:
- abstract = match.group(1).strip()
- # 限制摘要长度
- if len(abstract) > 50 and len(abstract) < 2000:
- return abstract[:1000] # 最多返回1000字符
-
- # 如果没找到,返回前500个字符作为摘要
- return text[:500].strip() + "..."
-
- async def parse_pdf_from_bytes(self, pdf_bytes: bytes, filename: str = "document.pdf") -> Dict[str, Any]:
- """
- 从字节流解析 PDF
-
- Args:
- pdf_bytes: PDF 文件的字节内容
- filename: 文件名(用于日志)
-
- Returns:
- 包含解析结果的字典
- """
- try:
- import pdfplumber
- import io
-
- logger.info(f"开始解析 PDF 字节流: {filename}")
-
- with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
- # 提取所有文本
- full_text = ""
- for page in pdf.pages:
- text = page.extract_text()
- if text:
- full_text += text + "\n"
-
- if not full_text.strip():
- return {
- "success": False,
- "error": "无法从 PDF 中提取文本"
- }
-
- # 提取元数据
- metadata = pdf.metadata or {}
-
- # 提取信息
- title = self._extract_title(full_text, metadata)
- authors = self._extract_authors(full_text, metadata)
- abstract = self._extract_abstract(full_text)
-
- result = {
- "success": True,
- "title": title,
- "authors": authors,
- "abstract": abstract,
- "full_text": full_text,
- "page_count": len(pdf.pages),
- "word_count": len(full_text.split()),
- "metadata": {
- "creator": metadata.get("/Creator", ""),
- "producer": metadata.get("/Producer", ""),
- "subject": metadata.get("/Subject", ""),
- "keywords": metadata.get("/Keywords", "")
- }
- }
-
- logger.info(f"PDF 字节流解析成功")
- return result
-
- except Exception as e:
- logger.error(f"PDF 字节流解析失败: {str(e)}")
- return {
- "success": False,
- "error": f"PDF 解析失败: {str(e)}"
- }
- # 全局 PDF 解析器实例
- pdf_parser = PDFParser()
|