text_processor.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. """
  2. InnoCore AI 文本处理工具
  3. """
  4. import re
  5. from typing import List, Dict, Optional, Any, Tuple
  6. import string
  7. from collections import Counter
  8. import asyncio
  9. class TextProcessor:
  10. """文本处理器"""
  11. def __init__(self):
  12. self.stop_words = self._load_stop_words()
  13. self.punctuation = string.punctuation
  14. def _load_stop_words(self) -> set:
  15. """加载停用词"""
  16. # 简化的停用词列表
  17. return {
  18. 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of',
  19. 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has',
  20. 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may',
  21. 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you',
  22. 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
  23. 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'mine', 'yours',
  24. 'hers', 'ours', 'theirs', 'what', 'which', 'who', 'whom', 'whose',
  25. 'where', 'when', 'why', 'how', 'all', 'each', 'every', 'both', 'few',
  26. 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only',
  27. 'own', 'same', 'so', 'than', 'too', 'very', 'just', 'now', 'also'
  28. }
  29. def clean_text(self, text: str) -> str:
  30. """清理文本"""
  31. if not text:
  32. return ""
  33. # 移除多余的空白字符
  34. text = re.sub(r'\s+', ' ', text)
  35. # 移除特殊字符(保留基本标点)
  36. text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}\"\'\/\\]', ' ', text)
  37. # 移除多余的空格
  38. text = re.sub(r'\s+', ' ', text).strip()
  39. return text
  40. def tokenize(self, text: str) -> List[str]:
  41. """分词"""
  42. if not text:
  43. return []
  44. # 转换为小写并分词
  45. words = text.lower().split()
  46. # 移除标点符号
  47. words = [word.strip(self.punctuation) for word in words]
  48. # 过滤空字符串
  49. words = [word for word in words if word]
  50. return words
  51. def remove_stop_words(self, words: List[str]) -> List[str]:
  52. """移除停用词"""
  53. return [word for word in words if word not in self.stop_words]
  54. def extract_sentences(self, text: str) -> List[str]:
  55. """提取句子"""
  56. if not text:
  57. return []
  58. # 使用正则表达式分割句子
  59. sentences = re.split(r'[.!?]+', text)
  60. # 清理和过滤
  61. sentences = [s.strip() for s in sentences if s.strip()]
  62. return sentences
  63. def extract_paragraphs(self, text: str) -> List[str]:
  64. """提取段落"""
  65. if not text:
  66. return []
  67. # 按双换行分割段落
  68. paragraphs = re.split(r'\n\s*\n', text)
  69. # 清理和过滤
  70. paragraphs = [p.strip() for p in paragraphs if p.strip()]
  71. return paragraphs
  72. def calculate_readability(self, text: str) -> Dict[str, float]:
  73. """计算文本可读性指标"""
  74. if not text:
  75. return {"flesch_score": 0.0, "avg_sentence_length": 0.0, "avg_word_length": 0.0}
  76. sentences = self.extract_sentences(text)
  77. words = self.tokenize(text)
  78. if not sentences or not words:
  79. return {"flesch_score": 0.0, "avg_sentence_length": 0.0, "avg_word_length": 0.0}
  80. # 平均句子长度
  81. avg_sentence_length = len(words) / len(sentences)
  82. # 平均词长
  83. avg_word_length = sum(len(word) for word in words) / len(words)
  84. # 简化的Flesch Reading Ease分数
  85. flesch_score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_word_length)
  86. return {
  87. "flesch_score": max(0, min(100, flesch_score)),
  88. "avg_sentence_length": avg_sentence_length,
  89. "avg_word_length": avg_word_length
  90. }
  91. def extract_key_phrases(self, text: str, max_phrases: int = 10) -> List[str]:
  92. """提取关键短语"""
  93. if not text:
  94. return []
  95. # 简化的关键短语提取
  96. words = self.tokenize(text)
  97. words = self.remove_stop_words(words)
  98. # 寻找常见的学术短语模式
  99. phrase_patterns = [
  100. r'\b\w+\s+\w+\b', # 两词短语
  101. r'\b\w+\s+\w+\s+\w+\b', # 三词短语
  102. ]
  103. phrases = []
  104. for pattern in phrase_patterns:
  105. matches = re.findall(pattern, text.lower())
  106. phrases.extend(matches)
  107. # 计算短语频率
  108. phrase_freq = Counter(phrases)
  109. # 过滤和排序
  110. filtered_phrases = [
  111. phrase for phrase, freq in phrase_freq.items()
  112. if freq > 1 and len(phrase.split()) >= 2
  113. ]
  114. filtered_phrases.sort(key=lambda x: phrase_freq[x], reverse=True)
  115. return filtered_phrases[:max_phrases]
  116. def detect_language(self, text: str) -> str:
  117. """检测语言(简化实现)"""
  118. if not text:
  119. return "unknown"
  120. # 简单的语言检测基于常见词汇
  121. chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
  122. english_chars = len(re.findall(r'[a-zA-Z]', text))
  123. total_chars = chinese_chars + english_chars
  124. if total_chars == 0:
  125. return "unknown"
  126. chinese_ratio = chinese_chars / total_chars
  127. if chinese_ratio > 0.3:
  128. return "chinese"
  129. elif english_chars > 0:
  130. return "english"
  131. else:
  132. return "unknown"
  133. def extract_citations(self, text: str) -> List[Dict[str, Any]]:
  134. """提取引用"""
  135. citations = []
  136. # 数字引用模式 [1], [2-3]
  137. numeric_pattern = r'\[(\d+(?:-\d+)?)\]'
  138. numeric_matches = re.finditer(numeric_pattern, text)
  139. for match in numeric_matches:
  140. citations.append({
  141. "type": "numeric",
  142. "text": match.group(0),
  143. "reference": match.group(1),
  144. "position": match.start()
  145. })
  146. # 作者年份引用 (Smith, 2020)
  147. author_year_pattern = r'\(([A-Za-z]+(?:\s+et\s+al\.)?,\s*\d{4})\)'
  148. author_year_matches = re.finditer(author_year_pattern, text)
  149. for match in author_year_matches:
  150. citations.append({
  151. "type": "author_year",
  152. "text": match.group(0),
  153. "reference": match.group(1),
  154. "position": match.start()
  155. })
  156. return citations
  157. def extract_numbers_and_units(self, text: str) -> List[Dict[str, Any]]:
  158. """提取数字和单位"""
  159. patterns = [
  160. r'(\d+(?:\.\d+)?)\s*([a-zA-Z%]+)', # 数字 + 单位
  161. r'(\d+(?:,\d{3})*(?:\.\d+)?)', # 带逗号的数字
  162. ]
  163. results = []
  164. for pattern in patterns:
  165. matches = re.finditer(pattern, text)
  166. for match in matches:
  167. results.append({
  168. "text": match.group(0),
  169. "number": match.group(1),
  170. "unit": match.group(2) if len(match.groups()) > 1 else "",
  171. "position": match.start()
  172. })
  173. return results
  174. def extract_acronyms(self, text: str) -> Dict[str, str]:
  175. """提取缩写词"""
  176. acronyms = {}
  177. # 查找全称(缩写)模式
  178. acronym_pattern = r'([A-Za-z\s]+)\s*\(([A-Z]{2,})\)'
  179. matches = re.finditer(acronym_pattern, text)
  180. for match in matches:
  181. full_name = match.group(1).strip()
  182. acronym = match.group(2)
  183. # 验证缩写是否来自全称的首字母
  184. initials = ''.join([word[0].upper() for word in full_name.split() if word])
  185. if acronym.startswith(initials):
  186. acronyms[acronym] = full_name
  187. return acronyms
  188. def summarize_text(self, text: str, max_sentences: int = 3) -> str:
  189. """文本摘要(简化实现)"""
  190. if not text:
  191. return ""
  192. sentences = self.extract_sentences(text)
  193. if len(sentences) <= max_sentences:
  194. return " ".join(sentences)
  195. # 简单的摘要算法:选择包含关键词最多的句子
  196. words = self.tokenize(text)
  197. words = self.remove_stop_words(words)
  198. word_freq = Counter(words)
  199. sentence_scores = []
  200. for sentence in sentences:
  201. sentence_words = self.tokenize(sentence)
  202. sentence_words = self.remove_stop_words(sentence_words)
  203. score = sum(word_freq.get(word, 0) for word in sentence_words)
  204. sentence_scores.append((sentence, score))
  205. # 选择得分最高的句子
  206. sentence_scores.sort(key=lambda x: x[1], reverse=True)
  207. top_sentences = [sentence for sentence, score in sentence_scores[:max_sentences]]
  208. # 按原文顺序排列
  209. summary_sentences = []
  210. for sentence in sentences:
  211. if sentence in top_sentences:
  212. summary_sentences.append(sentence)
  213. return " ".join(summary_sentences)
  214. def extract_entities(self, text: str) -> Dict[str, List[str]]:
  215. """实体提取(简化实现)"""
  216. entities = {
  217. "persons": [],
  218. "organizations": [],
  219. "locations": [],
  220. "dates": [],
  221. "numbers": []
  222. }
  223. # 人名模式(简化)
  224. person_pattern = r'\b([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b'
  225. person_matches = re.findall(person_pattern, text)
  226. entities["persons"] = list(set(person_matches))
  227. # 组织模式(简化)
  228. org_patterns = [
  229. r'\b([A-Z][a-z]+\s+(?:University|Institute|Laboratory|Company|Corp|Inc|Ltd))\b',
  230. r'\b((?:[A-Z]+\s*){2,})\b'
  231. ]
  232. for pattern in org_patterns:
  233. matches = re.findall(pattern, text)
  234. entities["organizations"].extend(matches)
  235. entities["organizations"] = list(set(entities["organizations"]))
  236. # 日期模式
  237. date_patterns = [
  238. r'\b(\d{4})\b',
  239. r'\b(\d{1,2}/\d{1,2}/\d{4})\b',
  240. r'\b((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4})\b'
  241. ]
  242. for pattern in date_patterns:
  243. matches = re.findall(pattern, text)
  244. entities["dates"].extend(matches)
  245. entities["dates"] = list(set(entities["dates"]))
  246. # 数字模式
  247. number_pattern = r'\b(\d+(?:\.\d+)?)\b'
  248. number_matches = re.findall(number_pattern, text)
  249. entities["numbers"] = list(set(number_matches))
  250. return entities
  251. def calculate_text_similarity(self, text1: str, text2: str) -> float:
  252. """计算文本相似度(基于词汇重叠)"""
  253. if not text1 or not text2:
  254. return 0.0
  255. words1 = set(self.tokenize(text1))
  256. words2 = set(self.tokenize(text2))
  257. if not words1 or not words2:
  258. return 0.0
  259. intersection = words1.intersection(words2)
  260. union = words1.union(words2)
  261. return len(intersection) / len(union)
  262. async def process_batch(self, texts: List[str], operations: List[str]) -> List[Dict[str, Any]]:
  263. """批量处理文本"""
  264. results = []
  265. for text in texts:
  266. result = {"text": text}
  267. for operation in operations:
  268. if operation == "clean":
  269. result["cleaned"] = self.clean_text(text)
  270. elif operation == "tokenize":
  271. result["tokens"] = self.tokenize(text)
  272. elif operation == "sentences":
  273. result["sentences"] = self.extract_sentences(text)
  274. elif operation == "paragraphs":
  275. result["paragraphs"] = self.extract_paragraphs(text)
  276. elif operation == "readability":
  277. result["readability"] = self.calculate_readability(text)
  278. elif operation == "key_phrases":
  279. result["key_phrases"] = self.extract_key_phrases(text)
  280. elif operation == "language":
  281. result["language"] = self.detect_language(text)
  282. elif operation == "citations":
  283. result["citations"] = self.extract_citations(text)
  284. elif operation == "entities":
  285. result["entities"] = self.extract_entities(text)
  286. elif operation == "summary":
  287. result["summary"] = self.summarize_text(text)
  288. results.append(result)
  289. return results