moki
/
hello-agents
огледало од https://github.com/datawhalechina/hello-agents.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
							"""
InnoCore AI 文本处理工具
"""

import re
from typing import List, Dict, Optional, Any, Tuple
import string
from collections import Counter
import asyncio

class TextProcessor:
    """文本处理器"""
    
    def __init__(self):
        self.stop_words = self._load_stop_words()
        self.punctuation = string.punctuation
    
    def _load_stop_words(self) -> set:
        """加载停用词"""
        # 简化的停用词列表
        return {
            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of',
            'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has',
            'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may',
            'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you',
            'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
            'my', 'your', 'his', 'her', 'its', 'our', 'their', 'mine', 'yours',
            'hers', 'ours', 'theirs', 'what', 'which', 'who', 'whom', 'whose',
            'where', 'when', 'why', 'how', 'all', 'each', 'every', 'both', 'few',
            'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only',
            'own', 'same', 'so', 'than', 'too', 'very', 'just', 'now', 'also'
        }
    
    def clean_text(self, text: str) -> str:
        """清理文本"""
        if not text:
            return ""
        
        # 移除多余的空白字符
        text = re.sub(r'\s+', ' ', text)
        
        # 移除特殊字符（保留基本标点）
        text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}\"\'\/\\]', ' ', text)
        
        # 移除多余的空格
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize(self, text: str) -> List[str]:
        """分词"""
        if not text:
            return []
        
        # 转换为小写并分词
        words = text.lower().split()
        
        # 移除标点符号
        words = [word.strip(self.punctuation) for word in words]
        
        # 过滤空字符串
        words = [word for word in words if word]
        
        return words
    
    def remove_stop_words(self, words: List[str]) -> List[str]:
        """移除停用词"""
        return [word for word in words if word not in self.stop_words]
    
    def extract_sentences(self, text: str) -> List[str]:
        """提取句子"""
        if not text:
            return []
        
        # 使用正则表达式分割句子
        sentences = re.split(r'[.!?]+', text)
        
        # 清理和过滤
        sentences = [s.strip() for s in sentences if s.strip()]
        
        return sentences
    
    def extract_paragraphs(self, text: str) -> List[str]:
        """提取段落"""
        if not text:
            return []
        
        # 按双换行分割段落
        paragraphs = re.split(r'\n\s*\n', text)
        
        # 清理和过滤
        paragraphs = [p.strip() for p in paragraphs if p.strip()]
        
        return paragraphs
    
    def calculate_readability(self, text: str) -> Dict[str, float]:
        """计算文本可读性指标"""
        if not text:
            return {"flesch_score": 0.0, "avg_sentence_length": 0.0, "avg_word_length": 0.0}
        
        sentences = self.extract_sentences(text)
        words = self.tokenize(text)
        
        if not sentences or not words:
            return {"flesch_score": 0.0, "avg_sentence_length": 0.0, "avg_word_length": 0.0}
        
        # 平均句子长度
        avg_sentence_length = len(words) / len(sentences)
        
        # 平均词长
        avg_word_length = sum(len(word) for word in words) / len(words)
        
        # 简化的Flesch Reading Ease分数
        flesch_score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_word_length)
        
        return {
            "flesch_score": max(0, min(100, flesch_score)),
            "avg_sentence_length": avg_sentence_length,
            "avg_word_length": avg_word_length
        }
    
    def extract_key_phrases(self, text: str, max_phrases: int = 10) -> List[str]:
        """提取关键短语"""
        if not text:
            return []
        
        # 简化的关键短语提取
        words = self.tokenize(text)
        words = self.remove_stop_words(words)
        
        # 寻找常见的学术短语模式
        phrase_patterns = [
            r'\b\w+\s+\w+\b',  # 两词短语
            r'\b\w+\s+\w+\s+\w+\b',  # 三词短语
        ]
        
        phrases = []
        for pattern in phrase_patterns:
            matches = re.findall(pattern, text.lower())
            phrases.extend(matches)
        
        # 计算短语频率
        phrase_freq = Counter(phrases)
        
        # 过滤和排序
        filtered_phrases = [
            phrase for phrase, freq in phrase_freq.items()
            if freq > 1 and len(phrase.split()) >= 2
        ]
        
        filtered_phrases.sort(key=lambda x: phrase_freq[x], reverse=True)
        
        return filtered_phrases[:max_phrases]
    
    def detect_language(self, text: str) -> str:
        """检测语言（简化实现）"""
        if not text:
            return "unknown"
        
        # 简单的语言检测基于常见词汇
        chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
        english_chars = len(re.findall(r'[a-zA-Z]', text))
        
        total_chars = chinese_chars + english_chars
        
        if total_chars == 0:
            return "unknown"
        
        chinese_ratio = chinese_chars / total_chars
        
        if chinese_ratio > 0.3:
            return "chinese"
        elif english_chars > 0:
            return "english"
        else:
            return "unknown"
    
    def extract_citations(self, text: str) -> List[Dict[str, Any]]:
        """提取引用"""
        citations = []
        
        # 数字引用模式 [1], [2-3]
        numeric_pattern = r'\[(\d+(?:-\d+)?)\]'
        numeric_matches = re.finditer(numeric_pattern, text)
        for match in numeric_matches:
            citations.append({
                "type": "numeric",
                "text": match.group(0),
                "reference": match.group(1),
                "position": match.start()
            })
        
        # 作者年份引用 (Smith, 2020)
        author_year_pattern = r'\(([A-Za-z]+(?:\s+et\s+al\.)?,\s*\d{4})\)'
        author_year_matches = re.finditer(author_year_pattern, text)
        for match in author_year_matches:
            citations.append({
                "type": "author_year",
                "text": match.group(0),
                "reference": match.group(1),
                "position": match.start()
            })
        
        return citations
    
    def extract_numbers_and_units(self, text: str) -> List[Dict[str, Any]]:
        """提取数字和单位"""
        patterns = [
            r'(\d+(?:\.\d+)?)\s*([a-zA-Z%]+)',  # 数字 + 单位
            r'(\d+(?:,\d{3})*(?:\.\d+)?)',  # 带逗号的数字
        ]
        
        results = []
        for pattern in patterns:
            matches = re.finditer(pattern, text)
            for match in matches:
                results.append({
                    "text": match.group(0),
                    "number": match.group(1),
                    "unit": match.group(2) if len(match.groups()) > 1 else "",
                    "position": match.start()
                })
        
        return results
    
    def extract_acronyms(self, text: str) -> Dict[str, str]:
        """提取缩写词"""
        acronyms = {}
        
        # 查找全称(缩写)模式
        acronym_pattern = r'([A-Za-z\s]+)\s*\(([A-Z]{2,})\)'
        matches = re.finditer(acronym_pattern, text)
        
        for match in matches:
            full_name = match.group(1).strip()
            acronym = match.group(2)
            
            # 验证缩写是否来自全称的首字母
            initials = ''.join([word[0].upper() for word in full_name.split() if word])
            
            if acronym.startswith(initials):
                acronyms[acronym] = full_name
        
        return acronyms
    
    def summarize_text(self, text: str, max_sentences: int = 3) -> str:
        """文本摘要（简化实现）"""
        if not text:
            return ""
        
        sentences = self.extract_sentences(text)
        
        if len(sentences) <= max_sentences:
            return " ".join(sentences)
        
        # 简单的摘要算法：选择包含关键词最多的句子
        words = self.tokenize(text)
        words = self.remove_stop_words(words)
        word_freq = Counter(words)
        
        sentence_scores = []
        for sentence in sentences:
            sentence_words = self.tokenize(sentence)
            sentence_words = self.remove_stop_words(sentence_words)
            
            score = sum(word_freq.get(word, 0) for word in sentence_words)
            sentence_scores.append((sentence, score))
        
        # 选择得分最高的句子
        sentence_scores.sort(key=lambda x: x[1], reverse=True)
        top_sentences = [sentence for sentence, score in sentence_scores[:max_sentences]]
        
        # 按原文顺序排列
        summary_sentences = []
        for sentence in sentences:
            if sentence in top_sentences:
                summary_sentences.append(sentence)
        
        return " ".join(summary_sentences)
    
    def extract_entities(self, text: str) -> Dict[str, List[str]]:
        """实体提取（简化实现）"""
        entities = {
            "persons": [],
            "organizations": [],
            "locations": [],
            "dates": [],
            "numbers": []
        }
        
        # 人名模式（简化）
        person_pattern = r'\b([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b'
        person_matches = re.findall(person_pattern, text)
        entities["persons"] = list(set(person_matches))
        
        # 组织模式（简化）
        org_patterns = [
            r'\b([A-Z][a-z]+\s+(?:University|Institute|Laboratory|Company|Corp|Inc|Ltd))\b',
            r'\b((?:[A-Z]+\s*){2,})\b'
        ]
        for pattern in org_patterns:
            matches = re.findall(pattern, text)
            entities["organizations"].extend(matches)
        entities["organizations"] = list(set(entities["organizations"]))
        
        # 日期模式
        date_patterns = [
            r'\b(\d{4})\b',
            r'\b(\d{1,2}/\d{1,2}/\d{4})\b',
            r'\b((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4})\b'
        ]
        for pattern in date_patterns:
            matches = re.findall(pattern, text)
            entities["dates"].extend(matches)
        entities["dates"] = list(set(entities["dates"]))
        
        # 数字模式
        number_pattern = r'\b(\d+(?:\.\d+)?)\b'
        number_matches = re.findall(number_pattern, text)
        entities["numbers"] = list(set(number_matches))
        
        return entities
    
    def calculate_text_similarity(self, text1: str, text2: str) -> float:
        """计算文本相似度（基于词汇重叠）"""
        if not text1 or not text2:
            return 0.0
        
        words1 = set(self.tokenize(text1))
        words2 = set(self.tokenize(text2))
        
        if not words1 or not words2:
            return 0.0
        
        intersection = words1.intersection(words2)
        union = words1.union(words2)
        
        return len(intersection) / len(union)
    
    async def process_batch(self, texts: List[str], operations: List[str]) -> List[Dict[str, Any]]:
        """批量处理文本"""
        results = []
        
        for text in texts:
            result = {"text": text}
            
            for operation in operations:
                if operation == "clean":
                    result["cleaned"] = self.clean_text(text)
                elif operation == "tokenize":
                    result["tokens"] = self.tokenize(text)
                elif operation == "sentences":
                    result["sentences"] = self.extract_sentences(text)
                elif operation == "paragraphs":
                    result["paragraphs"] = self.extract_paragraphs(text)
                elif operation == "readability":
                    result["readability"] = self.calculate_readability(text)
                elif operation == "key_phrases":
                    result["key_phrases"] = self.extract_key_phrases(text)
                elif operation == "language":
                    result["language"] = self.detect_language(text)
                elif operation == "citations":
                    result["citations"] = self.extract_citations(text)
                elif operation == "entities":
                    result["entities"] = self.extract_entities(text)
                elif operation == "summary":
                    result["summary"] = self.summarize_text(text)
            
            results.append(result)
        
        return results