moki
/
hello-agents
镜像来自 https://github.com/datawhalechina/hello-agents.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
							# processors/add_knowledge.py
"""知识添加处理器 - 使用 LLM 分析、分类并保存知识"""

import json
import os
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional
from hello_agents import HelloAgentsLLM
from core.file_manager import FileManager
from core.summary_manager import SummaryManager


class AddKnowledgeProcessor:
    """
    知识添加处理器

    功能：
    - 识别输入类型（文本/文件/URL）
    - 使用 LLM 分析内容
    - 智能分类和打标签
    - 提取关键概念
    - 生成文件名
    - 保存到 knowledge 目录
    - 更新 knowledge_summary.md
    """

    def __init__(self, llm: HelloAgentsLLM, file_manager: FileManager):
        """
        初始化 AddKnowledgeProcessor

        Args:
            llm: HelloAgentsLLM 实例
            file_manager: FileManager 实例
        """
        self.llm = llm
        self.file_manager = file_manager
        self.summary_manager = SummaryManager(file_manager)

    def _identify_input_type(self, input_data: str) -> str:
        """
        识别输入类型

        Args:
            input_data: 用户输入

        Returns:
            输入类型（text/file/url）
        """
        # 检查 URL
        if input_data.startswith("http://") or input_data.startswith("https://"):
            return "url"

        # 检查文件路径
        if (
            input_data.startswith("~")
            or input_data.startswith("/")
            or input_data.startswith("./")
        ):
            return "file"

        # 默认为文本
        return "text"

    def _read_file(self, file_path: str) -> str:
        """
        读取文件内容

        Args:
            file_path: 文件路径

        Returns:
            文件内容
        """
        # 处理 ~ 路径
        if file_path.startswith("~"):
            file_path = os.path.expanduser(file_path)

        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()

    def _analyze_content(self, content: str, domain: str) -> Dict[str, any]:
        """
        使用 LLM 分析内容

        Args:
            content: 知识内容
            domain: 领域名称

        Returns:
            分析结果字典，包含：
            - category: 分类
            - tags: 标签列表
            - key_concepts: 关键概念列表
            - summary: 摘要
        """
        user_prompt = f"""请分析以下知识内容并提取关键信息：

【领域】
{domain}

【知识内容】
{content[:2000]}

请提供以下信息（JSON格式）：
{{
  "category": "分类（如：算法、概念、工具、实践等）",
  "tags": ["标签1", "标签2", "标签3"],
  "key_concepts": ["核心概念1", "核心概念2", "核心概念3"],
  "summary": "一句话摘要（50字以内）"
}}
"""

        messages = [
            {
                "role": "system",
                "content": "你是一个知识管理专家，擅长分析学习内容并提取关键信息、分类和标签。",
            },
            {"role": "user", "content": user_prompt},
        ]

        try:
            response = self.llm.invoke(messages)

            # 尝试解析 JSON（简化实现：使用规则提取）
            return self._extract_metadata_from_text(response)
        except Exception:
            # 降级：使用规则分析
            return {
                "category": self._classify_content(content, domain),
                "tags": self._extract_tags_from_content(content),
                "key_concepts": self._extract_concepts_from_content(content),
                "summary": content[:100] + "..." if len(content) > 100 else content,
                "domain": domain,  # 添加 domain 字段
            }

    def _extract_metadata_from_text(self, text: str) -> Dict[str, any]:
        """
        从文本中提取元数据（简化版）

        Args:
            text: LLM 响应文本

        Returns:
            元数据字典
        """
        # 简化实现：基于规则提取
        lines = text.strip().split("\n")

        category = "通用"
        tags = []
        key_concepts = []
        summary = ""

        for line in lines:
            line = line.strip()
            if "分类" in line or "category" in line.lower():
                category = line.split("：")[-1].split(":")[-1].strip()
            elif "标签" in line or "tags" in line.lower():
                tags = [
                    tag.strip(" \"'[]{}")
                    for tag in line.split("：")[-1].split(":")[-1].split(",")
                ]
            elif "概念" in line or "concepts" in line.lower():
                key_concepts = [
                    c.strip(" \"'[]{}")
                    for c in line.split("：")[-1].split(":")[-1].split(",")
                ]
            elif "摘要" in line or "summary" in line.lower():
                summary = line.split("：")[-1].split(":")[-1].strip()

        return {
            "category": category if category else "通用",
            "tags": [t for t in tags if t],
            "key_concepts": [c for c in key_concepts if c],
            "summary": summary if summary else "知识笔记",
            "domain": domain,  # 添加 domain 字段
        }

    def _extract_tags_from_content(self, content: str) -> List[str]:
        """
        从内容中提取标签（基于关键词）

        Args:
            content: 内容文本

        Returns:
            标签列表
        """
        # 常见技术关键词
        keywords = [
            "算法",
            "数据结构",
            "机器学习",
            "深度学习",
            "Python",
            "JavaScript",
            "TypeScript",
            "Java",
            "框架",
            "库",
            "工具",
            "API",
            "前端",
            "后端",
            "全栈",
            "数据库",
            "理论",
            "实践",
            "教程",
            "示例",
        ]

        found = []
        content_lower = content.lower()
        for keyword in keywords:
            if keyword.lower() in content_lower:
                found.append(keyword)

        return found[:5]  # 最多5个标签

    def _extract_concepts_from_content(self, content: str) -> List[str]:
        """
        从内容中提取关键概念

        Args:
            content: 内容文本

        Returns:
            关键概念列表
        """
        # 提取以 # 开头的标题作为概念
        concepts = []
        for line in content.split("\n"):
            line = line.strip()
            if line.startswith("#"):
                # 去掉 # 符号和空格
                concept = line.lstrip("#").strip()
                if concept and len(concept) < 50:  # 限制长度
                    concepts.append(concept)

        return concepts[:5]  # 最多5个概念

    def _generate_filename(self, title: str, category: str = "") -> str:
        """
        生成文件名

        Args:
            title: 标题
            category: 分类（可选）

        Returns:
            文件名（带扩展名）
        """
        # 提取第一句话作为文件名
        if len(title) > 50:
            title = title[:50]

        # 清理特殊字符
        title = title.replace(" ", "-")
        title = "".join(c for c in title if c.isalnum() or c in "-_")

        # 添加时间戳
        timestamp = datetime.now().strftime("%Y%m%d-%H%M")

        if category:
            base_name = f"{timestamp}-{category}-{title}"
        else:
            base_name = f"{timestamp}-{title}"

        return f"{base_name}.md"  # 添加 .md 扩展名

    def _save_knowledge(
        self, domain: str, content: str, metadata: Dict[str, any]
    ) -> Path:
        """
        保存知识笔记

        Args:
            domain: 领域名称
            content: 知识内容
            metadata: 元数据

        Returns:
            保存的文件路径
        """
        # 生成文件名（_generate_filename 已包含 .md 扩展名）
        title = content.split("\n")[0].lstrip("#").strip()
        filename = self._generate_filename(title, metadata.get("category", ""))

        # 添加元数据到内容
        full_content = f"""# {title}

> **分类**: {metadata.get('category', '通用')}
> **标签**: {', '.join(metadata.get('tags', []))}
> **添加时间**: {datetime.now().strftime('%Y-%m-%d %H:%M')}

---

{content}

## 关键概念
{chr(10).join(f"- {c}" for c in metadata.get('key_concepts', []))}

## 摘要
{metadata.get('summary', '无')}
"""

        # 保存文件
        self.file_manager.save_knowledge(domain, filename, full_content)

        # 返回完整路径
        return self.file_manager.BASE_DIR / domain / "knowledge" / filename

    def _classify_content(self, content: str, domain: str) -> str:
        """
        分类内容

        Args:
            content: 内容
            domain: 领域

        Returns:
            分类名称
        """
        # 基于规则的简单分类
        content_lower = content.lower()

        if any(
            word in content_lower for word in ["算法", "algorithm", "方法", "method"]
        ):
            return "算法"
        elif any(
            word in content_lower for word in ["概念", "concept", "原理", "principle"]
        ):
            return "概念"
        elif any(
            word in content_lower
            for word in ["工具", "tool", "框架", "framework", "库", "library"]
        ):
            return "工具"
        elif any(
            word in content_lower
            for word in ["实践", "practice", "案例", "case", "项目", "project"]
        ):
            return "实践"
        elif any(
            word in content_lower for word in ["教程", "tutorial", "指南", "guide"]
        ):
            return "教程"
        else:
            return "通用"

    def add(self, domain: str, input_data: str, input_type: str = None) -> str:
        """
        添加知识

        Args:
            domain: 领域名称
            input_data: 输入数据（文本/文件路径/URL）
            input_type: 输入类型（可选，自动识别）

        Returns:
            执行结果
        """
        # 识别输入类型
        if not input_type:
            input_type = self._identify_input_type(input_data)

        # 获取内容
        if input_type == "text":
            content = input_data
        elif input_type == "file":
            try:
                content = self._read_file(input_data)
            except Exception as e:
                return f"❌ 读取文件失败：{e}"
        elif input_type == "url":
            # 简化实现：提示用户复制内容
            content = f"# URL 知识\n\n来源：{input_data}\n\n请手动添加内容..."
        else:
            return f"❌ 未知的输入类型：{input_type}"

        # 分析内容
        metadata = self._analyze_content(content, domain)

        # 保存知识
        try:
            file_path = self._save_knowledge(domain, content, metadata)

            # 更新摘要
            self.summary_manager.update_knowledge_summary(domain, file_path.name)

            return f"""✅ 知识已添加

📁 保存位置: {domain}/knowledge/{file_path.name}
📊 分类: {metadata.get('category', '通用')}
🏷️  标签: {', '.join(metadata.get('tags', []))}
"""

        except Exception as e:
            return f"❌ 添加知识失败：{e}"