| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402 |
- # processors/add_knowledge.py
- """知识添加处理器 - 使用 LLM 分析、分类并保存知识"""
- import json
- import os
- from datetime import datetime
- from pathlib import Path
- from typing import Dict, List, Optional
- from hello_agents import HelloAgentsLLM
- from core.file_manager import FileManager
- from core.summary_manager import SummaryManager
- class AddKnowledgeProcessor:
- """
- 知识添加处理器
- 功能:
- - 识别输入类型(文本/文件/URL)
- - 使用 LLM 分析内容
- - 智能分类和打标签
- - 提取关键概念
- - 生成文件名
- - 保存到 knowledge 目录
- - 更新 knowledge_summary.md
- """
- def __init__(self, llm: HelloAgentsLLM, file_manager: FileManager):
- """
- 初始化 AddKnowledgeProcessor
- Args:
- llm: HelloAgentsLLM 实例
- file_manager: FileManager 实例
- """
- self.llm = llm
- self.file_manager = file_manager
- self.summary_manager = SummaryManager(file_manager)
- def _identify_input_type(self, input_data: str) -> str:
- """
- 识别输入类型
- Args:
- input_data: 用户输入
- Returns:
- 输入类型(text/file/url)
- """
- # 检查 URL
- if input_data.startswith("http://") or input_data.startswith("https://"):
- return "url"
- # 检查文件路径
- if (
- input_data.startswith("~")
- or input_data.startswith("/")
- or input_data.startswith("./")
- ):
- return "file"
- # 默认为文本
- return "text"
- def _read_file(self, file_path: str) -> str:
- """
- 读取文件内容
- Args:
- file_path: 文件路径
- Returns:
- 文件内容
- """
- # 处理 ~ 路径
- if file_path.startswith("~"):
- file_path = os.path.expanduser(file_path)
- with open(file_path, "r", encoding="utf-8") as f:
- return f.read()
- def _analyze_content(self, content: str, domain: str) -> Dict[str, any]:
- """
- 使用 LLM 分析内容
- Args:
- content: 知识内容
- domain: 领域名称
- Returns:
- 分析结果字典,包含:
- - category: 分类
- - tags: 标签列表
- - key_concepts: 关键概念列表
- - summary: 摘要
- """
- user_prompt = f"""请分析以下知识内容并提取关键信息:
- 【领域】
- {domain}
- 【知识内容】
- {content[:2000]}
- 请提供以下信息(JSON格式):
- {{
- "category": "分类(如:算法、概念、工具、实践等)",
- "tags": ["标签1", "标签2", "标签3"],
- "key_concepts": ["核心概念1", "核心概念2", "核心概念3"],
- "summary": "一句话摘要(50字以内)"
- }}
- """
- messages = [
- {
- "role": "system",
- "content": "你是一个知识管理专家,擅长分析学习内容并提取关键信息、分类和标签。",
- },
- {"role": "user", "content": user_prompt},
- ]
- try:
- response = self.llm.invoke(messages)
- # 尝试解析 JSON(简化实现:使用规则提取)
- return self._extract_metadata_from_text(response)
- except Exception:
- # 降级:使用规则分析
- return {
- "category": self._classify_content(content, domain),
- "tags": self._extract_tags_from_content(content),
- "key_concepts": self._extract_concepts_from_content(content),
- "summary": content[:100] + "..." if len(content) > 100 else content,
- "domain": domain, # 添加 domain 字段
- }
- def _extract_metadata_from_text(self, text: str) -> Dict[str, any]:
- """
- 从文本中提取元数据(简化版)
- Args:
- text: LLM 响应文本
- Returns:
- 元数据字典
- """
- # 简化实现:基于规则提取
- lines = text.strip().split("\n")
- category = "通用"
- tags = []
- key_concepts = []
- summary = ""
- for line in lines:
- line = line.strip()
- if "分类" in line or "category" in line.lower():
- category = line.split(":")[-1].split(":")[-1].strip()
- elif "标签" in line or "tags" in line.lower():
- tags = [
- tag.strip(" \"'[]{}")
- for tag in line.split(":")[-1].split(":")[-1].split(",")
- ]
- elif "概念" in line or "concepts" in line.lower():
- key_concepts = [
- c.strip(" \"'[]{}")
- for c in line.split(":")[-1].split(":")[-1].split(",")
- ]
- elif "摘要" in line or "summary" in line.lower():
- summary = line.split(":")[-1].split(":")[-1].strip()
- return {
- "category": category if category else "通用",
- "tags": [t for t in tags if t],
- "key_concepts": [c for c in key_concepts if c],
- "summary": summary if summary else "知识笔记",
- "domain": domain, # 添加 domain 字段
- }
- def _extract_tags_from_content(self, content: str) -> List[str]:
- """
- 从内容中提取标签(基于关键词)
- Args:
- content: 内容文本
- Returns:
- 标签列表
- """
- # 常见技术关键词
- keywords = [
- "算法",
- "数据结构",
- "机器学习",
- "深度学习",
- "Python",
- "JavaScript",
- "TypeScript",
- "Java",
- "框架",
- "库",
- "工具",
- "API",
- "前端",
- "后端",
- "全栈",
- "数据库",
- "理论",
- "实践",
- "教程",
- "示例",
- ]
- found = []
- content_lower = content.lower()
- for keyword in keywords:
- if keyword.lower() in content_lower:
- found.append(keyword)
- return found[:5] # 最多5个标签
- def _extract_concepts_from_content(self, content: str) -> List[str]:
- """
- 从内容中提取关键概念
- Args:
- content: 内容文本
- Returns:
- 关键概念列表
- """
- # 提取以 # 开头的标题作为概念
- concepts = []
- for line in content.split("\n"):
- line = line.strip()
- if line.startswith("#"):
- # 去掉 # 符号和空格
- concept = line.lstrip("#").strip()
- if concept and len(concept) < 50: # 限制长度
- concepts.append(concept)
- return concepts[:5] # 最多5个概念
- def _generate_filename(self, title: str, category: str = "") -> str:
- """
- 生成文件名
- Args:
- title: 标题
- category: 分类(可选)
- Returns:
- 文件名(带扩展名)
- """
- # 提取第一句话作为文件名
- if len(title) > 50:
- title = title[:50]
- # 清理特殊字符
- title = title.replace(" ", "-")
- title = "".join(c for c in title if c.isalnum() or c in "-_")
- # 添加时间戳
- timestamp = datetime.now().strftime("%Y%m%d-%H%M")
- if category:
- base_name = f"{timestamp}-{category}-{title}"
- else:
- base_name = f"{timestamp}-{title}"
- return f"{base_name}.md" # 添加 .md 扩展名
- def _save_knowledge(
- self, domain: str, content: str, metadata: Dict[str, any]
- ) -> Path:
- """
- 保存知识笔记
- Args:
- domain: 领域名称
- content: 知识内容
- metadata: 元数据
- Returns:
- 保存的文件路径
- """
- # 生成文件名(_generate_filename 已包含 .md 扩展名)
- title = content.split("\n")[0].lstrip("#").strip()
- filename = self._generate_filename(title, metadata.get("category", ""))
- # 添加元数据到内容
- full_content = f"""# {title}
- > **分类**: {metadata.get('category', '通用')}
- > **标签**: {', '.join(metadata.get('tags', []))}
- > **添加时间**: {datetime.now().strftime('%Y-%m-%d %H:%M')}
- ---
- {content}
- ## 关键概念
- {chr(10).join(f"- {c}" for c in metadata.get('key_concepts', []))}
- ## 摘要
- {metadata.get('summary', '无')}
- """
- # 保存文件
- self.file_manager.save_knowledge(domain, filename, full_content)
- # 返回完整路径
- return self.file_manager.BASE_DIR / domain / "knowledge" / filename
- def _classify_content(self, content: str, domain: str) -> str:
- """
- 分类内容
- Args:
- content: 内容
- domain: 领域
- Returns:
- 分类名称
- """
- # 基于规则的简单分类
- content_lower = content.lower()
- if any(
- word in content_lower for word in ["算法", "algorithm", "方法", "method"]
- ):
- return "算法"
- elif any(
- word in content_lower for word in ["概念", "concept", "原理", "principle"]
- ):
- return "概念"
- elif any(
- word in content_lower
- for word in ["工具", "tool", "框架", "framework", "库", "library"]
- ):
- return "工具"
- elif any(
- word in content_lower
- for word in ["实践", "practice", "案例", "case", "项目", "project"]
- ):
- return "实践"
- elif any(
- word in content_lower for word in ["教程", "tutorial", "指南", "guide"]
- ):
- return "教程"
- else:
- return "通用"
- def add(self, domain: str, input_data: str, input_type: str = None) -> str:
- """
- 添加知识
- Args:
- domain: 领域名称
- input_data: 输入数据(文本/文件路径/URL)
- input_type: 输入类型(可选,自动识别)
- Returns:
- 执行结果
- """
- # 识别输入类型
- if not input_type:
- input_type = self._identify_input_type(input_data)
- # 获取内容
- if input_type == "text":
- content = input_data
- elif input_type == "file":
- try:
- content = self._read_file(input_data)
- except Exception as e:
- return f"❌ 读取文件失败:{e}"
- elif input_type == "url":
- # 简化实现:提示用户复制内容
- content = f"# URL 知识\n\n来源:{input_data}\n\n请手动添加内容..."
- else:
- return f"❌ 未知的输入类型:{input_type}"
- # 分析内容
- metadata = self._analyze_content(content, domain)
- # 保存知识
- try:
- file_path = self._save_knowledge(domain, content, metadata)
- # 更新摘要
- self.summary_manager.update_knowledge_summary(domain, file_path.name)
- return f"""✅ 知识已添加
- 📁 保存位置: {domain}/knowledge/{file_path.name}
- 📊 分类: {metadata.get('category', '通用')}
- 🏷️ 标签: {', '.join(metadata.get('tags', []))}
- """
- except Exception as e:
- return f"❌ 添加知识失败:{e}"
|