add_knowledge.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. # processors/add_knowledge.py
  2. """知识添加处理器 - 使用 LLM 分析、分类并保存知识"""
  3. import json
  4. import os
  5. from datetime import datetime
  6. from pathlib import Path
  7. from typing import Dict, List, Optional
  8. from hello_agents import HelloAgentsLLM
  9. from core.file_manager import FileManager
  10. from core.summary_manager import SummaryManager
  11. class AddKnowledgeProcessor:
  12. """
  13. 知识添加处理器
  14. 功能:
  15. - 识别输入类型(文本/文件/URL)
  16. - 使用 LLM 分析内容
  17. - 智能分类和打标签
  18. - 提取关键概念
  19. - 生成文件名
  20. - 保存到 knowledge 目录
  21. - 更新 knowledge_summary.md
  22. """
  23. def __init__(self, llm: HelloAgentsLLM, file_manager: FileManager):
  24. """
  25. 初始化 AddKnowledgeProcessor
  26. Args:
  27. llm: HelloAgentsLLM 实例
  28. file_manager: FileManager 实例
  29. """
  30. self.llm = llm
  31. self.file_manager = file_manager
  32. self.summary_manager = SummaryManager(file_manager)
  33. def _identify_input_type(self, input_data: str) -> str:
  34. """
  35. 识别输入类型
  36. Args:
  37. input_data: 用户输入
  38. Returns:
  39. 输入类型(text/file/url)
  40. """
  41. # 检查 URL
  42. if input_data.startswith("http://") or input_data.startswith("https://"):
  43. return "url"
  44. # 检查文件路径
  45. if (
  46. input_data.startswith("~")
  47. or input_data.startswith("/")
  48. or input_data.startswith("./")
  49. ):
  50. return "file"
  51. # 默认为文本
  52. return "text"
  53. def _read_file(self, file_path: str) -> str:
  54. """
  55. 读取文件内容
  56. Args:
  57. file_path: 文件路径
  58. Returns:
  59. 文件内容
  60. """
  61. # 处理 ~ 路径
  62. if file_path.startswith("~"):
  63. file_path = os.path.expanduser(file_path)
  64. with open(file_path, "r", encoding="utf-8") as f:
  65. return f.read()
  66. def _analyze_content(self, content: str, domain: str) -> Dict[str, any]:
  67. """
  68. 使用 LLM 分析内容
  69. Args:
  70. content: 知识内容
  71. domain: 领域名称
  72. Returns:
  73. 分析结果字典,包含:
  74. - category: 分类
  75. - tags: 标签列表
  76. - key_concepts: 关键概念列表
  77. - summary: 摘要
  78. """
  79. user_prompt = f"""请分析以下知识内容并提取关键信息:
  80. 【领域】
  81. {domain}
  82. 【知识内容】
  83. {content[:2000]}
  84. 请提供以下信息(JSON格式):
  85. {{
  86. "category": "分类(如:算法、概念、工具、实践等)",
  87. "tags": ["标签1", "标签2", "标签3"],
  88. "key_concepts": ["核心概念1", "核心概念2", "核心概念3"],
  89. "summary": "一句话摘要(50字以内)"
  90. }}
  91. """
  92. messages = [
  93. {
  94. "role": "system",
  95. "content": "你是一个知识管理专家,擅长分析学习内容并提取关键信息、分类和标签。",
  96. },
  97. {"role": "user", "content": user_prompt},
  98. ]
  99. try:
  100. response = self.llm.invoke(messages)
  101. # 尝试解析 JSON(简化实现:使用规则提取)
  102. return self._extract_metadata_from_text(response)
  103. except Exception:
  104. # 降级:使用规则分析
  105. return {
  106. "category": self._classify_content(content, domain),
  107. "tags": self._extract_tags_from_content(content),
  108. "key_concepts": self._extract_concepts_from_content(content),
  109. "summary": content[:100] + "..." if len(content) > 100 else content,
  110. "domain": domain, # 添加 domain 字段
  111. }
  112. def _extract_metadata_from_text(self, text: str) -> Dict[str, any]:
  113. """
  114. 从文本中提取元数据(简化版)
  115. Args:
  116. text: LLM 响应文本
  117. Returns:
  118. 元数据字典
  119. """
  120. # 简化实现:基于规则提取
  121. lines = text.strip().split("\n")
  122. category = "通用"
  123. tags = []
  124. key_concepts = []
  125. summary = ""
  126. for line in lines:
  127. line = line.strip()
  128. if "分类" in line or "category" in line.lower():
  129. category = line.split(":")[-1].split(":")[-1].strip()
  130. elif "标签" in line or "tags" in line.lower():
  131. tags = [
  132. tag.strip(" \"'[]{}")
  133. for tag in line.split(":")[-1].split(":")[-1].split(",")
  134. ]
  135. elif "概念" in line or "concepts" in line.lower():
  136. key_concepts = [
  137. c.strip(" \"'[]{}")
  138. for c in line.split(":")[-1].split(":")[-1].split(",")
  139. ]
  140. elif "摘要" in line or "summary" in line.lower():
  141. summary = line.split(":")[-1].split(":")[-1].strip()
  142. return {
  143. "category": category if category else "通用",
  144. "tags": [t for t in tags if t],
  145. "key_concepts": [c for c in key_concepts if c],
  146. "summary": summary if summary else "知识笔记",
  147. "domain": domain, # 添加 domain 字段
  148. }
  149. def _extract_tags_from_content(self, content: str) -> List[str]:
  150. """
  151. 从内容中提取标签(基于关键词)
  152. Args:
  153. content: 内容文本
  154. Returns:
  155. 标签列表
  156. """
  157. # 常见技术关键词
  158. keywords = [
  159. "算法",
  160. "数据结构",
  161. "机器学习",
  162. "深度学习",
  163. "Python",
  164. "JavaScript",
  165. "TypeScript",
  166. "Java",
  167. "框架",
  168. "库",
  169. "工具",
  170. "API",
  171. "前端",
  172. "后端",
  173. "全栈",
  174. "数据库",
  175. "理论",
  176. "实践",
  177. "教程",
  178. "示例",
  179. ]
  180. found = []
  181. content_lower = content.lower()
  182. for keyword in keywords:
  183. if keyword.lower() in content_lower:
  184. found.append(keyword)
  185. return found[:5] # 最多5个标签
  186. def _extract_concepts_from_content(self, content: str) -> List[str]:
  187. """
  188. 从内容中提取关键概念
  189. Args:
  190. content: 内容文本
  191. Returns:
  192. 关键概念列表
  193. """
  194. # 提取以 # 开头的标题作为概念
  195. concepts = []
  196. for line in content.split("\n"):
  197. line = line.strip()
  198. if line.startswith("#"):
  199. # 去掉 # 符号和空格
  200. concept = line.lstrip("#").strip()
  201. if concept and len(concept) < 50: # 限制长度
  202. concepts.append(concept)
  203. return concepts[:5] # 最多5个概念
  204. def _generate_filename(self, title: str, category: str = "") -> str:
  205. """
  206. 生成文件名
  207. Args:
  208. title: 标题
  209. category: 分类(可选)
  210. Returns:
  211. 文件名(带扩展名)
  212. """
  213. # 提取第一句话作为文件名
  214. if len(title) > 50:
  215. title = title[:50]
  216. # 清理特殊字符
  217. title = title.replace(" ", "-")
  218. title = "".join(c for c in title if c.isalnum() or c in "-_")
  219. # 添加时间戳
  220. timestamp = datetime.now().strftime("%Y%m%d-%H%M")
  221. if category:
  222. base_name = f"{timestamp}-{category}-{title}"
  223. else:
  224. base_name = f"{timestamp}-{title}"
  225. return f"{base_name}.md" # 添加 .md 扩展名
  226. def _save_knowledge(
  227. self, domain: str, content: str, metadata: Dict[str, any]
  228. ) -> Path:
  229. """
  230. 保存知识笔记
  231. Args:
  232. domain: 领域名称
  233. content: 知识内容
  234. metadata: 元数据
  235. Returns:
  236. 保存的文件路径
  237. """
  238. # 生成文件名(_generate_filename 已包含 .md 扩展名)
  239. title = content.split("\n")[0].lstrip("#").strip()
  240. filename = self._generate_filename(title, metadata.get("category", ""))
  241. # 添加元数据到内容
  242. full_content = f"""# {title}
  243. > **分类**: {metadata.get('category', '通用')}
  244. > **标签**: {', '.join(metadata.get('tags', []))}
  245. > **添加时间**: {datetime.now().strftime('%Y-%m-%d %H:%M')}
  246. ---
  247. {content}
  248. ## 关键概念
  249. {chr(10).join(f"- {c}" for c in metadata.get('key_concepts', []))}
  250. ## 摘要
  251. {metadata.get('summary', '无')}
  252. """
  253. # 保存文件
  254. self.file_manager.save_knowledge(domain, filename, full_content)
  255. # 返回完整路径
  256. return self.file_manager.BASE_DIR / domain / "knowledge" / filename
  257. def _classify_content(self, content: str, domain: str) -> str:
  258. """
  259. 分类内容
  260. Args:
  261. content: 内容
  262. domain: 领域
  263. Returns:
  264. 分类名称
  265. """
  266. # 基于规则的简单分类
  267. content_lower = content.lower()
  268. if any(
  269. word in content_lower for word in ["算法", "algorithm", "方法", "method"]
  270. ):
  271. return "算法"
  272. elif any(
  273. word in content_lower for word in ["概念", "concept", "原理", "principle"]
  274. ):
  275. return "概念"
  276. elif any(
  277. word in content_lower
  278. for word in ["工具", "tool", "框架", "framework", "库", "library"]
  279. ):
  280. return "工具"
  281. elif any(
  282. word in content_lower
  283. for word in ["实践", "practice", "案例", "case", "项目", "project"]
  284. ):
  285. return "实践"
  286. elif any(
  287. word in content_lower for word in ["教程", "tutorial", "指南", "guide"]
  288. ):
  289. return "教程"
  290. else:
  291. return "通用"
  292. def add(self, domain: str, input_data: str, input_type: str = None) -> str:
  293. """
  294. 添加知识
  295. Args:
  296. domain: 领域名称
  297. input_data: 输入数据(文本/文件路径/URL)
  298. input_type: 输入类型(可选,自动识别)
  299. Returns:
  300. 执行结果
  301. """
  302. # 识别输入类型
  303. if not input_type:
  304. input_type = self._identify_input_type(input_data)
  305. # 获取内容
  306. if input_type == "text":
  307. content = input_data
  308. elif input_type == "file":
  309. try:
  310. content = self._read_file(input_data)
  311. except Exception as e:
  312. return f"❌ 读取文件失败:{e}"
  313. elif input_type == "url":
  314. # 简化实现:提示用户复制内容
  315. content = f"# URL 知识\n\n来源:{input_data}\n\n请手动添加内容..."
  316. else:
  317. return f"❌ 未知的输入类型:{input_type}"
  318. # 分析内容
  319. metadata = self._analyze_content(content, domain)
  320. # 保存知识
  321. try:
  322. file_path = self._save_knowledge(domain, content, metadata)
  323. # 更新摘要
  324. self.summary_manager.update_knowledge_summary(domain, file_path.name)
  325. return f"""✅ 知识已添加
  326. 📁 保存位置: {domain}/knowledge/{file_path.name}
  327. 📊 分类: {metadata.get('category', '通用')}
  328. 🏷️ 标签: {', '.join(metadata.get('tags', []))}
  329. """
  330. except Exception as e:
  331. return f"❌ 添加知识失败:{e}"