extract_dimensions.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. """
  2. 维度提取模块 - 使用LLM从报告中提取维度
  3. """
  4. import sys
  5. import json
  6. import os
  7. from pathlib import Path
  8. from datetime import datetime
  9. from typing import List, Dict, Optional
  10. # 设置控制台编码为UTF-8(Windows)
  11. # 注意:只在作为主脚本运行时重定向,避免在被导入时冲突
  12. if sys.platform == 'win32' and __name__ == "__main__":
  13. import io
  14. if not isinstance(sys.stdout, io.TextIOWrapper):
  15. sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
  16. if not isinstance(sys.stderr, io.TextIOWrapper):
  17. sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
  18. # 加载 .env 文件(如果存在)
  19. try:
  20. from dotenv import load_dotenv
  21. load_dotenv()
  22. except ImportError:
  23. pass
  24. # 导入LLM
  25. try:
  26. from hello_agents.core.llm import HelloAgentsLLM
  27. LLM_AVAILABLE = True
  28. except ImportError:
  29. LLM_AVAILABLE = False
  30. print("⚠️ 警告: hello_agents 模块未安装,无法使用LLM提取维度")
  31. def init_llm():
  32. """初始化LLM"""
  33. if not LLM_AVAILABLE:
  34. return None
  35. # 从环境变量读取LLM配置
  36. llm_model = (
  37. os.getenv("LLM_MODEL") or
  38. os.getenv("LLM_MODEL_ID") or
  39. "qwen-plus"
  40. )
  41. llm_api_key = (
  42. os.getenv("LLM_API_KEY") or
  43. os.getenv("MODELSCOPE_API_KEY") or
  44. os.getenv("MODELSCOPE_API_TOKEN")
  45. )
  46. llm_base_url = (
  47. os.getenv("LLM_BASE_URL") or
  48. "https://api-inference.modelscope.cn/v1/"
  49. )
  50. llm_provider = os.getenv("LLM_PROVIDER", "modelscope")
  51. if not llm_api_key:
  52. print("⚠️ 警告: 未找到LLM API Key")
  53. return None
  54. try:
  55. llm = HelloAgentsLLM(
  56. model=llm_model,
  57. api_key=llm_api_key,
  58. base_url=llm_base_url,
  59. provider=llm_provider
  60. )
  61. return llm
  62. except Exception as e:
  63. print(f"⚠️ 初始化LLM失败: {e}")
  64. return None
  65. def extract_json_from_text(text: str) -> Optional[Dict]:
  66. """从文本中提取JSON内容"""
  67. import re
  68. # 尝试直接解析
  69. try:
  70. return json.loads(text.strip())
  71. except json.JSONDecodeError:
  72. pass
  73. # 尝试提取JSON代码块
  74. json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
  75. if json_match:
  76. try:
  77. return json.loads(json_match.group(1))
  78. except json.JSONDecodeError:
  79. pass
  80. # 尝试提取第一个完整的JSON对象
  81. json_match = re.search(r'\{.*\}', text, re.DOTALL)
  82. if json_match:
  83. try:
  84. return json.loads(json_match.group(0))
  85. except json.JSONDecodeError:
  86. pass
  87. return None
  88. def extract_dimensions_from_text(text: str, llm, existing_themes: List[str] = None) -> Dict:
  89. """从报告文本中提取维度
  90. Args:
  91. text: 报告文本内容
  92. llm: LLM实例
  93. existing_themes: 现有的themes列表,用于参考抽象级别
  94. """
  95. if not llm:
  96. return {"dimensions": [], "confidence": 0.0, "error": "LLM未初始化"}
  97. themes_hint = ""
  98. if existing_themes:
  99. themes_hint = f"\n参考现有themes的风格(这些是用户已经定义的兴趣主题):{existing_themes}\n提取的维度应该与这些themes在抽象级别上保持一致。"
  100. prompt = f"""请从以下用户报告中提取3-8个维度(dimensions)。维度应该是用户关注的**高级别的主题、领域或兴趣点**,而不是简单的名词拆分。
  101. 报告内容:
  102. {text}
  103. {themes_hint}
  104. **提取原则**:
  105. 1. **保持概念完整性**:如果报告中提到"信息信号系统"这样的完整概念,应该提取为"信息信号系统"或"系统",而不要拆成"信息"、"信号"、"系统"三个词
  106. 2. **提取主题级别**:维度应该是主题级别的概念(如"AI"、"健康"、"工作"),而不是具体细节(如"更新"、"今天"、"高兴")
  107. 3. **过滤无关词**:
  108. - 过滤掉动作词(如:更新、创建、删除)
  109. - 过滤掉时间词(如:今天、昨天、本周)
  110. - 过滤掉情绪词(如:高兴、难过),除非情绪本身是报告的主题
  111. - 过滤掉过于通用的词(如:事情、内容、问题)
  112. 4. **理解语义上下文**:理解整个句子的含义,提取其背后关注的主题
  113. 5. **抽象层次**:维度应该是足够抽象的主题,可以作为YouTube搜索关键词或兴趣标签使用
  114. **示例**:
  115. - 报告:"今天很高兴,我们的信息信号系统再次迎来了更新"
  116. - ❌ 错误提取:["信息", "信号", "系统", "更新", "今天"]
  117. - ✅ 正确提取:["信息信号系统"] 或 ["系统"] 或 ["技术系统"]
  118. 请以JSON格式返回维度列表:
  119. {{
  120. "dimensions": ["维度1", "维度2", "维度3"],
  121. "confidence": 0.85,
  122. "reasoning": "简要说明提取理由"
  123. }}
  124. 要求:
  125. - 维度数量:3-8个(根据报告内容的重要性决定)
  126. - 维度格式:简洁的主题词(2-8个字),保持概念的完整性
  127. - confidence:提取的置信度(0-1之间)
  128. - reasoning:简要说明为什么提取这些维度
  129. 请直接返回JSON,不要包含其他文字。"""
  130. try:
  131. messages = [
  132. {"role": "system", "content": "你是一个专业的文本分析助手,擅长从文本中提取高级别的主题和兴趣维度。你会理解语义上下文,保持概念的完整性,不会简单地进行分词。"},
  133. {"role": "user", "content": prompt}
  134. ]
  135. response = llm.invoke(messages)
  136. # 提取JSON
  137. result = extract_json_from_text(response)
  138. if result and "dimensions" in result:
  139. return {
  140. "dimensions": result["dimensions"],
  141. "confidence": result.get("confidence", 0.8),
  142. "reasoning": result.get("reasoning", "")
  143. }
  144. else:
  145. print(f"⚠️ LLM返回格式不正确: {response[:200]}")
  146. return {"dimensions": [], "confidence": 0.0, "error": "格式解析失败"}
  147. except Exception as e:
  148. print(f"⚠️ 提取维度失败: {e}")
  149. return {"dimensions": [], "confidence": 0.0, "error": str(e)}
  150. def extract_dimensions_from_report(report_file: Path, llm, existing_themes: List[str] = None) -> Optional[Dict]:
  151. """从Markdown文件中提取维度
  152. Args:
  153. report_file: 报告文件路径
  154. llm: LLM实例
  155. existing_themes: 现有的themes列表,用于参考抽象级别
  156. """
  157. if not report_file.exists():
  158. print(f"❌ 报告文件不存在: {report_file}")
  159. return None
  160. try:
  161. with open(report_file, 'r', encoding='utf-8') as f:
  162. content = f.read()
  163. # 移除Markdown标题(如果存在)
  164. lines = content.split('\n')
  165. # 跳过开头的#标题行
  166. content_lines = []
  167. for line in lines:
  168. if line.strip().startswith('#') and not content_lines:
  169. continue
  170. content_lines.append(line)
  171. text = '\n'.join(content_lines).strip()
  172. if not text:
  173. print(f"⚠️ 报告内容为空: {report_file}")
  174. return None
  175. # 提取维度(传入existing_themes)
  176. result = extract_dimensions_from_text(text, llm, existing_themes=existing_themes)
  177. # 添加报告信息
  178. result["report_file"] = str(report_file)
  179. result["report_date"] = report_file.stem
  180. result["extraction_date"] = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
  181. return result
  182. except Exception as e:
  183. print(f"❌ 读取报告失败 {report_file}: {e}")
  184. return None
  185. def save_extraction_result(base_dir: Path, result: Dict, report_type: str):
  186. """保存提取结果"""
  187. dimensions_dir = base_dir / "archive" / "dimensions"
  188. dimensions_dir.mkdir(parents=True, exist_ok=True)
  189. # 根据报告日期生成文件名
  190. report_date = result.get("report_date", datetime.now().strftime("%Y-%m-%d"))
  191. output_file = dimensions_dir / f"{report_date}_{report_type}_dimensions.json"
  192. try:
  193. with open(output_file, 'w', encoding='utf-8') as f:
  194. json.dump(result, f, indent=2, ensure_ascii=False)
  195. print(f"✅ 维度提取结果已保存: {output_file}")
  196. return output_file
  197. except Exception as e:
  198. print(f"❌ 保存失败: {e}")
  199. return None
  200. def batch_extract_dimensions(base_dir: Path, report_type: str = None, llm=None, existing_themes: List[str] = None) -> List[Dict]:
  201. """批量提取维度
  202. Args:
  203. base_dir: 基础目录路径
  204. report_type: 报告类型(daily/weekly/monthly),None表示处理所有类型
  205. llm: LLM实例
  206. existing_themes: 现有的themes列表,如果为None则自动从themes.yaml加载
  207. """
  208. if not llm:
  209. llm = init_llm()
  210. if not llm:
  211. print("❌ LLM未初始化,无法提取维度")
  212. return []
  213. # 如果没有传入existing_themes,尝试从themes.yaml加载
  214. if existing_themes is None:
  215. try:
  216. # 避免循环导入,直接在这里读取yaml
  217. import yaml
  218. themes_file = base_dir / "themes.yaml"
  219. if themes_file.exists():
  220. with open(themes_file, 'r', encoding='utf-8') as f:
  221. data = yaml.safe_load(f)
  222. if data and isinstance(data, dict):
  223. existing_themes = data.get('themes', [])
  224. else:
  225. existing_themes = []
  226. if existing_themes:
  227. print(f"📌 已加载 {len(existing_themes)} 个现有themes作为参考: {existing_themes}")
  228. except Exception as e:
  229. print(f"⚠️ 加载themes.yaml失败,将不参考现有themes: {e}")
  230. existing_themes = []
  231. reports_dir = base_dir / "archive" / "reports"
  232. results = []
  233. # 确定要处理的报告类型
  234. types_to_process = [report_type] if report_type else ["daily", "weekly", "monthly"]
  235. for rtype in types_to_process:
  236. type_dir = reports_dir / rtype
  237. if not type_dir.exists():
  238. continue
  239. print(f"\n📂 处理{rtype}报告...")
  240. report_files = sorted(type_dir.glob("*.md"))
  241. for report_file in report_files:
  242. print(f" 处理: {report_file.name}")
  243. result = extract_dimensions_from_report(report_file, llm, existing_themes=existing_themes)
  244. if result and result.get("dimensions"):
  245. # 添加报告类型
  246. result["report_type"] = rtype
  247. # 保存提取结果
  248. save_extraction_result(base_dir, result, rtype)
  249. results.append(result)
  250. print(f" ✅ 提取到 {len(result['dimensions'])} 个维度: {', '.join(result['dimensions'][:5])}")
  251. # 如果有reasoning,也显示出来(用于调试)
  252. if result.get("reasoning"):
  253. print(f" 推理: {result['reasoning'][:100]}...")
  254. else:
  255. print(f" ⚠️ 未提取到维度")
  256. return results
  257. def load_extraction_results(base_dir: Path) -> List[Dict]:
  258. """加载所有提取结果"""
  259. dimensions_dir = base_dir / "archive" / "dimensions"
  260. if not dimensions_dir.exists():
  261. return []
  262. results = []
  263. for json_file in dimensions_dir.glob("*_dimensions.json"):
  264. try:
  265. with open(json_file, 'r', encoding='utf-8') as f:
  266. result = json.load(f)
  267. results.append(result)
  268. except Exception as e:
  269. print(f"⚠️ 读取提取结果失败 {json_file.name}: {e}")
  270. return results
  271. if __name__ == "__main__":
  272. # 命令行工具
  273. import argparse
  274. parser = argparse.ArgumentParser(description="从报告中提取维度")
  275. parser.add_argument("--report-type", choices=["daily", "weekly", "monthly"],
  276. help="指定报告类型(不指定则处理所有类型)")
  277. parser.add_argument("--report-file", type=str,
  278. help="指定单个报告文件路径")
  279. parser.add_argument("--base-dir", type=str,
  280. help="基础目录路径(默认为脚本所在目录)")
  281. args = parser.parse_args()
  282. base_dir = Path(args.base_dir) if args.base_dir else Path(__file__).parent
  283. llm = init_llm()
  284. if not llm:
  285. print("❌ 无法初始化LLM,退出")
  286. sys.exit(1)
  287. # 加载existing_themes(如果存在)
  288. existing_themes = None
  289. try:
  290. import yaml
  291. themes_file = base_dir / "themes.yaml"
  292. if themes_file.exists():
  293. with open(themes_file, 'r', encoding='utf-8') as f:
  294. data = yaml.safe_load(f)
  295. if data and isinstance(data, dict):
  296. existing_themes = data.get('themes', [])
  297. except Exception:
  298. pass
  299. if args.report_file:
  300. # 处理单个文件
  301. report_file = Path(args.report_file)
  302. result = extract_dimensions_from_report(report_file, llm, existing_themes=existing_themes)
  303. if result:
  304. report_type = result.get("report_type", "daily")
  305. save_extraction_result(base_dir, result, report_type)
  306. print(f"\n提取的维度: {result.get('dimensions', [])}")
  307. else:
  308. # 批量处理
  309. results = batch_extract_dimensions(base_dir, args.report_type, llm, existing_themes=existing_themes)
  310. print(f"\n✅ 共处理 {len(results)} 个报告")