search_youtube_mcp_videos.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740
  1. """
  2. YouTube 视频搜索脚本 - 按主题搜索、评分、生成日报
  3. 从 themes.yaml 读取主题列表,对每个主题分别搜索 YouTube
  4. 合并结果、评分、排序后生成日报报告
  5. """
  6. import sys
  7. import os
  8. import json
  9. import argparse
  10. import re
  11. from pathlib import Path
  12. from datetime import datetime, timedelta, timezone
  13. # 设置控制台编码为UTF-8(Windows)
  14. if sys.platform == 'win32':
  15. import io
  16. sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
  17. sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
  18. try:
  19. import httpx
  20. except ImportError:
  21. print("❌ 错误: 需要安装 httpx 库")
  22. print("💡 运行: pip install httpx")
  23. sys.exit(1)
  24. try:
  25. import yaml
  26. except ImportError:
  27. print("❌ 错误: 需要安装 PyYAML 库")
  28. print("💡 运行: pip install pyyaml")
  29. sys.exit(1)
  30. # 加载 .env 文件(如果存在)
  31. try:
  32. from dotenv import load_dotenv
  33. load_dotenv()
  34. except ImportError:
  35. pass # dotenv 可选,如果未安装则跳过
  36. # 可选:导入 LLM 相关模块(仅用于 research 模式)
  37. try:
  38. from hello_agents.core.llm import HelloAgentsLLM
  39. LLM_AVAILABLE = True
  40. except ImportError:
  41. LLM_AVAILABLE = False
  42. # 配置常量
  43. DAYS_WINDOW = int(os.getenv("DAYS_WINDOW", "14")) # 时间窗口:默认14天
  44. def load_youtube_api_key():
  45. """从环境变量或配置文件中加载 YouTube API Key"""
  46. # 首先尝试环境变量
  47. api_key = os.getenv("YOUTUBE_API_KEY")
  48. if api_key:
  49. return api_key
  50. # 尝试从配置文件中读取
  51. config_file = Path(__file__).parent / "config"
  52. if config_file.exists():
  53. try:
  54. with open(config_file, 'r', encoding='utf-8') as f:
  55. for line in f:
  56. line = line.strip()
  57. if line.startswith("YOUTUBE_API_KEY=") and not line.startswith("#"):
  58. api_key = line.split("=", 1)[1].strip()
  59. if api_key:
  60. return api_key
  61. except Exception as e:
  62. print(f"⚠️ 读取配置文件失败: {e}")
  63. return None
  64. def load_themes():
  65. """从 themes.yaml 读取主题列表"""
  66. themes_file = Path(__file__).parent / "themes.yaml"
  67. if not themes_file.exists():
  68. print(f"❌ 错误: 找不到 themes.yaml 文件: {themes_file}")
  69. return []
  70. try:
  71. with open(themes_file, 'r', encoding='utf-8') as f:
  72. data = yaml.safe_load(f)
  73. if data is None:
  74. print(f"❌ 错误: themes.yaml 文件为空或格式错误")
  75. return []
  76. themes = data.get('themes', [])
  77. if not themes:
  78. print(f"⚠️ 警告: themes.yaml 中未找到主题列表")
  79. return []
  80. print(f"✅ 加载了 {len(themes)} 个主题: {', '.join(themes)}")
  81. return themes
  82. except Exception as e:
  83. print(f"❌ 读取 themes.yaml 失败: {e}")
  84. import traceback
  85. traceback.print_exc()
  86. return []
  87. def load_whitelist_channels():
  88. """从 channels.yaml 读取白名单频道"""
  89. channels_file = Path(__file__).parent / "channels.yaml"
  90. if not channels_file.exists():
  91. print(f"⚠️ 警告: 找不到 channels.yaml 文件: {channels_file}")
  92. return []
  93. try:
  94. with open(channels_file, 'r', encoding='utf-8') as f:
  95. data = yaml.safe_load(f)
  96. if data is None:
  97. print(f"⚠️ 警告: channels.yaml 文件为空或格式错误")
  98. return []
  99. channels = data.get('whitelist_channels', [])
  100. print(f"✅ 加载了 {len(channels)} 个白名单频道")
  101. return channels
  102. except Exception as e:
  103. print(f"⚠️ 读取 channels.yaml 失败: {e}")
  104. return []
  105. def search_youtube_videos(query: str, max_results: int = 10, api_key: str = None):
  106. """搜索 YouTube 视频"""
  107. if not api_key:
  108. api_key = load_youtube_api_key()
  109. if not api_key:
  110. print("❌ 错误: 未找到 YouTube API Key")
  111. print("💡 请设置环境变量 YOUTUBE_API_KEY 或在 config 文件中配置")
  112. return None
  113. try:
  114. url = "https://www.googleapis.com/youtube/v3/search"
  115. params = {
  116. "key": api_key,
  117. "q": query,
  118. "part": "snippet",
  119. "type": "video",
  120. "maxResults": min(max_results, 50), # API limit
  121. "order": "relevance"
  122. }
  123. response = httpx.get(url, params=params, timeout=10.0)
  124. response.raise_for_status()
  125. data = response.json()
  126. if "items" not in data or not data["items"]:
  127. return []
  128. videos = []
  129. for item in data["items"]:
  130. video_info = {
  131. "video_id": item["id"]["videoId"],
  132. "title": item["snippet"]["title"],
  133. "description": item["snippet"]["description"],
  134. "channel_title": item["snippet"]["channelTitle"],
  135. "channel_id": item["snippet"]["channelId"],
  136. "published_at": item["snippet"]["publishedAt"],
  137. "thumbnail": item["snippet"]["thumbnails"].get("medium", {}).get("url", ""),
  138. "url": f"https://www.youtube.com/watch?v={item['id']['videoId']}",
  139. "query": query # 记录搜索关键词
  140. }
  141. videos.append(video_info)
  142. return videos
  143. except httpx.HTTPStatusError as e:
  144. if e.response.status_code == 403:
  145. print(f"❌ 错误: API 密钥无效或配额已用完 (查询: {query})")
  146. else:
  147. print(f"❌ HTTP 错误: {e.response.status_code} (查询: {query})")
  148. return None
  149. except Exception as e:
  150. print(f"❌ 搜索失败 (查询: {query}): {str(e)}")
  151. return None
  152. def parse_published_time(published_at_str: str):
  153. """解析发布时间字符串为 datetime 对象"""
  154. try:
  155. # YouTube API 返回 ISO 8601 格式: 2024-01-01T12:00:00Z
  156. dt = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
  157. return dt
  158. except Exception as e:
  159. print(f"⚠️ 解析发布时间失败: {published_at_str}, 错误: {e}")
  160. return None
  161. def is_within_time_window(published_at_str: str, days_window: int = DAYS_WINDOW):
  162. """检查视频是否在时间窗口内(默认14天)"""
  163. published_time = parse_published_time(published_at_str)
  164. if not published_time:
  165. return False
  166. now = datetime.now(timezone.utc)
  167. time_diff = now - published_time
  168. return time_diff <= timedelta(days=days_window)
  169. def calculate_time_score(published_at_str: str):
  170. """计算时间评分:24小时内 +3,48小时内 +2"""
  171. published_time = parse_published_time(published_at_str)
  172. if not published_time:
  173. return 0
  174. now = datetime.now(timezone.utc)
  175. time_diff = now - published_time
  176. if time_diff <= timedelta(hours=24):
  177. return 3
  178. elif time_diff <= timedelta(hours=48):
  179. return 2
  180. else:
  181. return 0
  182. def count_theme_keywords(text: str, themes: list):
  183. """计算文本中命中的主题关键词数量(不区分大小写)"""
  184. if not text:
  185. return 0
  186. text_lower = text.lower()
  187. count = 0
  188. for theme in themes:
  189. if theme.lower() in text_lower:
  190. count += 1
  191. return count
  192. def score_video(video: dict, themes: list, whitelist_channels: list):
  193. """为视频计算评分"""
  194. score = 0
  195. # 1. 白名单频道评分 +10
  196. if video['channel_title'] in whitelist_channels:
  197. score += 10
  198. # 2. 标题或描述中每命中1个主题关键词 +5
  199. title_matches = count_theme_keywords(video['title'], themes)
  200. desc_matches = count_theme_keywords(video['description'], themes)
  201. keyword_score = (title_matches + desc_matches) * 5
  202. score += keyword_score
  203. # 3. 发布时间评分
  204. time_score = calculate_time_score(video['published_at'])
  205. score += time_score
  206. return score
  207. def merge_and_deduplicate_videos(all_videos: list):
  208. """合并视频列表并按 videoId 去重"""
  209. video_dict = {}
  210. for video in all_videos:
  211. video_id = video['video_id']
  212. if video_id not in video_dict:
  213. video_dict[video_id] = video
  214. else:
  215. # 如果已存在,合并查询关键词
  216. existing_queries = video_dict[video_id].get('queries', [])
  217. if isinstance(existing_queries, str):
  218. existing_queries = [existing_queries]
  219. if video['query'] not in existing_queries:
  220. existing_queries.append(video['query'])
  221. video_dict[video_id]['queries'] = existing_queries
  222. return list(video_dict.values())
  223. def generate_action(videos: list):
  224. """生成 action 字段:从 Top1 生成1条可执行动作(≤15min)"""
  225. if not videos:
  226. return "暂无推荐视频"
  227. # 只使用 Top1
  228. top1 = videos[0]
  229. action = f"观看《{top1['title']}》({top1['channel_title']}),预计≤15分钟"
  230. return action
  231. def has_clickbait_words(title: str):
  232. """检查标题中是否包含标题党词汇"""
  233. clickbait_words = ['INSANE', 'HYPE', 'SHOCKING', 'UNBELIEVABLE', 'MIND-BLOWING',
  234. 'AMAZING', 'INCREDIBLE', 'YOU WON\'T BELIEVE', 'THIS WILL BLOW YOUR MIND']
  235. title_upper = title.upper()
  236. for word in clickbait_words:
  237. if word in title_upper:
  238. return True
  239. return False
  240. def is_older_than_days(published_at_str: str, days: int = 30):
  241. """检查视频是否超过指定天数"""
  242. published_time = parse_published_time(published_at_str)
  243. if not published_time:
  244. return False
  245. now = datetime.now(timezone.utc)
  246. time_diff = now - published_time
  247. return time_diff > timedelta(days=days)
  248. def generate_risk(videos: list, themes: list):
  249. """生成 risk 字段:偏差检测"""
  250. if not videos:
  251. return "无风险"
  252. # 只检查 Top3
  253. top3 = videos[:3]
  254. warnings = []
  255. # 检查是否有超过30天的视频
  256. old_videos = []
  257. for video in top3:
  258. if is_older_than_days(video['published_at'], days=30):
  259. old_videos.append(video['title'])
  260. if old_videos:
  261. warnings.append(f"Top3中存在超过30天的视频: {', '.join(old_videos[:2])}")
  262. # 检查是否有标题党词汇
  263. clickbait_videos = []
  264. for video in top3:
  265. if has_clickbait_words(video['title']):
  266. clickbait_videos.append(video['title'])
  267. if clickbait_videos:
  268. warnings.append(f"检测到标题党词汇: {', '.join(clickbait_videos[:2])}")
  269. # 如果有警告,返回警告;否则返回正面评价
  270. if warnings:
  271. return "; ".join(warnings)
  272. else:
  273. return "今日信号较新且较可信"
  274. def init_research_llm():
  275. """初始化用于研究模式的 LLM(使用通义千问/ModelScope配置)"""
  276. if not LLM_AVAILABLE:
  277. print("⚠️ 警告: hello_agents 模块未安装,无法使用研究模式")
  278. return None
  279. # 从环境变量读取 LLM 配置(优先级顺序,与 chapter9 保持一致)
  280. # 优先使用 ModelScope 配置(通义千问)
  281. llm_model = (
  282. os.getenv("LLM_MODEL") or
  283. os.getenv("LLM_MODEL_ID") or
  284. "Qwen/Qwen2.5-7B-Instruct" # 默认通义千问模型
  285. )
  286. llm_api_key = (
  287. os.getenv("LLM_API_KEY") or # 优先使用 LLM_API_KEY(阿里云通义千问)
  288. os.getenv("MODELSCOPE_API_KEY") or
  289. os.getenv("MODELSCOPE_API_TOKEN")
  290. )
  291. llm_base_url = (
  292. os.getenv("LLM_BASE_URL") or
  293. "https://api-inference.modelscope.cn/v1/" # ModelScope 默认地址
  294. )
  295. llm_provider = os.getenv("LLM_PROVIDER", "modelscope")
  296. if not llm_api_key:
  297. print("⚠️ 警告: 未找到 LLM API Key,研究模式需要配置 LLM")
  298. print("💡 请设置环境变量(推荐在 .env 文件中配置):")
  299. print(" MODELSCOPE_API_KEY=your-modelscope-token-here")
  300. print(" LLM_MODEL=Qwen/Qwen2.5-7B-Instruct")
  301. print(" LLM_BASE_URL=https://api-inference.modelscope.cn/v1/")
  302. print(" LLM_PROVIDER=modelscope")
  303. return None
  304. try:
  305. llm = HelloAgentsLLM(
  306. model=llm_model,
  307. api_key=llm_api_key,
  308. base_url=llm_base_url,
  309. provider=llm_provider
  310. )
  311. print(f"✅ LLM 初始化成功: {llm_model} ({llm_provider})")
  312. return llm
  313. except Exception as e:
  314. print(f"⚠️ 初始化 LLM 失败: {e}")
  315. return None
  316. def prepare_sources_data(top3_videos: list):
  317. """从 Top3 视频中提取 sources 数据"""
  318. sources = []
  319. for video in top3_videos:
  320. sources.append({
  321. "title": video['title'],
  322. "channel": video['channel_title'],
  323. "url": video['url'],
  324. "published_at": video['published_at'],
  325. "score": video['score']
  326. })
  327. return sources
  328. def extract_json_from_text(text: str):
  329. """从文本中提取 JSON 内容(处理 LLM 可能返回的格式化文本)"""
  330. # 尝试直接解析
  331. try:
  332. return json.loads(text.strip())
  333. except json.JSONDecodeError:
  334. pass
  335. # 尝试提取 JSON 代码块
  336. json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
  337. if json_match:
  338. try:
  339. return json.loads(json_match.group(1))
  340. except json.JSONDecodeError:
  341. pass
  342. # 尝试提取第一个完整的 JSON 对象
  343. json_match = re.search(r'\{.*\}', text, re.DOTALL)
  344. if json_match:
  345. try:
  346. return json.loads(json_match.group(0))
  347. except json.JSONDecodeError:
  348. pass
  349. return None
  350. def generate_research_report(top3_videos: list, themes: list, llm):
  351. """使用 LLM 生成研究报告"""
  352. if not top3_videos:
  353. return None
  354. # 构建视频信息文本
  355. videos_info = []
  356. for i, video in enumerate(top3_videos, 1):
  357. videos_info.append(
  358. f"{i}. 标题: {video['title']}\n"
  359. f" 频道: {video['channel_title']}\n"
  360. f" 发布时间: {video['published_at']}\n"
  361. f" 评分: {video['score']}分\n"
  362. f" 链接: {video['url']}"
  363. )
  364. videos_text = "\n\n".join(videos_info)
  365. themes_text = ", ".join(themes)
  366. # 构建 prompt
  367. prompt = f"""基于以下 Top3 YouTube 视频信息,生成一份结构化研究报告。
  368. 视频信息:
  369. {videos_text}
  370. 搜索主题:{themes_text}
  371. 请以 JSON 格式返回以下内容:
  372. 1. question: 一个核心问题,概括这些视频的共同关注点
  373. 2. key_findings: 3条发现,每条1句话,基于标题/频道/发布时间推断,使用"可能/倾向"等措辞
  374. 3. why_it_matters_to_me: 为什么这些信息对我重要(个性化解释)
  375. 4. next_steps: 1-3条行动建议,每条≤15分钟
  376. 请严格按照以下 JSON 格式返回(不要包含其他文字):
  377. {{
  378. "question": "核心问题",
  379. "key_findings": [
  380. "发现1(使用可能/倾向等措辞)",
  381. "发现2(使用可能/倾向等措辞)",
  382. "发现3(使用可能/倾向等措辞)"
  383. ],
  384. "why_it_matters_to_me": "个性化解释",
  385. "next_steps": [
  386. "行动建议1(≤15分钟)",
  387. "行动建议2(≤15分钟)",
  388. "行动建议3(≤15分钟)"
  389. ]
  390. }}"""
  391. messages = [
  392. {"role": "system", "content": "你是一位专业的研究分析师,擅长从视频信息中提取关键洞察并给出可执行的行动建议。请始终以 JSON 格式返回结果。"},
  393. {"role": "user", "content": prompt}
  394. ]
  395. try:
  396. print("\n🔬 正在使用 LLM 生成研究报告...")
  397. response = llm.invoke(messages)
  398. if not response:
  399. print("⚠️ LLM 返回空响应")
  400. return None
  401. # 提取 JSON
  402. research_data = extract_json_from_text(response)
  403. if not research_data:
  404. print(f"⚠️ 无法解析 LLM 响应为 JSON,原始响应: {response[:200]}...")
  405. return None
  406. # 验证必需字段
  407. required_fields = ["question", "key_findings", "why_it_matters_to_me", "next_steps"]
  408. missing_fields = [field for field in required_fields if field not in research_data]
  409. if missing_fields:
  410. print(f"⚠️ LLM 响应缺少必需字段: {', '.join(missing_fields)}")
  411. return None
  412. # 确保 key_findings 是列表且有3条
  413. if not isinstance(research_data.get("key_findings"), list):
  414. research_data["key_findings"] = []
  415. if len(research_data["key_findings"]) != 3:
  416. # 如果不足3条,填充或截断
  417. while len(research_data["key_findings"]) < 3:
  418. research_data["key_findings"].append("暂无发现")
  419. research_data["key_findings"] = research_data["key_findings"][:3]
  420. # 确保 next_steps 是列表,最多3条
  421. if not isinstance(research_data.get("next_steps"), list):
  422. research_data["next_steps"] = []
  423. research_data["next_steps"] = research_data["next_steps"][:3]
  424. print("✅ 研究报告生成成功")
  425. return research_data
  426. except Exception as e:
  427. print(f"⚠️ 生成研究报告时出错: {e}")
  428. import traceback
  429. traceback.print_exc()
  430. return None
  431. def main():
  432. """主函数"""
  433. # 解析命令行参数
  434. parser = argparse.ArgumentParser(description="YouTube 视频搜索 - 多主题智能搜索与日报生成")
  435. parser.add_argument(
  436. "--mode",
  437. type=str,
  438. choices=["daily_signal", "research"],
  439. default="research",
  440. help="运行模式: research (默认,生成日报+研究报告) 或 daily_signal (仅生成日报)"
  441. )
  442. args = parser.parse_args()
  443. mode = args.mode
  444. print("=" * 70)
  445. print("YouTube 视频搜索 - 多主题智能搜索与日报生成")
  446. if mode == "research":
  447. print("运行模式: 研究模式 (将生成日报 + 研究报告)")
  448. else:
  449. print("运行模式: 日报模式 (仅生成日报)")
  450. print("=" * 70)
  451. # 1. 加载配置
  452. themes = load_themes()
  453. if not themes:
  454. print("❌ 无法加载主题列表,退出")
  455. return
  456. whitelist_channels = load_whitelist_channels()
  457. api_key = load_youtube_api_key()
  458. if not api_key:
  459. print("❌ 无法加载 API Key,退出")
  460. return
  461. # 2. 对每个主题搜索
  462. print(f"\n🔍 开始搜索 {len(themes)} 个主题...")
  463. all_videos = []
  464. for theme in themes:
  465. print(f" 搜索主题: {theme}")
  466. videos = search_youtube_videos(theme, max_results=10, api_key=api_key)
  467. if videos:
  468. all_videos.extend(videos)
  469. print(f" ✅ 找到 {len(videos)} 个视频")
  470. else:
  471. print(f" ⚠️ 未找到视频或搜索失败")
  472. if not all_videos:
  473. print("❌ 未找到任何视频,退出")
  474. return
  475. print(f"\n📊 合并前共找到 {len(all_videos)} 个视频")
  476. # 3. 合并去重
  477. unique_videos = merge_and_deduplicate_videos(all_videos)
  478. print(f"📊 去重后剩余 {len(unique_videos)} 个唯一视频")
  479. # 4. 时间窗口过滤:只考虑最近 DAYS_WINDOW 天的视频
  480. print(f"\n⏰ 应用时间窗口过滤({DAYS_WINDOW}天)...")
  481. filtered_videos = [v for v in unique_videos if is_within_time_window(v['published_at'], DAYS_WINDOW)]
  482. excluded_count = len(unique_videos) - len(filtered_videos)
  483. if excluded_count > 0:
  484. print(f" ⚠️ 过滤掉 {excluded_count} 个超过 {DAYS_WINDOW} 天的视频")
  485. print(f" ✅ 剩余 {len(filtered_videos)} 个视频参与排序")
  486. if not filtered_videos:
  487. print(f"❌ 时间窗口内({DAYS_WINDOW}天)未找到任何视频,退出")
  488. return
  489. # 5. 评分
  490. print(f"\n⭐ 开始评分...")
  491. for video in filtered_videos:
  492. score = score_video(video, themes, whitelist_channels)
  493. video['score'] = score
  494. video['scoring_details'] = {
  495. 'whitelist_bonus': 10 if video['channel_title'] in whitelist_channels else 0,
  496. 'keyword_matches': count_theme_keywords(video['title'], themes) + count_theme_keywords(video['description'], themes),
  497. 'time_bonus': calculate_time_score(video['published_at'])
  498. }
  499. # 6. 排序并取 Top 3
  500. sorted_videos = sorted(filtered_videos, key=lambda x: x['score'], reverse=True)
  501. top3_videos = sorted_videos[:3]
  502. print(f"\n🏆 Top 3 视频:")
  503. for i, video in enumerate(top3_videos, 1):
  504. print(f" {i}. [{video['score']}分] {video['title']}")
  505. print(f" 频道: {video['channel_title']}")
  506. print(f" 链接: {video['url']}")
  507. # 7. 生成日期字符串
  508. today = datetime.now().strftime("%Y-%m-%d")
  509. # 8. 创建输出目录
  510. base_dir = Path(__file__).parent
  511. raw_dir = base_dir / "raw" / "youtube"
  512. archive_dir = base_dir / "archive" / "youtube"
  513. raw_dir.mkdir(parents=True, exist_ok=True)
  514. archive_dir.mkdir(parents=True, exist_ok=True)
  515. # 9. 保存原始数据
  516. raw_file = raw_dir / f"{today}_raw.json"
  517. raw_data = {
  518. "date": today,
  519. "themes_used": themes,
  520. "whitelist_channels": whitelist_channels,
  521. "days_window": DAYS_WINDOW,
  522. "total_videos_found": len(all_videos),
  523. "unique_videos": len(unique_videos),
  524. "filtered_videos_count": len(filtered_videos),
  525. "all_videos": sorted_videos # 保存过滤后的视频,按评分排序
  526. }
  527. try:
  528. with open(raw_file, 'w', encoding='utf-8') as f:
  529. json.dump(raw_data, f, indent=2, ensure_ascii=False)
  530. print(f"\n💾 原始数据已保存到: {raw_file}")
  531. except Exception as e:
  532. print(f"❌ 保存原始数据失败: {e}")
  533. return
  534. # 10. 生成并保存日报
  535. action = generate_action(top3_videos)
  536. risk = generate_risk(sorted_videos, themes)
  537. daily_report = {
  538. "date": today,
  539. "themes_used": themes,
  540. "dimensions": [], # 新增:用户可选的维度标签(如:["健康", "情绪", "工作"]),向后兼容
  541. "top3": [
  542. {
  543. "title": video['title'],
  544. "channel": video['channel_title'],
  545. "url": video['url'],
  546. "score": video['score'],
  547. "published_at": video['published_at'],
  548. "scoring_details": video['scoring_details']
  549. }
  550. for video in top3_videos
  551. ],
  552. "action": action,
  553. "risk": risk
  554. }
  555. archive_file = archive_dir / f"{today}.json"
  556. try:
  557. with open(archive_file, 'w', encoding='utf-8') as f:
  558. json.dump(daily_report, f, indent=2, ensure_ascii=False)
  559. print(f"💾 日报信号已保存到: {archive_file}")
  560. except Exception as e:
  561. print(f"❌ 保存日报信号失败: {e}")
  562. return
  563. # 11. 如果模式是 research,生成研究报告
  564. if mode == "research":
  565. llm = init_research_llm()
  566. if llm:
  567. try:
  568. research_report = generate_research_report(top3_videos, themes, llm)
  569. if research_report:
  570. # 添加 sources 字段
  571. research_report["sources"] = prepare_sources_data(top3_videos)
  572. research_report["date"] = today
  573. research_report["themes_used"] = themes
  574. # 保存研究报告
  575. research_file = archive_dir / f"{today}_research.json"
  576. with open(research_file, 'w', encoding='utf-8') as f:
  577. json.dump(research_report, f, indent=2, ensure_ascii=False)
  578. print(f"\n💾 研究报告已保存到: {research_file}")
  579. # 显示研究报告摘要
  580. print("\n" + "=" * 70)
  581. print("🔬 研究报告摘要")
  582. print("=" * 70)
  583. print(f"核心问题: {research_report.get('question', 'N/A')}")
  584. print(f"\n关键发现:")
  585. for i, finding in enumerate(research_report.get('key_findings', []), 1):
  586. print(f" {i}. {finding}")
  587. print(f"\n为什么重要: {research_report.get('why_it_matters_to_me', 'N/A')}")
  588. print(f"\n下一步行动:")
  589. for i, step in enumerate(research_report.get('next_steps', []), 1):
  590. print(f" {i}. {step}")
  591. print("=" * 70)
  592. else:
  593. print("⚠️ 研究报告生成失败,已跳过")
  594. except Exception as e:
  595. print(f"⚠️ 生成研究报告时出错: {e}")
  596. import traceback
  597. traceback.print_exc()
  598. else:
  599. print("⚠️ 未配置 LLM,跳过研究模式")
  600. # 12. 显示日报摘要
  601. print("\n" + "=" * 70)
  602. print("📄 日报摘要")
  603. print("=" * 70)
  604. print(f"日期: {daily_report['date']}")
  605. print(f"主题: {', '.join(daily_report['themes_used'])}")
  606. print(f"\n推荐行动 (Action):")
  607. print(f" {daily_report['action']}")
  608. print(f"\n风险评估 (Risk):")
  609. print(f" {daily_report['risk']}")
  610. print("=" * 70)
  611. if __name__ == "__main__":
  612. main()