run_complete_evaluation.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. """
  2. 完整评估流程
  3. 运行完整的数据生成和评估流程:
  4. 1. 生成AIME题目
  5. 2. LLM Judge评估
  6. 3. Win Rate评估
  7. 4. 生成综合报告
  8. 运行方法:
  9. python data_generation/run_complete_evaluation.py 30 3.0
  10. 参数:
  11. - 30: 生成题目数量
  12. - 3.0: 每次生成之间的延迟(秒)
  13. 说明:
  14. - 使用AIME 2025年真题作为参考
  15. - 数据集来源:math-ai/aime25(JSONL格式)
  16. """
  17. import json
  18. import os
  19. import sys
  20. from datetime import datetime
  21. from aime_generator import AIMEGenerator
  22. from hello_agents import SimpleAgent, HelloAgentsLLM
  23. from hello_agents.tools import LLMJudgeTool, WinRateTool
  24. def run_complete_evaluation(
  25. num_problems: int = 30,
  26. delay_seconds: float = 3.0
  27. ):
  28. """
  29. 运行完整评估流程
  30. Args:
  31. num_problems: 生成题目数量
  32. delay_seconds: 每次生成之间的延迟(秒),避免API速率限制
  33. """
  34. print("\n" + "="*80)
  35. print("🚀 AIME数据生成与评估完整流程")
  36. print("="*80)
  37. print(f"\n配置信息:")
  38. print(f" - 生成题目数量: {num_problems}")
  39. print(f" - API延迟: {delay_seconds}秒/题")
  40. print(f" - 生成参考数据: TianHongZXY/aime-1983-2025(900+道题)")
  41. print(f" - 评估参考: AIME 2025真题")
  42. # ========== 步骤1: 生成AIME题目 ==========
  43. print("\n" + "="*80)
  44. print("📝 步骤1: 生成AIME题目")
  45. print("="*80)
  46. generator = AIMEGenerator(delay_seconds=delay_seconds)
  47. generated_data_path = generator.generate_and_save(
  48. num_problems=num_problems,
  49. output_dir="data_generation/generated_data"
  50. )
  51. print(f"\n✅ 步骤1完成!生成数据保存在: {generated_data_path}")
  52. # ========== 步骤2: 评估 ==========
  53. # 创建评估结果目录
  54. timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  55. evaluation_dir = f"data_generation/evaluation_results/{timestamp}"
  56. os.makedirs(evaluation_dir, exist_ok=True)
  57. os.makedirs(os.path.join(evaluation_dir, "llm_judge"), exist_ok=True)
  58. os.makedirs(os.path.join(evaluation_dir, "win_rate"), exist_ok=True)
  59. # 创建LLM
  60. llm = HelloAgentsLLM()
  61. # ========== 步骤2.1: LLM Judge评估 ==========
  62. print(f"\n🎯 步骤2.1: LLM Judge评估 (vs AIME 2025)")
  63. llm_judge_result = None
  64. try:
  65. llm_judge_tool = LLMJudgeTool(llm=llm)
  66. llm_judge_result_json = llm_judge_tool.run({
  67. "generated_data_path": generated_data_path,
  68. "reference_year": 2025,
  69. "max_samples": num_problems,
  70. "output_dir": os.path.join(evaluation_dir, "llm_judge"),
  71. "judge_model": "gpt-4o"
  72. })
  73. llm_judge_result = json.loads(llm_judge_result_json)
  74. print(f"\n✅ LLM Judge评估完成!")
  75. print(f" 平均总分: {llm_judge_result['metrics']['average_total_score']:.2f}/5.0")
  76. print(f" 通过率: {llm_judge_result['metrics']['pass_rate']:.2%}")
  77. except Exception as e:
  78. print(f"\n❌ LLM Judge评估失败: {e}")
  79. import traceback
  80. traceback.print_exc()
  81. # ========== 步骤2.2: Win Rate评估 ==========
  82. print(f"\n🏆 步骤2.2: Win Rate评估 (vs AIME 2025)")
  83. win_rate_result = None
  84. try:
  85. win_rate_tool = WinRateTool(llm=llm)
  86. win_rate_result_json = win_rate_tool.run({
  87. "generated_data_path": generated_data_path,
  88. "reference_year": 2025,
  89. "num_comparisons": min(num_problems, 20), # 最多20次对比
  90. "output_dir": os.path.join(evaluation_dir, "win_rate"),
  91. "judge_model": "gpt-4o"
  92. })
  93. win_rate_result = json.loads(win_rate_result_json)
  94. print(f"\n✅ Win Rate评估完成!")
  95. print(f" Win Rate: {win_rate_result['metrics']['win_rate']:.2%}")
  96. except Exception as e:
  97. print(f"\n❌ Win Rate评估失败: {e}")
  98. import traceback
  99. traceback.print_exc()
  100. # ========== 步骤3: 生成综合报告 ==========
  101. comprehensive_report_path = None
  102. if llm_judge_result or win_rate_result:
  103. print("\n" + "="*80)
  104. print("📊 步骤3: 生成综合报告")
  105. print("="*80)
  106. comprehensive_report_path = os.path.join(evaluation_dir, "comprehensive_report.md")
  107. # 生成综合报告
  108. report = generate_comprehensive_report(
  109. generated_data_path,
  110. llm_judge_result,
  111. win_rate_result
  112. )
  113. with open(comprehensive_report_path, 'w', encoding='utf-8') as f:
  114. f.write(report)
  115. print(f"\n✅ 综合报告已保存: {comprehensive_report_path}")
  116. # ========== 完成 ==========
  117. print("\n" + "="*80)
  118. print("🎉 完整评估流程完成!")
  119. print("="*80)
  120. print(f"\n📁 输出文件:")
  121. print(f" - 生成数据: {generated_data_path}")
  122. print(f" - 评估结果目录: {evaluation_dir}")
  123. if llm_judge_result:
  124. print(f" - LLM Judge报告: {llm_judge_result.get('report_file', 'N/A')}")
  125. if win_rate_result:
  126. print(f" - Win Rate报告: {win_rate_result.get('report_file', 'N/A')}")
  127. if comprehensive_report_path:
  128. print(f" - 综合报告: {comprehensive_report_path}")
  129. print(f"\n💡 下一步:")
  130. if comprehensive_report_path:
  131. print(f" 1. 查看综合报告: {comprehensive_report_path}")
  132. print(f" 2. 运行人工验证: python data_generation/human_verification_ui.py {generated_data_path}")
  133. return {
  134. "generated_data_path": generated_data_path,
  135. "llm_judge_result": llm_judge_result,
  136. "win_rate_result": win_rate_result,
  137. "comprehensive_report_path": comprehensive_report_path
  138. }
  139. def generate_comprehensive_report(
  140. generated_data_path: str,
  141. llm_judge_result: dict,
  142. win_rate_result: dict
  143. ) -> str:
  144. """生成综合评估报告"""
  145. # 加载生成数据
  146. with open(generated_data_path, 'r', encoding='utf-8') as f:
  147. generated_data = json.load(f)
  148. report = f"""# AIME数据生成与评估综合报告
  149. ## 1. 基本信息
  150. - **生成时间**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
  151. - **生成题目数量**: {len(generated_data)}
  152. - **参考AIME年份**: 2025
  153. - **生成数据路径**: {generated_data_path}
  154. ## 2. 数据生成统计
  155. ### 主题分布
  156. """
  157. # 统计主题分布
  158. topic_counts = {}
  159. for item in generated_data:
  160. topic = item.get('topic', 'Unknown')
  161. topic_counts[topic] = topic_counts.get(topic, 0) + 1
  162. report += "| 主题 | 数量 | 占比 |\n"
  163. report += "|------|------|------|\n"
  164. for topic, count in sorted(topic_counts.items(), key=lambda x: x[1], reverse=True):
  165. percentage = count / len(generated_data) * 100
  166. report += f"| {topic} | {count} | {percentage:.1f}% |\n"
  167. # LLM Judge结果
  168. if llm_judge_result:
  169. report += "\n## 3. LLM Judge评估结果\n\n"
  170. report += f"""**总体评分**:
  171. - 平均总分: {llm_judge_result['metrics']['average_total_score']:.2f}/5.0
  172. - 通过率: {llm_judge_result['metrics']['pass_rate']:.2%}
  173. - 优秀率: {llm_judge_result['metrics']['excellent_rate']:.2%}
  174. **各维度评分**:
  175. | 维度 | 平均分 |
  176. |------|--------|
  177. | 正确性 | {llm_judge_result['metrics']['dimension_averages']['correctness']:.2f}/5.0 |
  178. | 清晰度 | {llm_judge_result['metrics']['dimension_averages']['clarity']:.2f}/5.0 |
  179. | 难度匹配 | {llm_judge_result['metrics']['dimension_averages']['difficulty_match']:.2f}/5.0 |
  180. | 完整性 | {llm_judge_result['metrics']['dimension_averages']['completeness']:.2f}/5.0 |
  181. """
  182. # Win Rate结果
  183. if win_rate_result:
  184. report += "\n## 4. Win Rate评估结果\n\n"
  185. report += f"""**胜率统计**:
  186. - Win Rate: {win_rate_result['metrics']['win_rate']:.2%}
  187. - Loss Rate: {win_rate_result['metrics']['loss_rate']:.2%}
  188. - Tie Rate: {win_rate_result['metrics']['tie_rate']:.2%}
  189. **对比次数**:
  190. - 总对比次数: {win_rate_result['metrics']['total_comparisons']} 次
  191. - 胜出次数: {win_rate_result['metrics']['wins']} 次
  192. - 失败次数: {win_rate_result['metrics']['losses']} 次
  193. - 平局次数: {win_rate_result['metrics']['ties']} 次
  194. """
  195. # 综合结论
  196. report += "\n## 5. 综合结论\n\n"
  197. if llm_judge_result and win_rate_result:
  198. overall_avg_score = llm_judge_result['metrics']['average_total_score']
  199. overall_win_rate = win_rate_result['metrics']['win_rate']
  200. if overall_avg_score >= 4.5 and overall_win_rate >= 0.48:
  201. report += "✅ **结论**: 生成数据质量**优秀**,达到或超过AIME真题水平。\n"
  202. elif overall_avg_score >= 4.0 and overall_win_rate >= 0.45:
  203. report += "✅ **结论**: 生成数据质量**良好**,接近AIME真题水平。\n"
  204. else:
  205. report += "⚠️ **结论**: 生成数据质量**需要改进**,与AIME真题仍有差距。\n"
  206. report += f"\n**整体指标**:\n"
  207. report += f"- LLM Judge得分: {overall_avg_score:.2f}/5.0\n"
  208. report += f"- Win Rate: {overall_win_rate:.2%}\n"
  209. # 改进建议
  210. report += "\n## 6. 改进建议\n\n"
  211. if llm_judge_result:
  212. avg_score = llm_judge_result['metrics']['average_total_score']
  213. if avg_score >= 4.5:
  214. report += "- ✅ 继续保持当前的生成策略\n"
  215. report += "- ✅ 可以考虑增加生成数量\n"
  216. elif avg_score >= 4.0:
  217. report += "- 🔄 优化题目生成的提示词\n"
  218. report += "- 🔄 增加质量过滤步骤\n"
  219. else:
  220. report += "- ⚠️ 需要重新设计生成提示词\n"
  221. report += "- ⚠️ 考虑使用更强的生成模型\n"
  222. report += "- ⚠️ 增加人工审核环节\n"
  223. # 下一步行动
  224. report += "\n## 7. 下一步行动\n\n"
  225. report += "1. **人工验证**: 运行人工验证界面,对生成的题目进行人工审核\n"
  226. report += f" ```bash\n python data_generation/human_verification_ui.py {generated_data_path}\n ```\n\n"
  227. report += "2. **质量筛选**: 根据评估结果筛选高质量题目\n\n"
  228. report += "3. **迭代优化**: 根据评估反馈优化生成策略\n"
  229. report += f"\n---\n\n*报告生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n"
  230. return report
  231. def main():
  232. if len(sys.argv) < 2:
  233. print("用法: python run_complete_evaluation.py <num_problems> [delay_seconds]")
  234. print("\n说明:")
  235. print(" - 使用AIME 2025年真题作为参考")
  236. print(" - 数据集来源: math-ai/aime25(JSONL格式)")
  237. print("\n示例:")
  238. print("python run_complete_evaluation.py 30 3.0")
  239. sys.exit(1)
  240. # 解析命令行参数
  241. num_problems = int(sys.argv[1])
  242. delay_seconds = float(sys.argv[2]) if len(sys.argv) > 2 else 3.0
  243. # 运行完整评估
  244. run_complete_evaluation(
  245. num_problems=num_problems,
  246. delay_seconds=delay_seconds
  247. )
  248. if __name__ == "__main__":
  249. main()