09_data_generation_win_rate.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. """
  2. 第十二章示例9:Win Rate评估
  3. 对应文档:12.4.4 Win Rate评估
  4. 这个示例展示如何使用Win Rate评估生成的AIME题目质量。
  5. Win Rate评估通过对比生成题目和真题,评估生成质量:
  6. - Win Rate = 50%:生成质量与真题相当(理想情况)
  7. - Win Rate > 50%:生成质量优于真题(可能是评估偏差)
  8. - Win Rate < 50%:生成质量低于真题(需要改进)
  9. """
  10. import sys
  11. import os
  12. import json
  13. # 添加HelloAgents路径
  14. sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "HelloAgents"))
  15. from hello_agents import HelloAgentsLLM
  16. from hello_agents.evaluation import WinRateEvaluator, AIDataset
  17. # 1. 准备生成的题目数据
  18. generated_problems = [
  19. {
  20. "problem_id": "generated_001",
  21. "problem": "Find the number of positive integers $n$ such that $n^2 + 19n + 92$ is a perfect square.",
  22. "answer": "4"
  23. },
  24. {
  25. "problem_id": "generated_002",
  26. "problem": "In triangle $ABC$, $AB = 13$, $BC = 14$, and $CA = 15$. Find the area of the triangle.",
  27. "answer": "84"
  28. },
  29. {
  30. "problem_id": "generated_003",
  31. "problem": "How many positive integers less than 1000 are divisible by 7 but not by 11?",
  32. "answer": "129"
  33. }
  34. ]
  35. # 2. 加载参考数据集(AIME真题)
  36. print("="*60)
  37. print("Win Rate评估")
  38. print("="*60)
  39. print("\n加载参考数据集...")
  40. dataset = AIDataset()
  41. reference_problems = dataset.load()
  42. print(f"✅ 已加载 {len(reference_problems)} 道AIME真题")
  43. # 3. 创建Win Rate评估器
  44. llm = HelloAgentsLLM(model_name="gpt-4o")
  45. evaluator = WinRateEvaluator(
  46. llm=llm,
  47. reference_problems=reference_problems
  48. )
  49. # 4. 运行Win Rate评估
  50. print(f"\n开始Win Rate评估...")
  51. print(f" 生成题目数: {len(generated_problems)}")
  52. print(f" 对比数量: 20")
  53. results = evaluator.evaluate(
  54. generated_problems=generated_problems,
  55. num_comparisons=20 # 进行20次对比
  56. )
  57. # 5. 显示评估结果
  58. print("\n" + "="*60)
  59. print("评估结果")
  60. print("="*60)
  61. print(f"\nWin Rate: {results['win_rate']:.2%}")
  62. print(f"Tie Rate: {results['tie_rate']:.2%}")
  63. print(f"Loss Rate: {results['loss_rate']:.2%}")
  64. print(f"\n详细统计:")
  65. print(f" 总对比数: {results['total_comparisons']}")
  66. print(f" 生成题目胜: {results['wins']}")
  67. print(f" 平局: {results['ties']}")
  68. print(f" 真题胜: {results['losses']}")
  69. # 6. 质量评估
  70. print(f"\n质量评估:")
  71. win_rate = results['win_rate']
  72. if 0.45 <= win_rate <= 0.55:
  73. print("✅ 优秀 - 生成质量接近AIME真题水平")
  74. elif 0.35 <= win_rate < 0.45:
  75. print("⚠️ 良好 - 生成质量可用,但略低于真题")
  76. elif 0.25 <= win_rate < 0.35:
  77. print("⚠️ 一般 - 生成质量一般,需要改进")
  78. else:
  79. print("❌ 较差 - 生成质量差,需要大幅改进")
  80. # 7. 查看部分对比详情
  81. print("\n" + "="*60)
  82. print("对比详情(前5个)")
  83. print("="*60)
  84. for i, comparison in enumerate(results['comparisons'][:5], 1):
  85. print(f"\n对比 {i}:")
  86. print(f" 生成题目: {comparison['generated_problem'][:60]}...")
  87. print(f" 真题: {comparison['reference_problem'][:60]}...")
  88. print(f" 结果: {comparison['result']}")
  89. if 'reason' in comparison:
  90. print(f" 理由: {comparison['reason'][:100]}...")
  91. # 8. 保存评估结果
  92. output_file = "./evaluation_results/win_rate_results.json"
  93. os.makedirs(os.path.dirname(output_file), exist_ok=True)
  94. with open(output_file, 'w', encoding='utf-8') as f:
  95. json.dump(results, f, indent=2, ensure_ascii=False)
  96. print(f"\n✅ 评估结果已保存到 {output_file}")
  97. # 运行输出示例:
  98. # ============================================================
  99. # Win Rate评估
  100. # ============================================================
  101. #
  102. # 加载参考数据集...
  103. # ✅ 已加载 963 道AIME真题
  104. #
  105. # 开始Win Rate评估...
  106. # 生成题目数: 3
  107. # 对比数量: 20
  108. #
  109. # Win Rate评估: 100%|██████████| 20/20 [01:00<00:00, 3.01s/对比]
  110. #
  111. # ============================================================
  112. # 评估结果
  113. # ============================================================
  114. #
  115. # Win Rate: 45.00%
  116. # Tie Rate: 10.00%
  117. # Loss Rate: 45.00%
  118. #
  119. # 详细统计:
  120. # 总对比数: 20
  121. # 生成题目胜: 9
  122. # 平局: 2
  123. # 真题胜: 9
  124. #
  125. # 质量评估:
  126. # ✅ 优秀 - 生成质量接近AIME真题水平
  127. #
  128. # ============================================================
  129. # 对比详情(前5个)
  130. # ============================================================
  131. #
  132. # 对比 1:
  133. # 生成题目: Find the number of positive integers $n$ such that $n^2 + 19...
  134. # 真题: Let $N$ be the number of consecutive $0$'s at the right end...
  135. # 结果: generated
  136. # 理由: The generated problem has a clearer problem statement and a mo...
  137. #
  138. # 对比 2:
  139. # 生成题目: In triangle $ABC$, $AB = 13$, $BC = 14$, and $CA = 15$. F...
  140. # 真题: Find the number of ordered pairs $(m,n)$ of positive integers...
  141. # 结果: reference
  142. # 理由: The reference problem is more challenging and requires deeper...
  143. #
  144. # ...
  145. #
  146. # ✅ 评估结果已保存到 ./evaluation_results/win_rate_results.json