1
0

08_data_generation_llm_judge.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. """
  2. 第十二章示例8:LLM Judge评估
  3. 对应文档:12.4.3 LLM Judge评估
  4. 这个示例展示如何使用LLM Judge评估生成的AIME题目质量。
  5. LLM Judge从4个维度评估题目质量:
  6. 1. 正确性(Correctness):题目和答案是否正确
  7. 2. 清晰度(Clarity):题目表述是否清晰
  8. 3. 难度匹配(Difficulty Match):难度是否符合AIME水平
  9. 4. 完整性(Completeness):题目是否完整
  10. """
  11. import sys
  12. import os
  13. import json
  14. # 添加HelloAgents路径
  15. sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "HelloAgents"))
  16. from hello_agents import HelloAgentsLLM
  17. from hello_agents.evaluation import LLMJudge
  18. # 1. 准备生成的题目数据
  19. generated_problems = [
  20. {
  21. "problem_id": "generated_001",
  22. "problem": "Find the number of positive integers $n$ such that $n^2 + 19n + 92$ is a perfect square.",
  23. "answer": "4",
  24. "solution": "Let $n^2 + 19n + 92 = m^2$ for some positive integer $m$..."
  25. },
  26. {
  27. "problem_id": "generated_002",
  28. "problem": "In triangle $ABC$, $AB = 13$, $BC = 14$, and $CA = 15$. Find the area of the triangle.",
  29. "answer": "84",
  30. "solution": "Using Heron's formula, $s = (13+14+15)/2 = 21$..."
  31. }
  32. ]
  33. # 2. 创建LLM Judge评估器
  34. llm = HelloAgentsLLM(model_name="gpt-4o")
  35. judge = LLMJudge(llm=llm)
  36. # 3. 评估每道题目
  37. print("="*60)
  38. print("LLM Judge评估")
  39. print("="*60)
  40. all_scores = []
  41. for i, problem in enumerate(generated_problems, 1):
  42. print(f"\n评估题目 {i}/{len(generated_problems)}")
  43. print(f"题目ID: {problem['problem_id']}")
  44. # 评估单道题目
  45. result = judge.evaluate_single(problem)
  46. # 显示评估结果
  47. print(f"\n评估结果:")
  48. print(f" 正确性: {result['correctness']}/5")
  49. print(f" 清晰度: {result['clarity']}/5")
  50. print(f" 难度匹配: {result['difficulty_match']}/5")
  51. print(f" 完整性: {result['completeness']}/5")
  52. print(f" 平均分: {result['average_score']:.2f}/5")
  53. print(f"\n评语:")
  54. print(f" {result['feedback']}")
  55. all_scores.append(result)
  56. # 4. 计算总体统计
  57. print("\n" + "="*60)
  58. print("总体统计")
  59. print("="*60)
  60. avg_correctness = sum(s['correctness'] for s in all_scores) / len(all_scores)
  61. avg_clarity = sum(s['clarity'] for s in all_scores) / len(all_scores)
  62. avg_difficulty = sum(s['difficulty_match'] for s in all_scores) / len(all_scores)
  63. avg_completeness = sum(s['completeness'] for s in all_scores) / len(all_scores)
  64. avg_overall = sum(s['average_score'] for s in all_scores) / len(all_scores)
  65. print(f"\n平均分:")
  66. print(f" 正确性: {avg_correctness:.2f}/5")
  67. print(f" 清晰度: {avg_clarity:.2f}/5")
  68. print(f" 难度匹配: {avg_difficulty:.2f}/5")
  69. print(f" 完整性: {avg_completeness:.2f}/5")
  70. print(f" 总体平均: {avg_overall:.2f}/5")
  71. # 5. 质量评估
  72. print(f"\n质量评估:")
  73. if avg_overall >= 4.0:
  74. print("✅ 优秀 - 题目质量很高,可以直接使用")
  75. elif avg_overall >= 3.0:
  76. print("⚠️ 良好 - 题目质量可用,建议人工审核")
  77. elif avg_overall >= 2.0:
  78. print("⚠️ 一般 - 题目质量一般,需要大幅改进")
  79. else:
  80. print("❌ 较差 - 题目质量差,需要重新生成")
  81. # 6. 保存评估结果
  82. output_file = "./evaluation_results/llm_judge_results.json"
  83. os.makedirs(os.path.dirname(output_file), exist_ok=True)
  84. with open(output_file, 'w', encoding='utf-8') as f:
  85. json.dump({
  86. 'problems': generated_problems,
  87. 'scores': all_scores,
  88. 'statistics': {
  89. 'avg_correctness': avg_correctness,
  90. 'avg_clarity': avg_clarity,
  91. 'avg_difficulty': avg_difficulty,
  92. 'avg_completeness': avg_completeness,
  93. 'avg_overall': avg_overall
  94. }
  95. }, f, indent=2, ensure_ascii=False)
  96. print(f"\n✅ 评估结果已保存到 {output_file}")
  97. # 运行输出示例:
  98. # ============================================================
  99. # LLM Judge评估
  100. # ============================================================
  101. #
  102. # 评估题目 1/2
  103. # 题目ID: generated_001
  104. #
  105. # 评估结果:
  106. # 正确性: 5/5
  107. # 清晰度: 4/5
  108. # 难度匹配: 5/5
  109. # 完整性: 5/5
  110. # 平均分: 4.75/5
  111. #
  112. # 评语:
  113. # This is an excellent AIME-level problem. The problem is well-posed,
  114. # the solution is correct, and the difficulty is appropriate.
  115. #
  116. # 评估题目 2/2
  117. # 题目ID: generated_002
  118. #
  119. # 评估结果:
  120. # 正确性: 5/5
  121. # 清晰度: 5/5
  122. # 难度匹配: 3/5
  123. # 完整性: 5/5
  124. # 平均分: 4.50/5
  125. #
  126. # 评语:
  127. # The problem is correct and clear, but the difficulty is slightly
  128. # below AIME level. Consider adding more complexity.
  129. #
  130. # ============================================================
  131. # 总体统计
  132. # ============================================================
  133. #
  134. # 平均分:
  135. # 正确性: 5.00/5
  136. # 清晰度: 4.50/5
  137. # 难度匹配: 4.00/5
  138. # 完整性: 5.00/5
  139. # 总体平均: 4.62/5
  140. #
  141. # 质量评估:
  142. # ✅ 优秀 - 题目质量很高,可以直接使用
  143. #
  144. # ✅ 评估结果已保存到 ./evaluation_results/llm_judge_results.json