06_gaia_best_practices.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. """
  2. 第十二章示例6:GAIA评估最佳实践
  3. 对应文档:12.3.9 GAIA评估最佳实践
  4. 这个示例展示了GAIA评估的最佳实践,包括:
  5. 1. 分级评估
  6. 2. 小样本快速测试
  7. 3. 结果解读
  8. """
  9. import os
  10. from hello_agents import SimpleAgent, HelloAgentsLLM
  11. from hello_agents.tools import GAIAEvaluationTool
  12. # GAIA官方系统提示词
  13. GAIA_SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
  14. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
  15. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
  16. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
  17. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
  18. # 创建智能体
  19. llm = HelloAgentsLLM()
  20. agent = SimpleAgent(
  21. name="TestAgent",
  22. llm=llm,
  23. system_prompt=GAIA_SYSTEM_PROMPT
  24. )
  25. # 创建评估工具
  26. gaia_tool = GAIAEvaluationTool()
  27. # ============================================================
  28. # 最佳实践1:分级评估
  29. # ============================================================
  30. print("="*60)
  31. print("最佳实践1:分级评估")
  32. print("="*60)
  33. # 第一步:评估Level 1(简单任务)
  34. print("\n第一步:评估Level 1(简单任务)")
  35. results_l1 = gaia_tool.run(agent, level=1, max_samples=10)
  36. print(f"Level 1精确匹配率: {results_l1['exact_match_rate']:.2%}")
  37. # 第二步:如果Level 1表现良好,评估Level 2
  38. if results_l1['exact_match_rate'] > 0.6:
  39. print("\n第二步:评估Level 2(中等任务)")
  40. results_l2 = gaia_tool.run(agent, level=2, max_samples=10)
  41. print(f"Level 2精确匹配率: {results_l2['exact_match_rate']:.2%}")
  42. # 第三步:如果Level 2表现良好,评估Level 3
  43. if results_l2['exact_match_rate'] > 0.4:
  44. print("\n第三步:评估Level 3(困难任务)")
  45. results_l3 = gaia_tool.run(agent, level=3, max_samples=10)
  46. print(f"Level 3精确匹配率: {results_l3['exact_match_rate']:.2%}")
  47. else:
  48. print("\n⚠️ Level 2表现不佳,建议先优化后再评估Level 3")
  49. else:
  50. print("\n⚠️ Level 1表现不佳,建议先优化后再评估更高级别")
  51. # ============================================================
  52. # 最佳实践2:小样本快速测试
  53. # ============================================================
  54. print("\n" + "="*60)
  55. print("最佳实践2:小样本快速测试")
  56. print("="*60)
  57. # 快速测试(每个级别2个样本)
  58. for level in [1, 2, 3]:
  59. print(f"\n快速测试 Level {level}:")
  60. results = gaia_tool.run(agent, level=level, max_samples=2)
  61. print(f" 精确匹配率: {results['exact_match_rate']:.2%}")
  62. # ============================================================
  63. # 最佳实践3:结果解读
  64. # ============================================================
  65. print("\n" + "="*60)
  66. print("最佳实践3:结果解读")
  67. print("="*60)
  68. def interpret_results(level, exact_match_rate):
  69. """解读评估结果"""
  70. print(f"\nLevel {level} 结果解读:")
  71. print(f"精确匹配率: {exact_match_rate:.2%}")
  72. if level == 1:
  73. if exact_match_rate >= 0.6:
  74. print("✅ 优秀 - 基础能力扎实")
  75. elif exact_match_rate >= 0.4:
  76. print("⚠️ 良好 - 基础能力可用")
  77. else:
  78. print("❌ 较差 - 需要改进")
  79. print("建议:")
  80. print(" - 检查系统提示词是否包含GAIA官方格式要求")
  81. print(" - 检查答案提取逻辑是否正确")
  82. print(" - 检查LLM模型是否足够强大")
  83. elif level == 2:
  84. if exact_match_rate >= 0.4:
  85. print("✅ 优秀 - 中等任务能力强")
  86. elif exact_match_rate >= 0.2:
  87. print("⚠️ 良好 - 中等任务能力可用")
  88. else:
  89. print("❌ 较差 - 需要改进")
  90. print("建议:")
  91. print(" - 增强多步推理能力")
  92. print(" - 增加工具使用能力")
  93. print(" - 优化推理链的构建")
  94. elif level == 3:
  95. if exact_match_rate >= 0.2:
  96. print("✅ 优秀 - 复杂任务能力强")
  97. elif exact_match_rate >= 0.1:
  98. print("⚠️ 良好 - 复杂任务能力可用")
  99. else:
  100. print("❌ 较差 - 需要改进")
  101. print("建议:")
  102. print(" - 增强复杂推理能力")
  103. print(" - 增加长上下文处理能力")
  104. print(" - 优化工具链的组合使用")
  105. # 解读结果
  106. if 'results_l1' in locals():
  107. interpret_results(1, results_l1['exact_match_rate'])
  108. if 'results_l2' in locals():
  109. interpret_results(2, results_l2['exact_match_rate'])
  110. if 'results_l3' in locals():
  111. interpret_results(3, results_l3['exact_match_rate'])
  112. # ============================================================
  113. # 难度递进分析
  114. # ============================================================
  115. print("\n" + "="*60)
  116. print("难度递进分析")
  117. print("="*60)
  118. if 'results_l1' in locals() and 'results_l2' in locals():
  119. if results_l1['exact_match_rate'] > results_l2['exact_match_rate']:
  120. print("✅ 正常递进:Level 1 > Level 2")
  121. else:
  122. print("⚠️ 异常情况:Level 2 >= Level 1(可能是数据集偏差或智能体特性)")
  123. if 'results_l2' in locals() and 'results_l3' in locals():
  124. if results_l2['exact_match_rate'] > results_l3['exact_match_rate']:
  125. print("✅ 正常递进:Level 2 > Level 3")
  126. else:
  127. print("⚠️ 异常情况:Level 3 >= Level 2(可能是数据集偏差或智能体特性)")