| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149 |
- """
- 第十二章示例6:GAIA评估最佳实践
- 对应文档:12.3.9 GAIA评估最佳实践
- 这个示例展示了GAIA评估的最佳实践,包括:
- 1. 分级评估
- 2. 小样本快速测试
- 3. 结果解读
- """
- import os
- from hello_agents import SimpleAgent, HelloAgentsLLM
- from hello_agents.tools import GAIAEvaluationTool
- # GAIA官方系统提示词
- GAIA_SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
- YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
- # 创建智能体
- llm = HelloAgentsLLM()
- agent = SimpleAgent(
- name="TestAgent",
- llm=llm,
- system_prompt=GAIA_SYSTEM_PROMPT
- )
- # 创建评估工具
- gaia_tool = GAIAEvaluationTool()
- # ============================================================
- # 最佳实践1:分级评估
- # ============================================================
- print("="*60)
- print("最佳实践1:分级评估")
- print("="*60)
- # 第一步:评估Level 1(简单任务)
- print("\n第一步:评估Level 1(简单任务)")
- results_l1 = gaia_tool.run(agent, level=1, max_samples=10)
- print(f"Level 1精确匹配率: {results_l1['exact_match_rate']:.2%}")
- # 第二步:如果Level 1表现良好,评估Level 2
- if results_l1['exact_match_rate'] > 0.6:
- print("\n第二步:评估Level 2(中等任务)")
- results_l2 = gaia_tool.run(agent, level=2, max_samples=10)
- print(f"Level 2精确匹配率: {results_l2['exact_match_rate']:.2%}")
-
- # 第三步:如果Level 2表现良好,评估Level 3
- if results_l2['exact_match_rate'] > 0.4:
- print("\n第三步:评估Level 3(困难任务)")
- results_l3 = gaia_tool.run(agent, level=3, max_samples=10)
- print(f"Level 3精确匹配率: {results_l3['exact_match_rate']:.2%}")
- else:
- print("\n⚠️ Level 2表现不佳,建议先优化后再评估Level 3")
- else:
- print("\n⚠️ Level 1表现不佳,建议先优化后再评估更高级别")
- # ============================================================
- # 最佳实践2:小样本快速测试
- # ============================================================
- print("\n" + "="*60)
- print("最佳实践2:小样本快速测试")
- print("="*60)
- # 快速测试(每个级别2个样本)
- for level in [1, 2, 3]:
- print(f"\n快速测试 Level {level}:")
- results = gaia_tool.run(agent, level=level, max_samples=2)
- print(f" 精确匹配率: {results['exact_match_rate']:.2%}")
- # ============================================================
- # 最佳实践3:结果解读
- # ============================================================
- print("\n" + "="*60)
- print("最佳实践3:结果解读")
- print("="*60)
- def interpret_results(level, exact_match_rate):
- """解读评估结果"""
- print(f"\nLevel {level} 结果解读:")
- print(f"精确匹配率: {exact_match_rate:.2%}")
-
- if level == 1:
- if exact_match_rate >= 0.6:
- print("✅ 优秀 - 基础能力扎实")
- elif exact_match_rate >= 0.4:
- print("⚠️ 良好 - 基础能力可用")
- else:
- print("❌ 较差 - 需要改进")
- print("建议:")
- print(" - 检查系统提示词是否包含GAIA官方格式要求")
- print(" - 检查答案提取逻辑是否正确")
- print(" - 检查LLM模型是否足够强大")
-
- elif level == 2:
- if exact_match_rate >= 0.4:
- print("✅ 优秀 - 中等任务能力强")
- elif exact_match_rate >= 0.2:
- print("⚠️ 良好 - 中等任务能力可用")
- else:
- print("❌ 较差 - 需要改进")
- print("建议:")
- print(" - 增强多步推理能力")
- print(" - 增加工具使用能力")
- print(" - 优化推理链的构建")
-
- elif level == 3:
- if exact_match_rate >= 0.2:
- print("✅ 优秀 - 复杂任务能力强")
- elif exact_match_rate >= 0.1:
- print("⚠️ 良好 - 复杂任务能力可用")
- else:
- print("❌ 较差 - 需要改进")
- print("建议:")
- print(" - 增强复杂推理能力")
- print(" - 增加长上下文处理能力")
- print(" - 优化工具链的组合使用")
- # 解读结果
- if 'results_l1' in locals():
- interpret_results(1, results_l1['exact_match_rate'])
- if 'results_l2' in locals():
- interpret_results(2, results_l2['exact_match_rate'])
- if 'results_l3' in locals():
- interpret_results(3, results_l3['exact_match_rate'])
- # ============================================================
- # 难度递进分析
- # ============================================================
- print("\n" + "="*60)
- print("难度递进分析")
- print("="*60)
- if 'results_l1' in locals() and 'results_l2' in locals():
- if results_l1['exact_match_rate'] > results_l2['exact_match_rate']:
- print("✅ 正常递进:Level 1 > Level 2")
- else:
- print("⚠️ 异常情况:Level 2 >= Level 1(可能是数据集偏差或智能体特性)")
- if 'results_l2' in locals() and 'results_l3' in locals():
- if results_l2['exact_match_rate'] > results_l3['exact_match_rate']:
- print("✅ 正常递进:Level 2 > Level 3")
- else:
- print("⚠️ 异常情况:Level 3 >= Level 2(可能是数据集偏差或智能体特性)")
|