moki
/
hello-agents
spogulis no https://github.com/datawhalechina/hello-agents.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
							"""
第十二章示例6：GAIA评估最佳实践

对应文档：12.3.9 GAIA评估最佳实践

这个示例展示了GAIA评估的最佳实践，包括：
1. 分级评估
2. 小样本快速测试
3. 结果解读
"""

import os
from hello_agents import SimpleAgent, HelloAgentsLLM
from hello_agents.tools import GAIAEvaluationTool

# GAIA官方系统提示词
GAIA_SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""

# 创建智能体
llm = HelloAgentsLLM()
agent = SimpleAgent(
    name="TestAgent",
    llm=llm,
    system_prompt=GAIA_SYSTEM_PROMPT
)

# 创建评估工具
gaia_tool = GAIAEvaluationTool()

# ============================================================
# 最佳实践1：分级评估
# ============================================================
print("="*60)
print("最佳实践1：分级评估")
print("="*60)

# 第一步：评估Level 1（简单任务）
print("\n第一步：评估Level 1（简单任务）")
results_l1 = gaia_tool.run(agent, level=1, max_samples=10)
print(f"Level 1精确匹配率: {results_l1['exact_match_rate']:.2%}")

# 第二步：如果Level 1表现良好，评估Level 2
if results_l1['exact_match_rate'] > 0.6:
    print("\n第二步：评估Level 2（中等任务）")
    results_l2 = gaia_tool.run(agent, level=2, max_samples=10)
    print(f"Level 2精确匹配率: {results_l2['exact_match_rate']:.2%}")
    
    # 第三步：如果Level 2表现良好，评估Level 3
    if results_l2['exact_match_rate'] > 0.4:
        print("\n第三步：评估Level 3（困难任务）")
        results_l3 = gaia_tool.run(agent, level=3, max_samples=10)
        print(f"Level 3精确匹配率: {results_l3['exact_match_rate']:.2%}")
    else:
        print("\n⚠️ Level 2表现不佳，建议先优化后再评估Level 3")
else:
    print("\n⚠️ Level 1表现不佳，建议先优化后再评估更高级别")

# ============================================================
# 最佳实践2：小样本快速测试
# ============================================================
print("\n" + "="*60)
print("最佳实践2：小样本快速测试")
print("="*60)

# 快速测试（每个级别2个样本）
for level in [1, 2, 3]:
    print(f"\n快速测试 Level {level}:")
    results = gaia_tool.run(agent, level=level, max_samples=2)
    print(f"  精确匹配率: {results['exact_match_rate']:.2%}")

# ============================================================
# 最佳实践3：结果解读
# ============================================================
print("\n" + "="*60)
print("最佳实践3：结果解读")
print("="*60)

def interpret_results(level, exact_match_rate):
    """解读评估结果"""
    print(f"\nLevel {level} 结果解读:")
    print(f"精确匹配率: {exact_match_rate:.2%}")
    
    if level == 1:
        if exact_match_rate >= 0.6:
            print("✅ 优秀 - 基础能力扎实")
        elif exact_match_rate >= 0.4:
            print("⚠️ 良好 - 基础能力可用")
        else:
            print("❌ 较差 - 需要改进")
            print("建议:")
            print("  - 检查系统提示词是否包含GAIA官方格式要求")
            print("  - 检查答案提取逻辑是否正确")
            print("  - 检查LLM模型是否足够强大")
    
    elif level == 2:
        if exact_match_rate >= 0.4:
            print("✅ 优秀 - 中等任务能力强")
        elif exact_match_rate >= 0.2:
            print("⚠️ 良好 - 中等任务能力可用")
        else:
            print("❌ 较差 - 需要改进")
            print("建议:")
            print("  - 增强多步推理能力")
            print("  - 增加工具使用能力")
            print("  - 优化推理链的构建")
    
    elif level == 3:
        if exact_match_rate >= 0.2:
            print("✅ 优秀 - 复杂任务能力强")
        elif exact_match_rate >= 0.1:
            print("⚠️ 良好 - 复杂任务能力可用")
        else:
            print("❌ 较差 - 需要改进")
            print("建议:")
            print("  - 增强复杂推理能力")
            print("  - 增加长上下文处理能力")
            print("  - 优化工具链的组合使用")

# 解读结果
if 'results_l1' in locals():
    interpret_results(1, results_l1['exact_match_rate'])
if 'results_l2' in locals():
    interpret_results(2, results_l2['exact_match_rate'])
if 'results_l3' in locals():
    interpret_results(3, results_l3['exact_match_rate'])

# ============================================================
# 难度递进分析
# ============================================================
print("\n" + "="*60)
print("难度递进分析")
print("="*60)

if 'results_l1' in locals() and 'results_l2' in locals():
    if results_l1['exact_match_rate'] > results_l2['exact_match_rate']:
        print("✅ 正常递进：Level 1 > Level 2")
    else:
        print("⚠️ 异常情况：Level 2 >= Level 1（可能是数据集偏差或智能体特性）")

if 'results_l2' in locals() and 'results_l3' in locals():
    if results_l2['exact_match_rate'] > results_l3['exact_match_rate']:
        print("✅ 正常递进：Level 2 > Level 3")
    else:
        print("⚠️ 异常情况：Level 3 >= Level 2（可能是数据集偏差或智能体特性）")