| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485 |
- """
- 第十二章示例5:GAIA快速开始
- 对应文档:12.3.5 在HelloAgents中实现GAIA评估 - 方式1
- 这是最简单的GAIA评估方式,一行代码完成评估。
- 重要提示:
- 1. GAIA是受限数据集,需要先在HuggingFace上申请访问权限
- 2. 需要设置HF_TOKEN环境变量
- 3. 必须使用GAIA官方系统提示词
- """
- import os
- from hello_agents import SimpleAgent, HelloAgentsLLM
- from hello_agents.tools import GAIAEvaluationTool
- # GAIA官方系统提示词(必须使用)
- GAIA_SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
- YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
- # 1. 设置HuggingFace Token(如果还没设置)
- # os.environ["HF_TOKEN"] = "your_huggingface_token_here"
- # 2. 创建智能体(必须使用GAIA官方系统提示词)
- llm = HelloAgentsLLM()
- agent = SimpleAgent(
- name="TestAgent",
- llm=llm,
- system_prompt=GAIA_SYSTEM_PROMPT # 必须使用官方提示词
- )
- # 3. 创建GAIA评估工具
- gaia_tool = GAIAEvaluationTool()
- # 4. 运行评估
- results = gaia_tool.run(
- agent=agent,
- level=1, # 评估级别(1=简单,2=中等,3=困难)
- max_samples=2, # 评估样本数(0表示全部)
- export_results=True, # 导出结果到GAIA官方格式
- generate_report=True # 生成详细报告
- )
- # 5. 查看结果
- print(f"\n评估结果:")
- print(f"精确匹配率: {results['exact_match_rate']:.2%}")
- print(f"部分匹配率: {results['partial_match_rate']:.2%}")
- print(f"正确数: {results['correct_samples']}/{results['total_samples']}")
- # 运行输出示例:
- # ============================================================
- # GAIA一键评估
- # ============================================================
- #
- # 配置:
- # 智能体: TestAgent
- # 级别: Level 1
- # 样本数: 2
- #
- # ✅ GAIA数据集加载完成
- # 数据源: gaia-benchmark/GAIA
- # 分割: validation
- # 级别: 1
- # 样本数: 2
- #
- # 评估进度: 100%|██████████| 2/2 [00:10<00:00, 5.23s/样本]
- #
- # ✅ 评估完成
- # 总样本数: 2
- # 正确样本数: 2
- # 精确匹配率: 100.00%
- # 部分匹配率: 100.00%
- #
- # ✅ 结果已导出到 ./evaluation_results/gaia_submission.json
- # ✅ 报告已生成到 ./evaluation_results/gaia_report.md
- #
- # 评估结果:
- # 精确匹配率: 100.00%
- # 部分匹配率: 100.00%
- # 正确数: 2/2
|