05_gaia_quick_start.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. """
  2. 第十二章示例5:GAIA快速开始
  3. 对应文档:12.3.5 在HelloAgents中实现GAIA评估 - 方式1
  4. 这是最简单的GAIA评估方式,一行代码完成评估。
  5. 重要提示:
  6. 1. GAIA是受限数据集,需要先在HuggingFace上申请访问权限
  7. 2. 需要设置HF_TOKEN环境变量
  8. 3. 必须使用GAIA官方系统提示词
  9. """
  10. import os
  11. from hello_agents import SimpleAgent, HelloAgentsLLM
  12. from hello_agents.tools import GAIAEvaluationTool
  13. # GAIA官方系统提示词(必须使用)
  14. GAIA_SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
  15. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
  16. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
  17. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
  18. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
  19. # 1. 设置HuggingFace Token(如果还没设置)
  20. # os.environ["HF_TOKEN"] = "your_huggingface_token_here"
  21. # 2. 创建智能体(必须使用GAIA官方系统提示词)
  22. llm = HelloAgentsLLM()
  23. agent = SimpleAgent(
  24. name="TestAgent",
  25. llm=llm,
  26. system_prompt=GAIA_SYSTEM_PROMPT # 必须使用官方提示词
  27. )
  28. # 3. 创建GAIA评估工具
  29. gaia_tool = GAIAEvaluationTool()
  30. # 4. 运行评估
  31. results = gaia_tool.run(
  32. agent=agent,
  33. level=1, # 评估级别(1=简单,2=中等,3=困难)
  34. max_samples=2, # 评估样本数(0表示全部)
  35. export_results=True, # 导出结果到GAIA官方格式
  36. generate_report=True # 生成详细报告
  37. )
  38. # 5. 查看结果
  39. print(f"\n评估结果:")
  40. print(f"精确匹配率: {results['exact_match_rate']:.2%}")
  41. print(f"部分匹配率: {results['partial_match_rate']:.2%}")
  42. print(f"正确数: {results['correct_samples']}/{results['total_samples']}")
  43. # 运行输出示例:
  44. # ============================================================
  45. # GAIA一键评估
  46. # ============================================================
  47. #
  48. # 配置:
  49. # 智能体: TestAgent
  50. # 级别: Level 1
  51. # 样本数: 2
  52. #
  53. # ✅ GAIA数据集加载完成
  54. # 数据源: gaia-benchmark/GAIA
  55. # 分割: validation
  56. # 级别: 1
  57. # 样本数: 2
  58. #
  59. # 评估进度: 100%|██████████| 2/2 [00:10<00:00, 5.23s/样本]
  60. #
  61. # ✅ 评估完成
  62. # 总样本数: 2
  63. # 正确样本数: 2
  64. # 精确匹配率: 100.00%
  65. # 部分匹配率: 100.00%
  66. #
  67. # ✅ 结果已导出到 ./evaluation_results/gaia_submission.json
  68. # ✅ 报告已生成到 ./evaluation_results/gaia_report.md
  69. #
  70. # 评估结果:
  71. # 精确匹配率: 100.00%
  72. # 部分匹配率: 100.00%
  73. # 正确数: 2/2