1
0

04_run_bfcl_evaluation.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. """
  2. 第十二章:BFCL一键评估脚本
  3. 本脚本提供完整的BFCL评估流程:
  4. 1. 自动检查和准备BFCL数据
  5. 2. 运行HelloAgents评估
  6. 3. 导出BFCL格式结果
  7. 4. 调用BFCL官方评估工具
  8. 5. 展示评估结果
  9. 使用方法:
  10. python examples/04_run_bfcl_evaluation.py
  11. 可选参数:
  12. --category: 评估类别(默认:simple_python)
  13. --samples: 样本数量(默认:5,设为0表示全部)
  14. --model-name: 模型名称(默认:HelloAgents)
  15. """
  16. import sys
  17. import subprocess
  18. from pathlib import Path
  19. import argparse
  20. import json
  21. # 添加项目路径
  22. project_root = Path(__file__).parent.parent
  23. sys.path.insert(0, str(project_root))
  24. from hello_agents import SimpleAgent, HelloAgentsLLM
  25. from hello_agents.evaluation import BFCLDataset, BFCLEvaluator
  26. # 函数调用系统提示词
  27. FUNCTION_CALLING_SYSTEM_PROMPT = """你是一个专业的函数调用助手。
  28. 你的任务是:根据用户的问题和提供的函数定义,生成正确的函数调用。
  29. 输出格式要求:
  30. 1. 必须是纯JSON格式,不要添加任何解释文字
  31. 2. 使用JSON数组格式:[{"name": "函数名", "arguments": {"参数名": "参数值"}}]
  32. 3. 如果需要调用多个函数,在数组中添加多个对象
  33. 4. 如果不需要调用函数,返回空数组:[]
  34. 示例:
  35. 用户问题:查询北京的天气
  36. 可用函数:get_weather(city: str)
  37. 正确输出:[{"name": "get_weather", "arguments": {"city": "北京"}}]
  38. 注意:
  39. - 只输出JSON,不要添加"好的"、"我来帮你"等额外文字
  40. - 参数值必须与函数定义的类型匹配
  41. - 参数名必须与函数定义完全一致
  42. """
  43. def check_bfcl_data(bfcl_data_dir: Path) -> bool:
  44. """检查BFCL数据是否存在"""
  45. if not bfcl_data_dir.exists():
  46. print(f"\n❌ BFCL数据目录不存在: {bfcl_data_dir}")
  47. print(f"\n请先克隆BFCL仓库:")
  48. print(f" git clone --depth 1 https://github.com/ShishirPatil/gorilla.git temp_gorilla")
  49. return False
  50. return True
  51. def run_evaluation(category: str, max_samples: int, model_name: str) -> dict:
  52. """运行HelloAgents评估"""
  53. print("\n" + "="*60)
  54. print("步骤1: 运行HelloAgents评估")
  55. print("="*60)
  56. # BFCL数据目录
  57. bfcl_data_dir = project_root / "temp_gorilla" / "berkeley-function-call-leaderboard" / "bfcl_eval" / "data"
  58. # 检查数据
  59. if not check_bfcl_data(bfcl_data_dir):
  60. return None
  61. # 加载数据集
  62. print(f"\n📚 加载BFCL数据集...")
  63. dataset = BFCLDataset(bfcl_data_dir=str(bfcl_data_dir), category=category)
  64. # 创建智能体
  65. print(f"\n🤖 创建智能体...")
  66. llm = HelloAgentsLLM()
  67. agent = SimpleAgent(
  68. name=model_name,
  69. llm=llm,
  70. system_prompt=FUNCTION_CALLING_SYSTEM_PROMPT,
  71. enable_tool_calling=False
  72. )
  73. print(f" 智能体: {model_name}")
  74. print(f" LLM: {llm.provider}")
  75. # 创建评估器
  76. evaluator = BFCLEvaluator(dataset=dataset, category=category)
  77. # 运行评估(传递max_samples参数)
  78. print(f"\n🔄 开始评估...")
  79. if max_samples > 0:
  80. print(f" 样本数量: {max_samples}")
  81. results = evaluator.evaluate(agent, max_samples=max_samples)
  82. else:
  83. print(f" 样本数量: 全部")
  84. results = evaluator.evaluate(agent, max_samples=None)
  85. # 显示结果
  86. print(f"\n📊 评估结果:")
  87. print(f" 准确率: {results['overall_accuracy']:.2%}")
  88. print(f" 正确数: {results['correct_samples']}/{results['total_samples']}")
  89. return results
  90. def export_bfcl_format(results: dict, category: str, model_name: str) -> Path:
  91. """导出BFCL格式结果"""
  92. print("\n" + "="*60)
  93. print("步骤2: 导出BFCL格式结果")
  94. print("="*60)
  95. # 输出目录
  96. output_dir = project_root / "evaluation_results" / "bfcl_official"
  97. output_dir.mkdir(parents=True, exist_ok=True)
  98. # 输出文件
  99. output_file = output_dir / f"BFCL_v4_{category}_result.json"
  100. # 创建评估器(用于导出)
  101. bfcl_data_dir = project_root / "temp_gorilla" / "berkeley-function-call-leaderboard" / "bfcl_eval" / "data"
  102. dataset = BFCLDataset(bfcl_data_dir=str(bfcl_data_dir), category=category)
  103. evaluator = BFCLEvaluator(dataset=dataset, category=category)
  104. # 导出
  105. evaluator.export_to_bfcl_format(results, output_file)
  106. return output_file
  107. def copy_to_bfcl_result_dir(source_file: Path, model_name: str, category: str) -> Path:
  108. """复制结果文件到BFCL结果目录"""
  109. print("\n" + "="*60)
  110. print("步骤3: 准备BFCL官方评估")
  111. print("="*60)
  112. # BFCL结果目录
  113. # 注意:BFCL会将模型名中的"/"替换为"_"
  114. safe_model_name = model_name.replace("/", "_")
  115. result_dir = project_root / "result" / safe_model_name
  116. result_dir.mkdir(parents=True, exist_ok=True)
  117. # 目标文件
  118. target_file = result_dir / f"BFCL_v4_{category}_result.json"
  119. # 复制文件
  120. import shutil
  121. shutil.copy(source_file, target_file)
  122. print(f"\n✅ 结果文件已复制到:")
  123. print(f" {target_file}")
  124. return target_file
  125. def run_bfcl_official_eval(model_name: str, category: str) -> bool:
  126. """运行BFCL官方评估"""
  127. print("\n" + "="*60)
  128. print("步骤4: 运行BFCL官方评估")
  129. print("="*60)
  130. try:
  131. # 设置环境变量
  132. import os
  133. os.environ['PYTHONUTF8'] = '1'
  134. # 运行BFCL评估
  135. cmd = [
  136. "bfcl", "evaluate",
  137. "--model", model_name,
  138. "--test-category", category,
  139. "--partial-eval"
  140. ]
  141. print(f"\n🔄 运行命令: {' '.join(cmd)}")
  142. result = subprocess.run(
  143. cmd,
  144. cwd=str(project_root),
  145. capture_output=True,
  146. text=True,
  147. encoding='utf-8'
  148. )
  149. # 显示输出
  150. if result.stdout:
  151. print(result.stdout)
  152. if result.returncode != 0:
  153. print(f"\n❌ BFCL评估失败:")
  154. if result.stderr:
  155. print(result.stderr)
  156. return False
  157. return True
  158. except FileNotFoundError:
  159. print("\n❌ 未找到bfcl命令")
  160. print(" 请先安装: pip install bfcl-eval")
  161. return False
  162. except Exception as e:
  163. print(f"\n❌ 运行BFCL评估时出错: {e}")
  164. return False
  165. def show_results(model_name: str, category: str):
  166. """展示评估结果"""
  167. print("\n" + "="*60)
  168. print("步骤5: 展示评估结果")
  169. print("="*60)
  170. # CSV文件
  171. csv_file = project_root / "score" / "data_non_live.csv"
  172. if csv_file.exists():
  173. print(f"\n📊 评估结果汇总:")
  174. with open(csv_file, 'r', encoding='utf-8') as f:
  175. content = f.read()
  176. print(content)
  177. else:
  178. print(f"\n⚠️ 未找到评估结果文件: {csv_file}")
  179. # 详细评分文件
  180. safe_model_name = model_name.replace("/", "_")
  181. score_file = project_root / "score" / safe_model_name / "non_live" / f"BFCL_v4_{category}_score.json"
  182. if score_file.exists():
  183. print(f"\n📝 详细评分文件:")
  184. print(f" {score_file}")
  185. # 读取并显示准确率
  186. with open(score_file, 'r', encoding='utf-8') as f:
  187. first_line = f.readline()
  188. summary = json.loads(first_line)
  189. print(f"\n🎯 最终结果:")
  190. print(f" 准确率: {summary['accuracy']:.2%}")
  191. print(f" 正确数: {summary['correct_count']}/{summary['total_count']}")
  192. def main():
  193. """主函数"""
  194. parser = argparse.ArgumentParser(description="BFCL一键评估脚本")
  195. parser.add_argument("--category", default="simple_python", help="评估类别")
  196. parser.add_argument("--samples", type=int, default=5, help="样本数量(0表示全部)")
  197. parser.add_argument("--model-name", default="Qwen/Qwen3-8B",
  198. help="模型名称(必须是BFCL支持的模型,运行'bfcl models'查看)")
  199. args = parser.parse_args()
  200. print("="*60)
  201. print("BFCL一键评估脚本")
  202. print("="*60)
  203. print(f"\n配置:")
  204. print(f" 评估类别: {args.category}")
  205. print(f" 样本数量: {args.samples if args.samples > 0 else '全部'}")
  206. print(f" 模型名称: {args.model_name}")
  207. # 步骤1: 运行评估
  208. results = run_evaluation(args.category, args.samples, args.model_name)
  209. if not results:
  210. return
  211. # 步骤2: 导出BFCL格式
  212. output_file = export_bfcl_format(results, args.category, args.model_name)
  213. # 步骤3: 复制到BFCL结果目录
  214. copy_to_bfcl_result_dir(output_file, args.model_name, args.category)
  215. # 步骤4: 运行BFCL官方评估
  216. if not run_bfcl_official_eval(args.model_name, args.category):
  217. print("\n⚠️ BFCL官方评估失败,但HelloAgents评估已完成")
  218. return
  219. # 步骤5: 展示结果
  220. show_results(args.model_name, args.category)
  221. print("\n" + "="*60)
  222. print("✅ 评估完成!")
  223. print("="*60)
  224. if __name__ == "__main__":
  225. main()