02_reward_functions.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. """
  2. 示例2: 奖励函数设计和使用
  3. 演示如何使用RLTrainingTool创建和测试奖励函数
  4. """
  5. import sys
  6. from pathlib import Path
  7. import json
  8. # 添加项目路径
  9. project_root = Path(__file__).parent.parent / "HelloAgents"
  10. sys.path.insert(0, str(project_root))
  11. from hello_agents.tools import RLTrainingTool
  12. # ============================================================================
  13. # 示例1: 创建准确性奖励函数
  14. # ============================================================================
  15. def create_accuracy_reward():
  16. """
  17. 创建准确性奖励函数
  18. 奖励规则:
  19. - 答案正确: 1.0
  20. - 答案错误: 0.0
  21. """
  22. tool = RLTrainingTool()
  23. config = {
  24. "action": "create_reward",
  25. "reward_type": "accuracy"
  26. }
  27. print("创建准确性奖励函数...")
  28. result = tool.run(config)
  29. result_dict = json.loads(result)
  30. print(f"✅ 奖励函数类型: {result_dict['reward_type']}")
  31. print(f"📋 描述: {result_dict['description']}")
  32. return result_dict
  33. # ============================================================================
  34. # 示例2: 创建长度惩罚奖励函数
  35. # ============================================================================
  36. def create_length_penalty_reward():
  37. """
  38. 创建长度惩罚奖励函数
  39. 奖励规则:
  40. - 基础奖励 (准确性)
  41. - 减去长度惩罚 (鼓励简洁)
  42. """
  43. tool = RLTrainingTool()
  44. config = {
  45. "action": "create_reward",
  46. "reward_type": "length_penalty",
  47. "penalty_weight": 0.001, # 每个token惩罚0.001
  48. "max_length": 512
  49. }
  50. print("创建长度惩罚奖励函数...")
  51. result = tool.run(config)
  52. result_dict = json.loads(result)
  53. print(f"✅ 奖励函数类型: {result_dict['reward_type']}")
  54. print(f"📋 惩罚权重: {result_dict.get('penalty_weight', 0.001)}")
  55. print(f"📋 最大长度: {result_dict.get('max_length', 512)}")
  56. return result_dict
  57. # ============================================================================
  58. # 示例3: 创建步骤奖励函数
  59. # ============================================================================
  60. def create_step_reward():
  61. """
  62. 创建步骤奖励函数
  63. 奖励规则:
  64. - 基础奖励 (准确性)
  65. - 加上步骤奖励 (鼓励详细推理)
  66. """
  67. tool = RLTrainingTool()
  68. config = {
  69. "action": "create_reward",
  70. "reward_type": "step",
  71. "step_bonus": 0.1, # 每个步骤额外奖励0.1
  72. "max_steps": 10
  73. }
  74. print("创建步骤奖励函数...")
  75. result = tool.run(config)
  76. result_dict = json.loads(result)
  77. print(f"✅ 奖励函数类型: {result_dict['reward_type']}")
  78. print(f"📋 步骤奖励: {result_dict.get('step_bonus', 0.1)}")
  79. print(f"📋 最大步骤: {result_dict.get('max_steps', 10)}")
  80. return result_dict
  81. # ============================================================================
  82. # 示例4: 测试奖励函数
  83. # ============================================================================
  84. def test_reward_function():
  85. """
  86. 测试奖励函数的计算
  87. 使用MathRewardFunction直接测试
  88. """
  89. from hello_agents.rl import MathRewardFunction
  90. reward_fn = MathRewardFunction(tolerance=1e-4)
  91. # 测试样本
  92. test_cases = [
  93. {
  94. "completion": "Let me calculate: 2+2=4. Final Answer: 4",
  95. "ground_truth": "4",
  96. "expected": 1.0
  97. },
  98. {
  99. "completion": "I think 2+2=5. Final Answer: 5",
  100. "ground_truth": "4",
  101. "expected": 0.0
  102. },
  103. {
  104. "completion": "The answer is 4",
  105. "ground_truth": "4",
  106. "expected": 1.0
  107. },
  108. {
  109. "completion": "2+2 equals four. #### 4",
  110. "ground_truth": "4",
  111. "expected": 1.0
  112. }
  113. ]
  114. print("测试奖励函数:")
  115. print("-" * 80)
  116. for i, case in enumerate(test_cases, 1):
  117. # 计算奖励
  118. rewards = reward_fn(
  119. completions=[case["completion"]],
  120. ground_truth=[case["ground_truth"]]
  121. )
  122. reward = rewards[0]
  123. print(f"\n测试 {i}:")
  124. print(f" 生成: {case['completion'][:50]}...")
  125. print(f" 真值: {case['ground_truth']}")
  126. print(f" 奖励: {reward:.2f} (期望: {case['expected']:.2f})")
  127. print(f" {'✅ 正确' if abs(reward - case['expected']) < 0.01 else '❌ 错误'}")
  128. return test_cases
  129. # ============================================================================
  130. # 示例5: 答案提取测试
  131. # ============================================================================
  132. def test_answer_extraction():
  133. """
  134. 测试答案提取功能
  135. """
  136. from hello_agents.rl import MathRewardFunction
  137. reward_fn = MathRewardFunction()
  138. test_texts = [
  139. "Final Answer: 42",
  140. "The answer is 3.14",
  141. "#### 100",
  142. "So the result is 2.5",
  143. "Let me think... the answer should be 7",
  144. "42"
  145. ]
  146. print("答案提取测试:")
  147. print("-" * 80)
  148. for text in test_texts:
  149. answer = reward_fn.extract_answer(text)
  150. print(f"\n文本: {text}")
  151. print(f"提取: {answer if answer else '(未找到)'}")
  152. return test_texts
  153. # ============================================================================
  154. # 示例6: 答案比较测试
  155. # ============================================================================
  156. def test_answer_comparison():
  157. """
  158. 测试答案比较功能
  159. """
  160. from hello_agents.rl import MathRewardFunction
  161. reward_fn = MathRewardFunction(tolerance=0.01)
  162. test_pairs = [
  163. ("42", "42", True),
  164. ("3.14", "3.14159", False), # 超出容差
  165. ("3.14", "3.141", True), # 在容差内
  166. ("100", "100.0", True),
  167. ("2.5", "3.0", False),
  168. ("7", "7.00", True)
  169. ]
  170. print("答案比较测试:")
  171. print("-" * 80)
  172. for pred, truth, expected in test_pairs:
  173. is_correct = reward_fn.compare_answers(pred, truth)
  174. print(f"\n预测: {pred}, 真值: {truth}")
  175. print(f"结果: {'正确' if is_correct else '错误'} (期望: {'正确' if expected else '错误'})")
  176. print(f"{'✅ 通过' if is_correct == expected else '❌ 失败'}")
  177. return test_pairs
  178. # ============================================================================
  179. # 示例7: 不同奖励函数的对比
  180. # ============================================================================
  181. def compare_reward_functions():
  182. """
  183. 对比不同奖励函数的效果
  184. """
  185. from hello_agents.rl import (
  186. create_accuracy_reward,
  187. create_length_penalty_reward,
  188. create_step_reward
  189. )
  190. # 创建不同的奖励函数
  191. accuracy_fn = create_accuracy_reward()
  192. base_fn = create_accuracy_reward() # 基础奖励函数
  193. length_fn = create_length_penalty_reward(base_fn, penalty_weight=0.001)
  194. step_fn = create_step_reward(base_fn, step_bonus=0.1)
  195. # 测试样本
  196. test_cases = [
  197. {
  198. "completion": "4",
  199. "ground_truth": "4",
  200. "desc": "简洁正确答案"
  201. },
  202. {
  203. "completion": "Step 1: 2+2=4\nFinal Answer: 4",
  204. "ground_truth": "4",
  205. "desc": "带步骤的正确答案"
  206. },
  207. {
  208. "completion": "Let me think... " * 20 + "Final Answer: 4",
  209. "ground_truth": "4",
  210. "desc": "冗长的正确答案"
  211. }
  212. ]
  213. print("奖励函数对比:")
  214. print("=" * 80)
  215. for i, case in enumerate(test_cases, 1):
  216. print(f"\n测试 {i}: {case['desc']}")
  217. print(f"长度: {len(case['completion'])} 字符")
  218. # 计算不同奖励
  219. acc_reward = accuracy_fn([case["completion"]], ground_truth=[case["ground_truth"]])[0]
  220. len_reward = length_fn([case["completion"]], ground_truth=[case["ground_truth"]])[0]
  221. step_reward = step_fn([case["completion"]], ground_truth=[case["ground_truth"]])[0]
  222. print(f" 准确性奖励: {acc_reward:.4f}")
  223. print(f" 长度惩罚奖励: {len_reward:.4f}")
  224. print(f" 步骤奖励: {step_reward:.4f}")
  225. print("\n结论:")
  226. print(" - 准确性奖励: 只关注答案正确性")
  227. print(" - 长度惩罚: 鼓励简洁答案")
  228. print(" - 步骤奖励: 鼓励详细推理")
  229. return test_cases
  230. # ============================================================================
  231. # 主函数
  232. # ============================================================================
  233. if __name__ == "__main__":
  234. print("="*80)
  235. print("示例1: 创建准确性奖励函数")
  236. print("="*80)
  237. create_accuracy_reward()
  238. print("\n" + "="*80)
  239. print("示例2: 创建长度惩罚奖励函数")
  240. print("="*80)
  241. create_length_penalty_reward()
  242. print("\n" + "="*80)
  243. print("示例3: 创建步骤奖励函数")
  244. print("="*80)
  245. create_step_reward()
  246. print("\n" + "="*80)
  247. print("示例4: 测试奖励函数")
  248. print("="*80)
  249. test_reward_function()
  250. print("\n" + "="*80)
  251. print("示例5: 答案提取测试")
  252. print("="*80)
  253. test_answer_extraction()
  254. print("\n" + "="*80)
  255. print("示例6: 答案比较测试")
  256. print("="*80)
  257. test_answer_comparison()
  258. print("\n" + "="*80)
  259. print("示例7: 不同奖励函数的对比")
  260. print("="*80)
  261. compare_reward_functions()