| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335 |
- """
- 示例2: 奖励函数设计和使用
- 演示如何使用RLTrainingTool创建和测试奖励函数
- """
- import sys
- from pathlib import Path
- import json
- # 添加项目路径
- project_root = Path(__file__).parent.parent / "HelloAgents"
- sys.path.insert(0, str(project_root))
- from hello_agents.tools import RLTrainingTool
- # ============================================================================
- # 示例1: 创建准确性奖励函数
- # ============================================================================
- def create_accuracy_reward():
- """
- 创建准确性奖励函数
-
- 奖励规则:
- - 答案正确: 1.0
- - 答案错误: 0.0
- """
- tool = RLTrainingTool()
-
- config = {
- "action": "create_reward",
- "reward_type": "accuracy"
- }
-
- print("创建准确性奖励函数...")
- result = tool.run(config)
- result_dict = json.loads(result)
-
- print(f"✅ 奖励函数类型: {result_dict['reward_type']}")
- print(f"📋 描述: {result_dict['description']}")
-
- return result_dict
- # ============================================================================
- # 示例2: 创建长度惩罚奖励函数
- # ============================================================================
- def create_length_penalty_reward():
- """
- 创建长度惩罚奖励函数
-
- 奖励规则:
- - 基础奖励 (准确性)
- - 减去长度惩罚 (鼓励简洁)
- """
- tool = RLTrainingTool()
-
- config = {
- "action": "create_reward",
- "reward_type": "length_penalty",
- "penalty_weight": 0.001, # 每个token惩罚0.001
- "max_length": 512
- }
-
- print("创建长度惩罚奖励函数...")
- result = tool.run(config)
- result_dict = json.loads(result)
-
- print(f"✅ 奖励函数类型: {result_dict['reward_type']}")
- print(f"📋 惩罚权重: {result_dict.get('penalty_weight', 0.001)}")
- print(f"📋 最大长度: {result_dict.get('max_length', 512)}")
-
- return result_dict
- # ============================================================================
- # 示例3: 创建步骤奖励函数
- # ============================================================================
- def create_step_reward():
- """
- 创建步骤奖励函数
-
- 奖励规则:
- - 基础奖励 (准确性)
- - 加上步骤奖励 (鼓励详细推理)
- """
- tool = RLTrainingTool()
-
- config = {
- "action": "create_reward",
- "reward_type": "step",
- "step_bonus": 0.1, # 每个步骤额外奖励0.1
- "max_steps": 10
- }
-
- print("创建步骤奖励函数...")
- result = tool.run(config)
- result_dict = json.loads(result)
-
- print(f"✅ 奖励函数类型: {result_dict['reward_type']}")
- print(f"📋 步骤奖励: {result_dict.get('step_bonus', 0.1)}")
- print(f"📋 最大步骤: {result_dict.get('max_steps', 10)}")
-
- return result_dict
- # ============================================================================
- # 示例4: 测试奖励函数
- # ============================================================================
- def test_reward_function():
- """
- 测试奖励函数的计算
-
- 使用MathRewardFunction直接测试
- """
- from hello_agents.rl import MathRewardFunction
-
- reward_fn = MathRewardFunction(tolerance=1e-4)
-
- # 测试样本
- test_cases = [
- {
- "completion": "Let me calculate: 2+2=4. Final Answer: 4",
- "ground_truth": "4",
- "expected": 1.0
- },
- {
- "completion": "I think 2+2=5. Final Answer: 5",
- "ground_truth": "4",
- "expected": 0.0
- },
- {
- "completion": "The answer is 4",
- "ground_truth": "4",
- "expected": 1.0
- },
- {
- "completion": "2+2 equals four. #### 4",
- "ground_truth": "4",
- "expected": 1.0
- }
- ]
-
- print("测试奖励函数:")
- print("-" * 80)
-
- for i, case in enumerate(test_cases, 1):
- # 计算奖励
- rewards = reward_fn(
- completions=[case["completion"]],
- ground_truth=[case["ground_truth"]]
- )
- reward = rewards[0]
-
- print(f"\n测试 {i}:")
- print(f" 生成: {case['completion'][:50]}...")
- print(f" 真值: {case['ground_truth']}")
- print(f" 奖励: {reward:.2f} (期望: {case['expected']:.2f})")
- print(f" {'✅ 正确' if abs(reward - case['expected']) < 0.01 else '❌ 错误'}")
-
- return test_cases
- # ============================================================================
- # 示例5: 答案提取测试
- # ============================================================================
- def test_answer_extraction():
- """
- 测试答案提取功能
- """
- from hello_agents.rl import MathRewardFunction
-
- reward_fn = MathRewardFunction()
-
- test_texts = [
- "Final Answer: 42",
- "The answer is 3.14",
- "#### 100",
- "So the result is 2.5",
- "Let me think... the answer should be 7",
- "42"
- ]
-
- print("答案提取测试:")
- print("-" * 80)
-
- for text in test_texts:
- answer = reward_fn.extract_answer(text)
- print(f"\n文本: {text}")
- print(f"提取: {answer if answer else '(未找到)'}")
-
- return test_texts
- # ============================================================================
- # 示例6: 答案比较测试
- # ============================================================================
- def test_answer_comparison():
- """
- 测试答案比较功能
- """
- from hello_agents.rl import MathRewardFunction
-
- reward_fn = MathRewardFunction(tolerance=0.01)
-
- test_pairs = [
- ("42", "42", True),
- ("3.14", "3.14159", False), # 超出容差
- ("3.14", "3.141", True), # 在容差内
- ("100", "100.0", True),
- ("2.5", "3.0", False),
- ("7", "7.00", True)
- ]
-
- print("答案比较测试:")
- print("-" * 80)
-
- for pred, truth, expected in test_pairs:
- is_correct = reward_fn.compare_answers(pred, truth)
- print(f"\n预测: {pred}, 真值: {truth}")
- print(f"结果: {'正确' if is_correct else '错误'} (期望: {'正确' if expected else '错误'})")
- print(f"{'✅ 通过' if is_correct == expected else '❌ 失败'}")
-
- return test_pairs
- # ============================================================================
- # 示例7: 不同奖励函数的对比
- # ============================================================================
- def compare_reward_functions():
- """
- 对比不同奖励函数的效果
- """
- from hello_agents.rl import (
- create_accuracy_reward,
- create_length_penalty_reward,
- create_step_reward
- )
- # 创建不同的奖励函数
- accuracy_fn = create_accuracy_reward()
- base_fn = create_accuracy_reward() # 基础奖励函数
- length_fn = create_length_penalty_reward(base_fn, penalty_weight=0.001)
- step_fn = create_step_reward(base_fn, step_bonus=0.1)
-
- # 测试样本
- test_cases = [
- {
- "completion": "4",
- "ground_truth": "4",
- "desc": "简洁正确答案"
- },
- {
- "completion": "Step 1: 2+2=4\nFinal Answer: 4",
- "ground_truth": "4",
- "desc": "带步骤的正确答案"
- },
- {
- "completion": "Let me think... " * 20 + "Final Answer: 4",
- "ground_truth": "4",
- "desc": "冗长的正确答案"
- }
- ]
-
- print("奖励函数对比:")
- print("=" * 80)
-
- for i, case in enumerate(test_cases, 1):
- print(f"\n测试 {i}: {case['desc']}")
- print(f"长度: {len(case['completion'])} 字符")
-
- # 计算不同奖励
- acc_reward = accuracy_fn([case["completion"]], ground_truth=[case["ground_truth"]])[0]
- len_reward = length_fn([case["completion"]], ground_truth=[case["ground_truth"]])[0]
- step_reward = step_fn([case["completion"]], ground_truth=[case["ground_truth"]])[0]
-
- print(f" 准确性奖励: {acc_reward:.4f}")
- print(f" 长度惩罚奖励: {len_reward:.4f}")
- print(f" 步骤奖励: {step_reward:.4f}")
-
- print("\n结论:")
- print(" - 准确性奖励: 只关注答案正确性")
- print(" - 长度惩罚: 鼓励简洁答案")
- print(" - 步骤奖励: 鼓励详细推理")
-
- return test_cases
- # ============================================================================
- # 主函数
- # ============================================================================
- if __name__ == "__main__":
- print("="*80)
- print("示例1: 创建准确性奖励函数")
- print("="*80)
- create_accuracy_reward()
-
- print("\n" + "="*80)
- print("示例2: 创建长度惩罚奖励函数")
- print("="*80)
- create_length_penalty_reward()
-
- print("\n" + "="*80)
- print("示例3: 创建步骤奖励函数")
- print("="*80)
- create_step_reward()
-
- print("\n" + "="*80)
- print("示例4: 测试奖励函数")
- print("="*80)
- test_reward_function()
-
- print("\n" + "="*80)
- print("示例5: 答案提取测试")
- print("="*80)
- test_answer_extraction()
-
- print("\n" + "="*80)
- print("示例6: 答案比较测试")
- print("="*80)
- test_answer_comparison()
-
- print("\n" + "="*80)
- print("示例7: 不同奖励函数的对比")
- print("="*80)
- compare_reward_functions()
|