moki
/
hello-agents
kopia lustrzana https://github.com/datawhalechina/hello-agents.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
							"""
示例2: 奖励函数设计和使用
演示如何使用RLTrainingTool创建和测试奖励函数
"""

import sys
from pathlib import Path
import json

# 添加项目路径
project_root = Path(__file__).parent.parent / "HelloAgents"
sys.path.insert(0, str(project_root))

from hello_agents.tools import RLTrainingTool


# ============================================================================
# 示例1: 创建准确性奖励函数
# ============================================================================

def create_accuracy_reward():
    """
    创建准确性奖励函数
    
    奖励规则:
    - 答案正确: 1.0
    - 答案错误: 0.0
    """
    tool = RLTrainingTool()
    
    config = {
        "action": "create_reward",
        "reward_type": "accuracy"
    }
    
    print("创建准确性奖励函数...")
    result = tool.run(config)
    result_dict = json.loads(result)
    
    print(f"✅ 奖励函数类型: {result_dict['reward_type']}")
    print(f"📋 描述: {result_dict['description']}")
    
    return result_dict


# ============================================================================
# 示例2: 创建长度惩罚奖励函数
# ============================================================================

def create_length_penalty_reward():
    """
    创建长度惩罚奖励函数
    
    奖励规则:
    - 基础奖励 (准确性)
    - 减去长度惩罚 (鼓励简洁)
    """
    tool = RLTrainingTool()
    
    config = {
        "action": "create_reward",
        "reward_type": "length_penalty",
        "penalty_weight": 0.001,  # 每个token惩罚0.001
        "max_length": 512
    }
    
    print("创建长度惩罚奖励函数...")
    result = tool.run(config)
    result_dict = json.loads(result)
    
    print(f"✅ 奖励函数类型: {result_dict['reward_type']}")
    print(f"📋 惩罚权重: {result_dict.get('penalty_weight', 0.001)}")
    print(f"📋 最大长度: {result_dict.get('max_length', 512)}")
    
    return result_dict


# ============================================================================
# 示例3: 创建步骤奖励函数
# ============================================================================

def create_step_reward():
    """
    创建步骤奖励函数
    
    奖励规则:
    - 基础奖励 (准确性)
    - 加上步骤奖励 (鼓励详细推理)
    """
    tool = RLTrainingTool()
    
    config = {
        "action": "create_reward",
        "reward_type": "step",
        "step_bonus": 0.1,  # 每个步骤额外奖励0.1
        "max_steps": 10
    }
    
    print("创建步骤奖励函数...")
    result = tool.run(config)
    result_dict = json.loads(result)
    
    print(f"✅ 奖励函数类型: {result_dict['reward_type']}")
    print(f"📋 步骤奖励: {result_dict.get('step_bonus', 0.1)}")
    print(f"📋 最大步骤: {result_dict.get('max_steps', 10)}")
    
    return result_dict


# ============================================================================
# 示例4: 测试奖励函数
# ============================================================================

def test_reward_function():
    """
    测试奖励函数的计算
    
    使用MathRewardFunction直接测试
    """
    from hello_agents.rl import MathRewardFunction
    
    reward_fn = MathRewardFunction(tolerance=1e-4)
    
    # 测试样本
    test_cases = [
        {
            "completion": "Let me calculate: 2+2=4. Final Answer: 4",
            "ground_truth": "4",
            "expected": 1.0
        },
        {
            "completion": "I think 2+2=5. Final Answer: 5",
            "ground_truth": "4",
            "expected": 0.0
        },
        {
            "completion": "The answer is 4",
            "ground_truth": "4",
            "expected": 1.0
        },
        {
            "completion": "2+2 equals four. #### 4",
            "ground_truth": "4",
            "expected": 1.0
        }
    ]
    
    print("测试奖励函数:")
    print("-" * 80)
    
    for i, case in enumerate(test_cases, 1):
        # 计算奖励
        rewards = reward_fn(
            completions=[case["completion"]],
            ground_truth=[case["ground_truth"]]
        )
        reward = rewards[0]
        
        print(f"\n测试 {i}:")
        print(f"  生成: {case['completion'][:50]}...")
        print(f"  真值: {case['ground_truth']}")
        print(f"  奖励: {reward:.2f} (期望: {case['expected']:.2f})")
        print(f"  {'✅ 正确' if abs(reward - case['expected']) < 0.01 else '❌ 错误'}")
    
    return test_cases


# ============================================================================
# 示例5: 答案提取测试
# ============================================================================

def test_answer_extraction():
    """
    测试答案提取功能
    """
    from hello_agents.rl import MathRewardFunction
    
    reward_fn = MathRewardFunction()
    
    test_texts = [
        "Final Answer: 42",
        "The answer is 3.14",
        "#### 100",
        "So the result is 2.5",
        "Let me think... the answer should be 7",
        "42"
    ]
    
    print("答案提取测试:")
    print("-" * 80)
    
    for text in test_texts:
        answer = reward_fn.extract_answer(text)
        print(f"\n文本: {text}")
        print(f"提取: {answer if answer else '(未找到)'}")
    
    return test_texts


# ============================================================================
# 示例6: 答案比较测试
# ============================================================================

def test_answer_comparison():
    """
    测试答案比较功能
    """
    from hello_agents.rl import MathRewardFunction
    
    reward_fn = MathRewardFunction(tolerance=0.01)
    
    test_pairs = [
        ("42", "42", True),
        ("3.14", "3.14159", False),  # 超出容差
        ("3.14", "3.141", True),     # 在容差内
        ("100", "100.0", True),
        ("2.5", "3.0", False),
        ("7", "7.00", True)
    ]
    
    print("答案比较测试:")
    print("-" * 80)
    
    for pred, truth, expected in test_pairs:
        is_correct = reward_fn.compare_answers(pred, truth)
        print(f"\n预测: {pred}, 真值: {truth}")
        print(f"结果: {'正确' if is_correct else '错误'} (期望: {'正确' if expected else '错误'})")
        print(f"{'✅ 通过' if is_correct == expected else '❌ 失败'}")
    
    return test_pairs


# ============================================================================
# 示例7: 不同奖励函数的对比
# ============================================================================

def compare_reward_functions():
    """
    对比不同奖励函数的效果
    """
    from hello_agents.rl import (
        create_accuracy_reward,
        create_length_penalty_reward,
        create_step_reward
    )

    # 创建不同的奖励函数
    accuracy_fn = create_accuracy_reward()
    base_fn = create_accuracy_reward()  # 基础奖励函数
    length_fn = create_length_penalty_reward(base_fn, penalty_weight=0.001)
    step_fn = create_step_reward(base_fn, step_bonus=0.1)
    
    # 测试样本
    test_cases = [
        {
            "completion": "4",
            "ground_truth": "4",
            "desc": "简洁正确答案"
        },
        {
            "completion": "Step 1: 2+2=4\nFinal Answer: 4",
            "ground_truth": "4",
            "desc": "带步骤的正确答案"
        },
        {
            "completion": "Let me think... " * 20 + "Final Answer: 4",
            "ground_truth": "4",
            "desc": "冗长的正确答案"
        }
    ]
    
    print("奖励函数对比:")
    print("=" * 80)
    
    for i, case in enumerate(test_cases, 1):
        print(f"\n测试 {i}: {case['desc']}")
        print(f"长度: {len(case['completion'])} 字符")
        
        # 计算不同奖励
        acc_reward = accuracy_fn([case["completion"]], ground_truth=[case["ground_truth"]])[0]
        len_reward = length_fn([case["completion"]], ground_truth=[case["ground_truth"]])[0]
        step_reward = step_fn([case["completion"]], ground_truth=[case["ground_truth"]])[0]
        
        print(f"  准确性奖励: {acc_reward:.4f}")
        print(f"  长度惩罚奖励: {len_reward:.4f}")
        print(f"  步骤奖励: {step_reward:.4f}")
    
    print("\n结论:")
    print("  - 准确性奖励: 只关注答案正确性")
    print("  - 长度惩罚: 鼓励简洁答案")
    print("  - 步骤奖励: 鼓励详细推理")
    
    return test_cases


# ============================================================================
# 主函数
# ============================================================================

if __name__ == "__main__":
    print("="*80)
    print("示例1: 创建准确性奖励函数")
    print("="*80)
    create_accuracy_reward()
    
    print("\n" + "="*80)
    print("示例2: 创建长度惩罚奖励函数")
    print("="*80)
    create_length_penalty_reward()
    
    print("\n" + "="*80)
    print("示例3: 创建步骤奖励函数")
    print("="*80)
    create_step_reward()
    
    print("\n" + "="*80)
    print("示例4: 测试奖励函数")
    print("="*80)
    test_reward_function()
    
    print("\n" + "="*80)
    print("示例5: 答案提取测试")
    print("="*80)
    test_answer_extraction()
    
    print("\n" + "="*80)
    print("示例6: 答案比较测试")
    print("="*80)
    test_answer_comparison()
    
    print("\n" + "="*80)
    print("示例7: 不同奖励函数的对比")
    print("="*80)
    compare_reward_functions()