| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104 |
- """
- 分布式训练示例
- 本脚本演示如何使用Accelerate进行分布式训练。
- 训练代码本身无需修改,只需通过accelerate launch启动即可。
- 使用方法:
- 1. 单GPU训练:
- python 07_distributed_training.py
- 2. 多GPU DDP训练:
- accelerate launch --config_file accelerate_configs/multi_gpu_ddp.yaml 07_distributed_training.py
- 3. DeepSpeed ZeRO-2训练:
- accelerate launch --config_file accelerate_configs/deepspeed_zero2.yaml 07_distributed_training.py
- 4. DeepSpeed ZeRO-3训练:
- accelerate launch --config_file accelerate_configs/deepspeed_zero3.yaml 07_distributed_training.py
- """
- import sys
- import os
- # 添加HelloAgents到路径
- sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "HelloAgents"))
- from hello_agents.tools import RLTrainingTool
- import json
- def main():
- print("="*80)
- print("分布式训练示例")
- print("="*80)
-
- # 检测分布式环境
- world_size = int(os.environ.get("WORLD_SIZE", 1))
- local_rank = int(os.environ.get("LOCAL_RANK", 0))
-
- if world_size > 1:
- print(f"\n🚀 分布式训练模式")
- print(f" - 总进程数: {world_size}")
- print(f" - 当前进程: {local_rank}")
- print(f" - 分布式后端: {os.environ.get('ACCELERATE_DISTRIBUTED_TYPE', 'MULTI_GPU')}")
- else:
- print(f"\n💻 单GPU训练模式")
-
- print("="*80)
-
- # 创建训练工具
- rl_tool = RLTrainingTool()
-
- # 训练配置
- # 注意: batch_size是每个GPU的batch size
- # 总batch size = batch_size × num_gpus × gradient_accumulation_steps
- config = {
- "action": "train",
- "algorithm": "grpo",
- "model_name": "Qwen/Qwen3-0.6B",
- "output_dir": "./models/grpo_distributed",
- "max_samples": 200, # 使用200个样本
- "num_epochs": 2,
- "batch_size": 2, # 每个GPU的batch size
- "use_lora": True,
- "use_wandb": False,
- "use_tensorboard": True,
- }
-
- # 只在主进程打印配置
- if local_rank == 0:
- print("\n训练配置:")
- print(f" - 模型: {config['model_name']}")
- print(f" - 样本数: {config['max_samples']}")
- print(f" - Epoch数: {config['num_epochs']}")
- print(f" - 每GPU batch size: {config['batch_size']}")
- if world_size > 1:
- total_batch = config['batch_size'] * world_size
- print(f" - 总batch size: {total_batch}")
- print("="*80)
-
- # 开始训练
- # 训练代码完全不需要修改!
- # Accelerate会自动处理分布式训练的所有细节
- result = rl_tool.run(config)
-
- # 只在主进程打印结果
- if local_rank == 0:
- result_data = json.loads(result)
- print("\n" + "="*80)
- print("训练完成!")
- print("="*80)
- print(f"状态: {result_data['status']}")
- print(f"模型路径: {result_data['output_dir']}")
- print("="*80)
-
- # 打印性能提示
- if world_size > 1:
- print(f"\n💡 性能提示:")
- print(f" 使用了 {world_size} 个GPU进行训练")
- print(f" 理论加速比: ~{world_size * 0.85:.1f}x")
- print(f" (实际加速比取决于通信开销和数据加载)")
- if __name__ == "__main__":
- main()
|