08_distributed_training.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. """
  2. 分布式训练示例
  3. 本脚本演示如何使用Accelerate进行分布式训练。
  4. 训练代码本身无需修改,只需通过accelerate launch启动即可。
  5. 使用方法:
  6. 1. 单GPU训练:
  7. python 07_distributed_training.py
  8. 2. 多GPU DDP训练:
  9. accelerate launch --config_file accelerate_configs/multi_gpu_ddp.yaml 07_distributed_training.py
  10. 3. DeepSpeed ZeRO-2训练:
  11. accelerate launch --config_file accelerate_configs/deepspeed_zero2.yaml 07_distributed_training.py
  12. 4. DeepSpeed ZeRO-3训练:
  13. accelerate launch --config_file accelerate_configs/deepspeed_zero3.yaml 07_distributed_training.py
  14. """
  15. import sys
  16. import os
  17. # 添加HelloAgents到路径
  18. sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "HelloAgents"))
  19. from hello_agents.tools import RLTrainingTool
  20. import json
  21. def main():
  22. print("="*80)
  23. print("分布式训练示例")
  24. print("="*80)
  25. # 检测分布式环境
  26. world_size = int(os.environ.get("WORLD_SIZE", 1))
  27. local_rank = int(os.environ.get("LOCAL_RANK", 0))
  28. if world_size > 1:
  29. print(f"\n🚀 分布式训练模式")
  30. print(f" - 总进程数: {world_size}")
  31. print(f" - 当前进程: {local_rank}")
  32. print(f" - 分布式后端: {os.environ.get('ACCELERATE_DISTRIBUTED_TYPE', 'MULTI_GPU')}")
  33. else:
  34. print(f"\n💻 单GPU训练模式")
  35. print("="*80)
  36. # 创建训练工具
  37. rl_tool = RLTrainingTool()
  38. # 训练配置
  39. # 注意: batch_size是每个GPU的batch size
  40. # 总batch size = batch_size × num_gpus × gradient_accumulation_steps
  41. config = {
  42. "action": "train",
  43. "algorithm": "grpo",
  44. "model_name": "Qwen/Qwen3-0.6B",
  45. "output_dir": "./models/grpo_distributed",
  46. "max_samples": 200, # 使用200个样本
  47. "num_epochs": 2,
  48. "batch_size": 2, # 每个GPU的batch size
  49. "use_lora": True,
  50. "use_wandb": False,
  51. "use_tensorboard": True,
  52. }
  53. # 只在主进程打印配置
  54. if local_rank == 0:
  55. print("\n训练配置:")
  56. print(f" - 模型: {config['model_name']}")
  57. print(f" - 样本数: {config['max_samples']}")
  58. print(f" - Epoch数: {config['num_epochs']}")
  59. print(f" - 每GPU batch size: {config['batch_size']}")
  60. if world_size > 1:
  61. total_batch = config['batch_size'] * world_size
  62. print(f" - 总batch size: {total_batch}")
  63. print("="*80)
  64. # 开始训练
  65. # 训练代码完全不需要修改!
  66. # Accelerate会自动处理分布式训练的所有细节
  67. result = rl_tool.run(config)
  68. # 只在主进程打印结果
  69. if local_rank == 0:
  70. result_data = json.loads(result)
  71. print("\n" + "="*80)
  72. print("训练完成!")
  73. print("="*80)
  74. print(f"状态: {result_data['status']}")
  75. print(f"模型路径: {result_data['output_dir']}")
  76. print("="*80)
  77. # 打印性能提示
  78. if world_size > 1:
  79. print(f"\n💡 性能提示:")
  80. print(f" 使用了 {world_size} 个GPU进行训练")
  81. print(f" 理论加速比: ~{world_size * 0.85:.1f}x")
  82. print(f" (实际加速比取决于通信开销和数据加载)")
  83. if __name__ == "__main__":
  84. main()