35 Commits

Author SHA1 Message Date
  imyzx10 b53e5ab271 ignore .pycharm dependence 1 week ago
  wangy12 56bed85710 更新类图和时序图 3 weeks ago
  wangy12 fc25164a01 更新 'README.md' 3 weeks ago
  wangy12 28709c7a9b 更新 'README.md' 3 weeks ago
  qinsh 34632e54ee add interface for comms hook 3 weeks ago
  wangy12 22d313ef81 更新时序图和类图 4 weeks ago
  wangy12 2596596668 更新 'examples/training_scalability_nvidia_4_nodes/README.md' 4 weeks ago
  wangy12 0051d417e5 更新 'examples/training_scalability_nvidia_4_nodes/README.md' 4 weeks ago
  wangy12 74dfcfc927 更新 'docker/nvidia/Dockerfile' 1 month ago
  wangy12 5bb9f0a53f 更新 'examples/training_scalability_nvidia_4_nodes/README.md' 1 month ago
  wangy12 114582c1d1 更新 'examples/training_scalability_nvidia_4_nodes/README.md' 1 month ago
  wangy12 b392925483 更新 'examples/training_scalability_nvidia_4_nodes/tp1_node3.sh' 1 month ago
  wangy12 a929e6a20d 更新 'examples/training_scalability_nvidia_4_nodes/tp1_node2.sh' 1 month ago
  wangy12 5ae752a3e1 更新 'examples/training_scalability_nvidia_4_nodes/tp1_node1.sh' 1 month ago
  wangy12 7c55e55864 更新 'examples/training_scalability_nvidia_4_nodes/tp1_node0.sh' 1 month ago
  wangy12 53b9f701d6 更新 'examples/training_scalability_nvidia_4_nodes/tp1_crdp2_node3.sh' 1 month ago
  wangy12 81b8d864ec 更新 'examples/training_scalability_nvidia_4_nodes/tp1_crdp2_node2.sh' 1 month ago
  wangy12 42afbb1195 更新 'examples/training_scalability_nvidia_4_nodes/tp1_crdp2_node1.sh' 1 month ago
  wangy12 eaa178c6d1 更新 'examples/training_scalability_nvidia_4_nodes/tp1_crdp2_node0.sh' 1 month ago
  imyzx10 8734791fc3 “crdp port modify for aisynergy crdp2 training!” 1 month ago
  imyzx10 3e6076c64e “rename .sh name, for aisynergy crdp2 training!” 1 month ago
  imyzx10 25c8e0919f “rename .sh name, for aisynergy crdp2 training!” 1 month ago
  imyzx10 afe43805c0 “rename log filename..” 1 month ago
  imyzx10 ce8704ca6c “merge aisyn2 4 shells to 1 shell, update config_aisyn2_xxx.yaml” 1 month ago
  imyzx10 818f37f69c “merge tp1-4 shells to 1 shell, update ascend-docker readme.md” 1 month ago
  qinsh a83146bbda rough draft v0.6 1 month ago
  qinsh 1258125b98 rough draft v0.5 1 month ago
  qinsh 7c65917679 rough draft v0.4 1 month ago
  imyzx10 629a76eed7 “update dockerfile for 910B, with README.md update” 1 month ago
  qinsh 59f29a10cb add nvidia Dockerfile 1 month ago
  qinsh 8ad37b25f7 --amend 1 month ago
  qinsh 98b946f71f add nvidia Dockerfile 1 month ago
  qinsh 683aa3a9b6 add nvidia dockerfile 1 month ago
  wangy12 65229d7a74 更新 'docker/README.md' 1 month ago
  wangy12 00e01b6e7f 更新 'docker/Dockerfile_nvidia' 1 month ago
100 changed files with 4333 additions and 122 deletions
Split View
  1. +6
    -0
      .gitignore
  2. +17
    -10
      README.md
  3. +10
    -12
      aisynergy2/aisynergy2.py
  4. +11
    -0
      aisynergy2/comms/all_reduce_hook.py
  5. +15
    -5
      aisynergy2/comms/global_process_group_manager.py
  6. +1
    -1
      aisynergy2/config/config.py
  7. +17
    -13
      aisynergy2/distributed/cross_region_data_parallel.py
  8. +13
    -2
      aisynergy2/distributed/cross_region_params_manager.py
  9. +11
    -1
      aisynergy2/optimizer/cross_region_optimizer.py
  10. +10
    -2
      aisynergy2/training/strategy.py
  11. +1
    -1
      aisynergy2_pretrain_with_megatron.py
  12. +26
    -0
      aisynergy2_pretrain_with_mindspeed.py
  13. +0
    -73
      docker/Dockerfile_nvidia
  14. +0
    -2
      docker/README.md
  15. +8
    -0
      docker/ascend/Dockerfile
  16. +21
    -0
      docker/ascend/README.md
  17. +23
    -0
      docker/nvidia/Dockerfile
  18. +17
    -0
      docker/nvidia/README.md
  19. +18
    -0
      examples/training_scalability_ascend_4_nodes/README.md
  20. +116
    -0
      examples/training_scalability_ascend_4_nodes/aisyn2_crossDP2_innerDP2_tp1_pp1_globalRank0.sh
  21. +116
    -0
      examples/training_scalability_ascend_4_nodes/aisyn2_crossDP2_innerDP2_tp1_pp1_globalRank1.sh
  22. +25
    -0
      examples/training_scalability_ascend_4_nodes/config_aisyn2_crossDP2_innerDP2_tp1_pp1_globalRank0_node0.yaml
  23. +25
    -0
      examples/training_scalability_ascend_4_nodes/config_aisyn2_crossDP2_innerDP2_tp1_pp1_globalRank0_node1.yaml
  24. +25
    -0
      examples/training_scalability_ascend_4_nodes/config_aisyn2_crossDP2_innerDP2_tp1_pp1_globalRank1_node0.yaml
  25. +25
    -0
      examples/training_scalability_ascend_4_nodes/config_aisyn2_crossDP2_innerDP2_tp1_pp1_globalRank1_node1.yaml
  26. +115
    -0
      examples/training_scalability_ascend_4_nodes/dp4_tp1_pp1_nodeX.sh
  27. +26
    -0
      examples/training_scalability_nvidia_4_nodes/README.md
  28. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp10_crdp2_node0.yaml
  29. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp10_crdp2_node1.yaml
  30. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp10_crdp2_node2.yaml
  31. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp10_crdp2_node3.yaml
  32. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp11_crdp2_node0.yaml
  33. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp11_crdp2_node1.yaml
  34. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp11_crdp2_node2.yaml
  35. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp11_crdp2_node3.yaml
  36. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp12_crdp2_node0.yaml
  37. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp12_crdp2_node1.yaml
  38. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp12_crdp2_node2.yaml
  39. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp12_crdp2_node3.yaml
  40. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp13_crdp2_node0.yaml
  41. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp13_crdp2_node1.yaml
  42. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp13_crdp2_node2.yaml
  43. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp13_crdp2_node3.yaml
  44. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp1_crdp2_node0.yaml
  45. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp1_crdp2_node1.yaml
  46. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp1_crdp2_node2.yaml
  47. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp1_crdp2_node3.yaml
  48. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp2_crdp2_node0.yaml
  49. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp2_crdp2_node1.yaml
  50. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp2_crdp2_node2.yaml
  51. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp2_crdp2_node3.yaml
  52. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp3_crdp2_node0.yaml
  53. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp3_crdp2_node1.yaml
  54. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp3_crdp2_node2.yaml
  55. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp3_crdp2_node3.yaml
  56. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp4_crdp2_node0.yaml
  57. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp4_crdp2_node1.yaml
  58. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp4_crdp2_node2.yaml
  59. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp4_crdp2_node3.yaml
  60. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp5_crdp2_node0.yaml
  61. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp5_crdp2_node1.yaml
  62. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp5_crdp2_node2.yaml
  63. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp5_crdp2_node3.yaml
  64. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp6_crdp2_node0.yaml
  65. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp6_crdp2_node1.yaml
  66. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp6_crdp2_node2.yaml
  67. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp6_crdp2_node3.yaml
  68. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp7_crdp2_node0.yaml
  69. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp7_crdp2_node1.yaml
  70. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp7_crdp2_node2.yaml
  71. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp7_crdp2_node3.yaml
  72. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp8_crdp2_node0.yaml
  73. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp8_crdp2_node1.yaml
  74. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp8_crdp2_node2.yaml
  75. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp8_crdp2_node3.yaml
  76. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp9_crdp2_node0.yaml
  77. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp9_crdp2_node1.yaml
  78. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp9_crdp2_node2.yaml
  79. +25
    -0
      examples/training_scalability_nvidia_4_nodes/config_tp9_crdp2_node3.yaml
  80. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp10_crdp2_node0.sh
  81. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp10_crdp2_node1.sh
  82. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp10_crdp2_node2.sh
  83. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp10_crdp2_node3.sh
  84. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp11_crdp2_node0.sh
  85. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp11_crdp2_node1.sh
  86. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp11_crdp2_node2.sh
  87. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp11_crdp2_node3.sh
  88. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp12_crdp2_node0.sh
  89. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp12_crdp2_node1.sh
  90. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp12_crdp2_node2.sh
  91. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp12_crdp2_node3.sh
  92. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp13_crdp2_node0.sh
  93. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp13_crdp2_node1.sh
  94. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp13_crdp2_node2.sh
  95. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp13_crdp2_node3.sh
  96. +113
    -0
      examples/training_scalability_nvidia_4_nodes/tp13_node0.sh
  97. +113
    -0
      examples/training_scalability_nvidia_4_nodes/tp13_node1.sh
  98. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp1_crdp2_node0.sh
  99. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp1_crdp2_node1.sh
  100. +111
    -0
      examples/training_scalability_nvidia_4_nodes/tp1_crdp2_node2.sh

+ 6
- 0
.gitignore View File

@@ -0,0 +1,6 @@
__pycache__
.vscode

#ascend
kernel_meta
fusion_result.json

+ 17
- 10
README.md View File

@@ -3,16 +3,23 @@
## 代码目录结构

```
├── aisynergy2 //aisynergy2核心代码
│   ├── comms.py //通信代码ElasticDeviceMesh
│   └── diloco.py //双层模型封装Diloco
├── third_party //适配的第三方单集群并行框架
│ └── Megatron-LM //适配Megatron框架
├── docker //容器镜像脚本
├── docs //文档说明
├── images //图片文件
├── platform //平台部署代码
└── aisynergy2_pretrain_with_megatron.py //aisynergy2基于megatron的预训练主函数入口
├── aisynergy2.py //aisynergy2核心代码
├── comms //跨区域通信基础设施
| ├── all_reduce_hook.py //定义自定义通信钩子
| └── global_process_group_manager.py //创建和管理跨区域全局进程组
├── config //配置管理
| ├── config.py //定义配置数据类(dataclass)与加载逻辑
| └── config.yaml //YAML 格式配置文件,供用户灵活调整
├── distributed //分布式参数与梯度管理
| ├── cross_region_data_parallel.py //跨区域数据并行包装器,管理模型参数与梯度的跨区域同步
| └── cross_region_params_manager.py //实现参数与梯度的连续内存缓冲区管理(Bucket 化)
├── optimizer //跨区域优化器包装
| └── cross_region_optimizer.py //让 Megatron 优化器“无感知”地操作跨区域参数,保持训练逻辑不变
├── training //训练策略控制
| └── strategy.py //通过 hook 注入 Megatron train_step,完全非侵入
└── utils //工具与参数解析
└── arguments.py //扩展 Megatron 命令行参数,添加 AISynergy2 专属配置

```




+ 10
- 12
aisynergy2/aisynergy2.py View File

@@ -1,12 +1,13 @@
from typing import Optional
import torch
import os

from .utils.arguments import parse_args
from .config.config import load_config_from_yaml, AISynergy2Config
from .comms.global_process_group_manager import GlobalProcessGroupManager
from .training.strategy import CrossRegionStrategy
from .distributed.cross_region_data_parallel import CrossRegionDataParallel
from .optimizer.cross_region_optimizer import CrossRegionOptimizer
from .optimizer.cross_region_optimizer import CrossRegionOptimizer, build_cross_region_optimizer



@@ -14,7 +15,7 @@ class AISynergy2:
def __init__(self, config: AISynergy2Config):
self.config = config
self.global_process_group_manager = GlobalProcessGroupManager(self.config.global_process_group_manager_config)
self.cross_region_strategy: Optional[CrossRegionStrategy] = CrossRegionStrategy(self.config.global_train_config.cross_region_strategy_config, self)
self.cross_region_strategy: Optional[CrossRegionStrategy] = CrossRegionStrategy(self, self.config.global_train_config.cross_region_strategy_config)
self.cross_region_data_parallel: Optional[CrossRegionDataParallel] = None
self.cross_region_opimizer: Optional[CrossRegionOptimizer] = None

@@ -27,17 +28,14 @@ class AISynergy2:
origin_model,
self.global_process_group_manager,
)
self.cross_region_data_parallel.bradcast_cross_region_params()
torch.cuda.set_device(int(os.getenv("LOCAL_RANK")))
origin_model.cuda(torch.cuda.current_device())
self.cross_region_data_parallel.broadcast_cross_region_params()
optimizer_config = self.config.global_train_config.cross_region_optimizer_config
assert optimizer_config.optimizer == "sgd"
optimizer = torch.optim.SGD(
origin_model.parameters(),
lr=optimizer_config.lr,
weight_decay=optimizer_config.weight_decay,
momentum=optimizer_config.sgd_momentum,
)
self.cross_region_opimizer = CrossRegionOptimizer(optimizer)

self.cross_region_opimizer = build_cross_region_optimizer(origin_model, optimizer_config)
return origin_model
@@ -45,7 +43,7 @@ class AISynergy2:

def wrap_training_script(self, origin_training_script):
assert hasattr(origin_training_script, "train_step")
origin_train_step = getattr(origin_training_script.train_step)
origin_train_step = getattr(origin_training_script, "train_step")
wrapped_train_step = self.wrap_train_step(origin_train_step)
setattr(origin_training_script, "train_step", wrapped_train_step)


+ 11
- 0
aisynergy2/comms/all_reduce_hook.py View File

@@ -0,0 +1,11 @@
import torch
import torch.distributed as dist

def all_reduce_hook(state, bucket, global_pg) -> torch.futures.Future[torch.Tensor]:
tensor = bucket.buffer()
tensor.div_(global_pg.size())
return (
dist.all_reduce(tensor, dist.ReduceOp.SUM, global_pg, async_op=True)
.get_future()
.then(lambda fut: fut.value()[0])
)

+ 15
- 5
aisynergy2/comms/global_process_group_manager.py View File

@@ -29,13 +29,23 @@ class GlobalProcessGroupManager:
prefix_store = dist.PrefixStore("aisynergy2", self.global_store)
if self.config.global_backend == "gloo":
self.global_pg = dist.ProcessGroupGloo(
prefix_store, self.config.global_rank, self.config.global_world_size, self.config.global_comm_timeout
prefix_store, self.config.global_rank, self.config.global_world_size, timedelta(seconds=self.config.global_comm_timeout)
)
elif self.config.global_backend == "nccl":
pg_options = dist.ProcessGroupNCCL.Options()
pg_options._timeout = timedelta(seconds=self.config.global_comm_timeout)
self.global_pg = dist.ProcessGroupNCCL(
prefix_store, self.config.global_rank, self.config.global_world_size, pg_options
)
else:
assert self.config.global_backend == "hccl"
import torch_npu
pg_options = torch_npu._C._distributed_c10d.ProcessGroupHCCL.Options()
pg_options._timeout = timedelta(seconds=self.config.global_comm_timeout)
self.global_pg = torch_npu._C._distributed_c10d.ProcessGroupHCCL(
prefix_store, self.config.global_rank, self.config.global_world_size, pg_options
)
assert self.global_pg is not None
def all_reduce(self, tensor: torch.Tensor, async_op: bool):
tensor.div_(self.config.global_world_size)
return dist.all_reduce(tensor, dist.ReduceOp.SUM, self.global_pg, async_op)

def get_gobal_pg(self):
return self.global_pg

+ 1
- 1
aisynergy2/config/config.py View File

@@ -25,7 +25,7 @@ class CrossRegionOptimizerConfig(BaseModel):

class CrossRegionStrategyConfig(BaseModel):
inner_steps: int
async_op: False
async_op: bool = False

class GlobalTrainConfig(BaseModel):
cross_region_data_parallel_config: CrossRegionDataParallelConfig


+ 17
- 13
aisynergy2/distributed/cross_region_data_parallel.py View File

@@ -3,6 +3,7 @@ import torch.distributed

from ..config.config import CrossRegionDataParallelConfig
from ..comms.global_process_group_manager import GlobalProcessGroupManager
from ..comms.all_reduce_hook import all_reduce_hook
from .cross_region_params_manager import _ParamAndGradBuffer

DTYPE_MAP = {
@@ -22,7 +23,9 @@ class CrossRegionDataParallel:
self.module = module
self.global_process_group_manager = global_process_group_manager
self.comms_hook = self.defualt_comms_hook
self.comms_hook_state = None
self.comms_hook = None
self._init_default_comms_hook()
self.buffer = None
@@ -40,18 +43,17 @@ class CrossRegionDataParallel:
bucket_size=self.config.bucket_size
)
def defualt_comms_hook(self):
self.comms_hook = self.global_process_group_manager.all_reduce
def _init_default_comms_hook(self):
self.register_comms_hook(None, all_reduce_hook)
def _make_comms_hook(self, comms_hook):
global_pg = self.global_process_group_manager.get_gobal_pg()
def wrapper(tensor, async_op):
return comms_hook(tensor, global_pg, async_op)
def _make_comms_hook(self, hook):
def wrapper(bucket):
return hook(self.comms_hook_state, bucket, self.global_process_group_manager.get_gobal_pg())
return wrapper
def register_comms_hook(self, comms_hook):
comms_hook = self._make_comms_hook(comms_hook)
self.comms_hook = comms_hook
def register_comms_hook(self, state, hook):
self.comms_hook_state = state
self.comms_hook = self._make_comms_hook(hook)
def start_grad_sync(self, async_op: bool = False):
for bucket in self.buffer.get_buckets():
@@ -61,13 +63,15 @@ class CrossRegionDataParallel:
for bucket in self.buffer.get_buckets():
bucket.finish_grad_sync()
def bradcast_cross_region_params(self):
def broadcast_cross_region_params(self):
global_pg = self.global_process_group_manager.get_gobal_pg()
opts = torch.distributed.BarrierOptions()
opts = torch.distributed.BroadcastOptions()
opts.rootRank = 0
opts.rootTensor = 0
opts.asyncOp = False
for param in self.module.parameters():
global_pg.broadcast([param.data], opts)
global_pg.broadcast([param.data], opts).wait()
def caculate_cross_region_grads(self):
self.buffer.caculate_grads()


+ 13
- 2
aisynergy2/distributed/cross_region_params_manager.py View File

@@ -34,13 +34,24 @@ class _ParamAndGradBucket:
self.bucket_id = bucket_id
self.grad_sync_handle = None
def start_grad_sync(self, comms_hook, async_op):
def start_grad_sync(self, comms_hook, async_op: bool = False):
assert self.grad_sync_handle is None
self.grad_sync_handle = comms_hook(self.grad_data, async_op)
cm = comms_hook(self)
if async_op:
self.grad_sync_handle = cm
else:
cm.wait()
self.grad_sync_handle = None
def finish_grad_sync(self):
self.grad_sync_handle.wait()
self.grad_sync_handle = None
def buffer(self):
return self.grad_data
def index(self):
self.bucket_id

class _ParamAndGradBuffer:



+ 11
- 1
aisynergy2/optimizer/cross_region_optimizer.py View File

@@ -4,6 +4,8 @@ class CrossRegionOptimizer:
self.optimizer = optimizer
self.model_params_groups = []
self.cross_region_params_groups = []
def init_param_groups(self):
for param_group in self.optimizer.param_groups:
model_params_this_group = []
cross_region_params_this_group = []
@@ -36,4 +38,12 @@ class CrossRegionOptimizer:
# no-op due to grads sharing with CRDP(cross_region_data_parallel) buffer
pass

def build_cross_region_optimizer(origin_model, optimizer_config):
assert optimizer_config.optimizer == "sgd"
optimizer = torch.optim.SGD(
origin_model.parameters(),
lr=optimizer_config.lr,
weight_decay=optimizer_config.weight_decay,
momentum=optimizer_config.sgd_momentum,
)
return CrossRegionOptimizer(optimizer)

+ 10
- 2
aisynergy2/training/strategy.py View File

@@ -1,8 +1,8 @@
from ..config.config import CrossRegionStrategyConfig
from ..aisynergy2 import AISynergy2
import time

class CrossRegionStrategy:
def __init__(self, config: CrossRegionStrategyConfig, aisynergy2: AISynergy2):
def __init__(self, aisynergy2, config: CrossRegionStrategyConfig):
self.config = config
self.aisynergy2 = aisynergy2
self.local_step: int = 0
@@ -10,14 +10,22 @@ class CrossRegionStrategy:
def train_step_pre_hook(self):
if self.local_step == 0:
self.aisynergy2.cross_region_data_parallel.allocate_buffer_for_params()
self.aisynergy2.cross_region_opimizer.init_param_groups()
def train_step_post_hook(self):
self.local_step += 1
if self.local_step % self.config.inner_steps == 0:
#TODO inaccurate, make a sync timer
time1 = time.time()
self.aisynergy2.cross_region_data_parallel.caculate_cross_region_grads()
time2 = time.time()
assert not self.config.async_op
self.aisynergy2.cross_region_data_parallel.start_grad_sync(self.config.async_op)
time3 = time.time()
self.aisynergy2.cross_region_opimizer.step()
time4 = time.time()
self.aisynergy2.cross_region_data_parallel.zero_grad_buffer()
time5 = time.time()
print("**iteration {} cross-region update | Time:{}->{}->{}->{}->{}".format(self.local_step, time1, time2, time3, time4, time5))

+ 1
- 1
aisynergy2_pretrain_with_megatron.py View File

@@ -1,7 +1,7 @@
from aisynergy2 import build_aisynergy2_and_wrap_megatron

import megatron.training.training as megatron_training_script
import third_party.MegatronLM.pretrain_gpt as local_train_script
import third_party.MegatronLM.megatron.training.training as megatron_training_script


if __name__ == "__main__":


+ 26
- 0
aisynergy2_pretrain_with_mindspeed.py View File

@@ -0,0 +1,26 @@
import torch
import mindspeed.megatron_adaptor
from aisynergy2 import build_aisynergy2_and_wrap_megatron

import megatron.training.training as megatron_training_script
import third_party.MegatronLM.pretrain_gpt as local_train_script


if __name__ == "__main__":
# aisynergy2 wrapper
aisynergy2, wrapped_model_provider = build_aisynergy2_and_wrap_megatron(
local_train_script.model_provider,
megatron_training_script,
)

# Temporary for transition to core datasets
local_train_script.train_valid_test_datasets_provider.is_distributed = True

local_train_script.pretrain(
local_train_script.train_valid_test_datasets_provider,
wrapped_model_provider,
local_train_script.ModelType.encoder_or_decoder,
local_train_script.forward_step,
args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
)

+ 0
- 73
docker/Dockerfile_nvidia View File

@@ -1,73 +0,0 @@
# 使用 CUDA 12.4 官方镜像(带 cuDNN)
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04

ENV DEBIAN_FRONTEND=noninteractive

# 安装 nvcc 和 CUDA Toolkit(开发工具)
#RUN apt-get update && \
# apt-get install -y --no-install-recommends cuda-toolkit-12-4 && \
# rm -rf /var/lib/apt/lists/*

# 验证(可选)
RUN which nvcc && nvcc --version

# 安装 Python 3.10
RUN apt-get update && apt-get install -y \
python3.10 \
python3.10-dev \
python3-pip \
python3-setuptools \
python3-venv \
curl \
git \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*

# 设置默认 Python
RUN ln -sf /usr/bin/python3.10 /usr/bin/python3 \
&& ln -sf /usr/bin/python3.10 /usr/bin/python \
&& ln -sf /usr/bin/pip3 /usr/bin/pip

# 升级 pip
RUN pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple

# ⚠️ 安装 PyTorch 2.6.0 (Nightly Preview for CUDA 12.4)
RUN pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124 -i https://pypi.tuna.tsinghua.edu.cn/simple

# 验证 PyTorch 是否已正确安装
RUN python -c "import torch; print('PyTorch version:', torch.__version__); print('CUDA available:', torch.cuda.is_available())"


# ✅ 修复:验证版本和 GPU(写成单行或正确续行)
RUN python3 -c 'import torch; \
print("\n✅ Installed PyTorch version:", torch.__version__); \
print("✅ Built with CUDA:", torch.version.cuda); \
print("✅ cuDNN enabled:", torch.backends.cudnn.enabled); \
print("✅ CUDA available:", torch.cuda.is_available())'

# 安装 git(如果原镜像没有)
RUN apt-get install -y git && rm -rf /var/lib/apt/lists/*

# 克隆并安装 Megatron-LM core_v0.12.1
RUN git clone --branch core_v0.12.1 https://github.com/NVIDIA/Megatron-LM.git /opt/Megatron-LM \
&& cd /opt/Megatron-LM \
&& pip install -e . -i https://pypi.tuna.tsinghua.edu.cn/simple

# 验证安装
RUN python -c "import megatron; print('✅ Megatron-LM core_v0.12.1 is ready!')"

# 安装 pybind11(用于编译 Megatron 数据集加速模块)
RUN pip install pybind11 -i https://pypi.tuna.tsinghua.edu.cn/simple

# 可选:安装编译工具链(一般已有,保险起见可加)
RUN apt-get install -y build-essential && rm -rf /var/lib/apt/lists/*


# 安装 Apex(可选,用于高级融合算子)如果已下载/apex,直接运行COPY,如果没有则可运行下面的git clone
COPY ./apex /tmp/apex
#RUN git clone https://github.com/NVIDIA/apex.git /tmp/apex \
RUN cd /tmp/apex \
&& APEX_CPP_EXT=1 APEX_CUDA_EXT=1 pip install -v --no-build-isolation . \
&& rm -rf /tmp/apex

WORKDIR /workspace

+ 0
- 2
docker/README.md View File

@@ -1,2 +0,0 @@
关于Dockerfile_nvidia文件的使用命令(替换下面的image-name即可):
sudo docker build -t $(your-image-name) .

+ 8
- 0
docker/ascend/Dockerfile View File

@@ -0,0 +1,8 @@
FROM quay.io/ascend/cann:8.2.rc1.alpha002-910b-ubuntu22.04-py3.10
WORKDIR /workspace
RUN apt-get update && apt-get install -y git vim python3-dev net-tools
RUN pip3 install torch-npu==2.6.0.post3 pydantic -i https://repo.huaweicloud.com/repository/pypi/simple

RUN pip3 install pybind11 nltk pyarrow pandas ninja wheel "numpy<=1.26.0" six regex decorator attrs psutil pyyaml protobuf einops scipy sentencepiece pytest pytest-mock "transformers>=4.43.2" gpytorch pandas scikit-learn SQLAlchemy -i https://repo.huaweicloud.com/repository/pypi/simple

ENTRYPOINT ["/bin/bash"]

+ 21
- 0
docker/ascend/README.md View File

@@ -0,0 +1,21 @@
## Dockerfile_ascend910B使用方法
## 版本说明
```
python==3.10
torch==2.6.0
torch_npu==2.6.0.post3
transformers==4.57.1
CANN 8.2.RC1
ubuntu 22.04
```

## 构建命令
```
docker build -t $YOUR_IMAGE -f Dockerfile_ascend910B .
```

## 运行命令
```
docker run -it --privileged=true --net=host --rm --shm-size 64G --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/common -v /usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/driver -v /etc/ascend_install.info:/etc/ascend_install.info -v /etc/vnpu.cfg:/etc/vnpu.cfg -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /usr/local/openmpi/:/usr/local/openmpi/ -v /path/to/aisynergy2:/workspace/aisynergy2 -v /path/to/dataset:/workspace/dataset -v /path/to/checkpoints:/workspace/checkpoints -e PYTHONPATH="/workspace/aisynergy2/third_party/MegatronLM:/workspace/aisynergy2/third_party/MindSpeed:${PYTHONPATH}" image_name
```


+ 23
- 0
docker/nvidia/Dockerfile View File

@@ -0,0 +1,23 @@
# 使用 CUDA 12.4 官方镜像(带 cuDNN)
FROM nvcr.io/nvidia/pytorch:24.08-py3

ENV DEBIAN_FRONTEND=noninteractive

# 设置环境变量
ENV TZ=UTC
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONPATH="/workspace/AISynergy2/third_party/MegatronLM:${PYTHONPATH}"

# 设置时区
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

WORKDIR /workspace

# ✅ 关键:设置默认 PYTHONPATH(可选,推荐在 docker run 时覆盖)
# ENV PYTHONPATH="/workspace/megatron:${PYTHONPATH}"

# ✅ 最终提示:使用时请挂载 Megatron 源码
CMD ["bash"]

# 运行命令
#sudo docker run -e TZ=UTC --privileged --net=host --gpus all -it --rm --shm-size 64G -v /userpath/AISynergy2:/workspace/AISynergy2 -v /userpath/dataset:/workspace/dataset -v /userpath/checkpoints:/workspace/checkpoints -e PYTHONPATH="/workspace/AISynergy2/third_party/MegatronLM:${PYTHONPATH}" 镜像名:版本

+ 17
- 0
docker/nvidia/README.md View File

@@ -0,0 +1,17 @@
## 版本说明
```
python 3.10
pytorch 2.6.0
cuda 12.2.2
ubuntu 22.04
```

## 构建命令
```
docker build -t aisynergy2_nvidia:v1 .
```

## 运行命令
```
docker run --privileged --net=host --gpus all -it --rm --shm-size 64G -v /path/to/aisynergy2:/workspace/aisynergy2 -v /path/to/dataset:/workspace/dataset aisynergy2_nvidia:v1
```

+ 18
- 0
examples/training_scalability_ascend_4_nodes/README.md View File

@@ -0,0 +1,18 @@
## 启动容器命令
```
docker run --privileged --net=host --gpus all -it --rm --shm-size 64G --device=/dev/davinci_manager --device=/dev/devmm_svm --device=/dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/common -v /usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/driver -v /etc/ascend_install.info:/etc/ascend_install.info -v /etc/vnpu.cfg:/etc/vnpu.cfg -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /usr/local/openmpi/:/usr/local/openmpi/ -v /path/to/aisynergy2:/workspace/aisynergy2 -v /path/to/dataset:/workspace/dataset -v /path/to/checkpoints:/workspace/checkpoints -e PYTHONPATH="/workspace/aisynergy2/third_party/MegatronLM:/workspace/aisynergy2/third_party/MindSpeed:${PYTHONPATH}" image_name
```

## 容器内执行
```
cd /workspace/aisynergy2
# example:4节点各1卡模拟DP=4,TP=1,PP=1训练
# 1.1 节点1执行
bash examples/training_scalability_ascend_4_nodes/dp4_tp1_pp1_nodeX.sh 0
# 1.2 节点2执行
bash examples/training_scalability_ascend_4_nodes/dp4_tp1_pp1_nodeX.sh 1
# 1.3 节点3执行
bash examples/training_scalability_ascend_4_nodes/dp4_tp1_pp1_nodeX.sh 2
# 1.4 节点4执行
bash examples/training_scalability_ascend_4_nodes/dp4_tp1_pp1_nodeX.sh 3
```

+ 116
- 0
examples/training_scalability_ascend_4_nodes/aisyn2_crossDP2_innerDP2_tp1_pp1_globalRank0.sh View File

@@ -0,0 +1,116 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export ASCEND_SLOG_PRINT_TO_STDOUT=1
# export ASCEND_GLOBAL_LOG_LEVEL=1

#export ASCEND_RT_VISIBLE_DEVICES=0

export GLOO_SOCKET_IFNAME=bond_virt

TENSOR_PARALLEL=1

GPUS_PER_NODE=1
# Change for multinode config
MASTER_ADDR=192.168.190.71
MASTER_PORT=6662
NUM_NODES=2
NODE_RANK=$1
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}_node${NODE_RANK}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}_node${NODE_RANK}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 16
--hidden-size 2048
--num-attention-heads 16
)

TRAINING_ARGS=(
--transformer-impl local
--disable-bias-linear
--no-masked-softmax-fusion
--attention-backend flash
--use-flash-attn
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 64
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--fp16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_mindspeed.py --aisynergy2-yaml-config-path examples/training_scalability_ascend_4_nodes/config_${TASK_VERSION}_node${NODE_RANK}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log_aisyn2_nr$NODE_RANK.log

+ 116
- 0
examples/training_scalability_ascend_4_nodes/aisyn2_crossDP2_innerDP2_tp1_pp1_globalRank1.sh View File

@@ -0,0 +1,116 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export ASCEND_SLOG_PRINT_TO_STDOUT=1
# export ASCEND_GLOBAL_LOG_LEVEL=1

#export ASCEND_RT_VISIBLE_DEVICES=0

export GLOO_SOCKET_IFNAME=bond_virt

TENSOR_PARALLEL=1

GPUS_PER_NODE=1
# Change for multinode config
MASTER_ADDR=192.168.190.75
MASTER_PORT=6662
NUM_NODES=2
NODE_RANK=$1
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}_node${NODE_RANK}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}_node${NODE_RANK}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 16
--hidden-size 2048
--num-attention-heads 16
)

TRAINING_ARGS=(
--transformer-impl local
--disable-bias-linear
--no-masked-softmax-fusion
--attention-backend flash
--use-flash-attn
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 64
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--fp16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_mindspeed.py --aisynergy2-yaml-config-path examples/training_scalability_ascend_4_nodes/config_${TASK_VERSION}_node${NODE_RANK}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log_aisyn2_nr$NODE_RANK.log

+ 25
- 0
examples/training_scalability_ascend_4_nodes/config_aisyn2_crossDP2_innerDP2_tp1_pp1_globalRank0_node0.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.190.71"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "hccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_ascend_4_nodes/config_aisyn2_crossDP2_innerDP2_tp1_pp1_globalRank0_node1.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.190.74"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "hccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_ascend_4_nodes/config_aisyn2_crossDP2_innerDP2_tp1_pp1_globalRank1_node0.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.190.71"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "hccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_ascend_4_nodes/config_aisyn2_crossDP2_innerDP2_tp1_pp1_globalRank1_node1.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.190.74"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "hccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 115
- 0
examples/training_scalability_ascend_4_nodes/dp4_tp1_pp1_nodeX.sh View File

@@ -0,0 +1,115 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export ASCEND_SLOG_PRINT_TO_STDOUT=1
# export ASCEND_GLOBAL_LOG_LEVEL=1
#export ASCEND_RT_VISIBLE_DEVICES=0

export GLOO_SOCKET_IFNAME=bond_virt

TENSOR_PARALLEL=1

GPUS_PER_NODE=1
# Change for multinode config
MASTER_ADDR=192.168.190.77 # $YOUR_MASTER_ADDR
MASTER_PORT=6661 # $YOUR_MASTER_PORT
NUM_NODES=4
NODE_RANK=$1
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}_$NODE_RANK
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}_node${NODE_RANK}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}_node${NODE_RANK}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 20
--hidden-size 2048
--num-attention-heads 16
)

TRAINING_ARGS=(
--transformer-impl local
--disable-bias-linear
--no-masked-softmax-fusion
--attention-backend flash
--use-flash-attn
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 128
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--fp16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} pretrain_with_mindspeed.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log_nr$NODE_RANK.log

+ 26
- 0
examples/training_scalability_nvidia_4_nodes/README.md View File

@@ -0,0 +1,26 @@
## 启动容器命令
```
docker run --privileged --net=host --gpus all -it --rm --shm-size 64G -v /path/to/aisynergy2:/workspace/aisynergy2 -v /path/to/dataset:/workspace/dataset -v /path/to/checkpoints:/workspace/checkpoints -e PYTHONPATH="/workspace/aisynergy2/third_party/MegatronLM:${PYTHONPATH}" image_name
```

## 容器内执行
```
cd aisynergy2
bash examples/path/to/xx.sh
```

## 例:
```
docker run -e TZ=UTC --privileged --net=host --gpus all -it --rm --shm-size 64G -v /path/to/AISynergy2:/workspace/AISynergy2 -v /path/to/dataset:/workspace/dataset -v /path/to/checkpoints:/workspace/checkpoints -e PYTHONPATH="/workspace/AISynergy2/third_party/MegatronLM:${PYTHONPATH}" nvcr.io/nvidia/pytorch:24.08-py3

cd /workspace/AISynergy2
bash examples/training_scalability_nvidia_4_nodes/tp4_crdp2_node0.sh
bash examples/training_scalability_nvidia_4_nodes/tp4_node0.sh
```

## 数据集地址:
```
https://openi.pcl.ac.cn/explore/datasets/qinsh/aisynergy2_test_data

将上述地址的所有文件下载并存放在/path/to/dataset路径下
```

+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp10_crdp2_node0.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp10_crdp2_node1.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp10_crdp2_node2.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp10_crdp2_node3.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp11_crdp2_node0.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 300

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp11_crdp2_node1.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 300

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp11_crdp2_node2.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 300

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp11_crdp2_node3.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 300

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp12_crdp2_node0.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp12_crdp2_node1.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp12_crdp2_node2.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp12_crdp2_node3.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp13_crdp2_node0.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp13_crdp2_node1.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp13_crdp2_node2.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp13_crdp2_node3.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp1_crdp2_node0.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp1_crdp2_node1.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp1_crdp2_node2.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp1_crdp2_node3.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp2_crdp2_node0.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 1
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp2_crdp2_node1.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 1
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp2_crdp2_node2.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 1
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp2_crdp2_node3.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 1
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp3_crdp2_node0.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 1
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp3_crdp2_node1.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 1
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp3_crdp2_node2.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 1
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp3_crdp2_node3.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 1
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp4_crdp2_node0.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 1
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp4_crdp2_node1.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 1
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp4_crdp2_node2.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 1
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp4_crdp2_node3.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 1
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp5_crdp2_node0.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp5_crdp2_node1.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp5_crdp2_node2.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp5_crdp2_node3.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp6_crdp2_node0.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp6_crdp2_node1.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp6_crdp2_node2.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp6_crdp2_node3.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp7_crdp2_node0.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp7_crdp2_node1.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp7_crdp2_node2.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp7_crdp2_node3.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp8_crdp2_node0.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp8_crdp2_node1.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp8_crdp2_node2.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp8_crdp2_node3.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 60

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 5
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp9_crdp2_node0.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 300

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp9_crdp2_node1.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 0
global_backend: "nccl"
global_comm_timeout: 300

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp9_crdp2_node2.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.8"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 300

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 25
- 0
examples/training_scalability_nvidia_4_nodes/config_tp9_crdp2_node3.yaml View File

@@ -0,0 +1,25 @@
global_process_group_manager_config:
global_master_addr: "192.168.242.14"
global_master_port: 12123
global_world_size: 2
global_rank: 1
global_backend: "nccl"
global_comm_timeout: 300

global_train_config:
cross_region_data_parallel_config:
bucket_size: 40000000
cross_region_param_dtype: "fp32"
cross_region_grad_dtype: "fp32"

cross_region_optimizer_config:
optimizer: "sgd"
lr: 0.01
weight_decay: 0.003
sgd_momentum: 0.9
cross_region_strategy_config:
inner_steps: 50
async_op: false


+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp10_crdp2_node0.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0,1
export GLOO_SOCKET_IFNAME=enp104s0f0

TENSOR_PARALLEL=2

GPUS_PER_NODE=2
# Change for multinode config
MASTER_ADDR=192.168.242.8
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 39
--hidden-size 2048
--num-attention-heads 16
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 64
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp10_crdp2_node1.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0,1
export GLOO_SOCKET_IFNAME=enp104s0f0

TENSOR_PARALLEL=2

GPUS_PER_NODE=2
# Change for multinode config
MASTER_ADDR=192.168.242.8
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=1
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 39
--hidden-size 2048
--num-attention-heads 16
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 64
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp10_crdp2_node2.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0,1
export GLOO_SOCKET_IFNAME=eno1

TENSOR_PARALLEL=2

GPUS_PER_NODE=2
# Change for multinode config
MASTER_ADDR=192.168.242.15
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 39
--hidden-size 2048
--num-attention-heads 16
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 64
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp10_crdp2_node3.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0,1
export GLOO_SOCKET_IFNAME=eno1

TENSOR_PARALLEL=2

GPUS_PER_NODE=2
# Change for multinode config
MASTER_ADDR=192.168.242.15
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=1
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 39
--hidden-size 2048
--num-attention-heads 16
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 64
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp11_crdp2_node0.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0,1,2,3
export GLOO_SOCKET_IFNAME=enp104s0f0

TENSOR_PARALLEL=4

GPUS_PER_NODE=4
# Change for multinode config
MASTER_ADDR=192.168.242.8
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 36
--hidden-size 3072
--num-attention-heads 24
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 64
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp11_crdp2_node1.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0,1,2,3
export GLOO_SOCKET_IFNAME=enp104s0f0

TENSOR_PARALLEL=4

GPUS_PER_NODE=4
# Change for multinode config
MASTER_ADDR=192.168.242.8
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=1
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 36
--hidden-size 3072
--num-attention-heads 24
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 64
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp11_crdp2_node2.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=4,5,6,7
export GLOO_SOCKET_IFNAME=eno1

TENSOR_PARALLEL=4

GPUS_PER_NODE=4
# Change for multinode config
MASTER_ADDR=192.168.242.15
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 36
--hidden-size 3072
--num-attention-heads 24
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 64
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp11_crdp2_node3.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=4,5,6,7
export GLOO_SOCKET_IFNAME=eno1

TENSOR_PARALLEL=4

GPUS_PER_NODE=4
# Change for multinode config
MASTER_ADDR=192.168.242.15
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=1
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 36
--hidden-size 3072
--num-attention-heads 24
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 64
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp12_crdp2_node0.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export GLOO_SOCKET_IFNAME=enp104s0f0

TENSOR_PARALLEL=8

GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=192.168.242.8
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 24
--hidden-size 4096
--num-attention-heads 32
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 64
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp12_crdp2_node1.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export GLOO_SOCKET_IFNAME=enp104s0f0

TENSOR_PARALLEL=8

GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=192.168.242.8
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=1
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 24
--hidden-size 4096
--num-attention-heads 32
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 64
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp12_crdp2_node2.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export GLOO_SOCKET_IFNAME=eno1

TENSOR_PARALLEL=8

GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=192.168.242.15
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 24
--hidden-size 4096
--num-attention-heads 32
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 64
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp12_crdp2_node3.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export GLOO_SOCKET_IFNAME=eno1

TENSOR_PARALLEL=8

GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=192.168.242.15
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=1
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 24
--hidden-size 4096
--num-attention-heads 32
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 64
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp13_crdp2_node0.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export GLOO_SOCKET_IFNAME=enp104s0f0

TENSOR_PARALLEL=8

GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=192.168.242.8
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 48
--hidden-size 56
--num-attention-heads 32
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 1
--global-batch-size 16
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp13_crdp2_node1.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export GLOO_SOCKET_IFNAME=enp104s0f0

TENSOR_PARALLEL=8

GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=192.168.242.8
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=1
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 48
--hidden-size 56
--num-attention-heads 32
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 1
--global-batch-size 16
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp13_crdp2_node2.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export GLOO_SOCKET_IFNAME=eno1

TENSOR_PARALLEL=8

GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=192.168.242.15
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 48
--hidden-size 56
--num-attention-heads 32
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 1
--global-batch-size 16
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp13_crdp2_node3.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export GLOO_SOCKET_IFNAME=eno1

TENSOR_PARALLEL=8

GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=192.168.242.15
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=1
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 48
--hidden-size 56
--num-attention-heads 32
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 1
--global-batch-size 16
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 113
- 0
examples/training_scalability_nvidia_4_nodes/tp13_node0.sh View File

@@ -0,0 +1,113 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export GLOO_SOCKET_IFNAME=enp104s0f0

TENSOR_PARALLEL=8

GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=192.168.242.8
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 48
--hidden-size 7168
--num-attention-heads 56
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
#--tp-comm-overlap
#--sequence-parallel
--use-flash-attn
--attention-backend flash
--overlap-grad-reduce
# --overlap-param-gather
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 128
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} third_party/MegatronLM/pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 113
- 0
examples/training_scalability_nvidia_4_nodes/tp13_node1.sh View File

@@ -0,0 +1,113 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export GLOO_SOCKET_IFNAME=enp104s0f0

TENSOR_PARALLEL=8

GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=192.168.242.8
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=1
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 48
--hidden-size 7168
--num-attention-heads 56
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
#--tp-comm-overlap
#--sequence-parallel
--use-flash-attn
--attention-backend flash
--overlap-grad-reduce
# --overlap-param-gather
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 1
--global-batch-size 32
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} third_party/MegatronLM/pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp1_crdp2_node0.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0
export GLOO_SOCKET_IFNAME=enp104s0f0

TENSOR_PARALLEL=1

GPUS_PER_NODE=1
# Change for multinode config
MASTER_ADDR=192.168.242.8
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 22
--hidden-size 2048
--num-attention-heads 16
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 64
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp1_crdp2_node1.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0
export GLOO_SOCKET_IFNAME=enp104s0f0

TENSOR_PARALLEL=1

GPUS_PER_NODE=1
# Change for multinode config
MASTER_ADDR=192.168.242.8
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=1
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 22
--hidden-size 2048
--num-attention-heads 16
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 64
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

+ 111
- 0
examples/training_scalability_nvidia_4_nodes/tp1_crdp2_node2.sh View File

@@ -0,0 +1,111 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

# export NCCL_DEBUG=INFO

export CUDA_VISIBLE_DEVICES=0
export GLOO_SOCKET_IFNAME=eno1

TENSOR_PARALLEL=1

GPUS_PER_NODE=1
# Change for multinode config
MASTER_ADDR=192.168.242.15
MASTER_PORT=6000
NUM_NODES=2
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))

SH_FILE=$(basename $0)
TASK_VERSION=${SH_FILE%.*}
echo $TASK_VERSION

CHECKPOINT_PATH=/workspace/checkpoints/checkpoints/${TASK_VERSION}
if [ ! -d $CHECKPOINT_PATH ]
then
mkdir -p $CHECKPOINT_PATH
else
echo "文件夹已经存在"
fi

TENSORBOARD_PATH=/workspace/checkpoints/tensorboard/${TASK_VERSION}
if [ ! -d $TENSORBOARD_PATH ]
then
mkdir -p $TENSORBOARD_PATH
else
echo "文件夹已经存在"
fi

VOCAB_FILE=/workspace/dataset/vocab.json
MERGE_FILE=/workspace/dataset/merges.txt
DATA_PATH=/workspace/dataset/alpaca_text_document

DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)


GPT_MODEL_ARGS=(
--tensor-model-parallel-size $TENSOR_PARALLEL
--num-layers 22
--hidden-size 2048
--num-attention-heads 16
)

TRAINING_ARGS=(
--pipeline-model-parallel-size 1
--distributed-backend nccl
# --use-distributed-optimizer
--overlap-grad-reduce
# --overlap-param-gather
--attention-backend flash
--use-flash-attn
--seq-length 4096
--max-position-embeddings 4096
--micro-batch-size 4
--global-batch-size 64
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--bf16
)


DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)

EVAL_AND_LOGGING_ARGS=(
--log-throughput
--log-interval 10
--save-interval 10000
--eval-interval 1000
--eval-iters 10
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
# --tensorboard-dir $TENSORBOARD_PATH
)


torchrun ${DISTRIBUTED_ARGS[@]} aisynergy2_pretrain_with_megatron.py --aisynergy2-yaml-config-path examples/training_scalability_nvidia_4_nodes/config_${TASK_VERSION}.yaml \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
2>&1 | tee $CHECKPOINT_PATH/log.log

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save
Baidu
map