[v1] support training with fsdp2 (#9773)

Co-authored-by: frozenleaves <frozen@Mac.local>
Co-authored-by: Yaowei Zheng <hiyouga@buaa.edu.cn>
This commit is contained in:
浮梦
2026-01-25 19:41:58 +08:00
committed by GitHub
parent 641bfdd482
commit f9f11dcb97
15 changed files with 801 additions and 33 deletions

View File

@@ -180,6 +180,16 @@ def operate_tensorlike(fn: Callable[[...], Tensor], data: TensorLike, **kwargs)
return result.tolist()
def get_process_group_backend() -> str:
"""Get backend for init process group."""
if get_current_accelerator().type == DeviceType.NPU:
return "hccl"
elif get_current_accelerator().type == DeviceType.CUDA:
return "nccl"
else:
return "gloo"
def all_gather(tensor: Tensor, group: Optional[ProcessGroup] = None) -> Tensor:
"""Gathers the tensor from all ranks and stacks them at the first dim."""
world_size = get_world_size()

View File

@@ -145,7 +145,7 @@ class DistributedInterface:
timeout = config.get("timeout", 18000)
if self._is_distributed:
init_process_group(timeout=timedelta(seconds=timeout))
init_process_group(timeout=timedelta(seconds=timeout), backend=helper.get_process_group_backend())
self.model_device_mesh = init_device_mesh(
device_type=self.current_device.type,
mesh_shape=self.strategy.model_mesh_shape,