[v1] support training with fsdp2 (#9773)

Co-authored-by: frozenleaves <frozen@Mac.local> Co-authored-by: Yaowei Zheng <hiyouga@buaa.edu.cn>
2026-05-05 15:48:54 +08:00 · 2026-01-25 19:41:58 +08:00
parent 641bfdd482
commit f9f11dcb97
15 changed files with 801 additions and 33 deletions
--- a/src/llamafactory/v1/accelerator/helper.py
+++ b/src/llamafactory/v1/accelerator/helper.py
@@ -180,6 +180,16 @@ def operate_tensorlike(fn: Callable[[...], Tensor], data: TensorLike, **kwargs)
        return result.tolist()


+def get_process_group_backend() -> str:
+    """Get backend for init process group."""
+    if get_current_accelerator().type == DeviceType.NPU:
+        return "hccl"
+    elif get_current_accelerator().type == DeviceType.CUDA:
+        return "nccl"
+    else:
+        return "gloo"
+
+
 def all_gather(tensor: Tensor, group: Optional[ProcessGroup] = None) -> Tensor:
    """Gathers the tensor from all ranks and stacks them at the first dim."""
    world_size = get_world_size()