[v1] add dp & mp mesh (#9611)

2026-07-31 21:26:09 +08:00 · 2025-12-13 01:44:28 +08:00
parent 203069e11c
commit 110d21713e
3 changed files with 158 additions and 55 deletions
--- a/src/llamafactory/v1/accelerator/helper.py
+++ b/src/llamafactory/v1/accelerator/helper.py
@@ -60,16 +60,16 @@ def get_rank() -> int:
    return int(os.getenv("RANK", "0"))


-def get_local_rank() -> int:
-    """Get local rank."""
-    return int(os.getenv("LOCAL_RANK", "0"))
-
-
 def get_world_size() -> int:
    """Get world size."""
    return int(os.getenv("WORLD_SIZE", "1"))


+def get_local_rank() -> int:
+    """Get local rank."""
+    return int(os.getenv("LOCAL_RANK", "0"))
+
+
 def get_local_world_size() -> int:
    """Get local world size."""
    return int(os.getenv("LOCAL_WORLD_SIZE", "1"))
@@ -79,7 +79,7 @@ def get_local_world_size() -> int:
 def get_current_accelerator(check_available: bool = True) -> torch.device:
    """Get current accelerator.

-    Note: this api requires torch>=2.7.0, 2.6 or lower will get an AttributeError or RuntimeError
+    Note: this api requires torch>=2.7.0, otherwise it will raise an AttributeError or RuntimeError
    """
    if not hasattr(torch, "accelerator"):
        raise RuntimeError("torch.accelerator is not available, please upgrade torch to 2.7.0 or higher.")
@@ -123,7 +123,7 @@ def all_reduce(data: TensorLike, op: ReduceOp = ReduceOp.MEAN, group: Optional["
    is_tensor = isinstance(data, torch.Tensor)

    if is_ndarray:
-        data = torch.from_numpy(data)
+        data = torch.from_numpy(data).to(device=device, dtype=torch.float)
    elif not is_tensor:
        data = torch.tensor(data, dtype=torch.float, device=device)

@@ -140,7 +140,7 @@ def all_reduce(data: TensorLike, op: ReduceOp = ReduceOp.MEAN, group: Optional["
    if is_tensor:
        return data
    elif is_ndarray:
-        return data.numpy()
+        return data.cpu().numpy()
    elif data.numel() == 1:
        return data.item()
    else:
--- a/src/llamafactory/v1/accelerator/interface.py
+++ b/src/llamafactory/v1/accelerator/interface.py
@@ -1,4 +1,7 @@
-# Copyright 2025 the LlamaFactory team.
+# Copyright 2025 Bytedance Ltd. and the LlamaFactory team.
+#
+# This code is inspired by the Bytedance's VeOmni library.
+# https://github.com/ByteDance-Seed/VeOmni/blob/v0.1.4/veomni/distributed/parallel_state.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,41 +16,91 @@
 # limitations under the License.

 from dataclasses import dataclass
-from typing import Any, Optional
+from enum import Enum
+from typing import TYPE_CHECKING, Any, Optional

 from torch.distributed.device_mesh import DeviceMesh, init_device_mesh

-from ..utils.types import TensorLike
-from .helper import ReduceOp, all_reduce, get_current_accelerator, get_rank, get_world_size, is_distributed
+from ..utils.types import Tensor, TensorLike
+from .helper import (
+    ReduceOp,
+    all_gather,
+    all_reduce,
+    get_current_accelerator,
+    get_local_rank,
+    get_local_world_size,
+    get_rank,
+    get_world_size,
+    is_distributed,
+)
+
+
+if TYPE_CHECKING:
+    from torch.distributed import ProcessGroup
+
+
+class Dim(str, Enum):
+    """Dimension names."""
+
+    MP_REPLICATE = "mp_replicate"
+    MP_SHARD = "mp_shard"
+    DP = "dp"
+    CP = "cp"


@dataclass
 class DistributedStrategy:
    """Distributed strategy."""

+    mp_replicate_size: int = 1
+    """Model parallel replicate size, default to 1."""
+    mp_shard_size: Optional[int] = None
+    """Model parallel shard size, default to world_size // mp_replicate_size."""
    dp_size: Optional[int] = None
-    tp_size: int = 1
+    """Data parallel size, default to world_size // cp_size."""
+    cp_size: int = 1
+    """Context parallel size, default to 1."""

    def __post_init__(self) -> None:
+        if not is_distributed():
+            self.mp_shard_size = 1
+        elif self.mp_shard_size is None:
+            self.mp_shard_size = get_world_size() // self.mp_replicate_size
+        elif self.mp_replicate_size * self.mp_shard_size != get_world_size():
+            raise ValueError(
+                f"mp_replicate_size * mp_shard_size must equal to world_size, "
+                f"got {self.mp_replicate_size} * {self.mp_shard_size} != {get_world_size()}."
+            )
+
        if not is_distributed():
            self.dp_size = 1
        elif self.dp_size is None:
-            self.dp_size = get_world_size() // self.tp_size
-        elif self.dp_size * self.tp_size != get_world_size():
+            self.dp_size = get_world_size() // self.cp_size
+        elif self.dp_size * self.cp_size != get_world_size():
            raise ValueError(
-                f"dp_size * tp_size must equal to world_size, "
-                f"got {self.dp_size} * {self.tp_size} != {get_world_size()}."
+                f"dp_size * cp_size must equal to world_size, "
+                f"got {self.dp_size} * {self.cp_size} != {get_world_size()}."
            )

    @property
-    def mesh_shape(self) -> tuple[int, int]:
-        """Mesh shape."""
-        return (self.dp_size, self.tp_size)
+    def model_mesh_shape(self) -> tuple[int, int]:
+        """Model parallel mesh shape."""
+        return (self.mp_replicate_size, self.mp_shard_size)

    @property
-    def mesh_dim_names(self) -> tuple[str, str]:
-        """Mesh dimension names."""
-        return ("dp", "tp")
+    def model_mesh_dim_names(self) -> tuple[str, str]:
+        """Model parallel mesh dimension names."""
+        return (Dim.MP_REPLICATE.value, Dim.MP_SHARD.value)
+
+    @property
+    def data_mesh_shape(self) -> tuple[int, int]:
+        """Data parallel mesh shape."""
+        return (self.dp_size, self.cp_size)
+
+    @property
+    def data_mesh_dim_names(self) -> tuple[str, str]:
+        """Data parallel mesh dimension names."""
+        return (Dim.DP.value, Dim.CP.value)


 class DistributedInterface:
@@ -55,15 +108,18 @@ class DistributedInterface:

    _instance: Optional["DistributedInterface"] = None
    _initialized: bool = False
+    _is_distributed = is_distributed()
+    _rank = get_rank()
+    _world_size = get_world_size()
+    _local_rank = get_local_rank()
+    _local_world_size = get_local_world_size()

-    is_distributed = is_distributed()
-    """Check if distributed environment is available."""
-    rank = get_rank()
-    """Global rank."""
-    world_size = get_world_size()
-    """Global world size."""
-    device_mesh: Optional[DeviceMesh] = None
-    """Device mesh."""
+    strategy: Optional[DistributedStrategy] = None
+    """Distributed strategy."""
+    model_device_mesh: Optional[DeviceMesh] = None
+    """Model parallel device mesh."""
+    data_device_mesh: Optional[DeviceMesh] = None
+    """Data parallel device mesh."""
    current_accelerator = get_current_accelerator()
    """Current accelerator."""

@@ -79,44 +135,89 @@ class DistributedInterface:
            return

        self.strategy = strategy
-        if self.is_distributed:
-            self.device_mesh = init_device_mesh(
+        if self._is_distributed:
+            self.model_device_mesh = init_device_mesh(
                device_type=self.current_accelerator.type,
-                mesh_shape=strategy.mesh_shape,
-                mesh_dim_names=strategy.mesh_dim_names,
+                mesh_shape=strategy.model_mesh_shape,
+                mesh_dim_names=strategy.model_mesh_dim_names,
+            )
+            self.data_device_mesh = init_device_mesh(
+                device_type=self.current_accelerator.type,
+                mesh_shape=strategy.data_mesh_shape,
+                mesh_dim_names=strategy.data_mesh_dim_names,
            )
        else:
-            self.device_mesh = None
+            self.model_device_mesh = None
+            self.data_device_mesh = None

        self._initialized = True

    def __str__(self) -> str:
        return (
-            f"DistributedInterface(strategy={self.strategy}), is_distributed={self.is_distributed}, "
-            f"rank={self.rank}, world_size={self.world_size}, "
-            f"device_mesh={self.device_mesh}, current_accelerator={self.current_accelerator}"
+            f"DistributedInterface(strategy={self.strategy}), is_distributed={self._is_distributed}, "
+            f"current_accelerator={self.current_accelerator}, rank={self._rank}, world_size={self._world_size}, "
+            f"model_device_mesh={self.model_device_mesh}, data_device_mesh={self.data_device_mesh}"
        )

-    def dp_rank(self) -> int:
-        """Data parallel rank."""
-        if self.device_mesh is None:
+    @classmethod
+    def get_device_mesh(cls, dim: Optional[Dim] = None) -> Optional[DeviceMesh]:
+        """Get device mesh for specified dimension."""
+        if dim is None:
+            raise ValueError("dim must be specified.")
+        elif cls.model_device_mesh is None:
+            return None
+        elif dim in cls.strategy.data_mesh_dim_names:
+            return cls.data_device_mesh[dim.value]
+        else:
+            return cls.model_device_mesh[dim.value]
+
+    @classmethod
+    def get_group(cls, dim: Optional[Dim] = None) -> Optional["ProcessGroup"]:
+        """Get process group for specified dimension."""
+        if cls.model_device_mesh is None or dim is None:
+            return None
+        else:
+            return cls.get_device_mesh(dim).get_group()
+
+    @classmethod
+    def get_rank(cls, dim: Optional[Dim] = None) -> int:
+        """Get parallel rank for specified dimension."""
+        if cls.model_device_mesh is None:
            return 0
+        elif dim is None:
+            return cls._rank
+        else:
+            return cls.get_device_mesh(dim).get_local_rank()

-        return self.device_mesh["dp"].get_rank()
-
-    def dp_size(self) -> int:
-        """Data parallel size."""
-        if self.device_mesh is None:
+    @classmethod
+    def get_world_size(cls, dim: Optional[Dim] = None) -> int:
+        """Get parallel size for specified dimension."""
+        if cls.model_device_mesh is None:
            return 1
+        elif dim is None:
+            return cls._world_size
+        else:
+            return cls.get_device_mesh(dim).size()

-        return self.device_mesh["dp"].size()
+    @classmethod
+    def get_local_rank(cls) -> int:
+        """Get parallel local rank."""
+        return cls._local_rank

-    def all_reduce_over_dp(self, data: TensorLike, op: ReduceOp = ReduceOp.MEAN) -> TensorLike:
-        """All reduce tensor."""
-        if self.device_mesh is None:
-            return data
+    @classmethod
+    def get_local_world_size(cls) -> int:
+        """Get parallel local world size."""
+        return cls._local_world_size

-        return all_reduce(data, op, self.device_mesh["dp"].get_group())
+    @classmethod
+    def all_gather(cls, data: Tensor, dim: Optional[Dim] = Dim.DP) -> Tensor:
+        """Gather tensor across specified parallel group."""
+        return all_gather(data, cls.get_group(dim)) if cls.model_device_mesh is not None else data
+
+    @classmethod
+    def all_reduce(cls, data: TensorLike, op: ReduceOp = ReduceOp.MEAN, dim: Optional[Dim] = Dim.DP) -> TensorLike:
+        """Reduce tensor across specified parallel group."""
+        return all_reduce(data, op, cls.get_group(dim)) if cls.model_device_mesh is not None else data


 if __name__ == "__main__":
--- a/tests_v1/accelerator/test_interface.py
+++ b/tests_v1/accelerator/test_interface.py
@@ -20,5 +20,7 @@ from llamafactory.v1.accelerator.interface import DistributedInterface, Distribu

 def test_distributed_interface():
    DistributedInterface(DistributedStrategy())
-    assert DistributedInterface.rank == int(os.getenv("RANK", "0"))
-    assert DistributedInterface.world_size == int(os.getenv("WORLD_SIZE", "1"))
+    assert DistributedInterface.get_rank() == int(os.getenv("RANK", "0"))
+    assert DistributedInterface.get_world_size() == int(os.getenv("WORLD_SIZE", "1"))
+    assert DistributedInterface.get_local_rank() == int(os.getenv("LOCAL_RANK", "0"))
+    assert DistributedInterface.get_local_world_size() == int(os.getenv("LOCAL_WORLD_SIZE", "1"))