From 1f338deb875dce97f5d25e6f05e954b8a70fb8ef Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Mon, 28 Apr 2025 18:11:09 +0800 Subject: [PATCH] [model] fix dsv3 leaf node (#7879) --- src/llamafactory/data/mm_plugin.py | 4 +++- src/llamafactory/model/model_utils/moe.py | 15 +++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 8b5d23d9..b653f57b 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -1671,7 +1671,9 @@ class Qwen2OmniPlugin(Qwen2VLPlugin): if num_video_tokens >= len(videos): raise ValueError(f"`len(videos)` is less than the number of {VIDEO_PLACEHOLDER} tokens.") - video_seqlen = video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1 + video_seqlen = ( + video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1 + ) content = content.replace( VIDEO_PLACEHOLDER, f"<|vision_bos|>{self.video_token * video_seqlen}<|vision_eos|>", 1 ) diff --git a/src/llamafactory/model/model_utils/moe.py b/src/llamafactory/model/model_utils/moe.py index ec6e1e38..51f289f4 100644 --- a/src/llamafactory/model/model_utils/moe.py +++ b/src/llamafactory/model/model_utils/moe.py @@ -12,21 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Union -import torch from transformers.integrations import is_deepspeed_zero3_enabled from ...extras.misc import check_version if TYPE_CHECKING: + from torch import nn from transformers import PretrainedConfig, PreTrainedModel from ...hparams import ModelArguments -def _set_z3_leaf_modules(model: "PreTrainedModel", leaf_modules: list["torch.nn.Module"]) -> None: +def _set_z3_leaf_modules(model: "PreTrainedModel", leaf_modules: list[Union["nn.Module", str]]) -> None: check_version("deepspeed>=0.13.0") from deepspeed.utils import set_z3_leaf_modules # type: ignore @@ -44,10 +44,13 @@ def add_z3_leaf_module(model: "PreTrainedModel") -> None: _set_z3_leaf_modules(model, [DbrxFFN]) - if model_type == "deepseek_v3": - from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE + if model_type == "deepseek_v2": + # deepseek v2 uses custom code + _set_z3_leaf_modules(model, ["DeepseekV2MoE"]) - _set_z3_leaf_modules(model, [DeepseekV3MoE]) + if model_type == "deepseek_v3" or model_type == "kimi_vl": + # deepseek v3 and kimi vl use custom code + _set_z3_leaf_modules(model, ["DeepseekV3MoE"]) if model_type == "granitemoe": from transformers.models.granitemoe.modeling_granitemoe import GraniteMoeMoE