From 48974783da7acc45dc417409b3d2afb366e2d44e Mon Sep 17 00:00:00 2001 From: Jiayi Mao <77279783+MJy1023@users.noreply.github.com> Date: Mon, 13 Oct 2025 13:13:31 +0800 Subject: [PATCH] [model]: add ernie4_5_moe support for DeepSpeed Zero3 training (#9262) --- src/llamafactory/model/model_utils/moe.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/llamafactory/model/model_utils/moe.py b/src/llamafactory/model/model_utils/moe.py index 1a27e34b..0a541415 100644 --- a/src/llamafactory/model/model_utils/moe.py +++ b/src/llamafactory/model/model_utils/moe.py @@ -55,6 +55,11 @@ def add_z3_leaf_module(model: "PreTrainedModel") -> None: # deepseek v3 and kimi vl use custom code _set_z3_leaf_modules(model, ["DeepseekV3MoE"]) + if model_type == "ernie4_5_moe": + from transformers.models.ernie4_5_moe.modeling_ernie4_5_moe import Ernie4_5_MoeSparseMoeBlock + + _set_z3_leaf_modules(model, [Ernie4_5_MoeSparseMoeBlock]) + if model_type == "granitemoe": from transformers.models.granitemoe.modeling_granitemoe import GraniteMoeMoE @@ -130,6 +135,7 @@ def configure_moe(config: "PretrainedConfig", model_args: "ModelArguments", is_t if model_type in [ "dbrx", + "ernie4_5_moe", "granitemoe", "jamba", "jetmoe", @@ -148,7 +154,7 @@ def configure_moe(config: "PretrainedConfig", model_args: "ModelArguments", is_t ]: setattr(text_config, "output_router_logits", True) - if model_type in ["granitemoe", "jamba", "llama4", "mixtral", "olmoe", "phimoe", "qwen2_moe", "qwen3_moe"]: + if model_type in ["ernie4_5_moe", "granitemoe", "jamba", "llama4", "mixtral", "olmoe", "phimoe", "qwen2_moe", "qwen3_moe"]: setattr(config, "router_aux_loss_coef", model_args.moe_aux_loss_coef) elif text_config and getattr(text_config, "model_type", None) in ["qwen3_moe"]: