diff --git a/scripts/megatron_merge.py b/scripts/megatron_merge.py
index 4d9d932cd..e8e9e12a3 100644
--- a/scripts/megatron_merge.py
+++ b/scripts/megatron_merge.py
@@ -71,6 +71,7 @@ def convert(
     pipeline_model_parallel_size: int = 1,
     expert_model_parallel_size: int = 1,
     virtual_pipeline_model_parallel_size: int | None = None,
+    moe_grouped_gemm: bool | None = None,
 ):
     """Convert checkpoint between MCA and HuggingFace formats.
 
@@ -84,6 +85,10 @@ def convert(
         pipeline_model_parallel_size: Pipeline model parallel size
         expert_model_parallel_size: Expert model parallel size
         virtual_pipeline_model_parallel_size: Virtual pipeline model parallel size
+        moe_grouped_gemm: Use grouped gemm for MoE experts. When enabled, expert
+            weights are stored in a flattened format (linear_fc1.weight0, weight1, ...)
+            rather than per-expert format (local_experts.0.linear_fc1.weight, ...).
+            Must match the format used when saving the checkpoint.
     """
     if bf16 and fp16:
         raise ValueError("bf16 and fp16 cannot be both True.")
@@ -97,8 +102,9 @@ def convert(
             pipeline_model_parallel_size=pipeline_model_parallel_size,
             expert_model_parallel_size=expert_model_parallel_size,
             virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size,
+            moe_grouped_gemm=moe_grouped_gemm,
+            transformer_impl="transformer_engine",  # hard code here since we default using te for training
         )
-
         convert_checkpoint_to_mca(
             checkpoint_path,
             output_path,
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 7e23e3162..0d2cae65b 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -69,6 +69,8 @@ MCA_SUPPORTED_MODELS = {
     "qwen3",
     "qwen3_moe",
     "qwen3_next",
+    "qwen3_5",
+    "qwen3_5_moe",
 }
 
 METHODS = ["full", "freeze", "lora", "oft"]
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 308eecade..ab47a088a 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -470,7 +470,7 @@ def get_train_args(args: dict[str, Any] | list[str] | None = None) -> _TRAIN_CLS
         training_args.resume_from_checkpoint is None
         and training_args.do_train
         and os.path.isdir(training_args.output_dir)
-        and not training_args.overwrite_output_dir
+        and not getattr(training_args, "overwrite_output_dir", False) # for mca training args and transformers >= 5.0
         and can_resume_from_checkpoint
     ):
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
diff --git a/src/llamafactory/train/callbacks.py b/src/llamafactory/train/callbacks.py
index ac574ae7c..77507c848 100644
--- a/src/llamafactory/train/callbacks.py
+++ b/src/llamafactory/train/callbacks.py
@@ -228,7 +228,7 @@ class LogCallback(TrainerCallback):
         if (
             args.should_save
             and os.path.exists(os.path.join(args.output_dir, TRAINER_LOG))
-            and args.overwrite_output_dir
+            and getattr(args, "overwrite_output_dir", False)
         ):
             logger.warning_rank0_once("Previous trainer log in this folder will be deleted.")
             os.remove(os.path.join(args.output_dir, TRAINER_LOG))
diff --git a/src/llamafactory/train/mca/workflow.py b/src/llamafactory/train/mca/workflow.py
index 142e86526..812ae5830 100644
--- a/src/llamafactory/train/mca/workflow.py
+++ b/src/llamafactory/train/mca/workflow.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 import functools
+import json
+import os
 from collections.abc import Sequence
 from copy import deepcopy
 from typing import TYPE_CHECKING, Any, Optional
@@ -77,20 +79,25 @@ def _data_collator_wrapper(data_collator: Any):
 
 def _check_model_support(model_args: "ModelArguments"):
     from transformers import AutoConfig as HfAutoConfig
+    if os.path.exists(os.path.join(model_args.model_name_or_path, "mca_config.json")): # load from mcore ckpt
+        mca_config = json.load(open(os.path.join(model_args.model_name_or_path, "mca_config.json")))
+        model_type = mca_config.get("hf_model_type", None)
+    else:
+        config = HfAutoConfig.from_pretrained(
+            model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
+        )
+        model_type = config.model_type
 
-    config = HfAutoConfig.from_pretrained(
-        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
-    )
-    if config.model_type not in MCA_SUPPORTED_MODELS:
+    if model_type not in MCA_SUPPORTED_MODELS:
         raise ValueError(
-            f"Model {config.model_type} is not supported by mcore_adapter."
+            f"Model {model_type} is not supported by mcore_adapter."
             "You can try to upgrade mcore_adapter to the latest version for more supported models."
         )
 
 
 def _freeze_model_parameters(model: Any, finetuning_args: "FinetuningArguments"):
     """Freeze model parameters for qwen_vl series models based on finetuning arguments."""
-    if getattr(model.config, "hf_model_type", None) not in ["qwen2_vl", "qwen2_5_vl", "qwen3_vl", "qwen3_vl_moe"]:
+    if getattr(model.config, "hf_model_type", None) not in ["qwen2_vl", "qwen2_5_vl", "qwen3_vl", "qwen3_vl_moe", "qwen3_5", "qwen3_5_moe"]:
         return
 
     params_to_freeze = []