mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2026-03-12 23:16:04 +08:00
[mca] support qwen3.5 (#10265)
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -71,6 +71,7 @@ def convert(
|
||||
pipeline_model_parallel_size: int = 1,
|
||||
expert_model_parallel_size: int = 1,
|
||||
virtual_pipeline_model_parallel_size: int | None = None,
|
||||
moe_grouped_gemm: bool | None = None,
|
||||
):
|
||||
"""Convert checkpoint between MCA and HuggingFace formats.
|
||||
|
||||
@@ -84,6 +85,10 @@ def convert(
|
||||
pipeline_model_parallel_size: Pipeline model parallel size
|
||||
expert_model_parallel_size: Expert model parallel size
|
||||
virtual_pipeline_model_parallel_size: Virtual pipeline model parallel size
|
||||
moe_grouped_gemm: Use grouped gemm for MoE experts. When enabled, expert
|
||||
weights are stored in a flattened format (linear_fc1.weight0, weight1, ...)
|
||||
rather than per-expert format (local_experts.0.linear_fc1.weight, ...).
|
||||
Must match the format used when saving the checkpoint.
|
||||
"""
|
||||
if bf16 and fp16:
|
||||
raise ValueError("bf16 and fp16 cannot be both True.")
|
||||
@@ -97,8 +102,9 @@ def convert(
|
||||
pipeline_model_parallel_size=pipeline_model_parallel_size,
|
||||
expert_model_parallel_size=expert_model_parallel_size,
|
||||
virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size,
|
||||
moe_grouped_gemm=moe_grouped_gemm,
|
||||
transformer_impl="transformer_engine", # hard code here since we default using te for training
|
||||
)
|
||||
|
||||
convert_checkpoint_to_mca(
|
||||
checkpoint_path,
|
||||
output_path,
|
||||
|
||||
Reference in New Issue
Block a user