[model] Add Ministral3 (#9582 )

Co-authored-by: kingsley <kingsleydodonow@gmail.com>
[model] Rename GLMV template (#9595 )
2025-12-28 09:40:34 +08:00 · 2025-12-10 15:57:24 +08:00 · 2025-12-10 13:27:47 +08:00
8 changed files with 59 additions and 20 deletions
--- a/README.md
+++ b/README.md
@@ -291,8 +291,8 @@ Read technical notes:
 | [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma/gemma2         |
 | [Gemma 3/Gemma 3n](https://huggingface.co/google)                 | 270M/1B/4B/6B/8B/12B/27B         | gemma3/gemma3n       |
 | [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org)         | 9B/32B                           | glm4/glmz1           |
-| [GLM-4.1V/GLM-4.6V](https://huggingface.co/zai-org)               | 9B/106B                          | glm4v                |
+| [GLM-4.1V](https://huggingface.co/zai-org)                        | 9B                               | glm4v                |
-| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org)                | 106B/355B                        | glm4_moe/glm4v_moe   |
+| [GLM-4.5/GLM-4.5(6)V](https://huggingface.co/zai-org)             | 9B/106B/355B                     | glm4_moe/glm4_5v     |
 | [GPT-2](https://huggingface.co/openai-community)                  | 0.1B/0.4B/0.8B/1.5B              | -                    |
 | [GPT-OSS](https://huggingface.co/openai)                          | 20B/120B                         | gpt                  |
 | [Granite 3.0-3.3](https://huggingface.co/ibm-granite)             | 1B/2B/3B/8B                      | granite3             |
@@ -315,7 +315,7 @@ Read technical notes:
 | [MiMo](https://huggingface.co/XiaomiMiMo)                         | 7B                               | mimo                 |
 | [MiniCPM 1-4.1](https://huggingface.co/openbmb)                   | 0.5B/1B/2B/4B/8B                 | cpm/cpm3/cpm4        |
 | [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb)     | 8B                               | minicpm_o/minicpm_v  |
-| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai)        | 8B/12B                           | ministral            |
+| [Ministral(3)/Mistral-Nemo](https://huggingface.co/mistralai)     | 3B/8B/12B/14B                    | ministral/ministral3 |
 | [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral              |
 | [Mistral Small](https://huggingface.co/mistralai)                 | 24B                              | mistral_small        |
 | [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -                    |
--- a/README_zh.md
+++ b/README_zh.md
@@ -293,8 +293,8 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
 | [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma/gemma2         |
 | [Gemma 3/Gemma 3n](https://huggingface.co/google)                 | 270M/1B/4B/6B/8B/12B/27B         | gemma3/gemma3n       |
 | [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org)         | 9B/32B                           | glm4/glmz1           |
-| [GLM-4.1V/GLM-4.6V](https://huggingface.co/zai-org)               | 9B/106B                          | glm4v                |
+| [GLM-4.1V](https://huggingface.co/zai-org)                        | 9B                               | glm4v                |
-| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org)                | 106B/355B                        | glm4_moe/glm4v_moe   |
+| [GLM-4.5/GLM-4.5(6)V](https://huggingface.co/zai-org)             | 9B/106B/355B                     | glm4_moe/glm4_5v     |
 | [GPT-2](https://huggingface.co/openai-community)                  | 0.1B/0.4B/0.8B/1.5B              | -                    |
 | [GPT-OSS](https://huggingface.co/openai)                          | 20B/120B                         | gpt                  |
 | [Granite 3.0-3.3](https://huggingface.co/ibm-granite)             | 1B/2B/3B/8B                      | granite3             |
@@ -317,7 +317,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
 | [MiMo](https://huggingface.co/XiaomiMiMo)                         | 7B                               | mimo                 |
 | [MiniCPM 1-4.1](https://huggingface.co/openbmb)                   | 0.5B/1B/2B/4B/8B                 | cpm/cpm3/cpm4        |
 | [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb)     | 8B                               | minicpm_o/minicpm_v  |
-| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai)        | 8B/12B                           | ministral            |
+| [Ministral(3)/Mistral-Nemo](https://huggingface.co/mistralai)     | 3B/8B/12B/14B                    | ministral/ministral3 |
 | [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral              |
 | [Mistral Small](https://huggingface.co/mistralai)                 | 24B                              | mistral_small        |
 | [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -                    |
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -1128,7 +1128,7 @@ register_template(
 # copied from glm4 template
 register_template(
-    name="glm4v_moe",
+    name="glm4_5v",
    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
    format_assistant=StringFormatter(slots=["\n{{content}}"]),
    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
@@ -1687,6 +1687,19 @@ register_template(
 )
 register_template(
    name="ministral3",
    format_user=StringFormatter(slots=["[INST]{{content}}[/INST]"]),
    format_system=StringFormatter(slots=["{{content}}\n\n"]),
    format_function=FunctionFormatter(slots=["[TOOL_CALLS]{{content}}", {"eos_token"}], tool_format="mistral"),
    format_observation=StringFormatter(slots=["""[TOOL_RESULTS]{"content": {{content}}}[/TOOL_RESULTS]"""]),
    format_tools=ToolFormatter(tool_format="mistral"),
    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
    template_class=Llama2Template,
    mm_plugin=get_mm_plugin(name="pixtral", image_token="[IMG]"),
 )
 register_template(
    name="olmo",
    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>\n"]),
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -141,6 +141,7 @@ class QuantizationMethod(str, Enum):
    EETQ = "eetq"
    HQQ = "hqq"
    MXFP4 = "mxfp4"
    FP8 = "fp8"
 class RopeScaling(str, Enum):
@@ -969,14 +970,6 @@ register_model_group(
            DownloadSource.DEFAULT: "zai-org/GLM-4.1V-9B-Thinking",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.1V-9B-Thinking",
        },
        "GLM-4.6V": {
            DownloadSource.DEFAULT: "zai-org/GLM-4.6V",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.6V",
        },
        "GLM-4.6V-Flash": {
            DownloadSource.DEFAULT: "zai-org/GLM-4.6V-Flash",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.6V-Flash",
        },
    },
    template="glm4v",
    multimodal=True,
@@ -1011,9 +1004,17 @@ register_model_group(
        "GLM-4.5V-Air-Thinking": {
            DownloadSource.DEFAULT: "zai-org/GLM-4.5V",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5V",
-        }
+        },
        "GLM-4.6V": {
            DownloadSource.DEFAULT: "zai-org/GLM-4.6V",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.6V",
        },
        "GLM-4.6V-Flash": {
            DownloadSource.DEFAULT: "zai-org/GLM-4.6V-Flash",
            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.6V-Flash",
        },
    },
-    template="glm4v_moe",
+    template="glm4_5v",
    multimodal=True,
 )
@@ -1977,6 +1978,25 @@ register_model_group(
    template="mistral",
 )
 register_model_group(
    models={
        "Ministral-3-3B-Instruct-2512": {
            DownloadSource.DEFAULT: "mistralai/Ministral-3-3B-Instruct-2512",
            DownloadSource.MODELSCOPE: "mistralai/Ministral-3-3B-Instruct-2512",
        },
        "Ministral-3-8B-Instruct-2512": {
            DownloadSource.DEFAULT: "mistralai/Ministral-3-8B-Instruct-2512",
            DownloadSource.MODELSCOPE: "mistralai/Ministral-3-8B-Instruct-2512",
        },
        "Ministral-3-14B-Instruct-2512": {
            DownloadSource.DEFAULT: "mistralai/Ministral-3-14B-Instruct-2512",
            DownloadSource.MODELSCOPE: "mistralai/Ministral-3-14B-Instruct-2512",
        },
    },
    template="ministral3",
    multimodal=True,
 )
 register_model_group(
    models={
--- a/src/llamafactory/model/model_utils/quantization.py
+++ b/src/llamafactory/model/model_utils/quantization.py
@@ -22,7 +22,7 @@ from typing import TYPE_CHECKING, Any
 import torch
 from datasets import load_dataset
-from transformers import BitsAndBytesConfig, EetqConfig, GPTQConfig, HqqConfig
+from transformers import BitsAndBytesConfig, EetqConfig, FineGrainedFP8Config, GPTQConfig, HqqConfig
 from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.modeling_utils import is_fsdp_enabled
@@ -83,6 +83,7 @@ def configure_quantization(
    config: "PretrainedConfig",
    tokenizer: "PreTrainedTokenizer",
    model_args: "ModelArguments",
    is_trainable: bool,
    init_kwargs: dict[str, Any],
 ) -> None:
    r"""Priority: PTQ-quantized (train/infer) > AutoGPTQ (export) > On-the-fly quantization (train/infer)."""
@@ -109,6 +110,10 @@ def configure_quantization(
            check_version("aqlm>=1.1.0", mandatory=True)
            quantization_config["bits"] = 2
        if quant_method == QuantizationMethod.FP8 and is_trainable:
            quant_config = FineGrainedFP8Config(dequantize=True)
            init_kwargs["quantization_config"] = quant_config
        quant_bits = quantization_config.get("bits", "?")
        logger.info_rank0(f"Loading {quant_bits}-bit {quant_method.upper()}-quantized model.")
--- a/src/llamafactory/model/model_utils/visual.py
+++ b/src/llamafactory/model/model_utils/visual.py
@@ -301,6 +301,7 @@ _register_composite_model(
 _register_composite_model(
    model_type="mistral3",
    projector_key="model.multi_modal_projector",
 )
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -115,7 +115,7 @@ def patch_config(
    configure_attn_implementation(config, model_args)
    configure_rope(config, model_args)
    configure_longlora(config, model_args, is_trainable)
-    configure_quantization(config, tokenizer, model_args, init_kwargs)
+    configure_quantization(config, tokenizer, model_args, is_trainable, init_kwargs)
    configure_moe(config, model_args, is_trainable)
    configure_visual_model(config)
    configure_packing(model_args, is_trainable)
--- a/src/llamafactory/train/sft/workflow.py
+++ b/src/llamafactory/train/sft/workflow.py
@@ -78,7 +78,7 @@ def run_sft(
    gen_kwargs = generating_args.to_dict(obey_generation_config=True)
    # Compatible with Transformers v4 and Transformers v5
-    if is_transformers_version_greater_than("5.0.0RC0"):
+    if is_transformers_version_greater_than("4.58.0"):
        extra_ids = getattr(tokenizer, "additional_special_tokens_ids", None)
        if not isinstance(extra_ids, list):
            extra_special_tokens = getattr(tokenizer, "_extra_special_tokens", [])
Author	SHA1	Message	Date
tangefly	4fd94141a4	[model] Add Ministral3 (#9582 ) Co-authored-by: kingsley <kingsleydodonow@gmail.com>	2025-12-10 15:57:24 +08:00
Kingsley	22d6ac29d5	[model] Rename GLMV template (#9595 )	2025-12-10 13:27:47 +08:00