add pixtral template

Former-commit-id: 86f5a9be548ef02ce334bba35a529c70e8b3ad7f
2025-12-13 02:16:57 +08:00 · 2024-09-26 12:11:58 +08:00 · 2024-09-26 12:11:58 +08:00 · c436d6ea0b
commit c436d6ea0b
parent 944ae8780c
4 changed files with 61 additions and 0 deletions
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@ -323,6 +323,12 @@ class PaliGemmaPlugin(BasePlugin):
        mm_inputs["token_type_ids"] = _get_paligemma_token_type_ids(imglens, seqlens, processor)
        return mm_inputs

+class PixtralPlugin(BasePlugin):
+    #TODO preprocess according to Pixtral hf
+    from transformers import LlavaForConditionalGeneration
+    @override
+    def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject":
+        pass

 class Qwen2vlPlugin(BasePlugin):
    @override
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@ -821,6 +821,13 @@ _register_template(
    replace_eos=True,
 )

+_register_template(
+    name="pixtral",
+    format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="pixtral", image_token="[IMG]")
+)
+

 _register_template(
    name="qwen",
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@ -894,6 +894,16 @@ register_model_group(
    template="mistral",
 )

+register_model_group(
+    models={
+        "Pixtral-12B-2409": {
+            DownloadSource.DEFAULT: "mistral-community/pixtral-12b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b",
+        }
+    },
+    template="mistral"
+)
+

 register_model_group(
    models={
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@ -119,6 +119,44 @@ def load_config(model_args: "ModelArguments") -> "PretrainedConfig":
    Loads model config.
    """
    init_kwargs = _get_init_kwargs(model_args)
+    if "pixtral" in model_args.model_name_or_path:
+        from transformers import PretrainedConfig
+
+        class PixtralVisionConfig(PretrainedConfig):
+            model_type = "pixtral"
+
+            def __init__(
+                self,
+                hidden_size=1024,
+                intermediate_size=4096,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                num_channels=3,
+                image_size=1024,
+                patch_size=16,
+                hidden_act="gelu",
+                attention_dropout=0.0,
+                rope_theta=10000.0,
+                tie_word_embeddings=False,
+                **kwargs,
+            ):
+                super().__init__(**kwargs)
+
+                self.hidden_size = hidden_size
+                self.intermediate_size = intermediate_size
+                self.num_hidden_layers = num_hidden_layers
+                self.num_attention_heads = num_attention_heads
+                self.num_channels = num_channels
+                self.patch_size = patch_size
+                self.image_size = image_size
+                self.attention_dropout = attention_dropout
+                self.hidden_act = hidden_act
+                self.rope_theta = rope_theta
+                self.tie_word_embeddings = tie_word_embeddings
+                self.head_dim = hidden_size // num_attention_heads
+                
+        return PixtralVisionConfig()
+    
    return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)