From f00f4ae9b6c944220a66538ac54c7d2a18d6474d Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Tue, 10 Sep 2024 12:31:53 +0800
Subject: [PATCH 01/33] support llava-next(video)

Former-commit-id: 31259e7e0caa9ff6449b4abcee0554e211167178
---
 README.md                                    |  58 +++---
 README_zh.md                                 |   2 +
 requirements.txt                             |   1 +
 setup.py                                     |   1 +
 src/llamafactory/data/mm_plugin.py           | 178 +++++++++++++++++++
 src/llamafactory/data/template.py            |  43 +++++
 src/llamafactory/extras/constants.py         |  71 ++++++++
 src/llamafactory/model/loader.py             |  12 +-
 src/llamafactory/model/model_utils/misc.py   |   2 +-
 src/llamafactory/model/model_utils/visual.py |   4 +-
 tests/data/test_mm_plugin.py                 |  55 +++++-
 11 files changed, 394 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md
index 8bc99730..963ca26a 100644
--- a/README.md
+++ b/README.md
@@ -160,34 +160,36 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Supported Models
 
-| Model                                                             | Model size                       | Template  |
-| ----------------------------------------------------------------- | -------------------------------- | --------- |
-| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2 |
-| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3  |
-| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere    |
-| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B                  | falcon    |
-| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma     |
-| [GLM-4](https://huggingface.co/THUDM)                             | 9B                               | glm4      |
-| [InternLM2/InternLM2.5](https://huggingface.co/internlm)          | 7B/20B                           | intern2   |
-| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -         |
-| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2    |
-| [Llama 3/Llama 3.1](https://huggingface.co/meta-llama)            | 8B/70B                           | llama3    |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava     |
-| [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                         | cpm/cpm3  |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral   |
-| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -         |
-| [PaliGemma](https://huggingface.co/google)                        | 3B                               | paligemma |
-| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                        | -         |
-| [Phi-3](https://huggingface.co/microsoft)                         | 4B/7B/14B                        | phi       |
-| [Qwen/Qwen1.5/Qwen2 (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/4B/7B/14B/32B/72B/110B | qwen      |
-| [Qwen2-VL](https://huggingface.co/Qwen)                           | 2B/7B                            | qwen2_vl  |
-| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -         |
-| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                       | xverse    |
-| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi        |
-| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                           | yi_vl     |
-| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan      |
+| Model                                                             | Model size                       | Template         |
+|-------------------------------------------------------------------| -------------------------------- |------------------|
+| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2        |
+| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -                |
+| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3         |
+| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere           |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek         |
+| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B                  | falcon           |
+| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma            |
+| [GLM-4](https://huggingface.co/THUDM)                             | 9B                               | glm4             |
+| [InternLM2/InternLM2.5](https://huggingface.co/internlm)          | 7B/20B                           | intern2          |
+| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -                |
+| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2           |
+| [Llama 3/Llama 3.1](https://huggingface.co/meta-llama)            | 8B/70B                           | llama3           |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava            |
+| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/13B                           | llava_next       |
+| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/13B                           | llava_next_video |
+| [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                         | cpm/cpm3         |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral          |
+| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -                |
+| [PaliGemma](https://huggingface.co/google)                        | 3B                               | paligemma        |
+| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                        | -                |
+| [Phi-3](https://huggingface.co/microsoft)                         | 4B/7B/14B                        | phi              |
+| [Qwen/Qwen1.5/Qwen2 (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/4B/7B/14B/32B/72B/110B | qwen             |
+| [Qwen2-VL](https://huggingface.co/Qwen)                           | 2B/7B                            | qwen2_vl         |
+| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -                |
+| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                       | xverse           |
+| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi               |
+| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                           | yi_vl            |
+| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan             |
 
 > [!NOTE]
 > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.
diff --git a/README_zh.md b/README_zh.md
index e80a2104..251b1f87 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -176,6 +176,8 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272
 | [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2    |
 | [Llama 3/Llama 3.1](https://huggingface.co/meta-llama)            | 8B/70B                           | llama3    |
 | [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava     |
+| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/13B                           | llava_next       |
+| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/13B                           | llava_next_video |
 | [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                         | cpm/cpm3  |
 | [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral   |
 | [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -         |
diff --git a/requirements.txt b/requirements.txt
index 54d58bb3..1c1b4c55 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,3 +19,4 @@ fire
 packaging
 pyyaml
 numpy<2.0.0
+av
\ No newline at end of file
diff --git a/setup.py b/setup.py
index a80cb81b..5e969e51 100644
--- a/setup.py
+++ b/setup.py
@@ -61,6 +61,7 @@ extra_require = {
     "qwen": ["transformers_stream_generator"],
     "modelscope": ["modelscope"],
     "dev": ["ruff", "pytest"],
+    "av": ["av>=13.0.0"],
 }
 
 
diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index c109d26e..22c49468 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -209,6 +209,50 @@ class BasePlugin:
         return {}
 
 
+class Idefics2Plugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: Sequence[Dict[str, str]],
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+        processor: Optional["ProcessorMixin"],
+    ) -> List[Dict[str, str]]:
+        self._validate_input(images, videos)
+        num_image_tokens = 0
+        messages = deepcopy(messages)
+        fake_image_token = processor.fake_image_token.content
+        image_str = f"{fake_image_token}{self.image_token * processor.image_seq_len}{fake_image_token}"
+        image_str = image_str * 5
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                num_image_tokens += 1
+                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}", 1)
+            content = content.replace("{{image}}", image_str)
+            content = content.replace(f"{fake_image_token}{fake_image_token}", f"{fake_image_token}")
+            message["content"] = content
+
+        if len(images) != num_image_tokens:
+            raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+        imglens: Sequence[int],
+        vidlens: Sequence[int],
+        seqlens: Sequence[int],
+        processor: Optional["ProcessorMixin"],
+    ) -> Dict[str, Union[List[int], "torch.Tensor"]]:
+        self._validate_input(images, videos)
+        return _get_mm_inputs(images, videos, processor)
+
+
 class LlavaPlugin(BasePlugin):
     @override
     def process_messages(
@@ -249,6 +293,92 @@ class LlavaPlugin(BasePlugin):
         return _get_mm_inputs(images, videos, processor)
 
 
+class LlavaNextPlugin(BasePlugin):
+    @override
+    def process_messages(
+            self,
+            messages: Sequence[Dict[str, str]],
+            images: Sequence["ImageInput"],
+            videos: Sequence["VideoInput"],
+            processor: Optional["ProcessorMixin"],
+    ) -> List[Dict[str, str]]:
+        self._validate_input(images, videos)
+        num_image_tokens = 0
+        messages = deepcopy(messages)
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                num_image_tokens += 1
+                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}", 1)
+
+        if len(images) != num_image_tokens:
+            raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+            self,
+            images: Sequence["ImageInput"],
+            videos: Sequence["VideoInput"],
+            imglens: Sequence[int],
+            vidlens: Sequence[int],
+            seqlens: Sequence[int],
+            processor: Optional["ProcessorMixin"],
+    ) -> Dict[str, Union[List[int], "torch.Tensor"]]:
+        self._validate_input(images, videos)
+        return _get_mm_inputs(images, videos, processor)
+
+
+class LlavaNextVideoPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: Sequence[Dict[str, str]],
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+        processor: Optional["ProcessorMixin"],
+    ) -> List[Dict[str, str]]:
+        self._validate_input(images, videos)
+        num_image_tokens = 0
+        num_video_tokens = 0
+        messages = deepcopy(messages)
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                num_image_tokens += 1
+                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}", 1)
+            while VIDEO_PLACEHOLDER in content:
+                num_video_tokens += 1
+                content = content.replace(VIDEO_PLACEHOLDER, "{{video}}", 1)
+
+        if len(images) != num_image_tokens:
+            raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
+
+        if len(videos) != num_video_tokens:
+            raise ValueError("The number of videos does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+        imglens: Sequence[int],
+        vidlens: Sequence[int],
+        seqlens: Sequence[int],
+        processor: Optional["ProcessorMixin"],
+    ) -> Dict[str, Union[List[int], "torch.Tensor"]]:
+        self._validate_input(images, videos)
+        video_processor = getattr(processor, "video_processor")
+        res = _get_mm_inputs(images, [], processor)
+        if len(videos) != 0:
+            videos = _regularize_videos(videos, processor)
+            video_res = video_processor(videos, return_tensors="pt")
+            res.update(video_res)
+        return res
+
 class PaliGemmaPlugin(BasePlugin):
     @override
     def process_messages(
@@ -380,11 +510,59 @@ class Qwen2vlPlugin(BasePlugin):
         return _get_mm_inputs(images, videos, processor)
 
 
+class VideoLlavaPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: Sequence[Dict[str, str]],
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+        processor: Optional["ProcessorMixin"],
+    ) -> List[Dict[str, str]]:
+        self._validate_input(images, videos)
+        num_image_tokens = 0
+        num_video_tokens = 0
+        messages = deepcopy(messages)
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                num_image_tokens += 1
+                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}", 1)
+            while VIDEO_PLACEHOLDER in content:
+                num_video_tokens += 1
+                content = content.replace(VIDEO_PLACEHOLDER, "{{video}}", 1)
+
+        if len(images) != num_image_tokens:
+            raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
+
+        if len(videos) != num_video_tokens:
+            raise ValueError("The number of videos does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+        imglens: Sequence[int],
+        vidlens: Sequence[int],
+        seqlens: Sequence[int],
+        processor: Optional["ProcessorMixin"],
+    ) -> Dict[str, Union[List[int], "torch.Tensor"]]:
+        self._validate_input(images, videos)
+        return _get_mm_inputs(images, videos, processor)
+
+
 PLUGINS = {
     "base": BasePlugin,
+    "idefics2": Idefics2Plugin,
     "llava": LlavaPlugin,
+    "llava_next": LlavaNextPlugin,
+    "llava_next_video": LlavaNextVideoPlugin,
     "paligemma": PaliGemmaPlugin,
     "qwen2_vl": Qwen2vlPlugin,
+    "video_llava": VideoLlavaPlugin,
 }
 
 
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index ff5e32d2..7bf164e6 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -680,6 +680,16 @@ _register_template(
 )
 
 
+_register_template(
+    name="idefics2",
+    format_user=StringFormatter(slots=["User:{{content}}<end_of_utterance>\nAssistant:"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    stop_words=["<end_of_utterance>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="idefics2", image_token="<image>"),
+)
+
+
 _register_template(
     name="intern",
     format_user=StringFormatter(slots=["<|User|>:{{content}}\n<|Bot|>:"]),
@@ -753,6 +763,28 @@ _register_template(
 )
 
 
+_register_template(
+    name="llava_next",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+_register_template(
+    name="llava_next_video",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>"),
+)
+
+
 _register_template(
     name="mistral",
     format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]),
@@ -897,6 +929,17 @@ _register_template(
 )
 
 
+_register_template(
+    name="video_llava",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="video_llava", image_token="<image>", video_token="<video>"),
+)
+
+
 _register_template(
     name="xuanyuan",
     format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]),
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 89fcfb91..6194559e 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -583,6 +583,23 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "Idefics2-Base": {
+            DownloadSource.DEFAULT: "HuggingFaceM4/idefics2-8b-base",
+        },
+        "Idefics2-Chat": {
+            DownloadSource.DEFAULT: "HuggingFaceM4/idefics2-8b",
+        },
+        "Idefics2-Chatty": {
+            DownloadSource.DEFAULT: "HuggingFaceM4/idefics2-8b-chatty",
+        },
+    },
+    template="idefics2",
+    vision=True,
+)
+
+
 register_model_group(
     models={
         "InternLM-7B": {
@@ -812,6 +829,49 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "LLaVA-NeXT-7B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-vicuna-7b-hf",
+        },
+        "LLaVA-NeXT-13B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-vicuna-13b-hf",
+        },
+        "LLaVA-NeXT-34B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-34b-hf",
+        },
+        "LLaVA-NeXT-Mistral-7B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-mistral-7b-hf",
+        },
+    },
+    template="llava_next",
+    vision=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-Video-7B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-hf",
+        },
+        "LLaVA-NeXT-Video-34B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-34B-hf",
+        },
+        "LLaVA-NeXT-Video-7B-32k-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-32K-hf",
+        },
+        "LLaVA-NeXT-Video-7B-DPO": {
+            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-DPO-hf",
+        },
+        "LLaVA-NeXT-Video-34B-DPO": {
+            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-34B-DPO-hf",
+        },
+    },
+    template="llava_next_video",
+    vision=True,
+)
+
+
 register_model_group(
     models={
         "MiniCPM-2B-SFT-Chat": {
@@ -1475,6 +1535,17 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "Video-LLaVA-7B-Chat": {
+            DownloadSource.DEFAULT: "LanguageBind/Video-LLaVA-7B-hf",
+        },
+    },
+    template="video_llava",
+    vision=True,
+)
+
+
 register_model_group(
     models={
         "XuanYuan-6B": {
diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index c44468ed..6b0fa719 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -107,7 +107,8 @@ def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule":
             setattr(processor, "video_factor", 2)
         else:
             setattr(processor, "video_factor", 1)
-    except Exception:
+    except Exception as e:
+        print(e)
         processor = None
 
     # Avoid load tokenizer, see:
@@ -123,6 +124,12 @@ def load_config(model_args: "ModelArguments") -> "PretrainedConfig":
     Loads model config.
     """
     init_kwargs = _get_init_kwargs(model_args)
+    if "LLaVA-NeXT-Video" in model_args.model_name_or_path:
+        from transformers import PretrainedConfig, LlavaNextVideoConfig, CLIPVisionConfig, LlamaConfig
+        official_config = PretrainedConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)
+        config = LlavaNextVideoConfig(CLIPVisionConfig(**official_config.vision_config), LlamaConfig(**official_config.text_config))
+        setattr(config, "visual_inputs", True)
+        return config
     return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)
 
 
@@ -159,6 +166,9 @@ def load_model(
                 load_class = AutoModelForVision2Seq
             else:
                 load_class = AutoModelForCausalLM
+                if "llava_next_video" == getattr(config, "model_type"):
+                    from transformers import LlavaNextVideoForConditionalGeneration
+                    load_class = LlavaNextVideoForConditionalGeneration
 
             if model_args.train_from_scratch:
                 model = load_class.from_config(config)
diff --git a/src/llamafactory/model/model_utils/misc.py b/src/llamafactory/model/model_utils/misc.py
index 342a1008..000a1399 100644
--- a/src/llamafactory/model/model_utils/misc.py
+++ b/src/llamafactory/model/model_utils/misc.py
@@ -34,7 +34,7 @@ def find_all_linear_modules(model: "PreTrainedModel", freeze_vision_tower: bool)
         forbidden_modules.add("output_layer")
     elif model_type == "internlm2":
         forbidden_modules.add("output")
-    elif model_type in ["llava", "paligemma"]:
+    elif model_type in ["idefics2", "llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]:
         forbidden_modules.add("multi_modal_projector")
     elif model_type == "qwen2_vl":
         forbidden_modules.add("merger")
diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py
index 23f880a6..a850d077 100644
--- a/src/llamafactory/model/model_utils/visual.py
+++ b/src/llamafactory/model/model_utils/visual.py
@@ -108,7 +108,7 @@ def configure_visual_model(config: "PretrainedConfig") -> None:
     Patches VLMs before loading them.
     """
     model_type = getattr(config, "model_type", None)
-    if model_type == "llava":  # required for ds zero3 and valuehead models
+    if model_type in ["llava", "llava_next", "video_llava", "idefics2", "llava_next_video"]:  # required for ds zero3 and valuehead models
         setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None))
 
     if getattr(config, "is_yi_vl_derived_model", None):
@@ -150,7 +150,7 @@ def get_image_seqlen(config: "PretrainedConfig") -> int:
             image_seqlen += 1
     elif model_type == "paligemma":
         image_seqlen = config.vision_config.num_image_tokens
-    elif model_type == "qwen2_vl":  # variable length
+    else:
         image_seqlen = -1
 
     return image_seqlen
diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py
index 3ccba7f4..04559604 100644
--- a/tests/data/test_mm_plugin.py
+++ b/tests/data/test_mm_plugin.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import copy
 import os
 from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple
 
@@ -136,6 +136,47 @@ def test_llava_plugin():
     _check_plugin(**check_inputs)
 
 
+def test_idefics2_plugin():
+    tokenizer, processor = _load_tokenizer_module(model_name_or_path="HuggingFaceM4/idefics2-8b")
+    idefics2_plugin = get_mm_plugin(name="idefics2", image_token="<image>")
+    check_inputs = {"plugin": idefics2_plugin, "tokenizer": tokenizer, "processor": processor}
+    mm_messages = copy.deepcopy(MM_MESSAGES)
+    fake_image_token = processor.fake_image_token.content
+    image_str = f"{fake_image_token}{"<image>" * processor.image_seq_len}{fake_image_token}"
+    image_str = image_str * 5
+    for message in mm_messages:
+        content = message["content"]
+        content = content.replace("<image>", image_str)
+        content = content.replace(f"{fake_image_token}{fake_image_token}", f"{fake_image_token}")
+        message['content'] = content
+    check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
+    _check_plugin(**check_inputs)
+
+
+def test_llava_next_plugin():
+    tokenizer, processor = _load_tokenizer_module(model_name_or_path="llava-hf/llava-v1.6-vicuna-7b-hf")
+    llava_next_plugin = get_mm_plugin(name="llava_next", image_token="<image>")
+    check_inputs = {"plugin": llava_next_plugin, "tokenizer": tokenizer, "processor": processor}
+    check_inputs["expected_mm_messages"] = [
+        {key: value for key, value in message.items()}
+        for message in MM_MESSAGES
+    ]
+    check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
+    _check_plugin(**check_inputs)
+
+
+def test_llava_next_video_plugin():
+    tokenizer, processor = _load_tokenizer_module(model_name_or_path="llava-hf/LLaVA-NeXT-Video-7B-hf")
+    llava_next_video_plugin = get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>")
+    check_inputs = {"plugin": llava_next_video_plugin, "tokenizer": tokenizer, "processor": processor}
+    check_inputs["expected_mm_messages"] = [
+        {key: value for key, value in message.items()}
+        for message in MM_MESSAGES
+    ]
+    check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
+    _check_plugin(**check_inputs)
+
+
 @pytest.mark.skipif(not HF_TOKEN, reason="Gated model.")
 def test_paligemma_plugin():
     tokenizer, processor = _load_tokenizer_module(model_name_or_path="google/paligemma-3b-pt-224")
@@ -167,3 +208,15 @@ def test_qwen2_vl_plugin():
     ]
     check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
     _check_plugin(**check_inputs)
+
+
+def test_video_llava_plugin():
+    tokenizer, processor = _load_tokenizer_module(model_name_or_path="LanguageBind/Video-LLaVA-7B-hf")
+    video_llava_plugin = get_mm_plugin(name="video_llava", image_token="<image>", video_token="<video>")
+    check_inputs = {"plugin": video_llava_plugin, "tokenizer": tokenizer, "processor": processor}
+    check_inputs["expected_mm_messages"] = [
+        {key: value for key, value in message.items()}
+        for message in MM_MESSAGES
+    ]
+    check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
+    _check_plugin(**check_inputs)

From 5c53cf324485927d10e5464dd5bda2cdc5a7a0ad Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Tue, 10 Sep 2024 12:39:17 +0800
Subject: [PATCH 02/33] resolve confilct

Former-commit-id: 96decf82b872a4ec06450b4440336d38475d1d02
---
 src/llamafactory/model/loader.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index 6b0fa719..96fb5760 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -103,12 +103,7 @@ def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule":
         setattr(processor, "video_resolution", model_args.video_resolution)
         setattr(processor, "video_fps", model_args.video_fps)
         setattr(processor, "video_maxlen", model_args.video_maxlen)
-        if getattr(config, "model_type", None) == "qwen2_vl":
-            setattr(processor, "video_factor", 2)
-        else:
-            setattr(processor, "video_factor", 1)
-    except Exception as e:
-        print(e)
+    except Exception:
         processor = None
 
     # Avoid load tokenizer, see:

From 677d57b7c7df0bde3906763751e02388bd507f25 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Tue, 10 Sep 2024 12:56:12 +0800
Subject: [PATCH 03/33] try to past test

Former-commit-id: b0cff7d9096b4b54927ce665fde58b7f34b11439
---
 tests/data/test_mm_plugin.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py
index 04559604..bbd13611 100644
--- a/tests/data/test_mm_plugin.py
+++ b/tests/data/test_mm_plugin.py
@@ -148,7 +148,7 @@ def test_idefics2_plugin():
         content = message["content"]
         content = content.replace("<image>", image_str)
         content = content.replace(f"{fake_image_token}{fake_image_token}", f"{fake_image_token}")
-        message['content'] = content
+        message["content"] = content
     check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
     _check_plugin(**check_inputs)
 
@@ -157,10 +157,7 @@ def test_llava_next_plugin():
     tokenizer, processor = _load_tokenizer_module(model_name_or_path="llava-hf/llava-v1.6-vicuna-7b-hf")
     llava_next_plugin = get_mm_plugin(name="llava_next", image_token="<image>")
     check_inputs = {"plugin": llava_next_plugin, "tokenizer": tokenizer, "processor": processor}
-    check_inputs["expected_mm_messages"] = [
-        {key: value for key, value in message.items()}
-        for message in MM_MESSAGES
-    ]
+    check_inputs["expected_mm_messages"] = MM_MESSAGES
     check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
     _check_plugin(**check_inputs)
 
@@ -169,10 +166,7 @@ def test_llava_next_video_plugin():
     tokenizer, processor = _load_tokenizer_module(model_name_or_path="llava-hf/LLaVA-NeXT-Video-7B-hf")
     llava_next_video_plugin = get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>")
     check_inputs = {"plugin": llava_next_video_plugin, "tokenizer": tokenizer, "processor": processor}
-    check_inputs["expected_mm_messages"] = [
-        {key: value for key, value in message.items()}
-        for message in MM_MESSAGES
-    ]
+    check_inputs["expected_mm_messages"] = MM_MESSAGES
     check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
     _check_plugin(**check_inputs)
 
@@ -214,9 +208,6 @@ def test_video_llava_plugin():
     tokenizer, processor = _load_tokenizer_module(model_name_or_path="LanguageBind/Video-LLaVA-7B-hf")
     video_llava_plugin = get_mm_plugin(name="video_llava", image_token="<image>", video_token="<video>")
     check_inputs = {"plugin": video_llava_plugin, "tokenizer": tokenizer, "processor": processor}
-    check_inputs["expected_mm_messages"] = [
-        {key: value for key, value in message.items()}
-        for message in MM_MESSAGES
-    ]
+    check_inputs["expected_mm_messages"] = MM_MESSAGES
     check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
     _check_plugin(**check_inputs)

From 16c7326bc5023bc8865283a61df3f0831c9dfb59 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Tue, 10 Sep 2024 13:12:51 +0800
Subject: [PATCH 04/33] try to past test

Former-commit-id: 7b4ba0efb658422fd29dca63bac1e9cee8e82af8
---
 README.md                                    | 56 ++++++++++----------
 src/llamafactory/data/mm_plugin.py           | 25 ++++-----
 src/llamafactory/model/loader.py             |  8 ++-
 src/llamafactory/model/model_utils/visual.py |  8 ++-
 4 files changed, 54 insertions(+), 43 deletions(-)

diff --git a/README.md b/README.md
index 963ca26a..51fbc7c1 100644
--- a/README.md
+++ b/README.md
@@ -160,36 +160,36 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Supported Models
 
-| Model                                                             | Model size                       | Template         |
-|-------------------------------------------------------------------| -------------------------------- |------------------|
-| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2        |
-| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -                |
-| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3         |
-| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere           |
-| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek         |
-| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B                  | falcon           |
-| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma            |
-| [GLM-4](https://huggingface.co/THUDM)                             | 9B                               | glm4             |
-| [InternLM2/InternLM2.5](https://huggingface.co/internlm)          | 7B/20B                           | intern2          |
-| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -                |
-| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2           |
-| [Llama 3/Llama 3.1](https://huggingface.co/meta-llama)            | 8B/70B                           | llama3           |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava            |
+| Model                                                             | Model size                       | Template  |
+| ----------------------------------------------------------------- | -------------------------------- | --------- |
+| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2 |
+| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
+| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3  |
+| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere    |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek  |
+| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B                  | falcon    |
+| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma     |
+| [GLM-4](https://huggingface.co/THUDM)                             | 9B                               | glm4      |
+| [InternLM2/InternLM2.5](https://huggingface.co/internlm)          | 7B/20B                           | intern2   |
+| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -         |
+| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2    |
+| [Llama 3/Llama 3.1](https://huggingface.co/meta-llama)            | 8B/70B                           | llama3    |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava     |
 | [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/13B                           | llava_next       |
 | [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/13B                           | llava_next_video |
-| [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                         | cpm/cpm3         |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral          |
-| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -                |
-| [PaliGemma](https://huggingface.co/google)                        | 3B                               | paligemma        |
-| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                        | -                |
-| [Phi-3](https://huggingface.co/microsoft)                         | 4B/7B/14B                        | phi              |
-| [Qwen/Qwen1.5/Qwen2 (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/4B/7B/14B/32B/72B/110B | qwen             |
-| [Qwen2-VL](https://huggingface.co/Qwen)                           | 2B/7B                            | qwen2_vl         |
-| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -                |
-| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                       | xverse           |
-| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi               |
-| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                           | yi_vl            |
-| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan             |
+| [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                         | cpm/cpm3  |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral   |
+| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -         |
+| [PaliGemma](https://huggingface.co/google)                        | 3B                               | paligemma |
+| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                        | -         |
+| [Phi-3](https://huggingface.co/microsoft)                         | 4B/7B/14B                        | phi       |
+| [Qwen/Qwen1.5/Qwen2 (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/4B/7B/14B/32B/72B/110B | qwen      |
+| [Qwen2-VL](https://huggingface.co/Qwen)                           | 2B/7B                            | qwen2_vl  |
+| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -         |
+| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                       | xverse    |
+| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi        |
+| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                           | yi_vl     |
+| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan      |
 
 > [!NOTE]
 > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.
diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index 22c49468..919541c6 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -296,11 +296,11 @@ class LlavaPlugin(BasePlugin):
 class LlavaNextPlugin(BasePlugin):
     @override
     def process_messages(
-            self,
-            messages: Sequence[Dict[str, str]],
-            images: Sequence["ImageInput"],
-            videos: Sequence["VideoInput"],
-            processor: Optional["ProcessorMixin"],
+        self,
+        messages: Sequence[Dict[str, str]],
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+        processor: Optional["ProcessorMixin"],
     ) -> List[Dict[str, str]]:
         self._validate_input(images, videos)
         num_image_tokens = 0
@@ -318,13 +318,13 @@ class LlavaNextPlugin(BasePlugin):
 
     @override
     def get_mm_inputs(
-            self,
-            images: Sequence["ImageInput"],
-            videos: Sequence["VideoInput"],
-            imglens: Sequence[int],
-            vidlens: Sequence[int],
-            seqlens: Sequence[int],
-            processor: Optional["ProcessorMixin"],
+        self,
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+        imglens: Sequence[int],
+        vidlens: Sequence[int],
+        seqlens: Sequence[int],
+        processor: Optional["ProcessorMixin"],
     ) -> Dict[str, Union[List[int], "torch.Tensor"]]:
         self._validate_input(images, videos)
         return _get_mm_inputs(images, videos, processor)
@@ -379,6 +379,7 @@ class LlavaNextVideoPlugin(BasePlugin):
             res.update(video_res)
         return res
 
+
 class PaliGemmaPlugin(BasePlugin):
     @override
     def process_messages(
diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index 96fb5760..502af2a2 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -120,9 +120,12 @@ def load_config(model_args: "ModelArguments") -> "PretrainedConfig":
     """
     init_kwargs = _get_init_kwargs(model_args)
     if "LLaVA-NeXT-Video" in model_args.model_name_or_path:
-        from transformers import PretrainedConfig, LlavaNextVideoConfig, CLIPVisionConfig, LlamaConfig
+        from transformers import CLIPVisionConfig, LlamaConfig, LlavaNextVideoConfig, PretrainedConfig
+
         official_config = PretrainedConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)
-        config = LlavaNextVideoConfig(CLIPVisionConfig(**official_config.vision_config), LlamaConfig(**official_config.text_config))
+        config = LlavaNextVideoConfig(
+            CLIPVisionConfig(**official_config.vision_config), LlamaConfig(**official_config.text_config)
+        )
         setattr(config, "visual_inputs", True)
         return config
     return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)
@@ -163,6 +166,7 @@ def load_model(
                 load_class = AutoModelForCausalLM
                 if "llava_next_video" == getattr(config, "model_type"):
                     from transformers import LlavaNextVideoForConditionalGeneration
+
                     load_class = LlavaNextVideoForConditionalGeneration
 
             if model_args.train_from_scratch:
diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py
index a850d077..32662110 100644
--- a/src/llamafactory/model/model_utils/visual.py
+++ b/src/llamafactory/model/model_utils/visual.py
@@ -108,7 +108,13 @@ def configure_visual_model(config: "PretrainedConfig") -> None:
     Patches VLMs before loading them.
     """
     model_type = getattr(config, "model_type", None)
-    if model_type in ["llava", "llava_next", "video_llava", "idefics2", "llava_next_video"]:  # required for ds zero3 and valuehead models
+    if model_type in [
+        "llava",
+        "llava_next",
+        "video_llava",
+        "idefics2",
+        "llava_next_video",
+    ]:  # required for ds zero3 and valuehead models
         setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None))
 
     if getattr(config, "is_yi_vl_derived_model", None):

From 0be477292b3c7f195d907dfdb15bb973d44e41b2 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Tue, 10 Sep 2024 13:25:30 +0800
Subject: [PATCH 05/33] try to past test

Former-commit-id: e387216d49cf261c82d77df22424e6360dbcfc2a
---
 src/llamafactory/data/mm_plugin.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index 90c0bf22..40414399 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -264,7 +264,7 @@ class Idefics2Plugin(BasePlugin):
         processor: Optional["ProcessorMixin"],
     ) -> Dict[str, Union[List[int], "torch.Tensor"]]:
         self._validate_input(images, videos)
-        return _get_mm_inputs(images, videos, processor)
+        return self._get_mm_inputs(images, videos, processor)
 
 
 class LlavaPlugin(BasePlugin):
@@ -341,7 +341,7 @@ class LlavaNextPlugin(BasePlugin):
         processor: Optional["ProcessorMixin"],
     ) -> Dict[str, Union[List[int], "torch.Tensor"]]:
         self._validate_input(images, videos)
-        return _get_mm_inputs(images, videos, processor)
+        return self._get_mm_inputs(images, videos, processor)
 
 
 class LlavaNextVideoPlugin(BasePlugin):
@@ -386,9 +386,9 @@ class LlavaNextVideoPlugin(BasePlugin):
     ) -> Dict[str, Union[List[int], "torch.Tensor"]]:
         self._validate_input(images, videos)
         video_processor = getattr(processor, "video_processor")
-        res = _get_mm_inputs(images, [], processor)
+        res = self._get_mm_inputs(images, [], processor)
         if len(videos) != 0:
-            videos = _regularize_videos(videos, processor)
+            videos = self._regularize_videos(videos)
             video_res = video_processor(videos, return_tensors="pt")
             res.update(video_res)
         return res
@@ -589,7 +589,7 @@ class VideoLlavaPlugin(BasePlugin):
         processor: Optional["ProcessorMixin"],
     ) -> Dict[str, Union[List[int], "torch.Tensor"]]:
         self._validate_input(images, videos)
-        return _get_mm_inputs(images, videos, processor)
+        return self._get_mm_inputs(images, videos, processor)
 
 
 PLUGINS = {

From 75585d01f0014c066039605a6f97d58e29d85293 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Tue, 10 Sep 2024 13:29:09 +0800
Subject: [PATCH 06/33] try to past test

Former-commit-id: 0042a144d532aecd0159956d6a275eb9fbfa305d
---
 tests/data/test_mm_plugin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py
index bbd13611..722d093e 100644
--- a/tests/data/test_mm_plugin.py
+++ b/tests/data/test_mm_plugin.py
@@ -142,7 +142,7 @@ def test_idefics2_plugin():
     check_inputs = {"plugin": idefics2_plugin, "tokenizer": tokenizer, "processor": processor}
     mm_messages = copy.deepcopy(MM_MESSAGES)
     fake_image_token = processor.fake_image_token.content
-    image_str = f"{fake_image_token}{"<image>" * processor.image_seq_len}{fake_image_token}"
+    image_str = f"{fake_image_token}{'<image>' * processor.image_seq_len}{fake_image_token}"
     image_str = image_str * 5
     for message in mm_messages:
         content = message["content"]

From 5aa1e847d9d6a4aa58ff511ff417812d0fd285fa Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sat, 28 Sep 2024 00:57:03 +0800
Subject: [PATCH 07/33] add llava-next/llava-next-video/video-llava

Former-commit-id: 6642cd501d55a1657678428ef2aa0c9b99b7e83f
---
 src/llamafactory/data/mm_plugin.py           | 200 ++++++++++++-------
 src/llamafactory/data/template.py            |  10 -
 src/llamafactory/extras/constants.py         |  17 --
 src/llamafactory/model/loader.py             |  14 --
 src/llamafactory/model/model_utils/visual.py |   9 +-
 5 files changed, 134 insertions(+), 116 deletions(-)

diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index 40414399..fbcfd46a 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -4,6 +4,7 @@ from io import BytesIO
 from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, TypedDict, Union
 
 import numpy as np
+from transformers.image_utils import get_image_size, to_numpy_array
 from typing_extensions import override
 
 from ..extras.constants import IGNORE_INDEX, IMAGE_PLACEHOLDER, VIDEO_PLACEHOLDER
@@ -173,7 +174,6 @@ class BasePlugin:
                 video_maxlen=getattr(processor, "video_maxlen", 64),
             )
             input_dict["videos"] = videos
-
         if input_dict.get("images", None) is not None or input_dict.get("videos", None) is not None:
             return image_processor(**input_dict, return_tensors="pt")
         else:
@@ -223,50 +223,6 @@ class BasePlugin:
         return {}
 
 
-class Idefics2Plugin(BasePlugin):
-    @override
-    def process_messages(
-        self,
-        messages: Sequence[Dict[str, str]],
-        images: Sequence["ImageInput"],
-        videos: Sequence["VideoInput"],
-        processor: Optional["ProcessorMixin"],
-    ) -> List[Dict[str, str]]:
-        self._validate_input(images, videos)
-        num_image_tokens = 0
-        messages = deepcopy(messages)
-        fake_image_token = processor.fake_image_token.content
-        image_str = f"{fake_image_token}{self.image_token * processor.image_seq_len}{fake_image_token}"
-        image_str = image_str * 5
-
-        for message in messages:
-            content = message["content"]
-            while IMAGE_PLACEHOLDER in content:
-                num_image_tokens += 1
-                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}", 1)
-            content = content.replace("{{image}}", image_str)
-            content = content.replace(f"{fake_image_token}{fake_image_token}", f"{fake_image_token}")
-            message["content"] = content
-
-        if len(images) != num_image_tokens:
-            raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
-
-        return messages
-
-    @override
-    def get_mm_inputs(
-        self,
-        images: Sequence["ImageInput"],
-        videos: Sequence["VideoInput"],
-        imglens: Sequence[int],
-        vidlens: Sequence[int],
-        seqlens: Sequence[int],
-        processor: Optional["ProcessorMixin"],
-    ) -> Dict[str, Union[List[int], "torch.Tensor"]]:
-        self._validate_input(images, videos)
-        return self._get_mm_inputs(images, videos, processor)
-
-
 class LlavaPlugin(BasePlugin):
     @override
     def process_messages(
@@ -319,15 +275,33 @@ class LlavaNextPlugin(BasePlugin):
         self._validate_input(images, videos)
         num_image_tokens = 0
         messages = deepcopy(messages)
-        for message in messages:
-            content = message["content"]
-            while IMAGE_PLACEHOLDER in content:
-                num_image_tokens += 1
-                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}", 1)
+        if getattr(processor, "patch_size") is None or getattr(processor, "vision_feature_select_strategy") is None:
+            for message in messages:
+                content = message["content"]
+                while self.image_token in content:
+                    num_image_tokens += 1
+                    content = content.replace(self.image_token, "{{image}}", 1)
+        else:
+            mm_inputs = self._get_mm_inputs(images, videos, processor)
+            image_sizes = iter(mm_inputs["image_sizes"])
+            height, width = get_image_size(to_numpy_array(mm_inputs["pixel_values"][0][0]))
+            for message in messages:
+                content = message["content"]
+                while self.image_token in content:
+                    image_size = next(image_sizes)
+                    orig_height, orig_width = image_size
+                    image_seqlen = processor._get_number_of_features(orig_height, orig_width, height, width)
+                    if processor.vision_feature_select_strategy == "default":
+                        image_seqlen -= 1
+                    num_image_tokens += 1
+                    print(image_seqlen)
+                    content = content.replace(self.image_token, "{{image}}" * image_seqlen, 1)
+
+                message['content'] = content.replace("{{image}}", self.image_token)
 
         if len(images) != num_image_tokens:
             raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
-
+        print(messages)
         return messages
 
     @override
@@ -341,8 +315,8 @@ class LlavaNextPlugin(BasePlugin):
         processor: Optional["ProcessorMixin"],
     ) -> Dict[str, Union[List[int], "torch.Tensor"]]:
         self._validate_input(images, videos)
-        return self._get_mm_inputs(images, videos, processor)
-
+        res = self._get_mm_inputs(images, videos, processor)
+        return res
 
 class LlavaNextVideoPlugin(BasePlugin):
     @override
@@ -357,14 +331,47 @@ class LlavaNextVideoPlugin(BasePlugin):
         num_image_tokens = 0
         num_video_tokens = 0
         messages = deepcopy(messages)
-        for message in messages:
-            content = message["content"]
-            while IMAGE_PLACEHOLDER in content:
-                num_image_tokens += 1
-                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}", 1)
-            while VIDEO_PLACEHOLDER in content:
-                num_video_tokens += 1
-                content = content.replace(VIDEO_PLACEHOLDER, "{{video}}", 1)
+        if getattr(processor, "patch_size") is None or getattr(processor, "vision_feature_select_strategy") is None:
+            for message in messages:
+                content = message["content"]
+                while self.image_token in content:
+                    num_image_tokens += 1
+                    content = content.replace(self.image_token, "{{image}}", 1)
+                while self.video_token in content:
+                    num_video_tokens += 1
+                    content = content.replace(self.video_token, "{{video}}", 1)
+        else:
+            mm_inputs = self._get_mm_inputs(images, videos, processor)
+            if "pixel_values" in mm_inputs:
+                image_sizes = iter(mm_inputs["image_sizes"])
+                height, width = get_image_size(to_numpy_array(mm_inputs["pixel_values"][0][0]))
+                for message in messages:
+                    content = message["content"]
+
+                    while self.image_token in content:
+                        image_size = next(image_sizes)
+                        orig_height, orig_width = image_size
+                        image_seqlen = processor._get_number_of_features(orig_height, orig_width, height, width)
+                        if processor.vision_feature_select_strategy == "default":
+                            image_seqlen -= 1
+                        num_image_tokens += 1
+                        content = content.replace(self.image_token, "{{image}}" * image_seqlen, 1)
+
+                    message['content'] = content.replace("{{image}}", self.image_token)
+
+            if "pixel_values_videos" in mm_inputs:
+                one_video = to_numpy_array(mm_inputs.get("pixel_values_videos")[0])
+                height, width = get_image_size(one_video[0])
+                num_frames = one_video.shape[0]  # frame dim is always after batch dim
+                image_seqlen = (height // processor.patch_size) * (width // processor.patch_size)
+                video_seqlen = image_seqlen // 4 * num_frames  # divide by 4 needed for avg pooling layer
+
+                for message in messages:
+                    content = message["content"]
+                    while self.video_token in content:
+                        num_video_tokens += 1
+                        content = content.replace(self.video_token, "{{video}}", 1)
+                    message['content'] = content.replace("{{video}}", self.video_token * video_seqlen)
 
         if len(images) != num_image_tokens:
             raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
@@ -393,6 +400,19 @@ class LlavaNextVideoPlugin(BasePlugin):
             res.update(video_res)
         return res
 
+    @override
+    def _regularize_videos(self, videos: Sequence["VideoInput"], **kwargs) -> List[List["ImageObject"]]:
+        r"""
+        Regularizes videos to avoid error. Including reading, resizing and converting.
+        """
+        videos = super()._regularize_videos(
+            videos,
+            image_resolution=128,
+            video_fps=1.0,
+            video_maxlen=64,
+        )
+        return videos
+
 
 class PaliGemmaPlugin(BasePlugin):
     @override
@@ -561,14 +581,42 @@ class VideoLlavaPlugin(BasePlugin):
         num_image_tokens = 0
         num_video_tokens = 0
         messages = deepcopy(messages)
-        for message in messages:
-            content = message["content"]
-            while IMAGE_PLACEHOLDER in content:
-                num_image_tokens += 1
-                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}", 1)
-            while VIDEO_PLACEHOLDER in content:
-                num_video_tokens += 1
-                content = content.replace(VIDEO_PLACEHOLDER, "{{video}}", 1)
+        if getattr(processor, "patch_size") is None or getattr(processor, "vision_feature_select_strategy") is None:
+            for message in messages:
+                content = message["content"]
+                while self.image_token in content:
+                    num_image_tokens += 1
+                    content = content.replace(self.image_token, "{{image}}", 1)
+                while self.video_token in content:
+                    num_video_tokens += 1
+                    content = content.replace(self.video_token, "{{video}}", 1)
+        else:
+            mm_inputs = self._get_mm_inputs(images, videos, processor)
+            if "pixel_values_images" in mm_inputs.keys():
+                height, width = get_image_size(to_numpy_array(mm_inputs.get("pixel_values_images")[0]))
+                num_frames = 1
+
+            if "pixel_values_videos" in mm_inputs.keys():
+                one_video = to_numpy_array(mm_inputs.get("pixel_values_videos")[0])
+                height, width = get_image_size(one_video[0])
+                num_frames = one_video.shape[0]  # frame dim is always after batch dim
+
+            image_seqlen = (height // processor.patch_size) * (width // processor.patch_size) + 1
+            video_seqlen = num_image_tokens * num_frames
+            if processor.vision_feature_select_strategy == "default":
+                image_seqlen -= 1
+
+            for message in messages:
+                content = message["content"]
+                while self.image_token in content:
+                    num_image_tokens += 1
+                    content = content.replace(self.image_token, "{{image}}", 1)
+                while self.video_token in content:
+                    num_image_tokens += 1
+                    content = content.replace(self.video_token, "{{video}}", 1)
+
+                message["content"] = content.replace("{{image}}", self.image_token * image_seqlen)
+                message["content"] = content.replace("{{video}}", self.video_token * video_seqlen)
 
         if len(images) != num_image_tokens:
             raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
@@ -591,10 +639,22 @@ class VideoLlavaPlugin(BasePlugin):
         self._validate_input(images, videos)
         return self._get_mm_inputs(images, videos, processor)
 
+    @override
+    def _regularize_videos(self, videos: Sequence["VideoInput"], **kwargs) -> List[List["ImageObject"]]:
+        r"""
+        Regularizes videos to avoid error. Including reading, resizing and converting.
+        """
+        videos = super()._regularize_videos(
+            videos,
+            image_resolution=128,
+            video_fps=1.0,
+            video_maxlen=64,
+        )
+        return videos
+
 
 PLUGINS = {
     "base": BasePlugin,
-    "idefics2": Idefics2Plugin,
     "llava": LlavaPlugin,
     "llava_next": LlavaNextPlugin,
     "llava_next_video": LlavaNextVideoPlugin,
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 99fca395..2d966155 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -686,16 +686,6 @@ _register_template(
 )
 
 
-_register_template(
-    name="idefics2",
-    format_user=StringFormatter(slots=["User:{{content}}<end_of_utterance>\nAssistant:"]),
-    format_separator=EmptyFormatter(slots=["\n"]),
-    stop_words=["<end_of_utterance>"],
-    replace_eos=True,
-    mm_plugin=get_mm_plugin(name="idefics2", image_token="<image>"),
-)
-
-
 _register_template(
     name="intern",
     format_user=StringFormatter(slots=["<|User|>:{{content}}\n<|Bot|>:"]),
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index a3667249..335d222a 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -583,23 +583,6 @@ register_model_group(
 )
 
 
-register_model_group(
-    models={
-        "Idefics2-Base": {
-            DownloadSource.DEFAULT: "HuggingFaceM4/idefics2-8b-base",
-        },
-        "Idefics2-Chat": {
-            DownloadSource.DEFAULT: "HuggingFaceM4/idefics2-8b",
-        },
-        "Idefics2-Chatty": {
-            DownloadSource.DEFAULT: "HuggingFaceM4/idefics2-8b-chatty",
-        },
-    },
-    template="idefics2",
-    vision=True,
-)
-
-
 register_model_group(
     models={
         "InternLM-7B": {
diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index 502af2a2..c2fdb2dd 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -119,15 +119,6 @@ def load_config(model_args: "ModelArguments") -> "PretrainedConfig":
     Loads model config.
     """
     init_kwargs = _get_init_kwargs(model_args)
-    if "LLaVA-NeXT-Video" in model_args.model_name_or_path:
-        from transformers import CLIPVisionConfig, LlamaConfig, LlavaNextVideoConfig, PretrainedConfig
-
-        official_config = PretrainedConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)
-        config = LlavaNextVideoConfig(
-            CLIPVisionConfig(**official_config.vision_config), LlamaConfig(**official_config.text_config)
-        )
-        setattr(config, "visual_inputs", True)
-        return config
     return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)
 
 
@@ -164,11 +155,6 @@ def load_model(
                 load_class = AutoModelForVision2Seq
             else:
                 load_class = AutoModelForCausalLM
-                if "llava_next_video" == getattr(config, "model_type"):
-                    from transformers import LlavaNextVideoForConditionalGeneration
-
-                    load_class = LlavaNextVideoForConditionalGeneration
-
             if model_args.train_from_scratch:
                 model = load_class.from_config(config)
             else:
diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py
index 32662110..b5b581bb 100644
--- a/src/llamafactory/model/model_utils/visual.py
+++ b/src/llamafactory/model/model_utils/visual.py
@@ -92,7 +92,7 @@ def autocast_projector_dtype(model: "PreTrainedModel", model_args: "ModelArgumen
 
     if getattr(model, "quantization_method", None):
         model_type = getattr(model.config, "model_type", None)
-        if model_type in ["llava", "paligemma"]:
+        if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]:
             mm_projector: "torch.nn.Module" = getattr(model, "multi_modal_projector")
         elif model_type == "qwen2_vl":
             mm_projector: "torch.nn.Module" = getattr(getattr(model, "visual"), "merger")
@@ -111,9 +111,8 @@ def configure_visual_model(config: "PretrainedConfig") -> None:
     if model_type in [
         "llava",
         "llava_next",
-        "video_llava",
-        "idefics2",
         "llava_next_video",
+        "video_llava",
     ]:  # required for ds zero3 and valuehead models
         setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None))
 
@@ -128,7 +127,7 @@ def get_forbidden_modules(config: "PretrainedConfig", finetuning_args: "Finetuni
     """
     model_type = getattr(config, "model_type", None)
     forbidden_modules = set()
-    if model_type in ["llava", "paligemma"]:
+    if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]:
         if finetuning_args.freeze_vision_tower:
             forbidden_modules.add("vision_tower")
 
@@ -170,7 +169,7 @@ def patch_target_modules(
     """
     model_type = getattr(config, "model_type", None)
     if finetuning_args.freeze_vision_tower:
-        if model_type in ["llava", "paligemma"]:
+        if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]:
             return "^(?!.*vision_tower).*(?:{}).*".format("|".join(target_modules))
         elif model_type == "qwen2_vl":
             return "^(?!.*visual).*(?:{}).*".format("|".join(target_modules))

From 3501257780b1cedac84c86393559ee89b0984cf0 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sat, 28 Sep 2024 00:59:14 +0800
Subject: [PATCH 08/33] add tests

Former-commit-id: f3be3d21e7efcb2f596ee96f5a97bb53438f2d89
---
 tests/data/test_mm_plugin.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py
index 722d093e..a48ad795 100644
--- a/tests/data/test_mm_plugin.py
+++ b/tests/data/test_mm_plugin.py
@@ -136,23 +136,6 @@ def test_llava_plugin():
     _check_plugin(**check_inputs)
 
 
-def test_idefics2_plugin():
-    tokenizer, processor = _load_tokenizer_module(model_name_or_path="HuggingFaceM4/idefics2-8b")
-    idefics2_plugin = get_mm_plugin(name="idefics2", image_token="<image>")
-    check_inputs = {"plugin": idefics2_plugin, "tokenizer": tokenizer, "processor": processor}
-    mm_messages = copy.deepcopy(MM_MESSAGES)
-    fake_image_token = processor.fake_image_token.content
-    image_str = f"{fake_image_token}{'<image>' * processor.image_seq_len}{fake_image_token}"
-    image_str = image_str * 5
-    for message in mm_messages:
-        content = message["content"]
-        content = content.replace("<image>", image_str)
-        content = content.replace(f"{fake_image_token}{fake_image_token}", f"{fake_image_token}")
-        message["content"] = content
-    check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
-    _check_plugin(**check_inputs)
-
-
 def test_llava_next_plugin():
     tokenizer, processor = _load_tokenizer_module(model_name_or_path="llava-hf/llava-v1.6-vicuna-7b-hf")
     llava_next_plugin = get_mm_plugin(name="llava_next", image_token="<image>")

From 7ab42cb5827116d6da7705a4eb72e9ee481cbce2 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sat, 28 Sep 2024 01:07:38 +0800
Subject: [PATCH 09/33] modify some style

Former-commit-id: 1b21793543f4a03252e81a86e06ec91fd2a51fa5
---
 src/llamafactory/data/mm_plugin.py | 2 --
 tests/data/test_mm_plugin.py       | 1 -
 2 files changed, 3 deletions(-)

diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index fbcfd46a..bb008ae8 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -294,14 +294,12 @@ class LlavaNextPlugin(BasePlugin):
                     if processor.vision_feature_select_strategy == "default":
                         image_seqlen -= 1
                     num_image_tokens += 1
-                    print(image_seqlen)
                     content = content.replace(self.image_token, "{{image}}" * image_seqlen, 1)
 
                 message['content'] = content.replace("{{image}}", self.image_token)
 
         if len(images) != num_image_tokens:
             raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
-        print(messages)
         return messages
 
     @override
diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py
index a48ad795..6cce2c4c 100644
--- a/tests/data/test_mm_plugin.py
+++ b/tests/data/test_mm_plugin.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import copy
 import os
 from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple
 

From 0e33902f61b5bd467a2fa78583a82fea24322428 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sat, 28 Sep 2024 01:15:33 +0800
Subject: [PATCH 10/33] fix some

Former-commit-id: 7f3f81009e3728fe25b9c063491ee71acc498c35
---
 src/llamafactory/model/model_utils/misc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/model/model_utils/misc.py b/src/llamafactory/model/model_utils/misc.py
index 000a1399..4883fa23 100644
--- a/src/llamafactory/model/model_utils/misc.py
+++ b/src/llamafactory/model/model_utils/misc.py
@@ -34,7 +34,7 @@ def find_all_linear_modules(model: "PreTrainedModel", freeze_vision_tower: bool)
         forbidden_modules.add("output_layer")
     elif model_type == "internlm2":
         forbidden_modules.add("output")
-    elif model_type in ["idefics2", "llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]:
+    elif model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]:
         forbidden_modules.add("multi_modal_projector")
     elif model_type == "qwen2_vl":
         forbidden_modules.add("merger")

From 615edf937e30d27a43d3db643b451e731edd4317 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sun, 29 Sep 2024 12:38:25 +0800
Subject: [PATCH 11/33] fix some params of visual regularize

Former-commit-id: 1d09d592d3beacca6a17bb3e5a9728150741e551
---
 src/llamafactory/data/mm_plugin.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index bb008ae8..553aef96 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -405,9 +405,9 @@ class LlavaNextVideoPlugin(BasePlugin):
         """
         videos = super()._regularize_videos(
             videos,
-            image_resolution=128,
+            image_resolution=168,
             video_fps=1.0,
-            video_maxlen=64,
+            video_maxlen=16,
         )
         return videos
 
@@ -644,9 +644,9 @@ class VideoLlavaPlugin(BasePlugin):
         """
         videos = super()._regularize_videos(
             videos,
-            image_resolution=128,
+            image_resolution=224,
             video_fps=1.0,
-            video_maxlen=64,
+            video_maxlen=8,
         )
         return videos
 

From 1e2ea34419331b0ec4c3b2593915fdcec00d6e08 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sun, 29 Sep 2024 17:55:40 +0800
Subject: [PATCH 12/33] fix some

Former-commit-id: d5c69400cd27cdf0667290f3863a3aab47143eb3
---
 setup.py                                     |   1 -
 src/llamafactory/data/mm_plugin.py           | 188 ++++++++-----------
 src/llamafactory/model/loader.py             |   4 +-
 src/llamafactory/model/model_utils/visual.py |  16 ++
 4 files changed, 96 insertions(+), 113 deletions(-)

diff --git a/setup.py b/setup.py
index 5e969e51..a80cb81b 100644
--- a/setup.py
+++ b/setup.py
@@ -61,7 +61,6 @@ extra_require = {
     "qwen": ["transformers_stream_generator"],
     "modelscope": ["modelscope"],
     "dev": ["ruff", "pytest"],
-    "av": ["av>=13.0.0"],
 }
 
 
diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index 553aef96..14fdf8cb 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -275,28 +275,23 @@ class LlavaNextPlugin(BasePlugin):
         self._validate_input(images, videos)
         num_image_tokens = 0
         messages = deepcopy(messages)
-        if getattr(processor, "patch_size") is None or getattr(processor, "vision_feature_select_strategy") is None:
-            for message in messages:
-                content = message["content"]
-                while self.image_token in content:
-                    num_image_tokens += 1
-                    content = content.replace(self.image_token, "{{image}}", 1)
-        else:
-            mm_inputs = self._get_mm_inputs(images, videos, processor)
+        mm_inputs = self._get_mm_inputs(images, videos, processor)
+        if "image_sizes" in mm_inputs:
             image_sizes = iter(mm_inputs["image_sizes"])
+        if "pixel_values" in mm_inputs:
             height, width = get_image_size(to_numpy_array(mm_inputs["pixel_values"][0][0]))
-            for message in messages:
-                content = message["content"]
-                while self.image_token in content:
-                    image_size = next(image_sizes)
-                    orig_height, orig_width = image_size
-                    image_seqlen = processor._get_number_of_features(orig_height, orig_width, height, width)
-                    if processor.vision_feature_select_strategy == "default":
-                        image_seqlen -= 1
-                    num_image_tokens += 1
-                    content = content.replace(self.image_token, "{{image}}" * image_seqlen, 1)
+        for message in messages:
+            content = message["content"]
+            while self.image_token in content:
+                image_size = next(image_sizes)
+                orig_height, orig_width = image_size
+                image_seqlen = processor._get_number_of_features(orig_height, orig_width, height, width)
+                if processor.vision_feature_select_strategy == "default":
+                    image_seqlen -= 1
+                num_image_tokens += 1
+                content = content.replace(self.image_token, "{{image}}" * image_seqlen, 1)
 
-                message['content'] = content.replace("{{image}}", self.image_token)
+            message['content'] = content.replace("{{image}}", self.image_token)
 
         if len(images) != num_image_tokens:
             raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
@@ -316,6 +311,7 @@ class LlavaNextPlugin(BasePlugin):
         res = self._get_mm_inputs(images, videos, processor)
         return res
 
+
 class LlavaNextVideoPlugin(BasePlugin):
     @override
     def process_messages(
@@ -329,47 +325,37 @@ class LlavaNextVideoPlugin(BasePlugin):
         num_image_tokens = 0
         num_video_tokens = 0
         messages = deepcopy(messages)
-        if getattr(processor, "patch_size") is None or getattr(processor, "vision_feature_select_strategy") is None:
+        mm_inputs = self._get_mm_inputs(images, videos, processor)
+        if "pixel_values" in mm_inputs:
+            image_sizes = iter(mm_inputs["image_sizes"])
+            height, width = get_image_size(to_numpy_array(mm_inputs["pixel_values"][0][0]))
             for message in messages:
                 content = message["content"]
+
                 while self.image_token in content:
+                    image_size = next(image_sizes)
+                    orig_height, orig_width = image_size
+                    image_seqlen = processor._get_number_of_features(orig_height, orig_width, height, width)
+                    if processor.vision_feature_select_strategy == "default":
+                        image_seqlen -= 1
                     num_image_tokens += 1
-                    content = content.replace(self.image_token, "{{image}}", 1)
+                    content = content.replace(self.image_token, "{{image}}" * image_seqlen, 1)
+
+                message['content'] = content.replace("{{image}}", self.image_token)
+
+        if "pixel_values_videos" in mm_inputs:
+            one_video = to_numpy_array(mm_inputs.get("pixel_values_videos")[0])
+            height, width = get_image_size(one_video[0])
+            num_frames = one_video.shape[0]  # frame dim is always after batch dim
+            image_seqlen = (height // processor.patch_size) * (width // processor.patch_size)
+            video_seqlen = image_seqlen // 4 * num_frames  # divide by 4 needed for avg pooling layer
+
+            for message in messages:
+                content = message["content"]
                 while self.video_token in content:
                     num_video_tokens += 1
                     content = content.replace(self.video_token, "{{video}}", 1)
-        else:
-            mm_inputs = self._get_mm_inputs(images, videos, processor)
-            if "pixel_values" in mm_inputs:
-                image_sizes = iter(mm_inputs["image_sizes"])
-                height, width = get_image_size(to_numpy_array(mm_inputs["pixel_values"][0][0]))
-                for message in messages:
-                    content = message["content"]
-
-                    while self.image_token in content:
-                        image_size = next(image_sizes)
-                        orig_height, orig_width = image_size
-                        image_seqlen = processor._get_number_of_features(orig_height, orig_width, height, width)
-                        if processor.vision_feature_select_strategy == "default":
-                            image_seqlen -= 1
-                        num_image_tokens += 1
-                        content = content.replace(self.image_token, "{{image}}" * image_seqlen, 1)
-
-                    message['content'] = content.replace("{{image}}", self.image_token)
-
-            if "pixel_values_videos" in mm_inputs:
-                one_video = to_numpy_array(mm_inputs.get("pixel_values_videos")[0])
-                height, width = get_image_size(one_video[0])
-                num_frames = one_video.shape[0]  # frame dim is always after batch dim
-                image_seqlen = (height // processor.patch_size) * (width // processor.patch_size)
-                video_seqlen = image_seqlen // 4 * num_frames  # divide by 4 needed for avg pooling layer
-
-                for message in messages:
-                    content = message["content"]
-                    while self.video_token in content:
-                        num_video_tokens += 1
-                        content = content.replace(self.video_token, "{{video}}", 1)
-                    message['content'] = content.replace("{{video}}", self.video_token * video_seqlen)
+                message['content'] = content.replace("{{video}}", self.video_token * video_seqlen)
 
         if len(images) != num_image_tokens:
             raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
@@ -380,36 +366,38 @@ class LlavaNextVideoPlugin(BasePlugin):
         return messages
 
     @override
-    def get_mm_inputs(
+    def _get_mm_inputs(
         self,
         images: Sequence["ImageInput"],
         videos: Sequence["VideoInput"],
-        imglens: Sequence[int],
-        vidlens: Sequence[int],
-        seqlens: Sequence[int],
         processor: Optional["ProcessorMixin"],
     ) -> Dict[str, Union[List[int], "torch.Tensor"]]:
         self._validate_input(images, videos)
         video_processor = getattr(processor, "video_processor")
-        res = self._get_mm_inputs(images, [], processor)
+        res = super()._get_mm_inputs(images, [], processor)
         if len(videos) != 0:
-            videos = self._regularize_videos(videos)
+            videos = self._regularize_videos(
+                videos,
+                image_resolution=getattr(processor, "image_resolution", 168),
+                video_fps=getattr(processor, "video_fps", 1.0),
+                video_maxlen=getattr(processor, "video_maxlen", 16),
+            )
             video_res = video_processor(videos, return_tensors="pt")
             res.update(video_res)
         return res
 
     @override
-    def _regularize_videos(self, videos: Sequence["VideoInput"], **kwargs) -> List[List["ImageObject"]]:
-        r"""
-        Regularizes videos to avoid error. Including reading, resizing and converting.
-        """
-        videos = super()._regularize_videos(
-            videos,
-            image_resolution=168,
-            video_fps=1.0,
-            video_maxlen=16,
-        )
-        return videos
+    def get_mm_inputs(
+            self,
+            images: Sequence["ImageInput"],
+            videos: Sequence["VideoInput"],
+            imglens: Sequence[int],
+            vidlens: Sequence[int],
+            seqlens: Sequence[int],
+            processor: Optional["ProcessorMixin"],
+    ) -> Dict[str, Union[List[int], "torch.Tensor"]]:
+        self._validate_input(images, videos)
+        return self._get_mm_inputs(images, videos, processor)
 
 
 class PaliGemmaPlugin(BasePlugin):
@@ -579,7 +567,22 @@ class VideoLlavaPlugin(BasePlugin):
         num_image_tokens = 0
         num_video_tokens = 0
         messages = deepcopy(messages)
-        if getattr(processor, "patch_size") is None or getattr(processor, "vision_feature_select_strategy") is None:
+        mm_inputs = self._get_mm_inputs(images, videos, processor)
+        num_frames = 0
+        exist_images = "pixel_values_images" in mm_inputs
+        exist_videos = "pixel_values_videos" in mm_inputs
+        if exist_videos or exist_images:
+            if exist_images:
+                height, width = get_image_size(to_numpy_array(mm_inputs.get("pixel_values_images")[0]))
+                num_frames = 1
+            if exist_videos:
+                one_video = to_numpy_array(mm_inputs.get("pixel_values_videos")[0])
+                height, width = get_image_size(one_video[0])
+                num_frames = one_video.shape[0]  # frame dim is always after batch dim
+            image_seqlen = (height // processor.patch_size) * (width // processor.patch_size) + 1
+            video_seqlen = image_seqlen * num_frames
+            if processor.vision_feature_select_strategy == "default":
+                image_seqlen -= 1
             for message in messages:
                 content = message["content"]
                 while self.image_token in content:
@@ -588,39 +591,15 @@ class VideoLlavaPlugin(BasePlugin):
                 while self.video_token in content:
                     num_video_tokens += 1
                     content = content.replace(self.video_token, "{{video}}", 1)
-        else:
-            mm_inputs = self._get_mm_inputs(images, videos, processor)
-            if "pixel_values_images" in mm_inputs.keys():
-                height, width = get_image_size(to_numpy_array(mm_inputs.get("pixel_values_images")[0]))
-                num_frames = 1
 
-            if "pixel_values_videos" in mm_inputs.keys():
-                one_video = to_numpy_array(mm_inputs.get("pixel_values_videos")[0])
-                height, width = get_image_size(one_video[0])
-                num_frames = one_video.shape[0]  # frame dim is always after batch dim
-
-            image_seqlen = (height // processor.patch_size) * (width // processor.patch_size) + 1
-            video_seqlen = num_image_tokens * num_frames
-            if processor.vision_feature_select_strategy == "default":
-                image_seqlen -= 1
-
-            for message in messages:
-                content = message["content"]
-                while self.image_token in content:
-                    num_image_tokens += 1
-                    content = content.replace(self.image_token, "{{image}}", 1)
-                while self.video_token in content:
-                    num_image_tokens += 1
-                    content = content.replace(self.video_token, "{{video}}", 1)
-
-                message["content"] = content.replace("{{image}}", self.image_token * image_seqlen)
+                content = content.replace("{{image}}", self.image_token * image_seqlen)
                 message["content"] = content.replace("{{video}}", self.video_token * video_seqlen)
 
         if len(images) != num_image_tokens:
-            raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
+            raise ValueError("The number of images does not match the number of {} tokens".format(self.image_token))
 
         if len(videos) != num_video_tokens:
-            raise ValueError("The number of videos does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
+            raise ValueError("The number of videos does not match the number of {} tokens".format(self.video_token))
 
         return messages
 
@@ -637,19 +616,6 @@ class VideoLlavaPlugin(BasePlugin):
         self._validate_input(images, videos)
         return self._get_mm_inputs(images, videos, processor)
 
-    @override
-    def _regularize_videos(self, videos: Sequence["VideoInput"], **kwargs) -> List[List["ImageObject"]]:
-        r"""
-        Regularizes videos to avoid error. Including reading, resizing and converting.
-        """
-        videos = super()._regularize_videos(
-            videos,
-            image_resolution=224,
-            video_fps=1.0,
-            video_maxlen=8,
-        )
-        return videos
-
 
 PLUGINS = {
     "base": BasePlugin,
diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index c2fdb2dd..c90913ae 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -25,7 +25,7 @@ from .model_utils.misc import register_autoclass
 from .model_utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model
 from .model_utils.unsloth import load_unsloth_pretrained_model
 from .model_utils.valuehead import load_valuehead_params
-from .model_utils.visual import get_image_seqlen
+from .model_utils.visual import get_image_seqlen, get_patch_size, get_vision_feature_select_strategy
 from .patcher import patch_config, patch_model, patch_tokenizer, patch_valuehead_model
 
 
@@ -100,9 +100,11 @@ def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule":
         setattr(processor, "tokenizer", tokenizer)
         setattr(processor, "image_seqlen", get_image_seqlen(config))
         setattr(processor, "image_resolution", model_args.image_resolution)
+        setattr(processor, "patch_size", get_patch_size(config))
         setattr(processor, "video_resolution", model_args.video_resolution)
         setattr(processor, "video_fps", model_args.video_fps)
         setattr(processor, "video_maxlen", model_args.video_maxlen)
+        setattr(processor, "vision_feature_select_strategy", get_vision_feature_select_strategy(config))
     except Exception:
         processor = None
 
diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py
index b5b581bb..e5ee6224 100644
--- a/src/llamafactory/model/model_utils/visual.py
+++ b/src/llamafactory/model/model_utils/visual.py
@@ -161,6 +161,22 @@ def get_image_seqlen(config: "PretrainedConfig") -> int:
     return image_seqlen
 
 
+def get_patch_size(config: "PretrainedConfig") -> int:
+    r"""
+    Computes the patch size of the vit.
+    """
+    patch_size = getattr(config.vision_config, "patch_size", 14)
+    return patch_size
+
+
+def get_vision_feature_select_strategy(config: "PretrainedConfig") -> int:
+    r"""
+    Get the vision_feature_select_strategy.
+    """
+    vision_feature_select_strategy = getattr(config, "vision_feature_select_strategy", "default")
+    return vision_feature_select_strategy
+
+
 def patch_target_modules(
     config: "PretrainedConfig", finetuning_args: "FinetuningArguments", target_modules: Sequence[str]
 ) -> Union[str, List[str]]:

From 96d51325ad7b4b1695468a4b9f02718cde8ea8b1 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sun, 29 Sep 2024 18:00:45 +0800
Subject: [PATCH 13/33] fix tests

Former-commit-id: 97d1536ee1f85b0a7a84fc6cb390fc502287a74c
---
 tests/data/test_mm_plugin.py | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py
index 6cce2c4c..6187fa5e 100644
--- a/tests/data/test_mm_plugin.py
+++ b/tests/data/test_mm_plugin.py
@@ -139,7 +139,14 @@ def test_llava_next_plugin():
     tokenizer, processor = _load_tokenizer_module(model_name_or_path="llava-hf/llava-v1.6-vicuna-7b-hf")
     llava_next_plugin = get_mm_plugin(name="llava_next", image_token="<image>")
     check_inputs = {"plugin": llava_next_plugin, "tokenizer": tokenizer, "processor": processor}
-    check_inputs["expected_mm_messages"] = MM_MESSAGES
+    image_seqlen = 1176
+    check_inputs["expected_mm_messages"] = [
+        {
+            key: value.replace("<image>", "<image>" * image_seqlen)
+            for key, value in message.items()
+        }
+        for message in MM_MESSAGES
+    ]
     check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
     _check_plugin(**check_inputs)
 
@@ -148,7 +155,14 @@ def test_llava_next_video_plugin():
     tokenizer, processor = _load_tokenizer_module(model_name_or_path="llava-hf/LLaVA-NeXT-Video-7B-hf")
     llava_next_video_plugin = get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>")
     check_inputs = {"plugin": llava_next_video_plugin, "tokenizer": tokenizer, "processor": processor}
-    check_inputs["expected_mm_messages"] = MM_MESSAGES
+    image_seqlen = 1176
+    check_inputs["expected_mm_messages"] = [
+        {
+            key: value.replace("<image>", "<image>" * image_seqlen)
+            for key, value in message.items()
+        }
+        for message in MM_MESSAGES
+    ]
     check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
     _check_plugin(**check_inputs)
 
@@ -190,6 +204,13 @@ def test_video_llava_plugin():
     tokenizer, processor = _load_tokenizer_module(model_name_or_path="LanguageBind/Video-LLaVA-7B-hf")
     video_llava_plugin = get_mm_plugin(name="video_llava", image_token="<image>", video_token="<video>")
     check_inputs = {"plugin": video_llava_plugin, "tokenizer": tokenizer, "processor": processor}
-    check_inputs["expected_mm_messages"] = MM_MESSAGES
+    image_seqlen = 256
+    check_inputs["expected_mm_messages"] = [
+        {
+            key: value.replace("<image>", "<image>" * image_seqlen)
+            for key, value in message.items()
+        }
+        for message in MM_MESSAGES
+    ]
     check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
     _check_plugin(**check_inputs)

From 8ee588248e95eb27adf39f52565be81893caf1f9 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sun, 29 Sep 2024 20:30:57 +0800
Subject: [PATCH 14/33] fix style

Former-commit-id: 6ddea0f3d3ef568378470ce967a0e8d02eeac5dd
---
 src/llamafactory/data/mm_plugin.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index 14fdf8cb..29c81f56 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -291,7 +291,7 @@ class LlavaNextPlugin(BasePlugin):
                 num_image_tokens += 1
                 content = content.replace(self.image_token, "{{image}}" * image_seqlen, 1)
 
-            message['content'] = content.replace("{{image}}", self.image_token)
+            message["content"] = content.replace("{{image}}", self.image_token)
 
         if len(images) != num_image_tokens:
             raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
@@ -341,7 +341,7 @@ class LlavaNextVideoPlugin(BasePlugin):
                     num_image_tokens += 1
                     content = content.replace(self.image_token, "{{image}}" * image_seqlen, 1)
 
-                message['content'] = content.replace("{{image}}", self.image_token)
+                message["content"] = content.replace("{{image}}", self.image_token)
 
         if "pixel_values_videos" in mm_inputs:
             one_video = to_numpy_array(mm_inputs.get("pixel_values_videos")[0])
@@ -355,7 +355,7 @@ class LlavaNextVideoPlugin(BasePlugin):
                 while self.video_token in content:
                     num_video_tokens += 1
                     content = content.replace(self.video_token, "{{video}}", 1)
-                message['content'] = content.replace("{{video}}", self.video_token * video_seqlen)
+                message["content"] = content.replace("{{video}}", self.video_token * video_seqlen)
 
         if len(images) != num_image_tokens:
             raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
@@ -388,13 +388,13 @@ class LlavaNextVideoPlugin(BasePlugin):
 
     @override
     def get_mm_inputs(
-            self,
-            images: Sequence["ImageInput"],
-            videos: Sequence["VideoInput"],
-            imglens: Sequence[int],
-            vidlens: Sequence[int],
-            seqlens: Sequence[int],
-            processor: Optional["ProcessorMixin"],
+        self,
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+        imglens: Sequence[int],
+        vidlens: Sequence[int],
+        seqlens: Sequence[int],
+        processor: Optional["ProcessorMixin"],
     ) -> Dict[str, Union[List[int], "torch.Tensor"]]:
         self._validate_input(images, videos)
         return self._get_mm_inputs(images, videos, processor)

From 66213043acbff3c68257798dce00154b911b776d Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sun, 29 Sep 2024 20:38:46 +0800
Subject: [PATCH 15/33] tiny fix

Former-commit-id: 7397827aec55eecad1e70878453387fda0db62b5
---
 src/llamafactory/data/mm_plugin.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index 29c81f56..f38031ca 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -344,9 +344,9 @@ class LlavaNextVideoPlugin(BasePlugin):
                 message["content"] = content.replace("{{image}}", self.image_token)
 
         if "pixel_values_videos" in mm_inputs:
-            one_video = to_numpy_array(mm_inputs.get("pixel_values_videos")[0])
-            height, width = get_image_size(one_video[0])
-            num_frames = one_video.shape[0]  # frame dim is always after batch dim
+            pixel_values_video = to_numpy_array(mm_inputs.get("pixel_values_videos")[0])
+            height, width = get_image_size(pixel_values_video[0])
+            num_frames = pixel_values_video.shape[0]  # frame dim is always after batch dim
             image_seqlen = (height // processor.patch_size) * (width // processor.patch_size)
             video_seqlen = image_seqlen // 4 * num_frames  # divide by 4 needed for avg pooling layer
 
@@ -378,9 +378,9 @@ class LlavaNextVideoPlugin(BasePlugin):
         if len(videos) != 0:
             videos = self._regularize_videos(
                 videos,
-                image_resolution=getattr(processor, "image_resolution", 168),
-                video_fps=getattr(processor, "video_fps", 1.0),
-                video_maxlen=getattr(processor, "video_maxlen", 16),
+                image_resolution=getattr(processor, "image_resolution"),
+                video_fps=getattr(processor, "video_fps"),
+                video_maxlen=getattr(processor, "video_maxlen"),
             )
             video_res = video_processor(videos, return_tensors="pt")
             res.update(video_res)
@@ -576,9 +576,9 @@ class VideoLlavaPlugin(BasePlugin):
                 height, width = get_image_size(to_numpy_array(mm_inputs.get("pixel_values_images")[0]))
                 num_frames = 1
             if exist_videos:
-                one_video = to_numpy_array(mm_inputs.get("pixel_values_videos")[0])
-                height, width = get_image_size(one_video[0])
-                num_frames = one_video.shape[0]  # frame dim is always after batch dim
+                pixel_values_video = to_numpy_array(mm_inputs.get("pixel_values_videos")[0])
+                height, width = get_image_size(pixel_values_video[0])
+                num_frames = pixel_values_video.shape[0]  # frame dim is always after batch dim
             image_seqlen = (height // processor.patch_size) * (width // processor.patch_size) + 1
             video_seqlen = image_seqlen * num_frames
             if processor.vision_feature_select_strategy == "default":

From 534dc583632247c71a9fb09f8d811f75b9912b51 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sun, 29 Sep 2024 20:45:02 +0800
Subject: [PATCH 16/33] fix readme

Former-commit-id: bf0bcbc5ec4ca0182ade283ea9f37012f224f519
---
 README.md                                    | 6 +++---
 src/llamafactory/model/model_utils/visual.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index de88d8dd..6cda8e67 100644
--- a/README.md
+++ b/README.md
@@ -163,7 +163,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 ## Supported Models
 
 | Model                                                             | Model size                       | Template  |
-| ----------------------------------------------------------------- | -------------------------------- | --------- |
+| ----------------------------------------------------------------- |----------------------------------| --------- |
 | [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2 |
 | [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
 | [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3  |
@@ -177,8 +177,8 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2    |
 | [Llama 3/Llama 3.1/Llama3.2](https://huggingface.co/meta-llama)   | 1B/3B/8B/70B                     | llama3    |
 | [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava     |
-| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/13B                           | llava_next       |
-| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/13B                           | llava_next_video |
+| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/13B/34B                       | llava_next       |
+| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/13B/34B                       | llava_next_video |
 | [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                         | cpm/cpm3  |
 | [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral   |
 | [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -         |
diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py
index e5ee6224..1f6eb47f 100644
--- a/src/llamafactory/model/model_utils/visual.py
+++ b/src/llamafactory/model/model_utils/visual.py
@@ -165,7 +165,7 @@ def get_patch_size(config: "PretrainedConfig") -> int:
     r"""
     Computes the patch size of the vit.
     """
-    patch_size = getattr(config.vision_config, "patch_size", 14)
+    patch_size = getattr(config.vision_config, "patch_size", -1)
     return patch_size
 
 

From ee3fe4226dff0bcd8f0b682fe93fbee355916304 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sun, 29 Sep 2024 20:46:47 +0800
Subject: [PATCH 17/33] fix readme_zh

Former-commit-id: 1a757c5ab243c03e2f9c0dcd884e34d752885689
---
 README_zh.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README_zh.md b/README_zh.md
index 6e64d855..f51061a6 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -163,8 +163,8 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272
 
 ## 模型
 
-| 模型名                                                            | 模型大小                          | Template  |
-| ----------------------------------------------------------------- | -------------------------------- | --------- |
+| 模型名                                                            | 模型大小                             | Template  |
+| ----------------------------------------------------------------- |----------------------------------| --------- |
 | [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2 |
 | [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
 | [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3  |
@@ -178,8 +178,8 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272
 | [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2    |
 | [Llama 3/Llama 3.1/Llama3.2](https://huggingface.co/meta-llama)   | 1B/3B/8B/70B                     | llama3    |
 | [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava     |
-| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/13B                           | llava_next       |
-| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/13B                           | llava_next_video |
+| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/13B/34B                       | llava_next       |
+| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/13B/34B                       | llava_next_video |
 | [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                         | cpm/cpm3  |
 | [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral   |
 | [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -         |

From ce0c73c032f1b7c499cc635b3a3193ef4977d4c0 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 29 Sep 2024 20:53:34 +0800
Subject: [PATCH 18/33] Update mm_plugin.py

Former-commit-id: 0257a67cb266dcaee8bfb358d88ef2be2403a2f7
---
 src/llamafactory/data/mm_plugin.py | 38 ++++++++++--------------------
 1 file changed, 13 insertions(+), 25 deletions(-)

diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index f38031ca..29d80b50 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -158,6 +158,8 @@ class BasePlugin:
         It holds num_patches == torch.prod(image_grid_thw)
         """
         image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
+        video_processor: "BaseImageProcessor" = getattr(processor, "video_processor", image_processor)
+        res = super()._get_mm_inputs(images, [], processor)
         input_dict = {"images": None}  # default key
         if len(images) != 0:
             images = self._regularize_images(
@@ -174,10 +176,17 @@ class BasePlugin:
                 video_maxlen=getattr(processor, "video_maxlen", 64),
             )
             input_dict["videos"] = videos
-        if input_dict.get("images", None) is not None or input_dict.get("videos", None) is not None:
-            return image_processor(**input_dict, return_tensors="pt")
-        else:
-            return {}
+
+        mm_inputs = {}
+        if image_processor != video_processor:
+            if input_dict.get("images") is not None:
+                mm_inputs.update(image_processor(input_dict["images"], return_tensors="pt"))
+            if input_dict.get("videos") is not None:
+                mm_inputs.update(video_processor(input_dict["videos"], return_tensors="pt"))
+        elif input_dict.get("images") is not None or input_dict.get("videos") is not None:  # same processor (qwen2-vl)
+            mm_inputs.update(image_processor(**input_dict, return_tensors="pt"))
+
+        return mm_inputs
 
     def process_messages(
         self,
@@ -365,27 +374,6 @@ class LlavaNextVideoPlugin(BasePlugin):
 
         return messages
 
-    @override
-    def _get_mm_inputs(
-        self,
-        images: Sequence["ImageInput"],
-        videos: Sequence["VideoInput"],
-        processor: Optional["ProcessorMixin"],
-    ) -> Dict[str, Union[List[int], "torch.Tensor"]]:
-        self._validate_input(images, videos)
-        video_processor = getattr(processor, "video_processor")
-        res = super()._get_mm_inputs(images, [], processor)
-        if len(videos) != 0:
-            videos = self._regularize_videos(
-                videos,
-                image_resolution=getattr(processor, "image_resolution"),
-                video_fps=getattr(processor, "video_fps"),
-                video_maxlen=getattr(processor, "video_maxlen"),
-            )
-            video_res = video_processor(videos, return_tensors="pt")
-            res.update(video_res)
-        return res
-
     @override
     def get_mm_inputs(
         self,

From ec793d16de4b2ca7077d8adbcbdeb09bbce2e7ae Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 29 Sep 2024 20:54:04 +0800
Subject: [PATCH 19/33] Update mm_plugin.py

Former-commit-id: ffaea305fc405c9892aa0c9712d98185d9241e69
---
 src/llamafactory/data/mm_plugin.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index 29d80b50..3684495b 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -159,7 +159,6 @@ class BasePlugin:
         """
         image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
         video_processor: "BaseImageProcessor" = getattr(processor, "video_processor", image_processor)
-        res = super()._get_mm_inputs(images, [], processor)
         input_dict = {"images": None}  # default key
         if len(images) != 0:
             images = self._regularize_images(

From 8231359bbbd2a59d02bbf21826f7f4aa782900d4 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sun, 29 Sep 2024 20:55:18 +0800
Subject: [PATCH 20/33] fix readme_zh

Former-commit-id: 45b01df6a6103d36bbd16204327ac88ea0a02971
---
 README.md    | 58 ++++++++++++++++++++++++++--------------------------
 README_zh.md | 58 ++++++++++++++++++++++++++--------------------------
 2 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/README.md b/README.md
index 6cda8e67..b1443029 100644
--- a/README.md
+++ b/README.md
@@ -162,36 +162,36 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Supported Models
 
-| Model                                                             | Model size                       | Template  |
-| ----------------------------------------------------------------- |----------------------------------| --------- |
-| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2 |
-| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3  |
-| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere    |
-| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B                  | falcon    |
-| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma     |
-| [GLM-4](https://huggingface.co/THUDM)                             | 9B                               | glm4      |
-| [InternLM2/InternLM2.5](https://huggingface.co/internlm)          | 7B/20B                           | intern2   |
-| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -         |
-| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2    |
-| [Llama 3/Llama 3.1/Llama3.2](https://huggingface.co/meta-llama)   | 1B/3B/8B/70B                     | llama3    |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava     |
-| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/13B/34B                       | llava_next       |
-| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/13B/34B                       | llava_next_video |
-| [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                         | cpm/cpm3  |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral   |
-| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -         |
-| [PaliGemma](https://huggingface.co/google)                        | 3B                               | paligemma |
-| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                        | -         |
-| [Phi-3](https://huggingface.co/microsoft)                         | 4B/7B/14B                        | phi       |
+| Model                                                             | Model size                   | Template  |
+| ----------------------------------------------------------------- |------------------------------| --------- |
+| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                       | baichuan2 |
+| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B  | -         |
+| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                           | chatglm3  |
+| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                     | cohere    |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B              | deepseek  |
+| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B              | falcon    |
+| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                 | gemma     |
+| [GLM-4](https://huggingface.co/THUDM)                             | 9B                           | glm4      |
+| [InternLM2/InternLM2.5](https://huggingface.co/internlm)          | 7B/20B                       | intern2   |
+| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B               | -         |
+| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                   | llama2    |
+| [Llama 3/Llama 3.1/Llama3.2](https://huggingface.co/meta-llama)   | 1B/3B/8B/70B                 | llama3    |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                       | llava     |
+| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/13B/34B                   | llava_next       |
+| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/34B                       | llava_next_video |
+| [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                     | cpm/cpm3  |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                | mistral   |
+| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                        | -         |
+| [PaliGemma](https://huggingface.co/google)                        | 3B                           | paligemma |
+| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                    | -         |
+| [Phi-3](https://huggingface.co/microsoft)                         | 4B/7B/14B                    | phi       |
 | [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen)       | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen      |
-| [Qwen2-VL](https://huggingface.co/Qwen)                           | 2B/7B/72B                        | qwen2_vl  |
-| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -         |
-| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                       | xverse    |
-| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi        |
-| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                           | yi_vl     |
-| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan      |
+| [Qwen2-VL](https://huggingface.co/Qwen)                           | 2B/7B/72B                    | qwen2_vl  |
+| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                    | -         |
+| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                   | xverse    |
+| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B               | yi        |
+| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                       | yi_vl     |
+| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                  | yuan      |
 
 > [!NOTE]
 > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.
diff --git a/README_zh.md b/README_zh.md
index f51061a6..7ec46b3b 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -163,36 +163,36 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272
 
 ## 模型
 
-| 模型名                                                            | 模型大小                             | Template  |
-| ----------------------------------------------------------------- |----------------------------------| --------- |
-| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2 |
-| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3  |
-| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere    |
-| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B                  | falcon    |
-| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma     |
-| [GLM-4](https://huggingface.co/THUDM)                             | 9B                               | glm4      |
-| [InternLM2/InternLM2.5](https://huggingface.co/internlm)          | 7B/20B                           | intern2   |
-| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -         |
-| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2    |
-| [Llama 3/Llama 3.1/Llama3.2](https://huggingface.co/meta-llama)   | 1B/3B/8B/70B                     | llama3    |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava     |
-| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/13B/34B                       | llava_next       |
-| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/13B/34B                       | llava_next_video |
-| [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                         | cpm/cpm3  |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral   |
-| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -         |
-| [PaliGemma](https://huggingface.co/google)                        | 3B                               | paligemma |
-| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                        | -         |
-| [Phi-3](https://huggingface.co/microsoft)                         | 4B/7B/14B                        | phi       |
+| 模型名                                                            | 模型大小                         | Template  |
+| ----------------------------------------------------------------- |------------------------------| --------- |
+| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                       | baichuan2 |
+| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B  | -         |
+| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                           | chatglm3  |
+| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                     | cohere    |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B              | deepseek  |
+| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B              | falcon    |
+| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                 | gemma     |
+| [GLM-4](https://huggingface.co/THUDM)                             | 9B                           | glm4      |
+| [InternLM2/InternLM2.5](https://huggingface.co/internlm)          | 7B/20B                       | intern2   |
+| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B               | -         |
+| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                   | llama2    |
+| [Llama 3/Llama 3.1/Llama3.2](https://huggingface.co/meta-llama)   | 1B/3B/8B/70B                 | llama3    |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                       | llava     |
+| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/13B/34B                   | llava_next       |
+| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/34B                       | llava_next_video |
+| [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                     | cpm/cpm3  |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                | mistral   |
+| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                        | -         |
+| [PaliGemma](https://huggingface.co/google)                        | 3B                           | paligemma |
+| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                    | -         |
+| [Phi-3](https://huggingface.co/microsoft)                         | 4B/7B/14B                    | phi       |
 | [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen)       | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen      |
-| [Qwen2-VL](https://huggingface.co/Qwen)                           | 2B/7B/72B                        | qwen2_vl  |
-| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -         |
-| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                       | xverse    |
-| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi        |
-| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                           | yi_vl     |
-| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan      |
+| [Qwen2-VL](https://huggingface.co/Qwen)                           | 2B/7B/72B                    | qwen2_vl  |
+| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                    | -         |
+| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                   | xverse    |
+| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B               | yi        |
+| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                       | yi_vl     |
+| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                  | yuan      |
 
 > [!NOTE]
 > 对于所有“基座”（Base）模型，`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”（Instruct/Chat）模型请务必使用**对应的模板**。

From 1b71afb27789f08a57feb90b7dc3506f794ce79c Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sun, 29 Sep 2024 21:29:29 +0800
Subject: [PATCH 21/33] add more llava-next series template

Former-commit-id: 65a8923f5a7d20d34fabf4f81746fe9b7bc8c84a
---
 README.md                                    | 58 ++++++++++----------
 README_zh.md                                 |  2 +-
 src/llamafactory/data/template.py            | 57 +++++++++++++++++++
 src/llamafactory/extras/constants.py         | 57 ++++++++++++++++---
 src/llamafactory/model/model_utils/misc.py   |  2 +-
 src/llamafactory/model/model_utils/visual.py | 13 ++---
 6 files changed, 141 insertions(+), 48 deletions(-)

diff --git a/README.md b/README.md
index b1443029..cff7422d 100644
--- a/README.md
+++ b/README.md
@@ -162,36 +162,36 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Supported Models
 
-| Model                                                             | Model size                   | Template  |
-| ----------------------------------------------------------------- |------------------------------| --------- |
-| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                       | baichuan2 |
-| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B  | -         |
-| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                           | chatglm3  |
-| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                     | cohere    |
-| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B              | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B              | falcon    |
-| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                 | gemma     |
-| [GLM-4](https://huggingface.co/THUDM)                             | 9B                           | glm4      |
-| [InternLM2/InternLM2.5](https://huggingface.co/internlm)          | 7B/20B                       | intern2   |
-| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B               | -         |
-| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                   | llama2    |
-| [Llama 3/Llama 3.1/Llama3.2](https://huggingface.co/meta-llama)   | 1B/3B/8B/70B                 | llama3    |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                       | llava     |
-| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/13B/34B                   | llava_next       |
-| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/34B                       | llava_next_video |
-| [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                     | cpm/cpm3  |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                | mistral   |
-| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                        | -         |
-| [PaliGemma](https://huggingface.co/google)                        | 3B                           | paligemma |
-| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                    | -         |
-| [Phi-3](https://huggingface.co/microsoft)                         | 4B/7B/14B                    | phi       |
+| Model                                                             | Model size                       | Template  |
+| ----------------------------------------------------------------- |----------------------------------| --------- |
+| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2 |
+| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
+| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3  |
+| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere    |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek  |
+| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B                  | falcon    |
+| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma     |
+| [GLM-4](https://huggingface.co/THUDM)                             | 9B                               | glm4      |
+| [InternLM2/InternLM2.5](https://huggingface.co/internlm)          | 7B/20B                           | intern2   |
+| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -         |
+| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2    |
+| [Llama 3/Llama 3.1/Llama3.2](https://huggingface.co/meta-llama)   | 1B/3B/8B/70B                     | llama3    |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava     |
+| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/8B/13B/34B/72B/110B           | llava_next       |
+| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/34B                           | llava_next_video |
+| [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                         | cpm/cpm3  |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral   |
+| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -         |
+| [PaliGemma](https://huggingface.co/google)                        | 3B                               | paligemma |
+| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                        | -         |
+| [Phi-3](https://huggingface.co/microsoft)                         | 4B/7B/14B                        | phi       |
 | [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen)       | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen      |
-| [Qwen2-VL](https://huggingface.co/Qwen)                           | 2B/7B/72B                    | qwen2_vl  |
-| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                    | -         |
-| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                   | xverse    |
-| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B               | yi        |
-| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                       | yi_vl     |
-| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                  | yuan      |
+| [Qwen2-VL](https://huggingface.co/Qwen)                           | 2B/7B/72B                        | qwen2_vl  |
+| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -         |
+| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                       | xverse    |
+| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi        |
+| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                           | yi_vl     |
+| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan      |
 
 > [!NOTE]
 > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.
diff --git a/README_zh.md b/README_zh.md
index 7ec46b3b..623eb8e5 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -178,7 +178,7 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272
 | [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                   | llama2    |
 | [Llama 3/Llama 3.1/Llama3.2](https://huggingface.co/meta-llama)   | 1B/3B/8B/70B                 | llama3    |
 | [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                       | llava     |
-| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/13B/34B                   | llava_next       |
+| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/8B/13B/34B/72B/110B                   | llava_next       |
 | [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/34B                       | llava_next_video |
 | [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                     | cpm/cpm3  |
 | [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                | mistral   |
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 2d966155..a23e0e44 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -771,6 +771,52 @@ _register_template(
 )
 
 
+_register_template(
+    name="llava_next_llama3",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_system=StringFormatter(slots=["<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]),
+    format_observation=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>tool<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot_id|>"],
+    replace_eos=True,
+    replace_jinja_template=False,
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+_register_template(
+    name="llava_next_mistral",
+    format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+_register_template(
+    name="llava_next_yi",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
 _register_template(
     name="llava_next_video",
     format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
@@ -782,6 +828,17 @@ _register_template(
 )
 
 
+_register_template(
+    name="llava_next_video_yi",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>"),
+)
+
+
 _register_template(
     name="mistral",
     format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]),
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 335d222a..7dce012e 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -843,14 +843,47 @@ register_model_group(
         "LLaVA-NeXT-13B-Chat": {
             DownloadSource.DEFAULT: "llava-hf/llava-v1.6-vicuna-13b-hf",
         },
-        "LLaVA-NeXT-34B-Chat": {
-            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-34b-hf",
+    },
+    template="llava_next",
+    vision=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-LLaMA3-8B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llama3-llava-next-8b-hf",
         },
+        "LLaVA-NeXT-LLaMA3-72B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-next-72b-hf",
+        },
+        "LLaVA-NeXT-LLaMA3-110B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-next-110b-hf",
+        },
+    },
+    template="llava_next_llama3",
+    vision=True,
+)
+
+
+register_model_group(
+    models={
         "LLaVA-NeXT-Mistral-7B-Chat": {
             DownloadSource.DEFAULT: "llava-hf/llava-v1.6-mistral-7b-hf",
         },
     },
-    template="llava_next",
+    template="llava_next_mistral",
+    vision=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-Yi-34B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-34b-hf",
+        },
+    },
+    template="llava_next_yi",
     vision=True,
 )
 
@@ -860,20 +893,28 @@ register_model_group(
         "LLaVA-NeXT-Video-7B-Chat": {
             DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-hf",
         },
-        "LLaVA-NeXT-Video-34B-Chat": {
-            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-34B-hf",
-        },
         "LLaVA-NeXT-Video-7B-32k-Chat": {
             DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-32K-hf",
         },
         "LLaVA-NeXT-Video-7B-DPO": {
             DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-DPO-hf",
         },
-        "LLaVA-NeXT-Video-34B-DPO": {
+    },
+    template="llava_next_video",
+    vision=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-Video-Yi-34B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-34B-hf",
+        },
+        "LLaVA-NeXT-Video-Yi-34B-DPO": {
             DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-34B-DPO-hf",
         },
     },
-    template="llava_next_video",
+    template="llava_next_video_yi",
     vision=True,
 )
 
diff --git a/src/llamafactory/model/model_utils/misc.py b/src/llamafactory/model/model_utils/misc.py
index 4883fa23..12eafcac 100644
--- a/src/llamafactory/model/model_utils/misc.py
+++ b/src/llamafactory/model/model_utils/misc.py
@@ -34,7 +34,7 @@ def find_all_linear_modules(model: "PreTrainedModel", freeze_vision_tower: bool)
         forbidden_modules.add("output_layer")
     elif model_type == "internlm2":
         forbidden_modules.add("output")
-    elif model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]:
+    elif model_type in ["llava", "paligemma", "video_llava"] or "llava_next" in model_type:
         forbidden_modules.add("multi_modal_projector")
     elif model_type == "qwen2_vl":
         forbidden_modules.add("merger")
diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py
index 1f6eb47f..55f045f6 100644
--- a/src/llamafactory/model/model_utils/visual.py
+++ b/src/llamafactory/model/model_utils/visual.py
@@ -92,7 +92,7 @@ def autocast_projector_dtype(model: "PreTrainedModel", model_args: "ModelArgumen
 
     if getattr(model, "quantization_method", None):
         model_type = getattr(model.config, "model_type", None)
-        if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]:
+        if model_type in ["llava", "paligemma", "video_llava"] or "llava_next" in model_type:
             mm_projector: "torch.nn.Module" = getattr(model, "multi_modal_projector")
         elif model_type == "qwen2_vl":
             mm_projector: "torch.nn.Module" = getattr(getattr(model, "visual"), "merger")
@@ -108,12 +108,7 @@ def configure_visual_model(config: "PretrainedConfig") -> None:
     Patches VLMs before loading them.
     """
     model_type = getattr(config, "model_type", None)
-    if model_type in [
-        "llava",
-        "llava_next",
-        "llava_next_video",
-        "video_llava",
-    ]:  # required for ds zero3 and valuehead models
+    if model_type in ["llava", "video_llava"] or "llava_next" in model_type:  # required for ds zero3 and valuehead models
         setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None))
 
     if getattr(config, "is_yi_vl_derived_model", None):
@@ -127,7 +122,7 @@ def get_forbidden_modules(config: "PretrainedConfig", finetuning_args: "Finetuni
     """
     model_type = getattr(config, "model_type", None)
     forbidden_modules = set()
-    if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]:
+    if model_type in ["llava", "paligemma", "video_llava"] or "llava_next" in model_type:
         if finetuning_args.freeze_vision_tower:
             forbidden_modules.add("vision_tower")
 
@@ -185,7 +180,7 @@ def patch_target_modules(
     """
     model_type = getattr(config, "model_type", None)
     if finetuning_args.freeze_vision_tower:
-        if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]:
+        if model_type in ["llava", "paligemma", "video_llava"] or "llava_next" in model_type:
             return "^(?!.*vision_tower).*(?:{}).*".format("|".join(target_modules))
         elif model_type == "qwen2_vl":
             return "^(?!.*visual).*(?:{}).*".format("|".join(target_modules))

From ddec40ac1642ef83947b4aa8b3c451e81d20ee1d Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sun, 29 Sep 2024 21:39:37 +0800
Subject: [PATCH 22/33] fix style

Former-commit-id: 23916d57c1d22653739dbf913d3e427fcb978a15
---
 src/llamafactory/model/model_utils/visual.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py
index 55f045f6..85f386de 100644
--- a/src/llamafactory/model/model_utils/visual.py
+++ b/src/llamafactory/model/model_utils/visual.py
@@ -108,7 +108,9 @@ def configure_visual_model(config: "PretrainedConfig") -> None:
     Patches VLMs before loading them.
     """
     model_type = getattr(config, "model_type", None)
-    if model_type in ["llava", "video_llava"] or "llava_next" in model_type:  # required for ds zero3 and valuehead models
+    if (
+        model_type in ["llava", "video_llava"] or "llava_next" in model_type
+    ):  # required for ds zero3 and valuehead models
         setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None))
 
     if getattr(config, "is_yi_vl_derived_model", None):

From 15dbd4893e622fa74e04e29e62b77313d5dfab24 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 29 Sep 2024 21:51:23 +0800
Subject: [PATCH 23/33] Update requirements.txt

Former-commit-id: 905b7c03ae074bd958afdab6d79e45b30cec5271
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index dfd1619f..e913c58d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,4 +19,4 @@ fire
 packaging
 pyyaml
 numpy<2.0.0
-av
\ No newline at end of file
+av

From 87ab7fc01cdb257448bf3b36e39f2fe362f0fa2f Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sun, 29 Sep 2024 22:00:01 +0800
Subject: [PATCH 24/33] fix constants

Former-commit-id: 485fc047169afd027ee65d05e3c5c08b371b6c4d
---
 src/llamafactory/extras/constants.py         |  4 ++--
 src/llamafactory/model/model_utils/misc.py   |  2 +-
 src/llamafactory/model/model_utils/visual.py | 16 ++++++++++------
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 7dce012e..1be3940e 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -896,7 +896,7 @@ register_model_group(
         "LLaVA-NeXT-Video-7B-32k-Chat": {
             DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-32K-hf",
         },
-        "LLaVA-NeXT-Video-7B-DPO": {
+        "LLaVA-NeXT-Video-7B-DPO-Chat": {
             DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-DPO-hf",
         },
     },
@@ -910,7 +910,7 @@ register_model_group(
         "LLaVA-NeXT-Video-Yi-34B-Chat": {
             DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-34B-hf",
         },
-        "LLaVA-NeXT-Video-Yi-34B-DPO": {
+        "LLaVA-NeXT-Video-Yi-34B-DPO-Chat": {
             DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-34B-DPO-hf",
         },
     },
diff --git a/src/llamafactory/model/model_utils/misc.py b/src/llamafactory/model/model_utils/misc.py
index 12eafcac..4883fa23 100644
--- a/src/llamafactory/model/model_utils/misc.py
+++ b/src/llamafactory/model/model_utils/misc.py
@@ -34,7 +34,7 @@ def find_all_linear_modules(model: "PreTrainedModel", freeze_vision_tower: bool)
         forbidden_modules.add("output_layer")
     elif model_type == "internlm2":
         forbidden_modules.add("output")
-    elif model_type in ["llava", "paligemma", "video_llava"] or "llava_next" in model_type:
+    elif model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]:
         forbidden_modules.add("multi_modal_projector")
     elif model_type == "qwen2_vl":
         forbidden_modules.add("merger")
diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py
index 85f386de..c2aeb0dd 100644
--- a/src/llamafactory/model/model_utils/visual.py
+++ b/src/llamafactory/model/model_utils/visual.py
@@ -92,7 +92,7 @@ def autocast_projector_dtype(model: "PreTrainedModel", model_args: "ModelArgumen
 
     if getattr(model, "quantization_method", None):
         model_type = getattr(model.config, "model_type", None)
-        if model_type in ["llava", "paligemma", "video_llava"] or "llava_next" in model_type:
+        if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]:
             mm_projector: "torch.nn.Module" = getattr(model, "multi_modal_projector")
         elif model_type == "qwen2_vl":
             mm_projector: "torch.nn.Module" = getattr(getattr(model, "visual"), "merger")
@@ -108,9 +108,13 @@ def configure_visual_model(config: "PretrainedConfig") -> None:
     Patches VLMs before loading them.
     """
     model_type = getattr(config, "model_type", None)
-    if (
-        model_type in ["llava", "video_llava"] or "llava_next" in model_type
-    ):  # required for ds zero3 and valuehead models
+    if model_type in [
+        "llava",
+        "llava_next",
+        "llava_next_video",
+        "paligemma",
+        "video_llava",
+    ]:  # required for ds zero3 and valuehead models
         setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None))
 
     if getattr(config, "is_yi_vl_derived_model", None):
@@ -124,7 +128,7 @@ def get_forbidden_modules(config: "PretrainedConfig", finetuning_args: "Finetuni
     """
     model_type = getattr(config, "model_type", None)
     forbidden_modules = set()
-    if model_type in ["llava", "paligemma", "video_llava"] or "llava_next" in model_type:
+    if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]:
         if finetuning_args.freeze_vision_tower:
             forbidden_modules.add("vision_tower")
 
@@ -182,7 +186,7 @@ def patch_target_modules(
     """
     model_type = getattr(config, "model_type", None)
     if finetuning_args.freeze_vision_tower:
-        if model_type in ["llava", "paligemma", "video_llava"] or "llava_next" in model_type:
+        if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]:
             return "^(?!.*vision_tower).*(?:{}).*".format("|".join(target_modules))
         elif model_type == "qwen2_vl":
             return "^(?!.*visual).*(?:{}).*".format("|".join(target_modules))

From b37bb592eca7bd07eea374e92c6a9e0979623355 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sun, 29 Sep 2024 22:40:43 +0800
Subject: [PATCH 25/33] fix constants

Former-commit-id: bec1cb8d55d01ac8b70b9bacd92a227b48cf8336
---
 src/llamafactory/data/template.py    | 27 +++++++++++++++-------
 src/llamafactory/extras/constants.py | 34 ++++++++++++++++++++--------
 2 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index a23e0e44..99cbf86b 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -782,14 +782,6 @@ _register_template(
         ]
     ),
     format_system=StringFormatter(slots=["<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]),
-    format_observation=StringFormatter(
-        slots=[
-            (
-                "<|start_header_id|>tool<|end_header_id|>\n\n{{content}}<|eot_id|>"
-                "<|start_header_id|>assistant<|end_header_id|>\n\n"
-            )
-        ]
-    ),
     format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     stop_words=["<|eot_id|>"],
     replace_eos=True,
@@ -806,6 +798,17 @@ _register_template(
 )
 
 
+_register_template(
+    name="llava_next_qwen",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    replace_jinja_template=False,
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
 _register_template(
     name="llava_next_yi",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
@@ -828,6 +831,14 @@ _register_template(
 )
 
 
+_register_template(
+    name="llava_next_video_mistral",
+    format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>"),
+)
+
+
 _register_template(
     name="llava_next_video_yi",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 1be3940e..02bb9066 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -854,12 +854,6 @@ register_model_group(
         "LLaVA-NeXT-LLaMA3-8B-Chat": {
             DownloadSource.DEFAULT: "llava-hf/llama3-llava-next-8b-hf",
         },
-        "LLaVA-NeXT-LLaMA3-72B-Chat": {
-            DownloadSource.DEFAULT: "llava-hf/llava-next-72b-hf",
-        },
-        "LLaVA-NeXT-LLaMA3-110B-Chat": {
-            DownloadSource.DEFAULT: "llava-hf/llava-next-110b-hf",
-        },
     },
     template="llava_next_llama3",
     vision=True,
@@ -877,6 +871,20 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "LLaVA-NeXT-Qwen-1.5-72B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-next-72b-hf",
+        },
+        "LLaVA-NeXT-Qwen-1.5-110B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-next-110b-hf",
+        },
+    },
+    template="llava_next_qwen",
+    vision=True,
+)
+
+
 register_model_group(
     models={
         "LLaVA-NeXT-Yi-34B-Chat": {
@@ -893,9 +901,6 @@ register_model_group(
         "LLaVA-NeXT-Video-7B-Chat": {
             DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-hf",
         },
-        "LLaVA-NeXT-Video-7B-32k-Chat": {
-            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-32K-hf",
-        },
         "LLaVA-NeXT-Video-7B-DPO-Chat": {
             DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-DPO-hf",
         },
@@ -905,6 +910,17 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "LLaVA-NeXT-Video-Mistral-7B-32k-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-32K-hf",
+        },
+    },
+    template="llava_next_video_mistral",
+    vision=True,
+)
+
+
 register_model_group(
     models={
         "LLaVA-NeXT-Video-Yi-34B-Chat": {

From 1c69eea995888df604affcac8336046b08bbd237 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sun, 29 Sep 2024 22:55:45 +0800
Subject: [PATCH 26/33] fix template

Former-commit-id: 96bec6817a43c58e2d4b294dd52fee4cce10b02d
---
 src/llamafactory/data/template.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 99cbf86b..3a7bfd5d 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -782,6 +782,14 @@ _register_template(
         ]
     ),
     format_system=StringFormatter(slots=["<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]),
+    format_observation=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>tool<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
     format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     stop_words=["<|eot_id|>"],
     replace_eos=True,
@@ -801,11 +809,13 @@ _register_template(
 _register_template(
     name="llava_next_qwen",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
     format_separator=EmptyFormatter(slots=["\n"]),
+    default_system="You are a helpful assistant.",
     stop_words=["<|im_end|>"],
     replace_eos=True,
     replace_jinja_template=False,
-    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
 )
 
 

From f1b16236a4ba54625a5fa54b3f5145ca5cf3b552 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sun, 29 Sep 2024 22:56:36 +0800
Subject: [PATCH 27/33] fix template

Former-commit-id: 01ca056965bb63c5e62ec53ffdb51a062bf64371
---
 src/llamafactory/data/template.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 3a7bfd5d..7a10a0e3 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -816,6 +816,7 @@ _register_template(
     stop_words=["<|im_end|>"],
     replace_eos=True,
     replace_jinja_template=False,
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
 )
 
 

From 81cf3bff083aa74ced74f55e8295b80d388bac49 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 29 Sep 2024 22:59:47 +0800
Subject: [PATCH 28/33] Update test_mm_plugin.py

Former-commit-id: 8b50ce516af715a16c53568fb3420d6441aca1ae
---
 tests/data/test_mm_plugin.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py
index 6187fa5e..bd2eb45a 100644
--- a/tests/data/test_mm_plugin.py
+++ b/tests/data/test_mm_plugin.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os
 from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple
 

From 6e4d5d9b2a22fbfefd04826683ea7a60b27f8033 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 29 Sep 2024 23:45:34 +0800
Subject: [PATCH 29/33] Update constants.py

Former-commit-id: b257b91cd0a71448af520baa8e864920333da848
---
 src/llamafactory/extras/constants.py | 297 +++++++++++++--------------
 1 file changed, 146 insertions(+), 151 deletions(-)

diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 02bb9066..87436bc7 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -114,17 +114,12 @@ def register_model_group(
     template: Optional[str] = None,
     vision: bool = False,
 ) -> None:
-    prefix = None
     for name, path in models.items():
-        if prefix is None:
-            prefix = name.split("-")[0]
-        else:
-            assert prefix == name.split("-")[0], "prefix should be identical."
         SUPPORTED_MODELS[name] = path
-    if template is not None:
-        DEFAULT_TEMPLATE[prefix] = template
-    if vision:
-        VISION_MODELS.add(prefix)
+        if template is not None and any(suffix in name for suffix in ("-Chat", "-Instruct")):
+            DEFAULT_TEMPLATE[name] = template
+        if vision:
+            VISION_MODELS.add(name)
 
 
 register_model_group(
@@ -274,27 +269,27 @@ register_model_group(
 
 register_model_group(
     models={
-        "ChineseLLaMA2-1.3B": {
+        "Chinese-Llama-2-1.3B": {
             DownloadSource.DEFAULT: "hfl/chinese-llama-2-1.3b",
             DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-1.3b",
         },
-        "ChineseLLaMA2-7B": {
+        "Chinese-Llama-2-7B": {
             DownloadSource.DEFAULT: "hfl/chinese-llama-2-7b",
             DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-7b",
         },
-        "ChineseLLaMA2-13B": {
+        "Chinese-Llama-2-13B": {
             DownloadSource.DEFAULT: "hfl/chinese-llama-2-13b",
             DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-13b",
         },
-        "ChineseLLaMA2-1.3B-Chat": {
+        "Chinese-Alpaca-2-1.3B-Chat": {
             DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-1.3b",
             DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-1.3b",
         },
-        "ChineseLLaMA2-7B-Chat": {
+        "Chinese-Alpaca-2-7B-Chat": {
             DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-7b",
             DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-7b",
         },
-        "ChineseLLaMA2-13B-Chat": {
+        "Chinese-Alpaca-2-13B-Chat": {
             DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-13b",
             DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-13b",
         },
@@ -450,25 +445,25 @@ register_model_group(
 
 register_model_group(
     models={
-        "DeepSeekCoder-6.7B-Base": {
+        "DeepSeek-Coder-6.7B-Base": {
             DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-6.7b-base",
             DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-6.7b-base",
         },
-        "DeepSeekCoder-7B-Base": {
+        "DeepSeek-Coder-7B-Base": {
             DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-7b-base-v1.5",
         },
-        "DeepSeekCoder-33B-Base": {
+        "DeepSeek-Coder-33B-Base": {
             DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-33b-base",
             DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-33b-base",
         },
-        "DeepSeekCoder-6.7B-Instruct": {
+        "DeepSeek-Coder-6.7B-Instruct": {
             DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-6.7b-instruct",
             DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-6.7b-instruct",
         },
-        "DeepSeekCoder-7B-Instruct": {
+        "DeepSeek-Coder-7B-Instruct": {
             DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-7b-instruct-v1.5",
         },
-        "DeepSeekCoder-33B-Instruct": {
+        "DeepSeek-Coder-33B-Instruct": {
             DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-33b-instruct",
             DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-33b-instruct",
         },
@@ -624,13 +619,6 @@ register_model_group(
             DownloadSource.DEFAULT: "internlm/internlm2-chat-20b",
             DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-chat-20b",
         },
-    },
-    template="intern2",
-)
-
-
-register_model_group(
-    models={
         "InternLM2.5-1.8B": {
             DownloadSource.DEFAULT: "internlm/internlm2_5-1_8b",
             DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-1_8b",
@@ -686,19 +674,19 @@ register_model_group(
 
 register_model_group(
     models={
-        "LLaMA-7B": {
+        "Llama-7B": {
             DownloadSource.DEFAULT: "huggyllama/llama-7b",
             DownloadSource.MODELSCOPE: "skyline2006/llama-7b",
         },
-        "LLaMA-13B": {
+        "Llama-13B": {
             DownloadSource.DEFAULT: "huggyllama/llama-13b",
             DownloadSource.MODELSCOPE: "skyline2006/llama-13b",
         },
-        "LLaMA-30B": {
+        "Llama-30B": {
             DownloadSource.DEFAULT: "huggyllama/llama-30b",
             DownloadSource.MODELSCOPE: "skyline2006/llama-30b",
         },
-        "LLaMA-65B": {
+        "Llama-65B": {
             DownloadSource.DEFAULT: "huggyllama/llama-65b",
             DownloadSource.MODELSCOPE: "skyline2006/llama-65b",
         },
@@ -708,27 +696,27 @@ register_model_group(
 
 register_model_group(
     models={
-        "LLaMA2-7B": {
+        "Llama-2-7B": {
             DownloadSource.DEFAULT: "meta-llama/Llama-2-7b-hf",
             DownloadSource.MODELSCOPE: "modelscope/Llama-2-7b-ms",
         },
-        "LLaMA2-13B": {
+        "Llama-2-13B": {
             DownloadSource.DEFAULT: "meta-llama/Llama-2-13b-hf",
             DownloadSource.MODELSCOPE: "modelscope/Llama-2-13b-ms",
         },
-        "LLaMA2-70B": {
+        "Llama-2-70B": {
             DownloadSource.DEFAULT: "meta-llama/Llama-2-70b-hf",
             DownloadSource.MODELSCOPE: "modelscope/Llama-2-70b-ms",
         },
-        "LLaMA2-7B-Chat": {
+        "Llama-2-7B-Chat": {
             DownloadSource.DEFAULT: "meta-llama/Llama-2-7b-chat-hf",
             DownloadSource.MODELSCOPE: "modelscope/Llama-2-7b-chat-ms",
         },
-        "LLaMA2-13B-Chat": {
+        "Llama-2-13B-Chat": {
             DownloadSource.DEFAULT: "meta-llama/Llama-2-13b-chat-hf",
             DownloadSource.MODELSCOPE: "modelscope/Llama-2-13b-chat-ms",
         },
-        "LLaMA2-70B-Chat": {
+        "Llama-2-70B-Chat": {
             DownloadSource.DEFAULT: "meta-llama/Llama-2-70b-chat-hf",
             DownloadSource.MODELSCOPE: "modelscope/Llama-2-70b-chat-ms",
         },
@@ -739,80 +727,74 @@ register_model_group(
 
 register_model_group(
     models={
-        "LLaMA3-8B": {
+        "Llama-3-8B": {
             DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-8B",
             DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-8B",
         },
-        "LLaMA3-70B": {
+        "Llama-3-70B": {
             DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-70B",
             DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-70B",
         },
-        "LLaMA3-8B-Instruct": {
+        "Llama-3-8B-Instruct": {
             DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-8B-Instruct",
             DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-8B-Instruct",
         },
-        "LLaMA3-70B-Instruct": {
+        "Llama-3-70B-Instruct": {
             DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-70B-Instruct",
             DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-70B-Instruct",
         },
-        "LLaMA3-8B-Chinese-Chat": {
+        "Llama-3-8B-Chinese-Chat": {
             DownloadSource.DEFAULT: "shenzhi-wang/Llama3-8B-Chinese-Chat",
             DownloadSource.MODELSCOPE: "LLM-Research/Llama3-8B-Chinese-Chat",
         },
-        "LLaMA3-70B-Chinese-Chat": {
+        "Llama-3-70B-Chinese-Chat": {
             DownloadSource.DEFAULT: "shenzhi-wang/Llama3-70B-Chinese-Chat",
         },
-    },
-    template="llama3",
-)
-
-
-register_model_group(
-    models={
-        "LLaMA3.1-8B": {
+        "Llama-3.1-8B": {
             DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-8B",
             DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-8B",
         },
-        "LLaMA3.1-70B": {
+        "Llama-3.1-70B": {
             DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-70B",
             DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-70B",
         },
-        "LLaMA3.1-405B": {
+        "Llama-3.1-405B": {
             DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-405B",
             DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-405B",
         },
-        "LLaMA3.1-8B-Instruct": {
+        "Llama-3.1-8B-Instruct": {
             DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-8B-Instruct",
             DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-8B-Instruct",
         },
-        "LLaMA3.1-70B-Instruct": {
+        "Llama-3.1-70B-Instruct": {
             DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-70B-Instruct",
             DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-70B-Instruct",
         },
-        "LLaMA3.1-405B-Instruct": {
+        "Llama-3.1-405B-Instruct": {
             DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-405B-Instruct",
             DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-405B-Instruct",
         },
-    },
-    template="llama3",
-)
-
-
-register_model_group(
-    models={
-        "LLaMA3.2-1B": {
+        "Llama-3.1-8B-Chinese-Chat": {
+            DownloadSource.DEFAULT: "shenzhi-wang/Llama3.1-8B-Chinese-Chat",
+            DownloadSource.MODELSCOPE: "XD_AI/Llama3.1-8B-Chinese-Chat",
+        },
+        "Llama-3.1-70B-Chinese-Chat": {
+            DownloadSource.DEFAULT: "shenzhi-wang/Llama3.1-70B-Chinese-Chat",
+            DownloadSource.MODELSCOPE: "XD_AI/Llama3.1-70B-Chinese-Chat",
+        },
+        "Llama-3.2-1B": {
             DownloadSource.DEFAULT: "meta-llama/Llama-3.2-1B",
             DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-1B",
         },
-        "LLaMA3.2-3B": {
+        "Llama-3.2-3B": {
             DownloadSource.DEFAULT: "meta-llama/Llama-3.2-3B",
             DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-3B",
         },
-        "LLaMA3.2-1B-Instruct": {
+        "Llama-3.2-1B-Instruct": {
             DownloadSource.DEFAULT: "meta-llama/Llama-3.2-1B-Instruct",
             DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-1B-Instruct",
         },
-        "LLaMA3.2-3B-Instruct": {
+        "Llama-3.2-3B-Instruct": {
             DownloadSource.DEFAULT: "meta-llama/Llama-3.2-3B-Instruct",
             DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-3B-Instruct",
         },
@@ -823,11 +805,13 @@ register_model_group(
 
 register_model_group(
     models={
-        "LLaVA1.5-7B-Chat": {
+        "LLaVA-1.5-7B-Chat": {
             DownloadSource.DEFAULT: "llava-hf/llava-1.5-7b-hf",
+            DownloadSource.MODELSCOPE: "swift/llava-1.5-7b-hf",
         },
-        "LLaVA1.5-13B-Chat": {
+        "LLaVA-1.5-13B-Chat": {
             DownloadSource.DEFAULT: "llava-hf/llava-1.5-13b-hf",
+            DownloadSource.MODELSCOPE: "swift/llava-1.5-13b-hf",
         },
     },
     template="llava",
@@ -839,9 +823,11 @@ register_model_group(
     models={
         "LLaVA-NeXT-7B-Chat": {
             DownloadSource.DEFAULT: "llava-hf/llava-v1.6-vicuna-7b-hf",
+            DownloadSource.MODELSCOPE: "swift/llava-v1.6-vicuna-7b-hf",
         },
         "LLaVA-NeXT-13B-Chat": {
             DownloadSource.DEFAULT: "llava-hf/llava-v1.6-vicuna-13b-hf",
+            DownloadSource.MODELSCOPE: "swift/llava-v1.6-vicuna-13b-hf",
         },
     },
     template="llava_next",
@@ -849,21 +835,11 @@ register_model_group(
 )
 
 
-register_model_group(
-    models={
-        "LLaVA-NeXT-LLaMA3-8B-Chat": {
-            DownloadSource.DEFAULT: "llava-hf/llama3-llava-next-8b-hf",
-        },
-    },
-    template="llava_next_llama3",
-    vision=True,
-)
-
-
 register_model_group(
     models={
         "LLaVA-NeXT-Mistral-7B-Chat": {
             DownloadSource.DEFAULT: "llava-hf/llava-v1.6-mistral-7b-hf",
+            DownloadSource.MODELSCOPE: "swift/llava-v1.6-mistral-7b-hf",
         },
     },
     template="llava_next_mistral",
@@ -873,22 +849,21 @@ register_model_group(
 
 register_model_group(
     models={
-        "LLaVA-NeXT-Qwen-1.5-72B-Chat": {
-            DownloadSource.DEFAULT: "llava-hf/llava-next-72b-hf",
-        },
-        "LLaVA-NeXT-Qwen-1.5-110B-Chat": {
-            DownloadSource.DEFAULT: "llava-hf/llava-next-110b-hf",
+        "LLaVA-NeXT-Llama3-8B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llama3-llava-next-8b-hf",
+            DownloadSource.MODELSCOPE: "swift/llama3-llava-next-8b-hf",
         },
     },
-    template="llava_next_qwen",
+    template="llava_next_llama3",
     vision=True,
 )
 
 
 register_model_group(
     models={
-        "LLaVA-NeXT-Yi-34B-Chat": {
+        "LLaVA-NeXT-34B-Chat": {
             DownloadSource.DEFAULT: "llava-hf/llava-v1.6-34b-hf",
+            DownloadSource.MODELSCOPE: "LLM-Research/llava-v1.6-34b-hf",
         },
     },
     template="llava_next_yi",
@@ -896,13 +871,31 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "LLaVA-NeXT-72B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-next-72b-hf",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/llava-next-72b-hf",
+        },
+        "LLaVA-NeXT-110B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-next-110b-hf",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/llava-next-110b-hf",
+        },
+    },
+    template="llava_next_qwen",
+    vision=True,
+)
+
+
 register_model_group(
     models={
         "LLaVA-NeXT-Video-7B-Chat": {
             DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-hf",
+            DownloadSource.MODELSCOPE: "swift/LLaVA-NeXT-Video-7B-hf",
         },
         "LLaVA-NeXT-Video-7B-DPO-Chat": {
             DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-DPO-hf",
+            DownloadSource.MODELSCOPE: "swift/LLaVA-NeXT-Video-7B-DPO-hf",
         },
     },
     template="llava_next_video",
@@ -912,8 +905,9 @@ register_model_group(
 
 register_model_group(
     models={
-        "LLaVA-NeXT-Video-Mistral-7B-32k-Chat": {
+        "LLaVA-NeXT-Video-7B-32k-Chat": {
             DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-32K-hf",
+            DownloadSource.MODELSCOPE: "swift/LLaVA-NeXT-Video-7B-32K-hf",
         },
     },
     template="llava_next_video_mistral",
@@ -923,10 +917,11 @@ register_model_group(
 
 register_model_group(
     models={
-        "LLaVA-NeXT-Video-Yi-34B-Chat": {
+        "LLaVA-NeXT-Video-34B-Chat": {
             DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-34B-hf",
+            DownloadSource.MODELSCOPE: "swift/LLaVA-NeXT-Video-34B-hf",
         },
-        "LLaVA-NeXT-Video-Yi-34B-DPO-Chat": {
+        "LLaVA-NeXT-Video-34B-DPO-Chat": {
             DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-34B-DPO-hf",
         },
     },
@@ -1128,27 +1123,27 @@ register_model_group(
 
 register_model_group(
     models={
-        "Phi3-4B-4k-Instruct": {
+        "Phi-3-4B-4k-Instruct": {
             DownloadSource.DEFAULT: "microsoft/Phi-3-mini-4k-instruct",
             DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-mini-4k-instruct",
         },
-        "Phi3-4B-128k-Instruct": {
+        "Phi-3-4B-128k-Instruct": {
             DownloadSource.DEFAULT: "microsoft/Phi-3-mini-128k-instruct",
             DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-mini-128k-instruct",
         },
-        "Phi3-7B-8k-Instruct": {
+        "Phi-3-7B-8k-Instruct": {
             DownloadSource.DEFAULT: "microsoft/Phi-3-small-8k-instruct",
             DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-small-8k-instruct",
         },
-        "Phi3-7B-128k-Instruct": {
+        "Phi-3-7B-128k-Instruct": {
             DownloadSource.DEFAULT: "microsoft/Phi-3-small-128k-instruct",
             DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-small-128k-instruct",
         },
-        "Phi3-14B-8k-Instruct": {
+        "Phi-3-14B-8k-Instruct": {
             DownloadSource.DEFAULT: "microsoft/Phi-3-medium-4k-instruct",
             DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-medium-4k-instruct",
         },
-        "Phi3-14B-128k-Instruct": {
+        "Phi-3-14B-128k-Instruct": {
             DownloadSource.DEFAULT: "microsoft/Phi-3-medium-128k-instruct",
             DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-medium-128k-instruct",
         },
@@ -1191,35 +1186,35 @@ register_model_group(
             DownloadSource.DEFAULT: "Qwen/Qwen-72B-Chat",
             DownloadSource.MODELSCOPE: "qwen/Qwen-72B-Chat",
         },
-        "Qwen-1.8B-int8-Chat": {
+        "Qwen-1.8B-Chat-Int8": {
             DownloadSource.DEFAULT: "Qwen/Qwen-1_8B-Chat-Int8",
             DownloadSource.MODELSCOPE: "qwen/Qwen-1_8B-Chat-Int8",
         },
-        "Qwen-1.8B-int4-Chat": {
+        "Qwen-1.8B-Chat-Int4": {
             DownloadSource.DEFAULT: "Qwen/Qwen-1_8B-Chat-Int4",
             DownloadSource.MODELSCOPE: "qwen/Qwen-1_8B-Chat-Int4",
         },
-        "Qwen-7B-int8-Chat": {
+        "Qwen-7B-Chat-Int8": {
             DownloadSource.DEFAULT: "Qwen/Qwen-7B-Chat-Int8",
             DownloadSource.MODELSCOPE: "qwen/Qwen-7B-Chat-Int8",
         },
-        "Qwen-7B-int4-Chat": {
+        "Qwen-7B-Chat-Int4": {
             DownloadSource.DEFAULT: "Qwen/Qwen-7B-Chat-Int4",
             DownloadSource.MODELSCOPE: "qwen/Qwen-7B-Chat-Int4",
         },
-        "Qwen-14B-int8-Chat": {
+        "Qwen-14B-Chat-Int8": {
             DownloadSource.DEFAULT: "Qwen/Qwen-14B-Chat-Int8",
             DownloadSource.MODELSCOPE: "qwen/Qwen-14B-Chat-Int8",
         },
-        "Qwen-14B-int4-Chat": {
+        "Qwen-14B-Chat-Int4": {
             DownloadSource.DEFAULT: "Qwen/Qwen-14B-Chat-Int4",
             DownloadSource.MODELSCOPE: "qwen/Qwen-14B-Chat-Int4",
         },
-        "Qwen-72B-int8-Chat": {
+        "Qwen-72B-Chat-Int8": {
             DownloadSource.DEFAULT: "Qwen/Qwen-72B-Chat-Int8",
             DownloadSource.MODELSCOPE: "qwen/Qwen-72B-Chat-Int8",
         },
-        "Qwen-72B-int4-Chat": {
+        "Qwen-72B-Chat-Int4": {
             DownloadSource.DEFAULT: "Qwen/Qwen-72B-Chat-Int4",
             DownloadSource.MODELSCOPE: "qwen/Qwen-72B-Chat-Int4",
         },
@@ -1302,75 +1297,75 @@ register_model_group(
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B-Chat",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-MoE-A2.7B-Chat",
         },
-        "Qwen1.5-0.5B-int8-Chat": {
+        "Qwen1.5-0.5B-Chat-GPTQ-Int8": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8",
         },
-        "Qwen1.5-0.5B-int4-Chat": {
+        "Qwen1.5-0.5B-Chat-AWQ": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat-AWQ",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-0.5B-Chat-AWQ",
         },
-        "Qwen1.5-1.8B-int8-Chat": {
+        "Qwen1.5-1.8B-Chat-GPTQ-Int8": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8",
         },
-        "Qwen1.5-1.8B-int4-Chat": {
+        "Qwen1.5-1.8B-Chat-AWQ": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-1.8B-Chat-AWQ",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-1.8B-Chat-AWQ",
         },
-        "Qwen1.5-4B-int8-Chat": {
+        "Qwen1.5-4B-Chat-GPTQ-Int8": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-4B-Chat-GPTQ-Int8",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-4B-Chat-GPTQ-Int8",
         },
-        "Qwen1.5-4B-int4-Chat": {
+        "Qwen1.5-4B-Chat-AWQ": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-4B-Chat-AWQ",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-4B-Chat-AWQ",
         },
-        "Qwen1.5-7B-int8-Chat": {
+        "Qwen1.5-7B-Chat-GPTQ-Int8": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-7B-Chat-GPTQ-Int8",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-7B-Chat-GPTQ-Int8",
         },
-        "Qwen1.5-7B-int4-Chat": {
+        "Qwen1.5-7B-Chat-AWQ": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-7B-Chat-AWQ",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-7B-Chat-AWQ",
         },
-        "Qwen1.5-14B-int8-Chat": {
+        "Qwen1.5-14B-Chat-GPTQ-Int8": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B-Chat-GPTQ-Int8",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-14B-Chat-GPTQ-Int8",
         },
-        "Qwen1.5-14B-int4-Chat": {
+        "Qwen1.5-14B-Chat-AWQ": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B-Chat-AWQ",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-14B-Chat-AWQ",
         },
-        "Qwen1.5-32B-int4-Chat": {
+        "Qwen1.5-32B-Chat-AWQ": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-32B-Chat-AWQ",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-32B-Chat-AWQ",
         },
-        "Qwen1.5-72B-int8-Chat": {
+        "Qwen1.5-72B-Chat-GPTQ-Int8": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat-GPTQ-Int8",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B-Chat-GPTQ-Int8",
         },
-        "Qwen1.5-72B-int4-Chat": {
+        "Qwen1.5-72B-Chat-AWQ": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat-AWQ",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B-Chat-AWQ",
         },
-        "Qwen1.5-110B-int4-Chat": {
+        "Qwen1.5-110B-Chat-AWQ": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-110B-Chat-AWQ",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-110B-Chat-AWQ",
         },
-        "Qwen1.5-MoE-A2.7B-int4-Chat": {
+        "Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
         },
-        "Qwen1.5-Code-7B": {
+        "CodeQwen1.5-7B": {
             DownloadSource.DEFAULT: "Qwen/CodeQwen1.5-7B",
             DownloadSource.MODELSCOPE: "qwen/CodeQwen1.5-7B",
         },
-        "Qwen1.5-Code-7B-Chat": {
+        "CodeQwen1.5-7B-Chat": {
             DownloadSource.DEFAULT: "Qwen/CodeQwen1.5-7B-Chat",
             DownloadSource.MODELSCOPE: "qwen/CodeQwen1.5-7B-Chat",
         },
-        "Qwen1.5-Code-7B-int4-Chat": {
+        "CodeQwen1.5-7B-Chat-AWQ": {
             DownloadSource.DEFAULT: "Qwen/CodeQwen1.5-7B-Chat-AWQ",
             DownloadSource.MODELSCOPE: "qwen/CodeQwen1.5-7B-Chat-AWQ",
         },
@@ -1691,51 +1686,51 @@ register_model_group(
 
 register_model_group(
     models={
-        "Qwen2VL-2B-Instruct": {
+        "Qwen2-VL-2B-Instruct": {
             DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B-Instruct",
             DownloadSource.MODELSCOPE: "qwen/Qwen2-VL-2B-Instruct",
         },
-        "Qwen2VL-7B-Instruct": {
+        "Qwen2-VL-7B-Instruct": {
             DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B-Instruct",
             DownloadSource.MODELSCOPE: "qwen/Qwen2-VL-7B-Instruct",
         },
-        "Qwen2VL-72B-Instruct": {
+        "Qwen2-VL-72B-Instruct": {
             DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B-Instruct",
             DownloadSource.MODELSCOPE: "qwen/Qwen2-VL-72B-Instruct",
         },
-        "Qwen2VL-2B-Instruct-GPTQ-Int8": {
+        "Qwen2-VL-2B-Instruct-GPTQ-Int8": {
             DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8",
             DownloadSource.MODELSCOPE: "qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8",
         },
-        "Qwen2VL-2B-Instruct-GPTQ-Int4": {
+        "Qwen2-VL-2B-Instruct-GPTQ-Int4": {
             DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4",
             DownloadSource.MODELSCOPE: "qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4",
         },
-        "Qwen2VL-2B-Instruct-AWQ": {
+        "Qwen2-VL-2B-Instruct-AWQ": {
             DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B-Instruct-AWQ",
             DownloadSource.MODELSCOPE: "qwen/Qwen2-VL-2B-Instruct-AWQ",
         },
-        "Qwen2VL-7B-Instruct-GPTQ-Int8": {
+        "Qwen2-VL-7B-Instruct-GPTQ-Int8": {
             DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8",
             DownloadSource.MODELSCOPE: "qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8",
         },
-        "Qwen2VL-7B-Instruct-GPTQ-Int4": {
+        "Qwen2-VL-7B-Instruct-GPTQ-Int4": {
             DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4",
             DownloadSource.MODELSCOPE: "qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4",
         },
-        "Qwen2VL-7B-Instruct-AWQ": {
+        "Qwen2-VL-7B-Instruct-AWQ": {
             DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B-Instruct-AWQ",
             DownloadSource.MODELSCOPE: "qwen/Qwen2-VL-7B-Instruct-AWQ",
         },
-        "Qwen2VL-72B-Instruct-GPTQ-Int8": {
+        "Qwen2-VL-72B-Instruct-GPTQ-Int8": {
             DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8",
             DownloadSource.MODELSCOPE: "qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8",
         },
-        "Qwen2VL-72B-Instruct-GPTQ-Int4": {
+        "Qwen2-VL-72B-Instruct-GPTQ-Int4": {
             DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4",
             DownloadSource.MODELSCOPE: "qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4",
         },
-        "Qwen2VL-72B-Instruct-AWQ": {
+        "Qwen2-VL-72B-Instruct-AWQ": {
             DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B-Instruct-AWQ",
             DownloadSource.MODELSCOPE: "qwen/Qwen2-VL-72B-Instruct-AWQ",
         },
@@ -1812,11 +1807,11 @@ register_model_group(
 
 register_model_group(
     models={
-        "Vicuna1.5-7B-Chat": {
+        "Vicuna-v1.5-7B-Chat": {
             DownloadSource.DEFAULT: "lmsys/vicuna-7b-v1.5",
             DownloadSource.MODELSCOPE: "Xorbits/vicuna-7b-v1.5",
         },
-        "Vicuna1.5-13B-Chat": {
+        "Vicuna-v1.5-13B-Chat": {
             DownloadSource.DEFAULT: "lmsys/vicuna-13b-v1.5",
             DownloadSource.MODELSCOPE: "Xorbits/vicuna-13b-v1.5",
         },
@@ -1846,7 +1841,7 @@ register_model_group(
             DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B",
             DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B",
         },
-        "XuanYuan-2-70B": {
+        "XuanYuan2-70B": {
             DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B",
             DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B",
         },
@@ -1858,31 +1853,31 @@ register_model_group(
             DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat",
             DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat",
         },
-        "XuanYuan-2-70B-Chat": {
+        "XuanYuan2-70B-Chat": {
             DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat",
             DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat",
         },
-        "XuanYuan-6B-int8-Chat": {
+        "XuanYuan-6B-Chat-8bit": {
             DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B-Chat-8bit",
             DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B-Chat-8bit",
         },
-        "XuanYuan-6B-int4-Chat": {
+        "XuanYuan-6B-Chat-4bit": {
             DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B-Chat-4bit",
             DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B-Chat-4bit",
         },
-        "XuanYuan-70B-int8-Chat": {
+        "XuanYuan-70B-Chat-8bit": {
             DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat-8bit",
             DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat-8bit",
         },
-        "XuanYuan-70B-int4-Chat": {
+        "XuanYuan-70B-Chat-4bit": {
             DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat-4bit",
             DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat-4bit",
         },
-        "XuanYuan-2-70B-int8-Chat": {
+        "XuanYuan2-70B-Chat-8bit": {
             DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat-8bit",
             DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat-8bit",
         },
-        "XuanYuan-2-70B-int4-Chat": {
+        "XuanYuan2-70B-Chat-4bit": {
             DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat-4bit",
             DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat-4bit",
         },
@@ -1987,19 +1982,19 @@ register_model_group(
             DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat",
             DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat",
         },
-        "Yi-6B-int8-Chat": {
+        "Yi-6B-Chat-8bits": {
             DownloadSource.DEFAULT: "01-ai/Yi-6B-Chat-8bits",
             DownloadSource.MODELSCOPE: "01ai/Yi-6B-Chat-8bits",
         },
-        "Yi-6B-int4-Chat": {
+        "Yi-6B-Chat-4bits": {
             DownloadSource.DEFAULT: "01-ai/Yi-6B-Chat-4bits",
             DownloadSource.MODELSCOPE: "01ai/Yi-6B-Chat-4bits",
         },
-        "Yi-34B-int8-Chat": {
+        "Yi-34B-Chat-8bits": {
             DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat-8bits",
             DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat-8bits",
         },
-        "Yi-34B-int4-Chat": {
+        "Yi-34B-Chat-4bits": {
             DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat-4bits",
             DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat-4bits",
         },
@@ -2050,10 +2045,10 @@ register_model_group(
 
 register_model_group(
     models={
-        "YiVL-6B-Chat": {
+        "Yi-VL-6B-Chat": {
             DownloadSource.DEFAULT: "BUAADreamer/Yi-VL-6B-hf",
         },
-        "YiVL-34B-Chat": {
+        "Yi-VL-34B-Chat": {
             DownloadSource.DEFAULT: "BUAADreamer/Yi-VL-34B-hf",
         },
     },

From 4269b4b49aa3287dd5ed4d07bbaf94fa97069e6d Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 29 Sep 2024 23:55:21 +0800
Subject: [PATCH 30/33] Update README.md

Former-commit-id: 2d37fa1255f289b076ff0fabec6d50077a86f6e4
---
 README.md | 56 +++++++++++++++++++++++++++----------------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index e355691e..8a590323 100644
--- a/README.md
+++ b/README.md
@@ -162,36 +162,36 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Supported Models
 
-| Model                                                             | Model size                       | Template  |
-| ----------------------------------------------------------------- |----------------------------------| --------- |
-| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2 |
-| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3  |
-| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere    |
-| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B                  | falcon    |
-| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma     |
-| [GLM-4](https://huggingface.co/THUDM)                             | 9B                               | glm4      |
-| [InternLM2/InternLM2.5](https://huggingface.co/internlm)          | 7B/20B                           | intern2   |
-| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -         |
-| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2    |
-| [Llama 3-3.2](https://huggingface.co/meta-llama)                  | 1B/3B/8B/70B                     | llama3    |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava     |
+| Model                                                             | Model size                       | Template         |
+| ----------------------------------------------------------------- |----------------------------------| ---------------- |
+| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2        |
+| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -                |
+| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3         |
+| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere           |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek         |
+| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B                  | falcon           |
+| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma            |
+| [GLM-4](https://huggingface.co/THUDM)                             | 9B                               | glm4             |
+| [InternLM2/InternLM2.5](https://huggingface.co/internlm)          | 7B/20B                           | intern2          |
+| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -                |
+| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2           |
+| [Llama 3-3.2](https://huggingface.co/meta-llama)                  | 1B/3B/8B/70B                     | llama3           |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava            |
 | [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/8B/13B/34B/72B/110B           | llava_next       |
 | [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/34B                           | llava_next_video |
-| [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                         | cpm/cpm3  |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral   |
-| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -         |
-| [PaliGemma](https://huggingface.co/google)                        | 3B                               | paligemma |
-| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                        | -         |
-| [Phi-3](https://huggingface.co/microsoft)                         | 4B/7B/14B                        | phi       |
-| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen)       | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen      |
-| [Qwen2-VL](https://huggingface.co/Qwen)                           | 2B/7B/72B                        | qwen2_vl  |
-| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -         |
-| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                       | xverse    |
-| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi        |
-| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                           | yi_vl     |
-| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan      |
+| [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                         | cpm/cpm3         |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral          |
+| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -                |
+| [PaliGemma](https://huggingface.co/google)                        | 3B                               | paligemma        |
+| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                        | -                |
+| [Phi-3](https://huggingface.co/microsoft)                         | 4B/7B/14B                        | phi              |
+| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen)       | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen             |
+| [Qwen2-VL](https://huggingface.co/Qwen)                           | 2B/7B/72B                        | qwen2_vl         |
+| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -                |
+| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                       | xverse           |
+| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi               |
+| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                           | yi_vl            |
+| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan             |
 
 > [!NOTE]
 > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.

From 853c95bb89e008b8b0e28f8d964c618040e583a6 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 29 Sep 2024 23:55:55 +0800
Subject: [PATCH 31/33] Update README.md

Former-commit-id: 63148e72710f1c03cf527f2c06830f725fd035f7
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8a590323..e62a9d47 100644
--- a/README.md
+++ b/README.md
@@ -163,7 +163,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 ## Supported Models
 
 | Model                                                             | Model size                       | Template         |
-| ----------------------------------------------------------------- |----------------------------------| ---------------- |
+| ----------------------------------------------------------------- | -------------------------------- | ---------------- |
 | [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2        |
 | [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -                |
 | [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3         |

From f051bff1e61e2ef8850eebfce0c3d247131fbfe0 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 29 Sep 2024 23:56:32 +0800
Subject: [PATCH 32/33] Update README_zh.md

Former-commit-id: e472f355f2b0beba9329137ac625059bc36a671e
---
 README_zh.md | 60 ++++++++++++++++++++++++++--------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/README_zh.md b/README_zh.md
index 804704cf..b5da9785 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -163,36 +163,36 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272
 
 ## 模型
 
-| 模型名                                                            | 模型大小                          | Template  |
-| ----------------------------------------------------------------- | -------------------------------- | --------- |
-| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2 |
-| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3  |
-| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere    |
-| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B                  | falcon    |
-| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma     |
-| [GLM-4](https://huggingface.co/THUDM)                             | 9B                               | glm4      |
-| [InternLM2/InternLM2.5](https://huggingface.co/internlm)          | 7B/20B                           | intern2   |
-| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -         |
-| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2    |
-| [Llama 3-3.2](https://huggingface.co/meta-llama)                  | 1B/3B/8B/70B                     | llama3    |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava     |
-| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/8B/13B/34B/72B/110B                   | llava_next       |
-| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/34B                       | llava_next_video |
-| [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                         | cpm/cpm3  |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral   |
-| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -         |
-| [PaliGemma](https://huggingface.co/google)                        | 3B                               | paligemma |
-| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                        | -         |
-| [Phi-3](https://huggingface.co/microsoft)                         | 4B/7B/14B                        | phi       |
-| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen)       | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen      |
-| [Qwen2-VL](https://huggingface.co/Qwen)                           | 2B/7B/72B                    | qwen2_vl  |
-| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                    | -         |
-| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                   | xverse    |
-| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B               | yi        |
-| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                       | yi_vl     |
-| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                  | yuan      |
+| 模型名                                                            | 模型大小                          | Template         |
+| ----------------------------------------------------------------- | -------------------------------- | ---------------- |
+| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2        |
+| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -                |
+| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3         |
+| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere           |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek         |
+| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B                  | falcon           |
+| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma            |
+| [GLM-4](https://huggingface.co/THUDM)                             | 9B                               | glm4             |
+| [InternLM2/InternLM2.5](https://huggingface.co/internlm)          | 7B/20B                           | intern2          |
+| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -                |
+| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2           |
+| [Llama 3-3.2](https://huggingface.co/meta-llama)                  | 1B/3B/8B/70B                     | llama3           |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava            |
+| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/8B/13B/34B/72B/110B           | llava_next       |
+| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/34B                           | llava_next_video |
+| [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                         | cpm/cpm3         |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral          |
+| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -                |
+| [PaliGemma](https://huggingface.co/google)                        | 3B                               | paligemma        |
+| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                        | -                |
+| [Phi-3](https://huggingface.co/microsoft)                         | 4B/7B/14B                        | phi              |
+| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen)       | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen             |
+| [Qwen2-VL](https://huggingface.co/Qwen)                           | 2B/7B/72B                        | qwen2_vl         |
+| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -                |
+| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                       | xverse           |
+| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi               |
+| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                           | yi_vl            |
+| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan             |
 
 > [!NOTE]
 > 对于所有“基座”（Base）模型，`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”（Instruct/Chat）模型请务必使用**对应的模板**。

From 1f1b8e825da281f744ea975423a6b96bb6e7d52f Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 29 Sep 2024 23:58:09 +0800
Subject: [PATCH 33/33] Update common.py

Former-commit-id: 2c17d91bb7ae58346c020c46cb7ffabad4deff4f
---
 src/llamafactory/webui/common.py | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/src/llamafactory/webui/common.py b/src/llamafactory/webui/common.py
index 0ad2929e..d4e9be51 100644
--- a/src/llamafactory/webui/common.py
+++ b/src/llamafactory/webui/common.py
@@ -115,13 +115,6 @@ def get_model_path(model_name: str) -> str:
     return model_path
 
 
-def get_prefix(model_name: str) -> str:
-    r"""
-    Gets the prefix of the model name to obtain the model family.
-    """
-    return model_name.split("-")[0]
-
-
 def get_model_info(model_name: str) -> Tuple[str, str]:
     r"""
     Gets the necessary information of this model.
@@ -137,21 +130,14 @@ def get_template(model_name: str) -> str:
     r"""
     Gets the template name if the model is a chat model.
     """
-    if (
-        model_name
-        and any(suffix in model_name for suffix in ("-Chat", "-Instruct"))
-        and get_prefix(model_name) in DEFAULT_TEMPLATE
-    ):
-        return DEFAULT_TEMPLATE[get_prefix(model_name)]
-
-    return "default"
+    return DEFAULT_TEMPLATE.get(model_name, "default")
 
 
 def get_visual(model_name: str) -> bool:
     r"""
     Judges if the model is a vision language model.
     """
-    return get_prefix(model_name) in VISION_MODELS
+    return model_name in VISION_MODELS
 
 
 def list_checkpoints(model_name: str, finetuning_type: str) -> "gr.Dropdown":