From a2452d0b1c2cab9481054bdeeab3bcf945cbc588 Mon Sep 17 00:00:00 2001 From: Kingsley Date: Sun, 29 Sep 2024 00:00:23 +0800 Subject: [PATCH] Tiny fix Former-commit-id: 8f13a3627d06a6f0a9b4e35443a415958d9ad1c9 --- README.md | 2 +- README_zh.md | 2 +- src/llamafactory/data/mm_plugin.py | 3 +-- src/llamafactory/extras/constants.py | 6 +++--- src/llamafactory/model/model_utils/visual.py | 2 -- 5 files changed, 6 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index cf37565b..899ad0c4 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ | [PaliGemma](https://huggingface.co/google) | 3B | paligemma | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistralai/Pixtral-12B-2409) | 12B | pixtral | +| [Pixtral](https://huggingface.co/mistral-community/pixtral-12b) | 12B | pixtral | | [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | diff --git a/README_zh.md b/README_zh.md index 4b3b53de..e7335b72 100644 --- a/README_zh.md +++ b/README_zh.md @@ -184,7 +184,7 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272 | [PaliGemma](https://huggingface.co/google) | 3B | paligemma | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistralai/Pixtral-12B-2409) | 12B | pixtral | +| [Pixtral](https://huggingface.co/mistral-community/pixtral-12b) | 12B | pixtral | | [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 2b85c2c5..8688a8be 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -24,7 +24,6 @@ if TYPE_CHECKING: from av.stream import Stream from transformers import PreTrainedTokenizer, ProcessorMixin from transformers.image_processing_utils import BaseImageProcessor - from transformers.processing_utils import _validate_images_text_input_order, ProcessingKwargs class EncodedImage(TypedDict): path: Optional[str] @@ -392,7 +391,7 @@ class PixtralPlugin(BasePlugin): mm_inputs = self._get_mm_inputs(images, videos, processor) if mm_inputs.get("image_sizes"): mm_inputs.pop("image_sizes") - + if isinstance(mm_inputs.get("pixel_values"), list) and len(mm_inputs.get("pixel_values")[0]) >= 2: raise ValueError("Now it only supports batchsize=1 on per gpu due to `List[tensor]` can not pack into BachEncoding") diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index 3de1c7a2..32af244b 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -1059,9 +1059,9 @@ register_model_group( register_model_group( models={ - "Pixtral-12B-2409": { - DownloadSource.DEFAULT: "mistralai/Pixtral-12B-2409", - DownloadSource.MODELSCOPE: "LLM-Research/Pixtral-12B-2409", + "Pixtral-12B": { + DownloadSource.DEFAULT: "mistral-community/pixtral-12b", + DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b", } }, template="mistral", diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py index 107590bd..8aad4d87 100644 --- a/src/llamafactory/model/model_utils/visual.py +++ b/src/llamafactory/model/model_utils/visual.py @@ -96,7 +96,6 @@ def autocast_projector_dtype(model: "PreTrainedModel", model_args: "ModelArgumen mm_projector: "torch.nn.Module" = getattr(model, "multi_modal_projector") elif model_type == "qwen2_vl": mm_projector: "torch.nn.Module" = getattr(getattr(model, "visual"), "merger") - # TODO check it elif model_type == "pixtral": mm_projector: "torch.nn.Module" = getattr(model, "vision_language_adapte") else: @@ -128,7 +127,6 @@ def get_forbidden_modules(config: "PretrainedConfig", finetuning_args: "Finetuni if model_type in ["llava", "paligemma", "pixtral"]: if finetuning_args.freeze_vision_tower: forbidden_modules.add("vision_tower") - #TODO check it forbidden_modules.add("vision_encoder") if finetuning_args.train_mm_proj_only: