From c436d6ea0b6cd51d1b6a81894c8f11d4ea05b366 Mon Sep 17 00:00:00 2001 From: Kingsley Date: Thu, 26 Sep 2024 12:11:58 +0800 Subject: [PATCH 01/34] add pixtral template Former-commit-id: 86f5a9be548ef02ce334bba35a529c70e8b3ad7f --- src/llamafactory/data/mm_plugin.py | 6 +++++ src/llamafactory/data/template.py | 7 +++++ src/llamafactory/extras/constants.py | 10 ++++++++ src/llamafactory/model/loader.py | 38 ++++++++++++++++++++++++++++ 4 files changed, 61 insertions(+) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index e22e2760..ea0f2185 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -323,6 +323,12 @@ class PaliGemmaPlugin(BasePlugin): mm_inputs["token_type_ids"] = _get_paligemma_token_type_ids(imglens, seqlens, processor) return mm_inputs +class PixtralPlugin(BasePlugin): + #TODO preprocess according to Pixtral hf + from transformers import LlavaForConditionalGeneration + @override + def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject": + pass class Qwen2vlPlugin(BasePlugin): @override diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index 54da4757..9b844d88 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -821,6 +821,13 @@ _register_template( replace_eos=True, ) +_register_template( + name="pixtral", + format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]), + format_prefix=EmptyFormatter(slots=[{"bos_token"}]), + mm_plugin=get_mm_plugin(name="pixtral", image_token="[IMG]") +) + _register_template( name="qwen", diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index 8d8d4424..e88f0da7 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -894,6 +894,16 @@ register_model_group( template="mistral", ) +register_model_group( + models={ + "Pixtral-12B-2409": { + DownloadSource.DEFAULT: "mistral-community/pixtral-12b", + DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b", + } + }, + template="mistral" +) + register_model_group( models={ diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index 030ce90f..bc4e101c 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -119,6 +119,44 @@ def load_config(model_args: "ModelArguments") -> "PretrainedConfig": Loads model config. """ init_kwargs = _get_init_kwargs(model_args) + if "pixtral" in model_args.model_name_or_path: + from transformers import PretrainedConfig + + class PixtralVisionConfig(PretrainedConfig): + model_type = "pixtral" + + def __init__( + self, + hidden_size=1024, + intermediate_size=4096, + num_hidden_layers=24, + num_attention_heads=16, + num_channels=3, + image_size=1024, + patch_size=16, + hidden_act="gelu", + attention_dropout=0.0, + rope_theta=10000.0, + tie_word_embeddings=False, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.attention_dropout = attention_dropout + self.hidden_act = hidden_act + self.rope_theta = rope_theta + self.tie_word_embeddings = tie_word_embeddings + self.head_dim = hidden_size // num_attention_heads + + return PixtralVisionConfig() + return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs) From b76116bb6cbdb05f3203a3363d1e04037d701dc3 Mon Sep 17 00:00:00 2001 From: Kingsley Date: Thu, 26 Sep 2024 17:14:51 +0800 Subject: [PATCH 02/34] add pixtral template Former-commit-id: 7b3336dd97e06a11ec52433ef36980aefdbb45ba --- src/llamafactory/data/mm_plugin.py | 64 ++++++++++++++++++++++++++++-- src/llamafactory/model/loader.py | 37 ----------------- 2 files changed, 60 insertions(+), 41 deletions(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index ea0f2185..0e59ec0b 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -24,6 +24,7 @@ if TYPE_CHECKING: from av.stream import Stream from transformers import PreTrainedTokenizer, ProcessorMixin from transformers.image_processing_utils import BaseImageProcessor + from transformers.processing_utils import _validate_images_text_input_order, ProcessingKwargs class EncodedImage(TypedDict): path: Optional[str] @@ -324,11 +325,65 @@ class PaliGemmaPlugin(BasePlugin): return mm_inputs class PixtralPlugin(BasePlugin): - #TODO preprocess according to Pixtral hf - from transformers import LlavaForConditionalGeneration @override - def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject": - pass + def process_messages( + self, + messages: Sequence[Dict[str, str]], + images: Sequence["ImageInput"], + videos: Sequence["VideoInput"], + processor: Optional["ProcessorMixin"], + ) -> List[Dict[str, str]]: + patch_size = processor.patch_size + image_token = processor.image_token + image_break_token = processor.image_break_token + image_end_token = processor.image_end_token + + self._validate_input(images, videos) + num_image_tokens = 0 + image_input_sizes = self._get_mm_inputs(images, videos, processor)["image_sizes"] + messages = deepcopy(messages) + print(image_input_sizes[0], messages) + for message in messages: + content = message["content"] + img_id = 0 + while IMAGE_PLACEHOLDER in content: + # only support one image for one time? + image_size = image_input_sizes[0][0] + height, width = image_size + num_height_tokens = height // patch_size + num_width_tokens = width // patch_size + replace_tokens = [ + [image_token] * num_width_tokens + [image_break_token] + ] * num_height_tokens + # Flatten list + replace_tokens = [item for sublist in replace_tokens for item in sublist] + replace_tokens[-1] = image_end_token + replace_str = "".join(replace_tokens) + content.replace(IMAGE_PLACEHOLDER, replace_str, 1) + + img_id += 1 + num_image_tokens += 1 + + message["content"] = content + + if len(images) != num_image_tokens: + raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER)) + + return messages + + @override + def get_mm_inputs( + self, + images: Sequence["ImageInput"], + videos: Sequence["VideoInput"], + imglens: Sequence[int], + vidlens: Sequence[int], + seqlens: Sequence[int], + processor: Optional["ProcessorMixin"], + ) -> Dict[str, Union[List[int], "torch.Tensor"]]: + + self._validate_input(images, videos) + return self._get_mm_inputs(images, videos, processor) class Qwen2vlPlugin(BasePlugin): @override @@ -428,6 +483,7 @@ PLUGINS = { "llava": LlavaPlugin, "paligemma": PaliGemmaPlugin, "qwen2_vl": Qwen2vlPlugin, + "pixtral": PixtralPlugin, } diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index bc4e101c..96d61645 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -119,43 +119,6 @@ def load_config(model_args: "ModelArguments") -> "PretrainedConfig": Loads model config. """ init_kwargs = _get_init_kwargs(model_args) - if "pixtral" in model_args.model_name_or_path: - from transformers import PretrainedConfig - - class PixtralVisionConfig(PretrainedConfig): - model_type = "pixtral" - - def __init__( - self, - hidden_size=1024, - intermediate_size=4096, - num_hidden_layers=24, - num_attention_heads=16, - num_channels=3, - image_size=1024, - patch_size=16, - hidden_act="gelu", - attention_dropout=0.0, - rope_theta=10000.0, - tie_word_embeddings=False, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.hidden_act = hidden_act - self.rope_theta = rope_theta - self.tie_word_embeddings = tie_word_embeddings - self.head_dim = hidden_size // num_attention_heads - - return PixtralVisionConfig() return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs) From 66e473d5191e18ebe2df336f7effb7f448afa04e Mon Sep 17 00:00:00 2001 From: Kingsley Date: Sat, 28 Sep 2024 02:14:06 +0800 Subject: [PATCH 03/34] remove some unnecessary if conditions Former-commit-id: de06e2678e2168586614242f65939c5772e78774 --- src/llamafactory/chat/hf_engine.py | 2 +- src/llamafactory/data/mm_plugin.py | 36 +++++++++++++++++++++++----- src/llamafactory/extras/constants.py | 22 +++++++++-------- 3 files changed, 43 insertions(+), 17 deletions(-) diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 2b1d9fe5..68416fdf 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -168,7 +168,7 @@ class HuggingfaceEngine(BaseEngine): for key, value in mm_inputs.items(): value = value if isinstance(value, torch.Tensor) else torch.tensor(value) gen_kwargs[key] = value.to(model.device) - + return gen_kwargs, prompt_length @staticmethod diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 0e59ec0b..6716527c 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -325,6 +325,14 @@ class PaliGemmaPlugin(BasePlugin): return mm_inputs class PixtralPlugin(BasePlugin): + # @override + # def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject": + # image = super()._preprocess_image(image, **kwargs) + # UP_SIZE = (512,512) + # image = image.resize(UP_SIZE, resample=Image.NEAREST) + + # return image + @override def process_messages( self, @@ -340,15 +348,22 @@ class PixtralPlugin(BasePlugin): self._validate_input(images, videos) num_image_tokens = 0 - image_input_sizes = self._get_mm_inputs(images, videos, processor)["image_sizes"] + img_kwargs = self._get_mm_inputs(images, videos, processor) + image_input_sizes = None + + if img_kwargs.get("pixel_values") is not None: + image_input_sizes = img_kwargs["image_sizes"] + messages = deepcopy(messages) - print(image_input_sizes[0], messages) for message in messages: content = message["content"] img_id = 0 while IMAGE_PLACEHOLDER in content: - # only support one image for one time? - image_size = image_input_sizes[0][0] + + if image_input_sizes is None: + raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER)) + + image_size = image_input_sizes[0][img_id] height, width = image_size num_height_tokens = height // patch_size num_width_tokens = width // patch_size @@ -359,7 +374,7 @@ class PixtralPlugin(BasePlugin): replace_tokens = [item for sublist in replace_tokens for item in sublist] replace_tokens[-1] = image_end_token replace_str = "".join(replace_tokens) - content.replace(IMAGE_PLACEHOLDER, replace_str, 1) + content = content.replace(IMAGE_PLACEHOLDER, replace_str, 1) img_id += 1 num_image_tokens += 1 @@ -383,7 +398,16 @@ class PixtralPlugin(BasePlugin): ) -> Dict[str, Union[List[int], "torch.Tensor"]]: self._validate_input(images, videos) - return self._get_mm_inputs(images, videos, processor) + mm_inputs = self._get_mm_inputs(images, videos, processor) + if mm_inputs.get('image_sizes'): + del mm_inputs['image_sizes'] + # TODO fix this type error + # if isinstance(mm_inputs.get("pixel_values"), list): #List[List[torch.tensor]] -> [B C W H] + # recommend for batch==1 for one gpu or it will rise the error of BatchEncoding. + mm_inputs["pixel_values"] = mm_inputs.get("pixel_values")[0][0].unsqueeze(0) + # mm_inputs["pixel_values"] = mm_inputs.get("pixel_values") + + return mm_inputs class Qwen2vlPlugin(BasePlugin): @override diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index ef075cf9..e3f6a99d 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -917,16 +917,6 @@ register_model_group( template="mistral", ) -register_model_group( - models={ - "Pixtral-12B-2409": { - DownloadSource.DEFAULT: "mistral-community/pixtral-12b", - DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b", - } - }, - template="mistral" -) - register_model_group( models={ @@ -1067,6 +1057,18 @@ register_model_group( ) +register_model_group( + models={ + "Pixtral-12B-2409": { + DownloadSource.DEFAULT: "mistral-community/pixtral-12b", + DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b", + } + }, + template="mistral", + vision=True +) + + register_model_group( models={ "Qwen-1.8B": { From fd79cf8551418954f199ec266f4385f8b1e5f894 Mon Sep 17 00:00:00 2001 From: Kingsley Date: Sat, 28 Sep 2024 22:50:53 +0800 Subject: [PATCH 04/34] tiny fix Former-commit-id: 3d3cc6705d4575f7f20bf4da2b7dab60b337006b --- README.md | 1 + README_zh.md | 1 + src/llamafactory/data/mm_plugin.py | 21 ++++++-------------- src/llamafactory/extras/constants.py | 4 ++-- src/llamafactory/model/model_utils/visual.py | 13 ++++++++++-- 5 files changed, 21 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 92bbcc88..cf37565b 100644 --- a/README.md +++ b/README.md @@ -183,6 +183,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ | [PaliGemma](https://huggingface.co/google) | 3B | paligemma | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | +| [Pixtral](https://huggingface.co/mistralai/Pixtral-12B-2409) | 12B | pixtral | | [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | diff --git a/README_zh.md b/README_zh.md index 0b02f35f..4b3b53de 100644 --- a/README_zh.md +++ b/README_zh.md @@ -184,6 +184,7 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272 | [PaliGemma](https://huggingface.co/google) | 3B | paligemma | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | +| [Pixtral](https://huggingface.co/mistralai/Pixtral-12B-2409) | 12B | pixtral | | [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 6716527c..2b85c2c5 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -325,14 +325,6 @@ class PaliGemmaPlugin(BasePlugin): return mm_inputs class PixtralPlugin(BasePlugin): - # @override - # def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject": - # image = super()._preprocess_image(image, **kwargs) - # UP_SIZE = (512,512) - # image = image.resize(UP_SIZE, resample=Image.NEAREST) - - # return image - @override def process_messages( self, @@ -396,16 +388,15 @@ class PixtralPlugin(BasePlugin): seqlens: Sequence[int], processor: Optional["ProcessorMixin"], ) -> Dict[str, Union[List[int], "torch.Tensor"]]: - self._validate_input(images, videos) mm_inputs = self._get_mm_inputs(images, videos, processor) - if mm_inputs.get('image_sizes'): - del mm_inputs['image_sizes'] - # TODO fix this type error - # if isinstance(mm_inputs.get("pixel_values"), list): #List[List[torch.tensor]] -> [B C W H] - # recommend for batch==1 for one gpu or it will rise the error of BatchEncoding. + if mm_inputs.get("image_sizes"): + mm_inputs.pop("image_sizes") + + if isinstance(mm_inputs.get("pixel_values"), list) and len(mm_inputs.get("pixel_values")[0]) >= 2: + raise ValueError("Now it only supports batchsize=1 on per gpu due to `List[tensor]` can not pack into BachEncoding") + mm_inputs["pixel_values"] = mm_inputs.get("pixel_values")[0][0].unsqueeze(0) - # mm_inputs["pixel_values"] = mm_inputs.get("pixel_values") return mm_inputs diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index e3f6a99d..3de1c7a2 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -1060,8 +1060,8 @@ register_model_group( register_model_group( models={ "Pixtral-12B-2409": { - DownloadSource.DEFAULT: "mistral-community/pixtral-12b", - DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b", + DownloadSource.DEFAULT: "mistralai/Pixtral-12B-2409", + DownloadSource.MODELSCOPE: "LLM-Research/Pixtral-12B-2409", } }, template="mistral", diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py index 23f880a6..107590bd 100644 --- a/src/llamafactory/model/model_utils/visual.py +++ b/src/llamafactory/model/model_utils/visual.py @@ -96,6 +96,9 @@ def autocast_projector_dtype(model: "PreTrainedModel", model_args: "ModelArgumen mm_projector: "torch.nn.Module" = getattr(model, "multi_modal_projector") elif model_type == "qwen2_vl": mm_projector: "torch.nn.Module" = getattr(getattr(model, "visual"), "merger") + # TODO check it + elif model_type == "pixtral": + mm_projector: "torch.nn.Module" = getattr(model, "vision_language_adapte") else: return @@ -122,9 +125,11 @@ def get_forbidden_modules(config: "PretrainedConfig", finetuning_args: "Finetuni """ model_type = getattr(config, "model_type", None) forbidden_modules = set() - if model_type in ["llava", "paligemma"]: + if model_type in ["llava", "paligemma", "pixtral"]: if finetuning_args.freeze_vision_tower: forbidden_modules.add("vision_tower") + #TODO check it + forbidden_modules.add("vision_encoder") if finetuning_args.train_mm_proj_only: forbidden_modules.add("language_model") @@ -150,7 +155,7 @@ def get_image_seqlen(config: "PretrainedConfig") -> int: image_seqlen += 1 elif model_type == "paligemma": image_seqlen = config.vision_config.num_image_tokens - elif model_type == "qwen2_vl": # variable length + elif model_type in ["qwen2_vl", "pixtral"]: # variable length image_seqlen = -1 return image_seqlen @@ -168,10 +173,14 @@ def patch_target_modules( return "^(?!.*vision_tower).*(?:{}).*".format("|".join(target_modules)) elif model_type == "qwen2_vl": return "^(?!.*visual).*(?:{}).*".format("|".join(target_modules)) + elif model_type == "pixtral": + return "^(?!.*vision_encoder).*(?:{}).*".format("|".join(target_modules)) else: return target_modules else: if model_type == "qwen2_vl": return "^(?!.*patch_embed).*(?:{}).*".format("|".join(target_modules)) + elif model_type == "pixtral": + return "^(?!.*patch_conv).*(?:{}).*".format("|".join(target_modules)) else: return target_modules From a2452d0b1c2cab9481054bdeeab3bcf945cbc588 Mon Sep 17 00:00:00 2001 From: Kingsley Date: Sun, 29 Sep 2024 00:00:23 +0800 Subject: [PATCH 05/34] Tiny fix Former-commit-id: 8f13a3627d06a6f0a9b4e35443a415958d9ad1c9 --- README.md | 2 +- README_zh.md | 2 +- src/llamafactory/data/mm_plugin.py | 3 +-- src/llamafactory/extras/constants.py | 6 +++--- src/llamafactory/model/model_utils/visual.py | 2 -- 5 files changed, 6 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index cf37565b..899ad0c4 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ | [PaliGemma](https://huggingface.co/google) | 3B | paligemma | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistralai/Pixtral-12B-2409) | 12B | pixtral | +| [Pixtral](https://huggingface.co/mistral-community/pixtral-12b) | 12B | pixtral | | [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | diff --git a/README_zh.md b/README_zh.md index 4b3b53de..e7335b72 100644 --- a/README_zh.md +++ b/README_zh.md @@ -184,7 +184,7 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272 | [PaliGemma](https://huggingface.co/google) | 3B | paligemma | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistralai/Pixtral-12B-2409) | 12B | pixtral | +| [Pixtral](https://huggingface.co/mistral-community/pixtral-12b) | 12B | pixtral | | [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 2b85c2c5..8688a8be 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -24,7 +24,6 @@ if TYPE_CHECKING: from av.stream import Stream from transformers import PreTrainedTokenizer, ProcessorMixin from transformers.image_processing_utils import BaseImageProcessor - from transformers.processing_utils import _validate_images_text_input_order, ProcessingKwargs class EncodedImage(TypedDict): path: Optional[str] @@ -392,7 +391,7 @@ class PixtralPlugin(BasePlugin): mm_inputs = self._get_mm_inputs(images, videos, processor) if mm_inputs.get("image_sizes"): mm_inputs.pop("image_sizes") - + if isinstance(mm_inputs.get("pixel_values"), list) and len(mm_inputs.get("pixel_values")[0]) >= 2: raise ValueError("Now it only supports batchsize=1 on per gpu due to `List[tensor]` can not pack into BachEncoding") diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index 3de1c7a2..32af244b 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -1059,9 +1059,9 @@ register_model_group( register_model_group( models={ - "Pixtral-12B-2409": { - DownloadSource.DEFAULT: "mistralai/Pixtral-12B-2409", - DownloadSource.MODELSCOPE: "LLM-Research/Pixtral-12B-2409", + "Pixtral-12B": { + DownloadSource.DEFAULT: "mistral-community/pixtral-12b", + DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b", } }, template="mistral", diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py index 107590bd..8aad4d87 100644 --- a/src/llamafactory/model/model_utils/visual.py +++ b/src/llamafactory/model/model_utils/visual.py @@ -96,7 +96,6 @@ def autocast_projector_dtype(model: "PreTrainedModel", model_args: "ModelArgumen mm_projector: "torch.nn.Module" = getattr(model, "multi_modal_projector") elif model_type == "qwen2_vl": mm_projector: "torch.nn.Module" = getattr(getattr(model, "visual"), "merger") - # TODO check it elif model_type == "pixtral": mm_projector: "torch.nn.Module" = getattr(model, "vision_language_adapte") else: @@ -128,7 +127,6 @@ def get_forbidden_modules(config: "PretrainedConfig", finetuning_args: "Finetuni if model_type in ["llava", "paligemma", "pixtral"]: if finetuning_args.freeze_vision_tower: forbidden_modules.add("vision_tower") - #TODO check it forbidden_modules.add("vision_encoder") if finetuning_args.train_mm_proj_only: From 94ce8f561f3bc664c81c1445f7cc33a7a4a59914 Mon Sep 17 00:00:00 2001 From: Kingsley Date: Mon, 30 Sep 2024 19:58:34 +0800 Subject: [PATCH 06/34] fix some errors due to inconsistency of model cards Former-commit-id: 2166b9bc6ba35760ff85b63620af9fa0213a4c78 --- README.md | 62 ++++++++++---------- README_zh.md | 62 ++++++++++---------- src/llamafactory/chat/hf_engine.py | 2 +- src/llamafactory/extras/constants.py | 4 +- src/llamafactory/model/model_utils/visual.py | 5 +- 5 files changed, 66 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index 3f0642c5..f6d0edf2 100644 --- a/README.md +++ b/README.md @@ -162,37 +162,37 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ ## Supported Models -| Model | Model size | Template | -| ----------------------------------------------------------------- | -------------------------------- | ---------------- | -| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | -| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | -| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | -| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | -| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | -| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | -| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | -| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | -| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 | -| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | -| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | -| [Llama 3-3.2](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | -| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | -| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | -| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | -| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | -| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | -| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | -| [PaliGemma](https://huggingface.co/google) | 3B | paligemma | -| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | -| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistral-community/pixtral-12b) | 12B | pixtral | -| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | -| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | -| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | -| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | -| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | -| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | -| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | +| Model | Model size | Template | +|-------------------------------------------------------------| -------------------------------- | ---------------- | +| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | +| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | +| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | +| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | +| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | +| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | +| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | +| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | +| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 | +| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | +| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | +| [Llama 3-3.2](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | +| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | +| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | +| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | +| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | +| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | +| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | +| [PaliGemma](https://huggingface.co/google) | 3B | paligemma | +| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | +| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | +| [Pixtral](https://huggingface.co/mistral-community) | 12B | pixtral | +| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | +| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | +| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | +| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | +| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | +| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | +| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | > [!NOTE] > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models. diff --git a/README_zh.md b/README_zh.md index f66b0e02..fddc6d2c 100644 --- a/README_zh.md +++ b/README_zh.md @@ -163,37 +163,37 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272 ## 模型 -| 模型名 | 模型大小 | Template | -| ----------------------------------------------------------------- | -------------------------------- | ---------------- | -| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | -| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | -| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | -| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | -| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | -| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | -| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | -| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | -| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 | -| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | -| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | -| [Llama 3-3.2](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | -| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | -| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | -| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | -| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | -| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | -| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | -| [PaliGemma](https://huggingface.co/google) | 3B | paligemma | -| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | -| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistral-community/pixtral-12b) | 12B | pixtral | -| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | -| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | -| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | -| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | -| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | -| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | -| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | +| 模型名 | 模型大小 | Template | +|-------------------------------------------------------------| -------------------------------- | ---------------- | +| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | +| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | +| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | +| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | +| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | +| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | +| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | +| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | +| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 | +| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | +| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | +| [Llama 3-3.2](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | +| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | +| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | +| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | +| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | +| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | +| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | +| [PaliGemma](https://huggingface.co/google) | 3B | paligemma | +| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | +| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | +| [Pixtral](https://huggingface.co/mistral-community) | 12B | pixtral | +| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | +| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | +| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | +| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | +| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | +| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | +| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | > [!NOTE] > 对于所有“基座”(Base)模型,`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”(Instruct/Chat)模型请务必使用**对应的模板**。 diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 68416fdf..2b1d9fe5 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -168,7 +168,7 @@ class HuggingfaceEngine(BaseEngine): for key, value in mm_inputs.items(): value = value if isinstance(value, torch.Tensor) else torch.tensor(value) gen_kwargs[key] = value.to(model.device) - + return gen_kwargs, prompt_length @staticmethod diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index 17bb5aed..ed1aff63 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -1164,12 +1164,12 @@ register_model_group( register_model_group( models={ - "Pixtral-12B": { + "Pixtral-12B-Instruct": { DownloadSource.DEFAULT: "mistral-community/pixtral-12b", DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b", } }, - template="mistral", + template="pixtral", vision=True ) diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py index a30be115..bc3ef676 100644 --- a/src/llamafactory/model/model_utils/visual.py +++ b/src/llamafactory/model/model_utils/visual.py @@ -92,12 +92,10 @@ def autocast_projector_dtype(model: "PreTrainedModel", model_args: "ModelArgumen if getattr(model, "quantization_method", None): model_type = getattr(model.config, "model_type", None) - if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]: + if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "pixtral", "video_llava"]: mm_projector: "torch.nn.Module" = getattr(model, "multi_modal_projector") elif model_type == "qwen2_vl": mm_projector: "torch.nn.Module" = getattr(getattr(model, "visual"), "merger") - elif model_type == "pixtral": - mm_projector: "torch.nn.Module" = getattr(model, "vision_language_adapte") else: return @@ -133,7 +131,6 @@ def get_forbidden_modules(config: "PretrainedConfig", finetuning_args: "Finetuni if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "pixtral", "video_llava"]: if finetuning_args.freeze_vision_tower: forbidden_modules.add("vision_tower") - forbidden_modules.add("vision_encoder") if finetuning_args.train_mm_proj_only: forbidden_modules.add("language_model") From ececd68f9a122a78917435cfd9a5a09a16b8649f Mon Sep 17 00:00:00 2001 From: Kingsley Date: Mon, 30 Sep 2024 20:04:47 +0800 Subject: [PATCH 07/34] register model fix Former-commit-id: 15d555c8c523ac2252d55614773073c57db0e025 --- src/llamafactory/extras/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index ed1aff63..4a8a6d25 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -1164,7 +1164,7 @@ register_model_group( register_model_group( models={ - "Pixtral-12B-Instruct": { + "Pixtral-12B-Chat": { DownloadSource.DEFAULT: "mistral-community/pixtral-12b", DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b", } From 6729ed2c7e4f16b613cfb3e1476d2ad83be3899e Mon Sep 17 00:00:00 2001 From: Kingsley Date: Mon, 30 Sep 2024 20:27:05 +0800 Subject: [PATCH 08/34] sync with former Former-commit-id: 9ddb84052e3cc72e21a92b8103caa179a35859c4 --- src/llamafactory/model/model_utils/visual.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py index bc3ef676..7007b5a2 100644 --- a/src/llamafactory/model/model_utils/visual.py +++ b/src/llamafactory/model/model_utils/visual.py @@ -186,12 +186,10 @@ def patch_target_modules( """ model_type = getattr(config, "model_type", None) if finetuning_args.freeze_vision_tower: - if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]: + if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "pixtral", "video_llava"]: return "^(?!.*vision_tower).*(?:{}).*".format("|".join(target_modules)) elif model_type == "qwen2_vl": return "^(?!.*visual).*(?:{}).*".format("|".join(target_modules)) - elif model_type == "pixtral": - return "^(?!.*vision_encoder).*(?:{}).*".format("|".join(target_modules)) else: return target_modules else: From dd2d1c3154112cc767df3f0ab3930905db2eaaeb Mon Sep 17 00:00:00 2001 From: Kingsley Date: Mon, 30 Sep 2024 23:36:16 +0800 Subject: [PATCH 09/34] unfactor md Former-commit-id: c668568bc73914ba071a4121c4fec1ee7f2ab76c --- README.md | 64 ++++++++++++++++++++++++++-------------------------- README_zh.md | 64 ++++++++++++++++++++++++++-------------------------- 2 files changed, 64 insertions(+), 64 deletions(-) diff --git a/README.md b/README.md index f6d0edf2..c02c455e 100644 --- a/README.md +++ b/README.md @@ -162,37 +162,37 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ ## Supported Models -| Model | Model size | Template | -|-------------------------------------------------------------| -------------------------------- | ---------------- | -| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | -| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | -| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | -| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | -| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | -| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | -| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | -| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | -| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 | -| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | -| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | -| [Llama 3-3.2](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | -| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | -| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | -| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | -| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | -| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | -| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | -| [PaliGemma](https://huggingface.co/google) | 3B | paligemma | -| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | -| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistral-community) | 12B | pixtral | -| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | -| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | -| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | -| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | -| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | -| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | -| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | +| Model | Model size | Template | +| ----------------------------------------------------------------- | -------------------------------- | ---------------- | +| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | +| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | +| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | +| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | +| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | +| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | +| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | +| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | +| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 | +| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | +| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | +| [Llama 3-3.2](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | +| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | +| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | +| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | +| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | +| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | +| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | +| [PaliGemma](https://huggingface.co/google) | 3B | paligemma | +| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | +| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | +| [Pixtral](https://huggingface.co/mistral-community) | 12B | pixtral | +| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | +| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | +| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | +| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | +| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | +| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | +| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | > [!NOTE] > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models. @@ -721,4 +721,4 @@ This repo benefits from [PEFT](https://github.com/huggingface/peft), [TRL](https ## Star History -![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) +![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) \ No newline at end of file diff --git a/README_zh.md b/README_zh.md index fddc6d2c..4dff0be7 100644 --- a/README_zh.md +++ b/README_zh.md @@ -163,37 +163,37 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272 ## 模型 -| 模型名 | 模型大小 | Template | -|-------------------------------------------------------------| -------------------------------- | ---------------- | -| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | -| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | -| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | -| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | -| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | -| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | -| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | -| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | -| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 | -| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | -| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | -| [Llama 3-3.2](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | -| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | -| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | -| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | -| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | -| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | -| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | -| [PaliGemma](https://huggingface.co/google) | 3B | paligemma | -| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | -| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistral-community) | 12B | pixtral | -| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | -| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | -| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | -| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | -| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | -| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | -| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | +| 模型名 | 模型大小 | Template | +| ----------------------------------------------------------------- | -------------------------------- | ---------------- | +| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | +| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | +| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | +| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | +| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | +| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | +| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | +| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | +| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 | +| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | +| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | +| [Llama 3-3.2](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | +| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | +| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | +| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | +| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | +| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | +| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | +| [PaliGemma](https://huggingface.co/google) | 3B | paligemma | +| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | +| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | +| [Pixtral](https://huggingface.co/mistral-community) | 12B | pixtral | +| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | +| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | +| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | +| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | +| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | +| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | +| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | > [!NOTE] > 对于所有“基座”(Base)模型,`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”(Instruct/Chat)模型请务必使用**对应的模板**。 @@ -722,4 +722,4 @@ run_name: test_run # 可选 ## Star History -![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) +![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) \ No newline at end of file From 2c5f912e16baebc0f7caf4b31696f9620c293f5f Mon Sep 17 00:00:00 2001 From: Kingsley Date: Mon, 14 Oct 2024 16:55:59 +0800 Subject: [PATCH 10/34] remove bs condition Former-commit-id: 962b9730a7a2940a0d4e5c76d1fe41d0fef76547 --- src/llamafactory/data/mm_plugin.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index edca48a7..b9e7bc3b 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -477,7 +477,7 @@ class PixtralPlugin(BasePlugin): if image_input_sizes is None: raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER)) - + image_size = image_input_sizes[0][img_id] height, width = image_size num_height_tokens = height // patch_size @@ -500,7 +500,7 @@ class PixtralPlugin(BasePlugin): raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER)) return messages - + @override def get_mm_inputs( self, @@ -516,11 +516,6 @@ class PixtralPlugin(BasePlugin): if mm_inputs.get("image_sizes"): mm_inputs.pop("image_sizes") - if isinstance(mm_inputs.get("pixel_values"), list) and len(mm_inputs.get("pixel_values")[0]) >= 2: - raise ValueError("Now it only supports batchsize=1 on per gpu due to `List[tensor]` can not pack into BachEncoding") - - mm_inputs["pixel_values"] = mm_inputs.get("pixel_values")[0][0].unsqueeze(0) - return mm_inputs class Qwen2vlPlugin(BasePlugin): From df722bf18e8ce0484054edb1cbbdee2eb33195f3 Mon Sep 17 00:00:00 2001 From: KUANGDD Date: Mon, 14 Oct 2024 21:11:09 +0800 Subject: [PATCH 11/34] required transformers version Former-commit-id: 9f44598b92e72cf8dd923eb229f4637ab9287948 --- src/llamafactory/data/mm_plugin.py | 6 ++++++ src/llamafactory/extras/misc.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index b9e7bc3b..9d81848b 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -692,6 +692,12 @@ def get_mm_plugin( video_token: Optional[str] = None, ) -> "BasePlugin": plugin_class = PLUGINS.get(name, None) + if plugin_class == "PixtralPlugin": + from transformers.utils.versions import require_version + try: + require_version("transformers==4.46.0.dev0") + except Exception as e: + raise ImportError("PixtralPlugin requires transformers==4.46.0.dev0. Please install it first.") if plugin_class is None: raise ValueError("Multimodal plugin `{}` not found.".format(name)) diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py index fd78530a..47f2ebbe 100644 --- a/src/llamafactory/extras/misc.py +++ b/src/llamafactory/extras/misc.py @@ -79,7 +79,7 @@ def check_dependencies() -> None: if os.environ.get("DISABLE_VERSION_CHECK", "0").lower() in ["true", "1"]: logger.warning("Version checking has been disabled, may lead to unexpected behaviors.") else: - require_version("transformers>=4.41.2,<=4.45.2", "To fix: pip install transformers>=4.41.2,<=4.45.2") + require_version("transformers>=4.41.2", "To fix: pip install transformers>=4.41.2,<=4.45.2") require_version("datasets>=2.16.0,<=2.21.0", "To fix: pip install datasets>=2.16.0,<=2.21.0") require_version("accelerate>=0.30.1,<=0.34.2", "To fix: pip install accelerate>=0.30.1,<=0.34.2") require_version("peft>=0.11.1,<=0.12.0", "To fix: pip install peft>=0.11.1,<=0.12.0") From 5e440a467dcb6e9ebbc1408418611046966be2b1 Mon Sep 17 00:00:00 2001 From: KUANGDD Date: Tue, 15 Oct 2024 12:12:46 +0800 Subject: [PATCH 12/34] plugin test & check Former-commit-id: 2df2be1c47aded0132b5cc86acd3926dca585bc1 --- src/llamafactory/data/mm_plugin.py | 2 +- tests/data/test_mm_plugin.py | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 9d81848b..5f128706 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -697,7 +697,7 @@ def get_mm_plugin( try: require_version("transformers==4.46.0.dev0") except Exception as e: - raise ImportError("PixtralPlugin requires transformers==4.46.0.dev0. Please install it first.") + raise ImportError("PixtralPlugin requires transformers>=4.46.0.dev0. Please install it first.") if plugin_class is None: raise ValueError("Multimodal plugin `{}` not found.".format(name)) diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py index 75541000..70b61444 100644 --- a/tests/data/test_mm_plugin.py +++ b/tests/data/test_mm_plugin.py @@ -178,7 +178,18 @@ def test_paligemma_plugin(): check_inputs["expected_no_mm_inputs"] = {"token_type_ids": [[1] * 1024]} _check_plugin(**check_inputs) - +def test_pixtral_plugin(): + tokenizer, processor = _load_tokenizer_module(model_name_or_path="mistral-community/pixtral-12b") + pixtral_plugin = get_mm_plugin(name="pixtral", image_token="[IMG]") + image_slice_heigt, image_slice_width = 2, 2 + check_inputs = {"plugin": pixtral_plugin, "tokenizer": tokenizer, "processor": processor} + check_inputs["expected_mm_messages"] = [ + {key: value.replace("", "{}[IMG_BREAK]".format("[IMG]" * image_slice_width) * image_slice_heigt).rsplit("[IMG_BREAK]", 1)[0] + "[IMG_END]" + for key, value in message.items()} for message in MM_MESSAGES + ] + check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor) + _check_plugin(**check_inputs) + def test_qwen2_vl_plugin(): tokenizer, processor = _load_tokenizer_module(model_name_or_path="Qwen/Qwen2-VL-7B-Instruct") qwen2_vl_plugin = get_mm_plugin(name="qwen2_vl", image_token="<|image_pad|>") @@ -206,3 +217,6 @@ def test_video_llava_plugin(): ] check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor) _check_plugin(**check_inputs) + +if __name__ == "__main__": + test_pixtral_plugin() \ No newline at end of file From a3f37777c1628d7992ee1deb3a9418fb751ba724 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Tue, 15 Oct 2024 13:30:41 +0800 Subject: [PATCH 13/34] fix some Former-commit-id: 25641af04c98e902ff024c8fa7b4c2c36ed797de --- tests/data/test_mm_plugin.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py index 70b61444..d3c3f021 100644 --- a/tests/data/test_mm_plugin.py +++ b/tests/data/test_mm_plugin.py @@ -178,18 +178,20 @@ def test_paligemma_plugin(): check_inputs["expected_no_mm_inputs"] = {"token_type_ids": [[1] * 1024]} _check_plugin(**check_inputs) + def test_pixtral_plugin(): tokenizer, processor = _load_tokenizer_module(model_name_or_path="mistral-community/pixtral-12b") pixtral_plugin = get_mm_plugin(name="pixtral", image_token="[IMG]") - image_slice_heigt, image_slice_width = 2, 2 + image_slice_height, image_slice_width = 2, 2 check_inputs = {"plugin": pixtral_plugin, "tokenizer": tokenizer, "processor": processor} check_inputs["expected_mm_messages"] = [ - {key: value.replace("", "{}[IMG_BREAK]".format("[IMG]" * image_slice_width) * image_slice_heigt).rsplit("[IMG_BREAK]", 1)[0] + "[IMG_END]" + {key: value.replace("", "{}[IMG_BREAK]".format("[IMG]" * image_slice_width) * image_slice_height).rsplit("[IMG_BREAK]", 1)[0] + "[IMG_END]" for key, value in message.items()} for message in MM_MESSAGES ] check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor) _check_plugin(**check_inputs) - + + def test_qwen2_vl_plugin(): tokenizer, processor = _load_tokenizer_module(model_name_or_path="Qwen/Qwen2-VL-7B-Instruct") qwen2_vl_plugin = get_mm_plugin(name="qwen2_vl", image_token="<|image_pad|>") @@ -217,6 +219,3 @@ def test_video_llava_plugin(): ] check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor) _check_plugin(**check_inputs) - -if __name__ == "__main__": - test_pixtral_plugin() \ No newline at end of file From 66ee9f04891ef3ac1f4f006b67102da32cc6ed7e Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Tue, 15 Oct 2024 13:53:33 +0800 Subject: [PATCH 14/34] tiny fix [skip ci] Former-commit-id: 58b97197b8aea2820c1b7eb338753dba6b22f3d7 --- src/llamafactory/data/mm_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 5f128706..15785844 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -680,8 +680,8 @@ PLUGINS = { "llava_next": LlavaNextPlugin, "llava_next_video": LlavaNextVideoPlugin, "paligemma": PaliGemmaPlugin, - "qwen2_vl": Qwen2vlPlugin, "pixtral": PixtralPlugin, + "qwen2_vl": Qwen2vlPlugin, "video_llava": VideoLlavaPlugin, } From ae869639dd1fafef8d9c7c738f46c42ff4322ddb Mon Sep 17 00:00:00 2001 From: Kingsley Date: Tue, 15 Oct 2024 17:09:24 +0800 Subject: [PATCH 15/34] add extra test for pixtral mm_input Former-commit-id: 0fc949783dec2d038dc3d1bf52051c256b69ac20 --- src/llamafactory/data/mm_plugin.py | 2 -- tests/data/test_mm_plugin.py | 42 ++++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 5f128706..f3f6433c 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -513,8 +513,6 @@ class PixtralPlugin(BasePlugin): ) -> Dict[str, Union[List[int], "torch.Tensor"]]: self._validate_input(images, videos) mm_inputs = self._get_mm_inputs(images, videos, processor) - if mm_inputs.get("image_sizes"): - mm_inputs.pop("image_sizes") return mm_inputs diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py index d3c3f021..da64ccc3 100644 --- a/tests/data/test_mm_plugin.py +++ b/tests/data/test_mm_plugin.py @@ -13,7 +13,7 @@ # limitations under the License. import os -from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple +from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, Union import pytest import torch @@ -68,12 +68,50 @@ def _get_mm_inputs(processor: "ProcessorMixin") -> Dict[str, "torch.Tensor"]: image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") return image_processor(images=IMAGES, return_tensors="pt") +def _is_nested_tensor_list(element): + if not isinstance(element, list): + return False + + for item in element: + if isinstance(item, list): + if not _is_nested_tensor_list(item): + return False + + elif not isinstance(item, torch.Tensor): + return False + + return True + + +def _equal_nested_tensor_list(a: List[List[torch.Tensor]], b: List[List[torch.Tensor]]) -> bool: + if type(a) != type(b): + return False + + if isinstance(a, list) and isinstance(b, list): + if len(a) != len(b): + return False + + for sub_a, sub_b in zip(a, b): + if isinstance(sub_a, list) and isinstance(sub_b, list): + if not _equal_nested_tensor_list(sub_a, sub_b): + return False + elif isinstance(sub_a, torch.Tensor) and isinstance(sub_b, torch.Tensor): + if not torch.equal(sub_a, sub_b): + return False + else: + return False + + return True + + return False def _is_close(batch_a: Dict[str, Any], batch_b: Dict[str, Any]) -> None: assert batch_a.keys() == batch_b.keys() for key in batch_a.keys(): if isinstance(batch_a[key], torch.Tensor): assert torch.allclose(batch_a[key], batch_b[key], rtol=1e-4, atol=1e-5) + elif _is_nested_tensor_list(batch_a[key]) and _is_nested_tensor_list(batch_b[key]): + assert _equal_nested_tensor_list(batch_a[key], batch_b[key]) else: assert batch_a[key] == batch_b[key] @@ -185,7 +223,7 @@ def test_pixtral_plugin(): image_slice_height, image_slice_width = 2, 2 check_inputs = {"plugin": pixtral_plugin, "tokenizer": tokenizer, "processor": processor} check_inputs["expected_mm_messages"] = [ - {key: value.replace("", "{}[IMG_BREAK]".format("[IMG]" * image_slice_width) * image_slice_height).rsplit("[IMG_BREAK]", 1)[0] + "[IMG_END]" + {key: value.replace("", ("{}[IMG_BREAK]".format("[IMG]" * image_slice_width) * image_slice_height).rsplit("[IMG_BREAK]", 1)[0] + "[IMG_END]") for key, value in message.items()} for message in MM_MESSAGES ] check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor) From 9c4941a1eab20a35f6e693aacec32b8f9245165a Mon Sep 17 00:00:00 2001 From: KUANGDD Date: Wed, 16 Oct 2024 01:09:33 +0800 Subject: [PATCH 16/34] fix bug for webui infer Former-commit-id: 7ea29bbfe03550ac59ff9cb01a4bc41c95ac3adf --- src/llamafactory/data/mm_plugin.py | 6 ++++++ tests/data/test_mm_plugin.py | 5 +++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index de7e362a..f67737f5 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -513,6 +513,12 @@ class PixtralPlugin(BasePlugin): ) -> Dict[str, Union[List[int], "torch.Tensor"]]: self._validate_input(images, videos) mm_inputs = self._get_mm_inputs(images, videos, processor) + # hack for hf engine + if mm_inputs.get("pixel_values") and len(mm_inputs.get("pixel_values")[0]) == 1: + mm_inputs["pixel_values"] = mm_inputs["pixel_values"][0][0].unsqueeze(0) + + if mm_inputs.get("image_sizes"): + del mm_inputs["image_sizes"] return mm_inputs diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py index da64ccc3..32b89f7f 100644 --- a/tests/data/test_mm_plugin.py +++ b/tests/data/test_mm_plugin.py @@ -110,8 +110,6 @@ def _is_close(batch_a: Dict[str, Any], batch_b: Dict[str, Any]) -> None: for key in batch_a.keys(): if isinstance(batch_a[key], torch.Tensor): assert torch.allclose(batch_a[key], batch_b[key], rtol=1e-4, atol=1e-5) - elif _is_nested_tensor_list(batch_a[key]) and _is_nested_tensor_list(batch_b[key]): - assert _equal_nested_tensor_list(batch_a[key], batch_b[key]) else: assert batch_a[key] == batch_b[key] @@ -227,6 +225,9 @@ def test_pixtral_plugin(): for key, value in message.items()} for message in MM_MESSAGES ] check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor) + # TODO works needed for pixtral plugin test & hack hf engine input below for now + check_inputs["expected_mm_inputs"].pop("image_sizes") + check_inputs["expected_mm_inputs"]["pixel_values"] = check_inputs["expected_mm_inputs"]["pixel_values"][0][0].unsqueeze(0) _check_plugin(**check_inputs) From a24f94a36c40c3ce5d5fb07771b8ace9cec47dd8 Mon Sep 17 00:00:00 2001 From: KUANGDD Date: Wed, 16 Oct 2024 01:14:51 +0800 Subject: [PATCH 17/34] remove useless codes Former-commit-id: 9b2642a2b53d3392e95061ed0f2c8dc10580c9e8 --- tests/data/test_mm_plugin.py | 36 ------------------------------------ 1 file changed, 36 deletions(-) diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py index 32b89f7f..b342e658 100644 --- a/tests/data/test_mm_plugin.py +++ b/tests/data/test_mm_plugin.py @@ -68,42 +68,6 @@ def _get_mm_inputs(processor: "ProcessorMixin") -> Dict[str, "torch.Tensor"]: image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") return image_processor(images=IMAGES, return_tensors="pt") -def _is_nested_tensor_list(element): - if not isinstance(element, list): - return False - - for item in element: - if isinstance(item, list): - if not _is_nested_tensor_list(item): - return False - - elif not isinstance(item, torch.Tensor): - return False - - return True - - -def _equal_nested_tensor_list(a: List[List[torch.Tensor]], b: List[List[torch.Tensor]]) -> bool: - if type(a) != type(b): - return False - - if isinstance(a, list) and isinstance(b, list): - if len(a) != len(b): - return False - - for sub_a, sub_b in zip(a, b): - if isinstance(sub_a, list) and isinstance(sub_b, list): - if not _equal_nested_tensor_list(sub_a, sub_b): - return False - elif isinstance(sub_a, torch.Tensor) and isinstance(sub_b, torch.Tensor): - if not torch.equal(sub_a, sub_b): - return False - else: - return False - - return True - - return False def _is_close(batch_a: Dict[str, Any], batch_b: Dict[str, Any]) -> None: assert batch_a.keys() == batch_b.keys() From 16d4149c2507d52e5488716746cd48281067cb40 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Wed, 16 Oct 2024 15:55:30 +0800 Subject: [PATCH 18/34] tiny fix [skip ci] Former-commit-id: 1724a58b478d7960ed07e864620249091b242e34 --- tests/data/test_mm_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py index b342e658..cc9bf2a1 100644 --- a/tests/data/test_mm_plugin.py +++ b/tests/data/test_mm_plugin.py @@ -13,7 +13,7 @@ # limitations under the License. import os -from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple import pytest import torch From 79433fb6a6745789a31bca5e656d52551d4a157b Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Thu, 17 Oct 2024 19:46:36 +0800 Subject: [PATCH 19/34] Update README.md Former-commit-id: 1fea87183561559f140f8de9b869e893ff8a3378 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cbb4df12..7ec1c87b 100644 --- a/README.md +++ b/README.md @@ -740,4 +740,4 @@ This repo benefits from [PEFT](https://github.com/huggingface/peft), [TRL](https ## Star History -![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) \ No newline at end of file +![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) From 6fbf77aa54f2a6a0b84cec457356fef6730035d7 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Thu, 17 Oct 2024 19:47:33 +0800 Subject: [PATCH 20/34] Update README_zh.md Former-commit-id: 110e4c548dac4a1838d069d312d9f27af90cb1e3 --- README_zh.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_zh.md b/README_zh.md index 0d9f8fdf..a578afcb 100644 --- a/README_zh.md +++ b/README_zh.md @@ -741,4 +741,4 @@ run_name: test_run # 可选 ## Star History -![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) \ No newline at end of file +![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) From af50c0387979d01df1527250707ca699fd732b32 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Thu, 17 Oct 2024 19:48:12 +0800 Subject: [PATCH 21/34] Update loader.py Former-commit-id: 93b9067dfc44d3a8570c39831539a8aa3e27e2db --- src/llamafactory/model/loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index 957d5e4e..7613c092 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -116,7 +116,6 @@ def load_config(model_args: "ModelArguments") -> "PretrainedConfig": Loads model config. """ init_kwargs = _get_init_kwargs(model_args) - return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs) From 8fb211ad0e8e090631f560e53ef13f5c4b82ef59 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Thu, 17 Oct 2024 19:48:51 +0800 Subject: [PATCH 22/34] Update misc.py Former-commit-id: 769fbb6349006effa261f40fc055f670fae3e98d --- src/llamafactory/extras/misc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py index 47f2ebbe..74d3f595 100644 --- a/src/llamafactory/extras/misc.py +++ b/src/llamafactory/extras/misc.py @@ -79,7 +79,7 @@ def check_dependencies() -> None: if os.environ.get("DISABLE_VERSION_CHECK", "0").lower() in ["true", "1"]: logger.warning("Version checking has been disabled, may lead to unexpected behaviors.") else: - require_version("transformers>=4.41.2", "To fix: pip install transformers>=4.41.2,<=4.45.2") + require_version("transformers>=4.41.2,<=4.46.0", "To fix: pip install transformers>=4.41.2,<=4.46.0") require_version("datasets>=2.16.0,<=2.21.0", "To fix: pip install datasets>=2.16.0,<=2.21.0") require_version("accelerate>=0.30.1,<=0.34.2", "To fix: pip install accelerate>=0.30.1,<=0.34.2") require_version("peft>=0.11.1,<=0.12.0", "To fix: pip install peft>=0.11.1,<=0.12.0") From 62cbcb646ab91b050b058ca788da4ea1c3c578bc Mon Sep 17 00:00:00 2001 From: KUANGDD Date: Wed, 23 Oct 2024 15:24:07 +0800 Subject: [PATCH 23/34] modify style & little change Former-commit-id: 9d6143e36a12e0f295139d057aeb1843535435cf --- src/llamafactory/chat/hf_engine.py | 11 +++++++++- src/llamafactory/data/collator.py | 3 +++ src/llamafactory/data/mm_plugin.py | 30 +++++++++++++--------------- src/llamafactory/data/template.py | 2 +- src/llamafactory/extras/constants.py | 2 +- src/llamafactory/model/loader.py | 2 +- tests/data/test_mm_plugin.py | 20 ++++++++++++++----- 7 files changed, 45 insertions(+), 25 deletions(-) diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 2b1d9fe5..53fb666a 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -165,8 +165,17 @@ class HuggingfaceEngine(BaseEngine): ) mm_inputs = template.mm_plugin.get_mm_inputs(**mm_input_dict, seqlens=[prompt_length], processor=processor) + for key, value in mm_inputs.items(): - value = value if isinstance(value, torch.Tensor) else torch.tensor(value) + value = ( + value + if isinstance(value, torch.Tensor) + else ( + torch.stack(value) + if isinstance(value, list) and all(isinstance(v, torch.Tensor) for v in value) + else torch.tensor(value) + ) + ) gen_kwargs[key] = value.to(model.device) return gen_kwargs, prompt_length diff --git a/src/llamafactory/data/collator.py b/src/llamafactory/data/collator.py index 92d86cc7..e92d2ab3 100644 --- a/src/llamafactory/data/collator.py +++ b/src/llamafactory/data/collator.py @@ -99,6 +99,9 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): features: Dict[str, "torch.Tensor"] = super().__call__(features) features.update(mm_inputs) + if features.get("pixel_values") is not None and isinstance(features["pixel_values"], list): + features = features.data + return features diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index f67737f5..a138c058 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -4,6 +4,7 @@ from io import BytesIO from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, TypedDict, Union import numpy as np +import torch from transformers.image_utils import get_image_size, to_numpy_array from typing_extensions import override @@ -447,6 +448,7 @@ class PaliGemmaPlugin(BasePlugin): mm_inputs["token_type_ids"] = _get_paligemma_token_type_ids(imglens, seqlens, processor) return mm_inputs + class PixtralPlugin(BasePlugin): @override def process_messages( @@ -466,32 +468,28 @@ class PixtralPlugin(BasePlugin): img_kwargs = self._get_mm_inputs(images, videos, processor) image_input_sizes = None - if img_kwargs.get("pixel_values") is not None: - image_input_sizes = img_kwargs["image_sizes"] + image_input_sizes = img_kwargs.get("image_sizes", None) messages = deepcopy(messages) for message in messages: content = message["content"] - img_id = 0 while IMAGE_PLACEHOLDER in content: - if image_input_sizes is None: - raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER)) + raise ValueError( + "The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER) + ) - image_size = image_input_sizes[0][img_id] + image_size = image_input_sizes[0][num_image_tokens] height, width = image_size num_height_tokens = height // patch_size num_width_tokens = width // patch_size - replace_tokens = [ - [image_token] * num_width_tokens + [image_break_token] - ] * num_height_tokens + replace_tokens = [[image_token] * num_width_tokens + [image_break_token]] * num_height_tokens # Flatten list replace_tokens = [item for sublist in replace_tokens for item in sublist] replace_tokens[-1] = image_end_token replace_str = "".join(replace_tokens) content = content.replace(IMAGE_PLACEHOLDER, replace_str, 1) - img_id += 1 num_image_tokens += 1 message["content"] = content @@ -514,14 +512,13 @@ class PixtralPlugin(BasePlugin): self._validate_input(images, videos) mm_inputs = self._get_mm_inputs(images, videos, processor) # hack for hf engine - if mm_inputs.get("pixel_values") and len(mm_inputs.get("pixel_values")[0]) == 1: - mm_inputs["pixel_values"] = mm_inputs["pixel_values"][0][0].unsqueeze(0) - - if mm_inputs.get("image_sizes"): - del mm_inputs["image_sizes"] + if mm_inputs.get("pixel_values"): + mm_inputs["pixel_values"] = mm_inputs["pixel_values"][0] + mm_inputs.pop("image_sizes", None) return mm_inputs + class Qwen2vlPlugin(BasePlugin): @override def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject": @@ -698,9 +695,10 @@ def get_mm_plugin( plugin_class = PLUGINS.get(name, None) if plugin_class == "PixtralPlugin": from transformers.utils.versions import require_version + try: require_version("transformers==4.46.0.dev0") - except Exception as e: + except Exception: raise ImportError("PixtralPlugin requires transformers>=4.46.0.dev0. Please install it first.") if plugin_class is None: raise ValueError("Multimodal plugin `{}` not found.".format(name)) diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index 28ad2295..a9618885 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -938,7 +938,7 @@ _register_template( name="pixtral", format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]), format_prefix=EmptyFormatter(slots=[{"bos_token"}]), - mm_plugin=get_mm_plugin(name="pixtral", image_token="[IMG]") + mm_plugin=get_mm_plugin(name="pixtral", image_token="[IMG]"), ) diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index cf5df20c..237afeec 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -1185,7 +1185,7 @@ register_model_group( } }, template="pixtral", - vision=True + vision=True, ) diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index 957d5e4e..299e6333 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -116,7 +116,7 @@ def load_config(model_args: "ModelArguments") -> "PretrainedConfig": Loads model config. """ init_kwargs = _get_init_kwargs(model_args) - + return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs) diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py index b342e658..66e9b57c 100644 --- a/tests/data/test_mm_plugin.py +++ b/tests/data/test_mm_plugin.py @@ -13,7 +13,7 @@ # limitations under the License. import os -from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple import pytest import torch @@ -74,6 +74,10 @@ def _is_close(batch_a: Dict[str, Any], batch_b: Dict[str, Any]) -> None: for key in batch_a.keys(): if isinstance(batch_a[key], torch.Tensor): assert torch.allclose(batch_a[key], batch_b[key], rtol=1e-4, atol=1e-5) + elif isinstance(batch_a[key], list) and all(isinstance(item, torch.Tensor) for item in batch_a[key]): + assert len(batch_a[key]) == len(batch_b[key]) + for tensor_a, tensor_b in zip(batch_a[key], batch_b[key]): + assert torch.allclose(tensor_a, tensor_b, rtol=1e-4, atol=1e-5) else: assert batch_a[key] == batch_b[key] @@ -185,13 +189,19 @@ def test_pixtral_plugin(): image_slice_height, image_slice_width = 2, 2 check_inputs = {"plugin": pixtral_plugin, "tokenizer": tokenizer, "processor": processor} check_inputs["expected_mm_messages"] = [ - {key: value.replace("", ("{}[IMG_BREAK]".format("[IMG]" * image_slice_width) * image_slice_height).rsplit("[IMG_BREAK]", 1)[0] + "[IMG_END]") - for key, value in message.items()} for message in MM_MESSAGES + { + key: value.replace( + "", + ("{}[IMG_BREAK]".format("[IMG]" * image_slice_width) * image_slice_height).rsplit("[IMG_BREAK]", 1)[0] + + "[IMG_END]", + ) + for key, value in message.items() + } + for message in MM_MESSAGES ] check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor) - # TODO works needed for pixtral plugin test & hack hf engine input below for now check_inputs["expected_mm_inputs"].pop("image_sizes") - check_inputs["expected_mm_inputs"]["pixel_values"] = check_inputs["expected_mm_inputs"]["pixel_values"][0][0].unsqueeze(0) + check_inputs["expected_mm_inputs"]["pixel_values"] = check_inputs["expected_mm_inputs"]["pixel_values"][0] _check_plugin(**check_inputs) From 006b708b57cf135a5bb3c9ba2a1f36858842f576 Mon Sep 17 00:00:00 2001 From: KUANGDD Date: Wed, 23 Oct 2024 15:32:33 +0800 Subject: [PATCH 24/34] rm import torch Former-commit-id: a8571844569416b59d7e9c5fcc4f9a4809d8700c --- src/llamafactory/data/mm_plugin.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index a138c058..9249d4ab 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -4,7 +4,6 @@ from io import BytesIO from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, TypedDict, Union import numpy as np -import torch from transformers.image_utils import get_image_size, to_numpy_array from typing_extensions import override From b9c6fcfe98856ea1e4ff49bf53115360d3fd88a4 Mon Sep 17 00:00:00 2001 From: KUANGDD Date: Wed, 23 Oct 2024 15:38:11 +0800 Subject: [PATCH 25/34] rm useless code Former-commit-id: f5c1cdfaab939fb970393452081950a4b6e3604d --- src/llamafactory/data/mm_plugin.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 9249d4ab..acdcca4e 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -692,13 +692,6 @@ def get_mm_plugin( video_token: Optional[str] = None, ) -> "BasePlugin": plugin_class = PLUGINS.get(name, None) - if plugin_class == "PixtralPlugin": - from transformers.utils.versions import require_version - - try: - require_version("transformers==4.46.0.dev0") - except Exception: - raise ImportError("PixtralPlugin requires transformers>=4.46.0.dev0. Please install it first.") if plugin_class is None: raise ValueError("Multimodal plugin `{}` not found.".format(name)) From 9d1f079ca5eff1e629bc8d047dbd765aaec96cdd Mon Sep 17 00:00:00 2001 From: KUANGDD Date: Wed, 23 Oct 2024 15:50:59 +0800 Subject: [PATCH 26/34] rm comment Former-commit-id: c1d17b0f013f40b48322cb13979fe3726dc124a6 --- src/llamafactory/data/mm_plugin.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index acdcca4e..a601562d 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -510,7 +510,6 @@ class PixtralPlugin(BasePlugin): ) -> Dict[str, Union[List[int], "torch.Tensor"]]: self._validate_input(images, videos) mm_inputs = self._get_mm_inputs(images, videos, processor) - # hack for hf engine if mm_inputs.get("pixel_values"): mm_inputs["pixel_values"] = mm_inputs["pixel_values"][0] From 2876b429bc17434c4cb9c8b0d4f6f34abdd49dd1 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 29 Oct 2024 21:57:28 +0800 Subject: [PATCH 27/34] Update README.md Former-commit-id: 1b57df074ab4deb29749086ccb10b459eebf5143 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7ec1c87b..c98b8d25 100644 --- a/README.md +++ b/README.md @@ -190,7 +190,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ | [PaliGemma](https://huggingface.co/google) | 3B | paligemma | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistral-community) | 12B | pixtral | +| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral | | [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | @@ -716,7 +716,7 @@ If you have a project that should be incorporated, please contact via email or c This repository is licensed under the [Apache-2.0 License](LICENSE). -Please follow the model licenses to use the corresponding model weights: [Baichuan 2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [InternLM2](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [MiniCPM](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/Phi-2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan) +Please follow the model licenses to use the corresponding model weights: [Baichuan 2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [InternLM2](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [MiniCPM](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md) / [Mistral/Mixtral/Pixtral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/Phi-2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan) ## Citation From eca50b89a244a0159b04c6335ae758a70dc9d7de Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 29 Oct 2024 21:58:03 +0800 Subject: [PATCH 28/34] Update README_zh.md Former-commit-id: 8fa20bf4272666e0ed9bcbfee8e4fe66801ef10c --- README_zh.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README_zh.md b/README_zh.md index a578afcb..b932973c 100644 --- a/README_zh.md +++ b/README_zh.md @@ -191,7 +191,7 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272 | [PaliGemma](https://huggingface.co/google) | 3B | paligemma | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistral-community) | 12B | pixtral | +| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral | | [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | @@ -717,7 +717,7 @@ run_name: test_run # 可选 本仓库的代码依照 [Apache-2.0](LICENSE) 协议开源。 -使用模型权重时,请遵循对应的模型协议:[Baichuan 2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [InternLM2](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [MiniCPM](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/Phi-2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan) +使用模型权重时,请遵循对应的模型协议:[Baichuan 2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [InternLM2](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [MiniCPM](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md) / [Mistral/Mixtral/Pixtral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/Phi-2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan) ## 引用 From 90cd3538decdd35d30bcd1c7131f2cc65ec90ad7 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 29 Oct 2024 22:00:59 +0800 Subject: [PATCH 29/34] Update hf_engine.py Former-commit-id: 6e212fdab5f48c955db250ecfc197b89f8856e4b --- src/llamafactory/chat/hf_engine.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 53fb666a..87d9c451 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -165,17 +165,12 @@ class HuggingfaceEngine(BaseEngine): ) mm_inputs = template.mm_plugin.get_mm_inputs(**mm_input_dict, seqlens=[prompt_length], processor=processor) - for key, value in mm_inputs.items(): - value = ( - value - if isinstance(value, torch.Tensor) - else ( - torch.stack(value) - if isinstance(value, list) and all(isinstance(v, torch.Tensor) for v in value) - else torch.tensor(value) - ) - ) + if isinstance(value, list) and all(isinstance(v, torch.Tensor for v in value)): # for pixtral inputs + value = torch.stack(value) # assume they have same sizes + elif not isinstance(value, torch.Tensor): + value = torch.tensor(value) + gen_kwargs[key] = value.to(model.device) return gen_kwargs, prompt_length From fb8f35558adf1556c02ea518f9a5c59e6060944e Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 29 Oct 2024 22:03:42 +0800 Subject: [PATCH 30/34] Update collator.py Former-commit-id: f745c4b28f532c7084d4b8522c972e735729ecee --- src/llamafactory/data/collator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llamafactory/data/collator.py b/src/llamafactory/data/collator.py index 42b4f565..8fa6f0dd 100644 --- a/src/llamafactory/data/collator.py +++ b/src/llamafactory/data/collator.py @@ -99,8 +99,8 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): features: Dict[str, "torch.Tensor"] = super().__call__(features) features.update(mm_inputs) - if features.get("pixel_values") is not None and isinstance(features["pixel_values"], list): - features = features.data + if isinstance(features.get("pixel_values"), list): # for pixtral inputs + features = features.data # use default_collate() instead of BatchEncoding.to() return features From 2179b91acb96bb4ab420fae1556c6f8275286a3c Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 29 Oct 2024 22:10:29 +0800 Subject: [PATCH 31/34] Update visual.py Former-commit-id: 0baa7735f64cbef9bd90e1db485c120b4c1c88bd --- src/llamafactory/model/model_utils/visual.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py index beec1884..bcd21841 100644 --- a/src/llamafactory/model/model_utils/visual.py +++ b/src/llamafactory/model/model_utils/visual.py @@ -113,6 +113,7 @@ def configure_visual_model(config: "PretrainedConfig") -> None: "llava_next", "llava_next_video", "paligemma", + "pixtral", "video_llava", ]: # required for ds zero3 and valuehead models setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None)) From 54961946ac2217a40f566b7a48d3c43b247b6ede Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 29 Oct 2024 22:11:21 +0800 Subject: [PATCH 32/34] Update template.py Former-commit-id: c3239ba6b24481b092e86ce94a6337fba18c25c0 --- src/llamafactory/data/template.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index 2a11427b..d0da3b30 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -934,6 +934,7 @@ _register_template( replace_eos=True, ) + _register_template( name="pixtral", format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]), From 49054329d04786e4542c38d1608bec36215162e3 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 29 Oct 2024 22:16:22 +0800 Subject: [PATCH 33/34] Update mm_plugin.py Former-commit-id: 049c554aee25cf1e29bee88dfb21381b3a4a2947 --- src/llamafactory/data/mm_plugin.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index de348896..52c65cb7 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -457,19 +457,16 @@ class PixtralPlugin(BasePlugin): videos: Sequence["VideoInput"], processor: Optional["ProcessorMixin"], ) -> List[Dict[str, str]]: - patch_size = processor.patch_size - image_token = processor.image_token - image_break_token = processor.image_break_token - image_end_token = processor.image_end_token - self._validate_input(images, videos) + patch_size = getattr(processor, "patch_size") + image_token = getattr(processor, "image_token") + image_break_token = getattr(processor, "image_break_token") + image_end_token = getattr(processor, "image_end_token") + num_image_tokens = 0 - img_kwargs = self._get_mm_inputs(images, videos, processor) - image_input_sizes = None - - image_input_sizes = img_kwargs.get("image_sizes", None) - messages = deepcopy(messages) + mm_inputs = self._get_mm_inputs(images, videos, processor) + image_input_sizes = mm_inputs.get("image_sizes", None) for message in messages: content = message["content"] while IMAGE_PLACEHOLDER in content: @@ -483,12 +480,10 @@ class PixtralPlugin(BasePlugin): num_height_tokens = height // patch_size num_width_tokens = width // patch_size replace_tokens = [[image_token] * num_width_tokens + [image_break_token]] * num_height_tokens - # Flatten list - replace_tokens = [item for sublist in replace_tokens for item in sublist] + replace_tokens = [item for sublist in replace_tokens for item in sublist] # flatten list replace_tokens[-1] = image_end_token replace_str = "".join(replace_tokens) content = content.replace(IMAGE_PLACEHOLDER, replace_str, 1) - num_image_tokens += 1 message["content"] = content From 15786539d7fd4009b34efaeec18a684a1ed94a7f Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 29 Oct 2024 22:19:04 +0800 Subject: [PATCH 34/34] fix bug Former-commit-id: bb0afda8fbb7d3556c1742bccf0b33ee3bf0f0b6 --- src/llamafactory/chat/hf_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 87d9c451..909f8161 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -166,7 +166,7 @@ class HuggingfaceEngine(BaseEngine): mm_inputs = template.mm_plugin.get_mm_inputs(**mm_input_dict, seqlens=[prompt_length], processor=processor) for key, value in mm_inputs.items(): - if isinstance(value, list) and all(isinstance(v, torch.Tensor for v in value)): # for pixtral inputs + if isinstance(value, list) and all(isinstance(v, torch.Tensor) for v in value): # for pixtral inputs value = torch.stack(value) # assume they have same sizes elif not isinstance(value, torch.Tensor): value = torch.tensor(value)