From 300feb3245a1a643902a55fa6932c1ce62aef825 Mon Sep 17 00:00:00 2001 From: Kingsley Date: Thu, 26 Sep 2024 12:11:58 +0800 Subject: [PATCH 01/34] add pixtral template Former-commit-id: e0bcaa6c6e902e29361438a6d215bbc2535b648f --- src/llamafactory/data/mm_plugin.py | 6 +++++ src/llamafactory/data/template.py | 7 +++++ src/llamafactory/extras/constants.py | 10 ++++++++ src/llamafactory/model/loader.py | 38 ++++++++++++++++++++++++++++ 4 files changed, 61 insertions(+) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index e22e2760..ea0f2185 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -323,6 +323,12 @@ class PaliGemmaPlugin(BasePlugin): mm_inputs["token_type_ids"] = _get_paligemma_token_type_ids(imglens, seqlens, processor) return mm_inputs +class PixtralPlugin(BasePlugin): + #TODO preprocess according to Pixtral hf + from transformers import LlavaForConditionalGeneration + @override + def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject": + pass class Qwen2vlPlugin(BasePlugin): @override diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index 54da4757..9b844d88 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -821,6 +821,13 @@ _register_template( replace_eos=True, ) +_register_template( + name="pixtral", + format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]), + format_prefix=EmptyFormatter(slots=[{"bos_token"}]), + mm_plugin=get_mm_plugin(name="pixtral", image_token="[IMG]") +) + _register_template( name="qwen", diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index 8d8d4424..e88f0da7 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -894,6 +894,16 @@ register_model_group( template="mistral", ) +register_model_group( + models={ + "Pixtral-12B-2409": { + DownloadSource.DEFAULT: "mistral-community/pixtral-12b", + DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b", + } + }, + template="mistral" +) + register_model_group( models={ diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index 030ce90f..bc4e101c 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -119,6 +119,44 @@ def load_config(model_args: "ModelArguments") -> "PretrainedConfig": Loads model config. """ init_kwargs = _get_init_kwargs(model_args) + if "pixtral" in model_args.model_name_or_path: + from transformers import PretrainedConfig + + class PixtralVisionConfig(PretrainedConfig): + model_type = "pixtral" + + def __init__( + self, + hidden_size=1024, + intermediate_size=4096, + num_hidden_layers=24, + num_attention_heads=16, + num_channels=3, + image_size=1024, + patch_size=16, + hidden_act="gelu", + attention_dropout=0.0, + rope_theta=10000.0, + tie_word_embeddings=False, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.attention_dropout = attention_dropout + self.hidden_act = hidden_act + self.rope_theta = rope_theta + self.tie_word_embeddings = tie_word_embeddings + self.head_dim = hidden_size // num_attention_heads + + return PixtralVisionConfig() + return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs) From 9390927875d640461f3450605eead71eb0403244 Mon Sep 17 00:00:00 2001 From: Kingsley Date: Thu, 26 Sep 2024 17:14:51 +0800 Subject: [PATCH 02/34] add pixtral template Former-commit-id: c7b4e47e0fda955272ccd6340b2047fd92acbfcf --- src/llamafactory/data/mm_plugin.py | 64 ++++++++++++++++++++++++++++-- src/llamafactory/model/loader.py | 37 ----------------- 2 files changed, 60 insertions(+), 41 deletions(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index ea0f2185..0e59ec0b 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -24,6 +24,7 @@ if TYPE_CHECKING: from av.stream import Stream from transformers import PreTrainedTokenizer, ProcessorMixin from transformers.image_processing_utils import BaseImageProcessor + from transformers.processing_utils import _validate_images_text_input_order, ProcessingKwargs class EncodedImage(TypedDict): path: Optional[str] @@ -324,11 +325,65 @@ class PaliGemmaPlugin(BasePlugin): return mm_inputs class PixtralPlugin(BasePlugin): - #TODO preprocess according to Pixtral hf - from transformers import LlavaForConditionalGeneration @override - def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject": - pass + def process_messages( + self, + messages: Sequence[Dict[str, str]], + images: Sequence["ImageInput"], + videos: Sequence["VideoInput"], + processor: Optional["ProcessorMixin"], + ) -> List[Dict[str, str]]: + patch_size = processor.patch_size + image_token = processor.image_token + image_break_token = processor.image_break_token + image_end_token = processor.image_end_token + + self._validate_input(images, videos) + num_image_tokens = 0 + image_input_sizes = self._get_mm_inputs(images, videos, processor)["image_sizes"] + messages = deepcopy(messages) + print(image_input_sizes[0], messages) + for message in messages: + content = message["content"] + img_id = 0 + while IMAGE_PLACEHOLDER in content: + # only support one image for one time? + image_size = image_input_sizes[0][0] + height, width = image_size + num_height_tokens = height // patch_size + num_width_tokens = width // patch_size + replace_tokens = [ + [image_token] * num_width_tokens + [image_break_token] + ] * num_height_tokens + # Flatten list + replace_tokens = [item for sublist in replace_tokens for item in sublist] + replace_tokens[-1] = image_end_token + replace_str = "".join(replace_tokens) + content.replace(IMAGE_PLACEHOLDER, replace_str, 1) + + img_id += 1 + num_image_tokens += 1 + + message["content"] = content + + if len(images) != num_image_tokens: + raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER)) + + return messages + + @override + def get_mm_inputs( + self, + images: Sequence["ImageInput"], + videos: Sequence["VideoInput"], + imglens: Sequence[int], + vidlens: Sequence[int], + seqlens: Sequence[int], + processor: Optional["ProcessorMixin"], + ) -> Dict[str, Union[List[int], "torch.Tensor"]]: + + self._validate_input(images, videos) + return self._get_mm_inputs(images, videos, processor) class Qwen2vlPlugin(BasePlugin): @override @@ -428,6 +483,7 @@ PLUGINS = { "llava": LlavaPlugin, "paligemma": PaliGemmaPlugin, "qwen2_vl": Qwen2vlPlugin, + "pixtral": PixtralPlugin, } diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index bc4e101c..96d61645 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -119,43 +119,6 @@ def load_config(model_args: "ModelArguments") -> "PretrainedConfig": Loads model config. """ init_kwargs = _get_init_kwargs(model_args) - if "pixtral" in model_args.model_name_or_path: - from transformers import PretrainedConfig - - class PixtralVisionConfig(PretrainedConfig): - model_type = "pixtral" - - def __init__( - self, - hidden_size=1024, - intermediate_size=4096, - num_hidden_layers=24, - num_attention_heads=16, - num_channels=3, - image_size=1024, - patch_size=16, - hidden_act="gelu", - attention_dropout=0.0, - rope_theta=10000.0, - tie_word_embeddings=False, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.hidden_act = hidden_act - self.rope_theta = rope_theta - self.tie_word_embeddings = tie_word_embeddings - self.head_dim = hidden_size // num_attention_heads - - return PixtralVisionConfig() return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs) From e4c57f54f8dda6c5158ac9b500e953f0702fd874 Mon Sep 17 00:00:00 2001 From: Kingsley Date: Sat, 28 Sep 2024 02:14:06 +0800 Subject: [PATCH 03/34] remove some unnecessary if conditions Former-commit-id: 482d3e5ff3338385da664475fee88c7dc623c993 --- src/llamafactory/chat/hf_engine.py | 2 +- src/llamafactory/data/mm_plugin.py | 36 +++++++++++++++++++++++----- src/llamafactory/extras/constants.py | 22 +++++++++-------- 3 files changed, 43 insertions(+), 17 deletions(-) diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 2b1d9fe5..68416fdf 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -168,7 +168,7 @@ class HuggingfaceEngine(BaseEngine): for key, value in mm_inputs.items(): value = value if isinstance(value, torch.Tensor) else torch.tensor(value) gen_kwargs[key] = value.to(model.device) - + return gen_kwargs, prompt_length @staticmethod diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 0e59ec0b..6716527c 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -325,6 +325,14 @@ class PaliGemmaPlugin(BasePlugin): return mm_inputs class PixtralPlugin(BasePlugin): + # @override + # def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject": + # image = super()._preprocess_image(image, **kwargs) + # UP_SIZE = (512,512) + # image = image.resize(UP_SIZE, resample=Image.NEAREST) + + # return image + @override def process_messages( self, @@ -340,15 +348,22 @@ class PixtralPlugin(BasePlugin): self._validate_input(images, videos) num_image_tokens = 0 - image_input_sizes = self._get_mm_inputs(images, videos, processor)["image_sizes"] + img_kwargs = self._get_mm_inputs(images, videos, processor) + image_input_sizes = None + + if img_kwargs.get("pixel_values") is not None: + image_input_sizes = img_kwargs["image_sizes"] + messages = deepcopy(messages) - print(image_input_sizes[0], messages) for message in messages: content = message["content"] img_id = 0 while IMAGE_PLACEHOLDER in content: - # only support one image for one time? - image_size = image_input_sizes[0][0] + + if image_input_sizes is None: + raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER)) + + image_size = image_input_sizes[0][img_id] height, width = image_size num_height_tokens = height // patch_size num_width_tokens = width // patch_size @@ -359,7 +374,7 @@ class PixtralPlugin(BasePlugin): replace_tokens = [item for sublist in replace_tokens for item in sublist] replace_tokens[-1] = image_end_token replace_str = "".join(replace_tokens) - content.replace(IMAGE_PLACEHOLDER, replace_str, 1) + content = content.replace(IMAGE_PLACEHOLDER, replace_str, 1) img_id += 1 num_image_tokens += 1 @@ -383,7 +398,16 @@ class PixtralPlugin(BasePlugin): ) -> Dict[str, Union[List[int], "torch.Tensor"]]: self._validate_input(images, videos) - return self._get_mm_inputs(images, videos, processor) + mm_inputs = self._get_mm_inputs(images, videos, processor) + if mm_inputs.get('image_sizes'): + del mm_inputs['image_sizes'] + # TODO fix this type error + # if isinstance(mm_inputs.get("pixel_values"), list): #List[List[torch.tensor]] -> [B C W H] + # recommend for batch==1 for one gpu or it will rise the error of BatchEncoding. + mm_inputs["pixel_values"] = mm_inputs.get("pixel_values")[0][0].unsqueeze(0) + # mm_inputs["pixel_values"] = mm_inputs.get("pixel_values") + + return mm_inputs class Qwen2vlPlugin(BasePlugin): @override diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index ef075cf9..e3f6a99d 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -917,16 +917,6 @@ register_model_group( template="mistral", ) -register_model_group( - models={ - "Pixtral-12B-2409": { - DownloadSource.DEFAULT: "mistral-community/pixtral-12b", - DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b", - } - }, - template="mistral" -) - register_model_group( models={ @@ -1067,6 +1057,18 @@ register_model_group( ) +register_model_group( + models={ + "Pixtral-12B-2409": { + DownloadSource.DEFAULT: "mistral-community/pixtral-12b", + DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b", + } + }, + template="mistral", + vision=True +) + + register_model_group( models={ "Qwen-1.8B": { From bddb2646bd8363a77a8fd51c0b5f56ba16d25385 Mon Sep 17 00:00:00 2001 From: Kingsley Date: Sat, 28 Sep 2024 22:50:53 +0800 Subject: [PATCH 04/34] tiny fix Former-commit-id: 35bc71b2a68fd303798c35fe22ad29ceea87cf9b --- README.md | 1 + README_zh.md | 1 + src/llamafactory/data/mm_plugin.py | 21 ++++++-------------- src/llamafactory/extras/constants.py | 4 ++-- src/llamafactory/model/model_utils/visual.py | 13 ++++++++++-- 5 files changed, 21 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 92bbcc88..cf37565b 100644 --- a/README.md +++ b/README.md @@ -183,6 +183,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ | [PaliGemma](https://huggingface.co/google) | 3B | paligemma | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | +| [Pixtral](https://huggingface.co/mistralai/Pixtral-12B-2409) | 12B | pixtral | | [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | diff --git a/README_zh.md b/README_zh.md index 0b02f35f..4b3b53de 100644 --- a/README_zh.md +++ b/README_zh.md @@ -184,6 +184,7 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272 | [PaliGemma](https://huggingface.co/google) | 3B | paligemma | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | +| [Pixtral](https://huggingface.co/mistralai/Pixtral-12B-2409) | 12B | pixtral | | [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 6716527c..2b85c2c5 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -325,14 +325,6 @@ class PaliGemmaPlugin(BasePlugin): return mm_inputs class PixtralPlugin(BasePlugin): - # @override - # def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject": - # image = super()._preprocess_image(image, **kwargs) - # UP_SIZE = (512,512) - # image = image.resize(UP_SIZE, resample=Image.NEAREST) - - # return image - @override def process_messages( self, @@ -396,16 +388,15 @@ class PixtralPlugin(BasePlugin): seqlens: Sequence[int], processor: Optional["ProcessorMixin"], ) -> Dict[str, Union[List[int], "torch.Tensor"]]: - self._validate_input(images, videos) mm_inputs = self._get_mm_inputs(images, videos, processor) - if mm_inputs.get('image_sizes'): - del mm_inputs['image_sizes'] - # TODO fix this type error - # if isinstance(mm_inputs.get("pixel_values"), list): #List[List[torch.tensor]] -> [B C W H] - # recommend for batch==1 for one gpu or it will rise the error of BatchEncoding. + if mm_inputs.get("image_sizes"): + mm_inputs.pop("image_sizes") + + if isinstance(mm_inputs.get("pixel_values"), list) and len(mm_inputs.get("pixel_values")[0]) >= 2: + raise ValueError("Now it only supports batchsize=1 on per gpu due to `List[tensor]` can not pack into BachEncoding") + mm_inputs["pixel_values"] = mm_inputs.get("pixel_values")[0][0].unsqueeze(0) - # mm_inputs["pixel_values"] = mm_inputs.get("pixel_values") return mm_inputs diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index e3f6a99d..3de1c7a2 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -1060,8 +1060,8 @@ register_model_group( register_model_group( models={ "Pixtral-12B-2409": { - DownloadSource.DEFAULT: "mistral-community/pixtral-12b", - DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b", + DownloadSource.DEFAULT: "mistralai/Pixtral-12B-2409", + DownloadSource.MODELSCOPE: "LLM-Research/Pixtral-12B-2409", } }, template="mistral", diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py index 23f880a6..107590bd 100644 --- a/src/llamafactory/model/model_utils/visual.py +++ b/src/llamafactory/model/model_utils/visual.py @@ -96,6 +96,9 @@ def autocast_projector_dtype(model: "PreTrainedModel", model_args: "ModelArgumen mm_projector: "torch.nn.Module" = getattr(model, "multi_modal_projector") elif model_type == "qwen2_vl": mm_projector: "torch.nn.Module" = getattr(getattr(model, "visual"), "merger") + # TODO check it + elif model_type == "pixtral": + mm_projector: "torch.nn.Module" = getattr(model, "vision_language_adapte") else: return @@ -122,9 +125,11 @@ def get_forbidden_modules(config: "PretrainedConfig", finetuning_args: "Finetuni """ model_type = getattr(config, "model_type", None) forbidden_modules = set() - if model_type in ["llava", "paligemma"]: + if model_type in ["llava", "paligemma", "pixtral"]: if finetuning_args.freeze_vision_tower: forbidden_modules.add("vision_tower") + #TODO check it + forbidden_modules.add("vision_encoder") if finetuning_args.train_mm_proj_only: forbidden_modules.add("language_model") @@ -150,7 +155,7 @@ def get_image_seqlen(config: "PretrainedConfig") -> int: image_seqlen += 1 elif model_type == "paligemma": image_seqlen = config.vision_config.num_image_tokens - elif model_type == "qwen2_vl": # variable length + elif model_type in ["qwen2_vl", "pixtral"]: # variable length image_seqlen = -1 return image_seqlen @@ -168,10 +173,14 @@ def patch_target_modules( return "^(?!.*vision_tower).*(?:{}).*".format("|".join(target_modules)) elif model_type == "qwen2_vl": return "^(?!.*visual).*(?:{}).*".format("|".join(target_modules)) + elif model_type == "pixtral": + return "^(?!.*vision_encoder).*(?:{}).*".format("|".join(target_modules)) else: return target_modules else: if model_type == "qwen2_vl": return "^(?!.*patch_embed).*(?:{}).*".format("|".join(target_modules)) + elif model_type == "pixtral": + return "^(?!.*patch_conv).*(?:{}).*".format("|".join(target_modules)) else: return target_modules From e641f1215a36f8f1e69ee6310d6798424c7a37da Mon Sep 17 00:00:00 2001 From: Kingsley Date: Sun, 29 Sep 2024 00:00:23 +0800 Subject: [PATCH 05/34] Tiny fix Former-commit-id: ae66e1a545f4cd209a57fd824f9bfb7e94436cba --- README.md | 2 +- README_zh.md | 2 +- src/llamafactory/data/mm_plugin.py | 3 +-- src/llamafactory/extras/constants.py | 6 +++--- src/llamafactory/model/model_utils/visual.py | 2 -- 5 files changed, 6 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index cf37565b..899ad0c4 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ | [PaliGemma](https://huggingface.co/google) | 3B | paligemma | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistralai/Pixtral-12B-2409) | 12B | pixtral | +| [Pixtral](https://huggingface.co/mistral-community/pixtral-12b) | 12B | pixtral | | [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | diff --git a/README_zh.md b/README_zh.md index 4b3b53de..e7335b72 100644 --- a/README_zh.md +++ b/README_zh.md @@ -184,7 +184,7 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272 | [PaliGemma](https://huggingface.co/google) | 3B | paligemma | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistralai/Pixtral-12B-2409) | 12B | pixtral | +| [Pixtral](https://huggingface.co/mistral-community/pixtral-12b) | 12B | pixtral | | [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 2b85c2c5..8688a8be 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -24,7 +24,6 @@ if TYPE_CHECKING: from av.stream import Stream from transformers import PreTrainedTokenizer, ProcessorMixin from transformers.image_processing_utils import BaseImageProcessor - from transformers.processing_utils import _validate_images_text_input_order, ProcessingKwargs class EncodedImage(TypedDict): path: Optional[str] @@ -392,7 +391,7 @@ class PixtralPlugin(BasePlugin): mm_inputs = self._get_mm_inputs(images, videos, processor) if mm_inputs.get("image_sizes"): mm_inputs.pop("image_sizes") - + if isinstance(mm_inputs.get("pixel_values"), list) and len(mm_inputs.get("pixel_values")[0]) >= 2: raise ValueError("Now it only supports batchsize=1 on per gpu due to `List[tensor]` can not pack into BachEncoding") diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index 3de1c7a2..32af244b 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -1059,9 +1059,9 @@ register_model_group( register_model_group( models={ - "Pixtral-12B-2409": { - DownloadSource.DEFAULT: "mistralai/Pixtral-12B-2409", - DownloadSource.MODELSCOPE: "LLM-Research/Pixtral-12B-2409", + "Pixtral-12B": { + DownloadSource.DEFAULT: "mistral-community/pixtral-12b", + DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b", } }, template="mistral", diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py index 107590bd..8aad4d87 100644 --- a/src/llamafactory/model/model_utils/visual.py +++ b/src/llamafactory/model/model_utils/visual.py @@ -96,7 +96,6 @@ def autocast_projector_dtype(model: "PreTrainedModel", model_args: "ModelArgumen mm_projector: "torch.nn.Module" = getattr(model, "multi_modal_projector") elif model_type == "qwen2_vl": mm_projector: "torch.nn.Module" = getattr(getattr(model, "visual"), "merger") - # TODO check it elif model_type == "pixtral": mm_projector: "torch.nn.Module" = getattr(model, "vision_language_adapte") else: @@ -128,7 +127,6 @@ def get_forbidden_modules(config: "PretrainedConfig", finetuning_args: "Finetuni if model_type in ["llava", "paligemma", "pixtral"]: if finetuning_args.freeze_vision_tower: forbidden_modules.add("vision_tower") - #TODO check it forbidden_modules.add("vision_encoder") if finetuning_args.train_mm_proj_only: From 8fd84c375e15eaf8600b33ca4d26ea79715dea7c Mon Sep 17 00:00:00 2001 From: Kingsley Date: Mon, 30 Sep 2024 19:58:34 +0800 Subject: [PATCH 06/34] fix some errors due to inconsistency of model cards Former-commit-id: dd83265b9b8768eb8732f59ace128dfe4aac1c47 --- README.md | 62 ++++++++++---------- README_zh.md | 62 ++++++++++---------- src/llamafactory/chat/hf_engine.py | 2 +- src/llamafactory/extras/constants.py | 4 +- src/llamafactory/model/model_utils/visual.py | 5 +- 5 files changed, 66 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index 3f0642c5..f6d0edf2 100644 --- a/README.md +++ b/README.md @@ -162,37 +162,37 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ ## Supported Models -| Model | Model size | Template | -| ----------------------------------------------------------------- | -------------------------------- | ---------------- | -| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | -| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | -| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | -| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | -| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | -| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | -| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | -| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | -| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 | -| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | -| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | -| [Llama 3-3.2](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | -| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | -| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | -| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | -| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | -| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | -| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | -| [PaliGemma](https://huggingface.co/google) | 3B | paligemma | -| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | -| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistral-community/pixtral-12b) | 12B | pixtral | -| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | -| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | -| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | -| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | -| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | -| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | -| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | +| Model | Model size | Template | +|-------------------------------------------------------------| -------------------------------- | ---------------- | +| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | +| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | +| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | +| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | +| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | +| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | +| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | +| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | +| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 | +| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | +| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | +| [Llama 3-3.2](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | +| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | +| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | +| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | +| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | +| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | +| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | +| [PaliGemma](https://huggingface.co/google) | 3B | paligemma | +| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | +| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | +| [Pixtral](https://huggingface.co/mistral-community) | 12B | pixtral | +| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | +| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | +| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | +| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | +| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | +| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | +| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | > [!NOTE] > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models. diff --git a/README_zh.md b/README_zh.md index f66b0e02..fddc6d2c 100644 --- a/README_zh.md +++ b/README_zh.md @@ -163,37 +163,37 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272 ## 模型 -| 模型名 | 模型大小 | Template | -| ----------------------------------------------------------------- | -------------------------------- | ---------------- | -| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | -| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | -| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | -| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | -| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | -| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | -| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | -| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | -| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 | -| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | -| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | -| [Llama 3-3.2](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | -| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | -| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | -| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | -| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | -| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | -| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | -| [PaliGemma](https://huggingface.co/google) | 3B | paligemma | -| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | -| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistral-community/pixtral-12b) | 12B | pixtral | -| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | -| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | -| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | -| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | -| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | -| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | -| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | +| 模型名 | 模型大小 | Template | +|-------------------------------------------------------------| -------------------------------- | ---------------- | +| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | +| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | +| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | +| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | +| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | +| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | +| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | +| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | +| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 | +| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | +| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | +| [Llama 3-3.2](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | +| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | +| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | +| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | +| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | +| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | +| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | +| [PaliGemma](https://huggingface.co/google) | 3B | paligemma | +| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | +| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | +| [Pixtral](https://huggingface.co/mistral-community) | 12B | pixtral | +| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | +| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | +| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | +| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | +| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | +| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | +| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | > [!NOTE] > 对于所有“基座”(Base)模型,`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”(Instruct/Chat)模型请务必使用**对应的模板**。 diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 68416fdf..2b1d9fe5 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -168,7 +168,7 @@ class HuggingfaceEngine(BaseEngine): for key, value in mm_inputs.items(): value = value if isinstance(value, torch.Tensor) else torch.tensor(value) gen_kwargs[key] = value.to(model.device) - + return gen_kwargs, prompt_length @staticmethod diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index 17bb5aed..ed1aff63 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -1164,12 +1164,12 @@ register_model_group( register_model_group( models={ - "Pixtral-12B": { + "Pixtral-12B-Instruct": { DownloadSource.DEFAULT: "mistral-community/pixtral-12b", DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b", } }, - template="mistral", + template="pixtral", vision=True ) diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py index a30be115..bc3ef676 100644 --- a/src/llamafactory/model/model_utils/visual.py +++ b/src/llamafactory/model/model_utils/visual.py @@ -92,12 +92,10 @@ def autocast_projector_dtype(model: "PreTrainedModel", model_args: "ModelArgumen if getattr(model, "quantization_method", None): model_type = getattr(model.config, "model_type", None) - if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]: + if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "pixtral", "video_llava"]: mm_projector: "torch.nn.Module" = getattr(model, "multi_modal_projector") elif model_type == "qwen2_vl": mm_projector: "torch.nn.Module" = getattr(getattr(model, "visual"), "merger") - elif model_type == "pixtral": - mm_projector: "torch.nn.Module" = getattr(model, "vision_language_adapte") else: return @@ -133,7 +131,6 @@ def get_forbidden_modules(config: "PretrainedConfig", finetuning_args: "Finetuni if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "pixtral", "video_llava"]: if finetuning_args.freeze_vision_tower: forbidden_modules.add("vision_tower") - forbidden_modules.add("vision_encoder") if finetuning_args.train_mm_proj_only: forbidden_modules.add("language_model") From 870bbabbc446499f8da70fa016c703213755c6cf Mon Sep 17 00:00:00 2001 From: Kingsley Date: Mon, 30 Sep 2024 20:04:47 +0800 Subject: [PATCH 07/34] register model fix Former-commit-id: 077d8e3c0344d944705254cc5a2cd06c9f5dc116 --- src/llamafactory/extras/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index ed1aff63..4a8a6d25 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -1164,7 +1164,7 @@ register_model_group( register_model_group( models={ - "Pixtral-12B-Instruct": { + "Pixtral-12B-Chat": { DownloadSource.DEFAULT: "mistral-community/pixtral-12b", DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b", } From cbc1dd0c884c3897a21ea20327fa0b437cbbc8b9 Mon Sep 17 00:00:00 2001 From: Kingsley Date: Mon, 30 Sep 2024 20:27:05 +0800 Subject: [PATCH 08/34] sync with former Former-commit-id: f8707e52586182144c4fb70c7c0de8bf7044ef5e --- src/llamafactory/model/model_utils/visual.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py index bc3ef676..7007b5a2 100644 --- a/src/llamafactory/model/model_utils/visual.py +++ b/src/llamafactory/model/model_utils/visual.py @@ -186,12 +186,10 @@ def patch_target_modules( """ model_type = getattr(config, "model_type", None) if finetuning_args.freeze_vision_tower: - if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]: + if model_type in ["llava", "llava_next", "llava_next_video", "paligemma", "pixtral", "video_llava"]: return "^(?!.*vision_tower).*(?:{}).*".format("|".join(target_modules)) elif model_type == "qwen2_vl": return "^(?!.*visual).*(?:{}).*".format("|".join(target_modules)) - elif model_type == "pixtral": - return "^(?!.*vision_encoder).*(?:{}).*".format("|".join(target_modules)) else: return target_modules else: From e9ac26db4c40278ee744b6a9e7f22093508e2aaf Mon Sep 17 00:00:00 2001 From: Kingsley Date: Mon, 30 Sep 2024 23:36:16 +0800 Subject: [PATCH 09/34] unfactor md Former-commit-id: 1a79d61f8d25a4c1127c2f393418e14ab9d2abd4 --- README.md | 64 ++++++++++++++++++++++++++-------------------------- README_zh.md | 64 ++++++++++++++++++++++++++-------------------------- 2 files changed, 64 insertions(+), 64 deletions(-) diff --git a/README.md b/README.md index f6d0edf2..c02c455e 100644 --- a/README.md +++ b/README.md @@ -162,37 +162,37 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ ## Supported Models -| Model | Model size | Template | -|-------------------------------------------------------------| -------------------------------- | ---------------- | -| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | -| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | -| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | -| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | -| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | -| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | -| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | -| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | -| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 | -| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | -| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | -| [Llama 3-3.2](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | -| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | -| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | -| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | -| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | -| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | -| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | -| [PaliGemma](https://huggingface.co/google) | 3B | paligemma | -| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | -| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistral-community) | 12B | pixtral | -| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | -| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | -| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | -| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | -| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | -| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | -| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | +| Model | Model size | Template | +| ----------------------------------------------------------------- | -------------------------------- | ---------------- | +| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | +| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | +| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | +| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | +| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | +| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | +| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | +| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | +| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 | +| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | +| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | +| [Llama 3-3.2](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | +| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | +| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | +| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | +| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | +| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | +| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | +| [PaliGemma](https://huggingface.co/google) | 3B | paligemma | +| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | +| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | +| [Pixtral](https://huggingface.co/mistral-community) | 12B | pixtral | +| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | +| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | +| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | +| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | +| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | +| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | +| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | > [!NOTE] > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models. @@ -721,4 +721,4 @@ This repo benefits from [PEFT](https://github.com/huggingface/peft), [TRL](https ## Star History -![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) +![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) \ No newline at end of file diff --git a/README_zh.md b/README_zh.md index fddc6d2c..4dff0be7 100644 --- a/README_zh.md +++ b/README_zh.md @@ -163,37 +163,37 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272 ## 模型 -| 模型名 | 模型大小 | Template | -|-------------------------------------------------------------| -------------------------------- | ---------------- | -| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | -| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | -| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | -| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | -| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | -| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | -| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | -| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | -| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 | -| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | -| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | -| [Llama 3-3.2](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | -| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | -| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | -| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | -| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | -| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | -| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | -| [PaliGemma](https://huggingface.co/google) | 3B | paligemma | -| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | -| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistral-community) | 12B | pixtral | -| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | -| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | -| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | -| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | -| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | -| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | -| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | +| 模型名 | 模型大小 | Template | +| ----------------------------------------------------------------- | -------------------------------- | ---------------- | +| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | +| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | +| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | +| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | +| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | +| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | +| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | +| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | +| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 | +| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | +| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | +| [Llama 3-3.2](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | +| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | +| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | +| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | +| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | +| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | +| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | +| [PaliGemma](https://huggingface.co/google) | 3B | paligemma | +| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | +| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | +| [Pixtral](https://huggingface.co/mistral-community) | 12B | pixtral | +| [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | +| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | +| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | +| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | +| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | +| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | +| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | > [!NOTE] > 对于所有“基座”(Base)模型,`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”(Instruct/Chat)模型请务必使用**对应的模板**。 @@ -722,4 +722,4 @@ run_name: test_run # 可选 ## Star History -![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) +![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) \ No newline at end of file From 9d929897cedee6d3ab44432b95b5fc8f1cc38637 Mon Sep 17 00:00:00 2001 From: Kingsley Date: Mon, 14 Oct 2024 16:55:59 +0800 Subject: [PATCH 10/34] remove bs condition Former-commit-id: bf3520178ab66058c62a9cf31b42f36a9d88ce20 --- src/llamafactory/data/mm_plugin.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index edca48a7..b9e7bc3b 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -477,7 +477,7 @@ class PixtralPlugin(BasePlugin): if image_input_sizes is None: raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER)) - + image_size = image_input_sizes[0][img_id] height, width = image_size num_height_tokens = height // patch_size @@ -500,7 +500,7 @@ class PixtralPlugin(BasePlugin): raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER)) return messages - + @override def get_mm_inputs( self, @@ -516,11 +516,6 @@ class PixtralPlugin(BasePlugin): if mm_inputs.get("image_sizes"): mm_inputs.pop("image_sizes") - if isinstance(mm_inputs.get("pixel_values"), list) and len(mm_inputs.get("pixel_values")[0]) >= 2: - raise ValueError("Now it only supports batchsize=1 on per gpu due to `List[tensor]` can not pack into BachEncoding") - - mm_inputs["pixel_values"] = mm_inputs.get("pixel_values")[0][0].unsqueeze(0) - return mm_inputs class Qwen2vlPlugin(BasePlugin): From 2b3b0473cde78446e8b94f4b414078a4fbf7e349 Mon Sep 17 00:00:00 2001 From: KUANGDD Date: Mon, 14 Oct 2024 21:11:09 +0800 Subject: [PATCH 11/34] required transformers version Former-commit-id: d9915db327a038c93b5e3421c90b1f218fb23f92 --- src/llamafactory/data/mm_plugin.py | 6 ++++++ src/llamafactory/extras/misc.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index b9e7bc3b..9d81848b 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -692,6 +692,12 @@ def get_mm_plugin( video_token: Optional[str] = None, ) -> "BasePlugin": plugin_class = PLUGINS.get(name, None) + if plugin_class == "PixtralPlugin": + from transformers.utils.versions import require_version + try: + require_version("transformers==4.46.0.dev0") + except Exception as e: + raise ImportError("PixtralPlugin requires transformers==4.46.0.dev0. Please install it first.") if plugin_class is None: raise ValueError("Multimodal plugin `{}` not found.".format(name)) diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py index fd78530a..47f2ebbe 100644 --- a/src/llamafactory/extras/misc.py +++ b/src/llamafactory/extras/misc.py @@ -79,7 +79,7 @@ def check_dependencies() -> None: if os.environ.get("DISABLE_VERSION_CHECK", "0").lower() in ["true", "1"]: logger.warning("Version checking has been disabled, may lead to unexpected behaviors.") else: - require_version("transformers>=4.41.2,<=4.45.2", "To fix: pip install transformers>=4.41.2,<=4.45.2") + require_version("transformers>=4.41.2", "To fix: pip install transformers>=4.41.2,<=4.45.2") require_version("datasets>=2.16.0,<=2.21.0", "To fix: pip install datasets>=2.16.0,<=2.21.0") require_version("accelerate>=0.30.1,<=0.34.2", "To fix: pip install accelerate>=0.30.1,<=0.34.2") require_version("peft>=0.11.1,<=0.12.0", "To fix: pip install peft>=0.11.1,<=0.12.0") From 3693d7b571ff41cfd3dd5fa18aa873afc17d4826 Mon Sep 17 00:00:00 2001 From: KUANGDD Date: Tue, 15 Oct 2024 12:12:46 +0800 Subject: [PATCH 12/34] plugin test & check Former-commit-id: 76c7c8c5a729b8b43e3a31efc44f2c9c2678bf3d --- src/llamafactory/data/mm_plugin.py | 2 +- tests/data/test_mm_plugin.py | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 9d81848b..5f128706 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -697,7 +697,7 @@ def get_mm_plugin( try: require_version("transformers==4.46.0.dev0") except Exception as e: - raise ImportError("PixtralPlugin requires transformers==4.46.0.dev0. Please install it first.") + raise ImportError("PixtralPlugin requires transformers>=4.46.0.dev0. Please install it first.") if plugin_class is None: raise ValueError("Multimodal plugin `{}` not found.".format(name)) diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py index 75541000..70b61444 100644 --- a/tests/data/test_mm_plugin.py +++ b/tests/data/test_mm_plugin.py @@ -178,7 +178,18 @@ def test_paligemma_plugin(): check_inputs["expected_no_mm_inputs"] = {"token_type_ids": [[1] * 1024]} _check_plugin(**check_inputs) - +def test_pixtral_plugin(): + tokenizer, processor = _load_tokenizer_module(model_name_or_path="mistral-community/pixtral-12b") + pixtral_plugin = get_mm_plugin(name="pixtral", image_token="[IMG]") + image_slice_heigt, image_slice_width = 2, 2 + check_inputs = {"plugin": pixtral_plugin, "tokenizer": tokenizer, "processor": processor} + check_inputs["expected_mm_messages"] = [ + {key: value.replace("", "{}[IMG_BREAK]".format("[IMG]" * image_slice_width) * image_slice_heigt).rsplit("[IMG_BREAK]", 1)[0] + "[IMG_END]" + for key, value in message.items()} for message in MM_MESSAGES + ] + check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor) + _check_plugin(**check_inputs) + def test_qwen2_vl_plugin(): tokenizer, processor = _load_tokenizer_module(model_name_or_path="Qwen/Qwen2-VL-7B-Instruct") qwen2_vl_plugin = get_mm_plugin(name="qwen2_vl", image_token="<|image_pad|>") @@ -206,3 +217,6 @@ def test_video_llava_plugin(): ] check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor) _check_plugin(**check_inputs) + +if __name__ == "__main__": + test_pixtral_plugin() \ No newline at end of file From c3de160d1cb874eae5db5566d58d83b49ee62e15 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Tue, 15 Oct 2024 13:30:41 +0800 Subject: [PATCH 13/34] fix some Former-commit-id: c9b644693996f96d234349823911fc267635acb9 --- tests/data/test_mm_plugin.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py index 70b61444..d3c3f021 100644 --- a/tests/data/test_mm_plugin.py +++ b/tests/data/test_mm_plugin.py @@ -178,18 +178,20 @@ def test_paligemma_plugin(): check_inputs["expected_no_mm_inputs"] = {"token_type_ids": [[1] * 1024]} _check_plugin(**check_inputs) + def test_pixtral_plugin(): tokenizer, processor = _load_tokenizer_module(model_name_or_path="mistral-community/pixtral-12b") pixtral_plugin = get_mm_plugin(name="pixtral", image_token="[IMG]") - image_slice_heigt, image_slice_width = 2, 2 + image_slice_height, image_slice_width = 2, 2 check_inputs = {"plugin": pixtral_plugin, "tokenizer": tokenizer, "processor": processor} check_inputs["expected_mm_messages"] = [ - {key: value.replace("", "{}[IMG_BREAK]".format("[IMG]" * image_slice_width) * image_slice_heigt).rsplit("[IMG_BREAK]", 1)[0] + "[IMG_END]" + {key: value.replace("", "{}[IMG_BREAK]".format("[IMG]" * image_slice_width) * image_slice_height).rsplit("[IMG_BREAK]", 1)[0] + "[IMG_END]" for key, value in message.items()} for message in MM_MESSAGES ] check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor) _check_plugin(**check_inputs) - + + def test_qwen2_vl_plugin(): tokenizer, processor = _load_tokenizer_module(model_name_or_path="Qwen/Qwen2-VL-7B-Instruct") qwen2_vl_plugin = get_mm_plugin(name="qwen2_vl", image_token="<|image_pad|>") @@ -217,6 +219,3 @@ def test_video_llava_plugin(): ] check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor) _check_plugin(**check_inputs) - -if __name__ == "__main__": - test_pixtral_plugin() \ No newline at end of file From 01defc27796e850db1f3dbce056b7fd62643b5ab Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Tue, 15 Oct 2024 13:53:33 +0800 Subject: [PATCH 14/34] tiny fix [skip ci] Former-commit-id: 95f968eec2628cb26b3c4f4d4e81a9536e23cc31 --- src/llamafactory/data/mm_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 5f128706..15785844 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -680,8 +680,8 @@ PLUGINS = { "llava_next": LlavaNextPlugin, "llava_next_video": LlavaNextVideoPlugin, "paligemma": PaliGemmaPlugin, - "qwen2_vl": Qwen2vlPlugin, "pixtral": PixtralPlugin, + "qwen2_vl": Qwen2vlPlugin, "video_llava": VideoLlavaPlugin, } From f463b3f038c70a933fba10b4a82abaabb31784dd Mon Sep 17 00:00:00 2001 From: Kingsley Date: Tue, 15 Oct 2024 17:09:24 +0800 Subject: [PATCH 15/34] add extra test for pixtral mm_input Former-commit-id: c706ec8a5dbd3c72ab15a709668624c0c7bbd8ce --- src/llamafactory/data/mm_plugin.py | 2 -- tests/data/test_mm_plugin.py | 42 ++++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 5f128706..f3f6433c 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -513,8 +513,6 @@ class PixtralPlugin(BasePlugin): ) -> Dict[str, Union[List[int], "torch.Tensor"]]: self._validate_input(images, videos) mm_inputs = self._get_mm_inputs(images, videos, processor) - if mm_inputs.get("image_sizes"): - mm_inputs.pop("image_sizes") return mm_inputs diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py index d3c3f021..da64ccc3 100644 --- a/tests/data/test_mm_plugin.py +++ b/tests/data/test_mm_plugin.py @@ -13,7 +13,7 @@ # limitations under the License. import os -from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple +from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, Union import pytest import torch @@ -68,12 +68,50 @@ def _get_mm_inputs(processor: "ProcessorMixin") -> Dict[str, "torch.Tensor"]: image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") return image_processor(images=IMAGES, return_tensors="pt") +def _is_nested_tensor_list(element): + if not isinstance(element, list): + return False + + for item in element: + if isinstance(item, list): + if not _is_nested_tensor_list(item): + return False + + elif not isinstance(item, torch.Tensor): + return False + + return True + + +def _equal_nested_tensor_list(a: List[List[torch.Tensor]], b: List[List[torch.Tensor]]) -> bool: + if type(a) != type(b): + return False + + if isinstance(a, list) and isinstance(b, list): + if len(a) != len(b): + return False + + for sub_a, sub_b in zip(a, b): + if isinstance(sub_a, list) and isinstance(sub_b, list): + if not _equal_nested_tensor_list(sub_a, sub_b): + return False + elif isinstance(sub_a, torch.Tensor) and isinstance(sub_b, torch.Tensor): + if not torch.equal(sub_a, sub_b): + return False + else: + return False + + return True + + return False def _is_close(batch_a: Dict[str, Any], batch_b: Dict[str, Any]) -> None: assert batch_a.keys() == batch_b.keys() for key in batch_a.keys(): if isinstance(batch_a[key], torch.Tensor): assert torch.allclose(batch_a[key], batch_b[key], rtol=1e-4, atol=1e-5) + elif _is_nested_tensor_list(batch_a[key]) and _is_nested_tensor_list(batch_b[key]): + assert _equal_nested_tensor_list(batch_a[key], batch_b[key]) else: assert batch_a[key] == batch_b[key] @@ -185,7 +223,7 @@ def test_pixtral_plugin(): image_slice_height, image_slice_width = 2, 2 check_inputs = {"plugin": pixtral_plugin, "tokenizer": tokenizer, "processor": processor} check_inputs["expected_mm_messages"] = [ - {key: value.replace("", "{}[IMG_BREAK]".format("[IMG]" * image_slice_width) * image_slice_height).rsplit("[IMG_BREAK]", 1)[0] + "[IMG_END]" + {key: value.replace("", ("{}[IMG_BREAK]".format("[IMG]" * image_slice_width) * image_slice_height).rsplit("[IMG_BREAK]", 1)[0] + "[IMG_END]") for key, value in message.items()} for message in MM_MESSAGES ] check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor) From 4845a765350465f82fa48e5e7a01756bb56c23d5 Mon Sep 17 00:00:00 2001 From: KUANGDD Date: Wed, 16 Oct 2024 01:09:33 +0800 Subject: [PATCH 16/34] fix bug for webui infer Former-commit-id: 17768832908cc59ab64ed72522b2954c575ce21d --- src/llamafactory/data/mm_plugin.py | 6 ++++++ tests/data/test_mm_plugin.py | 5 +++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index de7e362a..f67737f5 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -513,6 +513,12 @@ class PixtralPlugin(BasePlugin): ) -> Dict[str, Union[List[int], "torch.Tensor"]]: self._validate_input(images, videos) mm_inputs = self._get_mm_inputs(images, videos, processor) + # hack for hf engine + if mm_inputs.get("pixel_values") and len(mm_inputs.get("pixel_values")[0]) == 1: + mm_inputs["pixel_values"] = mm_inputs["pixel_values"][0][0].unsqueeze(0) + + if mm_inputs.get("image_sizes"): + del mm_inputs["image_sizes"] return mm_inputs diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py index da64ccc3..32b89f7f 100644 --- a/tests/data/test_mm_plugin.py +++ b/tests/data/test_mm_plugin.py @@ -110,8 +110,6 @@ def _is_close(batch_a: Dict[str, Any], batch_b: Dict[str, Any]) -> None: for key in batch_a.keys(): if isinstance(batch_a[key], torch.Tensor): assert torch.allclose(batch_a[key], batch_b[key], rtol=1e-4, atol=1e-5) - elif _is_nested_tensor_list(batch_a[key]) and _is_nested_tensor_list(batch_b[key]): - assert _equal_nested_tensor_list(batch_a[key], batch_b[key]) else: assert batch_a[key] == batch_b[key] @@ -227,6 +225,9 @@ def test_pixtral_plugin(): for key, value in message.items()} for message in MM_MESSAGES ] check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor) + # TODO works needed for pixtral plugin test & hack hf engine input below for now + check_inputs["expected_mm_inputs"].pop("image_sizes") + check_inputs["expected_mm_inputs"]["pixel_values"] = check_inputs["expected_mm_inputs"]["pixel_values"][0][0].unsqueeze(0) _check_plugin(**check_inputs) From 7d135bbdb8010daff08f38255ac517fb6d1a78e7 Mon Sep 17 00:00:00 2001 From: KUANGDD Date: Wed, 16 Oct 2024 01:14:51 +0800 Subject: [PATCH 17/34] remove useless codes Former-commit-id: 01247fcdde215398ec67cbd6cf1bc6cfb512a9ba --- tests/data/test_mm_plugin.py | 36 ------------------------------------ 1 file changed, 36 deletions(-) diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py index 32b89f7f..b342e658 100644 --- a/tests/data/test_mm_plugin.py +++ b/tests/data/test_mm_plugin.py @@ -68,42 +68,6 @@ def _get_mm_inputs(processor: "ProcessorMixin") -> Dict[str, "torch.Tensor"]: image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") return image_processor(images=IMAGES, return_tensors="pt") -def _is_nested_tensor_list(element): - if not isinstance(element, list): - return False - - for item in element: - if isinstance(item, list): - if not _is_nested_tensor_list(item): - return False - - elif not isinstance(item, torch.Tensor): - return False - - return True - - -def _equal_nested_tensor_list(a: List[List[torch.Tensor]], b: List[List[torch.Tensor]]) -> bool: - if type(a) != type(b): - return False - - if isinstance(a, list) and isinstance(b, list): - if len(a) != len(b): - return False - - for sub_a, sub_b in zip(a, b): - if isinstance(sub_a, list) and isinstance(sub_b, list): - if not _equal_nested_tensor_list(sub_a, sub_b): - return False - elif isinstance(sub_a, torch.Tensor) and isinstance(sub_b, torch.Tensor): - if not torch.equal(sub_a, sub_b): - return False - else: - return False - - return True - - return False def _is_close(batch_a: Dict[str, Any], batch_b: Dict[str, Any]) -> None: assert batch_a.keys() == batch_b.keys() From cc097174ccaf66f9970860d0b70459b824ec55c7 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Wed, 16 Oct 2024 15:55:30 +0800 Subject: [PATCH 18/34] tiny fix [skip ci] Former-commit-id: 937f69190e529fe7bf0fdf58d7bbb39017854c5e --- tests/data/test_mm_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py index b342e658..cc9bf2a1 100644 --- a/tests/data/test_mm_plugin.py +++ b/tests/data/test_mm_plugin.py @@ -13,7 +13,7 @@ # limitations under the License. import os -from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple import pytest import torch From bd85e370be1b31d4b989ae3671b531992662dd4e Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Thu, 17 Oct 2024 19:46:36 +0800 Subject: [PATCH 19/34] Update README.md Former-commit-id: f62b0682e476dd62a4a3ac5620f8fc244e8bf150 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cbb4df12..7ec1c87b 100644 --- a/README.md +++ b/README.md @@ -740,4 +740,4 @@ This repo benefits from [PEFT](https://github.com/huggingface/peft), [TRL](https ## Star History -![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) \ No newline at end of file +![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) From 66819fd3eef646fe856008c5f3661216a4c8397e Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Thu, 17 Oct 2024 19:47:33 +0800 Subject: [PATCH 20/34] Update README_zh.md Former-commit-id: a829d4a28fae77b08a6ea451479c71578b3b552f --- README_zh.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_zh.md b/README_zh.md index 0d9f8fdf..a578afcb 100644 --- a/README_zh.md +++ b/README_zh.md @@ -741,4 +741,4 @@ run_name: test_run # 可选 ## Star History -![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) \ No newline at end of file +![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) From c6e131397764786626c1cc8557e330182f41afe0 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Thu, 17 Oct 2024 19:48:12 +0800 Subject: [PATCH 21/34] Update loader.py Former-commit-id: 3b229a27a108b840e6bed3c8684737f51ce9faf4 --- src/llamafactory/model/loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index 957d5e4e..7613c092 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -116,7 +116,6 @@ def load_config(model_args: "ModelArguments") -> "PretrainedConfig": Loads model config. """ init_kwargs = _get_init_kwargs(model_args) - return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs) From 4807d8a4efda3b6f15badad50bb2a4143daaa2bd Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Thu, 17 Oct 2024 19:48:51 +0800 Subject: [PATCH 22/34] Update misc.py Former-commit-id: fe9a927f1ea8e44e0429b437e5feecf13e34e9aa --- src/llamafactory/extras/misc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py index 47f2ebbe..74d3f595 100644 --- a/src/llamafactory/extras/misc.py +++ b/src/llamafactory/extras/misc.py @@ -79,7 +79,7 @@ def check_dependencies() -> None: if os.environ.get("DISABLE_VERSION_CHECK", "0").lower() in ["true", "1"]: logger.warning("Version checking has been disabled, may lead to unexpected behaviors.") else: - require_version("transformers>=4.41.2", "To fix: pip install transformers>=4.41.2,<=4.45.2") + require_version("transformers>=4.41.2,<=4.46.0", "To fix: pip install transformers>=4.41.2,<=4.46.0") require_version("datasets>=2.16.0,<=2.21.0", "To fix: pip install datasets>=2.16.0,<=2.21.0") require_version("accelerate>=0.30.1,<=0.34.2", "To fix: pip install accelerate>=0.30.1,<=0.34.2") require_version("peft>=0.11.1,<=0.12.0", "To fix: pip install peft>=0.11.1,<=0.12.0") From d0889012c202a0b7443dbe4ac539549da0e5f0d3 Mon Sep 17 00:00:00 2001 From: KUANGDD Date: Wed, 23 Oct 2024 15:24:07 +0800 Subject: [PATCH 23/34] modify style & little change Former-commit-id: c988477d14dc656450d5fec31895781b7f9f7dce --- src/llamafactory/chat/hf_engine.py | 11 +++++++++- src/llamafactory/data/collator.py | 3 +++ src/llamafactory/data/mm_plugin.py | 30 +++++++++++++--------------- src/llamafactory/data/template.py | 2 +- src/llamafactory/extras/constants.py | 2 +- src/llamafactory/model/loader.py | 2 +- tests/data/test_mm_plugin.py | 20 ++++++++++++++----- 7 files changed, 45 insertions(+), 25 deletions(-) diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 2b1d9fe5..53fb666a 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -165,8 +165,17 @@ class HuggingfaceEngine(BaseEngine): ) mm_inputs = template.mm_plugin.get_mm_inputs(**mm_input_dict, seqlens=[prompt_length], processor=processor) + for key, value in mm_inputs.items(): - value = value if isinstance(value, torch.Tensor) else torch.tensor(value) + value = ( + value + if isinstance(value, torch.Tensor) + else ( + torch.stack(value) + if isinstance(value, list) and all(isinstance(v, torch.Tensor) for v in value) + else torch.tensor(value) + ) + ) gen_kwargs[key] = value.to(model.device) return gen_kwargs, prompt_length diff --git a/src/llamafactory/data/collator.py b/src/llamafactory/data/collator.py index 92d86cc7..e92d2ab3 100644 --- a/src/llamafactory/data/collator.py +++ b/src/llamafactory/data/collator.py @@ -99,6 +99,9 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): features: Dict[str, "torch.Tensor"] = super().__call__(features) features.update(mm_inputs) + if features.get("pixel_values") is not None and isinstance(features["pixel_values"], list): + features = features.data + return features diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index f67737f5..a138c058 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -4,6 +4,7 @@ from io import BytesIO from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, TypedDict, Union import numpy as np +import torch from transformers.image_utils import get_image_size, to_numpy_array from typing_extensions import override @@ -447,6 +448,7 @@ class PaliGemmaPlugin(BasePlugin): mm_inputs["token_type_ids"] = _get_paligemma_token_type_ids(imglens, seqlens, processor) return mm_inputs + class PixtralPlugin(BasePlugin): @override def process_messages( @@ -466,32 +468,28 @@ class PixtralPlugin(BasePlugin): img_kwargs = self._get_mm_inputs(images, videos, processor) image_input_sizes = None - if img_kwargs.get("pixel_values") is not None: - image_input_sizes = img_kwargs["image_sizes"] + image_input_sizes = img_kwargs.get("image_sizes", None) messages = deepcopy(messages) for message in messages: content = message["content"] - img_id = 0 while IMAGE_PLACEHOLDER in content: - if image_input_sizes is None: - raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER)) + raise ValueError( + "The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER) + ) - image_size = image_input_sizes[0][img_id] + image_size = image_input_sizes[0][num_image_tokens] height, width = image_size num_height_tokens = height // patch_size num_width_tokens = width // patch_size - replace_tokens = [ - [image_token] * num_width_tokens + [image_break_token] - ] * num_height_tokens + replace_tokens = [[image_token] * num_width_tokens + [image_break_token]] * num_height_tokens # Flatten list replace_tokens = [item for sublist in replace_tokens for item in sublist] replace_tokens[-1] = image_end_token replace_str = "".join(replace_tokens) content = content.replace(IMAGE_PLACEHOLDER, replace_str, 1) - img_id += 1 num_image_tokens += 1 message["content"] = content @@ -514,14 +512,13 @@ class PixtralPlugin(BasePlugin): self._validate_input(images, videos) mm_inputs = self._get_mm_inputs(images, videos, processor) # hack for hf engine - if mm_inputs.get("pixel_values") and len(mm_inputs.get("pixel_values")[0]) == 1: - mm_inputs["pixel_values"] = mm_inputs["pixel_values"][0][0].unsqueeze(0) - - if mm_inputs.get("image_sizes"): - del mm_inputs["image_sizes"] + if mm_inputs.get("pixel_values"): + mm_inputs["pixel_values"] = mm_inputs["pixel_values"][0] + mm_inputs.pop("image_sizes", None) return mm_inputs + class Qwen2vlPlugin(BasePlugin): @override def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject": @@ -698,9 +695,10 @@ def get_mm_plugin( plugin_class = PLUGINS.get(name, None) if plugin_class == "PixtralPlugin": from transformers.utils.versions import require_version + try: require_version("transformers==4.46.0.dev0") - except Exception as e: + except Exception: raise ImportError("PixtralPlugin requires transformers>=4.46.0.dev0. Please install it first.") if plugin_class is None: raise ValueError("Multimodal plugin `{}` not found.".format(name)) diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index 28ad2295..a9618885 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -938,7 +938,7 @@ _register_template( name="pixtral", format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]), format_prefix=EmptyFormatter(slots=[{"bos_token"}]), - mm_plugin=get_mm_plugin(name="pixtral", image_token="[IMG]") + mm_plugin=get_mm_plugin(name="pixtral", image_token="[IMG]"), ) diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index cf5df20c..237afeec 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -1185,7 +1185,7 @@ register_model_group( } }, template="pixtral", - vision=True + vision=True, ) diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index 957d5e4e..299e6333 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -116,7 +116,7 @@ def load_config(model_args: "ModelArguments") -> "PretrainedConfig": Loads model config. """ init_kwargs = _get_init_kwargs(model_args) - + return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs) diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py index b342e658..66e9b57c 100644 --- a/tests/data/test_mm_plugin.py +++ b/tests/data/test_mm_plugin.py @@ -13,7 +13,7 @@ # limitations under the License. import os -from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple import pytest import torch @@ -74,6 +74,10 @@ def _is_close(batch_a: Dict[str, Any], batch_b: Dict[str, Any]) -> None: for key in batch_a.keys(): if isinstance(batch_a[key], torch.Tensor): assert torch.allclose(batch_a[key], batch_b[key], rtol=1e-4, atol=1e-5) + elif isinstance(batch_a[key], list) and all(isinstance(item, torch.Tensor) for item in batch_a[key]): + assert len(batch_a[key]) == len(batch_b[key]) + for tensor_a, tensor_b in zip(batch_a[key], batch_b[key]): + assert torch.allclose(tensor_a, tensor_b, rtol=1e-4, atol=1e-5) else: assert batch_a[key] == batch_b[key] @@ -185,13 +189,19 @@ def test_pixtral_plugin(): image_slice_height, image_slice_width = 2, 2 check_inputs = {"plugin": pixtral_plugin, "tokenizer": tokenizer, "processor": processor} check_inputs["expected_mm_messages"] = [ - {key: value.replace("", ("{}[IMG_BREAK]".format("[IMG]" * image_slice_width) * image_slice_height).rsplit("[IMG_BREAK]", 1)[0] + "[IMG_END]") - for key, value in message.items()} for message in MM_MESSAGES + { + key: value.replace( + "", + ("{}[IMG_BREAK]".format("[IMG]" * image_slice_width) * image_slice_height).rsplit("[IMG_BREAK]", 1)[0] + + "[IMG_END]", + ) + for key, value in message.items() + } + for message in MM_MESSAGES ] check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor) - # TODO works needed for pixtral plugin test & hack hf engine input below for now check_inputs["expected_mm_inputs"].pop("image_sizes") - check_inputs["expected_mm_inputs"]["pixel_values"] = check_inputs["expected_mm_inputs"]["pixel_values"][0][0].unsqueeze(0) + check_inputs["expected_mm_inputs"]["pixel_values"] = check_inputs["expected_mm_inputs"]["pixel_values"][0] _check_plugin(**check_inputs) From 64ac6ca396b98527364cfeeaba8d290ebfd16489 Mon Sep 17 00:00:00 2001 From: KUANGDD Date: Wed, 23 Oct 2024 15:32:33 +0800 Subject: [PATCH 24/34] rm import torch Former-commit-id: 561a0f8155afca20ac699e124320b0eaef6dac07 --- src/llamafactory/data/mm_plugin.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index a138c058..9249d4ab 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -4,7 +4,6 @@ from io import BytesIO from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, TypedDict, Union import numpy as np -import torch from transformers.image_utils import get_image_size, to_numpy_array from typing_extensions import override From 5440ebbae60d2c28e33da979c6f7b9027626a744 Mon Sep 17 00:00:00 2001 From: KUANGDD Date: Wed, 23 Oct 2024 15:38:11 +0800 Subject: [PATCH 25/34] rm useless code Former-commit-id: 2dc337a49a8646ce916981b2914718e7472b5946 --- src/llamafactory/data/mm_plugin.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 9249d4ab..acdcca4e 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -692,13 +692,6 @@ def get_mm_plugin( video_token: Optional[str] = None, ) -> "BasePlugin": plugin_class = PLUGINS.get(name, None) - if plugin_class == "PixtralPlugin": - from transformers.utils.versions import require_version - - try: - require_version("transformers==4.46.0.dev0") - except Exception: - raise ImportError("PixtralPlugin requires transformers>=4.46.0.dev0. Please install it first.") if plugin_class is None: raise ValueError("Multimodal plugin `{}` not found.".format(name)) From 30d7f6a22e85eba5e8061af1778a0f63cad7900c Mon Sep 17 00:00:00 2001 From: KUANGDD Date: Wed, 23 Oct 2024 15:50:59 +0800 Subject: [PATCH 26/34] rm comment Former-commit-id: 80b58eaaec1996571d24b2dc2b73859cc28911a1 --- src/llamafactory/data/mm_plugin.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index acdcca4e..a601562d 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -510,7 +510,6 @@ class PixtralPlugin(BasePlugin): ) -> Dict[str, Union[List[int], "torch.Tensor"]]: self._validate_input(images, videos) mm_inputs = self._get_mm_inputs(images, videos, processor) - # hack for hf engine if mm_inputs.get("pixel_values"): mm_inputs["pixel_values"] = mm_inputs["pixel_values"][0] From 0a833968a0a01509cbd17edfaadf80cbd363ecf7 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 29 Oct 2024 21:57:28 +0800 Subject: [PATCH 27/34] Update README.md Former-commit-id: 65be32f6b12c2be80a12a4e903001820f64a0833 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7ec1c87b..c98b8d25 100644 --- a/README.md +++ b/README.md @@ -190,7 +190,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ | [PaliGemma](https://huggingface.co/google) | 3B | paligemma | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistral-community) | 12B | pixtral | +| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral | | [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | @@ -716,7 +716,7 @@ If you have a project that should be incorporated, please contact via email or c This repository is licensed under the [Apache-2.0 License](LICENSE). -Please follow the model licenses to use the corresponding model weights: [Baichuan 2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [InternLM2](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [MiniCPM](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/Phi-2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan) +Please follow the model licenses to use the corresponding model weights: [Baichuan 2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [InternLM2](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [MiniCPM](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md) / [Mistral/Mixtral/Pixtral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/Phi-2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan) ## Citation From 1fdd0530228641646144b62f2ce037fe1cc3cede Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 29 Oct 2024 21:58:03 +0800 Subject: [PATCH 28/34] Update README_zh.md Former-commit-id: e14535aa97062d0e57bbf1230c050f2c56a45556 --- README_zh.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README_zh.md b/README_zh.md index a578afcb..b932973c 100644 --- a/README_zh.md +++ b/README_zh.md @@ -191,7 +191,7 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272 | [PaliGemma](https://huggingface.co/google) | 3B | paligemma | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | -| [Pixtral](https://huggingface.co/mistral-community) | 12B | pixtral | +| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral | | [Qwen (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B/72B | qwen2_vl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | @@ -717,7 +717,7 @@ run_name: test_run # 可选 本仓库的代码依照 [Apache-2.0](LICENSE) 协议开源。 -使用模型权重时,请遵循对应的模型协议:[Baichuan 2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [InternLM2](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [MiniCPM](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/Phi-2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan) +使用模型权重时,请遵循对应的模型协议:[Baichuan 2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [InternLM2](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [MiniCPM](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md) / [Mistral/Mixtral/Pixtral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/Phi-2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan) ## 引用 From a9afffa2467cc561f01ec50b6cbf46e9897c0f45 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 29 Oct 2024 22:00:59 +0800 Subject: [PATCH 29/34] Update hf_engine.py Former-commit-id: 7412a8b95678ca6827a8c42c9f4d38115fede897 --- src/llamafactory/chat/hf_engine.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 53fb666a..87d9c451 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -165,17 +165,12 @@ class HuggingfaceEngine(BaseEngine): ) mm_inputs = template.mm_plugin.get_mm_inputs(**mm_input_dict, seqlens=[prompt_length], processor=processor) - for key, value in mm_inputs.items(): - value = ( - value - if isinstance(value, torch.Tensor) - else ( - torch.stack(value) - if isinstance(value, list) and all(isinstance(v, torch.Tensor) for v in value) - else torch.tensor(value) - ) - ) + if isinstance(value, list) and all(isinstance(v, torch.Tensor for v in value)): # for pixtral inputs + value = torch.stack(value) # assume they have same sizes + elif not isinstance(value, torch.Tensor): + value = torch.tensor(value) + gen_kwargs[key] = value.to(model.device) return gen_kwargs, prompt_length From 58fb24ce41f52daed1c5b1a3a3e1e8e6ae17d19b Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 29 Oct 2024 22:03:42 +0800 Subject: [PATCH 30/34] Update collator.py Former-commit-id: 941fa8a0d9c3a9106ad0af6e776db7e57f69548f --- src/llamafactory/data/collator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llamafactory/data/collator.py b/src/llamafactory/data/collator.py index 42b4f565..8fa6f0dd 100644 --- a/src/llamafactory/data/collator.py +++ b/src/llamafactory/data/collator.py @@ -99,8 +99,8 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): features: Dict[str, "torch.Tensor"] = super().__call__(features) features.update(mm_inputs) - if features.get("pixel_values") is not None and isinstance(features["pixel_values"], list): - features = features.data + if isinstance(features.get("pixel_values"), list): # for pixtral inputs + features = features.data # use default_collate() instead of BatchEncoding.to() return features From 16a9a44849517a19c84b2cf28060a56509c33b56 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 29 Oct 2024 22:10:29 +0800 Subject: [PATCH 31/34] Update visual.py Former-commit-id: 6f1db7b9abfbdea1781452388d66df3e9f9a5dd9 --- src/llamafactory/model/model_utils/visual.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py index beec1884..bcd21841 100644 --- a/src/llamafactory/model/model_utils/visual.py +++ b/src/llamafactory/model/model_utils/visual.py @@ -113,6 +113,7 @@ def configure_visual_model(config: "PretrainedConfig") -> None: "llava_next", "llava_next_video", "paligemma", + "pixtral", "video_llava", ]: # required for ds zero3 and valuehead models setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None)) From 1047217f784e1b6f125bbb3be1e9c7fd7f2a9d06 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 29 Oct 2024 22:11:21 +0800 Subject: [PATCH 32/34] Update template.py Former-commit-id: 99a01547ca31adade1c48feae5796e06b73d387c --- src/llamafactory/data/template.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index 2a11427b..d0da3b30 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -934,6 +934,7 @@ _register_template( replace_eos=True, ) + _register_template( name="pixtral", format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]), From f6b06d0c6f279d2a0d8004acb0ec2ce478d94dd5 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 29 Oct 2024 22:16:22 +0800 Subject: [PATCH 33/34] Update mm_plugin.py Former-commit-id: 830315cb438e75b589017fd57f70d0a513780a53 --- src/llamafactory/data/mm_plugin.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index de348896..52c65cb7 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -457,19 +457,16 @@ class PixtralPlugin(BasePlugin): videos: Sequence["VideoInput"], processor: Optional["ProcessorMixin"], ) -> List[Dict[str, str]]: - patch_size = processor.patch_size - image_token = processor.image_token - image_break_token = processor.image_break_token - image_end_token = processor.image_end_token - self._validate_input(images, videos) + patch_size = getattr(processor, "patch_size") + image_token = getattr(processor, "image_token") + image_break_token = getattr(processor, "image_break_token") + image_end_token = getattr(processor, "image_end_token") + num_image_tokens = 0 - img_kwargs = self._get_mm_inputs(images, videos, processor) - image_input_sizes = None - - image_input_sizes = img_kwargs.get("image_sizes", None) - messages = deepcopy(messages) + mm_inputs = self._get_mm_inputs(images, videos, processor) + image_input_sizes = mm_inputs.get("image_sizes", None) for message in messages: content = message["content"] while IMAGE_PLACEHOLDER in content: @@ -483,12 +480,10 @@ class PixtralPlugin(BasePlugin): num_height_tokens = height // patch_size num_width_tokens = width // patch_size replace_tokens = [[image_token] * num_width_tokens + [image_break_token]] * num_height_tokens - # Flatten list - replace_tokens = [item for sublist in replace_tokens for item in sublist] + replace_tokens = [item for sublist in replace_tokens for item in sublist] # flatten list replace_tokens[-1] = image_end_token replace_str = "".join(replace_tokens) content = content.replace(IMAGE_PLACEHOLDER, replace_str, 1) - num_image_tokens += 1 message["content"] = content From f0181a41ff562b43bcfc18332e03c2aa42e7eeb9 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 29 Oct 2024 22:19:04 +0800 Subject: [PATCH 34/34] fix bug Former-commit-id: e69665746d9fcd17a92ace7d5d9c8de1fc0c29b7 --- src/llamafactory/chat/hf_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 87d9c451..909f8161 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -166,7 +166,7 @@ class HuggingfaceEngine(BaseEngine): mm_inputs = template.mm_plugin.get_mm_inputs(**mm_input_dict, seqlens=[prompt_length], processor=processor) for key, value in mm_inputs.items(): - if isinstance(value, list) and all(isinstance(v, torch.Tensor for v in value)): # for pixtral inputs + if isinstance(value, list) and all(isinstance(v, torch.Tensor) for v in value): # for pixtral inputs value = torch.stack(value) # assume they have same sizes elif not isinstance(value, torch.Tensor): value = torch.tensor(value)