diff --git a/README.md b/README.md index 547c84c7..6dfcf186 100644 --- a/README.md +++ b/README.md @@ -390,7 +390,7 @@ huggingface-cli login | ------------ | ------- | --------- | | python | 3.9 | 3.10 | | torch | 1.13.1 | 2.4.0 | -| transformers | 4.41.2 | 4.45.2 | +| transformers | 4.41.2 | 4.49.0 | | datasets | 2.16.0 | 3.2.0 | | accelerate | 0.34.0 | 1.2.1 | | peft | 0.11.1 | 0.12.0 | @@ -399,9 +399,9 @@ huggingface-cli login | Optional | Minimum | Recommend | | ------------ | ------- | --------- | | CUDA | 11.6 | 12.2 | -| deepspeed | 0.10.0 | 0.14.0 | +| deepspeed | 0.10.0 | 0.16.2 | | bitsandbytes | 0.39.0 | 0.43.1 | -| vllm | 0.4.3 | 0.6.6 | +| vllm | 0.4.3 | 0.7.2 | | flash-attn | 2.3.0 | 2.7.2 | ### Hardware Requirement diff --git a/README_zh.md b/README_zh.md index c9064e90..06c58ffb 100644 --- a/README_zh.md +++ b/README_zh.md @@ -392,7 +392,7 @@ huggingface-cli login | ------------ | ------- | --------- | | python | 3.9 | 3.10 | | torch | 1.13.1 | 2.4.0 | -| transformers | 4.41.2 | 4.45.2 | +| transformers | 4.41.2 | 4.49.0 | | datasets | 2.16.0 | 3.2.0 | | accelerate | 0.34.0 | 1.2.1 | | peft | 0.11.1 | 0.12.0 | @@ -401,9 +401,9 @@ huggingface-cli login | 可选项 | 至少 | 推荐 | | ------------ | ------- | --------- | | CUDA | 11.6 | 12.2 | -| deepspeed | 0.10.0 | 0.14.0 | +| deepspeed | 0.10.0 | 0.16.2 | | bitsandbytes | 0.39.0 | 0.43.1 | -| vllm | 0.4.3 | 0.6.6 | +| vllm | 0.4.3 | 0.7.2 | | flash-attn | 2.3.0 | 2.7.2 | ### 硬件依赖 diff --git a/requirements.txt b/requirements.txt index af317707..03639758 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -transformers>=4.41.2,<=4.48.3,!=4.46.*,!=4.47.*,!=4.48.0,!=4.48.1,!=4.48.2;python_version<'3.10' -transformers>=4.41.2,<=4.48.3,!=4.46.*,!=4.47.*,!=4.48.0;python_version>='3.10' +transformers>=4.41.2,<=4.49.0,!=4.46.*,!=4.47.*,!=4.48.*;python_version<'3.10' +transformers>=4.41.2,<=4.49.0,!=4.46.*,!=4.47.*,!=4.48.0;python_version>='3.10' datasets>=2.16.0,<=3.2.0 accelerate>=0.34.0,<=1.2.1 peft>=0.11.1,<=0.12.0 diff --git a/src/llamafactory/__init__.py b/src/llamafactory/__init__.py index 966a32d4..2cfc5a38 100644 --- a/src/llamafactory/__init__.py +++ b/src/llamafactory/__init__.py @@ -20,7 +20,7 @@ Level: Dependency graph: main: - transformers>=4.41.2,<=4.48.3,!=4.46.*,!=4.47.*,!=4.48.0 + transformers>=4.41.2,<=4.49.0,!=4.46.*,!=4.47.*,!=4.48.0 datasets>=2.16.0,<=3.2.0 accelerate>=0.34.0,<=1.2.1 peft>=0.11.1,<=0.12.0 diff --git a/src/llamafactory/data/collator.py b/src/llamafactory/data/collator.py index 72953d7a..68d2978c 100644 --- a/src/llamafactory/data/collator.py +++ b/src/llamafactory/data/collator.py @@ -187,8 +187,6 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): mm_inputs["cross_attention_mask"] = F.pad(cross_attention_mask, (0, 0, 0, 0, 0, seq_len - orig_len)) features.update(mm_inputs) - if isinstance(features.get("pixel_values"), list): # for pixtral inputs - features = features.data # use default_collate() instead of BatchEncoding.to() if "image_bound" in features: # for minicpmv inputs bsz, seq_length = features["input_ids"].shape diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 8a0fdab1..2c7a81c2 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -380,10 +380,8 @@ class LlavaNextPlugin(BasePlugin): num_image_tokens = 0 messages = deepcopy(messages) mm_inputs = self._get_mm_inputs(images, videos, audios, processor) - if "image_sizes" in mm_inputs: - image_sizes = iter(mm_inputs["image_sizes"]) - if "pixel_values" in mm_inputs: + image_sizes = iter(mm_inputs["image_sizes"].tolist()) height, width = get_image_size(to_numpy_array(mm_inputs["pixel_values"][0][0])) for message in messages: @@ -439,7 +437,7 @@ class LlavaNextVideoPlugin(BasePlugin): messages = deepcopy(messages) mm_inputs = self._get_mm_inputs(images, videos, audios, processor) if "pixel_values" in mm_inputs: - image_sizes = iter(mm_inputs["image_sizes"]) + image_sizes = iter(mm_inputs["image_sizes"].tolist()) height, width = get_image_size(to_numpy_array(mm_inputs["pixel_values"][0][0])) for message in messages: content = message["content"] @@ -916,16 +914,14 @@ class PixtralPlugin(BasePlugin): num_image_tokens = 0 messages = deepcopy(messages) mm_inputs = self._get_mm_inputs(images, videos, audios, processor) - image_input_sizes = mm_inputs.get("image_sizes", None) + if "pixel_values" in mm_inputs: + image_sizes = iter(mm_inputs["image_sizes"].tolist()) + for message in messages: content = message["content"] while IMAGE_PLACEHOLDER in content: - if image_input_sizes is None: - raise ValueError("Cannot get image input sizes.") - if self.expand_mm_tokens: - image_size = image_input_sizes[0][num_image_tokens] - height, width = image_size + height, width = next(image_sizes) num_height_tokens = height // patch_size num_width_tokens = width // patch_size replace_tokens = [[image_token] * num_width_tokens + [image_break_token]] * num_height_tokens @@ -959,9 +955,6 @@ class PixtralPlugin(BasePlugin): ) -> Dict[str, Union[List[int], "torch.Tensor"]]: self._validate_input(images, videos, audios) mm_inputs = self._get_mm_inputs(images, videos, audios, processor) - if mm_inputs.get("pixel_values"): - mm_inputs["pixel_values"] = mm_inputs["pixel_values"][0] - mm_inputs.pop("image_sizes", None) return mm_inputs diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py index 3f15dd04..f637d728 100644 --- a/src/llamafactory/extras/misc.py +++ b/src/llamafactory/extras/misc.py @@ -94,7 +94,7 @@ def check_dependencies() -> None: r""" Checks the version of the required packages. """ - check_version("transformers>=4.41.2,<=4.48.3,!=4.46.0,!=4.46.1,!=4.46.2,!=4.46.3,!=4.47.0,!=4.47.1,!=4.48.0") + check_version("transformers>=4.41.2,<=4.49.0,!=4.46.0,!=4.46.1,!=4.46.2,!=4.46.3,!=4.47.0,!=4.47.1,!=4.48.0") check_version("datasets>=2.16.0,<=3.2.0") check_version("accelerate>=0.34.0,<=1.2.1") check_version("peft>=0.11.1,<=0.12.0")