[model] add qwen2.5 vl models (#6779)

2025-12-17 04:10:36 +08:00 · 2025-01-31 03:00:29 +08:00
parent 15357cdad9
commit 999c7c8fe0
8 changed files with 77 additions and 30 deletions
--- a/src/llamafactory/data/collator.py
+++ b/src/llamafactory/data/collator.py
@@ -135,12 +135,16 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
        features: Dict[str, "torch.Tensor"] = super().__call__(features)

        if self.model is not None and hasattr(self.model, "get_rope_index"):  # for qwen2vl mrope
-            features["position_ids"], features["rope_deltas"] = self.model.get_rope_index(
-                input_ids=features["input_ids"],
-                image_grid_thw=mm_inputs.get("image_grid_thw", None),
-                video_grid_thw=mm_inputs.get("video_grid_thw", None),
-                attention_mask=features["attention_mask"],
-            )
+            rope_index_kwargs = {
+                "input_ids": features["input_ids"],
+                "image_grid_thw": mm_inputs.get("image_grid_thw"),
+                "video_grid_thw": mm_inputs.get("video_grid_thw"),
+                "attention_mask": features["attention_mask"],
+            }
+            if "second_per_grid_ts" in mm_inputs:
+                rope_index_kwargs["second_per_grid_ts"] = mm_inputs.get("second_per_grid_ts")
+
+            features["position_ids"], features["rope_deltas"] = self.model.get_rope_index(**rope_index_kwargs)

        if "cross_attention_mask" in mm_inputs:  # for mllama inputs when pad_to_multiple_of is enabled
            cross_attention_mask = mm_inputs.pop("cross_attention_mask")