mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-11-07 22:00:03 +08:00
Compare commits
No commits in common. "9afe70c6b99c793fad9c6b23a2661add16fbef42" and "fdafec43ca9a82789e5644d34c44969fd17c1abc" have entirely different histories.
9afe70c6b9
...
fdafec43ca
@ -194,7 +194,7 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
|
|||||||
elif "video_second_per_grid" in mm_inputs: # for qwen2.5 omni
|
elif "video_second_per_grid" in mm_inputs: # for qwen2.5 omni
|
||||||
rope_index_kwargs["second_per_grids"] = mm_inputs.get("video_second_per_grid")
|
rope_index_kwargs["second_per_grids"] = mm_inputs.get("video_second_per_grid")
|
||||||
|
|
||||||
if getattr(self.model.config, "model_type", None) in ["qwen2_5_omni_thinker", "qwen3_omni_moe_thinker"]: # for qwen2.5 omni
|
if getattr(self.model.config, "model_type", None) == "qwen2_5_omni_thinker": # for qwen2.5 omni
|
||||||
rope_index_kwargs["use_audio_in_video"] = getattr(self.processor, "use_audio_in_video", False)
|
rope_index_kwargs["use_audio_in_video"] = getattr(self.processor, "use_audio_in_video", False)
|
||||||
feature_attention_mask = mm_inputs.get("feature_attention_mask", None)
|
feature_attention_mask = mm_inputs.get("feature_attention_mask", None)
|
||||||
if feature_attention_mask is not None: # FIXME: need to get video image lengths
|
if feature_attention_mask is not None: # FIXME: need to get video image lengths
|
||||||
@ -211,7 +211,7 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
|
|||||||
if (
|
if (
|
||||||
self.model is not None
|
self.model is not None
|
||||||
and getattr(self.model.config, "model_type", None)
|
and getattr(self.model.config, "model_type", None)
|
||||||
in ["glm4v", "Keye", "qwen2_vl", "qwen2_5_vl", "qwen2_5_omni_thinker", "qwen3_omni_moe_thinker"]
|
in ["glm4v", "Keye", "qwen2_vl", "qwen2_5_vl", "qwen2_5_omni_thinker"]
|
||||||
and ("position_ids" not in features or features["position_ids"].dim() != 3)
|
and ("position_ids" not in features or features["position_ids"].dim() != 3)
|
||||||
):
|
):
|
||||||
raise ValueError(f"{self.model.config.model_type} requires 3D position ids for mrope.")
|
raise ValueError(f"{self.model.config.model_type} requires 3D position ids for mrope.")
|
||||||
|
|||||||
@ -350,32 +350,6 @@ _register_composite_model(
|
|||||||
lora_conflict_keys=["patch_embed"],
|
lora_conflict_keys=["patch_embed"],
|
||||||
)
|
)
|
||||||
|
|
||||||
_register_composite_model(
|
|
||||||
model_type="qwen3_vl",
|
|
||||||
projector_key="visual.merger",
|
|
||||||
vision_model_keys=["visual.patch_embed", "visual.blocks"],
|
|
||||||
language_model_keys=["language_model", "lm_head"],
|
|
||||||
lora_conflict_keys=["patch_embed"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
_register_composite_model(
|
|
||||||
model_type="qwen3_vl_moe",
|
|
||||||
projector_key="visual.merger",
|
|
||||||
vision_model_keys=["visual.patch_embed", "visual.blocks"],
|
|
||||||
language_model_keys=["language_model", "lm_head"],
|
|
||||||
lora_conflict_keys=["patch_embed"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
_register_composite_model(
|
|
||||||
model_type="qwen3_omni_moe_thinker",
|
|
||||||
projector_key="visual.merger",
|
|
||||||
vision_model_keys=["visual.patch_embed", "visual.blocks", "audio_tower"],
|
|
||||||
language_model_keys=["model", "lm_head"],
|
|
||||||
lora_conflict_keys=["patch_embed"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
_register_composite_model(
|
_register_composite_model(
|
||||||
model_type="video_llava",
|
model_type="video_llava",
|
||||||
|
|||||||
@ -332,14 +332,7 @@ def test_qwen2_omni_plugin():
|
|||||||
image_seqlen, audio_seqlen = 4, 2
|
image_seqlen, audio_seqlen = 4, 2
|
||||||
tokenizer_module = _load_tokenizer_module(model_name_or_path="Qwen/Qwen2.5-Omni-7B")
|
tokenizer_module = _load_tokenizer_module(model_name_or_path="Qwen/Qwen2.5-Omni-7B")
|
||||||
qwen2_omni_plugin = get_mm_plugin(
|
qwen2_omni_plugin = get_mm_plugin(
|
||||||
name="qwen2_omni",
|
name="qwen2_omni", audio_token="<|AUDIO|>", image_token="<|IMAGE|>", video_token="<|VIDEO|>"
|
||||||
audio_token="<|AUDIO|>",
|
|
||||||
image_token="<|IMAGE|>",
|
|
||||||
video_token="<|VIDEO|>",
|
|
||||||
vision_bos_token="<|vision_bos|>",
|
|
||||||
vision_eos_token="<|vision_eos|>",
|
|
||||||
audio_bos_token="<|audio_bos|>",
|
|
||||||
audio_eos_token="<|audio_eos|>",
|
|
||||||
)
|
)
|
||||||
check_inputs = {"plugin": qwen2_omni_plugin, **tokenizer_module}
|
check_inputs = {"plugin": qwen2_omni_plugin, **tokenizer_module}
|
||||||
check_inputs["expected_mm_messages"] = [
|
check_inputs["expected_mm_messages"] = [
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user