diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py index cfbe6a22..4ca64569 100644 --- a/src/llamafactory/model/model_utils/visual.py +++ b/src/llamafactory/model/model_utils/visual.py @@ -355,7 +355,7 @@ _register_composite_model( _register_composite_model( model_type="qwen3_vl", projector_key="visual.merger", - vision_model_keys=["visual.patch_embed", "visual.blocks"], + vision_model_keys=["visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list"], language_model_keys=["language_model", "lm_head"], lora_conflict_keys=["patch_embed"], ) @@ -364,7 +364,7 @@ _register_composite_model( _register_composite_model( model_type="qwen3_vl_moe", projector_key="visual.merger", - vision_model_keys=["visual.patch_embed", "visual.blocks"], + vision_model_keys=["visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list"], language_model_keys=["language_model", "lm_head"], lora_conflict_keys=["patch_embed"], ) @@ -373,7 +373,7 @@ _register_composite_model( _register_composite_model( model_type="qwen3_omni_moe_thinker", projector_key="visual.merger", - vision_model_keys=["visual.patch_embed", "visual.blocks", "audio_tower"], + vision_model_keys=["visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list", "audio_tower"], language_model_keys=["model", "lm_head"], lora_conflict_keys=["patch_embed"], )