Compare commits

..

No commits in common. "56f45e826f828e44fcdca6a1a5a854d4b71f6ec7" and "934b3084ee6c2aad0e9132699c4a58f1dbf55cd2" have entirely different histories.

4 changed files with 11 additions and 12 deletions

View File

@ -110,10 +110,6 @@ def is_starlette_available():
def is_transformers_version_greater_than(content: str):
return _get_package_version("transformers") >= version.parse(content)
@lru_cache
def is_torch_version_greater_than(content: str):
return _get_package_version("torch") >= version.parse(content)
def is_uvicorn_available():
return _is_package_available("uvicorn")

View File

@ -16,7 +16,6 @@ from typing import TYPE_CHECKING
from ...extras import logging
from ...extras.constants import AttentionFunction
from ...extras.packages import is_torch_version_greater_than
if TYPE_CHECKING:
@ -52,14 +51,15 @@ def configure_attn_implementation(config: "PretrainedConfig", model_args: "Model
requested_attn_implementation = "eager"
elif model_args.flash_attn == AttentionFunction.SDPA:
if not is_torch_version_greater_than("2.1.1"):
from transformers.utils import is_torch_sdpa_available
if not is_torch_sdpa_available():
logger.warning_rank0("torch>=2.1.1 is required for SDPA attention.")
return
requested_attn_implementation = "sdpa"
elif model_args.flash_attn == AttentionFunction.FA2:
from transformers import is_torch_npu_available
if not (is_flash_attn_2_available() or is_torch_npu_available()):
if not is_flash_attn_2_available():
logger.warning_rank0("FlashAttention-2 is not installed.")
return

View File

@ -355,7 +355,7 @@ _register_composite_model(
_register_composite_model(
model_type="qwen3_vl",
projector_key="visual.merger",
vision_model_keys=["visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list"],
vision_model_keys=["visual.patch_embed", "visual.blocks"],
language_model_keys=["language_model", "lm_head"],
lora_conflict_keys=["patch_embed"],
)
@ -364,7 +364,7 @@ _register_composite_model(
_register_composite_model(
model_type="qwen3_vl_moe",
projector_key="visual.merger",
vision_model_keys=["visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list"],
vision_model_keys=["visual.patch_embed", "visual.blocks"],
language_model_keys=["language_model", "lm_head"],
lora_conflict_keys=["patch_embed"],
)
@ -373,7 +373,7 @@ _register_composite_model(
_register_composite_model(
model_type="qwen3_omni_moe_thinker",
projector_key="visual.merger",
vision_model_keys=["visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list", "audio_tower"],
vision_model_keys=["visual.patch_embed", "visual.blocks", "audio_tower"],
language_model_keys=["model", "lm_head"],
lora_conflict_keys=["patch_embed"],
)

View File

@ -203,7 +203,7 @@ class CustomDPOTrainer(DPOTrainer):
bco_losses = self.bco_loss(
policy_chosen_logps, policy_rejected_logps, reference_chosen_logps, reference_rejected_logps
)
losses = (losses + bco_losses * self.bco_gemma) / (1.0 + self.bco_gemma) # re-weight W_p and W_q
losses += bco_losses * self.bco_gemma
return losses, chosen_rewards, rejected_rewards
@ -284,6 +284,9 @@ class CustomDPOTrainer(DPOTrainer):
sft_loss = -policy_chosen_logps_avg
if self.ftx_gamma > 1e-6:
losses += self.ftx_gamma * sft_loss
if self.bco_gemma > 1e-6:
# re-weigthing for MPO
losses /= self.ftx_gamma + self.bco_gemma + 1.0
prefix = "eval_" if train_eval == "eval" else ""
metrics[f"{prefix}rewards/chosen"] = chosen_rewards.mean().item()