fix: avoid redundant normalization in DPO's SFT loss calculation (#6722)

Former-commit-id: 971a8ccbdacf130763d40c7ef82a711b2fc1292f
2026-02-06 22:12:19 +08:00 · 2025-01-21 13:38:02 +08:00
parent db9b977e4f
commit a8fae3869d
1 changed files with 5 additions and 1 deletions
--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
@@ -204,7 +204,11 @@ class CustomDPOTrainer(DPOTrainer):
        chosen_logps, rejected_logps = all_logps.split(batch_size, dim=0)
        chosen_logits, rejected_logits = all_logits.split(batch_size, dim=0)
        chosen_length, _ = valid_length.split(batch_size, dim=0)
-        return chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_logps / chosen_length
+
        if self.loss_type in ["ipo", "orpo", "simpo"]:
            return chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_logps
        else:
            return chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_logps / chosen_length
    @override
    def compute_reference_log_probs(