fix #1422

Former-commit-id: 25d7bbd0a5142f001bd2ff498df07b24137050a9
2025-12-29 02:00:36 +08:00 · 2023-11-07 19:42:01 +08:00
parent f23e5b602a
commit 14a38b5069
5 changed files with 21 additions and 17 deletions
--- a/src/llmtuner/tuner/rm/trainer.py
+++ b/src/llmtuner/tuner/rm/trainer.py
@@ -34,7 +34,7 @@ class PairwiseTrainer(Trainer):

        Subclass and override to inject custom behavior.

-        Note that the first element will be removed from the output tuple. 
+        Note that the first element will be removed from the output tuple.
        See: https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/trainer.py#L3509
        """
        # Compute rewards
@@ -45,9 +45,6 @@ class PairwiseTrainer(Trainer):
        # Split the inputs and rewards into two parts, chosen and rejected
        batch_size = inputs["input_ids"].size(0) // 2
        chosen_input_ids, rejected_input_ids = inputs["input_ids"][:batch_size], inputs["input_ids"][batch_size:]
-        chosen_attn_mask, rejected_attn_mask = (
-            inputs["attention_mask"][:batch_size], inputs["attention_mask"][batch_size:]
-        )
        chosen_rewards, rejected_rewards = values[:batch_size], values[batch_size:]
        chosen_scores, rejected_scores = [], []

@@ -55,8 +52,8 @@ class PairwiseTrainer(Trainer):
        # Inspired by: https://github.com/CarperAI/trlx/blob/main/examples/summarize_rlhf/reward_model/reward_model.py
        loss = 0
        for i in range(batch_size):
-            chosen_length = chosen_attn_mask[i].nonzero()[-1] + 1
-            rejected_length = rejected_attn_mask[i].nonzero()[-1] + 1
+            chosen_length = (chosen_input_ids[i] != self.tokenizer.pad_token_id).nonzero()[-1] + 1
+            rejected_length = (rejected_input_ids[i] != self.tokenizer.pad_token_id).nonzero()[-1] + 1
            check_divergence = (chosen_input_ids[i] != rejected_input_ids[i]).nonzero()

            if len(check_divergence) == 0:
@@ -69,7 +66,7 @@ class PairwiseTrainer(Trainer):
            assert div_index > 0
            chosen_trunc_rewards = chosen_rewards[i, div_index:end_index]
            rejected_trunc_rewards = rejected_rewards[i, div_index:end_index]
-            if return_outputs: # use the score on the EOS token for inference
+            if return_outputs: # use the score on the last token except pad token for inference
                chosen_scores.append(chosen_rewards[i, chosen_length-1])
                rejected_scores.append(rejected_rewards[i, rejected_length-1])
            loss += -torch.nn.functional.logsigmoid(chosen_trunc_rewards - rejected_trunc_rewards).mean()
@@ -95,7 +92,6 @@ class PairwiseTrainer(Trainer):

        output_prediction_file = os.path.join(self.args.output_dir, "generated_predictions.jsonl")
        logger.info(f"Saving prediction results to {output_prediction_file}")
-
        chosen_scores, rejected_scores = predict_results.predictions

        with open(output_prediction_file, "w", encoding="utf-8") as writer:
--- a/src/llmtuner/tuner/rm/workflow.py
+++ b/src/llmtuner/tuner/rm/workflow.py
@@ -28,6 +28,7 @@ def run_rm(
    dataset = preprocess_dataset(dataset, tokenizer, data_args, training_args, stage="rm")
    data_collator = PairwiseDataCollatorWithPadding(tokenizer, pad_to_multiple_of=4)

+    # Update arguments
    training_args_dict = training_args.to_dict()
    training_args_dict.update(dict(remove_unused_columns=False)) # important for pairwise dataset
    training_args = Seq2SeqTrainingArguments(**training_args_dict)