From 000526908ae92cc1567130559f04a3436855e0cd Mon Sep 17 00:00:00 2001 From: Username_Full Date: Wed, 31 Dec 2025 20:54:27 +0800 Subject: [PATCH] [core deps] upgrade TRL to be between 0.18 and 0.24 (#9617) Co-authored-by: Yaowei Zheng --- .github/workflows/tests.yml | 8 +++--- pyproject.toml | 6 ++-- src/llamafactory/extras/misc.py | 4 +-- src/llamafactory/train/dpo/trainer.py | 38 +++++++++++++++---------- src/llamafactory/train/kto/trainer.py | 10 ++++++- src/llamafactory/train/ppo/trainer.py | 21 ++++++++++++-- src/llamafactory/train/trainer_utils.py | 2 +- 7 files changed, 60 insertions(+), 29 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3def9eb89..a10d7d850 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -33,17 +33,17 @@ jobs: - "windows-latest" - "macos-latest" transformers: - - null + - "" include: # test backward compatibility - - python: "3.11" - os: "ubuntu-latest" - transformers: "4.49.0" - python: "3.11" os: "ubuntu-latest" transformers: "4.51.0" - python: "3.11" os: "ubuntu-latest" transformers: "4.53.0" + - python: "3.11" + os: "ubuntu-latest" + transformers: "4.55.0" runs-on: ${{ matrix.os }} diff --git a/pyproject.toml b/pyproject.toml index fe23272ae..a60f47606 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,12 +41,12 @@ dependencies = [ "torch>=2.4.0", "torchvision>=0.19.0", "torchaudio>=2.4.0", - "transformers>=4.49.0,<=4.56.2,!=4.52.0; python_version < '3.10'", - "transformers>=4.49.0,<=4.57.1,!=4.52.0,!=4.57.0; python_version >= '3.10'", + "transformers>=4.51.0,<=4.56.2,!=4.52.0; python_version < '3.10'", + "transformers>=4.51.0,<=4.57.1,!=4.52.0,!=4.57.0; python_version >= '3.10'", "datasets>=2.16.0,<=4.0.0", "accelerate>=1.3.0,<=1.11.0", "peft>=0.14.0,<=0.17.1", - "trl>=0.8.6,<=0.9.6", + "trl>=0.18.0,<=0.24.0", "torchdata>=0.10.0,<=0.11.0", # gui "gradio>=4.38.0,<=5.50.0", diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py index ba35bafb2..36c140287 100644 --- a/src/llamafactory/extras/misc.py +++ b/src/llamafactory/extras/misc.py @@ -94,11 +94,11 @@ def check_version(requirement: str, mandatory: bool = False) -> None: def check_dependencies() -> None: r"""Check the version of the required packages.""" - check_version("transformers>=4.49.0,<=4.57.1") + check_version("transformers>=4.51.0,<=4.57.1") check_version("datasets>=2.16.0,<=4.0.0") check_version("accelerate>=1.3.0,<=1.11.0") check_version("peft>=0.14.0,<=0.17.1") - check_version("trl>=0.8.6,<=0.9.6") + check_version("trl>=0.18.0,<=0.24.0") def calculate_tps(dataset: list[dict[str, Any]], metrics: dict[str, float], stage: Literal["sft", "rm"]) -> float: diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py index acc1c4863..7780e20ee 100644 --- a/src/llamafactory/train/dpo/trainer.py +++ b/src/llamafactory/train/dpo/trainer.py @@ -26,6 +26,7 @@ import torch.nn.functional as F from transformers import Trainer from trl import DPOTrainer from trl.trainer import disable_dropout_in_model +from trl.trainer.utils import prepare_deepspeed from typing_extensions import override from ...extras.constants import IGNORE_INDEX @@ -95,7 +96,7 @@ class CustomDPOTrainer(DPOTrainer): if not ( getattr(ref_model, "is_loaded_in_8bit", False) or getattr(ref_model, "is_loaded_in_4bit", False) ): # quantized models are already set on the correct device - self.ref_model = self._prepare_deepspeed(self.ref_model) + self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator) else: self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True) self.ref_model.eval() @@ -210,7 +211,7 @@ class CustomDPOTrainer(DPOTrainer): @override def concatenated_forward( self, model: "PreTrainedModel", batch: dict[str, "torch.Tensor"], is_ref_model: bool = False - ) -> tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]: + ) -> dict[str, "torch.Tensor"]: r"""Compute the sum log probabilities of the labels under given logits if loss_type is not IPO, ORPO or SimPO. Otherwise the average log probabilities. @@ -230,11 +231,18 @@ class CustomDPOTrainer(DPOTrainer): chosen_logps, rejected_logps = all_logps.split(batch_size, dim=0) chosen_logits, rejected_logits = all_logits.split(batch_size, dim=0) chosen_length, _ = valid_length.split(batch_size, dim=0) - if self.loss_type in ["ipo", "orpo", "simpo"]: - return chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_logps + chosen_logps_avg = chosen_logps else: - return chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_logps / chosen_length + chosen_logps_avg = chosen_logps / chosen_length + + return { + "chosen_logps": chosen_logps, + "rejected_logps": rejected_logps, + "chosen_logits": chosen_logits, + "rejected_logits": rejected_logits, + "chosen_logps_avg": chosen_logps_avg, + } @override def compute_reference_log_probs( @@ -252,9 +260,9 @@ class CustomDPOTrainer(DPOTrainer): ref_context = nullcontext() with torch.no_grad(), ref_context: - reference_chosen_logps, reference_rejected_logps, *_ = self.concatenated_forward( - ref_model, batch, is_ref_model=True - ) + ref_output = self.concatenated_forward(ref_model, batch, is_ref_model=True) + reference_chosen_logps = ref_output["chosen_logps"] + reference_rejected_logps = ref_output["rejected_logps"] return reference_chosen_logps, reference_rejected_logps @@ -267,13 +275,13 @@ class CustomDPOTrainer(DPOTrainer): ) -> tuple["torch.Tensor", dict[str, "torch.Tensor"]]: r"""Compute the DPO loss and other metrics for the given batch of inputs for train or test.""" metrics = {} - ( - policy_chosen_logps, - policy_rejected_logps, - policy_chosen_logits, - policy_rejected_logits, - policy_chosen_logps_avg, - ) = self.concatenated_forward(model, batch) + + model_output = self.concatenated_forward(model, batch) + policy_chosen_logps = model_output["chosen_logps"] + policy_rejected_logps = model_output["rejected_logps"] + policy_chosen_logits = model_output["chosen_logits"] + policy_rejected_logits = model_output["rejected_logits"] + policy_chosen_logps_avg = model_output["chosen_logps_avg"] reference_chosen_logps, reference_rejected_logps = self.compute_reference_log_probs(model, batch) losses, chosen_rewards, rejected_rewards = self.compute_preference_loss( diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py index f67d0eced..eea92f3be 100644 --- a/src/llamafactory/train/kto/trainer.py +++ b/src/llamafactory/train/kto/trainer.py @@ -25,6 +25,7 @@ import torch from transformers import Trainer from trl import KTOTrainer from trl.trainer import disable_dropout_in_model +from trl.trainer.utils import prepare_deepspeed from typing_extensions import override from ...extras.constants import IGNORE_INDEX @@ -77,6 +78,13 @@ class CustomKTOTrainer(KTOTrainer): self.desirable_weight = finetuning_args.kto_chosen_weight self.undesirable_weight = finetuning_args.kto_rejected_weight self.ftx_gamma = finetuning_args.pref_ftx + # trl + # Not all losses require a KL calculation + self.calculate_KL = True + if hasattr(self, "loss_type") and self.loss_type in ["apo_zero_unpaired"]: + self.calculate_KL = False + else: + self.loss_type = "kto" Trainer.__init__(self, model=model, **kwargs) self.model_accepts_loss_kwargs = False # overwrite trainer's default behavior @@ -90,7 +98,7 @@ class CustomKTOTrainer(KTOTrainer): if not ( getattr(ref_model, "is_loaded_in_8bit", False) or getattr(ref_model, "is_loaded_in_4bit", False) ): # quantized models are already set on the correct device - self.ref_model = self._prepare_deepspeed(self.ref_model) + self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator) else: self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True) self.ref_model.eval() diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py index 09d12a851..eaa74bb33 100644 --- a/src/llamafactory/train/ppo/trainer.py +++ b/src/llamafactory/train/ppo/trainer.py @@ -33,12 +33,12 @@ from transformers.trainer_pt_utils import remove_dummy_checkpoint from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME from trl import PPOConfig, PPOTrainer -from trl.core import PPODecorators, logprobs_from_logits +from trl import __version__ as trl_version from trl.models.utils import unwrap_model_for_generation from typing_extensions import override from ...extras import logging -from ...extras.misc import AverageMeter, count_parameters, get_current_device, get_logits_processor +from ...extras.misc import AverageMeter, count_parameters, get_current_device, get_logits_processor, torch_gc from ..callbacks import FixValueHeadModelCallback, SaveProcessorCallback from ..trainer_utils import create_custom_optimizer, create_custom_scheduler from .ppo_utils import dump_layernorm, get_rewards_from_server, replace_model, restore_layernorm @@ -83,6 +83,19 @@ class CustomPPOTrainer(PPOTrainer, Trainer): if eval_dataset is not None: raise NotImplementedError("PPOTrainer does not support eval dataset yet.") + # Check if TRL version is compatible (0.8.6 <= version <= 0.9.6) + try: + from transformers.utils.versions import require_version + + require_version( + "trl>=0.8.6,<=0.9.6", + "Incompatible TRL version detected. LLaMA-Factory ppo requires TRL version >=0.8.6,<=0.9.6. " + f"Found version {trl_version}. Please install the correct version with: `pip install trl>=0.8.6,<=0.9.6`\n" + "To fix: run `DISABLE_VERSION_CHECK=1 llamafactory-cli train example_ppo.yaml`\n", + ) + except ImportError as e: + raise e + backward_batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps ppo_config = PPOConfig( model_name=model_args.model_name_or_path, @@ -406,7 +419,6 @@ class CustomPPOTrainer(PPOTrainer, Trainer): return rewards.float().detach() # use fp32 type @override - @PPODecorators.empty_device_cache() def batched_forward_pass( self, model: "AutoModelForCausalLMWithValueHead", @@ -420,6 +432,9 @@ class CustomPPOTrainer(PPOTrainer, Trainer): Subclass and override to inject custom behavior. """ + from trl.core import logprobs_from_logits + + torch_gc() bs = len(queries) fbs = self.config.mini_batch_size all_logprobs = [] diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py index 60adb2ecc..ec291e447 100644 --- a/src/llamafactory/train/trainer_utils.py +++ b/src/llamafactory/train/trainer_utils.py @@ -108,7 +108,7 @@ def create_modelcard_and_push( elif training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: - trainer.create_model_card(license="other", **kwargs) # prevent from connecting to hub + Trainer.create_model_card(trainer, license="other", **kwargs) # prevent from connecting to hub def create_ref_model(