Merge remote-tracking branch 'upstream/main'

Former-commit-id: ea1f3ba5e0
2025-12-14 19:06:26 +08:00 · 2024-06-17 18:44:51 +08:00
parent 67df86201a 0a40ee5444
commit 5d59f6562a
184 changed files with 5411 additions and 1780 deletions
--- a/src/llamafactory/train/dpo/init.py
+++ b/src/llamafactory/train/dpo/init.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_dpo


--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
@@ -1,3 +1,22 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/dpo_trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import warnings
 from collections import defaultdict
 from contextlib import nullcontext
 from types import MethodType
@@ -7,10 +26,10 @@ import torch
 import torch.nn.functional as F
 from transformers import Trainer
 from trl import DPOTrainer
-from trl.trainer.utils import disable_dropout_in_model
+from trl.trainer import disable_dropout_in_model

 from ...extras.constants import IGNORE_INDEX
-from ..utils import create_custom_optimzer, create_custom_scheduler
+from ..trainer_utils import convert_pissa_adapter, create_custom_optimzer, create_custom_scheduler, get_batch_logps


 if TYPE_CHECKING:
@@ -61,6 +80,8 @@ class CustomDPOTrainer(DPOTrainer):
        if not hasattr(self, "accelerator"):
            raise AttributeError("Please update `transformers`.")

+        warnings.simplefilter("ignore")  # remove gc warnings on ref model
+
        if ref_model is not None:
            if self.is_deepspeed_enabled:
                if not (
@@ -69,6 +90,10 @@ class CustomDPOTrainer(DPOTrainer):
                    self.ref_model = self._prepare_deepspeed(self.ref_model)
            else:
                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+                self.ref_model.eval()
+
+        if finetuning_args.pissa_convert:
+            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))

        if finetuning_args.use_badam:
            from badam import clip_grad_norm_for_sparse_tensor
@@ -88,22 +113,13 @@ class CustomDPOTrainer(DPOTrainer):

    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
        super()._save(output_dir, state_dict)
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        if self.finetuning_args.pissa_convert:
+            convert_pissa_adapter(output_dir, state_dict, self.accelerator, self.model, self.args)
+
        if self.processor is not None:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
            getattr(self.processor, "image_processor").save_pretrained(output_dir)

-    def sft_loss(self, batch: Dict[str, "torch.Tensor"], chosen_logits: "torch.FloatTensor") -> "torch.Tensor":
-        r"""
-        Computes supervised cross-entropy loss of given labels under the given logits.
-
-        Returns:
-            A tensor of shape (batch_size,) containing the cross-entropy loss of each samples.
-        """
-        batch_size = batch["input_ids"].size(0) // 2
-        chosen_labels, _ = batch["labels"].split(batch_size, dim=0)
-        chosen_logps = self.get_batch_logps(chosen_logits, chosen_labels, average_log_prob=True)
-        return -chosen_logps
-
    def odds_ratio_loss(self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor") -> "torch.Tensor":
        r"""
        Computes ORPO's odds ratio (OR) loss for batched log probabilities of the policy model.
@@ -155,9 +171,9 @@ class CustomDPOTrainer(DPOTrainer):

    def concatenated_forward(
        self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"]
-    ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+    ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
        r"""
-        Computes the sum log probabilities of the labels under the given logits if loss_type != IPO.
+        Computes the sum log probabilities of the labels under given logits if loss_type is not IPO, ORPO or SimPO.

        Otherwise the average log probabilities.
        """
@@ -166,20 +182,18 @@ class CustomDPOTrainer(DPOTrainer):

        all_logits: "torch.Tensor" = model(**batch, return_dict=True, use_cache=False).logits.to(torch.float32)

-        all_logps = self.get_batch_logps(
-            logits=all_logits,
-            labels=batch["labels"],
-            average_log_prob=(self.loss_type in ["ipo", "orpo", "simpo"]),
-            is_encoder_decoder=self.is_encoder_decoder,
-            label_pad_token_id=self.label_pad_token_id,
-        )
+        all_logps, valid_length = get_batch_logps(logits=all_logits, labels=batch["labels"])
+        if self.loss_type in ["ipo", "orpo", "simpo"]:
+            all_logps = all_logps / valid_length
+
        batch_size = batch["input_ids"].size(0) // 2
        chosen_logps, rejected_logps = all_logps.split(batch_size, dim=0)
        chosen_logits, rejected_logits = all_logits.split(batch_size, dim=0)
-        return chosen_logps, rejected_logps, chosen_logits, rejected_logits
+        chosen_length, _ = valid_length.split(batch_size, dim=0)
+        return chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_logps / chosen_length

    def compute_reference_log_probs(
-        self, batch: Dict[str, "torch.Tensor"]
+        self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"]
    ) -> Tuple[Optional["torch.Tensor"], Optional["torch.Tensor"]]:
        r"""
        Computes log probabilities of the reference model.
@@ -188,19 +202,14 @@ class CustomDPOTrainer(DPOTrainer):
            return None, None

        if self.ref_model is None:
-            ref_model = self.model
-            ref_context = self.accelerator.unwrap_model(self.model).disable_adapter()
+            ref_model = model
+            ref_context = self.accelerator.unwrap_model(model).disable_adapter()
        else:
            ref_model = self.ref_model
            ref_context = nullcontext()

        with torch.no_grad(), ref_context:
-            (
-                reference_chosen_logps,
-                reference_rejected_logps,
-                _,
-                _,
-            ) = self.concatenated_forward(ref_model, batch)
+            reference_chosen_logps, reference_rejected_logps, *_ = self.concatenated_forward(ref_model, batch)

        return reference_chosen_logps, reference_rejected_logps

@@ -219,16 +228,17 @@ class CustomDPOTrainer(DPOTrainer):
            policy_rejected_logps,
            policy_chosen_logits,
            policy_rejected_logits,
+            policy_chosen_logps_avg,
        ) = self.concatenated_forward(model, batch)

-        reference_chosen_logps, reference_rejected_logps = self.compute_reference_log_probs(batch)
+        reference_chosen_logps, reference_rejected_logps = self.compute_reference_log_probs(model, batch)
        losses, chosen_rewards, rejected_rewards = self.compute_preference_loss(
            policy_chosen_logps,
            policy_rejected_logps,
            reference_chosen_logps,
            reference_rejected_logps,
        )
-        sft_loss = self.sft_loss(batch, policy_chosen_logits)  # compute chosen_logps with masks
+        sft_loss = -policy_chosen_logps_avg
        if self.ftx_gamma > 1e-6:
            losses += self.ftx_gamma * sft_loss

--- a/src/llamafactory/train/dpo/workflow.py
+++ b/src/llamafactory/train/dpo/workflow.py
@@ -1,4 +1,19 @@
-# Inspired by: https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/dpo.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 from typing import TYPE_CHECKING, List, Optional

@@ -7,7 +22,7 @@ from ...extras.constants import IGNORE_INDEX
 from ...extras.ploting import plot_loss
 from ...hparams import ModelArguments
 from ...model import load_model, load_tokenizer
-from ..utils import create_modelcard_and_push, create_ref_model
+from ..trainer_utils import create_modelcard_and_push, create_ref_model
 from .trainer import CustomDPOTrainer


--- a/src/llamafactory/train/kto/init.py
+++ b/src/llamafactory/train/kto/init.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_kto


--- a/src/llamafactory/train/kto/trainer.py
+++ b/src/llamafactory/train/kto/trainer.py
@@ -1,18 +1,37 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/kto_trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
 from collections import defaultdict
 from contextlib import nullcontext
 from types import MethodType
-from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, Literal, Optional, Tuple, Union

 import torch
 from transformers import Trainer
 from trl import KTOTrainer
-from trl.trainer.utils import disable_dropout_in_model
+from trl.trainer import disable_dropout_in_model

 from ...extras.constants import IGNORE_INDEX
-from ..utils import create_custom_optimzer, create_custom_scheduler
+from ..trainer_utils import create_custom_optimzer, create_custom_scheduler, get_batch_logps


 if TYPE_CHECKING:
+    import torch.utils.data
    from transformers import PreTrainedModel, ProcessorMixin

    from ...hparams import FinetuningArguments
@@ -59,6 +78,8 @@ class CustomKTOTrainer(KTOTrainer):
        if not hasattr(self, "accelerator"):
            raise AttributeError("Please update `transformers`.")

+        warnings.simplefilter("ignore")  # remove gc warnings on ref model
+
        if ref_model is not None:
            if self.is_deepspeed_enabled:
                if not (
@@ -67,6 +88,7 @@ class CustomKTOTrainer(KTOTrainer):
                    self.ref_model = self._prepare_deepspeed(self.ref_model)
            else:
                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+                self.ref_model.eval()

        if finetuning_args.use_badam:
            from badam import clip_grad_norm_for_sparse_tensor
@@ -84,73 +106,74 @@ class CustomKTOTrainer(KTOTrainer):
        create_custom_scheduler(self.args, num_training_steps, optimizer)
        return super().create_scheduler(num_training_steps, optimizer)

+    def _get_train_sampler(self) -> Optional["torch.utils.data.Sampler"]:
+        r"""
+        Replaces the sequential sampler of KTO Trainer created by trl with the random sampler.
+        """
+        return Trainer._get_train_sampler(self)
+
    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
        super()._save(output_dir, state_dict)
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
        if self.processor is not None:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
            getattr(self.processor, "image_processor").save_pretrained(output_dir)

-    def sft_loss(self, chosen_logits: "torch.FloatTensor", chosen_labels: "torch.LongTensor") -> "torch.Tensor":
-        r"""
-        Computes supervised cross-entropy loss of given labels under the given logits.
-
-        Returns:
-            A tensor of shape (batch_size,) containing the cross-entropy loss of each samples.
-        """
-        all_logps = self.get_batch_logps(chosen_logits, chosen_labels, average_log_prob=True)
-        return -all_logps
-
    def forward(
-        self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"]
-    ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
-        with torch.no_grad():
-            kl_model_inputs = {"input_ids": batch["kl_input_ids"], "attention_mask": batch["kl_attention_mask"]}
-            if "pixel_values" in batch:
-                kl_model_inputs["pixel_values"] = batch["pixel_values"]
-
-            if "kl_token_type_ids" in batch:
-                kl_model_inputs["token_type_ids"] = batch["kl_token_type_ids"]
-
-            kl_logits = model(**kl_model_inputs, return_dict=True, use_cache=False).logits.to(torch.float32)
-
-        model_inputs = {"input_ids": batch["input_ids"], "attention_mask": batch["attention_mask"]}
+        self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"], prefix: Literal["", "kl_"] = ""
+    ) -> Tuple["torch.Tensor", "torch.Tensor"]:
+        r"""
+        Runs forward pass and computes the log probabilities.
+        """
+        batch = {k: v.detach().clone() for k, v in batch.items()}  # avoid error
+        model_inputs = {
+            "input_ids": batch["{}input_ids".format(prefix)],
+            "attention_mask": batch["{}attention_mask".format(prefix)],
+        }
        if "pixel_values" in batch:
            model_inputs["pixel_values"] = batch["pixel_values"]

-        if "token_type_ids" in batch:
-            model_inputs["token_type_ids"] = batch["token_type_ids"]
+        if "{}token_type_ids".format(prefix) in batch:
+            model_inputs["token_type_ids"] = batch["{}token_type_ids".format(prefix)]

-        target_logits = model(**model_inputs, return_dict=True, use_cache=False).logits.to(torch.float32)
+        logits = model(**model_inputs, return_dict=True, use_cache=False).logits.to(torch.float32)

-        target_logps = self.get_batch_logps(
-            logits=target_logits,
-            labels=batch["labels"],
-            average_log_prob=False,
-            is_encoder_decoder=self.is_encoder_decoder,
-            label_pad_token_id=self.label_pad_token_id,
-        )
+        logps, valid_length = get_batch_logps(logits=logits, labels=batch["{}labels".format(prefix)])
+        return logps, logps / valid_length

-        kl_logps = self.get_batch_logps(
-            logits=kl_logits,
-            labels=batch["kl_labels"],
-            average_log_prob=False,
-            is_encoder_decoder=self.is_encoder_decoder,
-            label_pad_token_id=self.label_pad_token_id,
-        )
+    def concatenated_forward(
+        self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"]
+    ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        target_logps, target_logps_avg = self.forward(model, batch)
+        with torch.no_grad():
+            kl_logps, _ = self.forward(model, batch, prefix="kl_")

        if len(target_logps) != len(batch["kto_tags"]):
            raise ValueError("Mismatched shape of inputs and labels.")

-        chosen_idx = [i for i in range(len(target_logps)) if batch["kto_tags"][i]]
-        rejected_idx = [i for i in range(len(target_logps)) if not batch["kto_tags"][i]]
+        chosen_logps = target_logps[batch["kto_tags"]]
+        rejected_logps = target_logps[~batch["kto_tags"]]
+        chosen_logps_avg = target_logps_avg[batch["kto_tags"]]
+        return chosen_logps, rejected_logps, kl_logps, chosen_logps_avg

-        chosen_logps = target_logps[chosen_idx, ...]
-        rejected_logps = target_logps[rejected_idx, ...]
+    def compute_reference_log_probs(
+        self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"]
+    ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        r"""
+        Computes log probabilities of the reference model.
+        """
+        if self.ref_model is None:
+            ref_model = model
+            ref_context = self.accelerator.unwrap_model(model).disable_adapter()
+        else:
+            ref_model = self.ref_model
+            ref_context = nullcontext()

-        chosen_logits = target_logits[chosen_idx, ...]
-        rejected_logits = target_logits[rejected_idx, ...]
+        with torch.no_grad(), ref_context:
+            reference_chosen_logps, reference_rejected_logps, reference_kl_logps, _ = self.concatenated_forward(
+                ref_model, batch
+            )

-        return chosen_logps, rejected_logps, chosen_logits, rejected_logits, kl_logps
+        return reference_chosen_logps, reference_rejected_logps, reference_kl_logps

    def get_batch_loss_metrics(
        self,
@@ -161,31 +184,12 @@ class CustomKTOTrainer(KTOTrainer):
        Computes the DPO loss and other metrics for the given batch of inputs for train or test.
        """
        metrics = {}
-        (
-            policy_chosen_logps,
-            policy_rejected_logps,
-            policy_chosen_logits,
-            _,
-            policy_kl_logps,
-        ) = self.forward(model, batch)
-
-        with torch.no_grad():
-            if self.ref_model is None:
-                ref_model = self.model
-                ref_context = self.accelerator.unwrap_model(self.model).disable_adapter()
-            else:
-                ref_model = self.ref_model
-                ref_context = nullcontext()
-
-            with ref_context:
-                (
-                    reference_chosen_logps,
-                    reference_rejected_logps,
-                    _,
-                    _,
-                    reference_kl_logps,
-                ) = self.forward(ref_model, batch)
-
+        policy_chosen_logps, policy_rejected_logps, policy_kl_logps, policy_chosen_logps_avg = (
+            self.concatenated_forward(model, batch)
+        )
+        reference_chosen_logps, reference_rejected_logps, reference_kl_logps = self.compute_reference_log_probs(
+            model, batch
+        )
        losses, chosen_rewards, rejected_rewards, kl = self.kto_loss(
            policy_chosen_logps,
            policy_rejected_logps,
@@ -197,8 +201,8 @@ class CustomKTOTrainer(KTOTrainer):
        losses = losses.nanmean()

        if self.ftx_gamma > 1e-6 and len(policy_chosen_logps) > 0:  # remember to rescale
-            sft_loss = self.sft_loss(policy_chosen_logits, batch["labels"][batch["kto_tags"]])
-            losses += self.ftx_gamma * sft_loss.nanmean() / len(policy_chosen_logits) * len(batch["labels"])
+            sft_loss = -policy_chosen_logps_avg
+            losses += self.ftx_gamma * sft_loss.nanmean() / len(policy_chosen_logps) * len(batch["labels"])

        num_chosen = torch.Tensor([len(chosen_rewards)]).to(self.accelerator.device)
        num_rejected = torch.Tensor([len(rejected_rewards)]).to(self.accelerator.device)
--- a/src/llamafactory/train/kto/workflow.py
+++ b/src/llamafactory/train/kto/workflow.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/kto.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, List, Optional

 from ...data import KTODataCollatorWithPadding, get_dataset, split_dataset
@@ -5,7 +22,7 @@ from ...extras.constants import IGNORE_INDEX
 from ...extras.ploting import plot_loss
 from ...hparams import ModelArguments
 from ...model import load_model, load_tokenizer
-from ..utils import create_modelcard_and_push, create_ref_model
+from ..trainer_utils import create_modelcard_and_push, create_ref_model
 from .trainer import CustomKTOTrainer


--- a/src/llamafactory/train/ppo/init.py
+++ b/src/llamafactory/train/ppo/init.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_ppo


--- a/src/llamafactory/train/ppo/ppo_utils.py
+++ b/src/llamafactory/train/ppo/ppo_utils.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 from contextlib import nullcontext
 from typing import TYPE_CHECKING, Dict, List, Literal, Optional
@@ -8,15 +22,19 @@ from transformers.integrations import is_deepspeed_zero3_enabled
 from ...extras.packages import is_requests_available


-if TYPE_CHECKING:
-    from transformers import PreTrainedModel
-    from trl import AutoModelForCausalLMWithValueHead
-
 if is_requests_available():
    import requests


+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+    from trl import AutoModelForCausalLMWithValueHead
+
+
 def get_rewards_from_server(server_url: str, messages: List[str]) -> List[torch.Tensor]:
+    r"""
+    Gets reward scores from the API server.
+    """
    headers = {"Content-Type": "application/json"}
    payload = {"model": "model", "messages": messages}
    response = requests.post(server_url, json=payload, headers=headers)
@@ -25,25 +43,33 @@ def get_rewards_from_server(server_url: str, messages: List[str]) -> List[torch.


 def replace_model(model: "AutoModelForCausalLMWithValueHead", target: Literal["default", "reward"]) -> None:
+    r"""
+    Replaces the default/reward modules in the model. The model is already unwrapped.
+    """
+    v_head_layer = model.v_head.summary
    if is_deepspeed_zero3_enabled():
        import deepspeed  # type: ignore

-        params = [model.v_head.summary.weight, model.v_head.summary.bias]
+        params = [v_head_layer.weight, v_head_layer.bias]
        context_maybe_zero3 = deepspeed.zero.GatheredParameters(params, modifier_rank=0)
    else:
        context_maybe_zero3 = nullcontext()

+    model.pretrained_model.set_adapter(target)  # set the LoRA adapter to be active
    with context_maybe_zero3:
        if target == "reward":  # save default head temporarily
-            setattr(model, "default_head_weight", model.v_head.summary.weight.data.detach().clone())
-            setattr(model, "default_head_bias", model.v_head.summary.bias.data.detach().clone())
+            setattr(model, "default_head_weight", v_head_layer.weight.data.detach().clone())
+            setattr(model, "default_head_bias", v_head_layer.bias.data.detach().clone())

-        model.pretrained_model.set_adapter(target)  # set the LoRA adapter to be active
-        model.v_head.summary.weight.data = model.get_buffer("{}_head_weight".format(target)).detach().clone()
-        model.v_head.summary.bias.data = model.get_buffer("{}_head_bias".format(target)).detach().clone()
+        device = v_head_layer.weight.device
+        v_head_layer.weight.data = model.get_buffer("{}_head_weight".format(target)).detach().clone().to(device)
+        v_head_layer.bias.data = model.get_buffer("{}_head_bias".format(target)).detach().clone().to(device)


 def dump_layernorm(model: "PreTrainedModel") -> Dict[str, torch.Tensor]:
+    r"""
+    Dumps the layernorm parameters in the model. The model is already unwrapped (and gathered).
+    """
    layer_norm_params = {}
    for name, param in model.named_parameters():
        if param.data.dtype == torch.float32:
@@ -54,6 +80,9 @@ def dump_layernorm(model: "PreTrainedModel") -> Dict[str, torch.Tensor]:


 def restore_layernorm(model: "PreTrainedModel", layernorm_params: Optional[Dict[str, torch.Tensor]] = None) -> None:
+    r"""
+    Restores the layernorm parameters in the model. The model is already unwrapped (and gathered).
+    """
    for name, param in model.named_parameters():
        if name in layernorm_params:
            param.data = layernorm_params[name]
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -1,10 +1,29 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/ppo_trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 import os
 import sys
+import warnings
 from types import MethodType
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple

 import torch
+from accelerate.utils import DistributedDataParallelKwargs
 from tqdm import tqdm
 from transformers import GenerationConfig, Trainer, TrainerControl, TrainerState
 from transformers.optimization import get_scheduler
@@ -13,12 +32,13 @@ from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME
 from trl import PPOConfig, PPOTrainer
 from trl.core import PPODecorators, logprobs_from_logits
+from trl.models.utils import unwrap_model_for_generation

 from ...extras.callbacks import FixValueHeadModelCallback, LogCallback
 from ...extras.logging import get_logger
 from ...extras.misc import AverageMeter, count_parameters, get_current_device, get_logits_processor
-from ..utils import create_custom_optimzer, create_custom_scheduler
-from .utils import dump_layernorm, get_rewards_from_server, replace_model, restore_layernorm
+from ..trainer_utils import create_custom_optimzer, create_custom_scheduler
+from .ppo_utils import dump_layernorm, get_rewards_from_server, replace_model, restore_layernorm


 if TYPE_CHECKING:
@@ -78,6 +98,13 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
            project_kwargs={"logging_dir": training_args.logging_dir},
        )

+        # Add deepspeed config
+        ppo_config.accelerator_kwargs["kwargs_handlers"] = [
+            DistributedDataParallelKwargs(find_unused_parameters=training_args.ddp_find_unused_parameters)
+        ]
+        if training_args.deepspeed_plugin is not None:
+            ppo_config.accelerator_kwargs["deepspeed_plugin"] = training_args.deepspeed_plugin
+
        # Create optimizer and scheduler
        if training_args.max_steps > 0:
            num_training_steps = training_args.max_steps
@@ -114,15 +141,20 @@ class CustomPPOTrainer(PPOTrainer, Trainer):

        self.state = TrainerState()
        self.control = TrainerControl()
-        self.is_deepspeed_enabled = self.accelerator.distributed_type == "DEEPSPEED" and hasattr(
-            self.accelerator.state, "deepspeed_plugin"
-        )
+        self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
+        self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
        self.log_callback, self.save_callback = callbacks[0], callbacks[1]
        assert isinstance(self.log_callback, LogCallback) and isinstance(self.save_callback, FixValueHeadModelCallback)

        if self.args.max_steps > 0:
            logger.info("max_steps is given, it will override any value given in num_train_epochs")

+        unwrapped_model: "AutoModelForCausalLMWithValueHead" = self.accelerator.unwrap_model(self.model)
+        self.is_chatglm_model = getattr(unwrapped_model.config, "model_type", None) == "chatglm"
+
+        self.amp_context = torch.autocast(self.current_device.type, dtype=self.model_args.compute_dtype)
+        warnings.simplefilter("ignore")  # remove gc warnings on ref model
+
        if finetuning_args.reward_model_type == "full":
            if self.is_deepspeed_enabled:
                if not (
@@ -183,7 +215,6 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
            logger.info("  Total training steps = {}".format(max_steps))
            logger.info("  Number of trainable parameters = {}".format(count_parameters(self.model)[0]))

-        unwrapped_model: "AutoModelForCausalLMWithValueHead" = self.accelerator.unwrap_model(self.model)
        dataiter = iter(self.dataloader)
        loss_meter = AverageMeter()
        reward_meter = AverageMeter()
@@ -196,29 +227,21 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
                dataiter = iter(self.dataloader)
                batch = next(dataiter)

-            # Cast to inference mode
-            unwrapped_model.gradient_checkpointing_disable()
-            unwrapped_model.config.use_cache = True
-            self.model.eval()
-
            # Get inputs
+            self.model.eval()
            self.tokenizer.padding_side = "right"  # change padding side
            queries, responses, rewards = [], [], []
            for idx in range(0, self.config.batch_size, self.config.mini_batch_size):
                mini_batch_queries, mini_batch_responses = self.get_inputs(
                    batch[idx : idx + self.config.mini_batch_size]
                )
-                mini_batch_rewards = self.get_rewards(mini_batch_queries, mini_batch_responses, unwrapped_model)
+                mini_batch_rewards = self.get_rewards(mini_batch_queries, mini_batch_responses)
                queries.extend(mini_batch_queries)
                responses.extend(mini_batch_responses)
                rewards.extend(mini_batch_rewards)

-            # Cast to training mode
-            unwrapped_model.gradient_checkpointing_enable()
-            unwrapped_model.config.use_cache = False
-            self.model.train()
-
            # Run PPO step
+            self.model.train()
            stats = self.step(queries, responses, rewards)
            self.tokenizer.padding_side = "left"  # restore padding side
            loss_meter.update(float(stats["ppo/loss/total"]), n=len(rewards))
@@ -303,32 +326,26 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
        )
        return lr_scheduler

-    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
-        super()._save(output_dir, state_dict)
-        if self.processor is not None:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
-            getattr(self.processor, "image_processor").save_pretrained(output_dir)
-
    @torch.no_grad()
-    def get_inputs(self, batch: Dict[str, torch.Tensor]) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    def get_inputs(self, batch: Dict[str, "torch.Tensor"]) -> Tuple[List["torch.Tensor"], List["torch.Tensor"]]:
        r"""
        Generates model's responses given queries.
        """
-        if self.model_args.upcast_layernorm:
-            layernorm_params = dump_layernorm(self.model)
-
        if batch["input_ids"].size(0) == 1:  # handle llama2 ppo with gradient accumulation > 1
            start_index = (batch["input_ids"][0] != self.tokenizer.pad_token_id).nonzero()[0].item()
            for k, v in batch.items():
                batch[k] = v[:, start_index:]

-        unwrapped_model: "AutoModelForCausalLMWithValueHead" = self.accelerator.unwrap_model(self.model)
-        generate_output: torch.Tensor = unwrapped_model.generate(
-            generation_config=self.generation_config, logits_processor=get_logits_processor(), **batch
-        )
+        with unwrap_model_for_generation(self.model, self.accelerator) as unwrapped_model:
+            unwrapped_model = self.accelerator.unwrap_model(self.model)  # issue in trl v0.8.6
+            if self.model_args.upcast_layernorm:
+                layernorm_params = dump_layernorm(unwrapped_model)

-        if self.model_args.upcast_layernorm:
-            restore_layernorm(self.model, layernorm_params)
+            generate_output: torch.Tensor = unwrapped_model.generate(
+                generation_config=self.generation_config, logits_processor=get_logits_processor(), **batch
+            )
+            if self.model_args.upcast_layernorm:
+                restore_layernorm(unwrapped_model, layernorm_params)

        query = batch["input_ids"].detach().cpu()
        response = generate_output[:, batch["input_ids"].size(-1) :].detach().cpu()
@@ -350,10 +367,9 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
    @torch.no_grad()
    def get_rewards(
        self,
-        queries: List[torch.Tensor],
-        responses: List[torch.Tensor],
-        unwrapped_model: "AutoModelForCausalLMWithValueHead",
-    ) -> List[torch.Tensor]:
+        queries: List["torch.Tensor"],
+        responses: List["torch.Tensor"],
+    ) -> List["torch.Tensor"]:
        r"""
        Computes scores using given reward model.

@@ -364,18 +380,22 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
            messages = self.tokenizer.batch_decode(token_ids, skip_special_tokens=True)
            return get_rewards_from_server(self.reward_model, messages)

+        batch = self.prepare_model_inputs(queries, responses)
+        unwrapped_model: "AutoModelForCausalLMWithValueHead" = self.accelerator.unwrap_model(self.model)
+
        if self.finetuning_args.reward_model_type == "lora":
            replace_model(unwrapped_model, target="reward")
            reward_model = self.model
        else:
            reward_model = self.reward_model

-        batch = self.prepare_model_inputs(queries, responses)
-
-        with torch.cuda.amp.autocast(dtype=self.model_args.compute_dtype):  # support bf16
+        with unwrap_model_for_generation(reward_model, self.accelerator), self.amp_context:  # support bf16
            _, _, values = reward_model(**batch, output_hidden_states=True, return_dict=True, use_cache=False)

-        if getattr(unwrapped_model.config, "model_type", None) == "chatglm":  # assume same architecture
+        if self.finetuning_args.reward_model_type == "lora":
+            replace_model(unwrapped_model, target="default")
+
+        if self.is_chatglm_model:  # assume same architecture
            values = torch.transpose(values, 0, 1)

        rewards = []
@@ -384,21 +404,18 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
            end_index = end_indexes[-1].item() if len(end_indexes) else 0
            rewards.append(values[i, end_index].float().detach().cpu())  # use fp32 type

-        if self.finetuning_args.reward_model_type == "lora":
-            replace_model(unwrapped_model, target="default")
-
        return rewards

    @PPODecorators.empty_device_cache()
    def batched_forward_pass(
        self,
        model: "AutoModelForCausalLMWithValueHead",
-        queries: torch.Tensor,
-        responses: torch.Tensor,
-        model_inputs: dict,
+        queries: "torch.Tensor",
+        responses: "torch.Tensor",
+        model_inputs: Dict[str, Any],
        return_logits: bool = False,
-        response_masks: Optional[torch.Tensor] = None,
-    ):
+        response_masks: Optional["torch.Tensor"] = None,
+    ) -> Tuple["torch.Tensor", Optional["torch.Tensor"], "torch.Tensor", "torch.Tensor"]:
        r"""
        Calculates model outputs in multiple batches.

@@ -420,11 +437,10 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
            input_ids = input_kwargs["input_ids"]
            attention_mask = input_kwargs["attention_mask"]

-            with torch.cuda.amp.autocast(dtype=self.model_args.compute_dtype):  # support bf16
+            with self.amp_context:  # support bf16
                logits, _, values = model(**input_kwargs)

-            unwrapped_model: "AutoModelForCausalLMWithValueHead" = self.accelerator.unwrap_model(self.model)
-            if getattr(unwrapped_model.config, "model_type", None) == "chatglm":
+            if self.is_chatglm_model:
                values = torch.transpose(values, 0, 1)

            logprobs = logprobs_from_logits(logits[:, :-1, :], input_ids[:, 1:])
@@ -467,14 +483,28 @@ class CustomPPOTrainer(PPOTrainer, Trainer):

        Subclass and override to inject custom behavior.
        """
-        if self.args.should_save:
+        if output_dir is None:
+            output_dir = self.args.output_dir
+
+        if self.is_fsdp_enabled or self.is_deepspeed_enabled:
            try:
-                self._save(output_dir, state_dict=self.accelerator.get_state_dict(self.model))
+                state_dict = self.accelerator.get_state_dict(self.model)  # must be called at all ranks
+                if self.args.should_save:
+                    self._save(output_dir, state_dict=state_dict)
            except ValueError:
                logger.warning(
                    " stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead,"
                    " use zero_to_fp32.py to recover weights"
                )
-                self._save(output_dir, state_dict={})
-                remove_dummy_checkpoint(True, output_dir, [WEIGHTS_NAME, SAFE_WEIGHTS_NAME])
+                if self.args.should_save:
+                    self._save(output_dir, state_dict={})
+                # remove the dummy state_dict
+                remove_dummy_checkpoint(self.args.should_save, output_dir, [WEIGHTS_NAME, SAFE_WEIGHTS_NAME])
                self.model.save_checkpoint(output_dir)
+
+        elif self.args.should_save:
+            self._save(output_dir)
+
+        if self.processor is not None and self.args.should_save:
+            output_dir = output_dir if output_dir is not None else self.args.output_dir
+            getattr(self.processor, "image_processor").save_pretrained(output_dir)
--- a/src/llamafactory/train/ppo/workflow.py
+++ b/src/llamafactory/train/ppo/workflow.py
@@ -1,4 +1,19 @@
-# Inspired by: https://github.com/lvwerra/trl/blob/main/examples/research_projects/stack_llama/scripts/rl_training.py
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/ppo.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 from typing import TYPE_CHECKING, List, Optional

@@ -9,7 +24,7 @@ from ...extras.callbacks import FixValueHeadModelCallback
 from ...extras.misc import fix_valuehead_checkpoint
 from ...extras.ploting import plot_loss
 from ...model import load_model, load_tokenizer
-from ..utils import create_ref_model, create_reward_model
+from ..trainer_utils import create_ref_model, create_reward_model
 from .trainer import CustomPPOTrainer


@@ -29,7 +44,7 @@ def run_ppo(
 ):
    tokenizer_module = load_tokenizer(model_args)
    tokenizer = tokenizer_module["tokenizer"]
-    dataset = get_dataset(model_args, data_args, training_args, stage="pt", **tokenizer_module)
+    dataset = get_dataset(model_args, data_args, training_args, stage="ppo", **tokenizer_module)
    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train, add_valuehead=True)

    tokenizer.padding_side = "left"  # use left-padding in generation while using right-padding in training
--- a/src/llamafactory/train/pt/init.py
+++ b/src/llamafactory/train/pt/init.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_pt


--- a/src/llamafactory/train/pt/trainer.py
+++ b/src/llamafactory/train/pt/trainer.py
@@ -1,10 +1,25 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
 from types import MethodType
 from typing import TYPE_CHECKING, Dict, Optional

 from transformers import Trainer

 from ...extras.logging import get_logger
-from ..utils import create_custom_optimzer, create_custom_scheduler
+from ..trainer_utils import convert_pissa_adapter, create_custom_optimzer, create_custom_scheduler


 if TYPE_CHECKING:
@@ -28,6 +43,10 @@ class CustomTrainer(Trainer):
        super().__init__(**kwargs)
        self.finetuning_args = finetuning_args
        self.processor = processor
+
+        if finetuning_args.pissa_convert:
+            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
+
        if finetuning_args.use_badam:
            from badam import clip_grad_norm_for_sparse_tensor

@@ -46,6 +65,9 @@ class CustomTrainer(Trainer):

    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
        super()._save(output_dir, state_dict)
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        if self.finetuning_args.pissa_convert:
+            convert_pissa_adapter(output_dir, state_dict, self.accelerator, self.model, self.args)
+
        if self.processor is not None:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
            getattr(self.processor, "image_processor").save_pretrained(output_dir)
--- a/src/llamafactory/train/pt/workflow.py
+++ b/src/llamafactory/train/pt/workflow.py
@@ -1,4 +1,19 @@
-# Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/language-modeling/run_clm.py
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 import math
 from typing import TYPE_CHECKING, List, Optional
@@ -8,7 +23,7 @@ from transformers import DataCollatorForLanguageModeling
 from ...data import get_dataset, split_dataset
 from ...extras.ploting import plot_loss
 from ...model import load_model, load_tokenizer
-from ..utils import create_modelcard_and_push
+from ..trainer_utils import create_modelcard_and_push
 from .trainer import CustomTrainer


--- a/src/llamafactory/train/rm/init.py
+++ b/src/llamafactory/train/rm/init.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_rm


--- a/src/llamafactory/train/rm/metric.py
+++ b/src/llamafactory/train/rm/metric.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Dict, Sequence, Tuple, Union

 import numpy as np
--- a/src/llamafactory/train/rm/trainer.py
+++ b/src/llamafactory/train/rm/trainer.py
@@ -1,3 +1,42 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# This code is inspired by the CarperAI's trlx library.
+# https://github.com/CarperAI/trlx/blob/v0.7.0/examples/summarize_rlhf/reward_model/reward_model.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License
+#
+# Copyright (c) 2022 CarperAI
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 import json
 import os
 from types import MethodType
@@ -7,7 +46,7 @@ import torch
 from transformers import Trainer

 from ...extras.logging import get_logger
-from ..utils import create_custom_optimzer, create_custom_scheduler
+from ..trainer_utils import create_custom_optimzer, create_custom_scheduler


 if TYPE_CHECKING:
@@ -50,8 +89,8 @@ class PairwiseTrainer(Trainer):

    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
        super()._save(output_dir, state_dict)
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
        if self.processor is not None:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
            getattr(self.processor, "image_processor").save_pretrained(output_dir)

    def compute_loss(
@@ -79,7 +118,6 @@ class PairwiseTrainer(Trainer):
        chosen_scores, rejected_scores = [], []

        # Compute pairwise loss. Only backprop on the different tokens before padding
-        # Inspired by: https://github.com/CarperAI/trlx/blob/main/examples/summarize_rlhf/reward_model/reward_model.py
        loss = 0
        for i in range(batch_size):
            chosen_length = (chosen_input_ids[i] != self.tokenizer.pad_token_id).nonzero()[-1] + 1
--- a/src/llamafactory/train/rm/workflow.py
+++ b/src/llamafactory/train/rm/workflow.py
@@ -1,4 +1,41 @@
-# Inspired by: https://github.com/CarperAI/trlx/blob/main/examples/summarize_rlhf/reward_model/train_reward_model_gptj.py
+# Copyright 2024 the LlamaFactory team.
+#
+# This code is inspired by the CarperAI's trlx library.
+# https://github.com/CarperAI/trlx/blob/v0.7.0/examples/summarize_rlhf/reward_model/train_reward_model_gptj.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License
+#
+# Copyright (c) 2022 CarperAI
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.

 from typing import TYPE_CHECKING, List, Optional

@@ -7,7 +44,7 @@ from ...extras.callbacks import FixValueHeadModelCallback
 from ...extras.misc import fix_valuehead_checkpoint
 from ...extras.ploting import plot_loss
 from ...model import load_model, load_tokenizer
-from ..utils import create_modelcard_and_push
+from ..trainer_utils import create_modelcard_and_push
 from .metric import compute_accuracy
 from .trainer import PairwiseTrainer

--- a/src/llamafactory/train/sft/init.py
+++ b/src/llamafactory/train/sft/init.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_sft


--- a/src/llamafactory/train/sft/metric.py
+++ b/src/llamafactory/train/sft/metric.py
@@ -1,21 +1,43 @@
+# Copyright 2024 HuggingFace Inc., THUDM, and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library and the THUDM's ChatGLM implementation.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
+# https://github.com/THUDM/ChatGLM-6B/blob/main/ptuning/main.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union

 import numpy as np
+from transformers.utils import is_jieba_available, is_nltk_available

 from ...extras.constants import IGNORE_INDEX
-from ...extras.packages import is_jieba_available, is_nltk_available, is_rouge_available
+from ...extras.packages import is_rouge_available


 if TYPE_CHECKING:
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers import PreTrainedTokenizer
+

 if is_jieba_available():
    import jieba  # type: ignore

+
 if is_nltk_available():
    from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu

+
 if is_rouge_available():
    from rouge_chinese import Rouge

--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/trainer_seq2seq.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
 from types import MethodType
@@ -9,10 +26,11 @@ from transformers import Seq2SeqTrainer

 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
-from ..utils import create_custom_optimzer, create_custom_scheduler
+from ..trainer_utils import convert_pissa_adapter, create_custom_optimzer, create_custom_scheduler


 if TYPE_CHECKING:
+    from torch.utils.data import Dataset
    from transformers import ProcessorMixin
    from transformers.trainer import PredictionOutput

@@ -33,6 +51,10 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
        super().__init__(**kwargs)
        self.finetuning_args = finetuning_args
        self.processor = processor
+
+        if finetuning_args.pissa_convert:
+            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
+
        if finetuning_args.use_badam:
            from badam import clip_grad_norm_for_sparse_tensor

@@ -51,8 +73,11 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):

    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
        super()._save(output_dir, state_dict)
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        if self.finetuning_args.pissa_convert:
+            convert_pissa_adapter(output_dir, state_dict, self.accelerator, self.model, self.args)
+
        if self.processor is not None:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
            getattr(self.processor, "image_processor").save_pretrained(output_dir)

    def training_step(self, *args, **kwargs):
@@ -109,7 +134,7 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
        padded_tensor[:, -src_tensor.shape[-1] :] = src_tensor  # adopt left-padding
        return padded_tensor.contiguous()  # in contiguous memory

-    def save_predictions(self, predict_results: "PredictionOutput") -> None:
+    def save_predictions(self, dataset: "Dataset", predict_results: "PredictionOutput") -> None:
        r"""
        Saves model predictions to `output_dir`.

@@ -135,6 +160,9 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
                    (preds[i][pad_len[0] :], preds[i][: pad_len[0]]), axis=-1
                )  # move pad token to last

+        decoded_inputs = self.tokenizer.batch_decode(
+            dataset["input_ids"], skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
        decoded_labels = self.tokenizer.batch_decode(
            labels, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
@@ -142,6 +170,6 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):

        with open(output_prediction_file, "w", encoding="utf-8") as writer:
            res: List[str] = []
-            for label, pred in zip(decoded_labels, decoded_preds):
-                res.append(json.dumps({"label": label, "predict": pred}, ensure_ascii=False))
+            for text, label, pred in zip(decoded_inputs, decoded_labels, decoded_preds):
+                res.append(json.dumps({"prompt": text, "label": label, "predict": pred}, ensure_ascii=False))
            writer.write("\n".join(res))
--- a/src/llamafactory/train/sft/workflow.py
+++ b/src/llamafactory/train/sft/workflow.py
@@ -1,4 +1,19 @@
-# Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/summarization/run_summarization.py
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 from typing import TYPE_CHECKING, List, Optional

@@ -9,7 +24,7 @@ from ...extras.constants import IGNORE_INDEX
 from ...extras.misc import get_logits_processor
 from ...extras.ploting import plot_loss
 from ...model import load_model, load_tokenizer
-from ..utils import create_modelcard_and_push
+from ..trainer_utils import create_modelcard_and_push
 from .metric import ComputeMetrics
 from .trainer import CustomSeq2SeqTrainer

@@ -93,7 +108,7 @@ def run_sft(
            predict_results.metrics.pop("predict_loss", None)
        trainer.log_metrics("predict", predict_results.metrics)
        trainer.save_metrics("predict", predict_results.metrics)
-        trainer.save_predictions(predict_results)
+        trainer.save_predictions(dataset, predict_results)

    # Create model card
    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -1,11 +1,33 @@
-from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the original GaLore's implementation: https://github.com/jiaweizzhao/GaLore
+# and the original LoRA+'s implementation: https://github.com/nikhil-ghosh-berkeley/loraplus
+# and the original BAdam's implementation: https://github.com/Ledzy/BAdam
+# and the HuggingFace's TRL library: https://github.com/huggingface/trl
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union

 import torch
+from peft import PeftModel
 from transformers import Trainer
 from transformers.optimization import get_scheduler
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
 from transformers.trainer_pt_utils import get_parameter_names

+from ..extras.constants import IGNORE_INDEX
 from ..extras.logging import get_logger
 from ..extras.packages import is_galore_available
 from ..hparams import FinetuningArguments, ModelArguments
@@ -17,8 +39,8 @@ if is_galore_available():


 if TYPE_CHECKING:
-    from transformers import Seq2SeqTrainingArguments
-    from transformers.modeling_utils import PreTrainedModel
+    from accelerate import Accelerator
+    from transformers import PreTrainedModel, Seq2SeqTrainingArguments
    from trl import AutoModelForCausalLMWithValueHead

    from ..hparams import DataArguments
@@ -81,15 +103,12 @@ def create_ref_model(
    The valuehead parameter is randomly initialized since it is useless for PPO training.
    """
    if finetuning_args.ref_model is not None:
-        ref_model_args_dict = model_args.to_dict()
-        ref_model_args_dict.update(
-            dict(
-                model_name_or_path=finetuning_args.ref_model,
-                adapter_name_or_path=finetuning_args.ref_model_adapters,
-                quantization_bit=finetuning_args.ref_model_quantization_bit,
-            )
+        ref_model_args = ModelArguments.copyfrom(
+            model_args,
+            model_name_or_path=finetuning_args.ref_model,
+            adapter_name_or_path=finetuning_args.ref_model_adapters,
+            quantization_bit=finetuning_args.ref_model_quantization_bit,
        )
-        ref_model_args = ModelArguments(**ref_model_args_dict)
        ref_finetuning_args = FinetuningArguments()
        tokenizer = load_tokenizer(ref_model_args)["tokenizer"]
        ref_model = load_model(
@@ -100,9 +119,11 @@ def create_ref_model(
        if finetuning_args.finetuning_type == "lora":
            ref_model = None
        else:
-            tokenizer = load_tokenizer(model_args)["tokenizer"]
+            ref_model_args = ModelArguments.copyfrom(model_args)
+            ref_finetuning_args = FinetuningArguments()
+            tokenizer = load_tokenizer(ref_model_args)["tokenizer"]
            ref_model = load_model(
-                tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=add_valuehead
+                tokenizer, ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead
            )
            logger.info("Created reference model from the model itself.")

@@ -137,15 +158,12 @@ def create_reward_model(
        logger.info("Loaded adapter weights of reward model from {}".format(finetuning_args.reward_model))
        return None
    else:
-        reward_model_args_dict = model_args.to_dict()
-        reward_model_args_dict.update(
-            dict(
-                model_name_or_path=finetuning_args.reward_model,
-                adapter_name_or_path=finetuning_args.reward_model_adapters,
-                quantization_bit=finetuning_args.reward_model_quantization_bit,
-            )
+        reward_model_args = ModelArguments.copyfrom(
+            model_args,
+            model_name_or_path=finetuning_args.reward_model,
+            adapter_name_or_path=finetuning_args.reward_model_adapters,
+            quantization_bit=finetuning_args.reward_model_quantization_bit,
        )
-        reward_model_args = ModelArguments(**reward_model_args_dict)
        reward_finetuning_args = FinetuningArguments()
        tokenizer = load_tokenizer(reward_model_args)["tokenizer"]
        reward_model = load_model(
@@ -156,6 +174,50 @@ def create_reward_model(
        return reward_model


+def convert_pissa_adapter(
+    output_dir: str,
+    state_dict: Dict[str, "torch.Tensor"],
+    accelerator: "Accelerator",
+    model: "PreTrainedModel",
+    training_args: "Seq2SeqTrainingArguments",
+) -> None:
+    r"""
+    Converts the PiSSA adapter to a LoRA adapter.
+    """
+    pissa_init_dir = os.path.join(training_args.output_dir, "pissa_init")
+    pissa_backup_dir = os.path.join(output_dir, "pissa_backup")
+    if output_dir == pissa_init_dir:
+        logger.info("Initial PiSSA adatper will be saved at: {}.".format(pissa_init_dir))
+        unwrapped_model = accelerator.unwrap_model(model)
+        if isinstance(unwrapped_model, PeftModel):
+            init_lora_weights = getattr(unwrapped_model.peft_config["default"], "init_lora_weights")
+            setattr(unwrapped_model.peft_config["default"], "init_lora_weights", True)
+            unwrapped_model.save_pretrained(
+                output_dir,
+                state_dict=state_dict,
+                safe_serialization=training_args.save_safetensors,
+            )
+            setattr(unwrapped_model.peft_config["default"], "init_lora_weights", init_lora_weights)
+    elif output_dir == training_args.output_dir:  # at the end of training
+        logger.info("Converted PiSSA adapter will be saved at: {}.".format(output_dir))
+        unwrapped_model = accelerator.unwrap_model(model)
+        if isinstance(unwrapped_model, PeftModel):  # backup the pissa adapter for further use
+            unwrapped_model.save_pretrained(
+                pissa_backup_dir,
+                state_dict=state_dict,
+                safe_serialization=training_args.save_safetensors,
+            )
+            unwrapped_model.save_pretrained(
+                output_dir,
+                state_dict=state_dict,
+                safe_serialization=training_args.save_safetensors,
+                convert_pissa_to_lora=pissa_init_dir,
+            )
+            # TODO: the model is applied pissa again unexpectedly
+            unwrapped_model.load_adapter(pissa_backup_dir, "default", is_trainable=True)
+            unwrapped_model.set_adapter("default")
+
+
 def _get_decay_parameter_names(model: "PreTrainedModel") -> List[str]:
    r"""
    Returns a list of names of parameters with weight decay. (weights in non-layernorm layers)
@@ -386,6 +448,7 @@ def create_custom_scheduler(
                optimizer=optimizer_dict[param],
                num_warmup_steps=training_args.get_warmup_steps(num_training_steps),
                num_training_steps=num_training_steps,
+                scheduler_specific_kwargs=training_args.lr_scheduler_kwargs,
            )

        def scheduler_hook(param: "torch.nn.Parameter"):
@@ -393,3 +456,24 @@ def create_custom_scheduler(

        for param in optimizer_dict.keys():
            param.register_post_accumulate_grad_hook(scheduler_hook)
+
+
+def get_batch_logps(
+    logits: "torch.Tensor", labels: "torch.Tensor", label_pad_token_id: int = IGNORE_INDEX
+) -> Tuple["torch.Tensor", "torch.Tensor"]:
+    r"""
+    Computes the log probabilities of the given labels under the given logits.
+
+    Returns:
+        logps: A tensor of shape (batch_size,) containing the sum of log probabilities.
+        valid_length: A tensor of shape (batch_size,) containing the number of non-masked tokens.
+    """
+    if logits.shape[:-1] != labels.shape:
+        raise ValueError("Logits (batchsize x seqlen) and labels must have the same shape.")
+
+    labels = labels[:, 1:].clone()
+    logits = logits[:, :-1, :]
+    loss_mask = labels != label_pad_token_id
+    labels[labels == label_pad_token_id] = 0  # dummy token
+    per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2)
+    return (per_token_logps * loss_mask).sum(-1), loss_mask.sum(-1)
--- a/src/llamafactory/train/tuner.py
+++ b/src/llamafactory/train/tuner.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Any, Dict, List, Optional

 import torch