From 0a94fab357f912ec2e63a598ce8841541cffda2b Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 17:44:48 +0800
Subject: [PATCH] support badam for all stages

Former-commit-id: e3d8fc75eb2cfc54efd35bfd9ad6c4ac5acc458c
---
 README.md                          | 27 ++++++++++++++-------------
 README_zh.md                       | 27 ++++++++++++++-------------
 examples/README.md                 |  4 +++-
 examples/README_zh.md              |  4 +++-
 src/llmtuner/train/dpo/trainer.py  |  6 ++++++
 src/llmtuner/train/orpo/trainer.py |  5 +++++
 src/llmtuner/train/ppo/trainer.py  |  6 ++++++
 src/llmtuner/train/pt/trainer.py   |  5 +++++
 src/llmtuner/train/rm/trainer.py   |  5 +++++
 9 files changed, 61 insertions(+), 28 deletions(-)
diff --git a/README.md b/README.md
index 2fc9ba88..276bc6a7 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ Choose your path:
 - **Various models**: LLaMA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc.
 - **Integrated methods**: (Continuous) pre-training, supervised fine-tuning, reward modeling, PPO, DPO and ORPO.
 - **Scalable resources**: 32-bit full-tuning, 16-bit freeze-tuning, 16-bit LoRA and 2/4/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8.
-- **Advanced algorithms**: GaLore, DoRA, LongLoRA, LLaMA Pro, LoRA+, LoftQ and Agent tuning.
+- **Advanced algorithms**: GaLore, BAdam, DoRA, LongLoRA, LLaMA Pro, LoRA+, LoftQ and Agent tuning.
 - **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune and rsLoRA.
 - **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, etc.
 - **Faster inference**: OpenAI-style API, Gradio UI and CLI with vLLM worker.
@@ -68,14 +68,16 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Changelog
 
+[24/04/16] We supported **[BAdam](https://arxiv.org/abs/2404.02827)**. See `examples/extras/badam` for usage.
+
 [24/04/16] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s long-sequence training (Llama-2-7B-56k within 24GB). It achieves **117%** speed and **50%** memory compared with FlashAttention-2, more benchmarks can be found in [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison).
 
 [24/03/31] We supported **[ORPO](https://arxiv.org/abs/2403.07691)**. See `examples/lora_single_gpu` for usage.
 
-[24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv!
-
 <details><summary>Full Changelog</summary>
 
+[24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv!
+
 [24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See `examples/extras/fsdp_qlora` for usage.
 
 [24/03/13] We supported **[LoRA+](https://arxiv.org/abs/2402.12354)**. See `examples/extras/loraplus` for usage.
@@ -278,16 +280,15 @@ huggingface-cli login
 
 \* *estimated*
 
-| Method | Bits |   7B  |  13B  |  30B  |   70B  |   8x7B |
-| ------ | ---- | ----- | ----- | ----- | ------ | ------ |
-| Full   | AMP  | 120GB | 240GB | 600GB | 1200GB |  900GB |
-| Full   |  16  |  60GB | 120GB | 300GB |  600GB |  400GB |
-| GaLore |  16  |  16GB |  32GB |  64GB |  160GB |  120GB |
-| Freeze |  16  |  20GB |  40GB |  80GB |  200GB |  160GB |
-| LoRA   |  16  |  16GB |  32GB |  64GB |  160GB |  120GB |
-| QLoRA  |   8  |  10GB |  20GB |  40GB |   80GB |   60GB |
-| QLoRA  |   4  |   6GB |  12GB |  24GB |   48GB |   30GB |
-| QLoRA  |   2  |   4GB |   8GB |  16GB |   24GB |   18GB |
+| Method            | Bits |   7B  |  13B  |  30B  |   70B  |   8x7B |
+| ----------------- | ---- | ----- | ----- | ----- | ------ | ------ |
+| Full              | AMP  | 120GB | 240GB | 600GB | 1200GB |  900GB |
+| Full              |  16  |  60GB | 120GB | 300GB |  600GB |  400GB |
+| Freeze            |  16  |  20GB |  40GB |  80GB |  200GB |  160GB |
+| LoRA/GaLore/BAdam |  16  |  16GB |  32GB |  64GB |  160GB |  120GB |
+| QLoRA             |   8  |  10GB |  20GB |  40GB |   80GB |   60GB |
+| QLoRA             |   4  |   6GB |  12GB |  24GB |   48GB |   30GB |
+| QLoRA             |   2  |   4GB |   8GB |  16GB |   24GB |   18GB |
 
 ## Getting Started
 
diff --git a/README_zh.md b/README_zh.md
index 6564ad4f..4420d8bb 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -46,7 +46,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - **多种模型**：LLaMA、Mistral、Mixtral-MoE、Qwen、Yi、Gemma、Baichuan、ChatGLM、Phi 等等。
 - **集成方法**：（增量）预训练、指令监督微调、奖励模型训练、PPO 训练、DPO 训练和 ORPO 训练。
 - **多种精度**：32 比特全参数微调、16 比特冻结微调、16 比特 LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8 的 2/4/8 比特 QLoRA 微调。
-- **先进算法**：GaLore、DoRA、LongLoRA、LLaMA Pro、LoRA+、LoftQ 和 Agent 微调。
+- **先进算法**：GaLore、BAdam、DoRA、LongLoRA、LLaMA Pro、LoRA+、LoftQ 和 Agent 微调。
 - **实用技巧**：FlashAttention-2、Unsloth、RoPE scaling、NEFTune 和 rsLoRA。
 - **实验监控**：LlamaBoard、TensorBoard、Wandb、MLflow 等等。
 - **极速推理**：基于 vLLM 的 OpenAI 风格 API、浏览器界面和命令行接口。
@@ -68,14 +68,16 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 更新日志
 
+[24/04/16] 我们支持了 **[BAdam](https://arxiv.org/abs/2404.02827)**。详细用法请参照 `examples/extras/badam`。
+
 [24/04/16] 我们支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的长序列训练（24GB 可训练 Llama-2-7B-56k）。该方法相比 FlashAttention-2 提供了 **117%** 的训练速度和 **50%** 的显存节约。更多数据请见[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
 
 [24/03/31] 我们支持了 **[ORPO](https://arxiv.org/abs/2403.07691)**。详细用法请参照 `examples/lora_single_gpu`。
 
-[24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看！
-
 <details><summary>展开日志</summary>
 
+[24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看！
+
 [24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 `examples/extras/fsdp_qlora`。
 
 [24/03/13] 我们支持了 **[LoRA+](https://arxiv.org/abs/2402.12354)**。详细用法请参照 `examples/extras/loraplus`。
@@ -278,16 +280,15 @@ huggingface-cli login
 
 \* *估算值*
 
-| 训练方法 | 精度 |   7B  |  13B  |  30B  |   70B  |   8x7B |
-| ------- | ---- | ----- | ----- | ----- | ------ | ------ |
-| 全参数   | AMP  | 120GB | 240GB | 600GB | 1200GB |  900GB |
-| 全参数   |  16  |  60GB | 120GB | 300GB |  600GB |  400GB |
-| GaLore  |  16  |  16GB |  32GB |  64GB |  160GB |  120GB |
-| 部分参数 |  16  |  20GB |  40GB |  80GB |  200GB |  160GB |
-| LoRA    |  16  |  16GB |  32GB |  64GB |  160GB |  120GB |
-| QLoRA   |   8  |  10GB |  20GB |  40GB |   80GB |   60GB |
-| QLoRA   |   4  |   6GB |  12GB |  24GB |   48GB |   30GB |
-| QLoRA   |   2  |   4GB |   8GB |  16GB |   24GB |   18GB |
+| 训练方法           | 精度 |   7B  |  13B  |  30B  |   70B  |   8x7B |
+| ----------------- | ---- | ----- | ----- | ----- | ------ | ------ |
+| Full              | AMP  | 120GB | 240GB | 600GB | 1200GB |  900GB |
+| Full              |  16  |  60GB | 120GB | 300GB |  600GB |  400GB |
+| Freeze            |  16  |  20GB |  40GB |  80GB |  200GB |  160GB |
+| LoRA/GaLore/BAdam |  16  |  16GB |  32GB |  64GB |  160GB |  120GB |
+| QLoRA             |   8  |  10GB |  20GB |  40GB |   80GB |   60GB |
+| QLoRA             |   4  |   6GB |  12GB |  24GB |   48GB |   30GB |
+| QLoRA             |   2  |   4GB |   8GB |  16GB |   24GB |   18GB |
 
 ## 如何使用
 
diff --git a/examples/README.md b/examples/README.md
index 4e771c2e..c0c0088e 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -3,7 +3,7 @@ We provide diverse examples about fine-tuning LLMs.
 ```
 examples/
 ├── lora_single_gpu/
-│   ├── pretrain.sh: Do pre-training using LoRA
+│   ├── pretrain.sh: Do continuous pre-training using LoRA
 │   ├── sft.sh: Do supervised fine-tuning using LoRA
 │   ├── reward.sh: Do reward modeling using LoRA
 │   ├── ppo.sh: Do PPO training using LoRA
@@ -34,6 +34,8 @@ examples/
 └── extras/
     ├── galore/
     │   └── sft.sh: Fine-tune model with GaLore
+    ├── badam/
+    │   └── sft.sh: Fine-tune model with BAdam
     ├── loraplus/
     │   └── sft.sh: Fine-tune model using LoRA+
     ├── llama_pro/
diff --git a/examples/README_zh.md b/examples/README_zh.md
index badda0fe..3f31ffc7 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -3,7 +3,7 @@
 ```
 examples/
 ├── lora_single_gpu/
-│   ├── pretrain.sh: 基于 LoRA 进行预训练
+│   ├── pretrain.sh: 基于 LoRA 进行增量预训练
 │   ├── sft.sh: 基于 LoRA 进行指令监督微调
 │   ├── reward.sh: 基于 LoRA 进行奖励模型训练
 │   ├── ppo.sh: 基于 LoRA 进行 PPO 训练
@@ -34,6 +34,8 @@ examples/
 └── extras/
     ├── galore/
     │   └── sft.sh: 使用 GaLore 训练模型
+    ├── badam/
+    │   └── sft.sh: 使用 BAdam 训练模型
     ├── loraplus/
     │   └── sft.sh: 使用 LoRA+ 训练模型
     ├── llama_pro/
diff --git a/src/llmtuner/train/dpo/trainer.py b/src/llmtuner/train/dpo/trainer.py
index 0b316c62..35dcd8db 100644
--- a/src/llmtuner/train/dpo/trainer.py
+++ b/src/llmtuner/train/dpo/trainer.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
 from contextlib import nullcontext
+from types import MethodType
 from typing import TYPE_CHECKING, Dict, Literal, Optional, Tuple, Union
 
 import torch
@@ -63,6 +64,11 @@ class CustomDPOTrainer(DPOTrainer):
             else:
                 self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
 
+        if finetuning_args.use_badam:
+            from badam import clip_grad_norm_for_sparse_tensor
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
+
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
             self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
diff --git a/src/llmtuner/train/orpo/trainer.py b/src/llmtuner/train/orpo/trainer.py
index d84e0199..5e0d70d9 100644
--- a/src/llmtuner/train/orpo/trainer.py
+++ b/src/llmtuner/train/orpo/trainer.py
@@ -1,4 +1,5 @@
 from collections import defaultdict
+from types import MethodType
 from typing import TYPE_CHECKING, Dict, Literal, Optional, Tuple, Union
 
 import torch
@@ -44,6 +45,10 @@ class CustomORPOTrainer(DPOTrainer):
         self._stored_metrics = defaultdict(lambda: defaultdict(list))
 
         Trainer.__init__(self, model=model, **kwargs)
+        if finetuning_args.use_badam:
+            from badam import clip_grad_norm_for_sparse_tensor
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
diff --git a/src/llmtuner/train/ppo/trainer.py b/src/llmtuner/train/ppo/trainer.py
index 020d54cf..ef769968 100644
--- a/src/llmtuner/train/ppo/trainer.py
+++ b/src/llmtuner/train/ppo/trainer.py
@@ -1,6 +1,7 @@
 import math
 import os
 import sys
+from types import MethodType
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
 
 import torch
@@ -124,6 +125,11 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
             else:
                 self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True)
 
+        if finetuning_args.use_badam:
+            from badam import clip_grad_norm_for_sparse_tensor
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
+
     def ppo_train(self, resume_from_checkpoint: Optional[str] = None) -> None:
         r"""
         Implements training loop for the PPO stage, like _inner_training_loop() in Huggingface's Trainer.
diff --git a/src/llmtuner/train/pt/trainer.py b/src/llmtuner/train/pt/trainer.py
index af2848fb..969ebf04 100644
--- a/src/llmtuner/train/pt/trainer.py
+++ b/src/llmtuner/train/pt/trainer.py
@@ -1,3 +1,4 @@
+from types import MethodType
 from typing import TYPE_CHECKING, Optional
 
 from transformers import Trainer
@@ -23,6 +24,10 @@ class CustomTrainer(Trainer):
     def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None:
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
+        if finetuning_args.use_badam:
+            from badam import clip_grad_norm_for_sparse_tensor
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
diff --git a/src/llmtuner/train/rm/trainer.py b/src/llmtuner/train/rm/trainer.py
index 8d0f2763..0f5d88d3 100644
--- a/src/llmtuner/train/rm/trainer.py
+++ b/src/llmtuner/train/rm/trainer.py
@@ -1,5 +1,6 @@
 import json
 import os
+from types import MethodType
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -28,6 +29,10 @@ class PairwiseTrainer(Trainer):
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
         self.can_return_loss = True  # override property to return eval_loss
+        if finetuning_args.use_badam:
+            from badam import clip_grad_norm_for_sparse_tensor
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None: