From 0a94fab357f912ec2e63a598ce8841541cffda2b Mon Sep 17 00:00:00 2001 From: hiyouga Date: Tue, 16 Apr 2024 17:44:48 +0800 Subject: [PATCH] support badam for all stages Former-commit-id: e3d8fc75eb2cfc54efd35bfd9ad6c4ac5acc458c --- README.md | 27 ++++++++++++++------------- README_zh.md | 27 ++++++++++++++------------- examples/README.md | 4 +++- examples/README_zh.md | 4 +++- src/llmtuner/train/dpo/trainer.py | 6 ++++++ src/llmtuner/train/orpo/trainer.py | 5 +++++ src/llmtuner/train/ppo/trainer.py | 6 ++++++ src/llmtuner/train/pt/trainer.py | 5 +++++ src/llmtuner/train/rm/trainer.py | 5 +++++ 9 files changed, 61 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 2fc9ba88..276bc6a7 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ Choose your path: - **Various models**: LLaMA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc. - **Integrated methods**: (Continuous) pre-training, supervised fine-tuning, reward modeling, PPO, DPO and ORPO. - **Scalable resources**: 32-bit full-tuning, 16-bit freeze-tuning, 16-bit LoRA and 2/4/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8. -- **Advanced algorithms**: GaLore, DoRA, LongLoRA, LLaMA Pro, LoRA+, LoftQ and Agent tuning. +- **Advanced algorithms**: GaLore, BAdam, DoRA, LongLoRA, LLaMA Pro, LoRA+, LoftQ and Agent tuning. - **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune and rsLoRA. - **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, etc. - **Faster inference**: OpenAI-style API, Gradio UI and CLI with vLLM worker. @@ -68,14 +68,16 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ ## Changelog +[24/04/16] We supported **[BAdam](https://arxiv.org/abs/2404.02827)**. See `examples/extras/badam` for usage. + [24/04/16] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s long-sequence training (Llama-2-7B-56k within 24GB). It achieves **117%** speed and **50%** memory compared with FlashAttention-2, more benchmarks can be found in [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison). [24/03/31] We supported **[ORPO](https://arxiv.org/abs/2403.07691)**. See `examples/lora_single_gpu` for usage. -[24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv! -
Full Changelog +[24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv! + [24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See `examples/extras/fsdp_qlora` for usage. [24/03/13] We supported **[LoRA+](https://arxiv.org/abs/2402.12354)**. See `examples/extras/loraplus` for usage. @@ -278,16 +280,15 @@ huggingface-cli login \* *estimated* -| Method | Bits | 7B | 13B | 30B | 70B | 8x7B | -| ------ | ---- | ----- | ----- | ----- | ------ | ------ | -| Full | AMP | 120GB | 240GB | 600GB | 1200GB | 900GB | -| Full | 16 | 60GB | 120GB | 300GB | 600GB | 400GB | -| GaLore | 16 | 16GB | 32GB | 64GB | 160GB | 120GB | -| Freeze | 16 | 20GB | 40GB | 80GB | 200GB | 160GB | -| LoRA | 16 | 16GB | 32GB | 64GB | 160GB | 120GB | -| QLoRA | 8 | 10GB | 20GB | 40GB | 80GB | 60GB | -| QLoRA | 4 | 6GB | 12GB | 24GB | 48GB | 30GB | -| QLoRA | 2 | 4GB | 8GB | 16GB | 24GB | 18GB | +| Method | Bits | 7B | 13B | 30B | 70B | 8x7B | +| ----------------- | ---- | ----- | ----- | ----- | ------ | ------ | +| Full | AMP | 120GB | 240GB | 600GB | 1200GB | 900GB | +| Full | 16 | 60GB | 120GB | 300GB | 600GB | 400GB | +| Freeze | 16 | 20GB | 40GB | 80GB | 200GB | 160GB | +| LoRA/GaLore/BAdam | 16 | 16GB | 32GB | 64GB | 160GB | 120GB | +| QLoRA | 8 | 10GB | 20GB | 40GB | 80GB | 60GB | +| QLoRA | 4 | 6GB | 12GB | 24GB | 48GB | 30GB | +| QLoRA | 2 | 4GB | 8GB | 16GB | 24GB | 18GB | ## Getting Started diff --git a/README_zh.md b/README_zh.md index 6564ad4f..4420d8bb 100644 --- a/README_zh.md +++ b/README_zh.md @@ -46,7 +46,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd - **多种模型**:LLaMA、Mistral、Mixtral-MoE、Qwen、Yi、Gemma、Baichuan、ChatGLM、Phi 等等。 - **集成方法**:(增量)预训练、指令监督微调、奖励模型训练、PPO 训练、DPO 训练和 ORPO 训练。 - **多种精度**:32 比特全参数微调、16 比特冻结微调、16 比特 LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8 的 2/4/8 比特 QLoRA 微调。 -- **先进算法**:GaLore、DoRA、LongLoRA、LLaMA Pro、LoRA+、LoftQ 和 Agent 微调。 +- **先进算法**:GaLore、BAdam、DoRA、LongLoRA、LLaMA Pro、LoRA+、LoftQ 和 Agent 微调。 - **实用技巧**:FlashAttention-2、Unsloth、RoPE scaling、NEFTune 和 rsLoRA。 - **实验监控**:LlamaBoard、TensorBoard、Wandb、MLflow 等等。 - **极速推理**:基于 vLLM 的 OpenAI 风格 API、浏览器界面和命令行接口。 @@ -68,14 +68,16 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd ## 更新日志 +[24/04/16] 我们支持了 **[BAdam](https://arxiv.org/abs/2404.02827)**。详细用法请参照 `examples/extras/badam`。 + [24/04/16] 我们支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的长序列训练(24GB 可训练 Llama-2-7B-56k)。该方法相比 FlashAttention-2 提供了 **117%** 的训练速度和 **50%** 的显存节约。更多数据请见[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。 [24/03/31] 我们支持了 **[ORPO](https://arxiv.org/abs/2403.07691)**。详细用法请参照 `examples/lora_single_gpu`。 -[24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看! -
展开日志 +[24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看! + [24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 `examples/extras/fsdp_qlora`。 [24/03/13] 我们支持了 **[LoRA+](https://arxiv.org/abs/2402.12354)**。详细用法请参照 `examples/extras/loraplus`。 @@ -278,16 +280,15 @@ huggingface-cli login \* *估算值* -| 训练方法 | 精度 | 7B | 13B | 30B | 70B | 8x7B | -| ------- | ---- | ----- | ----- | ----- | ------ | ------ | -| 全参数 | AMP | 120GB | 240GB | 600GB | 1200GB | 900GB | -| 全参数 | 16 | 60GB | 120GB | 300GB | 600GB | 400GB | -| GaLore | 16 | 16GB | 32GB | 64GB | 160GB | 120GB | -| 部分参数 | 16 | 20GB | 40GB | 80GB | 200GB | 160GB | -| LoRA | 16 | 16GB | 32GB | 64GB | 160GB | 120GB | -| QLoRA | 8 | 10GB | 20GB | 40GB | 80GB | 60GB | -| QLoRA | 4 | 6GB | 12GB | 24GB | 48GB | 30GB | -| QLoRA | 2 | 4GB | 8GB | 16GB | 24GB | 18GB | +| 训练方法 | 精度 | 7B | 13B | 30B | 70B | 8x7B | +| ----------------- | ---- | ----- | ----- | ----- | ------ | ------ | +| Full | AMP | 120GB | 240GB | 600GB | 1200GB | 900GB | +| Full | 16 | 60GB | 120GB | 300GB | 600GB | 400GB | +| Freeze | 16 | 20GB | 40GB | 80GB | 200GB | 160GB | +| LoRA/GaLore/BAdam | 16 | 16GB | 32GB | 64GB | 160GB | 120GB | +| QLoRA | 8 | 10GB | 20GB | 40GB | 80GB | 60GB | +| QLoRA | 4 | 6GB | 12GB | 24GB | 48GB | 30GB | +| QLoRA | 2 | 4GB | 8GB | 16GB | 24GB | 18GB | ## 如何使用 diff --git a/examples/README.md b/examples/README.md index 4e771c2e..c0c0088e 100644 --- a/examples/README.md +++ b/examples/README.md @@ -3,7 +3,7 @@ We provide diverse examples about fine-tuning LLMs. ``` examples/ ├── lora_single_gpu/ -│ ├── pretrain.sh: Do pre-training using LoRA +│ ├── pretrain.sh: Do continuous pre-training using LoRA │ ├── sft.sh: Do supervised fine-tuning using LoRA │ ├── reward.sh: Do reward modeling using LoRA │ ├── ppo.sh: Do PPO training using LoRA @@ -34,6 +34,8 @@ examples/ └── extras/ ├── galore/ │ └── sft.sh: Fine-tune model with GaLore + ├── badam/ + │ └── sft.sh: Fine-tune model with BAdam ├── loraplus/ │ └── sft.sh: Fine-tune model using LoRA+ ├── llama_pro/ diff --git a/examples/README_zh.md b/examples/README_zh.md index badda0fe..3f31ffc7 100644 --- a/examples/README_zh.md +++ b/examples/README_zh.md @@ -3,7 +3,7 @@ ``` examples/ ├── lora_single_gpu/ -│ ├── pretrain.sh: 基于 LoRA 进行预训练 +│ ├── pretrain.sh: 基于 LoRA 进行增量预训练 │ ├── sft.sh: 基于 LoRA 进行指令监督微调 │ ├── reward.sh: 基于 LoRA 进行奖励模型训练 │ ├── ppo.sh: 基于 LoRA 进行 PPO 训练 @@ -34,6 +34,8 @@ examples/ └── extras/ ├── galore/ │ └── sft.sh: 使用 GaLore 训练模型 + ├── badam/ + │ └── sft.sh: 使用 BAdam 训练模型 ├── loraplus/ │ └── sft.sh: 使用 LoRA+ 训练模型 ├── llama_pro/ diff --git a/src/llmtuner/train/dpo/trainer.py b/src/llmtuner/train/dpo/trainer.py index 0b316c62..35dcd8db 100644 --- a/src/llmtuner/train/dpo/trainer.py +++ b/src/llmtuner/train/dpo/trainer.py @@ -1,5 +1,6 @@ from collections import defaultdict from contextlib import nullcontext +from types import MethodType from typing import TYPE_CHECKING, Dict, Literal, Optional, Tuple, Union import torch @@ -63,6 +64,11 @@ class CustomDPOTrainer(DPOTrainer): else: self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True) + if finetuning_args.use_badam: + from badam import clip_grad_norm_for_sparse_tensor + + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) + def create_optimizer(self) -> "torch.optim.Optimizer": if self.optimizer is None: self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args) diff --git a/src/llmtuner/train/orpo/trainer.py b/src/llmtuner/train/orpo/trainer.py index d84e0199..5e0d70d9 100644 --- a/src/llmtuner/train/orpo/trainer.py +++ b/src/llmtuner/train/orpo/trainer.py @@ -1,4 +1,5 @@ from collections import defaultdict +from types import MethodType from typing import TYPE_CHECKING, Dict, Literal, Optional, Tuple, Union import torch @@ -44,6 +45,10 @@ class CustomORPOTrainer(DPOTrainer): self._stored_metrics = defaultdict(lambda: defaultdict(list)) Trainer.__init__(self, model=model, **kwargs) + if finetuning_args.use_badam: + from badam import clip_grad_norm_for_sparse_tensor + + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) def create_optimizer(self) -> "torch.optim.Optimizer": if self.optimizer is None: diff --git a/src/llmtuner/train/ppo/trainer.py b/src/llmtuner/train/ppo/trainer.py index 020d54cf..ef769968 100644 --- a/src/llmtuner/train/ppo/trainer.py +++ b/src/llmtuner/train/ppo/trainer.py @@ -1,6 +1,7 @@ import math import os import sys +from types import MethodType from typing import TYPE_CHECKING, Dict, List, Optional, Tuple import torch @@ -124,6 +125,11 @@ class CustomPPOTrainer(PPOTrainer, Trainer): else: self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True) + if finetuning_args.use_badam: + from badam import clip_grad_norm_for_sparse_tensor + + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) + def ppo_train(self, resume_from_checkpoint: Optional[str] = None) -> None: r""" Implements training loop for the PPO stage, like _inner_training_loop() in Huggingface's Trainer. diff --git a/src/llmtuner/train/pt/trainer.py b/src/llmtuner/train/pt/trainer.py index af2848fb..969ebf04 100644 --- a/src/llmtuner/train/pt/trainer.py +++ b/src/llmtuner/train/pt/trainer.py @@ -1,3 +1,4 @@ +from types import MethodType from typing import TYPE_CHECKING, Optional from transformers import Trainer @@ -23,6 +24,10 @@ class CustomTrainer(Trainer): def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None: super().__init__(**kwargs) self.finetuning_args = finetuning_args + if finetuning_args.use_badam: + from badam import clip_grad_norm_for_sparse_tensor + + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) def create_optimizer(self) -> "torch.optim.Optimizer": if self.optimizer is None: diff --git a/src/llmtuner/train/rm/trainer.py b/src/llmtuner/train/rm/trainer.py index 8d0f2763..0f5d88d3 100644 --- a/src/llmtuner/train/rm/trainer.py +++ b/src/llmtuner/train/rm/trainer.py @@ -1,5 +1,6 @@ import json import os +from types import MethodType from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import torch @@ -28,6 +29,10 @@ class PairwiseTrainer(Trainer): super().__init__(**kwargs) self.finetuning_args = finetuning_args self.can_return_loss = True # override property to return eval_loss + if finetuning_args.use_badam: + from badam import clip_grad_norm_for_sparse_tensor + + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) def create_optimizer(self) -> "torch.optim.Optimizer": if self.optimizer is None: