From 5c00783697c5292da94eb03fa205ad34d4ea3bc9 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 9 Mar 2024 03:58:18 +0800
Subject: [PATCH] update hardware requirements

Former-commit-id: 393c2de27ce0a2dee793092843ec0afa54f49a6d
---
 README.md                               | 22 ++++++++++++----------
 README_zh.md                            | 22 ++++++++++++----------
 src/llmtuner/hparams/finetuning_args.py | 10 +++++-----
 src/llmtuner/hparams/parser.py          | 10 ----------
 4 files changed, 29 insertions(+), 35 deletions(-)
diff --git a/README.md b/README.md
index 51c4c5e8..afd78d4c 100644
--- a/README.md
+++ b/README.md
@@ -48,8 +48,8 @@ Choose your path:
 - **Various models**: LLaMA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc.
 - **Integrated methods**: (Continuous) pre-training, supervised fine-tuning, reward modeling, PPO and DPO.
 - **Scalable resources**: 32-bit full-tuning, 16-bit freeze-tuning, 16-bit LoRA and 2/4/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8.
-- **Advanced algorithms**: DoRA, LongLoRA, LLaMA Pro, LoftQ and Agent tuning.
-- **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune, rsLoRA and GaLore.
+- **Advanced algorithms**: GaLore, DoRA, LongLoRA, LLaMA Pro, LoftQ and Agent tuning.
+- **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune and rsLoRA.
 - **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, etc.
 - **Faster inference**: OpenAI-style API, Gradio UI and CLI with vLLM worker.
 
@@ -272,13 +272,15 @@ huggingface-cli login
 
 \* *estimated*
 
-| Method | Bits |   7B  |  13B  |  30B  |   65B  |   8x7B |
+| Method | Bits |   7B  |  13B  |  30B  |   70B  |   8x7B |
 | ------ | ---- | ----- | ----- | ----- | ------ | ------ |
-| Full   |  16  | 160GB | 320GB | 600GB | 1200GB |  900GB |
-| Freeze |  16  |  20GB |  40GB | 120GB |  240GB |  200GB |
-| LoRA   |  16  |  16GB |  32GB |  80GB |  160GB |  120GB |
-| QLoRA  |   8  |  10GB |  16GB |  40GB |   80GB |   80GB |
-| QLoRA  |   4  |   6GB |  12GB |  24GB |   48GB |   32GB |
+| Full   | AMP  | 120GB | 240GB | 600GB | 1200GB |  900GB |
+| Full   |  16  |  60GB | 120GB | 300GB |  600GB |  400GB |
+| Freeze |  16  |  20GB |  40GB |  80GB |  200GB |  160GB |
+| LoRA   |  16  |  16GB |  32GB |  64GB |  160GB |  120GB |
+| QLoRA  |   8  |  10GB |  20GB |  40GB |   80GB |   60GB |
+| QLoRA  |   4  |   6GB |  12GB |  24GB |   48GB |   30GB |
+| QLoRA  |   2  |   4GB |   8GB |  16GB |   24GB |   18GB |
 
 ## Getting Started
 
@@ -483,7 +485,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
 accelerate launch --config_file config.yaml src/train_bash.py # arguments (same as above)
 ```
 
-<details><summary>Example config for LoRA training</summary>
+<details><summary>Example config.yaml for LoRA training</summary>
 
 ```yaml
 compute_environment: LOCAL_MACHINE
@@ -517,7 +519,7 @@ deepspeed --num_gpus 8 src/train_bash.py \
     ... # arguments (same as above)
 ```
 
-<details><summary>Example config for full-parameter training with DeepSpeed ZeRO-2</summary>
+<details><summary>Example ds_config.json for full-parameter training with DeepSpeed ZeRO-2</summary>
 
 ```json
 {
diff --git a/README_zh.md b/README_zh.md
index a4971fa7..697f0239 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -48,8 +48,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - **多种模型**：LLaMA、Mistral、Mixtral-MoE、Qwen、Yi、Gemma、Baichuan、ChatGLM、Phi 等等。
 - **集成方法**：（增量）预训练、指令监督微调、奖励模型训练、PPO 训练和 DPO 训练。
 - **多种精度**：32 比特全参数微调、16 比特冻结微调、16 比特 LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8 的 2/4/8 比特 QLoRA 微调。
-- **先进算法**：DoRA、LongLoRA、LLaMA Pro、LoftQ 和 Agent 微调。
-- **实用技巧**：FlashAttention-2、Unsloth、RoPE scaling、NEFTune、rsLoRA 和 GaLore。
+- **先进算法**：GaLore、DoRA、LongLoRA、LLaMA Pro、LoftQ 和 Agent 微调。
+- **实用技巧**：FlashAttention-2、Unsloth、RoPE scaling、NEFTune 和 rsLoRA。
 - **实验监控**：LlamaBoard、TensorBoard、Wandb、MLflow 等等。
 - **极速推理**：基于 vLLM 的 OpenAI 风格 API、浏览器界面和命令行接口。
 
@@ -272,13 +272,15 @@ huggingface-cli login
 
 \* *估算值*
 
-| 训练方法 | 精度 |   7B  |  13B  |  30B  |   65B  |   8x7B |
+| 训练方法 | 精度 |   7B  |  13B  |  30B  |   70B  |   8x7B |
 | ------- | ---- | ----- | ----- | ----- | ------ | ------ |
-| 全参数   |  16  | 160GB | 320GB | 600GB | 1200GB |  900GB |
-| 部分参数 |  16  |  20GB |  40GB | 120GB |  240GB |  200GB |
-| LoRA    |  16  |  16GB |  32GB |  80GB |  160GB |  120GB |
-| QLoRA   |   8  |  10GB |  16GB |  40GB |   80GB |   80GB |
-| QLoRA   |   4  |   6GB |  12GB |  24GB |   48GB |   32GB |
+| 全参数   | AMP  | 120GB | 240GB | 600GB | 1200GB |  900GB |
+| 全参数   |  16  |  60GB | 120GB | 300GB |  600GB |  400GB |
+| 部分参数 |  16  |  20GB |  40GB |  80GB |  200GB |  160GB |
+| LoRA    |  16  |  16GB |  32GB |  64GB |  160GB |  120GB |
+| QLoRA   |   8  |  10GB |  20GB |  40GB |   80GB |   60GB |
+| QLoRA   |   4  |   6GB |  12GB |  24GB |   48GB |   30GB |
+| QLoRA   |   2  |   4GB |   8GB |  16GB |   24GB |   18GB |
 
 ## 如何使用
 
@@ -482,7 +484,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
 accelerate launch --config_file config.yaml src/train_bash.py # 参数同上
 ```
 
-<details><summary>LoRA 训练的 Accelerate 配置示例</summary>
+<details><summary>使用 Accelerate 进行 LoRA 训练的 config.yaml 示例</summary>
 
 ```yaml
 compute_environment: LOCAL_MACHINE
@@ -516,7 +518,7 @@ deepspeed --num_gpus 8 src/train_bash.py \
     ... # 参数同上
 ```
 
-<details><summary>使用 DeepSpeed ZeRO-2 进行全参数训练的 DeepSpeed 配置示例</summary>
+<details><summary>使用 DeepSpeed ZeRO-2 进行全参数训练的 ds_config.json 示例</summary>
 
 ```json
 {
diff --git a/src/llmtuner/hparams/finetuning_args.py b/src/llmtuner/hparams/finetuning_args.py
index e2d35d63..1fb270ab 100644
--- a/src/llmtuner/hparams/finetuning_args.py
+++ b/src/llmtuner/hparams/finetuning_args.py
@@ -9,8 +9,8 @@ class FreezeArguments:
     Arguments pertaining to the freeze (partial-parameter) training.
     """
 
-    name_module_trainable: Optional[str] = field(
-        default=None,
+    name_module_trainable: str = field(
+        default="all",
         metadata={
             "help": """Name of trainable modules for partial-parameter (freeze) fine-tuning. \
                     Use commas to separate multiple modules. \
@@ -23,7 +23,7 @@ class FreezeArguments:
         },
     )
     num_layer_trainable: int = field(
-        default=3,
+        default=2,
         metadata={"help": "The number of trainable layers for partial-parameter (freeze) fine-tuning."},
     )
 
@@ -52,8 +52,8 @@ class LoraArguments:
         default=8,
         metadata={"help": "The intrinsic dimension for LoRA fine-tuning."},
     )
-    lora_target: Optional[str] = field(
-        default=None,
+    lora_target: str = field(
+        default="all",
         metadata={
             "help": """Name(s) of target modules to apply LoRA. \
                     Use commas to separate multiple modules. \
diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index 1db14132..14cf9bfd 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -137,16 +137,6 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if training_args.do_train and training_args.predict_with_generate:
         raise ValueError("`predict_with_generate` cannot be set as True while training.")
 
-    if (
-        training_args.do_train
-        and finetuning_args.finetuning_type == "freeze"
-        and finetuning_args.name_module_trainable is None
-    ):
-        raise ValueError("Please specify `name_module_trainable` in Freeze training.")
-
-    if training_args.do_train and finetuning_args.finetuning_type == "lora" and finetuning_args.lora_target is None:
-        raise ValueError("Please specify `lora_target` in LoRA training.")
-
     if training_args.do_train and model_args.use_unsloth and not is_unsloth_available:
         raise ValueError("Unsloth was not installed: https://github.com/unslothai/unsloth")