From 845e750abd9918400a26f9518971bc5359f6fef8 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 28 Feb 2024 23:19:25 +0800
Subject: [PATCH] add examples

Former-commit-id: 804c1e7083e56b4a132d4d820ea9d8d50e5499e9
---
 README.md                                 | 25 ++++++++++++-----
 README_zh.md                              | 25 ++++++++++++-----
 examples/full_multi_gpu/ds_z2_config.json | 27 +++++++++++++++++++
 examples/full_multi_gpu/ds_z3_config.json | 33 +++++++++++++++++++++++
 examples/full_multi_gpu/sft.sh            | 29 ++++++++++++++++++++
 examples/lora_multi_gpu/config.yaml       | 16 +++++++++++
 examples/lora_multi_gpu/sft.sh            | 30 +++++++++++++++++++++
 examples/lora_single_gpu/dpo.sh           | 33 +++++++++++++++++++++++
 examples/lora_single_gpu/ppo.sh           | 31 +++++++++++++++++++++
 examples/lora_single_gpu/predict.sh       | 18 +++++++++++++
 examples/lora_single_gpu/pretrain.sh      | 29 ++++++++++++++++++++
 examples/lora_single_gpu/reward.sh        | 31 +++++++++++++++++++++
 examples/lora_single_gpu/sft.sh           | 30 +++++++++++++++++++++
 examples/qlora_single_gpu/aqlm.sh         | 30 +++++++++++++++++++++
 examples/qlora_single_gpu/awq.sh          | 30 +++++++++++++++++++++
 examples/qlora_single_gpu/bitsandbytes.sh | 31 +++++++++++++++++++++
 examples/qlora_single_gpu/gptq.sh         | 30 +++++++++++++++++++++
 17 files changed, 466 insertions(+), 12 deletions(-)
 create mode 100644 examples/full_multi_gpu/ds_z2_config.json
 create mode 100644 examples/full_multi_gpu/ds_z3_config.json
 create mode 100644 examples/full_multi_gpu/sft.sh
 create mode 100644 examples/lora_multi_gpu/config.yaml
 create mode 100644 examples/lora_multi_gpu/sft.sh
 create mode 100644 examples/lora_single_gpu/dpo.sh
 create mode 100644 examples/lora_single_gpu/ppo.sh
 create mode 100644 examples/lora_single_gpu/predict.sh
 create mode 100644 examples/lora_single_gpu/pretrain.sh
 create mode 100644 examples/lora_single_gpu/reward.sh
 create mode 100644 examples/lora_single_gpu/sft.sh
 create mode 100644 examples/qlora_single_gpu/aqlm.sh
 create mode 100644 examples/qlora_single_gpu/awq.sh
 create mode 100644 examples/qlora_single_gpu/bitsandbytes.sh
 create mode 100644 examples/qlora_single_gpu/gptq.sh

diff --git a/README.md b/README.md
index df64aa75..23f7a3ed 100644
--- a/README.md
+++ b/README.md
@@ -45,6 +45,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846
 - **Scalable resources**: 32-bit full-tuning, 16-bit freeze tuning, 16-bit LoRA tuning, 2/4/8-bit QLoRA with AQLM/AWQ/GPTQ/LLM.int8.
 - **Advanced algorithms**: DoRA, LongLoRA, LLaMA Pro, LoftQ, agent tuning.
 - **Intriguing tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune, rsLoRA.
+- **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, etc.
 
 ## Benchmark
 
@@ -236,15 +237,27 @@ huggingface-cli login
 
 ## Requirement
 
-- Python 3.8+ and PyTorch 1.13.1+
-- 🤗Transformers, Datasets, Accelerate, PEFT and TRL
-- sentencepiece, protobuf and tiktoken
-- jieba, rouge-chinese and nltk (used at evaluation and predict)
-- gradio and matplotlib (used in web UI)
-- uvicorn, fastapi and sse-starlette (used in API)
+| Mandatory    | Minimum | Recommend |
+| ------------ | ------- | --------- |
+| python       | 3.8     | 3.10      |
+| torch        | 1.13.1  | 2.2.1     |
+| transformers | 4.37.2  | 4.38.1    |
+| datasets     | 2.14.3  | 2.17.1    |
+| accelerate   | 0.27.2  | 0.27.2    |
+| peft         | 0.9.0   | 0.9.0     |
+| trl          | 0.7.11  | 0.7.11    |
+
+| Optional     | Minimum | Recommend |
+| ------------ | ------- | --------- |
+| CUDA         | 11.6    | 12.2      |
+| deepspeed    | 0.10.0  | 0.13.4    |
+| bitsandbytes | 0.39.0  | 0.41.3    |
+| flash-attn   | 2.3.0   | 2.5.5     |
 
 ### Hardware Requirement
 
+\* *estimated*
+
 | Method | Bits |   7B  |  13B  |  30B  |   65B  |   8x7B |
 | ------ | ---- | ----- | ----- | ----- | ------ | ------ |
 | Full   |  16  | 160GB | 320GB | 600GB | 1200GB |  900GB |
diff --git a/README_zh.md b/README_zh.md
index c8db1485..7235321a 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -45,6 +45,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846
 - **多种精度**：32 比特全参数训练、16 比特部分参数训练、16比特 LoRA 训练、基于 AQLM/AWQ/GPTQ/LLM.int8 的 2/4/8 比特 LoRA 训练。
 - **先进算法**: DoRA、LongLoRA、LLaMA Pro、LoftQ、agent tuning。
 - **新鲜技巧**：FlashAttention-2、Unsloth、RoPE scaling、NEFTune、rsLoRA。
+- **实验监控**：LlamaBoard、TensorBoard、Wandb、MLflow 等等。
 
 ## 性能指标
 
@@ -236,15 +237,27 @@ huggingface-cli login
 
 ## 软硬件依赖
 
-- Python 3.8+ 和 PyTorch 1.13.1+
-- 🤗Transformers, Datasets, Accelerate, PEFT 和 TRL
-- sentencepiece, protobuf 和 tiktoken
-- jieba, rouge-chinese 和 nltk (用于评估及预测)
-- gradio 和 matplotlib (用于网页端交互)
-- uvicorn, fastapi 和 sse-starlette (用于 API)
+| 必需项       | 至少     | 推荐      |
+| ------------ | ------- | --------- |
+| python       | 3.8     | 3.10      |
+| torch        | 1.13.1  | 2.2.1     |
+| transformers | 4.37.2  | 4.38.1    |
+| datasets     | 2.14.3  | 2.17.1    |
+| accelerate   | 0.27.2  | 0.27.2    |
+| peft         | 0.9.0   | 0.9.0     |
+| trl          | 0.7.11  | 0.7.11    |
+
+| 可选项       | 至少     | 推荐      |
+| ------------ | ------- | --------- |
+| CUDA         | 11.6    | 12.2      |
+| deepspeed    | 0.10.0  | 0.13.4    |
+| bitsandbytes | 0.39.0  | 0.41.3    |
+| flash-attn   | 2.3.0   | 2.5.5     |
 
 ### 硬件依赖
 
+\* *估算值*
+
 | 训练方法 | 精度 |   7B  |  13B  |  30B  |   65B  |   8x7B |
 | ------- | ---- | ----- | ----- | ----- | ------ | ------ |
 | 全参数   |  16  | 160GB | 320GB | 600GB | 1200GB |  900GB |
diff --git a/examples/full_multi_gpu/ds_z2_config.json b/examples/full_multi_gpu/ds_z2_config.json
new file mode 100644
index 00000000..3d42aa15
--- /dev/null
+++ b/examples/full_multi_gpu/ds_z2_config.json
@@ -0,0 +1,27 @@
+{
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "initial_scale_power": 16,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 2,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "overlap_comm": true,
+    "contiguous_gradients": true
+  }
+}
\ No newline at end of file
diff --git a/examples/full_multi_gpu/ds_z3_config.json b/examples/full_multi_gpu/ds_z3_config.json
new file mode 100644
index 00000000..9c5f55da
--- /dev/null
+++ b/examples/full_multi_gpu/ds_z3_config.json
@@ -0,0 +1,33 @@
+{
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "initial_scale_power": 16,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "offload_optimizer": {
+      "device": "cpu"
+    },
+    "offload_param": {
+      "device": "cpu"
+    },
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 1e9,
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "stage3_gather_16bit_weights_on_model_save": true
+  }
+}
\ No newline at end of file
diff --git a/examples/full_multi_gpu/sft.sh b/examples/full_multi_gpu/sft.sh
new file mode 100644
index 00000000..e3ced5f0
--- /dev/null
+++ b/examples/full_multi_gpu/sft.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+deepspeed --num_gpus 4 ../../src/train_bash.py \
+    --deepspeed ds_z3_config.json \
+    --stage sft \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --dataset alpaca_gpt4_en \
+    --dataset_dir ../../data \
+    --template default \
+    --finetuning_type full \
+    --output_dir ../../saves/LLaMA2-7B/full/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 2 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --fp16
diff --git a/examples/lora_multi_gpu/config.yaml b/examples/lora_multi_gpu/config.yaml
new file mode 100644
index 00000000..ddb5c910
--- /dev/null
+++ b/examples/lora_multi_gpu/config.yaml
@@ -0,0 +1,16 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: fp16
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/examples/lora_multi_gpu/sft.sh b/examples/lora_multi_gpu/sft.sh
new file mode 100644
index 00000000..525e4f67
--- /dev/null
+++ b/examples/lora_multi_gpu/sft.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch --config_file config.yaml ../../src/train_bash.py \
+    --stage sft \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../data \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,v_proj \
+    --output_dir ../../saves/LLaMA2-7B/lora/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 2 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --fp16
diff --git a/examples/lora_single_gpu/dpo.sh b/examples/lora_single_gpu/dpo.sh
new file mode 100644
index 00000000..8c2f68c9
--- /dev/null
+++ b/examples/lora_single_gpu/dpo.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+    --stage dpo \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
+    --create_new_adapter \
+    --dataset comparison_gpt4_en \
+    --dataset_dir ../../data \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,v_proj \
+    --output_dir ../../saves/LLaMA2-7B/lora/dpo \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 1e-5 \
+    --num_train_epochs 1.0 \
+    --max_samples 1000 \
+    --val_size 0.1 \
+    --dpo_ftx 1.0 \
+    --plot_loss \
+    --fp16
diff --git a/examples/lora_single_gpu/ppo.sh b/examples/lora_single_gpu/ppo.sh
new file mode 100644
index 00000000..4ec0cbfb
--- /dev/null
+++ b/examples/lora_single_gpu/ppo.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+    --stage ppo \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
+    --create_new_adapter \
+    --dataset alpaca_gpt4_en \
+    --dataset_dir ../../data \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,v_proj \
+    --reward_model ../../saves/LLaMA2-7B/lora/reward \
+    --output_dir ../../saves/LLaMA2-7B/lora/ppo \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 512 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --save_steps 100 \
+    --learning_rate 1e-5 \
+    --num_train_epochs 1.0 \
+    --max_samples 1000 \
+    --top_k 0 \
+    --top_p 0.9 \
+    --max_new_tokens 256 \
+    --plot_loss \
+    --fp16
diff --git a/examples/lora_single_gpu/predict.sh b/examples/lora_single_gpu/predict.sh
new file mode 100644
index 00000000..1fb45396
--- /dev/null
+++ b/examples/lora_single_gpu/predict.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+    --stage sft \
+    --do_predict \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft,../../saves/LLaMA2-7B/lora/dpo \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../data \
+    --template default \
+    --finetuning_type lora \
+    --output_dir ../../saves/LLaMA2-7B/lora/predict \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --per_device_eval_batch_size 1 \
+    --max_samples 20 \
+    --predict_with_generate
diff --git a/examples/lora_single_gpu/pretrain.sh b/examples/lora_single_gpu/pretrain.sh
new file mode 100644
index 00000000..37adf51f
--- /dev/null
+++ b/examples/lora_single_gpu/pretrain.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+    --stage pt \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --dataset c4_demo \
+    --dataset_dir ../../data \
+    --finetuning_type lora \
+    --lora_target q_proj,v_proj \
+    --output_dir ../../saves/LLaMA2-7B/lora/pretrain \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 10000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --fp16
diff --git a/examples/lora_single_gpu/reward.sh b/examples/lora_single_gpu/reward.sh
new file mode 100644
index 00000000..7c19e9aa
--- /dev/null
+++ b/examples/lora_single_gpu/reward.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+    --stage rm \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
+    --create_new_adapter \
+    --dataset comparison_gpt4_en \
+    --dataset_dir ../../data \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,v_proj \
+    --output_dir ../../saves/LLaMA2-7B/lora/reward \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --learning_rate 1e-5 \
+    --num_train_epochs 1.0 \
+    --max_samples 5000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --fp16
diff --git a/examples/lora_single_gpu/sft.sh b/examples/lora_single_gpu/sft.sh
new file mode 100644
index 00000000..41d7851a
--- /dev/null
+++ b/examples/lora_single_gpu/sft.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+    --stage sft \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../data \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,v_proj \
+    --output_dir ../../saves/LLaMA2-7B/lora/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --fp16
diff --git a/examples/qlora_single_gpu/aqlm.sh b/examples/qlora_single_gpu/aqlm.sh
new file mode 100644
index 00000000..68eb4482
--- /dev/null
+++ b/examples/qlora_single_gpu/aqlm.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+    --stage sft \
+    --do_train \
+    --model_name_or_path BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../data \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,v_proj \
+    --output_dir ../../saves/LLaMA2-7B/lora/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --fp16
diff --git a/examples/qlora_single_gpu/awq.sh b/examples/qlora_single_gpu/awq.sh
new file mode 100644
index 00000000..b0f1f46b
--- /dev/null
+++ b/examples/qlora_single_gpu/awq.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+    --stage sft \
+    --do_train \
+    --model_name_or_path TheBloke/Llama-2-7B-AWQ \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../data \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,v_proj \
+    --output_dir ../../saves/LLaMA2-7B/lora/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --fp16
diff --git a/examples/qlora_single_gpu/bitsandbytes.sh b/examples/qlora_single_gpu/bitsandbytes.sh
new file mode 100644
index 00000000..84bbb426
--- /dev/null
+++ b/examples/qlora_single_gpu/bitsandbytes.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+    --stage sft \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../data \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,v_proj \
+    --output_dir ../../saves/LLaMA2-7B/lora/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --quantization_bit 4 \
+    --plot_loss \
+    --fp16
diff --git a/examples/qlora_single_gpu/gptq.sh b/examples/qlora_single_gpu/gptq.sh
new file mode 100644
index 00000000..a971b09f
--- /dev/null
+++ b/examples/qlora_single_gpu/gptq.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+    --stage sft \
+    --do_train \
+    --model_name_or_path TheBloke/Llama-2-7B-GPTQ \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../data \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,v_proj \
+    --output_dir ../../saves/LLaMA2-7B/lora/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --fp16