From dfff5119b40b9752836d59e7c1a89e37542302da Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 17 May 2024 01:02:00 +0800
Subject: [PATCH] update examples

Former-commit-id: ddec9e1b842d407790637e9b0b181f8b26926db9
---
 examples/extras/badam/llama3_lora_sft.yaml         | 12 ++++++------
 examples/extras/fsdp_qlora/llama3_lora_sft.yaml    | 14 +++++++-------
 examples/extras/galore/llama3_full_sft.yaml        | 12 ++++++------
 examples/extras/llama_pro/llama3_freeze_sft.yaml   | 12 ++++++------
 examples/extras/loraplus/llama3_lora_sft.yaml      | 12 ++++++------
 examples/extras/mod/llama3_full_sft.yaml           | 12 ++++++------
 examples/full_multi_gpu/llama3_full_predict.yaml   | 10 +++++-----
 examples/full_multi_gpu/llama3_full_sft.yaml       | 14 +++++++-------
 examples/lora_multi_gpu/llama3_lora_sft.yaml       | 14 +++++++-------
 examples/lora_multi_gpu/llama3_lora_sft_ds.yaml    | 14 +++++++-------
 examples/lora_multi_npu/llama3_lora_sft_ds.yaml    | 14 +++++++-------
 examples/lora_single_gpu/llama3_lora_dpo.yaml      | 12 ++++++------
 examples/lora_single_gpu/llama3_lora_eval.yaml     | 10 +++++-----
 examples/lora_single_gpu/llama3_lora_orpo.yaml     | 12 ++++++------
 examples/lora_single_gpu/llama3_lora_ppo.yaml      | 12 ++++++------
 examples/lora_single_gpu/llama3_lora_predict.yaml  | 10 +++++-----
 examples/lora_single_gpu/llama3_lora_pretrain.yaml | 12 ++++++------
 examples/lora_single_gpu/llama3_lora_reward.yaml   | 12 ++++++------
 examples/lora_single_gpu/llama3_lora_sft.yaml      | 12 ++++++------
 examples/lora_single_gpu/llama3_preprocess.yaml    |  8 ++++----
 examples/lora_single_gpu/llava1_5_lora_sft.yaml    | 12 ++++++------
 examples/merge_lora/llama3_gptq.yaml               |  4 ++--
 examples/merge_lora/llama3_lora_sft.yaml           |  6 +++---
 .../qlora_single_gpu/llama3_lora_sft_aqlm.yaml     | 12 ++++++------
 examples/qlora_single_gpu/llama3_lora_sft_awq.yaml | 12 ++++++------
 .../llama3_lora_sft_bitsandbytes.yaml              | 12 ++++++------
 .../qlora_single_gpu/llama3_lora_sft_gptq.yaml     | 12 ++++++------
 27 files changed, 155 insertions(+), 155 deletions(-)

diff --git a/examples/extras/badam/llama3_lora_sft.yaml b/examples/extras/badam/llama3_lora_sft.yaml
index 5e8994bc..c8c00431 100644
--- a/examples/extras/badam/llama3_lora_sft.yaml
+++ b/examples/extras/badam/llama3_lora_sft.yaml
@@ -1,7 +1,7 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: full
@@ -10,7 +10,7 @@ badam_switch_mode: descending
 badam_switch_interval: 50
 badam_verbose: 2
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -18,14 +18,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/full/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -34,7 +34,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 pure_bf16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
index 1fd8f16a..9d3b1124 100644
--- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
@@ -1,17 +1,17 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 quantization_bit: 4
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# ddp
+### ddp
 ddp_timeout: 180000000
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -19,14 +19,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -35,7 +35,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/extras/galore/llama3_full_sft.yaml b/examples/extras/galore/llama3_full_sft.yaml
index 3bc074c5..7f5ce354 100644
--- a/examples/extras/galore/llama3_full_sft.yaml
+++ b/examples/extras/galore/llama3_full_sft.yaml
@@ -1,7 +1,7 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: full
@@ -11,7 +11,7 @@ galore_target: mlp,self_attn
 galore_rank: 128
 galore_scale: 2.0
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -19,14 +19,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/full/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 1
 learning_rate: 0.0001
@@ -35,7 +35,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 pure_bf16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/extras/llama_pro/llama3_freeze_sft.yaml b/examples/extras/llama_pro/llama3_freeze_sft.yaml
index 0ffcb5e8..fc9bc9d3 100644
--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
@@ -1,7 +1,7 @@
-# model
+### model
 model_name_or_path: models/llama3-8b-instruct-pro
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: freeze
@@ -9,7 +9,7 @@ freeze_trainable_layers: 8
 freeze_trainable_modules: all
 use_llama_pro: true
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -17,14 +17,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b-instruct-pro/freeze/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -33,7 +33,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/extras/loraplus/llama3_lora_sft.yaml b/examples/extras/loraplus/llama3_lora_sft.yaml
index 0956aa71..c0e582d9 100644
--- a/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
@@ -1,14 +1,14 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 loraplus_lr_ratio: 16.0
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -16,14 +16,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/extras/mod/llama3_full_sft.yaml b/examples/extras/mod/llama3_full_sft.yaml
index 5dc8c061..cfcd4f8a 100644
--- a/examples/extras/mod/llama3_full_sft.yaml
+++ b/examples/extras/mod/llama3_full_sft.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: full
 mixture_of_depths: convert
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b-mod/full/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 optim: paged_adamw_8bit
@@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 pure_bf16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/full_multi_gpu/llama3_full_predict.yaml b/examples/full_multi_gpu/llama3_full_predict.yaml
index 5b9b680b..f037a20c 100644
--- a/examples/full_multi_gpu/llama3_full_predict.yaml
+++ b/examples/full_multi_gpu/llama3_full_predict.yaml
@@ -1,12 +1,12 @@
-# model
+### model
 model_name_or_path: saves/llama3-8b/full/sft
 
-# method
+### method
 stage: sft
 do_predict: true
 finetuning_type: full
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -14,10 +14,10 @@ max_samples: 50
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/full/predict
 overwrite_output_dir: true
 
-# eval
+### eval
 per_device_eval_batch_size: 1
 predict_with_generate: true
diff --git a/examples/full_multi_gpu/llama3_full_sft.yaml b/examples/full_multi_gpu/llama3_full_sft.yaml
index 2d8031f1..a08af5fe 100644
--- a/examples/full_multi_gpu/llama3_full_sft.yaml
+++ b/examples/full_multi_gpu/llama3_full_sft.yaml
@@ -1,16 +1,16 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: full
 
-# ddp
+### ddp
 ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z3_config.json
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -18,14 +18,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/full/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 2
 learning_rate: 0.0001
@@ -34,7 +34,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_multi_gpu/llama3_lora_sft.yaml b/examples/lora_multi_gpu/llama3_lora_sft.yaml
index 6cc06f8a..ed39144f 100644
--- a/examples/lora_multi_gpu/llama3_lora_sft.yaml
+++ b/examples/lora_multi_gpu/llama3_lora_sft.yaml
@@ -1,16 +1,16 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# ddp
+### ddp
 ddp_timeout: 180000000
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -18,14 +18,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 2
 learning_rate: 0.0001
@@ -34,7 +34,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
index 5a7348c1..1ce045c0 100644
--- a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
+++ b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
@@ -1,17 +1,17 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# ddp
+### ddp
 ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z3_config.json
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -19,14 +19,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 2
 learning_rate: 0.0001
@@ -35,7 +35,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_multi_npu/llama3_lora_sft_ds.yaml b/examples/lora_multi_npu/llama3_lora_sft_ds.yaml
index 2e9c0558..286ab503 100644
--- a/examples/lora_multi_npu/llama3_lora_sft_ds.yaml
+++ b/examples/lora_multi_npu/llama3_lora_sft_ds.yaml
@@ -1,17 +1,17 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# ddp
+### ddp
 ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z0_config.json
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -19,14 +19,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 2
 learning_rate: 0.0001
@@ -35,7 +35,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_single_gpu/llama3_lora_dpo.yaml b/examples/lora_single_gpu/llama3_lora_dpo.yaml
index 16c6d0c9..615e919f 100644
--- a/examples/lora_single_gpu/llama3_lora_dpo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_dpo.yaml
@@ -1,14 +1,14 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: dpo
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 dpo_ftx: 1.0
 
-# dataset
+### dataset
 dataset: orca_rlhf
 template: llama3
 cutoff_len: 1024
@@ -16,14 +16,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/dpo
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.00001
@@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_single_gpu/llama3_lora_eval.yaml b/examples/lora_single_gpu/llama3_lora_eval.yaml
index 5808a47a..6fcfd6ef 100644
--- a/examples/lora_single_gpu/llama3_lora_eval.yaml
+++ b/examples/lora_single_gpu/llama3_lora_eval.yaml
@@ -1,19 +1,19 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 adapter_name_or_path: saves/llama3-8b/lora/sft
 
-# method
+### method
 finetuning_type: lora
 
-# dataset
+### dataset
 task: mmlu
 split: test
 template: fewshot
 lang: en
 n_shot: 5
 
-# output
+### output
 save_dir: saves/llama3-8b/lora/eval
 
-# eval
+### eval
 batch_size: 4
diff --git a/examples/lora_single_gpu/llama3_lora_orpo.yaml b/examples/lora_single_gpu/llama3_lora_orpo.yaml
index bc42bdd4..6fed8735 100644
--- a/examples/lora_single_gpu/llama3_lora_orpo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_orpo.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: orpo
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: orca_rlhf
 template: llama3
 cutoff_len: 1024
@@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/orpo
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.00001
@@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_single_gpu/llama3_lora_ppo.yaml b/examples/lora_single_gpu/llama3_lora_ppo.yaml
index 8d78d20d..5cd2f18f 100644
--- a/examples/lora_single_gpu/llama3_lora_ppo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_ppo.yaml
@@ -1,14 +1,14 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 reward_model: saves/llama3-8b/lora/reward
 
-# method
+### method
 stage: ppo
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -16,14 +16,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/ppo
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.00001
@@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# generate
+### generate
 max_new_tokens: 512
 top_k: 0
 top_p: 0.9
diff --git a/examples/lora_single_gpu/llama3_lora_predict.yaml b/examples/lora_single_gpu/llama3_lora_predict.yaml
index 5a9de686..ba55219a 100644
--- a/examples/lora_single_gpu/llama3_lora_predict.yaml
+++ b/examples/lora_single_gpu/llama3_lora_predict.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 adapter_name_or_path: saves/llama3-8b/lora/sft
 
-# method
+### method
 stage: sft
 do_predict: true
 finetuning_type: lora
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -15,10 +15,10 @@ max_samples: 50
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/predict
 overwrite_output_dir: true
 
-# eval
+### eval
 per_device_eval_batch_size: 1
 predict_with_generate: true
diff --git a/examples/lora_single_gpu/llama3_lora_pretrain.yaml b/examples/lora_single_gpu/llama3_lora_pretrain.yaml
index 48425b15..acb18ebf 100644
--- a/examples/lora_single_gpu/llama3_lora_pretrain.yaml
+++ b/examples/lora_single_gpu/llama3_lora_pretrain.yaml
@@ -1,27 +1,27 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: pt
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: c4_demo
 cutoff_len: 1024
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -30,7 +30,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_single_gpu/llama3_lora_reward.yaml b/examples/lora_single_gpu/llama3_lora_reward.yaml
index ecaf8d72..67baefd0 100644
--- a/examples/lora_single_gpu/llama3_lora_reward.yaml
+++ b/examples/lora_single_gpu/llama3_lora_reward.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: rm
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: orca_rlhf
 template: llama3
 cutoff_len: 1024
@@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/reward
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.00001
@@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_single_gpu/llama3_lora_sft.yaml b/examples/lora_single_gpu/llama3_lora_sft.yaml
index 0e5e30b3..e7836fd1 100644
--- a/examples/lora_single_gpu/llama3_lora_sft.yaml
+++ b/examples/lora_single_gpu/llama3_lora_sft.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_single_gpu/llama3_preprocess.yaml b/examples/lora_single_gpu/llama3_preprocess.yaml
index 4c45c1cd..59090544 100644
--- a/examples/lora_single_gpu/llama3_preprocess.yaml
+++ b/examples/lora_single_gpu/llama3_preprocess.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -16,6 +16,6 @@ overwrite_cache: true
 preprocessing_num_workers: 16
 tokenized_path: saves/llama3-8b/dataset/sft
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 overwrite_output_dir: true
diff --git a/examples/lora_single_gpu/llava1_5_lora_sft.yaml b/examples/lora_single_gpu/llava1_5_lora_sft.yaml
index 84d2a672..8e4226da 100644
--- a/examples/lora_single_gpu/llava1_5_lora_sft.yaml
+++ b/examples/lora_single_gpu/llava1_5_lora_sft.yaml
@@ -1,14 +1,14 @@
-# model
+### model
 model_name_or_path: llava-hf/llava-1.5-7b-hf
 visual_inputs: true
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: mllm_demo
 template: vicuna
 cutoff_len: 1024
@@ -16,14 +16,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llava1_5-7b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/merge_lora/llama3_gptq.yaml b/examples/merge_lora/llama3_gptq.yaml
index eac12f90..70c96a6b 100644
--- a/examples/merge_lora/llama3_gptq.yaml
+++ b/examples/merge_lora/llama3_gptq.yaml
@@ -1,8 +1,8 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 template: llama3
 
-# export
+### export
 export_dir: models/llama3_gptq
 export_quantization_bit: 4
 export_quantization_dataset: data/c4_demo.json
diff --git a/examples/merge_lora/llama3_lora_sft.yaml b/examples/merge_lora/llama3_lora_sft.yaml
index de41d48b..1e017f69 100644
--- a/examples/merge_lora/llama3_lora_sft.yaml
+++ b/examples/merge_lora/llama3_lora_sft.yaml
@@ -1,12 +1,12 @@
-# Note: DO NOT use quantized model or quantization_bit when merging lora adapters
+### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
 
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 adapter_name_or_path: saves/llama3-8b/lora/sft
 template: llama3
 finetuning_type: lora
 
-# export
+### export
 export_dir: models/llama3_lora_sft
 export_size: 2
 export_device: cpu
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
index a1d5f95d..c8f2cff6 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: ISTA-DASLab/Meta-Llama-3-8B-Instruct-AQLM-2Bit-1x16
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
index 8941d6b2..05cb2a3f 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-AWQ
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
index 885fcd83..d6da94d3 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
@@ -1,14 +1,14 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 quantization_bit: 4
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -16,14 +16,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
index 87a404a0..f2ba7490 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps