From 845e750abd9918400a26f9518971bc5359f6fef8 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Wed, 28 Feb 2024 23:19:25 +0800 Subject: [PATCH] add examples Former-commit-id: 804c1e7083e56b4a132d4d820ea9d8d50e5499e9 --- README.md | 25 ++++++++++++----- README_zh.md | 25 ++++++++++++----- examples/full_multi_gpu/ds_z2_config.json | 27 +++++++++++++++++++ examples/full_multi_gpu/ds_z3_config.json | 33 +++++++++++++++++++++++ examples/full_multi_gpu/sft.sh | 29 ++++++++++++++++++++ examples/lora_multi_gpu/config.yaml | 16 +++++++++++ examples/lora_multi_gpu/sft.sh | 30 +++++++++++++++++++++ examples/lora_single_gpu/dpo.sh | 33 +++++++++++++++++++++++ examples/lora_single_gpu/ppo.sh | 31 +++++++++++++++++++++ examples/lora_single_gpu/predict.sh | 18 +++++++++++++ examples/lora_single_gpu/pretrain.sh | 29 ++++++++++++++++++++ examples/lora_single_gpu/reward.sh | 31 +++++++++++++++++++++ examples/lora_single_gpu/sft.sh | 30 +++++++++++++++++++++ examples/qlora_single_gpu/aqlm.sh | 30 +++++++++++++++++++++ examples/qlora_single_gpu/awq.sh | 30 +++++++++++++++++++++ examples/qlora_single_gpu/bitsandbytes.sh | 31 +++++++++++++++++++++ examples/qlora_single_gpu/gptq.sh | 30 +++++++++++++++++++++ 17 files changed, 466 insertions(+), 12 deletions(-) create mode 100644 examples/full_multi_gpu/ds_z2_config.json create mode 100644 examples/full_multi_gpu/ds_z3_config.json create mode 100644 examples/full_multi_gpu/sft.sh create mode 100644 examples/lora_multi_gpu/config.yaml create mode 100644 examples/lora_multi_gpu/sft.sh create mode 100644 examples/lora_single_gpu/dpo.sh create mode 100644 examples/lora_single_gpu/ppo.sh create mode 100644 examples/lora_single_gpu/predict.sh create mode 100644 examples/lora_single_gpu/pretrain.sh create mode 100644 examples/lora_single_gpu/reward.sh create mode 100644 examples/lora_single_gpu/sft.sh create mode 100644 examples/qlora_single_gpu/aqlm.sh create mode 100644 examples/qlora_single_gpu/awq.sh create mode 100644 examples/qlora_single_gpu/bitsandbytes.sh create mode 100644 examples/qlora_single_gpu/gptq.sh diff --git a/README.md b/README.md index df64aa75..23f7a3ed 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846 - **Scalable resources**: 32-bit full-tuning, 16-bit freeze tuning, 16-bit LoRA tuning, 2/4/8-bit QLoRA with AQLM/AWQ/GPTQ/LLM.int8. - **Advanced algorithms**: DoRA, LongLoRA, LLaMA Pro, LoftQ, agent tuning. - **Intriguing tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune, rsLoRA. +- **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, etc. ## Benchmark @@ -236,15 +237,27 @@ huggingface-cli login ## Requirement -- Python 3.8+ and PyTorch 1.13.1+ -- 🤗Transformers, Datasets, Accelerate, PEFT and TRL -- sentencepiece, protobuf and tiktoken -- jieba, rouge-chinese and nltk (used at evaluation and predict) -- gradio and matplotlib (used in web UI) -- uvicorn, fastapi and sse-starlette (used in API) +| Mandatory | Minimum | Recommend | +| ------------ | ------- | --------- | +| python | 3.8 | 3.10 | +| torch | 1.13.1 | 2.2.1 | +| transformers | 4.37.2 | 4.38.1 | +| datasets | 2.14.3 | 2.17.1 | +| accelerate | 0.27.2 | 0.27.2 | +| peft | 0.9.0 | 0.9.0 | +| trl | 0.7.11 | 0.7.11 | + +| Optional | Minimum | Recommend | +| ------------ | ------- | --------- | +| CUDA | 11.6 | 12.2 | +| deepspeed | 0.10.0 | 0.13.4 | +| bitsandbytes | 0.39.0 | 0.41.3 | +| flash-attn | 2.3.0 | 2.5.5 | ### Hardware Requirement +\* *estimated* + | Method | Bits | 7B | 13B | 30B | 65B | 8x7B | | ------ | ---- | ----- | ----- | ----- | ------ | ------ | | Full | 16 | 160GB | 320GB | 600GB | 1200GB | 900GB | diff --git a/README_zh.md b/README_zh.md index c8db1485..7235321a 100644 --- a/README_zh.md +++ b/README_zh.md @@ -45,6 +45,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846 - **多种精度**:32 比特全参数训练、16 比特部分参数训练、16比特 LoRA 训练、基于 AQLM/AWQ/GPTQ/LLM.int8 的 2/4/8 比特 LoRA 训练。 - **先进算法**: DoRA、LongLoRA、LLaMA Pro、LoftQ、agent tuning。 - **新鲜技巧**:FlashAttention-2、Unsloth、RoPE scaling、NEFTune、rsLoRA。 +- **实验监控**:LlamaBoard、TensorBoard、Wandb、MLflow 等等。 ## 性能指标 @@ -236,15 +237,27 @@ huggingface-cli login ## 软硬件依赖 -- Python 3.8+ 和 PyTorch 1.13.1+ -- 🤗Transformers, Datasets, Accelerate, PEFT 和 TRL -- sentencepiece, protobuf 和 tiktoken -- jieba, rouge-chinese 和 nltk (用于评估及预测) -- gradio 和 matplotlib (用于网页端交互) -- uvicorn, fastapi 和 sse-starlette (用于 API) +| 必需项 | 至少 | 推荐 | +| ------------ | ------- | --------- | +| python | 3.8 | 3.10 | +| torch | 1.13.1 | 2.2.1 | +| transformers | 4.37.2 | 4.38.1 | +| datasets | 2.14.3 | 2.17.1 | +| accelerate | 0.27.2 | 0.27.2 | +| peft | 0.9.0 | 0.9.0 | +| trl | 0.7.11 | 0.7.11 | + +| 可选项 | 至少 | 推荐 | +| ------------ | ------- | --------- | +| CUDA | 11.6 | 12.2 | +| deepspeed | 0.10.0 | 0.13.4 | +| bitsandbytes | 0.39.0 | 0.41.3 | +| flash-attn | 2.3.0 | 2.5.5 | ### 硬件依赖 +\* *估算值* + | 训练方法 | 精度 | 7B | 13B | 30B | 65B | 8x7B | | ------- | ---- | ----- | ----- | ----- | ------ | ------ | | 全参数 | 16 | 160GB | 320GB | 600GB | 1200GB | 900GB | diff --git a/examples/full_multi_gpu/ds_z2_config.json b/examples/full_multi_gpu/ds_z2_config.json new file mode 100644 index 00000000..3d42aa15 --- /dev/null +++ b/examples/full_multi_gpu/ds_z2_config.json @@ -0,0 +1,27 @@ +{ + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "zero_allow_untested_optimizer": true, + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "initial_scale_power": 16, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 5e8, + "reduce_scatter": true, + "reduce_bucket_size": 5e8, + "overlap_comm": true, + "contiguous_gradients": true + } +} \ No newline at end of file diff --git a/examples/full_multi_gpu/ds_z3_config.json b/examples/full_multi_gpu/ds_z3_config.json new file mode 100644 index 00000000..9c5f55da --- /dev/null +++ b/examples/full_multi_gpu/ds_z3_config.json @@ -0,0 +1,33 @@ +{ + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "zero_allow_untested_optimizer": true, + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "initial_scale_power": 16, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu" + }, + "offload_param": { + "device": "cpu" + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true + } +} \ No newline at end of file diff --git a/examples/full_multi_gpu/sft.sh b/examples/full_multi_gpu/sft.sh new file mode 100644 index 00000000..e3ced5f0 --- /dev/null +++ b/examples/full_multi_gpu/sft.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +deepspeed --num_gpus 4 ../../src/train_bash.py \ + --deepspeed ds_z3_config.json \ + --stage sft \ + --do_train \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --dataset alpaca_gpt4_en \ + --dataset_dir ../../data \ + --template default \ + --finetuning_type full \ + --output_dir ../../saves/LLaMA2-7B/full/sft \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 1024 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 2 \ + --lr_scheduler_type cosine \ + --logging_steps 10 \ + --save_steps 100 \ + --eval_steps 100 \ + --evaluation_strategy steps \ + --learning_rate 5e-5 \ + --num_train_epochs 3.0 \ + --max_samples 3000 \ + --val_size 0.1 \ + --plot_loss \ + --fp16 diff --git a/examples/lora_multi_gpu/config.yaml b/examples/lora_multi_gpu/config.yaml new file mode 100644 index 00000000..ddb5c910 --- /dev/null +++ b/examples/lora_multi_gpu/config.yaml @@ -0,0 +1,16 @@ +compute_environment: LOCAL_MACHINE +debug: false +distributed_type: MULTI_GPU +downcast_bf16: 'no' +gpu_ids: all +machine_rank: 0 +main_training_function: main +mixed_precision: fp16 +num_machines: 1 +num_processes: 4 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false diff --git a/examples/lora_multi_gpu/sft.sh b/examples/lora_multi_gpu/sft.sh new file mode 100644 index 00000000..525e4f67 --- /dev/null +++ b/examples/lora_multi_gpu/sft.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch --config_file config.yaml ../../src/train_bash.py \ + --stage sft \ + --do_train \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --dataset alpaca_gpt4_en,glaive_toolcall \ + --dataset_dir ../../data \ + --template default \ + --finetuning_type lora \ + --lora_target q_proj,v_proj \ + --output_dir ../../saves/LLaMA2-7B/lora/sft \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 1024 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 2 \ + --lr_scheduler_type cosine \ + --logging_steps 10 \ + --save_steps 100 \ + --eval_steps 100 \ + --evaluation_strategy steps \ + --load_best_model_at_end \ + --learning_rate 5e-5 \ + --num_train_epochs 3.0 \ + --max_samples 3000 \ + --val_size 0.1 \ + --plot_loss \ + --fp16 diff --git a/examples/lora_single_gpu/dpo.sh b/examples/lora_single_gpu/dpo.sh new file mode 100644 index 00000000..8c2f68c9 --- /dev/null +++ b/examples/lora_single_gpu/dpo.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \ + --stage dpo \ + --do_train \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \ + --create_new_adapter \ + --dataset comparison_gpt4_en \ + --dataset_dir ../../data \ + --template default \ + --finetuning_type lora \ + --lora_target q_proj,v_proj \ + --output_dir ../../saves/LLaMA2-7B/lora/dpo \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 1024 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --lr_scheduler_type cosine \ + --logging_steps 10 \ + --save_steps 100 \ + --eval_steps 100 \ + --evaluation_strategy steps \ + --load_best_model_at_end \ + --learning_rate 1e-5 \ + --num_train_epochs 1.0 \ + --max_samples 1000 \ + --val_size 0.1 \ + --dpo_ftx 1.0 \ + --plot_loss \ + --fp16 diff --git a/examples/lora_single_gpu/ppo.sh b/examples/lora_single_gpu/ppo.sh new file mode 100644 index 00000000..4ec0cbfb --- /dev/null +++ b/examples/lora_single_gpu/ppo.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \ + --stage ppo \ + --do_train \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \ + --create_new_adapter \ + --dataset alpaca_gpt4_en \ + --dataset_dir ../../data \ + --template default \ + --finetuning_type lora \ + --lora_target q_proj,v_proj \ + --reward_model ../../saves/LLaMA2-7B/lora/reward \ + --output_dir ../../saves/LLaMA2-7B/lora/ppo \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 512 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --lr_scheduler_type cosine \ + --logging_steps 10 \ + --save_steps 100 \ + --learning_rate 1e-5 \ + --num_train_epochs 1.0 \ + --max_samples 1000 \ + --top_k 0 \ + --top_p 0.9 \ + --max_new_tokens 256 \ + --plot_loss \ + --fp16 diff --git a/examples/lora_single_gpu/predict.sh b/examples/lora_single_gpu/predict.sh new file mode 100644 index 00000000..1fb45396 --- /dev/null +++ b/examples/lora_single_gpu/predict.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \ + --stage sft \ + --do_predict \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft,../../saves/LLaMA2-7B/lora/dpo \ + --dataset alpaca_gpt4_en,glaive_toolcall \ + --dataset_dir ../../data \ + --template default \ + --finetuning_type lora \ + --output_dir ../../saves/LLaMA2-7B/lora/predict \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 1024 \ + --per_device_eval_batch_size 1 \ + --max_samples 20 \ + --predict_with_generate diff --git a/examples/lora_single_gpu/pretrain.sh b/examples/lora_single_gpu/pretrain.sh new file mode 100644 index 00000000..37adf51f --- /dev/null +++ b/examples/lora_single_gpu/pretrain.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \ + --stage pt \ + --do_train \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --dataset c4_demo \ + --dataset_dir ../../data \ + --finetuning_type lora \ + --lora_target q_proj,v_proj \ + --output_dir ../../saves/LLaMA2-7B/lora/pretrain \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 1024 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --lr_scheduler_type cosine \ + --logging_steps 10 \ + --save_steps 100 \ + --eval_steps 100 \ + --evaluation_strategy steps \ + --load_best_model_at_end \ + --learning_rate 5e-5 \ + --num_train_epochs 3.0 \ + --max_samples 10000 \ + --val_size 0.1 \ + --plot_loss \ + --fp16 diff --git a/examples/lora_single_gpu/reward.sh b/examples/lora_single_gpu/reward.sh new file mode 100644 index 00000000..7c19e9aa --- /dev/null +++ b/examples/lora_single_gpu/reward.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \ + --stage rm \ + --do_train \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \ + --create_new_adapter \ + --dataset comparison_gpt4_en \ + --dataset_dir ../../data \ + --template default \ + --finetuning_type lora \ + --lora_target q_proj,v_proj \ + --output_dir ../../saves/LLaMA2-7B/lora/reward \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 1024 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --lr_scheduler_type cosine \ + --logging_steps 10 \ + --save_steps 100 \ + --eval_steps 100 \ + --evaluation_strategy steps \ + --learning_rate 1e-5 \ + --num_train_epochs 1.0 \ + --max_samples 5000 \ + --val_size 0.1 \ + --plot_loss \ + --fp16 diff --git a/examples/lora_single_gpu/sft.sh b/examples/lora_single_gpu/sft.sh new file mode 100644 index 00000000..41d7851a --- /dev/null +++ b/examples/lora_single_gpu/sft.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \ + --stage sft \ + --do_train \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --dataset alpaca_gpt4_en,glaive_toolcall \ + --dataset_dir ../../data \ + --template default \ + --finetuning_type lora \ + --lora_target q_proj,v_proj \ + --output_dir ../../saves/LLaMA2-7B/lora/sft \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 1024 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --lr_scheduler_type cosine \ + --logging_steps 10 \ + --save_steps 100 \ + --eval_steps 100 \ + --evaluation_strategy steps \ + --load_best_model_at_end \ + --learning_rate 5e-5 \ + --num_train_epochs 3.0 \ + --max_samples 3000 \ + --val_size 0.1 \ + --plot_loss \ + --fp16 diff --git a/examples/qlora_single_gpu/aqlm.sh b/examples/qlora_single_gpu/aqlm.sh new file mode 100644 index 00000000..68eb4482 --- /dev/null +++ b/examples/qlora_single_gpu/aqlm.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \ + --stage sft \ + --do_train \ + --model_name_or_path BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf \ + --dataset alpaca_gpt4_en,glaive_toolcall \ + --dataset_dir ../../data \ + --template default \ + --finetuning_type lora \ + --lora_target q_proj,v_proj \ + --output_dir ../../saves/LLaMA2-7B/lora/sft \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 1024 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --lr_scheduler_type cosine \ + --logging_steps 10 \ + --save_steps 100 \ + --eval_steps 100 \ + --evaluation_strategy steps \ + --load_best_model_at_end \ + --learning_rate 5e-5 \ + --num_train_epochs 3.0 \ + --max_samples 3000 \ + --val_size 0.1 \ + --plot_loss \ + --fp16 diff --git a/examples/qlora_single_gpu/awq.sh b/examples/qlora_single_gpu/awq.sh new file mode 100644 index 00000000..b0f1f46b --- /dev/null +++ b/examples/qlora_single_gpu/awq.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \ + --stage sft \ + --do_train \ + --model_name_or_path TheBloke/Llama-2-7B-AWQ \ + --dataset alpaca_gpt4_en,glaive_toolcall \ + --dataset_dir ../../data \ + --template default \ + --finetuning_type lora \ + --lora_target q_proj,v_proj \ + --output_dir ../../saves/LLaMA2-7B/lora/sft \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 1024 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --lr_scheduler_type cosine \ + --logging_steps 10 \ + --save_steps 100 \ + --eval_steps 100 \ + --evaluation_strategy steps \ + --load_best_model_at_end \ + --learning_rate 5e-5 \ + --num_train_epochs 3.0 \ + --max_samples 3000 \ + --val_size 0.1 \ + --plot_loss \ + --fp16 diff --git a/examples/qlora_single_gpu/bitsandbytes.sh b/examples/qlora_single_gpu/bitsandbytes.sh new file mode 100644 index 00000000..84bbb426 --- /dev/null +++ b/examples/qlora_single_gpu/bitsandbytes.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \ + --stage sft \ + --do_train \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --dataset alpaca_gpt4_en,glaive_toolcall \ + --dataset_dir ../../data \ + --template default \ + --finetuning_type lora \ + --lora_target q_proj,v_proj \ + --output_dir ../../saves/LLaMA2-7B/lora/sft \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 1024 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --lr_scheduler_type cosine \ + --logging_steps 10 \ + --save_steps 100 \ + --eval_steps 100 \ + --evaluation_strategy steps \ + --load_best_model_at_end \ + --learning_rate 5e-5 \ + --num_train_epochs 3.0 \ + --max_samples 3000 \ + --val_size 0.1 \ + --quantization_bit 4 \ + --plot_loss \ + --fp16 diff --git a/examples/qlora_single_gpu/gptq.sh b/examples/qlora_single_gpu/gptq.sh new file mode 100644 index 00000000..a971b09f --- /dev/null +++ b/examples/qlora_single_gpu/gptq.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \ + --stage sft \ + --do_train \ + --model_name_or_path TheBloke/Llama-2-7B-GPTQ \ + --dataset alpaca_gpt4_en,glaive_toolcall \ + --dataset_dir ../../data \ + --template default \ + --finetuning_type lora \ + --lora_target q_proj,v_proj \ + --output_dir ../../saves/LLaMA2-7B/lora/sft \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 1024 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --lr_scheduler_type cosine \ + --logging_steps 10 \ + --save_steps 100 \ + --eval_steps 100 \ + --evaluation_strategy steps \ + --load_best_model_at_end \ + --learning_rate 5e-5 \ + --num_train_epochs 3.0 \ + --max_samples 3000 \ + --val_size 0.1 \ + --plot_loss \ + --fp16