mirror of
				https://github.com/hiyouga/LLaMA-Factory.git
				synced 2025-11-04 18:02:19 +08:00 
			
		
		
		
	Former-commit-id: 8608fa268cde5cddf8d0c6c2eb2cb5fa246c1831
This commit is contained in:
		
							parent
							
								
									1a3764ab8f
								
							
						
					
					
						commit
						82d744716a
					
				@ -107,13 +107,13 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_l
 | 
			
		||||
 | 
			
		||||
### LoRA Fine-Tuning on Multiple GPUs
 | 
			
		||||
 | 
			
		||||
#### Supervised Fine-Tuning with Accelerate on Single Node
 | 
			
		||||
#### Supervised Fine-Tuning on Single Node
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
#### Supervised Fine-Tuning with Accelerate on Multiple Nodes
 | 
			
		||||
#### Supervised Fine-Tuning on Multiple Nodes
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
 | 
			
		||||
@ -136,13 +136,13 @@ ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu
 | 
			
		||||
 | 
			
		||||
### Full-Parameter Fine-Tuning on Multiple GPUs
 | 
			
		||||
 | 
			
		||||
#### Supervised Fine-Tuning with Accelerate on Single Node
 | 
			
		||||
#### Supervised Fine-Tuning on Single Node
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
#### Supervised Fine-Tuning with Accelerate on Multiple Nodes
 | 
			
		||||
#### Supervised Fine-Tuning on Multiple Nodes
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
 | 
			
		||||
 | 
			
		||||
@ -107,13 +107,13 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_l
 | 
			
		||||
 | 
			
		||||
### 多 GPU LoRA 微调
 | 
			
		||||
 | 
			
		||||
#### 使用 Accelerate 进行单节点训练
 | 
			
		||||
#### 在单机上进行指令监督微调
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
#### 使用 Accelerate 进行多节点训练
 | 
			
		||||
#### 在多机上进行指令监督微调
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
 | 
			
		||||
@ -128,7 +128,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llam
 | 
			
		||||
 | 
			
		||||
### 多 NPU LoRA 微调
 | 
			
		||||
 | 
			
		||||
#### 使用 DeepSpeed ZeRO-0 训练
 | 
			
		||||
#### 使用 DeepSpeed ZeRO-0 进行指令监督微调
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu/llama3_lora_sft_ds.yaml
 | 
			
		||||
@ -136,13 +136,13 @@ ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu
 | 
			
		||||
 | 
			
		||||
### 多 GPU 全参数微调
 | 
			
		||||
 | 
			
		||||
#### 使用 DeepSpeed 进行单节点训练
 | 
			
		||||
#### 在单机上进行指令监督微调
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
#### 使用 DeepSpeed 进行多节点训练
 | 
			
		||||
#### 在多机上进行指令监督微调
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
 | 
			
		||||
 | 
			
		||||
@ -28,10 +28,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
pure_bf16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -29,10 +29,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -29,10 +29,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 1
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
pure_bf16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -27,10 +27,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -26,10 +26,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -26,10 +26,10 @@ overwrite_output_dir: true
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
optim: paged_adamw_8bit
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
pure_bf16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -28,10 +28,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 2
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -28,10 +28,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 2
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -29,10 +29,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 2
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -29,10 +29,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 2
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -27,10 +27,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 0.000005
 | 
			
		||||
learning_rate: 5.0e-6
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -25,10 +25,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 0.000005
 | 
			
		||||
learning_rate: 5.0e-6
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -26,10 +26,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 0.00001
 | 
			
		||||
learning_rate: 1.0e-5
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### generate
 | 
			
		||||
 | 
			
		||||
@ -24,10 +24,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -25,10 +25,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 0.00001
 | 
			
		||||
learning_rate: 1.0e-5
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -25,10 +25,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -26,10 +26,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -25,10 +25,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -25,10 +25,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -26,10 +26,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -25,10 +25,10 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
 | 
			
		||||
@ -107,7 +107,7 @@ class ModelArguments:
 | 
			
		||||
    )
 | 
			
		||||
    vllm_maxlen: int = field(
 | 
			
		||||
        default=2048,
 | 
			
		||||
        metadata={"help": "Maximum sequence length of the vLLM engine (including prompt and output)."},
 | 
			
		||||
        metadata={"help": "Maximum sequence (prompt + response) length of the vLLM engine."},
 | 
			
		||||
    )
 | 
			
		||||
    vllm_gpu_util: float = field(
 | 
			
		||||
        default=0.9,
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user