mirror of
				https://github.com/hiyouga/LLaMA-Factory.git
				synced 2025-11-04 18:02:19 +08:00 
			
		
		
		
	add npu examples
Former-commit-id: 0f21e68e2dbd84c820d66d5c6d980004efc51d51
This commit is contained in:
		
							parent
							
								
									0a82e15e7c
								
							
						
					
					
						commit
						ba0da83031
					
				@ -6,7 +6,7 @@ RANK=0
 | 
			
		||||
MASTER_ADDR=192.168.0.1
 | 
			
		||||
MASTER_PORT=29500
 | 
			
		||||
 | 
			
		||||
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
 | 
			
		||||
CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
 | 
			
		||||
    --nproc_per_node $NPROC_PER_NODE \
 | 
			
		||||
    --nnodes $NNODES \
 | 
			
		||||
    --node_rank $RANK \
 | 
			
		||||
 | 
			
		||||
@ -1,9 +1,15 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
NPROC_PER_NODE=4
 | 
			
		||||
NNODES=1
 | 
			
		||||
RANK=0
 | 
			
		||||
MASTER_ADDR=127.0.0.1
 | 
			
		||||
MASTER_PORT=29500
 | 
			
		||||
 | 
			
		||||
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
 | 
			
		||||
CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
 | 
			
		||||
    --nproc_per_node $NPROC_PER_NODE \
 | 
			
		||||
    --nnodes 1 \
 | 
			
		||||
    --standalone \
 | 
			
		||||
    --nnodes $NNODES \
 | 
			
		||||
    --node_rank $RANK \
 | 
			
		||||
    --master_addr $MASTER_ADDR \
 | 
			
		||||
    --master_port $MASTER_PORT \
 | 
			
		||||
    src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
 | 
			
		||||
 | 
			
		||||
@ -1,9 +1,15 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
NPROC_PER_NODE=4
 | 
			
		||||
NNODES=1
 | 
			
		||||
RANK=0
 | 
			
		||||
MASTER_ADDR=127.0.0.1
 | 
			
		||||
MASTER_PORT=29500
 | 
			
		||||
 | 
			
		||||
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
 | 
			
		||||
CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
 | 
			
		||||
    --nproc_per_node $NPROC_PER_NODE \
 | 
			
		||||
    --nnodes 1 \
 | 
			
		||||
    --standalone \
 | 
			
		||||
    --nnodes $NNODES \
 | 
			
		||||
    --node_rank $RANK \
 | 
			
		||||
    --master_addr $MASTER_ADDR \
 | 
			
		||||
    --master_port $MASTER_PORT \
 | 
			
		||||
    src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										15
									
								
								examples/lora_multi_npu/ds_zero0.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								examples/lora_multi_npu/ds_zero0.sh
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,15 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
NPROC_PER_NODE=4
 | 
			
		||||
NNODES=1
 | 
			
		||||
RANK=0
 | 
			
		||||
MASTER_ADDR=127.0.0.1
 | 
			
		||||
MASTER_PORT=29500
 | 
			
		||||
 | 
			
		||||
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 torchrun \
 | 
			
		||||
    --nproc_per_node $NPROC_PER_NODE \
 | 
			
		||||
    --nnodes $NNODES \
 | 
			
		||||
    --node_rank $RANK \
 | 
			
		||||
    --master_addr $MASTER_ADDR \
 | 
			
		||||
    --master_port $MASTER_PORT \
 | 
			
		||||
    src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
 | 
			
		||||
							
								
								
									
										42
									
								
								examples/lora_multi_npu/llama3_lora_sft_ds.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								examples/lora_multi_npu/llama3_lora_sft_ds.yaml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,42 @@
 | 
			
		||||
# model
 | 
			
		||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 | 
			
		||||
 | 
			
		||||
# method
 | 
			
		||||
stage: sft
 | 
			
		||||
do_train: true
 | 
			
		||||
finetuning_type: lora
 | 
			
		||||
lora_target: q_proj,v_proj
 | 
			
		||||
 | 
			
		||||
# ddp
 | 
			
		||||
ddp_timeout: 180000000
 | 
			
		||||
deepspeed: examples/deepspeed/ds_z0_config.json
 | 
			
		||||
 | 
			
		||||
# dataset
 | 
			
		||||
dataset: identity,alpaca_gpt4_en
 | 
			
		||||
template: llama3
 | 
			
		||||
cutoff_len: 1024
 | 
			
		||||
max_samples: 1000
 | 
			
		||||
overwrite_cache: true
 | 
			
		||||
preprocessing_num_workers: 16
 | 
			
		||||
 | 
			
		||||
# output
 | 
			
		||||
output_dir: saves/llama3-8b/lora/sft
 | 
			
		||||
logging_steps: 10
 | 
			
		||||
save_steps: 500
 | 
			
		||||
plot_loss: true
 | 
			
		||||
overwrite_output_dir: true
 | 
			
		||||
 | 
			
		||||
# train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 2
 | 
			
		||||
learning_rate: 0.0001
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_steps: 0.1
 | 
			
		||||
fp16: true
 | 
			
		||||
 | 
			
		||||
# eval
 | 
			
		||||
val_size: 0.1
 | 
			
		||||
per_device_eval_batch_size: 1
 | 
			
		||||
evaluation_strategy: steps
 | 
			
		||||
eval_steps: 500
 | 
			
		||||
@ -1,9 +1,10 @@
 | 
			
		||||
import os
 | 
			
		||||
from types import MethodType
 | 
			
		||||
from typing import TYPE_CHECKING, Any, Dict
 | 
			
		||||
 | 
			
		||||
import torch
 | 
			
		||||
from peft import PeftModel
 | 
			
		||||
from transformers import PreTrainedModel, PreTrainedTokenizerBase
 | 
			
		||||
from transformers import PreTrainedModel, PreTrainedTokenizerBase, is_torch_npu_available
 | 
			
		||||
from transformers.integrations import is_deepspeed_zero3_enabled
 | 
			
		||||
 | 
			
		||||
from ..extras.logging import get_logger
 | 
			
		||||
@ -44,6 +45,10 @@ def patch_config(
 | 
			
		||||
    if model_args.compute_dtype is None:  # priority: bf16 > fp16 > fp32
 | 
			
		||||
        model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
 | 
			
		||||
 | 
			
		||||
    if is_torch_npu_available():
 | 
			
		||||
        use_jit_compile = os.environ.get("JIT_COMPILE", "0").lower() in ["true", "1"]
 | 
			
		||||
        torch.npu.set_compile_mode(jit_compile=use_jit_compile)
 | 
			
		||||
 | 
			
		||||
    configure_attn_implementation(config, model_args)
 | 
			
		||||
    configure_rope(config, model_args, is_trainable)
 | 
			
		||||
    configure_longlora(config, model_args, is_trainable)
 | 
			
		||||
@ -56,7 +61,7 @@ def patch_config(
 | 
			
		||||
        logger.info("Using KV cache for faster generation.")
 | 
			
		||||
 | 
			
		||||
    if getattr(config, "model_type", None) == "qwen":
 | 
			
		||||
        setattr(config, "use_flash_attn", model_args.flash_attn)
 | 
			
		||||
        setattr(config, "use_flash_attn", model_args.flash_attn == "fa2")
 | 
			
		||||
        for dtype_name, dtype in [("fp16", torch.float16), ("bf16", torch.bfloat16), ("fp32", torch.float32)]:
 | 
			
		||||
            setattr(config, dtype_name, model_args.compute_dtype == dtype)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -22,7 +22,7 @@ def configure_attn_implementation(config: "PretrainedConfig", model_args: "Model
 | 
			
		||||
 | 
			
		||||
    elif model_args.flash_attn == "sdpa":
 | 
			
		||||
        if not is_sdpa_available():
 | 
			
		||||
            logger.warning("Torch>=2.1.1 is required for SDPA attention.")
 | 
			
		||||
            logger.warning("torch>=2.1.1 is required for SDPA attention.")
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        requested_attn_implementation = "sdpa"
 | 
			
		||||
@ -52,4 +52,4 @@ def print_attn_implementation(config: "PretrainedConfig") -> None:
 | 
			
		||||
    elif attn_implementation == "sdpa":
 | 
			
		||||
        logger.info("Using torch SDPA for faster training and inference.")
 | 
			
		||||
    else:
 | 
			
		||||
        logger.info("Using vanilla Attention implementation.")
 | 
			
		||||
        logger.info("Using vanilla attention implementation.")
 | 
			
		||||
 | 
			
		||||
@ -1,8 +1,3 @@
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
import torch
 | 
			
		||||
from transformers import is_torch_npu_available
 | 
			
		||||
 | 
			
		||||
from llmtuner.train.tuner import run_exp
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -16,7 +11,4 @@ def _mp_fn(index):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    if is_torch_npu_available():
 | 
			
		||||
        use_jit_compile = os.getenv('JIT_COMPILE', 'False').lower() in ['true', '1']
 | 
			
		||||
        torch.npu.set_compile_mode(jit_compile=use_jit_compile)
 | 
			
		||||
    main()
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user