diff --git a/.env.local b/.env.local index f6c73f38..f5eeeb65 100644 --- a/.env.local +++ b/.env.local @@ -15,6 +15,7 @@ LLAMAFACTORY_VERBOSITY= USE_MODELSCOPE_HUB= USE_OPENMIND_HUB= USE_RAY= +USE_KT= RECORD_VRAM= OPTIM_TORCH= NPU_JIT_COMPILE= diff --git a/examples/inference/deepseek2_lora_sft_kt.yaml b/examples/inference/deepseek2_lora_sft_kt.yaml new file mode 100644 index 00000000..13afdefd --- /dev/null +++ b/examples/inference/deepseek2_lora_sft_kt.yaml @@ -0,0 +1,10 @@ +model_name_or_path: deepseek-ai/DeepSeek-V2-Lite +adapter_name_or_path: saves/Kllama_deepseekV2 +template: deepseek +infer_backend: ktransformers # choices: [huggingface, vllm, sglang, ktransformers] +trust_remote_code: true + +use_kt: true # use KTransformers as LoRA sft backend to inference +kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml +cpu_infer: 32 +chunk_size: 8192 \ No newline at end of file diff --git a/examples/inference/deepseek3_kt.yaml b/examples/inference/deepseek3_kt.yaml new file mode 100644 index 00000000..22674e8e --- /dev/null +++ b/examples/inference/deepseek3_kt.yaml @@ -0,0 +1,9 @@ +model_name_or_path: opensourcerelease/DeepSeek-V3-bf16 +template: deepseek +infer_backend: ktransformers # choices: [huggingface, vllm, sglang, ktransformers] +trust_remote_code: true + +use_kt: true # use KTransformers as LoRA sft backend to inference +kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml +cpu_infer: 32 +chunk_size: 8192 \ No newline at end of file diff --git a/examples/inference/deepseek3_lora_sft_kt.yaml b/examples/inference/deepseek3_lora_sft_kt.yaml new file mode 100644 index 00000000..6b18cff0 --- /dev/null +++ b/examples/inference/deepseek3_lora_sft_kt.yaml @@ -0,0 +1,10 @@ +model_name_or_path: opensourcerelease/DeepSeek-V3-bf16 +adapter_name_or_path: saves/Kllama_deepseekV3 +template: deepseek +infer_backend: ktransformers # choices: [huggingface, vllm, sglang, ktransformers] +trust_remote_code: true + +use_kt: true # use KTransformers as LoRA sft backend to inference +kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml +cpu_infer: 32 +chunk_size: 8192 \ No newline at end of file diff --git a/examples/inference/llama3.yaml b/examples/inference/llama3.yaml index 5d5381c8..9315e797 100644 --- a/examples/inference/llama3.yaml +++ b/examples/inference/llama3.yaml @@ -1,4 +1,4 @@ model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct template: llama3 -infer_backend: huggingface # choices: [huggingface, vllm, sglang] +infer_backend: huggingface # choices: [huggingface, vllm, sglang, ktransformers] trust_remote_code: true diff --git a/examples/inference/llama3_full_sft.yaml b/examples/inference/llama3_full_sft.yaml index 5d8acabe..64fc2489 100644 --- a/examples/inference/llama3_full_sft.yaml +++ b/examples/inference/llama3_full_sft.yaml @@ -1,4 +1,4 @@ model_name_or_path: saves/llama3-8b/full/sft template: llama3 -infer_backend: huggingface # choices: [huggingface, vllm, sglang] +infer_backend: huggingface # choices: [huggingface, vllm, sglang, ktransformers] trust_remote_code: true diff --git a/examples/inference/llama3_lora_sft.yaml b/examples/inference/llama3_lora_sft.yaml index 0f5e9f84..e7fd0425 100644 --- a/examples/inference/llama3_lora_sft.yaml +++ b/examples/inference/llama3_lora_sft.yaml @@ -1,5 +1,5 @@ model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct adapter_name_or_path: saves/llama3-8b/lora/sft template: llama3 -infer_backend: huggingface # choices: [huggingface, vllm, sglang] +infer_backend: huggingface # choices: [huggingface, vllm, sglang, ktransformers] trust_remote_code: true diff --git a/examples/inference/qwen2_5vl.yaml b/examples/inference/qwen2_5vl.yaml index d8f88dc2..67b78d4f 100644 --- a/examples/inference/qwen2_5vl.yaml +++ b/examples/inference/qwen2_5vl.yaml @@ -1,4 +1,4 @@ model_name_or_path: Qwen/Qwen2.5-VL-7B-Instruct template: qwen2_vl -infer_backend: huggingface # choices: [huggingface, vllm, sglang] +infer_backend: huggingface # choices: [huggingface, vllm, sglang, ktransformers] trust_remote_code: true diff --git a/examples/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml b/examples/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml new file mode 100644 index 00000000..37ca02ed --- /dev/null +++ b/examples/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml @@ -0,0 +1,69 @@ +- match: + class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbedding + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +- match: + name: "^lm_head" + class: torch.nn.Linear + replace: + class: ktransformers.operators.linear.KTransformersLinear + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +- match: + name: "^model\\.layers\\..*\\.mlp$" + class: ktransformers.models.modeling_deepseek.DeepseekV2MoE + replace: + class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model\\.layers\\..*\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism + kwargs: + prefill_device: "cuda" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KSFTExpertsCPU" + out_device: "cuda" + backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) + recursive: False # don't recursively inject submodules of this module +- match: + name: "^model\\.layers\\..*\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model$" + replace: + class: "ktransformers.operators.models.KDeepseekV2Model" + kwargs: + per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill +- match: + name: "^model.embed_tokens" + replace: + class: "default" + kwargs: + generate_device: "cpu" + prefill_device: "cpu" \ No newline at end of file diff --git a/examples/kt_optimize_rules/DeepSeek-V2-Chat.yaml b/examples/kt_optimize_rules/DeepSeek-V2-Chat.yaml new file mode 100644 index 00000000..7f3e44ea --- /dev/null +++ b/examples/kt_optimize_rules/DeepSeek-V2-Chat.yaml @@ -0,0 +1,68 @@ +- match: + class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbedding + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" + +- match: + name: "^lm_head" + class: torch.nn.Linear + replace: + class: ktransformers.operators.linear.KTransformersLinear + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" + +- match: + name: "^model\\.layers\\..*\\.mlp$" + class: ktransformers.models.modeling_deepseek.DeepseekV2MoE + replace: + class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model\\.layers\\..*\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism + kwargs: + prefill_device: "cuda" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KExpertsCPU" + out_device: "cuda" + recursive: False # don't recursively inject submodules of this module +- match: + name: "^model\\.layers\\..*\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model$" + replace: + class: "ktransformers.operators.models.KDeepseekV2Model" + kwargs: + per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill +- match: + name: "^model.embed_tokens" + replace: + class: "default" + kwargs: + generate_device: "cpu" + prefill_device: "cpu" \ No newline at end of file diff --git a/examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml b/examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml new file mode 100644 index 00000000..54f97e50 --- /dev/null +++ b/examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml @@ -0,0 +1,139 @@ +- match: + name: "^model.embed_tokens" + replace: + class: "default" + kwargs: + generate_device: "cpu" + prefill_device: "cpu" + +- match: + name: "^model\\.layers\\.(0|[1-9])\\." + class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbedding + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" +- match: + name: "^model\\.layers\\.([12][0-9])\\." + class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbedding + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + +- match: + name: "^model\\.layers\\.(0|[1-9])\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +- match: + name: "^model\\.layers\\.([12][0-9])\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +- match: + name: "^model\\.layers\\.(0|[1-9])\\.mlp$" + class: ktransformers.models.modeling_deepseek.DeepseekV2MoE + replace: + class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" +- match: + name: "^model\\.layers\\.([12][0-9])\\.mlp$" + class: ktransformers.models.modeling_deepseek.DeepseekV2MoE + replace: + class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + +- match: + name: "^model\\.layers\\.(0|[1-9])\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism + kwargs: + prefill_device: "cuda:0" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KSFTExpertsCPU" + out_device: "cuda:0" + backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) + recursive: False # don't recursively inject submodules of this module + +- match: + name: "^model\\.layers\\.([12][0-9])\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism + kwargs: + prefill_device: "cuda:1" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KSFTExpertsCPU" + out_device: "cuda:1" + backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) + recursive: False # don't recursively inject submodules of this module + +- match: + name: "^model\\.layers\\.(0|[1-9])\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" +- match: + name: "^model\\.layers\\.([12][0-9])\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" +- match: + name: "^model$" + replace: + class: "ktransformers.operators.models.KDeepseekV2Model" + kwargs: + per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill + transfer_map: + 10: "cuda:1" + +- match: + name: "^model\\.layers\\.(0|[1-9])\\." + replace: + class: "default" + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" + +- match: + name: "^lm_head" + class: torch.nn.Linear + replace: + class: ktransformers.operators.linear.KTransformersLinear + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +- match: + name: "(^model\\.layers\\.([12][0-9])\\.)|(model.norm)" + replace: + class: "default" + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" diff --git a/examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml b/examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml new file mode 100644 index 00000000..7e6e340d --- /dev/null +++ b/examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml @@ -0,0 +1,69 @@ +- match: + class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbedding + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +- match: + name: "^lm_head" + class: torch.nn.Linear + replace: + class: ktransformers.operators.linear.KTransformersLinear + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +- match: + name: "^model\\.layers\\..*\\.mlp$" + class: ktransformers.models.modeling_deepseek.DeepseekV2MoE + replace: + class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model\\.layers\\..*\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism + kwargs: + prefill_device: "cpu" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KSFTExpertsCPU" + out_device: "cuda" + backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) + recursive: False # don't recursively inject submodules of this module +- match: + name: "^model\\.layers\\..*\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model$" + replace: + class: "ktransformers.operators.models.KDeepseekV2Model" + kwargs: + per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill +- match: + name: "^model.embed_tokens" + replace: + class: "default" + kwargs: + generate_device: "cpu" + prefill_device: "cpu" \ No newline at end of file diff --git a/examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft.yaml b/examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft.yaml new file mode 100644 index 00000000..eebc24b1 --- /dev/null +++ b/examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft.yaml @@ -0,0 +1,68 @@ +- match: + class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbedding + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +- match: + name: "^lm_head" + class: torch.nn.Linear + replace: + class: ktransformers.operators.linear.KTransformersLinear + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +- match: + name: "^model\\.layers\\..*\\.mlp$" + class: ktransformers.models.modeling_deepseek.DeepseekV2MoE + replace: + class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model\\.layers\\..*\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism + kwargs: + prefill_device: "cpu" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KSFTExpertsCPU" + out_device: "cuda" + recursive: False # don't recursively inject submodules of this module +- match: + name: "^model\\.layers\\..*\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model$" + replace: + class: "ktransformers.operators.models.KDeepseekV2Model" + kwargs: + per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill +- match: + name: "^model.embed_tokens" + replace: + class: "default" + kwargs: + generate_device: "cpu" + prefill_device: "cpu" \ No newline at end of file diff --git a/examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat.yaml b/examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat.yaml new file mode 100644 index 00000000..7f3e44ea --- /dev/null +++ b/examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat.yaml @@ -0,0 +1,68 @@ +- match: + class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbedding + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" + +- match: + name: "^lm_head" + class: torch.nn.Linear + replace: + class: ktransformers.operators.linear.KTransformersLinear + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" + +- match: + name: "^model\\.layers\\..*\\.mlp$" + class: ktransformers.models.modeling_deepseek.DeepseekV2MoE + replace: + class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model\\.layers\\..*\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism + kwargs: + prefill_device: "cuda" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KExpertsCPU" + out_device: "cuda" + recursive: False # don't recursively inject submodules of this module +- match: + name: "^model\\.layers\\..*\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model$" + replace: + class: "ktransformers.operators.models.KDeepseekV2Model" + kwargs: + per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill +- match: + name: "^model.embed_tokens" + replace: + class: "default" + kwargs: + generate_device: "cpu" + prefill_device: "cpu" \ No newline at end of file diff --git a/examples/kt_optimize_rules/DeepSeek-V3-Chat-amx.yaml b/examples/kt_optimize_rules/DeepSeek-V3-Chat-amx.yaml new file mode 100644 index 00000000..724e1a47 --- /dev/null +++ b/examples/kt_optimize_rules/DeepSeek-V3-Chat-amx.yaml @@ -0,0 +1,77 @@ +- match: + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + +- match: + name: "^lm_head$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" + +- match: + name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearMarlin" + prefill_op: "KLinearTorch" +- match: + name: "^model\\.layers\\..*\\.mlp$" + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE + replace: + class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + class: ktransformers.models.modeling_deepseek_v3.MoEGate + replace: + class: ktransformers.operators.gate.KMoEGate + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" +- match: + name: "^model\\.layers\\..*\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism + kwargs: + prefill_device: "cuda" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KExpertsCPU" + out_device: "cuda" + backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) + recursive: False # don't recursively inject submodules of this module +- match: + name: "^model\\.layers\\..*\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + absorb_for_prefill: False # change this to True to enable long context(prefill may slower). +- match: + name: "^model$" + replace: + class: "ktransformers.operators.models.KDeepseekV2Model" + kwargs: + per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill +- match: + name: "^model.embed_tokens" + replace: + class: "default" + kwargs: + generate_device: "cpu" + prefill_device: "cpu" \ No newline at end of file diff --git a/examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml b/examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml new file mode 100644 index 00000000..4eda68c8 --- /dev/null +++ b/examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml @@ -0,0 +1,392 @@ +- match: + name: "^model.embed_tokens" + replace: + class: "default" + kwargs: + generate_device: "cpu" + prefill_device: "cpu" + +# === Rotary Embedding Replacement === + +# GPU 0: layers 0–14 +- match: + name: "^model\\.layers\\.([0-9]|1[0-4])\\." + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" + +# GPU 1: layers 15–29 +- match: + name: "^model\\.layers\\.(1[5-9]|2[0-9])\\." + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + +# GPU 2: layers 30–44 +- match: + name: "^model\\.layers\\.(3[0-9]|4[0-4])\\." + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 + kwargs: + generate_device: "cuda:2" + prefill_device: "cuda:2" + +# GPU 3: layers 45–60 +- match: + name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\." + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 + kwargs: + generate_device: "cuda:3" + prefill_device: "cuda:3" + +# === Linear Layers Replacement (excluding self_attn.kv_b_proj) === + +# GPU 0: layers 0–14 +- match: + name: "^model\\.layers\\.([0-9]|1[0-4])\\.(?!self_attn\\.kv_b_proj).*$" + class: torch.nn.Linear + replace: + class: ktransformers.operators.linear.KTransformersLinear + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +# GPU 1: layers 15–29 +- match: + name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.(?!self_attn\\.kv_b_proj).*$" + class: torch.nn.Linear + replace: + class: ktransformers.operators.linear.KTransformersLinear + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +# GPU 2: layers 30–44 +- match: + name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.(?!self_attn\\.kv_b_proj).*$" + class: torch.nn.Linear + replace: + class: ktransformers.operators.linear.KTransformersLinear + kwargs: + generate_device: "cuda:2" + prefill_device: "cuda:2" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +# GPU 3: layers 45–60 +- match: + name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.(?!self_attn\\.kv_b_proj).*$" + class: torch.nn.Linear + replace: + class: ktransformers.operators.linear.KTransformersLinear + kwargs: + generate_device: "cuda:3" + prefill_device: "cuda:3" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +# === MLP (MoE) Replacement === + +# GPU 0: layers 0–14 +- match: + name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp$" + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE + replace: + class: ktransformers.operators.experts.KDeepseekV3MoE + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" + +# GPU 1: layers 15–29 +- match: + name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp$" + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE + replace: + class: ktransformers.operators.experts.KDeepseekV3MoE + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + +# GPU 2: layers 30–44 +- match: + name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp$" + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE + replace: + class: ktransformers.operators.experts.KDeepseekV3MoE + kwargs: + generate_device: "cuda:2" + prefill_device: "cuda:2" + +# GPU 3: layers 45–60 +- match: + name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp$" + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE + replace: + class: ktransformers.operators.experts.KDeepseekV3MoE + kwargs: + generate_device: "cuda:3" + prefill_device: "cuda:3" + +# === MLP Gate Replacement === + +# GPU 0: layers 0–14 +- match: + name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.gate$" + class: ktransformers.models.modeling_deepseek_v3.MoEGate + replace: + class: ktransformers.operators.gate.KMoEGate + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" + +# GPU 1: layers 15–29 +- match: + name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.gate$" + class: ktransformers.models.modeling_deepseek_v3.MoEGate + replace: + class: ktransformers.operators.gate.KMoEGate + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + +# GPU 2: layers 30–44 +- match: + name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.gate$" + class: ktransformers.models.modeling_deepseek_v3.MoEGate + replace: + class: ktransformers.operators.gate.KMoEGate + kwargs: + generate_device: "cuda:2" + prefill_device: "cuda:2" + +# GPU 3: layers 45–60 +- match: + name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.gate$" + class: ktransformers.models.modeling_deepseek_v3.MoEGate + replace: + class: ktransformers.operators.gate.KMoEGate + kwargs: + generate_device: "cuda:3" + prefill_device: "cuda:3" + +# === MLP Experts Replacement === +# replace with marlin expert. Open and modify layer-num as needed. +# Each layer of malin experts takes about 6GB of GPU memory. +# !!!Do remember 'close' cuda graph if you are using marlin expert.!!! +# !!!KExpertsTorch is untested, we don't have enough VRAM.!!! + +# GPU 0: layers 3–4 +# - match: +# name: "^model\\.layers\\.([3-4])\\.mlp\\.experts$" +# replace: +# class: ktransformers.operators.experts.KTransformersExperts +# kwargs: +# generate_device: "cuda:0" +# generate_op: "KExpertsMarlin" +# recursive: False + +# # GPU 1: layers 15–17 +# - match: +# name: "^model\\.layers\\.(1[5-7])\\.mlp\\.experts$" +# replace: +# class: ktransformers.operators.experts.KTransformersExperts +# kwargs: +# generate_device: "cuda:1" +# generate_op: "KExpertsMarlin" +# recursive: False + +# # GPU 2: layers 30–32 +# - match: +# name: "^model\\.layers\\.(3[0-2])\\.mlp\\.experts$" +# replace: +# class: ktransformers.operators.experts.KTransformersExperts +# kwargs: +# generate_device: "cuda:2" +# generate_op: "KExpertsMarlin" +# recursive: False + +# # GPU 3: layers 45–46 +# - match: +# name: "^model\\.layers\\.(4[5-6])\\.mlp\\.experts$" +# replace: +# class: ktransformers.operators.experts.KTransformersExperts +# kwargs: +# generate_device: "cuda:3" +# generate_op: "KExpertsMarlin" +# recursive: False + + +# === MLP Experts Replacement === + +# GPU 0: layers 0–14 +- match: + name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts + kwargs: + prefill_device: "cuda:0" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KSFTExpertsCPU" + out_device: "cuda:0" + backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) + recursive: False + +# GPU 1: layers 15–29 +- match: + name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts + kwargs: + prefill_device: "cuda:1" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KSFTExpertsCPU" + out_device: "cuda:1" + backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) + recursive: False + +# GPU 2: layers 30–44 +- match: + name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts + kwargs: + prefill_device: "cuda:2" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KSFTExpertsCPU" + out_device: "cuda:2" + backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) + recursive: False + +# GPU 3: layers 45–60 +- match: + name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts + kwargs: + prefill_device: "cuda:3" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KSFTExpertsCPU" + out_device: "cuda:3" + backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) + recursive: False + +# === Self-Attention Replacement === + +# GPU 0: layers 0–14 +- match: + name: "^model\\.layers\\.([0-9]|1[0-4])\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" + absorb_for_prefill: False + +# GPU 1: layers 15–29 +- match: + name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + absorb_for_prefill: False + +# GPU 2: layers 30–44 +- match: + name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention + kwargs: + generate_device: "cuda:2" + prefill_device: "cuda:2" + absorb_for_prefill: False + +# GPU 3: layers 45–60 +- match: + name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention + kwargs: + generate_device: "cuda:3" + prefill_device: "cuda:3" + absorb_for_prefill: False + +# === Overall Model Replacement with Transfer Map === + +- match: + name: "^model$" + replace: + class: "ktransformers.operators.models.KDeepseekV2Model" + kwargs: + per_layer_prefill_intput_threshold: 0 # 0 means close layer‐wise prefill + transfer_map: + 15: "cuda:1" # Layers 15+ on GPU 1 + 30: "cuda:2" # Layers 30+ on GPU 2 + 45: "cuda:3" # Layers 45+ on GPU 3 + +# === Default Catch-All for Other Modules === + +# GPU 0: layers 0–14 +- match: + name: "^model\\.layers\\.([0-9]|1[0-4])\\." + replace: + class: "default" + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" + +# GPU 1: layers 15–29 +- match: + name: "^model\\.layers\\.(1[5-9]|2[0-9])\\." + replace: + class: "default" + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + +# GPU 2: layers 30–44 +- match: + name: "^model\\.layers\\.(3[0-9]|4[0-4])\\." + replace: + class: "default" + kwargs: + generate_device: "cuda:2" + prefill_device: "cuda:2" + +- match: + name: "^lm_head" + class: torch.nn.Linear + replace: + class: ktransformers.operators.linear.KTransformersLinear + kwargs: + generate_device: "cuda:3" + prefill_device: "cuda:3" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +# For final modules (model.norm), ensure they are on GPU 3 (as in your original config) +- match: + name: "(^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.)|(^model\\.norm)" + replace: + class: "default" + kwargs: + generate_device: "cuda:3" + prefill_device: "cuda:3" diff --git a/examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml b/examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml new file mode 100644 index 00000000..8b8c204b --- /dev/null +++ b/examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml @@ -0,0 +1,156 @@ +- match: + name: "^model.embed_tokens" + replace: + class: "default" + kwargs: + generate_device: "cpu" + prefill_device: "cpu" + +- match: + name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\." + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" +- match: + name: "^model\\.layers\\.([3456][0-9])\\." + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + +- match: + name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +- match: + name: "^model\\.layers\\.([3456][0-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +- match: + name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$" + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE + replace: + class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" +- match: + name: "^model\\.layers\\.([3456][0-9])\\.mlp$" + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE + replace: + class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + +- match: + name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$" + class: ktransformers.models.modeling_deepseek_v3.MoEGate + replace: + class: ktransformers.operators.gate.KMoEGate + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" +- match: + name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$" + class: ktransformers.models.modeling_deepseek_v3.MoEGate + replace: + class: ktransformers.operators.gate.KMoEGate # mlp module with custom forward function + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + +- match: + name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism + kwargs: + prefill_device: "cuda:0" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KSFTExpertsCPU" + out_device: "cuda:0" + backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) + recursive: False # don't recursively inject submodules of this module + +- match: + name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism + kwargs: + prefill_device: "cuda:1" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KSFTExpertsCPU" + out_device: "cuda:1" + backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) + recursive: False # don't recursively inject submodules of this module + +- match: + name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" +- match: + name: "^model\\.layers\\.([3456][0-9])\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" +- match: + name: "^model$" + replace: + class: "ktransformers.operators.models.KDeepseekV2Model" + kwargs: + per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill + transfer_map: + 30: "cuda:1" + +- match: + name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\." + replace: + class: "default" + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" + +- match: + name: "^lm_head" + class: torch.nn.Linear + replace: + class: ktransformers.operators.linear.KTransformersLinear + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +- match: + name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)" + replace: + class: "default" + kwargs: + generate_device: "cuda:1" + prefill_device: "cuda:1" diff --git a/examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx.yaml b/examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx.yaml new file mode 100644 index 00000000..177d4c4e --- /dev/null +++ b/examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx.yaml @@ -0,0 +1,77 @@ +- match: + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + +- match: + name: "^lm_head$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +- match: + name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" +- match: + name: "^model\\.layers\\..*\\.mlp$" + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE + replace: + class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + class: ktransformers.models.modeling_deepseek_v3.MoEGate + replace: + class: ktransformers.operators.gate.KMoEGate + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" +- match: + name: "^model\\.layers\\..*\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism + kwargs: + prefill_device: "cuda" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KSFTExpertsCPU" + out_device: "cuda" + backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) + recursive: False # don't recursively inject submodules of this module +- match: + name: "^model\\.layers\\..*\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + absorb_for_prefill: False # change this to True to enable long context(prefill may slower). +- match: + name: "^model$" + replace: + class: "ktransformers.operators.models.KDeepseekV2Model" + kwargs: + per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill +- match: + name: "^model.embed_tokens" + replace: + class: "default" + kwargs: + generate_device: "cpu" + prefill_device: "cpu" \ No newline at end of file diff --git a/examples/train_lora/deepseek2_lora_sft_kt.yaml b/examples/train_lora/deepseek2_lora_sft_kt.yaml new file mode 100644 index 00000000..eacb521e --- /dev/null +++ b/examples/train_lora/deepseek2_lora_sft_kt.yaml @@ -0,0 +1,52 @@ +### model +model_name_or_path: deepseek-ai/DeepSeek-V2-Lite +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_rank: 8 +lora_target: all + +### dataset +dataset: identity +template: deepseek +cutoff_len: 2048 +max_samples: 100000 +overwrite_cache: true +preprocessing_num_workers: 16 +dataloader_num_workers: 4 + +### output +output_dir: saves/Kllama_deepseekV2 +logging_steps: 10 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true +save_only_model: false +report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] + +### train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 8 +learning_rate: 1.0e-4 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null + +### ktransformers +use_kt: true # use KTransformers as LoRA sft backend +kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml +cpu_infer: 32 +chunk_size: 8192 + +### eval +# eval_dataset: alpaca_en_demo +# val_size: 0.1 +# per_device_eval_batch_size: 1 +# eval_strategy: steps +# eval_steps: 500 diff --git a/examples/train_lora/deepseek3_lora_sft_kt.yaml b/examples/train_lora/deepseek3_lora_sft_kt.yaml new file mode 100644 index 00000000..02d77661 --- /dev/null +++ b/examples/train_lora/deepseek3_lora_sft_kt.yaml @@ -0,0 +1,52 @@ +### model +model_name_or_path: opensourcerelease/DeepSeek-V3-bf16 +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_rank: 8 +lora_target: all + +### dataset +dataset: identity +template: deepseek +cutoff_len: 2048 +max_samples: 100000 +overwrite_cache: true +preprocessing_num_workers: 16 +dataloader_num_workers: 4 + +### output +output_dir: saves/Kllama_deepseekV3 +logging_steps: 10 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true +save_only_model: false +report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] + +### train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 8 +learning_rate: 1.0e-4 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null + +### ktransformers +use_kt: true # use KTransformers as LoRA sft backend +kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml +cpu_infer: 32 +chunk_size: 8192 + +### eval +# eval_dataset: alpaca_en_demo +# val_size: 0.1 +# per_device_eval_batch_size: 1 +# eval_strategy: steps +# eval_steps: 500 diff --git a/src/llamafactory/chat/chat_model.py b/src/llamafactory/chat/chat_model.py index 9ffd8647..cb612f88 100644 --- a/src/llamafactory/chat/chat_model.py +++ b/src/llamafactory/chat/chat_model.py @@ -71,6 +71,16 @@ class ChatModel: "SGLang not install, you may need to run `pip install sglang[all]`\n" "or try to use HuggingFace backend: --infer_backend huggingface" ) from e + elif model_args.infer_backend == EngineName.KT: + try: + from .kt_engine import KTransformersEngine + + self.engine: BaseEngine = KTransformersEngine(model_args, data_args, finetuning_args, generating_args) + except ImportError as e: + raise ImportError( + "KTransformers not install, you may need to run `pip install ktransformers`\n" + "or try to use HuggingFace backend: --infer_backend huggingface" + ) from e else: raise NotImplementedError(f"Unknown backend: {model_args.infer_backend}") diff --git a/src/llamafactory/chat/kt_engine.py b/src/llamafactory/chat/kt_engine.py new file mode 100644 index 00000000..7bbfcc80 --- /dev/null +++ b/src/llamafactory/chat/kt_engine.py @@ -0,0 +1,270 @@ +# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os +import platform +from collections.abc import AsyncGenerator +from threading import Thread +from typing import TYPE_CHECKING, Any, Optional + +import torch +from typing_extensions import override + +from ..data import get_template_and_fix_tokenizer +from ..extras import logging +from ..extras.constants import EngineName +from ..model import load_model, load_tokenizer +from .base_engine import BaseEngine, Response + + +if TYPE_CHECKING: + from transformers import PreTrainedTokenizer + from trl import PreTrainedModelWrapper + + from ..data.mm_plugin import AudioInput, ImageInput, VideoInput + from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments + +from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled +from ktransformers.server.config.config import Config +from ktransformers.util.utils import ( + get_compute_capability, + prefill_and_generate_capture, +) +from ktransformers.util.vendors import GPUVendor, device_manager + + +logger = logging.get_logger(__name__) + + +class KTransformersEngine(BaseEngine): + def __init__( + self, + model_args: "ModelArguments", + data_args: "DataArguments", + finetuning_args: "FinetuningArguments", + generating_args: "GeneratingArguments", + ) -> None: + self.name = EngineName.KT + self.can_generate = finetuning_args.stage == "sft" + + tok_mod = load_tokenizer(model_args) + self.tokenizer = tok_mod["tokenizer"] + self.tokenizer.padding_side = "left" if self.can_generate else "right" + self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args) + + self.model = load_model( + self.tokenizer, model_args, finetuning_args, + is_trainable=False, add_valuehead=(not self.can_generate) + ) + + self.generating_args = generating_args.to_dict() + self.max_new_tokens = model_args.kt_maxlen + self.use_cuda_graph = model_args.kt_use_cuda_graph + self.mode = model_args.kt_mode + self.force_think = model_args.kt_force_think + self.chunk_size = model_args.chunk_size + + try: + asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + self.semaphore = asyncio.Semaphore(int(os.getenv("MAX_CONCURRENT", "1"))) + + @staticmethod + @torch.inference_mode() + def _get_scores( + model: "PreTrainedModelWrapper", + tokenizer: "PreTrainedTokenizer", + batch_input: list[str], + input_kwargs: Optional[dict[str, Any]] = {}, + ) -> list[float]: + max_length: Optional[int] = input_kwargs.pop("max_length", None) + device = getattr(model.pretrained_model, "device", "cuda") + inputs = tokenizer( + batch_input, + padding=True, + truncation=True, + max_length=max_length or getattr(model.config, "max_position_embeddings", 1024), + return_tensors="pt", + add_special_tokens=False, + ).to(device) + values: torch.Tensor = model(**inputs, return_dict=True, use_cache=False)[-1] + scores = values.gather(dim=-1, index=(inputs["attention_mask"].sum(dim=-1, keepdim=True) - 1)) + return scores + + async def _generate( + self, + messages: list[dict[str, str]], + system: Optional[str] = None, + tools: Optional[str] = None, + **input_kwargs, + ) -> AsyncGenerator[str, None]: + paired = messages + [{"role": "assistant", "content": ""}] + prompt_ids, _ = self.template.encode_oneturn(self.tokenizer, paired, system, tools) + prompt_len = len(prompt_ids) + + max_length: Optional[int] = input_kwargs.pop("max_length", None) + max_new_tokens: Optional[int] = input_kwargs.pop("max_new_tokens", None) + + if "max_new_tokens" in self.generating_args: + max_tokens = int(self.generating_args["max_new_tokens"]) + elif "max_length" in self.generating_args: + gl = int(self.generating_args["max_length"]) + max_tokens = gl - prompt_len if gl > prompt_len else 1 + else: + max_tokens = self.max_new_tokens or 256 + + if max_length is not None: + max_tokens = max(max_length - prompt_len, 1) + if max_new_tokens is not None: + max_tokens = int(max_new_tokens) + max_tokens = max(1, int(max_tokens)) + + if self.mode == "long_context": + max_len_cfg = Config().long_context_config["max_seq_len"] + need = prompt_len + max_tokens + assert max_len_cfg > need, f"please set max_seq_len > {need} in ~/.ktransformers/config.yaml" + + device = next(self.model.parameters()).device + input_tensor = torch.tensor([prompt_ids], dtype=torch.long, device=device) + if self.force_think: + think = torch.tensor( + [self.tokenizer.encode("\n", add_special_tokens=False)], + dtype=torch.long, device=device + ) + input_tensor = torch.cat([input_tensor, think], dim=1) + + use_flashinfer = ( + platform.system() != "Windows" + and getattr(self.model.config, "architectures", [""])[0] in {"DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"} + and flashinfer_enabled + and get_compute_capability() >= 8 + and device_manager.gpu_vendor == GPUVendor.NVIDIA + ) + + def make_gen(): + if use_flashinfer: + return prefill_and_generate_capture( + self.model, self.tokenizer, input_tensor, max_tokens, self.use_cuda_graph, + mode=self.mode, force_think=self.force_think, chunk_size=self.chunk_size, + use_flashinfer_mla=True, + num_heads=self.model.config.num_attention_heads, + head_dim_ckv=getattr(self.model.config, "kv_lora_rank", 0), + head_dim_kpe=getattr(self.model.config, "qk_rope_head_dim", 0), + q_head_dim=getattr(self.model.config, "qk_rope_head_dim", 0) + getattr(self.model.config, "qk_nope_head_dim", 0), + echo_stream=False, + ) + else: + return prefill_and_generate_capture( + self.model, self.tokenizer, input_tensor, max_tokens, self.use_cuda_graph, + mode=self.mode, force_think=self.force_think, chunk_size=self.chunk_size, + echo_stream=False, + ) + + loop = asyncio.get_running_loop() + q: asyncio.Queue[Optional[str]] = asyncio.Queue() + + def producer(): + try: + gen = make_gen() + if hasattr(gen, "__aiter__"): + async def drain_async(): + async for t in gen: + loop.call_soon_threadsafe(q.put_nowait, t if isinstance(t, str) else str(t)) + asyncio.run(drain_async()) + elif hasattr(gen, "__iter__"): + for t in gen: + loop.call_soon_threadsafe(q.put_nowait, t if isinstance(t, str) else str(t)) + else: + loop.call_soon_threadsafe(q.put_nowait, gen if isinstance(gen, str) else str(gen)) + finally: + loop.call_soon_threadsafe(q.put_nowait, None) + + Thread(target=producer, daemon=True).start() + + while True: + item = await q.get() + if item is None: + break + yield item + + @override + async def chat( + self, + messages: list[dict[str, str]], + system: Optional[str] = None, + tools: Optional[str] = None, + images: Optional[list["ImageInput"]] = None, + videos: Optional[list["VideoInput"]] = None, + audios: Optional[list["AudioInput"]] = None, + **input_kwargs, + ) -> list["Response"]: + if not self.can_generate: + raise ValueError("The current model does not support `chat`.") + async with self.semaphore: + produced = "" + final_text = "" + async for t in self._generate(messages, system, tools, **input_kwargs): + delta = t + produced = produced + delta + if delta: + final_text += delta + + prompt_ids, _ = self.template.encode_oneturn( + self.tokenizer, messages + [{"role": "assistant", "content": ""}], system, tools + ) + return [ + Response( + response_text=final_text, + response_length=len(self.tokenizer.encode(final_text, add_special_tokens=False)), + prompt_length=len(prompt_ids), + finish_reason="stop", + ) + ] + + @override + async def stream_chat( + self, + messages: list[dict[str, str]], + system: Optional[str] = None, + tools: Optional[str] = None, + images: Optional[list["ImageInput"]] = None, + videos: Optional[list["VideoInput"]] = None, + audios: Optional[list["AudioInput"]] = None, + **input_kwargs, + ) -> AsyncGenerator[str, None]: + if not self.can_generate: + raise ValueError("The current model does not support `stream_chat`.") + async with self.semaphore: + produced = "" + async for t in self._generate(messages, system, tools, **input_kwargs): + delta = t[len(produced):] if t.startswith(produced) else t + produced = t + if delta: + yield delta + + @override + async def get_scores( + self, + batch_input: list[str], + **input_kwargs, + ) -> list[float]: + if self.can_generate: + raise ValueError("Cannot get scores using an auto-regressive model.") + args = (self.model, self.tokenizer, batch_input, input_kwargs) + async with self.semaphore: + return await asyncio.to_thread(self._get_scores, *args) diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index 83166589..c1137ac2 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -120,6 +120,7 @@ class EngineName(str, Enum): HF = "huggingface" VLLM = "vllm" SGLANG = "sglang" + KT = "ktransformers" class DownloadSource(str, Enum): diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py index 4f1d40a7..7b9277ab 100644 --- a/src/llamafactory/extras/misc.py +++ b/src/llamafactory/extras/misc.py @@ -312,6 +312,8 @@ def use_openmind() -> bool: def use_ray() -> bool: return is_env_enabled("USE_RAY") +def use_kt() -> bool: + return is_env_enabled("USE_KT") def find_available_port() -> int: r"""Find an available port on the local machine.""" diff --git a/src/llamafactory/extras/packages.py b/src/llamafactory/extras/packages.py index 21265c8e..904207ae 100644 --- a/src/llamafactory/extras/packages.py +++ b/src/llamafactory/extras/packages.py @@ -82,6 +82,10 @@ def is_ray_available(): return _is_package_available("ray") +def is_kt_available(): + return _is_package_available("ktransformers") + + def is_requests_available(): return _is_package_available("requests") diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py index ef690d7b..a47edff8 100644 --- a/src/llamafactory/hparams/finetuning_args.py +++ b/src/llamafactory/hparams/finetuning_args.py @@ -439,7 +439,6 @@ class SwanLabArguments: metadata={"help": "The Lark(飞书) secret for SwanLab."}, ) - @dataclass class FinetuningArguments( SwanLabArguments, diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py index 45068abd..dfb01773 100644 --- a/src/llamafactory/hparams/model_args.py +++ b/src/llamafactory/hparams/model_args.py @@ -1,4 +1,4 @@ -# Copyright 2025 HuggingFace Inc. and the LlamaFactory team. +# Copyright 2025 HuggingFace Inc., the KVCache.AI team, Approaching AI, and the LlamaFactory team. # # This code is inspired by the HuggingFace's transformers library. # https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py @@ -475,9 +475,51 @@ class SGLangArguments: self.sglang_config = _convert_str_dict(json.loads(self.sglang_config)) +@dataclass +class KTransformersArguments: + r"""Arguments pertaining to the KT training.""" + + use_kt: bool = field( + default=False, + metadata={"help": "Whether To Use KTransformers Optimizations For LoRA Training."}, + ) + kt_optimize_rule: Optional[str] = field( + default=None, + metadata={"help": "Path To The KTransformers Optimize Rule; See https://github.com/kvcache-ai/ktransformers/."}, + ) + cpu_infer: Optional[int] = field( + default=32, + metadata={"help": "Number Of CPU Cores Used For Computation."}, + ) + chunk_size: Optional[int] = field( + default=8192, + metadata={"help": "Chunk Size Used For CPU Compute In KTransformers."}, + ) + mode: Optional[str] = field( + default="normal", + metadata={"help": "Normal Or Long_Context For Llama Models."}, + ) + + kt_maxlen: int = field( + default=4096, + metadata={"help": "Maximum Sequence (Prompt + Response) Length Of The KT Engine."}, + ) + kt_use_cuda_graph: bool = field( + default=True, + metadata={"help": "Whether To Use CUDA Graphs For The KT Engine."}, + ) + kt_mode: str = field( + default="normal", + metadata={"help": "Normal Or Long_Context Mode For The KT Engine."}, + ) + kt_force_think: bool = field( + default=False, + metadata={"help": "Force-Think Toggle For The KT Engine."}, + ) + @dataclass class ModelArguments( - SGLangArguments, VllmArguments, ExportArguments, ProcessorArguments, QuantizationArguments, BaseModelArguments + SGLangArguments, VllmArguments, KTransformersArguments, ExportArguments, ProcessorArguments, QuantizationArguments, BaseModelArguments ): r"""Arguments pertaining to which model/config/tokenizer we are going to fine-tune or infer. diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py index e830d0cc..8f204e80 100644 --- a/src/llamafactory/hparams/parser.py +++ b/src/llamafactory/hparams/parser.py @@ -156,6 +156,9 @@ def _check_extra_dependencies( finetuning_args: "FinetuningArguments", training_args: Optional["TrainingArguments"] = None, ) -> None: + if model_args.use_kt: + check_version("ktransformers", mandatory=True) + if model_args.use_unsloth: check_version("unsloth", mandatory=True) @@ -282,13 +285,16 @@ def get_train_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _ if model_args.shift_attn: raise ValueError("PPO training is incompatible with S^2-Attn.") + if finetuning_args.reward_model_type == "lora" and model_args.use_kt: + raise ValueError("KTransformers does not support lora reward model.") + if finetuning_args.reward_model_type == "lora" and model_args.use_unsloth: raise ValueError("Unsloth does not support lora reward model.") if training_args.report_to and training_args.report_to[0] not in ["wandb", "tensorboard"]: raise ValueError("PPO only accepts wandb or tensorboard logger.") - if training_args.parallel_mode == ParallelMode.NOT_DISTRIBUTED: + if not model_args.use_kt and training_args.parallel_mode == ParallelMode.NOT_DISTRIBUTED: raise ValueError("Please launch distributed training with `llamafactory-cli` or `torchrun`.") if training_args.deepspeed and training_args.parallel_mode != ParallelMode.DISTRIBUTED: @@ -350,6 +356,9 @@ def get_train_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _ if model_args.use_unsloth and is_deepspeed_zero3_enabled(): raise ValueError("Unsloth is incompatible with DeepSpeed ZeRO-3.") + if model_args.use_kt and is_deepspeed_zero3_enabled(): + raise ValueError("KTransformers is incompatible with DeepSpeed ZeRO-3.") + if data_args.neat_packing and is_transformers_version_greater_than("4.53.0"): raise ValueError("Neat packing is incompatible with transformers>=4.53.0.") diff --git a/src/llamafactory/hparams/training_args.py b/src/llamafactory/hparams/training_args.py index 46b40a2d..4ab6a9cb 100644 --- a/src/llamafactory/hparams/training_args.py +++ b/src/llamafactory/hparams/training_args.py @@ -90,7 +90,6 @@ class RayArguments: elif self.ray_storage_filesystem == "gs" or self.ray_storage_filesystem == "gcs": self.ray_storage_filesystem = fs.GcsFileSystem() - @dataclass class TrainingArguments(RayArguments, BaseTrainingArguments): r"""Arguments pertaining to the trainer.""" diff --git a/src/llamafactory/launcher.py b/src/llamafactory/launcher.py index 99f2ea3e..91c3fe10 100644 --- a/src/llamafactory/launcher.py +++ b/src/llamafactory/launcher.py @@ -38,7 +38,7 @@ USAGE = ( def launch(): from .extras import logging from .extras.env import VERSION, print_env - from .extras.misc import find_available_port, get_device_count, is_env_enabled, use_ray + from .extras.misc import find_available_port, get_device_count, is_env_enabled, use_kt, use_ray logger = logging.get_logger(__name__) WELCOME = ( @@ -57,7 +57,7 @@ def launch(): if is_env_enabled("USE_MCA"): # force use torchrun os.environ["FORCE_TORCHRUN"] = "1" - if command == "train" and (is_env_enabled("FORCE_TORCHRUN") or (get_device_count() > 1 and not use_ray())): + if command == "train" and (is_env_enabled("FORCE_TORCHRUN") or (get_device_count() > 1 and not use_ray() and not use_kt())): # launch distributed training nnodes = os.getenv("NNODES", "1") node_rank = os.getenv("NODE_RANK", "0") diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py index d9522d39..3eac99b4 100644 --- a/src/llamafactory/model/adapter.py +++ b/src/llamafactory/model/adapter.py @@ -20,6 +20,8 @@ from peft import LoraConfig, LoraModel, OFTConfig, PeftModel, TaskType, get_peft from transformers.integrations import is_deepspeed_zero3_enabled from ..extras import logging +from ..extras.constants import EngineName +from .model_utils.ktransformers import get_kt_peft_model, load_kt_peft_model from .model_utils.misc import find_all_linear_modules, find_expanded_modules from .model_utils.quantization import QuantizationMethod from .model_utils.unsloth import get_unsloth_peft_model, load_unsloth_peft_model @@ -164,6 +166,10 @@ def _setup_lora_tuning( assert len(model_args.adapter_name_or_path) == 1, "Cannot use multiple adapters in DeepSpeed ZeRO-3." is_mergeable = False + if model_args.use_kt: + assert len(model_args.adapter_name_or_path) == 1, "Up to now, KTransformers model only accepts a single adapter, for more features, you can contact with us." + is_mergeable = False + if model_args.use_unsloth: assert len(model_args.adapter_name_or_path) == 1, "Unsloth model only accepts a single adapter." is_mergeable = False @@ -182,6 +188,10 @@ def _setup_lora_tuning( "token": model_args.hf_hub_token, } + if model_args.use_kt: + if model_args.infer_backend != EngineName.KT: + raise ValueError("We should use ktransformers as backend to infer the adapter fine-tuned by ktransformers.") + for adapter in adapter_to_merge: model: LoraModel = PeftModel.from_pretrained(model, adapter, **init_kwargs) model = model.merge_and_unload() @@ -190,7 +200,9 @@ def _setup_lora_tuning( logger.info_rank0(f"Merged {len(adapter_to_merge)} adapter(s).") if adapter_to_resume is not None: # resume lora training - if model_args.use_unsloth: + if model_args.use_kt: + model = load_kt_peft_model(model_args, model) + elif model_args.use_unsloth: model = load_unsloth_peft_model(config, model_args, finetuning_args, is_trainable=is_trainable) else: model = PeftModel.from_pretrained(model, adapter_to_resume, is_trainable=is_trainable, **init_kwargs) @@ -203,6 +215,16 @@ def _setup_lora_tuning( else: target_modules = finetuning_args.lora_target + if model_args.use_kt: + new_list = [] + for m in target_modules: + if m in ('down_proj', 'up_proj', 'gate_proj'): + new_list.extend([f"mlp.{m}", f"shared_experts.{m}"]) + elif m not in ('generate_linear', 'orig_module', 'prefill_linear'): + new_list.append(m) + + target_modules[:] = new_list + if finetuning_args.use_llama_pro: target_modules = find_expanded_modules(model, target_modules, finetuning_args.freeze_trainable_layers) @@ -245,7 +267,21 @@ def _setup_lora_tuning( "modules_to_save": finetuning_args.additional_target, } - if model_args.use_unsloth: + if model_args.use_kt: + if finetuning_args.finetuning_type == "oft": + raise ValueError("KTransformers is currently not supported for OFT.") + if finetuning_args.finetuning_type == "lora": + peft_config = LoraConfig( + task_type=TaskType.CAUSAL_LM, + inference_mode=False, + **peft_kwargs, + ) + else: + raise ValueError("KTransformers is currently only supported for LoRA.") + + model = get_kt_peft_model(model, peft_config) + print(f"KT_model:{model}") + elif model_args.use_unsloth: if finetuning_args.finetuning_type == "oft": raise ValueError("Unsloth is currently not supported for OFT.") diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index 37dffcbe..7142211d 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -31,6 +31,7 @@ from trl import AutoModelForCausalLMWithValueHead from ..extras import logging from ..extras.misc import count_parameters, skip_check_imports, try_download_model_from_other_hub from .adapter import init_adapter +from .model_utils.ktransformers import load_kt_pretrained_model from .model_utils.liger_kernel import apply_liger_kernel from .model_utils.misc import register_autoclass from .model_utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model @@ -143,7 +144,11 @@ def load_model( model = None lazy_load = False - if model_args.use_unsloth: + if model_args.use_kt: + from ktransformers.sft.monkey_patch_torch_module import install_patch + install_patch() + model = load_kt_pretrained_model(config, model_args) + elif model_args.use_unsloth: if model_args.adapter_name_or_path is not None: lazy_load = True elif is_trainable: diff --git a/src/llamafactory/model/model_utils/ktransformers.py b/src/llamafactory/model/model_utils/ktransformers.py new file mode 100644 index 00000000..59adc51f --- /dev/null +++ b/src/llamafactory/model/model_utils/ktransformers.py @@ -0,0 +1,159 @@ +# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib.util as _u +from typing import TYPE_CHECKING, Any, Optional + +import torch + +from ...extras import logging +from ...extras.misc import get_current_device + + +if TYPE_CHECKING: + from ...hparams import FinetuningArguments, ModelArguments + +from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel + + +KT_AVAILABLE = _u.find_spec("ktransformers") is not None +if KT_AVAILABLE: + from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM + from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM + from ktransformers.models.modeling_llama import LlamaForCausalLM + from ktransformers.models.modeling_mixtral import MixtralForCausalLM + from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM + from ktransformers.optimize.optimize import optimize_and_load_gguf + from ktransformers.server.config.config import Config + from ktransformers.sft.lora import inject_lora_layer + from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader + from ktransformers.util.globals import GLOBAL_CONFIG + from ktransformers.util.utils import load_weights + +logger = logging.get_logger(__name__) + +def _get_kt_kwargs( + config: "PretrainedConfig", + model_name_or_path: str, + model_args: "ModelArguments", + finetuning_args: "FinetuningArguments", +) -> dict[str, Any]: + return { + "model_name": model_name_or_path, + "max_seq_length": model_args.model_max_length or 4096, + "dtype": model_args.compute_dtype, + "load_in_4bit": model_args.quantization_bit == 4, + "token": model_args.hf_hub_token, + "full_finetuning": finetuning_args.finetuning_type == "full", + "device_map": {"": get_current_device()}, + "rope_scaling": getattr(config, "rope_scaling", None), + "fix_tokenizer": False, + "trust_remote_code": model_args.trust_remote_code, + "use_gradient_checkpointing": "ktransformers", + } + + +def load_kt_pretrained_model( + config: "PretrainedConfig", model_args: "ModelArguments" +) -> Optional["PreTrainedModel"]: + r"""Optionally load pretrained model with KTransformers. Used in training.""" + custom_models = { + "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM, + "DeepseekV3ForCausalLM": DeepseekV3ForCausalLM, + "Qwen2MoeForCausalLM": Qwen2MoeForCausalLM, + "LlamaForCausalLM": LlamaForCausalLM, + "MixtralForCausalLM": MixtralForCausalLM, + } + Config().cpu_infer = model_args.cpu_infer + Config().chunk_size = model_args.chunk_size + config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code) + + if model_args.mode == 'long_context': + assert config.architectures[0] == "LlamaForCausalLM", "only LlamaForCausalLM support long_context mode" + torch.set_default_dtype(torch.float16) + else: + torch.set_default_dtype(config.torch_dtype) + + with torch.device("meta"): + if config.architectures[0] in custom_models: + print("using custom modeling_xxx.py.") + if ( + "Qwen2Moe" in config.architectures[0] + ): # Qwen2Moe must use flash_attention_2 to avoid overflow. + config._attn_implementation = "flash_attention_2" + if "Llama" in config.architectures[0]: + config._attn_implementation = "eager" + if "Mixtral" in config.architectures[0]: + config._attn_implementation = "flash_attention_2" + model = custom_models[config.architectures[0]](config) + else: + attn_implementation = "flash_attention_2" + model = AutoModelForCausalLM.from_config( + config, trust_remote_code=True, attn_implementation=attn_implementation + ) + + optimize_config_path = model_args.kt_optimize_rule + gguf_path = model_args.model_name_or_path + + assert optimize_config_path is not None, "optimize_config_path must be provided (path to YAML rules file)." + assert gguf_path is not None, "gguf_path must be provided (path to a folder or .gguf file)." + + GLOBAL_CONFIG._config["mod"] = "infer" + optimize_and_load_gguf(model, optimize_config_path, gguf_path, config) + + return model + + +def get_kt_peft_model( + model: "PreTrainedModel", peft_kwargs: dict[str, Any] +) -> "PreTrainedModel": + r"""Get the peft model for the pretrained model with KTransformers. Used in training.""" + from ktransformers.sft.peft_utils.mapping import get_peft_model + + return get_peft_model(model, peft_kwargs) + + +def load_kt_peft_model( + model_args: "ModelArguments", model: "PreTrainedModel", +) -> "PreTrainedModel": + r"""Load peft model with KTransformers. Used in both training and inference.""" + load_adapter_name_or_path = model_args.adapter_name_or_path[0] + if load_adapter_name_or_path.endswith('.gguf'): + inject_lora_layer(model, load_adapter_name_or_path) + adapter_gguf_loader = GGUFLoader(load_adapter_name_or_path) + load_weights(model, adapter_gguf_loader, adapter_gguf=True) + model.train() + else: + inject_lora_layer(model, load_adapter_name_or_path) + + adapter_loader = SafeTensorLoader(load_adapter_name_or_path) + device = next(model.parameters()).device + for key in adapter_loader.tensor_file_map.keys(): + try: + tensor = adapter_loader.load_tensor(key, device=device) + + model_key = key.replace("base_model.model.", "") + model_key = model_key.replace(".weight", ".default.weight") + model_key = model_key.replace(".default.default.weight", ".default.weight") + + param = model.get_parameter(model_key) + param.data.copy_(tensor.data) + + print(f"Loaded adapter weight: {key} -> {model_key}") + except AttributeError: + print(f"Skipping {key}: not a model parameter") + except KeyError: + print(f"Key not found in model: {model_key} (original: {key})") + + return model diff --git a/src/llamafactory/train/ksft/__init__.py b/src/llamafactory/train/ksft/__init__.py new file mode 100644 index 00000000..12c53f62 --- /dev/null +++ b/src/llamafactory/train/ksft/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .workflow import run_sft + + +__all__ = ["run_sft"] diff --git a/src/llamafactory/train/ksft/workflow.py b/src/llamafactory/train/ksft/workflow.py new file mode 100644 index 00000000..049a1991 --- /dev/null +++ b/src/llamafactory/train/ksft/workflow.py @@ -0,0 +1,110 @@ +# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING, Optional + +from ...data import SFTDataCollatorWith4DAttentionMask, get_dataset, get_template_and_fix_tokenizer +from ...extras.constants import IGNORE_INDEX +from ...extras.logging import get_logger +from ...extras.misc import calculate_tps +from ...extras.ploting import plot_loss +from ...model import load_model, load_tokenizer +from ..trainer_utils import create_modelcard_and_push + + +if TYPE_CHECKING: + from transformers import Seq2SeqTrainingArguments, TrainerCallback + + from ...hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments + + +logger = get_logger(__name__) + + +def run_sft( + model_args: "ModelArguments", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", + generating_args: "GeneratingArguments", + callbacks: Optional[list["TrainerCallback"]] = None, +): + tokenizer_module = load_tokenizer(model_args) + tokenizer = tokenizer_module["tokenizer"] + template = get_template_and_fix_tokenizer(tokenizer, data_args) + dataset_module = get_dataset(template, model_args, data_args, training_args, stage="sft", **tokenizer_module) + model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) + + from ktransformers.util.globals import GLOBAL_CONFIG + GLOBAL_CONFIG._config["mod"] = "sft" + + if getattr(model, "is_quantized", False) and not training_args.do_train: + setattr(model, "_hf_peft_config_loaded", True) # hack here: make model compatible with prediction + + data_collator = SFTDataCollatorWith4DAttentionMask( + template=template, + model=model if not training_args.predict_with_generate else None, + pad_to_multiple_of=8 if training_args.do_train else None, # for shift short attention + label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id, + block_diag_attn=model_args.block_diag_attn, + attn_implementation=getattr(model.config, "_attn_implementation", None), + compute_dtype=model_args.compute_dtype, + **tokenizer_module, + ) + + # Metric utils + metric_module = {} + if training_args.predict_with_generate: + raise NotImplementedError("`predict_with_generate` is not supported in KTransformers SFT yet. if you do need it, please open an issue.") + elif finetuning_args.compute_accuracy: + raise NotImplementedError("`compute_accuracy` is not supported in KTransformers SFT yet. if you do need it, please open an issue.") + + # Initialize our Trainer + from ktransformers.sft.lora import KTrainer + trainer = KTrainer( + model=model, + args=training_args, + tokenizer=tokenizer_module, + data_collator=data_collator, + callbacks=callbacks, + **dataset_module, + **metric_module, + ) + + # Training + if training_args.do_train: + model.config.use_cache = False + train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + trainer.save_model() + if finetuning_args.include_effective_tokens_per_second: + train_result.metrics["effective_tokens_per_sec"] = calculate_tps( + dataset_module["train_dataset"], train_result.metrics, stage="sft" + ) + + trainer.log_metrics("train", train_result.metrics) + trainer.save_metrics("train", train_result.metrics) + trainer.save_state() + if trainer.is_world_process_zero() and finetuning_args.plot_loss: + keys = ["loss"] + if isinstance(dataset_module.get("eval_dataset"), dict): + keys += sum( + [[f"eval_{key}_loss", f"eval_{key}_accuracy"] for key in dataset_module["eval_dataset"].keys()], [] + ) + else: + keys += ["eval_loss", "eval_accuracy"] + + plot_loss(training_args.output_dir, keys=keys) + + # Create model card + create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py index 80a46397..66bff5f1 100644 --- a/src/llamafactory/train/trainer_utils.py +++ b/src/llamafactory/train/trainer_utils.py @@ -100,6 +100,9 @@ def create_modelcard_and_push( if model_args.use_unsloth: kwargs["tags"] = kwargs["tags"] + ["unsloth"] + if model_args.use_kt: + kwargs["tags"] = kwargs["tags"] + ["ktransformers"] + if not training_args.do_train: pass elif training_args.push_to_hub: diff --git a/src/llamafactory/train/tuner.py b/src/llamafactory/train/tuner.py index f8b84107..47fce654 100644 --- a/src/llamafactory/train/tuner.py +++ b/src/llamafactory/train/tuner.py @@ -1,4 +1,4 @@ -# Copyright 2025 the LlamaFactory team. +# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,7 +24,7 @@ from ..data import get_template_and_fix_tokenizer from ..extras import logging from ..extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME from ..extras.misc import infer_optim_dtype -from ..extras.packages import is_mcore_adapter_available, is_ray_available +from ..extras.packages import is_kt_available, is_mcore_adapter_available, is_ray_available from ..hparams import get_infer_args, get_ray_args, get_train_args, read_args from ..model import load_model, load_tokenizer from .callbacks import LogCallback, PissaConvertCallback, ReporterCallback @@ -85,6 +85,12 @@ def _training_function(config: dict[str, Any]) -> None: elif finetuning_args.stage == "pt": run_pt(model_args, data_args, training_args, finetuning_args, callbacks) elif finetuning_args.stage == "sft": + if model_args.use_kt: + if not is_kt_available(): + raise ImportError("KTransformers is not installed. Please install it with `pip install ktransformers`.") + from .ksft.workflow import run_sft as run_sft_kt + run_sft_kt(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) + run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) elif finetuning_args.stage == "rm": run_rm(model_args, data_args, training_args, finetuning_args, callbacks)