diff --git a/examples/ktransformers/accelerate/fsdp2_kt_bf16.yaml b/examples/ktransformers/accelerate/fsdp2_kt_bf16.yaml new file mode 100644 index 000000000..d5548289a --- /dev/null +++ b/examples/ktransformers/accelerate/fsdp2_kt_bf16.yaml @@ -0,0 +1,25 @@ +compute_environment: LOCAL_MACHINE +distributed_type: FSDP +fsdp_config: + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_cpu_ram_efficient_loading: true + fsdp_offload_params: false + fsdp_reshard_after_forward: true + fsdp_state_dict_type: FULL_STATE_DICT + fsdp_version: 2 +mixed_precision: bf16 +num_machines: 1 +num_processes: 4 # Adjust based on your GPU count; 4 is suitable for 4 GPUs +rdzv_backend: static +same_network: true +use_cpu: false + +kt_config: + enabled: true + kt_backend: AMXBF16 # Use with original BF16 expert weights. + kt_num_threads: 96 + kt_tp_enabled: true + kt_threadpool_count: 2 + kt_max_cache_depth: 2 + kt_share_backward_bb: true + lora_rank: 8 diff --git a/examples/ktransformers/accelerate/fsdp2_kt_int4.yaml b/examples/ktransformers/accelerate/fsdp2_kt_int4.yaml new file mode 100644 index 000000000..ec20e76c6 --- /dev/null +++ b/examples/ktransformers/accelerate/fsdp2_kt_int4.yaml @@ -0,0 +1,25 @@ +compute_environment: LOCAL_MACHINE +distributed_type: FSDP +fsdp_config: + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_cpu_ram_efficient_loading: true + fsdp_offload_params: false + fsdp_reshard_after_forward: true + fsdp_state_dict_type: FULL_STATE_DICT + fsdp_version: 2 +mixed_precision: bf16 +num_machines: 1 +num_processes: 4 # Adjust based on your GPU count; 4 is suitable for 4 GPUs +rdzv_backend: static +same_network: true +use_cpu: false + +kt_config: + enabled: true + kt_backend: AMXINT4 # Use with online-converted INT4 expert weights + kt_num_threads: 96 + kt_tp_enabled: true + kt_threadpool_count: 2 + kt_max_cache_depth: 2 + kt_share_backward_bb: true + lora_rank: 8 diff --git a/examples/ktransformers/accelerate/fsdp2_kt_int8.yaml b/examples/ktransformers/accelerate/fsdp2_kt_int8.yaml new file mode 100644 index 000000000..733f2c39a --- /dev/null +++ b/examples/ktransformers/accelerate/fsdp2_kt_int8.yaml @@ -0,0 +1,25 @@ +compute_environment: LOCAL_MACHINE +distributed_type: FSDP +fsdp_config: + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_cpu_ram_efficient_loading: true + fsdp_offload_params: false + fsdp_reshard_after_forward: true + fsdp_state_dict_type: FULL_STATE_DICT + fsdp_version: 2 +mixed_precision: bf16 +num_machines: 1 +num_processes: 4 # Adjust based on your GPU count; 4 is suitable for 4 GPUs +rdzv_backend: static +same_network: true +use_cpu: false + +kt_config: + enabled: true + kt_backend: AMXINT8 # Use with online-converted INT8 expert weights + kt_num_threads: 96 + kt_tp_enabled: true + kt_threadpool_count: 2 + kt_max_cache_depth: 2 + kt_share_backward_bb: true + lora_rank: 8 diff --git a/examples/ktransformers/accelerate/fsdp2_kt_int8_1gpu.yaml b/examples/ktransformers/accelerate/fsdp2_kt_int8_1gpu.yaml new file mode 100644 index 000000000..a76ceae3e --- /dev/null +++ b/examples/ktransformers/accelerate/fsdp2_kt_int8_1gpu.yaml @@ -0,0 +1,25 @@ +compute_environment: LOCAL_MACHINE +distributed_type: FSDP +fsdp_config: + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_cpu_ram_efficient_loading: true + fsdp_offload_params: false + fsdp_reshard_after_forward: true + fsdp_state_dict_type: FULL_STATE_DICT + fsdp_version: 2 +mixed_precision: bf16 +num_machines: 1 +num_processes: 1 # Adjust based on your GPU count; 1 is suitable for 1 GPU +rdzv_backend: static +same_network: true +use_cpu: false + +kt_config: + enabled: true + kt_backend: AMXINT8 # Use with online-converted INT8 expert weights + kt_num_threads: 96 + kt_tp_enabled: true + kt_threadpool_count: 2 + kt_max_cache_depth: 2 + kt_share_backward_bb: true + lora_rank: 8 diff --git a/examples/ktransformers/accelerate/fsdp2_kt_int8_8gpu.yaml b/examples/ktransformers/accelerate/fsdp2_kt_int8_8gpu.yaml new file mode 100644 index 000000000..7ba10cd48 --- /dev/null +++ b/examples/ktransformers/accelerate/fsdp2_kt_int8_8gpu.yaml @@ -0,0 +1,25 @@ +compute_environment: LOCAL_MACHINE +distributed_type: FSDP +fsdp_config: + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_cpu_ram_efficient_loading: true + fsdp_offload_params: false + fsdp_reshard_after_forward: true + fsdp_state_dict_type: FULL_STATE_DICT + fsdp_version: 2 +mixed_precision: bf16 +num_machines: 1 +num_processes: 8 # Adjust based on your GPU count; 8 is suitable for 8 GPUs +rdzv_backend: static +same_network: true +use_cpu: false + +kt_config: + enabled: true + kt_backend: AMXINT8 # Use with online-converted INT8 expert weights + kt_num_threads: 96 + kt_tp_enabled: true + kt_threadpool_count: 2 + kt_max_cache_depth: 2 + kt_share_backward_bb: true + lora_rank: 8 diff --git a/examples/ktransformers/infer_lora/deepseek2_lora_sft_kt.yaml b/examples/ktransformers/infer_lora/deepseek2_lora_sft_kt.yaml deleted file mode 100644 index cd10c83f1..000000000 --- a/examples/ktransformers/infer_lora/deepseek2_lora_sft_kt.yaml +++ /dev/null @@ -1,10 +0,0 @@ -model_name_or_path: deepseek-ai/DeepSeek-V2-Lite -adapter_name_or_path: saves/Kllama_deepseekV2 -template: deepseek -infer_backend: ktransformers # choices: [huggingface, vllm, sglang, ktransformers] -trust_remote_code: true - -use_kt: true # use KTransformers as LoRA sft backend to inference -kt_optimize_rule: examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml -cpu_infer: 32 -chunk_size: 8192 diff --git a/examples/ktransformers/infer_lora/deepseek3_kt.yaml b/examples/ktransformers/infer_lora/deepseek3_kt.yaml deleted file mode 100644 index 6534d4078..000000000 --- a/examples/ktransformers/infer_lora/deepseek3_kt.yaml +++ /dev/null @@ -1,9 +0,0 @@ -model_name_or_path: opensourcerelease/DeepSeek-V3-bf16 -template: deepseek3 -infer_backend: ktransformers # choices: [huggingface, vllm, sglang, ktransformers] -trust_remote_code: true - -use_kt: true # use KTransformers as LoRA sft backend to inference -kt_optimize_rule: examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml -cpu_infer: 32 -chunk_size: 8192 diff --git a/examples/ktransformers/infer_lora/deepseek3_lora_sft_kt.yaml b/examples/ktransformers/infer_lora/deepseek3_lora_sft_kt.yaml deleted file mode 100644 index a7171eb4a..000000000 --- a/examples/ktransformers/infer_lora/deepseek3_lora_sft_kt.yaml +++ /dev/null @@ -1,10 +0,0 @@ -model_name_or_path: opensourcerelease/DeepSeek-V3-bf16 -adapter_name_or_path: saves/Kllama_deepseekV3 -template: deepseek3 -infer_backend: ktransformers # choices: [huggingface, vllm, sglang, ktransformers] -trust_remote_code: true - -use_kt: true # use KTransformers as LoRA sft backend to inference -kt_optimize_rule: examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml -cpu_infer: 32 -chunk_size: 8192 diff --git a/examples/ktransformers/infer_lora/qwen3moe_lora_sft_kt.yaml b/examples/ktransformers/infer_lora/qwen3moe_lora_sft_kt.yaml deleted file mode 100644 index c71321c33..000000000 --- a/examples/ktransformers/infer_lora/qwen3moe_lora_sft_kt.yaml +++ /dev/null @@ -1,10 +0,0 @@ -model_name_or_path: Qwen/Qwen3-235B-A22B-Instruct-2507 -adapter_name_or_path: saves/Kllama_Qwen3MoE_235bA22b -template: qwen3_nothink -infer_backend: ktransformers # choices: [huggingface, vllm, sglang, ktransformers] -trust_remote_code: true - -use_kt: true # use KTransformers as LoRA sft backend to inference -kt_optimize_rule: examples/ktransformers/kt_optimize_rules/Qwen3Moe-sft-amx.yaml -cpu_infer: 32 -chunk_size: 8192 diff --git a/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml deleted file mode 100644 index 626e76748..000000000 --- a/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml +++ /dev/null @@ -1,69 +0,0 @@ -- match: - class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding - replace: - class: ktransformers.operators.RoPE.YarnRotaryEmbedding - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression - class: torch.nn.Linear # only match modules matching name and class simultaneously - replace: - class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -- match: - name: "^lm_head" - class: torch.nn.Linear - replace: - class: ktransformers.operators.linear.KTransformersLinear - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -- match: - name: "^model\\.layers\\..*\\.mlp$" - class: ktransformers.models.modeling_deepseek.DeepseekV2MoE - replace: - class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - name: "^model\\.layers\\..*\\.mlp\\.experts$" - replace: - class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism - kwargs: - prefill_device: "cuda" - prefill_op: "KExpertsTorch" - generate_device: "cpu" - generate_op: "KSFTExpertsCPU" - out_device: "cuda" - backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) - recursive: False # don't recursively inject submodules of this module -- match: - name: "^model\\.layers\\..*\\.self_attn$" - replace: - class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - name: "^model$" - replace: - class: "ktransformers.operators.models.KDeepseekV2Model" - kwargs: - per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill -- match: - name: "^model.embed_tokens" - replace: - class: "default" - kwargs: - generate_device: "cpu" - prefill_device: "cpu" diff --git a/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Chat.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Chat.yaml deleted file mode 100644 index e9bbe675e..000000000 --- a/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Chat.yaml +++ /dev/null @@ -1,68 +0,0 @@ -- match: - class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding - replace: - class: ktransformers.operators.RoPE.YarnRotaryEmbedding - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression - class: torch.nn.Linear # only match modules matching name and class simultaneously - replace: - class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - generate_op: "KLinearMarlin" - prefill_op: "KLinearTorch" - -- match: - name: "^lm_head" - class: torch.nn.Linear - replace: - class: ktransformers.operators.linear.KTransformersLinear - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - generate_op: "KLinearMarlin" - prefill_op: "KLinearTorch" - -- match: - name: "^model\\.layers\\..*\\.mlp$" - class: ktransformers.models.modeling_deepseek.DeepseekV2MoE - replace: - class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - name: "^model\\.layers\\..*\\.mlp\\.experts$" - replace: - class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism - kwargs: - prefill_device: "cuda" - prefill_op: "KExpertsTorch" - generate_device: "cpu" - generate_op: "KExpertsCPU" - out_device: "cuda" - recursive: False # don't recursively inject submodules of this module -- match: - name: "^model\\.layers\\..*\\.self_attn$" - replace: - class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - name: "^model$" - replace: - class: "ktransformers.operators.models.KDeepseekV2Model" - kwargs: - per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill -- match: - name: "^model.embed_tokens" - replace: - class: "default" - kwargs: - generate_device: "cpu" - prefill_device: "cpu" diff --git a/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml deleted file mode 100644 index f77a540e1..000000000 --- a/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml +++ /dev/null @@ -1,139 +0,0 @@ -- match: - name: "^model.embed_tokens" - replace: - class: "default" - kwargs: - generate_device: "cpu" - prefill_device: "cpu" - -- match: - name: "^model\\.layers\\.(0|[1-9])\\." - class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding - replace: - class: ktransformers.operators.RoPE.YarnRotaryEmbedding - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" -- match: - name: "^model\\.layers\\.([12][0-9])\\." - class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding - replace: - class: ktransformers.operators.RoPE.YarnRotaryEmbedding - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" - -- match: - name: "^model\\.layers\\.(0|[1-9])\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression - class: torch.nn.Linear # only match modules matching name and class simultaneously - replace: - class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -- match: - name: "^model\\.layers\\.([12][0-9])\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression - class: torch.nn.Linear # only match modules matching name and class simultaneously - replace: - class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -- match: - name: "^model\\.layers\\.(0|[1-9])\\.mlp$" - class: ktransformers.models.modeling_deepseek.DeepseekV2MoE - replace: - class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" -- match: - name: "^model\\.layers\\.([12][0-9])\\.mlp$" - class: ktransformers.models.modeling_deepseek.DeepseekV2MoE - replace: - class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" - -- match: - name: "^model\\.layers\\.(0|[1-9])\\.mlp\\.experts$" - replace: - class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism - kwargs: - prefill_device: "cuda:0" - prefill_op: "KExpertsTorch" - generate_device: "cpu" - generate_op: "KSFTExpertsCPU" - out_device: "cuda:0" - backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) - recursive: False # don't recursively inject submodules of this module - -- match: - name: "^model\\.layers\\.([12][0-9])\\.mlp\\.experts$" - replace: - class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism - kwargs: - prefill_device: "cuda:1" - prefill_op: "KExpertsTorch" - generate_device: "cpu" - generate_op: "KSFTExpertsCPU" - out_device: "cuda:1" - backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) - recursive: False # don't recursively inject submodules of this module - -- match: - name: "^model\\.layers\\.(0|[1-9])\\.self_attn$" - replace: - class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" -- match: - name: "^model\\.layers\\.([12][0-9])\\.self_attn$" - replace: - class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" -- match: - name: "^model$" - replace: - class: "ktransformers.operators.models.KDeepseekV2Model" - kwargs: - per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill - transfer_map: - 10: "cuda:1" - -- match: - name: "^model\\.layers\\.(0|[1-9])\\." - replace: - class: "default" - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" - -- match: - name: "^lm_head" - class: torch.nn.Linear - replace: - class: ktransformers.operators.linear.KTransformersLinear - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -- match: - name: "(^model\\.layers\\.([12][0-9])\\.)|(model.norm)" - replace: - class: "default" - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" diff --git a/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml deleted file mode 100644 index 42343fd55..000000000 --- a/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml +++ /dev/null @@ -1,69 +0,0 @@ -- match: - class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding - replace: - class: ktransformers.operators.RoPE.YarnRotaryEmbedding - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression - class: torch.nn.Linear # only match modules matching name and class simultaneously - replace: - class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -- match: - name: "^lm_head" - class: torch.nn.Linear - replace: - class: ktransformers.operators.linear.KTransformersLinear - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -- match: - name: "^model\\.layers\\..*\\.mlp$" - class: ktransformers.models.modeling_deepseek.DeepseekV2MoE - replace: - class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - name: "^model\\.layers\\..*\\.mlp\\.experts$" - replace: - class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism - kwargs: - prefill_device: "cpu" - prefill_op: "KExpertsTorch" - generate_device: "cpu" - generate_op: "KSFTExpertsCPU" - out_device: "cuda" - backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) - recursive: False # don't recursively inject submodules of this module -- match: - name: "^model\\.layers\\..*\\.self_attn$" - replace: - class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - name: "^model$" - replace: - class: "ktransformers.operators.models.KDeepseekV2Model" - kwargs: - per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill -- match: - name: "^model.embed_tokens" - replace: - class: "default" - kwargs: - generate_device: "cpu" - prefill_device: "cpu" diff --git a/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft.yaml deleted file mode 100644 index 135bb7654..000000000 --- a/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft.yaml +++ /dev/null @@ -1,68 +0,0 @@ -- match: - class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding - replace: - class: ktransformers.operators.RoPE.YarnRotaryEmbedding - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression - class: torch.nn.Linear # only match modules matching name and class simultaneously - replace: - class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -- match: - name: "^lm_head" - class: torch.nn.Linear - replace: - class: ktransformers.operators.linear.KTransformersLinear - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -- match: - name: "^model\\.layers\\..*\\.mlp$" - class: ktransformers.models.modeling_deepseek.DeepseekV2MoE - replace: - class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - name: "^model\\.layers\\..*\\.mlp\\.experts$" - replace: - class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism - kwargs: - prefill_device: "cpu" - prefill_op: "KExpertsTorch" - generate_device: "cpu" - generate_op: "KSFTExpertsCPU" - out_device: "cuda" - recursive: False # don't recursively inject submodules of this module -- match: - name: "^model\\.layers\\..*\\.self_attn$" - replace: - class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - name: "^model$" - replace: - class: "ktransformers.operators.models.KDeepseekV2Model" - kwargs: - per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill -- match: - name: "^model.embed_tokens" - replace: - class: "default" - kwargs: - generate_device: "cpu" - prefill_device: "cpu" diff --git a/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat.yaml deleted file mode 100644 index e9bbe675e..000000000 --- a/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat.yaml +++ /dev/null @@ -1,68 +0,0 @@ -- match: - class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding - replace: - class: ktransformers.operators.RoPE.YarnRotaryEmbedding - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression - class: torch.nn.Linear # only match modules matching name and class simultaneously - replace: - class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - generate_op: "KLinearMarlin" - prefill_op: "KLinearTorch" - -- match: - name: "^lm_head" - class: torch.nn.Linear - replace: - class: ktransformers.operators.linear.KTransformersLinear - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - generate_op: "KLinearMarlin" - prefill_op: "KLinearTorch" - -- match: - name: "^model\\.layers\\..*\\.mlp$" - class: ktransformers.models.modeling_deepseek.DeepseekV2MoE - replace: - class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - name: "^model\\.layers\\..*\\.mlp\\.experts$" - replace: - class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism - kwargs: - prefill_device: "cuda" - prefill_op: "KExpertsTorch" - generate_device: "cpu" - generate_op: "KExpertsCPU" - out_device: "cuda" - recursive: False # don't recursively inject submodules of this module -- match: - name: "^model\\.layers\\..*\\.self_attn$" - replace: - class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - name: "^model$" - replace: - class: "ktransformers.operators.models.KDeepseekV2Model" - kwargs: - per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill -- match: - name: "^model.embed_tokens" - replace: - class: "default" - kwargs: - generate_device: "cpu" - prefill_device: "cpu" diff --git a/examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-amx.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-amx.yaml deleted file mode 100644 index 0ebe79e1e..000000000 --- a/examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-amx.yaml +++ /dev/null @@ -1,77 +0,0 @@ -- match: - class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding - replace: - class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - -- match: - name: "^lm_head$" # regular expression - class: torch.nn.Linear # only match modules matching name and class simultaneously - replace: - class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - generate_op: "KLinearMarlin" - prefill_op: "KLinearTorch" - -- match: - name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression - class: torch.nn.Linear # only match modules matching name and class simultaneously - replace: - class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - generate_op: "KLinearMarlin" - prefill_op: "KLinearTorch" -- match: - name: "^model\\.layers\\..*\\.mlp$" - class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE - replace: - class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - class: ktransformers.models.modeling_deepseek_v3.MoEGate - replace: - class: ktransformers.operators.gate.KMoEGate - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" -- match: - name: "^model\\.layers\\..*\\.mlp\\.experts$" - replace: - class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism - kwargs: - prefill_device: "cuda" - prefill_op: "KExpertsTorch" - generate_device: "cpu" - generate_op: "KExpertsCPU" - out_device: "cuda" - backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) - recursive: False # don't recursively inject submodules of this module -- match: - name: "^model\\.layers\\..*\\.self_attn$" - replace: - class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - absorb_for_prefill: False # change this to True to enable long context(prefill may slower). -- match: - name: "^model$" - replace: - class: "ktransformers.operators.models.KDeepseekV2Model" - kwargs: - per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill -- match: - name: "^model.embed_tokens" - replace: - class: "default" - kwargs: - generate_device: "cpu" - prefill_device: "cpu" diff --git a/examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml deleted file mode 100644 index 4eda68c87..000000000 --- a/examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml +++ /dev/null @@ -1,392 +0,0 @@ -- match: - name: "^model.embed_tokens" - replace: - class: "default" - kwargs: - generate_device: "cpu" - prefill_device: "cpu" - -# === Rotary Embedding Replacement === - -# GPU 0: layers 0–14 -- match: - name: "^model\\.layers\\.([0-9]|1[0-4])\\." - class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding - replace: - class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" - -# GPU 1: layers 15–29 -- match: - name: "^model\\.layers\\.(1[5-9]|2[0-9])\\." - class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding - replace: - class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" - -# GPU 2: layers 30–44 -- match: - name: "^model\\.layers\\.(3[0-9]|4[0-4])\\." - class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding - replace: - class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 - kwargs: - generate_device: "cuda:2" - prefill_device: "cuda:2" - -# GPU 3: layers 45–60 -- match: - name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\." - class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding - replace: - class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 - kwargs: - generate_device: "cuda:3" - prefill_device: "cuda:3" - -# === Linear Layers Replacement (excluding self_attn.kv_b_proj) === - -# GPU 0: layers 0–14 -- match: - name: "^model\\.layers\\.([0-9]|1[0-4])\\.(?!self_attn\\.kv_b_proj).*$" - class: torch.nn.Linear - replace: - class: ktransformers.operators.linear.KTransformersLinear - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -# GPU 1: layers 15–29 -- match: - name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.(?!self_attn\\.kv_b_proj).*$" - class: torch.nn.Linear - replace: - class: ktransformers.operators.linear.KTransformersLinear - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -# GPU 2: layers 30–44 -- match: - name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.(?!self_attn\\.kv_b_proj).*$" - class: torch.nn.Linear - replace: - class: ktransformers.operators.linear.KTransformersLinear - kwargs: - generate_device: "cuda:2" - prefill_device: "cuda:2" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -# GPU 3: layers 45–60 -- match: - name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.(?!self_attn\\.kv_b_proj).*$" - class: torch.nn.Linear - replace: - class: ktransformers.operators.linear.KTransformersLinear - kwargs: - generate_device: "cuda:3" - prefill_device: "cuda:3" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -# === MLP (MoE) Replacement === - -# GPU 0: layers 0–14 -- match: - name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp$" - class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE - replace: - class: ktransformers.operators.experts.KDeepseekV3MoE - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" - -# GPU 1: layers 15–29 -- match: - name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp$" - class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE - replace: - class: ktransformers.operators.experts.KDeepseekV3MoE - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" - -# GPU 2: layers 30–44 -- match: - name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp$" - class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE - replace: - class: ktransformers.operators.experts.KDeepseekV3MoE - kwargs: - generate_device: "cuda:2" - prefill_device: "cuda:2" - -# GPU 3: layers 45–60 -- match: - name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp$" - class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE - replace: - class: ktransformers.operators.experts.KDeepseekV3MoE - kwargs: - generate_device: "cuda:3" - prefill_device: "cuda:3" - -# === MLP Gate Replacement === - -# GPU 0: layers 0–14 -- match: - name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.gate$" - class: ktransformers.models.modeling_deepseek_v3.MoEGate - replace: - class: ktransformers.operators.gate.KMoEGate - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" - -# GPU 1: layers 15–29 -- match: - name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.gate$" - class: ktransformers.models.modeling_deepseek_v3.MoEGate - replace: - class: ktransformers.operators.gate.KMoEGate - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" - -# GPU 2: layers 30–44 -- match: - name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.gate$" - class: ktransformers.models.modeling_deepseek_v3.MoEGate - replace: - class: ktransformers.operators.gate.KMoEGate - kwargs: - generate_device: "cuda:2" - prefill_device: "cuda:2" - -# GPU 3: layers 45–60 -- match: - name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.gate$" - class: ktransformers.models.modeling_deepseek_v3.MoEGate - replace: - class: ktransformers.operators.gate.KMoEGate - kwargs: - generate_device: "cuda:3" - prefill_device: "cuda:3" - -# === MLP Experts Replacement === -# replace with marlin expert. Open and modify layer-num as needed. -# Each layer of malin experts takes about 6GB of GPU memory. -# !!!Do remember 'close' cuda graph if you are using marlin expert.!!! -# !!!KExpertsTorch is untested, we don't have enough VRAM.!!! - -# GPU 0: layers 3–4 -# - match: -# name: "^model\\.layers\\.([3-4])\\.mlp\\.experts$" -# replace: -# class: ktransformers.operators.experts.KTransformersExperts -# kwargs: -# generate_device: "cuda:0" -# generate_op: "KExpertsMarlin" -# recursive: False - -# # GPU 1: layers 15–17 -# - match: -# name: "^model\\.layers\\.(1[5-7])\\.mlp\\.experts$" -# replace: -# class: ktransformers.operators.experts.KTransformersExperts -# kwargs: -# generate_device: "cuda:1" -# generate_op: "KExpertsMarlin" -# recursive: False - -# # GPU 2: layers 30–32 -# - match: -# name: "^model\\.layers\\.(3[0-2])\\.mlp\\.experts$" -# replace: -# class: ktransformers.operators.experts.KTransformersExperts -# kwargs: -# generate_device: "cuda:2" -# generate_op: "KExpertsMarlin" -# recursive: False - -# # GPU 3: layers 45–46 -# - match: -# name: "^model\\.layers\\.(4[5-6])\\.mlp\\.experts$" -# replace: -# class: ktransformers.operators.experts.KTransformersExperts -# kwargs: -# generate_device: "cuda:3" -# generate_op: "KExpertsMarlin" -# recursive: False - - -# === MLP Experts Replacement === - -# GPU 0: layers 0–14 -- match: - name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.experts$" - replace: - class: ktransformers.operators.experts.KTransformersExperts - kwargs: - prefill_device: "cuda:0" - prefill_op: "KExpertsTorch" - generate_device: "cpu" - generate_op: "KSFTExpertsCPU" - out_device: "cuda:0" - backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) - recursive: False - -# GPU 1: layers 15–29 -- match: - name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.experts$" - replace: - class: ktransformers.operators.experts.KTransformersExperts - kwargs: - prefill_device: "cuda:1" - prefill_op: "KExpertsTorch" - generate_device: "cpu" - generate_op: "KSFTExpertsCPU" - out_device: "cuda:1" - backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) - recursive: False - -# GPU 2: layers 30–44 -- match: - name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.experts$" - replace: - class: ktransformers.operators.experts.KTransformersExperts - kwargs: - prefill_device: "cuda:2" - prefill_op: "KExpertsTorch" - generate_device: "cpu" - generate_op: "KSFTExpertsCPU" - out_device: "cuda:2" - backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) - recursive: False - -# GPU 3: layers 45–60 -- match: - name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.experts$" - replace: - class: ktransformers.operators.experts.KTransformersExperts - kwargs: - prefill_device: "cuda:3" - prefill_op: "KExpertsTorch" - generate_device: "cpu" - generate_op: "KSFTExpertsCPU" - out_device: "cuda:3" - backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) - recursive: False - -# === Self-Attention Replacement === - -# GPU 0: layers 0–14 -- match: - name: "^model\\.layers\\.([0-9]|1[0-4])\\.self_attn$" - replace: - class: ktransformers.operators.attention.KDeepseekV2Attention - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" - absorb_for_prefill: False - -# GPU 1: layers 15–29 -- match: - name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.self_attn$" - replace: - class: ktransformers.operators.attention.KDeepseekV2Attention - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" - absorb_for_prefill: False - -# GPU 2: layers 30–44 -- match: - name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.self_attn$" - replace: - class: ktransformers.operators.attention.KDeepseekV2Attention - kwargs: - generate_device: "cuda:2" - prefill_device: "cuda:2" - absorb_for_prefill: False - -# GPU 3: layers 45–60 -- match: - name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.self_attn$" - replace: - class: ktransformers.operators.attention.KDeepseekV2Attention - kwargs: - generate_device: "cuda:3" - prefill_device: "cuda:3" - absorb_for_prefill: False - -# === Overall Model Replacement with Transfer Map === - -- match: - name: "^model$" - replace: - class: "ktransformers.operators.models.KDeepseekV2Model" - kwargs: - per_layer_prefill_intput_threshold: 0 # 0 means close layer‐wise prefill - transfer_map: - 15: "cuda:1" # Layers 15+ on GPU 1 - 30: "cuda:2" # Layers 30+ on GPU 2 - 45: "cuda:3" # Layers 45+ on GPU 3 - -# === Default Catch-All for Other Modules === - -# GPU 0: layers 0–14 -- match: - name: "^model\\.layers\\.([0-9]|1[0-4])\\." - replace: - class: "default" - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" - -# GPU 1: layers 15–29 -- match: - name: "^model\\.layers\\.(1[5-9]|2[0-9])\\." - replace: - class: "default" - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" - -# GPU 2: layers 30–44 -- match: - name: "^model\\.layers\\.(3[0-9]|4[0-4])\\." - replace: - class: "default" - kwargs: - generate_device: "cuda:2" - prefill_device: "cuda:2" - -- match: - name: "^lm_head" - class: torch.nn.Linear - replace: - class: ktransformers.operators.linear.KTransformersLinear - kwargs: - generate_device: "cuda:3" - prefill_device: "cuda:3" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -# For final modules (model.norm), ensure they are on GPU 3 (as in your original config) -- match: - name: "(^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.)|(^model\\.norm)" - replace: - class: "default" - kwargs: - generate_device: "cuda:3" - prefill_device: "cuda:3" diff --git a/examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml deleted file mode 100644 index e5b0979a1..000000000 --- a/examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml +++ /dev/null @@ -1,156 +0,0 @@ -- match: - name: "^model.embed_tokens" - replace: - class: "default" - kwargs: - generate_device: "cpu" - prefill_device: "cpu" - -- match: - name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\." - class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding - replace: - class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" -- match: - name: "^model\\.layers\\.([3456][0-9])\\." - class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding - replace: - class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" - -- match: - name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression - class: torch.nn.Linear # only match modules matching name and class simultaneously - replace: - class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -- match: - name: "^model\\.layers\\.([3456][0-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression - class: torch.nn.Linear # only match modules matching name and class simultaneously - replace: - class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -- match: - name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$" - class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE - replace: - class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" -- match: - name: "^model\\.layers\\.([3456][0-9])\\.mlp$" - class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE - replace: - class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" - -- match: - name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$" - class: ktransformers.models.modeling_deepseek_v3.MoEGate - replace: - class: ktransformers.operators.gate.KMoEGate - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" -- match: - name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$" - class: ktransformers.models.modeling_deepseek_v3.MoEGate - replace: - class: ktransformers.operators.gate.KMoEGate # mlp module with custom forward function - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" - -- match: - name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$" - replace: - class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism - kwargs: - prefill_device: "cuda:0" - prefill_op: "KExpertsTorch" - generate_device: "cpu" - generate_op: "KSFTExpertsCPU" - out_device: "cuda:0" - backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) - recursive: False # don't recursively inject submodules of this module - -- match: - name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.experts$" - replace: - class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism - kwargs: - prefill_device: "cuda:1" - prefill_op: "KExpertsTorch" - generate_device: "cpu" - generate_op: "KSFTExpertsCPU" - out_device: "cuda:1" - backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) - recursive: False # don't recursively inject submodules of this module - -- match: - name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$" - replace: - class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" -- match: - name: "^model\\.layers\\.([3456][0-9])\\.self_attn$" - replace: - class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" -- match: - name: "^model$" - replace: - class: "ktransformers.operators.models.KDeepseekV2Model" - kwargs: - per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill - transfer_map: - 30: "cuda:1" - -- match: - name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\." - replace: - class: "default" - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" - -- match: - name: "^lm_head" - class: torch.nn.Linear - replace: - class: ktransformers.operators.linear.KTransformersLinear - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -- match: - name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)" - replace: - class: "default" - kwargs: - generate_device: "cuda:1" - prefill_device: "cuda:1" diff --git a/examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx.yaml deleted file mode 100644 index 57746f646..000000000 --- a/examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx.yaml +++ /dev/null @@ -1,77 +0,0 @@ -- match: - class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding - replace: - class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - -- match: - name: "^lm_head$" # regular expression - class: torch.nn.Linear # only match modules matching name and class simultaneously - replace: - class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -- match: - name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression - class: torch.nn.Linear # only match modules matching name and class simultaneously - replace: - class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" -- match: - name: "^model\\.layers\\..*\\.mlp$" - class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE - replace: - class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - class: ktransformers.models.modeling_deepseek_v3.MoEGate - replace: - class: ktransformers.operators.gate.KMoEGate - kwargs: - generate_device: "cuda:0" - prefill_device: "cuda:0" -- match: - name: "^model\\.layers\\..*\\.mlp\\.experts$" - replace: - class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism - kwargs: - prefill_device: "cuda" - prefill_op: "KExpertsTorch" - generate_device: "cpu" - generate_op: "KSFTExpertsCPU" - out_device: "cuda" - backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default) - recursive: False # don't recursively inject submodules of this module -- match: - name: "^model\\.layers\\..*\\.self_attn$" - replace: - class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - absorb_for_prefill: False # change this to True to enable long context(prefill may slower). -- match: - name: "^model$" - replace: - class: "ktransformers.operators.models.KDeepseekV2Model" - kwargs: - per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill -- match: - name: "^model.embed_tokens" - replace: - class: "default" - kwargs: - generate_device: "cpu" - prefill_device: "cpu" diff --git a/examples/ktransformers/kt_optimize_rules/Qwen3Moe-sft-amx.yaml b/examples/ktransformers/kt_optimize_rules/Qwen3Moe-sft-amx.yaml deleted file mode 100644 index 3fd78d13f..000000000 --- a/examples/ktransformers/kt_optimize_rules/Qwen3Moe-sft-amx.yaml +++ /dev/null @@ -1,80 +0,0 @@ -- match: - class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding - replace: - class: ktransformers.operators.RoPE.RotaryEmbedding - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - -- match: - name: "^lm_head$" # regular expression - class: torch.nn.Linear # only match modules matching name and class simultaneously - replace: - class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" - -# - match: -# name: "^model\\.layers\\..*$" # regular expression -# class: torch.nn.Linear # only match modules matching name and class simultaneously -# replace: -# class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types -# kwargs: -# generate_device: "cuda" -# prefill_device: "cuda" -# generate_op: "KLinearTorch" -# prefill_op: "KLinearTorch" -- match: - name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$" # regular expression - class: torch.nn.Linear # only match modules matching name and class simultaneously - replace: - class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - generate_op: "KLinearTorch" - prefill_op: "KLinearTorch" -- match: - name: "^model\\.layers\\..*\\.mlp$" - replace: - class: ktransformers.operators.experts.KQwen3MoeSparseMoeBlock # mlp module with custom forward function - kwargs: - generate_device: "cuda" - prefill_device: "cuda" - -- match: - name: "^model\\.layers\\..*\\.mlp\\.experts$" - replace: - class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism - kwargs: - prefill_device: "cuda" - prefill_op: "KExpertsTorch" - generate_device: "cpu" - generate_op: "KSFTExpertsCPU" - out_device: "cuda" - backend: "AMXInt8" # or "AMXBF16" or "AMXInt8" - recursive: False # don't recursively inject submodules of this module -- match: - name: "^model\\.layers\\..*\\.self_attn$" - replace: - class: ktransformers.operators.attention.KQwen3MoeAttention # optimized MLA implementation - kwargs: - generate_device: "cuda" - prefill_device: "cuda" -- match: - name: "^model.embed_tokens" - replace: - class: "default" - kwargs: - generate_device: "cpu" - prefill_device: "cpu" - -- match: - name: "^model$" - replace: - class: "ktransformers.operators.models.KQwen3MoeModel" - kwargs: - per_layer_prefill_intput_threshold: 0 diff --git a/examples/ktransformers/train_lora/deepseek2_lora_sft_kt.yaml b/examples/ktransformers/train_lora/deepseek_v2_lora_sft_kt.yaml similarity index 69% rename from examples/ktransformers/train_lora/deepseek2_lora_sft_kt.yaml rename to examples/ktransformers/train_lora/deepseek_v2_lora_sft_kt.yaml index 0d4da3dd3..7fe3f60d7 100644 --- a/examples/ktransformers/train_lora/deepseek2_lora_sft_kt.yaml +++ b/examples/ktransformers/train_lora/deepseek_v2_lora_sft_kt.yaml @@ -19,7 +19,7 @@ preprocessing_num_workers: 16 dataloader_num_workers: 4 ### output -output_dir: saves/Kllama_deepseekV2 +output_dir: saves/KT_FT_deepseekV2 logging_steps: 10 save_steps: 500 plot_loss: true @@ -39,14 +39,7 @@ ddp_timeout: 180000000 resume_from_checkpoint: null ### ktransformers -use_kt: true # use KTransformers as LoRA sft backend -kt_optimize_rule: examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml -cpu_infer: 32 -chunk_size: 8192 - -### eval -# eval_dataset: alpaca_en_demo -# val_size: 0.1 -# per_device_eval_batch_size: 1 -# eval_strategy: steps -# eval_steps: 500 +use_kt: true +# Pair with fsdp2_kt_bf16.yaml for original BF16 checkpoints. +# For pre-converted expert weights, uncomment kt_weight_path and use fsdp2_kt_int8.yaml or fsdp2_kt_int4.yaml. +# kt_weight_path: /path/to/DeepSeek-V2-Lite-AMXINT8 diff --git a/examples/ktransformers/train_lora/deepseek3_lora_sft_kt.yaml b/examples/ktransformers/train_lora/deepseek_v3_lora_sft_kt.yaml similarity index 64% rename from examples/ktransformers/train_lora/deepseek3_lora_sft_kt.yaml rename to examples/ktransformers/train_lora/deepseek_v3_lora_sft_kt.yaml index 3ee95aaa5..c3282391b 100644 --- a/examples/ktransformers/train_lora/deepseek3_lora_sft_kt.yaml +++ b/examples/ktransformers/train_lora/deepseek_v3_lora_sft_kt.yaml @@ -1,5 +1,5 @@ ### model -model_name_or_path: opensourcerelease/DeepSeek-V3-bf16 +model_name_or_path: deepseek-ai/DeepSeek-V3-0324-BF16 # need to convert to BF16 checkpoint first trust_remote_code: true ### method @@ -19,7 +19,7 @@ preprocessing_num_workers: 16 dataloader_num_workers: 4 ### output -output_dir: saves/Kllama_deepseekV3 +output_dir: saves/KT_FT_deepseekV3 logging_steps: 10 save_steps: 500 plot_loss: true @@ -39,14 +39,7 @@ ddp_timeout: 180000000 resume_from_checkpoint: null ### ktransformers -use_kt: true # use KTransformers as LoRA sft backend -kt_optimize_rule: examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml -cpu_infer: 32 -chunk_size: 8192 - -### eval -# eval_dataset: alpaca_en_demo -# val_size: 0.1 -# per_device_eval_batch_size: 1 -# eval_strategy: steps -# eval_steps: 500 +use_kt: true +# Pair with fsdp2_kt_bf16.yaml for original BF16 checkpoints. +# For pre-converted expert weights, uncomment kt_weight_path and use fsdp2_kt_int8.yaml or fsdp2_kt_int4.yaml. +# kt_weight_path: /path/to/DeepSeek-V3-AMXINT8 diff --git a/examples/ktransformers/train_lora/qwen3_5moe_lora_sft_kt.yaml b/examples/ktransformers/train_lora/qwen3_5moe_lora_sft_kt.yaml new file mode 100644 index 000000000..40f75442c --- /dev/null +++ b/examples/ktransformers/train_lora/qwen3_5moe_lora_sft_kt.yaml @@ -0,0 +1,46 @@ +### model +model_name_or_path: Qwen/Qwen3.5-397B-A17B +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_rank: 8 +lora_target: all + +### dataset +dataset: identity, alpaca_en_demo +template: qwen3_5 +cutoff_len: 2048 +max_samples: 100000 +overwrite_cache: true +preprocessing_num_workers: 16 +dataloader_num_workers: 4 + +### output +output_dir: saves/KT_FT_qwen35Moe +logging_steps: 10 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true +save_only_model: false +report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] + +### train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 8 +learning_rate: 1.0e-4 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null + +### ktransformers +use_kt: true +# For original BF16 checkpoints, start with examples/ktransformers/accelerate/fsdp2_kt_bf16.yaml. +# For pre-converted expert weights, uncomment kt_weight_path and use fsdp2_kt_int8.yaml or fsdp2_kt_int4.yaml. +# Pair the 397B path with fsdp2_kt_int8.yaml, tune cutoff_len to prepared weights and GPU memory. +# kt_weight_path: /path/to/Qwen3.5-MoE-AMXINT8 diff --git a/examples/ktransformers/train_lora/qwen3moe_lora_sft_kt.yaml b/examples/ktransformers/train_lora/qwen3moe_lora_sft_kt.yaml index ee4e6e95d..f0633e565 100644 --- a/examples/ktransformers/train_lora/qwen3moe_lora_sft_kt.yaml +++ b/examples/ktransformers/train_lora/qwen3moe_lora_sft_kt.yaml @@ -11,7 +11,7 @@ lora_target: all ### dataset dataset: identity, alpaca_en_demo -template: qwen3_nothink +template: qwen3 cutoff_len: 2048 max_samples: 100000 overwrite_cache: true @@ -19,9 +19,9 @@ preprocessing_num_workers: 16 dataloader_num_workers: 4 ### output -output_dir: saves/Kllama_Qwen3MoE_235bA22b +output_dir: saves/KT_FT_qwen3Moe logging_steps: 10 -save_steps: 200 +save_steps: 500 plot_loss: true overwrite_output_dir: true save_only_model: false @@ -31,7 +31,7 @@ report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] per_device_train_batch_size: 1 gradient_accumulation_steps: 8 learning_rate: 1.0e-4 -num_train_epochs: 3 +num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 bf16: true @@ -39,14 +39,7 @@ ddp_timeout: 180000000 resume_from_checkpoint: null ### ktransformers -use_kt: true # use KTransformers as LoRA sft backend -kt_optimize_rule: examples/ktransformers/kt_optimize_rules/Qwen3Moe-sft-amx.yaml -cpu_infer: 32 -chunk_size: 8192 - -### eval -# eval_dataset: alpaca_en_demo -# val_size: 0.1 -# per_device_eval_batch_size: 1 -# eval_strategy: steps -# eval_steps: 500 +use_kt: true +# Pair with examples/ktransformers/accelerate/fsdp2_kt_bf16.yaml for original BF16 checkpoints. +# For pre-converted expert weights, uncomment kt_weight_path and use fsdp2_kt_int8.yaml or fsdp2_kt_int4.yaml. +# kt_weight_path: /path/to/Qwen3-235B-A22B-Instruct-2507-AMXINT8 diff --git a/requirements/ktransformers.txt b/requirements/ktransformers.txt new file mode 100644 index 000000000..7f67b35b6 --- /dev/null +++ b/requirements/ktransformers.txt @@ -0,0 +1 @@ +ktransformers[sft] diff --git a/src/llamafactory/chat/chat_model.py b/src/llamafactory/chat/chat_model.py index cb612f88d..9ffd8647c 100644 --- a/src/llamafactory/chat/chat_model.py +++ b/src/llamafactory/chat/chat_model.py @@ -71,16 +71,6 @@ class ChatModel: "SGLang not install, you may need to run `pip install sglang[all]`\n" "or try to use HuggingFace backend: --infer_backend huggingface" ) from e - elif model_args.infer_backend == EngineName.KT: - try: - from .kt_engine import KTransformersEngine - - self.engine: BaseEngine = KTransformersEngine(model_args, data_args, finetuning_args, generating_args) - except ImportError as e: - raise ImportError( - "KTransformers not install, you may need to run `pip install ktransformers`\n" - "or try to use HuggingFace backend: --infer_backend huggingface" - ) from e else: raise NotImplementedError(f"Unknown backend: {model_args.infer_backend}") diff --git a/src/llamafactory/chat/kt_engine.py b/src/llamafactory/chat/kt_engine.py deleted file mode 100644 index 3bf3f4bb2..000000000 --- a/src/llamafactory/chat/kt_engine.py +++ /dev/null @@ -1,284 +0,0 @@ -# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import asyncio -import os -import platform -from collections.abc import AsyncGenerator -from threading import Thread -from typing import TYPE_CHECKING, Any, Optional - -import torch -from typing_extensions import override - -from ..data import get_template_and_fix_tokenizer -from ..extras import logging -from ..extras.constants import EngineName -from ..model import load_model, load_tokenizer -from .base_engine import BaseEngine, Response - - -if TYPE_CHECKING: - from transformers import PreTrainedTokenizer - from trl import PreTrainedModelWrapper - - from ..data.mm_plugin import AudioInput, ImageInput, VideoInput - from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments - -from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled -from ktransformers.server.config.config import Config -from ktransformers.util.utils import ( - get_compute_capability, - prefill_and_generate_capture, -) -from ktransformers.util.vendors import GPUVendor, device_manager - - -logger = logging.get_logger(__name__) - - -class KTransformersEngine(BaseEngine): - def __init__( - self, - model_args: "ModelArguments", - data_args: "DataArguments", - finetuning_args: "FinetuningArguments", - generating_args: "GeneratingArguments", - ) -> None: - self.name = EngineName.KT - self.can_generate = finetuning_args.stage == "sft" - - tok_mod = load_tokenizer(model_args) - self.tokenizer = tok_mod["tokenizer"] - self.tokenizer.padding_side = "left" if self.can_generate else "right" - self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args) - - self.model = load_model( - self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate) - ) - - self.generating_args = generating_args.to_dict() - self.max_new_tokens = model_args.kt_maxlen - self.use_cuda_graph = model_args.kt_use_cuda_graph - self.mode = model_args.kt_mode - self.force_think = model_args.kt_force_think - self.chunk_size = model_args.chunk_size - - try: - asyncio.get_event_loop() - except RuntimeError: - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - - self.semaphore = asyncio.Semaphore(int(os.getenv("MAX_CONCURRENT", "1"))) - - @staticmethod - @torch.inference_mode() - def _get_scores( - model: "PreTrainedModelWrapper", - tokenizer: "PreTrainedTokenizer", - batch_input: list[str], - input_kwargs: Optional[dict[str, Any]] = {}, - ) -> list[float]: - max_length: Optional[int] = input_kwargs.pop("max_length", None) - device = getattr(model.pretrained_model, "device", "cuda") - inputs = tokenizer( - batch_input, - padding=True, - truncation=True, - max_length=max_length or getattr(model.config, "max_position_embeddings", 1024), - return_tensors="pt", - add_special_tokens=False, - ).to(device) - values: torch.Tensor = model(**inputs, return_dict=True, use_cache=False)[-1] - scores = values.gather(dim=-1, index=(inputs["attention_mask"].sum(dim=-1, keepdim=True) - 1)) - return scores - - async def _generate( - self, - messages: list[dict[str, str]], - system: Optional[str] = None, - tools: Optional[str] = None, - **input_kwargs, - ) -> AsyncGenerator[str, None]: - paired = messages + [{"role": "assistant", "content": ""}] - prompt_ids, _ = self.template.encode_oneturn(self.tokenizer, paired, system, tools) - prompt_len = len(prompt_ids) - - max_length: Optional[int] = input_kwargs.pop("max_length", None) - max_new_tokens: Optional[int] = input_kwargs.pop("max_new_tokens", None) - - if "max_new_tokens" in self.generating_args: - max_tokens = int(self.generating_args["max_new_tokens"]) - elif "max_length" in self.generating_args: - gl = int(self.generating_args["max_length"]) - max_tokens = gl - prompt_len if gl > prompt_len else 1 - else: - max_tokens = self.max_new_tokens or 256 - - if max_length is not None: - max_tokens = max(max_length - prompt_len, 1) - if max_new_tokens is not None: - max_tokens = int(max_new_tokens) - max_tokens = max(1, int(max_tokens)) - - if self.mode == "long_context": - max_len_cfg = Config().long_context_config["max_seq_len"] - need = prompt_len + max_tokens - assert max_len_cfg > need, f"please set max_seq_len > {need} in ~/.ktransformers/config.yaml" - - device = next(self.model.parameters()).device - input_tensor = torch.tensor([prompt_ids], dtype=torch.long, device=device) - if self.force_think: - think = torch.tensor( - [self.tokenizer.encode("\n", add_special_tokens=False)], dtype=torch.long, device=device - ) - input_tensor = torch.cat([input_tensor, think], dim=1) - - use_flashinfer = ( - platform.system() != "Windows" - and getattr(self.model.config, "architectures", [""])[0] - in {"DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"} - and flashinfer_enabled - and get_compute_capability() >= 8 - and device_manager.gpu_vendor == GPUVendor.NVIDIA - ) - - def make_gen(): - if use_flashinfer: - return prefill_and_generate_capture( - self.model, - self.tokenizer, - input_tensor, - max_tokens, - self.use_cuda_graph, - mode=self.mode, - force_think=self.force_think, - chunk_size=self.chunk_size, - use_flashinfer_mla=True, - num_heads=self.model.config.num_attention_heads, - head_dim_ckv=getattr(self.model.config, "kv_lora_rank", 0), - head_dim_kpe=getattr(self.model.config, "qk_rope_head_dim", 0), - q_head_dim=getattr(self.model.config, "qk_rope_head_dim", 0) - + getattr(self.model.config, "qk_nope_head_dim", 0), - echo_stream=False, - ) - else: - return prefill_and_generate_capture( - self.model, - self.tokenizer, - input_tensor, - max_tokens, - self.use_cuda_graph, - mode=self.mode, - force_think=self.force_think, - chunk_size=self.chunk_size, - echo_stream=False, - ) - - loop = asyncio.get_running_loop() - q: asyncio.Queue[Optional[str]] = asyncio.Queue() - - def producer(): - try: - gen = make_gen() - if hasattr(gen, "__aiter__"): - - async def drain_async(): - async for t in gen: - loop.call_soon_threadsafe(q.put_nowait, t if isinstance(t, str) else str(t)) - - asyncio.run(drain_async()) - elif hasattr(gen, "__iter__"): - for t in gen: - loop.call_soon_threadsafe(q.put_nowait, t if isinstance(t, str) else str(t)) - else: - loop.call_soon_threadsafe(q.put_nowait, gen if isinstance(gen, str) else str(gen)) - finally: - loop.call_soon_threadsafe(q.put_nowait, None) - - Thread(target=producer, daemon=True).start() - - while True: - item = await q.get() - if item is None: - break - yield item - - @override - async def chat( - self, - messages: list[dict[str, str]], - system: Optional[str] = None, - tools: Optional[str] = None, - images: Optional[list["ImageInput"]] = None, - videos: Optional[list["VideoInput"]] = None, - audios: Optional[list["AudioInput"]] = None, - **input_kwargs, - ) -> list["Response"]: - if not self.can_generate: - raise ValueError("The current model does not support `chat`.") - async with self.semaphore: - produced = "" - final_text = "" - async for t in self._generate(messages, system, tools, **input_kwargs): - delta = t - produced = produced + delta - if delta: - final_text += delta - - prompt_ids, _ = self.template.encode_oneturn( - self.tokenizer, messages + [{"role": "assistant", "content": ""}], system, tools - ) - return [ - Response( - response_text=final_text, - response_length=len(self.tokenizer.encode(final_text, add_special_tokens=False)), - prompt_length=len(prompt_ids), - finish_reason="stop", - ) - ] - - @override - async def stream_chat( - self, - messages: list[dict[str, str]], - system: Optional[str] = None, - tools: Optional[str] = None, - images: Optional[list["ImageInput"]] = None, - videos: Optional[list["VideoInput"]] = None, - audios: Optional[list["AudioInput"]] = None, - **input_kwargs, - ) -> AsyncGenerator[str, None]: - if not self.can_generate: - raise ValueError("The current model does not support `stream_chat`.") - async with self.semaphore: - produced = "" - async for t in self._generate(messages, system, tools, **input_kwargs): - delta = t[len(produced) :] if t.startswith(produced) else t - produced = t - if delta: - yield delta - - @override - async def get_scores( - self, - batch_input: list[str], - **input_kwargs, - ) -> list[float]: - if self.can_generate: - raise ValueError("Cannot get scores using an auto-regressive model.") - args = (self.model, self.tokenizer, batch_input, input_kwargs) - async with self.semaphore: - return await asyncio.to_thread(self._get_scores, *args) diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index c90fcfc8b..684fd2b47 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -139,7 +139,6 @@ class EngineName(StrEnum): HF = "huggingface" VLLM = "vllm" SGLANG = "sglang" - KT = "ktransformers" class DownloadSource(StrEnum): diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py index 17f70f53e..e128ca418 100644 --- a/src/llamafactory/extras/misc.py +++ b/src/llamafactory/extras/misc.py @@ -96,8 +96,8 @@ def check_dependencies() -> None: r"""Check the version of the required packages.""" check_version("transformers>=4.55.0,<=5.6.0") check_version("datasets>=2.16.0,<=4.0.0") - check_version("accelerate>=1.3.0,<=1.11.0") - check_version("peft>=0.18.0,<=0.18.1") + check_version("accelerate>=1.3.0,<=1.15.0") + check_version("peft>=0.18.0,<=0.20.0") check_version("trl>=0.18.0,<=0.24.0") diff --git a/src/llamafactory/extras/packages.py b/src/llamafactory/extras/packages.py index 853b9eacc..a228ac0fb 100644 --- a/src/llamafactory/extras/packages.py +++ b/src/llamafactory/extras/packages.py @@ -88,7 +88,7 @@ def is_ray_available(): def is_kt_available(): - return _is_package_available("ktransformers") + return _is_package_available("kt_kernel") def is_requests_available(): diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py index 2bfaa2734..4212a46df 100644 --- a/src/llamafactory/hparams/model_args.py +++ b/src/llamafactory/hparams/model_args.py @@ -16,6 +16,7 @@ # limitations under the License. import json +import os from dataclasses import asdict, dataclass, field, fields from typing import Any, Literal, Self @@ -460,47 +461,81 @@ class SGLangArguments: @dataclass class KTransformersArguments: - r"""Arguments pertaining to the KT training.""" + r"""Arguments pertaining to KTransformers AMX MoE SFT training. + + These fields are normalized into the transformers/accelerate KT config before training starts. + """ use_kt: bool = field( default=False, - metadata={"help": "Whether To Use KTransformers Optimizations For LoRA Training."}, + metadata={"help": "Whether to use KTransformers AMX MoE backend for SFT training."}, ) - kt_optimize_rule: str | None = field( + kt_weight_path: str | None = field( default=None, - metadata={ - "help": "Path To The KTransformers Optimize Rule; See https://github.com/kvcache-ai/ktransformers/." - }, + metadata={"help": "Path to pre-quantized INT8 expert weights (.kt files)."}, ) - cpu_infer: int | None = field( - default=32, - metadata={"help": "Number Of CPU Cores Used For Computation."}, + kt_expert_checkpoint_path: str | None = field( + default=None, + metadata={"help": "Path to expert checkpoint (safetensors) for online conversion."}, ) - chunk_size: int | None = field( - default=8192, - metadata={"help": "Chunk Size Used For CPU Compute In KTransformers."}, + kt_use_lora_experts: bool | None = field( + default=None, + metadata={"help": "Whether to use GPU-side LoRA Experts."}, ) - mode: str | None = field( - default="normal", - metadata={"help": "Normal Or Long_Context For Llama Models."}, + kt_lora_expert_num: int | None = field( + default=None, + metadata={"help": "Number of GPU-side LoRA Experts."}, + ) + kt_lora_expert_intermediate_size: int | None = field( + default=None, + metadata={"help": "Intermediate size for GPU-side LoRA Experts."}, ) - kt_maxlen: int = field( - default=4096, - metadata={"help": "Maximum Sequence (Prompt + Response) Length Of The KT Engine."}, - ) - kt_use_cuda_graph: bool = field( - default=True, - metadata={"help": "Whether To Use CUDA Graphs For The KT Engine."}, - ) - kt_mode: str = field( - default="normal", - metadata={"help": "Normal Or Long_Context Mode For The KT Engine."}, - ) - kt_force_think: bool = field( - default=False, - metadata={"help": "Force-Think Toggle For The KT Engine."}, - ) + def get_kt_config_dict(self, finetuning_args: Any, model_max_length: int | None) -> dict[str, Any]: + r"""Build KT config values from LLaMA-Factory model and LoRA arguments.""" + kt_config = { + "kt_lora_rank": getattr(finetuning_args, "lora_rank", None), + "kt_lora_alpha": getattr(finetuning_args, "lora_alpha", None), + "kt_weight_path": self.kt_weight_path, + "kt_expert_checkpoint_path": self.kt_expert_checkpoint_path, + "kt_model_max_length": model_max_length, + "kt_use_lora_experts": self.kt_use_lora_experts, + "kt_lora_expert_num": self.kt_lora_expert_num, + "kt_lora_expert_intermediate_size": self.kt_lora_expert_intermediate_size, + } + return {key: value for key, value in kt_config.items() if value is not None} + + def apply_kt_config(self, finetuning_args: Any, training_args: Any, model_max_length: int | None) -> None: + r"""Apply LLaMA-Factory KT args to transformers/accelerate KT integration points.""" + if not self.use_kt: + return + + kt_config = self.get_kt_config_dict(finetuning_args, model_max_length) + env_mapping = { + "kt_weight_path": "ACCELERATE_KT_WEIGHT_PATH", + "kt_expert_checkpoint_path": "ACCELERATE_KT_EXPERT_CHECKPOINT_PATH", + "kt_model_max_length": "ACCELERATE_KT_MODEL_MAX_LENGTH", + "kt_lora_rank": "ACCELERATE_KT_LORA_RANK", + "kt_lora_alpha": "ACCELERATE_KT_LORA_ALPHA", + "kt_use_lora_experts": "ACCELERATE_KT_USE_LORA_EXPERTS", + "kt_lora_expert_num": "ACCELERATE_KT_LORA_EXPERT_NUM", + "kt_lora_expert_intermediate_size": "ACCELERATE_KT_LORA_EXPERT_INTERMEDIATE_SIZE", + } + for key, env_key in env_mapping.items(): + value = kt_config.get(key) + if value is not None: + os.environ[env_key] = str(value) + + hf_kt = getattr(training_args, "hf_kt_config", None) + if hf_kt is None or not hasattr(hf_kt, "_kt_config") or not isinstance(hf_kt._kt_config, dict): + return + + hf_kt._kt_config.update(kt_config) + gc_enabled = getattr(training_args, "gradient_checkpointing", False) or not getattr( + self, "disable_gradient_checkpointing", True + ) + if gc_enabled: + hf_kt._kt_config.setdefault("kt_share_cache_pool", True) @dataclass diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py index 4da66b5f3..f8bfe4868 100644 --- a/src/llamafactory/hparams/parser.py +++ b/src/llamafactory/hparams/parser.py @@ -186,13 +186,16 @@ def _verify_model_args( raise ValueError("Quantized model only accepts a single adapter. Merge them first.") + def _check_extra_dependencies( model_args: "ModelArguments", finetuning_args: "FinetuningArguments", training_args: Optional["TrainingArguments"] = None, ) -> None: if model_args.use_kt: - check_version("ktransformers", mandatory=True) + check_version("kt-kernel", mandatory=True) + check_version("transformers-kt", mandatory=True) + check_version("accelerate-kt", mandatory=True) if model_args.use_unsloth: check_version("unsloth", mandatory=True) @@ -510,6 +513,9 @@ def get_train_args(args: dict[str, Any] | list[str] | None = None) -> _TRAIN_CLS ) transformers.set_seed(training_args.seed) + if model_args.use_kt: + model_args.apply_kt_config(finetuning_args, training_args, model_args.model_max_length) + return model_args, data_args, training_args, finetuning_args, generating_args diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py index a5850703d..70efb6acb 100644 --- a/src/llamafactory/model/adapter.py +++ b/src/llamafactory/model/adapter.py @@ -21,7 +21,6 @@ from transformers.integrations import is_deepspeed_zero3_enabled from ..extras import logging from ..extras.constants import EngineName -from .model_utils.ktransformers import get_kt_peft_model, load_kt_peft_model from .model_utils.misc import find_all_linear_modules, find_expanded_modules from .model_utils.quantization import QuantizationMethod from .model_utils.unsloth import get_unsloth_peft_model, load_unsloth_peft_model @@ -188,12 +187,6 @@ def _setup_lora_tuning( "token": model_args.hf_hub_token, } - if model_args.use_kt: - if model_args.infer_backend != EngineName.KT: - raise ValueError( - "We should use ktransformers as backend to infer the adapter fine-tuned by ktransformers." - ) - for adapter in adapter_to_merge: model: LoraModel = PeftModel.from_pretrained(model, adapter, **init_kwargs) model = model.merge_and_unload() @@ -202,9 +195,7 @@ def _setup_lora_tuning( logger.info_rank0(f"Merged {len(adapter_to_merge)} adapter(s).") if adapter_to_resume is not None: # resume lora training - if model_args.use_kt: - model = load_kt_peft_model(model_args, model) - elif model_args.use_unsloth: + if model_args.use_unsloth: model = load_unsloth_peft_model(config, model_args, finetuning_args, is_trainable=is_trainable) else: model = PeftModel.from_pretrained(model, adapter_to_resume, is_trainable=is_trainable, **init_kwargs) @@ -217,16 +208,6 @@ def _setup_lora_tuning( else: target_modules = finetuning_args.lora_target - if model_args.use_kt: - new_list = [] - for m in target_modules: - if m in ("down_proj", "up_proj", "gate_proj"): - new_list.extend([f"mlp.{m}", f"shared_experts.{m}"]) - elif m not in ("generate_linear", "orig_module", "prefill_linear"): - new_list.append(m) - - target_modules[:] = new_list - if finetuning_args.use_llama_pro: target_modules = find_expanded_modules(model, target_modules, finetuning_args.freeze_trainable_layers) @@ -270,19 +251,11 @@ def _setup_lora_tuning( } if model_args.use_kt: - if finetuning_args.finetuning_type == "oft": - raise ValueError("KTransformers is currently not supported for OFT.") - if finetuning_args.finetuning_type == "lora": - peft_config = LoraConfig( - task_type=TaskType.CAUSAL_LM, - inference_mode=False, - **peft_kwargs, - ) - else: - raise ValueError("KTransformers is currently only supported for LoRA.") + if finetuning_args.finetuning_type != "lora": + raise ValueError("KTransformers only supports LoRA finetuning.") - model = get_kt_peft_model(model, peft_config) - print(f"KT_model:{model}") + peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, **peft_kwargs) + model = get_peft_model(model, peft_config) elif model_args.use_unsloth: if finetuning_args.finetuning_type == "oft": raise ValueError("Unsloth is currently not supported for OFT.") diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index d838d2176..7a209ee11 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -31,7 +31,6 @@ from ..extras import logging from ..extras.misc import count_parameters, skip_check_imports, try_download_model_from_other_hub from ..extras.packages import is_torch_version_greater_than from .adapter import init_adapter -from .model_utils.ktransformers import load_kt_pretrained_model from .model_utils.liger_kernel import apply_liger_kernel from .model_utils.misc import register_autoclass from .model_utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model @@ -144,12 +143,7 @@ def load_model( model = None lazy_load = False - if model_args.use_kt: - from ktransformers.sft.monkey_patch_torch_module import install_patch - - install_patch() - model = load_kt_pretrained_model(config, model_args) - elif model_args.use_unsloth: + if model_args.use_unsloth: if model_args.adapter_name_or_path is not None: lazy_load = True elif is_trainable: diff --git a/src/llamafactory/model/model_utils/ktransformers.py b/src/llamafactory/model/model_utils/ktransformers.py deleted file mode 100644 index 26c413cab..000000000 --- a/src/llamafactory/model/model_utils/ktransformers.py +++ /dev/null @@ -1,154 +0,0 @@ -# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import importlib.util as _u -from typing import TYPE_CHECKING, Any - -import torch - -from ...extras import logging -from ...extras.misc import get_current_device - - -if TYPE_CHECKING: - from ...hparams import FinetuningArguments, ModelArguments - -from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel - - -KT_AVAILABLE = _u.find_spec("ktransformers") is not None -if KT_AVAILABLE: - from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM - from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM - from ktransformers.models.modeling_llama import LlamaForCausalLM - from ktransformers.models.modeling_mixtral import MixtralForCausalLM - from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM - from ktransformers.models.modeling_qwen3_moe import Qwen3MoeForCausalLM - from ktransformers.optimize.optimize import optimize_and_load_gguf - from ktransformers.server.config.config import Config - from ktransformers.sft.lora import inject_lora_layer - from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader - from ktransformers.util.globals import GLOBAL_CONFIG - from ktransformers.util.utils import load_weights - -logger = logging.get_logger(__name__) - - -def _get_kt_kwargs( - config: "PretrainedConfig", - model_name_or_path: str, - model_args: "ModelArguments", - finetuning_args: "FinetuningArguments", -) -> dict[str, Any]: - return { - "model_name": model_name_or_path, - "max_seq_length": model_args.model_max_length or 4096, - "dtype": model_args.compute_dtype, - "load_in_4bit": model_args.quantization_bit == 4, - "token": model_args.hf_hub_token, - "full_finetuning": finetuning_args.finetuning_type == "full", - "device_map": {"": get_current_device()}, - "rope_scaling": getattr(config, "rope_scaling", None), - "fix_tokenizer": False, - "trust_remote_code": model_args.trust_remote_code, - "use_gradient_checkpointing": "ktransformers", - } - - -def load_kt_pretrained_model(config: "PretrainedConfig", model_args: "ModelArguments") -> "PreTrainedModel": - r"""Optionally load pretrained model with KTransformers. Used in training.""" - custom_models = { - "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM, - "DeepseekV3ForCausalLM": DeepseekV3ForCausalLM, - "Qwen2MoeForCausalLM": Qwen2MoeForCausalLM, - "Qwen3MoeForCausalLM": Qwen3MoeForCausalLM, - "LlamaForCausalLM": LlamaForCausalLM, - "MixtralForCausalLM": MixtralForCausalLM, - } - Config().cpu_infer = model_args.cpu_infer - Config().chunk_size = model_args.chunk_size - config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code) - - if model_args.mode == "long_context": - assert config.architectures[0] == "LlamaForCausalLM", "only LlamaForCausalLM support long_context mode" - torch.set_default_dtype(torch.float16) - else: - torch.set_default_dtype(config.torch_dtype) - - with torch.device("meta"): - if config.architectures[0] in custom_models: - print("using custom modeling_xxx.py.") - if "Qwen2Moe" in config.architectures[0]: # Qwen2Moe must use flash_attention_2 to avoid overflow. - config._attn_implementation = "flash_attention_2" - if "Llama" in config.architectures[0]: - config._attn_implementation = "eager" - if "Mixtral" in config.architectures[0]: - config._attn_implementation = "flash_attention_2" - model = custom_models[config.architectures[0]](config) - else: - attn_implementation = "flash_attention_2" - model = AutoModelForCausalLM.from_config( - config, trust_remote_code=True, attn_implementation=attn_implementation - ) - - optimize_config_path = model_args.kt_optimize_rule - gguf_path = model_args.model_name_or_path - - assert optimize_config_path is not None, "optimize_config_path must be provided (path to YAML rules file)." - assert gguf_path is not None, "gguf_path must be provided (path to a folder or .gguf file)." - - GLOBAL_CONFIG._config["mod"] = "infer" - optimize_and_load_gguf(model, optimize_config_path, gguf_path, config) - - return model - - -def get_kt_peft_model(model: "PreTrainedModel", peft_kwargs: dict[str, Any]) -> "PreTrainedModel": - r"""Get the peft model for the pretrained model with KTransformers. Used in training.""" - from ktransformers.sft.peft_utils.mapping import get_peft_model - - return get_peft_model(model, peft_kwargs) - - -def load_kt_peft_model(model_args: "ModelArguments", model: "PreTrainedModel") -> "PreTrainedModel": - r"""Load peft model with KTransformers. Used in both training and inference.""" - load_adapter_name_or_path = model_args.adapter_name_or_path[0] - if load_adapter_name_or_path.endswith(".gguf"): - inject_lora_layer(model, load_adapter_name_or_path) - adapter_gguf_loader = GGUFLoader(load_adapter_name_or_path) - load_weights(model, adapter_gguf_loader, adapter_gguf=True) - model.train() - else: - inject_lora_layer(model, load_adapter_name_or_path) - - adapter_loader = SafeTensorLoader(load_adapter_name_or_path) - device = next(model.parameters()).device - for key in adapter_loader.tensor_file_map.keys(): - try: - tensor = adapter_loader.load_tensor(key, device=device) - - model_key = key.replace("base_model.model.", "") - model_key = model_key.replace(".weight", ".default.weight") - model_key = model_key.replace(".default.default.weight", ".default.weight") - - param = model.get_parameter(model_key) - param.data.copy_(tensor.data) - - print(f"Loaded adapter weight: {key} -> {model_key}") - except AttributeError: - print(f"Skipping {key}: not a model parameter") - except KeyError: - print(f"Key not found in model: {model_key} (original: {key})") - - return model diff --git a/src/llamafactory/train/dpo/ktrainer.py b/src/llamafactory/train/dpo/ktrainer.py deleted file mode 100644 index 0da2c6851..000000000 --- a/src/llamafactory/train/dpo/ktrainer.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright 2025 HuggingFace Inc. and the LlamaFactory team. -# -# This code is inspired by the HuggingFace's TRL library. -# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/dpo_trainer.py -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import TYPE_CHECKING - -import torch -from ktransformers.sft.lora import KTrainer # type: ignore -from typing_extensions import override - -from ..trainer_utils import get_batch_logps, nested_detach -from .trainer import CustomDPOTrainer - - -if TYPE_CHECKING: - from transformers import PreTrainedModel - - -class KDPOTrainer(KTrainer, CustomDPOTrainer): - @override - def concatenated_forward( - self, model: "PreTrainedModel", batch: dict[str, "torch.Tensor"], is_ref_model: bool = False - ) -> tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]: - r"""Compute the sum log probabilities of the labels under given logits if loss_type is not IPO, ORPO or SimPO. - - Otherwise the average log probabilities. - """ - if self.finetuning_args.use_ref_model: - batch = nested_detach(batch, clone=True) # avoid error - - labels = batch.pop("labels") # dpo do not need compute loss in forward - all_logits: torch.Tensor = model(**batch, return_dict=True, use_cache=False).logits.to(torch.float32) - all_logits = all_logits.to("cpu") - labels = labels.to(all_logits.device) - all_logps, valid_length = get_batch_logps( - logits=all_logits, labels=labels, ld_alpha=(self.ld_alpha if not is_ref_model else None) - ) - if self.loss_type in ["ipo", "orpo", "simpo"]: - all_logps = all_logps / valid_length - - batch_size = batch["input_ids"].size(0) // 2 - chosen_logps, rejected_logps = all_logps.split(batch_size, dim=0) - chosen_logits, rejected_logits = all_logits.split(batch_size, dim=0) - chosen_length, _ = valid_length.split(batch_size, dim=0) - - if self.loss_type in ["ipo", "orpo", "simpo"]: - return chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_logps - else: - return chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_logps / chosen_length diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py index 83ad38dfa..209449365 100644 --- a/src/llamafactory/train/dpo/workflow.py +++ b/src/llamafactory/train/dpo/workflow.py @@ -62,15 +62,7 @@ def run_dpo( else: ref_model = None - if model_args.use_kt: - from ktransformers.util.globals import GLOBAL_CONFIG # type: ignore - - from .ktrainer import KDPOTrainer as CustomDPOTrainer - - GLOBAL_CONFIG._config["mod"] = "sft" - - else: - from .trainer import CustomDPOTrainer + from .trainer import CustomDPOTrainer # Initialize our Trainer trainer = CustomDPOTrainer( diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py index 3561ecb30..b50f53ffd 100644 --- a/src/llamafactory/train/sft/workflow.py +++ b/src/llamafactory/train/sft/workflow.py @@ -103,37 +103,18 @@ def run_sft( gen_kwargs["pad_token_id"] = tokenizer.pad_token_id # Initialize our Trainer - if model_args.use_kt: - from ktransformers.sft.lora import KTrainer # type: ignore - from ktransformers.util.globals import GLOBAL_CONFIG # type: ignore - - GLOBAL_CONFIG._config["mod"] = "sft" - - trainer = KTrainer( - model=model, - args=training_args, - tokenizer=tokenizer_module, - data_collator=data_collator, - callbacks=callbacks, - **dataset_module, - **metric_module, - ) - trainer.model_accepts_loss_kwargs = False - model.config.use_cache = False - - else: - trainer = CustomSeq2SeqTrainer( - model=model, - args=training_args, - finetuning_args=finetuning_args, - data_collator=data_collator, - callbacks=callbacks, - gen_kwargs=gen_kwargs, - ref_model=ref_model, - **dataset_module, - **tokenizer_module, - **metric_module, - ) + trainer = CustomSeq2SeqTrainer( + model=model, + args=training_args, + finetuning_args=finetuning_args, + data_collator=data_collator, + callbacks=callbacks, + gen_kwargs=gen_kwargs, + ref_model=ref_model, + **dataset_module, + **tokenizer_module, + **metric_module, + ) # Training if training_args.do_train: diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py index 0d89e8a31..a0e898e39 100644 --- a/src/llamafactory/train/trainer_utils.py +++ b/src/llamafactory/train/trainer_utils.py @@ -103,7 +103,7 @@ def create_modelcard_and_push( kwargs["tags"] = kwargs["tags"] + ["unsloth"] if model_args.use_kt: - kwargs["tags"] = kwargs["tags"] + ["ktransformers"] + kwargs["tags"] = kwargs["tags"] + ["kt-kernel"] if not training_args.do_train: pass