[refactor] Add KTransformers AMX MoE SFT support via Accelerate (#10430)

Co-authored-by: mrhaoxx <mr.haoxx@gmail.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-06-21 06:38:54 +08:00 · 2026-05-01 01:47:58 +08:00
parent 6b08b948c9
commit 887ee2b121
39 changed files with 287 additions and 1968 deletions
--- a/examples/ktransformers/accelerate/fsdp2_kt_bf16.yaml
+++ b/examples/ktransformers/accelerate/fsdp2_kt_bf16.yaml
@@ -0,0 +1,25 @@
+compute_environment: LOCAL_MACHINE
+distributed_type: FSDP
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: true
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_version: 2
+mixed_precision: bf16
+num_machines: 1
+num_processes: 4 # Adjust based on your GPU count; 4 is suitable for 4 GPUs
+rdzv_backend: static
+same_network: true
+use_cpu: false
+
+kt_config:
+  enabled: true
+  kt_backend: AMXBF16  # Use with original BF16 expert weights.
+  kt_num_threads: 96
+  kt_tp_enabled: true
+  kt_threadpool_count: 2
+  kt_max_cache_depth: 2
+  kt_share_backward_bb: true
+  lora_rank: 8
--- a/examples/ktransformers/accelerate/fsdp2_kt_int4.yaml
+++ b/examples/ktransformers/accelerate/fsdp2_kt_int4.yaml
@@ -0,0 +1,25 @@
+compute_environment: LOCAL_MACHINE
+distributed_type: FSDP
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: true
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_version: 2
+mixed_precision: bf16
+num_machines: 1
+num_processes: 4 # Adjust based on your GPU count; 4 is suitable for 4 GPUs
+rdzv_backend: static
+same_network: true
+use_cpu: false
+
+kt_config:
+  enabled: true
+  kt_backend: AMXINT4  # Use with online-converted INT4 expert weights
+  kt_num_threads: 96
+  kt_tp_enabled: true
+  kt_threadpool_count: 2
+  kt_max_cache_depth: 2
+  kt_share_backward_bb: true
+  lora_rank: 8
--- a/examples/ktransformers/accelerate/fsdp2_kt_int8.yaml
+++ b/examples/ktransformers/accelerate/fsdp2_kt_int8.yaml
@@ -0,0 +1,25 @@
+compute_environment: LOCAL_MACHINE
+distributed_type: FSDP
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: true
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_version: 2
+mixed_precision: bf16
+num_machines: 1
+num_processes: 4 # Adjust based on your GPU count; 4 is suitable for 4 GPUs
+rdzv_backend: static
+same_network: true
+use_cpu: false
+
+kt_config:
+  enabled: true
+  kt_backend: AMXINT8  # Use with online-converted INT8 expert weights
+  kt_num_threads: 96
+  kt_tp_enabled: true
+  kt_threadpool_count: 2
+  kt_max_cache_depth: 2
+  kt_share_backward_bb: true
+  lora_rank: 8
--- a/examples/ktransformers/accelerate/fsdp2_kt_int8_1gpu.yaml
+++ b/examples/ktransformers/accelerate/fsdp2_kt_int8_1gpu.yaml
@@ -0,0 +1,25 @@
+compute_environment: LOCAL_MACHINE
+distributed_type: FSDP
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: true
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_version: 2
+mixed_precision: bf16
+num_machines: 1
+num_processes: 1 # Adjust based on your GPU count; 1 is suitable for 1 GPU
+rdzv_backend: static
+same_network: true
+use_cpu: false
+
+kt_config:
+  enabled: true
+  kt_backend: AMXINT8  # Use with online-converted INT8 expert weights
+  kt_num_threads: 96
+  kt_tp_enabled: true
+  kt_threadpool_count: 2
+  kt_max_cache_depth: 2
+  kt_share_backward_bb: true
+  lora_rank: 8
--- a/examples/ktransformers/accelerate/fsdp2_kt_int8_8gpu.yaml
+++ b/examples/ktransformers/accelerate/fsdp2_kt_int8_8gpu.yaml
@@ -0,0 +1,25 @@
+compute_environment: LOCAL_MACHINE
+distributed_type: FSDP
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: true
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_version: 2
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8 # Adjust based on your GPU count; 8 is suitable for 8 GPUs
+rdzv_backend: static
+same_network: true
+use_cpu: false
+
+kt_config:
+  enabled: true
+  kt_backend: AMXINT8  # Use with online-converted INT8 expert weights
+  kt_num_threads: 96
+  kt_tp_enabled: true
+  kt_threadpool_count: 2
+  kt_max_cache_depth: 2
+  kt_share_backward_bb: true
+  lora_rank: 8