compute_environment: LOCAL_MACHINE distributed_type: FSDP fsdp_config: fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_cpu_ram_efficient_loading: true fsdp_offload_params: false fsdp_reshard_after_forward: true fsdp_state_dict_type: FULL_STATE_DICT fsdp_version: 2 mixed_precision: bf16 num_machines: 1 num_processes: 4 # Adjust based on your GPU count; 4 is suitable for 4 GPUs rdzv_backend: static same_network: true use_cpu: false kt_config: enabled: true kt_backend: AMXBF16 # Use with original BF16 expert weights. kt_num_threads: 96 kt_tp_enabled: true kt_threadpool_count: 2 kt_max_cache_depth: 2 kt_share_backward_bb: true lora_rank: 8