compute_environment: LOCAL_MACHINE debug: false distributed_type: FSDP downcast_bf16: 'no' fsdp_config: fsdp_version: 2 fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: Qwen3_5DecoderLayer,Qwen3_5VisionBlock fsdp_cpu_ram_efficient_loading: true fsdp_offload_params: false fsdp_reshard_after_forward: true fsdp_state_dict_type: FULL_STATE_DICT machine_rank: 0 main_training_function: main mixed_precision: bf16 num_machines: 1 num_processes: 8 # Change to match your NPU count (e.g., 8 for A2, 16 for A3) rdzv_backend: static same_network: true use_cpu: false