#!/bin/bash NPROC_PER_NODE=4 CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \ --nproc_per_node $NPROC_PER_NODE \ --nnodes 1 \ --standalone \ src/train.py examples/full_multi_gpu/llama3_full_sft.yaml