[v1] support resume training from checkpoint (#10280)

Co-authored-by: frozenleaves <frozen@Mac.local>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
浮梦
2026-04-20 20:28:08 +08:00
committed by GitHub
parent c5aecaf31d
commit c4bbac49b2
9 changed files with 577 additions and 10 deletions

View File

@@ -85,6 +85,28 @@ class TrainingArguments:
default=42,
metadata={"help": "Random seed that will be set at the beginning of training."},
)
resume_from_checkpoint: str | None = field(
default=None,
metadata={"help": "Path to a checkpoint directory to resume training from, or 'auto' to find the latest."},
)
save_steps: int | None = field(
default=None,
metadata={"help": "Save a training checkpoint every N global steps."},
)
save_epochs: float | None = field(
default=None,
metadata={"help": "Save a training checkpoint every N epochs."},
)
save_ckpt_as_hf: bool = field(
default=False,
metadata={
"help": "Save intermediate checkpoints in HuggingFace format instead of distributed format. Warning: doubles memory usage."
},
)
save_total_limit: int | None = field(
default=None,
metadata={"help": "Maximum number of checkpoints to keep. Oldest checkpoints are deleted."},
)
logging_steps: int = field(
default=1,
metadata={"help": "Log metrics every N optimizer steps."},