[v1] support resume training from checkpoint (#10280)

Co-authored-by: frozenleaves <frozen@Mac.local> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-04-27 18:29:08 +08:00 · 2026-04-20 20:28:08 +08:00
parent c5aecaf31d
commit c4bbac49b2
9 changed files with 577 additions and 10 deletions
--- a/src/llamafactory/v1/config/training_args.py
+++ b/src/llamafactory/v1/config/training_args.py
@@ -85,6 +85,28 @@ class TrainingArguments:
        default=42,
        metadata={"help": "Random seed that will be set at the beginning of training."},
    )
+    resume_from_checkpoint: str | None = field(
+        default=None,
+        metadata={"help": "Path to a checkpoint directory to resume training from, or 'auto' to find the latest."},
+    )
+    save_steps: int | None = field(
+        default=None,
+        metadata={"help": "Save a training checkpoint every N global steps."},
+    )
+    save_epochs: float | None = field(
+        default=None,
+        metadata={"help": "Save a training checkpoint every N epochs."},
+    )
+    save_ckpt_as_hf: bool = field(
+        default=False,
+        metadata={
+            "help": "Save intermediate checkpoints in HuggingFace format instead of distributed format. Warning: doubles memory usage."
+        },
+    )
+    save_total_limit: int | None = field(
+        default=None,
+        metadata={"help": "Maximum number of checkpoints to keep. Oldest checkpoints are deleted."},
+    )
    logging_steps: int = field(
        default=1,
        metadata={"help": "Log metrics every N optimizer steps."},