From 6e58115f98ad9aa2fccd9a90a960c58bee724bf0 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 5 Mar 2025 23:32:54 +0800
Subject: [PATCH] [trainer] update config (#7174)

Former-commit-id: b4b89b4ff3bc03aa388569e253d62580755a77a5
---
 examples/train_full/llama3_full_sft.yaml      |  3 +++
 examples/train_full/qwen2vl_full_sft.yaml     |  3 +++
 examples/train_lora/llama3_lora_dpo.yaml      |  3 +++
 examples/train_lora/llama3_lora_pretrain.yaml |  3 +++
 examples/train_lora/llama3_lora_reward.yaml   |  3 +++
 examples/train_lora/llama3_lora_sft.yaml      |  3 +++
 examples/train_lora/llama3_lora_sft_ds3.yaml  |  4 ++++
 examples/train_lora/llama3_lora_sft_ray.yaml  |  5 +++++
 examples/train_lora/llava1_5_lora_sft.yaml    |  3 +++
 examples/train_lora/qwen2vl_lora_dpo.yaml     |  3 +++
 examples/train_lora/qwen2vl_lora_sft.yaml     |  3 +++
 src/llamafactory/data/mm_plugin.py            | 13 ++++++-------
 src/llamafactory/train/tuner.py               |  7 +++++++
 13 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/examples/train_full/llama3_full_sft.yaml b/examples/train_full/llama3_full_sft.yaml
index fa914838..19d6df42 100644
--- a/examples/train_full/llama3_full_sft.yaml
+++ b/examples/train_full/llama3_full_sft.yaml
@@ -15,6 +15,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 
 ### output
 output_dir: saves/llama3-8b/full/sft
@@ -22,6 +23,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 
 ### train
 per_device_train_batch_size: 1
@@ -32,6 +34,7 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 
 ### eval
 # eval_dataset: alpaca_en_demo
diff --git a/examples/train_full/qwen2vl_full_sft.yaml b/examples/train_full/qwen2vl_full_sft.yaml
index bdf28fe9..559bca48 100644
--- a/examples/train_full/qwen2vl_full_sft.yaml
+++ b/examples/train_full/qwen2vl_full_sft.yaml
@@ -20,6 +20,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 
 ### output
 output_dir: saves/qwen2_vl-7b/full/sft
@@ -27,6 +28,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 
 ### train
 per_device_train_batch_size: 1
@@ -37,6 +39,7 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 
 ### eval
 # val_size: 0.1
diff --git a/examples/train_lora/llama3_lora_dpo.yaml b/examples/train_lora/llama3_lora_dpo.yaml
index 102b79ab..1b890ab2 100644
--- a/examples/train_lora/llama3_lora_dpo.yaml
+++ b/examples/train_lora/llama3_lora_dpo.yaml
@@ -18,6 +18,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 
 ### output
 output_dir: saves/llama3-8b/lora/dpo
@@ -25,6 +26,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 
 ### train
 per_device_train_batch_size: 1
@@ -35,6 +37,7 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 
 ### eval
 # eval_dataset: dpo_en_demo
diff --git a/examples/train_lora/llama3_lora_pretrain.yaml b/examples/train_lora/llama3_lora_pretrain.yaml
index aa0b5df8..82e0d58a 100644
--- a/examples/train_lora/llama3_lora_pretrain.yaml
+++ b/examples/train_lora/llama3_lora_pretrain.yaml
@@ -15,6 +15,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 
 ### output
 output_dir: saves/llama3-8b/lora/pretrain
@@ -22,6 +23,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 
 ### train
 per_device_train_batch_size: 1
@@ -32,6 +34,7 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 
 ### eval
 # eval_dataset: c4_demo
diff --git a/examples/train_lora/llama3_lora_reward.yaml b/examples/train_lora/llama3_lora_reward.yaml
index c27029bf..e71a99b8 100644
--- a/examples/train_lora/llama3_lora_reward.yaml
+++ b/examples/train_lora/llama3_lora_reward.yaml
@@ -16,6 +16,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 
 ### output
 output_dir: saves/llama3-8b/lora/reward
@@ -23,6 +24,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 
 ### train
 per_device_train_batch_size: 1
@@ -33,6 +35,7 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 
 ### eval
 # eval_dataset: dpo_en_demo
diff --git a/examples/train_lora/llama3_lora_sft.yaml b/examples/train_lora/llama3_lora_sft.yaml
index 6a4bb636..fe889208 100644
--- a/examples/train_lora/llama3_lora_sft.yaml
+++ b/examples/train_lora/llama3_lora_sft.yaml
@@ -16,6 +16,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 
 ### output
 output_dir: saves/llama3-8b/lora/sft
@@ -23,6 +24,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 
 ### train
 per_device_train_batch_size: 1
@@ -33,6 +35,7 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 
 ### eval
 # eval_dataset: alpaca_en_demo
diff --git a/examples/train_lora/llama3_lora_sft_ds3.yaml b/examples/train_lora/llama3_lora_sft_ds3.yaml
index 7bf91ec3..b35f5466 100644
--- a/examples/train_lora/llama3_lora_sft_ds3.yaml
+++ b/examples/train_lora/llama3_lora_sft_ds3.yaml
@@ -17,6 +17,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 
 ### output
 output_dir: saves/llama3-8b/lora/sft
@@ -24,6 +25,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 
 ### train
 per_device_train_batch_size: 1
@@ -34,8 +36,10 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 
 ### eval
+# eval_dataset: alpaca_en_demo
 # val_size: 0.1
 # per_device_eval_batch_size: 1
 # eval_strategy: steps
diff --git a/examples/train_lora/llama3_lora_sft_ray.yaml b/examples/train_lora/llama3_lora_sft_ray.yaml
index 58856a16..d30e986b 100644
--- a/examples/train_lora/llama3_lora_sft_ray.yaml
+++ b/examples/train_lora/llama3_lora_sft_ray.yaml
@@ -17,6 +17,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 
 ### output
 output_dir: tmp_dir
@@ -24,9 +25,11 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 
 ### ray
 ray_run_name: llama3_8b_sft_lora
+ray_storage_path: ./saves
 ray_num_workers: 4  # number of GPUs to use
 resources_per_worker:
   GPU: 1
@@ -41,8 +44,10 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 
 ### eval
+# eval_dataset: alpaca_en_demo
 # val_size: 0.1
 # per_device_eval_batch_size: 1
 # eval_strategy: steps
diff --git a/examples/train_lora/llava1_5_lora_sft.yaml b/examples/train_lora/llava1_5_lora_sft.yaml
index 24d09d91..116c2a42 100644
--- a/examples/train_lora/llava1_5_lora_sft.yaml
+++ b/examples/train_lora/llava1_5_lora_sft.yaml
@@ -16,6 +16,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 
 ### output
 output_dir: saves/llava1_5-7b/lora/sft
@@ -23,6 +24,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 
 ### train
 per_device_train_batch_size: 1
@@ -33,6 +35,7 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 
 ### eval
 # val_size: 0.1
diff --git a/examples/train_lora/qwen2vl_lora_dpo.yaml b/examples/train_lora/qwen2vl_lora_dpo.yaml
index 6fed819e..148c4ec2 100644
--- a/examples/train_lora/qwen2vl_lora_dpo.yaml
+++ b/examples/train_lora/qwen2vl_lora_dpo.yaml
@@ -20,6 +20,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 
 ### output
 output_dir: saves/qwen2_vl-7b/lora/dpo
@@ -27,6 +28,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 
 ### train
 per_device_train_batch_size: 1
@@ -37,6 +39,7 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 
 ### eval
 # val_size: 0.1
diff --git a/examples/train_lora/qwen2vl_lora_sft.yaml b/examples/train_lora/qwen2vl_lora_sft.yaml
index e2c11520..c57b78e4 100644
--- a/examples/train_lora/qwen2vl_lora_sft.yaml
+++ b/examples/train_lora/qwen2vl_lora_sft.yaml
@@ -18,6 +18,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 
 ### output
 output_dir: saves/qwen2_vl-7b/lora/sft
@@ -25,6 +26,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 
 ### train
 per_device_train_batch_size: 1
@@ -35,6 +37,7 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 
 ### eval
 # val_size: 0.1
diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index 5597d73b..e074d021 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -521,9 +521,7 @@ class MiniCPMVPlugin(BasePlugin):
         processor: Optional["ProcessorMixin"],
     ) -> List[Dict[str, str]]:
         self._validate_input(processor, images, videos, audios)
-        num_image_tokens = 0
-        num_video_tokens = 0
-        num_audio_tokens = 0
+        num_image_tokens, num_video_tokens, num_audio_tokens = 0, 0, 0
         messages = deepcopy(messages)
         image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
         mm_inputs = {}
@@ -1038,7 +1036,7 @@ class Qwen2AudioPlugin(BasePlugin):
 
 
 @dataclass
-class Qwen2vlPlugin(BasePlugin):
+class Qwen2VLPlugin(BasePlugin):
     @override
     def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject":
         image = super()._preprocess_image(image, **kwargs)
@@ -1124,7 +1122,10 @@ class Qwen2vlPlugin(BasePlugin):
         processor: Optional["ProcessorMixin"],
     ) -> List[Dict[str, str]]:
         self._validate_input(processor, images, videos, audios)
+        num_image_tokens, num_video_tokens = 0, 0
+        messages = deepcopy(messages)
         image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
+
         merge_length: int = getattr(image_processor, "merge_size") ** 2
         if self.expand_mm_tokens:
             mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
@@ -1134,8 +1135,6 @@ class Qwen2vlPlugin(BasePlugin):
             image_grid_thw = [None] * len(images)
             video_grid_thw = [None] * len(videos)
 
-        num_image_tokens, num_video_tokens = 0, 0
-        messages = deepcopy(messages)
         for message in messages:
             content = message["content"]
             while IMAGE_PLACEHOLDER in content:
@@ -1273,7 +1272,7 @@ PLUGINS = {
     "paligemma": PaliGemmaPlugin,
     "pixtral": PixtralPlugin,
     "qwen2_audio": Qwen2AudioPlugin,
-    "qwen2_vl": Qwen2vlPlugin,
+    "qwen2_vl": Qwen2VLPlugin,
     "video_llava": VideoLlavaPlugin,
 }
 
diff --git a/src/llamafactory/train/tuner.py b/src/llamafactory/train/tuner.py
index 73800694..767d0cda 100644
--- a/src/llamafactory/train/tuner.py
+++ b/src/llamafactory/train/tuner.py
@@ -17,6 +17,7 @@ import shutil
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
 import torch
+import torch.distributed as dist
 from transformers import PreTrainedModel
 
 from ..data import get_template_and_fix_tokenizer
@@ -76,6 +77,12 @@ def _training_function(config: Dict[str, Any]) -> None:
     else:
         raise ValueError(f"Unknown task: {finetuning_args.stage}.")
 
+    try:
+        if dist.is_initialized():
+            dist.destroy_process_group()
+    except Exception as e:
+        logger.warning(f"Failed to destroy process group: {e}.")
+
 
 def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["TrainerCallback"]] = None) -> None:
     args = read_args(args)