From 6e58115f98ad9aa2fccd9a90a960c58bee724bf0 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Wed, 5 Mar 2025 23:32:54 +0800 Subject: [PATCH] [trainer] update config (#7174) Former-commit-id: b4b89b4ff3bc03aa388569e253d62580755a77a5 --- examples/train_full/llama3_full_sft.yaml | 3 +++ examples/train_full/qwen2vl_full_sft.yaml | 3 +++ examples/train_lora/llama3_lora_dpo.yaml | 3 +++ examples/train_lora/llama3_lora_pretrain.yaml | 3 +++ examples/train_lora/llama3_lora_reward.yaml | 3 +++ examples/train_lora/llama3_lora_sft.yaml | 3 +++ examples/train_lora/llama3_lora_sft_ds3.yaml | 4 ++++ examples/train_lora/llama3_lora_sft_ray.yaml | 5 +++++ examples/train_lora/llava1_5_lora_sft.yaml | 3 +++ examples/train_lora/qwen2vl_lora_dpo.yaml | 3 +++ examples/train_lora/qwen2vl_lora_sft.yaml | 3 +++ src/llamafactory/data/mm_plugin.py | 13 ++++++------- src/llamafactory/train/tuner.py | 7 +++++++ 13 files changed, 49 insertions(+), 7 deletions(-) diff --git a/examples/train_full/llama3_full_sft.yaml b/examples/train_full/llama3_full_sft.yaml index fa914838..19d6df42 100644 --- a/examples/train_full/llama3_full_sft.yaml +++ b/examples/train_full/llama3_full_sft.yaml @@ -15,6 +15,7 @@ cutoff_len: 2048 max_samples: 1000 overwrite_cache: true preprocessing_num_workers: 16 +dataloader_num_workers: 4 ### output output_dir: saves/llama3-8b/full/sft @@ -22,6 +23,7 @@ logging_steps: 10 save_steps: 500 plot_loss: true overwrite_output_dir: true +save_only_model: false ### train per_device_train_batch_size: 1 @@ -32,6 +34,7 @@ lr_scheduler_type: cosine warmup_ratio: 0.1 bf16: true ddp_timeout: 180000000 +resume_from_checkpoint: null ### eval # eval_dataset: alpaca_en_demo diff --git a/examples/train_full/qwen2vl_full_sft.yaml b/examples/train_full/qwen2vl_full_sft.yaml index bdf28fe9..559bca48 100644 --- a/examples/train_full/qwen2vl_full_sft.yaml +++ b/examples/train_full/qwen2vl_full_sft.yaml @@ -20,6 +20,7 @@ cutoff_len: 2048 max_samples: 1000 overwrite_cache: true preprocessing_num_workers: 16 +dataloader_num_workers: 4 ### output output_dir: saves/qwen2_vl-7b/full/sft @@ -27,6 +28,7 @@ logging_steps: 10 save_steps: 500 plot_loss: true overwrite_output_dir: true +save_only_model: false ### train per_device_train_batch_size: 1 @@ -37,6 +39,7 @@ lr_scheduler_type: cosine warmup_ratio: 0.1 bf16: true ddp_timeout: 180000000 +resume_from_checkpoint: null ### eval # val_size: 0.1 diff --git a/examples/train_lora/llama3_lora_dpo.yaml b/examples/train_lora/llama3_lora_dpo.yaml index 102b79ab..1b890ab2 100644 --- a/examples/train_lora/llama3_lora_dpo.yaml +++ b/examples/train_lora/llama3_lora_dpo.yaml @@ -18,6 +18,7 @@ cutoff_len: 2048 max_samples: 1000 overwrite_cache: true preprocessing_num_workers: 16 +dataloader_num_workers: 4 ### output output_dir: saves/llama3-8b/lora/dpo @@ -25,6 +26,7 @@ logging_steps: 10 save_steps: 500 plot_loss: true overwrite_output_dir: true +save_only_model: false ### train per_device_train_batch_size: 1 @@ -35,6 +37,7 @@ lr_scheduler_type: cosine warmup_ratio: 0.1 bf16: true ddp_timeout: 180000000 +resume_from_checkpoint: null ### eval # eval_dataset: dpo_en_demo diff --git a/examples/train_lora/llama3_lora_pretrain.yaml b/examples/train_lora/llama3_lora_pretrain.yaml index aa0b5df8..82e0d58a 100644 --- a/examples/train_lora/llama3_lora_pretrain.yaml +++ b/examples/train_lora/llama3_lora_pretrain.yaml @@ -15,6 +15,7 @@ cutoff_len: 2048 max_samples: 1000 overwrite_cache: true preprocessing_num_workers: 16 +dataloader_num_workers: 4 ### output output_dir: saves/llama3-8b/lora/pretrain @@ -22,6 +23,7 @@ logging_steps: 10 save_steps: 500 plot_loss: true overwrite_output_dir: true +save_only_model: false ### train per_device_train_batch_size: 1 @@ -32,6 +34,7 @@ lr_scheduler_type: cosine warmup_ratio: 0.1 bf16: true ddp_timeout: 180000000 +resume_from_checkpoint: null ### eval # eval_dataset: c4_demo diff --git a/examples/train_lora/llama3_lora_reward.yaml b/examples/train_lora/llama3_lora_reward.yaml index c27029bf..e71a99b8 100644 --- a/examples/train_lora/llama3_lora_reward.yaml +++ b/examples/train_lora/llama3_lora_reward.yaml @@ -16,6 +16,7 @@ cutoff_len: 2048 max_samples: 1000 overwrite_cache: true preprocessing_num_workers: 16 +dataloader_num_workers: 4 ### output output_dir: saves/llama3-8b/lora/reward @@ -23,6 +24,7 @@ logging_steps: 10 save_steps: 500 plot_loss: true overwrite_output_dir: true +save_only_model: false ### train per_device_train_batch_size: 1 @@ -33,6 +35,7 @@ lr_scheduler_type: cosine warmup_ratio: 0.1 bf16: true ddp_timeout: 180000000 +resume_from_checkpoint: null ### eval # eval_dataset: dpo_en_demo diff --git a/examples/train_lora/llama3_lora_sft.yaml b/examples/train_lora/llama3_lora_sft.yaml index 6a4bb636..fe889208 100644 --- a/examples/train_lora/llama3_lora_sft.yaml +++ b/examples/train_lora/llama3_lora_sft.yaml @@ -16,6 +16,7 @@ cutoff_len: 2048 max_samples: 1000 overwrite_cache: true preprocessing_num_workers: 16 +dataloader_num_workers: 4 ### output output_dir: saves/llama3-8b/lora/sft @@ -23,6 +24,7 @@ logging_steps: 10 save_steps: 500 plot_loss: true overwrite_output_dir: true +save_only_model: false ### train per_device_train_batch_size: 1 @@ -33,6 +35,7 @@ lr_scheduler_type: cosine warmup_ratio: 0.1 bf16: true ddp_timeout: 180000000 +resume_from_checkpoint: null ### eval # eval_dataset: alpaca_en_demo diff --git a/examples/train_lora/llama3_lora_sft_ds3.yaml b/examples/train_lora/llama3_lora_sft_ds3.yaml index 7bf91ec3..b35f5466 100644 --- a/examples/train_lora/llama3_lora_sft_ds3.yaml +++ b/examples/train_lora/llama3_lora_sft_ds3.yaml @@ -17,6 +17,7 @@ cutoff_len: 2048 max_samples: 1000 overwrite_cache: true preprocessing_num_workers: 16 +dataloader_num_workers: 4 ### output output_dir: saves/llama3-8b/lora/sft @@ -24,6 +25,7 @@ logging_steps: 10 save_steps: 500 plot_loss: true overwrite_output_dir: true +save_only_model: false ### train per_device_train_batch_size: 1 @@ -34,8 +36,10 @@ lr_scheduler_type: cosine warmup_ratio: 0.1 bf16: true ddp_timeout: 180000000 +resume_from_checkpoint: null ### eval +# eval_dataset: alpaca_en_demo # val_size: 0.1 # per_device_eval_batch_size: 1 # eval_strategy: steps diff --git a/examples/train_lora/llama3_lora_sft_ray.yaml b/examples/train_lora/llama3_lora_sft_ray.yaml index 58856a16..d30e986b 100644 --- a/examples/train_lora/llama3_lora_sft_ray.yaml +++ b/examples/train_lora/llama3_lora_sft_ray.yaml @@ -17,6 +17,7 @@ cutoff_len: 2048 max_samples: 1000 overwrite_cache: true preprocessing_num_workers: 16 +dataloader_num_workers: 4 ### output output_dir: tmp_dir @@ -24,9 +25,11 @@ logging_steps: 10 save_steps: 500 plot_loss: true overwrite_output_dir: true +save_only_model: false ### ray ray_run_name: llama3_8b_sft_lora +ray_storage_path: ./saves ray_num_workers: 4 # number of GPUs to use resources_per_worker: GPU: 1 @@ -41,8 +44,10 @@ lr_scheduler_type: cosine warmup_ratio: 0.1 bf16: true ddp_timeout: 180000000 +resume_from_checkpoint: null ### eval +# eval_dataset: alpaca_en_demo # val_size: 0.1 # per_device_eval_batch_size: 1 # eval_strategy: steps diff --git a/examples/train_lora/llava1_5_lora_sft.yaml b/examples/train_lora/llava1_5_lora_sft.yaml index 24d09d91..116c2a42 100644 --- a/examples/train_lora/llava1_5_lora_sft.yaml +++ b/examples/train_lora/llava1_5_lora_sft.yaml @@ -16,6 +16,7 @@ cutoff_len: 2048 max_samples: 1000 overwrite_cache: true preprocessing_num_workers: 16 +dataloader_num_workers: 4 ### output output_dir: saves/llava1_5-7b/lora/sft @@ -23,6 +24,7 @@ logging_steps: 10 save_steps: 500 plot_loss: true overwrite_output_dir: true +save_only_model: false ### train per_device_train_batch_size: 1 @@ -33,6 +35,7 @@ lr_scheduler_type: cosine warmup_ratio: 0.1 bf16: true ddp_timeout: 180000000 +resume_from_checkpoint: null ### eval # val_size: 0.1 diff --git a/examples/train_lora/qwen2vl_lora_dpo.yaml b/examples/train_lora/qwen2vl_lora_dpo.yaml index 6fed819e..148c4ec2 100644 --- a/examples/train_lora/qwen2vl_lora_dpo.yaml +++ b/examples/train_lora/qwen2vl_lora_dpo.yaml @@ -20,6 +20,7 @@ cutoff_len: 2048 max_samples: 1000 overwrite_cache: true preprocessing_num_workers: 16 +dataloader_num_workers: 4 ### output output_dir: saves/qwen2_vl-7b/lora/dpo @@ -27,6 +28,7 @@ logging_steps: 10 save_steps: 500 plot_loss: true overwrite_output_dir: true +save_only_model: false ### train per_device_train_batch_size: 1 @@ -37,6 +39,7 @@ lr_scheduler_type: cosine warmup_ratio: 0.1 bf16: true ddp_timeout: 180000000 +resume_from_checkpoint: null ### eval # val_size: 0.1 diff --git a/examples/train_lora/qwen2vl_lora_sft.yaml b/examples/train_lora/qwen2vl_lora_sft.yaml index e2c11520..c57b78e4 100644 --- a/examples/train_lora/qwen2vl_lora_sft.yaml +++ b/examples/train_lora/qwen2vl_lora_sft.yaml @@ -18,6 +18,7 @@ cutoff_len: 2048 max_samples: 1000 overwrite_cache: true preprocessing_num_workers: 16 +dataloader_num_workers: 4 ### output output_dir: saves/qwen2_vl-7b/lora/sft @@ -25,6 +26,7 @@ logging_steps: 10 save_steps: 500 plot_loss: true overwrite_output_dir: true +save_only_model: false ### train per_device_train_batch_size: 1 @@ -35,6 +37,7 @@ lr_scheduler_type: cosine warmup_ratio: 0.1 bf16: true ddp_timeout: 180000000 +resume_from_checkpoint: null ### eval # val_size: 0.1 diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 5597d73b..e074d021 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -521,9 +521,7 @@ class MiniCPMVPlugin(BasePlugin): processor: Optional["ProcessorMixin"], ) -> List[Dict[str, str]]: self._validate_input(processor, images, videos, audios) - num_image_tokens = 0 - num_video_tokens = 0 - num_audio_tokens = 0 + num_image_tokens, num_video_tokens, num_audio_tokens = 0, 0, 0 messages = deepcopy(messages) image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") mm_inputs = {} @@ -1038,7 +1036,7 @@ class Qwen2AudioPlugin(BasePlugin): @dataclass -class Qwen2vlPlugin(BasePlugin): +class Qwen2VLPlugin(BasePlugin): @override def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject": image = super()._preprocess_image(image, **kwargs) @@ -1124,7 +1122,10 @@ class Qwen2vlPlugin(BasePlugin): processor: Optional["ProcessorMixin"], ) -> List[Dict[str, str]]: self._validate_input(processor, images, videos, audios) + num_image_tokens, num_video_tokens = 0, 0 + messages = deepcopy(messages) image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") + merge_length: int = getattr(image_processor, "merge_size") ** 2 if self.expand_mm_tokens: mm_inputs = self._get_mm_inputs(images, videos, audios, processor) @@ -1134,8 +1135,6 @@ class Qwen2vlPlugin(BasePlugin): image_grid_thw = [None] * len(images) video_grid_thw = [None] * len(videos) - num_image_tokens, num_video_tokens = 0, 0 - messages = deepcopy(messages) for message in messages: content = message["content"] while IMAGE_PLACEHOLDER in content: @@ -1273,7 +1272,7 @@ PLUGINS = { "paligemma": PaliGemmaPlugin, "pixtral": PixtralPlugin, "qwen2_audio": Qwen2AudioPlugin, - "qwen2_vl": Qwen2vlPlugin, + "qwen2_vl": Qwen2VLPlugin, "video_llava": VideoLlavaPlugin, } diff --git a/src/llamafactory/train/tuner.py b/src/llamafactory/train/tuner.py index 73800694..767d0cda 100644 --- a/src/llamafactory/train/tuner.py +++ b/src/llamafactory/train/tuner.py @@ -17,6 +17,7 @@ import shutil from typing import TYPE_CHECKING, Any, Dict, List, Optional import torch +import torch.distributed as dist from transformers import PreTrainedModel from ..data import get_template_and_fix_tokenizer @@ -76,6 +77,12 @@ def _training_function(config: Dict[str, Any]) -> None: else: raise ValueError(f"Unknown task: {finetuning_args.stage}.") + try: + if dist.is_initialized(): + dist.destroy_process_group() + except Exception as e: + logger.warning(f"Failed to destroy process group: {e}.") + def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["TrainerCallback"]] = None) -> None: args = read_args(args)