[trainer] update config (#7174)

Former-commit-id: b4b89b4ff3bc03aa388569e253d62580755a77a5
This commit is contained in:
hoshi-hiyouga 2025-03-05 23:32:54 +08:00 committed by GitHub
parent 8dddffa340
commit 6e58115f98
13 changed files with 49 additions and 7 deletions

View File

@ -15,6 +15,7 @@ cutoff_len: 2048
max_samples: 1000 max_samples: 1000
overwrite_cache: true overwrite_cache: true
preprocessing_num_workers: 16 preprocessing_num_workers: 16
dataloader_num_workers: 4
### output ### output
output_dir: saves/llama3-8b/full/sft output_dir: saves/llama3-8b/full/sft
@ -22,6 +23,7 @@ logging_steps: 10
save_steps: 500 save_steps: 500
plot_loss: true plot_loss: true
overwrite_output_dir: true overwrite_output_dir: true
save_only_model: false
### train ### train
per_device_train_batch_size: 1 per_device_train_batch_size: 1
@ -32,6 +34,7 @@ lr_scheduler_type: cosine
warmup_ratio: 0.1 warmup_ratio: 0.1
bf16: true bf16: true
ddp_timeout: 180000000 ddp_timeout: 180000000
resume_from_checkpoint: null
### eval ### eval
# eval_dataset: alpaca_en_demo # eval_dataset: alpaca_en_demo

View File

@ -20,6 +20,7 @@ cutoff_len: 2048
max_samples: 1000 max_samples: 1000
overwrite_cache: true overwrite_cache: true
preprocessing_num_workers: 16 preprocessing_num_workers: 16
dataloader_num_workers: 4
### output ### output
output_dir: saves/qwen2_vl-7b/full/sft output_dir: saves/qwen2_vl-7b/full/sft
@ -27,6 +28,7 @@ logging_steps: 10
save_steps: 500 save_steps: 500
plot_loss: true plot_loss: true
overwrite_output_dir: true overwrite_output_dir: true
save_only_model: false
### train ### train
per_device_train_batch_size: 1 per_device_train_batch_size: 1
@ -37,6 +39,7 @@ lr_scheduler_type: cosine
warmup_ratio: 0.1 warmup_ratio: 0.1
bf16: true bf16: true
ddp_timeout: 180000000 ddp_timeout: 180000000
resume_from_checkpoint: null
### eval ### eval
# val_size: 0.1 # val_size: 0.1

View File

@ -18,6 +18,7 @@ cutoff_len: 2048
max_samples: 1000 max_samples: 1000
overwrite_cache: true overwrite_cache: true
preprocessing_num_workers: 16 preprocessing_num_workers: 16
dataloader_num_workers: 4
### output ### output
output_dir: saves/llama3-8b/lora/dpo output_dir: saves/llama3-8b/lora/dpo
@ -25,6 +26,7 @@ logging_steps: 10
save_steps: 500 save_steps: 500
plot_loss: true plot_loss: true
overwrite_output_dir: true overwrite_output_dir: true
save_only_model: false
### train ### train
per_device_train_batch_size: 1 per_device_train_batch_size: 1
@ -35,6 +37,7 @@ lr_scheduler_type: cosine
warmup_ratio: 0.1 warmup_ratio: 0.1
bf16: true bf16: true
ddp_timeout: 180000000 ddp_timeout: 180000000
resume_from_checkpoint: null
### eval ### eval
# eval_dataset: dpo_en_demo # eval_dataset: dpo_en_demo

View File

@ -15,6 +15,7 @@ cutoff_len: 2048
max_samples: 1000 max_samples: 1000
overwrite_cache: true overwrite_cache: true
preprocessing_num_workers: 16 preprocessing_num_workers: 16
dataloader_num_workers: 4
### output ### output
output_dir: saves/llama3-8b/lora/pretrain output_dir: saves/llama3-8b/lora/pretrain
@ -22,6 +23,7 @@ logging_steps: 10
save_steps: 500 save_steps: 500
plot_loss: true plot_loss: true
overwrite_output_dir: true overwrite_output_dir: true
save_only_model: false
### train ### train
per_device_train_batch_size: 1 per_device_train_batch_size: 1
@ -32,6 +34,7 @@ lr_scheduler_type: cosine
warmup_ratio: 0.1 warmup_ratio: 0.1
bf16: true bf16: true
ddp_timeout: 180000000 ddp_timeout: 180000000
resume_from_checkpoint: null
### eval ### eval
# eval_dataset: c4_demo # eval_dataset: c4_demo

View File

@ -16,6 +16,7 @@ cutoff_len: 2048
max_samples: 1000 max_samples: 1000
overwrite_cache: true overwrite_cache: true
preprocessing_num_workers: 16 preprocessing_num_workers: 16
dataloader_num_workers: 4
### output ### output
output_dir: saves/llama3-8b/lora/reward output_dir: saves/llama3-8b/lora/reward
@ -23,6 +24,7 @@ logging_steps: 10
save_steps: 500 save_steps: 500
plot_loss: true plot_loss: true
overwrite_output_dir: true overwrite_output_dir: true
save_only_model: false
### train ### train
per_device_train_batch_size: 1 per_device_train_batch_size: 1
@ -33,6 +35,7 @@ lr_scheduler_type: cosine
warmup_ratio: 0.1 warmup_ratio: 0.1
bf16: true bf16: true
ddp_timeout: 180000000 ddp_timeout: 180000000
resume_from_checkpoint: null
### eval ### eval
# eval_dataset: dpo_en_demo # eval_dataset: dpo_en_demo

View File

@ -16,6 +16,7 @@ cutoff_len: 2048
max_samples: 1000 max_samples: 1000
overwrite_cache: true overwrite_cache: true
preprocessing_num_workers: 16 preprocessing_num_workers: 16
dataloader_num_workers: 4
### output ### output
output_dir: saves/llama3-8b/lora/sft output_dir: saves/llama3-8b/lora/sft
@ -23,6 +24,7 @@ logging_steps: 10
save_steps: 500 save_steps: 500
plot_loss: true plot_loss: true
overwrite_output_dir: true overwrite_output_dir: true
save_only_model: false
### train ### train
per_device_train_batch_size: 1 per_device_train_batch_size: 1
@ -33,6 +35,7 @@ lr_scheduler_type: cosine
warmup_ratio: 0.1 warmup_ratio: 0.1
bf16: true bf16: true
ddp_timeout: 180000000 ddp_timeout: 180000000
resume_from_checkpoint: null
### eval ### eval
# eval_dataset: alpaca_en_demo # eval_dataset: alpaca_en_demo

View File

@ -17,6 +17,7 @@ cutoff_len: 2048
max_samples: 1000 max_samples: 1000
overwrite_cache: true overwrite_cache: true
preprocessing_num_workers: 16 preprocessing_num_workers: 16
dataloader_num_workers: 4
### output ### output
output_dir: saves/llama3-8b/lora/sft output_dir: saves/llama3-8b/lora/sft
@ -24,6 +25,7 @@ logging_steps: 10
save_steps: 500 save_steps: 500
plot_loss: true plot_loss: true
overwrite_output_dir: true overwrite_output_dir: true
save_only_model: false
### train ### train
per_device_train_batch_size: 1 per_device_train_batch_size: 1
@ -34,8 +36,10 @@ lr_scheduler_type: cosine
warmup_ratio: 0.1 warmup_ratio: 0.1
bf16: true bf16: true
ddp_timeout: 180000000 ddp_timeout: 180000000
resume_from_checkpoint: null
### eval ### eval
# eval_dataset: alpaca_en_demo
# val_size: 0.1 # val_size: 0.1
# per_device_eval_batch_size: 1 # per_device_eval_batch_size: 1
# eval_strategy: steps # eval_strategy: steps

View File

@ -17,6 +17,7 @@ cutoff_len: 2048
max_samples: 1000 max_samples: 1000
overwrite_cache: true overwrite_cache: true
preprocessing_num_workers: 16 preprocessing_num_workers: 16
dataloader_num_workers: 4
### output ### output
output_dir: tmp_dir output_dir: tmp_dir
@ -24,9 +25,11 @@ logging_steps: 10
save_steps: 500 save_steps: 500
plot_loss: true plot_loss: true
overwrite_output_dir: true overwrite_output_dir: true
save_only_model: false
### ray ### ray
ray_run_name: llama3_8b_sft_lora ray_run_name: llama3_8b_sft_lora
ray_storage_path: ./saves
ray_num_workers: 4 # number of GPUs to use ray_num_workers: 4 # number of GPUs to use
resources_per_worker: resources_per_worker:
GPU: 1 GPU: 1
@ -41,8 +44,10 @@ lr_scheduler_type: cosine
warmup_ratio: 0.1 warmup_ratio: 0.1
bf16: true bf16: true
ddp_timeout: 180000000 ddp_timeout: 180000000
resume_from_checkpoint: null
### eval ### eval
# eval_dataset: alpaca_en_demo
# val_size: 0.1 # val_size: 0.1
# per_device_eval_batch_size: 1 # per_device_eval_batch_size: 1
# eval_strategy: steps # eval_strategy: steps

View File

@ -16,6 +16,7 @@ cutoff_len: 2048
max_samples: 1000 max_samples: 1000
overwrite_cache: true overwrite_cache: true
preprocessing_num_workers: 16 preprocessing_num_workers: 16
dataloader_num_workers: 4
### output ### output
output_dir: saves/llava1_5-7b/lora/sft output_dir: saves/llava1_5-7b/lora/sft
@ -23,6 +24,7 @@ logging_steps: 10
save_steps: 500 save_steps: 500
plot_loss: true plot_loss: true
overwrite_output_dir: true overwrite_output_dir: true
save_only_model: false
### train ### train
per_device_train_batch_size: 1 per_device_train_batch_size: 1
@ -33,6 +35,7 @@ lr_scheduler_type: cosine
warmup_ratio: 0.1 warmup_ratio: 0.1
bf16: true bf16: true
ddp_timeout: 180000000 ddp_timeout: 180000000
resume_from_checkpoint: null
### eval ### eval
# val_size: 0.1 # val_size: 0.1

View File

@ -20,6 +20,7 @@ cutoff_len: 2048
max_samples: 1000 max_samples: 1000
overwrite_cache: true overwrite_cache: true
preprocessing_num_workers: 16 preprocessing_num_workers: 16
dataloader_num_workers: 4
### output ### output
output_dir: saves/qwen2_vl-7b/lora/dpo output_dir: saves/qwen2_vl-7b/lora/dpo
@ -27,6 +28,7 @@ logging_steps: 10
save_steps: 500 save_steps: 500
plot_loss: true plot_loss: true
overwrite_output_dir: true overwrite_output_dir: true
save_only_model: false
### train ### train
per_device_train_batch_size: 1 per_device_train_batch_size: 1
@ -37,6 +39,7 @@ lr_scheduler_type: cosine
warmup_ratio: 0.1 warmup_ratio: 0.1
bf16: true bf16: true
ddp_timeout: 180000000 ddp_timeout: 180000000
resume_from_checkpoint: null
### eval ### eval
# val_size: 0.1 # val_size: 0.1

View File

@ -18,6 +18,7 @@ cutoff_len: 2048
max_samples: 1000 max_samples: 1000
overwrite_cache: true overwrite_cache: true
preprocessing_num_workers: 16 preprocessing_num_workers: 16
dataloader_num_workers: 4
### output ### output
output_dir: saves/qwen2_vl-7b/lora/sft output_dir: saves/qwen2_vl-7b/lora/sft
@ -25,6 +26,7 @@ logging_steps: 10
save_steps: 500 save_steps: 500
plot_loss: true plot_loss: true
overwrite_output_dir: true overwrite_output_dir: true
save_only_model: false
### train ### train
per_device_train_batch_size: 1 per_device_train_batch_size: 1
@ -35,6 +37,7 @@ lr_scheduler_type: cosine
warmup_ratio: 0.1 warmup_ratio: 0.1
bf16: true bf16: true
ddp_timeout: 180000000 ddp_timeout: 180000000
resume_from_checkpoint: null
### eval ### eval
# val_size: 0.1 # val_size: 0.1

View File

@ -521,9 +521,7 @@ class MiniCPMVPlugin(BasePlugin):
processor: Optional["ProcessorMixin"], processor: Optional["ProcessorMixin"],
) -> List[Dict[str, str]]: ) -> List[Dict[str, str]]:
self._validate_input(processor, images, videos, audios) self._validate_input(processor, images, videos, audios)
num_image_tokens = 0 num_image_tokens, num_video_tokens, num_audio_tokens = 0, 0, 0
num_video_tokens = 0
num_audio_tokens = 0
messages = deepcopy(messages) messages = deepcopy(messages)
image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
mm_inputs = {} mm_inputs = {}
@ -1038,7 +1036,7 @@ class Qwen2AudioPlugin(BasePlugin):
@dataclass @dataclass
class Qwen2vlPlugin(BasePlugin): class Qwen2VLPlugin(BasePlugin):
@override @override
def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject": def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject":
image = super()._preprocess_image(image, **kwargs) image = super()._preprocess_image(image, **kwargs)
@ -1124,7 +1122,10 @@ class Qwen2vlPlugin(BasePlugin):
processor: Optional["ProcessorMixin"], processor: Optional["ProcessorMixin"],
) -> List[Dict[str, str]]: ) -> List[Dict[str, str]]:
self._validate_input(processor, images, videos, audios) self._validate_input(processor, images, videos, audios)
num_image_tokens, num_video_tokens = 0, 0
messages = deepcopy(messages)
image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
merge_length: int = getattr(image_processor, "merge_size") ** 2 merge_length: int = getattr(image_processor, "merge_size") ** 2
if self.expand_mm_tokens: if self.expand_mm_tokens:
mm_inputs = self._get_mm_inputs(images, videos, audios, processor) mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
@ -1134,8 +1135,6 @@ class Qwen2vlPlugin(BasePlugin):
image_grid_thw = [None] * len(images) image_grid_thw = [None] * len(images)
video_grid_thw = [None] * len(videos) video_grid_thw = [None] * len(videos)
num_image_tokens, num_video_tokens = 0, 0
messages = deepcopy(messages)
for message in messages: for message in messages:
content = message["content"] content = message["content"]
while IMAGE_PLACEHOLDER in content: while IMAGE_PLACEHOLDER in content:
@ -1273,7 +1272,7 @@ PLUGINS = {
"paligemma": PaliGemmaPlugin, "paligemma": PaliGemmaPlugin,
"pixtral": PixtralPlugin, "pixtral": PixtralPlugin,
"qwen2_audio": Qwen2AudioPlugin, "qwen2_audio": Qwen2AudioPlugin,
"qwen2_vl": Qwen2vlPlugin, "qwen2_vl": Qwen2VLPlugin,
"video_llava": VideoLlavaPlugin, "video_llava": VideoLlavaPlugin,
} }

View File

@ -17,6 +17,7 @@ import shutil
from typing import TYPE_CHECKING, Any, Dict, List, Optional from typing import TYPE_CHECKING, Any, Dict, List, Optional
import torch import torch
import torch.distributed as dist
from transformers import PreTrainedModel from transformers import PreTrainedModel
from ..data import get_template_and_fix_tokenizer from ..data import get_template_and_fix_tokenizer
@ -76,6 +77,12 @@ def _training_function(config: Dict[str, Any]) -> None:
else: else:
raise ValueError(f"Unknown task: {finetuning_args.stage}.") raise ValueError(f"Unknown task: {finetuning_args.stage}.")
try:
if dist.is_initialized():
dist.destroy_process_group()
except Exception as e:
logger.warning(f"Failed to destroy process group: {e}.")
def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["TrainerCallback"]] = None) -> None: def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["TrainerCallback"]] = None) -> None:
args = read_args(args) args = read_args(args)