mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-08-02 03:32:50 +08:00
[trainer] update config (#7174)
Former-commit-id: b4b89b4ff3bc03aa388569e253d62580755a77a5
This commit is contained in:
parent
8dddffa340
commit
6e58115f98
@ -15,6 +15,7 @@ cutoff_len: 2048
|
|||||||
max_samples: 1000
|
max_samples: 1000
|
||||||
overwrite_cache: true
|
overwrite_cache: true
|
||||||
preprocessing_num_workers: 16
|
preprocessing_num_workers: 16
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
|
||||||
### output
|
### output
|
||||||
output_dir: saves/llama3-8b/full/sft
|
output_dir: saves/llama3-8b/full/sft
|
||||||
@ -22,6 +23,7 @@ logging_steps: 10
|
|||||||
save_steps: 500
|
save_steps: 500
|
||||||
plot_loss: true
|
plot_loss: true
|
||||||
overwrite_output_dir: true
|
overwrite_output_dir: true
|
||||||
|
save_only_model: false
|
||||||
|
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
@ -32,6 +34,7 @@ lr_scheduler_type: cosine
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
bf16: true
|
bf16: true
|
||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
resume_from_checkpoint: null
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
# eval_dataset: alpaca_en_demo
|
# eval_dataset: alpaca_en_demo
|
||||||
|
@ -20,6 +20,7 @@ cutoff_len: 2048
|
|||||||
max_samples: 1000
|
max_samples: 1000
|
||||||
overwrite_cache: true
|
overwrite_cache: true
|
||||||
preprocessing_num_workers: 16
|
preprocessing_num_workers: 16
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
|
||||||
### output
|
### output
|
||||||
output_dir: saves/qwen2_vl-7b/full/sft
|
output_dir: saves/qwen2_vl-7b/full/sft
|
||||||
@ -27,6 +28,7 @@ logging_steps: 10
|
|||||||
save_steps: 500
|
save_steps: 500
|
||||||
plot_loss: true
|
plot_loss: true
|
||||||
overwrite_output_dir: true
|
overwrite_output_dir: true
|
||||||
|
save_only_model: false
|
||||||
|
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
@ -37,6 +39,7 @@ lr_scheduler_type: cosine
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
bf16: true
|
bf16: true
|
||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
resume_from_checkpoint: null
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
# val_size: 0.1
|
# val_size: 0.1
|
||||||
|
@ -18,6 +18,7 @@ cutoff_len: 2048
|
|||||||
max_samples: 1000
|
max_samples: 1000
|
||||||
overwrite_cache: true
|
overwrite_cache: true
|
||||||
preprocessing_num_workers: 16
|
preprocessing_num_workers: 16
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
|
||||||
### output
|
### output
|
||||||
output_dir: saves/llama3-8b/lora/dpo
|
output_dir: saves/llama3-8b/lora/dpo
|
||||||
@ -25,6 +26,7 @@ logging_steps: 10
|
|||||||
save_steps: 500
|
save_steps: 500
|
||||||
plot_loss: true
|
plot_loss: true
|
||||||
overwrite_output_dir: true
|
overwrite_output_dir: true
|
||||||
|
save_only_model: false
|
||||||
|
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
@ -35,6 +37,7 @@ lr_scheduler_type: cosine
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
bf16: true
|
bf16: true
|
||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
resume_from_checkpoint: null
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
# eval_dataset: dpo_en_demo
|
# eval_dataset: dpo_en_demo
|
||||||
|
@ -15,6 +15,7 @@ cutoff_len: 2048
|
|||||||
max_samples: 1000
|
max_samples: 1000
|
||||||
overwrite_cache: true
|
overwrite_cache: true
|
||||||
preprocessing_num_workers: 16
|
preprocessing_num_workers: 16
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
|
||||||
### output
|
### output
|
||||||
output_dir: saves/llama3-8b/lora/pretrain
|
output_dir: saves/llama3-8b/lora/pretrain
|
||||||
@ -22,6 +23,7 @@ logging_steps: 10
|
|||||||
save_steps: 500
|
save_steps: 500
|
||||||
plot_loss: true
|
plot_loss: true
|
||||||
overwrite_output_dir: true
|
overwrite_output_dir: true
|
||||||
|
save_only_model: false
|
||||||
|
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
@ -32,6 +34,7 @@ lr_scheduler_type: cosine
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
bf16: true
|
bf16: true
|
||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
resume_from_checkpoint: null
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
# eval_dataset: c4_demo
|
# eval_dataset: c4_demo
|
||||||
|
@ -16,6 +16,7 @@ cutoff_len: 2048
|
|||||||
max_samples: 1000
|
max_samples: 1000
|
||||||
overwrite_cache: true
|
overwrite_cache: true
|
||||||
preprocessing_num_workers: 16
|
preprocessing_num_workers: 16
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
|
||||||
### output
|
### output
|
||||||
output_dir: saves/llama3-8b/lora/reward
|
output_dir: saves/llama3-8b/lora/reward
|
||||||
@ -23,6 +24,7 @@ logging_steps: 10
|
|||||||
save_steps: 500
|
save_steps: 500
|
||||||
plot_loss: true
|
plot_loss: true
|
||||||
overwrite_output_dir: true
|
overwrite_output_dir: true
|
||||||
|
save_only_model: false
|
||||||
|
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
@ -33,6 +35,7 @@ lr_scheduler_type: cosine
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
bf16: true
|
bf16: true
|
||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
resume_from_checkpoint: null
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
# eval_dataset: dpo_en_demo
|
# eval_dataset: dpo_en_demo
|
||||||
|
@ -16,6 +16,7 @@ cutoff_len: 2048
|
|||||||
max_samples: 1000
|
max_samples: 1000
|
||||||
overwrite_cache: true
|
overwrite_cache: true
|
||||||
preprocessing_num_workers: 16
|
preprocessing_num_workers: 16
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
|
||||||
### output
|
### output
|
||||||
output_dir: saves/llama3-8b/lora/sft
|
output_dir: saves/llama3-8b/lora/sft
|
||||||
@ -23,6 +24,7 @@ logging_steps: 10
|
|||||||
save_steps: 500
|
save_steps: 500
|
||||||
plot_loss: true
|
plot_loss: true
|
||||||
overwrite_output_dir: true
|
overwrite_output_dir: true
|
||||||
|
save_only_model: false
|
||||||
|
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
@ -33,6 +35,7 @@ lr_scheduler_type: cosine
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
bf16: true
|
bf16: true
|
||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
resume_from_checkpoint: null
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
# eval_dataset: alpaca_en_demo
|
# eval_dataset: alpaca_en_demo
|
||||||
|
@ -17,6 +17,7 @@ cutoff_len: 2048
|
|||||||
max_samples: 1000
|
max_samples: 1000
|
||||||
overwrite_cache: true
|
overwrite_cache: true
|
||||||
preprocessing_num_workers: 16
|
preprocessing_num_workers: 16
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
|
||||||
### output
|
### output
|
||||||
output_dir: saves/llama3-8b/lora/sft
|
output_dir: saves/llama3-8b/lora/sft
|
||||||
@ -24,6 +25,7 @@ logging_steps: 10
|
|||||||
save_steps: 500
|
save_steps: 500
|
||||||
plot_loss: true
|
plot_loss: true
|
||||||
overwrite_output_dir: true
|
overwrite_output_dir: true
|
||||||
|
save_only_model: false
|
||||||
|
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
@ -34,8 +36,10 @@ lr_scheduler_type: cosine
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
bf16: true
|
bf16: true
|
||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
resume_from_checkpoint: null
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
# eval_dataset: alpaca_en_demo
|
||||||
# val_size: 0.1
|
# val_size: 0.1
|
||||||
# per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
# eval_strategy: steps
|
# eval_strategy: steps
|
||||||
|
@ -17,6 +17,7 @@ cutoff_len: 2048
|
|||||||
max_samples: 1000
|
max_samples: 1000
|
||||||
overwrite_cache: true
|
overwrite_cache: true
|
||||||
preprocessing_num_workers: 16
|
preprocessing_num_workers: 16
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
|
||||||
### output
|
### output
|
||||||
output_dir: tmp_dir
|
output_dir: tmp_dir
|
||||||
@ -24,9 +25,11 @@ logging_steps: 10
|
|||||||
save_steps: 500
|
save_steps: 500
|
||||||
plot_loss: true
|
plot_loss: true
|
||||||
overwrite_output_dir: true
|
overwrite_output_dir: true
|
||||||
|
save_only_model: false
|
||||||
|
|
||||||
### ray
|
### ray
|
||||||
ray_run_name: llama3_8b_sft_lora
|
ray_run_name: llama3_8b_sft_lora
|
||||||
|
ray_storage_path: ./saves
|
||||||
ray_num_workers: 4 # number of GPUs to use
|
ray_num_workers: 4 # number of GPUs to use
|
||||||
resources_per_worker:
|
resources_per_worker:
|
||||||
GPU: 1
|
GPU: 1
|
||||||
@ -41,8 +44,10 @@ lr_scheduler_type: cosine
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
bf16: true
|
bf16: true
|
||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
resume_from_checkpoint: null
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
# eval_dataset: alpaca_en_demo
|
||||||
# val_size: 0.1
|
# val_size: 0.1
|
||||||
# per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
# eval_strategy: steps
|
# eval_strategy: steps
|
||||||
|
@ -16,6 +16,7 @@ cutoff_len: 2048
|
|||||||
max_samples: 1000
|
max_samples: 1000
|
||||||
overwrite_cache: true
|
overwrite_cache: true
|
||||||
preprocessing_num_workers: 16
|
preprocessing_num_workers: 16
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
|
||||||
### output
|
### output
|
||||||
output_dir: saves/llava1_5-7b/lora/sft
|
output_dir: saves/llava1_5-7b/lora/sft
|
||||||
@ -23,6 +24,7 @@ logging_steps: 10
|
|||||||
save_steps: 500
|
save_steps: 500
|
||||||
plot_loss: true
|
plot_loss: true
|
||||||
overwrite_output_dir: true
|
overwrite_output_dir: true
|
||||||
|
save_only_model: false
|
||||||
|
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
@ -33,6 +35,7 @@ lr_scheduler_type: cosine
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
bf16: true
|
bf16: true
|
||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
resume_from_checkpoint: null
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
# val_size: 0.1
|
# val_size: 0.1
|
||||||
|
@ -20,6 +20,7 @@ cutoff_len: 2048
|
|||||||
max_samples: 1000
|
max_samples: 1000
|
||||||
overwrite_cache: true
|
overwrite_cache: true
|
||||||
preprocessing_num_workers: 16
|
preprocessing_num_workers: 16
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
|
||||||
### output
|
### output
|
||||||
output_dir: saves/qwen2_vl-7b/lora/dpo
|
output_dir: saves/qwen2_vl-7b/lora/dpo
|
||||||
@ -27,6 +28,7 @@ logging_steps: 10
|
|||||||
save_steps: 500
|
save_steps: 500
|
||||||
plot_loss: true
|
plot_loss: true
|
||||||
overwrite_output_dir: true
|
overwrite_output_dir: true
|
||||||
|
save_only_model: false
|
||||||
|
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
@ -37,6 +39,7 @@ lr_scheduler_type: cosine
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
bf16: true
|
bf16: true
|
||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
resume_from_checkpoint: null
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
# val_size: 0.1
|
# val_size: 0.1
|
||||||
|
@ -18,6 +18,7 @@ cutoff_len: 2048
|
|||||||
max_samples: 1000
|
max_samples: 1000
|
||||||
overwrite_cache: true
|
overwrite_cache: true
|
||||||
preprocessing_num_workers: 16
|
preprocessing_num_workers: 16
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
|
||||||
### output
|
### output
|
||||||
output_dir: saves/qwen2_vl-7b/lora/sft
|
output_dir: saves/qwen2_vl-7b/lora/sft
|
||||||
@ -25,6 +26,7 @@ logging_steps: 10
|
|||||||
save_steps: 500
|
save_steps: 500
|
||||||
plot_loss: true
|
plot_loss: true
|
||||||
overwrite_output_dir: true
|
overwrite_output_dir: true
|
||||||
|
save_only_model: false
|
||||||
|
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
@ -35,6 +37,7 @@ lr_scheduler_type: cosine
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
bf16: true
|
bf16: true
|
||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
resume_from_checkpoint: null
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
# val_size: 0.1
|
# val_size: 0.1
|
||||||
|
@ -521,9 +521,7 @@ class MiniCPMVPlugin(BasePlugin):
|
|||||||
processor: Optional["ProcessorMixin"],
|
processor: Optional["ProcessorMixin"],
|
||||||
) -> List[Dict[str, str]]:
|
) -> List[Dict[str, str]]:
|
||||||
self._validate_input(processor, images, videos, audios)
|
self._validate_input(processor, images, videos, audios)
|
||||||
num_image_tokens = 0
|
num_image_tokens, num_video_tokens, num_audio_tokens = 0, 0, 0
|
||||||
num_video_tokens = 0
|
|
||||||
num_audio_tokens = 0
|
|
||||||
messages = deepcopy(messages)
|
messages = deepcopy(messages)
|
||||||
image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
|
image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
|
||||||
mm_inputs = {}
|
mm_inputs = {}
|
||||||
@ -1038,7 +1036,7 @@ class Qwen2AudioPlugin(BasePlugin):
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Qwen2vlPlugin(BasePlugin):
|
class Qwen2VLPlugin(BasePlugin):
|
||||||
@override
|
@override
|
||||||
def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject":
|
def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject":
|
||||||
image = super()._preprocess_image(image, **kwargs)
|
image = super()._preprocess_image(image, **kwargs)
|
||||||
@ -1124,7 +1122,10 @@ class Qwen2vlPlugin(BasePlugin):
|
|||||||
processor: Optional["ProcessorMixin"],
|
processor: Optional["ProcessorMixin"],
|
||||||
) -> List[Dict[str, str]]:
|
) -> List[Dict[str, str]]:
|
||||||
self._validate_input(processor, images, videos, audios)
|
self._validate_input(processor, images, videos, audios)
|
||||||
|
num_image_tokens, num_video_tokens = 0, 0
|
||||||
|
messages = deepcopy(messages)
|
||||||
image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
|
image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
|
||||||
|
|
||||||
merge_length: int = getattr(image_processor, "merge_size") ** 2
|
merge_length: int = getattr(image_processor, "merge_size") ** 2
|
||||||
if self.expand_mm_tokens:
|
if self.expand_mm_tokens:
|
||||||
mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
|
mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
|
||||||
@ -1134,8 +1135,6 @@ class Qwen2vlPlugin(BasePlugin):
|
|||||||
image_grid_thw = [None] * len(images)
|
image_grid_thw = [None] * len(images)
|
||||||
video_grid_thw = [None] * len(videos)
|
video_grid_thw = [None] * len(videos)
|
||||||
|
|
||||||
num_image_tokens, num_video_tokens = 0, 0
|
|
||||||
messages = deepcopy(messages)
|
|
||||||
for message in messages:
|
for message in messages:
|
||||||
content = message["content"]
|
content = message["content"]
|
||||||
while IMAGE_PLACEHOLDER in content:
|
while IMAGE_PLACEHOLDER in content:
|
||||||
@ -1273,7 +1272,7 @@ PLUGINS = {
|
|||||||
"paligemma": PaliGemmaPlugin,
|
"paligemma": PaliGemmaPlugin,
|
||||||
"pixtral": PixtralPlugin,
|
"pixtral": PixtralPlugin,
|
||||||
"qwen2_audio": Qwen2AudioPlugin,
|
"qwen2_audio": Qwen2AudioPlugin,
|
||||||
"qwen2_vl": Qwen2vlPlugin,
|
"qwen2_vl": Qwen2VLPlugin,
|
||||||
"video_llava": VideoLlavaPlugin,
|
"video_llava": VideoLlavaPlugin,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,6 +17,7 @@ import shutil
|
|||||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
import torch.distributed as dist
|
||||||
from transformers import PreTrainedModel
|
from transformers import PreTrainedModel
|
||||||
|
|
||||||
from ..data import get_template_and_fix_tokenizer
|
from ..data import get_template_and_fix_tokenizer
|
||||||
@ -76,6 +77,12 @@ def _training_function(config: Dict[str, Any]) -> None:
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown task: {finetuning_args.stage}.")
|
raise ValueError(f"Unknown task: {finetuning_args.stage}.")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if dist.is_initialized():
|
||||||
|
dist.destroy_process_group()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to destroy process group: {e}.")
|
||||||
|
|
||||||
|
|
||||||
def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["TrainerCallback"]] = None) -> None:
|
def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["TrainerCallback"]] = None) -> None:
|
||||||
args = read_args(args)
|
args = read_args(args)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user