From 7ad5b5c088a0517beaf8868f3d2244580e231ca5 Mon Sep 17 00:00:00 2001 From: Ting Date: Tue, 19 Nov 2024 17:15:47 +0800 Subject: [PATCH 1/4] support efficient tokens calculation on sft/dpo Former-commit-id: b9f00286d8a017ed9fd2876986da3b4d7034ef07 --- src/llamafactory/train/dpo/workflow.py | 10 ++++++++++ src/llamafactory/train/sft/workflow.py | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py index 3a8464ec..a94a7eff 100644 --- a/src/llamafactory/train/dpo/workflow.py +++ b/src/llamafactory/train/dpo/workflow.py @@ -16,6 +16,7 @@ # limitations under the License. from typing import TYPE_CHECKING, List, Optional +import torch.distributed as dist from ...data import PairwiseDataCollatorWithPadding, get_dataset, get_template_and_fix_tokenizer from ...extras.constants import IGNORE_INDEX @@ -64,6 +65,11 @@ def run_dpo( # Update arguments training_args.remove_unused_columns = False # important for multimodal and pairwise dataset + effi_token_num = 0.0 + for data in dataset_module["train_dataset"]: + effi_token_num += len(data["chosen_input_ids"]) + effi_token_num += len(data["rejected_input_ids"]) + # Initialize our Trainer trainer = CustomDPOTrainer( model=model, @@ -79,6 +85,10 @@ def run_dpo( # Training if training_args.do_train: train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + train_result.metrics['effective_tokens_per_sec'] = effi_token_num * train_result.metrics['epoch'] / train_result.metrics['train_runtime'] + if dist.is_initialized(): + train_result.metrics['effective_tokens_per_sec'] = train_result.metrics['effective_tokens_per_sec'] / dist.get_world_size() + trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py index 43a9aef1..c288b69f 100644 --- a/src/llamafactory/train/sft/workflow.py +++ b/src/llamafactory/train/sft/workflow.py @@ -16,6 +16,7 @@ # limitations under the License. from typing import TYPE_CHECKING, List, Optional +import torch.distributed as dist from ...data import SFTDataCollatorWith4DAttentionMask, get_dataset, get_template_and_fix_tokenizer from ...extras.constants import IGNORE_INDEX @@ -65,6 +66,10 @@ def run_sft( training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams training_args.remove_unused_columns = False # important for multimodal dataset + effi_token_num = 0.0 + for data in dataset_module["train_dataset"]: + effi_token_num += len(data["input_ids"]) + # Metric utils metric_module = {} if training_args.predict_with_generate: @@ -94,6 +99,10 @@ def run_sft( # Training if training_args.do_train: train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + train_result.metrics['effective_tokens_per_sec'] = effi_token_num * train_result.metrics['epoch'] / train_result.metrics['train_runtime'] + if dist.is_initialized(): + train_result.metrics['effective_tokens_per_sec'] = train_result.metrics['effective_tokens_per_sec'] / dist.get_world_size() + trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) @@ -123,3 +132,4 @@ def run_sft( # Create model card create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) + From bf2b8df5407af7c3373239555600736fceac4848 Mon Sep 17 00:00:00 2001 From: Ting Date: Tue, 19 Nov 2024 19:10:07 +0800 Subject: [PATCH 2/4] update Former-commit-id: ef6e14550dd76810285cee9c268590d1d9423e54 --- src/llamafactory/train/dpo/workflow.py | 9 +++++++-- src/llamafactory/train/sft/workflow.py | 14 +++++++++----- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py index a94a7eff..04f534ce 100644 --- a/src/llamafactory/train/dpo/workflow.py +++ b/src/llamafactory/train/dpo/workflow.py @@ -16,6 +16,7 @@ # limitations under the License. from typing import TYPE_CHECKING, List, Optional + import torch.distributed as dist from ...data import PairwiseDataCollatorWithPadding, get_dataset, get_template_and_fix_tokenizer @@ -85,9 +86,13 @@ def run_dpo( # Training if training_args.do_train: train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) - train_result.metrics['effective_tokens_per_sec'] = effi_token_num * train_result.metrics['epoch'] / train_result.metrics['train_runtime'] + train_result.metrics["effective_tokens_per_sec"] = ( + effi_token_num * train_result.metrics["epoch"] / train_result.metrics["train_runtime"] + ) if dist.is_initialized(): - train_result.metrics['effective_tokens_per_sec'] = train_result.metrics['effective_tokens_per_sec'] / dist.get_world_size() + train_result.metrics["effective_tokens_per_sec"] = ( + train_result.metrics["effective_tokens_per_sec"] / dist.get_world_size() + ) trainer.save_model() trainer.log_metrics("train", train_result.metrics) diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py index c288b69f..197a4866 100644 --- a/src/llamafactory/train/sft/workflow.py +++ b/src/llamafactory/train/sft/workflow.py @@ -16,6 +16,7 @@ # limitations under the License. from typing import TYPE_CHECKING, List, Optional + import torch.distributed as dist from ...data import SFTDataCollatorWith4DAttentionMask, get_dataset, get_template_and_fix_tokenizer @@ -66,9 +67,9 @@ def run_sft( training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams training_args.remove_unused_columns = False # important for multimodal dataset - effi_token_num = 0.0 + effective_token_num = 0.0 for data in dataset_module["train_dataset"]: - effi_token_num += len(data["input_ids"]) + effective_token_num += len(data["input_ids"]) # Metric utils metric_module = {} @@ -99,9 +100,13 @@ def run_sft( # Training if training_args.do_train: train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) - train_result.metrics['effective_tokens_per_sec'] = effi_token_num * train_result.metrics['epoch'] / train_result.metrics['train_runtime'] + train_result.metrics["effective_tokens_per_sec"] = ( + effective_token_num * train_result.metrics["epoch"] / train_result.metrics["train_runtime"] + ) if dist.is_initialized(): - train_result.metrics['effective_tokens_per_sec'] = train_result.metrics['effective_tokens_per_sec'] / dist.get_world_size() + train_result.metrics["effective_tokens_per_sec"] = ( + train_result.metrics["effective_tokens_per_sec"] / dist.get_world_size() + ) trainer.save_model() trainer.log_metrics("train", train_result.metrics) @@ -132,4 +137,3 @@ def run_sft( # Create model card create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) - From 32656bc50dc2d51b41a4f2da485fba0070902b2f Mon Sep 17 00:00:00 2001 From: Ting Date: Tue, 19 Nov 2024 19:12:10 +0800 Subject: [PATCH 3/4] update Former-commit-id: f566ecc8d1f04615351acbe4f8480b75b2daed42 --- src/llamafactory/train/dpo/workflow.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py index 04f534ce..c0767880 100644 --- a/src/llamafactory/train/dpo/workflow.py +++ b/src/llamafactory/train/dpo/workflow.py @@ -66,10 +66,10 @@ def run_dpo( # Update arguments training_args.remove_unused_columns = False # important for multimodal and pairwise dataset - effi_token_num = 0.0 + effective_token_num = 0.0 for data in dataset_module["train_dataset"]: - effi_token_num += len(data["chosen_input_ids"]) - effi_token_num += len(data["rejected_input_ids"]) + effective_token_num += len(data["chosen_input_ids"]) + effective_token_num += len(data["rejected_input_ids"]) # Initialize our Trainer trainer = CustomDPOTrainer( @@ -87,7 +87,7 @@ def run_dpo( if training_args.do_train: train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) train_result.metrics["effective_tokens_per_sec"] = ( - effi_token_num * train_result.metrics["epoch"] / train_result.metrics["train_runtime"] + effective_token_num * train_result.metrics["epoch"] / train_result.metrics["train_runtime"] ) if dist.is_initialized(): train_result.metrics["effective_tokens_per_sec"] = ( From e27a0c3d53e49110019a24000da0462d0314eb0d Mon Sep 17 00:00:00 2001 From: Ting Date: Tue, 19 Nov 2024 20:33:18 +0800 Subject: [PATCH 4/4] code refactor Former-commit-id: 40627c601efc9f144a227dded8c6b40babff4e8b --- src/llamafactory/extras/misc.py | 9 +++++++++ src/llamafactory/hparams/finetuning_args.py | 4 ++++ src/llamafactory/train/dpo/workflow.py | 20 +++++++++----------- src/llamafactory/train/sft/workflow.py | 18 +++++++----------- 4 files changed, 29 insertions(+), 22 deletions(-) diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py index c6183d1a..f46c0f88 100644 --- a/src/llamafactory/extras/misc.py +++ b/src/llamafactory/extras/misc.py @@ -20,6 +20,7 @@ import os from typing import TYPE_CHECKING, Tuple, Union import torch +import torch.distributed as dist import transformers.dynamic_module_utils from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList from transformers.dynamic_module_utils import get_relative_imports @@ -263,3 +264,11 @@ def use_modelscope() -> bool: def use_openmind() -> bool: return os.environ.get("USE_OPENMIND_HUB", "0").lower() in ["true", "1"] + + +def cal_effective_tokens(effective_token_num, epoch, train_runtime) -> int: + r""" + calculate effective tokens. + """ + result = effective_token_num * epoch / train_runtime + return result / dist.get_world_size() if dist.is_initialized() else result diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py index ba1306e1..8cfea728 100644 --- a/src/llamafactory/hparams/finetuning_args.py +++ b/src/llamafactory/hparams/finetuning_args.py @@ -346,6 +346,10 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA default=False, metadata={"help": "Whether or not to save the training loss curves."}, ) + include_effective_tokens_per_second: bool = field( + default=False, + metadata={"help": "Whether or not to compute effective tokens per second."}, + ) def __post_init__(self): def split_arg(arg): diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py index c0767880..8c3e7401 100644 --- a/src/llamafactory/train/dpo/workflow.py +++ b/src/llamafactory/train/dpo/workflow.py @@ -17,10 +17,9 @@ from typing import TYPE_CHECKING, List, Optional -import torch.distributed as dist - from ...data import PairwiseDataCollatorWithPadding, get_dataset, get_template_and_fix_tokenizer from ...extras.constants import IGNORE_INDEX +from ...extras.misc import cal_effective_tokens from ...extras.ploting import plot_loss from ...hparams import ModelArguments from ...model import load_model, load_tokenizer @@ -67,9 +66,10 @@ def run_dpo( training_args.remove_unused_columns = False # important for multimodal and pairwise dataset effective_token_num = 0.0 - for data in dataset_module["train_dataset"]: - effective_token_num += len(data["chosen_input_ids"]) - effective_token_num += len(data["rejected_input_ids"]) + if finetuning_args.include_effective_tokens_per_second: + for data in dataset_module["train_dataset"]: + effective_token_num += len(data["chosen_input_ids"]) + effective_token_num += len(data["rejected_input_ids"]) # Initialize our Trainer trainer = CustomDPOTrainer( @@ -86,12 +86,10 @@ def run_dpo( # Training if training_args.do_train: train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) - train_result.metrics["effective_tokens_per_sec"] = ( - effective_token_num * train_result.metrics["epoch"] / train_result.metrics["train_runtime"] - ) - if dist.is_initialized(): - train_result.metrics["effective_tokens_per_sec"] = ( - train_result.metrics["effective_tokens_per_sec"] / dist.get_world_size() + + if finetuning_args.include_effective_tokens_per_second: + train_result.metrics["effective_tokens_per_sec"] = cal_effective_tokens( + effective_token_num, train_result.metrics["epoch"], train_result.metrics["train_runtime"] ) trainer.save_model() diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py index 197a4866..d8dafc5f 100644 --- a/src/llamafactory/train/sft/workflow.py +++ b/src/llamafactory/train/sft/workflow.py @@ -17,11 +17,9 @@ from typing import TYPE_CHECKING, List, Optional -import torch.distributed as dist - from ...data import SFTDataCollatorWith4DAttentionMask, get_dataset, get_template_and_fix_tokenizer from ...extras.constants import IGNORE_INDEX -from ...extras.misc import get_logits_processor +from ...extras.misc import cal_effective_tokens, get_logits_processor from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer from ..trainer_utils import create_modelcard_and_push @@ -68,8 +66,9 @@ def run_sft( training_args.remove_unused_columns = False # important for multimodal dataset effective_token_num = 0.0 - for data in dataset_module["train_dataset"]: - effective_token_num += len(data["input_ids"]) + if finetuning_args.include_effective_tokens_per_second: + for data in dataset_module["train_dataset"]: + effective_token_num += len(data["input_ids"]) # Metric utils metric_module = {} @@ -100,12 +99,9 @@ def run_sft( # Training if training_args.do_train: train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) - train_result.metrics["effective_tokens_per_sec"] = ( - effective_token_num * train_result.metrics["epoch"] / train_result.metrics["train_runtime"] - ) - if dist.is_initialized(): - train_result.metrics["effective_tokens_per_sec"] = ( - train_result.metrics["effective_tokens_per_sec"] / dist.get_world_size() + if finetuning_args.include_effective_tokens_per_second: + train_result.metrics["effective_tokens_per_sec"] = cal_effective_tokens( + effective_token_num, train_result.metrics["epoch"], train_result.metrics["train_runtime"] ) trainer.save_model()