From 7ad5b5c088a0517beaf8868f3d2244580e231ca5 Mon Sep 17 00:00:00 2001
From: Ting <wtmlon@foxmail.com>
Date: Tue, 19 Nov 2024 17:15:47 +0800
Subject: [PATCH 1/4] support efficient tokens calculation on sft/dpo

Former-commit-id: b9f00286d8a017ed9fd2876986da3b4d7034ef07
---
 src/llamafactory/train/dpo/workflow.py | 10 ++++++++++
 src/llamafactory/train/sft/workflow.py | 10 ++++++++++
 2 files changed, 20 insertions(+)

diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py
index 3a8464ec..a94a7eff 100644
--- a/src/llamafactory/train/dpo/workflow.py
+++ b/src/llamafactory/train/dpo/workflow.py
@@ -16,6 +16,7 @@
 # limitations under the License.
 
 from typing import TYPE_CHECKING, List, Optional
+import torch.distributed as dist
 
 from ...data import PairwiseDataCollatorWithPadding, get_dataset, get_template_and_fix_tokenizer
 from ...extras.constants import IGNORE_INDEX
@@ -64,6 +65,11 @@ def run_dpo(
     # Update arguments
     training_args.remove_unused_columns = False  # important for multimodal and pairwise dataset
 
+    effi_token_num = 0.0
+    for data in dataset_module["train_dataset"]:
+        effi_token_num += len(data["chosen_input_ids"])
+        effi_token_num += len(data["rejected_input_ids"])
+
     # Initialize our Trainer
     trainer = CustomDPOTrainer(
         model=model,
@@ -79,6 +85,10 @@ def run_dpo(
     # Training
     if training_args.do_train:
         train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        train_result.metrics['effective_tokens_per_sec'] = effi_token_num * train_result.metrics['epoch'] / train_result.metrics['train_runtime']
+        if dist.is_initialized():
+            train_result.metrics['effective_tokens_per_sec'] = train_result.metrics['effective_tokens_per_sec'] / dist.get_world_size()
+
         trainer.save_model()
         trainer.log_metrics("train", train_result.metrics)
         trainer.save_metrics("train", train_result.metrics)
diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py
index 43a9aef1..c288b69f 100644
--- a/src/llamafactory/train/sft/workflow.py
+++ b/src/llamafactory/train/sft/workflow.py
@@ -16,6 +16,7 @@
 # limitations under the License.
 
 from typing import TYPE_CHECKING, List, Optional
+import torch.distributed as dist
 
 from ...data import SFTDataCollatorWith4DAttentionMask, get_dataset, get_template_and_fix_tokenizer
 from ...extras.constants import IGNORE_INDEX
@@ -65,6 +66,10 @@ def run_sft(
     training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams
     training_args.remove_unused_columns = False  # important for multimodal dataset
 
+    effi_token_num = 0.0
+    for data in dataset_module["train_dataset"]:
+        effi_token_num += len(data["input_ids"])
+
     # Metric utils
     metric_module = {}
     if training_args.predict_with_generate:
@@ -94,6 +99,10 @@ def run_sft(
     # Training
     if training_args.do_train:
         train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        train_result.metrics['effective_tokens_per_sec'] = effi_token_num * train_result.metrics['epoch'] / train_result.metrics['train_runtime']
+        if dist.is_initialized():
+            train_result.metrics['effective_tokens_per_sec'] = train_result.metrics['effective_tokens_per_sec'] / dist.get_world_size()
+
         trainer.save_model()
         trainer.log_metrics("train", train_result.metrics)
         trainer.save_metrics("train", train_result.metrics)
@@ -123,3 +132,4 @@ def run_sft(
 
     # Create model card
     create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
+

From bf2b8df5407af7c3373239555600736fceac4848 Mon Sep 17 00:00:00 2001
From: Ting <wtmlon@foxmail.com>
Date: Tue, 19 Nov 2024 19:10:07 +0800
Subject: [PATCH 2/4] update

Former-commit-id: ef6e14550dd76810285cee9c268590d1d9423e54
---
 src/llamafactory/train/dpo/workflow.py |  9 +++++++--
 src/llamafactory/train/sft/workflow.py | 14 +++++++++-----
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py
index a94a7eff..04f534ce 100644
--- a/src/llamafactory/train/dpo/workflow.py
+++ b/src/llamafactory/train/dpo/workflow.py
@@ -16,6 +16,7 @@
 # limitations under the License.
 
 from typing import TYPE_CHECKING, List, Optional
+
 import torch.distributed as dist
 
 from ...data import PairwiseDataCollatorWithPadding, get_dataset, get_template_and_fix_tokenizer
@@ -85,9 +86,13 @@ def run_dpo(
     # Training
     if training_args.do_train:
         train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
-        train_result.metrics['effective_tokens_per_sec'] = effi_token_num * train_result.metrics['epoch'] / train_result.metrics['train_runtime']
+        train_result.metrics["effective_tokens_per_sec"] = (
+            effi_token_num * train_result.metrics["epoch"] / train_result.metrics["train_runtime"]
+        )
         if dist.is_initialized():
-            train_result.metrics['effective_tokens_per_sec'] = train_result.metrics['effective_tokens_per_sec'] / dist.get_world_size()
+            train_result.metrics["effective_tokens_per_sec"] = (
+                train_result.metrics["effective_tokens_per_sec"] / dist.get_world_size()
+            )
 
         trainer.save_model()
         trainer.log_metrics("train", train_result.metrics)
diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py
index c288b69f..197a4866 100644
--- a/src/llamafactory/train/sft/workflow.py
+++ b/src/llamafactory/train/sft/workflow.py
@@ -16,6 +16,7 @@
 # limitations under the License.
 
 from typing import TYPE_CHECKING, List, Optional
+
 import torch.distributed as dist
 
 from ...data import SFTDataCollatorWith4DAttentionMask, get_dataset, get_template_and_fix_tokenizer
@@ -66,9 +67,9 @@ def run_sft(
     training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams
     training_args.remove_unused_columns = False  # important for multimodal dataset
 
-    effi_token_num = 0.0
+    effective_token_num = 0.0
     for data in dataset_module["train_dataset"]:
-        effi_token_num += len(data["input_ids"])
+        effective_token_num += len(data["input_ids"])
 
     # Metric utils
     metric_module = {}
@@ -99,9 +100,13 @@ def run_sft(
     # Training
     if training_args.do_train:
         train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
-        train_result.metrics['effective_tokens_per_sec'] = effi_token_num * train_result.metrics['epoch'] / train_result.metrics['train_runtime']
+        train_result.metrics["effective_tokens_per_sec"] = (
+            effective_token_num * train_result.metrics["epoch"] / train_result.metrics["train_runtime"]
+        )
         if dist.is_initialized():
-            train_result.metrics['effective_tokens_per_sec'] = train_result.metrics['effective_tokens_per_sec'] / dist.get_world_size()
+            train_result.metrics["effective_tokens_per_sec"] = (
+                train_result.metrics["effective_tokens_per_sec"] / dist.get_world_size()
+            )
 
         trainer.save_model()
         trainer.log_metrics("train", train_result.metrics)
@@ -132,4 +137,3 @@ def run_sft(
 
     # Create model card
     create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
-

From 32656bc50dc2d51b41a4f2da485fba0070902b2f Mon Sep 17 00:00:00 2001
From: Ting <wtmlon@foxmail.com>
Date: Tue, 19 Nov 2024 19:12:10 +0800
Subject: [PATCH 3/4] update

Former-commit-id: f566ecc8d1f04615351acbe4f8480b75b2daed42
---
 src/llamafactory/train/dpo/workflow.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py
index 04f534ce..c0767880 100644
--- a/src/llamafactory/train/dpo/workflow.py
+++ b/src/llamafactory/train/dpo/workflow.py
@@ -66,10 +66,10 @@ def run_dpo(
     # Update arguments
     training_args.remove_unused_columns = False  # important for multimodal and pairwise dataset
 
-    effi_token_num = 0.0
+    effective_token_num = 0.0
     for data in dataset_module["train_dataset"]:
-        effi_token_num += len(data["chosen_input_ids"])
-        effi_token_num += len(data["rejected_input_ids"])
+        effective_token_num += len(data["chosen_input_ids"])
+        effective_token_num += len(data["rejected_input_ids"])
 
     # Initialize our Trainer
     trainer = CustomDPOTrainer(
@@ -87,7 +87,7 @@ def run_dpo(
     if training_args.do_train:
         train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
         train_result.metrics["effective_tokens_per_sec"] = (
-            effi_token_num * train_result.metrics["epoch"] / train_result.metrics["train_runtime"]
+            effective_token_num * train_result.metrics["epoch"] / train_result.metrics["train_runtime"]
         )
         if dist.is_initialized():
             train_result.metrics["effective_tokens_per_sec"] = (

From e27a0c3d53e49110019a24000da0462d0314eb0d Mon Sep 17 00:00:00 2001
From: Ting <wtmlon@foxmail.com>
Date: Tue, 19 Nov 2024 20:33:18 +0800
Subject: [PATCH 4/4] code refactor

Former-commit-id: 40627c601efc9f144a227dded8c6b40babff4e8b
---
 src/llamafactory/extras/misc.py             |  9 +++++++++
 src/llamafactory/hparams/finetuning_args.py |  4 ++++
 src/llamafactory/train/dpo/workflow.py      | 20 +++++++++-----------
 src/llamafactory/train/sft/workflow.py      | 18 +++++++-----------
 4 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py
index c6183d1a..f46c0f88 100644
--- a/src/llamafactory/extras/misc.py
+++ b/src/llamafactory/extras/misc.py
@@ -20,6 +20,7 @@ import os
 from typing import TYPE_CHECKING, Tuple, Union
 
 import torch
+import torch.distributed as dist
 import transformers.dynamic_module_utils
 from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList
 from transformers.dynamic_module_utils import get_relative_imports
@@ -263,3 +264,11 @@ def use_modelscope() -> bool:
 
 def use_openmind() -> bool:
     return os.environ.get("USE_OPENMIND_HUB", "0").lower() in ["true", "1"]
+
+
+def cal_effective_tokens(effective_token_num, epoch, train_runtime) -> int:
+    r"""
+    calculate effective tokens.
+    """
+    result = effective_token_num * epoch / train_runtime
+    return result / dist.get_world_size() if dist.is_initialized() else result
diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py
index ba1306e1..8cfea728 100644
--- a/src/llamafactory/hparams/finetuning_args.py
+++ b/src/llamafactory/hparams/finetuning_args.py
@@ -346,6 +346,10 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
         default=False,
         metadata={"help": "Whether or not to save the training loss curves."},
     )
+    include_effective_tokens_per_second: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to compute effective tokens per second."},
+    )
 
     def __post_init__(self):
         def split_arg(arg):
diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py
index c0767880..8c3e7401 100644
--- a/src/llamafactory/train/dpo/workflow.py
+++ b/src/llamafactory/train/dpo/workflow.py
@@ -17,10 +17,9 @@
 
 from typing import TYPE_CHECKING, List, Optional
 
-import torch.distributed as dist
-
 from ...data import PairwiseDataCollatorWithPadding, get_dataset, get_template_and_fix_tokenizer
 from ...extras.constants import IGNORE_INDEX
+from ...extras.misc import cal_effective_tokens
 from ...extras.ploting import plot_loss
 from ...hparams import ModelArguments
 from ...model import load_model, load_tokenizer
@@ -67,9 +66,10 @@ def run_dpo(
     training_args.remove_unused_columns = False  # important for multimodal and pairwise dataset
 
     effective_token_num = 0.0
-    for data in dataset_module["train_dataset"]:
-        effective_token_num += len(data["chosen_input_ids"])
-        effective_token_num += len(data["rejected_input_ids"])
+    if finetuning_args.include_effective_tokens_per_second:
+        for data in dataset_module["train_dataset"]:
+            effective_token_num += len(data["chosen_input_ids"])
+            effective_token_num += len(data["rejected_input_ids"])
 
     # Initialize our Trainer
     trainer = CustomDPOTrainer(
@@ -86,12 +86,10 @@ def run_dpo(
     # Training
     if training_args.do_train:
         train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
-        train_result.metrics["effective_tokens_per_sec"] = (
-            effective_token_num * train_result.metrics["epoch"] / train_result.metrics["train_runtime"]
-        )
-        if dist.is_initialized():
-            train_result.metrics["effective_tokens_per_sec"] = (
-                train_result.metrics["effective_tokens_per_sec"] / dist.get_world_size()
+
+        if finetuning_args.include_effective_tokens_per_second:
+            train_result.metrics["effective_tokens_per_sec"] = cal_effective_tokens(
+                effective_token_num, train_result.metrics["epoch"], train_result.metrics["train_runtime"]
             )
 
         trainer.save_model()
diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py
index 197a4866..d8dafc5f 100644
--- a/src/llamafactory/train/sft/workflow.py
+++ b/src/llamafactory/train/sft/workflow.py
@@ -17,11 +17,9 @@
 
 from typing import TYPE_CHECKING, List, Optional
 
-import torch.distributed as dist
-
 from ...data import SFTDataCollatorWith4DAttentionMask, get_dataset, get_template_and_fix_tokenizer
 from ...extras.constants import IGNORE_INDEX
-from ...extras.misc import get_logits_processor
+from ...extras.misc import cal_effective_tokens, get_logits_processor
 from ...extras.ploting import plot_loss
 from ...model import load_model, load_tokenizer
 from ..trainer_utils import create_modelcard_and_push
@@ -68,8 +66,9 @@ def run_sft(
     training_args.remove_unused_columns = False  # important for multimodal dataset
 
     effective_token_num = 0.0
-    for data in dataset_module["train_dataset"]:
-        effective_token_num += len(data["input_ids"])
+    if finetuning_args.include_effective_tokens_per_second:
+        for data in dataset_module["train_dataset"]:
+            effective_token_num += len(data["input_ids"])
 
     # Metric utils
     metric_module = {}
@@ -100,12 +99,9 @@ def run_sft(
     # Training
     if training_args.do_train:
         train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
-        train_result.metrics["effective_tokens_per_sec"] = (
-            effective_token_num * train_result.metrics["epoch"] / train_result.metrics["train_runtime"]
-        )
-        if dist.is_initialized():
-            train_result.metrics["effective_tokens_per_sec"] = (
-                train_result.metrics["effective_tokens_per_sec"] / dist.get_world_size()
+        if finetuning_args.include_effective_tokens_per_second:
+            train_result.metrics["effective_tokens_per_sec"] = cal_effective_tokens(
+                effective_token_num, train_result.metrics["epoch"], train_result.metrics["train_runtime"]
             )
 
         trainer.save_model()