From ab1775cd953b4dba82ae2bef5f039b128d3091d7 Mon Sep 17 00:00:00 2001
From: "-.-" <yzoaim@users.noreply.github.com>
Date: Sat, 7 Sep 2024 23:21:35 +0800
Subject: [PATCH 1/3] update cal_mfu.py

Former-commit-id: 66ec36522c9bf8dfffc1065202362801875a104d
---
 scripts/cal_mfu.py | 126 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 scripts/cal_mfu.py

diff --git a/scripts/cal_mfu.py b/scripts/cal_mfu.py
new file mode 100644
index 00000000..2f408497
--- /dev/null
+++ b/scripts/cal_mfu.py
@@ -0,0 +1,126 @@
+# coding=utf-8
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import torch
+from transformers import AutoConfig
+import fire
+def model_flops_counter(
+    batch_size: int,
+    seqlen: int,
+    model_config: dict,
+    is_backward: bool = True,
+    is_recompute: bool = False,
+    is_flashattn: bool = False,
+) -> float:
+    """
+    calculate the FLOPs of model per iteration
+    """
+    hidden_size = model_config.hidden_size
+    num_attention_heads = model_config.num_attention_heads
+    num_key_value_heads = model_config.num_key_value_heads
+    vocab_size = model_config.vocab_size
+    intermediate_size = model_config.intermediate_size
+    num_hidden_layers = model_config.num_hidden_layers
+    """
+    B: batch_size
+    S: seqlen
+    L: num_hidden_layers
+    H: hidden_size
+    V: vocab_size
+    I: intermediate_size
+    """
+    ### MLP calculation
+    per_mlp_calculation = 2 * hidden_size * intermediate_size
+    mlp_calculation_per_layer = per_mlp_calculation * 3
+    mlp_calculation = batch_size * seqlen * mlp_calculation_per_layer * num_hidden_layers
+
+    ### Attention calculation
+    Q_calculation = 2 * hidden_size * hidden_size
+    O_calculation = 2 * hidden_size * hidden_size
+    K_calculation = 2 * hidden_size * hidden_size * num_key_value_heads / num_attention_heads
+    V_calculation = 2 * hidden_size * hidden_size * num_key_value_heads / num_attention_heads
+    
+    QKVO_calculation = Q_calculation + O_calculation + K_calculation + V_calculation # 8H^2 / coe
+    self_attn_calculation = seqlen * hidden_size * 2 * 2  # (4 * S * H)
+    attention_calculation = batch_size * seqlen * num_hidden_layers * (QKVO_calculation + self_attn_calculation) # BSL(8H^2/coe + 4S * H)
+    
+    #Embedding and LMhead calculation
+    embedding_calculation = hidden_size * vocab_size
+    lmhead_calculation = hidden_size * vocab_size    
+    IO_calculation = 3 * batch_size * seqlen * (embedding_calculation + lmhead_calculation) # 2 *(1+2)BSHV    
+    E = attention_calculation + mlp_calculation
+    coefficient = 3
+    fix_term = 0
+    if(is_recompute):
+        coefficient = 4
+    if(is_flashattn):
+        fix_term = batch_size *seqlen * self_attn_calculation
+    
+    total_calculation = coefficient * E + IO_calculation + fix_term
+    
+    return total_calculation
+
+
+def hardware_flops_counter(
+    seconds: float, # seconds used in given iterations
+    num_gpus: int = 1,
+) -> float:
+    if "A100" in torch.cuda.get_device_name():
+        return 312 * 1e12 * seconds * num_gpus
+    elif "V100" in torch.cuda.get_device_name():
+        return 125 * 1e12 * seconds * num_gpus
+
+def compute_mfu(
+    batch_size: int,
+    seqlen: int,
+    model_config: dict,
+    num_iter: int,
+    seconds: float,
+    num_gpus: int = 1,
+) -> float:
+    """
+    compute MFU given model configuration, training config and training information
+    """
+    percentage = (num_iter * model_flops_counter(batch_size,seqlen,model_config)) / hardware_flops_counter(seconds, num_gpus)
+    
+    print(f"MFU : {percentage* 100:.2f}%")
+    return percentage
+    
+# User input
+
+### model_name
+model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+### training config
+batch_size = 8
+seqlen = 1*1024
+num_gpus = 1
+
+### training information
+num_iter = 225
+seconds = 605 # time used in {num_iter} iterations
+
+model_config = AutoConfig.from_pretrained(model_name)
+if __name__ == "__main__":
+    fire.Fire( 
+        compute_mfu(
+            batch_size=batch_size,
+            seqlen=seqlen,
+            model_config=model_config,
+            num_iter=num_iter,
+            seconds=seconds,
+            num_gpus=num_gpus
+        )
+    )

From 4f28e0e5d2fc727665b044d3bd7ea831a460826c Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 8 Sep 2024 00:39:48 +0800
Subject: [PATCH 2/3] Update cal_mfu.py

Former-commit-id: e71133cab14eed2c071cfd4ce044ae9905447427
---
 scripts/cal_mfu.py | 217 +++++++++++++++++++++++++--------------------
 1 file changed, 120 insertions(+), 97 deletions(-)

diff --git a/scripts/cal_mfu.py b/scripts/cal_mfu.py
index 2f408497..c4e851d7 100644
--- a/scripts/cal_mfu.py
+++ b/scripts/cal_mfu.py
@@ -11,116 +11,139 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License
+# limitations under the License.
 
+import json
+import os
+
+import fire
 import torch
 from transformers import AutoConfig
-import fire
-def model_flops_counter(
+
+from llamafactory.train.tuner import run_exp
+
+
+BASE = 2  # gemm (add + mul)
+
+
+def compute_model_flops(
+    model_name_or_path: str,
     batch_size: int,
-    seqlen: int,
-    model_config: dict,
-    is_backward: bool = True,
-    is_recompute: bool = False,
-    is_flashattn: bool = False,
-) -> float:
+    seq_length: int,
+    include_backward: bool = True,
+    include_recompute: bool = False,
+    include_flashattn: bool = False,
+) -> int:
+    r"""
+    Calculates the FLOPs of model per forward/backward pass.
     """
-    calculate the FLOPs of model per iteration
-    """
-    hidden_size = model_config.hidden_size
-    num_attention_heads = model_config.num_attention_heads
-    num_key_value_heads = model_config.num_key_value_heads
-    vocab_size = model_config.vocab_size
-    intermediate_size = model_config.intermediate_size
-    num_hidden_layers = model_config.num_hidden_layers
-    """
-    B: batch_size
-    S: seqlen
-    L: num_hidden_layers
-    H: hidden_size
-    V: vocab_size
-    I: intermediate_size
-    """
-    ### MLP calculation
-    per_mlp_calculation = 2 * hidden_size * intermediate_size
-    mlp_calculation_per_layer = per_mlp_calculation * 3
-    mlp_calculation = batch_size * seqlen * mlp_calculation_per_layer * num_hidden_layers
+    config = AutoConfig.from_pretrained(model_name_or_path)
+    hidden_size = getattr(config, "hidden_size", None)
+    vocab_size = getattr(config, "vocab_size", None)
+    intermediate_size = getattr(config, "intermediate_size", None)
+    num_attention_heads = getattr(config, "num_attention_heads", None)
+    num_key_value_heads = getattr(config, "num_key_value_heads", None)
+    num_hidden_layers = getattr(config, "num_hidden_layers", None)
+    tie_word_embeddings = getattr(config, "tie_word_embeddings", False)
 
-    ### Attention calculation
-    Q_calculation = 2 * hidden_size * hidden_size
-    O_calculation = 2 * hidden_size * hidden_size
-    K_calculation = 2 * hidden_size * hidden_size * num_key_value_heads / num_attention_heads
-    V_calculation = 2 * hidden_size * hidden_size * num_key_value_heads / num_attention_heads
-    
-    QKVO_calculation = Q_calculation + O_calculation + K_calculation + V_calculation # 8H^2 / coe
-    self_attn_calculation = seqlen * hidden_size * 2 * 2  # (4 * S * H)
-    attention_calculation = batch_size * seqlen * num_hidden_layers * (QKVO_calculation + self_attn_calculation) # BSL(8H^2/coe + 4S * H)
-    
-    #Embedding and LMhead calculation
-    embedding_calculation = hidden_size * vocab_size
-    lmhead_calculation = hidden_size * vocab_size    
-    IO_calculation = 3 * batch_size * seqlen * (embedding_calculation + lmhead_calculation) # 2 *(1+2)BSHV    
-    E = attention_calculation + mlp_calculation
-    coefficient = 3
-    fix_term = 0
-    if(is_recompute):
-        coefficient = 4
-    if(is_flashattn):
-        fix_term = batch_size *seqlen * self_attn_calculation
-    
-    total_calculation = coefficient * E + IO_calculation + fix_term
-    
-    return total_calculation
+    # mlp module
+    mlp_flops_per_token = 3 * BASE * hidden_size * intermediate_size  # up, gate, down
+    mlp_flops = batch_size * seq_length * num_hidden_layers * mlp_flops_per_token
+
+    # attn projector module
+    q_flops_per_token = BASE * hidden_size * hidden_size
+    o_flops_per_token = BASE * hidden_size * hidden_size
+    k_flops_per_token = BASE * hidden_size * hidden_size * num_key_value_heads // num_attention_heads
+    v_flops_per_token = BASE * hidden_size * hidden_size * num_key_value_heads // num_attention_heads
+    attn_proj_flops_per_token = q_flops_per_token + o_flops_per_token + k_flops_per_token + v_flops_per_token
+    attn_proj_flops = batch_size * seq_length * num_hidden_layers * attn_proj_flops_per_token
+
+    # attn sdpa module
+    sdpa_flops_per_layer = 2 * BASE * hidden_size * seq_length * seq_length  # (q * k^T) * v
+    sdpa_flops = batch_size * num_hidden_layers * sdpa_flops_per_layer
+
+    # embedding module
+    embedding_flops_per_token = hidden_size * vocab_size
+    embedding_flops = batch_size * seq_length * embedding_flops_per_token
+    if tie_word_embeddings is False:
+        embedding_flops *= 2
+
+    non_embedding_flops = mlp_flops + attn_proj_flops + sdpa_flops
+    non_embedding_coeff, embedding_coeff = 1, 1
+    if include_backward:
+        non_embedding_coeff += 2
+        embedding_coeff += 2
+
+    if include_recompute:
+        non_embedding_coeff += 1
+
+    total_flops = non_embedding_coeff * non_embedding_flops + embedding_coeff * embedding_flops
+
+    if include_flashattn:
+        total_flops += sdpa_flops
+
+    return total_flops
 
 
-def hardware_flops_counter(
-    seconds: float, # seconds used in given iterations
-    num_gpus: int = 1,
-) -> float:
-    if "A100" in torch.cuda.get_device_name():
-        return 312 * 1e12 * seconds * num_gpus
-    elif "V100" in torch.cuda.get_device_name():
-        return 125 * 1e12 * seconds * num_gpus
+def compute_device_flops() -> float:
+    device_name = torch.cuda.get_device_name()
+    device_count = torch.cuda.device_count()
+    if "H100" in device_name or "H800" in device_name:
+        return 989 * 1e12 * device_count
+    elif "A100" in device_name or "A800" in device_name:
+        return 312 * 1e12 * device_count
+    elif "V100" in device_name:
+        return 125 * 1e12 * device_count
+    elif "4090" in device_name:
+        return 98 * 1e12 * device_count
+    else:
+        raise NotImplementedError("Device not supported: {}.".format(device_name))
+
 
 def compute_mfu(
+    model_name_or_path: str,
     batch_size: int,
-    seqlen: int,
-    model_config: dict,
-    num_iter: int,
-    seconds: float,
-    num_gpus: int = 1,
+    seq_length: int,
+    finetuning_type: str = "lora",
+    flash_attn: str = "auto",
+    deepspeed_stage: int = 0,
+    disable_gc: bool = False,
+    liger_kernel: bool = False,
 ) -> float:
+    r"""
+    Computes MFU for given model and hyper-params.
+    Usage: python cal_mfu.py --model_name_or_path path_to_model --batch_size 1 --seq_length 1024
     """
-    compute MFU given model configuration, training config and training information
-    """
-    percentage = (num_iter * model_flops_counter(batch_size,seqlen,model_config)) / hardware_flops_counter(seconds, num_gpus)
-    
-    print(f"MFU : {percentage* 100:.2f}%")
-    return percentage
-    
-# User input
+    args = {
+        "model_name_or_path": model_name_or_path,
+        "flash_attn": flash_attn,
+        "disable_gradient_checkpointing": disable_gc,
+        "enable_liger_kernel": liger_kernel,
+        "stage": "pt",
+        "do_train": True,
+        "finetuning_type": finetuning_type,
+        "dataset": "c4_demo",
+        "cutoff_len": seq_length,
+        "output_dir": os.path.join("saves", "test_mfu"),
+        "overwrite_output_dir": True,
+        "per_device_train_batch_size": batch_size,
+        "max_steps": 100,
+        "bf16": True,
+    }
+    if deepspeed_stage in [2, 3]:
+        args["deepspeed"] = "examples/deepspeed/ds_z{}_config.json".format(deepspeed_stage)
 
-### model_name
-model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
+    run_exp(args)
+    with open(os.path.join("saves", "test_mfu", "all_results.json"), "r", encoding="utf-8") as f:
+        result = json.load(f)
 
-### training config
-batch_size = 8
-seqlen = 1*1024
-num_gpus = 1
-
-### training information
-num_iter = 225
-seconds = 605 # time used in {num_iter} iterations
-
-model_config = AutoConfig.from_pretrained(model_name)
-if __name__ == "__main__":
-    fire.Fire( 
-        compute_mfu(
-            batch_size=batch_size,
-            seqlen=seqlen,
-            model_config=model_config,
-            num_iter=num_iter,
-            seconds=seconds,
-            num_gpus=num_gpus
-        )
+    mfu_value = (
+        result["train_samples_per_second"]
+        * compute_model_flops(model_name_or_path, batch_size, seq_length)
+        / compute_device_flops()
     )
+    print("MFU: {:.2f}%".format(mfu_value * 100))
+
+
+if __name__ == "__main__":
+    fire.Fire(compute_mfu)

From 4d8b782268d14d436946fd8223de4c4b016b63ec Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 8 Sep 2024 00:41:45 +0800
Subject: [PATCH 3/3] fix

Former-commit-id: 842c5455b0e7c4b8d307af950304d4ed52bbb3e8
---
 scripts/cal_mfu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/cal_mfu.py b/scripts/cal_mfu.py
index c4e851d7..0ae4dd42 100644
--- a/scripts/cal_mfu.py
+++ b/scripts/cal_mfu.py
@@ -138,7 +138,7 @@ def compute_mfu(
         result = json.load(f)
 
     mfu_value = (
-        result["train_samples_per_second"]
+        result["train_steps_per_second"]
         * compute_model_flops(model_name_or_path, batch_size, seq_length)
         / compute_device_flops()
     )