Update cal_mfu.py

Former-commit-id: e71133cab1
2026-03-07 04:05:58 +08:00 · 2024-09-08 00:39:48 +08:00
parent ab1775cd95
commit 4f28e0e5d2
1 changed files with 120 additions and 97 deletions
--- a/scripts/cal_mfu.py
+++ b/scripts/cal_mfu.py
@@ -11,116 +11,139 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License
+# limitations under the License.
 import json
 import os
 import fire
 import torch
 from transformers import AutoConfig
-import fire
+
-def model_flops_counter(
+from llamafactory.train.tuner import run_exp
 BASE = 2  # gemm (add + mul)
 def compute_model_flops(
    model_name_or_path: str,
    batch_size: int,
-    seqlen: int,
+    seq_length: int,
-    model_config: dict,
+    include_backward: bool = True,
-    is_backward: bool = True,
+    include_recompute: bool = False,
-    is_recompute: bool = False,
+    include_flashattn: bool = False,
-    is_flashattn: bool = False,
+) -> int:
-) -> float:
+    r"""
    Calculates the FLOPs of model per forward/backward pass.
    """
-    calculate the FLOPs of model per iteration
+    config = AutoConfig.from_pretrained(model_name_or_path)
-    """
+    hidden_size = getattr(config, "hidden_size", None)
-    hidden_size = model_config.hidden_size
+    vocab_size = getattr(config, "vocab_size", None)
-    num_attention_heads = model_config.num_attention_heads
+    intermediate_size = getattr(config, "intermediate_size", None)
-    num_key_value_heads = model_config.num_key_value_heads
+    num_attention_heads = getattr(config, "num_attention_heads", None)
-    vocab_size = model_config.vocab_size
+    num_key_value_heads = getattr(config, "num_key_value_heads", None)
-    intermediate_size = model_config.intermediate_size
+    num_hidden_layers = getattr(config, "num_hidden_layers", None)
-    num_hidden_layers = model_config.num_hidden_layers
+    tie_word_embeddings = getattr(config, "tie_word_embeddings", False)
    """
    B: batch_size
    S: seqlen
    L: num_hidden_layers
    H: hidden_size
    V: vocab_size
    I: intermediate_size
    """
    ### MLP calculation
    per_mlp_calculation = 2 * hidden_size * intermediate_size
    mlp_calculation_per_layer = per_mlp_calculation * 3
    mlp_calculation = batch_size * seqlen * mlp_calculation_per_layer * num_hidden_layers
-    ### Attention calculation
+    # mlp module
-    Q_calculation = 2 * hidden_size * hidden_size
+    mlp_flops_per_token = 3 * BASE * hidden_size * intermediate_size  # up, gate, down
-    O_calculation = 2 * hidden_size * hidden_size
+    mlp_flops = batch_size * seq_length * num_hidden_layers * mlp_flops_per_token
-    K_calculation = 2 * hidden_size * hidden_size * num_key_value_heads / num_attention_heads
+
-    V_calculation = 2 * hidden_size * hidden_size * num_key_value_heads / num_attention_heads
+    # attn projector module
-    
+    q_flops_per_token = BASE * hidden_size * hidden_size
-    QKVO_calculation = Q_calculation + O_calculation + K_calculation + V_calculation # 8H^2 / coe
+    o_flops_per_token = BASE * hidden_size * hidden_size
-    self_attn_calculation = seqlen * hidden_size * 2 * 2  # (4 * S * H)
+    k_flops_per_token = BASE * hidden_size * hidden_size * num_key_value_heads // num_attention_heads
-    attention_calculation = batch_size * seqlen * num_hidden_layers * (QKVO_calculation + self_attn_calculation) # BSL(8H^2/coe + 4S * H)
+    v_flops_per_token = BASE * hidden_size * hidden_size * num_key_value_heads // num_attention_heads
-    
+    attn_proj_flops_per_token = q_flops_per_token + o_flops_per_token + k_flops_per_token + v_flops_per_token
-    #Embedding and LMhead calculation
+    attn_proj_flops = batch_size * seq_length * num_hidden_layers * attn_proj_flops_per_token
-    embedding_calculation = hidden_size * vocab_size
+
-    lmhead_calculation = hidden_size * vocab_size    
+    # attn sdpa module
-    IO_calculation = 3 * batch_size * seqlen * (embedding_calculation + lmhead_calculation) # 2 *(1+2)BSHV    
+    sdpa_flops_per_layer = 2 * BASE * hidden_size * seq_length * seq_length  # (q * k^T) * v
-    E = attention_calculation + mlp_calculation
+    sdpa_flops = batch_size * num_hidden_layers * sdpa_flops_per_layer
-    coefficient = 3
+
-    fix_term = 0
+    # embedding module
-    if(is_recompute):
+    embedding_flops_per_token = hidden_size * vocab_size
-        coefficient = 4
+    embedding_flops = batch_size * seq_length * embedding_flops_per_token
-    if(is_flashattn):
+    if tie_word_embeddings is False:
-        fix_term = batch_size *seqlen * self_attn_calculation
+        embedding_flops *= 2
-    
+
-    total_calculation = coefficient * E + IO_calculation + fix_term
+    non_embedding_flops = mlp_flops + attn_proj_flops + sdpa_flops
-    
+    non_embedding_coeff, embedding_coeff = 1, 1
-    return total_calculation
+    if include_backward:
        non_embedding_coeff += 2
        embedding_coeff += 2
    if include_recompute:
        non_embedding_coeff += 1
    total_flops = non_embedding_coeff * non_embedding_flops + embedding_coeff * embedding_flops
    if include_flashattn:
        total_flops += sdpa_flops
    return total_flops
-def hardware_flops_counter(
+def compute_device_flops() -> float:
-    seconds: float, # seconds used in given iterations
+    device_name = torch.cuda.get_device_name()
-    num_gpus: int = 1,
+    device_count = torch.cuda.device_count()
-) -> float:
+    if "H100" in device_name or "H800" in device_name:
-    if "A100" in torch.cuda.get_device_name():
+        return 989 * 1e12 * device_count
-        return 312 * 1e12 * seconds * num_gpus
+    elif "A100" in device_name or "A800" in device_name:
-    elif "V100" in torch.cuda.get_device_name():
+        return 312 * 1e12 * device_count
-        return 125 * 1e12 * seconds * num_gpus
+    elif "V100" in device_name:
        return 125 * 1e12 * device_count
    elif "4090" in device_name:
        return 98 * 1e12 * device_count
    else:
        raise NotImplementedError("Device not supported: {}.".format(device_name))
 def compute_mfu(
    model_name_or_path: str,
    batch_size: int,
-    seqlen: int,
+    seq_length: int,
-    model_config: dict,
+    finetuning_type: str = "lora",
-    num_iter: int,
+    flash_attn: str = "auto",
-    seconds: float,
+    deepspeed_stage: int = 0,
-    num_gpus: int = 1,
+    disable_gc: bool = False,
    liger_kernel: bool = False,
 ) -> float:
    r"""
    Computes MFU for given model and hyper-params.
    Usage: python cal_mfu.py --model_name_or_path path_to_model --batch_size 1 --seq_length 1024
    """
-    compute MFU given model configuration, training config and training information
+    args = {
-    """
+        "model_name_or_path": model_name_or_path,
-    percentage = (num_iter * model_flops_counter(batch_size,seqlen,model_config)) / hardware_flops_counter(seconds, num_gpus)
+        "flash_attn": flash_attn,
-    
+        "disable_gradient_checkpointing": disable_gc,
-    print(f"MFU : {percentage* 100:.2f}%")
+        "enable_liger_kernel": liger_kernel,
-    return percentage
+        "stage": "pt",
-    
+        "do_train": True,
-# User input
+        "finetuning_type": finetuning_type,
        "dataset": "c4_demo",
        "cutoff_len": seq_length,
        "output_dir": os.path.join("saves", "test_mfu"),
        "overwrite_output_dir": True,
        "per_device_train_batch_size": batch_size,
        "max_steps": 100,
        "bf16": True,
    }
    if deepspeed_stage in [2, 3]:
        args["deepspeed"] = "examples/deepspeed/ds_z{}_config.json".format(deepspeed_stage)
-### model_name
+    run_exp(args)
-model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
+    with open(os.path.join("saves", "test_mfu", "all_results.json"), "r", encoding="utf-8") as f:
        result = json.load(f)
-### training config
+    mfu_value = (
-batch_size = 8
+        result["train_samples_per_second"]
-seqlen = 1*1024
+        * compute_model_flops(model_name_or_path, batch_size, seq_length)
-num_gpus = 1
+        / compute_device_flops()
 ### training information
 num_iter = 225
 seconds = 605 # time used in {num_iter} iterations
 model_config = AutoConfig.from_pretrained(model_name)
 if __name__ == "__main__":
    fire.Fire( 
        compute_mfu(
            batch_size=batch_size,
            seqlen=seqlen,
            model_config=model_config,
            num_iter=num_iter,
            seconds=seconds,
            num_gpus=num_gpus
        )
    )
    print("MFU: {:.2f}%".format(mfu_value * 100))
 if __name__ == "__main__":
    fire.Fire(compute_mfu)