From ab1775cd953b4dba82ae2bef5f039b128d3091d7 Mon Sep 17 00:00:00 2001 From: "-.-" Date: Sat, 7 Sep 2024 23:21:35 +0800 Subject: [PATCH 1/3] update cal_mfu.py Former-commit-id: 66ec36522c9bf8dfffc1065202362801875a104d --- scripts/cal_mfu.py | 126 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 scripts/cal_mfu.py diff --git a/scripts/cal_mfu.py b/scripts/cal_mfu.py new file mode 100644 index 00000000..2f408497 --- /dev/null +++ b/scripts/cal_mfu.py @@ -0,0 +1,126 @@ +# coding=utf-8 +# Copyright 2024 the LlamaFactory team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +import torch +from transformers import AutoConfig +import fire +def model_flops_counter( + batch_size: int, + seqlen: int, + model_config: dict, + is_backward: bool = True, + is_recompute: bool = False, + is_flashattn: bool = False, +) -> float: + """ + calculate the FLOPs of model per iteration + """ + hidden_size = model_config.hidden_size + num_attention_heads = model_config.num_attention_heads + num_key_value_heads = model_config.num_key_value_heads + vocab_size = model_config.vocab_size + intermediate_size = model_config.intermediate_size + num_hidden_layers = model_config.num_hidden_layers + """ + B: batch_size + S: seqlen + L: num_hidden_layers + H: hidden_size + V: vocab_size + I: intermediate_size + """ + ### MLP calculation + per_mlp_calculation = 2 * hidden_size * intermediate_size + mlp_calculation_per_layer = per_mlp_calculation * 3 + mlp_calculation = batch_size * seqlen * mlp_calculation_per_layer * num_hidden_layers + + ### Attention calculation + Q_calculation = 2 * hidden_size * hidden_size + O_calculation = 2 * hidden_size * hidden_size + K_calculation = 2 * hidden_size * hidden_size * num_key_value_heads / num_attention_heads + V_calculation = 2 * hidden_size * hidden_size * num_key_value_heads / num_attention_heads + + QKVO_calculation = Q_calculation + O_calculation + K_calculation + V_calculation # 8H^2 / coe + self_attn_calculation = seqlen * hidden_size * 2 * 2 # (4 * S * H) + attention_calculation = batch_size * seqlen * num_hidden_layers * (QKVO_calculation + self_attn_calculation) # BSL(8H^2/coe + 4S * H) + + #Embedding and LMhead calculation + embedding_calculation = hidden_size * vocab_size + lmhead_calculation = hidden_size * vocab_size + IO_calculation = 3 * batch_size * seqlen * (embedding_calculation + lmhead_calculation) # 2 *(1+2)BSHV + E = attention_calculation + mlp_calculation + coefficient = 3 + fix_term = 0 + if(is_recompute): + coefficient = 4 + if(is_flashattn): + fix_term = batch_size *seqlen * self_attn_calculation + + total_calculation = coefficient * E + IO_calculation + fix_term + + return total_calculation + + +def hardware_flops_counter( + seconds: float, # seconds used in given iterations + num_gpus: int = 1, +) -> float: + if "A100" in torch.cuda.get_device_name(): + return 312 * 1e12 * seconds * num_gpus + elif "V100" in torch.cuda.get_device_name(): + return 125 * 1e12 * seconds * num_gpus + +def compute_mfu( + batch_size: int, + seqlen: int, + model_config: dict, + num_iter: int, + seconds: float, + num_gpus: int = 1, +) -> float: + """ + compute MFU given model configuration, training config and training information + """ + percentage = (num_iter * model_flops_counter(batch_size,seqlen,model_config)) / hardware_flops_counter(seconds, num_gpus) + + print(f"MFU : {percentage* 100:.2f}%") + return percentage + +# User input + +### model_name +model_name = "meta-llama/Meta-Llama-3-8B-Instruct" + +### training config +batch_size = 8 +seqlen = 1*1024 +num_gpus = 1 + +### training information +num_iter = 225 +seconds = 605 # time used in {num_iter} iterations + +model_config = AutoConfig.from_pretrained(model_name) +if __name__ == "__main__": + fire.Fire( + compute_mfu( + batch_size=batch_size, + seqlen=seqlen, + model_config=model_config, + num_iter=num_iter, + seconds=seconds, + num_gpus=num_gpus + ) + ) From 4f28e0e5d2fc727665b044d3bd7ea831a460826c Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Sun, 8 Sep 2024 00:39:48 +0800 Subject: [PATCH 2/3] Update cal_mfu.py Former-commit-id: e71133cab14eed2c071cfd4ce044ae9905447427 --- scripts/cal_mfu.py | 217 +++++++++++++++++++++++++-------------------- 1 file changed, 120 insertions(+), 97 deletions(-) diff --git a/scripts/cal_mfu.py b/scripts/cal_mfu.py index 2f408497..c4e851d7 100644 --- a/scripts/cal_mfu.py +++ b/scripts/cal_mfu.py @@ -11,116 +11,139 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License +# limitations under the License. +import json +import os + +import fire import torch from transformers import AutoConfig -import fire -def model_flops_counter( + +from llamafactory.train.tuner import run_exp + + +BASE = 2 # gemm (add + mul) + + +def compute_model_flops( + model_name_or_path: str, batch_size: int, - seqlen: int, - model_config: dict, - is_backward: bool = True, - is_recompute: bool = False, - is_flashattn: bool = False, -) -> float: + seq_length: int, + include_backward: bool = True, + include_recompute: bool = False, + include_flashattn: bool = False, +) -> int: + r""" + Calculates the FLOPs of model per forward/backward pass. """ - calculate the FLOPs of model per iteration - """ - hidden_size = model_config.hidden_size - num_attention_heads = model_config.num_attention_heads - num_key_value_heads = model_config.num_key_value_heads - vocab_size = model_config.vocab_size - intermediate_size = model_config.intermediate_size - num_hidden_layers = model_config.num_hidden_layers - """ - B: batch_size - S: seqlen - L: num_hidden_layers - H: hidden_size - V: vocab_size - I: intermediate_size - """ - ### MLP calculation - per_mlp_calculation = 2 * hidden_size * intermediate_size - mlp_calculation_per_layer = per_mlp_calculation * 3 - mlp_calculation = batch_size * seqlen * mlp_calculation_per_layer * num_hidden_layers + config = AutoConfig.from_pretrained(model_name_or_path) + hidden_size = getattr(config, "hidden_size", None) + vocab_size = getattr(config, "vocab_size", None) + intermediate_size = getattr(config, "intermediate_size", None) + num_attention_heads = getattr(config, "num_attention_heads", None) + num_key_value_heads = getattr(config, "num_key_value_heads", None) + num_hidden_layers = getattr(config, "num_hidden_layers", None) + tie_word_embeddings = getattr(config, "tie_word_embeddings", False) - ### Attention calculation - Q_calculation = 2 * hidden_size * hidden_size - O_calculation = 2 * hidden_size * hidden_size - K_calculation = 2 * hidden_size * hidden_size * num_key_value_heads / num_attention_heads - V_calculation = 2 * hidden_size * hidden_size * num_key_value_heads / num_attention_heads - - QKVO_calculation = Q_calculation + O_calculation + K_calculation + V_calculation # 8H^2 / coe - self_attn_calculation = seqlen * hidden_size * 2 * 2 # (4 * S * H) - attention_calculation = batch_size * seqlen * num_hidden_layers * (QKVO_calculation + self_attn_calculation) # BSL(8H^2/coe + 4S * H) - - #Embedding and LMhead calculation - embedding_calculation = hidden_size * vocab_size - lmhead_calculation = hidden_size * vocab_size - IO_calculation = 3 * batch_size * seqlen * (embedding_calculation + lmhead_calculation) # 2 *(1+2)BSHV - E = attention_calculation + mlp_calculation - coefficient = 3 - fix_term = 0 - if(is_recompute): - coefficient = 4 - if(is_flashattn): - fix_term = batch_size *seqlen * self_attn_calculation - - total_calculation = coefficient * E + IO_calculation + fix_term - - return total_calculation + # mlp module + mlp_flops_per_token = 3 * BASE * hidden_size * intermediate_size # up, gate, down + mlp_flops = batch_size * seq_length * num_hidden_layers * mlp_flops_per_token + + # attn projector module + q_flops_per_token = BASE * hidden_size * hidden_size + o_flops_per_token = BASE * hidden_size * hidden_size + k_flops_per_token = BASE * hidden_size * hidden_size * num_key_value_heads // num_attention_heads + v_flops_per_token = BASE * hidden_size * hidden_size * num_key_value_heads // num_attention_heads + attn_proj_flops_per_token = q_flops_per_token + o_flops_per_token + k_flops_per_token + v_flops_per_token + attn_proj_flops = batch_size * seq_length * num_hidden_layers * attn_proj_flops_per_token + + # attn sdpa module + sdpa_flops_per_layer = 2 * BASE * hidden_size * seq_length * seq_length # (q * k^T) * v + sdpa_flops = batch_size * num_hidden_layers * sdpa_flops_per_layer + + # embedding module + embedding_flops_per_token = hidden_size * vocab_size + embedding_flops = batch_size * seq_length * embedding_flops_per_token + if tie_word_embeddings is False: + embedding_flops *= 2 + + non_embedding_flops = mlp_flops + attn_proj_flops + sdpa_flops + non_embedding_coeff, embedding_coeff = 1, 1 + if include_backward: + non_embedding_coeff += 2 + embedding_coeff += 2 + + if include_recompute: + non_embedding_coeff += 1 + + total_flops = non_embedding_coeff * non_embedding_flops + embedding_coeff * embedding_flops + + if include_flashattn: + total_flops += sdpa_flops + + return total_flops -def hardware_flops_counter( - seconds: float, # seconds used in given iterations - num_gpus: int = 1, -) -> float: - if "A100" in torch.cuda.get_device_name(): - return 312 * 1e12 * seconds * num_gpus - elif "V100" in torch.cuda.get_device_name(): - return 125 * 1e12 * seconds * num_gpus +def compute_device_flops() -> float: + device_name = torch.cuda.get_device_name() + device_count = torch.cuda.device_count() + if "H100" in device_name or "H800" in device_name: + return 989 * 1e12 * device_count + elif "A100" in device_name or "A800" in device_name: + return 312 * 1e12 * device_count + elif "V100" in device_name: + return 125 * 1e12 * device_count + elif "4090" in device_name: + return 98 * 1e12 * device_count + else: + raise NotImplementedError("Device not supported: {}.".format(device_name)) + def compute_mfu( + model_name_or_path: str, batch_size: int, - seqlen: int, - model_config: dict, - num_iter: int, - seconds: float, - num_gpus: int = 1, + seq_length: int, + finetuning_type: str = "lora", + flash_attn: str = "auto", + deepspeed_stage: int = 0, + disable_gc: bool = False, + liger_kernel: bool = False, ) -> float: + r""" + Computes MFU for given model and hyper-params. + Usage: python cal_mfu.py --model_name_or_path path_to_model --batch_size 1 --seq_length 1024 """ - compute MFU given model configuration, training config and training information - """ - percentage = (num_iter * model_flops_counter(batch_size,seqlen,model_config)) / hardware_flops_counter(seconds, num_gpus) - - print(f"MFU : {percentage* 100:.2f}%") - return percentage - -# User input + args = { + "model_name_or_path": model_name_or_path, + "flash_attn": flash_attn, + "disable_gradient_checkpointing": disable_gc, + "enable_liger_kernel": liger_kernel, + "stage": "pt", + "do_train": True, + "finetuning_type": finetuning_type, + "dataset": "c4_demo", + "cutoff_len": seq_length, + "output_dir": os.path.join("saves", "test_mfu"), + "overwrite_output_dir": True, + "per_device_train_batch_size": batch_size, + "max_steps": 100, + "bf16": True, + } + if deepspeed_stage in [2, 3]: + args["deepspeed"] = "examples/deepspeed/ds_z{}_config.json".format(deepspeed_stage) -### model_name -model_name = "meta-llama/Meta-Llama-3-8B-Instruct" + run_exp(args) + with open(os.path.join("saves", "test_mfu", "all_results.json"), "r", encoding="utf-8") as f: + result = json.load(f) -### training config -batch_size = 8 -seqlen = 1*1024 -num_gpus = 1 - -### training information -num_iter = 225 -seconds = 605 # time used in {num_iter} iterations - -model_config = AutoConfig.from_pretrained(model_name) -if __name__ == "__main__": - fire.Fire( - compute_mfu( - batch_size=batch_size, - seqlen=seqlen, - model_config=model_config, - num_iter=num_iter, - seconds=seconds, - num_gpus=num_gpus - ) + mfu_value = ( + result["train_samples_per_second"] + * compute_model_flops(model_name_or_path, batch_size, seq_length) + / compute_device_flops() ) + print("MFU: {:.2f}%".format(mfu_value * 100)) + + +if __name__ == "__main__": + fire.Fire(compute_mfu) From 4d8b782268d14d436946fd8223de4c4b016b63ec Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Sun, 8 Sep 2024 00:41:45 +0800 Subject: [PATCH 3/3] fix Former-commit-id: 842c5455b0e7c4b8d307af950304d4ed52bbb3e8 --- scripts/cal_mfu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cal_mfu.py b/scripts/cal_mfu.py index c4e851d7..0ae4dd42 100644 --- a/scripts/cal_mfu.py +++ b/scripts/cal_mfu.py @@ -138,7 +138,7 @@ def compute_mfu( result = json.load(f) mfu_value = ( - result["train_samples_per_second"] + result["train_steps_per_second"] * compute_model_flops(model_name_or_path, batch_size, seq_length) / compute_device_flops() )