import torch from typing import List, Optional from transformers.modeling_utils import PreTrainedModel from transformers.generation.utils import LogitsProcessorList from transformers.generation.logits_process import LogitsProcessor from llmtuner.extras.constants import LAYERNORM_NAMES class AverageMeter: r""" Computes and stores the average and current value. """ def __init__(self): self.reset() def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count # Avoids runtime error in model.generate(do_sample=True). class InvalidScoreLogitsProcessor(LogitsProcessor): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: if torch.isnan(scores).any() or torch.isinf(scores).any(): scores.zero_() scores[..., 0] = 1.0 return scores def get_logits_processor() -> LogitsProcessorList: logits_processor = LogitsProcessorList() logits_processor.append(InvalidScoreLogitsProcessor()) return logits_processor def print_trainable_params(model: torch.nn.Module) -> None: trainable_params, all_param = 0, 0 for param in model.parameters(): num_params = param.numel() # if using DS Zero 3 and the weights are initialized empty if num_params == 0 and hasattr(param, "ds_numel"): num_params = param.ds_numel all_param += num_params if param.requires_grad: trainable_params += num_params print("trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format( trainable_params, all_param, 100 * trainable_params / all_param)) # Includes: (1) cast the layernorm in fp32 (2) make output embedding layer require grads (3) upcast the lm_head to fp32 # Inspired by: https://github.com/huggingface/peft/blob/c0209c35abbf88c63aa267800d98a8e212ed0a42/src/peft/utils/other.py#L35 def prepare_model_for_training( model: PreTrainedModel, finetuning_type: str, output_layer_name: Optional[str] = "lm_head", use_gradient_checkpointing: Optional[bool] = True, layer_norm_names: Optional[List[str]] = LAYERNORM_NAMES ) -> PreTrainedModel: for name, param in model.named_parameters(): if param.ndim == 1 and any(layer_norm_name in name for layer_norm_name in layer_norm_names): param.data = param.data.to(torch.float32) if use_gradient_checkpointing: if hasattr(model, "enable_input_require_grads"): model.enable_input_require_grads() else: def make_inputs_require_grad(module, input, output): output.requires_grad_(True) model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) model.gradient_checkpointing_enable() model.config.use_cache = False # turn off when gradient checkpointing is enabled if finetuning_type != "full" and hasattr(model, output_layer_name): output_layer: torch.nn.Linear = getattr(model, output_layer_name) input_dtype = output_layer.weight.dtype class CastOutputToFloat(torch.nn.Sequential): def forward(self, x: torch.Tensor) -> torch.Tensor: return super().forward(x.to(input_dtype)).to(torch.float32) new_output_layer = CastOutputToFloat(output_layer) # adapt to LLaMA-2's pretraining_tp (actually LLaMA models can automatically do casting but BLOOM models cannot) # (https://github.com/huggingface/transformers/blob/v4.31.0/src/transformers/models/llama/modeling_llama.py#L819) setattr(new_output_layer, "weight", output_layer.weight) setattr(model, output_layer_name, new_output_layer) return model def torch_gc() -> None: r""" Collects GPU memory. """ if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.ipc_collect()