From 61bc5bd0dd695073fcbd03376ee3989c7c3d4719 Mon Sep 17 00:00:00 2001 From: liuzc Date: Tue, 16 Jan 2024 15:16:20 +0800 Subject: [PATCH] support resize embed for zero3 Former-commit-id: a5f6a7f4fb057511428011c37422c535f31b79d2 --- src/llmtuner/model/patcher.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py index 381436d2..60e4b4bc 100644 --- a/src/llmtuner/model/patcher.py +++ b/src/llmtuner/model/patcher.py @@ -5,6 +5,7 @@ import random from types import MethodType from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple from datasets import load_dataset +from contextlib import nullcontext from transformers import BitsAndBytesConfig, GPTQConfig, PreTrainedModel, PreTrainedTokenizerBase from transformers.integrations import is_deepspeed_zero3_enabled @@ -28,7 +29,7 @@ SUPPORTED_CLASS_FOR_S2ATTN = [] # TODO: add llama def _noisy_mean_initialization(embed_weight: torch.Tensor, num_new_tokens: int): embedding_dim = embed_weight.size(1) avg_weight = embed_weight[:-num_new_tokens].mean(dim=0, keepdim=True) - noise_weight = torch.empty_like(avg_weight[-num_new_tokens:]) + noise_weight = torch.empty_like(embed_weight[-num_new_tokens:]) noise_weight.normal_(mean=0, std=(1.0 / math.sqrt(embedding_dim))) embed_weight[-num_new_tokens:] = avg_weight + noise_weight @@ -37,17 +38,31 @@ def _resize_embedding_layer(model: "PreTrainedModel", tokenizer: "PreTrainedToke r""" Resize token embeddings. """ - current_embedding_size = model.get_input_embeddings().weight.size(0) + if is_deepspeed_zero3_enabled(): + import deepspeed + with deepspeed.zero.GatheredParameters(model.get_input_embeddings().weight, modifier_rank=None): + current_embedding_size = model.get_input_embeddings().weight.size(0) + else: + current_embedding_size = model.get_input_embeddings().weight.size(0) if len(tokenizer) > current_embedding_size: if not isinstance(model.get_output_embeddings(), torch.nn.Linear): logger.warning("Current model does not support resizing token embeddings.") return model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64) - new_embedding_size = model.get_input_embeddings().weight.size(0) - num_new_tokens = new_embedding_size - current_embedding_size - _noisy_mean_initialization(model.get_input_embeddings().weight.data, num_new_tokens) - _noisy_mean_initialization(model.get_output_embeddings().weight.data, num_new_tokens) + if is_deepspeed_zero3_enabled(): + import deepspeed + params = [model.get_input_embeddings().weight] + if model.get_output_embeddings() is not None and not model.config.tie_word_embeddings: + params.append(model.get_output_embeddings().weight) + context = deepspeed.zero.GatheredParameters(params, modifier_rank=0) + else: + context = nullcontext() + with context: + new_embedding_size = model.get_input_embeddings().weight.size(0) + num_new_tokens = new_embedding_size - current_embedding_size + _noisy_mean_initialization(model.get_input_embeddings().weight.data, num_new_tokens) + _noisy_mean_initialization(model.get_output_embeddings().weight.data, num_new_tokens) logger.info("Resized token embeddings from {} to {}.".format(current_embedding_size, new_embedding_size)) @@ -264,9 +279,6 @@ def patch_model( setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"]) if model_args.resize_vocab: - if is_deepspeed_zero3_enabled(): - raise ValueError("DeepSpeed ZeRO-3 is incompatible with vocab resizing.") - _resize_embedding_layer(model, tokenizer) if is_trainable: