mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-08-04 20:52:59 +08:00
parent
92464eaf30
commit
3787d13816
@ -1,7 +1,6 @@
|
|||||||
from typing import TYPE_CHECKING, Optional, Tuple
|
from typing import TYPE_CHECKING, Optional, Tuple
|
||||||
|
|
||||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
|
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
|
||||||
from transformers.integrations import is_deepspeed_zero3_enabled
|
|
||||||
from trl import AutoModelForCausalLMWithValueHead
|
from trl import AutoModelForCausalLMWithValueHead
|
||||||
|
|
||||||
from ..extras.logging import get_logger
|
from ..extras.logging import get_logger
|
||||||
@ -77,13 +76,7 @@ def load_model_and_tokenizer(
|
|||||||
logger.warning("Unsloth does not support loading adapters.")
|
logger.warning("Unsloth does not support loading adapters.")
|
||||||
|
|
||||||
if model is None:
|
if model is None:
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, config=config, **config_kwargs)
|
||||||
model_args.model_name_or_path,
|
|
||||||
config=config,
|
|
||||||
torch_dtype=model_args.compute_dtype,
|
|
||||||
low_cpu_mem_usage=(not is_deepspeed_zero3_enabled()),
|
|
||||||
**config_kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
patch_model(model, tokenizer, model_args, is_trainable)
|
patch_model(model, tokenizer, model_args, is_trainable)
|
||||||
register_autoclass(config, model, tokenizer)
|
register_autoclass(config, model, tokenizer)
|
||||||
|
@ -163,7 +163,6 @@ def _configure_quantization(
|
|||||||
if is_deepspeed_zero3_enabled():
|
if is_deepspeed_zero3_enabled():
|
||||||
raise ValueError("DeepSpeed ZeRO-3 is incompatible with quantization.")
|
raise ValueError("DeepSpeed ZeRO-3 is incompatible with quantization.")
|
||||||
|
|
||||||
config_kwargs["device_map"] = {"": get_current_device()}
|
|
||||||
quantization_config: Dict[str, Any] = getattr(config, "quantization_config", None)
|
quantization_config: Dict[str, Any] = getattr(config, "quantization_config", None)
|
||||||
if quantization_config.get("quant_method", None) == "gptq" and quantization_config.get("bits", -1) == 4:
|
if quantization_config.get("quant_method", None) == "gptq" and quantization_config.get("bits", -1) == 4:
|
||||||
quantization_config["use_exllama"] = False # disable exllama
|
quantization_config["use_exllama"] = False # disable exllama
|
||||||
@ -214,7 +213,6 @@ def _configure_quantization(
|
|||||||
bnb_4bit_quant_type=model_args.quantization_type,
|
bnb_4bit_quant_type=model_args.quantization_type,
|
||||||
)
|
)
|
||||||
|
|
||||||
config_kwargs["device_map"] = {"": get_current_device()}
|
|
||||||
logger.info("Quantizing model to {} bit.".format(model_args.quantization_bit))
|
logger.info("Quantizing model to {} bit.".format(model_args.quantization_bit))
|
||||||
|
|
||||||
|
|
||||||
@ -284,6 +282,11 @@ def patch_config(
|
|||||||
|
|
||||||
_configure_quantization(config, tokenizer, model_args, config_kwargs)
|
_configure_quantization(config, tokenizer, model_args, config_kwargs)
|
||||||
|
|
||||||
|
config_kwargs["torch_dtype"] = model_args.compute_dtype
|
||||||
|
if not is_deepspeed_zero3_enabled():
|
||||||
|
config_kwargs["device_map"] = {"": get_current_device()}
|
||||||
|
config_kwargs["low_cpu_mem_usage"] = True
|
||||||
|
|
||||||
|
|
||||||
def patch_model(
|
def patch_model(
|
||||||
model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments", is_trainable: bool
|
model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments", is_trainable: bool
|
||||||
|
Loading…
x
Reference in New Issue
Block a user