mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-12-28 09:40:34 +08:00
Compare commits
2 Commits
cff4483392
...
4fd94141a4
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4fd94141a4 | ||
|
|
22d6ac29d5 |
@@ -291,8 +291,8 @@ Read technical notes:
|
|||||||
| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 |
|
| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 |
|
||||||
| [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n |
|
| [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n |
|
||||||
| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 |
|
| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 |
|
||||||
| [GLM-4.1V/GLM-4.6V](https://huggingface.co/zai-org) | 9B/106B | glm4v |
|
| [GLM-4.1V](https://huggingface.co/zai-org) | 9B | glm4v |
|
||||||
| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org) | 106B/355B | glm4_moe/glm4v_moe |
|
| [GLM-4.5/GLM-4.5(6)V](https://huggingface.co/zai-org) | 9B/106B/355B | glm4_moe/glm4_5v |
|
||||||
| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - |
|
| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - |
|
||||||
| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt |
|
| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt |
|
||||||
| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 |
|
| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 |
|
||||||
@@ -315,7 +315,7 @@ Read technical notes:
|
|||||||
| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo |
|
| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo |
|
||||||
| [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 |
|
| [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 |
|
||||||
| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v |
|
| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v |
|
||||||
| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral |
|
| [Ministral(3)/Mistral-Nemo](https://huggingface.co/mistralai) | 3B/8B/12B/14B | ministral/ministral3 |
|
||||||
| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral |
|
| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral |
|
||||||
| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small |
|
| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small |
|
||||||
| [OLMo](https://huggingface.co/allenai) | 1B/7B | - |
|
| [OLMo](https://huggingface.co/allenai) | 1B/7B | - |
|
||||||
|
|||||||
@@ -293,8 +293,8 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
|
|||||||
| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 |
|
| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 |
|
||||||
| [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n |
|
| [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n |
|
||||||
| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 |
|
| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 |
|
||||||
| [GLM-4.1V/GLM-4.6V](https://huggingface.co/zai-org) | 9B/106B | glm4v |
|
| [GLM-4.1V](https://huggingface.co/zai-org) | 9B | glm4v |
|
||||||
| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org) | 106B/355B | glm4_moe/glm4v_moe |
|
| [GLM-4.5/GLM-4.5(6)V](https://huggingface.co/zai-org) | 9B/106B/355B | glm4_moe/glm4_5v |
|
||||||
| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - |
|
| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - |
|
||||||
| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt |
|
| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt |
|
||||||
| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 |
|
| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 |
|
||||||
@@ -317,7 +317,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
|
|||||||
| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo |
|
| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo |
|
||||||
| [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 |
|
| [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 |
|
||||||
| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v |
|
| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v |
|
||||||
| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral |
|
| [Ministral(3)/Mistral-Nemo](https://huggingface.co/mistralai) | 3B/8B/12B/14B | ministral/ministral3 |
|
||||||
| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral |
|
| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral |
|
||||||
| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small |
|
| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small |
|
||||||
| [OLMo](https://huggingface.co/allenai) | 1B/7B | - |
|
| [OLMo](https://huggingface.co/allenai) | 1B/7B | - |
|
||||||
|
|||||||
@@ -1128,7 +1128,7 @@ register_template(
|
|||||||
|
|
||||||
# copied from glm4 template
|
# copied from glm4 template
|
||||||
register_template(
|
register_template(
|
||||||
name="glm4v_moe",
|
name="glm4_5v",
|
||||||
format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
|
format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
|
||||||
format_assistant=StringFormatter(slots=["\n{{content}}"]),
|
format_assistant=StringFormatter(slots=["\n{{content}}"]),
|
||||||
format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
|
format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
|
||||||
@@ -1687,6 +1687,19 @@ register_template(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
register_template(
|
||||||
|
name="ministral3",
|
||||||
|
format_user=StringFormatter(slots=["[INST]{{content}}[/INST]"]),
|
||||||
|
format_system=StringFormatter(slots=["{{content}}\n\n"]),
|
||||||
|
format_function=FunctionFormatter(slots=["[TOOL_CALLS]{{content}}", {"eos_token"}], tool_format="mistral"),
|
||||||
|
format_observation=StringFormatter(slots=["""[TOOL_RESULTS]{"content": {{content}}}[/TOOL_RESULTS]"""]),
|
||||||
|
format_tools=ToolFormatter(tool_format="mistral"),
|
||||||
|
format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
|
||||||
|
template_class=Llama2Template,
|
||||||
|
mm_plugin=get_mm_plugin(name="pixtral", image_token="[IMG]"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
register_template(
|
||||||
name="olmo",
|
name="olmo",
|
||||||
format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>\n"]),
|
format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>\n"]),
|
||||||
|
|||||||
@@ -141,6 +141,7 @@ class QuantizationMethod(str, Enum):
|
|||||||
EETQ = "eetq"
|
EETQ = "eetq"
|
||||||
HQQ = "hqq"
|
HQQ = "hqq"
|
||||||
MXFP4 = "mxfp4"
|
MXFP4 = "mxfp4"
|
||||||
|
FP8 = "fp8"
|
||||||
|
|
||||||
|
|
||||||
class RopeScaling(str, Enum):
|
class RopeScaling(str, Enum):
|
||||||
@@ -969,14 +970,6 @@ register_model_group(
|
|||||||
DownloadSource.DEFAULT: "zai-org/GLM-4.1V-9B-Thinking",
|
DownloadSource.DEFAULT: "zai-org/GLM-4.1V-9B-Thinking",
|
||||||
DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.1V-9B-Thinking",
|
DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.1V-9B-Thinking",
|
||||||
},
|
},
|
||||||
"GLM-4.6V": {
|
|
||||||
DownloadSource.DEFAULT: "zai-org/GLM-4.6V",
|
|
||||||
DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.6V",
|
|
||||||
},
|
|
||||||
"GLM-4.6V-Flash": {
|
|
||||||
DownloadSource.DEFAULT: "zai-org/GLM-4.6V-Flash",
|
|
||||||
DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.6V-Flash",
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
template="glm4v",
|
template="glm4v",
|
||||||
multimodal=True,
|
multimodal=True,
|
||||||
@@ -1011,9 +1004,17 @@ register_model_group(
|
|||||||
"GLM-4.5V-Air-Thinking": {
|
"GLM-4.5V-Air-Thinking": {
|
||||||
DownloadSource.DEFAULT: "zai-org/GLM-4.5V",
|
DownloadSource.DEFAULT: "zai-org/GLM-4.5V",
|
||||||
DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5V",
|
DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5V",
|
||||||
}
|
},
|
||||||
|
"GLM-4.6V": {
|
||||||
|
DownloadSource.DEFAULT: "zai-org/GLM-4.6V",
|
||||||
|
DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.6V",
|
||||||
|
},
|
||||||
|
"GLM-4.6V-Flash": {
|
||||||
|
DownloadSource.DEFAULT: "zai-org/GLM-4.6V-Flash",
|
||||||
|
DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.6V-Flash",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
template="glm4v_moe",
|
template="glm4_5v",
|
||||||
multimodal=True,
|
multimodal=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1977,6 +1978,25 @@ register_model_group(
|
|||||||
template="mistral",
|
template="mistral",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
register_model_group(
|
||||||
|
models={
|
||||||
|
"Ministral-3-3B-Instruct-2512": {
|
||||||
|
DownloadSource.DEFAULT: "mistralai/Ministral-3-3B-Instruct-2512",
|
||||||
|
DownloadSource.MODELSCOPE: "mistralai/Ministral-3-3B-Instruct-2512",
|
||||||
|
},
|
||||||
|
"Ministral-3-8B-Instruct-2512": {
|
||||||
|
DownloadSource.DEFAULT: "mistralai/Ministral-3-8B-Instruct-2512",
|
||||||
|
DownloadSource.MODELSCOPE: "mistralai/Ministral-3-8B-Instruct-2512",
|
||||||
|
},
|
||||||
|
"Ministral-3-14B-Instruct-2512": {
|
||||||
|
DownloadSource.DEFAULT: "mistralai/Ministral-3-14B-Instruct-2512",
|
||||||
|
DownloadSource.MODELSCOPE: "mistralai/Ministral-3-14B-Instruct-2512",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
template="ministral3",
|
||||||
|
multimodal=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
register_model_group(
|
register_model_group(
|
||||||
models={
|
models={
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ from typing import TYPE_CHECKING, Any
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from transformers import BitsAndBytesConfig, EetqConfig, GPTQConfig, HqqConfig
|
from transformers import BitsAndBytesConfig, EetqConfig, FineGrainedFP8Config, GPTQConfig, HqqConfig
|
||||||
from transformers.integrations import is_deepspeed_zero3_enabled
|
from transformers.integrations import is_deepspeed_zero3_enabled
|
||||||
from transformers.modeling_utils import is_fsdp_enabled
|
from transformers.modeling_utils import is_fsdp_enabled
|
||||||
|
|
||||||
@@ -83,6 +83,7 @@ def configure_quantization(
|
|||||||
config: "PretrainedConfig",
|
config: "PretrainedConfig",
|
||||||
tokenizer: "PreTrainedTokenizer",
|
tokenizer: "PreTrainedTokenizer",
|
||||||
model_args: "ModelArguments",
|
model_args: "ModelArguments",
|
||||||
|
is_trainable: bool,
|
||||||
init_kwargs: dict[str, Any],
|
init_kwargs: dict[str, Any],
|
||||||
) -> None:
|
) -> None:
|
||||||
r"""Priority: PTQ-quantized (train/infer) > AutoGPTQ (export) > On-the-fly quantization (train/infer)."""
|
r"""Priority: PTQ-quantized (train/infer) > AutoGPTQ (export) > On-the-fly quantization (train/infer)."""
|
||||||
@@ -109,6 +110,10 @@ def configure_quantization(
|
|||||||
check_version("aqlm>=1.1.0", mandatory=True)
|
check_version("aqlm>=1.1.0", mandatory=True)
|
||||||
quantization_config["bits"] = 2
|
quantization_config["bits"] = 2
|
||||||
|
|
||||||
|
if quant_method == QuantizationMethod.FP8 and is_trainable:
|
||||||
|
quant_config = FineGrainedFP8Config(dequantize=True)
|
||||||
|
init_kwargs["quantization_config"] = quant_config
|
||||||
|
|
||||||
quant_bits = quantization_config.get("bits", "?")
|
quant_bits = quantization_config.get("bits", "?")
|
||||||
logger.info_rank0(f"Loading {quant_bits}-bit {quant_method.upper()}-quantized model.")
|
logger.info_rank0(f"Loading {quant_bits}-bit {quant_method.upper()}-quantized model.")
|
||||||
|
|
||||||
|
|||||||
@@ -301,6 +301,7 @@ _register_composite_model(
|
|||||||
|
|
||||||
_register_composite_model(
|
_register_composite_model(
|
||||||
model_type="mistral3",
|
model_type="mistral3",
|
||||||
|
projector_key="model.multi_modal_projector",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -115,7 +115,7 @@ def patch_config(
|
|||||||
configure_attn_implementation(config, model_args)
|
configure_attn_implementation(config, model_args)
|
||||||
configure_rope(config, model_args)
|
configure_rope(config, model_args)
|
||||||
configure_longlora(config, model_args, is_trainable)
|
configure_longlora(config, model_args, is_trainable)
|
||||||
configure_quantization(config, tokenizer, model_args, init_kwargs)
|
configure_quantization(config, tokenizer, model_args, is_trainable, init_kwargs)
|
||||||
configure_moe(config, model_args, is_trainable)
|
configure_moe(config, model_args, is_trainable)
|
||||||
configure_visual_model(config)
|
configure_visual_model(config)
|
||||||
configure_packing(model_args, is_trainable)
|
configure_packing(model_args, is_trainable)
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ def run_sft(
|
|||||||
gen_kwargs = generating_args.to_dict(obey_generation_config=True)
|
gen_kwargs = generating_args.to_dict(obey_generation_config=True)
|
||||||
|
|
||||||
# Compatible with Transformers v4 and Transformers v5
|
# Compatible with Transformers v4 and Transformers v5
|
||||||
if is_transformers_version_greater_than("5.0.0RC0"):
|
if is_transformers_version_greater_than("4.58.0"):
|
||||||
extra_ids = getattr(tokenizer, "additional_special_tokens_ids", None)
|
extra_ids = getattr(tokenizer, "additional_special_tokens_ids", None)
|
||||||
if not isinstance(extra_ids, list):
|
if not isinstance(extra_ids, list):
|
||||||
extra_special_tokens = getattr(tokenizer, "_extra_special_tokens", [])
|
extra_special_tokens = getattr(tokenizer, "_extra_special_tokens", [])
|
||||||
|
|||||||
Reference in New Issue
Block a user