[model] add qwen3 2507 models (#8750 )

[model] add glm4moe (#8689 )
2025-12-28 01:30:36 +08:00 · 2025-07-25 20:21:47 +08:00 · 2025-07-25 19:53:45 +08:00
4 changed files with 90 additions and 17 deletions
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -1014,6 +1014,21 @@ register_template(
 )


+register_template(
+    name="glm4_moe",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4_moe"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4_moe"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+
 # copied from glm4 template
 register_template(
    name="glm4v",
--- a/src/llamafactory/data/tool_utils.py
+++ b/src/llamafactory/data/tool_utils.py
@@ -42,6 +42,18 @@ GLM4_TOOL_PROMPT = (
    "你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{tool_text}"
 )

+GLM4_MOE_TOOL_PROMPT = (
+    "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\n"
+    "You are provided with function signatures within <tools></tools> XML tags:\n<tools>{tool_text}"
+    "\n</tools>\n\nFor each function call, output the function name and arguments within the following XML format:"
+    "\n<tool_call>{{function-name}}"
+    "\n<arg_key>{{arg-key-1}}</arg_key>"
+    "\n<arg_value>{{arg-value-1}}</arg_value>"
+    "\n<arg_key>{{arg-key-2}}</arg_key>"
+    "\n<arg_value>{{arg-value-2}}</arg_value>"
+    "\n...\n</tool_call>\n"
+)
+
 LLAMA3_TOOL_PROMPT = (
    "Cutting Knowledge Date: December 2023\nToday Date: {date}\n\n"
    "You have access to the following functions. To call a function, please respond with JSON for a function call. "
@@ -303,12 +315,45 @@ class QwenToolUtils(ToolUtils):
        return results


+class GLM4MOEToolUtils(QwenToolUtils):
+    r"""GLM-4-MOE tool using template."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        tool_text = ""
+        for tool in tools:
+            wrapped_tool = tool if tool.get("type") == "function" else {"type": "function", "function": tool}
+            tool_text += "\n" + json.dumps(wrapped_tool, ensure_ascii=False)
+
+        return GLM4_MOE_TOOL_PROMPT.format(tool_text=tool_text)
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        function_json = [
+            {"func_name": name, "func_key_values": json.loads(arguments)} for name, arguments in functions
+        ]
+        function_texts = []
+        for func in function_json:
+            prompt = "\n<tool_call>" + func["func_name"]
+            for key, value in func["func_key_values"].items():
+                prompt += "\n<arg_key>" + key + "</arg_key>"
+                if not isinstance(value, str):
+                    value = json.dumps(value, ensure_ascii=False)
+                prompt += "\n<arg_value>" + value + "</arg_value>"
+            function_texts.append(prompt)
+
+        return "\n".join(function_texts)
+
+
 TOOLS = {
    "default": DefaultToolUtils(),
    "glm4": GLM4ToolUtils(),
    "llama3": Llama3ToolUtils(),
    "mistral": MistralToolUtils(),
    "qwen": QwenToolUtils(),
+    "glm4_moe": GLM4MOEToolUtils(),
 }


--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -143,7 +143,7 @@ def register_model_group(
    for name, path in models.items():
        SUPPORTED_MODELS[name] = path
        if template is not None and (
-            any(suffix in name for suffix in ("-Chat", "-Distill", "-Instruct")) or multimodal
+            any(suffix in name for suffix in ("-Chat", "-Distill", "-Instruct", "-Thinking")) or multimodal
        ):
            DEFAULT_TEMPLATE[name] = template

@@ -2680,67 +2680,75 @@ register_model_group(
            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-Base",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-Base",
        },
-        "Qwen3-0.6B-Instruct": {
+        "Qwen3-0.6B-Thinking": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-0.6B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-0.6B",
        },
-        "Qwen3-1.7B-Instruct": {
+        "Qwen3-1.7B-Thinking": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-1.7B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-1.7B",
        },
-        "Qwen3-4B-Instruct": {
+        "Qwen3-4B-Thinking": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-4B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B",
        },
-        "Qwen3-8B-Instruct": {
+        "Qwen3-8B-Thinking": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-8B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-8B",
        },
-        "Qwen3-14B-Instruct": {
+        "Qwen3-14B-Thinking": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-14B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-14B",
        },
-        "Qwen3-32B-Instruct": {
+        "Qwen3-32B-Thinking": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-32B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-32B",
        },
-        "Qwen3-30B-A3B-Instruct": {
+        "Qwen3-30B-A3B-Thinking": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B",
        },
-        "Qwen3-235B-A22B-Instruct": {
+        "Qwen3-235B-A22B-Thinking": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B",
        },
-        "Qwen3-0.6B-Instruct-GPTQ-Int8": {
+        "Qwen3-235B-A22B-Instruct-2507": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B-Instruct-2507",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B-Instruct-2507",
+        },
+        "Qwen3-235B-A22B-Thinking-2507": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B-Thinking-2507",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B-Thinking-2507",
+        },
+        "Qwen3-0.6B-Thinking-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-0.6B-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-0.6B-GPTQ-Int8",
        },
-        "Qwen3-1.7B-Instruct-GPTQ-Int8": {
+        "Qwen3-1.7B-Thinking-GPTQ-Int8": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-1.7B-GPTQ-Int8",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-1.7B-GPTQ-Int8",
        },
-        "Qwen3-4B-Instruct-AWQ": {
+        "Qwen3-4B-Thinking-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-4B-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-AWQ",
        },
-        "Qwen3-8B-Instruct-AWQ": {
+        "Qwen3-8B-Thinking-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-8B-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-8B-AWQ",
        },
-        "Qwen3-14B-Instruct-AWQ": {
+        "Qwen3-14B-Thinking-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-14B-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-14B-AWQ",
        },
-        "Qwen3-32B-Instruct-AWQ": {
+        "Qwen3-32B-Thinking-AWQ": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-32B-AWQ",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-32B-AWQ",
        },
-        "Qwen3-30B-A3B-Instruct-GPTQ-Int4": {
+        "Qwen3-30B-A3B-Thinking-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-GPTQ-Int4",
        },
-        "Qwen3-235B-A22B-Instruct-GPTQ-Int4": {
+        "Qwen3-235B-A22B-Thinking-GPTQ-Int4": {
            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B-GPTQ-Int4",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B-GPTQ-Int4",
        },
--- a/src/llamafactory/model/model_utils/moe.py
+++ b/src/llamafactory/model/model_utils/moe.py
@@ -57,6 +57,11 @@ def add_z3_leaf_module(model: "PreTrainedModel") -> None:

        _set_z3_leaf_modules(model, [GraniteMoeMoE])

+    if model_type == "glm4_moe":
+        from transformers.models.glm4_moe.modeling_glm4_moe import Glm4MoeMoE
+
+        _set_z3_leaf_modules(model, [Glm4MoeMoE])
+
    if model_type == "jamba":
        from transformers.models.jamba.modeling_jamba import JambaSparseMoeBlock
Author	SHA1	Message	Date
Yaowei Zheng	3307ff1d4a	[model] add qwen3 2507 models (#8750 )	2025-07-25 20:21:47 +08:00
Kingsley	2aadc90c2d	[model] add glm4moe (#8689 )	2025-07-25 19:53:45 +08:00