From a0e9a36a6f3630f81f64d24b873ac8b5ff2855e1 Mon Sep 17 00:00:00 2001 From: Yaowei Zheng Date: Mon, 11 Aug 2025 23:17:32 +0800 Subject: [PATCH] [model] add qwen3 nothink (#8869) --- README.md | 1 + README_zh.md | 1 + src/llamafactory/data/template.py | 22 +++++++++++++--- src/llamafactory/extras/constants.py | 31 ++++++++++++++--------- src/llamafactory/hparams/data_args.py | 2 +- src/llamafactory/hparams/training_args.py | 8 ++++-- 6 files changed, 47 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 2daeb2e3..f4ce635a 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,7 @@ Choose your path: ## Blogs +- [Fine-tune GPT-OSS for Role-Playing using LLaMA-Factory](https://docs.llamafactory.com.cn/docs/documents/best-practice/gptoss/?utm_source=LLaMA-Factory) (Chinese) - [Fine-tune Llama3.1-70B for Medical Diagnosis using LLaMA-Factory](https://docs.alayanew.com/docs/documents/bestPractice/bigModel/llama70B/?utm_source=LLaMA-Factory) (Chinese) - [A One-Stop Code-Free Model Reinforcement Learning and Deployment Platform based on LLaMA-Factory and EasyR1](https://aws.amazon.com/cn/blogs/china/building-llm-model-hub-based-on-llamafactory-and-easyr1/) (Chinese) - [How Apoidea Group enhances visual information extraction from banking documents with multimodal models using LLaMA-Factory on Amazon SageMaker HyperPod](https://aws.amazon.com/cn/blogs/machine-learning/how-apoidea-group-enhances-visual-information-extraction-from-banking-documents-with-multimodal-models-using-llama-factory-on-amazon-sagemaker-hyperpod/) (English) diff --git a/README_zh.md b/README_zh.md index 83f8fc98..e970255c 100644 --- a/README_zh.md +++ b/README_zh.md @@ -103,6 +103,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc ## 官方博客 +- [使用 LLaMA-Factory 构建 GPT-OSS 角色扮演模型](https://docs.llamafactory.com.cn/docs/documents/best-practice/gptoss/?utm_source=LLaMA-Factory)(中文) - [使用 LLaMA-Factory 微调 Llama3.1-70B 医学诊断模型](https://docs.alayanew.com/docs/documents/bestPractice/bigModel/llama70B/?utm_source=LLaMA-Factory)(中文) - [基于 LLaMA-Factory 和 EasyR1 打造一站式无代码大模型强化学习和部署平台 LLM Model Hub](https://aws.amazon.com/cn/blogs/china/building-llm-model-hub-based-on-llamafactory-and-easyr1/)(中文) - [通过亚马逊 SageMaker HyperPod 上的 LLaMA-Factory 增强多模态模型银行文档的视觉信息提取](https://aws.amazon.com/cn/blogs/machine-learning/how-apoidea-group-enhances-visual-information-extraction-from-banking-documents-with-multimodal-models-using-llama-factory-on-amazon-sagemaker-hyperpod/)(英文) diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index ef6c7c15..b888b6b1 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -96,7 +96,7 @@ class Template: def add_thought(self, content: str = "") -> str: r"""Add empty thought to assistant message.""" - return f"{self.thought_words[0]}\n\n{self.thought_words[1]}\n\n" + content + return f"{self.thought_words[0]}{self.thought_words[1]}" + content def remove_thought(self, content: str) -> str: r"""Remove thought from assistant message.""" @@ -518,7 +518,7 @@ def register_template( format_prefix=format_prefix or default_prefix_formatter, default_system=default_system, stop_words=stop_words or [], - thought_words=thought_words or ("", ""), + thought_words=thought_words or ("\n", "\n\n\n"), efficient_eos=efficient_eos, replace_eos=replace_eos, replace_jinja_template=replace_jinja_template, @@ -579,7 +579,7 @@ def parse_template(tokenizer: "PreTrainedTokenizer") -> "Template": format_prefix=EmptyFormatter(slots=[prefix]) if prefix else EmptyFormatter(), default_system=default_system, stop_words=[], - thought_words=("", ""), + thought_words=("\n", "\n\n\n"), efficient_eos=False, replace_eos=False, replace_jinja_template=False, @@ -1750,6 +1750,22 @@ register_template( ) +# copied from qwen template +register_template( + name="qwen3_nothink", + format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]), + format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), + format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"), + format_observation=StringFormatter( + slots=["<|im_start|>user\n\n{{content}}\n<|im_end|>\n<|im_start|>assistant\n"] + ), + format_tools=ToolFormatter(tool_format="qwen"), + stop_words=["<|im_end|>"], + replace_eos=True, +) + + # copied from chatml template register_template( name="qwen2_audio", diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index 8364e96c..f252a3a8 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -2767,10 +2767,6 @@ register_model_group( DownloadSource.DEFAULT: "Qwen/Qwen3-4B", DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B", }, - "Qwen3-4B-Instruct-2507": { - DownloadSource.DEFAULT: "Qwen/Qwen3-4B-Instruct-2507", - DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-Instruct-2507", - }, "Qwen3-4B-Thinking-2507": { DownloadSource.DEFAULT: "Qwen/Qwen3-4B-Thinking-2507", DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-Thinking-2507", @@ -2791,10 +2787,6 @@ register_model_group( DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B", DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B", }, - "Qwen3-30B-A3B-Instruct-2507": { - DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-Instruct-2507", - DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-Instruct-2507", - }, "Qwen3-30B-A3B-Thinking-2507": { DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-Thinking-2507", DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-Thinking-2507", @@ -2803,10 +2795,6 @@ register_model_group( DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B", DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B", }, - "Qwen3-235B-A22B-Instruct-2507": { - DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B-Instruct-2507", - DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B-Instruct-2507", - }, "Qwen3-235B-A22B-Thinking-2507": { DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B-Thinking-2507", DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B-Thinking-2507", @@ -2848,6 +2836,25 @@ register_model_group( ) +register_model_group( + models={ + "Qwen3-4B-Instruct-2507": { + DownloadSource.DEFAULT: "Qwen/Qwen3-4B-Instruct-2507", + DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-Instruct-2507", + }, + "Qwen3-30B-A3B-Instruct-2507": { + DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-Instruct-2507", + DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-Instruct-2507", + }, + "Qwen3-235B-A22B-Instruct-2507": { + DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B-Instruct-2507", + DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B-Instruct-2507", + }, + }, + template="qwen3_nothink", +) + + register_model_group( models={ "Qwen2-Audio-7B": { diff --git a/src/llamafactory/hparams/data_args.py b/src/llamafactory/hparams/data_args.py index 90036706..e6844733 100644 --- a/src/llamafactory/hparams/data_args.py +++ b/src/llamafactory/hparams/data_args.py @@ -16,7 +16,7 @@ # limitations under the License. from dataclasses import asdict, dataclass, field -from typing import Any, Literal, Optional, Union +from typing import Any, Literal, Optional @dataclass diff --git a/src/llamafactory/hparams/training_args.py b/src/llamafactory/hparams/training_args.py index b37c0a2f..38cdf6af 100644 --- a/src/llamafactory/hparams/training_args.py +++ b/src/llamafactory/hparams/training_args.py @@ -50,7 +50,7 @@ class RayArguments: default="PACK", metadata={"help": "The placement strategy for Ray training. Default is PACK."}, ) - ray_init_kwargs: Optional[dict] = field( + ray_init_kwargs: Optional[Union[dict, str]] = field( default=None, metadata={"help": "The arguments to pass to ray.init for Ray training. Default is None."}, ) @@ -59,10 +59,14 @@ class RayArguments: self.use_ray = use_ray() if isinstance(self.resources_per_worker, str) and self.resources_per_worker.startswith("{"): self.resources_per_worker = _convert_str_dict(json.loads(self.resources_per_worker)) + + if isinstance(self.ray_init_kwargs, str) and self.ray_init_kwargs.startswith("{"): + self.ray_init_kwargs = _convert_str_dict(json.loads(self.ray_init_kwargs)) + if self.ray_storage_filesystem is not None: if self.ray_storage_filesystem not in ["s3", "gs", "gcs"]: raise ValueError( - f"ray_storage_filesystem must be one of ['s3', 'gs', 'gcs'], got {self.ray_storage_filesystem}" + f"ray_storage_filesystem must be one of ['s3', 'gs', 'gcs'], got {self.ray_storage_filesystem}." ) import pyarrow.fs as fs