From dc080399c68cd22a8fe13cb76fdb90bd635a2c93 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 13 May 2025 15:59:55 +0800 Subject: [PATCH] [model] add seed coder and qwen3 quant models (#8039) --- README.md | 4 +- README_zh.md | 4 +- data/README.md | 2 + data/README_zh.md | 2 + src/llamafactory/data/parser.py | 7 +-- src/llamafactory/data/template.py | 14 ++++++ src/llamafactory/data/tool_utils.py | 4 +- src/llamafactory/extras/constants.py | 55 +++++++++++++++++++++--- src/llamafactory/webui/components/top.py | 4 +- src/llamafactory/webui/control.py | 11 +++++ src/llamafactory/webui/locales.py | 7 +++ 11 files changed, 94 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 7e9dd1d3..38eef1dc 100644 --- a/README.md +++ b/README.md @@ -299,8 +299,9 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ | [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Qwen3 (MoE)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/235B | qwen3 | | [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio | -| [Qwen2.5-Omni](https://huggingface.co/Qwen)\*\* | 3B/7B | qwen2_omni | +| [Qwen2.5-Omni](https://huggingface.co/Qwen)\* | 3B/7B | qwen2_omni | | [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl | +| [Seed Coder](https://huggingface.co/ByteDance-Seed) | 8B | seed_coder | | [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | | [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 | @@ -423,6 +424,7 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t - [COIG-P (en&zh)](https://huggingface.co/datasets/m-a-p/COIG-P) - [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset) - [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback) +- [RLAIF-V (en)](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset) - [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs) - [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf) - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar) diff --git a/README_zh.md b/README_zh.md index 8662d82e..f062ccee 100644 --- a/README_zh.md +++ b/README_zh.md @@ -286,8 +286,9 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc | [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Qwen3 (MoE)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/235B | qwen3 | | [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio | -| [Qwen2.5-Omni](https://huggingface.co/Qwen)\*\* | 3B/7B | qwen2_omni | +| [Qwen2.5-Omni](https://huggingface.co/Qwen)\* | 3B/7B | qwen2_omni | | [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl | +| [Seed Coder](https://huggingface.co/ByteDance-Seed) | 8B | seed_coder | | [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | | [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 | @@ -410,6 +411,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc - [COIG-P (en&zh)](https://huggingface.co/datasets/m-a-p/COIG-P) - [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset) - [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback) +- [RLAIF-V (en)](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset) - [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs) - [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf) - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar) diff --git a/data/README.md b/data/README.md index de04335a..5c2e969a 100644 --- a/data/README.md +++ b/data/README.md @@ -1,5 +1,7 @@ The [dataset_info.json](dataset_info.json) contains all available datasets. If you are using a custom dataset, please **make sure** to add a *dataset description* in `dataset_info.json` and specify `dataset: dataset_name` before training to use it. +The `dataset_info.json` file should be put in the `dataset_dir` directory. You can change `dataset_dir` to use another directory. The default value is `./data`. + Currently we support datasets in **alpaca** and **sharegpt** format. ```json diff --git a/data/README_zh.md b/data/README_zh.md index e1ebb87f..e36cbfe6 100644 --- a/data/README_zh.md +++ b/data/README_zh.md @@ -1,5 +1,7 @@ [dataset_info.json](dataset_info.json) 包含了所有可用的数据集。如果您希望使用自定义数据集,请**务必**在 `dataset_info.json` 文件中添加*数据集描述*,并通过修改 `dataset: 数据集名称` 配置来使用数据集。 +其中 `dataset_info.json` 文件应放置在 `dataset_dir` 目录下。您可以通过修改 `dataset_dir` 参数来使用其他目录。默认值为 `./data`。 + 目前我们支持 **alpaca** 格式和 **sharegpt** 格式的数据集。 ```json diff --git a/src/llamafactory/data/parser.py b/src/llamafactory/data/parser.py index d7220c7b..3c692784 100644 --- a/src/llamafactory/data/parser.py +++ b/src/llamafactory/data/parser.py @@ -115,12 +115,7 @@ def get_dataset_list(dataset_names: Optional[list[str]], dataset_dir: str) -> li dataset_list: list[DatasetAttr] = [] for name in dataset_names: if dataset_info is None: # dataset_dir is ONLINE - if use_modelscope(): - load_from = "ms_hub" - elif use_openmind(): - load_from = "om_hub" - else: - load_from = "hf_hub" + load_from = "ms_hub" if use_modelscope() else "om_hub" if use_openmind() else "hf_hub" dataset_attr = DatasetAttr(load_from, dataset_name=name) dataset_list.append(dataset_attr) continue diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index debc69da..b05c3b86 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -1622,6 +1622,20 @@ register_template( ) +register_template( + name="seed_coder", + format_user=StringFormatter( + slots=[{"bos_token"}, "user\n{{content}}", {"eos_token"}, {"bos_token"}, "assistant\n"] + ), + format_system=StringFormatter(slots=[{"bos_token"}, "system\n{{content}}", {"eos_token"}]), + default_system=( + "You are an AI programming assistant, utilizing the Seed-Coder model, developed by ByteDance Seed, " + "and you only answer questions related to computer science. For politically sensitive questions, " + "security and privacy issues, and other non-computer science questions, you will refuse to answer.\n\n" + ), +) + + # copied from llama3 template register_template( name="skywork_o1", diff --git a/src/llamafactory/data/tool_utils.py b/src/llamafactory/data/tool_utils.py index 2c34e1a6..b2f2798b 100644 --- a/src/llamafactory/data/tool_utils.py +++ b/src/llamafactory/data/tool_utils.py @@ -237,9 +237,7 @@ class MistralToolUtils(ToolUtils): def tool_formatter(tools: list[dict[str, Any]]) -> str: wrapped_tools = [] for tool in tools: - wrapped_tools.append( - tool if tool.get("type") == "function" else {"type": "function", "function": tool} - ) + wrapped_tools.append(tool if tool.get("type") == "function" else {"type": "function", "function": tool}) return "[AVAILABLE_TOOLS] " + json.dumps(wrapped_tools, ensure_ascii=False) + "[/AVAILABLE_TOOLS]" diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index 073e92f9..a461aeee 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -887,12 +887,13 @@ register_model_group( register_model_group( models={ - "Granite-3.2-1B-A400M-Base": { + "Granite-Vision-3.2-2B": { DownloadSource.DEFAULT: "ibm-granite/granite-vision-3.2-2b", DownloadSource.MODELSCOPE: "AI-ModelScope/granite-vision-3.2-2b", }, }, template="granite3_vision", + multimodal=True, ) @@ -2502,6 +2503,22 @@ register_model_group( DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B", DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B", }, + "Qwen3-0.6B-Instruct-GPTQ-Int8": { + DownloadSource.DEFAULT: "Qwen/Qwen3-0.6B-GPTQ-Int8", + DownloadSource.MODELSCOPE: "Qwen/Qwen3-0.6B-GPTQ-Int8", + }, + "Qwen3-1.7B-Instruct-GPTQ-Int8": { + DownloadSource.DEFAULT: "Qwen/Qwen3-1.7B-GPTQ-Int8", + DownloadSource.MODELSCOPE: "Qwen/Qwen3-1.7B-GPTQ-Int8", + }, + "Qwen3-4B-Instruct-AWQ": { + DownloadSource.DEFAULT: "Qwen/Qwen3-4B-AWQ", + DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-AWQ", + }, + "Qwen3-8B-Instruct-AWQ": { + DownloadSource.DEFAULT: "Qwen/Qwen3-8B-AWQ", + DownloadSource.MODELSCOPE: "Qwen/Qwen3-8B-AWQ", + }, "Qwen3-14B-Instruct-AWQ": { DownloadSource.DEFAULT: "Qwen/Qwen3-14B-AWQ", DownloadSource.MODELSCOPE: "Qwen/Qwen3-14B-AWQ", @@ -2510,6 +2527,14 @@ register_model_group( DownloadSource.DEFAULT: "Qwen/Qwen3-32B-AWQ", DownloadSource.MODELSCOPE: "Qwen/Qwen3-32B-AWQ", }, + "Qwen3-30B-A3B-Instruct-GPTQ-Int4": { + DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-GPTQ-Int4", + DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-GPTQ-Int4", + }, + "Qwen3-235B-A22B-Instruct-GPTQ-Int4": { + DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B-GPTQ-Int4", + DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B-GPTQ-Int4", + }, }, template="qwen3", ) @@ -2651,15 +2676,17 @@ register_model_group( register_model_group( models={ - "SOLAR-10.7B-v1.0": { - DownloadSource.DEFAULT: "upstage/SOLAR-10.7B-v1.0", + "Seed-Coder-8B-Base": { + DownloadSource.DEFAULT: "ByteDance-Seed/Seed-Coder-8B-Base", }, - "SOLAR-10.7B-Instruct-v1.0": { - DownloadSource.DEFAULT: "upstage/SOLAR-10.7B-Instruct-v1.0", - DownloadSource.MODELSCOPE: "AI-ModelScope/SOLAR-10.7B-Instruct-v1.0", + "Seed-Coder-8B-Instruct": { + DownloadSource.DEFAULT: "ByteDance-Seed/Seed-Coder-8B-Instruct", + }, + "Seed-Coder-8B-Instruct-Reasoning": { + DownloadSource.DEFAULT: "ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16", }, }, - template="solar", + template="seed_coder", ) @@ -2684,6 +2711,20 @@ register_model_group( ) +register_model_group( + models={ + "SOLAR-10.7B-v1.0": { + DownloadSource.DEFAULT: "upstage/SOLAR-10.7B-v1.0", + }, + "SOLAR-10.7B-Instruct-v1.0": { + DownloadSource.DEFAULT: "upstage/SOLAR-10.7B-Instruct-v1.0", + DownloadSource.MODELSCOPE: "AI-ModelScope/SOLAR-10.7B-Instruct-v1.0", + }, + }, + template="solar", +) + + register_model_group( models={ "StarCoder2-3B": { diff --git a/src/llamafactory/webui/components/top.py b/src/llamafactory/webui/components/top.py index d6df1746..7d17e079 100644 --- a/src/llamafactory/webui/components/top.py +++ b/src/llamafactory/webui/components/top.py @@ -18,7 +18,7 @@ from ...data import TEMPLATES from ...extras.constants import METHODS, SUPPORTED_MODELS from ...extras.packages import is_gradio_available from ..common import save_config -from ..control import can_quantize, can_quantize_to, get_model_info, list_checkpoints +from ..control import can_quantize, can_quantize_to, check_template, get_model_info, list_checkpoints if is_gradio_available(): @@ -49,7 +49,7 @@ def create_top() -> dict[str, "Component"]: model_name.change(get_model_info, [model_name], [model_path, template], queue=False).then( list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False - ) + ).then(check_template, [lang, template]) model_name.input(save_config, inputs=[lang, model_name], queue=False) model_path.input(save_config, inputs=[lang, model_name, model_path], queue=False) finetuning_type.change(can_quantize, [finetuning_type], [quantization_bit], queue=False).then( diff --git a/src/llamafactory/webui/control.py b/src/llamafactory/webui/control.py index c45073dc..1103f27a 100644 --- a/src/llamafactory/webui/control.py +++ b/src/llamafactory/webui/control.py @@ -84,6 +84,17 @@ def get_model_info(model_name: str) -> tuple[str, str]: return get_model_path(model_name), get_template(model_name) +def check_template(lang: str, template: str) -> None: + r"""Check if an instruct model is used. + + Please use queue=True to show the warning message. + + Inputs: top.lang, top.template + """ + if template == "default": + gr.Warning(ALERTS["warn_no_instruct"][lang]) + + def get_trainer_info(lang: str, output_path: os.PathLike, do_train: bool) -> tuple[str, "gr.Slider", dict[str, Any]]: r"""Get training infomation for monitor. diff --git a/src/llamafactory/webui/locales.py b/src/llamafactory/webui/locales.py index a1ac2c51..ed05bae7 100644 --- a/src/llamafactory/webui/locales.py +++ b/src/llamafactory/webui/locales.py @@ -2796,6 +2796,13 @@ ALERTS = { "ko": "출력 디렉토리가 이미 존재합니다. 위 출력 디렉토리에 저장된 학습을 재개합니다.", "ja": "出力ディレクトリが既に存在します。このチェックポイントからトレーニングを再開します。", }, + "warn_no_instruct": { + "en": "You are using a non-instruct model, please fine-tune it first.", + "ru": "Вы используете модель без инструкции, пожалуйста, primeros выполните донастройку этой модели.", + "zh": "您正在使用非指令模型,请先对其进行微调。", + "ko": "당신은 지시하지 않은 모델을 사용하고 있습니다. 먼저 이를 미세 조정해 주세요.", + "ja": "インストラクションモデルを使用していません。まずモデルをアダプターに適合させてください。", + }, "info_aborting": { "en": "Aborted, wait for terminating...", "ru": "Прервано, ожидание завершения...",