mirror of
				https://github.com/hiyouga/LLaMA-Factory.git
				synced 2025-11-04 18:02:19 +08:00 
			
		
		
		
	[model] add seed coder and qwen3 quant models (#8039)
This commit is contained in:
		
							parent
							
								
									68fc068cab
								
							
						
					
					
						commit
						dc080399c6
					
				@ -299,8 +299,9 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | 
			
		||||
| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen)   | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen                |
 | 
			
		||||
| [Qwen3 (MoE)](https://huggingface.co/Qwen)                        | 0.6B/1.7B/4B/8B/14B/32B/235B     | qwen3               |
 | 
			
		||||
| [Qwen2-Audio](https://huggingface.co/Qwen)                        | 7B                               | qwen2_audio         |
 | 
			
		||||
| [Qwen2.5-Omni](https://huggingface.co/Qwen)\*\*                   | 3B/7B                            | qwen2_omni          |
 | 
			
		||||
| [Qwen2.5-Omni](https://huggingface.co/Qwen)\*                     | 3B/7B                            | qwen2_omni          |
 | 
			
		||||
| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen)            | 2B/3B/7B/32B/72B                 | qwen2_vl            |
 | 
			
		||||
| [Seed Coder](https://huggingface.co/ByteDance-Seed)               | 8B                               | seed_coder          |
 | 
			
		||||
| [Skywork o1](https://huggingface.co/Skywork)                      | 8B                               | skywork_o1          |
 | 
			
		||||
| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -                   |
 | 
			
		||||
| [TeleChat2](https://huggingface.co/Tele-AI)                       | 3B/7B/35B/115B                   | telechat2           |
 | 
			
		||||
@ -423,6 +424,7 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
 | 
			
		||||
- [COIG-P (en&zh)](https://huggingface.co/datasets/m-a-p/COIG-P)
 | 
			
		||||
- [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset)
 | 
			
		||||
- [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback)
 | 
			
		||||
- [RLAIF-V (en)](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset)
 | 
			
		||||
- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
 | 
			
		||||
- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
 | 
			
		||||
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
 | 
			
		||||
 | 
			
		||||
@ -286,8 +286,9 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
 | 
			
		||||
| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen)   | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen                |
 | 
			
		||||
| [Qwen3 (MoE)](https://huggingface.co/Qwen)                        | 0.6B/1.7B/4B/8B/14B/32B/235B     | qwen3               |
 | 
			
		||||
| [Qwen2-Audio](https://huggingface.co/Qwen)                        | 7B                               | qwen2_audio         |
 | 
			
		||||
| [Qwen2.5-Omni](https://huggingface.co/Qwen)\*\*                   | 3B/7B                            | qwen2_omni          |
 | 
			
		||||
| [Qwen2.5-Omni](https://huggingface.co/Qwen)\*                     | 3B/7B                            | qwen2_omni          |
 | 
			
		||||
| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen)            | 2B/3B/7B/32B/72B                 | qwen2_vl            |
 | 
			
		||||
| [Seed Coder](https://huggingface.co/ByteDance-Seed)               | 8B                               | seed_coder          |
 | 
			
		||||
| [Skywork o1](https://huggingface.co/Skywork)                      | 8B                               | skywork_o1          |
 | 
			
		||||
| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -                   |
 | 
			
		||||
| [TeleChat2](https://huggingface.co/Tele-AI)                       | 3B/7B/35B/115B                   | telechat2           |
 | 
			
		||||
@ -410,6 +411,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
 | 
			
		||||
- [COIG-P (en&zh)](https://huggingface.co/datasets/m-a-p/COIG-P)
 | 
			
		||||
- [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset)
 | 
			
		||||
- [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback)
 | 
			
		||||
- [RLAIF-V (en)](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset)
 | 
			
		||||
- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
 | 
			
		||||
- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
 | 
			
		||||
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,7 @@
 | 
			
		||||
The [dataset_info.json](dataset_info.json) contains all available datasets. If you are using a custom dataset, please **make sure** to add a *dataset description* in `dataset_info.json` and specify `dataset: dataset_name` before training to use it.
 | 
			
		||||
 | 
			
		||||
The `dataset_info.json` file should be put in the `dataset_dir` directory. You can change `dataset_dir` to use another directory. The default value is `./data`.
 | 
			
		||||
 | 
			
		||||
Currently we support datasets in **alpaca** and **sharegpt** format.
 | 
			
		||||
 | 
			
		||||
```json
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,7 @@
 | 
			
		||||
[dataset_info.json](dataset_info.json) 包含了所有可用的数据集。如果您希望使用自定义数据集,请**务必**在 `dataset_info.json` 文件中添加*数据集描述*,并通过修改 `dataset: 数据集名称` 配置来使用数据集。
 | 
			
		||||
 | 
			
		||||
其中 `dataset_info.json` 文件应放置在 `dataset_dir` 目录下。您可以通过修改 `dataset_dir` 参数来使用其他目录。默认值为 `./data`。
 | 
			
		||||
 | 
			
		||||
目前我们支持 **alpaca** 格式和 **sharegpt** 格式的数据集。
 | 
			
		||||
 | 
			
		||||
```json
 | 
			
		||||
 | 
			
		||||
@ -115,12 +115,7 @@ def get_dataset_list(dataset_names: Optional[list[str]], dataset_dir: str) -> li
 | 
			
		||||
    dataset_list: list[DatasetAttr] = []
 | 
			
		||||
    for name in dataset_names:
 | 
			
		||||
        if dataset_info is None:  # dataset_dir is ONLINE
 | 
			
		||||
            if use_modelscope():
 | 
			
		||||
                load_from = "ms_hub"
 | 
			
		||||
            elif use_openmind():
 | 
			
		||||
                load_from = "om_hub"
 | 
			
		||||
            else:
 | 
			
		||||
                load_from = "hf_hub"
 | 
			
		||||
            load_from = "ms_hub" if use_modelscope() else "om_hub" if use_openmind() else "hf_hub"
 | 
			
		||||
            dataset_attr = DatasetAttr(load_from, dataset_name=name)
 | 
			
		||||
            dataset_list.append(dataset_attr)
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
@ -1622,6 +1622,20 @@ register_template(
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
register_template(
 | 
			
		||||
    name="seed_coder",
 | 
			
		||||
    format_user=StringFormatter(
 | 
			
		||||
        slots=[{"bos_token"}, "user\n{{content}}", {"eos_token"}, {"bos_token"}, "assistant\n"]
 | 
			
		||||
    ),
 | 
			
		||||
    format_system=StringFormatter(slots=[{"bos_token"}, "system\n{{content}}", {"eos_token"}]),
 | 
			
		||||
    default_system=(
 | 
			
		||||
        "You are an AI programming assistant, utilizing the Seed-Coder model, developed by ByteDance Seed, "
 | 
			
		||||
        "and you only answer questions related to computer science. For politically sensitive questions, "
 | 
			
		||||
        "security and privacy issues, and other non-computer science questions, you will refuse to answer.\n\n"
 | 
			
		||||
    ),
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# copied from llama3 template
 | 
			
		||||
register_template(
 | 
			
		||||
    name="skywork_o1",
 | 
			
		||||
 | 
			
		||||
@ -237,9 +237,7 @@ class MistralToolUtils(ToolUtils):
 | 
			
		||||
    def tool_formatter(tools: list[dict[str, Any]]) -> str:
 | 
			
		||||
        wrapped_tools = []
 | 
			
		||||
        for tool in tools:
 | 
			
		||||
            wrapped_tools.append(
 | 
			
		||||
                tool if tool.get("type") == "function" else {"type": "function", "function": tool}
 | 
			
		||||
            )
 | 
			
		||||
            wrapped_tools.append(tool if tool.get("type") == "function" else {"type": "function", "function": tool})
 | 
			
		||||
 | 
			
		||||
        return "[AVAILABLE_TOOLS] " + json.dumps(wrapped_tools, ensure_ascii=False) + "[/AVAILABLE_TOOLS]"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -887,12 +887,13 @@ register_model_group(
 | 
			
		||||
 | 
			
		||||
register_model_group(
 | 
			
		||||
    models={
 | 
			
		||||
        "Granite-3.2-1B-A400M-Base": {
 | 
			
		||||
        "Granite-Vision-3.2-2B": {
 | 
			
		||||
            DownloadSource.DEFAULT: "ibm-granite/granite-vision-3.2-2b",
 | 
			
		||||
            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-vision-3.2-2b",
 | 
			
		||||
        },
 | 
			
		||||
    },
 | 
			
		||||
    template="granite3_vision",
 | 
			
		||||
    multimodal=True,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -2502,6 +2503,22 @@ register_model_group(
 | 
			
		||||
            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B",
 | 
			
		||||
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B",
 | 
			
		||||
        },
 | 
			
		||||
        "Qwen3-0.6B-Instruct-GPTQ-Int8": {
 | 
			
		||||
            DownloadSource.DEFAULT: "Qwen/Qwen3-0.6B-GPTQ-Int8",
 | 
			
		||||
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-0.6B-GPTQ-Int8",
 | 
			
		||||
        },
 | 
			
		||||
        "Qwen3-1.7B-Instruct-GPTQ-Int8": {
 | 
			
		||||
            DownloadSource.DEFAULT: "Qwen/Qwen3-1.7B-GPTQ-Int8",
 | 
			
		||||
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-1.7B-GPTQ-Int8",
 | 
			
		||||
        },
 | 
			
		||||
        "Qwen3-4B-Instruct-AWQ": {
 | 
			
		||||
            DownloadSource.DEFAULT: "Qwen/Qwen3-4B-AWQ",
 | 
			
		||||
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-AWQ",
 | 
			
		||||
        },
 | 
			
		||||
        "Qwen3-8B-Instruct-AWQ": {
 | 
			
		||||
            DownloadSource.DEFAULT: "Qwen/Qwen3-8B-AWQ",
 | 
			
		||||
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-8B-AWQ",
 | 
			
		||||
        },
 | 
			
		||||
        "Qwen3-14B-Instruct-AWQ": {
 | 
			
		||||
            DownloadSource.DEFAULT: "Qwen/Qwen3-14B-AWQ",
 | 
			
		||||
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-14B-AWQ",
 | 
			
		||||
@ -2510,6 +2527,14 @@ register_model_group(
 | 
			
		||||
            DownloadSource.DEFAULT: "Qwen/Qwen3-32B-AWQ",
 | 
			
		||||
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-32B-AWQ",
 | 
			
		||||
        },
 | 
			
		||||
        "Qwen3-30B-A3B-Instruct-GPTQ-Int4": {
 | 
			
		||||
            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-GPTQ-Int4",
 | 
			
		||||
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-GPTQ-Int4",
 | 
			
		||||
        },
 | 
			
		||||
        "Qwen3-235B-A22B-Instruct-GPTQ-Int4": {
 | 
			
		||||
            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B-GPTQ-Int4",
 | 
			
		||||
            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B-GPTQ-Int4",
 | 
			
		||||
        },
 | 
			
		||||
    },
 | 
			
		||||
    template="qwen3",
 | 
			
		||||
)
 | 
			
		||||
@ -2651,15 +2676,17 @@ register_model_group(
 | 
			
		||||
 | 
			
		||||
register_model_group(
 | 
			
		||||
    models={
 | 
			
		||||
        "SOLAR-10.7B-v1.0": {
 | 
			
		||||
            DownloadSource.DEFAULT: "upstage/SOLAR-10.7B-v1.0",
 | 
			
		||||
        "Seed-Coder-8B-Base": {
 | 
			
		||||
            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-Coder-8B-Base",
 | 
			
		||||
        },
 | 
			
		||||
        "SOLAR-10.7B-Instruct-v1.0": {
 | 
			
		||||
            DownloadSource.DEFAULT: "upstage/SOLAR-10.7B-Instruct-v1.0",
 | 
			
		||||
            DownloadSource.MODELSCOPE: "AI-ModelScope/SOLAR-10.7B-Instruct-v1.0",
 | 
			
		||||
        "Seed-Coder-8B-Instruct": {
 | 
			
		||||
            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-Coder-8B-Instruct",
 | 
			
		||||
        },
 | 
			
		||||
        "Seed-Coder-8B-Instruct-Reasoning": {
 | 
			
		||||
            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16",
 | 
			
		||||
        },
 | 
			
		||||
    },
 | 
			
		||||
    template="solar",
 | 
			
		||||
    template="seed_coder",
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -2684,6 +2711,20 @@ register_model_group(
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
register_model_group(
 | 
			
		||||
    models={
 | 
			
		||||
        "SOLAR-10.7B-v1.0": {
 | 
			
		||||
            DownloadSource.DEFAULT: "upstage/SOLAR-10.7B-v1.0",
 | 
			
		||||
        },
 | 
			
		||||
        "SOLAR-10.7B-Instruct-v1.0": {
 | 
			
		||||
            DownloadSource.DEFAULT: "upstage/SOLAR-10.7B-Instruct-v1.0",
 | 
			
		||||
            DownloadSource.MODELSCOPE: "AI-ModelScope/SOLAR-10.7B-Instruct-v1.0",
 | 
			
		||||
        },
 | 
			
		||||
    },
 | 
			
		||||
    template="solar",
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
register_model_group(
 | 
			
		||||
    models={
 | 
			
		||||
        "StarCoder2-3B": {
 | 
			
		||||
 | 
			
		||||
@ -18,7 +18,7 @@ from ...data import TEMPLATES
 | 
			
		||||
from ...extras.constants import METHODS, SUPPORTED_MODELS
 | 
			
		||||
from ...extras.packages import is_gradio_available
 | 
			
		||||
from ..common import save_config
 | 
			
		||||
from ..control import can_quantize, can_quantize_to, get_model_info, list_checkpoints
 | 
			
		||||
from ..control import can_quantize, can_quantize_to, check_template, get_model_info, list_checkpoints
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if is_gradio_available():
 | 
			
		||||
@ -49,7 +49,7 @@ def create_top() -> dict[str, "Component"]:
 | 
			
		||||
 | 
			
		||||
    model_name.change(get_model_info, [model_name], [model_path, template], queue=False).then(
 | 
			
		||||
        list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False
 | 
			
		||||
    )
 | 
			
		||||
    ).then(check_template, [lang, template])
 | 
			
		||||
    model_name.input(save_config, inputs=[lang, model_name], queue=False)
 | 
			
		||||
    model_path.input(save_config, inputs=[lang, model_name, model_path], queue=False)
 | 
			
		||||
    finetuning_type.change(can_quantize, [finetuning_type], [quantization_bit], queue=False).then(
 | 
			
		||||
 | 
			
		||||
@ -84,6 +84,17 @@ def get_model_info(model_name: str) -> tuple[str, str]:
 | 
			
		||||
    return get_model_path(model_name), get_template(model_name)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def check_template(lang: str, template: str) -> None:
 | 
			
		||||
    r"""Check if an instruct model is used.
 | 
			
		||||
 | 
			
		||||
    Please use queue=True to show the warning message.
 | 
			
		||||
 | 
			
		||||
    Inputs: top.lang, top.template
 | 
			
		||||
    """
 | 
			
		||||
    if template == "default":
 | 
			
		||||
        gr.Warning(ALERTS["warn_no_instruct"][lang])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_trainer_info(lang: str, output_path: os.PathLike, do_train: bool) -> tuple[str, "gr.Slider", dict[str, Any]]:
 | 
			
		||||
    r"""Get training infomation for monitor.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -2796,6 +2796,13 @@ ALERTS = {
 | 
			
		||||
        "ko": "출력 디렉토리가 이미 존재합니다. 위 출력 디렉토리에 저장된 학습을 재개합니다.",
 | 
			
		||||
        "ja": "出力ディレクトリが既に存在します。このチェックポイントからトレーニングを再開します。",
 | 
			
		||||
    },
 | 
			
		||||
    "warn_no_instruct": {
 | 
			
		||||
        "en": "You are using a non-instruct model, please fine-tune it first.",
 | 
			
		||||
        "ru": "Вы используете модель без инструкции, пожалуйста, primeros выполните донастройку этой модели.",
 | 
			
		||||
        "zh": "您正在使用非指令模型,请先对其进行微调。",
 | 
			
		||||
        "ko": "당신은 지시하지 않은 모델을 사용하고 있습니다. 먼저 이를 미세 조정해 주세요.",
 | 
			
		||||
        "ja": "インストラクションモデルを使用していません。まずモデルをアダプターに適合させてください。",
 | 
			
		||||
    },
 | 
			
		||||
    "info_aborting": {
 | 
			
		||||
        "en": "Aborted, wait for terminating...",
 | 
			
		||||
        "ru": "Прервано, ожидание завершения...",
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user