[model] update constants (#10220)

2026-06-10 17:28:55 +08:00 · 2026-02-26 21:13:56 +08:00
parent 2b8b871475
commit 122cd46084
12 changed files with 69 additions and 40 deletions
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -25,16 +25,16 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
-      
+
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.10'
-          
+
      - name: Install dependencies
        run: |
          pip install -r docs/requirements.txt
-          
+
      - name: Build Sphinx
        run: |
          sphinx-build -b html docs/zh docs/_build/html/zh
@@ -56,10 +56,10 @@ jobs:
            > docs/_build/html/index.html
          touch docs/_build/html/.nojekyll
-          
+
      - name: Setup Pages
        uses: actions/configure-pages@v5
-        
+
      - name: Upload artifact
        uses: actions/upload-pages-artifact@v3
        with:
--- a/README.md
+++ b/README.md
@@ -291,7 +291,7 @@ Read technical notes:
 | [GPT-2](https://huggingface.co/openai-community)                  | 0.1B/0.4B/0.8B/1.5B              | -                    |
 | [GPT-OSS](https://huggingface.co/openai)                          | 20B/120B                         | gpt_oss              |
 | [Granite 3-4](https://huggingface.co/ibm-granite)                 | 1B/2B/3B/7B/8B                   | granite3/granite4    |
-| [Hunyuan/Hunyuan1.5 (MT)](https://huggingface.co/tencent/)        | 0.5B/1.8B/4B/7B/13B              | hunyuan/hunyuan_small |
+| [Hunyuan/Hunyuan1.5 (MT)](https://huggingface.co/tencent/)        | 0.5B/1.8B/4B/7B/13B              | hunyuan/hunyuan_small|
 | [InternLM 2-3](https://huggingface.co/internlm)                   | 7B/8B/20B                        | intern2              |
 | [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab)              | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl            |
 | [Intern-S1-mini](https://huggingface.co/internlm/)                | 8B                               | intern_s1            |
@@ -319,6 +319,7 @@ Read technical notes:
 | [Pixtral](https://huggingface.co/mistralai)                       | 12B                              | pixtral              |
 | [Qwen2 (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen)          | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen                 |
 | [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink  |
 | [Qwen3.5](https://huggingface.co/Qwen)                            | 27B/35B/122B/397B                | qwen3_5              |
 | [Qwen2-Audio](https://huggingface.co/Qwen)                        | 7B                               | qwen2_audio          |
 | [Qwen2.5-Omni](https://huggingface.co/Qwen)                       | 3B/7B                            | qwen2_omni           |
 | [Qwen3-Omni](https://huggingface.co/Qwen)                         | 30B                              | qwen3_omni           |
--- a/README_zh.md
+++ b/README_zh.md
@@ -293,7 +293,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
 | [GPT-2](https://huggingface.co/openai-community)                  | 0.1B/0.4B/0.8B/1.5B              | -                    |
 | [GPT-OSS](https://huggingface.co/openai)                          | 20B/120B                         | gpt_oss              |
 | [Granite 3-4](https://huggingface.co/ibm-granite)                 | 1B/2B/3B/7B/8B                   | granite3/granite4    |
-| [Hunyuan/Hunyuan1.5 (MT)](https://huggingface.co/tencent/)        | 0.5B/1.8B/4B/7B/13B              | hunyuan/hunyuan_small |
+| [Hunyuan/Hunyuan1.5 (MT)](https://huggingface.co/tencent/)        | 0.5B/1.8B/4B/7B/13B              | hunyuan/hunyuan_small|
 | [InternLM 2-3](https://huggingface.co/internlm)                   | 7B/8B/20B                        | intern2              |
 | [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab)              | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl            |
 | [Intern-S1-mini](https://huggingface.co/internlm/)                | 8B                               | intern_s1            |
@@ -321,6 +321,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
 | [Pixtral](https://huggingface.co/mistralai)                       | 12B                              | pixtral              |
 | [Qwen2 (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen)          | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen                 |
 | [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink  |
 | [Qwen3.5](https://huggingface.co/Qwen)                            | 27B/35B/122B/397B                | qwen3_5              |
 | [Qwen2-Audio](https://huggingface.co/Qwen)                        | 7B                               | qwen2_audio          |
 | [Qwen2.5-Omni](https://huggingface.co/Qwen)                       | 3B/7B                            | qwen2_omni           |
 | [Qwen3-Omni](https://huggingface.co/Qwen)                         | 30B                              | qwen3_omni           |
--- a/docs/_static/css/lang-switcher.css
+++ b/docs/_static/css/lang-switcher.css
@@ -47,4 +47,3 @@
  border-color: rgba(255, 255, 255, 0.45);
  box-shadow: 0 0 0 3px rgba(255, 255, 255, 0.12);
 }
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,33 +1,31 @@
 # Configuration file for the Sphinx documentation builder.
 import os
 import sys
 # Define common settings here
-project = 'LlamaFactory'
+project = "LlamaFactory"
-copyright = '2024, LlamaFactory Team'
+copyright = "2024, LlamaFactory Team"
-author = 'LlamaFactory Team'
+author = "LlamaFactory Team"
 extensions = [
-    'sphinx.ext.autodoc',
+    "sphinx.ext.autodoc",
-    'sphinx.ext.viewcode',
+    "sphinx.ext.viewcode",
-    'sphinx.ext.napoleon',
+    "sphinx.ext.napoleon",
-    'myst_parser',
+    "myst_parser",
 ]
-templates_path = ['_templates']
+templates_path = ["_templates"]
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_rtd_theme"
-html_static_path = ['_static']
+html_static_path = ["_static"]
 html_js_files = [
-    'js/switcher.js',
+    "js/switcher.js",
 ]
 html_css_files = [
-    'css/lang-switcher.css',
+    "css/lang-switcher.css",
 ]
 myst_enable_extensions = [
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -1,20 +1,22 @@
 import os
 import sys
 # Add parent dir to path to allow importing conf.py
 sys.path.insert(0, os.path.abspath('..'))
-from conf import *
+# Add parent dir to path to allow importing conf.py
 sys.path.insert(0, os.path.abspath(".."))
 from conf import *  # noqa: F403
 # Language settings
-language = 'en'
+language = "en"
-html_search_language = 'en'
+html_search_language = "en"
 # Static files
 # Point to the root _static directory
-html_static_path = ['../_static']
+html_static_path = ["../_static"]
 # Add custom JS for language switcher
 html_js_files = [
-    'js/switcher.js',
+    "js/switcher.js",
 ]
--- a/docs/zh/conf.py
+++ b/docs/zh/conf.py
@@ -1,20 +1,22 @@
 import os
 import sys
 # Add parent dir to path to allow importing conf.py
 sys.path.insert(0, os.path.abspath('..'))
-from conf import *
+# Add parent dir to path to allow importing conf.py
 sys.path.insert(0, os.path.abspath(".."))
 from conf import *  # noqa: F403
 # Language settings
-language = 'zh_CN'
+language = "zh_CN"
-html_search_language = 'zh'
+html_search_language = "zh"
 # Static files
 # Point to the root _static directory
-html_static_path = ['../_static']
+html_static_path = ["../_static"]
 # Add custom JS for language switcher
 html_js_files = [
-    'js/switcher.js',
+    "js/switcher.js",
 ]
--- a/examples/v1/train_full/train_full_deepspeed.yaml
+++ b/examples/v1/train_full/train_full_deepspeed.yaml
@@ -6,14 +6,14 @@ template: qwen3_nothink
 kernel_config:
    name: auto
-    include_kernels: auto 
+    include_kernels: auto
 dist_config:
    name: deepspeed
    config_file: examples/deepspeed/ds_z3_config.json
 ### data
-train_dataset: data/v1_sft_demo.yaml 
+train_dataset: data/v1_sft_demo.yaml
 ### training
 output_dir: outputs/Qwen3-0.6B-deepspeed
@@ -22,4 +22,3 @@ cutoff_len: 2048
 learning_rate: 1.0e-4
 bf16: true
 max_steps: 10
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -2810,6 +2810,29 @@ register_model_group(
 )
 register_model_group(
    models={
        "Qwen3.5-27B": {
            DownloadSource.DEFAULT: "Qwen/Qwen3.5-27B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3.5-27B",
        },
        "Qwen3.5-35B-A3B": {
            DownloadSource.DEFAULT: "Qwen/Qwen3.5-35B-A3B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3.5-35B-A3B",
        },
        "Qwen3.5-122B-A10B": {
            DownloadSource.DEFAULT: "Qwen/Qwen3.5-122B-A10B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3.5-122B-A10B",
        },
        "Qwen3.5-397B-A17B": {
            DownloadSource.DEFAULT: "Qwen/Qwen3.5-397B-A17B",
            DownloadSource.MODELSCOPE: "Qwen/Qwen3.5-397B-A17B",
        },
    },
    template="qwen3_5",
 )
 register_model_group(
    models={
        "Qwen2-Audio-7B": {
--- a/src/llamafactory/model/model_utils/moe.py
+++ b/src/llamafactory/model/model_utils/moe.py
@@ -147,6 +147,7 @@ def add_z3_leaf_module(model: "PreTrainedModel") -> None:
        _set_z3_leaf_modules(model, [Qwen3NextSparseMoeBlock])
 def configure_moe(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
    if not is_trainable or not model_args.moe_aux_loss_coef:
        return
--- a/src/llamafactory/train/mca/workflow.py
+++ b/src/llamafactory/train/mca/workflow.py
@@ -110,6 +110,7 @@ def _freeze_model_parameters(model: Any, finetuning_args: "FinetuningArguments")
            if any(name.startswith(k) for k in params_to_freeze):
                p.requires_grad_(False)
 def run_pt(
    model_args: "ModelArguments",
    data_args: "DataArguments",
--- a/tests/data/test_collator.py
+++ b/tests/data/test_collator.py
@@ -122,6 +122,8 @@ def test_multimodal_collator():
        **tokenizer_module["processor"].image_processor(fake_image),
    }
    if not is_transformers_version_greater_than("5.0.0"):
        # adapt position_ids and rope_deltas for transformers < 5.0.0
        # https://github.com/huggingface/transformers/pull/43972
        expected_input["position_ids"] = [[[0, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1]]] * 3
        expected_input["rope_deltas"] = [[-8]]