mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2026-02-27 00:05:58 +08:00
[model] update constants (#10220)
This commit is contained in:
10
.github/workflows/docs.yml
vendored
10
.github/workflows/docs.yml
vendored
@@ -25,16 +25,16 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.10'
|
||||
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -r docs/requirements.txt
|
||||
|
||||
|
||||
- name: Build Sphinx
|
||||
run: |
|
||||
sphinx-build -b html docs/zh docs/_build/html/zh
|
||||
@@ -56,10 +56,10 @@ jobs:
|
||||
> docs/_build/html/index.html
|
||||
|
||||
touch docs/_build/html/.nojekyll
|
||||
|
||||
|
||||
- name: Setup Pages
|
||||
uses: actions/configure-pages@v5
|
||||
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-pages-artifact@v3
|
||||
with:
|
||||
|
||||
@@ -291,7 +291,7 @@ Read technical notes:
|
||||
| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - |
|
||||
| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt_oss |
|
||||
| [Granite 3-4](https://huggingface.co/ibm-granite) | 1B/2B/3B/7B/8B | granite3/granite4 |
|
||||
| [Hunyuan/Hunyuan1.5 (MT)](https://huggingface.co/tencent/) | 0.5B/1.8B/4B/7B/13B | hunyuan/hunyuan_small |
|
||||
| [Hunyuan/Hunyuan1.5 (MT)](https://huggingface.co/tencent/) | 0.5B/1.8B/4B/7B/13B | hunyuan/hunyuan_small|
|
||||
| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 |
|
||||
| [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab) | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl |
|
||||
| [Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 |
|
||||
@@ -319,6 +319,7 @@ Read technical notes:
|
||||
| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral |
|
||||
| [Qwen2 (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen |
|
||||
| [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink |
|
||||
| [Qwen3.5](https://huggingface.co/Qwen) | 27B/35B/122B/397B | qwen3_5 |
|
||||
| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio |
|
||||
| [Qwen2.5-Omni](https://huggingface.co/Qwen) | 3B/7B | qwen2_omni |
|
||||
| [Qwen3-Omni](https://huggingface.co/Qwen) | 30B | qwen3_omni |
|
||||
|
||||
@@ -293,7 +293,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
|
||||
| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - |
|
||||
| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt_oss |
|
||||
| [Granite 3-4](https://huggingface.co/ibm-granite) | 1B/2B/3B/7B/8B | granite3/granite4 |
|
||||
| [Hunyuan/Hunyuan1.5 (MT)](https://huggingface.co/tencent/) | 0.5B/1.8B/4B/7B/13B | hunyuan/hunyuan_small |
|
||||
| [Hunyuan/Hunyuan1.5 (MT)](https://huggingface.co/tencent/) | 0.5B/1.8B/4B/7B/13B | hunyuan/hunyuan_small|
|
||||
| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 |
|
||||
| [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab) | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl |
|
||||
| [Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 |
|
||||
@@ -321,6 +321,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
|
||||
| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral |
|
||||
| [Qwen2 (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen |
|
||||
| [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink |
|
||||
| [Qwen3.5](https://huggingface.co/Qwen) | 27B/35B/122B/397B | qwen3_5 |
|
||||
| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio |
|
||||
| [Qwen2.5-Omni](https://huggingface.co/Qwen) | 3B/7B | qwen2_omni |
|
||||
| [Qwen3-Omni](https://huggingface.co/Qwen) | 30B | qwen3_omni |
|
||||
|
||||
1
docs/_static/css/lang-switcher.css
vendored
1
docs/_static/css/lang-switcher.css
vendored
@@ -47,4 +47,3 @@
|
||||
border-color: rgba(255, 255, 255, 0.45);
|
||||
box-shadow: 0 0 0 3px rgba(255, 255, 255, 0.12);
|
||||
}
|
||||
|
||||
|
||||
28
docs/conf.py
28
docs/conf.py
@@ -1,33 +1,31 @@
|
||||
# Configuration file for the Sphinx documentation builder.
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Define common settings here
|
||||
project = 'LlamaFactory'
|
||||
copyright = '2024, LlamaFactory Team'
|
||||
author = 'LlamaFactory Team'
|
||||
project = "LlamaFactory"
|
||||
copyright = "2024, LlamaFactory Team"
|
||||
author = "LlamaFactory Team"
|
||||
|
||||
extensions = [
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.viewcode',
|
||||
'sphinx.ext.napoleon',
|
||||
'myst_parser',
|
||||
"sphinx.ext.autodoc",
|
||||
"sphinx.ext.viewcode",
|
||||
"sphinx.ext.napoleon",
|
||||
"myst_parser",
|
||||
]
|
||||
|
||||
templates_path = ['_templates']
|
||||
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
|
||||
templates_path = ["_templates"]
|
||||
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
|
||||
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
html_theme = "sphinx_rtd_theme"
|
||||
|
||||
html_static_path = ['_static']
|
||||
html_static_path = ["_static"]
|
||||
|
||||
html_js_files = [
|
||||
'js/switcher.js',
|
||||
"js/switcher.js",
|
||||
]
|
||||
|
||||
html_css_files = [
|
||||
'css/lang-switcher.css',
|
||||
"css/lang-switcher.css",
|
||||
]
|
||||
|
||||
myst_enable_extensions = [
|
||||
|
||||
@@ -1,20 +1,22 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add parent dir to path to allow importing conf.py
|
||||
sys.path.insert(0, os.path.abspath('..'))
|
||||
|
||||
from conf import *
|
||||
# Add parent dir to path to allow importing conf.py
|
||||
sys.path.insert(0, os.path.abspath(".."))
|
||||
|
||||
from conf import * # noqa: F403
|
||||
|
||||
|
||||
# Language settings
|
||||
language = 'en'
|
||||
html_search_language = 'en'
|
||||
language = "en"
|
||||
html_search_language = "en"
|
||||
|
||||
# Static files
|
||||
# Point to the root _static directory
|
||||
html_static_path = ['../_static']
|
||||
html_static_path = ["../_static"]
|
||||
|
||||
# Add custom JS for language switcher
|
||||
html_js_files = [
|
||||
'js/switcher.js',
|
||||
"js/switcher.js",
|
||||
]
|
||||
|
||||
@@ -1,20 +1,22 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add parent dir to path to allow importing conf.py
|
||||
sys.path.insert(0, os.path.abspath('..'))
|
||||
|
||||
from conf import *
|
||||
# Add parent dir to path to allow importing conf.py
|
||||
sys.path.insert(0, os.path.abspath(".."))
|
||||
|
||||
from conf import * # noqa: F403
|
||||
|
||||
|
||||
# Language settings
|
||||
language = 'zh_CN'
|
||||
html_search_language = 'zh'
|
||||
language = "zh_CN"
|
||||
html_search_language = "zh"
|
||||
|
||||
# Static files
|
||||
# Point to the root _static directory
|
||||
html_static_path = ['../_static']
|
||||
html_static_path = ["../_static"]
|
||||
|
||||
# Add custom JS for language switcher
|
||||
html_js_files = [
|
||||
'js/switcher.js',
|
||||
"js/switcher.js",
|
||||
]
|
||||
|
||||
@@ -6,14 +6,14 @@ template: qwen3_nothink
|
||||
|
||||
kernel_config:
|
||||
name: auto
|
||||
include_kernels: auto
|
||||
include_kernels: auto
|
||||
|
||||
dist_config:
|
||||
name: deepspeed
|
||||
config_file: examples/deepspeed/ds_z3_config.json
|
||||
|
||||
### data
|
||||
train_dataset: data/v1_sft_demo.yaml
|
||||
train_dataset: data/v1_sft_demo.yaml
|
||||
|
||||
### training
|
||||
output_dir: outputs/Qwen3-0.6B-deepspeed
|
||||
@@ -22,4 +22,3 @@ cutoff_len: 2048
|
||||
learning_rate: 1.0e-4
|
||||
bf16: true
|
||||
max_steps: 10
|
||||
|
||||
|
||||
@@ -2810,6 +2810,29 @@ register_model_group(
|
||||
)
|
||||
|
||||
|
||||
register_model_group(
|
||||
models={
|
||||
"Qwen3.5-27B": {
|
||||
DownloadSource.DEFAULT: "Qwen/Qwen3.5-27B",
|
||||
DownloadSource.MODELSCOPE: "Qwen/Qwen3.5-27B",
|
||||
},
|
||||
"Qwen3.5-35B-A3B": {
|
||||
DownloadSource.DEFAULT: "Qwen/Qwen3.5-35B-A3B",
|
||||
DownloadSource.MODELSCOPE: "Qwen/Qwen3.5-35B-A3B",
|
||||
},
|
||||
"Qwen3.5-122B-A10B": {
|
||||
DownloadSource.DEFAULT: "Qwen/Qwen3.5-122B-A10B",
|
||||
DownloadSource.MODELSCOPE: "Qwen/Qwen3.5-122B-A10B",
|
||||
},
|
||||
"Qwen3.5-397B-A17B": {
|
||||
DownloadSource.DEFAULT: "Qwen/Qwen3.5-397B-A17B",
|
||||
DownloadSource.MODELSCOPE: "Qwen/Qwen3.5-397B-A17B",
|
||||
},
|
||||
},
|
||||
template="qwen3_5",
|
||||
)
|
||||
|
||||
|
||||
register_model_group(
|
||||
models={
|
||||
"Qwen2-Audio-7B": {
|
||||
|
||||
@@ -147,6 +147,7 @@ def add_z3_leaf_module(model: "PreTrainedModel") -> None:
|
||||
|
||||
_set_z3_leaf_modules(model, [Qwen3NextSparseMoeBlock])
|
||||
|
||||
|
||||
def configure_moe(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
|
||||
if not is_trainable or not model_args.moe_aux_loss_coef:
|
||||
return
|
||||
|
||||
@@ -110,6 +110,7 @@ def _freeze_model_parameters(model: Any, finetuning_args: "FinetuningArguments")
|
||||
if any(name.startswith(k) for k in params_to_freeze):
|
||||
p.requires_grad_(False)
|
||||
|
||||
|
||||
def run_pt(
|
||||
model_args: "ModelArguments",
|
||||
data_args: "DataArguments",
|
||||
|
||||
@@ -122,6 +122,8 @@ def test_multimodal_collator():
|
||||
**tokenizer_module["processor"].image_processor(fake_image),
|
||||
}
|
||||
if not is_transformers_version_greater_than("5.0.0"):
|
||||
# adapt position_ids and rope_deltas for transformers < 5.0.0
|
||||
# https://github.com/huggingface/transformers/pull/43972
|
||||
expected_input["position_ids"] = [[[0, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1]]] * 3
|
||||
expected_input["rope_deltas"] = [[-8]]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user