From 95ac3f2373b82662c1bd855c284d3379e6a763d3 Mon Sep 17 00:00:00 2001
From: Yaowei Zheng <hiyouga@buaa.edu.cn>
Date: Wed, 31 Dec 2025 22:22:40 +0800
Subject: [PATCH] [release] Bye 2025 (#9702)

---
 .github/workflows/tests.yml                   |  2 +-
 README.md                                     | 12 ++--
 README_zh.md                                  | 12 ++--
 examples/README.md                            | 70 ++++++++-----------
 examples/README_zh.md                         | 70 ++++++++-----------
 examples/inference/llama3_lora_sft.yaml       |  5 --
 .../inference/{qwen2_5vl.yaml => qwen3.yaml}  |  4 +-
 ...ama3_full_sft.yaml => qwen3_full_sft.yaml} |  4 +-
 examples/inference/qwen3_lora_sft.yaml        |  5 ++
 .../inference/{llama3.yaml => qwen3vl.yaml}   |  4 +-
 .../infer_lora}/deepseek2_lora_sft_kt.yaml    |  0
 .../infer_lora}/deepseek3_kt.yaml             |  0
 .../infer_lora}/deepseek3_lora_sft_kt.yaml    |  0
 .../infer_lora}/qwen3moe_lora_sft_kt.yaml     |  0
 .../DeepSeek-V2-Chat-sft-amx.yaml             |  0
 .../kt_optimize_rules/DeepSeek-V2-Chat.yaml   |  0
 ...epSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml |  0
 .../DeepSeek-V2-Lite-Chat-sft-amx.yaml        |  0
 .../DeepSeek-V2-Lite-Chat-sft.yaml            |  0
 .../DeepSeek-V2-Lite-Chat.yaml                |  0
 .../DeepSeek-V3-Chat-amx.yaml                 |  0
 .../DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml |  0
 .../DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml   |  0
 .../DeepSeek-V3-Chat-sft-amx.yaml             |  0
 .../kt_optimize_rules/Qwen3Moe-sft-amx.yaml   |  0
 .../train_lora/deepseek2_lora_sft_kt.yaml     |  0
 .../train_lora/deepseek3_lora_sft_kt.yaml     |  0
 .../train_lora/qwen3moe_lora_sft_kt.yaml      |  0
 ...ama3_full_sft.yaml => qwen3_full_sft.yaml} |  6 +-
 .../{llama3_gptq.yaml => qwen3_gptq.yaml}     |  6 +-
 ..._5vl_lora_sft.yaml => qwen3_lora_sft.yaml} |  8 +--
 ...a3_lora_sft.yaml => qwen3vl_lora_sft.yaml} |  8 +--
 ...ama3_full_sft.yaml => qwen3_full_sft.yaml} |  7 +-
 .../train_full/qwen3_full_sft_autotp.yaml     | 46 ------------
 ...vl_full_sft.yaml => qwen3vl_full_sft.yaml} |  7 +-
 examples/train_lora/llama3_lora_eval.yaml     | 19 -----
 examples/train_lora/llama3_lora_ppo.yaml      | 43 ------------
 examples/train_lora/llama3_lora_sft.yaml      | 46 ------------
 examples/train_lora/llama4_lora_sft_ds3.yaml  | 49 -------------
 ...ama3_lora_dpo.yaml => qwen3_lora_dpo.yaml} |  7 +-
 ...ama3_lora_kto.yaml => qwen3_lora_kto.yaml} |  7 +-
 ...pretrain.yaml => qwen3_lora_pretrain.yaml} |  5 +-
 ...ora_reward.yaml => qwen3_lora_reward.yaml} |  7 +-
 .../{llama3_lora_sft.sh => qwen3_lora_sft.sh} |  7 +-
 ...{gpt_lora_sft.yaml => qwen3_lora_sft.yaml} |  7 +-
 ...a_sft_ds3.yaml => qwen3_lora_sft_ds3.yaml} |  7 +-
 ...a_sft_ray.yaml => qwen3_lora_sft_ray.yaml} |  7 +-
 ..._preprocess.yaml => qwen3_preprocess.yaml} | 11 ++-
 ...vl_lora_dpo.yaml => qwen3vl_lora_dpo.yaml} |  7 +-
 ...vl_lora_sft.yaml => qwen3vl_lora_sft.yaml} |  7 +-
 .../train_qlora/llama3_lora_sft_aqlm.yaml     |  1 -
 examples/train_qlora/llama3_lora_sft_awq.yaml |  1 -
 .../train_qlora/llama3_lora_sft_gptq.yaml     |  1 -
 ...b_npu.yaml => qwen3_lora_sft_bnb_npu.yaml} |  7 +-
 ...sft_otfq.yaml => qwen3_lora_sft_otfq.yaml} |  7 +-
 pyproject.toml                                |  3 +-
 scripts/vllm_infer.py                         | 19 +++--
 src/llamafactory/extras/env.py                |  2 +-
 tests_v1/config/test_args_parser.py           |  2 +-
 59 files changed, 154 insertions(+), 401 deletions(-)
 delete mode 100644 examples/inference/llama3_lora_sft.yaml
 rename examples/inference/{qwen2_5vl.yaml => qwen3.yaml} (59%)
 rename examples/inference/{llama3_full_sft.yaml => qwen3_full_sft.yaml} (60%)
 create mode 100644 examples/inference/qwen3_lora_sft.yaml
 rename examples/inference/{llama3.yaml => qwen3vl.yaml} (59%)
 rename examples/{inference => ktransformers/infer_lora}/deepseek2_lora_sft_kt.yaml (100%)
 rename examples/{inference => ktransformers/infer_lora}/deepseek3_kt.yaml (100%)
 rename examples/{inference => ktransformers/infer_lora}/deepseek3_lora_sft_kt.yaml (100%)
 rename examples/{inference => ktransformers/infer_lora}/qwen3moe_lora_sft_kt.yaml (100%)
 rename examples/{ => ktransformers}/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml (100%)
 rename examples/{ => ktransformers}/kt_optimize_rules/DeepSeek-V2-Chat.yaml (100%)
 rename examples/{ => ktransformers}/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml (100%)
 rename examples/{ => ktransformers}/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml (100%)
 rename examples/{ => ktransformers}/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft.yaml (100%)
 rename examples/{ => ktransformers}/kt_optimize_rules/DeepSeek-V2-Lite-Chat.yaml (100%)
 rename examples/{ => ktransformers}/kt_optimize_rules/DeepSeek-V3-Chat-amx.yaml (100%)
 rename examples/{ => ktransformers}/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml (100%)
 rename examples/{ => ktransformers}/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml (100%)
 rename examples/{ => ktransformers}/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx.yaml (100%)
 rename examples/{ => ktransformers}/kt_optimize_rules/Qwen3Moe-sft-amx.yaml (100%)
 rename examples/{ => ktransformers}/train_lora/deepseek2_lora_sft_kt.yaml (100%)
 rename examples/{ => ktransformers}/train_lora/deepseek3_lora_sft_kt.yaml (100%)
 rename examples/{ => ktransformers}/train_lora/qwen3moe_lora_sft_kt.yaml (100%)
 rename examples/merge_lora/{llama3_full_sft.yaml => qwen3_full_sft.yaml} (56%)
 rename examples/merge_lora/{llama3_gptq.yaml => qwen3_gptq.yaml} (66%)
 rename examples/merge_lora/{qwen2_5vl_lora_sft.yaml => qwen3_lora_sft.yaml} (58%)
 rename examples/merge_lora/{llama3_lora_sft.yaml => qwen3vl_lora_sft.yaml} (57%)
 rename examples/train_full/{llama3_full_sft.yaml => qwen3_full_sft.yaml} (87%)
 delete mode 100644 examples/train_full/qwen3_full_sft_autotp.yaml
 rename examples/train_full/{qwen2_5vl_full_sft.yaml => qwen3vl_full_sft.yaml} (87%)
 delete mode 100644 examples/train_lora/llama3_lora_eval.yaml
 delete mode 100644 examples/train_lora/llama3_lora_ppo.yaml
 delete mode 100644 examples/train_lora/llama3_lora_sft.yaml
 delete mode 100644 examples/train_lora/llama4_lora_sft_ds3.yaml
 rename examples/train_lora/{llama3_lora_dpo.yaml => qwen3_lora_dpo.yaml} (86%)
 rename examples/train_lora/{llama3_lora_kto.yaml => qwen3_lora_kto.yaml} (84%)
 rename examples/train_lora/{llama3_lora_pretrain.yaml => qwen3_lora_pretrain.yaml} (86%)
 rename examples/train_lora/{llama3_lora_reward.yaml => qwen3_lora_reward.yaml} (85%)
 rename examples/train_lora/{llama3_lora_sft.sh => qwen3_lora_sft.sh} (84%)
 rename examples/train_lora/{gpt_lora_sft.yaml => qwen3_lora_sft.yaml} (87%)
 rename examples/train_lora/{llama3_lora_sft_ds3.yaml => qwen3_lora_sft_ds3.yaml} (87%)
 rename examples/train_lora/{llama3_lora_sft_ray.yaml => qwen3_lora_sft_ray.yaml} (87%)
 rename examples/train_lora/{llama3_preprocess.yaml => qwen3_preprocess.yaml} (58%)
 rename examples/train_lora/{qwen2_5vl_lora_dpo.yaml => qwen3vl_lora_dpo.yaml} (87%)
 rename examples/train_lora/{qwen2_5vl_lora_sft.yaml => qwen3vl_lora_sft.yaml} (86%)
 rename examples/train_qlora/{llama3_lora_sft_bnb_npu.yaml => qwen3_lora_sft_bnb_npu.yaml} (86%)
 rename examples/train_qlora/{llama3_lora_sft_otfq.yaml => qwen3_lora_sft_otfq.yaml} (86%)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index a10d7d850..f4166aab1 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -27,7 +27,7 @@ jobs:
         python:
           - "3.11"
           - "3.12"
-          # - "3.13" # enable after trl is upgraded
+          - "3.13"
         os:
           - "ubuntu-latest"
           - "windows-latest"
diff --git a/README.md b/README.md
index 84d468e19..b530e6087 100644
--- a/README.md
+++ b/README.md
@@ -639,7 +639,7 @@ cd transformers
 pip install .
 ```
 
-3. Set `double_quantization: false` in the configuration. You can refer to the [example](examples/train_qlora/llama3_lora_sft_bnb_npu.yaml).
+3. Set `double_quantization: false` in the configuration. You can refer to the [example](examples/train_qlora/qwen3_lora_sft_bnb_npu.yaml).
 
 </details>
 
@@ -654,12 +654,12 @@ You can also use **[Easy Dataset](https://github.com/ConardLi/easy-dataset)**, *
 
 ### Quickstart
 
-Use the following 3 commands to run LoRA **fine-tuning**, **inference** and **merging** of the Llama3-8B-Instruct model, respectively.
+Use the following 3 commands to run LoRA **fine-tuning**, **inference** and **merging** of the Qwen3-4B-Instruct model, respectively.
 
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
-llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
-llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen3_lora_sft.yaml
+llamafactory-cli chat examples/inference/qwen3_lora_sft.yaml
+llamafactory-cli export examples/merge_lora/qwen3_lora_sft.yaml
 ```
 
 See [examples/README.md](examples/README.md) for advanced usage (including distributed training).
@@ -782,7 +782,7 @@ When building the Docker image, use `-v ./hf_cache:/root/.cache/huggingface` arg
 ### Deploy with OpenAI-style API and vLLM
 
 ```bash
-API_PORT=8000 llamafactory-cli api examples/inference/llama3.yaml infer_backend=vllm vllm_enforce_eager=true
+API_PORT=8000 llamafactory-cli api examples/inference/qwen3.yaml infer_backend=vllm vllm_enforce_eager=true
 ```
 
 > [!TIP]
diff --git a/README_zh.md b/README_zh.md
index 3af79201c..8b786aabb 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -641,7 +641,7 @@ cd transformers
 pip install .
 ```
 
-3. 在训练参数中设置 `double_quantization: false`，可参考[示例](examples/train_qlora/llama3_lora_sft_bnb_npu.yaml)。
+3. 在训练参数中设置 `double_quantization: false`，可参考[示例](examples/train_qlora/qwen3_lora_sft_bnb_npu.yaml)。
 
 </details>
 
@@ -656,12 +656,12 @@ pip install .
 
 ### 快速开始
 
-下面三行命令分别对 Llama3-8B-Instruct 模型进行 LoRA **微调**、**推理**和**合并**。
+下面三行命令分别对 Qwen3-4B-Instruct 模型进行 LoRA **微调**、**推理**和**合并**。
 
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
-llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
-llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen3_lora_sft.yaml
+llamafactory-cli chat examples/inference/qwen3_lora_sft.yaml
+llamafactory-cli export examples/merge_lora/qwen3_lora_sft.yaml
 ```
 
 高级用法请参考 [examples/README_zh.md](examples/README_zh.md)（包括多 GPU 微调）。
@@ -787,7 +787,7 @@ docker exec -it llamafactory bash
 ### 利用 vLLM 部署 OpenAI API
 
 ```bash
-API_PORT=8000 llamafactory-cli api examples/inference/llama3.yaml infer_backend=vllm vllm_enforce_eager=true
+API_PORT=8000 llamafactory-cli api examples/inference/qwen3.yaml infer_backend=vllm vllm_enforce_eager=true
 ```
 
 > [!TIP]
diff --git a/examples/README.md b/examples/README.md
index 3fa7b1d1d..1d79d574c 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -18,19 +18,19 @@ By default, LLaMA-Factory uses all visible computing devices.
 Basic usage:
 
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen3_lora_sft.yaml
 ```
 
 Advanced usage:
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml \
+CUDA_VISIBLE_DEVICES=0,1 llamafactory-cli train examples/train_lora/qwen3_lora_sft.yaml \
     learning_rate=1e-5 \
     logging_steps=1
 ```
 
 ```bash
-bash examples/train_lora/llama3_lora_sft.sh
+bash examples/train_lora/qwen3_lora_sft.sh
 ```
 
 ## Examples
@@ -40,49 +40,43 @@ bash examples/train_lora/llama3_lora_sft.sh
 #### (Continuous) Pre-Training
 
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_pretrain.yaml
+llamafactory-cli train examples/train_lora/qwen3_lora_pretrain.yaml
 ```
 
 #### Supervised Fine-Tuning
 
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen3_lora_sft.yaml
 ```
 
 #### Multimodal Supervised Fine-Tuning
 
 ```bash
-llamafactory-cli train examples/train_lora/qwen2_5vl_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen3vl_lora_sft.yaml
 ```
 
 #### DPO/ORPO/SimPO Training
 
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
+llamafactory-cli train examples/train_lora/qwen3_lora_dpo.yaml
 ```
 
 #### Multimodal DPO/ORPO/SimPO Training
 
 ```bash
-llamafactory-cli train examples/train_lora/qwen2_5vl_lora_dpo.yaml
+llamafactory-cli train examples/train_lora/qwen3vl_lora_dpo.yaml
 ```
 
 #### Reward Modeling
 
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_reward.yaml
-```
-
-#### PPO Training
-
-```bash
-llamafactory-cli train examples/train_lora/llama3_lora_ppo.yaml
+llamafactory-cli train examples/train_lora/qwen3_lora_reward.yaml
 ```
 
 #### KTO Training
 
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_kto.yaml
+llamafactory-cli train examples/train_lora/qwen3_lora_kto.yaml
 ```
 
 #### Preprocess Dataset
@@ -90,32 +84,26 @@ llamafactory-cli train examples/train_lora/llama3_lora_kto.yaml
 It is useful for large dataset, use `tokenized_path` in config to load the preprocessed dataset.
 
 ```bash
-llamafactory-cli train examples/train_lora/llama3_preprocess.yaml
-```
-
-#### Evaluating on MMLU/CMMLU/C-Eval Benchmarks
-
-```bash
-llamafactory-cli eval examples/train_lora/llama3_lora_eval.yaml
+llamafactory-cli train examples/train_lora/qwen3_preprocess.yaml
 ```
 
 #### Supervised Fine-Tuning on Multiple Nodes
 
 ```bash
-FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
-FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/qwen3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/qwen3_lora_sft.yaml
 ```
 
 #### Supervised Fine-Tuning with DeepSpeed ZeRO-3 (Weight Sharding)
 
 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/qwen3_lora_sft_ds3.yaml
 ```
 
 #### Supervised Fine-Tuning with Ray on 4 GPUs
 
 ```bash
-USE_RAY=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ray.yaml
+USE_RAY=1 llamafactory-cli train examples/train_lora/qwen3_lora_sft_ray.yaml
 ```
 
 ### QLoRA Fine-Tuning
@@ -123,13 +111,13 @@ USE_RAY=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ray.yaml
 #### Supervised Fine-Tuning with 4/8-bit Bitsandbytes/HQQ/EETQ Quantization (Recommended)
 
 ```bash
-llamafactory-cli train examples/train_qlora/llama3_lora_sft_otfq.yaml
+llamafactory-cli train examples/train_qlora/qwen3_lora_sft_otfq.yaml
 ```
 
 #### Supervised Fine-Tuning with 4-bit Bitsandbytes Quantization on Ascend NPU
 
 ```bash
-llamafactory-cli train examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
+llamafactory-cli train examples/train_qlora/qwen3_lora_sft_bnb_npu.yaml
 ```
 
 #### Supervised Fine-Tuning with 4/8-bit GPTQ Quantization
@@ -155,14 +143,14 @@ llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
 #### Supervised Fine-Tuning on Single Node
 
 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen3_full_sft.yaml
 ```
 
 #### Supervised Fine-Tuning on Multiple Nodes
 
 ```bash
-FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
-FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/qwen3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/qwen3_full_sft.yaml
 ```
 
 ### Elastic and Fault-Tolerant Supervised Fine-Tuning on Multiple Nodes
@@ -170,13 +158,13 @@ FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500
 To launch an elastic job with `MAX_RESTARTS` failures retries, run the following on at least `MIN_NNODES` nodes and at most `MAX_NNODES` nodes. `RDZV_ID` should be set as a unique job id (shared by all nodes participating in the job). See also [torchrun](https://docs.pytorch.org/docs/stable/elastic/run.html).
 
 ```bash
-FORCE_TORCHRUN=1 MIN_NNODES=1 MAX_NNODES=3 MAX_RESTARTS=3 RDZV_ID=llamafactory MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 MIN_NNODES=1 MAX_NNODES=3 MAX_RESTARTS=3 RDZV_ID=llamafactory MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/qwen3_full_sft.yaml
 ```
 
 #### Multimodal Supervised Fine-Tuning
 
 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2_5vl_full_sft.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen3vl_full_sft.yaml
 ```
 
 ### Merging LoRA Adapters and Quantization
@@ -186,19 +174,19 @@ FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2_5vl_full_sft.y
 Note: DO NOT use quantized model or `quantization_bit` when merging LoRA adapters.
 
 ```bash
-llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+llamafactory-cli export examples/merge_lora/qwen3_lora_sft.yaml
 ```
 
 #### Quantizing Model using AutoGPTQ
 
 ```bash
-llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
+llamafactory-cli export examples/merge_lora/qwen3_gptq.yaml
 ```
 
 ### Save Ollama modelfile
 
 ```bash
-llamafactory-cli export examples/merge_lora/llama3_full_sft.yaml
+llamafactory-cli export examples/merge_lora/qwen3_full_sft.yaml
 ```
 
 ### Inferring LoRA Fine-Tuned Models
@@ -206,26 +194,26 @@ llamafactory-cli export examples/merge_lora/llama3_full_sft.yaml
 #### Evaluation using vLLM's Multi-GPU Inference
 
 ```
-python scripts/vllm_infer.py --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct --template llama3 --dataset alpaca_en_demo
+python scripts/vllm_infer.py --model_name_or_path Qwen/Qwen3-4B-Instruct-2507 --template qwen3_nothink --dataset alpaca_en_demo
 python scripts/eval_bleu_rouge.py generated_predictions.jsonl
 ```
 
 #### Use CLI ChatBox
 
 ```bash
-llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+llamafactory-cli chat examples/inference/qwen3_lora_sft.yaml
 ```
 
 #### Use Web UI ChatBox
 
 ```bash
-llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
+llamafactory-cli webchat examples/inference/qwen3_lora_sft.yaml
 ```
 
 #### Launch OpenAI-style API
 
 ```bash
-llamafactory-cli api examples/inference/llama3_lora_sft.yaml
+llamafactory-cli api examples/inference/qwen3_lora_sft.yaml
 ```
 
 ### Extras
diff --git a/examples/README_zh.md b/examples/README_zh.md
index aa42e4917..95f292838 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -18,19 +18,19 @@ LLaMA-Factory 默认使用所有可见的计算设备。
 基础用法：
 
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen3_lora_sft.yaml
 ```
 
 高级用法：
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml \
+CUDA_VISIBLE_DEVICES=0,1 llamafactory-cli train examples/train_lora/qwen3_lora_sft.yaml \
     learning_rate=1e-5 \
     logging_steps=1
 ```
 
 ```bash
-bash examples/train_lora/llama3_lora_sft.sh
+bash examples/train_lora/qwen3_lora_sft.sh
 ```
 
 ## 示例
@@ -40,49 +40,43 @@ bash examples/train_lora/llama3_lora_sft.sh
 #### （增量）预训练
 
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_pretrain.yaml
+llamafactory-cli train examples/train_lora/qwen3_lora_pretrain.yaml
 ```
 
 #### 指令监督微调
 
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen3_lora_sft.yaml
 ```
 
 #### 多模态指令监督微调
 
 ```bash
-llamafactory-cli train examples/train_lora/qwen2_5vl_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen3vl_lora_sft.yaml
 ```
 
 #### DPO/ORPO/SimPO 训练
 
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
+llamafactory-cli train examples/train_lora/qwen3_lora_dpo.yaml
 ```
 
 #### 多模态 DPO/ORPO/SimPO 训练
 
 ```bash
-llamafactory-cli train examples/train_lora/qwen2_5vl_lora_dpo.yaml
+llamafactory-cli train examples/train_lora/qwen3vl_lora_dpo.yaml
 ```
 
 #### 奖励模型训练
 
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_reward.yaml
-```
-
-#### PPO 训练
-
-```bash
-llamafactory-cli train examples/train_lora/llama3_lora_ppo.yaml
+llamafactory-cli train examples/train_lora/qwen3_lora_reward.yaml
 ```
 
 #### KTO 训练
 
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_kto.yaml
+llamafactory-cli train examples/train_lora/qwen3_lora_kto.yaml
 ```
 
 #### 预处理数据集
@@ -90,20 +84,14 @@ llamafactory-cli train examples/train_lora/llama3_lora_kto.yaml
 对于大数据集有帮助，在配置中使用 `tokenized_path` 以加载预处理后的数据集。
 
 ```bash
-llamafactory-cli train examples/train_lora/llama3_preprocess.yaml
-```
-
-#### 在 MMLU/CMMLU/C-Eval 上评估
-
-```bash
-llamafactory-cli eval examples/train_lora/llama3_lora_eval.yaml
+llamafactory-cli train examples/train_lora/qwen3_preprocess.yaml
 ```
 
 #### 多机指令监督微调
 
 ```bash
-FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
-FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/qwen3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/qwen3_lora_sft.yaml
 ```
 
 ### 支持弹性和容错的多机指令监督微调
@@ -111,19 +99,19 @@ FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500
 要启动一个支持弹性节点和容错的多机指令微调，在每个节点上执行以下命令。弹性节点数量范围为 `MIN_NNODES:MAX_NNODES`，每个节点最多允许因为错误重启 `MAX_RESTARTS` 次。`RDZV_ID` 应设置为一个唯一的作业 ID（由参与该作业的所有节点共享）。更多新可以参考官方文档 [torchrun](https://docs.pytorch.org/docs/stable/elastic/run.html)。
 
 ```bash
-FORCE_TORCHRUN=1 MIN_NNODES=1 MAX_NNODES=3 MAX_RESTARTS=3 RDZV_ID=llamafactory MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 MIN_NNODES=1 MAX_NNODES=3 MAX_RESTARTS=3 RDZV_ID=llamafactory MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/qwen3_full_sft.yaml
 ```
 
 #### 使用 DeepSpeed ZeRO-3 平均分配显存
 
 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/qwen3_lora_sft_ds3.yaml
 ```
 
 #### 使用 Ray 在 4 张 GPU 上微调
 
 ```bash
-USE_RAY=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ray.yaml
+USE_RAY=1 llamafactory-cli train examples/train_lora/qwen3_lora_sft_ray.yaml
 ```
 
 ### QLoRA 微调
@@ -131,13 +119,13 @@ USE_RAY=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ray.yaml
 #### 基于 4/8 比特 Bitsandbytes/HQQ/EETQ 量化进行指令监督微调（推荐）
 
 ```bash
-llamafactory-cli train examples/train_qlora/llama3_lora_sft_otfq.yaml
+llamafactory-cli train examples/train_qlora/qwen3_lora_sft_otfq.yaml
 ```
 
 #### 在 NPU 上基于 4 比特 Bitsandbytes 量化进行指令监督微调
 
 ```bash
-llamafactory-cli train examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
+llamafactory-cli train examples/train_qlora/qwen3_lora_sft_bnb_npu.yaml
 ```
 
 #### 基于 4/8 比特 GPTQ 量化进行指令监督微调
@@ -163,20 +151,20 @@ llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
 #### 在单机上进行指令监督微调
 
 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen3_full_sft.yaml
 ```
 
 #### 在多机上进行指令监督微调
 
 ```bash
-FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
-FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/qwen3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/qwen3_full_sft.yaml
 ```
 
 #### 多模态指令监督微调
 
 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2_5vl_full_sft.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen3vl_full_sft.yaml
 ```
 
 ### 合并 LoRA 适配器与模型量化
@@ -186,19 +174,19 @@ FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2_5vl_full_sft.y
 注：请勿使用量化后的模型或 `quantization_bit` 参数来合并 LoRA 适配器。
 
 ```bash
-llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+llamafactory-cli export examples/merge_lora/qwen3_lora_sft.yaml
 ```
 
 #### 使用 AutoGPTQ 量化模型
 
 ```bash
-llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
+llamafactory-cli export examples/merge_lora/qwen3_gptq.yaml
 ```
 
 ### 保存 Ollama 配置文件
 
 ```bash
-llamafactory-cli export examples/merge_lora/llama3_full_sft.yaml
+llamafactory-cli export examples/merge_lora/qwen3_full_sft.yaml
 ```
 
 ### 推理 LoRA 模型
@@ -206,26 +194,26 @@ llamafactory-cli export examples/merge_lora/llama3_full_sft.yaml
 #### 使用 vLLM 多卡推理评估
 
 ```
-python scripts/vllm_infer.py --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct --template llama3 --dataset alpaca_en_demo
+python scripts/vllm_infer.py --model_name_or_path Qwen/Qwen3-4B-Instruct-2507 --template qwen3_nothink --dataset alpaca_en_demo
 python scripts/eval_bleu_rouge.py generated_predictions.jsonl
 ```
 
 #### 使用命令行对话框
 
 ```bash
-llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+llamafactory-cli chat examples/inference/qwen3_lora_sft.yaml
 ```
 
 #### 使用浏览器对话框
 
 ```bash
-llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
+llamafactory-cli webchat examples/inference/qwen3_lora_sft.yaml
 ```
 
 #### 启动 OpenAI 风格 API
 
 ```bash
-llamafactory-cli api examples/inference/llama3_lora_sft.yaml
+llamafactory-cli api examples/inference/qwen3_lora_sft.yaml
 ```
 
 ### 杂项
diff --git a/examples/inference/llama3_lora_sft.yaml b/examples/inference/llama3_lora_sft.yaml
deleted file mode 100644
index e7fd04254..000000000
--- a/examples/inference/llama3_lora_sft.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-adapter_name_or_path: saves/llama3-8b/lora/sft
-template: llama3
-infer_backend: huggingface  # choices: [huggingface, vllm, sglang, ktransformers]
-trust_remote_code: true
diff --git a/examples/inference/qwen2_5vl.yaml b/examples/inference/qwen3.yaml
similarity index 59%
rename from examples/inference/qwen2_5vl.yaml
rename to examples/inference/qwen3.yaml
index 67b78d4fa..1c4232cd8 100644
--- a/examples/inference/qwen2_5vl.yaml
+++ b/examples/inference/qwen3.yaml
@@ -1,4 +1,4 @@
-model_name_or_path: Qwen/Qwen2.5-VL-7B-Instruct
-template: qwen2_vl
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
+template: qwen3_nothink
 infer_backend: huggingface  # choices: [huggingface, vllm, sglang, ktransformers]
 trust_remote_code: true
diff --git a/examples/inference/llama3_full_sft.yaml b/examples/inference/qwen3_full_sft.yaml
similarity index 60%
rename from examples/inference/llama3_full_sft.yaml
rename to examples/inference/qwen3_full_sft.yaml
index 64fc24899..b11f0e803 100644
--- a/examples/inference/llama3_full_sft.yaml
+++ b/examples/inference/qwen3_full_sft.yaml
@@ -1,4 +1,4 @@
-model_name_or_path: saves/llama3-8b/full/sft
-template: llama3
+model_name_or_path: saves/qwen3-4b/full/sft
+template: qwen3_nothink
 infer_backend: huggingface  # choices: [huggingface, vllm, sglang, ktransformers]
 trust_remote_code: true
diff --git a/examples/inference/qwen3_lora_sft.yaml b/examples/inference/qwen3_lora_sft.yaml
new file mode 100644
index 000000000..44d8471c5
--- /dev/null
+++ b/examples/inference/qwen3_lora_sft.yaml
@@ -0,0 +1,5 @@
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
+adapter_name_or_path: saves/qwen3-4b/lora/sft
+template: qwen3_nothink
+infer_backend: huggingface  # choices: [huggingface, vllm, sglang, ktransformers]
+trust_remote_code: true
diff --git a/examples/inference/llama3.yaml b/examples/inference/qwen3vl.yaml
similarity index 59%
rename from examples/inference/llama3.yaml
rename to examples/inference/qwen3vl.yaml
index 9315e7977..0c0b5dcbf 100644
--- a/examples/inference/llama3.yaml
+++ b/examples/inference/qwen3vl.yaml
@@ -1,4 +1,4 @@
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-template: llama3
+model_name_or_path: Qwen/Qwen3-VL-4B-Instruct
+template: qwen3_vl_nothink
 infer_backend: huggingface  # choices: [huggingface, vllm, sglang, ktransformers]
 trust_remote_code: true
diff --git a/examples/inference/deepseek2_lora_sft_kt.yaml b/examples/ktransformers/infer_lora/deepseek2_lora_sft_kt.yaml
similarity index 100%
rename from examples/inference/deepseek2_lora_sft_kt.yaml
rename to examples/ktransformers/infer_lora/deepseek2_lora_sft_kt.yaml
diff --git a/examples/inference/deepseek3_kt.yaml b/examples/ktransformers/infer_lora/deepseek3_kt.yaml
similarity index 100%
rename from examples/inference/deepseek3_kt.yaml
rename to examples/ktransformers/infer_lora/deepseek3_kt.yaml
diff --git a/examples/inference/deepseek3_lora_sft_kt.yaml b/examples/ktransformers/infer_lora/deepseek3_lora_sft_kt.yaml
similarity index 100%
rename from examples/inference/deepseek3_lora_sft_kt.yaml
rename to examples/ktransformers/infer_lora/deepseek3_lora_sft_kt.yaml
diff --git a/examples/inference/qwen3moe_lora_sft_kt.yaml b/examples/ktransformers/infer_lora/qwen3moe_lora_sft_kt.yaml
similarity index 100%
rename from examples/inference/qwen3moe_lora_sft_kt.yaml
rename to examples/ktransformers/infer_lora/qwen3moe_lora_sft_kt.yaml
diff --git a/examples/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml
similarity index 100%
rename from examples/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml
rename to examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml
diff --git a/examples/kt_optimize_rules/DeepSeek-V2-Chat.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Chat.yaml
similarity index 100%
rename from examples/kt_optimize_rules/DeepSeek-V2-Chat.yaml
rename to examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Chat.yaml
diff --git a/examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml
similarity index 100%
rename from examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml
rename to examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml
diff --git a/examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
similarity index 100%
rename from examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
rename to examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
diff --git a/examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft.yaml
similarity index 100%
rename from examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft.yaml
rename to examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft.yaml
diff --git a/examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat.yaml
similarity index 100%
rename from examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat.yaml
rename to examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat.yaml
diff --git a/examples/kt_optimize_rules/DeepSeek-V3-Chat-amx.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-amx.yaml
similarity index 100%
rename from examples/kt_optimize_rules/DeepSeek-V3-Chat-amx.yaml
rename to examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-amx.yaml
diff --git a/examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml
similarity index 100%
rename from examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml
rename to examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml
diff --git a/examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
similarity index 100%
rename from examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
rename to examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
diff --git a/examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx.yaml b/examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx.yaml
similarity index 100%
rename from examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx.yaml
rename to examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx.yaml
diff --git a/examples/kt_optimize_rules/Qwen3Moe-sft-amx.yaml b/examples/ktransformers/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
similarity index 100%
rename from examples/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
rename to examples/ktransformers/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
diff --git a/examples/train_lora/deepseek2_lora_sft_kt.yaml b/examples/ktransformers/train_lora/deepseek2_lora_sft_kt.yaml
similarity index 100%
rename from examples/train_lora/deepseek2_lora_sft_kt.yaml
rename to examples/ktransformers/train_lora/deepseek2_lora_sft_kt.yaml
diff --git a/examples/train_lora/deepseek3_lora_sft_kt.yaml b/examples/ktransformers/train_lora/deepseek3_lora_sft_kt.yaml
similarity index 100%
rename from examples/train_lora/deepseek3_lora_sft_kt.yaml
rename to examples/ktransformers/train_lora/deepseek3_lora_sft_kt.yaml
diff --git a/examples/train_lora/qwen3moe_lora_sft_kt.yaml b/examples/ktransformers/train_lora/qwen3moe_lora_sft_kt.yaml
similarity index 100%
rename from examples/train_lora/qwen3moe_lora_sft_kt.yaml
rename to examples/ktransformers/train_lora/qwen3moe_lora_sft_kt.yaml
diff --git a/examples/merge_lora/llama3_full_sft.yaml b/examples/merge_lora/qwen3_full_sft.yaml
similarity index 56%
rename from examples/merge_lora/llama3_full_sft.yaml
rename to examples/merge_lora/qwen3_full_sft.yaml
index dd6953720..9c6fb9255 100644
--- a/examples/merge_lora/llama3_full_sft.yaml
+++ b/examples/merge_lora/qwen3_full_sft.yaml
@@ -1,10 +1,10 @@
 ### model
-model_name_or_path: saves/llama3-8b/full/sft
-template: llama3
+model_name_or_path: saves/qwen3-4b/full/sft
+template: qwen3_nothink
 trust_remote_code: true
 
 ### export
-export_dir: output/llama3_full_sft
+export_dir: saves/qwen3_sft_merged
 export_size: 5
 export_device: cpu  # choices: [cpu, auto]
 export_legacy_format: false
diff --git a/examples/merge_lora/llama3_gptq.yaml b/examples/merge_lora/qwen3_gptq.yaml
similarity index 66%
rename from examples/merge_lora/llama3_gptq.yaml
rename to examples/merge_lora/qwen3_gptq.yaml
index 2a3d2fd6d..800bc8d04 100644
--- a/examples/merge_lora/llama3_gptq.yaml
+++ b/examples/merge_lora/qwen3_gptq.yaml
@@ -1,10 +1,10 @@
 ### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-template: llama3
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
+template: qwen3_nothink
 trust_remote_code: true
 
 ### export
-export_dir: output/llama3_gptq
+export_dir: saves/qwen3_gptq
 export_quantization_bit: 4
 export_quantization_dataset: data/c4_demo.jsonl
 export_size: 5
diff --git a/examples/merge_lora/qwen2_5vl_lora_sft.yaml b/examples/merge_lora/qwen3_lora_sft.yaml
similarity index 58%
rename from examples/merge_lora/qwen2_5vl_lora_sft.yaml
rename to examples/merge_lora/qwen3_lora_sft.yaml
index 38a5c7c4f..f4b93f1ba 100644
--- a/examples/merge_lora/qwen2_5vl_lora_sft.yaml
+++ b/examples/merge_lora/qwen3_lora_sft.yaml
@@ -1,13 +1,13 @@
 ### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
 
 ### model
-model_name_or_path: Qwen/Qwen2.5-VL-7B-Instruct
-adapter_name_or_path: saves/qwen2_5vl-7b/lora/sft
-template: qwen2_vl
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
+adapter_name_or_path: saves/qwen3-4b/lora/sft
+template: qwen3_nothink
 trust_remote_code: true
 
 ### export
-export_dir: output/qwen2_5vl_lora_sft
+export_dir: saves/qwen3_sft_merged
 export_size: 5
 export_device: cpu  # choices: [cpu, auto]
 export_legacy_format: false
diff --git a/examples/merge_lora/llama3_lora_sft.yaml b/examples/merge_lora/qwen3vl_lora_sft.yaml
similarity index 57%
rename from examples/merge_lora/llama3_lora_sft.yaml
rename to examples/merge_lora/qwen3vl_lora_sft.yaml
index 2b011d8d9..647b0c1ea 100644
--- a/examples/merge_lora/llama3_lora_sft.yaml
+++ b/examples/merge_lora/qwen3vl_lora_sft.yaml
@@ -1,13 +1,13 @@
 ### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
 
 ### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-adapter_name_or_path: saves/llama3-8b/lora/sft
-template: llama3
+model_name_or_path: Qwen/Qwen3-VL-4B-Instruct
+adapter_name_or_path: saves/qwen3-vl-4b/lora/sft
+template: qwen3_vl_nothink
 trust_remote_code: true
 
 ### export
-export_dir: output/llama3_lora_sft
+export_dir: saves/qwen3_vl_sft_merged
 export_size: 5
 export_device: cpu  # choices: [cpu, auto]
 export_legacy_format: false
diff --git a/examples/train_full/llama3_full_sft.yaml b/examples/train_full/qwen3_full_sft.yaml
similarity index 87%
rename from examples/train_full/llama3_full_sft.yaml
rename to examples/train_full/qwen3_full_sft.yaml
index fb7066a73..adb7a1dfe 100644
--- a/examples/train_full/llama3_full_sft.yaml
+++ b/examples/train_full/qwen3_full_sft.yaml
@@ -1,5 +1,5 @@
 ### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
 trust_remote_code: true
 
 ### method
@@ -10,15 +10,14 @@ deepspeed: examples/deepspeed/ds_z3_config.json  # choices: [ds_z0_config.json,
 
 ### dataset
 dataset: identity,alpaca_en_demo
-template: llama3
+template: qwen3_nothink
 cutoff_len: 2048
 max_samples: 1000
-overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 
 ### output
-output_dir: saves/llama3-8b/full/sft
+output_dir: saves/qwen3-4b/full/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
diff --git a/examples/train_full/qwen3_full_sft_autotp.yaml b/examples/train_full/qwen3_full_sft_autotp.yaml
deleted file mode 100644
index 2726203f7..000000000
--- a/examples/train_full/qwen3_full_sft_autotp.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-### model
-model_name_or_path: Qwen/Qwen3-32B
-trust_remote_code: true
-use_v1_kernels: true
-
-### method
-stage: sft
-do_train: true
-finetuning_type: full
-deepspeed: examples/deepspeed/ds_z2_autotp_config.json
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: qwen3
-cutoff_len: 2048
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-dataloader_num_workers: 4
-
-### output
-output_dir: saves/qwen3-32b/full/sft_autotp
-logging_steps: 1
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-save_only_model: false
-report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
-
-### train
-per_device_train_batch_size: 4
-gradient_accumulation_steps: 1
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-resume_from_checkpoint: null
-
-### eval
-# eval_dataset: alpaca_en_demo
-# val_size: 0.1
-# per_device_eval_batch_size: 1
-# eval_strategy: steps
-# eval_steps: 500
diff --git a/examples/train_full/qwen2_5vl_full_sft.yaml b/examples/train_full/qwen3vl_full_sft.yaml
similarity index 87%
rename from examples/train_full/qwen2_5vl_full_sft.yaml
rename to examples/train_full/qwen3vl_full_sft.yaml
index bd9ac90df..06c6d9528 100644
--- a/examples/train_full/qwen2_5vl_full_sft.yaml
+++ b/examples/train_full/qwen3vl_full_sft.yaml
@@ -1,5 +1,5 @@
 ### model
-model_name_or_path: Qwen/Qwen2.5-VL-7B-Instruct
+model_name_or_path: Qwen/Qwen3-VL-4B-Instruct
 image_max_pixels: 262144
 video_max_pixels: 16384
 trust_remote_code: true
@@ -15,15 +15,14 @@ deepspeed: examples/deepspeed/ds_z3_config.json
 
 ### dataset
 dataset: mllm_demo,identity,alpaca_en_demo
-template: qwen2_vl
+template: qwen3_vl_nothink
 cutoff_len: 2048
 max_samples: 1000
-overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 
 ### output
-output_dir: saves/qwen2_5vl-7b/full/sft
+output_dir: saves/qwen3-vl-4b/full/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
diff --git a/examples/train_lora/llama3_lora_eval.yaml b/examples/train_lora/llama3_lora_eval.yaml
deleted file mode 100644
index 60d7c2f39..000000000
--- a/examples/train_lora/llama3_lora_eval.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-adapter_name_or_path: saves/llama3-8b/lora/sft
-trust_remote_code: true
-
-### method
-finetuning_type: lora
-
-### dataset
-task: mmlu_test  # choices: [mmlu_test, ceval_validation, cmmlu_test]
-template: fewshot
-lang: en
-n_shot: 5
-
-### output
-save_dir: saves/llama3-8b/lora/eval
-
-### eval
-batch_size: 4
diff --git a/examples/train_lora/llama3_lora_ppo.yaml b/examples/train_lora/llama3_lora_ppo.yaml
deleted file mode 100644
index 879448190..000000000
--- a/examples/train_lora/llama3_lora_ppo.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-reward_model: saves/llama3-8b/lora/reward
-trust_remote_code: true
-
-### method
-stage: ppo
-do_train: true
-finetuning_type: lora
-lora_rank: 8
-lora_target: all
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 2048
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-dataloader_num_workers: 4
-
-### output
-output_dir: saves/llama3-8b/lora/ppo
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-5
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### generate
-max_new_tokens: 512
-top_k: 0
-top_p: 0.9
diff --git a/examples/train_lora/llama3_lora_sft.yaml b/examples/train_lora/llama3_lora_sft.yaml
deleted file mode 100644
index 157d66104..000000000
--- a/examples/train_lora/llama3_lora_sft.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-trust_remote_code: true
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_rank: 8
-lora_target: all
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 2048
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-dataloader_num_workers: 4
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-save_only_model: false
-report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-resume_from_checkpoint: null
-
-### eval
-# eval_dataset: alpaca_en_demo
-# val_size: 0.1
-# per_device_eval_batch_size: 1
-# eval_strategy: steps
-# eval_steps: 500
diff --git a/examples/train_lora/llama4_lora_sft_ds3.yaml b/examples/train_lora/llama4_lora_sft_ds3.yaml
deleted file mode 100644
index 6c5bb7bb7..000000000
--- a/examples/train_lora/llama4_lora_sft_ds3.yaml
+++ /dev/null
@@ -1,49 +0,0 @@
-# pip install git+https://github.com/hiyouga/transformers.git@llama4_train
-
-### model
-model_name_or_path: meta-llama/Llama-4-Scout-17B-16E-Instruct
-trust_remote_code: true
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_rank: 8
-lora_target: all
-deepspeed: examples/deepspeed/ds_z3_config.json  # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
-
-### dataset
-dataset: mllm_demo,identity,alpaca_en_demo
-template: llama4
-cutoff_len: 2048
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-dataloader_num_workers: 4
-
-### output
-output_dir: saves/llama4-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-save_only_model: false
-report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 2
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-resume_from_checkpoint: null
-
-### eval
-# eval_dataset: alpaca_en_demo
-# val_size: 0.1
-# per_device_eval_batch_size: 1
-# eval_strategy: steps
-# eval_steps: 500
diff --git a/examples/train_lora/llama3_lora_dpo.yaml b/examples/train_lora/qwen3_lora_dpo.yaml
similarity index 86%
rename from examples/train_lora/llama3_lora_dpo.yaml
rename to examples/train_lora/qwen3_lora_dpo.yaml
index fd8c042c1..78f4d31f6 100644
--- a/examples/train_lora/llama3_lora_dpo.yaml
+++ b/examples/train_lora/qwen3_lora_dpo.yaml
@@ -1,5 +1,5 @@
 ### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
 trust_remote_code: true
 
 ### method
@@ -13,15 +13,14 @@ pref_loss: sigmoid  # choices: [sigmoid (dpo), orpo, simpo]
 
 ### dataset
 dataset: dpo_en_demo
-template: llama3
+template: qwen3_nothink
 cutoff_len: 2048
 max_samples: 1000
-overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 
 ### output
-output_dir: saves/llama3-8b/lora/dpo
+output_dir: saves/qwen3-4b/lora/dpo
 logging_steps: 10
 save_steps: 500
 plot_loss: true
diff --git a/examples/train_lora/llama3_lora_kto.yaml b/examples/train_lora/qwen3_lora_kto.yaml
similarity index 84%
rename from examples/train_lora/llama3_lora_kto.yaml
rename to examples/train_lora/qwen3_lora_kto.yaml
index 113b9129f..51e67318a 100644
--- a/examples/train_lora/llama3_lora_kto.yaml
+++ b/examples/train_lora/qwen3_lora_kto.yaml
@@ -1,5 +1,5 @@
 ### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
 trust_remote_code: true
 
 ### method
@@ -12,15 +12,14 @@ pref_beta: 0.1
 
 ### dataset
 dataset: kto_en_demo
-template: llama3
+template: qwen3_nothink
 cutoff_len: 2048
 max_samples: 1000
-overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 
 ### output
-output_dir: saves/llama3-8b/lora/kto
+output_dir: saves/qwen3-4b/lora/kto
 logging_steps: 10
 save_steps: 500
 plot_loss: true
diff --git a/examples/train_lora/llama3_lora_pretrain.yaml b/examples/train_lora/qwen3_lora_pretrain.yaml
similarity index 86%
rename from examples/train_lora/llama3_lora_pretrain.yaml
rename to examples/train_lora/qwen3_lora_pretrain.yaml
index 3c851d705..a14e9b462 100644
--- a/examples/train_lora/llama3_lora_pretrain.yaml
+++ b/examples/train_lora/qwen3_lora_pretrain.yaml
@@ -1,5 +1,5 @@
 ### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
 trust_remote_code: true
 
 ### method
@@ -13,12 +13,11 @@ lora_target: all
 dataset: c4_demo
 cutoff_len: 2048
 max_samples: 1000
-overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 
 ### output
-output_dir: saves/llama3-8b/lora/pretrain
+output_dir: saves/qwen3-4b/lora/pretrain
 logging_steps: 10
 save_steps: 500
 plot_loss: true
diff --git a/examples/train_lora/llama3_lora_reward.yaml b/examples/train_lora/qwen3_lora_reward.yaml
similarity index 85%
rename from examples/train_lora/llama3_lora_reward.yaml
rename to examples/train_lora/qwen3_lora_reward.yaml
index 48230b552..17887c02d 100644
--- a/examples/train_lora/llama3_lora_reward.yaml
+++ b/examples/train_lora/qwen3_lora_reward.yaml
@@ -1,5 +1,5 @@
 ### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
 trust_remote_code: true
 
 ### method
@@ -11,15 +11,14 @@ lora_target: all
 
 ### dataset
 dataset: dpo_en_demo
-template: llama3
+template: qwen3_nothink
 cutoff_len: 2048
 max_samples: 1000
-overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 
 ### output
-output_dir: saves/llama3-8b/lora/reward
+output_dir: saves/qwen3-4b/lora/reward
 logging_steps: 10
 save_steps: 500
 plot_loss: true
diff --git a/examples/train_lora/llama3_lora_sft.sh b/examples/train_lora/qwen3_lora_sft.sh
similarity index 84%
rename from examples/train_lora/llama3_lora_sft.sh
rename to examples/train_lora/qwen3_lora_sft.sh
index 59db2c58e..bc63ac2d1 100644
--- a/examples/train_lora/llama3_lora_sft.sh
+++ b/examples/train_lora/qwen3_lora_sft.sh
@@ -2,7 +2,7 @@
 
 set -x
 
-MODEL_PATH=meta-llama/Meta-Llama-3-8B-Instruct
+MODEL_PATH=Qwen/Qwen3-4B-Instruct-2507
 
 llamafactory-cli train \
     --model_name_or_path ${MODEL_PATH} \
@@ -13,13 +13,12 @@ llamafactory-cli train \
     --lora_rank 8 \
     --lora_target all \
     --dataset identity,alpaca_en_demo \
-    --template llama3 \
+    --template qwen3_nothink \
     --cutoff_len 2048 \
     --max_samples 1000 \
-    --overwrite_cache \
     --preprocessing_num_workers 16 \
     --dataloader_num_workers 4 \
-    --output_dir saves/llama3-8b/lora/sft \
+    --output_dir saves/qwen3-4b/lora/sft \
     --logging_steps 10 \
     --save_steps 500 \
     --plot_loss \
diff --git a/examples/train_lora/gpt_lora_sft.yaml b/examples/train_lora/qwen3_lora_sft.yaml
similarity index 87%
rename from examples/train_lora/gpt_lora_sft.yaml
rename to examples/train_lora/qwen3_lora_sft.yaml
index b07615b1c..ba19e261c 100644
--- a/examples/train_lora/gpt_lora_sft.yaml
+++ b/examples/train_lora/qwen3_lora_sft.yaml
@@ -1,5 +1,5 @@
 ### model
-model_name_or_path: openai/gpt-oss-20b
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
 trust_remote_code: true
 
 ### method
@@ -11,15 +11,14 @@ lora_target: all
 
 ### dataset
 dataset: identity,alpaca_en_demo
-template: gpt
+template: qwen3_nothink
 cutoff_len: 2048
 max_samples: 1000
-overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 
 ### output
-output_dir: saves/gpt-20b/lora/sft
+output_dir: saves/qwen3-4b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
diff --git a/examples/train_lora/llama3_lora_sft_ds3.yaml b/examples/train_lora/qwen3_lora_sft_ds3.yaml
similarity index 87%
rename from examples/train_lora/llama3_lora_sft_ds3.yaml
rename to examples/train_lora/qwen3_lora_sft_ds3.yaml
index e20b35179..6fcf1c6c2 100644
--- a/examples/train_lora/llama3_lora_sft_ds3.yaml
+++ b/examples/train_lora/qwen3_lora_sft_ds3.yaml
@@ -1,5 +1,5 @@
 ### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
 trust_remote_code: true
 
 ### method
@@ -12,15 +12,14 @@ deepspeed: examples/deepspeed/ds_z3_config.json  # choices: [ds_z0_config.json,
 
 ### dataset
 dataset: identity,alpaca_en_demo
-template: llama3
+template: qwen3_nothink
 cutoff_len: 2048
 max_samples: 1000
-overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 
 ### output
-output_dir: saves/llama3-8b/lora/sft
+output_dir: saves/qwen3-4b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
diff --git a/examples/train_lora/llama3_lora_sft_ray.yaml b/examples/train_lora/qwen3_lora_sft_ray.yaml
similarity index 87%
rename from examples/train_lora/llama3_lora_sft_ray.yaml
rename to examples/train_lora/qwen3_lora_sft_ray.yaml
index 8c03bf9ee..0cbc59546 100644
--- a/examples/train_lora/llama3_lora_sft_ray.yaml
+++ b/examples/train_lora/qwen3_lora_sft_ray.yaml
@@ -1,5 +1,5 @@
 ### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct  # or use local absolute path
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507  # or use local absolute path
 trust_remote_code: true
 
 ### method
@@ -12,10 +12,9 @@ lora_target: all
 ### dataset
 dataset: identity,alpaca_en_demo
 dataset_dir: REMOTE:llamafactory/demo_data  # or use local absolute path
-template: llama3
+template: qwen3_nothink
 cutoff_len: 2048
 max_samples: 1000
-overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 
@@ -29,7 +28,7 @@ save_only_model: false
 report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
 
 ### ray
-ray_run_name: llama3_8b_sft_lora
+ray_run_name: qwen3_4b_sft_lora
 ray_storage_path: ./saves
 ray_num_workers: 4  # Number of GPUs to use.
 placement_strategy: PACK
diff --git a/examples/train_lora/llama3_preprocess.yaml b/examples/train_lora/qwen3_preprocess.yaml
similarity index 58%
rename from examples/train_lora/llama3_preprocess.yaml
rename to examples/train_lora/qwen3_preprocess.yaml
index fbaf01f0f..60901654c 100644
--- a/examples/train_lora/llama3_preprocess.yaml
+++ b/examples/train_lora/qwen3_preprocess.yaml
@@ -1,5 +1,5 @@
 ### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
 trust_remote_code: true
 
 ### method
@@ -11,13 +11,12 @@ lora_target: all
 
 ### dataset
 dataset: identity,alpaca_en_demo
-template: llama3
+template: qwen3_nothink
 cutoff_len: 2048
 max_samples: 1000
-overwrite_cache: true
 preprocessing_num_workers: 16
-tokenized_path: saves/llama3-8b/dataset/sft
+tokenized_path: saves/qwen3-4b/dataset/sft
 
-### output
-output_dir: saves/llama3-8b/lora/sft
+### output (not used)
+output_dir: saves/qwen3-4b/lora/sft
 overwrite_output_dir: true
diff --git a/examples/train_lora/qwen2_5vl_lora_dpo.yaml b/examples/train_lora/qwen3vl_lora_dpo.yaml
similarity index 87%
rename from examples/train_lora/qwen2_5vl_lora_dpo.yaml
rename to examples/train_lora/qwen3vl_lora_dpo.yaml
index 2140c90d5..12e9a615b 100644
--- a/examples/train_lora/qwen2_5vl_lora_dpo.yaml
+++ b/examples/train_lora/qwen3vl_lora_dpo.yaml
@@ -1,5 +1,5 @@
 ### model
-model_name_or_path: Qwen/Qwen2.5-VL-7B-Instruct
+model_name_or_path: Qwen/Qwen3-VL-4B-Instruct
 image_max_pixels: 262144
 video_max_pixels: 16384
 trust_remote_code: true
@@ -15,15 +15,14 @@ pref_loss: sigmoid  # choices: [sigmoid (dpo), orpo, simpo]
 
 ### dataset
 dataset: rlhf_v
-template: qwen2_vl
+template: qwen3_vl_nothink
 cutoff_len: 2048
 max_samples: 1000
-overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 
 ### output
-output_dir: saves/qwen2_5vl-7b/lora/dpo
+output_dir: saves/qwen3-vl-4b/lora/dpo
 logging_steps: 10
 save_steps: 500
 plot_loss: true
diff --git a/examples/train_lora/qwen2_5vl_lora_sft.yaml b/examples/train_lora/qwen3vl_lora_sft.yaml
similarity index 86%
rename from examples/train_lora/qwen2_5vl_lora_sft.yaml
rename to examples/train_lora/qwen3vl_lora_sft.yaml
index 6177cfd5f..749bfe60c 100644
--- a/examples/train_lora/qwen2_5vl_lora_sft.yaml
+++ b/examples/train_lora/qwen3vl_lora_sft.yaml
@@ -1,5 +1,5 @@
 ### model
-model_name_or_path: Qwen/Qwen2.5-VL-7B-Instruct
+model_name_or_path: Qwen/Qwen3-VL-4B-Instruct
 image_max_pixels: 262144
 video_max_pixels: 16384
 trust_remote_code: true
@@ -13,15 +13,14 @@ lora_target: all
 
 ### dataset
 dataset: mllm_demo,identity,alpaca_en_demo  # video: mllm_video_demo
-template: qwen2_vl
+template: qwen3_vl_nothink
 cutoff_len: 2048
 max_samples: 1000
-overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 
 ### output
-output_dir: saves/qwen2_5vl-7b/lora/sft
+output_dir: saves/qwen3-vl-4b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
diff --git a/examples/train_qlora/llama3_lora_sft_aqlm.yaml b/examples/train_qlora/llama3_lora_sft_aqlm.yaml
index a7d44c7ea..16a0a4a2c 100644
--- a/examples/train_qlora/llama3_lora_sft_aqlm.yaml
+++ b/examples/train_qlora/llama3_lora_sft_aqlm.yaml
@@ -14,7 +14,6 @@ dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 2048
 max_samples: 1000
-overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 
diff --git a/examples/train_qlora/llama3_lora_sft_awq.yaml b/examples/train_qlora/llama3_lora_sft_awq.yaml
index 861edfde3..9c57c6a13 100644
--- a/examples/train_qlora/llama3_lora_sft_awq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_awq.yaml
@@ -14,7 +14,6 @@ dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 2048
 max_samples: 1000
-overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 
diff --git a/examples/train_qlora/llama3_lora_sft_gptq.yaml b/examples/train_qlora/llama3_lora_sft_gptq.yaml
index 729d8628b..fd23e65c1 100644
--- a/examples/train_qlora/llama3_lora_sft_gptq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_gptq.yaml
@@ -14,7 +14,6 @@ dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 2048
 max_samples: 1000
-overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 
diff --git a/examples/train_qlora/llama3_lora_sft_bnb_npu.yaml b/examples/train_qlora/qwen3_lora_sft_bnb_npu.yaml
similarity index 86%
rename from examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
rename to examples/train_qlora/qwen3_lora_sft_bnb_npu.yaml
index d68ce665c..0301ee15a 100644
--- a/examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
+++ b/examples/train_qlora/qwen3_lora_sft_bnb_npu.yaml
@@ -1,5 +1,5 @@
 ### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
 quantization_bit: 4
 quantization_method: bnb
 double_quantization: false
@@ -14,15 +14,14 @@ lora_target: all
 
 ### dataset
 dataset: identity,alpaca_en_demo
-template: llama3
+template: qwen3_nothink
 cutoff_len: 2048
 max_samples: 1000
-overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 
 ### output
-output_dir: saves/llama3-8b/lora/sft
+output_dir: saves/qwen3-4b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
diff --git a/examples/train_qlora/llama3_lora_sft_otfq.yaml b/examples/train_qlora/qwen3_lora_sft_otfq.yaml
similarity index 86%
rename from examples/train_qlora/llama3_lora_sft_otfq.yaml
rename to examples/train_qlora/qwen3_lora_sft_otfq.yaml
index 1a157afec..3a0e3d457 100644
--- a/examples/train_qlora/llama3_lora_sft_otfq.yaml
+++ b/examples/train_qlora/qwen3_lora_sft_otfq.yaml
@@ -1,5 +1,5 @@
 ### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
 quantization_bit: 4  # choices: [8 (bnb/hqq/eetq), 4 (bnb/hqq), 3 (hqq), 2 (hqq)]
 quantization_method: bnb  # choices: [bnb, hqq, eetq]
 trust_remote_code: true
@@ -13,15 +13,14 @@ lora_target: all
 
 ### dataset
 dataset: identity,alpaca_en_demo
-template: llama3
+template: qwen3_nothink
 cutoff_len: 2048
 max_samples: 1000
-overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 
 ### output
-output_dir: saves/llama3-8b/lora/sft
+output_dir: saves/qwen3-4b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
diff --git a/pyproject.toml b/pyproject.toml
index a60f47606..ef03b82f8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,8 +41,7 @@ dependencies = [
     "torch>=2.4.0",
     "torchvision>=0.19.0",
     "torchaudio>=2.4.0",
-    "transformers>=4.51.0,<=4.56.2,!=4.52.0; python_version < '3.10'",
-    "transformers>=4.51.0,<=4.57.1,!=4.52.0,!=4.57.0; python_version >= '3.10'",
+    "transformers>=4.51.0,<=4.57.1,!=4.52.0,!=4.57.0",
     "datasets>=2.16.0,<=4.0.0",
     "accelerate>=1.3.0,<=1.11.0",
     "peft>=0.14.0,<=0.17.1",
diff --git a/scripts/vllm_infer.py b/scripts/vllm_infer.py
index 4d74004e0..c794b7c7b 100644
--- a/scripts/vllm_infer.py
+++ b/scripts/vllm_infer.py
@@ -18,9 +18,10 @@ import time
 
 import av
 import fire
+from datasets import load_dataset
+from eval_bleu_rouge import compute_metrics
 from tqdm import tqdm
 from transformers import Seq2SeqTrainingArguments
-from datasets import load_dataset
 
 from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
 from llamafactory.extras.constants import IGNORE_INDEX
@@ -29,8 +30,6 @@ from llamafactory.extras.packages import is_vllm_available
 from llamafactory.hparams import get_infer_args
 from llamafactory.model import load_tokenizer
 
-from eval_bleu_rouge import compute_metrics
-
 
 if is_vllm_available():
     from vllm import LLM, SamplingParams
@@ -235,10 +234,10 @@ def vllm_infer(
     print(f"{len(all_prompts)} total generated results have been saved at {save_name}.")
     print("*" * 70)
 
-    # Write all matrix results when matrix_save_name is not None, 
+    # Write all matrix results when matrix_save_name is not None,
     # The result matrix is referencing src.llamafactory.train.sft.workflow.run_sft # 127~132
     # trainer.save_metrics("predict", predict_results.metrics)
-    # 
+    #
     #   {
     #        "predict_bleu-4": 4.349975,
     #        "predict_model_preparation_time": 0.0128,
@@ -265,11 +264,11 @@ def vllm_infer(
             print(f"predict_{task}: {score:.4f}")
             average_score["predict_" + task] = score
 
-        average_score['predict_model_preparation_time'] = preparation_time
-        average_score['predict_runtime'] = predict_time
+        average_score["predict_model_preparation_time"] = preparation_time
+        average_score["predict_runtime"] = predict_time
         num_steps = len(range(0, len(train_dataset), batch_size))
-        average_score['predict_samples_per_second'] = len(dataset) / predict_time if predict_time > 0 else 0.0
-        average_score['predict_steps_per_second'] = num_steps / predict_time if predict_time > 0 else 0.0
+        average_score["predict_samples_per_second"] = len(dataset) / predict_time if predict_time > 0 else 0.0
+        average_score["predict_steps_per_second"] = num_steps / predict_time if predict_time > 0 else 0.0
 
         with open(matrix_save_name, "w", encoding="utf-8") as f:
             json.dump(average_score, f, indent=4)
@@ -280,4 +279,4 @@ def vllm_infer(
 
 
 if __name__ == "__main__":
-    fire.Fire(vllm_infer)
\ No newline at end of file
+    fire.Fire(vllm_infer)
diff --git a/src/llamafactory/extras/env.py b/src/llamafactory/extras/env.py
index d39b99453..d839ee924 100644
--- a/src/llamafactory/extras/env.py
+++ b/src/llamafactory/extras/env.py
@@ -19,7 +19,7 @@
 from collections import OrderedDict
 
 
-VERSION = "0.9.4.dev0"
+VERSION = "0.9.4"
 
 
 def print_env() -> None:
diff --git a/tests_v1/config/test_args_parser.py b/tests_v1/config/test_args_parser.py
index 945e0e572..b39f4532c 100644
--- a/tests_v1/config/test_args_parser.py
+++ b/tests_v1/config/test_args_parser.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import pathlib
+import sys
 from unittest.mock import patch
 
 from llamafactory.v1.config.arg_parser import get_args