[feat] Models trained and inferred with FP8 are dequantized by default (#9627 )

[example] add Qwen3 series examples (#9624 )
Co-authored-by: UsernameFull <tohowtodoit@gmail.com>
2026-04-17 02:16:02 +08:00 · 2025-12-18 22:54:35 +08:00 · 2025-12-18 21:27:00 +08:00 · 2025-12-18 21:26:25 +08:00 · 2025-12-18 21:26:04 +08:00
10 changed files with 258 additions and 151 deletions
--- a/examples/ascend/qwen3_full_sft_fsdp2.yaml
+++ b/examples/ascend/qwen3_full_sft_fsdp2.yaml
@@ -0,0 +1,45 @@
+# Start FSDP2 fine-tuning
+# accelerate launch \
+#     --config_file examples/accelerate/fsdp2_config.yaml \
+#     src/train.py examples/ascend/qwen3_full_sft_fsdp2.yaml
+# Change `num_processes` in fsdp2_config.yaml to 16 in A3
+
+### model
+model_name_or_path: Qwen/Qwen3-8B
+trust_remote_code: true
+use_v1_kernels: true
+flash_attn: fa2
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+
+### dataset
+dataset: alpaca_en_demo
+template: qwen3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+
+### output
+output_dir: saves/Qwen3-8B/full/sft
+logging_steps: 1
+save_steps: 500
+max_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+
+### train
+per_device_train_batch_size: 8
+gradient_accumulation_steps: 1
+learning_rate: 1.0e-5
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 1800
+resume_from_checkpoint: null
--- a/examples/ascend/qwen3moe_full_sft_fsdp.yaml
+++ b/examples/ascend/qwen3moe_full_sft_fsdp.yaml
@@ -0,0 +1,46 @@
+# Start FSDP fine-tuning
+# accelerate launch \
+#     --config_file examples/accelerate/fsdp_config.yaml \
+#     src/train.py examples/ascend/qwen3moe_full_sft_fsdp.yaml
+# Change `num_processes` in fsdp_config.yaml to 16 in A3
+
+### model
+model_name_or_path: Qwen/Qwen3-30B-A3B-Instruct-2507
+trust_remote_code: true
+use_v1_kernels: true
+flash_attn: fa2
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+disable_gradient_checkpointing: false
+
+### dataset
+dataset: alpaca_zh
+template: qwen3
+cutoff_len: 1024
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+
+### output
+output_dir: saves/Qwen3-30B-A3B-Instruct-2507/full/sft
+logging_steps: 1
+save_steps: 500
+max_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: true
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+
+### train
+per_device_train_batch_size: 4
+gradient_accumulation_steps: 1
+learning_rate: 1.0e-4
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+seed: 1234
--- a/examples/ascend/qwen3vlmoe_full_sft_fsdp2.yaml
+++ b/examples/ascend/qwen3vlmoe_full_sft_fsdp2.yaml
@@ -0,0 +1,48 @@
+# Start FSDP2 fine-tuning
+# accelerate launch \
+#     --config_file examples/accelerate/fsdp2_config.yaml \
+#     src/train.py examples/ascend/qwen3vlmoe_full_sft_fsdp2.yaml
+# Change `num_processes` in fsdp2_config.yaml to 16 in A3
+
+### model
+model_name_or_path: Qwen/Qwen3-VL-30B-A3B-Instruct
+image_max_pixels: 262144
+video_max_pixels: 16384
+trust_remote_code: true
+use_v1_kernels: true
+flash_attn: fa2
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+disable_gradient_checkpointing: false
+
+### dataset
+dataset: llava_1k_en, llava_1k_zh
+template: qwen3_vl
+cutoff_len: 1024
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+
+### output
+output_dir: saves/Qwen3-VL-30B-A3B-Instruct/full/sft
+logging_steps: 1
+save_steps: 500
+max_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: true
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+
+### train
+per_device_train_batch_size: 2
+gradient_accumulation_steps: 1
+learning_rate: 1.0e-4
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+seed: 1234
--- a/src/llamafactory/model/model_utils/quantization.py
+++ b/src/llamafactory/model/model_utils/quantization.py
@@ -110,7 +110,7 @@ def configure_quantization(
            check_version("aqlm>=1.1.0", mandatory=True)
            quantization_config["bits"] = 2

-        if quant_method == QuantizationMethod.FP8 and is_trainable:
+        if quant_method == QuantizationMethod.FP8:
            quant_config = FineGrainedFP8Config(dequantize=True)
            init_kwargs["quantization_config"] = quant_config

--- a/src/llamafactory/train/dpo/ktrainer.py
+++ b/src/llamafactory/train/dpo/ktrainer.py
@@ -0,0 +1,71 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/dpo_trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Optional
+import torch
+import torch.nn.functional as F
+from transformers import Trainer
+from trl import DPOTrainer
+from trl.trainer import disable_dropout_in_model
+from typing_extensions import override
+
+from ...extras.constants import IGNORE_INDEX
+from ...extras.packages import is_transformers_version_greater_than
+from ..callbacks import SaveProcessorCallback
+from ..trainer_utils import create_custom_optimizer, create_custom_scheduler, get_batch_logps, nested_detach
+from .trainer import CustomDPOTrainer as BaseDPOTrainer
+from ktransformers.sft.lora import KTrainer
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, ProcessorMixin
+
+    from ...hparams import FinetuningArguments
+
+
+class CustomDPOTrainer(KTrainer, BaseDPOTrainer):
+    @override
+    def concatenated_forward(
+            self, model: "PreTrainedModel", batch: dict[str, "torch.Tensor"], is_ref_model: bool = False
+    ) -> tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        r"""Compute the sum log probabilities of the labels under given logits if loss_type is not IPO, ORPO or SimPO.
+
+        Otherwise the average log probabilities.
+        """
+        if self.finetuning_args.use_ref_model:
+            batch = nested_detach(batch, clone=True)  # avoid error
+        labels = batch["labels"]
+        # dpo not need compute loss in forward, waste mem
+        del batch["labels"]
+        all_logits: torch.Tensor = model(**batch, return_dict=True, use_cache=False).logits.to(torch.float32)
+        all_logits = all_logits.to("cpu")
+        labels = labels.to(all_logits.device)
+        all_logps, valid_length = get_batch_logps(
+            logits=all_logits, labels=labels, ld_alpha=(self.ld_alpha if not is_ref_model else None)
+        )
+        if self.loss_type in ["ipo", "orpo", "simpo"]:
+            all_logps = all_logps / valid_length
+
+        batch_size = batch["input_ids"].size(0) // 2
+        chosen_logps, rejected_logps = all_logps.split(batch_size, dim=0)
+        chosen_logits, rejected_logits = all_logits.split(batch_size, dim=0)
+        chosen_length, _ = valid_length.split(batch_size, dim=0)
+
+        if self.loss_type in ["ipo", "orpo", "simpo"]:
+            return chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_logps
+        else:
+            return chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_logps / chosen_length
--- a/src/llamafactory/train/dpo/workflow.py
+++ b/src/llamafactory/train/dpo/workflow.py
@@ -24,7 +24,6 @@ from ...extras.ploting import plot_loss
 from ...hparams import ModelArguments
 from ...model import load_model, load_tokenizer
 from ..trainer_utils import create_modelcard_and_push, create_ref_model
-from .trainer import CustomDPOTrainer


 if TYPE_CHECKING:
@@ -63,6 +62,16 @@ def run_dpo(
    else:
        ref_model = None
        
+        
+    if model_args.use_kt:
+        from ktransformers.util.globals import GLOBAL_CONFIG
+
+        GLOBAL_CONFIG._config["mod"] = "sft"
+        
+        from .ktrainer import CustomDPOTrainer
+    else:
+        from .trainer import CustomDPOTrainer
+
    # Initialize our Trainer
    trainer = CustomDPOTrainer(
        model=model,
--- a/src/llamafactory/train/ksft/init.py
+++ b/src/llamafactory/train/ksft/init.py
@@ -1,18 +0,0 @@
-# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .workflow import run_sft
-
-
-__all__ = ["run_sft"]
--- a/src/llamafactory/train/ksft/workflow.py
+++ b/src/llamafactory/train/ksft/workflow.py
@@ -1,113 +0,0 @@
-# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import TYPE_CHECKING, Optional
-
-from ...data import SFTDataCollatorWith4DAttentionMask, get_dataset, get_template_and_fix_tokenizer
-from ...extras.constants import IGNORE_INDEX
-from ...extras.logging import get_logger
-from ...extras.misc import calculate_tps
-from ...extras.ploting import plot_loss
-from ...model import load_model, load_tokenizer
-from ..trainer_utils import create_modelcard_and_push
-
-
-if TYPE_CHECKING:
-    from transformers import Seq2SeqTrainingArguments, TrainerCallback
-
-    from ...hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
-
-
-logger = get_logger(__name__)
-
-
-def run_sft(
-    model_args: "ModelArguments",
-    data_args: "DataArguments",
-    training_args: "Seq2SeqTrainingArguments",
-    finetuning_args: "FinetuningArguments",
-    generating_args: "GeneratingArguments",
-    callbacks: Optional[list["TrainerCallback"]] = None,
-):
-    tokenizer_module = load_tokenizer(model_args)
-    tokenizer = tokenizer_module["tokenizer"]
-    template = get_template_and_fix_tokenizer(tokenizer, data_args)
-    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="sft", **tokenizer_module)
-    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
-
-    from ktransformers.util.globals import GLOBAL_CONFIG
-
-    GLOBAL_CONFIG._config["mod"] = "sft"
-
-    if getattr(model, "is_quantized", False) and not training_args.do_train:
-        setattr(model, "_hf_peft_config_loaded", True)  # hack here: make model compatible with prediction
-
-    data_collator = SFTDataCollatorWith4DAttentionMask(
-        template=template,
-        model=model if not training_args.predict_with_generate else None,
-        pad_to_multiple_of=8 if training_args.do_train else None,  # for shift short attention
-        label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id,
-        block_diag_attn=model_args.block_diag_attn,
-        attn_implementation=getattr(model.config, "_attn_implementation", None),
-        compute_dtype=model_args.compute_dtype,
-        **tokenizer_module,
-    )
-
-    # Metric utils
-    metric_module = {}
-    if training_args.predict_with_generate:
-        raise NotImplementedError("`predict_with_generate` is not supported in KTransformers SFT yet.")
-    elif finetuning_args.compute_accuracy:
-        raise NotImplementedError("`compute_accuracy` is not supported in KTransformers SFT yet.")
-
-    # Initialize our Trainer
-    from ktransformers.sft.lora import KTrainer
-
-    trainer = KTrainer(
-        model=model,
-        args=training_args,
-        tokenizer=tokenizer_module,
-        data_collator=data_collator,
-        callbacks=callbacks,
-        **dataset_module,
-        **metric_module,
-    )
-    trainer.model_accepts_loss_kwargs = False
-
-    # Training
-    if training_args.do_train:
-        model.config.use_cache = False
-        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
-        trainer.save_model()
-        if finetuning_args.include_effective_tokens_per_second:
-            train_result.metrics["effective_tokens_per_sec"] = calculate_tps(
-                dataset_module["train_dataset"], train_result.metrics, stage="sft"
-            )
-
-        trainer.log_metrics("train", train_result.metrics)
-        trainer.save_metrics("train", train_result.metrics)
-        trainer.save_state()
-        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
-            keys = ["loss"]
-            if isinstance(dataset_module.get("eval_dataset"), dict):
-                keys += sum(
-                    [[f"eval_{key}_loss", f"eval_{key}_accuracy"] for key in dataset_module["eval_dataset"].keys()], []
-                )
-            else:
-                keys += ["eval_loss", "eval_accuracy"]
-
-            plot_loss(training_args.output_dir, keys=keys)
-
-    # Create model card
-    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
--- a/src/llamafactory/train/sft/workflow.py
+++ b/src/llamafactory/train/sft/workflow.py
@@ -68,6 +68,12 @@ def run_sft(

    # Metric utils
    metric_module = {}
+    if model_args.use_kt:
+        if training_args.predict_with_generate:
+            raise NotImplementedError("`predict_with_generate` is not supported in KTransformers SFT yet.")
+        elif finetuning_args.compute_accuracy:
+            raise NotImplementedError("`compute_accuracy` is not supported in KTransformers SFT yet.")
+    
    if training_args.predict_with_generate:
        metric_module["compute_metrics"] = ComputeSimilarity(tokenizer=tokenizer)
    elif finetuning_args.compute_accuracy:
@@ -92,17 +98,36 @@ def run_sft(
    gen_kwargs["pad_token_id"] = tokenizer.pad_token_id

    # Initialize our Trainer
-    trainer = CustomSeq2SeqTrainer(
-        model=model,
-        args=training_args,
-        finetuning_args=finetuning_args,
-        data_collator=data_collator,
-        callbacks=callbacks,
-        gen_kwargs=gen_kwargs,
-        **dataset_module,
-        **tokenizer_module,
-        **metric_module,
-    )
+    if model_args.use_kt:
+        from ktransformers.util.globals import GLOBAL_CONFIG
+        from ktransformers.sft.lora import KTrainer
+
+        GLOBAL_CONFIG._config["mod"] = "sft"
+
+        trainer = KTrainer(
+            model=model,
+            args=training_args,
+            tokenizer=tokenizer_module,
+            data_collator=data_collator,
+            callbacks=callbacks,
+            **dataset_module,
+            **metric_module,
+        )
+        trainer.model_accepts_loss_kwargs = False
+        model.config.use_cache = False
+
+    else:
+        trainer = CustomSeq2SeqTrainer(
+            model=model,
+            args=training_args,
+            finetuning_args=finetuning_args,
+            data_collator=data_collator,
+            callbacks=callbacks,
+            gen_kwargs=gen_kwargs,
+            **dataset_module,
+            **tokenizer_module,
+            **metric_module,
+        )

    # Training
    if training_args.do_train:
--- a/src/llamafactory/train/tuner.py
+++ b/src/llamafactory/train/tuner.py
@@ -85,13 +85,7 @@ def _training_function(config: dict[str, Any]) -> None:
    elif finetuning_args.stage == "pt":
        run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
    elif finetuning_args.stage == "sft":
-        if model_args.use_kt:
-            from .ksft.workflow import run_sft as run_sft_kt
-
-            run_sft_kt(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
-        else:
-            run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
-
+        run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
    elif finetuning_args.stage == "rm":
        run_rm(model_args, data_args, training_args, finetuning_args, callbacks)
    elif finetuning_args.stage == "ppo":
Author	SHA1	Message	Date
Xunpeng Xiao	8c74dca76a	[feat] Models trained and inferred with FP8 are dequantized by default (#9627 )	2025-12-18 22:54:35 +08:00
xvxuopop	e8deda53a1	[example] add Qwen3 series examples (#9624 ) Co-authored-by: UsernameFull <tohowtodoit@gmail.com>	2025-12-18 21:27:00 +08:00
mrhaoxx	a769fb94b9	[feat] support ktransformers for dpo (#9621 ) Co-authored-by: poryfly <porykid@gmail.com>	2025-12-18 21:26:25 +08:00
mrhaoxx	964569751f	[kt] refactor ktransformers integration (#9632 )	2025-12-18 21:26:04 +08:00