[v1] upgrade batching (#9751)

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-03-04 02:35:59 +08:00 · 2026-01-12 00:21:36 +08:00
parent 15b87f3125
commit a296723697
18 changed files with 273 additions and 97 deletions
--- a/.github/instructions-v0.md
+++ b/.github/instructions-v0.md
--- a/.github/instructions-v1.md
+++ b/.github/instructions-v1.md
--- a/scripts/convert_ckpt/tiny_qwen3.py
+++ b/scripts/convert_ckpt/tiny_qwen3.py
@@ -0,0 +1,32 @@
 # Copyright 2025 the LlamaFactory team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from transformers import AutoTokenizer, Qwen3Config, Qwen3ForCausalLM
 if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Instruct-2507")
    config = Qwen3Config(
        hidden_size=1408,
        image_size=336,
        intermediate_size=5632,
        num_attention_heads=16,
        num_hidden_layers=4,
        vision_output_dim=4096,
    )
    model = Qwen3ForCausalLM.from_config(config)
    model.save_pretrained("tiny-qwen3")
    tokenizer.save_pretrained("tiny-qwen3")
    model.push_to_hub("llamafactory/tiny-random-qwen3")
    tokenizer.push_to_hub("llamafactory/tiny-random-qwen3")
--- a/src/llamafactory/v1/core/utils/batching.py
+++ b/src/llamafactory/v1/core/utils/batching.py
@@ -34,30 +34,29 @@ from ...accelerator.interface import DistributedInterface
 from ...config import BatchingStrategy
 from ...utils import logging
 from ...utils.helper import pad_and_truncate
-from ...utils.types import BatchInput, ModelInput, TorchDataset
+from ...utils.objects import StatefulBuffer
 from ...utils.types import BatchInfo, BatchInput, ModelInput, TorchDataset
 from .rendering import Renderer
 logger = logging.get_logger(__name__)
-def default_collate_fn(
+def default_collate_fn(buffer: StatefulBuffer, batch_info: BatchInfo) -> list[BatchInput] | None:
-    buffer: list[ModelInput], buffer_tokens: int, micro_batch_size: int, num_micro_batch: int, cutoff_len: int
+    micro_batch_size = batch_info["micro_batch_size"]
-) -> tuple[list[ModelInput], int, list[BatchInput]]:
+    num_micro_batch = batch_info["num_micro_batch"]
    cutoff_len = batch_info["cutoff_len"]
    batch_size = micro_batch_size * num_micro_batch
    if len(buffer) < batch_size:
-        return buffer, buffer_tokens, None
+        return None
    samples = buffer[:batch_size]
    buffer = buffer[batch_size:]
    buffer_tokens -= sum(len(sample["input_ids"]) for sample in samples)
    samples = buffer.get(batch_size)
    batch = []
    for i in range(num_micro_batch):
        micro_batch = samples[i * micro_batch_size : (i + 1) * micro_batch_size]
        batch.append(default_collate(pad_and_truncate(micro_batch, cutoff_len)))
-    return buffer, buffer_tokens, batch
+    return batch
 class BatchGenerator(Iterator):
@@ -105,9 +104,14 @@ class BatchGenerator(Iterator):
        self._is_resuming: bool = False
        self._data_iter = iter(self._data_provider)
-        self._buffer: list[ModelInput] = []
+        self._buffer = StatefulBuffer()
-        self._buffer_tokens: int = 0
+
-        self._max_buffer_tokens: int = self.micro_batch_size * self.num_micro_batch * self.cutoff_len
+        self._batch_info: BatchInfo = {
            "micro_batch_size": self.micro_batch_size,
            "num_micro_batch": self.num_micro_batch,
            "cutoff_len": self.cutoff_len,
            "data_iter": self._data_iter,
        }
        logger.info_rank0(
            f"Init unified data loader with global batch size {self.global_batch_size}, "
@@ -145,7 +149,7 @@ class BatchGenerator(Iterator):
        else:
            from ...plugins.trainer_plugins.batching import BatchingPlugin
-            self._length = BatchingPlugin(self.batching_strategy).compute_length()
+            self._length = BatchingPlugin(self.batching_strategy).compute_length(self._data_provider)
            raise NotImplementedError("Batching strategy other than NORMAL is not supported yet.")
    def __len__(self) -> int:
@@ -161,38 +165,34 @@ class BatchGenerator(Iterator):
        return self
    def __next__(self):
-        batch = self._next_batch()
+        self._fill_buffer()
        batch = self._generate_batch()
        if batch is None:
            raise StopIteration
        return batch
-    def _next_batch(self) -> list[BatchInput] | None:
+    def _fill_buffer(self) -> None:
        while self._buffer_tokens < self._max_buffer_tokens:
            try:
                samples: list[ModelInput] = next(self._data_iter)
            except StopIteration:
                break
            num_tokens = sum(len(sample["input_ids"]) for sample in samples)
            self._buffer.extend(samples)
            self._buffer_tokens += num_tokens
        return self._build_batch()
    def _build_batch(self) -> list[BatchInput] | None:
        if self.batching_strategy == BatchingStrategy.NORMAL:
-            self._buffer, self._buffer_tokens, batch = default_collate_fn(
+            while len(self._buffer) < self.micro_batch_size * self.num_micro_batch:
-                self._buffer, self._buffer_tokens, self.micro_batch_size, self.num_micro_batch, self.cutoff_len
+                try:
-            )
+                    samples: list[ModelInput] = next(self._data_iter)
-            return batch
+                except StopIteration:
                    break
                self._buffer.put(samples)
        else:
            from ...plugins.trainer_plugins.batching import BatchingPlugin
-            self._buffer, self._buffer_tokens, batch = BatchingPlugin(self.batching_strategy)(
+            BatchingPlugin(self.batching_strategy).fill_buffer(self._buffer, self._batch_info)
-                self._buffer, self._buffer_tokens, self.micro_batch_size, self.num_micro_batch, self.cutoff_len
+
-            )
+    def _generate_batch(self) -> list[BatchInput] | None:
-            return batch
+        if self.batching_strategy == BatchingStrategy.NORMAL:
            return default_collate_fn(self._buffer, self._batch_info)
        else:
            from ...plugins.trainer_plugins.batching import BatchingPlugin
            return BatchingPlugin(self.batching_strategy).generate_batch(self._buffer, self._batch_info)
    def state_dict(self) -> dict[str, Any]:
        return {
--- a/src/llamafactory/v1/plugins/model_plugins/rendering.py
+++ b/src/llamafactory/v1/plugins/model_plugins/rendering.py
@@ -22,7 +22,19 @@ from ...utils.types import Message, ModelInput, Processor, ToolCall
 class RenderingPlugin(BasePlugin):
-    pass
+    def render_messages(
        self,
        processor: Processor,
        messages: list[Message],
        tools: str | None = None,
        is_generate: bool = False,
    ) -> ModelInput:
        """Render messages in the template format."""
        return self["render_messages"](processor, messages, tools, is_generate)
    def parse_messages(self, generated_text: str) -> Message:
        """Parse messages in the template format."""
        return self["parse_messages"](generated_text)
 def _update_model_input(
--- a/src/llamafactory/v1/plugins/trainer_plugins/batching.py
+++ b/src/llamafactory/v1/plugins/trainer_plugins/batching.py
@@ -12,8 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from ...utils.objects import StatefulBuffer
 from ...utils.plugin import BasePlugin
 from ...utils.types import BatchInfo, BatchInput, DataLoader
 class BatchingPlugin(BasePlugin):
-    pass
+    def compute_length(self, dataloader: DataLoader) -> int:
        """Compute the length of the batch generator."""
        raise NotImplementedError()
    def fill_buffer(self, buffer: StatefulBuffer, batch_info: BatchInfo) -> None:
        """Fill the buffer with data."""
        raise NotImplementedError()
    def generate_batch(self, buffer: StatefulBuffer, batch_info: BatchInfo) -> list[BatchInput] | None:
        """Generate a batch from the buffer."""
        raise NotImplementedError()
--- a/src/llamafactory/v1/utils/objects.py
+++ b/src/llamafactory/v1/utils/objects.py
@@ -0,0 +1,67 @@
 # Copyright 2025 Optuna, HuggingFace Inc. and the LlamaFactory team.
 #
 # This code is inspired by the HuggingFace's transformers library.
 # https://github.com/huggingface/transformers/blob/v5.0.0rc0/src/transformers/utils/logging.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .types import ModelInput
 class StatefulBuffer:
    """A buffer that stores model inputs."""
    def __init__(self, max_buffer_size: int = 1_000_000_000) -> None:
        self._buffer: list[ModelInput] = []
        self._buffer_size: int = 0
        self._max_buffer_size: int = max_buffer_size
    def __len__(self) -> int:
        return len(self._buffer)
    @property
    def size(self) -> int:
        return self._buffer_size
    def put(self, samples: list[ModelInput]) -> None:
        """Add samples to the buffer."""
        num_tokens = sum(len(sample["input_ids"]) for sample in samples)
        if self._buffer_size + num_tokens > self._max_buffer_size:
            raise ValueError(f"Buffer size exceeds max buffer size {self._max_buffer_size}.")
        self._buffer.extend(samples)
        self._buffer_size += num_tokens
    def get(self, value: int) -> list[ModelInput]:
        """Get samples from the buffer and remove them."""
        samples = self._buffer[:value]
        self._buffer_size -= sum(len(sample["input_ids"]) for sample in samples)
        del self._buffer[:value]
        return samples
    def clear(self) -> None:
        """Clear the buffer."""
        self._buffer = []
        self._buffer_size = 0
    def state_dict(self) -> dict:
        """Returns the state of the buffer."""
        return {
            "buffer": self._buffer,
            "buffer_size": self._buffer_size,
        }
    def load_state_dict(self, state_dict: dict) -> None:
        """Loads the state into the buffer."""
        self._buffer = state_dict["buffer"]
        self._buffer_size = state_dict["buffer_size"]
--- a/src/llamafactory/v1/utils/plugin.py
+++ b/src/llamafactory/v1/utils/plugin.py
@@ -15,6 +15,7 @@
 from collections import defaultdict
 from collections.abc import Callable
 from typing import Any
 from . import logging
@@ -26,33 +27,37 @@ class BasePlugin:
    """Base class for plugins.
    A plugin is a callable object that can be registered and called by name.
    Example usage:
    ```python
    class PrintPlugin(BasePlugin):
        def again(self):  # optional
            self["again"]()
    @PrintPlugin("hello").register()
    def print_hello():
        print("Hello world!")
    @PrintPlugin("hello").register("again")
    def print_hello_again():
        print("Hello world! Again.")
    PrintPlugin("hello")()
    PrintPlugin("hello").again()
    ```
    """
    _registry: dict[str, dict[str, Callable]] = defaultdict(dict)
-    def __init__(self, name: str | None = None):
+    def __init__(self, name: str | None = None) -> None:
-        """Initialize the plugin with a name.
+        """Initialize the plugin with a name."""
        Args:
            name (str): The name of the plugin.
        """
        self.name = name
-    def register(self, method_name: str = "__call__"):
+    def register(self, method_name: str = "__call__") -> Callable:
-        """Decorator to register a function as a plugin.
+        """Decorator to register a function as a plugin."""
        Example usage:
        ```python
        @PrintPlugin("hello").register()
        def print_hello():
            print("Hello world!")
        @PrintPlugin("hello").register("again")
        def print_hello_again():
            print("Hello world! Again.")
        ```
        """
        if self.name is None:
            raise ValueError("Plugin name should be specified.")
@@ -65,27 +70,16 @@ class BasePlugin:
        return decorator
-    def __call__(self, *args, **kwargs):
+    def __call__(self, *args, **kwargs) -> Any:
-        """Call the registered function with the given arguments.
+        """Call the registered function with the given arguments."""
        return self["__call__"](*args, **kwargs)
-        Example usage:
+    def __getattr__(self, method_name: str) -> Callable:
-        ```python
+        """Get the registered function with the given name."""
-        PrintPlugin("hello")()
+        return self[method_name]
        ```
        """
        if "__call__" not in self._registry[self.name]:
            raise ValueError(f"Method __call__ of plugin {self.name} is not registered.")
-        return self._registry[self.name]["__call__"](*args, **kwargs)
+    def __getitem__(self, method_name: str) -> Callable:
-
+        """Get the registered function with the given name."""
    def __getattr__(self, method_name: str):
        """Get the registered function with the given name.
        Example usage:
        ```python
        PrintPlugin("hello").again()
        ```
        """
        if method_name not in self._registry[self.name]:
            raise ValueError(f"Method {method_name} of plugin {self.name} is not registered.")
@@ -98,7 +92,8 @@ if __name__ == "__main__":
    """
    class PrintPlugin(BasePlugin):
-        pass
+        def again(self):  # optional
            self["again"]()
    @PrintPlugin("hello").register()
    def print_hello():
--- a/src/llamafactory/v1/utils/types.py
+++ b/src/llamafactory/v1/utils/types.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections.abc import Iterator
 from typing import TYPE_CHECKING, Any, Literal, NotRequired, TypedDict, Union
@@ -161,3 +162,14 @@ class BatchInput(TypedDict, total=False):
    """Position ids for the model (optional)."""
    token_type_ids: NotRequired[Tensor]
    """Token type ids used in DPO, 0 represents the chosen messages, 1 represents the rejected messages."""
 class BatchInfo(TypedDict):
    micro_batch_size: int
    """Micro batch size."""
    num_micro_batch: int
    """Number of micro batches."""
    cutoff_len: int
    """Cutoff length."""
    data_iter: Iterator[list[ModelInput]]
    """Data iterator."""
--- a/tests_v1/accelerator/test_interface.py
+++ b/tests_v1/accelerator/test_interface.py
@@ -58,3 +58,10 @@ def test_multi_device():
    master_port = find_available_port()
    world_size = 2
    mp.spawn(_all_reduce_tests, args=(world_size, master_port), nprocs=world_size)
 if __name__ == "__main__":
    """
    python tests_v1/accelerator/test_interface.py
    """
    test_all_device()
--- a/tests_v1/config/test_args_parser.py
+++ b/tests_v1/config/test_args_parser.py
@@ -12,41 +12,41 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import pathlib
 import sys
 from pathlib import Path
 from unittest.mock import patch
 from llamafactory.v1.config.arg_parser import get_args
-def test_get_args_from_yaml(tmp_path: pathlib.Path):
+def test_get_args_from_yaml(tmp_path: Path):
    config_yaml = """
        ### model
-        model: "llamafactory/tiny-random-qwen2.5"
+        model: llamafactory/tiny-random-qwen3
        trust_remote_code: true
-        model_class: "llm"
+        model_class: llm
        kernel_config:
-          name: "auto"
+            name: auto
-          include_kernels: "auto" # choice: null/true/false/auto/kernel_id1,kernel_id2,kernel_id3, default is null
+            include_kernels: auto # choice: null/true/false/auto/kernel_id1,kernel_id2,kernel_id3, default is null
        peft_config:
-          name: "lora"
+            name: lora
-          lora_rank: 0.8
+            lora_rank: 0.8
        quant_config: null
        ### data
-        dataset: "llamafactory/tiny-supervised-dataset"
+        dataset: llamafactory/v1-sft-demo
        cutoff_len: 2048
        ### training
-        output_dir: "outputs/test_run"
+        output_dir: outputs/test_run
        micro_batch_size: 1
        global_batch_size: 1
        cutoff_len: 2048
        learning_rate: 1.0e-4
        bf16: false
        dist_config: null
        ### sample
-        sample_backend: "hf"
+        sample_backend: hf
        max_new_tokens: 128
    """
@@ -57,14 +57,26 @@ def test_get_args_from_yaml(tmp_path: pathlib.Path):
    with patch.object(sys, "argv", test_argv):
        data_args, model_args, training_args, sample_args = get_args()
        assert data_args.dataset == "llamafactory/v1-sft-demo"
        assert model_args.model == "llamafactory/tiny-random-qwen3"
        assert model_args.kernel_config.name == "auto"
        assert model_args.kernel_config.get("include_kernels") == "auto"
        assert model_args.peft_config.name == "lora"
        assert model_args.peft_config.get("lora_rank") == 0.8
        assert training_args.output_dir == "outputs/test_run"
        assert training_args.micro_batch_size == 1
        assert training_args.global_batch_size == 1
        assert training_args.learning_rate == 1.0e-4
        assert training_args.bf16 is False
        assert training_args.dist_config is None
-        assert model_args.model == "llamafactory/tiny-random-qwen2.5"
+        assert sample_args.sample_backend == "hf"
-        assert model_args.kernel_config.name == "auto"
+
-        assert model_args.kernel_config.get("include_kernels") == "auto"
+
-        assert model_args.peft_config.name == "lora"
+if __name__ == "__main__":
-        assert model_args.peft_config.get("lora_rank") == 0.8
+    """
    python -m tests_v1.config.test_args_parser
    """
    import tempfile
    with tempfile.TemporaryDirectory() as tmp_dir:
        test_get_args_from_yaml(tmp_path=Path(tmp_dir))
--- a/tests_v1/core/test_data_engine.py
+++ b/tests_v1/core/test_data_engine.py
@@ -33,4 +33,7 @@ def test_map_dataset(num_samples: int):
 if __name__ == "__main__":
    """
    python -m tests_v1.core.test_data_engine
    """
    test_map_dataset(1)
--- a/tests_v1/core/test_model_loader.py
+++ b/tests_v1/core/test_model_loader.py
@@ -44,5 +44,8 @@ def test_tiny_qwen_with_kernel_plugin():
 if __name__ == "__main__":
    """
    python -m tests_v1.core.test_model_loader
    """
    test_tiny_qwen()
    test_tiny_qwen_with_kernel_plugin()
--- a/tests_v1/core/utils/test_batching.py
+++ b/tests_v1/core/utils/test_batching.py
@@ -46,4 +46,7 @@ def test_normal_batching():
 if __name__ == "__main__":
    """
    python -m tests_v1.core.utils.test_batching
    """
    test_normal_batching()
--- a/tests_v1/core/utils/test_rendering.py
+++ b/tests_v1/core/utils/test_rendering.py
@@ -219,6 +219,9 @@ def test_process_dpo_samples():
 if __name__ == "__main__":
    """
    python -m tests_v1.core.utils.test_rendering
    """
    test_chatml_rendering()
    test_chatml_parse()
    test_chatml_rendering_remote(16)
--- a/tests_v1/plugins/data_plugins/test_converter.py
+++ b/tests_v1/plugins/data_plugins/test_converter.py
@@ -120,6 +120,9 @@ def test_pair_converter(num_samples: int):
 if __name__ == "__main__":
    """
    python -m tests_v1.plugins.data_plugins.test_converter
    """
    test_alpaca_converter(1)
    test_sharegpt_converter()
    test_pair_converter(1)
--- a/tests_v1/plugins/model_plugins/test_init_plugin.py
+++ b/tests_v1/plugins/model_plugins/test_init_plugin.py
@@ -52,3 +52,12 @@ def test_init_on_default():
    )
    model_engine = ModelEngine(model_args=model_args)
    assert model_engine.model.device == DistributedInterface().current_device
 if __name__ == "__main__":
    """
    python tests_v1/plugins/model_plugins/test_init_plugin.py
    """
    test_init_on_meta()
    test_init_on_rank0()
    test_init_on_default()
--- a/tests_v1/sampler/test_cli_sampler.py
+++ b/tests_v1/sampler/test_cli_sampler.py
@@ -38,4 +38,7 @@ def test_sync_sampler():
 if __name__ == "__main__":
    """
    python tests_v1/sampler/test_cli_sampler.py
    """
    test_sync_sampler()