[v1] upgrade batching (#9751)

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
Yaowei Zheng
2026-01-12 00:21:36 +08:00
committed by GitHub
parent 15b87f3125
commit a296723697
18 changed files with 273 additions and 97 deletions

0
.github/instructions-v0.md vendored Normal file
View File

0
.github/instructions-v1.md vendored Normal file
View File

View File

@@ -0,0 +1,32 @@
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from transformers import AutoTokenizer, Qwen3Config, Qwen3ForCausalLM
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Instruct-2507")
config = Qwen3Config(
hidden_size=1408,
image_size=336,
intermediate_size=5632,
num_attention_heads=16,
num_hidden_layers=4,
vision_output_dim=4096,
)
model = Qwen3ForCausalLM.from_config(config)
model.save_pretrained("tiny-qwen3")
tokenizer.save_pretrained("tiny-qwen3")
model.push_to_hub("llamafactory/tiny-random-qwen3")
tokenizer.push_to_hub("llamafactory/tiny-random-qwen3")

View File

@@ -34,30 +34,29 @@ from ...accelerator.interface import DistributedInterface
from ...config import BatchingStrategy from ...config import BatchingStrategy
from ...utils import logging from ...utils import logging
from ...utils.helper import pad_and_truncate from ...utils.helper import pad_and_truncate
from ...utils.types import BatchInput, ModelInput, TorchDataset from ...utils.objects import StatefulBuffer
from ...utils.types import BatchInfo, BatchInput, ModelInput, TorchDataset
from .rendering import Renderer from .rendering import Renderer
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
def default_collate_fn( def default_collate_fn(buffer: StatefulBuffer, batch_info: BatchInfo) -> list[BatchInput] | None:
buffer: list[ModelInput], buffer_tokens: int, micro_batch_size: int, num_micro_batch: int, cutoff_len: int micro_batch_size = batch_info["micro_batch_size"]
) -> tuple[list[ModelInput], int, list[BatchInput]]: num_micro_batch = batch_info["num_micro_batch"]
cutoff_len = batch_info["cutoff_len"]
batch_size = micro_batch_size * num_micro_batch batch_size = micro_batch_size * num_micro_batch
if len(buffer) < batch_size: if len(buffer) < batch_size:
return buffer, buffer_tokens, None return None
samples = buffer[:batch_size]
buffer = buffer[batch_size:]
buffer_tokens -= sum(len(sample["input_ids"]) for sample in samples)
samples = buffer.get(batch_size)
batch = [] batch = []
for i in range(num_micro_batch): for i in range(num_micro_batch):
micro_batch = samples[i * micro_batch_size : (i + 1) * micro_batch_size] micro_batch = samples[i * micro_batch_size : (i + 1) * micro_batch_size]
batch.append(default_collate(pad_and_truncate(micro_batch, cutoff_len))) batch.append(default_collate(pad_and_truncate(micro_batch, cutoff_len)))
return buffer, buffer_tokens, batch return batch
class BatchGenerator(Iterator): class BatchGenerator(Iterator):
@@ -105,9 +104,14 @@ class BatchGenerator(Iterator):
self._is_resuming: bool = False self._is_resuming: bool = False
self._data_iter = iter(self._data_provider) self._data_iter = iter(self._data_provider)
self._buffer: list[ModelInput] = [] self._buffer = StatefulBuffer()
self._buffer_tokens: int = 0
self._max_buffer_tokens: int = self.micro_batch_size * self.num_micro_batch * self.cutoff_len self._batch_info: BatchInfo = {
"micro_batch_size": self.micro_batch_size,
"num_micro_batch": self.num_micro_batch,
"cutoff_len": self.cutoff_len,
"data_iter": self._data_iter,
}
logger.info_rank0( logger.info_rank0(
f"Init unified data loader with global batch size {self.global_batch_size}, " f"Init unified data loader with global batch size {self.global_batch_size}, "
@@ -145,7 +149,7 @@ class BatchGenerator(Iterator):
else: else:
from ...plugins.trainer_plugins.batching import BatchingPlugin from ...plugins.trainer_plugins.batching import BatchingPlugin
self._length = BatchingPlugin(self.batching_strategy).compute_length() self._length = BatchingPlugin(self.batching_strategy).compute_length(self._data_provider)
raise NotImplementedError("Batching strategy other than NORMAL is not supported yet.") raise NotImplementedError("Batching strategy other than NORMAL is not supported yet.")
def __len__(self) -> int: def __len__(self) -> int:
@@ -161,38 +165,34 @@ class BatchGenerator(Iterator):
return self return self
def __next__(self): def __next__(self):
batch = self._next_batch() self._fill_buffer()
batch = self._generate_batch()
if batch is None: if batch is None:
raise StopIteration raise StopIteration
return batch return batch
def _next_batch(self) -> list[BatchInput] | None: def _fill_buffer(self) -> None:
while self._buffer_tokens < self._max_buffer_tokens: if self.batching_strategy == BatchingStrategy.NORMAL:
while len(self._buffer) < self.micro_batch_size * self.num_micro_batch:
try: try:
samples: list[ModelInput] = next(self._data_iter) samples: list[ModelInput] = next(self._data_iter)
except StopIteration: except StopIteration:
break break
num_tokens = sum(len(sample["input_ids"]) for sample in samples) self._buffer.put(samples)
self._buffer.extend(samples)
self._buffer_tokens += num_tokens
return self._build_batch()
def _build_batch(self) -> list[BatchInput] | None:
if self.batching_strategy == BatchingStrategy.NORMAL:
self._buffer, self._buffer_tokens, batch = default_collate_fn(
self._buffer, self._buffer_tokens, self.micro_batch_size, self.num_micro_batch, self.cutoff_len
)
return batch
else: else:
from ...plugins.trainer_plugins.batching import BatchingPlugin from ...plugins.trainer_plugins.batching import BatchingPlugin
self._buffer, self._buffer_tokens, batch = BatchingPlugin(self.batching_strategy)( BatchingPlugin(self.batching_strategy).fill_buffer(self._buffer, self._batch_info)
self._buffer, self._buffer_tokens, self.micro_batch_size, self.num_micro_batch, self.cutoff_len
) def _generate_batch(self) -> list[BatchInput] | None:
return batch if self.batching_strategy == BatchingStrategy.NORMAL:
return default_collate_fn(self._buffer, self._batch_info)
else:
from ...plugins.trainer_plugins.batching import BatchingPlugin
return BatchingPlugin(self.batching_strategy).generate_batch(self._buffer, self._batch_info)
def state_dict(self) -> dict[str, Any]: def state_dict(self) -> dict[str, Any]:
return { return {

View File

@@ -22,7 +22,19 @@ from ...utils.types import Message, ModelInput, Processor, ToolCall
class RenderingPlugin(BasePlugin): class RenderingPlugin(BasePlugin):
pass def render_messages(
self,
processor: Processor,
messages: list[Message],
tools: str | None = None,
is_generate: bool = False,
) -> ModelInput:
"""Render messages in the template format."""
return self["render_messages"](processor, messages, tools, is_generate)
def parse_messages(self, generated_text: str) -> Message:
"""Parse messages in the template format."""
return self["parse_messages"](generated_text)
def _update_model_input( def _update_model_input(

View File

@@ -12,8 +12,20 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from ...utils.objects import StatefulBuffer
from ...utils.plugin import BasePlugin from ...utils.plugin import BasePlugin
from ...utils.types import BatchInfo, BatchInput, DataLoader
class BatchingPlugin(BasePlugin): class BatchingPlugin(BasePlugin):
pass def compute_length(self, dataloader: DataLoader) -> int:
"""Compute the length of the batch generator."""
raise NotImplementedError()
def fill_buffer(self, buffer: StatefulBuffer, batch_info: BatchInfo) -> None:
"""Fill the buffer with data."""
raise NotImplementedError()
def generate_batch(self, buffer: StatefulBuffer, batch_info: BatchInfo) -> list[BatchInput] | None:
"""Generate a batch from the buffer."""
raise NotImplementedError()

View File

@@ -0,0 +1,67 @@
# Copyright 2025 Optuna, HuggingFace Inc. and the LlamaFactory team.
#
# This code is inspired by the HuggingFace's transformers library.
# https://github.com/huggingface/transformers/blob/v5.0.0rc0/src/transformers/utils/logging.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .types import ModelInput
class StatefulBuffer:
"""A buffer that stores model inputs."""
def __init__(self, max_buffer_size: int = 1_000_000_000) -> None:
self._buffer: list[ModelInput] = []
self._buffer_size: int = 0
self._max_buffer_size: int = max_buffer_size
def __len__(self) -> int:
return len(self._buffer)
@property
def size(self) -> int:
return self._buffer_size
def put(self, samples: list[ModelInput]) -> None:
"""Add samples to the buffer."""
num_tokens = sum(len(sample["input_ids"]) for sample in samples)
if self._buffer_size + num_tokens > self._max_buffer_size:
raise ValueError(f"Buffer size exceeds max buffer size {self._max_buffer_size}.")
self._buffer.extend(samples)
self._buffer_size += num_tokens
def get(self, value: int) -> list[ModelInput]:
"""Get samples from the buffer and remove them."""
samples = self._buffer[:value]
self._buffer_size -= sum(len(sample["input_ids"]) for sample in samples)
del self._buffer[:value]
return samples
def clear(self) -> None:
"""Clear the buffer."""
self._buffer = []
self._buffer_size = 0
def state_dict(self) -> dict:
"""Returns the state of the buffer."""
return {
"buffer": self._buffer,
"buffer_size": self._buffer_size,
}
def load_state_dict(self, state_dict: dict) -> None:
"""Loads the state into the buffer."""
self._buffer = state_dict["buffer"]
self._buffer_size = state_dict["buffer_size"]

View File

@@ -15,6 +15,7 @@
from collections import defaultdict from collections import defaultdict
from collections.abc import Callable from collections.abc import Callable
from typing import Any
from . import logging from . import logging
@@ -26,23 +27,14 @@ class BasePlugin:
"""Base class for plugins. """Base class for plugins.
A plugin is a callable object that can be registered and called by name. A plugin is a callable object that can be registered and called by name.
"""
_registry: dict[str, dict[str, Callable]] = defaultdict(dict)
def __init__(self, name: str | None = None):
"""Initialize the plugin with a name.
Args:
name (str): The name of the plugin.
"""
self.name = name
def register(self, method_name: str = "__call__"):
"""Decorator to register a function as a plugin.
Example usage: Example usage:
```python ```python
class PrintPlugin(BasePlugin):
def again(self): # optional
self["again"]()
@PrintPlugin("hello").register() @PrintPlugin("hello").register()
def print_hello(): def print_hello():
print("Hello world!") print("Hello world!")
@@ -51,8 +43,21 @@ class BasePlugin:
@PrintPlugin("hello").register("again") @PrintPlugin("hello").register("again")
def print_hello_again(): def print_hello_again():
print("Hello world! Again.") print("Hello world! Again.")
PrintPlugin("hello")()
PrintPlugin("hello").again()
``` ```
""" """
_registry: dict[str, dict[str, Callable]] = defaultdict(dict)
def __init__(self, name: str | None = None) -> None:
"""Initialize the plugin with a name."""
self.name = name
def register(self, method_name: str = "__call__") -> Callable:
"""Decorator to register a function as a plugin."""
if self.name is None: if self.name is None:
raise ValueError("Plugin name should be specified.") raise ValueError("Plugin name should be specified.")
@@ -65,27 +70,16 @@ class BasePlugin:
return decorator return decorator
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs) -> Any:
"""Call the registered function with the given arguments. """Call the registered function with the given arguments."""
return self["__call__"](*args, **kwargs)
Example usage: def __getattr__(self, method_name: str) -> Callable:
```python """Get the registered function with the given name."""
PrintPlugin("hello")() return self[method_name]
```
"""
if "__call__" not in self._registry[self.name]:
raise ValueError(f"Method __call__ of plugin {self.name} is not registered.")
return self._registry[self.name]["__call__"](*args, **kwargs) def __getitem__(self, method_name: str) -> Callable:
"""Get the registered function with the given name."""
def __getattr__(self, method_name: str):
"""Get the registered function with the given name.
Example usage:
```python
PrintPlugin("hello").again()
```
"""
if method_name not in self._registry[self.name]: if method_name not in self._registry[self.name]:
raise ValueError(f"Method {method_name} of plugin {self.name} is not registered.") raise ValueError(f"Method {method_name} of plugin {self.name} is not registered.")
@@ -98,7 +92,8 @@ if __name__ == "__main__":
""" """
class PrintPlugin(BasePlugin): class PrintPlugin(BasePlugin):
pass def again(self): # optional
self["again"]()
@PrintPlugin("hello").register() @PrintPlugin("hello").register()
def print_hello(): def print_hello():

View File

@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from collections.abc import Iterator
from typing import TYPE_CHECKING, Any, Literal, NotRequired, TypedDict, Union from typing import TYPE_CHECKING, Any, Literal, NotRequired, TypedDict, Union
@@ -161,3 +162,14 @@ class BatchInput(TypedDict, total=False):
"""Position ids for the model (optional).""" """Position ids for the model (optional)."""
token_type_ids: NotRequired[Tensor] token_type_ids: NotRequired[Tensor]
"""Token type ids used in DPO, 0 represents the chosen messages, 1 represents the rejected messages.""" """Token type ids used in DPO, 0 represents the chosen messages, 1 represents the rejected messages."""
class BatchInfo(TypedDict):
micro_batch_size: int
"""Micro batch size."""
num_micro_batch: int
"""Number of micro batches."""
cutoff_len: int
"""Cutoff length."""
data_iter: Iterator[list[ModelInput]]
"""Data iterator."""

View File

@@ -58,3 +58,10 @@ def test_multi_device():
master_port = find_available_port() master_port = find_available_port()
world_size = 2 world_size = 2
mp.spawn(_all_reduce_tests, args=(world_size, master_port), nprocs=world_size) mp.spawn(_all_reduce_tests, args=(world_size, master_port), nprocs=world_size)
if __name__ == "__main__":
"""
python tests_v1/accelerator/test_interface.py
"""
test_all_device()

View File

@@ -12,41 +12,41 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import pathlib
import sys import sys
from pathlib import Path
from unittest.mock import patch from unittest.mock import patch
from llamafactory.v1.config.arg_parser import get_args from llamafactory.v1.config.arg_parser import get_args
def test_get_args_from_yaml(tmp_path: pathlib.Path): def test_get_args_from_yaml(tmp_path: Path):
config_yaml = """ config_yaml = """
### model ### model
model: "llamafactory/tiny-random-qwen2.5" model: llamafactory/tiny-random-qwen3
trust_remote_code: true trust_remote_code: true
model_class: "llm" model_class: llm
kernel_config: kernel_config:
name: "auto" name: auto
include_kernels: "auto" # choice: null/true/false/auto/kernel_id1,kernel_id2,kernel_id3, default is null include_kernels: auto # choice: null/true/false/auto/kernel_id1,kernel_id2,kernel_id3, default is null
peft_config: peft_config:
name: "lora" name: lora
lora_rank: 0.8 lora_rank: 0.8
quant_config: null quant_config: null
### data ### data
dataset: "llamafactory/tiny-supervised-dataset" dataset: llamafactory/v1-sft-demo
cutoff_len: 2048
### training ### training
output_dir: "outputs/test_run" output_dir: outputs/test_run
micro_batch_size: 1 micro_batch_size: 1
global_batch_size: 1 global_batch_size: 1
cutoff_len: 2048
learning_rate: 1.0e-4 learning_rate: 1.0e-4
bf16: false bf16: false
dist_config: null dist_config: null
### sample ### sample
sample_backend: "hf" sample_backend: hf
max_new_tokens: 128 max_new_tokens: 128
""" """
@@ -57,14 +57,26 @@ def test_get_args_from_yaml(tmp_path: pathlib.Path):
with patch.object(sys, "argv", test_argv): with patch.object(sys, "argv", test_argv):
data_args, model_args, training_args, sample_args = get_args() data_args, model_args, training_args, sample_args = get_args()
assert data_args.dataset == "llamafactory/v1-sft-demo"
assert model_args.model == "llamafactory/tiny-random-qwen3"
assert model_args.kernel_config.name == "auto"
assert model_args.kernel_config.get("include_kernels") == "auto"
assert model_args.peft_config.name == "lora"
assert model_args.peft_config.get("lora_rank") == 0.8
assert training_args.output_dir == "outputs/test_run" assert training_args.output_dir == "outputs/test_run"
assert training_args.micro_batch_size == 1 assert training_args.micro_batch_size == 1
assert training_args.global_batch_size == 1 assert training_args.global_batch_size == 1
assert training_args.learning_rate == 1.0e-4 assert training_args.learning_rate == 1.0e-4
assert training_args.bf16 is False assert training_args.bf16 is False
assert training_args.dist_config is None assert training_args.dist_config is None
assert model_args.model == "llamafactory/tiny-random-qwen2.5" assert sample_args.sample_backend == "hf"
assert model_args.kernel_config.name == "auto"
assert model_args.kernel_config.get("include_kernels") == "auto"
assert model_args.peft_config.name == "lora" if __name__ == "__main__":
assert model_args.peft_config.get("lora_rank") == 0.8 """
python -m tests_v1.config.test_args_parser
"""
import tempfile
with tempfile.TemporaryDirectory() as tmp_dir:
test_get_args_from_yaml(tmp_path=Path(tmp_dir))

View File

@@ -33,4 +33,7 @@ def test_map_dataset(num_samples: int):
if __name__ == "__main__": if __name__ == "__main__":
"""
python -m tests_v1.core.test_data_engine
"""
test_map_dataset(1) test_map_dataset(1)

View File

@@ -44,5 +44,8 @@ def test_tiny_qwen_with_kernel_plugin():
if __name__ == "__main__": if __name__ == "__main__":
"""
python -m tests_v1.core.test_model_loader
"""
test_tiny_qwen() test_tiny_qwen()
test_tiny_qwen_with_kernel_plugin() test_tiny_qwen_with_kernel_plugin()

View File

@@ -46,4 +46,7 @@ def test_normal_batching():
if __name__ == "__main__": if __name__ == "__main__":
"""
python -m tests_v1.core.utils.test_batching
"""
test_normal_batching() test_normal_batching()

View File

@@ -219,6 +219,9 @@ def test_process_dpo_samples():
if __name__ == "__main__": if __name__ == "__main__":
"""
python -m tests_v1.core.utils.test_rendering
"""
test_chatml_rendering() test_chatml_rendering()
test_chatml_parse() test_chatml_parse()
test_chatml_rendering_remote(16) test_chatml_rendering_remote(16)

View File

@@ -120,6 +120,9 @@ def test_pair_converter(num_samples: int):
if __name__ == "__main__": if __name__ == "__main__":
"""
python -m tests_v1.plugins.data_plugins.test_converter
"""
test_alpaca_converter(1) test_alpaca_converter(1)
test_sharegpt_converter() test_sharegpt_converter()
test_pair_converter(1) test_pair_converter(1)

View File

@@ -52,3 +52,12 @@ def test_init_on_default():
) )
model_engine = ModelEngine(model_args=model_args) model_engine = ModelEngine(model_args=model_args)
assert model_engine.model.device == DistributedInterface().current_device assert model_engine.model.device == DistributedInterface().current_device
if __name__ == "__main__":
"""
python tests_v1/plugins/model_plugins/test_init_plugin.py
"""
test_init_on_meta()
test_init_on_rank0()
test_init_on_default()

View File

@@ -38,4 +38,7 @@ def test_sync_sampler():
if __name__ == "__main__": if __name__ == "__main__":
"""
python tests_v1/sampler/test_cli_sampler.py
"""
test_sync_sampler() test_sync_sampler()