[v1] add batch generator (#9744)

2026-03-02 17:55:59 +08:00 · 2026-01-10 04:24:09 +08:00
parent d7d734d54c
commit b2effbd77c
26 changed files with 604 additions and 850 deletions
--- a/tests_v1/accelerator/test_interface.py
+++ b/tests_v1/accelerator/test_interface.py
@@ -56,4 +56,5 @@ def test_all_device():
@pytest.mark.require_distributed(2)
 def test_multi_device():
    master_port = find_available_port()
-    mp.spawn(_all_reduce_tests, args=(2, master_port), nprocs=2)
+    world_size = 2
+    mp.spawn(_all_reduce_tests, args=(world_size, master_port), nprocs=world_size)
--- a/tests_v1/core/test_model_loader.py
+++ b/tests_v1/core/test_model_loader.py
@@ -14,28 +14,24 @@

 import torch

-from llamafactory.v1.config.model_args import ModelArguments, PluginConfig
+from llamafactory.v1.config.model_args import ModelArguments
 from llamafactory.v1.core.model_engine import ModelEngine


 def test_tiny_qwen():
-    from transformers import Qwen2Config, Qwen2ForCausalLM, Qwen2TokenizerFast
-
-    model_args = ModelArguments(model="llamafactory/tiny-random-qwen2.5")
+    model_args = ModelArguments(model="llamafactory/tiny-random-qwen3")
    model_engine = ModelEngine(model_args)
-    assert isinstance(model_engine.processor, Qwen2TokenizerFast)
-    assert isinstance(model_engine.model_config, Qwen2Config)
-    assert isinstance(model_engine.model, Qwen2ForCausalLM)
+    assert "Qwen2Tokenizer" in model_engine.processor.__class__.__name__
+    assert "Qwen3Config" in model_engine.model_config.__class__.__name__
+    assert "Qwen3ForCausalLM" in model_engine.model.__class__.__name__
    assert model_engine.model.dtype == torch.bfloat16


 def test_tiny_qwen_with_kernel_plugin():
-    from transformers import Qwen2ForCausalLM
-
    from llamafactory.v1.plugins.model_plugins.kernels.ops.rms_norm.npu_rms_norm import npu_rms_norm_forward

    model_args = ModelArguments(
-        model="llamafactory/tiny-random-qwen2.5", kernel_config=PluginConfig(name="auto", include_kernels="auto")
+        model="llamafactory/tiny-random-qwen3", kernel_config={"name": "auto", "include_kernels": "auto"}
    )
    model_engine = ModelEngine(model_args)
    # test enable apply kernel plugin
@@ -44,7 +40,7 @@ def test_tiny_qwen_with_kernel_plugin():
    else:
        assert model_engine.model.model.layers[0].input_layernorm.forward.__code__ != npu_rms_norm_forward.__code__

-    assert isinstance(model_engine.model, Qwen2ForCausalLM)
+    assert "Qwen3ForCausalLM" in model_engine.model.__class__.__name__


 if __name__ == "__main__":
--- a/tests_v1/core/utils/test_batching.py
+++ b/tests_v1/core/utils/test_batching.py
@@ -0,0 +1,49 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from llamafactory.v1.config import DataArguments, ModelArguments, TrainingArguments
+from llamafactory.v1.core.data_engine import DataEngine
+from llamafactory.v1.core.model_engine import ModelEngine
+from llamafactory.v1.core.utils.batching import BatchGenerator
+
+
+def test_normal_batching():
+    data_args = DataArguments(dataset="llamafactory/v1-sft-demo")
+    data_engine = DataEngine(data_args=data_args)
+    model_args = ModelArguments(model="llamafactory/tiny-random-qwen3")
+    model_engine = ModelEngine(model_args=model_args)
+    training_args = TrainingArguments(
+        micro_batch_size=4,
+        global_batch_size=8,
+        cutoff_len=10,
+        batching_workers=0,
+        batching_strategy="normal",
+    )
+    batch_generator = BatchGenerator(
+        data_engine,
+        model_engine.renderer,
+        micro_batch_size=training_args.micro_batch_size,
+        global_batch_size=training_args.global_batch_size,
+        cutoff_len=training_args.cutoff_len,
+        batching_workers=training_args.batching_workers,
+        batching_strategy=training_args.batching_strategy,
+    )
+    assert len(batch_generator) == len(data_engine) // training_args.global_batch_size
+    batch = next(iter(batch_generator))
+    assert len(batch) == 2
+    assert batch[0]["input_ids"].shape == (4, 10)
+
+
+if __name__ == "__main__":
+    test_normal_batching()
--- a/tests_v1/core/utils/test_data_loader.py
+++ b/tests_v1/core/utils/test_data_loader.py
@@ -1,171 +0,0 @@
-# Copyright 2025 the LlamaFactory team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Integration tests for DataLoader with different combinations of packing and dynamic batching.
-
-Tests the 4 scenarios:
-a) non pack + non dynamic.
-b) non pack + dynamic.
-c) pack + non dynamic.
-d) pack + dynamic.
-"""
-
-# import torch
-# from torch.utils.data import DataLoader as TorchDataLoader
-# from torch.utils.data import Dataset
-# from transformers import AutoTokenizer
-
-# from llamafactory.v1.config.data_args import DataArguments
-# from llamafactory.v1.core.data_engine import DataEngine
-# from llamafactory.v1.core.utils.data_collator import DefaultCollator
-# from llamafactory.v1.core.utils.data_loader import DataLoader
-# from llamafactory.v1.plugins.data_plugins.rendering import QwenTemplate
-# from llamafactory.v1.utils.batching_queue import TextBatchingQueue
-
-
-# class TensorDataset(Dataset):
-#     """Wrapper dataset that converts DataEngine samples to tensor format."""
-
-#     def __init__(self, data_engine: DataEngine, processor, template, max_samples: int = None):
-#         self.data_engine = data_engine
-#         self.processor = processor
-#         self.template = template
-#         self.max_samples = max_samples or len(data_engine)
-#         self.tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else processor
-
-#     def __len__(self):
-#         return min(self.max_samples, len(self.data_engine))
-
-#     def __getitem__(self, idx):
-#         # Get sample from DataEngine
-#         sample = self.data_engine[idx]
-
-#         # Extract messages from sample
-#         # DataEngine returns samples with format like {"messages": [...], ...}
-#         # For llamafactory/v1-sft-demo, the format should have "messages" field
-#         messages = None
-#         if "messages" in sample:
-#             messages = sample["messages"]
-#         elif "conversations" in sample:
-#             messages = sample["conversations"]
-#         elif "conversation" in sample:
-#             messages = sample["conversation"]
-#         else:
-#             # Try to find message-like fields (skip _dataset_name)
-#             for key, value in sample.items():
-#                 if key.startswith("_"):
-#                     continue
-#                 if isinstance(value, list) and len(value) > 0:
-#                     # Check if it looks like a message list
-#                     if isinstance(value[0], dict) and "role" in value[0]:
-#                         messages = value
-#                         break
-
-#         if messages is None:
-#             raise ValueError(f"Could not find messages in sample: {list(sample.keys())}")
-
-#         # Encode messages using template
-#         encoded = self.template.encode_messages(self.tokenizer, messages)
-
-#         # Convert to tensors
-#         return {
-#             "input_ids": torch.tensor(encoded["input_ids"], dtype=torch.long),
-#             "attention_mask": torch.tensor(encoded["attention_mask"], dtype=torch.long),
-#             "labels": torch.tensor(encoded["labels"], dtype=torch.long),
-#         }
-
-
-# def create_real_dataset(max_samples: int = 20, batch_size: int = 4):
-#     """Create a real dataset using DataEngine."""
-#     data_args = DataArguments(dataset="llamafactory/v1-sft-demo")
-#     data_engine = DataEngine(data_args)
-
-#     # Create processor and template
-#     processor = AutoTokenizer.from_pretrained("llamafactory/tiny-random-qwen2.5")
-#     template = QwenTemplate()
-
-#     # Create tensor dataset
-#     raw_data_dataset = TensorDataset(data_engine, processor, template, max_samples=max_samples)
-
-#     # Create torch DataLoader
-#     torch_dataloader = TorchDataLoader(
-#         raw_data_dataset,
-#         batch_size=batch_size,
-#         shuffle=False,
-#         collate_fn=lambda x: x,
-#     )
-
-#     return torch_dataloader, processor, template
-
-
-# class TestDataLoaderNonPackNonDynamic:
-#     """Test case a) non pack + non dynamic."""
-
-#     def test_basic_functionality(self):
-#         """Test DataLoader without packing and without dynamic batching."""
-#         # Create real dataset
-#         torch_dataloader, processor, template = create_real_dataset(max_samples=80, batch_size=8)
-
-#         # Create collator (non-packing)
-#         collator = DefaultCollator(processor=processor, template=template)
-
-#         # Create DataLoader without batching_queue (non-dynamic)
-#         data_loader = DataLoader(
-#             dataloader=torch_dataloader,
-#             collate_fn=collator,
-#             num_micro_batch=1,
-#             batching_queue=None,
-#         )
-
-#         # Iterate and check results
-#         batches = list(iter(data_loader))
-#         assert len(batches) > 0
-
-#         # Check first batch
-#         one_batch = batches[0]
-#         micro_batches = one_batch[0]
-#         assert "input_ids" in micro_batches
-#         assert "attention_mask" in micro_batches
-#         assert "labels" in micro_batches
-#         assert micro_batches["input_ids"].shape[0] == 1  # batch_size=1
-#         assert micro_batches["input_ids"].ndim == 2  # [batch_size, seq_len]
-
-
-# class TestDataLoaderNonPackDynamic:
-#     """Test case b) non pack + dynamic."""
-
-#     def test_basic_functionality(self):
-#         """Test DataLoader without packing but with dynamic batching."""
-#         # Create real dataset
-#         torch_dataloader, processor, template = create_real_dataset(max_samples=80, batch_size=8)
-#         collator = DefaultCollator(processor=processor, template=template)
-
-#         # Create batching queue for dynamic batching
-#         batching_queue = TextBatchingQueue(
-#             token_micro_bsz=120,
-#             buffer_size=8,
-#         )
-
-#         data_loader = DataLoader(
-#             dataloader=torch_dataloader,
-#             collate_fn=collator,
-#             num_micro_batch=4,
-#             batching_queue=batching_queue,
-#         )
-
-#         # Iterate and check
-#         batches = list(iter(data_loader))
-#         micro_batch_tokens_first = [micro_batch["attention_mask"].sum() for micro_batch in batches[0]]
-#         assert all(num_tokens <= 120 for num_tokens in micro_batch_tokens_first)
-#         assert len(batches) > 0
--- a/tests_v1/core/utils/test_rendering.py
+++ b/tests_v1/core/utils/test_rendering.py
@@ -184,6 +184,40 @@ def test_qwen3_nothink_rendering_remote(num_samples: int):
        assert v1_inputs["input_ids"][: len(prefix)] == prefix


+def test_process_sft_samples():
+    tokenizer: Processor = AutoTokenizer.from_pretrained("llamafactory/tiny-random-qwen3")
+    renderer = Renderer(template="chatml", processor=tokenizer)
+    hf_inputs = tokenizer.apply_chat_template(HF_MESSAGES)
+
+    samples = [{"messages": V1_MESSAGES, "extra_info": "test", "_dataset_name": "default"}]
+    model_inputs = renderer.process_samples(samples)
+    assert len(model_inputs) == 1
+    assert model_inputs[0]["input_ids"] == hf_inputs
+    assert model_inputs[0]["extra_info"] == "test"
+    assert model_inputs[0]["_dataset_name"] == "default"
+
+
+def test_process_dpo_samples():
+    tokenizer: Processor = AutoTokenizer.from_pretrained("llamafactory/tiny-random-qwen3")
+    renderer = Renderer(template="chatml", processor=tokenizer)
+    hf_inputs = tokenizer.apply_chat_template(HF_MESSAGES)
+
+    samples = [
+        {
+            "chosen_messages": V1_MESSAGES,
+            "rejected_messages": V1_MESSAGES,
+            "extra_info": "test",
+            "_dataset_name": "default",
+        }
+    ]
+    model_inputs = renderer.process_samples(samples)
+    assert len(model_inputs) == 1
+    assert model_inputs[0]["input_ids"] == hf_inputs * 2
+    assert model_inputs[0]["token_type_ids"] == [0] * len(hf_inputs) + [1] * len(hf_inputs)
+    assert model_inputs[0]["extra_info"] == "test"
+    assert model_inputs[0]["_dataset_name"] == "default"
+
+
 if __name__ == "__main__":
    test_chatml_rendering()
    test_chatml_parse()
@@ -191,3 +225,5 @@ if __name__ == "__main__":
    test_qwen3_nothink_rendering()
    test_qwen3_nothink_parse()
    test_qwen3_nothink_rendering_remote(16)
+    test_process_sft_samples()
+    test_process_dpo_samples()
--- a/tests_v1/plugins/model_plugins/test_init_plugin.py
+++ b/tests_v1/plugins/model_plugins/test_init_plugin.py
@@ -21,7 +21,7 @@ from llamafactory.v1.core.model_engine import ModelEngine
 def test_init_on_meta():
    _, model_args, *_ = get_args(
        dict(
-            model="llamafactory/tiny-random-qwen2.5",
+            model="llamafactory/tiny-random-qwen3",
            init_config={"name": "init_on_meta"},
        )
    )
@@ -32,7 +32,7 @@ def test_init_on_meta():
 def test_init_on_rank0():
    _, model_args, *_ = get_args(
        dict(
-            model="llamafactory/tiny-random-qwen2.5",
+            model="llamafactory/tiny-random-qwen3",
            init_config={"name": "init_on_rank0"},
        )
    )
@@ -46,7 +46,7 @@ def test_init_on_rank0():
 def test_init_on_default():
    _, model_args, *_ = get_args(
        dict(
-            model="llamafactory/tiny-random-qwen2.5",
+            model="llamafactory/tiny-random-qwen3",
            init_config={"name": "init_on_default"},
        )
    )
--- a/tests_v1/plugins/model_plugins/test_kernel_plugin.py
+++ b/tests_v1/plugins/model_plugins/test_kernel_plugin.py
@@ -43,7 +43,7 @@ def test_apply_kernel(mock_get_accelerator: MagicMock):
    reload_kernels()
    from llamafactory.v1.plugins.model_plugins.kernels.interface import apply_default_kernels

-    model = AutoModelForCausalLM.from_pretrained("llamafactory/tiny-random-qwen2.5")
+    model = AutoModelForCausalLM.from_pretrained("llamafactory/tiny-random-qwen3")
    original_rmsnorm_forward = model.model.layers[0].input_layernorm.forward
    original_swiglu_forward = model.model.layers[0].mlp.forward
    model = apply_default_kernels(model=model, include_kernels="npu_fused_rmsnorm")
@@ -62,7 +62,7 @@ def test_apply_all_kernels(mock_get_accelerator: MagicMock):
    reload_kernels()
    from llamafactory.v1.plugins.model_plugins.kernels.interface import apply_default_kernels

-    model = AutoModelForCausalLM.from_pretrained("llamafactory/tiny-random-qwen2.5")
+    model = AutoModelForCausalLM.from_pretrained("llamafactory/tiny-random-qwen3")

    original_rmsnorm_forward = model.model.layers[0].input_layernorm.forward
    original_swiglu_forward = model.model.layers[0].mlp.forward
--- a/tests_v1/utils/test_batching_queue.py
+++ b/tests_v1/utils/test_batching_queue.py
@@ -1,112 +0,0 @@
-# Copyright 2025 the LlamaFactory team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-from llamafactory.v1.utils.batching_queue import DynamicBatchSizeBuffer, TextBatchingQueue
-
-
-def create_sample(length: int):
-    """Helper to create a mock sample with a specific token length."""
-    return {"input_ids": torch.ones(length), "attention_mask": torch.ones(length)}
-
-
-class TestDynamicBatchSizeBuffer:
-    def test_append_and_token_count(self):
-        buffer = DynamicBatchSizeBuffer()
-        buffer.append(create_sample(10))
-        buffer.append(create_sample(20))
-
-        assert len(buffer) == 2
-        assert buffer.total_token_count == 30
-
-    def test_get_samples_within_budget(self):
-        buffer = DynamicBatchSizeBuffer()
-        buffer.append(create_sample(10))
-        buffer.append(create_sample(10))
-        buffer.append(create_sample(50))  # This one is large
-
-        # Request 25 tokens. Should get the first two (20 tokens total)
-        samples = buffer.get_samples(max_tokens_per_iteration=25)
-        assert len(samples) == 2
-
-    def test_force_return_first_sample(self):
-        buffer = DynamicBatchSizeBuffer()
-        buffer.append(create_sample(100))
-
-        # Even though budget is 50, force=True (default) should return the 100-token sample
-        samples = buffer.get_samples(max_tokens_per_iteration=50, force=True)
-        assert len(samples) == 1
-        assert len(samples[0]["input_ids"]) == 100
-
-    def test_flush_removes_used_samples(self):
-        buffer = DynamicBatchSizeBuffer()
-        buffer.append(create_sample(10))
-        buffer.append(create_sample(20))
-
-        # Take the first sample
-        buffer.get_samples(max_tokens_per_iteration=15)
-        buffer.flush()
-
-        assert len(buffer) == 1
-        assert buffer.total_token_count == 20
-        # The remaining sample should now be at the start
-        remaining = buffer.get_samples(max_tokens_per_iteration=50)
-        assert len(remaining[0]["input_ids"]) == 20
-
-
-class TestTextBatchingQueue:
-    def test_is_full_filled(self):
-        queue = TextBatchingQueue(token_micro_bsz=100, buffer_size=2)
-
-        queue.put_item(create_sample(10))
-        assert not queue.is_full_filled()  # Only 1 sample, buffer_size=2
-
-        queue.put_item(create_sample(10))
-        assert not queue.is_full_filled()  # 2 samples, but only 20 tokens (min 100)
-
-        queue.put_item(create_sample(90))
-        assert queue.is_full_filled()  # Meets both conditions
-
-    def test_warmup_logic(self):
-        # token_micro_bsz=1000, starts at 200, reaches 1000 at step 10
-        queue = TextBatchingQueue(token_micro_bsz=1000, bsz_warmup_steps=10, bsz_warmup_init_mbtoken=200)
-
-        # Step 0: should be init value
-        assert queue.get_cur_token_micro_bsz() == 200
-
-        # Step 5: halfway through warmup (200 + (800 * 5/10)) = 600
-        queue._step = 5
-        assert queue.get_cur_token_micro_bsz() == 600
-
-        # Step 11: past warmup
-        queue._step = 11
-        assert queue.get_cur_token_micro_bsz() == 1000
-
-    def test_get_micro_batch_integration(self):
-        queue = TextBatchingQueue(token_micro_bsz=50, buffer_size=1)
-        queue.put_item(create_sample(20))
-        queue.put_item(create_sample(20))
-        queue.put_item(create_sample(20))
-
-        # At step 0 (warmup not triggered as bsz_warmup_steps is -1 default),
-        # it should take samples up to 50 tokens.
-        batch = queue.get_micro_batch(step=0)
-
-        assert len(batch) == 2
-        assert queue.empty() is False
-
-        batch_2 = queue.get_micro_batch(step=1)
-        assert len(batch_2) == 1
-        assert queue.empty() is True