[v1] add cli sampler (#9721)

2026-01-13 17:40:34 +08:00 · 2026-01-06 23:31:27 +08:00
parent e944dc442c
commit ea0b4e2466
45 changed files with 1091 additions and 505 deletions
--- a/src/llamafactory/v1/plugins/data_plugins/loader.py
+++ b/src/llamafactory/v1/plugins/data_plugins/loader.py
@@ -49,7 +49,7 @@ def _get_builder_name(path: str) -> Literal["arrow", "csv", "json", "parquet", "
        raise ValueError(f"Unknown dataset filetype: {filetype}.")


-@DataLoaderPlugin("local").register
+@DataLoaderPlugin("local").register()
 def load_data_from_file(filepath: str, split: str, streaming: bool) -> HFDataset:
    if os.path.isdir(filepath):
        filetype = _get_builder_name(os.listdir(filepath)[0])
@@ -66,49 +66,43 @@ def load_data_from_file(filepath: str, split: str, streaming: bool) -> HFDataset
    return dataset


-class DataIndexPlugin(BasePlugin):
-    """Plugin for adjusting dataset index."""
+def adjust_data_index(
+    data_index: list[tuple[str, int]], size: int | None, weight: float | None
+) -> list[tuple[str, int]]:
+    """Adjust dataset index by size and weight.

-    def adjust_data_index(
-        self, data_index: list[tuple[str, int]], size: int | None, weight: float | None
-    ) -> list[tuple[str, int]]:
-        """Adjust dataset index by size and weight.
+    Args:
+        data_index (list[tuple[str, int]]): List of (dataset_name, sample_index).
+        size (Optional[int]): Desired dataset size.
+        weight (Optional[float]): Desired dataset weight.

-        Args:
-            data_index (list[tuple[str, int]]): List of (dataset_name, sample_index).
-            size (Optional[int]): Desired dataset size.
-            weight (Optional[float]): Desired dataset weight.
+    Returns:
+        list[tuple[str, int]]: Adjusted dataset index.
+    """
+    if size is not None:
+        data_index = random.choices(data_index, k=size)

-        Returns:
-            list[tuple[str, int]]: Adjusted dataset index.
-        """
-        if size is not None:
-            data_index = random.choices(data_index, k=size)
+    if weight is not None:
+        data_index = random.choices(data_index, k=int(len(data_index) * weight))

-        if weight is not None:
-            data_index = random.choices(data_index, k=int(len(data_index) * weight))
-
-        return data_index
+    return data_index


-class DataSelectorPlugin(BasePlugin):
-    """Plugin for selecting dataset samples."""
+def select_data_sample(
+    data_index: list[tuple[str, int]], index: slice | list[int] | Any
+) -> tuple[str, int] | list[tuple[str, int]]:
+    """Select dataset samples.

-    def select(
-        self, data_index: list[tuple[str, int]], index: slice | list[int] | Any
-    ) -> tuple[str, int] | list[tuple[str, int]]:
-        """Select dataset samples.
+    Args:
+        data_index (list[tuple[str, int]]): List of (dataset_name, sample_index).
+        index (Union[slice, list[int], Any]): Index of dataset samples.

-        Args:
-            data_index (list[tuple[str, int]]): List of (dataset_name, sample_index).
-            index (Union[slice, list[int], Any]): Index of dataset samples.
-
-        Returns:
-            Union[tuple[str, int], list[tuple[str, int]]]: Selected dataset samples.
-        """
-        if isinstance(index, slice):
-            return [data_index[i] for i in range(*index.indices(len(data_index)))]
-        elif isinstance(index, list):
-            return [data_index[i] for i in index]
-        else:
-            raise ValueError(f"Invalid index type {type(index)}.")
+    Returns:
+        Union[tuple[str, int], list[tuple[str, int]]]: Selected dataset samples.
+    """
+    if isinstance(index, slice):
+        return [data_index[i] for i in range(*index.indices(len(data_index)))]
+    elif isinstance(index, list):
+        return [data_index[i] for i in index]
+    else:
+        raise ValueError(f"Invalid index type {type(index)}.")