[data-loader] Allow dataset_dir to accept a dict for in-memory dataset_info (#8845)

This commit is contained in:
kahlun 2025-08-07 16:26:59 +08:00 committed by GitHub
parent bc54ed8efb
commit 2572111616
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 7 additions and 5 deletions

View File

@ -15,7 +15,7 @@
import json import json
import os import os
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, Literal, Optional from typing import Any, Literal, Optional, Union
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download
@ -90,12 +90,14 @@ class DatasetAttr:
self.set_attr(tag, attr["tags"]) self.set_attr(tag, attr["tags"])
def get_dataset_list(dataset_names: Optional[list[str]], dataset_dir: str) -> list["DatasetAttr"]: def get_dataset_list(dataset_names: Optional[list[str]], dataset_dir: Union[str, dict]) -> list["DatasetAttr"]:
r"""Get the attributes of the datasets.""" r"""Get the attributes of the datasets."""
if dataset_names is None: if dataset_names is None:
dataset_names = [] dataset_names = []
if dataset_dir == "ONLINE": if isinstance(dataset_dir, dict):
dataset_info = dataset_dir
elif dataset_dir == "ONLINE":
dataset_info = None dataset_info = None
else: else:
if dataset_dir.startswith("REMOTE:"): if dataset_dir.startswith("REMOTE:"):

View File

@ -16,7 +16,7 @@
# limitations under the License. # limitations under the License.
from dataclasses import asdict, dataclass, field from dataclasses import asdict, dataclass, field
from typing import Any, Literal, Optional from typing import Any, Literal, Optional, Union
@dataclass @dataclass
@ -35,7 +35,7 @@ class DataArguments:
default=None, default=None,
metadata={"help": "The name of dataset(s) to use for evaluation. Use commas to separate multiple datasets."}, metadata={"help": "The name of dataset(s) to use for evaluation. Use commas to separate multiple datasets."},
) )
dataset_dir: str = field( dataset_dir: Union[str, dict] = field(
default="data", default="data",
metadata={"help": "Path to the folder containing the datasets."}, metadata={"help": "Path to the folder containing the datasets."},
) )