[data-loader] Allow dataset_dir to accept a dict for in-memory dataset_info (#8845)

This commit is contained in:
kahlun 2025-08-07 16:26:59 +08:00 committed by GitHub
parent bc54ed8efb
commit 2572111616
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 7 additions and 5 deletions

View File

@ -15,7 +15,7 @@
import json
import os
from dataclasses import dataclass
from typing import Any, Literal, Optional
from typing import Any, Literal, Optional, Union
from huggingface_hub import hf_hub_download
@ -90,12 +90,14 @@ class DatasetAttr:
self.set_attr(tag, attr["tags"])
def get_dataset_list(dataset_names: Optional[list[str]], dataset_dir: str) -> list["DatasetAttr"]:
def get_dataset_list(dataset_names: Optional[list[str]], dataset_dir: Union[str, dict]) -> list["DatasetAttr"]:
r"""Get the attributes of the datasets."""
if dataset_names is None:
dataset_names = []
if dataset_dir == "ONLINE":
if isinstance(dataset_dir, dict):
dataset_info = dataset_dir
elif dataset_dir == "ONLINE":
dataset_info = None
else:
if dataset_dir.startswith("REMOTE:"):

View File

@ -16,7 +16,7 @@
# limitations under the License.
from dataclasses import asdict, dataclass, field
from typing import Any, Literal, Optional
from typing import Any, Literal, Optional, Union
@dataclass
@ -35,7 +35,7 @@ class DataArguments:
default=None,
metadata={"help": "The name of dataset(s) to use for evaluation. Use commas to separate multiple datasets."},
)
dataset_dir: str = field(
dataset_dir: Union[str, dict] = field(
default="data",
metadata={"help": "Path to the folder containing the datasets."},
)