support ms dataset

This commit is contained in:
yuze.zyz
2023-12-08 18:00:57 +08:00
parent 00f5c9ee16
commit 9c2247d700
3 changed files with 54 additions and 21 deletions

View File

@@ -24,7 +24,7 @@ def get_dataset(
for dataset_attr in data_args.dataset_list:
logger.info("Loading dataset {}...".format(dataset_attr))
if dataset_attr.load_from == "hf_hub":
if dataset_attr.load_from in ("hf_hub", "ms_hub"):
data_path = dataset_attr.dataset_name
data_name = dataset_attr.subset
data_files = None
@@ -53,15 +53,22 @@ def get_dataset(
else:
raise NotImplementedError
dataset = load_dataset(
path=data_path,
name=data_name,
data_files=data_files,
split=data_args.split,
cache_dir=model_args.cache_dir,
token=model_args.hf_hub_token,
streaming=(data_args.streaming and (dataset_attr.load_from != "file"))
)
if int(os.environ.get('USE_MODELSCOPE_HUB', '0')) and dataset_attr.load_from == "ms_hub":
from modelscope import MsDataset
dataset = MsDataset.load(
dataset_name=data_path,
subset_name=data_name,
).to_hf_dataset()
else:
dataset = load_dataset(
path=data_path,
name=data_name,
data_files=data_files,
split=data_args.split,
cache_dir=model_args.cache_dir,
token=model_args.hf_hub_token,
streaming=(data_args.streaming and (dataset_attr.load_from != "file"))
)
if data_args.streaming and (dataset_attr.load_from == "file"):
dataset = dataset.to_iterable_dataset() # TODO: add num shards parameter