[data] support for specifying a dataset in cloud storage (#7567)

* add support for loading datasets from s3/gcs

* add comments to readme

* run linter and address comments

* add option to pass in kwargs to ray init (i.e. runtime env)

* address comment

* revert mixed up changes
This commit is contained in:
Eric Tang
2025-04-09 20:31:35 -07:00
committed by GitHub
parent bb8d79bae2
commit a8caf09c7f
5 changed files with 63 additions and 6 deletions

View File

@@ -16,13 +16,13 @@ import os
from typing import TYPE_CHECKING, Literal, Optional, Union
import numpy as np
from datasets import load_dataset, load_from_disk
from datasets import Dataset, load_dataset, load_from_disk
from ..extras import logging
from ..extras.constants import FILEEXT2TYPE
from ..extras.misc import check_version, has_tokenized_data
from .converter import align_dataset
from .data_utils import get_dataset_module, merge_dataset, split_dataset
from .data_utils import get_dataset_module, merge_dataset, read_cloud_json, split_dataset
from .parser import get_dataset_list
from .processor import (
FeedbackDatasetProcessor,
@@ -67,6 +67,9 @@ def _load_single_dataset(
data_name = dataset_attr.subset
data_dir = dataset_attr.folder
elif dataset_attr.load_from == "cloud_file":
data_path = dataset_attr.dataset_name
elif dataset_attr.load_from == "file":
data_files = []
local_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
@@ -122,6 +125,8 @@ def _load_single_dataset(
token=model_args.om_hub_token,
streaming=data_args.streaming,
)
elif dataset_attr.load_from == "cloud_file":
dataset = Dataset.from_list(read_cloud_json(data_path), split=dataset_attr.split)
else:
dataset = load_dataset(
path=data_path,