mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2026-03-01 17:26:03 +08:00
[data] support for specifying a dataset in cloud storage (#7567)
* add support for loading datasets from s3/gcs * add comments to readme * run linter and address comments * add option to pass in kwargs to ray init (i.e. runtime env) * address comment * revert mixed up changes
This commit is contained in:
@@ -16,13 +16,13 @@ import os
|
||||
from typing import TYPE_CHECKING, Literal, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
from datasets import load_dataset, load_from_disk
|
||||
from datasets import Dataset, load_dataset, load_from_disk
|
||||
|
||||
from ..extras import logging
|
||||
from ..extras.constants import FILEEXT2TYPE
|
||||
from ..extras.misc import check_version, has_tokenized_data
|
||||
from .converter import align_dataset
|
||||
from .data_utils import get_dataset_module, merge_dataset, split_dataset
|
||||
from .data_utils import get_dataset_module, merge_dataset, read_cloud_json, split_dataset
|
||||
from .parser import get_dataset_list
|
||||
from .processor import (
|
||||
FeedbackDatasetProcessor,
|
||||
@@ -67,6 +67,9 @@ def _load_single_dataset(
|
||||
data_name = dataset_attr.subset
|
||||
data_dir = dataset_attr.folder
|
||||
|
||||
elif dataset_attr.load_from == "cloud_file":
|
||||
data_path = dataset_attr.dataset_name
|
||||
|
||||
elif dataset_attr.load_from == "file":
|
||||
data_files = []
|
||||
local_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
|
||||
@@ -122,6 +125,8 @@ def _load_single_dataset(
|
||||
token=model_args.om_hub_token,
|
||||
streaming=data_args.streaming,
|
||||
)
|
||||
elif dataset_attr.load_from == "cloud_file":
|
||||
dataset = Dataset.from_list(read_cloud_json(data_path), split=dataset_attr.split)
|
||||
else:
|
||||
dataset = load_dataset(
|
||||
path=data_path,
|
||||
|
||||
Reference in New Issue
Block a user