From 596f496f192bda5364d809b9ffdc29eed5828ef1 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Fri, 8 Dec 2023 18:00:57 +0800 Subject: [PATCH 1/9] support ms dataset Former-commit-id: 98638b35dc24045ac17b9b01d08d3a02372acef3 --- src/llmtuner/data/loader.py | 27 +++++++++++++++++---------- src/llmtuner/hparams/data_args.py | 15 +++++++++++++-- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 8e9053ca..41c12422 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -24,7 +24,7 @@ def get_dataset( for dataset_attr in data_args.dataset_list: logger.info("Loading dataset {}...".format(dataset_attr)) - if dataset_attr.load_from == "hf_hub": + if dataset_attr.load_from in ("hf_hub", "ms_hub"): data_path = dataset_attr.dataset_name data_name = dataset_attr.subset data_files = None @@ -53,15 +53,22 @@ def get_dataset( else: raise NotImplementedError - dataset = load_dataset( - path=data_path, - name=data_name, - data_files=data_files, - split=data_args.split, - cache_dir=model_args.cache_dir, - token=model_args.hf_hub_token, - streaming=(data_args.streaming and (dataset_attr.load_from != "file")) - ) + if int(os.environ.get('USE_MODELSCOPE_HUB', '0')) and dataset_attr.load_from == "ms_hub": + from modelscope import MsDataset + dataset = MsDataset.load( + dataset_name=data_path, + subset_name=data_name, + ).to_hf_dataset() + else: + dataset = load_dataset( + path=data_path, + name=data_name, + data_files=data_files, + split=data_args.split, + cache_dir=model_args.cache_dir, + token=model_args.hf_hub_token, + streaming=(data_args.streaming and (dataset_attr.load_from != "file")) + ) if data_args.streaming and (dataset_attr.load_from == "file"): dataset = dataset.to_iterable_dataset() # TODO: add num shards parameter diff --git a/src/llmtuner/hparams/data_args.py b/src/llmtuner/hparams/data_args.py index cea89198..6f8bb738 100644 --- a/src/llmtuner/hparams/data_args.py +++ b/src/llmtuner/hparams/data_args.py @@ -2,7 +2,9 @@ import os import json from typing import List, Literal, Optional from dataclasses import dataclass, field +from llmtuner.extras.logging import get_logger +logger = get_logger(__name__) DATA_CONFIG = "dataset_info.json" @@ -152,8 +154,17 @@ class DataArguments: if name not in dataset_info: raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG)) - if "hf_hub_url" in dataset_info[name]: - dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"]) + if "hf_hub_url" in dataset_info[name] or 'ms_hub_url' in dataset_info[name]: + url_key_name = "hf_hub_url" + if int(os.environ.get('USE_MODELSCOPE_HUB', '0')): + if 'ms_hub_url' in dataset_info[name]: + url_key_name = 'ms_hub_url' + else: + logger.warning('You are using ModelScope Hub, but the specified dataset ' + 'has no `ms_hub_url` key, so `hf_hub_url` will be used instead.') + + dataset_attr = DatasetAttr(url_key_name[:url_key_name.index('_url')], + dataset_name=dataset_info[name][url_key_name]) elif "script_url" in dataset_info[name]: dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"]) else: From b634e91c439365ab262ebc7fa936d1bcb6306357 Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 11:47:59 +0800 Subject: [PATCH 2/9] for test Former-commit-id: 95ea942bd32402018e7c5dc61d50153c602ab67a --- src/llmtuner/data/loader.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 41c12422..70beea05 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -59,6 +59,13 @@ def get_dataset( dataset_name=data_path, subset_name=data_name, ).to_hf_dataset() + + def map_func(example): + # do something to example + example['input'] = example['input'] or '' + return example + + dataset = dataset.ds_instance.map(map_func) else: dataset = load_dataset( path=data_path, From 79a376d1db4eb02a9c584daee75bcdfc2d7ed333 Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 11:52:59 +0800 Subject: [PATCH 3/9] for test Former-commit-id: 33d9082320098f994bfa0c6353459afcb93165b7 --- src/llmtuner/data/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 70beea05..602d9a8c 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -65,7 +65,7 @@ def get_dataset( example['input'] = example['input'] or '' return example - dataset = dataset.ds_instance.map(map_func) + dataset = dataset.map(map_func) else: dataset = load_dataset( path=data_path, From 0baf32e219a8150249dae1e15ebd00816095c463 Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 12:03:23 +0800 Subject: [PATCH 4/9] update Former-commit-id: e15fc417d897c3063a25d6eb7eb89d1916db3cc5 --- src/llmtuner/data/loader.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 602d9a8c..41c12422 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -59,13 +59,6 @@ def get_dataset( dataset_name=data_path, subset_name=data_name, ).to_hf_dataset() - - def map_func(example): - # do something to example - example['input'] = example['input'] or '' - return example - - dataset = dataset.map(map_func) else: dataset = load_dataset( path=data_path, From ed26bb3d825fec04a2f29952e97d34136ed6af0e Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 13:02:54 +0800 Subject: [PATCH 5/9] update args for MsDataset.load Former-commit-id: c5f69357a167cbf99a93607177526e787419ea05 --- src/llmtuner/data/loader.py | 5 +++++ src/llmtuner/hparams/model_args.py | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 41c12422..7bd52caa 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -58,6 +58,11 @@ def get_dataset( dataset = MsDataset.load( dataset_name=data_path, subset_name=data_name, + split=data_args.split, + data_files=data_files, + cache_dir=model_args.cache_dir, + token=model_args.ms_hub_token, + streaming=(data_args.streaming and (dataset_attr.load_from != "file")), ).to_hf_dataset() else: dataset = load_dataset( diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py index ebf6cafa..c5819cea 100644 --- a/src/llmtuner/hparams/model_args.py +++ b/src/llmtuner/hparams/model_args.py @@ -59,6 +59,10 @@ class ModelArguments: default=None, metadata={"help": "Auth token to log in with Hugging Face Hub."} ) + ms_hub_token: Optional[str] = field( + default=None, + metadata={"help": "Auth token to log in with ModelScope Hub."} + ) def __post_init__(self): self.compute_dtype = None From fb9e1c4087b7f0542762d81ebb1750304e3efaca Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 13:08:18 +0800 Subject: [PATCH 6/9] update cache dir Former-commit-id: c8a1ce847fd7a75a06659133d92a0ac42e52a839 --- src/llmtuner/data/loader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 7bd52caa..82130485 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -55,12 +55,14 @@ def get_dataset( if int(os.environ.get('USE_MODELSCOPE_HUB', '0')) and dataset_attr.load_from == "ms_hub": from modelscope import MsDataset + cache_dir = model_args.cache_dir + cache_dir = str(cache_dir) if cache_dir is not None else None dataset = MsDataset.load( dataset_name=data_path, subset_name=data_name, split=data_args.split, data_files=data_files, - cache_dir=model_args.cache_dir, + cache_dir=cache_dir, token=model_args.ms_hub_token, streaming=(data_args.streaming and (dataset_attr.load_from != "file")), ).to_hf_dataset() From 39c2f03eabda7a2403f56714cf3ca1ae4282305e Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 14:14:40 +0800 Subject: [PATCH 7/9] add print info for test Former-commit-id: e4ae2fccf0cbec57fb5fb01fd7cc352da69b23bf --- src/llmtuner/data/loader.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 82130485..45e51d64 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -57,6 +57,15 @@ def get_dataset( from modelscope import MsDataset cache_dir = model_args.cache_dir cache_dir = str(cache_dir) if cache_dir is not None else None + + print(f'>data_path: {data_path}, ' + f'>data_name: {data_name}, ' + f'>split: {data_args.split},' + f'>data_files: {data_files}, ' + f'>cache_dir: {cache_dir},' + f'>token: {model_args.ms_hub_token},' + f'>streaming: {data_args.streaming}') + dataset = MsDataset.load( dataset_name=data_path, subset_name=data_name, From e54dad75da085cad226beaa020ecd2c006bb8485 Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 14:21:33 +0800 Subject: [PATCH 8/9] fix cache dir Former-commit-id: 6231272b9c51d44196f1fbec026973231e489b67 --- src/llmtuner/data/loader.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 45e51d64..6421cfcf 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -55,16 +55,8 @@ def get_dataset( if int(os.environ.get('USE_MODELSCOPE_HUB', '0')) and dataset_attr.load_from == "ms_hub": from modelscope import MsDataset - cache_dir = model_args.cache_dir - cache_dir = str(cache_dir) if cache_dir is not None else None - - print(f'>data_path: {data_path}, ' - f'>data_name: {data_name}, ' - f'>split: {data_args.split},' - f'>data_files: {data_files}, ' - f'>cache_dir: {cache_dir},' - f'>token: {model_args.ms_hub_token},' - f'>streaming: {data_args.streaming}') + from modelscope.utils.config_ds import MS_DATASETS_CACHE + cache_dir = model_args.cache_dir or MS_DATASETS_CACHE dataset = MsDataset.load( dataset_name=data_path, From 3ee3fe0bbb17f810acf7cf74ebde0addac63b529 Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 14:23:05 +0800 Subject: [PATCH 9/9] add use_streaming Former-commit-id: 80388abdb7ee88eb4afad92d8c706370c0574039 --- src/llmtuner/data/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 6421cfcf..5fe06755 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -65,7 +65,7 @@ def get_dataset( data_files=data_files, cache_dir=cache_dir, token=model_args.ms_hub_token, - streaming=(data_args.streaming and (dataset_attr.load_from != "file")), + use_streaming=(data_args.streaming and (dataset_attr.load_from != "file")), ).to_hf_dataset() else: dataset = load_dataset(