From c523613f0ae26f08147f098b32b8639a767af0ed Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Fri, 8 Dec 2023 18:00:57 +0800 Subject: [PATCH 01/14] support ms dataset Former-commit-id: 9c2247d700763f480d88a5dd46480cb32cfc174e --- data/dataset_info.json | 33 ++++++++++++++++++++++--------- src/llmtuner/data/loader.py | 27 +++++++++++++++---------- src/llmtuner/hparams/data_args.py | 15 ++++++++++++-- 3 files changed, 54 insertions(+), 21 deletions(-) diff --git a/data/dataset_info.json b/data/dataset_info.json index 2b3f4eb7..60235d47 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -69,22 +69,28 @@ } }, "guanaco": { - "hf_hub_url": "JosephusCheung/GuanacoDataset" + "hf_hub_url": "JosephusCheung/GuanacoDataset", + "ms_hub_yrl": "wyj123456/GuanacoDataset" }, "belle_2m": { - "hf_hub_url": "BelleGroup/train_2M_CN" + "hf_hub_url": "BelleGroup/train_2M_CN", + "ms_hub_yrl": "AI-ModelScope/train_2M_CN" }, "belle_1m": { - "hf_hub_url": "BelleGroup/train_1M_CN" + "hf_hub_url": "BelleGroup/train_1M_CN", + "ms_hub_yrl": "AI-ModelScope/train_1M_CN" }, "belle_0.5m": { - "hf_hub_url": "BelleGroup/train_0.5M_CN" + "hf_hub_url": "BelleGroup/train_0.5M_CN", + "ms_hub_yrl": "AI-ModelScope/train_0.5M_CN" }, "belle_dialog": { - "hf_hub_url": "BelleGroup/generated_chat_0.4M" + "hf_hub_url": "BelleGroup/generated_chat_0.4M", + "ms_hub_yrl": "AI-ModelScope/generated_chat_0.4M" }, "belle_math": { - "hf_hub_url": "BelleGroup/school_math_0.25M" + "hf_hub_url": "BelleGroup/school_math_0.25M", + "ms_hub_yrl": "AI-ModelScope/school_math_0.25M" }, "belle_multiturn": { "script_url": "belle_multiturn", @@ -95,10 +101,12 @@ "formatting": "sharegpt" }, "open_platypus": { - "hf_hub_url": "garage-bAInd/Open-Platypus" + "hf_hub_url": "garage-bAInd/Open-Platypus", + "ms_hub_yrl": "AI-ModelScope/Open-Platypus" }, "codealpaca": { - "hf_hub_url": "sahil2801/CodeAlpaca-20k" + "hf_hub_url": "sahil2801/CodeAlpaca-20k", + "ms_hub_yrl": "AI-ModelScope/CodeAlpaca-20k" }, "alpaca_cot": { "hf_hub_url": "QingyiSi/Alpaca-CoT" @@ -112,6 +120,7 @@ }, "mathinstruct": { "hf_hub_url": "TIGER-Lab/MathInstruct", + "ms_hub_yrl": "AI-ModelScope/MathInstruct", "columns": { "prompt": "instruction", "response": "output" @@ -126,13 +135,15 @@ }, "webqa": { "hf_hub_url": "suolyer/webqa", + "ms_hub_yrl": "AI-ModelScope/webqa", "columns": { "prompt": "input", "response": "output" } }, "webnovel": { - "hf_hub_url": "zxbsmk/webnovel_cn" + "hf_hub_url": "zxbsmk/webnovel_cn", + "ms_hub_yrl": "AI-ModelScope/webnovel_cn" }, "nectar_sft": { "hf_hub_url": "mlinmg/SFT-Nectar" @@ -146,10 +157,12 @@ }, "sharegpt_hyper": { "hf_hub_url": "totally-not-an-llm/sharegpt-hyperfiltered-3k", + "ms_hub_yrl": "AI-ModelScope/sharegpt-hyperfiltered-3k", "formatting": "sharegpt" }, "sharegpt4": { "hf_hub_url": "shibing624/sharegpt_gpt4", + "ms_hub_yrl": "AI-ModelScope/sharegpt_gpt4", "formatting": "sharegpt" }, "ultrachat_200k": { @@ -176,6 +189,7 @@ }, "evol_instruct": { "hf_hub_url": "WizardLM/WizardLM_evol_instruct_V2_196k", + "ms_hub_yrl": "AI-ModelScope/WizardLM_evol_instruct_V2_196k", "formatting": "sharegpt" }, "hh_rlhf_en": { @@ -251,6 +265,7 @@ }, "wikipedia_zh": { "hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered", + "ms_hub_yrl": "AI-ModelScope/wikipedia-cn-20230720-filtered", "columns": { "prompt": "completion" } diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 8e9053ca..41c12422 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -24,7 +24,7 @@ def get_dataset( for dataset_attr in data_args.dataset_list: logger.info("Loading dataset {}...".format(dataset_attr)) - if dataset_attr.load_from == "hf_hub": + if dataset_attr.load_from in ("hf_hub", "ms_hub"): data_path = dataset_attr.dataset_name data_name = dataset_attr.subset data_files = None @@ -53,15 +53,22 @@ def get_dataset( else: raise NotImplementedError - dataset = load_dataset( - path=data_path, - name=data_name, - data_files=data_files, - split=data_args.split, - cache_dir=model_args.cache_dir, - token=model_args.hf_hub_token, - streaming=(data_args.streaming and (dataset_attr.load_from != "file")) - ) + if int(os.environ.get('USE_MODELSCOPE_HUB', '0')) and dataset_attr.load_from == "ms_hub": + from modelscope import MsDataset + dataset = MsDataset.load( + dataset_name=data_path, + subset_name=data_name, + ).to_hf_dataset() + else: + dataset = load_dataset( + path=data_path, + name=data_name, + data_files=data_files, + split=data_args.split, + cache_dir=model_args.cache_dir, + token=model_args.hf_hub_token, + streaming=(data_args.streaming and (dataset_attr.load_from != "file")) + ) if data_args.streaming and (dataset_attr.load_from == "file"): dataset = dataset.to_iterable_dataset() # TODO: add num shards parameter diff --git a/src/llmtuner/hparams/data_args.py b/src/llmtuner/hparams/data_args.py index cea89198..6f8bb738 100644 --- a/src/llmtuner/hparams/data_args.py +++ b/src/llmtuner/hparams/data_args.py @@ -2,7 +2,9 @@ import os import json from typing import List, Literal, Optional from dataclasses import dataclass, field +from llmtuner.extras.logging import get_logger +logger = get_logger(__name__) DATA_CONFIG = "dataset_info.json" @@ -152,8 +154,17 @@ class DataArguments: if name not in dataset_info: raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG)) - if "hf_hub_url" in dataset_info[name]: - dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"]) + if "hf_hub_url" in dataset_info[name] or 'ms_hub_url' in dataset_info[name]: + url_key_name = "hf_hub_url" + if int(os.environ.get('USE_MODELSCOPE_HUB', '0')): + if 'ms_hub_url' in dataset_info[name]: + url_key_name = 'ms_hub_url' + else: + logger.warning('You are using ModelScope Hub, but the specified dataset ' + 'has no `ms_hub_url` key, so `hf_hub_url` will be used instead.') + + dataset_attr = DatasetAttr(url_key_name[:url_key_name.index('_url')], + dataset_name=dataset_info[name][url_key_name]) elif "script_url" in dataset_info[name]: dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"]) else: From 9c30cdb53da69f0083987e88206df58f4aaa77f4 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Fri, 8 Dec 2023 18:13:26 +0800 Subject: [PATCH 02/14] fix typo Former-commit-id: e4cf2a75caac75cb6320350ba179b8e2dcd87366 --- data/dataset_info.json | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/data/dataset_info.json b/data/dataset_info.json index 60235d47..6593cf34 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -70,27 +70,27 @@ }, "guanaco": { "hf_hub_url": "JosephusCheung/GuanacoDataset", - "ms_hub_yrl": "wyj123456/GuanacoDataset" + "ms_hub_url": "wyj123456/GuanacoDataset" }, "belle_2m": { "hf_hub_url": "BelleGroup/train_2M_CN", - "ms_hub_yrl": "AI-ModelScope/train_2M_CN" + "ms_hub_url": "AI-ModelScope/train_2M_CN" }, "belle_1m": { "hf_hub_url": "BelleGroup/train_1M_CN", - "ms_hub_yrl": "AI-ModelScope/train_1M_CN" + "ms_hub_url": "AI-ModelScope/train_1M_CN" }, "belle_0.5m": { "hf_hub_url": "BelleGroup/train_0.5M_CN", - "ms_hub_yrl": "AI-ModelScope/train_0.5M_CN" + "ms_hub_url": "AI-ModelScope/train_0.5M_CN" }, "belle_dialog": { "hf_hub_url": "BelleGroup/generated_chat_0.4M", - "ms_hub_yrl": "AI-ModelScope/generated_chat_0.4M" + "ms_hub_url": "AI-ModelScope/generated_chat_0.4M" }, "belle_math": { "hf_hub_url": "BelleGroup/school_math_0.25M", - "ms_hub_yrl": "AI-ModelScope/school_math_0.25M" + "ms_hub_url": "AI-ModelScope/school_math_0.25M" }, "belle_multiturn": { "script_url": "belle_multiturn", @@ -102,11 +102,11 @@ }, "open_platypus": { "hf_hub_url": "garage-bAInd/Open-Platypus", - "ms_hub_yrl": "AI-ModelScope/Open-Platypus" + "ms_hub_url": "AI-ModelScope/Open-Platypus" }, "codealpaca": { "hf_hub_url": "sahil2801/CodeAlpaca-20k", - "ms_hub_yrl": "AI-ModelScope/CodeAlpaca-20k" + "ms_hub_url": "AI-ModelScope/CodeAlpaca-20k" }, "alpaca_cot": { "hf_hub_url": "QingyiSi/Alpaca-CoT" @@ -120,7 +120,7 @@ }, "mathinstruct": { "hf_hub_url": "TIGER-Lab/MathInstruct", - "ms_hub_yrl": "AI-ModelScope/MathInstruct", + "ms_hub_url": "AI-ModelScope/MathInstruct", "columns": { "prompt": "instruction", "response": "output" @@ -135,7 +135,7 @@ }, "webqa": { "hf_hub_url": "suolyer/webqa", - "ms_hub_yrl": "AI-ModelScope/webqa", + "ms_hub_url": "AI-ModelScope/webqa", "columns": { "prompt": "input", "response": "output" @@ -143,7 +143,7 @@ }, "webnovel": { "hf_hub_url": "zxbsmk/webnovel_cn", - "ms_hub_yrl": "AI-ModelScope/webnovel_cn" + "ms_hub_url": "AI-ModelScope/webnovel_cn" }, "nectar_sft": { "hf_hub_url": "mlinmg/SFT-Nectar" @@ -157,12 +157,12 @@ }, "sharegpt_hyper": { "hf_hub_url": "totally-not-an-llm/sharegpt-hyperfiltered-3k", - "ms_hub_yrl": "AI-ModelScope/sharegpt-hyperfiltered-3k", + "ms_hub_url": "AI-ModelScope/sharegpt-hyperfiltered-3k", "formatting": "sharegpt" }, "sharegpt4": { "hf_hub_url": "shibing624/sharegpt_gpt4", - "ms_hub_yrl": "AI-ModelScope/sharegpt_gpt4", + "ms_hub_url": "AI-ModelScope/sharegpt_gpt4", "formatting": "sharegpt" }, "ultrachat_200k": { @@ -189,7 +189,7 @@ }, "evol_instruct": { "hf_hub_url": "WizardLM/WizardLM_evol_instruct_V2_196k", - "ms_hub_yrl": "AI-ModelScope/WizardLM_evol_instruct_V2_196k", + "ms_hub_url": "AI-ModelScope/WizardLM_evol_instruct_V2_196k", "formatting": "sharegpt" }, "hh_rlhf_en": { @@ -265,7 +265,7 @@ }, "wikipedia_zh": { "hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered", - "ms_hub_yrl": "AI-ModelScope/wikipedia-cn-20230720-filtered", + "ms_hub_url": "AI-ModelScope/wikipedia-cn-20230720-filtered", "columns": { "prompt": "completion" } From 2918743520a488df914305f9d24e105d3465e784 Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 11:47:59 +0800 Subject: [PATCH 03/14] for test Former-commit-id: 8a908a8c644f4a961001cdd8388a3a7fea992c55 --- src/llmtuner/data/loader.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 41c12422..70beea05 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -59,6 +59,13 @@ def get_dataset( dataset_name=data_path, subset_name=data_name, ).to_hf_dataset() + + def map_func(example): + # do something to example + example['input'] = example['input'] or '' + return example + + dataset = dataset.ds_instance.map(map_func) else: dataset = load_dataset( path=data_path, From 1d65d240719553f2b0d281992d56c6da1251780b Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 11:52:59 +0800 Subject: [PATCH 04/14] for test Former-commit-id: 5b979147f093e86f44c4228ab34d04bdae94f89f --- src/llmtuner/data/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 70beea05..602d9a8c 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -65,7 +65,7 @@ def get_dataset( example['input'] = example['input'] or '' return example - dataset = dataset.ds_instance.map(map_func) + dataset = dataset.map(map_func) else: dataset = load_dataset( path=data_path, From 6520aecef1ef06b1e90f5867cbc9681be49cf627 Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 12:03:23 +0800 Subject: [PATCH 05/14] update Former-commit-id: cfba1009d0fc31b5933b558b249d89248f723d6b --- src/llmtuner/data/loader.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 602d9a8c..41c12422 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -59,13 +59,6 @@ def get_dataset( dataset_name=data_path, subset_name=data_name, ).to_hf_dataset() - - def map_func(example): - # do something to example - example['input'] = example['input'] or '' - return example - - dataset = dataset.map(map_func) else: dataset = load_dataset( path=data_path, From 92fb73abd422dd04c37e8edf1b38b3e7bcd0b290 Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 12:34:04 +0800 Subject: [PATCH 06/14] add open orca Former-commit-id: 0ce18a378255a1d075a38a364520ba7a1e56180f --- data/dataset_info.json | 1 + 1 file changed, 1 insertion(+) diff --git a/data/dataset_info.json b/data/dataset_info.json index 6593cf34..c9d9839a 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -113,6 +113,7 @@ }, "openorca": { "hf_hub_url": "Open-Orca/OpenOrca", + "ms_hub_url": "AI-ModelScope/OpenOrca", "columns": { "prompt": "question", "response": "response" From 9f17d36ccf983613350c988d3a302c9d12fb0fb7 Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 12:44:15 +0800 Subject: [PATCH 07/14] add new datasets Former-commit-id: fe4acc66b0e2bd96c988315192beb161da2d51f8 --- data/dataset_info.json | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/data/dataset_info.json b/data/dataset_info.json index c9d9839a..01f4db21 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -129,6 +129,7 @@ }, "firefly": { "hf_hub_url": "YeungNLP/firefly-train-1.1M", + "ms_hub_url": "AI-ModelScope/firefly-train-1.1M", "columns": { "prompt": "input", "response": "target" @@ -151,6 +152,7 @@ }, "adgen": { "hf_hub_url": "HasturOfficial/adgen", + "ms_hub_url": "AI-ModelScope/adgen", "columns": { "prompt": "content", "response": "summary" @@ -168,6 +170,7 @@ }, "ultrachat_200k": { "hf_hub_url": "HuggingFaceH4/ultrachat_200k", + "ms_hub_url": "AI-ModelScope/ultrachat_200k", "columns": { "prompt": "messages", "query": "role", @@ -181,6 +184,7 @@ }, "lmsys_chat": { "hf_hub_url": "lmsys/lmsys-chat-1m", + "ms_hub_url": "AI-ModelScope/lmsys-chat-1m", "columns": { "prompt": "conversation", "query": "role", From 879209829ea2f7587bd47d62d282ef2b7a3a403b Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 13:02:54 +0800 Subject: [PATCH 08/14] update args for MsDataset.load Former-commit-id: 09533e95edc5fa65a38b2f04c6d88506196021b3 --- data/dataset_info.json | 1 - src/llmtuner/data/loader.py | 5 +++++ src/llmtuner/hparams/model_args.py | 4 ++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/data/dataset_info.json b/data/dataset_info.json index 01f4db21..ceb81379 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -129,7 +129,6 @@ }, "firefly": { "hf_hub_url": "YeungNLP/firefly-train-1.1M", - "ms_hub_url": "AI-ModelScope/firefly-train-1.1M", "columns": { "prompt": "input", "response": "target" diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 41c12422..7bd52caa 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -58,6 +58,11 @@ def get_dataset( dataset = MsDataset.load( dataset_name=data_path, subset_name=data_name, + split=data_args.split, + data_files=data_files, + cache_dir=model_args.cache_dir, + token=model_args.ms_hub_token, + streaming=(data_args.streaming and (dataset_attr.load_from != "file")), ).to_hf_dataset() else: dataset = load_dataset( diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py index ebf6cafa..c5819cea 100644 --- a/src/llmtuner/hparams/model_args.py +++ b/src/llmtuner/hparams/model_args.py @@ -59,6 +59,10 @@ class ModelArguments: default=None, metadata={"help": "Auth token to log in with Hugging Face Hub."} ) + ms_hub_token: Optional[str] = field( + default=None, + metadata={"help": "Auth token to log in with ModelScope Hub."} + ) def __post_init__(self): self.compute_dtype = None From e17f2a3f7ffe4edfe06fa9c113895bc9ab9ad121 Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 13:08:18 +0800 Subject: [PATCH 09/14] update cache dir Former-commit-id: edc82b923a3fb03c5af100b5357e10f0c18b4523 --- src/llmtuner/data/loader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 7bd52caa..82130485 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -55,12 +55,14 @@ def get_dataset( if int(os.environ.get('USE_MODELSCOPE_HUB', '0')) and dataset_attr.load_from == "ms_hub": from modelscope import MsDataset + cache_dir = model_args.cache_dir + cache_dir = str(cache_dir) if cache_dir is not None else None dataset = MsDataset.load( dataset_name=data_path, subset_name=data_name, split=data_args.split, data_files=data_files, - cache_dir=model_args.cache_dir, + cache_dir=cache_dir, token=model_args.ms_hub_token, streaming=(data_args.streaming and (dataset_attr.load_from != "file")), ).to_hf_dataset() From c1974c91e5d9cfdb1e57e60265125932eaa7ca17 Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 14:14:40 +0800 Subject: [PATCH 10/14] add print info for test Former-commit-id: 168321a4da7612620b9528860306f03bf65d019a --- src/llmtuner/data/loader.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 82130485..45e51d64 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -57,6 +57,15 @@ def get_dataset( from modelscope import MsDataset cache_dir = model_args.cache_dir cache_dir = str(cache_dir) if cache_dir is not None else None + + print(f'>data_path: {data_path}, ' + f'>data_name: {data_name}, ' + f'>split: {data_args.split},' + f'>data_files: {data_files}, ' + f'>cache_dir: {cache_dir},' + f'>token: {model_args.ms_hub_token},' + f'>streaming: {data_args.streaming}') + dataset = MsDataset.load( dataset_name=data_path, subset_name=data_name, From 1bd75afae8285d9066b09b5af7905ada314de528 Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 14:21:33 +0800 Subject: [PATCH 11/14] fix cache dir Former-commit-id: 1909f0d11732bd99fadc6c1191e026137c6a7dff --- src/llmtuner/data/loader.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 45e51d64..6421cfcf 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -55,16 +55,8 @@ def get_dataset( if int(os.environ.get('USE_MODELSCOPE_HUB', '0')) and dataset_attr.load_from == "ms_hub": from modelscope import MsDataset - cache_dir = model_args.cache_dir - cache_dir = str(cache_dir) if cache_dir is not None else None - - print(f'>data_path: {data_path}, ' - f'>data_name: {data_name}, ' - f'>split: {data_args.split},' - f'>data_files: {data_files}, ' - f'>cache_dir: {cache_dir},' - f'>token: {model_args.ms_hub_token},' - f'>streaming: {data_args.streaming}') + from modelscope.utils.config_ds import MS_DATASETS_CACHE + cache_dir = model_args.cache_dir or MS_DATASETS_CACHE dataset = MsDataset.load( dataset_name=data_path, From 6cb2c99e7dfbb55cdbd9880fec4afeb59d85435a Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 14:23:05 +0800 Subject: [PATCH 12/14] add use_streaming Former-commit-id: adc98c86dad64f1a793017fa628b5cf19abbdd01 --- src/llmtuner/data/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 6421cfcf..5fe06755 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -65,7 +65,7 @@ def get_dataset( data_files=data_files, cache_dir=cache_dir, token=model_args.ms_hub_token, - streaming=(data_args.streaming and (dataset_attr.load_from != "file")), + use_streaming=(data_args.streaming and (dataset_attr.load_from != "file")), ).to_hf_dataset() else: dataset = load_dataset( From 277790d86896d506f19fa1c6e70187df6aeda3cd Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 14:53:59 +0800 Subject: [PATCH 13/14] update dataset info Former-commit-id: 73b50a26b9c6282f28df87338fa4057759c38f69 --- data/dataset_info.json | 2 -- 1 file changed, 2 deletions(-) diff --git a/data/dataset_info.json b/data/dataset_info.json index ceb81379..09be3e39 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -159,7 +159,6 @@ }, "sharegpt_hyper": { "hf_hub_url": "totally-not-an-llm/sharegpt-hyperfiltered-3k", - "ms_hub_url": "AI-ModelScope/sharegpt-hyperfiltered-3k", "formatting": "sharegpt" }, "sharegpt4": { @@ -193,7 +192,6 @@ }, "evol_instruct": { "hf_hub_url": "WizardLM/WizardLM_evol_instruct_V2_196k", - "ms_hub_url": "AI-ModelScope/WizardLM_evol_instruct_V2_196k", "formatting": "sharegpt" }, "hh_rlhf_en": { From e331e8c2007e2a682b8fb497f9f92fbd138d619f Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 12 Dec 2023 15:00:37 +0800 Subject: [PATCH 14/14] modify guanaco Former-commit-id: e80a989d49366bf08f62d212d329a90a02d8167e --- data/dataset_info.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/dataset_info.json b/data/dataset_info.json index 09be3e39..e941d401 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -70,7 +70,7 @@ }, "guanaco": { "hf_hub_url": "JosephusCheung/GuanacoDataset", - "ms_hub_url": "wyj123456/GuanacoDataset" + "ms_hub_url": "AI-ModelScope/GuanacoDataset" }, "belle_2m": { "hf_hub_url": "BelleGroup/train_2M_CN",