From 43d1730aa5d39393a4de6e788c9cbcd6ee8041a0 Mon Sep 17 00:00:00 2001 From: codingma Date: Tue, 16 Apr 2024 10:43:14 +0800 Subject: [PATCH] support for previewing custom dataset in directory format Former-commit-id: 75aa6392e86e965dc4a04b288ec60cff4e83f1af --- src/llmtuner/webui/components/data.py | 38 ++++++++++++++++++--------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/src/llmtuner/webui/components/data.py b/src/llmtuner/webui/components/data.py index 46274417..c0f113ea 100644 --- a/src/llmtuner/webui/components/data.py +++ b/src/llmtuner/webui/components/data.py @@ -28,30 +28,44 @@ def can_preview(dataset_dir: str, dataset: list) -> "gr.Button": dataset_info = json.load(f) except Exception: return gr.Button(interactive=False) - + + local_path = os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"]) if ( - len(dataset) > 0 - and "file_name" in dataset_info[dataset[0]] - and os.path.isfile(os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"])) + len(dataset) > 0 + and "file_name" in dataset_info[dataset[0]] + and (os.path.isfile(local_path) + or (os.path.isdir(local_path)) and len(os.listdir(local_path)) != 0) ): return gr.Button(interactive=True) else: return gr.Button(interactive=False) +def load_single_data(data_file_path): + with open(os.path.join(data_file_path), "r", encoding="utf-8") as f: + if data_file_path.endswith(".json"): + data = json.load(f) + elif data_file_path.endswith(".jsonl"): + data = [json.loads(line) for line in f] + else: + data = [line for line in f] # noqa: C416 + return data + + def get_preview(dataset_dir: str, dataset: list, page_index: int) -> Tuple[int, list, "gr.Column"]: with open(os.path.join(dataset_dir, DATA_CONFIG), "r", encoding="utf-8") as f: dataset_info = json.load(f) data_file: str = dataset_info[dataset[0]]["file_name"] - with open(os.path.join(dataset_dir, data_file), "r", encoding="utf-8") as f: - if data_file.endswith(".json"): - data = json.load(f) - elif data_file.endswith(".jsonl"): - data = [json.loads(line) for line in f] - else: - data = [line for line in f] # noqa: C416 - return len(data), data[PAGE_SIZE * page_index : PAGE_SIZE * (page_index + 1)], gr.Column(visible=True) + local_path = os.path.join(dataset_dir, data_file) + if os.path.isdir(local_path): + data = [] + for file_name in os.listdir(local_path): + data.extend(load_single_data(os.path.join(local_path, file_name))) + else: + data = load_single_data(local_path) + + return len(data), data[PAGE_SIZE * page_index: PAGE_SIZE * (page_index + 1)], gr.Column(visible=True) def create_preview_box(dataset_dir: "gr.Textbox", dataset: "gr.Dropdown") -> Dict[str, "Component"]: