mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-08-03 20:22:49 +08:00
support for previewing custom dataset in directory format
Former-commit-id: 75aa6392e86e965dc4a04b288ec60cff4e83f1af
This commit is contained in:
parent
627f281e5b
commit
43d1730aa5
@ -29,28 +29,42 @@ def can_preview(dataset_dir: str, dataset: list) -> "gr.Button":
|
|||||||
except Exception:
|
except Exception:
|
||||||
return gr.Button(interactive=False)
|
return gr.Button(interactive=False)
|
||||||
|
|
||||||
|
local_path = os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"])
|
||||||
if (
|
if (
|
||||||
len(dataset) > 0
|
len(dataset) > 0
|
||||||
and "file_name" in dataset_info[dataset[0]]
|
and "file_name" in dataset_info[dataset[0]]
|
||||||
and os.path.isfile(os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"]))
|
and (os.path.isfile(local_path)
|
||||||
|
or (os.path.isdir(local_path)) and len(os.listdir(local_path)) != 0)
|
||||||
):
|
):
|
||||||
return gr.Button(interactive=True)
|
return gr.Button(interactive=True)
|
||||||
else:
|
else:
|
||||||
return gr.Button(interactive=False)
|
return gr.Button(interactive=False)
|
||||||
|
|
||||||
|
|
||||||
|
def load_single_data(data_file_path):
|
||||||
|
with open(os.path.join(data_file_path), "r", encoding="utf-8") as f:
|
||||||
|
if data_file_path.endswith(".json"):
|
||||||
|
data = json.load(f)
|
||||||
|
elif data_file_path.endswith(".jsonl"):
|
||||||
|
data = [json.loads(line) for line in f]
|
||||||
|
else:
|
||||||
|
data = [line for line in f] # noqa: C416
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
def get_preview(dataset_dir: str, dataset: list, page_index: int) -> Tuple[int, list, "gr.Column"]:
|
def get_preview(dataset_dir: str, dataset: list, page_index: int) -> Tuple[int, list, "gr.Column"]:
|
||||||
with open(os.path.join(dataset_dir, DATA_CONFIG), "r", encoding="utf-8") as f:
|
with open(os.path.join(dataset_dir, DATA_CONFIG), "r", encoding="utf-8") as f:
|
||||||
dataset_info = json.load(f)
|
dataset_info = json.load(f)
|
||||||
|
|
||||||
data_file: str = dataset_info[dataset[0]]["file_name"]
|
data_file: str = dataset_info[dataset[0]]["file_name"]
|
||||||
with open(os.path.join(dataset_dir, data_file), "r", encoding="utf-8") as f:
|
local_path = os.path.join(dataset_dir, data_file)
|
||||||
if data_file.endswith(".json"):
|
if os.path.isdir(local_path):
|
||||||
data = json.load(f)
|
data = []
|
||||||
elif data_file.endswith(".jsonl"):
|
for file_name in os.listdir(local_path):
|
||||||
data = [json.loads(line) for line in f]
|
data.extend(load_single_data(os.path.join(local_path, file_name)))
|
||||||
else:
|
else:
|
||||||
data = [line for line in f] # noqa: C416
|
data = load_single_data(local_path)
|
||||||
|
|
||||||
return len(data), data[PAGE_SIZE * page_index: PAGE_SIZE * (page_index + 1)], gr.Column(visible=True)
|
return len(data), data[PAGE_SIZE * page_index: PAGE_SIZE * (page_index + 1)], gr.Column(visible=True)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user