From 130bfaf8e3b187859a17796b2d98d61eb2b56090 Mon Sep 17 00:00:00 2001 From: Eric Tang <46737979+erictang000@users.noreply.github.com> Date: Fri, 16 May 2025 00:35:38 -0700 Subject: [PATCH] [data] support loading folder from remote (#8078) --- src/llamafactory/data/data_utils.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/llamafactory/data/data_utils.py b/src/llamafactory/data/data_utils.py index 3ea4aff7..b23423be 100644 --- a/src/llamafactory/data/data_utils.py +++ b/src/llamafactory/data/data_utils.py @@ -169,11 +169,22 @@ def read_cloud_json(cloud_path): try: # Try with anonymous access first fs = setup_fs(cloud_path, anon=True) - return _read_json_with_fs(fs, cloud_path, lines=cloud_path.endswith(".jsonl")) except Exception: # Try again with credentials fs = setup_fs(cloud_path) - return _read_json_with_fs(fs, cloud_path, lines=cloud_path.endswith(".jsonl")) + + if fs.isdir(cloud_path): + files = [x["Key"] for x in fs.listdir(cloud_path)] + else: + files = [cloud_path] + # filter out non-JSON files + files = [file for file in files if file.endswith(".json") or file.endswith(".jsonl")] + if not files: + raise ValueError(f"No JSON/JSONL files found in the specified path: {cloud_path}") + data = [] + for file in files: + data.extend(_read_json_with_fs(fs, file, lines=file.endswith(".jsonl"))) + return data def _read_json_with_fs(fs, path, lines=True):