From 38c96417775bbbcd23cf306d0bc50bfcead14332 Mon Sep 17 00:00:00 2001 From: Noah <65912125+noahc1510@users.noreply.github.com> Date: Thu, 13 Feb 2025 01:39:41 +0800 Subject: [PATCH] [data] improve error handling (#6128) * sync from upstream * update * update * fix --------- Co-authored-by: hiyouga Former-commit-id: 1569e6096fec07da5583f1a3435b0d23ae09b5ba --- pyproject.toml | 33 +++++++++++++++--------------- src/llamafactory/data/converter.py | 22 +++++++++++--------- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ea5ed5ae..b94d25d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,22 +9,7 @@ requires-python = ">=3.10" [project.scripts] llamafactory-cli = "llamafactory.cli:main" - -[tool.uv] -conflicts = [ - [ - {extra = "aqlm" }, - {extra = "torch-npu" }, - ], - [ - {extra = "torch-npu" }, - {extra = "liger-kernel" }, - ], - [ - {extra = "vllm" }, - {extra = "torch-npu" }, - ] -] +lmf = "llamafactory.cli:main" [tool.ruff] target-version = "py38" @@ -55,3 +40,19 @@ indent-style = "space" docstring-code-format = true skip-magic-trailing-comma = false line-ending = "auto" + +[tool.uv] +conflicts = [ + [ + { extra = "torch-npu" }, + { extra = "aqlm" }, + ], + [ + { extra = "torch-npu" }, + { extra = "liger-kernel" }, + ], + [ + { extra = "torch-npu" }, + { extra = "vllm" }, + ] +] diff --git a/src/llamafactory/data/converter.py b/src/llamafactory/data/converter.py index 2cab9b08..ec456cd1 100644 --- a/src/llamafactory/data/converter.py +++ b/src/llamafactory/data/converter.py @@ -36,23 +36,25 @@ class DatasetConverter: dataset_attr: "DatasetAttr" data_args: "DataArguments" - def _find_medias(self, inputs: Union[Any, Sequence[Any]]) -> Optional[List[Any]]: + def _find_medias(self, medias: Union[Any, Sequence[Any]]) -> Optional[List[Any]]: r""" Optionally concatenates media path to media dir when loading from local disk. """ - if not isinstance(inputs, list): - inputs = [inputs] - elif len(inputs) == 0: + if not isinstance(medias, list): + medias = [medias] if medias is not None else [] + elif len(medias) == 0: return None else: - inputs = inputs[:] + medias = medias[:] - if self.dataset_attr.load_from in ["script", "file"]: - for i in range(len(inputs)): - if isinstance(inputs[i], str) and os.path.isfile(os.path.join(self.data_args.media_dir, inputs[i])): - inputs[i] = os.path.join(self.data_args.media_dir, inputs[i]) + if self.dataset_attr.load_from in ["script", "file"] and isinstance(medias[0], str): + for i in range(len(medias)): + if os.path.isfile(os.path.join(self.data_args.media_dir, medias[i])): + medias[i] = os.path.join(self.data_args.media_dir, medias[i]) + else: + logger.warning_rank0_once(f"Media {medias[i]} does not exist in `media_dir`. Use original path.") - return inputs + return medias @abstractmethod def __call__(self, example: Dict[str, Any]) -> Dict[str, Any]: