[data] improve error handling (#6128)

* sync from upstream

* update

* update

* fix

---------

Co-authored-by: hiyouga <hiyouga@buaa.edu.cn>
Former-commit-id: 4c7bfebcf1ed90800f5b0de4cf67b3036cb9dc13
This commit is contained in:
Noah 2025-02-13 01:39:41 +08:00 committed by GitHub
parent 9b852ebe25
commit 1adb46875f
2 changed files with 29 additions and 26 deletions

View File

@ -9,22 +9,7 @@ requires-python = ">=3.10"
[project.scripts]
llamafactory-cli = "llamafactory.cli:main"
[tool.uv]
conflicts = [
[
{extra = "aqlm" },
{extra = "torch-npu" },
],
[
{extra = "torch-npu" },
{extra = "liger-kernel" },
],
[
{extra = "vllm" },
{extra = "torch-npu" },
]
]
lmf = "llamafactory.cli:main"
[tool.ruff]
target-version = "py38"
@ -55,3 +40,19 @@ indent-style = "space"
docstring-code-format = true
skip-magic-trailing-comma = false
line-ending = "auto"
[tool.uv]
conflicts = [
[
{ extra = "torch-npu" },
{ extra = "aqlm" },
],
[
{ extra = "torch-npu" },
{ extra = "liger-kernel" },
],
[
{ extra = "torch-npu" },
{ extra = "vllm" },
]
]

View File

@ -36,23 +36,25 @@ class DatasetConverter:
dataset_attr: "DatasetAttr"
data_args: "DataArguments"
def _find_medias(self, inputs: Union[Any, Sequence[Any]]) -> Optional[List[Any]]:
def _find_medias(self, medias: Union[Any, Sequence[Any]]) -> Optional[List[Any]]:
r"""
Optionally concatenates media path to media dir when loading from local disk.
"""
if not isinstance(inputs, list):
inputs = [inputs]
elif len(inputs) == 0:
if not isinstance(medias, list):
medias = [medias] if medias is not None else []
elif len(medias) == 0:
return None
else:
inputs = inputs[:]
medias = medias[:]
if self.dataset_attr.load_from in ["script", "file"]:
for i in range(len(inputs)):
if isinstance(inputs[i], str) and os.path.isfile(os.path.join(self.data_args.media_dir, inputs[i])):
inputs[i] = os.path.join(self.data_args.media_dir, inputs[i])
if self.dataset_attr.load_from in ["script", "file"] and isinstance(medias[0], str):
for i in range(len(medias)):
if os.path.isfile(os.path.join(self.data_args.media_dir, medias[i])):
medias[i] = os.path.join(self.data_args.media_dir, medias[i])
else:
logger.warning_rank0_once(f"Media {medias[i]} does not exist in `media_dir`. Use original path.")
return inputs
return medias
@abstractmethod
def __call__(self, example: Dict[str, Any]) -> Dict[str, Any]: