From fe4f4e9758d0222e5bac43ec3eba921532f6547d Mon Sep 17 00:00:00 2001
From: HJ <92386084+JJJYmmm@users.noreply.github.com>
Date: Mon, 10 Feb 2025 21:59:12 +0800
Subject: [PATCH] [data] fix: sharegpt converter (#6879)

* fix-sharegpt-format

* fix

---------

Co-authored-by: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Former-commit-id: 0fb44cb3a5499c8da79e73004adc9d16f792b4b3
---
 src/llamafactory/data/aligner.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/llamafactory/data/aligner.py b/src/llamafactory/data/aligner.py
index f634f21e..b71964b0 100644
--- a/src/llamafactory/data/aligner.py
+++ b/src/llamafactory/data/aligner.py
@@ -145,6 +145,7 @@ def convert_sharegpt(
         if message[dataset_attr.role_tag] not in accept_tags[turn_idx % 2]:
             logger.warning_rank0(f"Invalid role tag in {messages}.")
             broken_data = True
+            break
 
         aligned_messages.append(
             {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]}
@@ -156,7 +157,10 @@ def convert_sharegpt(
         logger.warning_rank0(f"Invalid message count in {messages}.")
         broken_data = True
 
-    if dataset_attr.kto_tag and isinstance(example[dataset_attr.kto_tag], bool):  # kto example
+    if broken_data:
+        logger.warning_rank0("Skipping this abnormal example.")
+        prompt, response = [], []
+    elif dataset_attr.kto_tag and isinstance(example[dataset_attr.kto_tag], bool):  # kto example
         prompt = aligned_messages[:-1]
         response = aligned_messages[-1:]
         if example[dataset_attr.kto_tag]:
@@ -186,10 +190,6 @@ def convert_sharegpt(
         prompt = aligned_messages[:-1]
         response = aligned_messages[-1:]
 
-    if broken_data:
-        logger.warning_rank0("Skipping this abnormal example.")
-        prompt, response = [], []
-
     regularize_medias = partial(_regularize_medias, dataset_attr=dataset_attr, data_args=data_args)
     output = {
         "_prompt": prompt,