fix sft encode

Former-commit-id: 2369a96a3200593421ae9afb06e08e2ac8010bb2
2026-03-01 01:06:00 +08:00 · 2023-07-11 19:50:33 +08:00
parent cc290a41e6
commit 8de7a01887
1 changed files with 1 additions and 1 deletions
--- a/src/utils/common.py
+++ b/src/utils/common.py
@@ -505,7 +505,7 @@ def preprocess_data(
            input_ids, labels = [], []

            for i in range(len(dialog) // 2):
-                source_ids = tokenizer.encode(text=dialog[2*i], add_special_tokens=True)
+                source_ids = tokenizer.encode(text=dialog[2*i], add_special_tokens=(i == 0))
                target_ids = tokenizer.encode(text=dialog[2*i+1], add_special_tokens=False)

                if len(source_ids) > data_args.max_source_length: