fix bos and eos token

2025-12-15 03:10:35 +08:00 · 2023-08-04 23:55:57 +08:00
parent 8172ad1b5e
commit d87c8fd8ab
2 changed files with 14 additions and 9 deletions
--- a/src/llmtuner/dsets/preprocess.py
+++ b/src/llmtuner/dsets/preprocess.py
@@ -30,7 +30,7 @@ def preprocess_dataset(
            yield query, response, history, prefix

    def preprocess_pretrain_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]:
-        # build grouped texts with format `<bos> X1 X2 X3 ...` (without <eos>)
+        # build grouped texts with format `X1 X2 X3 ...` (without <eos>)
        tokenized_examples = tokenizer(examples["prompt"], add_special_tokens=False)
        concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()}
        total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])
@@ -55,17 +55,17 @@ def preprocess_dataset(
        for query, response, history, prefix in construct_example(examples):
            input_ids, labels = [], []

-            for source_ids, target_ids in template.get_dialog(tokenizer, query, response, history, prefix): # TODO: fix bos
+            for source_ids, target_ids in template.get_dialog(tokenizer, query, response, history, prefix):
                if len(source_ids) > data_args.max_source_length:
                    source_ids = source_ids[:data_args.max_source_length]
-                if len(target_ids) > data_args.max_target_length - 1: # eos token
-                    target_ids = target_ids[:data_args.max_target_length - 1]
+                if len(target_ids) > data_args.max_target_length:
+                    target_ids = target_ids[:data_args.max_target_length]

-                if len(input_ids) + len(source_ids) + len(target_ids) + 1 > max_length:
+                if len(input_ids) + len(source_ids) + len(target_ids) > max_length:
                    break

-                input_ids += source_ids + target_ids + [tokenizer.eos_token_id]
-                labels += [IGNORE_INDEX] * len(source_ids) + target_ids + [tokenizer.eos_token_id]
+                input_ids += source_ids + target_ids
+                labels += [IGNORE_INDEX] * len(source_ids) + target_ids

            model_inputs["input_ids"].append(input_ids)
            model_inputs["attention_mask"].append([1] * len(input_ids))