fix tokenizer config changed after pretrain

Changing tokenizer's attribute at preprocessing stage will result in saving a wrong tokenizer. for example, baichuan2 Former-commit-id: 19942b5314b84267691f0a5657d0679f2ddbe58b
2025-11-28 19:24:20 +08:00 · 2023-11-08 15:50:46 +08:00 · 2023-11-08 15:50:46 +08:00 · 09a1681b63
commit 09a1681b63
parent f5ba2190fb
1 changed files with 4 additions and 0 deletions
--- a/src/llmtuner/dsets/preprocess.py
+++ b/src/llmtuner/dsets/preprocess.py
@ -47,9 +47,13 @@ def preprocess_dataset(
            kwargs = dict(add_special_tokens=True)

        if hasattr(tokenizer, "add_eos_token"): # for LLaMA tokenizer
+            add_eos_token_flag = getattr(tokenizer, "add_eos_token")
            setattr(tokenizer, "add_eos_token", True)

        tokenized_examples = tokenizer(examples["prompt"], **kwargs)
+        # Make sure the saved tokenizer is the same as the original
+        if hasattr(tokenizer, "add_eos_token"):  # for Baichuan2 tokenizer
+            setattr(tokenizer, "add_eos_token", add_eos_token_flag)
        concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()}
        total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])
        block_size = data_args.cutoff_len