fix #763

Former-commit-id: 370bdb6e4309db03e26cad311fb13e5cbb1fc1bf
2025-08-06 05:32:50 +08:00 · 2023-09-01 23:13:05 +08:00 · 2023-09-01 23:13:05 +08:00 · 3b5a9c60b6
commit 3b5a9c60b6
parent a4fd976048
2 changed files with 3 additions and 1 deletions
--- a/src/llmtuner/dsets/preprocess.py
+++ b/src/llmtuner/dsets/preprocess.py
@ -34,6 +34,8 @@ def preprocess_dataset(
        # build grouped texts with format `X1 X2 X3 ...`
        if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding):
            kwargs = dict(allowed_special="all") # for tiktoken tokenizer (Qwen)
        else:
            kwargs = dict(add_special_tokens=True)
        if hasattr(tokenizer, "add_bos_token") and hasattr(tokenizer, "add_eos_token"):
            setattr(tokenizer, "add_bos_token", True) # for LLaMA tokenizer