This commit is contained in:
hiyouga
2023-09-01 23:13:05 +08:00
parent a9d1fb72f7
commit 370bdb6e43
2 changed files with 3 additions and 1 deletions

View File

@@ -34,6 +34,8 @@ def preprocess_dataset(
# build grouped texts with format `X1 X2 X3 ...`
if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding):
kwargs = dict(allowed_special="all") # for tiktoken tokenizer (Qwen)
else:
kwargs = dict(add_special_tokens=True)
if hasattr(tokenizer, "add_bos_token") and hasattr(tokenizer, "add_eos_token"):
setattr(tokenizer, "add_bos_token", True) # for LLaMA tokenizer