mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-08-06 05:32:50 +08:00
parent
a4fd976048
commit
3b5a9c60b6
@ -34,6 +34,8 @@ def preprocess_dataset(
|
|||||||
# build grouped texts with format `X1 X2 X3 ...`
|
# build grouped texts with format `X1 X2 X3 ...`
|
||||||
if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding):
|
if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding):
|
||||||
kwargs = dict(allowed_special="all") # for tiktoken tokenizer (Qwen)
|
kwargs = dict(allowed_special="all") # for tiktoken tokenizer (Qwen)
|
||||||
|
else:
|
||||||
|
kwargs = dict(add_special_tokens=True)
|
||||||
|
|
||||||
if hasattr(tokenizer, "add_bos_token") and hasattr(tokenizer, "add_eos_token"):
|
if hasattr(tokenizer, "add_bos_token") and hasattr(tokenizer, "add_eos_token"):
|
||||||
setattr(tokenizer, "add_bos_token", True) # for LLaMA tokenizer
|
setattr(tokenizer, "add_bos_token", True) # for LLaMA tokenizer
|
||||||
|
Loading…
x
Reference in New Issue
Block a user