From 39955c28ff247d50074c93e919bd8fc3014e83b0 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Sat, 5 Aug 2023 17:06:05 +0800 Subject: [PATCH] fix qwen tokenizer #361 Former-commit-id: 7f18d2a3359bcaab0f208f8c4a4bb13b6638072b --- src/llmtuner/extras/template.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/llmtuner/extras/template.py b/src/llmtuner/extras/template.py index 066f6c79..a6193744 100644 --- a/src/llmtuner/extras/template.py +++ b/src/llmtuner/extras/template.py @@ -98,12 +98,16 @@ class Template: r""" Converts context to token ids. """ + if hasattr(tokenizer, "tokenizer"): # for tiktoken tokenizer (Qwen) + kwargs = dict(allowed_special="all") + else: + kwargs = dict(add_special_tokens=False) + token_ids = [] for elem in context: if isinstance(elem, str): elem = elem.replace("{{query}}", query, 1) - elem = elem.replace("", "[MASK]") - token_ids = token_ids + tokenizer.encode(elem, add_special_tokens=False) + token_ids = token_ids + tokenizer.encode(elem, **kwargs) elif isinstance(elem, dict): token_ids = token_ids + [tokenizer.convert_tokens_to_ids(elem.get("token"))] else: