mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-10-14 23:58:11 +08:00
fix qwen tokenizer #361
Former-commit-id: 78a2fa95c8ab669254a6c8fce8138c4395fb0a09
This commit is contained in:
parent
fdbb2c5378
commit
a70d56864e
@ -98,12 +98,16 @@ class Template:
|
||||
r"""
|
||||
Converts context to token ids.
|
||||
"""
|
||||
if hasattr(tokenizer, "tokenizer"): # for tiktoken tokenizer (Qwen)
|
||||
kwargs = dict(allowed_special="all")
|
||||
else:
|
||||
kwargs = dict(add_special_tokens=False)
|
||||
|
||||
token_ids = []
|
||||
for elem in context:
|
||||
if isinstance(elem, str):
|
||||
elem = elem.replace("{{query}}", query, 1)
|
||||
elem = elem.replace("<mask>", "[MASK]")
|
||||
token_ids = token_ids + tokenizer.encode(elem, add_special_tokens=False)
|
||||
token_ids = token_ids + tokenizer.encode(elem, **kwargs)
|
||||
elif isinstance(elem, dict):
|
||||
token_ids = token_ids + [tokenizer.convert_tokens_to_ids(elem.get("token"))]
|
||||
else:
|
||||
|
Loading…
x
Reference in New Issue
Block a user