From a78cd5c5ad7a208c5581e222e1b05e8684db0de0 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Fri, 5 Jan 2024 16:14:56 +0800 Subject: [PATCH] fix qwen template Former-commit-id: ed216bbc4693860eaf27c2a95108f705d530cbdb --- src/llmtuner/data/template.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py index ee7c0da1..a53abc41 100644 --- a/src/llmtuner/data/template.py +++ b/src/llmtuner/data/template.py @@ -1,5 +1,4 @@ import tiktoken -from copy import deepcopy from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union @@ -224,12 +223,13 @@ def get_template_and_fix_tokenizer( template = templates.get(name, None) assert template is not None, "Template {} does not exist.".format(name) - stop_words = deepcopy(template.stop_words) + stop_words = template.stop_words if template.replace_eos: if not stop_words: raise ValueError("Stop words are required to replace the EOS token.") - tokenizer.eos_token = stop_words.pop(0) + tokenizer.eos_token = stop_words[0] + stop_words = stop_words[1:] logger.info("Replace eos token: {}".format(tokenizer.eos_token)) if stop_words: @@ -601,26 +601,19 @@ register_template( register_template( name="qwen", prefix=[ - {"token": "<|im_start|>"}, - "system\n{{system}}" + "<|im_start|>system\n{{system}}<|im_end|>" ], prompt=[ - {"token": "<|im_start|>"}, - "user\n{{query}}", - {"token": "<|im_end|>"}, - "\n", - {"token": "<|im_start|>"}, - "assistant\n" + "<|im_start|>user\n{{query}}<|im_end|>\n<|im_start|>assistant\n" ], system="You are a helpful assistant.", sep=[ - {"token": "<|im_end|>"}, "\n" ], stop_words=[ "<|im_end|>" ], - efficient_eos=True + replace_eos=True )