From da39715085086c2b5ab487c04a7b31083b783b9e Mon Sep 17 00:00:00 2001 From: d <913015993@qq.com> Date: Tue, 11 Jun 2024 16:21:48 +0800 Subject: [PATCH] =?UTF-8?q?=E7=BB=8F=E8=BF=87=E5=A4=A7=E9=87=8F=E7=9A=84?= =?UTF-8?q?=E5=A2=9E=E9=87=8F=E9=A2=84=E8=AE=AD=E7=BB=83=EF=BC=8C=E8=BF=9B?= =?UTF-8?q?=E8=A1=8C=E5=AF=B9=E6=AF=94=E8=AF=95=E9=AA=8C=EF=BC=8C=E5=8F=91?= =?UTF-8?q?=E7=8E=B0=E8=BF=99=E4=B8=AAbug=EF=BC=9Allama3=E5=9C=A8=E9=A2=84?= =?UTF-8?q?=E8=AE=AD=E7=BB=83=E6=97=B6=E4=BD=BF=E7=94=A8=E7=9A=84tokenizer?= =?UTF-8?q?.eos=5Ftoke=E6=98=AF'<|end=5Fof=5Ftext|>'=20=EF=BC=8C=E8=BF=99?= =?UTF-8?q?=E9=87=8C=E5=9C=A8=E6=AF=8F=E6=9D=A1=E6=95=B0=E6=8D=AE=E5=90=8E?= =?UTF-8?q?=E9=9D=A2=E4=B9=9F=E5=BE=97=E7=94=A8=E8=BF=99=E4=B8=AA=EF=BC=8C?= =?UTF-8?q?=E8=80=8C=E4=B8=8D=E6=98=AF'<|eot=5Fid|>'=EF=BC=8C=E5=90=A6?= =?UTF-8?q?=E5=88=99=E5=BE=88=E5=AE=B9=E6=98=93=E5=AF=BC=E8=87=B4=E4=B8=A5?= =?UTF-8?q?=E9=87=8D=E7=9A=84=E6=80=A7=E8=83=BD=E4=B8=8B=E9=99=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Former-commit-id: 6979f3f8480755604d8aea8164f6418126e094c5 --- src/llamafactory/data/processors/pretrain.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/llamafactory/data/processors/pretrain.py b/src/llamafactory/data/processors/pretrain.py index 87727b55..4050f74c 100644 --- a/src/llamafactory/data/processors/pretrain.py +++ b/src/llamafactory/data/processors/pretrain.py @@ -12,7 +12,8 @@ def preprocess_pretrain_dataset( examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments" ) -> Dict[str, List[List[int]]]: # build grouped texts with format `X1 X2 X3 ...` if packing is enabled - text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]] + eos_token = '<|end_of_text|>' if data_args.template == 'llama3' else tokenizer.eos_token + text_examples = [messages[0]["content"] + eos_token for messages in examples["prompt"]] if not data_args.packing: if data_args.template == "gemma":