From da39715085086c2b5ab487c04a7b31083b783b9e Mon Sep 17 00:00:00 2001 From: d <913015993@qq.com> Date: Tue, 11 Jun 2024 16:21:48 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E7=BB=8F=E8=BF=87=E5=A4=A7=E9=87=8F?= =?UTF-8?q?=E7=9A=84=E5=A2=9E=E9=87=8F=E9=A2=84=E8=AE=AD=E7=BB=83=EF=BC=8C?= =?UTF-8?q?=E8=BF=9B=E8=A1=8C=E5=AF=B9=E6=AF=94=E8=AF=95=E9=AA=8C=EF=BC=8C?= =?UTF-8?q?=E5=8F=91=E7=8E=B0=E8=BF=99=E4=B8=AAbug=EF=BC=9Allama3=E5=9C=A8?= =?UTF-8?q?=E9=A2=84=E8=AE=AD=E7=BB=83=E6=97=B6=E4=BD=BF=E7=94=A8=E7=9A=84?= =?UTF-8?q?tokenizer.eos=5Ftoke=E6=98=AF'<|end=5Fof=5Ftext|>'=20=EF=BC=8C?= =?UTF-8?q?=E8=BF=99=E9=87=8C=E5=9C=A8=E6=AF=8F=E6=9D=A1=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E5=90=8E=E9=9D=A2=E4=B9=9F=E5=BE=97=E7=94=A8=E8=BF=99=E4=B8=AA?= =?UTF-8?q?=EF=BC=8C=E8=80=8C=E4=B8=8D=E6=98=AF'<|eot=5Fid|>'=EF=BC=8C?= =?UTF-8?q?=E5=90=A6=E5=88=99=E5=BE=88=E5=AE=B9=E6=98=93=E5=AF=BC=E8=87=B4?= =?UTF-8?q?=E4=B8=A5=E9=87=8D=E7=9A=84=E6=80=A7=E8=83=BD=E4=B8=8B=E9=99=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Former-commit-id: 6979f3f8480755604d8aea8164f6418126e094c5 --- src/llamafactory/data/processors/pretrain.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/llamafactory/data/processors/pretrain.py b/src/llamafactory/data/processors/pretrain.py index 87727b55..4050f74c 100644 --- a/src/llamafactory/data/processors/pretrain.py +++ b/src/llamafactory/data/processors/pretrain.py @@ -12,7 +12,8 @@ def preprocess_pretrain_dataset( examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments" ) -> Dict[str, List[List[int]]]: # build grouped texts with format `X1 X2 X3 ...` if packing is enabled - text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]] + eos_token = '<|end_of_text|>' if data_args.template == 'llama3' else tokenizer.eos_token + text_examples = [messages[0]["content"] + eos_token for messages in examples["prompt"]] if not data_args.packing: if data_args.template == "gemma": From bf3de9bfe851916a7cfae9c267a2a61b276951f2 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 11 Jun 2024 17:02:14 +0800 Subject: [PATCH 2/2] Update pretrain.py Former-commit-id: 0c292332374fb96c3fc753abde42d070a0c1dca8 --- src/llamafactory/data/processors/pretrain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llamafactory/data/processors/pretrain.py b/src/llamafactory/data/processors/pretrain.py index 4050f74c..832c987e 100644 --- a/src/llamafactory/data/processors/pretrain.py +++ b/src/llamafactory/data/processors/pretrain.py @@ -12,7 +12,7 @@ def preprocess_pretrain_dataset( examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments" ) -> Dict[str, List[List[int]]]: # build grouped texts with format `X1 X2 X3 ...` if packing is enabled - eos_token = '<|end_of_text|>' if data_args.template == 'llama3' else tokenizer.eos_token + eos_token = "<|end_of_text|>" if data_args.template == "llama3" else tokenizer.eos_token text_examples = [messages[0]["content"] + eos_token for messages in examples["prompt"]] if not data_args.packing: