From bf3de9bfe851916a7cfae9c267a2a61b276951f2 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 11 Jun 2024 17:02:14 +0800 Subject: [PATCH] Update pretrain.py Former-commit-id: 0c292332374fb96c3fc753abde42d070a0c1dca8 --- src/llamafactory/data/processors/pretrain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llamafactory/data/processors/pretrain.py b/src/llamafactory/data/processors/pretrain.py index 4050f74c..832c987e 100644 --- a/src/llamafactory/data/processors/pretrain.py +++ b/src/llamafactory/data/processors/pretrain.py @@ -12,7 +12,7 @@ def preprocess_pretrain_dataset( examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments" ) -> Dict[str, List[List[int]]]: # build grouped texts with format `X1 X2 X3 ...` if packing is enabled - eos_token = '<|end_of_text|>' if data_args.template == 'llama3' else tokenizer.eos_token + eos_token = "<|end_of_text|>" if data_args.template == "llama3" else tokenizer.eos_token text_examples = [messages[0]["content"] + eos_token for messages in examples["prompt"]] if not data_args.packing: