mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-08-23 06:12:50 +08:00
Update pretrain.py
Former-commit-id: 0c292332374fb96c3fc753abde42d070a0c1dca8
This commit is contained in:
parent
da39715085
commit
bf3de9bfe8
@ -12,7 +12,7 @@ def preprocess_pretrain_dataset(
|
||||
examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments"
|
||||
) -> Dict[str, List[List[int]]]:
|
||||
# build grouped texts with format `X1 X2 X3 ...` if packing is enabled
|
||||
eos_token = '<|end_of_text|>' if data_args.template == 'llama3' else tokenizer.eos_token
|
||||
eos_token = "<|end_of_text|>" if data_args.template == "llama3" else tokenizer.eos_token
|
||||
text_examples = [messages[0]["content"] + eos_token for messages in examples["prompt"]]
|
||||
|
||||
if not data_args.packing:
|
||||
|
Loading…
x
Reference in New Issue
Block a user