improve fix tokenizer

This commit is contained in:
hiyouga
2024-02-09 14:53:14 +08:00
parent d0daaa01f9
commit 54ea9684ed
7 changed files with 105 additions and 71 deletions

View File

@@ -142,7 +142,7 @@ def get_dataset(
stage: Literal["pt", "sft", "rm", "ppo"],
# split: Optional[str] = "train", # TODO: add split
) -> Union["Dataset", "IterableDataset"]:
template = get_template_and_fix_tokenizer(data_args.template, tokenizer)
template = get_template_and_fix_tokenizer(tokenizer, data_args.template)
if data_args.train_on_prompt and template.efficient_eos:
raise ValueError("Current template does not support `train_on_prompt`.")