From e6ab1a57ea00d64e71b4f503ee1171c0e47c6630 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Tue, 12 Mar 2024 21:21:54 +0800 Subject: [PATCH] patch for gemma cpt Former-commit-id: fc0b19c62f52a90d78b63761dda3d8970a42f2da --- src/llmtuner/data/preprocess.py | 4 ++++ src/llmtuner/extras/constants.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py index c2b87860..7fb0a9b6 100644 --- a/src/llmtuner/data/preprocess.py +++ b/src/llmtuner/data/preprocess.py @@ -37,6 +37,10 @@ def preprocess_pretrain_dataset( k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } + if data_args.template == "gemma": + for i in range(len(result["input_ids"])): + result["input_ids"][i][0] = tokenizer.bos_token_id + return result diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py index 03d93cc2..12ba8b23 100644 --- a/src/llmtuner/extras/constants.py +++ b/src/llmtuner/extras/constants.py @@ -503,7 +503,7 @@ register_model_group( }, "OLMo-7B-Chat": { DownloadSource.DEFAULT: "allenai/OLMo-7B-Instruct", - } + }, }, module="att_proj", template="olmo",