From 096c31bfb6eea04b6b3c0054cde262511f008e31 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Tue, 12 Mar 2024 21:21:54 +0800 Subject: [PATCH] patch for gemma cpt Former-commit-id: 70a3052dd8a2d1322fa01ab19e369e465842d416 --- src/llmtuner/data/preprocess.py | 4 ++++ src/llmtuner/extras/constants.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py index c2b87860..7fb0a9b6 100644 --- a/src/llmtuner/data/preprocess.py +++ b/src/llmtuner/data/preprocess.py @@ -37,6 +37,10 @@ def preprocess_pretrain_dataset( k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } + if data_args.template == "gemma": + for i in range(len(result["input_ids"])): + result["input_ids"][i][0] = tokenizer.bos_token_id + return result diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py index 03d93cc2..12ba8b23 100644 --- a/src/llmtuner/extras/constants.py +++ b/src/llmtuner/extras/constants.py @@ -503,7 +503,7 @@ register_model_group( }, "OLMo-7B-Chat": { DownloadSource.DEFAULT: "allenai/OLMo-7B-Instruct", - } + }, }, module="att_proj", template="olmo",