diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py index c2b87860..7fb0a9b6 100644 --- a/src/llmtuner/data/preprocess.py +++ b/src/llmtuner/data/preprocess.py @@ -37,6 +37,10 @@ def preprocess_pretrain_dataset( k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } + if data_args.template == "gemma": + for i in range(len(result["input_ids"])): + result["input_ids"][i][0] = tokenizer.bos_token_id + return result diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py index 03d93cc2..12ba8b23 100644 --- a/src/llmtuner/extras/constants.py +++ b/src/llmtuner/extras/constants.py @@ -503,7 +503,7 @@ register_model_group( }, "OLMo-7B-Chat": { DownloadSource.DEFAULT: "allenai/OLMo-7B-Instruct", - } + }, }, module="att_proj", template="olmo",