diff --git a/scripts/cal_lr.py b/scripts/cal_lr.py index ad6992cb..a103e082 100644 --- a/scripts/cal_lr.py +++ b/scripts/cal_lr.py @@ -44,6 +44,7 @@ def calculate_lr( template: str = "default", cutoff_len: int = 1024, # i.e. maximum input length during training is_mistral: bool = False, # mistral model uses a smaller learning rate, + packing: bool = False, ): r""" Calculates the optimal learning rate for 7B/13B models using LLaMA's hyper-parameters. @@ -57,6 +58,7 @@ def calculate_lr( dataset_dir=dataset_dir, template=template, cutoff_len=cutoff_len, + packing=packing, output_dir="dummy_dir", overwrite_cache=True, ) @@ -69,7 +71,7 @@ def calculate_lr( elif stage == "sft": data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX) else: - raise NotImplementedError + raise NotImplementedError("Stage does not supported: {}.".format(stage)) dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True) valid_tokens, total_tokens = 0, 0 diff --git a/scripts/cal_ppl.py b/scripts/cal_ppl.py index fb503629..61f76922 100644 --- a/scripts/cal_ppl.py +++ b/scripts/cal_ppl.py @@ -98,7 +98,7 @@ def cal_ppl( tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX, train_on_prompt=train_on_prompt ) else: - raise NotImplementedError + raise NotImplementedError("Stage does not supported: {}.".format(stage)) dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True) criterion = torch.nn.CrossEntropyLoss(reduction="none")