fix scripts

Former-commit-id: eb3e147d198a3ecb02c65f7733cec7cd9d3814a3
This commit is contained in:
hiyouga 2024-12-05 03:47:28 +00:00
parent 9bbeba6323
commit 819f487c8f
4 changed files with 32 additions and 24 deletions

View File

@ -22,9 +22,9 @@ import fire
import torch import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from tqdm import tqdm from tqdm import tqdm
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq from transformers import DataCollatorForLanguageModeling
from llamafactory.data import get_dataset, get_template_and_fix_tokenizer from llamafactory.data import get_dataset, get_template_and_fix_tokenizer, MultiModalDataCollatorForSeq2Seq
from llamafactory.extras.constants import IGNORE_INDEX from llamafactory.extras.constants import IGNORE_INDEX
from llamafactory.hparams import get_train_args from llamafactory.hparams import get_train_args
from llamafactory.model import load_tokenizer from llamafactory.model import load_tokenizer
@ -71,7 +71,7 @@ def calculate_lr(
if stage == "pt": if stage == "pt":
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
elif stage == "sft": elif stage == "sft":
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX) data_collator = MultiModalDataCollatorForSeq2Seq(template=template, tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX)
else: else:
raise NotImplementedError(f"Stage does not supported: {stage}.") raise NotImplementedError(f"Stage does not supported: {stage}.")
@ -81,14 +81,13 @@ def calculate_lr(
valid_tokens += torch.sum(batch["labels"] != IGNORE_INDEX).item() valid_tokens += torch.sum(batch["labels"] != IGNORE_INDEX).item()
total_tokens += torch.numel(batch["labels"]) total_tokens += torch.numel(batch["labels"])
batch_max_len = cutoff_len * batch_size # max tokens in a batch
valid_ratio = valid_tokens / total_tokens valid_ratio = valid_tokens / total_tokens
batch_valid_len = batch_max_len * valid_ratio token_batch_size = cutoff_len * batch_size * valid_ratio
lr = BASE_LR * math.sqrt(batch_valid_len / BASE_BS) # lr ~ sqrt(batch_size) lr = BASE_LR * math.sqrt(token_batch_size / BASE_BS) # lr ~ sqrt(batch_size)
lr = lr / 6.0 if is_mistral_or_gemma else lr lr = lr / 6.0 if is_mistral_or_gemma else lr
print( print(
"Optimal learning rate is {:.2e} for valid ratio% {:.2f} and effective batch size {:.2f}".format( "Optimal learning rate is {:.2e} for valid ratio% {:.2f} and effective token batch size {:.2f}".format(
lr, valid_ratio * 100, batch_valid_len lr, valid_ratio * 100, token_batch_size
) )
) )

View File

@ -20,16 +20,16 @@ import fire
import torch import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from tqdm import tqdm from tqdm import tqdm
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq from transformers import DataCollatorForLanguageModeling
from llamafactory.data import get_dataset, get_template_and_fix_tokenizer from llamafactory.data import MultiModalDataCollatorForSeq2Seq, get_dataset, get_template_and_fix_tokenizer
from llamafactory.extras.constants import IGNORE_INDEX from llamafactory.extras.constants import IGNORE_INDEX
from llamafactory.hparams import get_train_args from llamafactory.hparams import get_train_args
from llamafactory.model import load_model, load_tokenizer from llamafactory.model import load_model, load_tokenizer
@dataclass @dataclass
class PairwiseDataCollatorWithPadding(DataCollatorForSeq2Seq): class PairwiseDataCollatorWithPadding(MultiModalDataCollatorForSeq2Seq):
r""" r"""
Data collator for pairwise data. Data collator for pairwise data.
""" """
@ -39,24 +39,25 @@ class PairwiseDataCollatorWithPadding(DataCollatorForSeq2Seq):
def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]: def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
r""" r"""
Pads batched data to the longest sequence in the batch. Pads batched data to the longest sequence in the batch.
We generate 2 * n examples where the first n examples represent chosen examples and
the last n examples represent rejected examples.
""" """
chosen_features = [] chosen_features = []
for feature in features: for feature in features:
prompt_len, answer_len = len(feature["prompt_ids"]), len(feature["chosen_ids"]) chosen_features.append(
input_ids = feature["prompt_ids"] + feature["chosen_ids"] {
attention_mask = [1] * (prompt_len + answer_len) "input_ids": feature["chosen_input_ids"],
labels = input_ids if self.train_on_prompt else [IGNORE_INDEX] * prompt_len + feature["chosen_ids"] "attention_mask": feature["chosen_attention_mask"],
chosen_features.append({"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}) "labels": feature["chosen_input_ids"] if self.train_on_prompt else feature["chosen_labels"],
"images": feature["images"],
"videos": feature["videos"],
}
)
return super().__call__(chosen_features) return super().__call__(chosen_features)
def calculate_ppl( def calculate_ppl(
model_name_or_path: str, model_name_or_path: str,
save_name: str, save_name: str = "ppl.json",
batch_size: int = 4, batch_size: int = 4,
stage: Literal["pt", "sft", "rm"] = "sft", stage: Literal["pt", "sft", "rm"] = "sft",
dataset: str = "alpaca_en_demo", dataset: str = "alpaca_en_demo",
@ -68,7 +69,8 @@ def calculate_ppl(
): ):
r""" r"""
Calculates the ppl on the dataset of the pre-trained models. Calculates the ppl on the dataset of the pre-trained models.
Usage: python cal_ppl.py --model_name_or_path path_to_model --dataset alpaca_en_demo --save_name ppl.json Usage: export CUDA_VISIBLE_DEVICES=0
python cal_ppl.py --model_name_or_path path_to_model --dataset alpaca_en_demo --save_name ppl.json
""" """
model_args, data_args, training_args, finetuning_args, _ = get_train_args( model_args, data_args, training_args, finetuning_args, _ = get_train_args(
dict( dict(
@ -93,10 +95,12 @@ def calculate_ppl(
if stage == "pt": if stage == "pt":
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
elif stage == "sft": elif stage == "sft":
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX) data_collator = MultiModalDataCollatorForSeq2Seq(
template=template, tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX
)
elif stage == "rm": elif stage == "rm":
data_collator = PairwiseDataCollatorWithPadding( data_collator = PairwiseDataCollatorWithPadding(
tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX, train_on_prompt=train_on_prompt template=template, tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX, train_on_prompt=train_on_prompt
) )
else: else:
raise NotImplementedError(f"Stage does not supported: {stage}.") raise NotImplementedError(f"Stage does not supported: {stage}.")

View File

@ -31,7 +31,8 @@ def length_cdf(
): ):
r""" r"""
Calculates the distribution of the input lengths in the dataset. Calculates the distribution of the input lengths in the dataset.
Usage: python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en_demo --template default Usage: export CUDA_VISIBLE_DEVICES=0
python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en_demo --template default
""" """
model_args, data_args, training_args, _, _ = get_train_args( model_args, data_args, training_args, _, _ = get_train_args(
dict( dict(

View File

@ -86,6 +86,10 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
template: Optional["Template"] = None template: Optional["Template"] = None
processor: Optional["ProcessorMixin"] = None processor: Optional["ProcessorMixin"] = None
def __post_init__(self):
if self.template is None:
raise ValueError("Template is required for MultiModalDataCollator.")
def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, "torch.Tensor"]: def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, "torch.Tensor"]:
batch_images, batch_videos, batch_imglens, batch_vidlens, batch_input_ids = [], [], [], [], [] batch_images, batch_videos, batch_imglens, batch_vidlens, batch_input_ids = [], [], [], [], []
for feature in features: for feature in features: