Former-commit-id: 338b8664edea5ae65192ac657bb013581245ae15
This commit is contained in:
hiyouga 2023-09-21 19:51:02 +08:00
parent d04585df59
commit 4581d09fa6
11 changed files with 116 additions and 101 deletions

View File

@ -44,10 +44,10 @@ def preprocess_dataset(
tokenized_examples = tokenizer(examples["prompt"], **kwargs) tokenized_examples = tokenizer(examples["prompt"], **kwargs)
concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()} concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()}
total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]]) total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])
block_size = data_args.max_source_length block_size = data_args.cutoff_len
# we drop the small remainder, and if the total_length < block_size, we exclude this batch # we drop the small remainder, and if the total_length < block_size, we exclude this batch
total_length = (total_length // block_size) * block_size total_length = (total_length // block_size) * block_size
# split by chunks of max_source_length # split by chunks of cutoff_len
result = { result = {
k: [t[i: i + block_size] for i in range(0, total_length, block_size)] k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items() for k, t in concatenated_examples.items()
@ -58,7 +58,6 @@ def preprocess_dataset(
# build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>` # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
# for multiturn examples, we only mask the prompt part in each prompt-response pair. # for multiturn examples, we only mask the prompt part in each prompt-response pair.
model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
max_length = data_args.max_source_length + data_args.max_target_length
for query, response, history, system in construct_example(examples): for query, response, history, system in construct_example(examples):
input_ids, labels = [], [] input_ids, labels = [], []
@ -66,13 +65,14 @@ def preprocess_dataset(
for turn_idx, (source_ids, target_ids) in enumerate(template.encode_multiturn( for turn_idx, (source_ids, target_ids) in enumerate(template.encode_multiturn(
tokenizer, query, response, history, system tokenizer, query, response, history, system
)): )):
if len(source_ids) > data_args.max_source_length: total_len = len(source_ids) + len(target_ids)
source_ids = source_ids[:data_args.max_source_length] max_source_len = int(data_args.cutoff_len * (len(source_ids) / total_len))
if len(target_ids) > data_args.max_target_length: max_target_len = int(data_args.cutoff_len * (len(target_ids) / total_len))
target_ids = target_ids[:data_args.max_target_length]
if len(input_ids) + len(source_ids) + len(target_ids) > max_length: if len(source_ids) > max_source_len:
break source_ids = source_ids[:max_source_len]
if len(target_ids) > max_target_len:
target_ids = target_ids[:max_target_len]
if turn_idx != 0 and template.efficient_eos: if turn_idx != 0 and template.efficient_eos:
source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1) source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1)
@ -86,6 +86,10 @@ def preprocess_dataset(
input_ids += [tokenizer.eos_token_id] input_ids += [tokenizer.eos_token_id]
labels += [tokenizer.eos_token_id] labels += [tokenizer.eos_token_id]
if len(input_ids) > data_args.cutoff_len:
input_ids = input_ids[:data_args.cutoff_len]
labels = labels[:data_args.cutoff_len]
model_inputs["input_ids"].append(input_ids) model_inputs["input_ids"].append(input_ids)
model_inputs["attention_mask"].append([1] * len(input_ids)) model_inputs["attention_mask"].append([1] * len(input_ids))
model_inputs["labels"].append(labels) model_inputs["labels"].append(labels)
@ -97,19 +101,19 @@ def preprocess_dataset(
model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
for query, response, history, system in construct_example(examples): for query, response, history, system in construct_example(examples):
source_ids, target_ids = template.encode_oneturn(tokenizer, query, response, history, system) input_ids, labels = template.encode_oneturn(tokenizer, query, response, history, system)
if len(source_ids) > data_args.max_source_length:
source_ids = source_ids[:data_args.max_source_length]
if len(target_ids) > data_args.max_target_length:
target_ids = target_ids[:data_args.max_target_length]
if template.efficient_eos: if template.efficient_eos:
target_ids += [tokenizer.eos_token_id] labels += [tokenizer.eos_token_id]
model_inputs["input_ids"].append(source_ids) if len(input_ids) > data_args.cutoff_len:
model_inputs["attention_mask"].append([1] * len(source_ids)) input_ids = input_ids[:data_args.cutoff_len]
model_inputs["labels"].append(target_ids) if len(labels) > data_args.cutoff_len:
labels = labels[:data_args.cutoff_len]
model_inputs["input_ids"].append(input_ids)
model_inputs["attention_mask"].append([1] * len(input_ids))
model_inputs["labels"].append(labels)
return model_inputs return model_inputs
@ -120,17 +124,21 @@ def preprocess_dataset(
prompt_ids, chosen_ids = template.encode_oneturn(tokenizer, query, response[0], history, system) prompt_ids, chosen_ids = template.encode_oneturn(tokenizer, query, response[0], history, system)
_, rejected_ids = template.encode_oneturn(tokenizer, query, response[1], history, system) _, rejected_ids = template.encode_oneturn(tokenizer, query, response[1], history, system)
if len(prompt_ids) > data_args.max_source_length:
prompt_ids = prompt_ids[:data_args.max_source_length]
if len(chosen_ids) > data_args.max_target_length:
chosen_ids = chosen_ids[:data_args.max_target_length]
if len(rejected_ids) > data_args.max_target_length:
rejected_ids = rejected_ids[:data_args.max_target_length]
if template.efficient_eos: if template.efficient_eos:
chosen_ids += [tokenizer.eos_token_id] chosen_ids += [tokenizer.eos_token_id]
rejected_ids += [tokenizer.eos_token_id] rejected_ids += [tokenizer.eos_token_id]
total_len = len(prompt_ids) + max(len(chosen_ids), len(rejected_ids))
max_source_len = int(data_args.cutoff_len * (len(prompt_ids) / total_len))
max_target_len = int(data_args.cutoff_len * (max(len(chosen_ids), len(rejected_ids)) / total_len))
if len(prompt_ids) > max_source_len:
prompt_ids = prompt_ids[:max_source_len]
if len(chosen_ids) > max_target_len:
chosen_ids = chosen_ids[:max_target_len]
if len(rejected_ids) > max_target_len:
rejected_ids = rejected_ids[:max_target_len]
model_inputs["prompt_ids"].append(prompt_ids) model_inputs["prompt_ids"].append(prompt_ids)
model_inputs["chosen_ids"].append(chosen_ids) model_inputs["chosen_ids"].append(chosen_ids)
model_inputs["rejected_ids"].append(rejected_ids) model_inputs["rejected_ids"].append(rejected_ids)

View File

@ -42,12 +42,16 @@ class DataArguments:
default="train", default="train",
metadata={"help": "Which dataset split to use for training and evaluation."} metadata={"help": "Which dataset split to use for training and evaluation."}
) )
cutoff_len: Optional[int] = field(
default=1024,
metadata={"help": "The maximum length of the model inputs after tokenization."}
)
streaming: Optional[bool] = field( streaming: Optional[bool] = field(
default=False, default=False,
metadata={"help": "Enable streaming mode."} metadata={"help": "Enable streaming mode."}
) )
buffer_size: Optional[int] = field( buffer_size: Optional[int] = field(
default=1024, default=16384,
metadata={"help": "Size of the buffer to randomly sample examples from in streaming mode."} metadata={"help": "Size of the buffer to randomly sample examples from in streaming mode."}
) )
mix_strategy: Optional[Literal["concat", "interleave_under", "interleave_over"]] = field( mix_strategy: Optional[Literal["concat", "interleave_under", "interleave_over"]] = field(
@ -66,14 +70,6 @@ class DataArguments:
default=None, default=None,
metadata={"help": "The number of processes to use for the preprocessing."} metadata={"help": "The number of processes to use for the preprocessing."}
) )
max_source_length: Optional[int] = field(
default=512,
metadata={"help": "The maximum total input sequence length after tokenization."}
)
max_target_length: Optional[int] = field(
default=512,
metadata={"help": "The maximum total output sequence length after tokenization."}
)
max_samples: Optional[int] = field( max_samples: Optional[int] = field(
default=None, default=None,
metadata={"help": "For debugging purposes, truncate the number of examples for each dataset."} metadata={"help": "For debugging purposes, truncate the number of examples for each dataset."}

View File

@ -63,18 +63,10 @@ class ModelArguments:
default=None, default=None,
metadata={"help": "Auth token to log in with Hugging Face Hub."} metadata={"help": "Auth token to log in with Hugging Face Hub."}
) )
compute_dtype: Optional[torch.dtype] = field(
default=None,
metadata={"help": "Used in quantization configs. Do not specify this argument manually."}
)
model_max_length: Optional[int] = field(
default=None,
metadata={"help": "Used in rope scaling. Do not specify this argument manually."}
)
def __post_init__(self): def __post_init__(self):
if self.compute_dtype is not None or self.model_max_length is not None: self.compute_dtype = None
raise ValueError("These arguments cannot be specified.") self.model_max_length = None
if self.checkpoint_dir is not None: # support merging multiple lora weights if self.checkpoint_dir is not None: # support merging multiple lora weights
self.checkpoint_dir = [cd.strip() for cd in self.checkpoint_dir.split(",")] self.checkpoint_dir = [cd.strip() for cd in self.checkpoint_dir.split(",")]

View File

@ -173,7 +173,7 @@ def load_model_and_tokenizer(
) )
# Disable custom generate method (for Qwen) # Disable custom generate method (for Qwen)
if "GenerationMixin" not in str(model.generate.__func__): if isinstance(model, PreTrainedModel) and "GenerationMixin" not in str(model.generate.__func__):
model.generate = MethodType(PreTrainedModel.generate, model) model.generate = MethodType(PreTrainedModel.generate, model)
# Fix LM head (for ChatGLM2) # Fix LM head (for ChatGLM2)

View File

@ -213,7 +213,7 @@ def get_train_args(
else: else:
model_args.compute_dtype = torch.float32 model_args.compute_dtype = torch.float32
model_args.model_max_length = data_args.max_source_length + data_args.max_target_length model_args.model_max_length = data_args.cutoff_len
# Log on each process the small summary: # Log on each process the small summary:
logger.info("Process rank: {}, device: {}, n_gpu: {}\n distributed training: {}, compute dtype: {}".format( logger.info("Process rank: {}, device: {}, n_gpu: {}\n distributed training: {}, compute dtype: {}".format(

View File

@ -2,13 +2,13 @@ import os
import math import math
import torch import torch
from tqdm import tqdm from tqdm import tqdm
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
from transformers import GenerationConfig, Trainer, TrainerState, TrainerControl from transformers import GenerationConfig, Trainer, TrainerState, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
from trl import PPOTrainer from trl import PPOTrainer
from trl.core import LengthSampler, PPODecorators, logprobs_from_logits from trl.core import PPODecorators, logprobs_from_logits
from llmtuner.extras.logging import get_logger from llmtuner.extras.logging import get_logger
from llmtuner.extras.misc import AverageMeter, count_parameters, get_logits_processor from llmtuner.extras.misc import AverageMeter, count_parameters, get_logits_processor
@ -47,7 +47,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
self.state = TrainerState() self.state = TrainerState()
self.control = TrainerControl() self.control = TrainerControl()
def ppo_train(self, max_target_length: int) -> None: def ppo_train(self) -> None:
r""" r"""
Implements training loop for the PPO stage, like _inner_training_loop() in Huggingface's Trainer. Implements training loop for the PPO stage, like _inner_training_loop() in Huggingface's Trainer.
""" """
@ -81,9 +81,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
pad_token_id=self.tokenizer.pad_token_id pad_token_id=self.tokenizer.pad_token_id
)) ))
length_sampler = LengthSampler(max_target_length // 2, max_target_length)
unwrapped_model: "AutoModelForCausalLMWithValueHead" = self.accelerator.unwrap_model(self.model) unwrapped_model: "AutoModelForCausalLMWithValueHead" = self.accelerator.unwrap_model(self.model)
dataiter = iter(self.dataloader) dataiter = iter(self.dataloader)
steps_trained = 0 steps_trained = 0
loss_meter = AverageMeter() loss_meter = AverageMeter()
@ -100,7 +98,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
self.model.eval() self.model.eval()
# Get inputs # Get inputs
queries, responses = self.get_inputs(batch, length_sampler, generating_args) queries, responses = self.get_inputs(batch, generating_args)
self.tokenizer.padding_side = "right" # change padding side self.tokenizer.padding_side = "right" # change padding side
rewards = self.get_rewards(queries, responses, unwrapped_model) rewards = self.get_rewards(queries, responses, unwrapped_model)
@ -156,13 +154,11 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
def get_inputs( def get_inputs(
self, self,
batch: Dict[str, torch.Tensor], batch: Dict[str, torch.Tensor],
length_sampler: Callable,
generating_args: Dict[str, Any] generating_args: Dict[str, Any]
) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
r""" r"""
Generates model's responses given queries. Generates model's responses given queries.
""" """
generating_args["max_new_tokens"] = length_sampler()
gen_kwargs = dict( gen_kwargs = dict(
generation_config=GenerationConfig(**generating_args), generation_config=GenerationConfig(**generating_args),
logits_processor=get_logits_processor(), logits_processor=get_logits_processor(),

View File

@ -79,7 +79,7 @@ def run_ppo(
# Training # Training
if training_args.do_train: if training_args.do_train:
ppo_trainer.ppo_train(max_target_length=data_args.max_target_length) ppo_trainer.ppo_train()
ppo_trainer.save_model() ppo_trainer.save_model()
ppo_trainer.save_state() # must be called after save_model to have a folder ppo_trainer.save_state() # must be called after save_model to have a folder
if ppo_trainer.is_world_process_zero() and model_args.plot_loss: if ppo_trainer.is_world_process_zero() and model_args.plot_loss:

View File

@ -28,12 +28,16 @@ def create_eval_tab(top_elems: Dict[str, "Component"], runner: "Runner") -> Dict
) )
with gr.Row(): with gr.Row():
max_source_length = gr.Slider(value=512, minimum=4, maximum=4096, step=1) cutoff_len = gr.Slider(value=1024, minimum=4, maximum=8192, step=1)
max_target_length = gr.Slider(value=512, minimum=4, maximum=4096, step=1)
max_samples = gr.Textbox(value="100000") max_samples = gr.Textbox(value="100000")
batch_size = gr.Slider(value=8, minimum=1, maximum=512, step=1) batch_size = gr.Slider(value=8, minimum=1, maximum=512, step=1)
predict = gr.Checkbox(value=True) predict = gr.Checkbox(value=True)
with gr.Row():
max_new_tokens = gr.Slider(10, 2048, value=128, step=1)
top_p = gr.Slider(0.01, 1, value=0.7, step=0.01)
temperature = gr.Slider(0.01, 1.5, value=0.95, step=0.01)
with gr.Row(): with gr.Row():
cmd_preview_btn = gr.Button() cmd_preview_btn = gr.Button()
start_btn = gr.Button() start_btn = gr.Button()
@ -55,11 +59,13 @@ def create_eval_tab(top_elems: Dict[str, "Component"], runner: "Runner") -> Dict
top_elems["system_prompt"], top_elems["system_prompt"],
dataset_dir, dataset_dir,
dataset, dataset,
max_source_length, cutoff_len,
max_target_length,
max_samples, max_samples,
batch_size, batch_size,
predict predict,
max_new_tokens,
top_p,
temperature
] ]
output_components = [ output_components = [
@ -78,11 +84,13 @@ def create_eval_tab(top_elems: Dict[str, "Component"], runner: "Runner") -> Dict
preview_count=preview_count, preview_count=preview_count,
preview_samples=preview_samples, preview_samples=preview_samples,
close_btn=close_btn, close_btn=close_btn,
max_source_length=max_source_length, cutoff_len=cutoff_len,
max_target_length=max_target_length,
max_samples=max_samples, max_samples=max_samples,
batch_size=batch_size, batch_size=batch_size,
predict=predict, predict=predict,
max_new_tokens=max_new_tokens,
top_p=top_p,
temperature=temperature,
cmd_preview_btn=cmd_preview_btn, cmd_preview_btn=cmd_preview_btn,
start_btn=start_btn, start_btn=start_btn,
stop_btn=stop_btn, stop_btn=stop_btn,

View File

@ -35,11 +35,11 @@ def create_train_tab(top_elems: Dict[str, "Component"], runner: "Runner") -> Dic
) )
with gr.Row(): with gr.Row():
max_source_length = gr.Slider(value=512, minimum=4, maximum=4096, step=1) cutoff_len = gr.Slider(value=1024, minimum=4, maximum=8192, step=1)
max_target_length = gr.Slider(value=512, minimum=4, maximum=4096, step=1)
learning_rate = gr.Textbox(value="5e-5") learning_rate = gr.Textbox(value="5e-5")
num_train_epochs = gr.Textbox(value="3.0") num_train_epochs = gr.Textbox(value="3.0")
max_samples = gr.Textbox(value="100000") max_samples = gr.Textbox(value="100000")
compute_type = gr.Radio(choices=["fp16", "bf16"], value="fp16")
with gr.Row(): with gr.Row():
batch_size = gr.Slider(value=4, minimum=1, maximum=512, step=1) batch_size = gr.Slider(value=4, minimum=1, maximum=512, step=1)
@ -55,7 +55,8 @@ def create_train_tab(top_elems: Dict[str, "Component"], runner: "Runner") -> Dic
logging_steps = gr.Slider(value=5, minimum=5, maximum=1000, step=5) logging_steps = gr.Slider(value=5, minimum=5, maximum=1000, step=5)
save_steps = gr.Slider(value=100, minimum=10, maximum=5000, step=10) save_steps = gr.Slider(value=100, minimum=10, maximum=5000, step=10)
warmup_steps = gr.Slider(value=0, minimum=0, maximum=5000, step=1) warmup_steps = gr.Slider(value=0, minimum=0, maximum=5000, step=1)
compute_type = gr.Radio(choices=["fp16", "bf16"], value="fp16") flash_attn = gr.Checkbox(value=False)
rope_scaling = gr.Checkbox(value=False)
with gr.Accordion(label="LoRA config", open=False) as lora_tab: with gr.Accordion(label="LoRA config", open=False) as lora_tab:
with gr.Row(): with gr.Row():
@ -107,11 +108,11 @@ def create_train_tab(top_elems: Dict[str, "Component"], runner: "Runner") -> Dic
training_stage, training_stage,
dataset_dir, dataset_dir,
dataset, dataset,
max_source_length, cutoff_len,
max_target_length,
learning_rate, learning_rate,
num_train_epochs, num_train_epochs,
max_samples, max_samples,
compute_type,
batch_size, batch_size,
gradient_accumulation_steps, gradient_accumulation_steps,
lr_scheduler_type, lr_scheduler_type,
@ -120,7 +121,8 @@ def create_train_tab(top_elems: Dict[str, "Component"], runner: "Runner") -> Dic
logging_steps, logging_steps,
save_steps, save_steps,
warmup_steps, warmup_steps,
compute_type, flash_attn,
rope_scaling,
lora_rank, lora_rank,
lora_dropout, lora_dropout,
lora_target, lora_target,
@ -151,11 +153,11 @@ def create_train_tab(top_elems: Dict[str, "Component"], runner: "Runner") -> Dic
preview_count=preview_count, preview_count=preview_count,
preview_samples=preview_samples, preview_samples=preview_samples,
close_btn=close_btn, close_btn=close_btn,
max_source_length=max_source_length, cutoff_len=cutoff_len,
max_target_length=max_target_length,
learning_rate=learning_rate, learning_rate=learning_rate,
num_train_epochs=num_train_epochs, num_train_epochs=num_train_epochs,
max_samples=max_samples, max_samples=max_samples,
compute_type=compute_type,
batch_size=batch_size, batch_size=batch_size,
gradient_accumulation_steps=gradient_accumulation_steps, gradient_accumulation_steps=gradient_accumulation_steps,
lr_scheduler_type=lr_scheduler_type, lr_scheduler_type=lr_scheduler_type,
@ -165,7 +167,8 @@ def create_train_tab(top_elems: Dict[str, "Component"], runner: "Runner") -> Dic
logging_steps=logging_steps, logging_steps=logging_steps,
save_steps=save_steps, save_steps=save_steps,
warmup_steps=warmup_steps, warmup_steps=warmup_steps,
compute_type=compute_type, flash_attn=flash_attn,
rope_scaling=rope_scaling,
lora_tab=lora_tab, lora_tab=lora_tab,
lora_rank=lora_rank, lora_rank=lora_rank,
lora_dropout=lora_dropout, lora_dropout=lora_dropout,

View File

@ -147,26 +147,16 @@ LOCALES = {
"value": "关闭" "value": "关闭"
} }
}, },
"max_source_length": { "cutoff_len": {
"en": { "en": {
"label": "Max source length", "label": "Cutoff length",
"info": "Max tokens in source sequence." "info": "Max tokens in input sequence."
}, },
"zh": { "zh": {
"label": "输入序列最大长度", "label": "截断长度",
"info": "输入序列分词后的最大长度。" "info": "输入序列分词后的最大长度。"
} }
}, },
"max_target_length": {
"en": {
"label": "Max target length",
"info": "Max tokens in target sequence."
},
"zh": {
"label": "输出序列最大长度",
"info": "输出序列分词后的最大长度。"
}
},
"learning_rate": { "learning_rate": {
"en": { "en": {
"label": "Learning rate", "label": "Learning rate",
@ -197,6 +187,16 @@ LOCALES = {
"info": "每个数据集最多使用的样本数。" "info": "每个数据集最多使用的样本数。"
} }
}, },
"compute_type": {
"en": {
"label": "Compute type",
"info": "Whether to use fp16 or bf16 mixed precision training."
},
"zh": {
"label": "计算类型",
"info": "是否启用 FP16 或 BF16 混合精度训练。"
}
},
"batch_size": { "batch_size": {
"en": { "en": {
"label": "Batch size", "label": "Batch size",
@ -277,14 +277,20 @@ LOCALES = {
"info": "学习率预热采用的步数。" "info": "学习率预热采用的步数。"
} }
}, },
"compute_type": { "flash_attn": {
"en": { "en": {
"label": "Compute type", "label": "Use FlashAttention-2"
"info": "Whether to use fp16 or bf16 mixed precision training."
}, },
"zh": { "zh": {
"label": "计算类型", "label": "使用 FlashAttention-2"
"info": "是否启用 FP16 或 BF16 混合精度训练。" }
},
"rope_scaling": {
"en": {
"label": "Use RoPE scaling"
},
"zh": {
"label": "使用 RoPE 插值"
} }
}, },
"lora_tab": { "lora_tab": {

View File

@ -73,11 +73,11 @@ class Runner:
training_stage: str, training_stage: str,
dataset_dir: str, dataset_dir: str,
dataset: List[str], dataset: List[str],
max_source_length: int, cutoff_len: int,
max_target_length: int,
learning_rate: str, learning_rate: str,
num_train_epochs: str, num_train_epochs: str,
max_samples: str, max_samples: str,
compute_type: str,
batch_size: int, batch_size: int,
gradient_accumulation_steps: int, gradient_accumulation_steps: int,
lr_scheduler_type: str, lr_scheduler_type: str,
@ -86,7 +86,8 @@ class Runner:
logging_steps: int, logging_steps: int,
save_steps: int, save_steps: int,
warmup_steps: int, warmup_steps: int,
compute_type: str, flash_attn: bool,
rope_scaling: bool,
lora_rank: int, lora_rank: int,
lora_dropout: float, lora_dropout: float,
lora_target: str, lora_target: str,
@ -120,8 +121,7 @@ class Runner:
system_prompt=system_prompt, system_prompt=system_prompt,
dataset_dir=dataset_dir, dataset_dir=dataset_dir,
dataset=",".join(dataset), dataset=",".join(dataset),
max_source_length=max_source_length, cutoff_len=cutoff_len,
max_target_length=max_target_length,
learning_rate=float(learning_rate), learning_rate=float(learning_rate),
num_train_epochs=float(num_train_epochs), num_train_epochs=float(num_train_epochs),
max_samples=int(max_samples), max_samples=int(max_samples),
@ -132,6 +132,8 @@ class Runner:
logging_steps=logging_steps, logging_steps=logging_steps,
save_steps=save_steps, save_steps=save_steps,
warmup_steps=warmup_steps, warmup_steps=warmup_steps,
flash_attn=flash_attn,
rope_scaling="linear" if rope_scaling else None,
lora_rank=lora_rank, lora_rank=lora_rank,
lora_dropout=lora_dropout, lora_dropout=lora_dropout,
lora_target=lora_target or DEFAULT_MODULE.get(model_name.split("-")[0], "q_proj,v_proj"), lora_target=lora_target or DEFAULT_MODULE.get(model_name.split("-")[0], "q_proj,v_proj"),
@ -168,11 +170,13 @@ class Runner:
system_prompt: str, system_prompt: str,
dataset_dir: str, dataset_dir: str,
dataset: List[str], dataset: List[str],
max_source_length: int, cutoff_len: int,
max_target_length: int,
max_samples: str, max_samples: str,
batch_size: int, batch_size: int,
predict: bool predict: bool,
max_new_tokens: int,
top_p: float,
temperature: float
) -> Tuple[str, str, List[str], str, Dict[str, Any]]: ) -> Tuple[str, str, List[str], str, Dict[str, Any]]:
if checkpoints: if checkpoints:
checkpoint_dir = ",".join( checkpoint_dir = ",".join(
@ -200,10 +204,12 @@ class Runner:
system_prompt=system_prompt, system_prompt=system_prompt,
dataset_dir=dataset_dir, dataset_dir=dataset_dir,
dataset=",".join(dataset), dataset=",".join(dataset),
max_source_length=max_source_length, cutoff_len=cutoff_len,
max_target_length=max_target_length,
max_samples=int(max_samples), max_samples=int(max_samples),
per_device_eval_batch_size=batch_size, per_device_eval_batch_size=batch_size,
max_new_tokens=max_new_tokens,
top_p=top_p,
temperature=temperature,
output_dir=output_dir output_dir=output_dir
) )