mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-08-23 06:12:50 +08:00
release v0.6.1
Former-commit-id: ca793028c69433eae405009c5ebb790c6c2d40c4
This commit is contained in:
parent
1421db282a
commit
fc066cad7f
@ -7,5 +7,5 @@ from .train import export_model, run_exp
|
|||||||
from .webui import create_ui, create_web_demo
|
from .webui import create_ui, create_web_demo
|
||||||
|
|
||||||
|
|
||||||
__version__ = "0.6.0"
|
__version__ = "0.6.1"
|
||||||
__all__ = ["create_app", "ChatModel", "Evaluator", "export_model", "run_exp", "create_ui", "create_web_demo"]
|
__all__ = ["create_app", "ChatModel", "Evaluator", "export_model", "run_exp", "create_ui", "create_web_demo"]
|
||||||
|
@ -28,6 +28,7 @@ def run_dpo(
|
|||||||
tokenizer = load_tokenizer(model_args)
|
tokenizer = load_tokenizer(model_args)
|
||||||
dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="rm")
|
dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="rm")
|
||||||
model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
|
model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
|
||||||
|
|
||||||
data_collator = DPODataCollatorWithPadding(
|
data_collator = DPODataCollatorWithPadding(
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
pad_to_multiple_of=8,
|
pad_to_multiple_of=8,
|
||||||
|
@ -6,20 +6,23 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
|
|||||||
import torch
|
import torch
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from transformers import GenerationConfig, Trainer, TrainerControl, TrainerState
|
from transformers import GenerationConfig, Trainer, TrainerControl, TrainerState
|
||||||
|
from transformers.optimization import get_scheduler
|
||||||
from transformers.trainer_pt_utils import remove_dummy_checkpoint
|
from transformers.trainer_pt_utils import remove_dummy_checkpoint
|
||||||
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
|
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
|
||||||
from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME
|
from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME
|
||||||
from trl import PPOTrainer
|
from trl import PPOConfig, PPOTrainer
|
||||||
from trl.core import PPODecorators, logprobs_from_logits
|
from trl.core import PPODecorators, logprobs_from_logits
|
||||||
|
|
||||||
from ...extras.callbacks import FixValueHeadModelCallback, LogCallback
|
from ...extras.callbacks import FixValueHeadModelCallback, LogCallback
|
||||||
from ...extras.logging import get_logger
|
from ...extras.logging import get_logger
|
||||||
from ...extras.misc import AverageMeter, count_parameters, get_current_device, get_logits_processor
|
from ...extras.misc import AverageMeter, count_parameters, get_current_device, get_logits_processor
|
||||||
|
from ..utils import create_custom_optimzer, create_custom_scheduler
|
||||||
from .utils import dump_layernorm, get_rewards_from_server, replace_model, restore_layernorm
|
from .utils import dump_layernorm, get_rewards_from_server, replace_model, restore_layernorm
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from transformers import Seq2SeqTrainingArguments, TrainerCallback
|
from datasets import Dataset
|
||||||
|
from transformers import DataCollatorWithPadding, PreTrainedTokenizer, Seq2SeqTrainingArguments, TrainerCallback
|
||||||
from trl import AutoModelForCausalLMWithValueHead
|
from trl import AutoModelForCausalLMWithValueHead
|
||||||
|
|
||||||
from ...hparams import FinetuningArguments, GeneratingArguments, ModelArguments
|
from ...hparams import FinetuningArguments, GeneratingArguments, ModelArguments
|
||||||
@ -40,10 +43,53 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
|
|||||||
finetuning_args: "FinetuningArguments",
|
finetuning_args: "FinetuningArguments",
|
||||||
generating_args: "GeneratingArguments",
|
generating_args: "GeneratingArguments",
|
||||||
callbacks: List["TrainerCallback"],
|
callbacks: List["TrainerCallback"],
|
||||||
reward_model: "AutoModelForCausalLMWithValueHead",
|
model: "AutoModelForCausalLMWithValueHead",
|
||||||
**kwargs,
|
reward_model: Optional["AutoModelForCausalLMWithValueHead"],
|
||||||
|
ref_model: Optional["AutoModelForCausalLMWithValueHead"],
|
||||||
|
tokenizer: "PreTrainedTokenizer",
|
||||||
|
dataset: "Dataset",
|
||||||
|
data_collator: "DataCollatorWithPadding",
|
||||||
):
|
):
|
||||||
PPOTrainer.__init__(self, **kwargs)
|
backward_batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps
|
||||||
|
ppo_config = PPOConfig(
|
||||||
|
model_name=model_args.model_name_or_path,
|
||||||
|
learning_rate=training_args.learning_rate,
|
||||||
|
mini_batch_size=training_args.per_device_train_batch_size,
|
||||||
|
batch_size=backward_batch_size * finetuning_args.ppo_buffer_size,
|
||||||
|
gradient_accumulation_steps=training_args.gradient_accumulation_steps,
|
||||||
|
ppo_epochs=finetuning_args.ppo_epochs,
|
||||||
|
max_grad_norm=training_args.max_grad_norm,
|
||||||
|
seed=training_args.seed,
|
||||||
|
optimize_device_cache=True,
|
||||||
|
target=finetuning_args.ppo_target,
|
||||||
|
use_score_scaling=finetuning_args.ppo_score_norm,
|
||||||
|
use_score_norm=finetuning_args.ppo_score_norm,
|
||||||
|
whiten_rewards=finetuning_args.ppo_whiten_rewards,
|
||||||
|
accelerator_kwargs={"step_scheduler_with_optimizer": False},
|
||||||
|
log_with=training_args.report_to[0] if training_args.report_to is not None else None,
|
||||||
|
project_kwargs={"logging_dir": training_args.logging_dir},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create optimizer and scheduler
|
||||||
|
if training_args.max_steps > 0:
|
||||||
|
num_training_steps = training_args.max_steps
|
||||||
|
else:
|
||||||
|
total_train_batch_size = backward_batch_size * finetuning_args.ppo_buffer_size * training_args.world_size
|
||||||
|
num_training_steps = training_args.num_train_epochs * math.ceil(len(dataset) / total_train_batch_size)
|
||||||
|
|
||||||
|
optimizer = self.create_optimizer(model, training_args, finetuning_args)
|
||||||
|
scheduler = self.create_scheduler(training_args, num_training_steps, optimizer)
|
||||||
|
|
||||||
|
PPOTrainer.__init__(
|
||||||
|
self,
|
||||||
|
config=ppo_config,
|
||||||
|
model=model,
|
||||||
|
ref_model=ref_model,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
dataset=dataset,
|
||||||
|
data_collator=data_collator,
|
||||||
|
lr_scheduler=scheduler,
|
||||||
|
)
|
||||||
|
|
||||||
self.args = training_args
|
self.args = training_args
|
||||||
self.model_args = model_args
|
self.model_args = model_args
|
||||||
@ -205,6 +251,44 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
|
|||||||
self.args, self.state, self.control, model=self.accelerator.unwrap_model(self.model)
|
self.args, self.state, self.control, model=self.accelerator.unwrap_model(self.model)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def create_optimizer(
|
||||||
|
self,
|
||||||
|
model: "AutoModelForCausalLMWithValueHead",
|
||||||
|
training_args: "Seq2SeqTrainingArguments",
|
||||||
|
finetuning_args: "FinetuningArguments",
|
||||||
|
) -> "torch.optim.Optimizer":
|
||||||
|
optimizer = create_custom_optimzer(model, training_args, finetuning_args)
|
||||||
|
if optimizer is None:
|
||||||
|
decay_params, nodecay_params = [], []
|
||||||
|
decay_param_names = self.get_decay_parameter_names(model)
|
||||||
|
for name, param in model.named_parameters():
|
||||||
|
if param.requires_grad:
|
||||||
|
if name in decay_param_names:
|
||||||
|
decay_params.append(param)
|
||||||
|
else:
|
||||||
|
nodecay_params.append(param)
|
||||||
|
|
||||||
|
optim_class, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
|
||||||
|
param_groups = [
|
||||||
|
dict(params=nodecay_params),
|
||||||
|
dict(params=decay_params, weight_decay=training_args.weight_decay),
|
||||||
|
]
|
||||||
|
optimizer = optim_class(param_groups, **optim_kwargs)
|
||||||
|
|
||||||
|
return optimizer
|
||||||
|
|
||||||
|
def create_scheduler(
|
||||||
|
self, training_args: "Seq2SeqTrainingArguments", num_training_steps: int, optimizer: "torch.optim.Optimizer"
|
||||||
|
) -> "torch.optim.lr_scheduler.LRScheduler":
|
||||||
|
create_custom_scheduler(training_args, num_training_steps, optimizer)
|
||||||
|
lr_scheduler = get_scheduler(
|
||||||
|
training_args.lr_scheduler_type,
|
||||||
|
optimizer=optimizer,
|
||||||
|
num_warmup_steps=training_args.get_warmup_steps(num_training_steps),
|
||||||
|
num_training_steps=num_training_steps,
|
||||||
|
)
|
||||||
|
return lr_scheduler
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def get_inputs(self, batch: Dict[str, torch.Tensor]) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
|
def get_inputs(self, batch: Dict[str, torch.Tensor]) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
|
||||||
r"""
|
r"""
|
||||||
|
@ -1,19 +1,15 @@
|
|||||||
# Inspired by: https://github.com/lvwerra/trl/blob/main/examples/research_projects/stack_llama/scripts/rl_training.py
|
# Inspired by: https://github.com/lvwerra/trl/blob/main/examples/research_projects/stack_llama/scripts/rl_training.py
|
||||||
|
|
||||||
import math
|
|
||||||
from typing import TYPE_CHECKING, List, Optional
|
from typing import TYPE_CHECKING, List, Optional
|
||||||
|
|
||||||
from torch.optim import AdamW
|
|
||||||
from transformers import DataCollatorWithPadding
|
from transformers import DataCollatorWithPadding
|
||||||
from transformers.optimization import get_scheduler
|
|
||||||
from trl import PPOConfig
|
|
||||||
|
|
||||||
from ...data import get_dataset
|
from ...data import get_dataset
|
||||||
from ...extras.callbacks import FixValueHeadModelCallback
|
from ...extras.callbacks import FixValueHeadModelCallback
|
||||||
from ...extras.misc import fix_valuehead_checkpoint
|
from ...extras.misc import fix_valuehead_checkpoint
|
||||||
from ...extras.ploting import plot_loss
|
from ...extras.ploting import plot_loss
|
||||||
from ...model import load_model, load_tokenizer
|
from ...model import load_model, load_tokenizer
|
||||||
from ..utils import create_custom_optimzer, create_custom_scheduler, create_ref_model, create_reward_model
|
from ..utils import create_ref_model, create_reward_model
|
||||||
from .trainer import CustomPPOTrainer
|
from .trainer import CustomPPOTrainer
|
||||||
|
|
||||||
|
|
||||||
@ -42,46 +38,6 @@ def run_ppo(
|
|||||||
ref_model = create_ref_model(model_args, finetuning_args, add_valuehead=True)
|
ref_model = create_ref_model(model_args, finetuning_args, add_valuehead=True)
|
||||||
reward_model = create_reward_model(model, model_args, finetuning_args)
|
reward_model = create_reward_model(model, model_args, finetuning_args)
|
||||||
|
|
||||||
# Create ppo config
|
|
||||||
backward_batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps
|
|
||||||
ppo_config = PPOConfig(
|
|
||||||
model_name=model_args.model_name_or_path,
|
|
||||||
learning_rate=training_args.learning_rate,
|
|
||||||
mini_batch_size=training_args.per_device_train_batch_size,
|
|
||||||
batch_size=backward_batch_size * finetuning_args.ppo_buffer_size,
|
|
||||||
gradient_accumulation_steps=training_args.gradient_accumulation_steps,
|
|
||||||
ppo_epochs=finetuning_args.ppo_epochs,
|
|
||||||
max_grad_norm=training_args.max_grad_norm,
|
|
||||||
seed=training_args.seed,
|
|
||||||
optimize_device_cache=True,
|
|
||||||
target=finetuning_args.ppo_target,
|
|
||||||
use_score_scaling=finetuning_args.ppo_score_norm,
|
|
||||||
use_score_norm=finetuning_args.ppo_score_norm,
|
|
||||||
whiten_rewards=finetuning_args.ppo_whiten_rewards,
|
|
||||||
accelerator_kwargs={"step_scheduler_with_optimizer": False},
|
|
||||||
log_with=training_args.report_to[0] if training_args.report_to is not None else None,
|
|
||||||
project_kwargs={"logging_dir": training_args.logging_dir},
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create optimizer and scheduler
|
|
||||||
if training_args.max_steps > 0:
|
|
||||||
num_training_steps = training_args.max_steps
|
|
||||||
else:
|
|
||||||
total_train_batch_size = backward_batch_size * finetuning_args.ppo_buffer_size * training_args.world_size
|
|
||||||
num_training_steps = training_args.num_train_epochs * math.ceil(len(dataset) / total_train_batch_size)
|
|
||||||
|
|
||||||
optimizer = create_custom_optimzer(model, training_args, finetuning_args)
|
|
||||||
if optimizer is None:
|
|
||||||
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=training_args.learning_rate)
|
|
||||||
|
|
||||||
create_custom_scheduler(training_args, num_training_steps, optimizer)
|
|
||||||
lr_scheduler = get_scheduler(
|
|
||||||
training_args.lr_scheduler_type,
|
|
||||||
optimizer=optimizer,
|
|
||||||
num_warmup_steps=training_args.get_warmup_steps(num_training_steps),
|
|
||||||
num_training_steps=num_training_steps,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Initialize our Trainer
|
# Initialize our Trainer
|
||||||
ppo_trainer = CustomPPOTrainer(
|
ppo_trainer = CustomPPOTrainer(
|
||||||
model_args=model_args,
|
model_args=model_args,
|
||||||
@ -89,15 +45,12 @@ def run_ppo(
|
|||||||
finetuning_args=finetuning_args,
|
finetuning_args=finetuning_args,
|
||||||
generating_args=generating_args,
|
generating_args=generating_args,
|
||||||
callbacks=callbacks + [FixValueHeadModelCallback()],
|
callbacks=callbacks + [FixValueHeadModelCallback()],
|
||||||
reward_model=reward_model,
|
|
||||||
config=ppo_config,
|
|
||||||
model=model,
|
model=model,
|
||||||
|
reward_model=reward_model,
|
||||||
ref_model=ref_model,
|
ref_model=ref_model,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
dataset=dataset,
|
dataset=dataset,
|
||||||
data_collator=data_collator,
|
data_collator=data_collator,
|
||||||
optimizer=optimizer,
|
|
||||||
lr_scheduler=lr_scheduler,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Training
|
# Training
|
||||||
|
@ -70,7 +70,7 @@ def create_modelcard_and_push(
|
|||||||
|
|
||||||
def create_ref_model(
|
def create_ref_model(
|
||||||
model_args: "ModelArguments", finetuning_args: "FinetuningArguments", add_valuehead: bool = False
|
model_args: "ModelArguments", finetuning_args: "FinetuningArguments", add_valuehead: bool = False
|
||||||
) -> Union["PreTrainedModel", "AutoModelForCausalLMWithValueHead"]:
|
) -> Optional[Union["PreTrainedModel", "AutoModelForCausalLMWithValueHead"]]:
|
||||||
r"""
|
r"""
|
||||||
Creates reference model for PPO/DPO training. Evaluation mode is not supported.
|
Creates reference model for PPO/DPO training. Evaluation mode is not supported.
|
||||||
|
|
||||||
@ -105,7 +105,7 @@ def create_ref_model(
|
|||||||
|
|
||||||
def create_reward_model(
|
def create_reward_model(
|
||||||
model: "AutoModelForCausalLMWithValueHead", model_args: "ModelArguments", finetuning_args: "FinetuningArguments"
|
model: "AutoModelForCausalLMWithValueHead", model_args: "ModelArguments", finetuning_args: "FinetuningArguments"
|
||||||
) -> "AutoModelForCausalLMWithValueHead":
|
) -> Optional["AutoModelForCausalLMWithValueHead"]:
|
||||||
r"""
|
r"""
|
||||||
Creates reward model for PPO training.
|
Creates reward model for PPO training.
|
||||||
"""
|
"""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user