mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-12-18 04:40:35 +08:00
support rank0 logger
This commit is contained in:
@@ -13,7 +13,6 @@
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
@@ -34,8 +33,8 @@ from transformers.utils import (
|
||||
)
|
||||
from typing_extensions import override
|
||||
|
||||
from ..extras import logging
|
||||
from ..extras.constants import TRAINER_LOG, V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
|
||||
from ..extras.logging import LoggerHandler, get_logger
|
||||
from ..extras.misc import get_peak_memory
|
||||
|
||||
|
||||
@@ -48,7 +47,7 @@ if TYPE_CHECKING:
|
||||
from trl import AutoModelForCausalLMWithValueHead
|
||||
|
||||
|
||||
logger = get_logger(__name__)
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
def fix_valuehead_checkpoint(
|
||||
@@ -92,7 +91,7 @@ def fix_valuehead_checkpoint(
|
||||
else:
|
||||
torch.save(v_head_state_dict, os.path.join(output_dir, V_HEAD_WEIGHTS_NAME))
|
||||
|
||||
logger.info(f"Value head model saved at: {output_dir}")
|
||||
logger.info_rank0(f"Value head model saved at: {output_dir}")
|
||||
|
||||
|
||||
class FixValueHeadModelCallback(TrainerCallback):
|
||||
@@ -145,7 +144,7 @@ class PissaConvertCallback(TrainerCallback):
|
||||
if args.should_save:
|
||||
model = kwargs.pop("model")
|
||||
pissa_init_dir = os.path.join(args.output_dir, "pissa_init")
|
||||
logger.info(f"Initial PiSSA adapter will be saved at: {pissa_init_dir}.")
|
||||
logger.info_rank0(f"Initial PiSSA adapter will be saved at: {pissa_init_dir}.")
|
||||
if isinstance(model, PeftModel):
|
||||
init_lora_weights = getattr(model.peft_config["default"], "init_lora_weights")
|
||||
setattr(model.peft_config["default"], "init_lora_weights", True)
|
||||
@@ -159,7 +158,7 @@ class PissaConvertCallback(TrainerCallback):
|
||||
pissa_init_dir = os.path.join(args.output_dir, "pissa_init")
|
||||
pissa_backup_dir = os.path.join(args.output_dir, "pissa_backup")
|
||||
pissa_convert_dir = os.path.join(args.output_dir, "pissa_converted")
|
||||
logger.info(f"Converted PiSSA adapter will be saved at: {pissa_convert_dir}.")
|
||||
logger.info_rank0(f"Converted PiSSA adapter will be saved at: {pissa_convert_dir}.")
|
||||
# 1. save a pissa backup with init_lora_weights: True
|
||||
# 2. save a converted lora with init_lora_weights: pissa
|
||||
# 3. load the pissa backup with init_lora_weights: True
|
||||
@@ -200,8 +199,8 @@ class LogCallback(TrainerCallback):
|
||||
self.webui_mode = os.environ.get("LLAMABOARD_ENABLED", "0").lower() in ["true", "1"]
|
||||
if self.webui_mode:
|
||||
signal.signal(signal.SIGABRT, self._set_abort)
|
||||
self.logger_handler = LoggerHandler(os.environ.get("LLAMABOARD_WORKDIR"))
|
||||
logging.root.addHandler(self.logger_handler)
|
||||
self.logger_handler = logging.LoggerHandler(os.environ.get("LLAMABOARD_WORKDIR"))
|
||||
logging.add_handler(self.logger_handler)
|
||||
transformers.logging.add_handler(self.logger_handler)
|
||||
|
||||
def _set_abort(self, signum, frame) -> None:
|
||||
@@ -243,7 +242,7 @@ class LogCallback(TrainerCallback):
|
||||
and os.path.exists(os.path.join(args.output_dir, TRAINER_LOG))
|
||||
and args.overwrite_output_dir
|
||||
):
|
||||
logger.warning("Previous trainer log in this folder will be deleted.")
|
||||
logger.warning_once("Previous trainer log in this folder will be deleted.")
|
||||
os.remove(os.path.join(args.output_dir, TRAINER_LOG))
|
||||
|
||||
@override
|
||||
@@ -310,7 +309,7 @@ class LogCallback(TrainerCallback):
|
||||
|
||||
logs = {k: v for k, v in logs.items() if v is not None}
|
||||
if self.webui_mode and all(key in logs for key in ["loss", "learning_rate", "epoch"]):
|
||||
logger.info(
|
||||
logger.info_rank0(
|
||||
"{{'loss': {:.4f}, 'learning_rate': {:2.4e}, 'epoch': {:.2f}, 'throughput': {}}}".format(
|
||||
logs["loss"], logs["learning_rate"], logs["epoch"], logs.get("throughput", "N/A")
|
||||
)
|
||||
|
||||
@@ -37,7 +37,7 @@ from trl.core import PPODecorators, logprobs_from_logits
|
||||
from trl.models.utils import unwrap_model_for_generation
|
||||
from typing_extensions import override
|
||||
|
||||
from ...extras.logging import get_logger
|
||||
from ...extras import logging
|
||||
from ...extras.misc import AverageMeter, count_parameters, get_current_device, get_logits_processor
|
||||
from ..callbacks import FixValueHeadModelCallback, SaveProcessorCallback
|
||||
from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
|
||||
@@ -58,7 +58,7 @@ if TYPE_CHECKING:
|
||||
from ...hparams import FinetuningArguments, GeneratingArguments, ModelArguments
|
||||
|
||||
|
||||
logger = get_logger(__name__)
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class CustomPPOTrainer(PPOTrainer, Trainer):
|
||||
@@ -112,7 +112,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
|
||||
]
|
||||
ppo_config.accelerator_kwargs["deepspeed_plugin"] = training_args.deepspeed_plugin
|
||||
if ppo_config.log_with is not None:
|
||||
logger.warning("PPOTrainer cannot use external logger when DeepSpeed is enabled.")
|
||||
logger.warning_rank0("PPOTrainer cannot use external logger when DeepSpeed is enabled.")
|
||||
ppo_config.log_with = None
|
||||
|
||||
# Create optimizer and scheduler
|
||||
@@ -160,7 +160,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
|
||||
callbacks, self.accelerator.unwrap_model(self.model), self.tokenizer, self.optimizer, self.lr_scheduler
|
||||
)
|
||||
if self.args.max_steps > 0:
|
||||
logger.info("max_steps is given, it will override any value given in num_train_epochs")
|
||||
logger.info_rank0("max_steps is given, it will override any value given in num_train_epochs")
|
||||
|
||||
self.amp_context = torch.autocast(self.current_device.type)
|
||||
warnings.simplefilter("ignore") # remove gc warnings on ref model
|
||||
@@ -216,20 +216,19 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
|
||||
self.state.is_local_process_zero = self.is_local_process_zero()
|
||||
self.state.is_world_process_zero = self.is_world_process_zero()
|
||||
|
||||
if self.is_world_process_zero():
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(f" Num examples = {num_examples:,}")
|
||||
logger.info(f" Num Epochs = {num_train_epochs:,}")
|
||||
logger.info(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
|
||||
logger.info(
|
||||
" Total train batch size (w. parallel, buffer, distributed & accumulation) = {:,}".format(
|
||||
total_train_batch_size
|
||||
)
|
||||
logger.info_rank0("***** Running training *****")
|
||||
logger.info_rank0(f" Num examples = {num_examples:,}")
|
||||
logger.info_rank0(f" Num Epochs = {num_train_epochs:,}")
|
||||
logger.info_rank0(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
|
||||
logger.info_rank0(
|
||||
" Total train batch size (w. parallel, buffer, distributed & accumulation) = {:,}".format(
|
||||
total_train_batch_size
|
||||
)
|
||||
logger.info(f" Gradient Accumulation steps = {self.args.gradient_accumulation_steps:,}")
|
||||
logger.info(f" Num optimization epochs per batch = {self.finetuning_args.ppo_epochs:,}")
|
||||
logger.info(f" Total training steps = {max_steps:,}")
|
||||
logger.info(f" Number of trainable parameters = {count_parameters(self.model)[0]:,}")
|
||||
)
|
||||
logger.info_rank0(f" Gradient Accumulation steps = {self.args.gradient_accumulation_steps:,}")
|
||||
logger.info_rank0(f" Num optimization epochs per batch = {self.finetuning_args.ppo_epochs:,}")
|
||||
logger.info_rank0(f" Total training steps = {max_steps:,}")
|
||||
logger.info_rank0(f" Number of trainable parameters = {count_parameters(self.model)[0]:,}")
|
||||
|
||||
dataiter = iter(self.dataloader)
|
||||
loss_meter = AverageMeter()
|
||||
@@ -269,7 +268,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
|
||||
batch["response"] = self.tokenizer.batch_decode(responses, skip_special_tokens=True)
|
||||
self.log_stats(stats, batch, rewards)
|
||||
except Exception:
|
||||
logger.warning("Failed to save stats due to unknown errors.")
|
||||
logger.warning_rank0("Failed to save stats due to unknown errors.")
|
||||
|
||||
self.state.global_step += 1
|
||||
self.callback_handler.on_step_end(self.args, self.state, self.control)
|
||||
@@ -498,7 +497,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
|
||||
if self.args.should_save:
|
||||
self._save(output_dir, state_dict=state_dict)
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
logger.warning_rank0(
|
||||
" stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead,"
|
||||
" use zero_to_fp32.py to recover weights"
|
||||
)
|
||||
|
||||
@@ -18,7 +18,6 @@ from typing import TYPE_CHECKING, Optional
|
||||
from transformers import Trainer
|
||||
from typing_extensions import override
|
||||
|
||||
from ...extras.logging import get_logger
|
||||
from ...extras.packages import is_transformers_version_equal_to_4_46
|
||||
from ..callbacks import PissaConvertCallback, SaveProcessorCallback
|
||||
from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
|
||||
@@ -31,9 +30,6 @@ if TYPE_CHECKING:
|
||||
from ...hparams import FinetuningArguments
|
||||
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class CustomTrainer(Trainer):
|
||||
r"""
|
||||
Inherits Trainer for custom optimizer.
|
||||
|
||||
@@ -24,7 +24,7 @@ import torch
|
||||
from transformers import Trainer
|
||||
from typing_extensions import override
|
||||
|
||||
from ...extras.logging import get_logger
|
||||
from ...extras import logging
|
||||
from ...extras.packages import is_transformers_version_equal_to_4_46
|
||||
from ..callbacks import FixValueHeadModelCallback, PissaConvertCallback, SaveProcessorCallback
|
||||
from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
|
||||
@@ -37,7 +37,7 @@ if TYPE_CHECKING:
|
||||
from ...hparams import FinetuningArguments
|
||||
|
||||
|
||||
logger = get_logger(__name__)
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class PairwiseTrainer(Trainer):
|
||||
@@ -118,7 +118,7 @@ class PairwiseTrainer(Trainer):
|
||||
return
|
||||
|
||||
output_prediction_file = os.path.join(self.args.output_dir, "generated_predictions.jsonl")
|
||||
logger.info(f"Saving prediction results to {output_prediction_file}")
|
||||
logger.info_rank0(f"Saving prediction results to {output_prediction_file}")
|
||||
chosen_scores, rejected_scores = predict_results.predictions
|
||||
|
||||
with open(output_prediction_file, "w", encoding="utf-8") as writer:
|
||||
|
||||
@@ -25,8 +25,8 @@ import torch
|
||||
from transformers import Seq2SeqTrainer
|
||||
from typing_extensions import override
|
||||
|
||||
from ...extras import logging
|
||||
from ...extras.constants import IGNORE_INDEX
|
||||
from ...extras.logging import get_logger
|
||||
from ...extras.packages import is_transformers_version_equal_to_4_46
|
||||
from ..callbacks import PissaConvertCallback, SaveProcessorCallback
|
||||
from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
|
||||
@@ -40,7 +40,7 @@ if TYPE_CHECKING:
|
||||
from ...hparams import FinetuningArguments
|
||||
|
||||
|
||||
logger = get_logger(__name__)
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class CustomSeq2SeqTrainer(Seq2SeqTrainer):
|
||||
@@ -142,7 +142,7 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
|
||||
return
|
||||
|
||||
output_prediction_file = os.path.join(self.args.output_dir, "generated_predictions.jsonl")
|
||||
logger.info(f"Saving prediction results to {output_prediction_file}")
|
||||
logger.info_rank0(f"Saving prediction results to {output_prediction_file}")
|
||||
|
||||
labels = np.where(
|
||||
predict_results.label_ids != IGNORE_INDEX, predict_results.label_ids, self.tokenizer.pad_token_id
|
||||
|
||||
@@ -28,8 +28,8 @@ from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||
from transformers.trainer_pt_utils import get_parameter_names
|
||||
from typing_extensions import override
|
||||
|
||||
from ..extras import logging
|
||||
from ..extras.constants import IGNORE_INDEX
|
||||
from ..extras.logging import get_logger
|
||||
from ..extras.packages import is_galore_available
|
||||
from ..hparams import FinetuningArguments, ModelArguments
|
||||
from ..model import find_all_linear_modules, load_model, load_tokenizer, load_valuehead_params
|
||||
@@ -46,7 +46,7 @@ if TYPE_CHECKING:
|
||||
from ..hparams import DataArguments
|
||||
|
||||
|
||||
logger = get_logger(__name__)
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class DummyOptimizer(torch.optim.Optimizer):
|
||||
@@ -116,7 +116,7 @@ def create_ref_model(
|
||||
ref_model = load_model(
|
||||
tokenizer, ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead
|
||||
)
|
||||
logger.info(f"Created reference model from {finetuning_args.ref_model}")
|
||||
logger.info_rank0(f"Created reference model from {finetuning_args.ref_model}")
|
||||
else:
|
||||
if finetuning_args.finetuning_type == "lora":
|
||||
ref_model = None
|
||||
@@ -127,7 +127,7 @@ def create_ref_model(
|
||||
ref_model = load_model(
|
||||
tokenizer, ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead
|
||||
)
|
||||
logger.info("Created reference model from the model itself.")
|
||||
logger.info_rank0("Created reference model from the model itself.")
|
||||
|
||||
return ref_model
|
||||
|
||||
@@ -140,7 +140,7 @@ def create_reward_model(
|
||||
"""
|
||||
if finetuning_args.reward_model_type == "api":
|
||||
assert finetuning_args.reward_model.startswith("http"), "Please provide full url."
|
||||
logger.info(f"Use reward server {finetuning_args.reward_model}")
|
||||
logger.info_rank0(f"Use reward server {finetuning_args.reward_model}")
|
||||
return finetuning_args.reward_model
|
||||
elif finetuning_args.reward_model_type == "lora":
|
||||
model.pretrained_model.load_adapter(finetuning_args.reward_model, "reward")
|
||||
@@ -157,7 +157,7 @@ def create_reward_model(
|
||||
model.register_buffer(
|
||||
"default_head_bias", torch.zeros_like(vhead_params["v_head.summary.bias"]), persistent=False
|
||||
)
|
||||
logger.info(f"Loaded adapter weights of reward model from {finetuning_args.reward_model}")
|
||||
logger.info_rank0(f"Loaded adapter weights of reward model from {finetuning_args.reward_model}")
|
||||
return None
|
||||
else:
|
||||
reward_model_args = ModelArguments.copyfrom(
|
||||
@@ -171,8 +171,8 @@ def create_reward_model(
|
||||
reward_model = load_model(
|
||||
tokenizer, reward_model_args, reward_finetuning_args, is_trainable=False, add_valuehead=True
|
||||
)
|
||||
logger.info(f"Loaded full weights of reward model from {finetuning_args.reward_model}")
|
||||
logger.warning("Please ensure the ppo model and reward model share SAME tokenizer and vocabulary.")
|
||||
logger.info_rank0(f"Loaded full weights of reward model from {finetuning_args.reward_model}")
|
||||
logger.warning_rank0("Please ensure the ppo model and reward model share SAME tokenizer and vocabulary.")
|
||||
return reward_model
|
||||
|
||||
|
||||
@@ -265,7 +265,7 @@ def _create_galore_optimizer(
|
||||
]
|
||||
optimizer = optim_class(param_groups, **optim_kwargs)
|
||||
|
||||
logger.info("Using GaLore optimizer, may cause hanging at the start of training, wait patiently.")
|
||||
logger.info_rank0("Using GaLore optimizer, may cause hanging at the start of training, wait patiently.")
|
||||
return optimizer
|
||||
|
||||
|
||||
@@ -305,7 +305,7 @@ def _create_loraplus_optimizer(
|
||||
dict(params=param_dict["embedding"], lr=embedding_lr, weight_decay=training_args.weight_decay),
|
||||
]
|
||||
optimizer = optim_class(param_groups, **optim_kwargs)
|
||||
logger.info(f"Using LoRA+ optimizer with loraplus lr ratio {finetuning_args.loraplus_lr_ratio:.2f}.")
|
||||
logger.info_rank0(f"Using LoRA+ optimizer with loraplus lr ratio {finetuning_args.loraplus_lr_ratio:.2f}.")
|
||||
return optimizer
|
||||
|
||||
|
||||
@@ -343,7 +343,7 @@ def _create_badam_optimizer(
|
||||
verbose=finetuning_args.badam_verbose,
|
||||
ds_zero3_enabled=is_deepspeed_zero3_enabled(),
|
||||
)
|
||||
logger.info(
|
||||
logger.info_rank0(
|
||||
f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, "
|
||||
f"switch block every {finetuning_args.badam_switch_interval} steps, "
|
||||
f"default start block is {finetuning_args.badam_start_block}"
|
||||
@@ -362,7 +362,7 @@ def _create_badam_optimizer(
|
||||
include_embedding=False,
|
||||
**optim_kwargs,
|
||||
)
|
||||
logger.info(
|
||||
logger.info_rank0(
|
||||
f"Using BAdam optimizer with ratio-based update, update ratio is {finetuning_args.badam_update_ratio}, "
|
||||
f"mask mode is {finetuning_args.badam_mask_mode}"
|
||||
)
|
||||
@@ -391,7 +391,7 @@ def _create_adam_mini_optimizer(
|
||||
n_heads=num_q_head,
|
||||
n_kv_heads=num_kv_head,
|
||||
)
|
||||
logger.info("Using Adam-mini optimizer.")
|
||||
logger.info_rank0("Using Adam-mini optimizer.")
|
||||
return optimizer
|
||||
|
||||
|
||||
|
||||
@@ -20,8 +20,8 @@ import torch
|
||||
from transformers import PreTrainedModel
|
||||
|
||||
from ..data import get_template_and_fix_tokenizer
|
||||
from ..extras import logging
|
||||
from ..extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
|
||||
from ..extras.logging import get_logger
|
||||
from ..hparams import get_infer_args, get_train_args
|
||||
from ..model import load_model, load_tokenizer
|
||||
from .callbacks import LogCallback
|
||||
@@ -37,7 +37,7 @@ if TYPE_CHECKING:
|
||||
from transformers import TrainerCallback
|
||||
|
||||
|
||||
logger = get_logger(__name__)
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: List["TrainerCallback"] = []) -> None:
|
||||
@@ -91,7 +91,7 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
|
||||
|
||||
setattr(model.config, "torch_dtype", output_dtype)
|
||||
model = model.to(output_dtype)
|
||||
logger.info(f"Convert model dtype to: {output_dtype}.")
|
||||
logger.info_rank0(f"Convert model dtype to: {output_dtype}.")
|
||||
|
||||
model.save_pretrained(
|
||||
save_directory=model_args.export_dir,
|
||||
@@ -117,13 +117,13 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
|
||||
os.path.join(vhead_path, V_HEAD_SAFE_WEIGHTS_NAME),
|
||||
os.path.join(model_args.export_dir, V_HEAD_SAFE_WEIGHTS_NAME),
|
||||
)
|
||||
logger.info(f"Copied valuehead to {model_args.export_dir}.")
|
||||
logger.info_rank0(f"Copied valuehead to {model_args.export_dir}.")
|
||||
elif os.path.exists(os.path.join(vhead_path, V_HEAD_WEIGHTS_NAME)):
|
||||
shutil.copy(
|
||||
os.path.join(vhead_path, V_HEAD_WEIGHTS_NAME),
|
||||
os.path.join(model_args.export_dir, V_HEAD_WEIGHTS_NAME),
|
||||
)
|
||||
logger.info(f"Copied valuehead to {model_args.export_dir}.")
|
||||
logger.info_rank0(f"Copied valuehead to {model_args.export_dir}.")
|
||||
|
||||
try:
|
||||
tokenizer.padding_side = "left" # restore padding side
|
||||
@@ -138,4 +138,4 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
|
||||
processor.push_to_hub(model_args.export_hub_model_id, token=model_args.hf_hub_token)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Cannot save tokenizer, please copy the files manually: {e}.")
|
||||
logger.warning_rank0(f"Cannot save tokenizer, please copy the files manually: {e}.")
|
||||
|
||||
Reference in New Issue
Block a user