support rank0 logger

This commit is contained in:
hiyouga
2024-11-02 18:31:04 +08:00
parent bd08b8c441
commit c38aa29336
42 changed files with 316 additions and 252 deletions

View File

@@ -13,7 +13,6 @@
# limitations under the License.
import json
import logging
import os
import signal
import sys
@@ -34,8 +33,8 @@ from transformers.utils import (
)
from typing_extensions import override
from ..extras import logging
from ..extras.constants import TRAINER_LOG, V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
from ..extras.logging import LoggerHandler, get_logger
from ..extras.misc import get_peak_memory
@@ -48,7 +47,7 @@ if TYPE_CHECKING:
from trl import AutoModelForCausalLMWithValueHead
logger = get_logger(__name__)
logger = logging.get_logger(__name__)
def fix_valuehead_checkpoint(
@@ -92,7 +91,7 @@ def fix_valuehead_checkpoint(
else:
torch.save(v_head_state_dict, os.path.join(output_dir, V_HEAD_WEIGHTS_NAME))
logger.info(f"Value head model saved at: {output_dir}")
logger.info_rank0(f"Value head model saved at: {output_dir}")
class FixValueHeadModelCallback(TrainerCallback):
@@ -145,7 +144,7 @@ class PissaConvertCallback(TrainerCallback):
if args.should_save:
model = kwargs.pop("model")
pissa_init_dir = os.path.join(args.output_dir, "pissa_init")
logger.info(f"Initial PiSSA adapter will be saved at: {pissa_init_dir}.")
logger.info_rank0(f"Initial PiSSA adapter will be saved at: {pissa_init_dir}.")
if isinstance(model, PeftModel):
init_lora_weights = getattr(model.peft_config["default"], "init_lora_weights")
setattr(model.peft_config["default"], "init_lora_weights", True)
@@ -159,7 +158,7 @@ class PissaConvertCallback(TrainerCallback):
pissa_init_dir = os.path.join(args.output_dir, "pissa_init")
pissa_backup_dir = os.path.join(args.output_dir, "pissa_backup")
pissa_convert_dir = os.path.join(args.output_dir, "pissa_converted")
logger.info(f"Converted PiSSA adapter will be saved at: {pissa_convert_dir}.")
logger.info_rank0(f"Converted PiSSA adapter will be saved at: {pissa_convert_dir}.")
# 1. save a pissa backup with init_lora_weights: True
# 2. save a converted lora with init_lora_weights: pissa
# 3. load the pissa backup with init_lora_weights: True
@@ -200,8 +199,8 @@ class LogCallback(TrainerCallback):
self.webui_mode = os.environ.get("LLAMABOARD_ENABLED", "0").lower() in ["true", "1"]
if self.webui_mode:
signal.signal(signal.SIGABRT, self._set_abort)
self.logger_handler = LoggerHandler(os.environ.get("LLAMABOARD_WORKDIR"))
logging.root.addHandler(self.logger_handler)
self.logger_handler = logging.LoggerHandler(os.environ.get("LLAMABOARD_WORKDIR"))
logging.add_handler(self.logger_handler)
transformers.logging.add_handler(self.logger_handler)
def _set_abort(self, signum, frame) -> None:
@@ -243,7 +242,7 @@ class LogCallback(TrainerCallback):
and os.path.exists(os.path.join(args.output_dir, TRAINER_LOG))
and args.overwrite_output_dir
):
logger.warning("Previous trainer log in this folder will be deleted.")
logger.warning_once("Previous trainer log in this folder will be deleted.")
os.remove(os.path.join(args.output_dir, TRAINER_LOG))
@override
@@ -310,7 +309,7 @@ class LogCallback(TrainerCallback):
logs = {k: v for k, v in logs.items() if v is not None}
if self.webui_mode and all(key in logs for key in ["loss", "learning_rate", "epoch"]):
logger.info(
logger.info_rank0(
"{{'loss': {:.4f}, 'learning_rate': {:2.4e}, 'epoch': {:.2f}, 'throughput': {}}}".format(
logs["loss"], logs["learning_rate"], logs["epoch"], logs.get("throughput", "N/A")
)

View File

@@ -37,7 +37,7 @@ from trl.core import PPODecorators, logprobs_from_logits
from trl.models.utils import unwrap_model_for_generation
from typing_extensions import override
from ...extras.logging import get_logger
from ...extras import logging
from ...extras.misc import AverageMeter, count_parameters, get_current_device, get_logits_processor
from ..callbacks import FixValueHeadModelCallback, SaveProcessorCallback
from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
@@ -58,7 +58,7 @@ if TYPE_CHECKING:
from ...hparams import FinetuningArguments, GeneratingArguments, ModelArguments
logger = get_logger(__name__)
logger = logging.get_logger(__name__)
class CustomPPOTrainer(PPOTrainer, Trainer):
@@ -112,7 +112,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
]
ppo_config.accelerator_kwargs["deepspeed_plugin"] = training_args.deepspeed_plugin
if ppo_config.log_with is not None:
logger.warning("PPOTrainer cannot use external logger when DeepSpeed is enabled.")
logger.warning_rank0("PPOTrainer cannot use external logger when DeepSpeed is enabled.")
ppo_config.log_with = None
# Create optimizer and scheduler
@@ -160,7 +160,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
callbacks, self.accelerator.unwrap_model(self.model), self.tokenizer, self.optimizer, self.lr_scheduler
)
if self.args.max_steps > 0:
logger.info("max_steps is given, it will override any value given in num_train_epochs")
logger.info_rank0("max_steps is given, it will override any value given in num_train_epochs")
self.amp_context = torch.autocast(self.current_device.type)
warnings.simplefilter("ignore") # remove gc warnings on ref model
@@ -216,20 +216,19 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
self.state.is_local_process_zero = self.is_local_process_zero()
self.state.is_world_process_zero = self.is_world_process_zero()
if self.is_world_process_zero():
logger.info("***** Running training *****")
logger.info(f" Num examples = {num_examples:,}")
logger.info(f" Num Epochs = {num_train_epochs:,}")
logger.info(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
logger.info(
" Total train batch size (w. parallel, buffer, distributed & accumulation) = {:,}".format(
total_train_batch_size
)
logger.info_rank0("***** Running training *****")
logger.info_rank0(f" Num examples = {num_examples:,}")
logger.info_rank0(f" Num Epochs = {num_train_epochs:,}")
logger.info_rank0(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
logger.info_rank0(
" Total train batch size (w. parallel, buffer, distributed & accumulation) = {:,}".format(
total_train_batch_size
)
logger.info(f" Gradient Accumulation steps = {self.args.gradient_accumulation_steps:,}")
logger.info(f" Num optimization epochs per batch = {self.finetuning_args.ppo_epochs:,}")
logger.info(f" Total training steps = {max_steps:,}")
logger.info(f" Number of trainable parameters = {count_parameters(self.model)[0]:,}")
)
logger.info_rank0(f" Gradient Accumulation steps = {self.args.gradient_accumulation_steps:,}")
logger.info_rank0(f" Num optimization epochs per batch = {self.finetuning_args.ppo_epochs:,}")
logger.info_rank0(f" Total training steps = {max_steps:,}")
logger.info_rank0(f" Number of trainable parameters = {count_parameters(self.model)[0]:,}")
dataiter = iter(self.dataloader)
loss_meter = AverageMeter()
@@ -269,7 +268,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
batch["response"] = self.tokenizer.batch_decode(responses, skip_special_tokens=True)
self.log_stats(stats, batch, rewards)
except Exception:
logger.warning("Failed to save stats due to unknown errors.")
logger.warning_rank0("Failed to save stats due to unknown errors.")
self.state.global_step += 1
self.callback_handler.on_step_end(self.args, self.state, self.control)
@@ -498,7 +497,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
if self.args.should_save:
self._save(output_dir, state_dict=state_dict)
except ValueError:
logger.warning(
logger.warning_rank0(
" stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead,"
" use zero_to_fp32.py to recover weights"
)

View File

@@ -18,7 +18,6 @@ from typing import TYPE_CHECKING, Optional
from transformers import Trainer
from typing_extensions import override
from ...extras.logging import get_logger
from ...extras.packages import is_transformers_version_equal_to_4_46
from ..callbacks import PissaConvertCallback, SaveProcessorCallback
from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
@@ -31,9 +30,6 @@ if TYPE_CHECKING:
from ...hparams import FinetuningArguments
logger = get_logger(__name__)
class CustomTrainer(Trainer):
r"""
Inherits Trainer for custom optimizer.

View File

@@ -24,7 +24,7 @@ import torch
from transformers import Trainer
from typing_extensions import override
from ...extras.logging import get_logger
from ...extras import logging
from ...extras.packages import is_transformers_version_equal_to_4_46
from ..callbacks import FixValueHeadModelCallback, PissaConvertCallback, SaveProcessorCallback
from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
@@ -37,7 +37,7 @@ if TYPE_CHECKING:
from ...hparams import FinetuningArguments
logger = get_logger(__name__)
logger = logging.get_logger(__name__)
class PairwiseTrainer(Trainer):
@@ -118,7 +118,7 @@ class PairwiseTrainer(Trainer):
return
output_prediction_file = os.path.join(self.args.output_dir, "generated_predictions.jsonl")
logger.info(f"Saving prediction results to {output_prediction_file}")
logger.info_rank0(f"Saving prediction results to {output_prediction_file}")
chosen_scores, rejected_scores = predict_results.predictions
with open(output_prediction_file, "w", encoding="utf-8") as writer:

View File

@@ -25,8 +25,8 @@ import torch
from transformers import Seq2SeqTrainer
from typing_extensions import override
from ...extras import logging
from ...extras.constants import IGNORE_INDEX
from ...extras.logging import get_logger
from ...extras.packages import is_transformers_version_equal_to_4_46
from ..callbacks import PissaConvertCallback, SaveProcessorCallback
from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
@@ -40,7 +40,7 @@ if TYPE_CHECKING:
from ...hparams import FinetuningArguments
logger = get_logger(__name__)
logger = logging.get_logger(__name__)
class CustomSeq2SeqTrainer(Seq2SeqTrainer):
@@ -142,7 +142,7 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
return
output_prediction_file = os.path.join(self.args.output_dir, "generated_predictions.jsonl")
logger.info(f"Saving prediction results to {output_prediction_file}")
logger.info_rank0(f"Saving prediction results to {output_prediction_file}")
labels = np.where(
predict_results.label_ids != IGNORE_INDEX, predict_results.label_ids, self.tokenizer.pad_token_id

View File

@@ -28,8 +28,8 @@ from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
from transformers.trainer_pt_utils import get_parameter_names
from typing_extensions import override
from ..extras import logging
from ..extras.constants import IGNORE_INDEX
from ..extras.logging import get_logger
from ..extras.packages import is_galore_available
from ..hparams import FinetuningArguments, ModelArguments
from ..model import find_all_linear_modules, load_model, load_tokenizer, load_valuehead_params
@@ -46,7 +46,7 @@ if TYPE_CHECKING:
from ..hparams import DataArguments
logger = get_logger(__name__)
logger = logging.get_logger(__name__)
class DummyOptimizer(torch.optim.Optimizer):
@@ -116,7 +116,7 @@ def create_ref_model(
ref_model = load_model(
tokenizer, ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead
)
logger.info(f"Created reference model from {finetuning_args.ref_model}")
logger.info_rank0(f"Created reference model from {finetuning_args.ref_model}")
else:
if finetuning_args.finetuning_type == "lora":
ref_model = None
@@ -127,7 +127,7 @@ def create_ref_model(
ref_model = load_model(
tokenizer, ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead
)
logger.info("Created reference model from the model itself.")
logger.info_rank0("Created reference model from the model itself.")
return ref_model
@@ -140,7 +140,7 @@ def create_reward_model(
"""
if finetuning_args.reward_model_type == "api":
assert finetuning_args.reward_model.startswith("http"), "Please provide full url."
logger.info(f"Use reward server {finetuning_args.reward_model}")
logger.info_rank0(f"Use reward server {finetuning_args.reward_model}")
return finetuning_args.reward_model
elif finetuning_args.reward_model_type == "lora":
model.pretrained_model.load_adapter(finetuning_args.reward_model, "reward")
@@ -157,7 +157,7 @@ def create_reward_model(
model.register_buffer(
"default_head_bias", torch.zeros_like(vhead_params["v_head.summary.bias"]), persistent=False
)
logger.info(f"Loaded adapter weights of reward model from {finetuning_args.reward_model}")
logger.info_rank0(f"Loaded adapter weights of reward model from {finetuning_args.reward_model}")
return None
else:
reward_model_args = ModelArguments.copyfrom(
@@ -171,8 +171,8 @@ def create_reward_model(
reward_model = load_model(
tokenizer, reward_model_args, reward_finetuning_args, is_trainable=False, add_valuehead=True
)
logger.info(f"Loaded full weights of reward model from {finetuning_args.reward_model}")
logger.warning("Please ensure the ppo model and reward model share SAME tokenizer and vocabulary.")
logger.info_rank0(f"Loaded full weights of reward model from {finetuning_args.reward_model}")
logger.warning_rank0("Please ensure the ppo model and reward model share SAME tokenizer and vocabulary.")
return reward_model
@@ -265,7 +265,7 @@ def _create_galore_optimizer(
]
optimizer = optim_class(param_groups, **optim_kwargs)
logger.info("Using GaLore optimizer, may cause hanging at the start of training, wait patiently.")
logger.info_rank0("Using GaLore optimizer, may cause hanging at the start of training, wait patiently.")
return optimizer
@@ -305,7 +305,7 @@ def _create_loraplus_optimizer(
dict(params=param_dict["embedding"], lr=embedding_lr, weight_decay=training_args.weight_decay),
]
optimizer = optim_class(param_groups, **optim_kwargs)
logger.info(f"Using LoRA+ optimizer with loraplus lr ratio {finetuning_args.loraplus_lr_ratio:.2f}.")
logger.info_rank0(f"Using LoRA+ optimizer with loraplus lr ratio {finetuning_args.loraplus_lr_ratio:.2f}.")
return optimizer
@@ -343,7 +343,7 @@ def _create_badam_optimizer(
verbose=finetuning_args.badam_verbose,
ds_zero3_enabled=is_deepspeed_zero3_enabled(),
)
logger.info(
logger.info_rank0(
f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, "
f"switch block every {finetuning_args.badam_switch_interval} steps, "
f"default start block is {finetuning_args.badam_start_block}"
@@ -362,7 +362,7 @@ def _create_badam_optimizer(
include_embedding=False,
**optim_kwargs,
)
logger.info(
logger.info_rank0(
f"Using BAdam optimizer with ratio-based update, update ratio is {finetuning_args.badam_update_ratio}, "
f"mask mode is {finetuning_args.badam_mask_mode}"
)
@@ -391,7 +391,7 @@ def _create_adam_mini_optimizer(
n_heads=num_q_head,
n_kv_heads=num_kv_head,
)
logger.info("Using Adam-mini optimizer.")
logger.info_rank0("Using Adam-mini optimizer.")
return optimizer

View File

@@ -20,8 +20,8 @@ import torch
from transformers import PreTrainedModel
from ..data import get_template_and_fix_tokenizer
from ..extras import logging
from ..extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
from ..extras.logging import get_logger
from ..hparams import get_infer_args, get_train_args
from ..model import load_model, load_tokenizer
from .callbacks import LogCallback
@@ -37,7 +37,7 @@ if TYPE_CHECKING:
from transformers import TrainerCallback
logger = get_logger(__name__)
logger = logging.get_logger(__name__)
def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: List["TrainerCallback"] = []) -> None:
@@ -91,7 +91,7 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
setattr(model.config, "torch_dtype", output_dtype)
model = model.to(output_dtype)
logger.info(f"Convert model dtype to: {output_dtype}.")
logger.info_rank0(f"Convert model dtype to: {output_dtype}.")
model.save_pretrained(
save_directory=model_args.export_dir,
@@ -117,13 +117,13 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
os.path.join(vhead_path, V_HEAD_SAFE_WEIGHTS_NAME),
os.path.join(model_args.export_dir, V_HEAD_SAFE_WEIGHTS_NAME),
)
logger.info(f"Copied valuehead to {model_args.export_dir}.")
logger.info_rank0(f"Copied valuehead to {model_args.export_dir}.")
elif os.path.exists(os.path.join(vhead_path, V_HEAD_WEIGHTS_NAME)):
shutil.copy(
os.path.join(vhead_path, V_HEAD_WEIGHTS_NAME),
os.path.join(model_args.export_dir, V_HEAD_WEIGHTS_NAME),
)
logger.info(f"Copied valuehead to {model_args.export_dir}.")
logger.info_rank0(f"Copied valuehead to {model_args.export_dir}.")
try:
tokenizer.padding_side = "left" # restore padding side
@@ -138,4 +138,4 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
processor.push_to_hub(model_args.export_hub_model_id, token=model_args.hf_hub_token)
except Exception as e:
logger.warning(f"Cannot save tokenizer, please copy the files manually: {e}.")
logger.warning_rank0(f"Cannot save tokenizer, please copy the files manually: {e}.")