option to avoid accelerate

Summary: For debugging, introduce PYTORCH3D_NO_ACCELERATE env var.

Reviewed By: shapovalov

Differential Revision: D37885393

fbshipit-source-id: de080080c0aa4b6d874028937083a0113bb97c23
This commit is contained in:
Jeremy Reizenstein 2022-07-17 13:15:59 -07:00 committed by Facebook GitHub Bot
parent 0f966217e5
commit 9b2e570536
2 changed files with 41 additions and 31 deletions

View File

@ -97,13 +97,16 @@ try:
except ModuleNotFoundError: except ModuleNotFoundError:
pass pass
no_accelerate = os.environ.get("PYTORCH3D_NO_ACCELERATE") is not None
def init_model( def init_model(
*,
cfg: DictConfig, cfg: DictConfig,
accelerator: Optional[Accelerator] = None,
force_load: bool = False, force_load: bool = False,
clear_stats: bool = False, clear_stats: bool = False,
load_model_only: bool = False, load_model_only: bool = False,
accelerator: Accelerator = None,
) -> Tuple[GenericModel, Stats, Optional[Dict[str, Any]]]: ) -> Tuple[GenericModel, Stats, Optional[Dict[str, Any]]]:
""" """
Returns an instance of `GenericModel`. Returns an instance of `GenericModel`.
@ -166,7 +169,7 @@ def init_model(
logger.info(" -> resuming") logger.info(" -> resuming")
map_location = None map_location = None
if not accelerator.is_local_main_process: if accelerator is not None and not accelerator.is_local_main_process:
map_location = { map_location = {
"cuda:%d" % 0: "cuda:%d" % accelerator.local_process_index "cuda:%d" % 0: "cuda:%d" % accelerator.local_process_index
} }
@ -228,13 +231,14 @@ def trainvalidate(
loader, loader,
optimizer, optimizer,
validation: bool, validation: bool,
*,
accelerator: Optional[Accelerator],
device: torch.device,
bp_var: str = "objective", bp_var: str = "objective",
metric_print_interval: int = 5, metric_print_interval: int = 5,
visualize_interval: int = 100, visualize_interval: int = 100,
visdom_env_root: str = "trainvalidate", visdom_env_root: str = "trainvalidate",
clip_grad: float = 0.0, clip_grad: float = 0.0,
device: str = "cuda:0",
accelerator: Accelerator = None,
**kwargs, **kwargs,
) -> None: ) -> None:
""" """
@ -286,7 +290,7 @@ def trainvalidate(
last_iter = it == n_batches - 1 last_iter = it == n_batches - 1
# move to gpu where possible (in place) # move to gpu where possible (in place)
net_input = net_input.to(accelerator.device) net_input = net_input.to(device)
# run the forward pass # run the forward pass
if not validation: if not validation:
@ -313,7 +317,7 @@ def trainvalidate(
# visualize results # visualize results
if ( if (
accelerator.is_local_main_process (accelerator is None or accelerator.is_local_main_process)
and visualize_interval > 0 and visualize_interval > 0
and it % visualize_interval == 0 and it % visualize_interval == 0
): ):
@ -331,7 +335,10 @@ def trainvalidate(
loss = preds[bp_var] loss = preds[bp_var]
assert torch.isfinite(loss).all(), "Non-finite loss!" assert torch.isfinite(loss).all(), "Non-finite loss!"
# backprop # backprop
accelerator.backward(loss) if accelerator is None:
loss.backward()
else:
accelerator.backward(loss)
if clip_grad > 0.0: if clip_grad > 0.0:
# Optionally clip the gradient norms. # Optionally clip the gradient norms.
total_norm = torch.nn.utils.clip_grad_norm( total_norm = torch.nn.utils.clip_grad_norm(
@ -353,15 +360,16 @@ def run_training(cfg: DictConfig) -> None:
""" """
# Initialize the accelerator # Initialize the accelerator
accelerator = Accelerator(device_placement=False) if no_accelerate:
logger.info(accelerator.state) accelerator = None
device = torch.device("cuda:0")
else:
accelerator = Accelerator(device_placement=False)
logger.info(accelerator.state)
device = accelerator.device
device = accelerator.device
logger.info(f"Running experiment on device: {device}") logger.info(f"Running experiment on device: {device}")
if accelerator.is_local_main_process:
logger.info(OmegaConf.to_yaml(cfg))
# set the debug mode # set the debug mode
if cfg.detect_anomaly: if cfg.detect_anomaly:
logger.info("Anomaly detection!") logger.info("Anomaly detection!")
@ -386,11 +394,11 @@ def run_training(cfg: DictConfig) -> None:
all_train_cameras = datasource.get_all_train_cameras() all_train_cameras = datasource.get_all_train_cameras()
# init the model # init the model
model, stats, optimizer_state = init_model(cfg, accelerator=accelerator) model, stats, optimizer_state = init_model(cfg=cfg, accelerator=accelerator)
start_epoch = stats.epoch + 1 start_epoch = stats.epoch + 1
# move model to gpu # move model to gpu
model.to(accelerator.device) model.to(device)
# only run evaluation on the test dataloader # only run evaluation on the test dataloader
if cfg.eval_only: if cfg.eval_only:
@ -403,7 +411,6 @@ def run_training(cfg: DictConfig) -> None:
model, model,
stats, stats,
device=device, device=device,
accelerator=accelerator,
) )
return return
@ -422,12 +429,15 @@ def run_training(cfg: DictConfig) -> None:
# Wrap all modules in the distributed library # Wrap all modules in the distributed library
# Note: we don't pass the scheduler to prepare as it # Note: we don't pass the scheduler to prepare as it
# doesn't need to be stepped at each optimizer step # doesn't need to be stepped at each optimizer step
( train_loader = dataloaders.train
model, val_loader = dataloaders.val
optimizer, if accelerator is not None:
train_loader, (
val_loader, model,
) = accelerator.prepare(model, optimizer, dataloaders.train, dataloaders.val) optimizer,
train_loader,
val_loader,
) = accelerator.prepare(model, optimizer, train_loader, val_loader)
past_scheduler_lrs = [] past_scheduler_lrs = []
# loop through epochs # loop through epochs
@ -485,19 +495,22 @@ def run_training(cfg: DictConfig) -> None:
task, task,
camera_difficulty_bin_breaks=cfg.camera_difficulty_bin_breaks, camera_difficulty_bin_breaks=cfg.camera_difficulty_bin_breaks,
device=device, device=device,
accelerator=accelerator,
) )
assert stats.epoch == epoch, "inconsistent stats!" assert stats.epoch == epoch, "inconsistent stats!"
# delete previous models if required # delete previous models if required
# save model only on the main process # save model only on the main process
if cfg.store_checkpoints and accelerator.is_local_main_process: if cfg.store_checkpoints and (
accelerator is None or accelerator.is_local_main_process
):
if cfg.store_checkpoints_purge > 0: if cfg.store_checkpoints_purge > 0:
for prev_epoch in range(epoch - cfg.store_checkpoints_purge): for prev_epoch in range(epoch - cfg.store_checkpoints_purge):
model_io.purge_epoch(cfg.exp_dir, prev_epoch) model_io.purge_epoch(cfg.exp_dir, prev_epoch)
outfile = model_io.get_checkpoint(cfg.exp_dir, epoch) outfile = model_io.get_checkpoint(cfg.exp_dir, epoch)
unwrapped_model = accelerator.unwrap_model(model) unwrapped_model = (
model if accelerator is None else accelerator.unwrap_model(model)
)
model_io.safe_save_model( model_io.safe_save_model(
unwrapped_model, stats, outfile, optimizer=optimizer unwrapped_model, stats, outfile, optimizer=optimizer
) )
@ -530,7 +543,6 @@ def _eval_and_dump(
model, model,
stats, stats,
device, device,
accelerator: Accelerator = None,
) -> None: ) -> None:
""" """
Run the evaluation loop with the test data loader and Run the evaluation loop with the test data loader and
@ -549,7 +561,6 @@ def _eval_and_dump(
task, task,
camera_difficulty_bin_breaks=cfg.camera_difficulty_bin_breaks, camera_difficulty_bin_breaks=cfg.camera_difficulty_bin_breaks,
device=device, device=device,
accelerator=accelerator,
) )
# add the evaluation epoch to the results # add the evaluation epoch to the results
@ -584,20 +595,19 @@ def _run_eval(
task: Task, task: Task,
camera_difficulty_bin_breaks: Tuple[float, float], camera_difficulty_bin_breaks: Tuple[float, float],
device, device,
accelerator: Accelerator = None,
): ):
""" """
Run the evaluation loop on the test dataloader Run the evaluation loop on the test dataloader
""" """
lpips_model = lpips.LPIPS(net="vgg") lpips_model = lpips.LPIPS(net="vgg")
lpips_model = lpips_model.to(accelerator.device) lpips_model = lpips_model.to(device)
model.eval() model.eval()
per_batch_eval_results = [] per_batch_eval_results = []
logger.info("Evaluating model ...") logger.info("Evaluating model ...")
for frame_data in tqdm.tqdm(loader): for frame_data in tqdm.tqdm(loader):
frame_data = frame_data.to(accelerator.device) frame_data = frame_data.to(device)
# mask out the unknown images so that the model does not see them # mask out the unknown images so that the model does not see them
frame_data_for_eval = _get_eval_frame_data(frame_data) frame_data_for_eval = _get_eval_frame_data(frame_data)

View File

@ -344,7 +344,7 @@ def export_scenes(
os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_idx) os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_idx)
# Load the previously trained model # Load the previously trained model
model, _, _ = init_model(config, force_load=True, load_model_only=True) model, _, _ = init_model(cfg=config, force_load=True, load_model_only=True)
model.cuda() model.cuda()
model.eval() model.eval()