mirror of
https://github.com/facebookresearch/pytorch3d.git
synced 2025-08-03 04:12:48 +08:00
option to avoid accelerate
Summary: For debugging, introduce PYTORCH3D_NO_ACCELERATE env var. Reviewed By: shapovalov Differential Revision: D37885393 fbshipit-source-id: de080080c0aa4b6d874028937083a0113bb97c23
This commit is contained in:
parent
0f966217e5
commit
9b2e570536
@ -97,13 +97,16 @@ try:
|
|||||||
except ModuleNotFoundError:
|
except ModuleNotFoundError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
no_accelerate = os.environ.get("PYTORCH3D_NO_ACCELERATE") is not None
|
||||||
|
|
||||||
|
|
||||||
def init_model(
|
def init_model(
|
||||||
|
*,
|
||||||
cfg: DictConfig,
|
cfg: DictConfig,
|
||||||
|
accelerator: Optional[Accelerator] = None,
|
||||||
force_load: bool = False,
|
force_load: bool = False,
|
||||||
clear_stats: bool = False,
|
clear_stats: bool = False,
|
||||||
load_model_only: bool = False,
|
load_model_only: bool = False,
|
||||||
accelerator: Accelerator = None,
|
|
||||||
) -> Tuple[GenericModel, Stats, Optional[Dict[str, Any]]]:
|
) -> Tuple[GenericModel, Stats, Optional[Dict[str, Any]]]:
|
||||||
"""
|
"""
|
||||||
Returns an instance of `GenericModel`.
|
Returns an instance of `GenericModel`.
|
||||||
@ -166,7 +169,7 @@ def init_model(
|
|||||||
logger.info(" -> resuming")
|
logger.info(" -> resuming")
|
||||||
|
|
||||||
map_location = None
|
map_location = None
|
||||||
if not accelerator.is_local_main_process:
|
if accelerator is not None and not accelerator.is_local_main_process:
|
||||||
map_location = {
|
map_location = {
|
||||||
"cuda:%d" % 0: "cuda:%d" % accelerator.local_process_index
|
"cuda:%d" % 0: "cuda:%d" % accelerator.local_process_index
|
||||||
}
|
}
|
||||||
@ -228,13 +231,14 @@ def trainvalidate(
|
|||||||
loader,
|
loader,
|
||||||
optimizer,
|
optimizer,
|
||||||
validation: bool,
|
validation: bool,
|
||||||
|
*,
|
||||||
|
accelerator: Optional[Accelerator],
|
||||||
|
device: torch.device,
|
||||||
bp_var: str = "objective",
|
bp_var: str = "objective",
|
||||||
metric_print_interval: int = 5,
|
metric_print_interval: int = 5,
|
||||||
visualize_interval: int = 100,
|
visualize_interval: int = 100,
|
||||||
visdom_env_root: str = "trainvalidate",
|
visdom_env_root: str = "trainvalidate",
|
||||||
clip_grad: float = 0.0,
|
clip_grad: float = 0.0,
|
||||||
device: str = "cuda:0",
|
|
||||||
accelerator: Accelerator = None,
|
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
@ -286,7 +290,7 @@ def trainvalidate(
|
|||||||
last_iter = it == n_batches - 1
|
last_iter = it == n_batches - 1
|
||||||
|
|
||||||
# move to gpu where possible (in place)
|
# move to gpu where possible (in place)
|
||||||
net_input = net_input.to(accelerator.device)
|
net_input = net_input.to(device)
|
||||||
|
|
||||||
# run the forward pass
|
# run the forward pass
|
||||||
if not validation:
|
if not validation:
|
||||||
@ -313,7 +317,7 @@ def trainvalidate(
|
|||||||
|
|
||||||
# visualize results
|
# visualize results
|
||||||
if (
|
if (
|
||||||
accelerator.is_local_main_process
|
(accelerator is None or accelerator.is_local_main_process)
|
||||||
and visualize_interval > 0
|
and visualize_interval > 0
|
||||||
and it % visualize_interval == 0
|
and it % visualize_interval == 0
|
||||||
):
|
):
|
||||||
@ -331,6 +335,9 @@ def trainvalidate(
|
|||||||
loss = preds[bp_var]
|
loss = preds[bp_var]
|
||||||
assert torch.isfinite(loss).all(), "Non-finite loss!"
|
assert torch.isfinite(loss).all(), "Non-finite loss!"
|
||||||
# backprop
|
# backprop
|
||||||
|
if accelerator is None:
|
||||||
|
loss.backward()
|
||||||
|
else:
|
||||||
accelerator.backward(loss)
|
accelerator.backward(loss)
|
||||||
if clip_grad > 0.0:
|
if clip_grad > 0.0:
|
||||||
# Optionally clip the gradient norms.
|
# Optionally clip the gradient norms.
|
||||||
@ -353,14 +360,15 @@ def run_training(cfg: DictConfig) -> None:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# Initialize the accelerator
|
# Initialize the accelerator
|
||||||
|
if no_accelerate:
|
||||||
|
accelerator = None
|
||||||
|
device = torch.device("cuda:0")
|
||||||
|
else:
|
||||||
accelerator = Accelerator(device_placement=False)
|
accelerator = Accelerator(device_placement=False)
|
||||||
logger.info(accelerator.state)
|
logger.info(accelerator.state)
|
||||||
|
|
||||||
device = accelerator.device
|
device = accelerator.device
|
||||||
logger.info(f"Running experiment on device: {device}")
|
|
||||||
|
|
||||||
if accelerator.is_local_main_process:
|
logger.info(f"Running experiment on device: {device}")
|
||||||
logger.info(OmegaConf.to_yaml(cfg))
|
|
||||||
|
|
||||||
# set the debug mode
|
# set the debug mode
|
||||||
if cfg.detect_anomaly:
|
if cfg.detect_anomaly:
|
||||||
@ -386,11 +394,11 @@ def run_training(cfg: DictConfig) -> None:
|
|||||||
all_train_cameras = datasource.get_all_train_cameras()
|
all_train_cameras = datasource.get_all_train_cameras()
|
||||||
|
|
||||||
# init the model
|
# init the model
|
||||||
model, stats, optimizer_state = init_model(cfg, accelerator=accelerator)
|
model, stats, optimizer_state = init_model(cfg=cfg, accelerator=accelerator)
|
||||||
start_epoch = stats.epoch + 1
|
start_epoch = stats.epoch + 1
|
||||||
|
|
||||||
# move model to gpu
|
# move model to gpu
|
||||||
model.to(accelerator.device)
|
model.to(device)
|
||||||
|
|
||||||
# only run evaluation on the test dataloader
|
# only run evaluation on the test dataloader
|
||||||
if cfg.eval_only:
|
if cfg.eval_only:
|
||||||
@ -403,7 +411,6 @@ def run_training(cfg: DictConfig) -> None:
|
|||||||
model,
|
model,
|
||||||
stats,
|
stats,
|
||||||
device=device,
|
device=device,
|
||||||
accelerator=accelerator,
|
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -422,12 +429,15 @@ def run_training(cfg: DictConfig) -> None:
|
|||||||
# Wrap all modules in the distributed library
|
# Wrap all modules in the distributed library
|
||||||
# Note: we don't pass the scheduler to prepare as it
|
# Note: we don't pass the scheduler to prepare as it
|
||||||
# doesn't need to be stepped at each optimizer step
|
# doesn't need to be stepped at each optimizer step
|
||||||
|
train_loader = dataloaders.train
|
||||||
|
val_loader = dataloaders.val
|
||||||
|
if accelerator is not None:
|
||||||
(
|
(
|
||||||
model,
|
model,
|
||||||
optimizer,
|
optimizer,
|
||||||
train_loader,
|
train_loader,
|
||||||
val_loader,
|
val_loader,
|
||||||
) = accelerator.prepare(model, optimizer, dataloaders.train, dataloaders.val)
|
) = accelerator.prepare(model, optimizer, train_loader, val_loader)
|
||||||
|
|
||||||
past_scheduler_lrs = []
|
past_scheduler_lrs = []
|
||||||
# loop through epochs
|
# loop through epochs
|
||||||
@ -485,19 +495,22 @@ def run_training(cfg: DictConfig) -> None:
|
|||||||
task,
|
task,
|
||||||
camera_difficulty_bin_breaks=cfg.camera_difficulty_bin_breaks,
|
camera_difficulty_bin_breaks=cfg.camera_difficulty_bin_breaks,
|
||||||
device=device,
|
device=device,
|
||||||
accelerator=accelerator,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert stats.epoch == epoch, "inconsistent stats!"
|
assert stats.epoch == epoch, "inconsistent stats!"
|
||||||
|
|
||||||
# delete previous models if required
|
# delete previous models if required
|
||||||
# save model only on the main process
|
# save model only on the main process
|
||||||
if cfg.store_checkpoints and accelerator.is_local_main_process:
|
if cfg.store_checkpoints and (
|
||||||
|
accelerator is None or accelerator.is_local_main_process
|
||||||
|
):
|
||||||
if cfg.store_checkpoints_purge > 0:
|
if cfg.store_checkpoints_purge > 0:
|
||||||
for prev_epoch in range(epoch - cfg.store_checkpoints_purge):
|
for prev_epoch in range(epoch - cfg.store_checkpoints_purge):
|
||||||
model_io.purge_epoch(cfg.exp_dir, prev_epoch)
|
model_io.purge_epoch(cfg.exp_dir, prev_epoch)
|
||||||
outfile = model_io.get_checkpoint(cfg.exp_dir, epoch)
|
outfile = model_io.get_checkpoint(cfg.exp_dir, epoch)
|
||||||
unwrapped_model = accelerator.unwrap_model(model)
|
unwrapped_model = (
|
||||||
|
model if accelerator is None else accelerator.unwrap_model(model)
|
||||||
|
)
|
||||||
model_io.safe_save_model(
|
model_io.safe_save_model(
|
||||||
unwrapped_model, stats, outfile, optimizer=optimizer
|
unwrapped_model, stats, outfile, optimizer=optimizer
|
||||||
)
|
)
|
||||||
@ -530,7 +543,6 @@ def _eval_and_dump(
|
|||||||
model,
|
model,
|
||||||
stats,
|
stats,
|
||||||
device,
|
device,
|
||||||
accelerator: Accelerator = None,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Run the evaluation loop with the test data loader and
|
Run the evaluation loop with the test data loader and
|
||||||
@ -549,7 +561,6 @@ def _eval_and_dump(
|
|||||||
task,
|
task,
|
||||||
camera_difficulty_bin_breaks=cfg.camera_difficulty_bin_breaks,
|
camera_difficulty_bin_breaks=cfg.camera_difficulty_bin_breaks,
|
||||||
device=device,
|
device=device,
|
||||||
accelerator=accelerator,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# add the evaluation epoch to the results
|
# add the evaluation epoch to the results
|
||||||
@ -584,20 +595,19 @@ def _run_eval(
|
|||||||
task: Task,
|
task: Task,
|
||||||
camera_difficulty_bin_breaks: Tuple[float, float],
|
camera_difficulty_bin_breaks: Tuple[float, float],
|
||||||
device,
|
device,
|
||||||
accelerator: Accelerator = None,
|
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Run the evaluation loop on the test dataloader
|
Run the evaluation loop on the test dataloader
|
||||||
"""
|
"""
|
||||||
lpips_model = lpips.LPIPS(net="vgg")
|
lpips_model = lpips.LPIPS(net="vgg")
|
||||||
lpips_model = lpips_model.to(accelerator.device)
|
lpips_model = lpips_model.to(device)
|
||||||
|
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
per_batch_eval_results = []
|
per_batch_eval_results = []
|
||||||
logger.info("Evaluating model ...")
|
logger.info("Evaluating model ...")
|
||||||
for frame_data in tqdm.tqdm(loader):
|
for frame_data in tqdm.tqdm(loader):
|
||||||
frame_data = frame_data.to(accelerator.device)
|
frame_data = frame_data.to(device)
|
||||||
|
|
||||||
# mask out the unknown images so that the model does not see them
|
# mask out the unknown images so that the model does not see them
|
||||||
frame_data_for_eval = _get_eval_frame_data(frame_data)
|
frame_data_for_eval = _get_eval_frame_data(frame_data)
|
||||||
|
@ -344,7 +344,7 @@ def export_scenes(
|
|||||||
os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_idx)
|
os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu_idx)
|
||||||
|
|
||||||
# Load the previously trained model
|
# Load the previously trained model
|
||||||
model, _, _ = init_model(config, force_load=True, load_model_only=True)
|
model, _, _ = init_model(cfg=config, force_load=True, load_model_only=True)
|
||||||
model.cuda()
|
model.cuda()
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user