mirror of
https://github.com/facebookresearch/pytorch3d.git
synced 2025-08-02 20:02:49 +08:00
Summary: ## Changes: - Added Accelerate Library and refactored experiment.py to use it - Needed to move `init_optimizer` and `ExperimentConfig` to a separate file to be compatible with submitit/hydra - Needed to make some modifications to data loaders etc to work well with the accelerate ddp wrappers - Loading/saving checkpoints incorporates an unwrapping step so remove the ddp wrapped model ## Tests Tested with both `torchrun` and `submitit/hydra` on two gpus locally. Here are the commands: **Torchrun** Modules loaded: ```sh 1) anaconda3/2021.05 2) cuda/11.3 3) NCCL/2.9.8-3-cuda.11.3 4) gcc/5.2.0. (but unload gcc when using submit) ``` ```sh torchrun --nnodes=1 --nproc_per_node=2 experiment.py --config-path ./configs --config-name repro_singleseq_nerf_test ``` **Submitit/Hydra Local test** ```sh ~/pytorch3d/projects/implicitron_trainer$ HYDRA_FULL_ERROR=1 python3.9 experiment.py --config-name repro_singleseq_nerf_test --multirun --config-path ./configs hydra/launcher=submitit_local hydra.launcher.gpus_per_node=2 hydra.launcher.tasks_per_node=2 hydra.launcher.nodes=1 ``` **Submitit/Hydra distributed test** ```sh ~/implicitron/pytorch3d$ python3.9 experiment.py --config-name repro_singleseq_nerf_test --multirun --config-path ./configs hydra/launcher=submitit_slurm hydra.launcher.gpus_per_node=8 hydra.launcher.tasks_per_node=8 hydra.launcher.nodes=1 hydra.launcher.partition=learnlab hydra.launcher.timeout_min=4320 ``` ## TODOS: - Fix distributed evaluation: currently this doesn't work as the input format to the evaluation function is not suitable for gathering across gpus (needs to be nested list/tuple/dicts of objects that satisfy `is_torch_tensor`) and currently `frame_data` contains `Cameras` type. - Refactor the `accelerator` object to be accessible by all functions instead of needing to pass it around everywhere? Maybe have a `Trainer` class and add it as a method? - Update readme with installation instructions for accelerate and also commands for running jobs with torchrun and submitit/hydra X-link: https://github.com/fairinternal/pytorch3d/pull/37 Reviewed By: davnov134, kjchalup Differential Revision: D37543870 Pulled By: bottler fbshipit-source-id: be9eb4e91244d4fe3740d87dafec622ae1e0cf76
50 lines
1.7 KiB
Python
50 lines
1.7 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the BSD-style license found in the
|
|
# LICENSE file in the root directory of this source tree.
|
|
|
|
from dataclasses import field
|
|
from typing import Tuple
|
|
|
|
from omegaconf import DictConfig
|
|
from pytorch3d.implicitron.dataset.data_source import ImplicitronDataSource
|
|
from pytorch3d.implicitron.models.generic_model import GenericModel
|
|
from pytorch3d.implicitron.tools.config import Configurable, get_default_args_field
|
|
|
|
from .optimization import init_optimizer
|
|
|
|
|
|
class ExperimentConfig(Configurable):
|
|
generic_model_args: DictConfig = get_default_args_field(GenericModel)
|
|
solver_args: DictConfig = get_default_args_field(init_optimizer)
|
|
data_source_args: DictConfig = get_default_args_field(ImplicitronDataSource)
|
|
architecture: str = "generic"
|
|
detect_anomaly: bool = False
|
|
eval_only: bool = False
|
|
exp_dir: str = "./data/default_experiment/"
|
|
exp_idx: int = 0
|
|
gpu_idx: int = 0
|
|
metric_print_interval: int = 5
|
|
resume: bool = True
|
|
resume_epoch: int = -1
|
|
seed: int = 0
|
|
store_checkpoints: bool = True
|
|
store_checkpoints_purge: int = 1
|
|
test_interval: int = -1
|
|
test_when_finished: bool = False
|
|
validation_interval: int = 1
|
|
visdom_env: str = ""
|
|
visdom_port: int = 8097
|
|
visdom_server: str = "http://127.0.0.1"
|
|
visualize_interval: int = 1000
|
|
clip_grad: float = 0.0
|
|
camera_difficulty_bin_breaks: Tuple[float, ...] = 0.97, 0.98
|
|
|
|
hydra: dict = field(
|
|
default_factory=lambda: {
|
|
"run": {"dir": "."}, # Make hydra not change the working dir.
|
|
"output_subdir": None, # disable storing the .hydra logs
|
|
}
|
|
)
|