mirror of
https://github.com/facebookresearch/pytorch3d.git
synced 2025-08-02 03:42:50 +08:00
Summary: ## Changes: - Added Accelerate Library and refactored experiment.py to use it - Needed to move `init_optimizer` and `ExperimentConfig` to a separate file to be compatible with submitit/hydra - Needed to make some modifications to data loaders etc to work well with the accelerate ddp wrappers - Loading/saving checkpoints incorporates an unwrapping step so remove the ddp wrapped model ## Tests Tested with both `torchrun` and `submitit/hydra` on two gpus locally. Here are the commands: **Torchrun** Modules loaded: ```sh 1) anaconda3/2021.05 2) cuda/11.3 3) NCCL/2.9.8-3-cuda.11.3 4) gcc/5.2.0. (but unload gcc when using submit) ``` ```sh torchrun --nnodes=1 --nproc_per_node=2 experiment.py --config-path ./configs --config-name repro_singleseq_nerf_test ``` **Submitit/Hydra Local test** ```sh ~/pytorch3d/projects/implicitron_trainer$ HYDRA_FULL_ERROR=1 python3.9 experiment.py --config-name repro_singleseq_nerf_test --multirun --config-path ./configs hydra/launcher=submitit_local hydra.launcher.gpus_per_node=2 hydra.launcher.tasks_per_node=2 hydra.launcher.nodes=1 ``` **Submitit/Hydra distributed test** ```sh ~/implicitron/pytorch3d$ python3.9 experiment.py --config-name repro_singleseq_nerf_test --multirun --config-path ./configs hydra/launcher=submitit_slurm hydra.launcher.gpus_per_node=8 hydra.launcher.tasks_per_node=8 hydra.launcher.nodes=1 hydra.launcher.partition=learnlab hydra.launcher.timeout_min=4320 ``` ## TODOS: - Fix distributed evaluation: currently this doesn't work as the input format to the evaluation function is not suitable for gathering across gpus (needs to be nested list/tuple/dicts of objects that satisfy `is_torch_tensor`) and currently `frame_data` contains `Cameras` type. - Refactor the `accelerator` object to be accessible by all functions instead of needing to pass it around everywhere? Maybe have a `Trainer` class and add it as a method? - Update readme with installation instructions for accelerate and also commands for running jobs with torchrun and submitit/hydra X-link: https://github.com/fairinternal/pytorch3d/pull/37 Reviewed By: davnov134, kjchalup Differential Revision: D37543870 Pulled By: bottler fbshipit-source-id: be9eb4e91244d4fe3740d87dafec622ae1e0cf76
88 lines
3.0 KiB
Python
88 lines
3.0 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the BSD-style license found in the
|
|
# LICENSE file in the root directory of this source tree.
|
|
|
|
import os
|
|
import unittest
|
|
from pathlib import Path
|
|
|
|
import torch
|
|
from hydra import compose, initialize_config_dir
|
|
from omegaconf import OmegaConf
|
|
|
|
from .. import experiment
|
|
|
|
|
|
def interactive_testing_requested() -> bool:
|
|
"""
|
|
Certain tests are only useful when run interactively, and so are not regularly run.
|
|
These are activated by this funciton returning True, which the user requests by
|
|
setting the environment variable `PYTORCH3D_INTERACTIVE_TESTING` to 1.
|
|
"""
|
|
return os.environ.get("PYTORCH3D_INTERACTIVE_TESTING", "") == "1"
|
|
|
|
|
|
DATA_DIR = Path(__file__).resolve().parent
|
|
IMPLICITRON_CONFIGS_DIR = Path(__file__).resolve().parent.parent / "configs"
|
|
DEBUG: bool = False
|
|
|
|
# TODO:
|
|
# - add enough files to skateboard_first_5 that this works on RE.
|
|
# - share common code with PyTorch3D tests?
|
|
# - deal with the temporary output files this test creates
|
|
|
|
|
|
class TestExperiment(unittest.TestCase):
|
|
def setUp(self):
|
|
self.maxDiff = None
|
|
|
|
def test_from_defaults(self):
|
|
# Test making minimal changes to the dataclass defaults.
|
|
if not interactive_testing_requested():
|
|
return
|
|
cfg = OmegaConf.structured(experiment.ExperimentConfig)
|
|
cfg.data_source_args.dataset_map_provider_class_type = (
|
|
"JsonIndexDatasetMapProvider"
|
|
)
|
|
dataset_args = (
|
|
cfg.data_source_args.dataset_map_provider_JsonIndexDatasetMapProvider_args
|
|
)
|
|
dataloader_args = (
|
|
cfg.data_source_args.data_loader_map_provider_SequenceDataLoaderMapProvider_args
|
|
)
|
|
dataset_args.category = "skateboard"
|
|
dataset_args.test_restrict_sequence_id = 0
|
|
dataset_args.dataset_root = "manifold://co3d/tree/extracted"
|
|
dataset_args.dataset_JsonIndexDataset_args.limit_sequences_to = 5
|
|
dataloader_args.dataset_length_train = 1
|
|
cfg.solver_args.max_epochs = 2
|
|
|
|
device = torch.device("cuda:0")
|
|
experiment.run_training(cfg, device)
|
|
|
|
def test_yaml_contents(self):
|
|
cfg = OmegaConf.structured(experiment.ExperimentConfig)
|
|
yaml = OmegaConf.to_yaml(cfg, sort_keys=False)
|
|
if DEBUG:
|
|
(DATA_DIR / "experiment.yaml").write_text(yaml)
|
|
self.assertEqual(yaml, (DATA_DIR / "experiment.yaml").read_text())
|
|
|
|
def test_load_configs(self):
|
|
config_files = []
|
|
|
|
for pattern in ("repro_singleseq*.yaml", "repro_multiseq*.yaml"):
|
|
config_files.extend(
|
|
[
|
|
f
|
|
for f in IMPLICITRON_CONFIGS_DIR.glob(pattern)
|
|
if not f.name.endswith("_base.yaml")
|
|
]
|
|
)
|
|
|
|
for file in config_files:
|
|
with self.subTest(file.name):
|
|
with initialize_config_dir(config_dir=str(IMPLICITRON_CONFIGS_DIR)):
|
|
compose(file.name)
|