MKL version fix in CI (#1820 )

Summary: Fix for "undefined symbol: iJIT_NotifyEvent" build issue, Pull Request resolved: https://github.com/facebookresearch/pytorch3d/pull/1820 Differential Revision: D58685326
2026-02-26 16:26:00 +08:00 · 2024-06-20 09:24:07 -07:00
252 changed files with 1203 additions and 1981 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -162,6 +162,90 @@ workflows:
    jobs:
      # - main:
      #     context: DOCKERHUB_TOKEN
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda113
          context: DOCKERHUB_TOKEN
          cu_version: cu113
          name: linux_conda_py38_cu113_pyt1120
          python_version: '3.8'
          pytorch_version: 1.12.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda116
          context: DOCKERHUB_TOKEN
          cu_version: cu116
          name: linux_conda_py38_cu116_pyt1120
          python_version: '3.8'
          pytorch_version: 1.12.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda113
          context: DOCKERHUB_TOKEN
          cu_version: cu113
          name: linux_conda_py38_cu113_pyt1121
          python_version: '3.8'
          pytorch_version: 1.12.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda116
          context: DOCKERHUB_TOKEN
          cu_version: cu116
          name: linux_conda_py38_cu116_pyt1121
          python_version: '3.8'
          pytorch_version: 1.12.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda116
          context: DOCKERHUB_TOKEN
          cu_version: cu116
          name: linux_conda_py38_cu116_pyt1130
          python_version: '3.8'
          pytorch_version: 1.13.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda117
          context: DOCKERHUB_TOKEN
          cu_version: cu117
          name: linux_conda_py38_cu117_pyt1130
          python_version: '3.8'
          pytorch_version: 1.13.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda116
          context: DOCKERHUB_TOKEN
          cu_version: cu116
          name: linux_conda_py38_cu116_pyt1131
          python_version: '3.8'
          pytorch_version: 1.13.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda117
          context: DOCKERHUB_TOKEN
          cu_version: cu117
          name: linux_conda_py38_cu117_pyt1131
          python_version: '3.8'
          pytorch_version: 1.13.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda117
          context: DOCKERHUB_TOKEN
          cu_version: cu117
          name: linux_conda_py38_cu117_pyt200
          python_version: '3.8'
          pytorch_version: 2.0.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
          cu_version: cu118
          name: linux_conda_py38_cu118_pyt200
          python_version: '3.8'
          pytorch_version: 2.0.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda117
          context: DOCKERHUB_TOKEN
          cu_version: cu117
          name: linux_conda_py38_cu117_pyt201
          python_version: '3.8'
          pytorch_version: 2.0.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
          cu_version: cu118
          name: linux_conda_py38_cu118_pyt201
          python_version: '3.8'
          pytorch_version: 2.0.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
@@ -247,33 +331,89 @@ workflows:
          python_version: '3.8'
          pytorch_version: 2.3.1
      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda118
+          conda_docker_image: pytorch/conda-builder:cuda113
          context: DOCKERHUB_TOKEN
-          cu_version: cu118
+          cu_version: cu113
-          name: linux_conda_py38_cu118_pyt240
+          name: linux_conda_py39_cu113_pyt1120
-          python_version: '3.8'
+          python_version: '3.9'
-          pytorch_version: 2.4.0
+          pytorch_version: 1.12.0
      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda121
+          conda_docker_image: pytorch/conda-builder:cuda116
          context: DOCKERHUB_TOKEN
-          cu_version: cu121
+          cu_version: cu116
-          name: linux_conda_py38_cu121_pyt240
+          name: linux_conda_py39_cu116_pyt1120
-          python_version: '3.8'
+          python_version: '3.9'
-          pytorch_version: 2.4.0
+          pytorch_version: 1.12.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda113
          context: DOCKERHUB_TOKEN
          cu_version: cu113
          name: linux_conda_py39_cu113_pyt1121
          python_version: '3.9'
          pytorch_version: 1.12.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda116
          context: DOCKERHUB_TOKEN
          cu_version: cu116
          name: linux_conda_py39_cu116_pyt1121
          python_version: '3.9'
          pytorch_version: 1.12.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda116
          context: DOCKERHUB_TOKEN
          cu_version: cu116
          name: linux_conda_py39_cu116_pyt1130
          python_version: '3.9'
          pytorch_version: 1.13.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda117
          context: DOCKERHUB_TOKEN
          cu_version: cu117
          name: linux_conda_py39_cu117_pyt1130
          python_version: '3.9'
          pytorch_version: 1.13.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda116
          context: DOCKERHUB_TOKEN
          cu_version: cu116
          name: linux_conda_py39_cu116_pyt1131
          python_version: '3.9'
          pytorch_version: 1.13.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda117
          context: DOCKERHUB_TOKEN
          cu_version: cu117
          name: linux_conda_py39_cu117_pyt1131
          python_version: '3.9'
          pytorch_version: 1.13.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda117
          context: DOCKERHUB_TOKEN
          cu_version: cu117
          name: linux_conda_py39_cu117_pyt200
          python_version: '3.9'
          pytorch_version: 2.0.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
          cu_version: cu118
-          name: linux_conda_py38_cu118_pyt241
+          name: linux_conda_py39_cu118_pyt200
-          python_version: '3.8'
+          python_version: '3.9'
-          pytorch_version: 2.4.1
+          pytorch_version: 2.0.0
      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda121
+          conda_docker_image: pytorch/conda-builder:cuda117
          context: DOCKERHUB_TOKEN
-          cu_version: cu121
+          cu_version: cu117
-          name: linux_conda_py38_cu121_pyt241
+          name: linux_conda_py39_cu117_pyt201
-          python_version: '3.8'
+          python_version: '3.9'
-          pytorch_version: 2.4.1
+          pytorch_version: 2.0.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
          cu_version: cu118
          name: linux_conda_py39_cu118_pyt201
          python_version: '3.9'
          pytorch_version: 2.0.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
@@ -359,33 +499,89 @@ workflows:
          python_version: '3.9'
          pytorch_version: 2.3.1
      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda118
+          conda_docker_image: pytorch/conda-builder:cuda113
          context: DOCKERHUB_TOKEN
-          cu_version: cu118
+          cu_version: cu113
-          name: linux_conda_py39_cu118_pyt240
+          name: linux_conda_py310_cu113_pyt1120
-          python_version: '3.9'
+          python_version: '3.10'
-          pytorch_version: 2.4.0
+          pytorch_version: 1.12.0
      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda121
+          conda_docker_image: pytorch/conda-builder:cuda116
          context: DOCKERHUB_TOKEN
-          cu_version: cu121
+          cu_version: cu116
-          name: linux_conda_py39_cu121_pyt240
+          name: linux_conda_py310_cu116_pyt1120
-          python_version: '3.9'
+          python_version: '3.10'
-          pytorch_version: 2.4.0
+          pytorch_version: 1.12.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda113
          context: DOCKERHUB_TOKEN
          cu_version: cu113
          name: linux_conda_py310_cu113_pyt1121
          python_version: '3.10'
          pytorch_version: 1.12.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda116
          context: DOCKERHUB_TOKEN
          cu_version: cu116
          name: linux_conda_py310_cu116_pyt1121
          python_version: '3.10'
          pytorch_version: 1.12.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda116
          context: DOCKERHUB_TOKEN
          cu_version: cu116
          name: linux_conda_py310_cu116_pyt1130
          python_version: '3.10'
          pytorch_version: 1.13.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda117
          context: DOCKERHUB_TOKEN
          cu_version: cu117
          name: linux_conda_py310_cu117_pyt1130
          python_version: '3.10'
          pytorch_version: 1.13.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda116
          context: DOCKERHUB_TOKEN
          cu_version: cu116
          name: linux_conda_py310_cu116_pyt1131
          python_version: '3.10'
          pytorch_version: 1.13.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda117
          context: DOCKERHUB_TOKEN
          cu_version: cu117
          name: linux_conda_py310_cu117_pyt1131
          python_version: '3.10'
          pytorch_version: 1.13.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda117
          context: DOCKERHUB_TOKEN
          cu_version: cu117
          name: linux_conda_py310_cu117_pyt200
          python_version: '3.10'
          pytorch_version: 2.0.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
          cu_version: cu118
-          name: linux_conda_py39_cu118_pyt241
+          name: linux_conda_py310_cu118_pyt200
-          python_version: '3.9'
+          python_version: '3.10'
-          pytorch_version: 2.4.1
+          pytorch_version: 2.0.0
      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda121
+          conda_docker_image: pytorch/conda-builder:cuda117
          context: DOCKERHUB_TOKEN
-          cu_version: cu121
+          cu_version: cu117
-          name: linux_conda_py39_cu121_pyt241
+          name: linux_conda_py310_cu117_pyt201
-          python_version: '3.9'
+          python_version: '3.10'
-          pytorch_version: 2.4.1
+          pytorch_version: 2.0.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
          cu_version: cu118
          name: linux_conda_py310_cu118_pyt201
          python_version: '3.10'
          pytorch_version: 2.0.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
@@ -470,34 +666,6 @@ workflows:
          name: linux_conda_py310_cu121_pyt231
          python_version: '3.10'
          pytorch_version: 2.3.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
          cu_version: cu118
          name: linux_conda_py310_cu118_pyt240
          python_version: '3.10'
          pytorch_version: 2.4.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda121
          context: DOCKERHUB_TOKEN
          cu_version: cu121
          name: linux_conda_py310_cu121_pyt240
          python_version: '3.10'
          pytorch_version: 2.4.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
          cu_version: cu118
          name: linux_conda_py310_cu118_pyt241
          python_version: '3.10'
          pytorch_version: 2.4.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda121
          context: DOCKERHUB_TOKEN
          cu_version: cu121
          name: linux_conda_py310_cu121_pyt241
          python_version: '3.10'
          pytorch_version: 2.4.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
@@ -582,34 +750,6 @@ workflows:
          name: linux_conda_py311_cu121_pyt231
          python_version: '3.11'
          pytorch_version: 2.3.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
          cu_version: cu118
          name: linux_conda_py311_cu118_pyt240
          python_version: '3.11'
          pytorch_version: 2.4.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda121
          context: DOCKERHUB_TOKEN
          cu_version: cu121
          name: linux_conda_py311_cu121_pyt240
          python_version: '3.11'
          pytorch_version: 2.4.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
          cu_version: cu118
          name: linux_conda_py311_cu118_pyt241
          python_version: '3.11'
          pytorch_version: 2.4.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda121
          context: DOCKERHUB_TOKEN
          cu_version: cu121
          name: linux_conda_py311_cu121_pyt241
          python_version: '3.11'
          pytorch_version: 2.4.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
@@ -652,34 +792,6 @@ workflows:
          name: linux_conda_py312_cu121_pyt231
          python_version: '3.12'
          pytorch_version: 2.3.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
          cu_version: cu118
          name: linux_conda_py312_cu118_pyt240
          python_version: '3.12'
          pytorch_version: 2.4.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda121
          context: DOCKERHUB_TOKEN
          cu_version: cu121
          name: linux_conda_py312_cu121_pyt240
          python_version: '3.12'
          pytorch_version: 2.4.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
          cu_version: cu118
          name: linux_conda_py312_cu118_pyt241
          python_version: '3.12'
          pytorch_version: 2.4.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda121
          context: DOCKERHUB_TOKEN
          cu_version: cu121
          name: linux_conda_py312_cu121_pyt241
          python_version: '3.12'
          pytorch_version: 2.4.1
      - binary_linux_conda_cuda:
          name: testrun_conda_cuda_py310_cu117_pyt201
          context: DOCKERHUB_TOKEN
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -19,14 +19,18 @@ from packaging import version
 # The CUDA versions which have pytorch conda packages available for linux for each
 # version of pytorch.
 CONDA_CUDA_VERSIONS = {
    "1.12.0": ["cu113", "cu116"],
    "1.12.1": ["cu113", "cu116"],
    "1.13.0": ["cu116", "cu117"],
    "1.13.1": ["cu116", "cu117"],
    "2.0.0": ["cu117", "cu118"],
    "2.0.1": ["cu117", "cu118"],
    "2.1.0": ["cu118", "cu121"],
    "2.1.1": ["cu118", "cu121"],
    "2.1.2": ["cu118", "cu121"],
    "2.2.0": ["cu118", "cu121"],
    "2.2.2": ["cu118", "cu121"],
    "2.3.1": ["cu118", "cu121"],
    "2.4.0": ["cu118", "cu121"],
    "2.4.1": ["cu118", "cu121"],
 }
@@ -88,6 +92,7 @@ def workflow_pair(
    upload=False,
    filter_branch,
 ):
    w = []
    py = python_version.replace(".", "")
    pyt = pytorch_version.replace(".", "")
@@ -126,6 +131,7 @@ def generate_base_workflow(
    btype,
    filter_branch=None,
 ):
    d = {
        "name": base_workflow_name,
        "python_version": python_version,
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,23 +0,0 @@
 name: facebookresearch/pytorch3d/build_and_test
 on:
  pull_request:
    branches:
      - main
  push:
    branches:
      - main
 jobs:
  binary_linux_conda_cuda:
    runs-on: 4-core-ubuntu-gpu-t4
    env:
      PYTHON_VERSION: "3.12"
      BUILD_VERSION: "${{ github.run_number }}"
      PYTORCH_VERSION: "2.4.1"
      CU_VERSION: "cu121"
      JUST_TESTRUN: 1
    steps:
    - uses: actions/checkout@v4
    - name: Build and run tests
      run: |-
        conda create --name env --yes --quiet conda-build
        conda run --no-capture-output --name env python3 ./packaging/build_conda.py --use-conda-cuda
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -8,10 +8,11 @@
 The core library is written in PyTorch. Several components have underlying implementation in CUDA for improved performance. A subset of these components have CPU implementations in C++/PyTorch. It is advised to use PyTorch3D with GPU support in order to use all the features.
 - Linux or macOS or Windows
- Python
+- Python 3.8, 3.9 or 3.10
- PyTorch 2.1.0, 2.1.1, 2.1.2, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0 or 2.4.1.
+- PyTorch 1.12.0, 1.12.1, 1.13.0, 2.0.0, 2.0.1, 2.1.0, 2.1.1, 2.1.2, 2.2.0, 2.2.1, 2.2.2, 2.3.0 or 2.3.1.
 - torchvision that matches the PyTorch installation. You can install them together as explained at pytorch.org to make sure of this.
 - gcc & g++ ≥ 4.9
 - [fvcore](https://github.com/facebookresearch/fvcore)
 - [ioPath](https://github.com/facebookresearch/iopath)
 - If CUDA is to be used, use a version which is supported by the corresponding pytorch version and at least version 9.2.
 - If CUDA older than 11.7 is to be used and you are building from source, the CUB library must be available. We recommend version 1.10.0.
@@ -21,7 +22,7 @@ The runtime dependencies can be installed by running:
 conda create -n pytorch3d python=3.9
 conda activate pytorch3d
 conda install pytorch=1.13.0 torchvision pytorch-cuda=11.6 -c pytorch -c nvidia
-conda install -c iopath iopath
+conda install -c fvcore -c iopath -c conda-forge fvcore iopath
 ```
 For the CUB build time dependency, which you only need if you have CUDA older than 11.7, if you are using conda, you can continue with
@@ -48,7 +49,6 @@ For developing on top of PyTorch3D or contributing, you will need to run the lin
 - tdqm
 - jupyter
 - imageio
 - fvcore
 - plotly
 - opencv-python
@@ -59,7 +59,6 @@ conda install jupyter
 pip install scikit-image matplotlib imageio plotly opencv-python
 # Tests/Linting
 conda install -c fvcore -c conda-forge fvcore
 pip install black usort flake8 flake8-bugbear flake8-comprehensions
 ```
@@ -98,7 +97,7 @@ version_str="".join([
    torch.version.cuda.replace(".",""),
    f"_pyt{pyt_version_str}"
 ])
-!pip install iopath
+!pip install fvcore iopath
 !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html
 ```
--- a/dev/linter.sh
+++ b/dev/linter.sh
@@ -10,7 +10,7 @@
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 DIR=$(dirname "${DIR}")
-if [[ -f "${DIR}/BUCK" ]]
+if [[ -f "${DIR}/TARGETS" ]]
 then
  pyfmt "${DIR}"
 else
@@ -36,5 +36,5 @@ then
  echo "Running pyre..."
  echo "To restart/kill pyre server, run 'pyre restart' or 'pyre kill' in fbcode/"
-  ( cd ~/fbsource/fbcode; arc pyre check //vision/fair/pytorch3d/... )
+  ( cd ~/fbsource/fbcode; pyre -l vision/fair/pytorch3d/ )
 fi
--- a/dev/run_tutorials.sh
+++ b/dev/run_tutorials.sh
@@ -23,7 +23,7 @@ conda init bash
 source ~/.bashrc
 conda create -y -n myenv python=3.8 matplotlib ipython ipywidgets nbconvert
 conda activate myenv
-conda install -y -c iopath iopath
+conda install -y -c fvcore -c iopath -c conda-forge fvcore iopath
 conda install -y -c pytorch pytorch=1.6.0 cudatoolkit=10.1 torchvision
 conda install -y -c pytorch3d-nightly pytorch3d
 pip install plotly scikit-image
--- a/docs/examples/pulsar_basic.py
+++ b/docs/examples/pulsar_basic.py
@@ -10,7 +10,6 @@ This example demonstrates the most trivial, direct interface of the pulsar
 sphere renderer. It renders and saves an image with 10 random spheres.
 Output: basic.png.
 """
 import logging
 import math
 from os import path
--- a/docs/examples/pulsar_basic_unified.py
+++ b/docs/examples/pulsar_basic_unified.py
@@ -11,7 +11,6 @@ interface for sphere renderering. It renders and saves an image with
 10 random spheres.
 Output: basic-pt3d.png.
 """
 import logging
 from os import path
--- a/docs/examples/pulsar_cam.py
+++ b/docs/examples/pulsar_cam.py
@@ -14,7 +14,6 @@ distorted. Gradient-based optimization is used to converge towards the
 original camera parameters.
 Output: cam.gif.
 """
 import logging
 import math
 from os import path
--- a/docs/examples/pulsar_cam_unified.py
+++ b/docs/examples/pulsar_cam_unified.py
@@ -14,7 +14,6 @@ distorted. Gradient-based optimization is used to converge towards the
 original camera parameters.
 Output: cam-pt3d.gif
 """
 import logging
 from os import path
--- a/docs/examples/pulsar_multiview.py
+++ b/docs/examples/pulsar_multiview.py
@@ -18,7 +18,6 @@ This example is not available yet through the 'unified' interface,
 because opacity support has not landed in PyTorch3D for general data
 structures yet.
 """
 import logging
 import math
 from os import path
--- a/docs/examples/pulsar_optimization.py
+++ b/docs/examples/pulsar_optimization.py
@@ -13,7 +13,6 @@ The scene is initialized with random spheres. Gradient-based
 optimization is used to converge towards a faithful
 scene representation.
 """
 import logging
 import math
--- a/docs/examples/pulsar_optimization_unified.py
+++ b/docs/examples/pulsar_optimization_unified.py
@@ -13,7 +13,6 @@ The scene is initialized with random spheres. Gradient-based
 optimization is used to converge towards a faithful
 scene representation.
 """
 import logging
 import math
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -5,6 +5,7 @@ sphinx_rtd_theme
 sphinx_markdown_tables
 numpy
 iopath
 fvcore
 https://download.pytorch.org/whl/cpu/torchvision-0.15.2%2Bcpu-cp311-cp311-linux_x86_64.whl
 https://download.pytorch.org/whl/cpu/torch-2.0.1%2Bcpu-cp311-cp311-linux_x86_64.whl
 omegaconf
--- a/docs/tutorials/bundle_adjustment.ipynb
+++ b/docs/tutorials/bundle_adjustment.ipynb
@@ -96,7 +96,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/camera_position_optimization_with_differentiable_rendering.ipynb
+++ b/docs/tutorials/camera_position_optimization_with_differentiable_rendering.ipynb
@@ -83,7 +83,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/dataloaders_ShapeNetCore_R2N2.ipynb
+++ b/docs/tutorials/dataloaders_ShapeNetCore_R2N2.ipynb
@@ -58,7 +58,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/deform_source_mesh_to_target_mesh.ipynb
+++ b/docs/tutorials/deform_source_mesh_to_target_mesh.ipynb
@@ -97,7 +97,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/fit_simple_neural_radiance_field.ipynb
+++ b/docs/tutorials/fit_simple_neural_radiance_field.ipynb
@@ -63,7 +63,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/fit_textured_mesh.ipynb
+++ b/docs/tutorials/fit_textured_mesh.ipynb
@@ -75,7 +75,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/fit_textured_volume.ipynb
+++ b/docs/tutorials/fit_textured_volume.ipynb
@@ -54,7 +54,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/implicitron_config_system.ipynb
+++ b/docs/tutorials/implicitron_config_system.ipynb
@@ -85,7 +85,7 @@
        "        torch.version.cuda.replace(\".\",\"\"),\n",
        "        f\"_pyt{pyt_version_str}\"\n",
        "    ])\n",
-        "    !pip install iopath\n",
+        "    !pip install fvcore iopath\n",
        "    if sys.platform.startswith(\"linux\"):\n",
        "        print(\"Trying to install wheel for PyTorch3D\")\n",
        "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/implicitron_volumes.ipynb
+++ b/docs/tutorials/implicitron_volumes.ipynb
@@ -79,7 +79,7 @@
        "        torch.version.cuda.replace(\".\",\"\"),\n",
        "        f\"_pyt{pyt_version_str}\"\n",
        "    ])\n",
-        "    !pip install iopath\n",
+        "    !pip install fvcore iopath\n",
        "    if sys.platform.startswith(\"linux\"):\n",
        "        print(\"Trying to install wheel for PyTorch3D\")\n",
        "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/render_colored_points.ipynb
+++ b/docs/tutorials/render_colored_points.ipynb
@@ -57,7 +57,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/render_densepose.ipynb
+++ b/docs/tutorials/render_densepose.ipynb
@@ -64,7 +64,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/render_textured_meshes.ipynb
+++ b/docs/tutorials/render_textured_meshes.ipynb
@@ -80,7 +80,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/packaging/build_conda.py
+++ b/packaging/build_conda.py
@@ -4,11 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import argparse
 import os.path
 import runpy
 import subprocess
-from typing import List, Tuple
+from typing import List
 # required env vars:
 # CU_VERSION: E.g. cu112
@@ -24,7 +23,7 @@ pytorch_major_minor = tuple(int(i) for i in PYTORCH_VERSION.split(".")[:2])
 source_root_dir = os.environ["PWD"]
-def version_constraint(version) -> str:
+def version_constraint(version):
    """
    Given version "11.3" returns " >=11.3,<11.4"
    """
@@ -33,7 +32,7 @@ def version_constraint(version) -> str:
    return f" >={version},<{upper}"
-def get_cuda_major_minor() -> Tuple[str, str]:
+def get_cuda_major_minor():
    if CU_VERSION == "cpu":
        raise ValueError("fn only for cuda builds")
    if len(CU_VERSION) != 5 or CU_VERSION[:2] != "cu":
@@ -43,10 +42,11 @@ def get_cuda_major_minor() -> Tuple[str, str]:
    return major, minor
-def setup_cuda(use_conda_cuda: bool) -> List[str]:
+def setup_cuda():
    if CU_VERSION == "cpu":
-        return []
+        return
    major, minor = get_cuda_major_minor()
    os.environ["CUDA_HOME"] = f"/usr/local/cuda-{major}.{minor}/"
    os.environ["FORCE_CUDA"] = "1"
    basic_nvcc_flags = (
@@ -75,15 +75,6 @@ def setup_cuda(use_conda_cuda: bool) -> List[str]:
    if os.environ.get("JUST_TESTRUN", "0") != "1":
        os.environ["NVCC_FLAGS"] = nvcc_flags
    if use_conda_cuda:
        os.environ["CONDA_CUDA_TOOLKIT_BUILD_CONSTRAINT1"] = "- cuda-toolkit"
        os.environ["CONDA_CUDA_TOOLKIT_BUILD_CONSTRAINT2"] = (
            f"- cuda-version={major}.{minor}"
        )
        return ["-c", f"nvidia/label/cuda-{major}.{minor}.0"]
    else:
        os.environ["CUDA_HOME"] = f"/usr/local/cuda-{major}.{minor}/"
        return []
 def setup_conda_pytorch_constraint() -> List[str]:
@@ -104,7 +95,7 @@ def setup_conda_pytorch_constraint() -> List[str]:
        return ["-c", "pytorch", "-c", "nvidia"]
-def setup_conda_cudatoolkit_constraint() -> None:
+def setup_conda_cudatoolkit_constraint():
    if CU_VERSION == "cpu":
        os.environ["CONDA_CPUONLY_FEATURE"] = "- cpuonly"
        os.environ["CONDA_CUDATOOLKIT_CONSTRAINT"] = ""
@@ -125,14 +116,14 @@ def setup_conda_cudatoolkit_constraint() -> None:
    os.environ["CONDA_CUDATOOLKIT_CONSTRAINT"] = toolkit
-def do_build(start_args: List[str]) -> None:
+def do_build(start_args: List[str]):
    args = start_args.copy()
    test_flag = os.environ.get("TEST_FLAG")
    if test_flag is not None:
        args.append(test_flag)
-    args.extend(["-c", "bottler", "-c", "iopath", "-c", "conda-forge"])
+    args.extend(["-c", "bottler", "-c", "fvcore", "-c", "iopath", "-c", "conda-forge"])
    args.append("--no-anaconda-upload")
    args.extend(["--python", os.environ["PYTHON_VERSION"]])
    args.append("packaging/pytorch3d")
@@ -141,16 +132,8 @@ def do_build(start_args: List[str]) -> None:
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Build the conda package.")
    parser.add_argument(
        "--use-conda-cuda",
        action="store_true",
        help="get cuda from conda ignoring local cuda",
    )
    our_args = parser.parse_args()
    args = ["conda", "build"]
-    args += setup_cuda(use_conda_cuda=our_args.use_conda_cuda)
+    setup_cuda()
    init_path = source_root_dir + "/pytorch3d/__init__.py"
    build_version = runpy.run_path(init_path)["__version__"]
--- a/packaging/linux_wheels/README.md
+++ b/packaging/linux_wheels/README.md
@@ -26,6 +26,6 @@ version_str="".join([
    torch.version.cuda.replace(".",""),
    f"_pyt{pyt_version_str}"
 ])
-!pip install iopath
+!pip install fvcore iopath
 !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html
 ```
--- a/packaging/linux_wheels/inside.sh
+++ b/packaging/linux_wheels/inside.sh
@@ -144,7 +144,7 @@ do
            conda activate "$tag"
            # shellcheck disable=SC2086
            conda install -y -c pytorch $extra_channel "pytorch=$pytorch_version" "$cudatools=$CUDA_TAG"
-            pip install iopath
+            pip install fvcore iopath
            echo "python version" "$python_version" "pytorch version" "$pytorch_version" "cuda version" "$cu_version" "tag" "$tag"
            rm -rf dist
--- a/packaging/pytorch3d/meta.yaml
+++ b/packaging/pytorch3d/meta.yaml
@@ -8,13 +8,10 @@ source:
 requirements:
  build:
    - {{ compiler('c') }} # [win]
    {{ environ.get('CONDA_CUDA_TOOLKIT_BUILD_CONSTRAINT1', '') }}
    {{ environ.get('CONDA_CUDA_TOOLKIT_BUILD_CONSTRAINT2', '') }}
    {{ environ.get('CONDA_CUB_CONSTRAINT') }}
  host:
    - python
    - mkl =2023  # [x86_64]
    {{ environ.get('SETUPTOOLS_CONSTRAINT') }}
    {{ environ.get('CONDA_PYTORCH_BUILD_CONSTRAINT') }}
    {{ environ.get('CONDA_PYTORCH_MKL_CONSTRAINT') }}
@@ -25,7 +22,7 @@ requirements:
    - python
    - numpy >=1.11
    - torchvision >=0.5
-    - mkl =2023  # [x86_64]
+    - fvcore
    - iopath
    {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
    {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
@@ -51,11 +48,8 @@ test:
    - imageio
    - hydra-core
    - accelerate
    - matplotlib
    - tabulate
    - pandas
    - sqlalchemy
  commands:
    #pytest .
    python -m unittest discover -v -s tests -t .
--- a/projects/implicitron_trainer/experiment.py
+++ b/projects/implicitron_trainer/experiment.py
@@ -7,7 +7,7 @@
 # pyre-unsafe
-""" "
+""""
 This file is the entry point for launching experiments with Implicitron.
 Launch Training
@@ -44,7 +44,6 @@ The outputs of the experiment are saved and logged in multiple ways:
        config file.
 """
 import logging
 import os
 import warnings
@@ -100,7 +99,7 @@ except ModuleNotFoundError:
 no_accelerate = os.environ.get("PYTORCH3D_NO_ACCELERATE") is not None
-class Experiment(Configurable):
+class Experiment(Configurable):  # pyre-ignore: 13
    """
    This class is at the top level of Implicitron's config hierarchy. Its
    members are high-level components necessary for training an implicit rende-
@@ -121,16 +120,12 @@ class Experiment(Configurable):
            will be saved here.
    """
    # pyre-fixme[13]: Attribute `data_source` is never initialized.
    data_source: DataSourceBase
    data_source_class_type: str = "ImplicitronDataSource"
    # pyre-fixme[13]: Attribute `model_factory` is never initialized.
    model_factory: ModelFactoryBase
    model_factory_class_type: str = "ImplicitronModelFactory"
    # pyre-fixme[13]: Attribute `optimizer_factory` is never initialized.
    optimizer_factory: OptimizerFactoryBase
    optimizer_factory_class_type: str = "ImplicitronOptimizerFactory"
    # pyre-fixme[13]: Attribute `training_loop` is never initialized.
    training_loop: TrainingLoopBase
    training_loop_class_type: str = "ImplicitronTrainingLoop"
--- a/projects/implicitron_trainer/impl/model_factory.py
+++ b/projects/implicitron_trainer/impl/model_factory.py
@@ -26,6 +26,7 @@ logger = logging.getLogger(__name__)
 class ModelFactoryBase(ReplaceableBase):
    resume: bool = True  # resume from the last checkpoint
    def __call__(self, **kwargs) -> ImplicitronModelBase:
@@ -44,7 +45,7 @@ class ModelFactoryBase(ReplaceableBase):
@registry.register
-class ImplicitronModelFactory(ModelFactoryBase):
+class ImplicitronModelFactory(ModelFactoryBase):  # pyre-ignore [13]
    """
    A factory class that initializes an implicit rendering model.
@@ -60,7 +61,6 @@ class ImplicitronModelFactory(ModelFactoryBase):
    """
    # pyre-fixme[13]: Attribute `model` is never initialized.
    model: ImplicitronModelBase
    model_class_type: str = "GenericModel"
    resume: bool = True
@@ -115,9 +115,7 @@ class ImplicitronModelFactory(ModelFactoryBase):
                        "cuda:%d" % 0: "cuda:%d" % accelerator.local_process_index
                    }
                model_state_dict = torch.load(
-                    model_io.get_model_path(model_path),
+                    model_io.get_model_path(model_path), map_location=map_location
                    map_location=map_location,
                    weights_only=True,
                )
                try:
--- a/projects/implicitron_trainer/impl/optimizer_factory.py
+++ b/projects/implicitron_trainer/impl/optimizer_factory.py
@@ -123,7 +123,6 @@ class ImplicitronOptimizerFactory(OptimizerFactoryBase):
        """
        # Get the parameters to optimize
        if hasattr(model, "_get_param_groups"):  # use the model function
            # pyre-fixme[29]: `Union[Tensor, Module]` is not a function.
            p_groups = model._get_param_groups(self.lr, wd=self.weight_decay)
        else:
            p_groups = [
@@ -242,7 +241,7 @@ class ImplicitronOptimizerFactory(OptimizerFactoryBase):
                    map_location = {
                        "cuda:%d" % 0: "cuda:%d" % accelerator.local_process_index
                    }
-                optimizer_state = torch.load(opt_path, map_location, weights_only=True)
+                optimizer_state = torch.load(opt_path, map_location)
            else:
                raise FileNotFoundError(f"Optimizer state {opt_path} does not exist.")
        return optimizer_state
--- a/projects/implicitron_trainer/impl/training_loop.py
+++ b/projects/implicitron_trainer/impl/training_loop.py
@@ -30,13 +30,13 @@ from .utils import seed_all_random_engines
 logger = logging.getLogger(__name__)
 # pyre-fixme[13]: Attribute `evaluator` is never initialized.
 class TrainingLoopBase(ReplaceableBase):
    """
    Members:
        evaluator: An EvaluatorBase instance, used to evaluate training results.
    """
    # pyre-fixme[13]: Attribute `evaluator` is never initialized.
    evaluator: Optional[EvaluatorBase]
    evaluator_class_type: Optional[str] = "ImplicitronEvaluator"
@@ -161,6 +161,7 @@ class ImplicitronTrainingLoop(TrainingLoopBase):
        for epoch in range(start_epoch, self.max_epochs):
            # automatic new_epoch and plotting of stats at every epoch start
            with stats:
                # Make sure to re-seed random generators to ensure reproducibility
                # even after restart.
                seed_all_random_engines(seed + epoch)
@@ -394,7 +395,6 @@ class ImplicitronTrainingLoop(TrainingLoopBase):
            ):
                prefix = f"e{stats.epoch}_it{stats.it[trainmode]}"
                if hasattr(model, "visualize"):
                    # pyre-fixme[29]: `Union[Tensor, Module]` is not a function.
                    model.visualize(
                        viz,
                        visdom_env_imgs,
--- a/projects/implicitron_trainer/tests/test_experiment.py
+++ b/projects/implicitron_trainer/tests/test_experiment.py
@@ -53,8 +53,12 @@ class TestExperiment(unittest.TestCase):
        cfg.data_source_ImplicitronDataSource_args.dataset_map_provider_class_type = (
            "JsonIndexDatasetMapProvider"
        )
-        dataset_args = cfg.data_source_ImplicitronDataSource_args.dataset_map_provider_JsonIndexDatasetMapProvider_args
+        dataset_args = (
-        dataloader_args = cfg.data_source_ImplicitronDataSource_args.data_loader_map_provider_SequenceDataLoaderMapProvider_args
+            cfg.data_source_ImplicitronDataSource_args.dataset_map_provider_JsonIndexDatasetMapProvider_args
        )
        dataloader_args = (
            cfg.data_source_ImplicitronDataSource_args.data_loader_map_provider_SequenceDataLoaderMapProvider_args
        )
        dataset_args.category = "skateboard"
        dataset_args.test_restrict_sequence_id = 0
        dataset_args.dataset_root = "manifold://co3d/tree/extracted"
@@ -90,8 +94,12 @@ class TestExperiment(unittest.TestCase):
        cfg.data_source_ImplicitronDataSource_args.dataset_map_provider_class_type = (
            "JsonIndexDatasetMapProvider"
        )
-        dataset_args = cfg.data_source_ImplicitronDataSource_args.dataset_map_provider_JsonIndexDatasetMapProvider_args
+        dataset_args = (
-        dataloader_args = cfg.data_source_ImplicitronDataSource_args.data_loader_map_provider_SequenceDataLoaderMapProvider_args
+            cfg.data_source_ImplicitronDataSource_args.dataset_map_provider_JsonIndexDatasetMapProvider_args
        )
        dataloader_args = (
            cfg.data_source_ImplicitronDataSource_args.data_loader_map_provider_SequenceDataLoaderMapProvider_args
        )
        dataset_args.category = "skateboard"
        dataset_args.test_restrict_sequence_id = 0
        dataset_args.dataset_root = "manifold://co3d/tree/extracted"
@@ -103,7 +111,9 @@ class TestExperiment(unittest.TestCase):
        cfg.training_loop_ImplicitronTrainingLoop_args.max_epochs = 2
        cfg.training_loop_ImplicitronTrainingLoop_args.store_checkpoints = False
        cfg.optimizer_factory_ImplicitronOptimizerFactory_args.lr_policy = "Exponential"
-        cfg.optimizer_factory_ImplicitronOptimizerFactory_args.exponential_lr_step_size = 2
+        cfg.optimizer_factory_ImplicitronOptimizerFactory_args.exponential_lr_step_size = (
            2
        )
        if DEBUG:
            experiment.dump_cfg(cfg)
--- a/projects/implicitron_trainer/tests/test_optimizer_factory.py
+++ b/projects/implicitron_trainer/tests/test_optimizer_factory.py
@@ -81,9 +81,8 @@ class TestOptimizerFactory(unittest.TestCase):
    def test_param_overrides_self_param_group_assignment(self):
        pa, pb, pc = [torch.nn.Parameter(data=torch.tensor(i * 1.0)) for i in range(3)]
-        na, nb = (
+        na, nb = Node(params=[pa]), Node(
-            Node(params=[pa]),
+            params=[pb], param_groups={"self": "pb_self", "p1": "pb_param"}
            Node(params=[pb], param_groups={"self": "pb_self", "p1": "pb_param"}),
        )
        root = Node(children=[na, nb], params=[pc], param_groups={"m1": "pb_member"})
        param_groups = self._get_param_groups(root)
--- a/projects/nerf/nerf/dataset.py
+++ b/projects/nerf/nerf/dataset.py
@@ -84,9 +84,9 @@ def get_nerf_datasets(
    if autodownload and any(not os.path.isfile(p) for p in (cameras_path, image_path)):
        # Automatically download the data files if missing.
-        download_data([dataset_name], data_root=data_root)
+        download_data((dataset_name,), data_root=data_root)
-    train_data = torch.load(cameras_path, weights_only=True)
+    train_data = torch.load(cameras_path)
    n_cameras = train_data["cameras"]["R"].shape[0]
    _image_max_image_pixels = Image.MAX_IMAGE_PIXELS
--- a/projects/nerf/nerf/stats.py
+++ b/projects/nerf/nerf/stats.py
@@ -194,6 +194,7 @@ class Stats:
        it = self.it[stat_set]
        for stat in self.log_vars:
            if stat not in self.stats[stat_set]:
                self.stats[stat_set][stat] = AverageMeter()
--- a/projects/nerf/test_nerf.py
+++ b/projects/nerf/test_nerf.py
@@ -24,6 +24,7 @@ CONFIG_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs"
@hydra.main(config_path=CONFIG_DIR, config_name="lego")
 def main(cfg: DictConfig):
    # Device on which to run.
    if torch.cuda.is_available():
        device = "cuda"
@@ -62,7 +63,7 @@ def main(cfg: DictConfig):
        raise ValueError(f"Model checkpoint {checkpoint_path} does not exist!")
    print(f"Loading checkpoint {checkpoint_path}.")
-    loaded_data = torch.load(checkpoint_path, weights_only=True)
+    loaded_data = torch.load(checkpoint_path)
    # Do not load the cached xy grid.
    # - this allows setting an arbitrary evaluation image size.
    state_dict = {
--- a/projects/nerf/tests/test_raysampler.py
+++ b/projects/nerf/tests/test_raysampler.py
@@ -42,6 +42,7 @@ class TestRaysampler(unittest.TestCase):
        cameras, rays = [], []
        for _ in range(batch_size):
            R = random_rotations(1)
            T = torch.randn(1, 3)
            focal_length = torch.rand(1, 2) + 0.5
--- a/projects/nerf/train_nerf.py
+++ b/projects/nerf/train_nerf.py
@@ -25,6 +25,7 @@ CONFIG_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs"
@hydra.main(config_path=CONFIG_DIR, config_name="lego")
 def main(cfg: DictConfig):
    # Set the relevant seeds for reproducibility.
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
@@ -76,7 +77,7 @@ def main(cfg: DictConfig):
        # Resume training if requested.
        if cfg.resume and os.path.isfile(checkpoint_path):
            print(f"Resuming from checkpoint {checkpoint_path}.")
-            loaded_data = torch.load(checkpoint_path, weights_only=True)
+            loaded_data = torch.load(checkpoint_path)
            model.load_state_dict(loaded_data["model"])
            stats = pickle.loads(loaded_data["stats"])
            print(f"   => resuming from epoch {stats.epoch}.")
@@ -218,6 +219,7 @@ def main(cfg: DictConfig):
        # Validation
        if epoch % cfg.validation_epoch_interval == 0 and epoch > 0:
            # Sample a validation camera/image.
            val_batch = next(val_dataloader.__iter__())
            val_image, val_camera, camera_idx = val_batch[0].values()
--- a/pytorch3d/init.py
+++ b/pytorch3d/init.py
@@ -6,4 +6,4 @@
 # pyre-unsafe
-__version__ = "0.7.9"
+__version__ = "0.7.6"
--- a/pytorch3d/common/compat.py
+++ b/pytorch3d/common/compat.py
@@ -17,7 +17,7 @@ Some functions which depend on PyTorch or Python versions.
 def meshgrid_ij(
-    *A: Union[torch.Tensor, Sequence[torch.Tensor]],
+    *A: Union[torch.Tensor, Sequence[torch.Tensor]]
 ) -> Tuple[torch.Tensor, ...]:  # pragma: no cover
    """
    Like torch.meshgrid was before PyTorch 1.10.0, i.e. with indexing set to ij
--- a/pytorch3d/csrc/ball_query/ball_query.cu
+++ b/pytorch3d/csrc/ball_query/ball_query.cu
@@ -32,9 +32,7 @@ __global__ void BallQueryKernel(
    at::PackedTensorAccessor64<int64_t, 3, at::RestrictPtrTraits> idxs,
    at::PackedTensorAccessor64<scalar_t, 3, at::RestrictPtrTraits> dists,
    const int64_t K,
-    const float radius,
+    const float radius2) {
    const float radius2,
    const bool skip_points_outside_cube) {
  const int64_t N = p1.size(0);
  const int64_t chunks_per_cloud = (1 + (p1.size(1) - 1) / blockDim.x);
  const int64_t chunks_to_do = N * chunks_per_cloud;
@@ -53,19 +51,7 @@ __global__ void BallQueryKernel(
    // Iterate over points in p2 until desired count is reached or
    // all points have been considered
    for (int64_t j = 0, count = 0; j < lengths2[n] && count < K; ++j) {
-      if (skip_points_outside_cube) {
+      // Calculate the distance between the points
        bool is_within_radius = true;
        // Filter when any one coordinate is already outside the radius
        for (int d = 0; is_within_radius && d < D; ++d) {
          scalar_t abs_diff = fabs(p1[n][i][d] - p2[n][j][d]);
          is_within_radius = (abs_diff <= radius);
        }
        if (!is_within_radius) {
          continue;
        }
      }
      // Else, calculate the distance between the points and compare
      scalar_t dist2 = 0.0;
      for (int d = 0; d < D; ++d) {
        scalar_t diff = p1[n][i][d] - p2[n][j][d];
@@ -91,8 +77,7 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCuda(
    const at::Tensor& lengths1, // (N,)
    const at::Tensor& lengths2, // (N,)
    int K,
-    float radius,
+    float radius) {
    bool skip_points_outside_cube) {
  // Check inputs are on the same device
  at::TensorArg p1_t{p1, "p1", 1}, p2_t{p2, "p2", 2},
      lengths1_t{lengths1, "lengths1", 3}, lengths2_t{lengths2, "lengths2", 4};
@@ -135,9 +120,7 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCuda(
            idxs.packed_accessor64<int64_t, 3, at::RestrictPtrTraits>(),
            dists.packed_accessor64<float, 3, at::RestrictPtrTraits>(),
            K_64,
-            radius,
+            radius2);
            radius2,
            skip_points_outside_cube);
      }));
  AT_CUDA_CHECK(cudaGetLastError());
--- a/pytorch3d/csrc/ball_query/ball_query.h
+++ b/pytorch3d/csrc/ball_query/ball_query.h
@@ -25,9 +25,6 @@
 //      within the radius
 //    radius: the radius around each point within which the neighbors need to be
 //      located
 //    skip_points_outside_cube: If true, reduce multiplications of float values
 //      by not explicitly calculating distances to points that fall outside the
 //      D-cube with side length (2*radius) centered at each point in p1.
 //
 // Returns:
 //    p1_neighbor_idx: LongTensor of shape (N, P1, K), where
@@ -49,8 +46,7 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCpu(
    const at::Tensor& lengths1,
    const at::Tensor& lengths2,
    const int K,
-    const float radius,
+    const float radius);
    const bool skip_points_outside_cube);
 // CUDA implementation
 std::tuple<at::Tensor, at::Tensor> BallQueryCuda(
@@ -59,8 +55,7 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCuda(
    const at::Tensor& lengths1,
    const at::Tensor& lengths2,
    const int K,
-    const float radius,
+    const float radius);
    const bool skip_points_outside_cube);
 // Implementation which is exposed
 // Note: the backward pass reuses the KNearestNeighborBackward kernel
@@ -70,8 +65,7 @@ inline std::tuple<at::Tensor, at::Tensor> BallQuery(
    const at::Tensor& lengths1,
    const at::Tensor& lengths2,
    int K,
-    float radius,
+    float radius) {
    bool skip_points_outside_cube) {
  if (p1.is_cuda() || p2.is_cuda()) {
 #ifdef WITH_CUDA
    CHECK_CUDA(p1);
@@ -82,20 +76,16 @@ inline std::tuple<at::Tensor, at::Tensor> BallQuery(
        lengths1.contiguous(),
        lengths2.contiguous(),
        K,
-        radius,
+        radius);
        skip_points_outside_cube);
 #else
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(p1);
  CHECK_CPU(p2);
  return BallQueryCpu(
      p1.contiguous(),
      p2.contiguous(),
      lengths1.contiguous(),
      lengths2.contiguous(),
      K,
-      radius,
+      radius);
      skip_points_outside_cube);
 }
--- a/pytorch3d/csrc/ball_query/ball_query_cpu.cpp
+++ b/pytorch3d/csrc/ball_query/ball_query_cpu.cpp
@@ -6,8 +6,8 @@
 * LICENSE file in the root directory of this source tree.
 */
 #include <math.h>
 #include <torch/extension.h>
 #include <queue>
 #include <tuple>
 std::tuple<at::Tensor, at::Tensor> BallQueryCpu(
@@ -16,8 +16,7 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCpu(
    const at::Tensor& lengths1,
    const at::Tensor& lengths2,
    int K,
-    float radius,
+    float radius) {
    bool skip_points_outside_cube) {
  const int N = p1.size(0);
  const int P1 = p1.size(1);
  const int D = p1.size(2);
@@ -39,16 +38,6 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCpu(
    const int64_t length2 = lengths2_a[n];
    for (int64_t i = 0; i < length1; ++i) {
      for (int64_t j = 0, count = 0; j < length2 && count < K; ++j) {
        if (skip_points_outside_cube) {
          bool is_within_radius = true;
          for (int d = 0; is_within_radius && d < D; ++d) {
            float abs_diff = fabs(p1_a[n][i][d] - p2_a[n][j][d]);
            is_within_radius = (abs_diff <= radius);
          }
          if (!is_within_radius) {
            continue;
          }
        }
        float dist2 = 0;
        for (int d = 0; d < D; ++d) {
          float diff = p1_a[n][i][d] - p2_a[n][j][d];
--- a/pytorch3d/csrc/blending/sigmoid_alpha_blend.h
+++ b/pytorch3d/csrc/blending/sigmoid_alpha_blend.h
@@ -98,11 +98,6 @@ at::Tensor SigmoidAlphaBlendBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(distances);
  CHECK_CPU(pix_to_face);
  CHECK_CPU(alphas);
  CHECK_CPU(grad_alphas);
  return SigmoidAlphaBlendBackwardCpu(
      grad_alphas, alphas, distances, pix_to_face, sigma);
 }
--- a/pytorch3d/csrc/compositing/alpha_composite.cu
+++ b/pytorch3d/csrc/compositing/alpha_composite.cu
@@ -28,16 +28,17 @@ __global__ void alphaCompositeCudaForwardKernel(
    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
  // clang-format on
  const int64_t batch_size = result.size(0);
  const int64_t C = features.size(0);
  const int64_t H = points_idx.size(2);
  const int64_t W = points_idx.size(3);
  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;
  const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
+  const int num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
  // Iterate over each feature in each pixel
  for (int pid = tid; pid < num_pixels; pid += num_threads) {
@@ -78,16 +79,17 @@ __global__ void alphaCompositeCudaBackwardKernel(
    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
  // clang-format on
  const int64_t batch_size = points_idx.size(0);
  const int64_t C = features.size(0);
  const int64_t H = points_idx.size(2);
  const int64_t W = points_idx.size(3);
  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;
  const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
+  const int num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
  // Parallelize over each feature in each pixel in images of size H * W,
  // for each image in the batch of size batch_size
--- a/pytorch3d/csrc/compositing/alpha_composite.h
+++ b/pytorch3d/csrc/compositing/alpha_composite.h
@@ -74,9 +74,6 @@ torch::Tensor alphaCompositeForward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
    CHECK_CPU(features);
    CHECK_CPU(alphas);
    CHECK_CPU(points_idx);
    return alphaCompositeCpuForward(features, alphas, points_idx);
  }
 }
@@ -104,11 +101,6 @@ std::tuple<torch::Tensor, torch::Tensor> alphaCompositeBackward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
    CHECK_CPU(grad_outputs);
    CHECK_CPU(features);
    CHECK_CPU(alphas);
    CHECK_CPU(points_idx);
    return alphaCompositeCpuBackward(
        grad_outputs, features, alphas, points_idx);
  }
--- a/pytorch3d/csrc/compositing/norm_weighted_sum.cu
+++ b/pytorch3d/csrc/compositing/norm_weighted_sum.cu
@@ -28,16 +28,17 @@ __global__ void weightedSumNormCudaForwardKernel(
    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
  // clang-format on
  const int64_t batch_size = result.size(0);
  const int64_t C = features.size(0);
  const int64_t H = points_idx.size(2);
  const int64_t W = points_idx.size(3);
  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;
  const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
+  const int num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
  // Parallelize over each feature in each pixel in images of size H * W,
  // for each image in the batch of size batch_size
@@ -91,16 +92,17 @@ __global__ void weightedSumNormCudaBackwardKernel(
    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
  // clang-format on
  const int64_t batch_size = points_idx.size(0);
  const int64_t C = features.size(0);
  const int64_t H = points_idx.size(2);
  const int64_t W = points_idx.size(3);
  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;
  const int num_pixels = C * W * H;
-  const auto num_threads = gridDim.y * blockDim.x;
+  const int num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
  // Parallelize over each feature in each pixel in images of size H * W,
  // for each image in the batch of size batch_size
--- a/pytorch3d/csrc/compositing/norm_weighted_sum.h
+++ b/pytorch3d/csrc/compositing/norm_weighted_sum.h
@@ -73,10 +73,6 @@ torch::Tensor weightedSumNormForward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
    CHECK_CPU(features);
    CHECK_CPU(alphas);
    CHECK_CPU(points_idx);
    return weightedSumNormCpuForward(features, alphas, points_idx);
  }
 }
@@ -104,11 +100,6 @@ std::tuple<torch::Tensor, torch::Tensor> weightedSumNormBackward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
    CHECK_CPU(grad_outputs);
    CHECK_CPU(features);
    CHECK_CPU(alphas);
    CHECK_CPU(points_idx);
    return weightedSumNormCpuBackward(
        grad_outputs, features, alphas, points_idx);
  }
--- a/pytorch3d/csrc/compositing/weighted_sum.cu
+++ b/pytorch3d/csrc/compositing/weighted_sum.cu
@@ -26,16 +26,17 @@ __global__ void weightedSumCudaForwardKernel(
    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
  // clang-format on
  const int64_t batch_size = result.size(0);
  const int64_t C = features.size(0);
  const int64_t H = points_idx.size(2);
  const int64_t W = points_idx.size(3);
  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;
  const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
+  const int num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
  // Parallelize over each feature in each pixel in images of size H * W,
  // for each image in the batch of size batch_size
@@ -73,16 +74,17 @@ __global__ void weightedSumCudaBackwardKernel(
    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
  // clang-format on
  const int64_t batch_size = points_idx.size(0);
  const int64_t C = features.size(0);
  const int64_t H = points_idx.size(2);
  const int64_t W = points_idx.size(3);
  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;
  const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
+  const int num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
  // Iterate over each pixel to compute the contribution to the
  // gradient for the features and weights
--- a/pytorch3d/csrc/compositing/weighted_sum.h
+++ b/pytorch3d/csrc/compositing/weighted_sum.h
@@ -72,9 +72,6 @@ torch::Tensor weightedSumForward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
    CHECK_CPU(features);
    CHECK_CPU(alphas);
    CHECK_CPU(points_idx);
    return weightedSumCpuForward(features, alphas, points_idx);
  }
 }
@@ -101,11 +98,6 @@ std::tuple<torch::Tensor, torch::Tensor> weightedSumBackward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
    CHECK_CPU(grad_outputs);
    CHECK_CPU(features);
    CHECK_CPU(alphas);
    CHECK_CPU(points_idx);
    return weightedSumCpuBackward(grad_outputs, features, alphas, points_idx);
  }
 }
--- a/pytorch3d/csrc/ext.cpp
+++ b/pytorch3d/csrc/ext.cpp
@@ -8,6 +8,7 @@
 // clang-format off
 #include "./pulsar/global.h" // Include before <torch/extension.h>.
 #include <torch/extension.h>
 // clang-format on
 #include "./pulsar/pytorch/renderer.h"
 #include "./pulsar/pytorch/tensor_util.h"
@@ -98,23 +99,21 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("marching_cubes", &MarchingCubes);
  // Pulsar.
  // Pulsar not enabled on AMD.
 #ifdef PULSAR_LOGGING_ENABLED
  c10::ShowLogInfoToStderr();
 #endif
  py::class_<
      pulsar::pytorch::Renderer,
      std::shared_ptr<pulsar::pytorch::Renderer>>(m, "PulsarRenderer")
-      .def(
+      .def(py::init<
-          py::init<
+           const uint&,
-              const uint&,
+           const uint&,
-              const uint&,
+           const uint&,
-              const uint&,
+           const bool&,
-              const bool&,
+           const bool&,
-              const bool&,
+           const float&,
-              const float&,
+           const uint&,
-              const uint&,
+           const uint&>())
              const uint&>())
      .def(
          "__eq__",
          [](const pulsar::pytorch::Renderer& a,
@@ -149,10 +148,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
          py::arg("gamma"),
          py::arg("max_depth"),
          py::arg("min_depth") /* = 0.f*/,
-          py::arg("bg_col") /* = std::nullopt not exposed properly in
+          py::arg(
-                               pytorch 1.1. */
+              "bg_col") /* = at::nullopt not exposed properly in pytorch 1.1. */
          ,
-          py::arg("opacity") /* = std::nullopt ... */,
+          py::arg("opacity") /* = at::nullopt ... */,
          py::arg("percent_allowed_difference") = 0.01f,
          py::arg("max_n_hits") = MAX_UINT,
          py::arg("mode") = 0)
--- a/pytorch3d/csrc/face_areas_normals/face_areas_normals.h
+++ b/pytorch3d/csrc/face_areas_normals/face_areas_normals.h
@@ -60,8 +60,6 @@ std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(verts);
  CHECK_CPU(faces);
  return FaceAreasNormalsForwardCpu(verts, faces);
 }
@@ -82,9 +80,5 @@ at::Tensor FaceAreasNormalsBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(grad_areas);
  CHECK_CPU(grad_normals);
  CHECK_CPU(verts);
  CHECK_CPU(faces);
  return FaceAreasNormalsBackwardCpu(grad_areas, grad_normals, verts, faces);
 }
--- a/pytorch3d/csrc/gather_scatter/gather_scatter.cu
+++ b/pytorch3d/csrc/gather_scatter/gather_scatter.cu
@@ -20,14 +20,14 @@ __global__ void GatherScatterCudaKernel(
    const size_t V,
    const size_t D,
    const size_t E) {
-  const auto tid = threadIdx.x;
+  const int tid = threadIdx.x;
  // Reverse the vertex order if backward.
  const int v0_idx = backward ? 1 : 0;
  const int v1_idx = backward ? 0 : 1;
  // Edges are split evenly across the blocks.
-  for (auto e = blockIdx.x; e < E; e += gridDim.x) {
+  for (int e = blockIdx.x; e < E; e += gridDim.x) {
    // Get indices of vertices which form the edge.
    const int64_t v0 = edges[2 * e + v0_idx];
    const int64_t v1 = edges[2 * e + v1_idx];
@@ -35,7 +35,7 @@ __global__ void GatherScatterCudaKernel(
    // Split vertex features evenly across threads.
    // This implementation will be quite wasteful when D<128 since there will be
    // a lot of threads doing nothing.
-    for (auto d = tid; d < D; d += blockDim.x) {
+    for (int d = tid; d < D; d += blockDim.x) {
      const float val = input[v1 * D + d];
      float* address = output + v0 * D + d;
      atomicAdd(address, val);
--- a/pytorch3d/csrc/gather_scatter/gather_scatter.h
+++ b/pytorch3d/csrc/gather_scatter/gather_scatter.h
@@ -53,7 +53,5 @@ at::Tensor GatherScatter(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(input);
  CHECK_CPU(edges);
  return GatherScatterCpu(input, edges, directed, backward);
 }
--- a/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.cu
+++ b/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.cu
@@ -20,8 +20,8 @@ __global__ void InterpFaceAttrsForwardKernel(
    const size_t P,
    const size_t F,
    const size_t D) {
-  const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  const auto num_threads = blockDim.x * gridDim.x;
+  const int num_threads = blockDim.x * gridDim.x;
  for (int pd = tid; pd < P * D; pd += num_threads) {
    const int p = pd / D;
    const int d = pd % D;
@@ -93,8 +93,8 @@ __global__ void InterpFaceAttrsBackwardKernel(
    const size_t P,
    const size_t F,
    const size_t D) {
-  const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  const auto num_threads = blockDim.x * gridDim.x;
+  const int num_threads = blockDim.x * gridDim.x;
  for (int pd = tid; pd < P * D; pd += num_threads) {
    const int p = pd / D;
    const int d = pd % D;
--- a/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.h
+++ b/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.h
@@ -57,8 +57,6 @@ at::Tensor InterpFaceAttrsForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(face_attrs);
  CHECK_CPU(barycentric_coords);
  return InterpFaceAttrsForwardCpu(pix_to_face, barycentric_coords, face_attrs);
 }
@@ -108,9 +106,6 @@ std::tuple<at::Tensor, at::Tensor> InterpFaceAttrsBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(face_attrs);
  CHECK_CPU(barycentric_coords);
  CHECK_CPU(grad_pix_attrs);
  return InterpFaceAttrsBackwardCpu(
      pix_to_face, barycentric_coords, face_attrs, grad_pix_attrs);
 }
--- a/pytorch3d/csrc/iou_box3d/iou_box3d.h
+++ b/pytorch3d/csrc/iou_box3d/iou_box3d.h
@@ -44,7 +44,5 @@ inline std::tuple<at::Tensor, at::Tensor> IoUBox3D(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(boxes1);
  CHECK_CPU(boxes2);
  return IoUBox3DCpu(boxes1.contiguous(), boxes2.contiguous());
 }
--- a/pytorch3d/csrc/iou_box3d/iou_box3d_cpu.cpp
+++ b/pytorch3d/csrc/iou_box3d/iou_box3d_cpu.cpp
@@ -7,7 +7,10 @@
 */
 #include <torch/extension.h>
 #include <torch/torch.h>
 #include <list>
 #include <numeric>
 #include <queue>
 #include <tuple>
 #include "iou_box3d/iou_utils.h"
--- a/pytorch3d/csrc/iou_box3d/iou_utils.cuh
+++ b/pytorch3d/csrc/iou_box3d/iou_utils.cuh
@@ -461,8 +461,10 @@ __device__ inline std::tuple<float3, float3> ArgMaxVerts(
 __device__ inline bool IsCoplanarTriTri(
    const FaceVerts& tri1,
    const FaceVerts& tri2) {
  const float3 tri1_ctr = FaceCenter({tri1.v0, tri1.v1, tri1.v2});
  const float3 tri1_n = FaceNormal({tri1.v0, tri1.v1, tri1.v2});
  const float3 tri2_ctr = FaceCenter({tri2.v0, tri2.v1, tri2.v2});
  const float3 tri2_n = FaceNormal({tri2.v0, tri2.v1, tri2.v2});
  // Check if parallel
@@ -498,6 +500,7 @@ __device__ inline bool IsCoplanarTriPlane(
    const FaceVerts& tri,
    const FaceVerts& plane,
    const float3& normal) {
  const float3 tri_ctr = FaceCenter({tri.v0, tri.v1, tri.v2});
  const float3 nt = FaceNormal({tri.v0, tri.v1, tri.v2});
  // check if parallel
@@ -725,7 +728,7 @@ __device__ inline int BoxIntersections(
      }
    }
    // Update the face_verts_out tris
-    num_tris = min(MAX_TRIS, offset);
+    num_tris = offset;
    for (int j = 0; j < num_tris; ++j) {
      face_verts_out[j] = tri_verts_updated[j];
    }
--- a/pytorch3d/csrc/knn/knn.h
+++ b/pytorch3d/csrc/knn/knn.h
@@ -74,8 +74,6 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdx(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(p1);
  CHECK_CPU(p2);
  return KNearestNeighborIdxCpu(p1, p2, lengths1, lengths2, norm, K);
 }
@@ -142,8 +140,6 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(p1);
  CHECK_CPU(p2);
  return KNearestNeighborBackwardCpu(
      p1, p2, lengths1, lengths2, idxs, norm, grad_dists);
 }
--- a/pytorch3d/csrc/marching_cubes/marching_cubes.h
+++ b/pytorch3d/csrc/marching_cubes/marching_cubes.h
@@ -58,6 +58,5 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor> MarchingCubes(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(vol);
  return MarchingCubesCpu(vol.contiguous(), isolevel);
 }
--- a/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.h
+++ b/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.h
@@ -88,8 +88,6 @@ at::Tensor PackedToPadded(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(inputs_packed);
  CHECK_CPU(first_idxs);
  return PackedToPaddedCpu(inputs_packed, first_idxs, max_size);
 }
@@ -107,7 +105,5 @@ at::Tensor PaddedToPacked(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(inputs_padded);
  CHECK_CPU(first_idxs);
  return PaddedToPackedCpu(inputs_padded, first_idxs, num_inputs);
 }
--- a/pytorch3d/csrc/point_mesh/point_mesh_cpu.cpp
+++ b/pytorch3d/csrc/point_mesh/point_mesh_cpu.cpp
@@ -174,8 +174,8 @@ std::tuple<at::Tensor, at::Tensor> HullHullDistanceForwardCpu(
  at::Tensor idxs = at::zeros({A_N,}, as_first_idx.options());
  // clang-format on
-  auto as_a = as.accessor<float, H1 == 1 ? 2 : 3>();
+  auto as_a = as.accessor < float, H1 == 1 ? 2 : 3 > ();
-  auto bs_a = bs.accessor<float, H2 == 1 ? 2 : 3>();
+  auto bs_a = bs.accessor < float, H2 == 1 ? 2 : 3 > ();
  auto as_first_idx_a = as_first_idx.accessor<int64_t, 1>();
  auto bs_first_idx_a = bs_first_idx.accessor<int64_t, 1>();
  auto dists_a = dists.accessor<float, 1>();
@@ -230,10 +230,10 @@ std::tuple<at::Tensor, at::Tensor> HullHullDistanceBackwardCpu(
  at::Tensor grad_as = at::zeros_like(as);
  at::Tensor grad_bs = at::zeros_like(bs);
-  auto as_a = as.accessor<float, H1 == 1 ? 2 : 3>();
+  auto as_a = as.accessor < float, H1 == 1 ? 2 : 3 > ();
-  auto bs_a = bs.accessor<float, H2 == 1 ? 2 : 3>();
+  auto bs_a = bs.accessor < float, H2 == 1 ? 2 : 3 > ();
-  auto grad_as_a = grad_as.accessor<float, H1 == 1 ? 2 : 3>();
+  auto grad_as_a = grad_as.accessor < float, H1 == 1 ? 2 : 3 > ();
-  auto grad_bs_a = grad_bs.accessor<float, H2 == 1 ? 2 : 3>();
+  auto grad_bs_a = grad_bs.accessor < float, H2 == 1 ? 2 : 3 > ();
  auto idx_bs_a = idx_bs.accessor<int64_t, 1>();
  auto grad_dists_a = grad_dists.accessor<float, 1>();
--- a/pytorch3d/csrc/point_mesh/point_mesh_cuda.cu
+++ b/pytorch3d/csrc/point_mesh/point_mesh_cuda.cu
@@ -110,7 +110,7 @@ __global__ void DistanceForwardKernel(
    __syncthreads();
    // Perform reduction in shared memory.
-    for (auto s = blockDim.x / 2; s > 32; s >>= 1) {
+    for (int s = blockDim.x / 2; s > 32; s >>= 1) {
      if (tid < s) {
        if (min_dists[tid] > min_dists[tid + s]) {
          min_dists[tid] = min_dists[tid + s];
@@ -502,8 +502,8 @@ __global__ void PointFaceArrayForwardKernel(
  const float3* tris_f3 = (float3*)tris;
  // Parallelize over P * S computations
-  const auto num_threads = gridDim.x * blockDim.x;
+  const int num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
  for (int t_i = tid; t_i < P * T; t_i += num_threads) {
    const int t = t_i / P; // segment index.
@@ -576,8 +576,8 @@ __global__ void PointFaceArrayBackwardKernel(
  const float3* tris_f3 = (float3*)tris;
  // Parallelize over P * S computations
-  const auto num_threads = gridDim.x * blockDim.x;
+  const int num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
  for (int t_i = tid; t_i < P * T; t_i += num_threads) {
    const int t = t_i / P; // triangle index.
@@ -683,8 +683,8 @@ __global__ void PointEdgeArrayForwardKernel(
  float3* segms_f3 = (float3*)segms;
  // Parallelize over P * S computations
-  const auto num_threads = gridDim.x * blockDim.x;
+  const int num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
  for (int t_i = tid; t_i < P * S; t_i += num_threads) {
    const int s = t_i / P; // segment index.
@@ -752,8 +752,8 @@ __global__ void PointEdgeArrayBackwardKernel(
  float3* segms_f3 = (float3*)segms;
  // Parallelize over P * S computations
-  const auto num_threads = gridDim.x * blockDim.x;
+  const int num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
  for (int t_i = tid; t_i < P * S; t_i += num_threads) {
    const int s = t_i / P; // segment index.
--- a/pytorch3d/csrc/point_mesh/point_mesh_cuda.h
+++ b/pytorch3d/csrc/point_mesh/point_mesh_cuda.h
@@ -88,10 +88,6 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(points);
  CHECK_CPU(points_first_idx);
  CHECK_CPU(tris);
  CHECK_CPU(tris_first_idx);
  return PointFaceDistanceForwardCpu(
      points, points_first_idx, tris, tris_first_idx, min_triangle_area);
 }
@@ -147,10 +143,6 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(points);
  CHECK_CPU(tris);
  CHECK_CPU(idx_points);
  CHECK_CPU(grad_dists);
  return PointFaceDistanceBackwardCpu(
      points, tris, idx_points, grad_dists, min_triangle_area);
 }
@@ -229,10 +221,6 @@ std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(points);
  CHECK_CPU(points_first_idx);
  CHECK_CPU(tris);
  CHECK_CPU(tris_first_idx);
  return FacePointDistanceForwardCpu(
      points, points_first_idx, tris, tris_first_idx, min_triangle_area);
 }
@@ -289,10 +277,6 @@ std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(points);
  CHECK_CPU(tris);
  CHECK_CPU(idx_tris);
  CHECK_CPU(grad_dists);
  return FacePointDistanceBackwardCpu(
      points, tris, idx_tris, grad_dists, min_triangle_area);
 }
@@ -362,10 +346,6 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(points);
  CHECK_CPU(points_first_idx);
  CHECK_CPU(segms);
  CHECK_CPU(segms_first_idx);
  return PointEdgeDistanceForwardCpu(
      points, points_first_idx, segms, segms_first_idx, max_points);
 }
@@ -416,10 +396,6 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(points);
  CHECK_CPU(segms);
  CHECK_CPU(idx_points);
  CHECK_CPU(grad_dists);
  return PointEdgeDistanceBackwardCpu(points, segms, idx_points, grad_dists);
 }
@@ -488,10 +464,6 @@ std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(points);
  CHECK_CPU(points_first_idx);
  CHECK_CPU(segms);
  CHECK_CPU(segms_first_idx);
  return EdgePointDistanceForwardCpu(
      points, points_first_idx, segms, segms_first_idx, max_segms);
 }
@@ -542,10 +514,6 @@ std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(points);
  CHECK_CPU(segms);
  CHECK_CPU(idx_segms);
  CHECK_CPU(grad_dists);
  return EdgePointDistanceBackwardCpu(points, segms, idx_segms, grad_dists);
 }
@@ -599,8 +567,6 @@ torch::Tensor PointFaceArrayDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(points);
  CHECK_CPU(tris);
  return PointFaceArrayDistanceForwardCpu(points, tris, min_triangle_area);
 }
@@ -647,9 +613,6 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceArrayDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(points);
  CHECK_CPU(tris);
  CHECK_CPU(grad_dists);
  return PointFaceArrayDistanceBackwardCpu(
      points, tris, grad_dists, min_triangle_area);
 }
@@ -698,8 +661,6 @@ torch::Tensor PointEdgeArrayDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(points);
  CHECK_CPU(segms);
  return PointEdgeArrayDistanceForwardCpu(points, segms);
 }
@@ -742,8 +703,5 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeArrayDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(points);
  CHECK_CPU(segms);
  CHECK_CPU(grad_dists);
  return PointEdgeArrayDistanceBackwardCpu(points, segms, grad_dists);
 }
--- a/pytorch3d/csrc/points_to_volumes/points_to_volumes.h
+++ b/pytorch3d/csrc/points_to_volumes/points_to_volumes.h
@@ -104,12 +104,6 @@ inline void PointsToVolumesForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(points_3d);
  CHECK_CPU(points_features);
  CHECK_CPU(volume_densities);
  CHECK_CPU(volume_features);
  CHECK_CPU(grid_sizes);
  CHECK_CPU(mask);
  PointsToVolumesForwardCpu(
      points_3d,
      points_features,
@@ -189,14 +183,6 @@ inline void PointsToVolumesBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
  CHECK_CPU(points_3d);
  CHECK_CPU(points_features);
  CHECK_CPU(grid_sizes);
  CHECK_CPU(mask);
  CHECK_CPU(grad_volume_densities);
  CHECK_CPU(grad_volume_features);
  CHECK_CPU(grad_points_3d);
  CHECK_CPU(grad_points_features);
  PointsToVolumesBackwardCpu(
      points_3d,
      points_features,
--- a/pytorch3d/csrc/points_to_volumes/points_to_volumes_cpu.cpp
+++ b/pytorch3d/csrc/points_to_volumes/points_to_volumes_cpu.cpp
@@ -8,7 +8,9 @@
 #include <torch/csrc/autograd/VariableTypeUtils.h>
 #include <torch/extension.h>
 #include <algorithm>
 #include <cmath>
 #include <thread>
 #include <vector>
 // In the x direction, the location {0, ..., grid_size_x - 1} correspond to
--- a/pytorch3d/csrc/pulsar/cuda/README.md
+++ b/pytorch3d/csrc/pulsar/cuda/README.md
--- a/pytorch3d/csrc/pulsar/cuda/commands.h
+++ b/pytorch3d/csrc/pulsar/cuda/commands.h
@@ -59,11 +59,6 @@ getLastCudaError(const char* errorMessage, const char* file, const int line) {
 #define SHARED __shared__
 #define ACTIVEMASK() __activemask()
 #define BALLOT(mask, val) __ballot_sync((mask), val)
 /* TODO (ROCM-6.2): None of the WARP_* are used anywhere and ROCM-6.2 natively
 * supports __shfl_*. Disabling until the move to ROCM-6.2.
 */
 #if !defined(USE_ROCM)
 /**
 * Find the cumulative sum within a warp up to the current
 * thread lane, with each mask thread contributing base.
@@ -120,7 +115,6 @@ INLINE DEVICE float3 WARP_SUM_FLOAT3(
  ret.z = WARP_SUM(group, mask, base.z);
  return ret;
 }
 #endif //! USE_ROCM
 // Floating point.
 // #define FMUL(a, b) __fmul_rn((a), (b))
@@ -148,7 +142,6 @@ INLINE DEVICE float3 WARP_SUM_FLOAT3(
 #define FMA(x, y, z) __fmaf_rn((x), (y), (z))
 #define I2F(a) __int2float_rn(a)
 #define FRCP(x) __frcp_rn(x)
 #if !defined(USE_ROCM)
 __device__ static float atomicMax(float* address, float val) {
  int* address_as_i = (int*)address;
  int old = *address_as_i, assumed;
@@ -173,7 +166,6 @@ __device__ static float atomicMin(float* address, float val) {
  } while (assumed != old);
  return __int_as_float(old);
 }
 #endif //! USE_ROCM
 #define DMAX(a, b) FMAX(a, b)
 #define DMIN(a, b) FMIN(a, b)
 #define DSQRT(a) sqrt(a)
@@ -417,7 +409,7 @@ __device__ static float atomicMin(float* address, float val) {
      (OUT_PTR),              \
      (NUM_SELECTED_PTR),     \
      (NUM_ITEMS),            \
-      (STREAM));
+      stream = (STREAM));
 #define COPY_HOST_DEV(PTR_D, PTR_H, TYPE, SIZE) \
  HANDLECUDA(cudaMemcpy(                        \
--- a/pytorch3d/csrc/pulsar/cuda/renderer.backward.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.backward.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.backward_dbg.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.backward_dbg.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.calc_gradients.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.calc_gradients.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.calc_signature.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.calc_signature.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.construct.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.construct.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.create_selector.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.create_selector.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.destruct.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.destruct.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.fill_bg.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.fill_bg.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.forward.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.forward.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.norm_cam_gradients.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.norm_cam_gradients.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.norm_sphere_gradients.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.norm_sphere_gradients.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.render.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.render.gpu.cu
--- a/pytorch3d/csrc/pulsar/global.h
+++ b/pytorch3d/csrc/pulsar/global.h
@@ -15,8 +15,8 @@
 #endif
 #if defined(_WIN64) || defined(_WIN32)
-using uint = unsigned int;
+#define uint unsigned int
-using ushort = unsigned short;
+#define ushort unsigned short
 #endif
 #include "./logging.h" // <- include before torch/extension.h
@@ -36,13 +36,11 @@ using ushort = unsigned short;
 #pragma nv_diag_suppress 2951
 #pragma nv_diag_suppress 2967
 #else
 #if !defined(USE_ROCM)
 #pragma diag_suppress = attribute_not_allowed
 #pragma diag_suppress = 1866
 #pragma diag_suppress = 2941
 #pragma diag_suppress = 2951
 #pragma diag_suppress = 2967
 #endif //! USE_ROCM
 #endif
 #else // __CUDACC__
 #define INLINE inline
@@ -58,9 +56,7 @@ using ushort = unsigned short;
 #pragma clang diagnostic pop
 #ifdef WITH_CUDA
 #include <ATen/cuda/CUDAContext.h>
 #if !defined(USE_ROCM)
 #include <vector_functions.h>
 #endif //! USE_ROCM
 #else
 #ifndef cudaStream_t
 typedef void* cudaStream_t;
--- a/pytorch3d/csrc/pulsar/host/commands.h
+++ b/pytorch3d/csrc/pulsar/host/commands.h
@@ -357,11 +357,11 @@ void MAX_WS(
 //
 //
 #define END_PARALLEL() \
-  end_parallel:;       \
+  end_parallel :;      \
  }
 #define END_PARALLEL_NORET() }
 #define END_PARALLEL_2D() \
-  end_parallel:;          \
+  end_parallel :;         \
  }                       \
  }
 #define END_PARALLEL_2D_NORET() \
--- a/pytorch3d/csrc/pulsar/include/camera.device.h
+++ b/pytorch3d/csrc/pulsar/include/camera.device.h
@@ -14,7 +14,7 @@
 #include "./commands.h"
 namespace pulsar {
-IHD CamGradInfo::CamGradInfo(int x) {
+IHD CamGradInfo::CamGradInfo() {
  cam_pos = make_float3(0.f, 0.f, 0.f);
  pixel_0_0_center = make_float3(0.f, 0.f, 0.f);
  pixel_dir_x = make_float3(0.f, 0.f, 0.f);
--- a/pytorch3d/csrc/pulsar/include/camera.h
+++ b/pytorch3d/csrc/pulsar/include/camera.h
@@ -63,13 +63,18 @@ inline bool operator==(const CamInfo& a, const CamInfo& b) {
 };
 struct CamGradInfo {
-  HOST DEVICE CamGradInfo(int = 0);
+  HOST DEVICE CamGradInfo();
  float3 cam_pos;
  float3 pixel_0_0_center;
  float3 pixel_dir_x;
  float3 pixel_dir_y;
 };
 // TODO: remove once https://github.com/NVlabs/cub/issues/172 is resolved.
 struct IntWrapper {
  int val;
 };
 } // namespace pulsar
 #endif
--- a/pytorch3d/csrc/pulsar/include/commands.h
+++ b/pytorch3d/csrc/pulsar/include/commands.h
@@ -24,7 +24,7 @@
 // #pragma diag_suppress = 68
 #include <ATen/cuda/CUDAContext.h>
 // #pragma pop
-#include "../gpu/commands.h"
+#include "../cuda/commands.h"
 #else
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Weverything"
--- a/pytorch3d/csrc/pulsar/include/math.h
+++ b/pytorch3d/csrc/pulsar/include/math.h
@@ -46,7 +46,6 @@ IHD float3 outer_product_sum(const float3& a) {
 }
 // TODO: put intrinsics here.
 #if !defined(USE_ROCM)
 IHD float3 operator+(const float3& a, const float3& b) {
  return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
 }
@@ -94,7 +93,6 @@ IHD float3 operator*(const float3& a, const float3& b) {
 IHD float3 operator*(const float& a, const float3& b) {
  return b * a;
 }
 #endif //! USE_ROCM
 INLINE DEVICE float length(const float3& v) {
  // TODO: benchmark what's faster.
@@ -149,6 +147,11 @@ IHD CamGradInfo operator*(const CamGradInfo& a, const float& b) {
  return res;
 }
 IHD IntWrapper operator+(const IntWrapper& a, const IntWrapper& b) {
  IntWrapper res;
  res.val = a.val + b.val;
  return res;
 }
 } // namespace pulsar
 #endif
--- a/pytorch3d/csrc/pulsar/include/renderer.backward.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.backward.device.h
@@ -155,8 +155,8 @@ void backward(
        stream);
    CHECKLAUNCH();
    SUM_WS(
-        self->ids_sorted_d,
+        (IntWrapper*)(self->ids_sorted_d),
-        self->n_grad_contributions_d,
+        (IntWrapper*)(self->n_grad_contributions_d),
        static_cast<int>(num_balls),
        self->workspace_d,
        self->workspace_size,
--- a/pytorch3d/csrc/pulsar/include/renderer.construct.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.construct.device.h
@@ -52,7 +52,7 @@ HOST void construct(
  self->cam.film_width = width;
  self->cam.film_height = height;
  self->max_num_balls = max_num_balls;
-  MALLOC(self->result_d, float, width * height * n_channels);
+  MALLOC(self->result_d, float, width* height* n_channels);
  self->cam.orthogonal_projection = orthogonal_projection;
  self->cam.right_handed = right_handed_system;
  self->cam.background_normalization_depth = background_normalization_depth;
@@ -93,7 +93,7 @@ HOST void construct(
  MALLOC(self->di_sorted_d, DrawInfo, max_num_balls);
  MALLOC(self->region_flags_d, char, max_num_balls);
  MALLOC(self->num_selected_d, size_t, 1);
-  MALLOC(self->forw_info_d, float, width * height * (3 + 2 * n_track));
+  MALLOC(self->forw_info_d, float, width* height * (3 + 2 * n_track));
  MALLOC(self->min_max_pixels_d, IntersectInfo, 1);
  MALLOC(self->grad_pos_d, float3, max_num_balls);
  MALLOC(self->grad_col_d, float, max_num_balls* n_channels);
--- a/pytorch3d/csrc/pulsar/include/renderer.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.h
@@ -255,7 +255,7 @@ GLOBAL void calc_signature(
 * for every iteration through the loading loop every thread could add a
 * 'hit' to the buffer.
 */
-#define RENDER_BUFFER_SIZE RENDER_BLOCK_SIZE * RENDER_BLOCK_SIZE * 2
+#define RENDER_BUFFER_SIZE RENDER_BLOCK_SIZE* RENDER_BLOCK_SIZE * 2
 /**
 * The threshold after which the spheres that are in the render buffer
 * are rendered and the buffer is flushed.
--- a/pytorch3d/csrc/pulsar/include/renderer.render.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.render.device.h
@@ -283,15 +283,9 @@ GLOBAL void render(
          (percent_allowed_difference > 0.f &&
           max_closest_possible_intersection > depth_threshold) ||
          tracker.get_n_hits() >= max_n_hits;
 #if defined(__CUDACC__) && defined(__HIP_PLATFORM_AMD__)
      unsigned long long warp_done = __ballot(done);
      int warp_done_bit_cnt = __popcll(warp_done);
 #else
      uint warp_done = thread_warp.ballot(done);
      int warp_done_bit_cnt = POPC(warp_done);
 #endif //__CUDACC__ && __HIP_PLATFORM_AMD__
      if (thread_warp.thread_rank() == 0)
-        ATOMICADD_B(&n_pixels_done, warp_done_bit_cnt);
+        ATOMICADD_B(&n_pixels_done, POPC(warp_done));
      // This sync is necessary to keep n_loaded until all threads are done with
      // painting.
      thread_block.sync();
--- a/pytorch3d/csrc/pulsar/pytorch/renderer.cpp
+++ b/pytorch3d/csrc/pulsar/pytorch/renderer.cpp
@@ -213,8 +213,8 @@ std::tuple<size_t, size_t, bool, torch::Tensor> Renderer::arg_check(
    const float& gamma,
    const float& max_depth,
    float& min_depth,
-    const std::optional<torch::Tensor>& bg_col,
+    const c10::optional<torch::Tensor>& bg_col,
-    const std::optional<torch::Tensor>& opacity,
+    const c10::optional<torch::Tensor>& opacity,
    const float& percent_allowed_difference,
    const uint& max_n_hits,
    const uint& mode) {
@@ -668,8 +668,8 @@ std::tuple<torch::Tensor, torch::Tensor> Renderer::forward(
    const float& gamma,
    const float& max_depth,
    float min_depth,
-    const std::optional<torch::Tensor>& bg_col,
+    const c10::optional<torch::Tensor>& bg_col,
-    const std::optional<torch::Tensor>& opacity,
+    const c10::optional<torch::Tensor>& opacity,
    const float& percent_allowed_difference,
    const uint& max_n_hits,
    const uint& mode) {
@@ -888,14 +888,14 @@ std::tuple<torch::Tensor, torch::Tensor> Renderer::forward(
 };
 std::tuple<
-    std::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
-    std::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
-    std::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
-    std::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
-    std::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
-    std::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
-    std::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
-    std::optional<torch::Tensor>>
+    at::optional<torch::Tensor>>
 Renderer::backward(
    const torch::Tensor& grad_im,
    const torch::Tensor& image,
@@ -912,8 +912,8 @@ Renderer::backward(
    const float& gamma,
    const float& max_depth,
    float min_depth,
-    const std::optional<torch::Tensor>& bg_col,
+    const c10::optional<torch::Tensor>& bg_col,
-    const std::optional<torch::Tensor>& opacity,
+    const c10::optional<torch::Tensor>& opacity,
    const float& percent_allowed_difference,
    const uint& max_n_hits,
    const uint& mode,
@@ -922,7 +922,7 @@ Renderer::backward(
    const bool& dif_rad,
    const bool& dif_cam,
    const bool& dif_opy,
-    const std::optional<std::pair<uint, uint>>& dbg_pos) {
+    const at::optional<std::pair<uint, uint>>& dbg_pos) {
  this->ensure_on_device(this->device_tracker.device());
  size_t batch_size;
  size_t n_points;
@@ -1045,14 +1045,14 @@ Renderer::backward(
  }
  // Prepare the return value.
  std::tuple<
-      std::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
-      std::optional<torch::Tensor>>
+      at::optional<torch::Tensor>>
      ret;
  if (mode == 1 || (!dif_pos && !dif_col && !dif_rad && !dif_cam && !dif_opy)) {
    return ret;
--- a/pytorch3d/csrc/pulsar/pytorch/renderer.h
+++ b/pytorch3d/csrc/pulsar/pytorch/renderer.h
@@ -44,21 +44,21 @@ struct Renderer {
      const float& gamma,
      const float& max_depth,
      float min_depth,
-      const std::optional<torch::Tensor>& bg_col,
+      const c10::optional<torch::Tensor>& bg_col,
-      const std::optional<torch::Tensor>& opacity,
+      const c10::optional<torch::Tensor>& opacity,
      const float& percent_allowed_difference,
      const uint& max_n_hits,
      const uint& mode);
  std::tuple<
-      std::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
-      std::optional<torch::Tensor>>
+      at::optional<torch::Tensor>>
  backward(
      const torch::Tensor& grad_im,
      const torch::Tensor& image,
@@ -75,8 +75,8 @@ struct Renderer {
      const float& gamma,
      const float& max_depth,
      float min_depth,
-      const std::optional<torch::Tensor>& bg_col,
+      const c10::optional<torch::Tensor>& bg_col,
-      const std::optional<torch::Tensor>& opacity,
+      const c10::optional<torch::Tensor>& opacity,
      const float& percent_allowed_difference,
      const uint& max_n_hits,
      const uint& mode,
@@ -85,7 +85,7 @@ struct Renderer {
      const bool& dif_rad,
      const bool& dif_cam,
      const bool& dif_opy,
-      const std::optional<std::pair<uint, uint>>& dbg_pos);
+      const at::optional<std::pair<uint, uint>>& dbg_pos);
  // Infrastructure.
  /**
@@ -115,8 +115,8 @@ struct Renderer {
      const float& gamma,
      const float& max_depth,
      float& min_depth,
-      const std::optional<torch::Tensor>& bg_col,
+      const c10::optional<torch::Tensor>& bg_col,
-      const std::optional<torch::Tensor>& opacity,
+      const c10::optional<torch::Tensor>& opacity,
      const float& percent_allowed_difference,
      const uint& max_n_hits,
      const uint& mode);
--- a/pytorch3d/csrc/pulsar/pytorch/tensor_util.cpp
+++ b/pytorch3d/csrc/pulsar/pytorch/tensor_util.cpp
@@ -8,7 +8,6 @@
 #ifdef WITH_CUDA
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAException.h>
 #include <cuda_runtime_api.h>
 #endif
 #include <torch/extension.h>
@@ -34,13 +33,13 @@ torch::Tensor sphere_ids_from_result_info_nograd(
          .contiguous();
  if (forw_info.device().type() == c10::DeviceType::CUDA) {
 #ifdef WITH_CUDA
-    C10_CUDA_CHECK(cudaMemcpyAsync(
+    cudaMemcpyAsync(
        result.data_ptr(),
        tmp.data_ptr(),
        sizeof(uint32_t) * tmp.size(0) * tmp.size(1) * tmp.size(2) *
            tmp.size(3),
        cudaMemcpyDeviceToDevice,
-        at::cuda::getCurrentCUDAStream()));
+        at::cuda::getCurrentCUDAStream());
 #else
    throw std::runtime_error(
        "Copy on CUDA device initiated but built "
--- a/pytorch3d/csrc/pulsar/pytorch/util.cpp
+++ b/pytorch3d/csrc/pulsar/pytorch/util.cpp
@@ -7,7 +7,6 @@
 */
 #ifdef WITH_CUDA
 #include <c10/cuda/CUDAException.h>
 #include <cuda_runtime_api.h>
 namespace pulsar {
@@ -18,8 +17,7 @@ void cudaDevToDev(
    const void* src,
    const int& size,
    const cudaStream_t& stream) {
-  C10_CUDA_CHECK(
+  cudaMemcpyAsync(trg, src, size, cudaMemcpyDeviceToDevice, stream);
      cudaMemcpyAsync(trg, src, size, cudaMemcpyDeviceToDevice, stream));
 }
 void cudaDevToHost(
@@ -27,8 +25,7 @@ void cudaDevToHost(
    const void* src,
    const int& size,
    const cudaStream_t& stream) {
-  C10_CUDA_CHECK(
+  cudaMemcpyAsync(trg, src, size, cudaMemcpyDeviceToHost, stream);
      cudaMemcpyAsync(trg, src, size, cudaMemcpyDeviceToHost, stream));
 }
 } // namespace pytorch
--- a/pytorch3d/csrc/pulsar/warnings.cpp
+++ b/pytorch3d/csrc/pulsar/warnings.cpp
@@ -6,6 +6,9 @@
 * LICENSE file in the root directory of this source tree.
 */
 #include "./global.h"
 #include "./logging.h"
 /**
 * A compilation unit to provide warnings about the code and avoid
 * repeated messages.
--- a/pytorch3d/csrc/rasterize_coarse/bitmask.cuh
+++ b/pytorch3d/csrc/rasterize_coarse/bitmask.cuh
@@ -25,7 +25,7 @@ class BitMask {
  // Use all threads in the current block to clear all bits of this BitMask
  __device__ void block_clear() {
-    for (auto i = threadIdx.x; i < H * W * D; i += blockDim.x) {
+    for (int i = threadIdx.x; i < H * W * D; i += blockDim.x) {
      data[i] = 0;
    }
    __syncthreads();
--- a/Show More
+++ b/Show More
`@@ -6,4 +6,4 @@`

	`# pyre-unsafe`	`# pyre-unsafe`

	`__version__ = "0.7.9"`	`__version__ = "0.7.6"`