Initial commit

fbshipit-source-id: ad58e416e3ceeca85fae0583308968d04e78fe0d
2025-11-04 09:52:11 +08:00 · 2020-01-23 11:53:41 -08:00 · 2020-01-23 11:53:41 -08:00 · dbf06b504b
commit dbf06b504b
211 changed files with 47362 additions and 0 deletions
--- a/.circleci/check.sh
+++ b/.circleci/check.sh
@ -0,0 +1,6 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+# Run this script before committing config.yml to verify it is valid yaml.
+
+python -c 'import yaml; yaml.safe_load(open("config.yml"))' && echo OK
--- a/.circleci/config.in.yml
+++ b/.circleci/config.in.yml
@ -0,0 +1,199 @@
+version: 2.1
+
+#examples:
+#https://github.com/facebookresearch/ParlAI/blob/master/.circleci/config.yml
+#https://github.com/facebookresearch/hydra/blob/master/.circleci/config.yml
+#https://github.com/facebookresearch/habitat-api/blob/master/.circleci/config.yml
+
+#drive tests with nox or tox or pytest?
+
+# -------------------------------------------------------------------------------------
+# environments where we run our jobs
+# -------------------------------------------------------------------------------------
+
+
+setupcuda: &setupcuda
+  run:
+    name: Setup CUDA
+    working_directory: ~/
+    command: |
+      # download and install nvidia drivers, cuda, etc
+      wget --no-verbose --no-clobber -P ~/nvidia-downloads 'https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-430.40.run'
+      wget --no-verbose --no-clobber -P ~/nvidia-downloads http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run
+      sudo /bin/bash ~/nvidia-downloads/NVIDIA-Linux-x86_64-430.40.run --no-drm -q --ui=none
+      sudo sh ~/nvidia-downloads/cuda_10.2.89_440.33.01_linux.run --silent
+      echo "Done installing CUDA."
+      pyenv versions
+      nvidia-smi
+      pyenv global 3.7.0
+
+gpu: &gpu
+  environment:
+    CUDA_VERSION: "10.2"
+  machine:
+    image: default
+  resource_class: gpu.medium # tesla m60
+
+binary_common: &binary_common
+  parameters:
+    # Edit these defaults to do a release`
+    build_version:
+      description: "version number of release binary; by default, build a nightly"
+      type: string
+      default: ""
+    pytorch_version:
+      description: "PyTorch version to build against; by default, use a nightly"
+      type: string
+      default: ""
+    # Don't edit these
+    python_version:
+      description: "Python version to build against (e.g., 3.7)"
+      type: string
+    cu_version:
+      description: "CUDA version to build against, in CU format (e.g., cpu or cu100)"
+      type: string
+    wheel_docker_image:
+      description: "Wheel only: what docker image to use"
+      type: string
+      default: "pytorch/manylinux-cuda101"
+  environment:
+    PYTHON_VERSION: << parameters.python_version >>
+    BUILD_VERSION: << parameters.build_version >>
+    PYTORCH_VERSION: << parameters.pytorch_version >>
+    CU_VERSION: << parameters.cu_version >>
+
+jobs:
+  main:
+    <<: *gpu
+    machine:
+      image: ubuntu-1604:201903-01
+    steps:
+      - checkout
+      - <<: *setupcuda
+      - run: pip3 install --progress-bar off wheel matplotlib 'pillow<7'
+      - run: pip3 install --progress-bar off torch torchvision
+      # - run: conda create -p ~/conda_env python=3.7 numpy
+      # - run: conda activate ~/conda_env
+      # - run: conda install -c pytorch pytorch torchvision
+
+      - run: pip3 install --progress-bar off 'git+https://github.com/facebookresearch/fvcore'
+      - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-10.2/lib64 python3 setup.py build_ext --inplace
+      - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-10.2/lib64 python -m unittest discover -v -s tests
+      - run: python3 setup.py bdist_wheel
+
+  binary_linux_wheel:
+    <<: *binary_common
+    docker:
+      - image: << parameters.wheel_docker_image >>
+    resource_class: 2xlarge+
+    steps:
+      - checkout
+      - run: packaging/build_wheel.sh
+      - store_artifacts:
+          path: dist
+      - persist_to_workspace:
+          root: dist
+          paths:
+            - "*"
+
+  binary_linux_conda:
+    <<: *binary_common
+    docker:
+      - image: "pytorch/conda-cuda"
+    resource_class: 2xlarge+
+    steps:
+      - checkout
+      # This is building with cuda but no gpu present,
+      # so we aren't running the tests.
+      - run: TEST_FLAG=--no-test packaging/build_conda.sh
+      - store_artifacts:
+          path: /opt/conda/conda-bld/linux-64
+      - persist_to_workspace:
+          root: /opt/conda/conda-bld/linux-64
+          paths:
+            - "*"
+
+  binary_linux_conda_cuda:
+    <<: *binary_common
+    machine:
+      image: ubuntu-1604:201903-01
+    resource_class: gpu.medium
+    steps:
+    - checkout
+    - run:
+        name: Setup environment
+        command: |
+          set -e
+
+          curl -L https://packagecloud.io/circleci/trusty/gpgkey | sudo apt-key add -
+          curl -L https://dl.google.com/linux/linux_signing_key.pub | sudo apt-key add -
+
+          sudo apt-get update
+
+          sudo apt-get install \
+              apt-transport-https \
+              ca-certificates \
+              curl \
+              gnupg-agent \
+              software-properties-common
+
+          curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
+
+          sudo add-apt-repository \
+             "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
+             $(lsb_release -cs) \
+             stable"
+
+          sudo apt-get update
+          export DOCKER_VERSION="5:19.03.2~3-0~ubuntu-xenial"
+          sudo apt-get install docker-ce=${DOCKER_VERSION} docker-ce-cli=${DOCKER_VERSION} containerd.io=1.2.6-3
+
+          # Add the package repositories
+          distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+          curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+          curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
+
+          export NVIDIA_CONTAINER_VERSION="1.0.3-1"
+          sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit=${NVIDIA_CONTAINER_VERSION}
+          sudo systemctl restart docker
+
+          DRIVER_FN="NVIDIA-Linux-x86_64-410.104.run"
+          wget "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
+          sudo /bin/bash "$DRIVER_FN" -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
+          nvidia-smi
+
+    - run:
+        name: Pull docker image
+        command: |
+          set -e
+          export DOCKER_IMAGE=pytorch/conda-cuda
+          echo Pulling docker image $DOCKER_IMAGE
+          docker pull $DOCKER_IMAGE >/dev/null
+
+    - run:
+        name: Build and run tests
+        command: |
+          set -e
+
+          cd ${HOME}/project/
+
+          export DOCKER_IMAGE=pytorch/conda-cuda
+          export VARS_TO_PASS="-e PYTHON_VERSION -e BUILD_VERSION -e PYTORCH_VERSION -e UNICODE_ABI -e CU_VERSION"
+
+          docker run --gpus all  --ipc=host -v $(pwd):/remote -w /remote ${VARS_TO_PASS} ${DOCKER_IMAGE} ./packaging/build_conda.sh
+
+workflows:
+  version: 2
+  build_and_test:
+    jobs:
+      - main
+      {{workflows()}}
+      - binary_linux_conda:
+          cu_version: cu101
+          name: binary_linux_conda_py3.7_cu101
+          python_version: '3.7'
+      - binary_linux_conda_cuda:
+          name: testrun_conda_cuda_py3.7_cu100
+          python_version: "3.7"
+          pytorch_version: "1.4"
+          cu_version: "cu100"
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -0,0 +1,258 @@
+version: 2.1
+
+#examples:
+#https://github.com/facebookresearch/ParlAI/blob/master/.circleci/config.yml
+#https://github.com/facebookresearch/hydra/blob/master/.circleci/config.yml
+#https://github.com/facebookresearch/habitat-api/blob/master/.circleci/config.yml
+
+#drive tests with nox or tox or pytest?
+
+# -------------------------------------------------------------------------------------
+# environments where we run our jobs
+# -------------------------------------------------------------------------------------
+
+
+setupcuda: &setupcuda
+  run:
+    name: Setup CUDA
+    working_directory: ~/
+    command: |
+      # download and install nvidia drivers, cuda, etc
+      wget --no-verbose --no-clobber -P ~/nvidia-downloads 'https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-430.40.run'
+      wget --no-verbose --no-clobber -P ~/nvidia-downloads http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run
+      sudo /bin/bash ~/nvidia-downloads/NVIDIA-Linux-x86_64-430.40.run --no-drm -q --ui=none
+      sudo sh ~/nvidia-downloads/cuda_10.2.89_440.33.01_linux.run --silent
+      echo "Done installing CUDA."
+      pyenv versions
+      nvidia-smi
+      pyenv global 3.7.0
+
+gpu: &gpu
+  environment:
+    CUDA_VERSION: "10.2"
+  machine:
+    image: default
+  resource_class: gpu.medium # tesla m60
+
+binary_common: &binary_common
+  parameters:
+    # Edit these defaults to do a release`
+    build_version:
+      description: "version number of release binary; by default, build a nightly"
+      type: string
+      default: ""
+    pytorch_version:
+      description: "PyTorch version to build against; by default, use a nightly"
+      type: string
+      default: ""
+    # Don't edit these
+    python_version:
+      description: "Python version to build against (e.g., 3.7)"
+      type: string
+    cu_version:
+      description: "CUDA version to build against, in CU format (e.g., cpu or cu100)"
+      type: string
+    wheel_docker_image:
+      description: "Wheel only: what docker image to use"
+      type: string
+      default: "pytorch/manylinux-cuda101"
+  environment:
+    PYTHON_VERSION: << parameters.python_version >>
+    BUILD_VERSION: << parameters.build_version >>
+    PYTORCH_VERSION: << parameters.pytorch_version >>
+    CU_VERSION: << parameters.cu_version >>
+
+jobs:
+  main:
+    <<: *gpu
+    machine:
+      image: ubuntu-1604:201903-01
+    steps:
+      - checkout
+      - <<: *setupcuda
+      - run: pip3 install --progress-bar off wheel matplotlib 'pillow<7'
+      - run: pip3 install --progress-bar off torch torchvision
+      # - run: conda create -p ~/conda_env python=3.7 numpy
+      # - run: conda activate ~/conda_env
+      # - run: conda install -c pytorch pytorch torchvision
+
+      - run: pip3 install --progress-bar off 'git+https://github.com/facebookresearch/fvcore'
+      - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-10.2/lib64 python3 setup.py build_ext --inplace
+      - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-10.2/lib64 python -m unittest discover -v -s tests
+      - run: python3 setup.py bdist_wheel
+
+  binary_linux_wheel:
+    <<: *binary_common
+    docker:
+      - image: << parameters.wheel_docker_image >>
+    resource_class: 2xlarge+
+    steps:
+      - checkout
+      - run: packaging/build_wheel.sh
+      - store_artifacts:
+          path: dist
+      - persist_to_workspace:
+          root: dist
+          paths:
+            - "*"
+
+  binary_linux_conda:
+    <<: *binary_common
+    docker:
+      - image: "pytorch/conda-cuda"
+    resource_class: 2xlarge+
+    steps:
+      - checkout
+      # This is building with cuda but no gpu present,
+      # so we aren't running the tests.
+      - run: TEST_FLAG=--no-test packaging/build_conda.sh
+      - store_artifacts:
+          path: /opt/conda/conda-bld/linux-64
+      - persist_to_workspace:
+          root: /opt/conda/conda-bld/linux-64
+          paths:
+            - "*"
+
+  binary_linux_conda_cuda:
+    <<: *binary_common
+    machine:
+      image: ubuntu-1604:201903-01
+    resource_class: gpu.medium
+    steps:
+    - checkout
+    - run:
+        name: Setup environment
+        command: |
+          set -e
+
+          curl -L https://packagecloud.io/circleci/trusty/gpgkey | sudo apt-key add -
+          curl -L https://dl.google.com/linux/linux_signing_key.pub | sudo apt-key add -
+
+          sudo apt-get update
+
+          sudo apt-get install \
+              apt-transport-https \
+              ca-certificates \
+              curl \
+              gnupg-agent \
+              software-properties-common
+
+          curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
+
+          sudo add-apt-repository \
+             "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
+             $(lsb_release -cs) \
+             stable"
+
+          sudo apt-get update
+          export DOCKER_VERSION="5:19.03.2~3-0~ubuntu-xenial"
+          sudo apt-get install docker-ce=${DOCKER_VERSION} docker-ce-cli=${DOCKER_VERSION} containerd.io=1.2.6-3
+
+          # Add the package repositories
+          distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+          curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+          curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
+
+          export NVIDIA_CONTAINER_VERSION="1.0.3-1"
+          sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit=${NVIDIA_CONTAINER_VERSION}
+          sudo systemctl restart docker
+
+          DRIVER_FN="NVIDIA-Linux-x86_64-410.104.run"
+          wget "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
+          sudo /bin/bash "$DRIVER_FN" -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
+          nvidia-smi
+
+    - run:
+        name: Pull docker image
+        command: |
+          set -e
+          export DOCKER_IMAGE=pytorch/conda-cuda
+          echo Pulling docker image $DOCKER_IMAGE
+          docker pull $DOCKER_IMAGE >/dev/null
+
+    - run:
+        name: Build and run tests
+        command: |
+          set -e
+
+          cd ${HOME}/project/
+
+          export DOCKER_IMAGE=pytorch/conda-cuda
+          export VARS_TO_PASS="-e PYTHON_VERSION -e BUILD_VERSION -e PYTORCH_VERSION -e UNICODE_ABI -e CU_VERSION"
+
+          docker run --gpus all  --ipc=host -v $(pwd):/remote -w /remote ${VARS_TO_PASS} ${DOCKER_IMAGE} ./packaging/build_conda.sh
+
+workflows:
+  version: 2
+  build_and_test:
+    jobs:
+      - main
+      - binary_linux_conda:
+          build_version: 0.1.0
+          cu_version: cu92
+          name: binary_linux_conda_py3.6_cu92
+          python_version: '3.6'
+          pytorch_version: '1.4'
+          wheel_docker_image: pytorch/manylinux-cuda92
+      - binary_linux_conda:
+          build_version: 0.1.0
+          cu_version: cu100
+          name: binary_linux_conda_py3.6_cu100
+          python_version: '3.6'
+          pytorch_version: '1.4'
+          wheel_docker_image: pytorch/manylinux-cuda100
+      - binary_linux_conda:
+          build_version: 0.1.0
+          cu_version: cu101
+          name: binary_linux_conda_py3.6_cu101
+          python_version: '3.6'
+          pytorch_version: '1.4'
+      - binary_linux_conda:
+          build_version: 0.1.0
+          cu_version: cu92
+          name: binary_linux_conda_py3.7_cu92
+          python_version: '3.7'
+          pytorch_version: '1.4'
+          wheel_docker_image: pytorch/manylinux-cuda92
+      - binary_linux_conda:
+          build_version: 0.1.0
+          cu_version: cu100
+          name: binary_linux_conda_py3.7_cu100
+          python_version: '3.7'
+          pytorch_version: '1.4'
+          wheel_docker_image: pytorch/manylinux-cuda100
+      - binary_linux_conda:
+          build_version: 0.1.0
+          cu_version: cu101
+          name: binary_linux_conda_py3.7_cu101
+          python_version: '3.7'
+          pytorch_version: '1.4'
+      - binary_linux_conda:
+          build_version: 0.1.0
+          cu_version: cu92
+          name: binary_linux_conda_py3.8_cu92
+          python_version: '3.8'
+          pytorch_version: '1.4'
+          wheel_docker_image: pytorch/manylinux-cuda92
+      - binary_linux_conda:
+          build_version: 0.1.0
+          cu_version: cu100
+          name: binary_linux_conda_py3.8_cu100
+          python_version: '3.8'
+          pytorch_version: '1.4'
+          wheel_docker_image: pytorch/manylinux-cuda100
+      - binary_linux_conda:
+          build_version: 0.1.0
+          cu_version: cu101
+          name: binary_linux_conda_py3.8_cu101
+          python_version: '3.8'
+          pytorch_version: '1.4'
+      - binary_linux_conda:
+          cu_version: cu101
+          name: binary_linux_conda_py3.7_cu101
+          python_version: '3.7'
+      - binary_linux_conda_cuda:
+          name: testrun_conda_cuda_py3.7_cu100
+          python_version: "3.7"
+          pytorch_version: "1.4"
+          cu_version: "cu100"
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+"""
+This script is adapted from the torchvision one.
+There is no python2.7 nor macos.
+TODO: python 3.8 when pytorch 1.4.
+"""
+
+import os.path
+import jinja2
+import yaml
+
+
+def workflows(prefix="", filter_branch=None, upload=False, indentation=6):
+    w = []
+    # add "wheel" here for pypi
+    for btype in ["conda"]:
+        for python_version in ["3.6", "3.7", "3.8"]:
+            for cu_version in ["cu92", "cu100", "cu101"]:
+                w += workflow_pair(
+                    btype=btype,
+                    python_version=python_version,
+                    cu_version=cu_version,
+                    prefix=prefix,
+                    upload=upload,
+                    filter_branch=filter_branch,
+                )
+
+    return indent(indentation, w)
+
+
+def workflow_pair(
+    *, btype, python_version, cu_version, prefix="", upload=False, filter_branch
+):
+
+    w = []
+    base_workflow_name = (
+        f"{prefix}binary_linux_{btype}_py{python_version}_{cu_version}"
+    )
+
+    w.append(
+        generate_base_workflow(
+            base_workflow_name=base_workflow_name,
+            python_version=python_version,
+            cu_version=cu_version,
+            btype=btype,
+            filter_branch=filter_branch,
+        )
+    )
+
+    if upload:
+        w.append(
+            generate_upload_workflow(
+                base_workflow_name=base_workflow_name,
+                btype=btype,
+                cu_version=cu_version,
+                filter_branch=filter_branch,
+            )
+        )
+
+    return w
+
+
+def generate_base_workflow(
+    *, base_workflow_name, python_version, cu_version, btype, filter_branch=None
+):
+
+    d = {
+        "name": base_workflow_name,
+        "python_version": python_version,
+        "cu_version": cu_version,
+        "build_version": "0.1.0",
+        "pytorch_version": "1.4",
+    }
+
+    if cu_version == "cu92":
+        d["wheel_docker_image"] = "pytorch/manylinux-cuda92"
+    elif cu_version == "cu100":
+        d["wheel_docker_image"] = "pytorch/manylinux-cuda100"
+
+    if filter_branch is not None:
+        d["filters"] = {"branches": {"only": filter_branch}}
+
+    return {f"binary_linux_{btype}": d}
+
+
+def generate_upload_workflow(
+    *, base_workflow_name, btype, cu_version, filter_branch
+):
+    d = {
+        "name": f"{base_workflow_name}_upload",
+        "context": "org-member",
+        "requires": [base_workflow_name],
+    }
+
+    if btype == "wheel":
+        d["subfolder"] = cu_version + "/"
+
+    if filter_branch is not None:
+        d["filters"] = {"branches": {"only": filter_branch}}
+
+    return {f"binary_{btype}_upload": d}
+
+
+def indent(indentation, data_list):
+    return ("\n" + " " * indentation).join(
+        yaml.dump(data_list, default_flow_style=False).splitlines()
+    )
+
+
+if __name__ == "__main__":
+    d = os.path.dirname(__file__)
+    env = jinja2.Environment(
+        loader=jinja2.FileSystemLoader(d), lstrip_blocks=True, autoescape=False
+    )
+
+    with open(os.path.join(d, "config.yml"), "w") as f:
+        f.write(env.get_template("config.in.yml").render(workflows=workflows))
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,85 @@
+AccessModifierOffset: -1
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ForEachMacros:   [ FOR_EACH, FOR_EACH_ENUMERATE, FOR_EACH_KV, FOR_EACH_R, FOR_EACH_RANGE, ]
+IncludeCategories:
+  - Regex:           '^<.*\.h(pp)?>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
--- a/.flake8
+++ b/.flake8
@ -0,0 +1,6 @@
+[flake8]
+ignore = E203, E266, E501, W503, E221
+max-line-length = 80
+max-complexity = 18
+select = B,C,E,F,W,T4,B9
+exclude = build,__init__.py
--- a/.github/CODE_OF_CONDUCT.md
+++ b/.github/CODE_OF_CONDUCT.md
@ -0,0 +1,76 @@
+# Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@fb.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@ -0,0 +1,54 @@
+# Contributing to PyTorch3D
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+However, if you're adding any significant features, please make sure to have a corresponding issue to outline your proposal and motivation and allow time for us to give feedback, *before* you send a PR.
+We do not always accept new features, and we take the following factors into consideration:
+
+- Whether the same feature can be achieved without modifying PyTorch3d directly. If any aspect of the API is not extensible, please highlight this in an issue so we can work on making this more extensible.
+- Whether the feature is potentially useful to a large audience, or only to a small portion of users.
+- Whether the proposed solution has a good design and interface.
+- Whether the proposed solution adds extra mental/practical overhead to users who don't need such feature.
+- Whether the proposed solution breaks existing APIs.
+
+When sending a PR, please ensure you complete the following steps:
+
+1. Fork the repo and create your branch from `master`. Follow the instructions
+   in [INSTALL.md](../INSTALL.md) to build the repo.
+2. If you've added code that should be tested, add tests.
+3. If you've changed any APIs, please update the documentation.
+4. Ensure the test suite passes:
+    ```
+    cd pytorch3d/tests
+    python -m unittest -v
+    ```
+5. Make sure your code lints by running `dev/linter.sh` from  the project root.
+6. If a PR contains multiple orthogonal changes, split it into multiple separate PRs.
+7. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## Coding Style  
+We follow these [python](http://google.github.io/styleguide/pyguide.html) and [C++](https://google.github.io/styleguide/cppguide.html) style guides.
+
+For the linter to work, you will need to install `black`, `flake`, `isort` and `clang-format`, and
+they need to be fairly up to date.
+
+## License
+By contributing to PyTorch3D, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
--- a/.github/ISSUE_TEMPLATE/bugs.md
+++ b/.github/ISSUE_TEMPLATE/bugs.md
@ -0,0 +1,28 @@
+---
+name: "🐛 Bugs / Unexpected behaviors"
+about: Please report unexpected behaviors or bugs in PyTorch3d.
+
+---
+
+If you do not know the root cause of the problem / bug, and wish someone to help you, please
+post according to this template:
+
+## 🐛 Bugs / Unexpected behaviors
+<!-- A clear and concise description of the issue -->
+
+## Instructions To Reproduce the Issue:
+
+Please include the following (depending on what the issue is):
+
+1. Any changes you made (`git diff`) or code you wrote
+```
+<put diff or code here>
+```
+2. The exact command(s) you ran:
+3. What you observed (including the full logs):
+```
+<put logs here>
+```
+
+Please also simplify the steps as much as possible so they do not require additional resources to
+	 run, such as a private dataset.
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -0,0 +1 @@
+blank_issues_enabled: false
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@ -0,0 +1,19 @@
+---
+name: "\U0001F680 Feature Request"
+about: Submit a proposal/request for a new PyTorch3d feature
+
+---
+
+## 🚀 Feature
+<!-- A clear and concise description of the feature proposal -->
+
+## Motivation
+
+<!-- Please outline the motivation for the proposal.
+e.g. It would be great if I could do [...], I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too -->
+
+## Pitch
+
+<!-- A clear and concise description, optionally with code examples showing the functionality you want. -->
+
+NOTE: we only consider adding new features if they are useful for many users.
--- a/.github/ISSUE_TEMPLATE/questions-help.md
+++ b/.github/ISSUE_TEMPLATE/questions-help.md
@ -0,0 +1,18 @@
+---
+name: "❓ Questions"
+about: How do I do X with PyTorch3d? How does PyTorch3d do X?
+
+---
+
+## ❓ Questions on how to use PyTorch3d
+
+<!-- A clear and concise description of the question you need help with. -->
+
+NOTE:
+
+1. If you encountered any errors or unexpected issues while using PyTorch3d and need help resolving them,
+   please use the "Bugs / Unexpected behaviors" issue template.
+
+2. We do not answer general machine learning / computer vision questions that are not specific to
+	 PyTorch3d, such as how a model works or what algorithm/methods can be
+	 used to achieve X.
--- a/.github/bundle_adjust.gif
+++ b/.github/bundle_adjust.gif
--- a/.github/camera_position_teapot.gif
+++ b/.github/camera_position_teapot.gif
--- a/.github/dolphin_deform.gif
+++ b/.github/dolphin_deform.gif
--- a/.github/pytorch3dlogo.png
+++ b/.github/pytorch3dlogo.png
--- a/.github/render_textured_mesh.gif
+++ b/.github/render_textured_mesh.gif
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+build/
+dist/
+*.egg-info/
+**/__pycache__/
--- a/INSTALL.md
+++ b/INSTALL.md
@ -0,0 +1,74 @@
+# Installation
+
+
+## Requirements
+
+### Core library
+
+The core library is written in PyTorch. Several components have underlying implementation in CUDA for improved performance. A subset of these components have CPU implementations in C++/Pytorch. It is advised to use PyTorch3d with GPU support in order to use all the features.
+
+- Linux or macOS
+- Python ≥ 3.6
+- PyTorch 1.4
+- torchvision that matches the PyTorch installation. You can install them together at pytorch.org to make sure of this.
+- gcc & g++ ≥ 4.9
+- CUDA 9.2 or 10.0 or 10.1
+- [fvcore](https://github.com/facebookresearch/fvcore)
+
+These can be installed by running:
+```
+conda create -n pytorch3d python=3.6
+conda activate pytorch3d
+conda install -c pytorch pytorch torchvision cudatoolkit=10.0
+conda install -c conda-forge -c takatosp1 fvcore
+```
+
+### Tests/Linting and Demos
+
+For developing on top of PyTorch3d or contributing, you will need to run the linter and tests. If you want to run any of the notebook tutorials as `docs/tutorials` you will also need matplotlib.
+- scikit-image
+- black
+- isort
+- flake8
+- matplotlib
+- tdqm
+- jupyter
+- imageio
+
+These can be installed by running:
+```
+# Demos
+conda install jupyter
+pip install scikit-image matplotlib imageio
+
+# Tests/Linting
+pip install black isort flake8
+```
+
+## Build/Install Pytorch3d
+After installing the above dependencies, run one of the following commands:
+
+### 1. Install from Anaconda Cloud
+
+```
+# Anaconda Cloud
+conda install pytorch3d
+```
+
+### 2. Install from GitHub
+```
+pip install 'git+https://github.com/facebookresearch/pytorch3d.git'
+# (add --user if you don't have permission)
+```
+
+### 3. Install from a local clone
+```
+git clone https://github.com/facebookresearch/pytorch3d.git
+cd pytorch3d && pip install -e .
+```
+To rebuild after installing from a local clone run, `rm -rf build/ **/*.so` then `pip install -e` .. You often need to rebuild pytorch3d after reinstalling PyTorch.
+
+**Install from local clone on macOS:**
+```
+MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ pip install -e .
+```
--- a/30
+++ b/30
@ -0,0 +1,30 @@
+BSD License
+
+For PyTorch3d software
+
+Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name Facebook nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,77 @@
+<img src="https://github.com/facebookresearch/pytorch3d/blob/master/.github/pytorch3dlogo.png" width="900"/>
+
+[![CircleCI](https://circleci.com/gh/facebookresearch/pytorch3d.svg?style=svg)](https://circleci.com/gh/facebookresearch/pytorch3d)
+[![Anaconda-Server Badge](https://anaconda.org/pytorch3d/pytorch3d/badges/version.svg)](https://anaconda.org/pytorch3d/pytorch3d)
+
+# Introduction
+
+PyTorch3d provides efficient, reusable components for 3D Computer Vision research with [PyTorch](https://pytorch.org).
+
+Key features include:
+
+- Data structure for storing and manipulating triangle meshes
+- Efficient operations on triangle meshes (projective transformations, graph convolution, sampling, loss functions)
+- A differentiable mesh renderer
+
+PyTorch3d is designed to integrate smoothly with deep learning methods for predicting and manipulating 3D data.
+For this reason, all operators in PyTorch3d:
+
+- Are implemented using PyTorch tensors
+- Can handle minibatches of hetereogenous data
+- Can be differentiated
+- Can utilize GPUs for acceleration
+
+Within FAIR, PyTorch3d has been used to power research projects such as [Mesh R-CNN](https://arxiv.org/abs/1906.02739).
+
+## Installation
+
+For detailed instructions refer to [INSTALL.md](INSTALL.md).
+
+## License
+
+PyTorch3d is released under the [BSD-3-Clause License](LICENSE).
+
+## Tutorials
+
+Get started with PyTorch3d by trying one of the tutorial notebooks.
+
+|<img src="https://github.com/facebookresearch/pytorch3d/blob/master/.github/dolphin_deform.gif" width="310"/>|<img src="https://github.com/facebookresearch/pytorch3d/blob/master/.github/bundle_adjust.gif" width="310"/>|
+|:-----------------------------------------------------------------------------------------------------------:|:--------------------------------------------------:|
+| [Deform a sphere mesh to dolphin](https://github.com/fairinternal/pytorch3d/blob/master/docs/tutorials/deform_source_mesh_to_target_mesh.ipynb)| [Bundle adjustment](https://github.com/fairinternal/pytorch3d/blob/master/docs/tutorials/bundle_adjustment.ipynb) |
+
+| <img src="https://github.com/facebookresearch/pytorch3d/blob/master/.github/render_textured_mesh.gif" width="310"/> | <img src="https://github.com/facebookresearch/pytorch3d/blob/master/.github/camera_position_teapot.gif" width="310" height="310"/>
+|:------------------------------------------------------------:|:--------------------------------------------------:|
+| [Render textured meshes](https://github.com/fairinternal/pytorch3d/blob/master/docs/tutorials/render_textured_meshes.ipynb)| [Camera position optimization](https://github.com/fairinternal/pytorch3d/blob/master/docs/tutorials/camera_position_optimization_with_differentiable_rendering.ipynb)|
+
+## Documentation
+
+Learn more about the API by reading the PyTorch3d [documentation](https://pytorch3d.readthedocs.org/).
+
+We also have deep dive notes on several API components:
+
+- [Heterogeneous Batching](https://github.com/facebookresearch/pytorch3d/tree/master/docs/notes/batching.md)
+- [Mesh IO](https://github.com/facebookresearch/pytorch3d/tree/master/docs/notes/meshes_io.md)
+- [Differentiable Rendering](https://github.com/facebookresearch/pytorch3d/tree/master/docs/notes/renderer_getting_started.md)
+
+## Development
+
+We welcome new contributions to Pytorch3d and we will be actively maintaining this library! Please refer to [CONTRIBUTING.md](./.github/CONTRIBUTING.md) for full instructions on how to run the code, tests and linter, and submit your pull requests.
+
+
+## Contributors
+
+PyTorch3d is written and maintained by the Facebook AI Research Computer Vision Team.
+
+## Citation
+
+If you find PyTorch3d useful in your research, please cite:
+
+```bibtex
+@misc{ravi2020pytorch3d,
+  author =       {Nikhila Ravi and Jeremy Reizenstein and David Novotny and Taylor Gordon
+                  and Wan-Yen Lo and Justin Johnson and Georgia Gkioxari},
+  title =        {PyTorch3D},
+  howpublished = {\url{https://github.com/facebookresearch/pytorch3d}},
+  year =         {2020}
+}
+```
--- a/dev/linter.sh
+++ b/dev/linter.sh
@ -0,0 +1,30 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+# Run this script at project root by "./dev/linter.sh" before you commit
+
+{
+  V=$(black --version|cut '-d ' -f3)
+  code='import distutils.version; assert "19.3" < distutils.version.LooseVersion("'$V'")'
+  python -c "${code}" 2> /dev/null
+} || {
+  echo "Linter requires black 19.3b0 or higher!"
+  exit 1
+}
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+DIR="${DIR}/.."
+
+echo "Running isort..."
+isort -y -sp "${DIR}"
+
+echo "Running black..."
+black -l 80 "${DIR}"
+
+echo "Running flake..."
+flake8 "${DIR}"
+
+echo "Running clang-format ..."
+find "${DIR}" -regex ".*\.\(cpp\|c\|cc\|cu\|cuh\|cxx\|h\|hh\|hpp\|hxx\|tcc\|mm\|m\)" -print0 | xargs -0 clang-format -i
+
+(cd "${DIR}"; command -v arc > /dev/null && arc lint) || true
--- a/docs/.gitignore
+++ b/docs/.gitignore
@ -0,0 +1,7 @@
+source
+_build
+_static
+_template
+*-checkpoint.ipynb
+.ipynb_checkpoints
+.ipynb_checkpoints/**
--- a/docs/Makefile
+++ b/docs/Makefile
@ -0,0 +1,21 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Minimal makefile for Sphinx documentation
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/README.md
+++ b/docs/README.md
@ -0,0 +1,78 @@
+
+## Setup
+
+### Install dependencies
+
+```
+pip install -U recommonmark mock sphinx sphinx_rtd_theme sphinx_markdown_tables
+```
+
+### Add symlink to the root README.md
+
+We want to include the root readme as an overview. Before generating the docs create a symlink to the root readme.
+
+```
+cd docs        
+ln -s ../README.md  overview.md
+```
+
+In `conf.py` for deployment this is done using `subprocess.call`.
+
+### Add a new file
+
+Add a new `.md` or `.rst` file and add the name to the doc tree in `index.rst` e.g
+
+```
+.. toctree::
+   :maxdepth: 1
+   :caption: Intro Documentation
+
+   overview
+```
+
+To autogenerate docs from docstrings in the source code, add the import path for the function e.g.
+
+```
+Chamfer Loss
+--------------------
+
+.. autoclass:: loss.chamfer.chamfer_distance
+    :members:
+    :undoc-members:
+
+    .. automethod:: __init__
+
+````
+
+### Build
+
+From `pytorch3d/docs` run:
+
+```
+> make html
+```
+
+The website is generated in `_build/html`.
+
+### Common Issues
+
+Sphinx can be fussy, and sometimes about things you weren’t expecting. For example, you might encounter something like:
+
+WARNING: toctree contains reference to nonexisting document u'overview'
+...
+checking consistency...
+<pytorch3d>/docs/overview.rst::
+WARNING: document isn't included in any toctree
+
+You might have indented overview in the .. toctree:: in index.rst with four spaces, when Sphinx is expecting three.
+
+
+### View
+
+Start a python simple server:
+
+```
+> python -m http.server
+```
+
+Navigate to: `http://0.0.0.0:8000/`
--- a/docs/conf.py
+++ b/docs/conf.py
@ -0,0 +1,200 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# flake8: noqa
+
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+
+import pytorch3d  # isort: skip
+
+import mock
+from recommonmark.parser import CommonMarkParser
+from recommonmark.states import DummyStateMachine
+from sphinx.builders.html import StandaloneHTMLBuilder
+from sphinx.ext.autodoc import between
+
+# Monkey patch to fix recommonmark 0.4 doc reference issues.
+orig_run_role = DummyStateMachine.run_role
+
+
+def run_role(self, name, options=None, content=None):
+    if name == "doc":
+        name = "any"
+    return orig_run_role(self, name, options, content)
+
+
+DummyStateMachine.run_role = run_role
+
+
+StandaloneHTMLBuilder.supported_image_types = [
+    "image/svg+xml",
+    "image/gif",
+    "image/png",
+    "image/jpeg",
+]
+
+# -- Path setup --------------------------------------------------------------
+
+
+sys.path.insert(0, os.path.abspath("../"))
+sys.path.insert(0, os.path.abspath("../pytorch3d"))
+sys.path.insert(0, os.path.abspath("../../"))
+
+DEPLOY = os.environ.get("READTHEDOCS") == "True"
+needs_sphinx = "1.7"
+
+
+# The short X.Y version
+version = pytorch3d.__version__
+# The full version, including alpha/beta/rc tags
+release = version
+
+try:
+    import torch  # noqa
+except ImportError:
+    for m in [
+        "torch",
+        "torchvision",
+        "torch.nn",
+        "torch.autograd",
+        "torch.autograd.function",
+        "torch.nn.modules",
+        "torch.nn.modules.utils",
+        "torch.utils",
+        "torch.utils.data",
+        "torchvision",
+        "torchvision.ops",
+    ]:
+        sys.modules[m] = mock.Mock(name=m)
+
+for m in ["cv2", "scipy", "numpy", "pytorch3d._C", "np.eye", "np.zeros"]:
+    sys.modules[m] = mock.Mock(name=m)
+
+# -- Project information -----------------------------------------------------
+
+project = "PyTorch3D"
+copyright = "2019, facebookresearch"
+author = "facebookresearch"
+
+# The full version, including alpha/beta/rc tags
+release = "v0.1"
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+
+extensions = [
+    "sphinx_markdown_tables",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.todo",
+    "sphinx.ext.coverage",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.githubpages",
+]
+
+# -- Configurations for plugins ------------
+napoleon_google_docstring = True
+napoleon_include_init_with_doc = True
+napoleon_include_special_with_doc = True
+napoleon_numpy_docstring = False
+# napoleon_use_param = False
+napoleon_use_rtype = False
+autodoc_inherit_docstrings = False
+autodoc_member_order = "bysource"
+
+source_parsers = {".md": CommonMarkParser}
+
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = [".rst", ".md"]
+
+# The master toctree document.
+master_doc = "index"
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "build", "README.md"]
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = "sphinx"
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme"
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
+
+html_theme_options = {"collapse_navigation": True}
+
+
+def url_resolver(url):
+    if ".html" not in url:
+        url = url.replace("../", "")
+        return (
+            "https://github.com/facebookresearch/pytorch3d/blob/master/" + url
+        )
+    else:
+        if DEPLOY:
+            return "http://pytorch3d.readthedocs.io/" + url
+        else:
+            return "/" + url
+
+
+def setup(app):
+    # Add symlink to root README
+    if DEPLOY:
+        import subprocess
+
+        subprocess.call(["ln", "-s", "../README.md", "overview.md"])
+
+    from recommonmark.transform import AutoStructify
+
+    app.add_config_value(
+        "recommonmark_config",
+        {
+            "url_resolver": url_resolver,
+            "auto_toc_tree_section": "Contents",
+            "enable_math": True,
+            "enable_inline_math": True,
+            "enable_eval_rst": True,
+            "enable_auto_toc_tree": True,
+        },
+        True,
+    )
+
+    # Register a sphinx.ext.autodoc.between listener to ignore everything
+    # between lines that contain the word IGNORE
+    app.connect(
+        "autodoc-process-docstring", between("^.*IGNORE.*$", exclude=True)
+    )
+    app.add_transform(AutoStructify)
+
+    return app
--- a/docs/figs/architecture_overview.png
+++ b/docs/figs/architecture_overview.png
--- a/docs/figs/batch_intro.png
+++ b/docs/figs/batch_intro.png
--- a/docs/figs/batch_modes.gif
+++ b/docs/figs/batch_modes.gif
--- a/docs/figs/fullset_batch_size_16.png
+++ b/docs/figs/fullset_batch_size_16.png
--- a/docs/figs/meshrcnn.png
+++ b/docs/figs/meshrcnn.png
--- a/docs/figs/opengl_coordframes.png
+++ b/docs/figs/opengl_coordframes.png
--- a/docs/figs/p3d_naive_vs_coarse.png
+++ b/docs/figs/p3d_naive_vs_coarse.png
--- a/docs/figs/p3d_vs_softras.png
+++ b/docs/figs/p3d_vs_softras.png
--- a/docs/figs/transformations_overview.png
+++ b/docs/figs/transformations_overview.png
--- a/docs/index.rst
+++ b/docs/index.rst
@ -0,0 +1,17 @@
+Welcome to PyTorch3D's documentation!
+=====================================
+
+PyTorch3D is a library of reusable components for Deep Learning with 3D data.
+
+Table of Contents
+=================
+
+.. toctree::
+   :maxdepth: 2
+
+   overview
+
+.. toctree::
+   :maxdepth: 2
+
+   modules/index
--- a/docs/modules/index.rst
+++ b/docs/modules/index.rst
@ -0,0 +1,13 @@
+API Documentation
+==================
+
+.. toctree::
+
+    structures
+    io
+    loss
+    ops
+    renderer/index
+    transforms
+    utils 
+    
--- a/docs/modules/io.rst
+++ b/docs/modules/io.rst
@ -0,0 +1,7 @@
+pytorch3d.io 
+===========================
+
+.. automodule:: pytorch3d.io
+    :members:
+    :undoc-members:
+    :show-inheritance:
--- a/docs/modules/loss.rst
+++ b/docs/modules/loss.rst
@ -0,0 +1,9 @@
+pytorch3d.loss
+====================
+
+Loss functions for meshes and point clouds.
+
+.. automodule:: pytorch3d.loss
+    :members:
+    :undoc-members:
+    :show-inheritance:
--- a/docs/modules/ops.rst
+++ b/docs/modules/ops.rst
@ -0,0 +1,6 @@
+pytorch3d.ops 
+===========================
+
+.. automodule:: pytorch3d.ops
+    :members:
+    :undoc-members:
--- a/docs/modules/renderer/blending.rst
+++ b/docs/modules/renderer/blending.rst
@ -0,0 +1,7 @@
+blending
+===========================
+
+.. automodule:: pytorch3d.renderer.blending
+    :members:
+    :undoc-members:
+    :show-inheritance:
--- a/docs/modules/renderer/cameras.rst
+++ b/docs/modules/renderer/cameras.rst
@ -0,0 +1,7 @@
+cameras
+===========================
+
+.. automodule:: pytorch3d.renderer.cameras
+    :members:
+    :undoc-members:
+    :show-inheritance:
--- a/docs/modules/renderer/index.rst
+++ b/docs/modules/renderer/index.rst
@ -0,0 +1,15 @@
+pytorch3d.renderer 
+===========================
+
+.. toctree::
+
+    rasterizer
+    cameras
+    lighting
+    materials
+    texturing
+    blending
+    shading
+    shader
+    renderer
+    utils
--- a/docs/modules/renderer/lighting.rst
+++ b/docs/modules/renderer/lighting.rst
@ -0,0 +1,6 @@
+lighting
+===========================
+
+.. automodule:: pytorch3d.renderer.lighting
+    :members:
+    :undoc-members:
--- a/docs/modules/renderer/materials.rst
+++ b/docs/modules/renderer/materials.rst
@ -0,0 +1,7 @@
+materials
+===========================
+
+.. automodule:: pytorch3d.renderer.materials
+    :members:
+    :undoc-members:
+    :show-inheritance:
--- a/docs/modules/renderer/rasterizer.rst
+++ b/docs/modules/renderer/rasterizer.rst
@ -0,0 +1,10 @@
+rasterizer
+===========================
+
+.. automodule:: pytorch3d.renderer.mesh.rasterize_meshes
+    :members:
+    :undoc-members:
+    
+.. automodule:: pytorch3d.renderer.mesh.rasterizer
+    :members:
+    :undoc-members:
--- a/docs/modules/renderer/shader.rst
+++ b/docs/modules/renderer/shader.rst
@ -0,0 +1,7 @@
+shader
+===========================
+
+.. automodule:: pytorch3d.renderer.mesh.shader
+    :members:
+    :undoc-members:
+    
--- a/docs/modules/renderer/shading.rst
+++ b/docs/modules/renderer/shading.rst
@ -0,0 +1,7 @@
+shading
+===========================
+
+.. automodule:: pytorch3d.renderer.mesh.shading
+    :members:
+    :undoc-members:
+    
--- a/docs/modules/renderer/texturing.rst
+++ b/docs/modules/renderer/texturing.rst
@ -0,0 +1,7 @@
+texturing
+===========================
+
+.. automodule:: pytorch3d.renderer.mesh.texturing
+    :members:
+    :undoc-members:
+    
--- a/docs/modules/renderer/utils.rst
+++ b/docs/modules/renderer/utils.rst
@ -0,0 +1,6 @@
+utils
+===========================
+
+.. automodule:: pytorch3d.renderer.utils
+    :members:
+    :undoc-members:
--- a/docs/modules/structures.rst
+++ b/docs/modules/structures.rst
@ -0,0 +1,8 @@
+pytorch3d.structures 
+====================
+
+.. automodule:: pytorch3d.structures
+    :members:
+    :undoc-members:
+
+
--- a/docs/modules/transforms.rst
+++ b/docs/modules/transforms.rst
@ -0,0 +1,7 @@
+pytorch3d.transforms 
+===========================
+
+.. automodule:: pytorch3d.transforms
+    :members:
+    :undoc-members:
+    :show-inheritance:
--- a/docs/modules/utils.rst
+++ b/docs/modules/utils.rst
@ -0,0 +1,7 @@
+pytorch3d.utils
+====================
+
+.. automodule:: pytorch3d.utils
+    :members:
+    :undoc-members:
+    :show-inheritance:
--- a/docs/notes/batching.md
+++ b/docs/notes/batching.md
@ -0,0 +1,27 @@
+# Batching
+
+In deep learning, every optimization step operates on multiple input examples for robust training. Thus, efficient batching is crucial. For image inputs, batching is straighforward; N images are resized to the same height and width and stacked as a 4 dimensional tensor of shape `N x 3 x H x W`. For meshes, batching is less straighforward.
+
+<img src="../figs/batch_intro.png" alt="batch_intro" align="middle"/>
+
+## Batch modes for meshes
+
+Assume you want to construct a batch containing two meshes, with `mesh1 = (v1: V1 x 3, f1: F1 x 3)` containing `V1` vertices and `F1` faces, and `mesh2 = (v2: V2 x 3, f2: F2 x 3)` with `V2 (!= V1)` vertices and `F2 (!= F1)` faces. The [Meshes][meshes] data structure provides three different ways to batch *heterogeneous* meshes. If `meshes = Meshes(verts = [v1, v2], faces = [f1, f2])` is an instantiation of the data structure, then
+
+* List: Returns the examples in the batch as a list of tensors. Specifically, `meshes.verts_list()` returns the list of vertices `[v1, v2]`. Similarly, `meshes.faces_list()` returns the list of faces `[f1, f2]`.
+* Padded: The padded representation constructs a tensor by padding the extra values. Specifically, `meshes.verts_padded()` returns a tensor of shape `2 x max(V1, V2) x 3` and pads the extra vertices with `0`s. Similarly, `meshes.faces_padded()` returns a tensor of shape `2 x max(F1, F2) x 3` and pads the extra faces with `-1`s.
+* Packed: The packed representation concatenates the examples in the batch into a tensor. In particular, `meshes.verts_packed()` returns a tensor of shape `(V1 + V2) x 3`. Similarly, `meshes.faces_packed()` returns a tensor of shape `(F1 + F2) x 3` for the faces. In the packed mode, auxiliary variables are computed that enable efficient conversion between packed and padded or list modes.
+
+<img src="../figs/batch_modes.gif" alt="batch_modes" height="450" align="middle" />
+
+## Use cases for batch modes
+
+The need for different mesh batch modes is inherent to the way pytorch operators are implemented. To fully utilize the optimized pytorch ops, the [Meshes][meshes] data structure allows for efficient conversion between the different batch modes. This is crucial when aiming for a fast and efficient training cycle. An example of this is [Mesh R-CNN][meshrcnn]. Here, in the same forward pass different parts of the network assume different inputs, which are computed by converting between the different batch modes. In particular, [vert_align][vert_align] assumes a *padded* input tensor while immediately after [graph_conv][graphconv] assumes a *packed* input tensor.
+
+<img src="../figs/meshrcnn.png" alt="meshrcnn" width="700" align="middle" />
+
+
+[meshes]: https://github.com/facebookresearch/pytorch3d/blob/master/pytorch3d/structures/meshes.py
+[graphconv]: https://github.com/facebookresearch/pytorch3d/blob/master/pytorch3d/ops/graph_conv.py
+[vert_align]: https://github.com/facebookresearch/pytorch3d/blob/master/pytorch3d/ops/vert_align.py
+[meshrcnn]: https://github.com/facebookresearch/meshrcnn
--- a/docs/notes/meshes_io.md
+++ b/docs/notes/meshes_io.md
@ -0,0 +1,67 @@
+# Meshes and IO
+
+The Meshes object represents a batch of triangulated meshes, and is central to
+much of the functionality of pytorch3d. There is no insistence that each mesh in
+the batch has the same number of vertices or faces. When available, it can store
+other data which pertains to the mesh, for example face normals, face areas
+and textures.
+
+Two common file formats for storing single meshes are ".obj" and ".ply" files,
+and pytorch3d has functions for reading these.
+
+## OBJ
+
+Obj files have a standard way to store extra information about a mesh. Given an
+obj file, it can be read with
+
+```
+  verts, faces, aux = load_obj(filename)
+```
+
+which sets `verts` to be a (V,3)-tensor of vertices and `faces.verts_idx` to be
+an (F,3)- tensor of the vertex-indices of each of the corners of the faces.
+Faces which are not triangles will be split into triangles. `aux` is an object
+which may contain normals, uv coordinates, material colors and textures if they
+are present, and `faces` may additionally contain indices into these normals,
+textures and materials in its NamedTuple structure. A Meshes object containing a
+single mesh can be created from just the vertices and faces using
+```
+    meshes = Meshes(verts=[verts], faces=[faces.verts_idx])
+```
+
+If there is texture information in the `.obj` it can be used to initialize a
+`Textures` class which is passed into the `Meshes` constructor.  Currently we
+support loading of texture maps for meshes which have one texture map for the
+entire mesh e.g.
+
+```
+verts_uvs = aux.verts_uvs[None, ...]  # (1, V, 2)
+faces_uvs = faces.textures_idx[None, ...]  # (1, F, 3)
+tex_maps = aux.texture_images
+
+# tex_maps is a dictionary of {material name: texture image}.
+# Take the first image:
+texture_image = list(tex_maps.values())[0]
+texture_image = texture_image[None, ...]  # (1, H, W, 3)
+
+# Create a textures object
+tex = Textures(verts_uvs=verts_uvs, faces_uvs=faces_uvs, maps=texture_image)
+
+# Initialise the mesh with textures
+meshes = Meshes(verts=[verts], faces=[faces.verts_idx], textures=tex)
+```
+## PLY
+
+Ply files are flexible in the way they store additional information, pytorch3d
+provides a function just to read the vertices and faces from a ply file.
+The call
+```
+    verts, faces = load_ply(filename)
+```
+sets `verts` to be a (V,3)-tensor of vertices and `faces` to be an (F,3)-
+tensor of the vertex-indices of each of the corners of the faces. Faces which
+are not triangles will be split into triangles. A Meshes object containing a
+single mesh can be created from this data using
+```
+    meshes = Meshes(verts=[verts], faces=[faces])
+```
--- a/docs/notes/renderer.md
+++ b/docs/notes/renderer.md
@ -0,0 +1,115 @@
+# Differentiable Rendering
+
+Differentiable rendering is a relatively new and exciting research area in computer vision, bridging the gap between 2D and 3D by allowing 2D image pixels to be related back to 3D properties of a scene.
+
+For example, by rendering an image from a 3D shape predicted by a neural network, it is possible to compute a 2D loss with a reference image. Inverting the rendering step means we can relate the 2D loss from the pixels back to the 3D properties of the shape such as the positions of mesh vertices, enabling 3D shapes to be learnt without any explicit 3D supervision.
+
+We extensively researched existing codebases for differentiable rendering and found that:
+- the rendering pipeline is complex with more than 7 separate components which need to interoperate and be differentiable
+- popular existing approaches [[1](#1), [2](#2)] are based on the same core implementation which bundles many of the key components into large CUDA kernels which require significant expertise to understand, and has limited scope for extensions  
+- existing methods either do not support batching or assume that meshes in a batch have the same number of vertices and faces
+- existing projects only provide CUDA implementations so they cannot be used without GPUs
+
+In order to experiment with different approaches, we wanted a modular implementation that is easy to use and extend, and supports [heterogeneous batching](batching.md).
+
+Taking inspiration from existing work [[1](#1), [2](#2)], we have created a new, modular, differentiable renderer with **parallel implementations in PyTorch, C++ and CUDA**, as well as comprehensive documentation and tests, with the aim of helping to further research in this field.
+
+Our implementation decouples the rasterization and shading steps of rendering. The core rasterization step (based on [[2]](#2)) returns several intermediate variables and has an optimized implementation in CUDA. The rest of the pipeline is implemented purely in PyTorch, and is designed to be customized and extended. With this approach, the PyTorch3d differentiable renderer can be imported as a library.
+
+## <u>Get started</u>
+
+To learn about more the implementation and start using the renderer refer to [renderer_getting_started.md](renderer_getting_started.md), which also contains the [architecture overview](../figs/architecture_overview.png) and [coordinate transformation conventions](../figs/transformations_overview.png).
+
+
+## <u>Key features</u>
+
+### 1. CUDA support for fast rasterization of large meshes
+
+We implemented modular CUDA kernels for the forward and backward pass of rasterization, adaptating a traditional graphics approach known as "coarse-to-fine" rasterization.
+
+First, the image is divided into a coarse grid and mesh faces are allocated to the grid cell in which they occur. This is followed by a refinement step which does pixel wise rasterization of the reduced subset of faces per grid cell. The grid cell size is a parameter which can be varied (`bin_size`).
+
+We additionally introduce a parameter `faces_per_pixel` which allows users to specify the top K faces which should be returned per pixel in the image (as opposed to traditional rasterization which returns only the index of the closest face in the mesh per pixel). The top K face properties can then be aggregated using different methods (such as the sigmoid/softmax approach proposed by Li et at in SoftRasterizer [[2]](#2)).
+
+We compared PyTorch3d with SoftRasterizer to measure the effect of both these design changes on the speed of rasterization. We selected a set of meshes of different sizes from ShapeNetV1 core, and rasterized one mesh in each batch to produce images of different sizes. We report the speed of the forward and backward passes.
+
+**Fig 1: PyTorch3d Naive vs Coarse-to-fine**
+
+This figure shows how the coarse-to-fine strategy for rasterization results in significant speed up compared to naive rasterization for large image size and large mesh sizes.
+
+<img src="../figs/p3d_naive_vs_coarse.png" width="1000">
+
+
+For small mesh and image sizes, the naive approach is slightly faster. We advise that you understand the data you are using and choose the rasterization setting which suits your performance requirements. It is easy to switch between the naive and coarse-to-fine options by adjusting the `bin_size` value when initializing the [rasterization settings](https://github.com/facebookresearch/pytorch3d/blob/master/pytorch3d/renderer/mesh/rasterizer.py#L26).
+
+Setting `bin_size = 0` will enable naive rasterization. If `bin_size > 0`, the coarse-to-fine approach is used. The default is `bin_size = None` in which case we set the bin size based on [heuristics](https://github.com/facebookresearch/pytorch3d/blob/master/pytorch3d/renderer/mesh/rasterize_meshes.py#L92).
+
+**Fig 2: PyTorch3d Coarse-to-fine vs SoftRasterizer**
+
+This figure shows the effect of the _combination_ of coarse-to-fine rasterization and caching the faces rasterized per pixel returned from the forward pass. For large meshes and image sizes, we again observe that the PyTorch3d rasterizer is significantly faster, noting that the speed is dominated by the forward pass and the backward pass is very fast.
+
+In the SoftRasterizer implementation, in both the forward and backward pass, there is a loop over every single face in the mesh for every pixel in the image. Therefore, the time for the full forward plus backward pass is ~2x the time for the forward pass. For small mesh and image sizes, the SoftRasterizer approach is slightly faster.
+
+<img src="../figs/p3d_vs_softras.png" width="1000">
+
+
+
+### 2. Support for Heterogeneous Batches
+
+PyTorch3d supports efficient rendering of batches of meshes where each mesh has different numbers of vertices and faces. This is done without using padded inputs.
+
+We again compare with SoftRasterizer which only supports batches of homogeneous meshes and test two cases: 1) a for loop over meshes in the batch, 2) padded inputs, and compare with the native heterogeneous batching support in PyTorch3d.
+
+We group meshes from ShapeNet into bins based on the number of faces in the mesh, and sample to compose a batch. We then render images of fixed size and measure the speed of the forward and backward passes.
+
+We tested with a range of increasingly large meshes and bin sizes.
+
+**Fig 3: PyTorch3d heterogeneous batching compared with SoftRasterizer**
+
+<img src="../figs/fullset_batch_size_16.png" width="700"/>
+
+This shows that for large meshes and large bin width (i.e. more variation in mesh size in the batch) the heterogeneous batching approach in PyTorch3d is faster than either of the workarounds with SoftRasterizer.
+
+(settings: batch size = 16, mesh sizes in bins ranging from 500-350k faces, image size = 64, faces per pixel = 100)
+
+---
+
+**NOTE: CUDA Memory usage**
+
+The SoftRasterizer forward CUDA kernel only outputs one `(N, H, W, 4)` FloatTensor compared with the PyTorch3d rasterizer forward CUDA kernel which outputs 4 tensors:
+
+  - `pix_to_face`, LongTensor `(N, H, W, K)`  
+  - `zbuf`, FloatTensor `(N, H, W, K)`
+  - `dist`, FloatTensor `(N, H, W, K)`
+  - `bary_coords`, FloatTensor `(N, H, W, K, 3)`
+
+where **N** = batch size, **H/W** are image height/width, **K** is the faces per pixel. The PyTorch3d backward pass returns gradients for `zbuf`, `dist` and `bary_coords`.
+
+Returning intermediate variables from rasterization has an associated memory cost. We can calculate the theoretical lower bound on the memory usage for the forward and backward pass as follows:
+
+```
+# Assume 4 bytes per float, and 8 bytes for long
+
+memory_forward_pass = ((N * H * W * K) * 2 + (N * H * W * K * 3)) * 4 + (N * H * W * K) * 8
+memory_backward_pass = ((N * H * W * K) * 2 + (N * H * W * K * 3)) * 4
+
+total_memory = memory_forward_pass + memory_backward_pass
+             = (N * H * W * K) * (5 * 4 * 2 + 8)
+             = (N * H * W * K) * 48
+```
+
+We need 48 bytes per face per pixel of the rasterized output. In order to remain within bounds for memory usage we can vary the batch size (**N**), image size (**H/W**) and faces per pixel (**K**).  For example, for a fixed batch size, if using a larger image size, try reducing the faces per pixel.
+
+---
+
+
+### 3. Modular design for easy experimentation and extensibility.
+
+We redesigned the rendering pipeline from the ground up to be modular and extensible and challenged many of the limitations in existing libraries. Refer to [renderer_getting_started.md](renderer_getting_started.md) for a detailed description of the architecture.
+
+
+### References
+
+<a id="1">[1]</a> Kato et al, 'Neural 3D Mesh Renderer', CVPR 2018
+
+<a id="2">[2]</a> Liu et al, 'Soft Rasterizer: A Differentiable Renderer for Image-based 3D Reasoning', ICCV 2019
--- a/docs/notes/renderer_getting_started.md
+++ b/docs/notes/renderer_getting_started.md
@ -0,0 +1,81 @@
+# Renderer Getting Started
+
+### Architecture Overview
+
+The renderer is designed to be modular, extensible and support batching and gradients for all inputs. The following figure describes all the components of the rendering pipeline.
+
+<img src="../figs/architecture_overview.png" width="1000">
+
+##### Fragments
+
+The **rasterizer** returns 4 output tensors in a named tuple.
+
+- **`pix_to_face`**: LongTensor of shape `(N, image_size, image_size, faces_per_pixel)` specifying the indices of the faces (in the packed faces) which overlap each pixel in the image.
+- **`zbuf`**: FloatTensor of shape `(N, image_size, image_size, faces_per_pixel)` giving the z-coordinates of the nearest faces at each pixel in world coordinates, sorted in ascending z-order.
+- **`bary_coords`**: FloatTensor of shape `(N, image_size, image_size, faces_per_pixel, 3)`
+  giving the barycentric coordinates in NDC units of the nearest faces at each pixel, sorted in ascending z-order.
+- **`pix_dists`**: FloatTensor of shape `(N, image_size, image_size, faces_per_pixel)` giving the signed Euclidean distance (in NDC units) in the x/y plane of each point closest to the pixel.
+
+
+See the renderer API reference for more details about each component in the pipeline.
+
+---
+
+**NOTE:**
+
+The differentiable renderer API is experimental and subject to change!.
+
+---
+
+### Coordinate transformation conventions
+
+Rendering requires transformations between several different coordinate frames: world space, view/camera space, NDC space and screen space. At each step it is important to know where the camera is located, how the x,y,z axes are aligned and the possible range of values. The following figure outlines the conventions used PyTorch3d.
+
+<img src="../figs/transformations_overview.png" width="1000">
+
+
+
+---
+
+**NOTE: PyTorch3d vs OpenGL**
+
+While we tried to emulate several aspects of OpenGL, the NDC coordinate system in PyTorch3d is **right-handed** compared with a **left-handed** NDC coordinate system in OpenGL (the projection matrix switches the handedness).
+
+In OpenGL, the camera at the origin is looking along `-z` axis in camera space, but it is looking along the `+z` axis in NDC space.
+
+<img align="center" src="../figs/opengl_coordframes.png" width="300">
+
+---
+### A simple renderer
+
+A renderer in PyTorch3d is composed of a **rasterizer** and a **shader**. Create a renderer in a few simple steps:
+
+```
+# Imports
+from pytorch3d.renderer import (
+    OpenGLPerspectiveCameras, look_at_view_transform,
+    RasterizationSettings, BlendParams,
+    MeshRenderer, MeshRasterizer, PhongShader
+)
+
+# Initialize an OpenGL perspective camera.
+R, T = look_at_view_transform(2.7, 10, 20)
+cameras = OpenGLPerspectiveCameras(device=device, R=R, T=T)
+
+# Define the settings for rasterization and shading. Here we set the output image to be of size
+# 512x512. As we are rendering images for visualization purposes only we will set faces_per_pixel=1
+# and blur_radius=0.0. Refer to rasterize_meshes.py for explanations of these parameters.
+raster_settings = RasterizationSettings(
+    image_size=512,
+    blur_radius=0.0,
+    faces_per_pixel=1,
+    bin_size=0
+)
+
+# Create a phong renderer by composing a rasterizer and a shader. Here we can use a predefined
+# PhongShader, passing in the device on which to initialize the default parameters
+renderer = MeshRenderer(
+    rasterizer=MeshRasterizer(cameras=cameras, raster_settings=raster_settings),
+    shader=PhongShader(device=device, cameras=cameras)
+)
+```
--- a/docs/tutorials/bundle_adjustment.ipynb
+++ b/docs/tutorials/bundle_adjustment.ipynb
--- a/docs/tutorials/camera_position_optimization_with_differentiable_rendering.ipynb
+++ b/docs/tutorials/camera_position_optimization_with_differentiable_rendering.ipynb
--- a/docs/tutorials/data/bundle_adjustment_final.png
+++ b/docs/tutorials/data/bundle_adjustment_final.png
--- a/docs/tutorials/data/bundle_adjustment_initialization.png
+++ b/docs/tutorials/data/bundle_adjustment_initialization.png
--- a/docs/tutorials/data/camera_graph.pth
+++ b/docs/tutorials/data/camera_graph.pth
--- a/docs/tutorials/data/cow_mesh/README.md
+++ b/docs/tutorials/data/cow_mesh/README.md
@ -0,0 +1,6 @@
+
+# Acknowledgements
+
+Thank you to Keenen Crane for allowing the cow mesh model to be used freely in the public domain.
+
+###### Source: http://www.cs.cmu.edu/~kmcrane/Projects/ModelRepository/
--- a/docs/tutorials/data/cow_mesh/cow.mtl
+++ b/docs/tutorials/data/cow_mesh/cow.mtl
@ -0,0 +1,9 @@
+newmtl material_1
+map_Kd cow_texture.png
+
+# Test colors
+
+Ka 1.000 1.000 1.000  # white
+Kd 1.000 1.000 1.000  # white
+Ks 0.000 0.000 0.000  # black
+Ns 10.0
--- a/docs/tutorials/data/cow_mesh/cow.obj
+++ b/docs/tutorials/data/cow_mesh/cow.obj
--- a/docs/tutorials/data/cow_mesh/cow_texture.png
+++ b/docs/tutorials/data/cow_mesh/cow_texture.png
--- a/docs/tutorials/data/teapot.obj
+++ b/docs/tutorials/data/teapot.obj
--- a/docs/tutorials/deform_source_mesh_to_target_mesh.ipynb
+++ b/docs/tutorials/deform_source_mesh_to_target_mesh.ipynb
--- a/docs/tutorials/render_textured_meshes.ipynb
+++ b/docs/tutorials/render_textured_meshes.ipynb
--- a/docs/tutorials/utils/init.py
+++ b/docs/tutorials/utils/init.py
@ -0,0 +1,8 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+from .camera_visualisation import (
+    get_camera_wireframe,
+    plot_camera_scene,
+    plot_cameras,
+)
+from .plot_image_grid import image_grid
--- a/docs/tutorials/utils/camera_visualisation.py
+++ b/docs/tutorials/utils/camera_visualisation.py
@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+import matplotlib.pyplot as plt
+import torch
+from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import
+
+
+def get_camera_wireframe(scale: float = 0.3):
+    """
+    Returns a wireframe of a 3D line-plot of a camera symbol.
+    """
+    a = 0.5 * torch.tensor([-2, 1.5, 4])
+    b = 0.5 * torch.tensor([2, 1.5, 4])
+    c = 0.5 * torch.tensor([-2, -1.5, 4])
+    d = 0.5 * torch.tensor([2, -1.5, 4])
+    C = torch.zeros(3)
+    F = torch.tensor([0, 0, 3])
+    camera_points = [a, b, d, c, a, C, b, d, C, c, C, F]
+    lines = torch.stack([x.float() for x in camera_points]) * scale
+    return lines
+
+
+def plot_cameras(ax, cameras, color: str = "blue"):
+    """
+    Plots a set of `cameras` objects into the maplotlib axis `ax` with
+    color `color`.
+    """
+    cam_wires_canonical = get_camera_wireframe().cuda()[None]
+    cam_trans = cameras.get_world_to_view_transform().inverse()
+    cam_wires_trans = cam_trans.transform_points(cam_wires_canonical)
+    plot_handles = []
+    for wire in cam_wires_trans:
+        # the Z and Y axes are flipped intentionally here!
+        x_, z_, y_ = wire.detach().cpu().numpy().T.astype(float)
+        (h,) = ax.plot(x_, y_, z_, color=color, linewidth=0.3)
+        plot_handles.append(h)
+    return plot_handles
+
+
+def plot_camera_scene(cameras, cameras_gt, status: str):
+    """
+    Plots a set of predicted cameras `cameras` and their corresponding
+    ground truth locations `cameras_gt`. The plot is named with
+    a string passed inside the `status` argument.
+    """
+    fig = plt.figure()
+    ax = fig.gca(projection="3d")
+    ax.clear()
+    ax.set_title(status)
+    handle_cam = plot_cameras(ax, cameras, color="#FF7D1E")
+    handle_cam_gt = plot_cameras(ax, cameras_gt, color="#812CE5")
+    plot_radius = 3
+    ax.set_xlim3d([-plot_radius, plot_radius])
+    ax.set_ylim3d([3 - plot_radius, 3 + plot_radius])
+    ax.set_zlim3d([-plot_radius, plot_radius])
+    ax.set_xlabel("x")
+    ax.set_ylabel("z")
+    ax.set_zlabel("y")
+    labels_handles = {
+        "Estimated cameras": handle_cam[0],
+        "GT cameras": handle_cam_gt[0],
+    }
+    ax.legend(
+        labels_handles.values(),
+        labels_handles.keys(),
+        loc="upper center",
+        bbox_to_anchor=(0.5, 0),
+    )
+    plt.show()
+    return fig
--- a/docs/tutorials/utils/plot_image_grid.py
+++ b/docs/tutorials/utils/plot_image_grid.py
@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+import matplotlib.pyplot as plt
+
+
+def image_grid(
+    images,
+    rows=None,
+    cols=None,
+    fill: bool = True,
+    show_axes: bool = False,
+    rgb: bool = True,
+):
+    """
+    A util function for plotting a grid of images.
+
+    Args:
+        images: (N, H, W, 4) array of RGBA images
+        rows: number of rows in the grid
+        cols: number of columns in the grid
+        fill: boolean indicating if the space between images should be filled
+        show_axes: boolean indicating if the axes of the plots should be visible
+        rgb: boolean, If True, only RGB channels are plotted.
+            If False, only the alpha channel is plotted.
+
+    Returns:
+        None
+    """
+    if (rows is None) != (cols is None):
+        raise ValueError("Specify either both rows and cols or neither.")
+
+    if rows is None:
+        rows = len(images)
+        cols = 1
+
+    gridspec_kw = {"wspace": 0.0, "hspace": 0.0} if fill else {}
+    fig, axarr = plt.subplots(
+        rows, cols, gridspec_kw=gridspec_kw, figsize=(15, 9)
+    )
+    bleed = 0
+    fig.subplots_adjust(
+        left=bleed, bottom=bleed, right=(1 - bleed), top=(1 - bleed)
+    )
+
+    for ax, im in zip(axarr.ravel(), images):
+        if rgb:
+            # only render RGB channels
+            ax.imshow(im[..., :3])
+        else:
+            # only render Alpha channel
+            ax.imshow(im[..., 3])
+        if not show_axes:
+            ax.set_axis_off()
--- a/packaging/build_conda.sh
+++ b/packaging/build_conda.sh
@ -0,0 +1,15 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+set -ex
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+. "$script_dir/pkg_helpers.bash"
+
+export BUILD_TYPE=conda
+setup_env 0.1.0
+export SOURCE_ROOT_DIR="$PWD"
+setup_conda_pytorch_constraint
+setup_conda_cudatoolkit_constraint
+setup_visual_studio_constraint
+# shellcheck disable=SC2086
+conda build $CONDA_CHANNEL_FLAGS ${TEST_FLAG:-} -c defaults -c conda-forge --no-anaconda-upload -c takatosp1 --python "$PYTHON_VERSION" packaging/pytorch3d
--- a/packaging/build_wheel.sh
+++ b/packaging/build_wheel.sh
@ -0,0 +1,14 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+set -ex
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+. "$script_dir/pkg_helpers.bash"
+
+export BUILD_TYPE=wheel
+setup_env 0.1.0
+setup_wheel_python
+pip_install numpy
+setup_pip_pytorch_version
+python setup.py clean
+IS_WHEEL=1 python setup.py bdist_wheel
--- a/packaging/conda/build_pytorch3d.sh
+++ b/packaging/conda/build_pytorch3d.sh
@ -0,0 +1,215 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+if [[ -x "/remote/anaconda_token" ]]; then
+    . /remote/anaconda_token || true
+fi
+
+set -ex
+
+# Function to retry functions that sometimes timeout or have flaky failures
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+# Parse arguments and determmine version
+###########################################################
+
+if [ "$#" -ne 3 ]; then
+    echo "Illegal number of parameters. Pass cuda version, pytorch3d version, pytorch3d build number"
+    echo "CUDA version should be Mm with no dot, e.g. '80'"
+    echo "DESIRED_PYTHON should be M.m, e.g. '2.7'"
+    exit 1
+fi
+
+desired_cuda="$1"
+build_version="$2"
+build_number="$3"
+
+if [[ "$desired_cuda" != cpu ]]; then
+  desired_cuda="$(echo $desired_cuda | tr -d cuda. )"
+fi
+echo "Building cuda version $desired_cuda and pytorch3d version: $build_version build_number: $build_number"
+
+if [[ "$desired_cuda" == 'cpu' ]]; then
+    cpu_only=1
+    cuver="cpu"
+else
+    # Switch desired_cuda to be M.m to be consistent with other scripts in
+    # pytorch/builder
+    export FORCE_CUDA=1
+    cuda_nodot="$desired_cuda"
+
+    if [[ ${#cuda_nodot} -eq 2 ]]; then
+        desired_cuda="${desired_cuda:0:1}.${desired_cuda:1:1}"
+    elif [[ ${#cuda_nodot} -eq 3 ]]; then
+        desired_cuda="${desired_cuda:0:2}.${desired_cuda:2:1}"
+    else
+        echo "unknown cuda version $cuda_nodot"
+        exit 1
+    fi
+
+    cuver="cu$cuda_nodot"
+fi
+
+export PYTORCH3D_BUILD_VERSION=$build_version
+export PYTORCH3D_BUILD_NUMBER=$build_number
+
+if [[ -z "$DESIRED_PYTHON" ]]; then
+    DESIRED_PYTHON=('3.5' '3.6' '3.7')
+fi
+
+SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
+
+if [[ -z "$WIN_PACKAGE_WORK_DIR" ]]; then
+    WIN_PACKAGE_WORK_DIR="$(echo $(pwd -W) | tr '/' '\\')\\tmp_conda_$(date +%H%M%S)"
+fi
+
+mkdir -p "$WIN_PACKAGE_WORK_DIR" || true
+pytorch3d_rootdir="$(realpath ${WIN_PACKAGE_WORK_DIR})/pytorch3d-src"
+git config --system core.longpaths true
+
+if [[ ! -d "$pytorch3d_rootdir" ]]; then
+    rm -rf "$pytorch3d_rootdir"
+    git clone SOURCE_DIR/../.. "$pytorch3d_rootdir"
+
+    # pushd "$vision_rootdir"
+    # git checkout $PYTORCH_BRANCH
+    # popd
+fi
+
+cd "$SOURCE_DIR"
+
+export tmp_conda="${WIN_PACKAGE_WORK_DIR}\\conda"
+export miniconda_exe="${WIN_PACKAGE_WORK_DIR}\\miniconda.exe"
+rm -rf "$tmp_conda"
+rm -f "$miniconda_exe"
+curl -sSk https://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe -o "$miniconda_exe"
+"$SOURCE_DIR/install_conda.bat" && rm "$miniconda_exe"
+pushd $tmp_conda
+export PATH="$(pwd):$(pwd)/Library/usr/bin:$(pwd)/Library/bin:$(pwd)/Scripts:$(pwd)/bin:$PATH"
+popd
+retry conda install -yq conda-build
+
+ANACONDA_USER=pytorch-nightly
+conda config --set anaconda_upload no
+
+
+export TORCHVISION_PACKAGE_SUFFIX=""
+if [[ "$desired_cuda" == 'cpu' ]]; then
+    export CONDA_CUDATOOLKIT_CONSTRAINT=""
+    export CONDA_CPUONLY_FEATURE="- cpuonly # [not osx]"
+    export CUDA_VERSION="None"
+else
+    export CONDA_CPUONLY_FEATURE=""
+    . ./switch_cuda_version.sh $desired_cuda
+    if [[ "$desired_cuda" == "10.1" ]]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.1,<10.2 # [not osx]"
+    elif [[ "$desired_cuda" == "10.0" ]]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.0,<10.1 # [not osx]"
+    elif [[ "$desired_cuda" == "9.2" ]]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=9.2,<9.3 # [not osx]"
+    elif [[ "$desired_cuda" == "9.0" ]]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=9.0,<9.1 # [not osx]"
+    elif [[ "$desired_cuda" == "8.0" ]]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=8.0,<8.1 # [not osx]"
+    else
+        echo "unhandled desired_cuda: $desired_cuda"
+        exit 1
+    fi
+fi
+
+if [[ -z "$PYTORCH_VERSION" ]]; then
+    export CONDA_CHANNEL_FLAGS="-c pytorch-nightly"
+    export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-nightly]' | \
+                                python -c "import os, sys, json, re; cuver = '$cuver'; \
+                                cuver = cuver.replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
+                                print(re.sub(r'\\+.*$', '', \
+                                [x['version'] for x in json.load(sys.stdin)['pytorch'] \
+                                    if (x['platform'] == 'darwin' or cuver in x['fn']) \
+                                    and 'py' + os.environ['DESIRED_PYTHON'] in x['fn']][-1]))")"
+    if [[ -z "$PYTORCH_VERSION" ]]; then
+        echo "PyTorch version auto detection failed"
+        echo "No package found for desired_cuda=$desired_cuda and DESIRED_PYTHON=$DESIRED_PYTHON"
+        exit 1
+    fi
+else
+    export CONDA_CHANNEL_FLAGS="-c pytorch -c pytorch-nightly"
+fi
+if [[ "$desired_cuda" == 'cpu' ]]; then
+    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==$PYTORCH_VERSION"
+    export CONDA_PYTORCH_CONSTRAINT="- pytorch==$PYTORCH_VERSION"
+else
+    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==${PYTORCH_VERSION}"
+    export CONDA_PYTORCH_CONSTRAINT="- pytorch==${PYTORCH_VERSION}"
+fi
+
+# Loop through all Python versions to build a package for each
+for py_ver in "${DESIRED_PYTHON[@]}"; do
+    build_string="py${py_ver}_${build_string_suffix}"
+    folder_tag="${build_string}_$(date +'%Y%m%d')"
+
+    # Create the conda package into this temporary folder. This is so we can find
+    # the package afterwards, as there's no easy way to extract the final filename
+    # from conda-build
+    output_folder="out_$folder_tag"
+    rm -rf "$output_folder"
+    mkdir "$output_folder"
+
+    export VSTOOLCHAIN_PACKAGE=vs2017
+
+    # We need to build the compiler activation scripts first on Windows
+    time VSDEVCMD_ARGS=${VSDEVCMD_ARGS[@]} \
+        conda build -c "$ANACONDA_USER" \
+                    --no-anaconda-upload \
+                    --output-folder "$output_folder" \
+                    ../$VSTOOLCHAIN_PACKAGE
+
+    cp ../$VSTOOLCHAIN_PACKAGE/conda_build_config.yaml ../torchvision/conda_build_config.yaml
+
+    conda config --set anaconda_upload no
+    echo "Calling conda-build at $(date)"
+    if [[ "$desired_cuda" == "9.2" ]]; then
+        time CMAKE_ARGS=${CMAKE_ARGS[@]} \
+            BUILD_VERSION="$PYTORCH3D_BUILD_VERSION" \
+            CU_VERSION="$cuver" \
+            SOURCE_ROOT_DIR="$pytorch3d_rootdir" \
+            conda build -c "$ANACONDA_USER" \
+                        -c defaults \
+                        -c conda-forge \
+                        -c "numba/label/dev" \
+                        --no-anaconda-upload \
+                        --python "$py_ver" \
+                        --output-folder "$output_folder" \
+                        --no-verify \
+                        --no-test \
+                        ../torchvision
+    else
+        time CMAKE_ARGS=${CMAKE_ARGS[@]} \
+            BUILD_VERSION="$PYTORCH3D_BUILD_VERSION" \
+            CU_VERSION="$cuver" \
+            SOURCE_ROOT_DIR="$pytorch3d_rootdir" \
+            conda build -c "$ANACONDA_USER" \
+                        -c defaults \
+                        -c conda-forge \
+                        --no-anaconda-upload \
+                        --python "$py_ver" \
+                        --output-folder "$output_folder" \
+                        --no-verify \
+                        --no-test \
+                        ../torchvision
+    fi
+    echo "Finished conda-build at $(date)"
+
+    # Extract the package for testing
+    ls -lah "$output_folder"
+    built_package="$(find $output_folder/ -name '*torchvision*.tar.bz2')"
+
+    # Copy the built package to the host machine for persistence before testing
+    if [[ -n "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
+        mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
+        cp "$built_package" "$PYTORCH_FINAL_PACKAGE_DIR/"
+    fi
+done
+
+
+set +e
--- a/packaging/conda/install_conda.bat
+++ b/packaging/conda/install_conda.bat
@ -0,0 +1,2 @@
+:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+start /wait "" "%miniconda_exe%" /S /InstallationType=JustMe /RegisterPython=0 /AddToPath=0 /D=%tmp_conda%
--- a/packaging/conda/switch_cuda_version.sh
+++ b/packaging/conda/switch_cuda_version.sh
@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+if [[ "$OSTYPE" == "msys" ]]; then
+    CUDA_DIR="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v$1"
+else
+    CUDA_DIR="/usr/local/cuda-$1"
+fi
+
+if ! ls "$CUDA_DIR"
+then
+    echo "folder $CUDA_DIR not found to switch"
+fi
+
+echo "Switching symlink to $CUDA_DIR"
+mkdir -p /usr/local
+rm -fr /usr/local/cuda
+ln -s "$CUDA_DIR" /usr/local/cuda
+
+if [[ "$OSTYPE" == "msys" ]]; then
+    export CUDA_VERSION=`ls /usr/local/cuda/bin/cudart64*.dll | head -1 | tr '._' ' ' | cut -d ' ' -f2`
+    export CUDNN_VERSION=`ls /usr/local/cuda/bin/cudnn64*.dll | head -1 | tr '._' ' ' | cut -d ' ' -f2`
+else
+    export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev)
+    export CUDNN_VERSION=$(ls /usr/local/cuda/lib64/libcudnn.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev)
+fi
+
+ls -alh /usr/local/cuda
+
+echo "CUDA_VERSION=$CUDA_VERSION"
+echo "CUDNN_VERSION=$CUDNN_VERSION"
--- a/packaging/pkg_helpers.bash
+++ b/packaging/pkg_helpers.bash
@ -0,0 +1,265 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+# shellcheck shell=bash
+# A set of useful bash functions for common functionality we need to do in
+# many build scripts
+
+
+# Setup CUDA environment variables, based on CU_VERSION
+#
+# Inputs:
+#   CU_VERSION (cu92, cu100, cu101)
+#   NO_CUDA_PACKAGE (bool)
+#   BUILD_TYPE (conda, wheel)
+#
+# Outputs:
+#   VERSION_SUFFIX (e.g., "")
+#   PYTORCH_VERSION_SUFFIX (e.g., +cpu)
+#   WHEEL_DIR (e.g., cu100/)
+#   CUDA_HOME (e.g., /usr/local/cuda-9.2, respected by torch.utils.cpp_extension)
+#   FORCE_CUDA (respected by pytorch3d setup.py)
+#   NVCC_FLAGS (respected by pytorch3d setup.py)
+#
+# Precondition: CUDA versions are installed in their conventional locations in
+# /usr/local/cuda-*
+#
+# NOTE: Why VERSION_SUFFIX versus PYTORCH_VERSION_SUFFIX?  If you're building
+# a package with CUDA on a platform we support CUDA on, VERSION_SUFFIX ==
+# PYTORCH_VERSION_SUFFIX and everyone is happy.  However, if you are building a
+# package with only CPU bits (e.g., torchaudio), then VERSION_SUFFIX is always
+# empty, but PYTORCH_VERSION_SUFFIX is +cpu (because that's how you get a CPU
+# version of a Python package.  But that doesn't apply if you're on OS X,
+# since the default CU_VERSION on OS X is cpu.
+setup_cuda() {
+
+  # First, compute version suffixes.  By default, assume no version suffixes
+  export VERSION_SUFFIX=""
+  export PYTORCH_VERSION_SUFFIX=""
+  export WHEEL_DIR=""
+  # Wheel builds need suffixes (but not if they're on OS X, which never has suffix)
+  if [[ "$BUILD_TYPE" == "wheel" ]] && [[ "$(uname)" != Darwin ]]; then
+    # The default CUDA has no suffix
+    if [[ "$CU_VERSION" != "cu101" ]]; then
+      export PYTORCH_VERSION_SUFFIX="+$CU_VERSION"
+    fi
+    # Match the suffix scheme of pytorch, unless this package does not have
+    # CUDA builds (in which case, use default)
+    if [[ -z "$NO_CUDA_PACKAGE" ]]; then
+      export VERSION_SUFFIX="$PYTORCH_VERSION_SUFFIX"
+      export WHEEL_DIR="$CU_VERSION/"
+    fi
+  fi
+
+  # Now work out the CUDA settings
+  case "$CU_VERSION" in
+    cu101)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.1"
+      else
+        export CUDA_HOME=/usr/local/cuda-10.1/
+      fi
+      export FORCE_CUDA=1
+      # Hard-coding gencode flags is temporary situation until
+      # https://github.com/pytorch/pytorch/pull/23408 lands
+      export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_50,code=compute_50"
+      ;;
+    cu100)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.0"
+      else
+        export CUDA_HOME=/usr/local/cuda-10.0/
+      fi
+      export FORCE_CUDA=1
+      # Hard-coding gencode flags is temporary situation until
+      # https://github.com/pytorch/pytorch/pull/23408 lands
+      export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_50,code=compute_50"
+      ;;
+    cu92)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.2"
+      else
+        export CUDA_HOME=/usr/local/cuda-9.2/
+      fi
+      export FORCE_CUDA=1
+      export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_50,code=compute_50"
+      ;;
+    cpu)
+      ;;
+    *)
+      echo "Unrecognized CU_VERSION=$CU_VERSION"
+      exit 1
+      ;;
+  esac
+}
+
+# Populate build version if necessary, and add version suffix
+#
+# Inputs:
+#   BUILD_VERSION (e.g., 0.2.0 or empty)
+#   VERSION_SUFFIX (e.g., +cpu)
+#
+# Outputs:
+#   BUILD_VERSION (e.g., 0.2.0.dev20190807+cpu)
+#
+# Fill BUILD_VERSION if it doesn't exist already with a nightly string
+# Usage: setup_build_version 0.2.0
+setup_build_version() {
+  if [[ -z "$BUILD_VERSION" ]]; then
+    export BUILD_VERSION="$1.dev$(date "+%Y%m%d")$VERSION_SUFFIX"
+  else
+    export BUILD_VERSION="$BUILD_VERSION$VERSION_SUFFIX"
+  fi
+}
+
+# Set some useful variables for OS X, if applicable
+setup_macos() {
+  if [[ "$(uname)" == Darwin ]]; then
+    export MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++
+  fi
+}
+
+# Top-level entry point for things every package will need to do
+#
+# Usage: setup_env 0.2.0
+setup_env() {
+  setup_cuda
+  setup_build_version "$1"
+  setup_macos
+}
+
+# Function to retry functions that sometimes timeout or have flaky failures
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+# Inputs:
+#   PYTHON_VERSION (2.7, 3.5, 3.6, 3.7)
+#   UNICODE_ABI (bool)
+#
+# Outputs:
+#   PATH modified to put correct Python version in PATH
+#
+# Precondition: If Linux, you are in a soumith/manylinux-cuda* Docker image
+setup_wheel_python() {
+  if [[ "$(uname)" == Darwin ]]; then
+    eval "$(conda shell.bash hook)"
+    conda env remove -n "env$PYTHON_VERSION" || true
+    conda create -yn "env$PYTHON_VERSION" python="$PYTHON_VERSION"
+    conda activate "env$PYTHON_VERSION"
+  else
+    case "$PYTHON_VERSION" in
+      2.7)
+        if [[ -n "$UNICODE_ABI" ]]; then
+          python_abi=cp27-cp27mu
+        else
+          python_abi=cp27-cp27m
+        fi
+        ;;
+      3.5) python_abi=cp35-cp35m ;;
+      3.6) python_abi=cp36-cp36m ;;
+      3.7) python_abi=cp37-cp37m ;;
+      3.8) python_abi=cp38-cp38 ;;
+      *)
+        echo "Unrecognized PYTHON_VERSION=$PYTHON_VERSION"
+        exit 1
+        ;;
+    esac
+    export PATH="/opt/python/$python_abi/bin:$PATH"
+  fi
+}
+
+# Install with pip a bit more robustly than the default
+pip_install() {
+  retry pip install --progress-bar off "$@"
+}
+
+# Install torch with pip, respecting PYTORCH_VERSION, and record the installed
+# version into PYTORCH_VERSION, if applicable
+setup_pip_pytorch_version() {
+  if [[ -z "$PYTORCH_VERSION" ]]; then
+    # Install latest prerelease version of torch, per our nightlies, consistent
+    # with the requested cuda version
+    pip_install --pre torch -f "https://download.pytorch.org/whl/nightly/${WHEEL_DIR}torch_nightly.html"
+    if [[ "$CUDA_VERSION" == "cpu" ]]; then
+      # CUDA and CPU are ABI compatible on the CPU-only parts, so strip
+      # in this case
+      export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version:  *//' | sed 's/+.\+//')"
+    else
+      export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version:  *//')"
+    fi
+  else
+    pip_install "torch==$PYTORCH_VERSION$CUDA_SUFFIX" \
+      -f https://download.pytorch.org/whl/torch_stable.html \
+      -f https://download.pytorch.org/whl/nightly/torch_nightly.html
+  fi
+}
+
+# Fill PYTORCH_VERSION with the latest conda nightly version, and
+# CONDA_CHANNEL_FLAGS with appropriate flags to retrieve these versions
+#
+# You MUST have populated CUDA_SUFFIX before hand.
+setup_conda_pytorch_constraint() {
+  if [[ -z "$PYTORCH_VERSION" ]]; then
+    export CONDA_CHANNEL_FLAGS="-c pytorch-nightly"
+    export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-nightly]' | \
+                              python -c "import os, sys, json, re; cuver = os.environ.get('CU_VERSION'); \
+                               cuver_1 = cuver.replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
+                               cuver_2 = (cuver[:-1] + '.' + cuver[-1]).replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
+                               print(re.sub(r'\\+.*$', '', \
+                                [x['version'] for x in json.load(sys.stdin)['pytorch'] \
+                                  if (x['platform'] == 'darwin' or cuver_1 in x['fn'] or cuver_2 in x['fn']) \
+                                    and 'py' + os.environ['PYTHON_VERSION'] in x['fn']][-1]))")"
+    if [[ -z "$PYTORCH_VERSION" ]]; then
+      echo "PyTorch version auto detection failed"
+      echo "No package found for CU_VERSION=$CU_VERSION and PYTHON_VERSION=$PYTHON_VERSION"
+      exit 1
+    fi
+  else
+    export CONDA_CHANNEL_FLAGS="-c pytorch -c pytorch-nightly"
+  fi
+  if [[ "$CU_VERSION" == cpu ]]; then
+    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==$PYTORCH_VERSION${PYTORCH_VERSION_SUFFIX}"
+    export CONDA_PYTORCH_CONSTRAINT="- pytorch==$PYTORCH_VERSION"
+  else
+    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}"
+    export CONDA_PYTORCH_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}"
+  fi
+}
+
+# Translate CUDA_VERSION into CUDA_CUDATOOLKIT_CONSTRAINT
+setup_conda_cudatoolkit_constraint() {
+  export CONDA_CPUONLY_FEATURE=""
+  if [[ "$(uname)" == Darwin ]]; then
+    export CONDA_CUDATOOLKIT_CONSTRAINT=""
+  else
+    case "$CU_VERSION" in
+      cu101)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.1,<10.2 # [not osx]"
+        ;;
+      cu100)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.0,<10.1 # [not osx]"
+        ;;
+      cu92)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=9.2,<9.3 # [not osx]"
+        ;;
+      cpu)
+        export CONDA_CUDATOOLKIT_CONSTRAINT=""
+        export CONDA_CPUONLY_FEATURE="- cpuonly"
+        ;;
+      *)
+        echo "Unrecognized CU_VERSION=$CU_VERSION"
+        exit 1
+        ;;
+    esac
+  fi
+}
+
+# Build the proper compiler package before building the final package
+setup_visual_studio_constraint() {
+  if [[ "$OSTYPE" == "msys" ]]; then
+      export VSTOOLCHAIN_PACKAGE=vs2019
+      export VSDEVCMD_ARGS=''
+      # shellcheck disable=SC2086
+      conda build $CONDA_CHANNEL_FLAGS --no-anaconda-upload packaging/$VSTOOLCHAIN_PACKAGE
+      cp packaging/$VSTOOLCHAIN_PACKAGE/conda_build_config.yaml packaging/torchvision/conda_build_config.yaml
+  fi
+}
--- a/packaging/pytorch3d/conda_build_config.yaml
+++ b/packaging/pytorch3d/conda_build_config.yaml
@ -0,0 +1,24 @@
+blas_impl:
+  - mkl                        # [x86_64]
+c_compiler:
+  - vs2017                     # [win]
+cxx_compiler:
+  - vs2017                     # [win]
+python:
+  - 3.5
+  - 3.6
+# This differs from target_platform in that it determines what subdir the compiler
+#    will target, not what subdir the compiler package will be itself.
+#    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
+#    code on win-64 miniconda.
+cross_compiler_target_platform:
+  - win-64                     # [win]
+target_platform:
+  - win-64                     # [win]
+vc:
+  - 14
+zip_keys:
+  -                             # [win]
+    - vc                        # [win]
+    - c_compiler                # [win]
+    - cxx_compiler              # [win]
--- a/packaging/pytorch3d/meta.yaml
+++ b/packaging/pytorch3d/meta.yaml
@ -0,0 +1,60 @@
+package:
+  name: pytorch3d
+  version: "{{ environ.get('BUILD_VERSION') }}"
+
+source:
+ path: "{{ environ.get('SOURCE_ROOT_DIR') }}"
+
+requirements:
+  build:
+    - {{ compiler('c') }} # [win]
+
+  host:
+    - python
+    - setuptools
+    {{ environ.get('CONDA_PYTORCH_BUILD_CONSTRAINT') }}
+    {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
+    {{ environ.get('CONDA_CPUONLY_FEATURE') }}
+
+  run:
+    - python
+    - numpy >=1.11
+    - six
+    - torchvision >=0.5
+    - fvcore
+    {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
+    {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
+
+build:
+  string: py{{py}}_{{ environ['CU_VERSION'] }}
+  script: python setup.py install --single-version-externally-managed --record=record.txt # [not win]
+  script_env:
+    - CUDA_HOME
+    - FORCE_CUDA
+    - NVCC_FLAGS
+  features:
+    {{ environ.get('CONDA_CPUONLY_FEATURE') }}
+
+test:
+  imports:
+    - pytorch3d
+  source_files:
+    - tests
+    - docs
+  requires:
+    - pytest
+    - scipy
+    - mock
+    - av
+    - ca-certificates
+    - typing
+  commands:
+    #pytest .
+    python -m unittest discover -v -s tests
+
+
+about:
+  home: https://github.com/facebookresearch/pytorch3d
+  license: BSD
+  license_file: LICENSE
+  summary: '3d Geometry for pytorch'
--- a/packaging/vs2017/activate.bat
+++ b/packaging/vs2017/activate.bat
@ -0,0 +1,45 @@
+:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+:: Set env vars that tell distutils to use the compiler that we put on path
+SET DISTUTILS_USE_SDK=1
+SET MSSdk=1
+
+SET "VS_VERSION=15.0"
+SET "VS_MAJOR=15"
+SET "VS_YEAR=2017"
+
+set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out"
+set "MSYS2_ENV_CONV_EXCL=CL"
+
+:: For Python 3.5+, ensure that we link with the dynamic runtime.  See
+:: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info
+set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll"
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VSINSTALLDIR=%%i\"
+        goto :vswhere
+    )
+)
+
+:vswhere
+
+:: Shorten PATH to avoid the `input line too long` error.
+SET MyPath=%PATH%
+
+setlocal EnableDelayedExpansion
+
+SET TempPath="%MyPath:;=";"%"
+SET var=
+FOR %%a IN (%TempPath%) DO (
+    IF EXIST %%~sa (
+        SET "var=!var!;%%~sa"
+    )
+)
+
+set "TempPath=!var:~1!"
+endlocal & set "PATH=%TempPath%"
+
+:: Shorten current directory too
+FOR %%A IN (.) DO CD "%%~sA"
+
+:: other things added by install_activate.bat at package build time
--- a/packaging/vs2017/conda_build_config.yaml
+++ b/packaging/vs2017/conda_build_config.yaml
@ -0,0 +1,24 @@
+blas_impl:
+  - mkl                        # [x86_64]
+c_compiler:
+  - vs2017                     # [win]
+cxx_compiler:
+  - vs2017                     # [win]
+python:
+  - 3.5
+  - 3.6
+# This differs from target_platform in that it determines what subdir the compiler
+#    will target, not what subdir the compiler package will be itself.
+#    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
+#    code on win-64 miniconda.
+cross_compiler_target_platform:
+  - win-64                     # [win]
+target_platform:
+  - win-64                     # [win]
+vc:
+  - 14
+zip_keys:
+  -                             # [win]
+    - vc                        # [win]
+    - c_compiler                # [win]
+    - cxx_compiler              # [win]
--- a/packaging/vs2017/install_activate.bat
+++ b/packaging/vs2017/install_activate.bat
@ -0,0 +1,30 @@
+:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+set YEAR=2017
+set VER=15
+
+mkdir "%PREFIX%\etc\conda\activate.d"
+COPY "%RECIPE_DIR%\activate.bat" "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+
+IF "%cross_compiler_target_platform%" == "win-64" (
+  set "target_platform=amd64"
+  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR% Win64" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  IF "%VSDEVCMD_ARGS%" == "" (
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  ) ELSE (
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  )
+  echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  ) else (
+  set "target_platform=x86"
+  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo CALL "VC\Auxiliary\Build\vcvars32.bat" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo popd
+  )
--- a/packaging/vs2017/install_runtime.bat
+++ b/packaging/vs2017/install_runtime.bat
@ -0,0 +1,50 @@
+:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+set VC_PATH=x86
+if "%ARCH%"=="64" (
+   set VC_PATH=x64
+)
+
+set MSC_VER=2017
+
+rem :: This should always be present for VC installed with VS.  Not sure about VC installed with Visual C++ Build Tools 2015
+rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO (
+rem     set SP=%%A
+rem     )
+
+rem if not "%SP%" == "%PKG_VERSION%" (
+rem    echo "Version detected from registry: %SP%"
+rem    echo    "does not match version of package being built (%PKG_VERSION%)"
+rem    echo "Do you have current updates for VS 2015 installed?"
+rem    exit 1
+rem )
+
+
+REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below!
+robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%LIBRARY_BIN%" *.dll /E
+robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%PREFIX%" *.dll /E
+if %ERRORLEVEL% GEQ 8 exit 1
+
+REM ========== This one comes from visual studio 2017
+set "VC_VER=141"
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
+        goto :eof
+    )
+)
+
+@setlocal
+call "%VS15VARSALL%" x64
+
+set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%"
+
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+@endlocal
--- a/packaging/vs2017/meta.yaml
+++ b/packaging/vs2017/meta.yaml
@ -0,0 +1,45 @@
+{% set vcver="14.1" %}
+{% set vcfeature="14" %}
+{% set vsyear="2017" %}
+{% set fullver="15.4.27004.2010" %}
+
+package:
+  name: vs{{ vsyear }}
+  version: {{ fullver }}
+
+build:
+  skip: True  [not win]
+  script_env:
+    - VSDEVCMD_ARGS # [win]
+
+outputs:
+  - name: vs{{ vsyear }}_{{ cross_compiler_target_platform }}
+    script: install_activate.bat
+    track_features:
+      # VS 2017 is binary-compatible with VS 2015/vc14.  Tools are "v141".
+      strong:
+        - vc{{ vcfeature }}
+    run_exports:
+      - vc {{ vcver }}
+    about:
+      summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler
+      license: BSD 3-clause
+  - name: vs{{ vsyear }}_runtime
+    script: install_runtime.bat
+  - name: vc
+    version: {{ vcver }}
+    track_features:
+      - vc{{ vcfeature }}
+    requirements:
+      run:
+        - {{ pin_subpackage('vs' ~ vsyear ~ '_runtime') }}
+    about:
+      home: https://github.com/conda/conda/wiki/VC-features
+      license: Modified BSD License (3-clause)
+      license_family: BSD
+      summary: A meta-package to track VC features.
+      description: |
+          This metapackage is used to activate vc features without
+          depending on Python.
+      doc_url: https://github.com/conda/conda/wiki/VC-features
+      dev_url: https://github.com/conda/conda/wiki/VC-features
--- a/packaging/vs2019/activate.bat
+++ b/packaging/vs2019/activate.bat
@ -0,0 +1,45 @@
+:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+:: Set env vars that tell distutils to use the compiler that we put on path
+SET DISTUTILS_USE_SDK=1
+SET MSSdk=1
+
+SET "VS_VERSION=16.0"
+SET "VS_MAJOR=16"
+SET "VS_YEAR=2019"
+
+set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out"
+set "MSYS2_ENV_CONV_EXCL=CL"
+
+:: For Python 3.5+, ensure that we link with the dynamic runtime.  See
+:: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info
+set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll"
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VSINSTALLDIR=%%i\"
+        goto :vswhere
+    )
+)
+
+:vswhere
+
+:: Shorten PATH to avoid the `input line too long` error.
+SET MyPath=%PATH%
+
+setlocal EnableDelayedExpansion
+
+SET TempPath="%MyPath:;=";"%"
+SET var=
+FOR %%a IN (%TempPath%) DO (
+    IF EXIST %%~sa (
+        SET "var=!var!;%%~sa"
+    )
+)
+
+set "TempPath=!var:~1!"
+endlocal & set "PATH=%TempPath%"
+
+:: Shorten current directory too
+FOR %%A IN (.) DO CD "%%~sA"
+
+:: other things added by install_activate.bat at package build time
--- a/packaging/vs2019/conda_build_config.yaml
+++ b/packaging/vs2019/conda_build_config.yaml
@ -0,0 +1,24 @@
+blas_impl:
+  - mkl                        # [x86_64]
+c_compiler:
+  - vs2019                     # [win]
+cxx_compiler:
+  - vs2019                     # [win]
+python:
+  - 3.5
+  - 3.6
+# This differs from target_platform in that it determines what subdir the compiler
+#    will target, not what subdir the compiler package will be itself.
+#    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
+#    code on win-64 miniconda.
+cross_compiler_target_platform:
+  - win-64                     # [win]
+target_platform:
+  - win-64                     # [win]
+vc:
+  - 14
+zip_keys:
+  -                             # [win]
+    - vc                        # [win]
+    - c_compiler                # [win]
+    - cxx_compiler              # [win]
--- a/packaging/vs2019/install_activate.bat
+++ b/packaging/vs2019/install_activate.bat
@ -0,0 +1,30 @@
+:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+set YEAR=2019
+set VER=16
+
+mkdir "%PREFIX%\etc\conda\activate.d"
+COPY "%RECIPE_DIR%\activate.bat" "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+
+IF "%cross_compiler_target_platform%" == "win-64" (
+  set "target_platform=amd64"
+  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR% Win64" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  IF "%VSDEVCMD_ARGS%" == "" (
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  ) ELSE (
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  )
+  echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  ) else (
+  set "target_platform=x86"
+  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo CALL "VC\Auxiliary\Build\vcvars32.bat" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo popd
+  )
--- a/packaging/vs2019/install_runtime.bat
+++ b/packaging/vs2019/install_runtime.bat
@ -0,0 +1,50 @@
+:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+set VC_PATH=x86
+if "%ARCH%"=="64" (
+   set VC_PATH=x64
+)
+
+set MSC_VER=2019
+
+rem :: This should always be present for VC installed with VS.  Not sure about VC installed with Visual C++ Build Tools 2015
+rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO (
+rem     set SP=%%A
+rem     )
+
+rem if not "%SP%" == "%PKG_VERSION%" (
+rem    echo "Version detected from registry: %SP%"
+rem    echo    "does not match version of package being built (%PKG_VERSION%)"
+rem    echo "Do you have current updates for VS 2015 installed?"
+rem    exit 1
+rem )
+
+
+REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below!
+robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%LIBRARY_BIN%" *.dll /E
+robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%PREFIX%" *.dll /E
+if %ERRORLEVEL% GEQ 8 exit 1
+
+REM ========== This one comes from visual studio 2019
+set "VC_VER=142"
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
+        goto :eof
+    )
+)
+
+@setlocal
+call "%VS15VARSALL%" x64
+
+set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%"
+
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+@endlocal
--- a/packaging/vs2019/meta.yaml
+++ b/packaging/vs2019/meta.yaml
@ -0,0 +1,45 @@
+{% set vcver="14.2" %}
+{% set vcfeature="14" %}
+{% set vsyear="2019" %}
+{% set fullver="15.4.27004.2010" %}
+
+package:
+  name: vs{{ vsyear }}
+  version: {{ fullver }}
+
+build:
+  skip: True  [not win]
+  script_env:
+    - VSDEVCMD_ARGS # [win]
+
+outputs:
+  - name: vs{{ vsyear }}_{{ cross_compiler_target_platform }}
+    script: install_activate.bat
+    track_features:
+      # VS 2019 is binary-compatible with VS 2017/vc 14.1 and 2015/vc14.  Tools are "v142".
+      strong:
+        - vc{{ vcfeature }}
+    run_exports:
+      - vc {{ vcver }}
+    about:
+      summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler
+      license: BSD 3-clause
+  - name: vs{{ vsyear }}_runtime
+    script: install_runtime.bat
+  - name: vc
+    version: {{ vcver }}
+    track_features:
+      - vc{{ vcfeature }}
+    requirements:
+      run:
+        - {{ pin_subpackage('vs' ~ vsyear ~ '_runtime') }}
+    about:
+      home: https://github.com/conda/conda/wiki/VC-features
+      license: Modified BSD License (3-clause)
+      license_family: BSD
+      summary: A meta-package to track VC features.
+      description: |
+          This metapackage is used to activate vc features without
+          depending on Python.
+      doc_url: https://github.com/conda/conda/wiki/VC-features
+      dev_url: https://github.com/conda/conda/wiki/VC-features
--- a/pytorch3d/init.py
+++ b/pytorch3d/init.py
@ -0,0 +1,3 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+__version__ = "0.1"
--- a/pytorch3d/csrc/ext.cpp
+++ b/pytorch3d/csrc/ext.cpp
@ -0,0 +1,27 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <torch/extension.h>
+#include "face_areas_normals/face_areas_normals.h"
+#include "gather_scatter/gather_scatter.h"
+#include "nearest_neighbor_points/nearest_neighbor_points.h"
+#include "packed_to_padded_tensor/packed_to_padded_tensor.h"
+#include "rasterize_meshes/rasterize_meshes.h"
+#include "rasterize_points/rasterize_points.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("face_areas_normals", &face_areas_normals);
+  m.def("packed_to_padded_tensor", &packed_to_padded_tensor);
+  m.def("nn_points_idx", &nn_points_idx);
+  m.def("gather_scatter", &gather_scatter);
+  m.def("rasterize_points", &RasterizePoints);
+  m.def("rasterize_points_backward", &RasterizePointsBackward);
+  m.def("rasterize_meshes_backward", &RasterizeMeshesBackward);
+  m.def("rasterize_meshes", &RasterizeMeshes);
+
+  // These are only visible for testing; users should not call them directly
+  m.def("_rasterize_points_coarse", &RasterizePointsCoarse);
+  m.def("_rasterize_points_naive", &RasterizePointsNaive);
+  m.def("_rasterize_meshes_naive", &RasterizeMeshesNaive);
+  m.def("_rasterize_meshes_coarse", &RasterizeMeshesCoarse);
+  m.def("_rasterize_meshes_fine", &RasterizeMeshesFine);
+}
--- a/pytorch3d/csrc/face_areas_normals/face_areas_normals.cu
+++ b/pytorch3d/csrc/face_areas_normals/face_areas_normals.cu
@ -0,0 +1,80 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <ATen/ATen.h>
+#include <tuple>
+
+template <typename scalar_t>
+__global__ void face_areas_kernel(
+    const scalar_t* __restrict__ verts,
+    const long* __restrict__ faces,
+    scalar_t* __restrict__ face_areas,
+    scalar_t* __restrict__ face_normals,
+    const size_t V,
+    const size_t F) {
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t stride = gridDim.x * blockDim.x;
+
+  // Faces split evenly over the number of threads in the grid.
+  // Each thread computes the area & normal of its respective faces and adds it
+  // to the global face_areas tensor.
+  for (size_t f = tid; f < F; f += stride) {
+    const long i0 = faces[3 * f + 0];
+    const long i1 = faces[3 * f + 1];
+    const long i2 = faces[3 * f + 2];
+
+    const scalar_t v0_x = verts[3 * i0 + 0];
+    const scalar_t v0_y = verts[3 * i0 + 1];
+    const scalar_t v0_z = verts[3 * i0 + 2];
+
+    const scalar_t v1_x = verts[3 * i1 + 0];
+    const scalar_t v1_y = verts[3 * i1 + 1];
+    const scalar_t v1_z = verts[3 * i1 + 2];
+
+    const scalar_t v2_x = verts[3 * i2 + 0];
+    const scalar_t v2_y = verts[3 * i2 + 1];
+    const scalar_t v2_z = verts[3 * i2 + 2];
+
+    const scalar_t ax = v1_x - v0_x;
+    const scalar_t ay = v1_y - v0_y;
+    const scalar_t az = v1_z - v0_z;
+
+    const scalar_t bx = v2_x - v0_x;
+    const scalar_t by = v2_y - v0_y;
+    const scalar_t bz = v2_z - v0_z;
+
+    const scalar_t cx = ay * bz - az * by;
+    const scalar_t cy = az * bx - ax * bz;
+    const scalar_t cz = ax * by - ay * bx;
+
+    scalar_t norm = sqrt(cx * cx + cy * cy + cz * cz);
+    face_areas[f] = norm / 2.0;
+    norm = (norm < 1e-6) ? 1e-6 : norm; // max(norm, 1e-6)
+    face_normals[3 * f + 0] = cx / norm;
+    face_normals[3 * f + 1] = cy / norm;
+    face_normals[3 * f + 2] = cz / norm;
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor> face_areas_cuda(
+    at::Tensor verts,
+    at::Tensor faces) {
+  const auto V = verts.size(0);
+  const auto F = faces.size(0);
+
+  at::Tensor areas = at::empty({F}, verts.options());
+  at::Tensor normals = at::empty({F, 3}, verts.options());
+
+  const int blocks = 64;
+  const int threads = 512;
+  AT_DISPATCH_FLOATING_TYPES(verts.type(), "face_areas_kernel", ([&] {
+                               face_areas_kernel<scalar_t><<<blocks, threads>>>(
+                                   verts.data_ptr<scalar_t>(),
+                                   faces.data_ptr<long>(),
+                                   areas.data_ptr<scalar_t>(),
+                                   normals.data_ptr<scalar_t>(),
+                                   V,
+                                   F);
+                             }));
+
+  return std::make_tuple(areas, normals);
+}
--- a/pytorch3d/csrc/face_areas_normals/face_areas_normals.h
+++ b/pytorch3d/csrc/face_areas_normals/face_areas_normals.h
@ -0,0 +1,36 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#pragma once
+#include <torch/extension.h>
+#include <tuple>
+
+// Compute areas of mesh faces using packed representation.
+//
+// Inputs:
+//    verts: FloatTensor of shape (V, 3) giving vertex positions.
+//    faces: LongTensor of shape (F, 3) giving faces.
+//
+// Returns:
+//    areas: FloatTensor of shape (F,) where areas[f] is the area of faces[f].
+//    normals: FloatTensor of shape (F, 3) where normals[f] is the normal of
+//    faces[f]
+//
+
+// Cuda implementation.
+std::tuple<at::Tensor, at::Tensor> face_areas_cuda(
+    at::Tensor verts,
+    at::Tensor faces);
+
+// Implementation which is exposed.
+std::tuple<at::Tensor, at::Tensor> face_areas_normals(
+    at::Tensor verts,
+    at::Tensor faces) {
+  if (verts.type().is_cuda() && faces.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return face_areas_cuda(verts, faces);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU.");
+}
--- a/pytorch3d/csrc/gather_scatter/gather_scatter.cu
+++ b/pytorch3d/csrc/gather_scatter/gather_scatter.cu
@ -0,0 +1,69 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <ATen/ATen.h>
+
+// TODO(T47953967) to make this cuda kernel support all datatypes.
+__global__ void gather_scatter_kernel(
+    const float* __restrict__ input,
+    const long* __restrict__ edges,
+    float* __restrict__ output,
+    bool directed,
+    bool backward,
+    const size_t V,
+    const size_t D,
+    const size_t E) {
+  const int tid = threadIdx.x;
+
+  // Reverse the vertex order if backward.
+  const int v0_idx = backward ? 1 : 0;
+  const int v1_idx = backward ? 0 : 1;
+
+  // Edges are split evenly across the blocks.
+  for (int e = blockIdx.x; e < E; e += gridDim.x) {
+    // Get indices of vertices which form the edge.
+    const long v0 = edges[2 * e + v0_idx];
+    const long v1 = edges[2 * e + v1_idx];
+
+    // Split vertex features evenly across threads.
+    // This implementation will be quite wasteful when D<128 since there will be
+    // a lot of threads doing nothing.
+    for (int d = tid; d < D; d += blockDim.x) {
+      const float val = input[v1 * D + d];
+      float* address = output + v0 * D + d;
+      atomicAdd(address, val);
+      if (!directed) {
+        const float val = input[v0 * D + d];
+        float* address = output + v1 * D + d;
+        atomicAdd(address, val);
+      }
+    }
+    __syncthreads();
+  }
+}
+
+at::Tensor gather_scatter_cuda(
+    const at::Tensor input,
+    const at::Tensor edges,
+    bool directed,
+    bool backward) {
+  const auto num_vertices = input.size(0);
+  const auto input_feature_dim = input.size(1);
+  const auto num_edges = edges.size(0);
+
+  auto output = at::zeros({num_vertices, input_feature_dim}, input.options());
+  const size_t threads = 128;
+  const size_t max_blocks = 1920;
+  const size_t blocks = num_edges < max_blocks ? num_edges : max_blocks;
+
+  gather_scatter_kernel<<<blocks, threads>>>(
+      input.data_ptr<float>(),
+      edges.data_ptr<long>(),
+      output.data_ptr<float>(),
+      directed,
+      backward,
+      num_vertices,
+      input_feature_dim,
+      num_edges);
+
+  return output;
+}
--- a/pytorch3d/csrc/gather_scatter/gather_scatter.h
+++ b/pytorch3d/csrc/gather_scatter/gather_scatter.h
@ -0,0 +1,43 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#pragma once
+#include <torch/extension.h>
+
+// Fused gather scatter operation for aggregating features of neighbor nodes
+// in a graph. This gather scatter operation is specific to graphs as edge
+// indices are used as input.
+//
+// Args:
+//   input: float32 Tensor of shape (V, D) where V is the number of vertices
+//          and D is the feature dimension.
+//   edges: int64 Tensor of shape (E, 2) giving the indices of the vertices that
+//          make up the edge. E is the number of edges.
+//  directed: Bool indicating if edges in the graph are directed. For a
+//            directed graph v0 -> v1 the updated feature for v0 depends on v1.
+//  backward: Bool indicating if the operation is the backward pass.
+//
+// Returns:
+//   output: float32 Tensor of same shape as input.
+
+// Cuda implementation.
+at::Tensor gather_scatter_cuda(
+    const at::Tensor input,
+    const at::Tensor edges,
+    bool directed,
+    bool backward);
+
+// Exposed implementation.
+at::Tensor gather_scatter(
+    const at::Tensor input,
+    const at::Tensor edges,
+    bool directed,
+    bool backward) {
+  if (input.type().is_cuda() && edges.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return gather_scatter_cuda(input, edges, directed, backward);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
--- a/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu
+++ b/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu
@ -0,0 +1,265 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <ATen/ATen.h>
+#include <float.h>
+
+template <typename scalar_t>
+__device__ void warp_reduce(
+    volatile scalar_t* min_dists,
+    volatile long* min_idxs,
+    const size_t tid) {
+  // s = 32
+  if (min_dists[tid] > min_dists[tid + 32]) {
+    min_idxs[tid] = min_idxs[tid + 32];
+    min_dists[tid] = min_dists[tid + 32];
+  }
+  // s = 16
+  if (min_dists[tid] > min_dists[tid + 16]) {
+    min_idxs[tid] = min_idxs[tid + 16];
+    min_dists[tid] = min_dists[tid + 16];
+  }
+  // s = 8
+  if (min_dists[tid] > min_dists[tid + 8]) {
+    min_idxs[tid] = min_idxs[tid + 8];
+    min_dists[tid] = min_dists[tid + 8];
+  }
+  // s = 4
+  if (min_dists[tid] > min_dists[tid + 4]) {
+    min_idxs[tid] = min_idxs[tid + 4];
+    min_dists[tid] = min_dists[tid + 4];
+  }
+  // s = 2
+  if (min_dists[tid] > min_dists[tid + 2]) {
+    min_idxs[tid] = min_idxs[tid + 2];
+    min_dists[tid] = min_dists[tid + 2];
+  }
+  // s = 1
+  if (min_dists[tid] > min_dists[tid + 1]) {
+    min_idxs[tid] = min_idxs[tid + 1];
+    min_dists[tid] = min_dists[tid + 1];
+  }
+}
+
+//  CUDA kernel to compute nearest neighbors between two batches of pointclouds
+//  where each point is of dimension D.
+//
+//  Args:
+//    points1: First set of points, of shape (N, P1, D).
+//    points2: Second set of points, of shape (N, P2, D).
+//    idx: Output memory buffer of shape (N, P1).
+//    N: Batch size.
+//    P1: Number of points in points1.
+//    P2: Number of points in points2.
+//    D_2: Size of the shared buffer; this is D rounded up so that memory access
+//         is aligned.
+//
+template <typename scalar_t>
+__global__ void nearest_neighbor_kernel(
+    const scalar_t* __restrict__ points1,
+    const scalar_t* __restrict__ points2,
+    long* __restrict__ idx,
+    const size_t N,
+    const size_t P1,
+    const size_t P2,
+    const size_t D,
+    const size_t D_2) {
+  // Each block will compute one element of the output idx[n, i]. Within the
+  // block we will use threads to compute the distances between points1[n, i]
+  // and points2[n, j] for all 0 <= j < P2, then use a block reduction to
+  // take an argmin of the distances.
+
+  // Shared buffers for the threads in the block. CUDA only allows declaration
+  // of a single shared buffer, so it needs to be manually sliced and cast to
+  // build several logical shared buffers of different types.
+  extern __shared__ char shared_buf[];
+  scalar_t* x = (scalar_t*)shared_buf; // scalar_t[DD]
+  scalar_t* min_dists = &x[D_2]; // scalar_t[NUM_THREADS]
+  long* min_idxs = (long*)&min_dists[blockDim.x]; // long[NUM_THREADS]
+
+  const size_t n = blockIdx.y; // index of batch element.
+  const size_t i = blockIdx.x; // index of point within batch element.
+  const size_t tid = threadIdx.x;
+
+  // Thread 0 copies points1[n, i, :] into x.
+  if (tid == 0) {
+    for (size_t d = 0; d < D; d++) {
+      x[d] = points1[n * (P1 * D) + i * D + d];
+    }
+  }
+  __syncthreads();
+
+  // Compute the distances between points1[n, i] and points2[n, j] for
+  // all 0 <= j < P2. Here each thread will reduce over P2 / blockDim.x
+  // in serial, and store its result to shared memory
+  scalar_t min_dist = FLT_MAX;
+  size_t min_idx = 0;
+  for (size_t j = tid; j < P2; j += blockDim.x) {
+    scalar_t dist = 0;
+    for (size_t d = 0; d < D; d++) {
+      scalar_t x_d = x[d];
+      scalar_t y_d = points2[n * (P2 * D) + j * D + d];
+      scalar_t diff = x_d - y_d;
+      dist += diff * diff;
+    }
+    min_dist = (j == tid) ? dist : min_dist;
+    min_idx = (dist <= min_dist) ? j : min_idx;
+    min_dist = (dist <= min_dist) ? dist : min_dist;
+  }
+  min_dists[tid] = min_dist;
+  min_idxs[tid] = min_idx;
+  __syncthreads();
+
+  // Perform reduction in shared memory.
+  for (int s = blockDim.x / 2; s > 32; s >>= 1) {
+    if (tid < s) {
+      if (min_dists[tid] > min_dists[tid + s]) {
+        min_dists[tid] = min_dists[tid + s];
+        min_idxs[tid] = min_idxs[tid + s];
+      }
+    }
+    __syncthreads();
+  }
+
+  // Unroll the last 6 iterations of the loop since they will happen
+  // synchronized within a single warp.
+  if (tid < 32)
+    warp_reduce<scalar_t>(min_dists, min_idxs, tid);
+
+  // Finally thread 0 writes the result to the output buffer.
+  if (tid == 0) {
+    idx[n * P1 + i] = min_idxs[0];
+  }
+}
+
+//  CUDA kernel to compute nearest neighbors between two sets of 3-dimensional
+//  pointclouds. This is a specialization of the nearest_neighbor_kernel
+//  to the case D=3.
+//
+//  Args:
+//    points1: First set of pointclouds, of shape (N, P1, 3).
+//    points2: Second set of pointclouds, of shape (N, P2, 3).
+//    idx: Output memory buffer of shape (N, P1).
+//    N: Batch size.
+//    P1: Number of points in points1.
+//    P2: Number of points in points2.
+//
+template <typename scalar_t>
+__global__ void nearest_neighbor_kernel_D3(
+    const scalar_t* __restrict__ points1,
+    const scalar_t* __restrict__ points2,
+    long* __restrict__ idx,
+    const size_t N,
+    const size_t P1,
+    const size_t P2) {
+  // Single shared memory buffer which is split and cast to different types.
+  extern __shared__ char shared_buf[];
+  scalar_t* min_dists = (scalar_t*)shared_buf; // scalar_t[NUM_THREADS]
+  long* min_idxs = (long*)&min_dists[blockDim.x]; // long[NUM_THREADS]
+
+  const size_t D = 3;
+  const size_t n = blockIdx.y; // index of batch element.
+  const size_t i = blockIdx.x; // index of point within batch element.
+  const size_t tid = threadIdx.x;
+
+  // Retrieve the coordinates of points1[n, i] from global memory; these
+  // will be stored in registers for fast access.
+  const scalar_t x = points1[n * (P1 * D) + i * D + 0];
+  const scalar_t y = points1[n * (P1 * D) + i * D + 1];
+  const scalar_t z = points1[n * (P1 * D) + i * D + 2];
+
+  // Compute distances between points1[n, i] and all points2[n, j]
+  // for 0 <= j < P2
+  scalar_t min_dist = FLT_MAX;
+  size_t min_idx = 0;
+
+  // Distance computation for points in p2 spread across threads in the block.
+  for (size_t j = tid; j < P2; j += blockDim.x) {
+    scalar_t dx = x - points2[n * (P2 * D) + j * D + 0];
+    scalar_t dy = y - points2[n * (P2 * D) + j * D + 1];
+    scalar_t dz = z - points2[n * (P2 * D) + j * D + 2];
+    scalar_t dist = dx * dx + dy * dy + dz * dz;
+    min_dist = (j == tid) ? dist : min_dist;
+    min_idx = (dist <= min_dist) ? j : min_idx;
+    min_dist = (dist <= min_dist) ? dist : min_dist;
+  }
+  min_dists[tid] = min_dist;
+  min_idxs[tid] = min_idx;
+
+  // Synchronize local threads writing to the shared memory buffer.
+  __syncthreads();
+
+  // Perform reduction in shared memory.
+  for (int s = blockDim.x / 2; s > 32; s >>= 1) {
+    if (tid < s) {
+      if (min_dists[tid] > min_dists[tid + s]) {
+        min_dists[tid] = min_dists[tid + s];
+        min_idxs[tid] = min_idxs[tid + s];
+      }
+    }
+
+    // Synchronize local threads so that min_dists is correct.
+    __syncthreads();
+  }
+
+  // Unroll the last 6 iterations of the loop since they will happen
+  // synchronized within a single warp.
+  if (tid < 32)
+    warp_reduce<scalar_t>(min_dists, min_idxs, tid);
+
+  // Finally thread 0 writes the result to the output buffer.
+  if (tid == 0) {
+    idx[n * P1 + i] = min_idxs[0];
+  }
+}
+
+at::Tensor nn_points_idx_cuda(at::Tensor p1, at::Tensor p2) {
+  const auto N = p1.size(0);
+  const auto P1 = p1.size(1);
+  const auto P2 = p2.size(1);
+  const auto D = p1.size(2);
+
+  AT_ASSERTM(p2.size(2) == D, "Point sets must have same last dimension.");
+  auto idx = at::empty({N, P1}, p1.options().dtype(at::kLong));
+
+  // On P100 with pointclouds of size (16, 5000, 3), 128 threads per block
+  // gives best results.
+  const int threads = 128;
+  const dim3 blocks(P1, N);
+
+  if (D == 3) {
+    // Use the specialized kernel for D=3.
+    AT_DISPATCH_FLOATING_TYPES(p1.type(), "nearest_neighbor_v3_cuda", ([&] {
+                                 size_t shared_size = threads * sizeof(size_t) +
+                                     threads * sizeof(long);
+                                 nearest_neighbor_kernel_D3<scalar_t>
+                                     <<<blocks, threads, shared_size>>>(
+                                         p1.data_ptr<scalar_t>(),
+                                         p2.data_ptr<scalar_t>(),
+                                         idx.data_ptr<long>(),
+                                         N,
+                                         P1,
+                                         P2);
+                               }));
+  } else {
+    // Use the general kernel for all other D.
+    AT_DISPATCH_FLOATING_TYPES(
+        p1.type(), "nearest_neighbor_v3_cuda", ([&] {
+          // To avoid misaligned memory access, the size of shared buffers
+          // need to be rounded to the next even size.
+          size_t D_2 = D + (D % 2);
+          size_t shared_size = (D_2 + threads) * sizeof(size_t);
+          shared_size += threads * sizeof(long);
+          nearest_neighbor_kernel<scalar_t><<<blocks, threads, shared_size>>>(
+              p1.data_ptr<scalar_t>(),
+              p2.data_ptr<scalar_t>(),
+              idx.data_ptr<long>(),
+              N,
+              P1,
+              P2,
+              D,
+              D_2);
+        }));
+  }
+
+  return idx;
+}
--- a/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.h
+++ b/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.h
@ -0,0 +1,37 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#pragma once
+#include <torch/extension.h>
+#include "pytorch3d_cutils.h"
+
+// Compute indices of nearest neighbors in pointcloud p2 to points
+// in pointcloud p1.
+//
+// Args:
+//    p1: FloatTensor of shape (N, P1, D) giving a batch of pointclouds each
+//        containing P1 points of dimension D.
+//    p2: FloatTensor of shape (N, P2, D) giving a batch of pointclouds each
+//        containing P2 points of dimension D.
+//
+// Returns:
+//    p1_neighbor_idx: LongTensor of shape (N, P1), where
+//                     p1_neighbor_idx[n, i] = j means that the nearest neighbor
+//                     to p1[n, i] in the cloud p2[n] is p2[n, j].
+//
+
+// Cuda implementation.
+at::Tensor nn_points_idx_cuda(at::Tensor p1, at::Tensor p2);
+
+// Implementation which is exposed.
+at::Tensor nn_points_idx(at::Tensor p1, at::Tensor p2) {
+  if (p1.type().is_cuda() && p2.type().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CONTIGUOUS_CUDA(p1);
+    CHECK_CONTIGUOUS_CUDA(p2);
+    return nn_points_idx_cuda(p1, p2);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU.");
+};
--- a/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.cu
+++ b/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.cu
@ -0,0 +1,52 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <ATen/ATen.h>
+
+template <typename scalar_t>
+__global__ void packed_to_padded_tensor_kernel(
+    const scalar_t* __restrict__ inputs,
+    const long* __restrict__ first_idxs,
+    scalar_t* __restrict__ inputs_padded,
+    const size_t batch_size,
+    const size_t max_size,
+    const size_t num_inputs) {
+  // Batch elements split evenly across blocks (num blocks = batch_size) and
+  // values for each element split across threads in the block. Each thread adds
+  // the values of its respective input elements to the global inputs_padded
+  // tensor.
+  const size_t tid = threadIdx.x;
+  const size_t batch_idx = blockIdx.x;
+
+  const long start = first_idxs[batch_idx];
+  const long end =
+      batch_idx + 1 < batch_size ? first_idxs[batch_idx + 1] : num_inputs;
+  const int num_faces = end - start;
+  for (size_t f = tid; f < num_faces; f += blockDim.x) {
+    inputs_padded[batch_idx * max_size + f] = inputs[start + f];
+  }
+}
+
+at::Tensor packed_to_padded_tensor_cuda(
+    at::Tensor inputs,
+    at::Tensor first_idxs,
+    const long max_size) {
+  const auto num_inputs = inputs.size(0);
+  const auto batch_size = first_idxs.size(0);
+  at::Tensor inputs_padded =
+      at::zeros({batch_size, max_size}, inputs.options());
+
+  const int threads = 512;
+  const int blocks = batch_size;
+  AT_DISPATCH_FLOATING_TYPES(
+      inputs.type(), "packed_to_padded_tensor_kernel", ([&] {
+        packed_to_padded_tensor_kernel<scalar_t><<<blocks, threads>>>(
+            inputs.data_ptr<scalar_t>(),
+            first_idxs.data_ptr<long>(),
+            inputs_padded.data_ptr<scalar_t>(),
+            batch_size,
+            max_size,
+            num_inputs);
+      }));
+
+  return inputs_padded;
+}
--- a/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.h
+++ b/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.h
@ -0,0 +1,44 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#pragma once
+#include <torch/extension.h>
+
+// Converts a packed tensor into a padded tensor, restoring the batch dimension.
+// Refer to pytorch3d/structures/meshes.py for details on packed/padded tensors.
+//
+// Inputs:
+//    inputs: FloatTensor of shape (F,), representing the packed batch tensor.
+//           e.g. areas for faces in a batch of meshes.
+//    first_idxs: LongTensor of shape (N,) where N is the number of
+//                       elements in the batch and `packed_first_idxs[i] = f`
+//                       means that the inputs for batch element i begin at
+//                       `inputs[f]`.
+//   max_size: Max length of an element in the batch.
+// Returns:
+//   inputs_padded: FloatTensor of shape (N, max_size) where max_size is max
+//                 of `sizes`. The values for batch element i which start at
+//                 `inputs[packed_first_idxs[i]]` will be copied to
+//                 `inputs_padded[i, :]``, with zeros padding out the extra
+//                  inputs.
+//
+
+// Cuda implementation.
+at::Tensor packed_to_padded_tensor_cuda(
+    at::Tensor inputs,
+    at::Tensor first_idxs,
+    const long max_size);
+
+// Implementation which is exposed.
+at::Tensor packed_to_padded_tensor(
+    at::Tensor inputs,
+    at::Tensor first_idxs,
+    const long max_size) {
+  if (inputs.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return packed_to_padded_tensor_cuda(inputs, first_idxs, max_size);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU.");
+}
--- a/Show More
+++ b/Show More