diff --git a/.circleci/config.in.yml b/.circleci/config.in.yml
index 607baf5d..323d36a3 100644
--- a/.circleci/config.in.yml
+++ b/.circleci/config.in.yml
@@ -18,20 +18,13 @@ setupcuda: &setupcuda
     working_directory: ~/
     command: |
       # download and install nvidia drivers, cuda, etc
-      wget --no-verbose --no-clobber -P ~/nvidia-downloads https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run
-      sudo sh ~/nvidia-downloads/cuda_11.2.2_460.32.03_linux.run --silent
+      wget --no-verbose --no-clobber -P ~/nvidia-downloads https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.19.01_linux.run
+      sudo sh ~/nvidia-downloads/cuda_11.3.1_465.19.01_linux.run --silent
       echo "Done installing CUDA."
       pyenv versions
       nvidia-smi
       pyenv global 3.9.1
 
-gpu: &gpu
-  environment:
-    CUDA_VERSION: "10.2"
-  machine:
-    image: default
-  resource_class: gpu.medium # tesla m60
-
 binary_common: &binary_common
   parameters:
     # Edit these defaults to do a release`
@@ -54,42 +47,41 @@ binary_common: &binary_common
       description: "Wheel only: what docker image to use"
       type: string
       default: "pytorch/manylinux-cuda101"
+    conda_docker_image:
+      description: "what docker image to use for docker"
+      type: string
+      default: "pytorch/conda-cuda"
   environment:
     PYTHON_VERSION: << parameters.python_version >>
     BUILD_VERSION: << parameters.build_version >>
     PYTORCH_VERSION: << parameters.pytorch_version >>
     CU_VERSION: << parameters.cu_version >>
+    TESTRUN_DOCKER_IMAGE: << parameters.conda_docker_image >>
 
 jobs:
   main:
-    <<: *gpu
+    environment:
+      CUDA_VERSION: "11.3"
+    resource_class: gpu.nvidia.small.multi
     machine:
       image: ubuntu-2004:202101-01
     steps:
       - checkout
       - <<: *setupcuda
       - run: pip3 install --progress-bar off imageio wheel matplotlib 'pillow<7'
-      - run: pip3 install --progress-bar off torch torchvision
+      - run: pip3 install --progress-bar off torch==1.10.0+cu113 torchvision==0.11.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
       # - run: conda create -p ~/conda_env python=3.7 numpy
       # - run: conda activate ~/conda_env
       # - run: conda install -c pytorch pytorch torchvision
 
       - run: pip3 install --progress-bar off 'git+https://github.com/facebookresearch/fvcore'
       - run: pip3 install --progress-bar off 'git+https://github.com/facebookresearch/iopath'
-      - run:
-          name: get cub
-          command: |
-            cd ..
-            wget --no-verbose https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz
-            tar xzf 1.10.0.tar.gz
-            # This expands to a directory called cub-1.10.0
       - run:
           name: build
           command: |
-            export LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-11.2/lib64
-            export CUB_HOME=$(realpath ../cub-1.10.0)
+            export LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-11.3/lib64
             python3 setup.py build_ext --inplace
-      - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-11.2/lib64 python -m unittest discover -v -s tests
+      - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-11.3/lib64 python -m unittest discover -v -s tests
       - run: python3 setup.py bdist_wheel
 
   binary_linux_wheel:
@@ -113,7 +105,7 @@ jobs:
   binary_linux_conda:
     <<: *binary_common
     docker:
-      - image: "pytorch/conda-cuda"
+      - image: "<< parameters.conda_docker_image >>"
         auth:
           username: $DOCKERHUB_USERNAME
           password: $DOCKERHUB_TOKEN
@@ -137,7 +129,7 @@ jobs:
     <<: *binary_common
     machine:
       image: ubuntu-1604:201903-01
-    resource_class: gpu.medium
+    resource_class: gpu.nvidia.small.multi
     steps:
     - checkout
     - run:
@@ -189,9 +181,8 @@ jobs:
 
           { docker login -u="$DOCKERHUB_USERNAME" -p="$DOCKERHUB_TOKEN" ; } 2> /dev/null
 
-          DOCKER_IMAGE=pytorch/conda-cuda
-          echo Pulling docker image $DOCKER_IMAGE
-          docker pull $DOCKER_IMAGE
+          echo Pulling docker image $TESTRUN_DOCKER_IMAGE
+          docker pull $TESTRUN_DOCKER_IMAGE
     - run:
         name: Build and run tests
         no_output_timeout: 20m
@@ -200,11 +191,10 @@ jobs:
 
           cd ${HOME}/project/
 
-          DOCKER_IMAGE=pytorch/conda-cuda
           export JUST_TESTRUN=1
           VARS_TO_PASS="-e PYTHON_VERSION -e BUILD_VERSION -e PYTORCH_VERSION -e CU_VERSION -e JUST_TESTRUN"
 
-          docker run --gpus all  --ipc=host -v $(pwd):/remote -w /remote ${VARS_TO_PASS} ${DOCKER_IMAGE} ./packaging/build_conda.sh
+          docker run --gpus all  --ipc=host -v $(pwd):/remote -w /remote ${VARS_TO_PASS} ${TESTRUN_DOCKER_IMAGE} ./packaging/build_conda.sh
 
   binary_macos_wheel:
     <<: *binary_common
@@ -228,27 +218,15 @@ workflows:
   version: 2
   build_and_test:
     jobs:
-      - main:
-          context: DOCKERHUB_TOKEN
+      # - main:
+      #     context: DOCKERHUB_TOKEN
       {{workflows()}}
       - binary_linux_conda_cuda:
-          name: testrun_conda_cuda_py37_cu102_pyt190
-          context: DOCKERHUB_TOKEN
-          python_version: "3.7"
-          pytorch_version: '1.9.0'
-          cu_version: "cu102"
-      - binary_linux_conda_cuda:
-          name: testrun_conda_cuda_py37_cu110_pyt170
+          name: testrun_conda_cuda_py37_cu102_pyt170
           context: DOCKERHUB_TOKEN
           python_version: "3.7"
           pytorch_version: '1.7.0'
-          cu_version: "cu110"
-      - binary_linux_conda_cuda:
-          name: testrun_conda_cuda_py39_cu111_pyt181
-          context: DOCKERHUB_TOKEN
-          python_version: "3.9"
-          pytorch_version: '1.8.1'
-          cu_version: "cu111"
+          cu_version: "cu102"
       - binary_macos_wheel:
           cu_version: cpu
           name: macos_wheel_py36_cpu
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 10456672..268a2961 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -18,20 +18,13 @@ setupcuda: &setupcuda
     working_directory: ~/
     command: |
       # download and install nvidia drivers, cuda, etc
-      wget --no-verbose --no-clobber -P ~/nvidia-downloads https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run
-      sudo sh ~/nvidia-downloads/cuda_11.2.2_460.32.03_linux.run --silent
+      wget --no-verbose --no-clobber -P ~/nvidia-downloads https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.19.01_linux.run
+      sudo sh ~/nvidia-downloads/cuda_11.3.1_465.19.01_linux.run --silent
       echo "Done installing CUDA."
       pyenv versions
       nvidia-smi
       pyenv global 3.9.1
 
-gpu: &gpu
-  environment:
-    CUDA_VERSION: "10.2"
-  machine:
-    image: default
-  resource_class: gpu.medium # tesla m60
-
 binary_common: &binary_common
   parameters:
     # Edit these defaults to do a release`
@@ -54,42 +47,41 @@ binary_common: &binary_common
       description: "Wheel only: what docker image to use"
       type: string
       default: "pytorch/manylinux-cuda101"
+    conda_docker_image:
+      description: "what docker image to use for docker"
+      type: string
+      default: "pytorch/conda-cuda"
   environment:
     PYTHON_VERSION: << parameters.python_version >>
     BUILD_VERSION: << parameters.build_version >>
     PYTORCH_VERSION: << parameters.pytorch_version >>
     CU_VERSION: << parameters.cu_version >>
+    TESTRUN_DOCKER_IMAGE: << parameters.conda_docker_image >>
 
 jobs:
   main:
-    <<: *gpu
+    environment:
+      CUDA_VERSION: "11.3"
+    resource_class: gpu.nvidia.small.multi
     machine:
       image: ubuntu-2004:202101-01
     steps:
       - checkout
       - <<: *setupcuda
       - run: pip3 install --progress-bar off imageio wheel matplotlib 'pillow<7'
-      - run: pip3 install --progress-bar off torch torchvision
+      - run: pip3 install --progress-bar off torch==1.10.0+cu113 torchvision==0.11.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
       # - run: conda create -p ~/conda_env python=3.7 numpy
       # - run: conda activate ~/conda_env
       # - run: conda install -c pytorch pytorch torchvision
 
       - run: pip3 install --progress-bar off 'git+https://github.com/facebookresearch/fvcore'
       - run: pip3 install --progress-bar off 'git+https://github.com/facebookresearch/iopath'
-      - run:
-          name: get cub
-          command: |
-            cd ..
-            wget --no-verbose https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz
-            tar xzf 1.10.0.tar.gz
-            # This expands to a directory called cub-1.10.0
       - run:
           name: build
           command: |
-            export LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-11.2/lib64
-            export CUB_HOME=$(realpath ../cub-1.10.0)
+            export LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-11.3/lib64
             python3 setup.py build_ext --inplace
-      - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-11.2/lib64 python -m unittest discover -v -s tests
+      - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-11.3/lib64 python -m unittest discover -v -s tests
       - run: python3 setup.py bdist_wheel
 
   binary_linux_wheel:
@@ -113,7 +105,7 @@ jobs:
   binary_linux_conda:
     <<: *binary_common
     docker:
-      - image: "pytorch/conda-cuda"
+      - image: "<< parameters.conda_docker_image >>"
         auth:
           username: $DOCKERHUB_USERNAME
           password: $DOCKERHUB_TOKEN
@@ -137,7 +129,7 @@ jobs:
     <<: *binary_common
     machine:
       image: ubuntu-1604:201903-01
-    resource_class: gpu.medium
+    resource_class: gpu.nvidia.small.multi
     steps:
     - checkout
     - run:
@@ -189,9 +181,8 @@ jobs:
 
           { docker login -u="$DOCKERHUB_USERNAME" -p="$DOCKERHUB_TOKEN" ; } 2> /dev/null
 
-          DOCKER_IMAGE=pytorch/conda-cuda
-          echo Pulling docker image $DOCKER_IMAGE
-          docker pull $DOCKER_IMAGE
+          echo Pulling docker image $TESTRUN_DOCKER_IMAGE
+          docker pull $TESTRUN_DOCKER_IMAGE
     - run:
         name: Build and run tests
         no_output_timeout: 20m
@@ -200,11 +191,10 @@ jobs:
 
           cd ${HOME}/project/
 
-          DOCKER_IMAGE=pytorch/conda-cuda
           export JUST_TESTRUN=1
           VARS_TO_PASS="-e PYTHON_VERSION -e BUILD_VERSION -e PYTORCH_VERSION -e CU_VERSION -e JUST_TESTRUN"
 
-          docker run --gpus all  --ipc=host -v $(pwd):/remote -w /remote ${VARS_TO_PASS} ${DOCKER_IMAGE} ./packaging/build_conda.sh
+          docker run --gpus all  --ipc=host -v $(pwd):/remote -w /remote ${VARS_TO_PASS} ${TESTRUN_DOCKER_IMAGE} ./packaging/build_conda.sh
 
   binary_macos_wheel:
     <<: *binary_common
@@ -228,8 +218,8 @@ workflows:
   version: 2
   build_and_test:
     jobs:
-      - main:
-          context: DOCKERHUB_TOKEN
+      # - main:
+      #     context: DOCKERHUB_TOKEN
       - binary_linux_conda:
           context: DOCKERHUB_TOKEN
           cu_version: cu92
@@ -344,6 +334,19 @@ workflows:
           name: linux_conda_py36_cu111_pyt191
           python_version: '3.6'
           pytorch_version: 1.9.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py36_cu102_pyt1100
+          python_version: '3.6'
+          pytorch_version: 1.10.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda113
+          context: DOCKERHUB_TOKEN
+          cu_version: cu113
+          name: linux_conda_py36_cu113_pyt1100
+          python_version: '3.6'
+          pytorch_version: 1.10.0
       - binary_linux_conda:
           context: DOCKERHUB_TOKEN
           cu_version: cu92
@@ -458,6 +461,19 @@ workflows:
           name: linux_conda_py37_cu111_pyt191
           python_version: '3.7'
           pytorch_version: 1.9.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py37_cu102_pyt1100
+          python_version: '3.7'
+          pytorch_version: 1.10.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda113
+          context: DOCKERHUB_TOKEN
+          cu_version: cu113
+          name: linux_conda_py37_cu113_pyt1100
+          python_version: '3.7'
+          pytorch_version: 1.10.0
       - binary_linux_conda:
           context: DOCKERHUB_TOKEN
           cu_version: cu92
@@ -572,6 +588,19 @@ workflows:
           name: linux_conda_py38_cu111_pyt191
           python_version: '3.8'
           pytorch_version: 1.9.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py38_cu102_pyt1100
+          python_version: '3.8'
+          pytorch_version: 1.10.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda113
+          context: DOCKERHUB_TOKEN
+          cu_version: cu113
+          name: linux_conda_py38_cu113_pyt1100
+          python_version: '3.8'
+          pytorch_version: 1.10.0
       - binary_linux_conda:
           context: DOCKERHUB_TOKEN
           cu_version: cu101
@@ -650,24 +679,25 @@ workflows:
           name: linux_conda_py39_cu111_pyt191
           python_version: '3.9'
           pytorch_version: 1.9.1
-      - binary_linux_conda_cuda:
-          name: testrun_conda_cuda_py37_cu102_pyt190
+      - binary_linux_conda:
           context: DOCKERHUB_TOKEN
-          python_version: "3.7"
-          pytorch_version: '1.9.0'
-          cu_version: "cu102"
+          cu_version: cu102
+          name: linux_conda_py39_cu102_pyt1100
+          python_version: '3.9'
+          pytorch_version: 1.10.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda113
+          context: DOCKERHUB_TOKEN
+          cu_version: cu113
+          name: linux_conda_py39_cu113_pyt1100
+          python_version: '3.9'
+          pytorch_version: 1.10.0
       - binary_linux_conda_cuda:
-          name: testrun_conda_cuda_py37_cu110_pyt170
+          name: testrun_conda_cuda_py37_cu102_pyt170
           context: DOCKERHUB_TOKEN
           python_version: "3.7"
           pytorch_version: '1.7.0'
-          cu_version: "cu110"
-      - binary_linux_conda_cuda:
-          name: testrun_conda_cuda_py39_cu111_pyt181
-          context: DOCKERHUB_TOKEN
-          python_version: "3.9"
-          pytorch_version: '1.8.1'
-          cu_version: "cu111"
+          cu_version: "cu102"
       - binary_macos_wheel:
           cu_version: cpu
           name: macos_wheel_py36_cpu
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
index 446ec84c..d727c882 100755
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -26,9 +26,16 @@ CONDA_CUDA_VERSIONS = {
     "1.8.1": ["cu101", "cu102", "cu111"],
     "1.9.0": ["cu102", "cu111"],
     "1.9.1": ["cu102", "cu111"],
+    "1.10.0": ["cu102", "cu113"],
 }
 
 
+def conda_docker_image_for_cuda(cuda_version):
+    if cuda_version == "cu113":
+        return "pytorch/conda-builder:cuda113"
+    return None
+
+
 def pytorch_versions_for_python(python_version):
     if python_version in ["3.6", "3.7", "3.8"]:
         return list(CONDA_CUDA_VERSIONS)
@@ -113,6 +120,10 @@ def generate_base_workflow(
         "context": "DOCKERHUB_TOKEN",
     }
 
+    conda_docker_image = conda_docker_image_for_cuda(cu_version)
+    if conda_docker_image is not None:
+        d["conda_docker_image"] = conda_docker_image
+
     if filter_branch is not None:
         d["filters"] = {"branches": {"only": filter_branch}}
 
diff --git a/packaging/linux_wheels/inside.sh b/packaging/linux_wheels/inside.sh
index 8d9d2531..b3f30096 100644
--- a/packaging/linux_wheels/inside.sh
+++ b/packaging/linux_wheels/inside.sh
@@ -58,6 +58,16 @@ do
         for cu_version in ${CONDA_CUDA_VERSIONS[$pytorch_version]}
         do
             case "$cu_version" in
+                cu113)
+                    export CUDA_HOME=/usr/local/cuda-11.3/
+                    export CUDA_TAG=11.3
+                    export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_50,code=compute_50"
+                ;;
+                cu112)
+                    export CUDA_HOME=/usr/local/cuda-11.2/
+                    export CUDA_TAG=11.2
+                    export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_50,code=compute_50"
+                ;;
                 cu111)
                     export CUDA_HOME=/usr/local/cuda-11.1/
                     export CUDA_TAG=11.1
diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash
index 92f50be5..6aaa5cf7 100644
--- a/packaging/pkg_helpers.bash
+++ b/packaging/pkg_helpers.bash
@@ -51,6 +51,28 @@ setup_cuda() {
 
   # Now work out the CUDA settings
   case "$CU_VERSION" in
+    cu113)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.3"
+      else
+        export CUDA_HOME=/usr/local/cuda-11.3/
+      fi
+      export FORCE_CUDA=1
+      # Hard-coding gencode flags is temporary situation until
+      # https://github.com/pytorch/pytorch/pull/23408 lands
+      export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_50,code=compute_50"
+      ;;
+    cu112)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.2"
+      else
+        export CUDA_HOME=/usr/local/cuda-11.2/
+      fi
+      export FORCE_CUDA=1
+      # Hard-coding gencode flags is temporary situation until
+      # https://github.com/pytorch/pytorch/pull/23408 lands
+      export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_50,code=compute_50"
+      ;;
     cu111)
       if [[ "$OSTYPE" == "msys" ]]; then
         export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1"
@@ -267,9 +289,14 @@ setup_conda_cudatoolkit_constraint() {
     export CONDA_CUDATOOLKIT_CONSTRAINT=""
   else
     case "$CU_VERSION" in
+      cu113)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.3,<11.4 # [not osx]"
+        ;;
+      cu112)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.2,<11.3 # [not osx]"
+        ;;
       cu111)
         export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.1,<11.2 # [not osx]"
-        #export CONDA_CUB_CONSTRAINT="- nvidiacub"
         ;;
       cu110)
         export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.0,<11.1 # [not osx]"