Accumulate points (#4)

Summary: Code for accumulating points in the z-buffer in three ways: 1. weighted sum 2. normalised weighted sum 3. alpha compositing Pull Request resolved: https://github.com/fairinternal/pytorch3d/pull/4 Reviewed By: nikhilaravi Differential Revision: D20522422 Pulled By: gkioxari fbshipit-source-id: 5023baa05f15e338f3821ef08f5552c2dcbfc06c
2026-02-07 06:22:19 +08:00 · 2020-03-19 11:19:39 -07:00
parent 5218f45c2c
commit 53599770dd
21 changed files with 2466 additions and 4 deletions
--- a/pytorch3d/csrc/compositing/alpha_composite.cu
+++ b/pytorch3d/csrc/compositing/alpha_composite.cu
@@ -0,0 +1,187 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <torch/extension.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <stdio.h>
+#include <vector>
+
+// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
+// Currently, support is for floats only.
+__global__ void alphaCompositeCudaForwardKernel(
+    // clang-format off
+    torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> result,
+    const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
+    const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
+    const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
+  // clang-format on
+  const int64_t batch_size = result.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  // Get the batch and index
+  const int batch = blockIdx.x;
+
+  const int num_pixels = C * W * H;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
+
+  // Iterate over each feature in each pixel
+  for (int pid = tid; pid < num_pixels; pid += num_threads) {
+    int ch = pid / (W * H);
+    int j = (pid % (W * H)) / H;
+    int i = (pid % (W * H)) % H;
+
+    // alphacomposite the different values
+    float cum_alpha = 1.;
+    // Iterate through the closest K points for this pixel
+    for (int k = 0; k < points_idx.size(1); ++k) {
+      int n_idx = points_idx[batch][k][j][i];
+
+      // Sentinel value is -1 indicating no point overlaps the pixel
+      if (n_idx < 0) {
+        continue;
+      }
+
+      float alpha = alphas[batch][k][j][i];
+      // TODO(gkioxari) It might be more efficient to have threads write in a
+      // local variable, and move atomicAdd outside of the loop such that
+      // atomicAdd is executed once per thread.
+      atomicAdd(
+          &result[batch][ch][j][i], features[ch][n_idx] * cum_alpha * alpha);
+      cum_alpha = cum_alpha * (1 - alpha);
+    }
+  }
+}
+
+// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
+// Currently, support is for floats only.
+__global__ void alphaCompositeCudaBackwardKernel(
+    // clang-format off
+    torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> grad_features,
+    torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_alphas,
+    const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_outputs,
+    const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
+    const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
+    const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
+  // clang-format on
+  const int64_t batch_size = points_idx.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  // Get the batch and index
+  const int batch = blockIdx.x;
+
+  const int num_pixels = C * W * H;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
+
+  // Parallelize over each feature in each pixel in images of size H * W,
+  // for each image in the batch of size batch_size
+  for (int pid = tid; pid < num_pixels; pid += num_threads) {
+    int ch = pid / (W * H);
+    int j = (pid % (W * H)) / H;
+    int i = (pid % (W * H)) % H;
+
+    // alphacomposite the different values
+    float cum_alpha = 1.;
+    // Iterate through the closest K points for this pixel
+    for (int k = 0; k < points_idx.size(1); ++k) {
+      int n_idx = points_idx[batch][k][j][i];
+
+      // Sentinel value is -1 indicating no point overlaps the pixel
+      if (n_idx < 0) {
+        continue;
+      }
+      float alpha = alphas[batch][k][j][i];
+
+      // TODO(gkioxari) It might be more efficient to have threads write in a
+      // local variable, and move atomicAdd outside of the loop such that
+      // atomicAdd is executed once per thread.
+      atomicAdd(
+          &grad_alphas[batch][k][j][i],
+          cum_alpha * features[ch][n_idx] * grad_outputs[batch][ch][j][i]);
+      atomicAdd(
+          &grad_features[ch][n_idx],
+          cum_alpha * alpha * grad_outputs[batch][ch][j][i]);
+
+      // Iterate over all (K-1) nearest points to update gradient
+      for (int t = 0; t < k; ++t) {
+        int t_idx = points_idx[batch][t][j][i];
+        // Sentinel value is -1, indicating no point overlaps this pixel
+        if (t_idx < 0) {
+          continue;
+        }
+        float alpha_tvalue = alphas[batch][t][j][i];
+        // TODO(gkioxari) It might be more efficient to have threads write in a
+        // local variable, and move atomicAdd outside of the loop such that
+        // atomicAdd is executed once per thread.
+        atomicAdd(
+            &grad_alphas[batch][t][j][i],
+            -grad_outputs[batch][ch][j][i] * features[ch][n_idx] * cum_alpha *
+                alpha / (1 - alpha_tvalue));
+      }
+
+      cum_alpha = cum_alpha * (1 - alphas[batch][k][j][i]);
+    }
+  }
+}
+
+torch::Tensor alphaCompositeCudaForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  const int64_t batch_size = points_idx.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  auto result = torch::zeros({batch_size, C, H, W}, features.options());
+
+  const dim3 threadsPerBlock(64);
+  const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
+
+  // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
+  // doubles. Currently, support is for floats only.
+  alphaCompositeCudaForwardKernel<<<numBlocks, threadsPerBlock>>>(
+      // clang-format off
+      result.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
+      features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
+      alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
+      points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
+  // clang-format on
+
+  return result;
+}
+
+std::tuple<torch::Tensor, torch::Tensor> alphaCompositeCudaBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  auto grad_features = torch::zeros_like(features);
+  auto grad_alphas = torch::zeros_like(alphas);
+
+  const int64_t bs = alphas.size(0);
+
+  const dim3 threadsPerBlock(64);
+  const dim3 numBlocks(bs, 1024 / bs + 1);
+
+  // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
+  // doubles. Currently, support is for floats only.
+  alphaCompositeCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>(
+      // clang-format off
+      grad_features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
+      grad_alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
+      grad_outputs.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
+      features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
+      alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
+      points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
+  // clang-format on
+
+  return std::make_tuple(grad_features, grad_alphas);
+}
--- a/pytorch3d/csrc/compositing/alpha_composite.h
+++ b/pytorch3d/csrc/compositing/alpha_composite.h
@@ -0,0 +1,110 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <torch/extension.h>
+#include "pytorch3d_cutils.h"
+
+#include <vector>
+
+// Perform alpha compositing of points in a z-buffer.
+//
+// Inputs:
+//    features: FloatTensor of shape (C, P) which gives the features
+//            of each point where C is the size of the feature and
+//            P the number of points.
+//    alphas: FloatTensor of shape (N, points_per_pixel, W, W) where
+//            points_per_pixel is the number of points in the z-buffer
+//            sorted in z-order, and W is the image size.
+//    points_idx: IntTensor of shape (N, points_per_pixel, W, W) giving the
+//            indices of the nearest points at each pixel, sorted in z-order.
+// Returns:
+//    weighted_fs: FloatTensor of shape (N, C, W, W) giving the accumulated
+//            feature for each point. Concretely, it gives:
+//                 weighted_fs[b,c,i,j] = sum_k cum_alpha_k *
+//                   features[c,points_idx[b,k,i,j]]
+//                 where cum_alpha_k =
+//                    alphas[b,k,i,j] * prod_l=0..k-1 (1 - alphas[b,l,i,j])
+
+// CUDA declarations
+#ifdef WITH_CUDA
+torch::Tensor alphaCompositeCudaForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+std::tuple<torch::Tensor, torch::Tensor> alphaCompositeCudaBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+#endif
+
+// C++ declarations
+torch::Tensor alphaCompositeCpuForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+std::tuple<torch::Tensor, torch::Tensor> alphaCompositeCpuBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+torch::Tensor alphaCompositeForward(
+    torch::Tensor& features,
+    torch::Tensor& alphas,
+    torch::Tensor& points_idx) {
+  features = features.contiguous();
+  alphas = alphas.contiguous();
+  points_idx = points_idx.contiguous();
+
+  if (features.type().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CONTIGUOUS_CUDA(features);
+    CHECK_CONTIGUOUS_CUDA(alphas);
+    CHECK_CONTIGUOUS_CUDA(points_idx);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+    return alphaCompositeCudaForward(features, alphas, points_idx);
+  } else {
+    CHECK_CONTIGUOUS(features);
+    CHECK_CONTIGUOUS(alphas);
+    CHECK_CONTIGUOUS(points_idx);
+
+    return alphaCompositeCpuForward(features, alphas, points_idx);
+  }
+}
+
+std::tuple<torch::Tensor, torch::Tensor> alphaCompositeBackward(
+    torch::Tensor& grad_outputs,
+    torch::Tensor& features,
+    torch::Tensor& alphas,
+    torch::Tensor& points_idx) {
+  grad_outputs = grad_outputs.contiguous();
+  features = features.contiguous();
+  alphas = alphas.contiguous();
+  points_idx = points_idx.contiguous();
+
+  if (grad_outputs.type().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CONTIGUOUS_CUDA(grad_outputs);
+    CHECK_CONTIGUOUS_CUDA(features);
+    CHECK_CONTIGUOUS_CUDA(alphas);
+    CHECK_CONTIGUOUS_CUDA(points_idx);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+
+    return alphaCompositeCudaBackward(
+        grad_outputs, features, alphas, points_idx);
+  } else {
+    CHECK_CONTIGUOUS(grad_outputs);
+    CHECK_CONTIGUOUS(features);
+    CHECK_CONTIGUOUS(alphas);
+    CHECK_CONTIGUOUS(points_idx);
+
+    return alphaCompositeCpuBackward(
+        grad_outputs, features, alphas, points_idx);
+  }
+}
--- a/pytorch3d/csrc/compositing/alpha_composite_cpu.cpp
+++ b/pytorch3d/csrc/compositing/alpha_composite_cpu.cpp
@@ -0,0 +1,114 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <torch/extension.h>
+
+#include <cmath>
+#include <vector>
+
+torch::Tensor alphaCompositeCpuForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  const int64_t B = points_idx.size(0);
+  const int64_t K = points_idx.size(1);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+  const int64_t C = features.size(0);
+
+  torch::Tensor result = torch::zeros({B, C, H, W}, features.options());
+
+  auto features_a = features.accessor<float, 2>();
+  auto alphas_a = alphas.accessor<float, 4>();
+  auto points_idx_a = points_idx.accessor<int64_t, 4>();
+  auto result_a = result.accessor<float, 4>();
+
+  // Iterate over the batch
+  for (int b = 0; b < B; ++b) {
+    // Iterate over the features
+    for (int c = 0; c < C; ++c) {
+      // Iterate through the horizontal lines of the image from top to bottom
+      for (int j = 0; j < H; ++j) {
+        // Iterate over pixels in a horizontal line, left to right
+        for (int i = 0; i < W; ++i) {
+          float cum_alpha = 1.;
+          // Iterate through the closest K points for this pixel
+          for (int k = 0; k < K; ++k) {
+            int64_t n_idx = points_idx_a[b][k][j][i];
+            // Sentinel value is -1 indicating no point overlaps the pixel
+            if (n_idx < 0) {
+              continue;
+            }
+            float alpha = alphas_a[b][k][j][i];
+            result_a[b][c][j][i] += cum_alpha * alpha * features_a[c][n_idx];
+            cum_alpha = cum_alpha * (1 - alpha);
+          }
+        }
+      }
+    }
+  }
+  return result;
+}
+
+std::tuple<torch::Tensor, torch::Tensor> alphaCompositeCpuBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  torch::Tensor grad_features = torch::zeros_like(features);
+  torch::Tensor grad_alphas = torch::zeros_like(alphas);
+
+  const int64_t B = points_idx.size(0);
+  const int64_t K = points_idx.size(1);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+  const int64_t C = features.size(0);
+
+  auto grad_outputs_a = grad_outputs.accessor<float, 4>();
+  auto features_a = features.accessor<float, 2>();
+  auto alphas_a = alphas.accessor<float, 4>();
+  auto points_idx_a = points_idx.accessor<int64_t, 4>();
+  auto grad_features_a = grad_features.accessor<float, 2>();
+  auto grad_alphas_a = grad_alphas.accessor<float, 4>();
+
+  // Iterate over the batch
+  for (int b = 0; b < B; ++b) {
+    // Iterate over the features
+    for (int c = 0; c < C; ++c) {
+      // Iterate through the horizontal lines of the image from top to bottom
+      for (int j = 0; j < H; ++j) {
+        // Iterate over pixels in a horizontal line, left to right
+        for (int i = 0; i < W; ++i) {
+          float cum_alpha = 1.;
+          // Iterate through the closest K points for this pixel
+          for (int k = 0; k < K; ++k) {
+            int64_t n_idx = points_idx_a[b][k][j][i];
+            // Sentinal value is -1, indicating no point overlaps this pixel
+            if (n_idx < 0) {
+              continue;
+            }
+            float alpha = alphas_a[b][k][j][i];
+            grad_alphas_a[b][k][j][i] +=
+                grad_outputs_a[b][c][j][i] * features_a[c][n_idx] * cum_alpha;
+            grad_features_a[c][n_idx] +=
+                grad_outputs_a[b][c][j][i] * cum_alpha * alpha;
+
+            // Iterate over all (K-1) nearer points to update gradient
+            for (int t = 0; t < k; t++) {
+              int64_t t_idx = points_idx_a[b][t][j][i];
+              // Sentinal value is -1, indicating no point overlaps this pixel
+              if (t_idx < 0) {
+                continue;
+              }
+              float alpha_tvalue = alphas_a[b][t][j][i];
+              grad_alphas_a[b][t][j][i] -= grad_outputs_a[b][c][j][i] *
+                  features_a[c][n_idx] * cum_alpha * alpha / (1 - alpha_tvalue);
+            }
+
+            cum_alpha = cum_alpha * (1 - alpha);
+          }
+        }
+      }
+    }
+  }
+  return std::make_tuple(grad_features, grad_alphas);
+}
--- a/pytorch3d/csrc/compositing/norm_weighted_sum.cu
+++ b/pytorch3d/csrc/compositing/norm_weighted_sum.cu
@@ -0,0 +1,202 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <torch/extension.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <stdio.h>
+#include <vector>
+
+__constant__ const float kEpsilon = 1e-4;
+
+// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
+// Currently, support is for floats only.
+__global__ void weightedSumNormCudaForwardKernel(
+    // clang-format off
+    torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> result,
+    const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
+    const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
+    const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
+  // clang-format on
+  const int64_t batch_size = result.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  // Get the batch and index
+  const int batch = blockIdx.x;
+
+  const int num_pixels = C * W * H;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
+
+  // Parallelize over each feature in each pixel in images of size H * W,
+  // for each image in the batch of size batch_size
+  for (int pid = tid; pid < num_pixels; pid += num_threads) {
+    int ch = pid / (W * H);
+    int j = (pid % (W * H)) / H;
+    int i = (pid % (W * H)) % H;
+
+    // Store the accumulated alpha value
+    float cum_alpha = 0.;
+    // Iterate through the closest K points for this pixel
+    for (int k = 0; k < points_idx.size(1); ++k) {
+      int n_idx = points_idx[batch][k][j][i];
+      // Sentinel value is -1 indicating no point overlaps the pixel
+      if (n_idx < 0) {
+        continue;
+      }
+
+      cum_alpha += alphas[batch][k][j][i];
+    }
+
+    if (cum_alpha < kEpsilon) {
+      cum_alpha = kEpsilon;
+    }
+
+    // Iterate through the closest K points for this pixel
+    for (int k = 0; k < points_idx.size(1); ++k) {
+      int n_idx = points_idx[batch][k][j][i];
+      // Sentinel value is -1 indicating no point overlaps the pixel
+      if (n_idx < 0) {
+        continue;
+      }
+      float alpha = alphas[batch][k][j][i];
+      // TODO(gkioxari) It might be more efficient to have threads write in a
+      // local variable, and move atomicAdd outside of the loop such that
+      // atomicAdd is executed once per thread.
+      atomicAdd(
+          &result[batch][ch][j][i], features[ch][n_idx] * alpha / cum_alpha);
+    }
+  }
+}
+
+// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
+// Currently, support is for floats only.
+__global__ void weightedSumNormCudaBackwardKernel(
+    // clang-format off
+    torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> grad_features,
+    torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_alphas,
+    const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_outputs,
+    const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
+    const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
+    const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
+  // clang-format on
+  const int64_t batch_size = points_idx.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  // Get the batch and index
+  const int batch = blockIdx.x;
+
+  const int num_pixels = C * W * H;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
+
+  // Parallelize over each feature in each pixel in images of size H * W,
+  // for each image in the batch of size batch_size
+  for (int pid = tid; pid < num_pixels; pid += num_threads) {
+    int ch = pid / (W * H);
+    int j = (pid % (W * H)) / H;
+    int i = (pid % (W * H)) % H;
+
+    float sum_alpha = 0.;
+    float sum_alphafs = 0.;
+    // Iterate through the closest K points for this pixel to calculate the
+    // cumulative sum of the alphas for this pixel
+    for (int k = 0; k < points_idx.size(1); ++k) {
+      int n_idx = points_idx[batch][k][j][i];
+      // Sentinel value is -1 indicating no point overlaps the pixel
+      if (n_idx < 0) {
+        continue;
+      }
+
+      sum_alpha += alphas[batch][k][j][i];
+      sum_alphafs += alphas[batch][k][j][i] * features[ch][n_idx];
+    }
+
+    if (sum_alpha < kEpsilon) {
+      sum_alpha = kEpsilon;
+    }
+
+    // Iterate again through the closest K points for this pixel to calculate
+    // the gradient.
+    for (int k = 0; k < points_idx.size(1); ++k) {
+      int n_idx = points_idx[batch][k][j][i];
+
+      // Sentinel value is -1 indicating no point overlaps the pixel
+      if (n_idx < 0) {
+        continue;
+      }
+      float alpha = alphas[batch][k][j][i];
+
+      // TODO(gkioxari) It might be more efficient to have threads write in a
+      // local variable, and move atomicAdd outside of the loop such that
+      // atomicAdd is executed once per thread.
+      atomicAdd(
+          &grad_alphas[batch][k][j][i],
+          (features[ch][n_idx] * sum_alpha - sum_alphafs) /
+              (sum_alpha * sum_alpha) * grad_outputs[batch][ch][j][i]);
+      atomicAdd(
+          &grad_features[ch][n_idx],
+          alpha * grad_outputs[batch][ch][j][i] / sum_alpha);
+    }
+  }
+}
+
+torch::Tensor weightedSumNormCudaForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  const int64_t batch_size = points_idx.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  auto result = torch::zeros({batch_size, C, H, W}, features.options());
+
+  const dim3 threadsPerBlock(64);
+  const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
+
+  // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
+  // doubles. Currently, support is for floats only.
+  // clang-format off
+  weightedSumNormCudaForwardKernel<<<numBlocks, threadsPerBlock>>>(
+      result.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
+      features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
+      alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
+      points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
+  // clang-format on
+
+  return result;
+}
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumNormCudaBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  auto grad_features = torch::zeros_like(features);
+  auto grad_alphas = torch::zeros_like(alphas);
+
+  const int64_t bs = points_idx.size(0);
+
+  const dim3 threadsPerBlock(64);
+  const dim3 numBlocks(bs, 1024 / bs + 1);
+
+  // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
+  // doubles. Currently, support is for floats only.
+  weightedSumNormCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>(
+      // clang-format off
+      grad_features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
+      grad_alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
+      grad_outputs.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
+      features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
+      alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
+      points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
+  // clang-format on
+
+  return std::make_tuple(grad_features, grad_alphas);
+}
--- a/pytorch3d/csrc/compositing/norm_weighted_sum.h
+++ b/pytorch3d/csrc/compositing/norm_weighted_sum.h
@@ -0,0 +1,109 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <torch/extension.h>
+#include "pytorch3d_cutils.h"
+
+#include <vector>
+
+// Perform normalized weighted sum compositing of points in a z-buffer.
+//
+// Inputs:
+//    features: FloatTensor of shape (C, P) which gives the features
+//            of each point where C is the size of the feature and
+//            P the number of points.
+//    alphas: FloatTensor of shape (N, points_per_pixel, W, W) where
+//            points_per_pixel is the number of points in the z-buffer
+//            sorted in z-order, and W is the image size.
+//    points_idx: IntTensor of shape (N, points_per_pixel, W, W) giving the
+//            indices of the nearest points at each pixel, sorted in z-order.
+// Returns:
+//    weighted_fs: FloatTensor of shape (N, C, W, W) giving the accumulated
+//            feature in each point. Concretely, it gives:
+//                 weighted_fs[b,c,i,j] = sum_k alphas[b,k,i,j] *
+//                   features[c,points_idx[b,k,i,j]] / sum_k alphas[b,k,i,j]
+
+// CUDA declarations
+#ifdef WITH_CUDA
+torch::Tensor weightedSumNormCudaForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumNormCudaBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+#endif
+
+// C++ declarations
+torch::Tensor weightedSumNormCpuForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumNormCpuBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+torch::Tensor weightedSumNormForward(
+    torch::Tensor& features,
+    torch::Tensor& alphas,
+    torch::Tensor& points_idx) {
+  features = features.contiguous();
+  alphas = alphas.contiguous();
+  points_idx = points_idx.contiguous();
+
+  if (features.type().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CONTIGUOUS_CUDA(features);
+    CHECK_CONTIGUOUS_CUDA(alphas);
+    CHECK_CONTIGUOUS_CUDA(points_idx);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+
+    return weightedSumNormCudaForward(features, alphas, points_idx);
+  } else {
+    CHECK_CONTIGUOUS(features);
+    CHECK_CONTIGUOUS(alphas);
+    CHECK_CONTIGUOUS(points_idx);
+
+    return weightedSumNormCpuForward(features, alphas, points_idx);
+  }
+}
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumNormBackward(
+    torch::Tensor& grad_outputs,
+    torch::Tensor& features,
+    torch::Tensor& alphas,
+    torch::Tensor& points_idx) {
+  grad_outputs = grad_outputs.contiguous();
+  features = features.contiguous();
+  alphas = alphas.contiguous();
+  points_idx = points_idx.contiguous();
+
+  if (grad_outputs.type().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CONTIGUOUS_CUDA(grad_outputs);
+    CHECK_CONTIGUOUS_CUDA(features);
+    CHECK_CONTIGUOUS_CUDA(alphas);
+    CHECK_CONTIGUOUS_CUDA(points_idx);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+
+    return weightedSumNormCudaBackward(
+        grad_outputs, features, alphas, points_idx);
+  } else {
+    CHECK_CONTIGUOUS(grad_outputs);
+    CHECK_CONTIGUOUS(features);
+    CHECK_CONTIGUOUS(alphas);
+    CHECK_CONTIGUOUS(points_idx);
+
+    return weightedSumNormCpuBackward(
+        grad_outputs, features, alphas, points_idx);
+  }
+}
--- a/pytorch3d/csrc/compositing/norm_weighted_sum_cpu.cpp
+++ b/pytorch3d/csrc/compositing/norm_weighted_sum_cpu.cpp
@@ -0,0 +1,134 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <torch/extension.h>
+
+#include <cmath>
+#include <vector>
+
+// Epsilon float
+const float kEps = 1e-4;
+
+torch::Tensor weightedSumNormCpuForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  const int64_t B = points_idx.size(0);
+  const int64_t K = points_idx.size(1);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+  const int64_t C = features.size(0);
+
+  torch::Tensor result = torch::zeros({B, C, H, W}, features.options());
+
+  auto features_a = features.accessor<float, 2>();
+  auto alphas_a = alphas.accessor<float, 4>();
+  auto points_idx_a = points_idx.accessor<int64_t, 4>();
+  auto result_a = result.accessor<float, 4>();
+
+  // Iterate over the batch
+  for (int b = 0; b < B; ++b) {
+    // Iterate oer the features
+    for (int c = 0; c < C; ++c) {
+      // Iterate through the horizontal lines of the image from top to bottom
+      for (int j = 0; j < H; ++j) {
+        // Iterate over pixels in a horizontal line, left to right
+        for (int i = 0; i < W; ++i) {
+          float t_alpha = 0.;
+          for (int k = 0; k < K; ++k) {
+            int64_t n_idx = points_idx_a[b][k][j][i];
+            if (n_idx < 0) {
+              continue;
+            }
+
+            t_alpha += alphas_a[b][k][j][i];
+          }
+
+          if (t_alpha < kEps) {
+            t_alpha = kEps;
+          }
+
+          // Iterate over the different zs to combine
+          for (int k = 0; k < K; ++k) {
+            int64_t n_idx = points_idx_a[b][k][j][i];
+            // Sentinel value is -1 indicating no point overlaps the pixel
+            if (n_idx < 0) {
+              continue;
+            }
+            float alpha = alphas_a[b][k][j][i];
+            result_a[b][c][j][i] += alpha * features_a[c][n_idx] / t_alpha;
+          }
+        }
+      }
+    }
+  }
+  return result;
+}
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumNormCpuBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  torch::Tensor grad_features = torch::zeros_like(features);
+  torch::Tensor grad_alphas = torch::zeros_like(alphas);
+
+  const int64_t B = points_idx.size(0);
+  const int64_t K = points_idx.size(1);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+  const int64_t C = features.size(0);
+
+  auto grad_outputs_a = grad_outputs.accessor<float, 4>();
+  auto features_a = features.accessor<float, 2>();
+  auto alphas_a = alphas.accessor<float, 4>();
+  auto points_idx_a = points_idx.accessor<int64_t, 4>();
+  auto grad_features_a = grad_features.accessor<float, 2>();
+  auto grad_alphas_a = grad_alphas.accessor<float, 4>();
+
+  // Iterate over the batch
+  for (int b = 0; b < B; ++b) {
+    // Iterate oer the features
+    for (int c = 0; c < C; ++c) {
+      // Iterate through the horizontal lines of the image from top to bottom
+      for (int j = 0; j < H; ++j) {
+        // Iterate over pixels in a horizontal line, left to right
+        for (int i = 0; i < W; ++i) {
+          float t_alpha = 0.;
+          float t_alphafs = 0.;
+          // Iterate through the closest K points for this pixel
+          for (int k = 0; k < K; ++k) {
+            int64_t n_idx = points_idx_a[b][k][j][i];
+            // Sentinel value is -1, indicating no point overlaps this pixel
+            if (n_idx < 0) {
+              continue;
+            }
+
+            t_alpha += alphas_a[b][k][j][i];
+            t_alphafs += alphas_a[b][k][j][i] * features_a[c][n_idx];
+          }
+
+          if (t_alpha < kEps) {
+            t_alpha = kEps;
+          }
+
+          // Iterate through the closest K points for this pixel ordered by z
+          // distance.
+          for (int k = 0; k < K; ++k) {
+            int64_t n_idx = points_idx_a[b][k][j][i];
+            // Sentinel value is -1 indicating no point overlaps the pixel
+            if (n_idx < 0) {
+              continue;
+            }
+            float alpha = alphas_a[b][k][j][i];
+            grad_alphas_a[b][k][j][i] += grad_outputs_a[b][c][j][i] *
+                (features_a[c][n_idx] * t_alpha - t_alphafs) /
+                (t_alpha * t_alpha);
+            grad_features_a[c][n_idx] +=
+                grad_outputs_a[b][c][j][i] * alpha / t_alpha;
+          }
+        }
+      }
+    }
+  }
+  return std::make_tuple(grad_features, grad_alphas);
+}
--- a/pytorch3d/csrc/compositing/weighted_sum.cu
+++ b/pytorch3d/csrc/compositing/weighted_sum.cu
@@ -0,0 +1,161 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <torch/extension.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <stdio.h>
+#include <vector>
+
+// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
+// Currently, support is for floats only.
+__global__ void weightedSumCudaForwardKernel(
+    // clang-format off
+    torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> result,
+    const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
+    const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
+    const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
+  // clang-format on
+  const int64_t batch_size = result.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  // Get the batch and index
+  const int batch = blockIdx.x;
+
+  const int num_pixels = C * W * H;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
+
+  // Parallelize over each feature in each pixel in images of size H * W,
+  // for each image in the batch of size batch_size
+  for (int pid = tid; pid < num_pixels; pid += num_threads) {
+    int ch = pid / (W * H);
+    int j = (pid % (W * H)) / H;
+    int i = (pid % (W * H)) % H;
+
+    // Iterate through the closest K points for this pixel
+    for (int k = 0; k < points_idx.size(1); ++k) {
+      int n_idx = points_idx[batch][k][j][i];
+      // Sentinel value is -1 indicating no point overlaps the pixel
+      if (n_idx < 0) {
+        continue;
+      }
+
+      // Accumulate the values
+      float alpha = alphas[batch][k][j][i];
+      // TODO(gkioxari) It might be more efficient to have threads write in a
+      // local variable, and move atomicAdd outside of the loop such that
+      // atomicAdd is executed once per thread.
+      atomicAdd(&result[batch][ch][j][i], features[ch][n_idx] * alpha);
+    }
+  }
+}
+
+// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
+// Currently, support is for floats only.
+__global__ void weightedSumCudaBackwardKernel(
+    // clang-format off
+    torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> grad_features,
+    torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_alphas,
+    const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_outputs,
+    const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
+    const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
+    const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
+  // clang-format on
+  const int64_t batch_size = points_idx.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  // Get the batch and index
+  const int batch = blockIdx.x;
+
+  const int num_pixels = C * W * H;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
+
+  // Iterate over each pixel to compute the contribution to the
+  // gradient for the features and weights
+  for (int pid = tid; pid < num_pixels; pid += num_threads) {
+    int ch = pid / (W * H);
+    int j = (pid % (W * H)) / H;
+    int i = (pid % (W * H)) % H;
+
+    // Iterate through the closest K points for this pixel
+    for (int k = 0; k < points_idx.size(1); ++k) {
+      int n_idx = points_idx[batch][k][j][i];
+      // Sentinel value is -1 indicating no point overlaps the pixel
+      if (n_idx < 0) {
+        continue;
+      }
+      float alpha = alphas[batch][k][j][i];
+
+      // TODO(gkioxari) It might be more efficient to have threads write in a
+      // local variable, and move atomicAdd outside of the loop such that
+      // atomicAdd is executed once per thread.
+      atomicAdd(
+          &grad_alphas[batch][k][j][i],
+          features[ch][n_idx] * grad_outputs[batch][ch][j][i]);
+      atomicAdd(
+          &grad_features[ch][n_idx], alpha * grad_outputs[batch][ch][j][i]);
+    }
+  }
+}
+
+torch::Tensor weightedSumCudaForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  const int64_t batch_size = points_idx.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  auto result = torch::zeros({batch_size, C, H, W}, features.options());
+
+  const dim3 threadsPerBlock(64);
+  const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
+
+  // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
+  // doubles. Currently, support is for floats only.
+  weightedSumCudaForwardKernel<<<numBlocks, threadsPerBlock>>>(
+      // clang-format off
+      result.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
+      features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
+      alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
+      points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
+  // clang-format on
+
+  return result;
+}
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumCudaBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  auto grad_features = torch::zeros_like(features);
+  auto grad_alphas = torch::zeros_like(alphas);
+
+  const int64_t bs = points_idx.size(0);
+
+  const dim3 threadsPerBlock(64);
+  const dim3 numBlocks(bs, 1024 / bs + 1);
+
+  // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
+  // doubles. Currently, support is for floats only.
+  weightedSumCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>(
+      // clang-format off
+      grad_features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
+      grad_alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
+      grad_outputs.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
+      features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
+      alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
+      points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
+  // clang-format on
+
+  return std::make_tuple(grad_features, grad_alphas);
+}
--- a/pytorch3d/csrc/compositing/weighted_sum.h
+++ b/pytorch3d/csrc/compositing/weighted_sum.h
@@ -0,0 +1,107 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <torch/extension.h>
+#include "pytorch3d_cutils.h"
+
+#include <vector>
+
+// Perform weighted sum compositing of points in a z-buffer.
+//
+// Inputs:
+//    features: FloatTensor of shape (C, P) which gives the features
+//            of each point where C is the size of the feature and
+//            P the number of points.
+//    alphas: FloatTensor of shape (N, points_per_pixel, W, W) where
+//            points_per_pixel is the number of points in the z-buffer
+//            sorted in z-order, and W is the image size.
+//    points_idx: IntTensor of shape (N, points_per_pixel, W, W) giving the
+//            indices of the nearest points at each pixel, sorted in z-order.
+// Returns:
+//    weighted_fs: FloatTensor of shape (N, C, W, W) giving the accumulated
+//            feature in each point. Concretely, it gives:
+//                 weighted_fs[b,c,i,j] = sum_k alphas[b,k,i,j] *
+//                   features[c,points_idx[b,k,i,j]]
+
+// CUDA declarations
+#ifdef WITH_CUDA
+torch::Tensor weightedSumCudaForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumCudaBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+#endif
+
+// C++ declarations
+torch::Tensor weightedSumCpuForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumCpuBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+torch::Tensor weightedSumForward(
+    torch::Tensor& features,
+    torch::Tensor& alphas,
+    torch::Tensor& points_idx) {
+  features = features.contiguous();
+  alphas = alphas.contiguous();
+  points_idx = points_idx.contiguous();
+
+  if (features.type().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CONTIGUOUS_CUDA(features);
+    CHECK_CONTIGUOUS_CUDA(alphas);
+    CHECK_CONTIGUOUS_CUDA(points_idx);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+
+    return weightedSumCudaForward(features, alphas, points_idx);
+  } else {
+    CHECK_CONTIGUOUS(features);
+    CHECK_CONTIGUOUS(alphas);
+    CHECK_CONTIGUOUS(points_idx);
+
+    return weightedSumCpuForward(features, alphas, points_idx);
+  }
+}
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumBackward(
+    torch::Tensor& grad_outputs,
+    torch::Tensor& features,
+    torch::Tensor& alphas,
+    torch::Tensor& points_idx) {
+  grad_outputs = grad_outputs.contiguous();
+  features = features.contiguous();
+  alphas = alphas.contiguous();
+  points_idx = points_idx.contiguous();
+
+  if (grad_outputs.type().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CONTIGUOUS_CUDA(grad_outputs);
+    CHECK_CONTIGUOUS_CUDA(features);
+    CHECK_CONTIGUOUS_CUDA(alphas);
+    CHECK_CONTIGUOUS_CUDA(points_idx);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+
+    return weightedSumCudaBackward(grad_outputs, features, alphas, points_idx);
+  } else {
+    CHECK_CONTIGUOUS(grad_outputs);
+    CHECK_CONTIGUOUS(features);
+    CHECK_CONTIGUOUS(alphas);
+    CHECK_CONTIGUOUS(points_idx);
+
+    return weightedSumCpuBackward(grad_outputs, features, alphas, points_idx);
+  }
+}
--- a/pytorch3d/csrc/compositing/weighted_sum_cpu.cpp
+++ b/pytorch3d/csrc/compositing/weighted_sum_cpu.cpp
@@ -0,0 +1,98 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <torch/extension.h>
+
+#include <cmath>
+#include <vector>
+
+torch::Tensor weightedSumCpuForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  const int64_t B = points_idx.size(0);
+  const int64_t K = points_idx.size(1);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+  const int64_t C = features.size(0);
+
+  torch::Tensor result = torch::zeros({B, C, H, W}, features.options());
+
+  auto features_a = features.accessor<float, 2>();
+  auto alphas_a = alphas.accessor<float, 4>();
+  auto points_idx_a = points_idx.accessor<int64_t, 4>();
+  auto result_a = result.accessor<float, 4>();
+
+  // Iterate over the batch
+  for (int b = 0; b < B; ++b) {
+    // Iterate over the features
+    for (int c = 0; c < C; ++c) {
+      // Iterate through the horizontal lines of the image from top to bottom
+      for (int j = 0; j < H; ++j) {
+        // Iterate over pixels in a horizontal line, left to right
+        for (int i = 0; i < W; ++i) {
+          // Iterate through the closest K points for this pixel
+          for (int k = 0; k < K; ++k) {
+            int64_t n_idx = points_idx_a[b][k][j][i];
+            // Sentinel value is -1 indicating no point overlaps the pixel
+            if (n_idx < 0) {
+              continue;
+            }
+
+            float alpha = alphas_a[b][k][j][i];
+            result_a[b][c][j][i] += alpha * features_a[c][n_idx];
+          }
+        }
+      }
+    }
+  }
+  return result;
+}
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumCpuBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  const int64_t B = points_idx.size(0);
+  const int64_t K = points_idx.size(1);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+  const int64_t C = features.size(0);
+
+  torch::Tensor grad_features = torch::zeros_like(features);
+  torch::Tensor grad_alphas = torch::zeros_like(alphas);
+
+  auto grad_outputs_a = grad_outputs.accessor<float, 4>();
+  auto features_a = features.accessor<float, 2>();
+  auto alphas_a = alphas.accessor<float, 4>();
+  auto points_idx_a = points_idx.accessor<int64_t, 4>();
+  auto grad_features_a = grad_features.accessor<float, 2>();
+  auto grad_alphas_a = grad_alphas.accessor<float, 4>();
+
+  // Iterate over the batch
+  for (int b = 0; b < B; ++b) {
+    // Iterate oer the features
+    for (int c = 0; c < C; ++c) {
+      // Iterate through the horizontal lines of the image from top to bottom
+      for (int j = 0; j < H; ++j) {
+        // Iterate over pixels in a horizontal line, left to right
+        for (int i = 0; i < W; ++i) {
+          // Iterate through the closest K points for this pixel
+          for (int k = 0; k < K; ++k) {
+            int64_t n_idx = points_idx_a[b][k][j][i];
+            // Sentinal value is -1, indicating no point overlaps this pixel
+            if (n_idx < 0) {
+              continue;
+            }
+
+            float alpha = alphas_a[b][k][j][i];
+            grad_alphas_a[b][k][j][i] +=
+                grad_outputs_a[b][c][j][i] * features_a[c][n_idx];
+            grad_features_a[c][n_idx] += grad_outputs_a[b][c][j][i] * alpha;
+          }
+        }
+      }
+    }
+  }
+  return std::make_tuple(grad_features, grad_alphas);
+}
--- a/pytorch3d/csrc/ext.cpp
+++ b/pytorch3d/csrc/ext.cpp
@@ -1,6 +1,9 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.

 #include <torch/extension.h>
+#include "compositing/alpha_composite.h"
+#include "compositing/norm_weighted_sum.h"
+#include "compositing/weighted_sum.h"
 #include "face_areas_normals/face_areas_normals.h"
 #include "gather_scatter/gather_scatter.h"
 #include "nearest_neighbor_points/nearest_neighbor_points.h"
@@ -20,6 +23,14 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("rasterize_meshes_backward", &RasterizeMeshesBackward);
  m.def("rasterize_meshes", &RasterizeMeshes);

+  // Accumulation functions
+  m.def("accum_weightedsumnorm", &weightedSumNormForward);
+  m.def("accum_weightedsum", &weightedSumForward);
+  m.def("accum_alphacomposite", &alphaCompositeForward);
+  m.def("accum_weightedsumnorm_backward", &weightedSumNormBackward);
+  m.def("accum_weightedsum_backward", &weightedSumBackward);
+  m.def("accum_alphacomposite_backward", &alphaCompositeBackward);
+
  // These are only visible for testing; users should not call them directly
  m.def("_rasterize_points_coarse", &RasterizePointsCoarse);
  m.def("_rasterize_points_naive", &RasterizePointsNaive);