Accumulate points (#4)

Summary:
Code for accumulating points in the z-buffer in three ways:
1. weighted sum
2. normalised weighted sum
3. alpha compositing

Pull Request resolved: https://github.com/fairinternal/pytorch3d/pull/4

Reviewed By: nikhilaravi

Differential Revision: D20522422

Pulled By: gkioxari

fbshipit-source-id: 5023baa05f15e338f3821ef08f5552c2dcbfc06c
This commit is contained in:
Olivia
2020-03-19 11:19:39 -07:00
committed by Facebook GitHub Bot
parent 5218f45c2c
commit 53599770dd
21 changed files with 2466 additions and 4 deletions

View File

@@ -0,0 +1,187 @@
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <vector>
// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
// Currently, support is for floats only.
__global__ void alphaCompositeCudaForwardKernel(
// clang-format off
torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> result,
const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
// clang-format on
const int64_t batch_size = result.size(0);
const int64_t C = features.size(0);
const int64_t H = points_idx.size(2);
const int64_t W = points_idx.size(3);
// Get the batch and index
const int batch = blockIdx.x;
const int num_pixels = C * W * H;
const int num_threads = gridDim.y * blockDim.x;
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
// Iterate over each feature in each pixel
for (int pid = tid; pid < num_pixels; pid += num_threads) {
int ch = pid / (W * H);
int j = (pid % (W * H)) / H;
int i = (pid % (W * H)) % H;
// alphacomposite the different values
float cum_alpha = 1.;
// Iterate through the closest K points for this pixel
for (int k = 0; k < points_idx.size(1); ++k) {
int n_idx = points_idx[batch][k][j][i];
// Sentinel value is -1 indicating no point overlaps the pixel
if (n_idx < 0) {
continue;
}
float alpha = alphas[batch][k][j][i];
// TODO(gkioxari) It might be more efficient to have threads write in a
// local variable, and move atomicAdd outside of the loop such that
// atomicAdd is executed once per thread.
atomicAdd(
&result[batch][ch][j][i], features[ch][n_idx] * cum_alpha * alpha);
cum_alpha = cum_alpha * (1 - alpha);
}
}
}
// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
// Currently, support is for floats only.
__global__ void alphaCompositeCudaBackwardKernel(
// clang-format off
torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> grad_features,
torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_alphas,
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_outputs,
const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
// clang-format on
const int64_t batch_size = points_idx.size(0);
const int64_t C = features.size(0);
const int64_t H = points_idx.size(2);
const int64_t W = points_idx.size(3);
// Get the batch and index
const int batch = blockIdx.x;
const int num_pixels = C * W * H;
const int num_threads = gridDim.y * blockDim.x;
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
// Parallelize over each feature in each pixel in images of size H * W,
// for each image in the batch of size batch_size
for (int pid = tid; pid < num_pixels; pid += num_threads) {
int ch = pid / (W * H);
int j = (pid % (W * H)) / H;
int i = (pid % (W * H)) % H;
// alphacomposite the different values
float cum_alpha = 1.;
// Iterate through the closest K points for this pixel
for (int k = 0; k < points_idx.size(1); ++k) {
int n_idx = points_idx[batch][k][j][i];
// Sentinel value is -1 indicating no point overlaps the pixel
if (n_idx < 0) {
continue;
}
float alpha = alphas[batch][k][j][i];
// TODO(gkioxari) It might be more efficient to have threads write in a
// local variable, and move atomicAdd outside of the loop such that
// atomicAdd is executed once per thread.
atomicAdd(
&grad_alphas[batch][k][j][i],
cum_alpha * features[ch][n_idx] * grad_outputs[batch][ch][j][i]);
atomicAdd(
&grad_features[ch][n_idx],
cum_alpha * alpha * grad_outputs[batch][ch][j][i]);
// Iterate over all (K-1) nearest points to update gradient
for (int t = 0; t < k; ++t) {
int t_idx = points_idx[batch][t][j][i];
// Sentinel value is -1, indicating no point overlaps this pixel
if (t_idx < 0) {
continue;
}
float alpha_tvalue = alphas[batch][t][j][i];
// TODO(gkioxari) It might be more efficient to have threads write in a
// local variable, and move atomicAdd outside of the loop such that
// atomicAdd is executed once per thread.
atomicAdd(
&grad_alphas[batch][t][j][i],
-grad_outputs[batch][ch][j][i] * features[ch][n_idx] * cum_alpha *
alpha / (1 - alpha_tvalue));
}
cum_alpha = cum_alpha * (1 - alphas[batch][k][j][i]);
}
}
}
torch::Tensor alphaCompositeCudaForward(
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx) {
const int64_t batch_size = points_idx.size(0);
const int64_t C = features.size(0);
const int64_t H = points_idx.size(2);
const int64_t W = points_idx.size(3);
auto result = torch::zeros({batch_size, C, H, W}, features.options());
const dim3 threadsPerBlock(64);
const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
// doubles. Currently, support is for floats only.
alphaCompositeCudaForwardKernel<<<numBlocks, threadsPerBlock>>>(
// clang-format off
result.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
// clang-format on
return result;
}
std::tuple<torch::Tensor, torch::Tensor> alphaCompositeCudaBackward(
const torch::Tensor& grad_outputs,
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx) {
auto grad_features = torch::zeros_like(features);
auto grad_alphas = torch::zeros_like(alphas);
const int64_t bs = alphas.size(0);
const dim3 threadsPerBlock(64);
const dim3 numBlocks(bs, 1024 / bs + 1);
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
// doubles. Currently, support is for floats only.
alphaCompositeCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>(
// clang-format off
grad_features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
grad_alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
grad_outputs.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
// clang-format on
return std::make_tuple(grad_features, grad_alphas);
}

View File

@@ -0,0 +1,110 @@
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <torch/extension.h>
#include "pytorch3d_cutils.h"
#include <vector>
// Perform alpha compositing of points in a z-buffer.
//
// Inputs:
// features: FloatTensor of shape (C, P) which gives the features
// of each point where C is the size of the feature and
// P the number of points.
// alphas: FloatTensor of shape (N, points_per_pixel, W, W) where
// points_per_pixel is the number of points in the z-buffer
// sorted in z-order, and W is the image size.
// points_idx: IntTensor of shape (N, points_per_pixel, W, W) giving the
// indices of the nearest points at each pixel, sorted in z-order.
// Returns:
// weighted_fs: FloatTensor of shape (N, C, W, W) giving the accumulated
// feature for each point. Concretely, it gives:
// weighted_fs[b,c,i,j] = sum_k cum_alpha_k *
// features[c,points_idx[b,k,i,j]]
// where cum_alpha_k =
// alphas[b,k,i,j] * prod_l=0..k-1 (1 - alphas[b,l,i,j])
// CUDA declarations
#ifdef WITH_CUDA
torch::Tensor alphaCompositeCudaForward(
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx);
std::tuple<torch::Tensor, torch::Tensor> alphaCompositeCudaBackward(
const torch::Tensor& grad_outputs,
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx);
#endif
// C++ declarations
torch::Tensor alphaCompositeCpuForward(
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx);
std::tuple<torch::Tensor, torch::Tensor> alphaCompositeCpuBackward(
const torch::Tensor& grad_outputs,
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx);
torch::Tensor alphaCompositeForward(
torch::Tensor& features,
torch::Tensor& alphas,
torch::Tensor& points_idx) {
features = features.contiguous();
alphas = alphas.contiguous();
points_idx = points_idx.contiguous();
if (features.type().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(features);
CHECK_CONTIGUOUS_CUDA(alphas);
CHECK_CONTIGUOUS_CUDA(points_idx);
#else
AT_ERROR("Not compiled with GPU support");
#endif
return alphaCompositeCudaForward(features, alphas, points_idx);
} else {
CHECK_CONTIGUOUS(features);
CHECK_CONTIGUOUS(alphas);
CHECK_CONTIGUOUS(points_idx);
return alphaCompositeCpuForward(features, alphas, points_idx);
}
}
std::tuple<torch::Tensor, torch::Tensor> alphaCompositeBackward(
torch::Tensor& grad_outputs,
torch::Tensor& features,
torch::Tensor& alphas,
torch::Tensor& points_idx) {
grad_outputs = grad_outputs.contiguous();
features = features.contiguous();
alphas = alphas.contiguous();
points_idx = points_idx.contiguous();
if (grad_outputs.type().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(grad_outputs);
CHECK_CONTIGUOUS_CUDA(features);
CHECK_CONTIGUOUS_CUDA(alphas);
CHECK_CONTIGUOUS_CUDA(points_idx);
#else
AT_ERROR("Not compiled with GPU support");
#endif
return alphaCompositeCudaBackward(
grad_outputs, features, alphas, points_idx);
} else {
CHECK_CONTIGUOUS(grad_outputs);
CHECK_CONTIGUOUS(features);
CHECK_CONTIGUOUS(alphas);
CHECK_CONTIGUOUS(points_idx);
return alphaCompositeCpuBackward(
grad_outputs, features, alphas, points_idx);
}
}

View File

@@ -0,0 +1,114 @@
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <torch/extension.h>
#include <cmath>
#include <vector>
torch::Tensor alphaCompositeCpuForward(
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx) {
const int64_t B = points_idx.size(0);
const int64_t K = points_idx.size(1);
const int64_t H = points_idx.size(2);
const int64_t W = points_idx.size(3);
const int64_t C = features.size(0);
torch::Tensor result = torch::zeros({B, C, H, W}, features.options());
auto features_a = features.accessor<float, 2>();
auto alphas_a = alphas.accessor<float, 4>();
auto points_idx_a = points_idx.accessor<int64_t, 4>();
auto result_a = result.accessor<float, 4>();
// Iterate over the batch
for (int b = 0; b < B; ++b) {
// Iterate over the features
for (int c = 0; c < C; ++c) {
// Iterate through the horizontal lines of the image from top to bottom
for (int j = 0; j < H; ++j) {
// Iterate over pixels in a horizontal line, left to right
for (int i = 0; i < W; ++i) {
float cum_alpha = 1.;
// Iterate through the closest K points for this pixel
for (int k = 0; k < K; ++k) {
int64_t n_idx = points_idx_a[b][k][j][i];
// Sentinel value is -1 indicating no point overlaps the pixel
if (n_idx < 0) {
continue;
}
float alpha = alphas_a[b][k][j][i];
result_a[b][c][j][i] += cum_alpha * alpha * features_a[c][n_idx];
cum_alpha = cum_alpha * (1 - alpha);
}
}
}
}
}
return result;
}
std::tuple<torch::Tensor, torch::Tensor> alphaCompositeCpuBackward(
const torch::Tensor& grad_outputs,
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx) {
torch::Tensor grad_features = torch::zeros_like(features);
torch::Tensor grad_alphas = torch::zeros_like(alphas);
const int64_t B = points_idx.size(0);
const int64_t K = points_idx.size(1);
const int64_t H = points_idx.size(2);
const int64_t W = points_idx.size(3);
const int64_t C = features.size(0);
auto grad_outputs_a = grad_outputs.accessor<float, 4>();
auto features_a = features.accessor<float, 2>();
auto alphas_a = alphas.accessor<float, 4>();
auto points_idx_a = points_idx.accessor<int64_t, 4>();
auto grad_features_a = grad_features.accessor<float, 2>();
auto grad_alphas_a = grad_alphas.accessor<float, 4>();
// Iterate over the batch
for (int b = 0; b < B; ++b) {
// Iterate over the features
for (int c = 0; c < C; ++c) {
// Iterate through the horizontal lines of the image from top to bottom
for (int j = 0; j < H; ++j) {
// Iterate over pixels in a horizontal line, left to right
for (int i = 0; i < W; ++i) {
float cum_alpha = 1.;
// Iterate through the closest K points for this pixel
for (int k = 0; k < K; ++k) {
int64_t n_idx = points_idx_a[b][k][j][i];
// Sentinal value is -1, indicating no point overlaps this pixel
if (n_idx < 0) {
continue;
}
float alpha = alphas_a[b][k][j][i];
grad_alphas_a[b][k][j][i] +=
grad_outputs_a[b][c][j][i] * features_a[c][n_idx] * cum_alpha;
grad_features_a[c][n_idx] +=
grad_outputs_a[b][c][j][i] * cum_alpha * alpha;
// Iterate over all (K-1) nearer points to update gradient
for (int t = 0; t < k; t++) {
int64_t t_idx = points_idx_a[b][t][j][i];
// Sentinal value is -1, indicating no point overlaps this pixel
if (t_idx < 0) {
continue;
}
float alpha_tvalue = alphas_a[b][t][j][i];
grad_alphas_a[b][t][j][i] -= grad_outputs_a[b][c][j][i] *
features_a[c][n_idx] * cum_alpha * alpha / (1 - alpha_tvalue);
}
cum_alpha = cum_alpha * (1 - alpha);
}
}
}
}
}
return std::make_tuple(grad_features, grad_alphas);
}

View File

@@ -0,0 +1,202 @@
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <vector>
__constant__ const float kEpsilon = 1e-4;
// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
// Currently, support is for floats only.
__global__ void weightedSumNormCudaForwardKernel(
// clang-format off
torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> result,
const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
// clang-format on
const int64_t batch_size = result.size(0);
const int64_t C = features.size(0);
const int64_t H = points_idx.size(2);
const int64_t W = points_idx.size(3);
// Get the batch and index
const int batch = blockIdx.x;
const int num_pixels = C * W * H;
const int num_threads = gridDim.y * blockDim.x;
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
// Parallelize over each feature in each pixel in images of size H * W,
// for each image in the batch of size batch_size
for (int pid = tid; pid < num_pixels; pid += num_threads) {
int ch = pid / (W * H);
int j = (pid % (W * H)) / H;
int i = (pid % (W * H)) % H;
// Store the accumulated alpha value
float cum_alpha = 0.;
// Iterate through the closest K points for this pixel
for (int k = 0; k < points_idx.size(1); ++k) {
int n_idx = points_idx[batch][k][j][i];
// Sentinel value is -1 indicating no point overlaps the pixel
if (n_idx < 0) {
continue;
}
cum_alpha += alphas[batch][k][j][i];
}
if (cum_alpha < kEpsilon) {
cum_alpha = kEpsilon;
}
// Iterate through the closest K points for this pixel
for (int k = 0; k < points_idx.size(1); ++k) {
int n_idx = points_idx[batch][k][j][i];
// Sentinel value is -1 indicating no point overlaps the pixel
if (n_idx < 0) {
continue;
}
float alpha = alphas[batch][k][j][i];
// TODO(gkioxari) It might be more efficient to have threads write in a
// local variable, and move atomicAdd outside of the loop such that
// atomicAdd is executed once per thread.
atomicAdd(
&result[batch][ch][j][i], features[ch][n_idx] * alpha / cum_alpha);
}
}
}
// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
// Currently, support is for floats only.
__global__ void weightedSumNormCudaBackwardKernel(
// clang-format off
torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> grad_features,
torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_alphas,
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_outputs,
const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
// clang-format on
const int64_t batch_size = points_idx.size(0);
const int64_t C = features.size(0);
const int64_t H = points_idx.size(2);
const int64_t W = points_idx.size(3);
// Get the batch and index
const int batch = blockIdx.x;
const int num_pixels = C * W * H;
const int num_threads = gridDim.y * blockDim.x;
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
// Parallelize over each feature in each pixel in images of size H * W,
// for each image in the batch of size batch_size
for (int pid = tid; pid < num_pixels; pid += num_threads) {
int ch = pid / (W * H);
int j = (pid % (W * H)) / H;
int i = (pid % (W * H)) % H;
float sum_alpha = 0.;
float sum_alphafs = 0.;
// Iterate through the closest K points for this pixel to calculate the
// cumulative sum of the alphas for this pixel
for (int k = 0; k < points_idx.size(1); ++k) {
int n_idx = points_idx[batch][k][j][i];
// Sentinel value is -1 indicating no point overlaps the pixel
if (n_idx < 0) {
continue;
}
sum_alpha += alphas[batch][k][j][i];
sum_alphafs += alphas[batch][k][j][i] * features[ch][n_idx];
}
if (sum_alpha < kEpsilon) {
sum_alpha = kEpsilon;
}
// Iterate again through the closest K points for this pixel to calculate
// the gradient.
for (int k = 0; k < points_idx.size(1); ++k) {
int n_idx = points_idx[batch][k][j][i];
// Sentinel value is -1 indicating no point overlaps the pixel
if (n_idx < 0) {
continue;
}
float alpha = alphas[batch][k][j][i];
// TODO(gkioxari) It might be more efficient to have threads write in a
// local variable, and move atomicAdd outside of the loop such that
// atomicAdd is executed once per thread.
atomicAdd(
&grad_alphas[batch][k][j][i],
(features[ch][n_idx] * sum_alpha - sum_alphafs) /
(sum_alpha * sum_alpha) * grad_outputs[batch][ch][j][i]);
atomicAdd(
&grad_features[ch][n_idx],
alpha * grad_outputs[batch][ch][j][i] / sum_alpha);
}
}
}
torch::Tensor weightedSumNormCudaForward(
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx) {
const int64_t batch_size = points_idx.size(0);
const int64_t C = features.size(0);
const int64_t H = points_idx.size(2);
const int64_t W = points_idx.size(3);
auto result = torch::zeros({batch_size, C, H, W}, features.options());
const dim3 threadsPerBlock(64);
const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
// doubles. Currently, support is for floats only.
// clang-format off
weightedSumNormCudaForwardKernel<<<numBlocks, threadsPerBlock>>>(
result.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
// clang-format on
return result;
}
std::tuple<torch::Tensor, torch::Tensor> weightedSumNormCudaBackward(
const torch::Tensor& grad_outputs,
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx) {
auto grad_features = torch::zeros_like(features);
auto grad_alphas = torch::zeros_like(alphas);
const int64_t bs = points_idx.size(0);
const dim3 threadsPerBlock(64);
const dim3 numBlocks(bs, 1024 / bs + 1);
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
// doubles. Currently, support is for floats only.
weightedSumNormCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>(
// clang-format off
grad_features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
grad_alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
grad_outputs.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
// clang-format on
return std::make_tuple(grad_features, grad_alphas);
}

View File

@@ -0,0 +1,109 @@
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <torch/extension.h>
#include "pytorch3d_cutils.h"
#include <vector>
// Perform normalized weighted sum compositing of points in a z-buffer.
//
// Inputs:
// features: FloatTensor of shape (C, P) which gives the features
// of each point where C is the size of the feature and
// P the number of points.
// alphas: FloatTensor of shape (N, points_per_pixel, W, W) where
// points_per_pixel is the number of points in the z-buffer
// sorted in z-order, and W is the image size.
// points_idx: IntTensor of shape (N, points_per_pixel, W, W) giving the
// indices of the nearest points at each pixel, sorted in z-order.
// Returns:
// weighted_fs: FloatTensor of shape (N, C, W, W) giving the accumulated
// feature in each point. Concretely, it gives:
// weighted_fs[b,c,i,j] = sum_k alphas[b,k,i,j] *
// features[c,points_idx[b,k,i,j]] / sum_k alphas[b,k,i,j]
// CUDA declarations
#ifdef WITH_CUDA
torch::Tensor weightedSumNormCudaForward(
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx);
std::tuple<torch::Tensor, torch::Tensor> weightedSumNormCudaBackward(
const torch::Tensor& grad_outputs,
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx);
#endif
// C++ declarations
torch::Tensor weightedSumNormCpuForward(
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx);
std::tuple<torch::Tensor, torch::Tensor> weightedSumNormCpuBackward(
const torch::Tensor& grad_outputs,
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx);
torch::Tensor weightedSumNormForward(
torch::Tensor& features,
torch::Tensor& alphas,
torch::Tensor& points_idx) {
features = features.contiguous();
alphas = alphas.contiguous();
points_idx = points_idx.contiguous();
if (features.type().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(features);
CHECK_CONTIGUOUS_CUDA(alphas);
CHECK_CONTIGUOUS_CUDA(points_idx);
#else
AT_ERROR("Not compiled with GPU support");
#endif
return weightedSumNormCudaForward(features, alphas, points_idx);
} else {
CHECK_CONTIGUOUS(features);
CHECK_CONTIGUOUS(alphas);
CHECK_CONTIGUOUS(points_idx);
return weightedSumNormCpuForward(features, alphas, points_idx);
}
}
std::tuple<torch::Tensor, torch::Tensor> weightedSumNormBackward(
torch::Tensor& grad_outputs,
torch::Tensor& features,
torch::Tensor& alphas,
torch::Tensor& points_idx) {
grad_outputs = grad_outputs.contiguous();
features = features.contiguous();
alphas = alphas.contiguous();
points_idx = points_idx.contiguous();
if (grad_outputs.type().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(grad_outputs);
CHECK_CONTIGUOUS_CUDA(features);
CHECK_CONTIGUOUS_CUDA(alphas);
CHECK_CONTIGUOUS_CUDA(points_idx);
#else
AT_ERROR("Not compiled with GPU support");
#endif
return weightedSumNormCudaBackward(
grad_outputs, features, alphas, points_idx);
} else {
CHECK_CONTIGUOUS(grad_outputs);
CHECK_CONTIGUOUS(features);
CHECK_CONTIGUOUS(alphas);
CHECK_CONTIGUOUS(points_idx);
return weightedSumNormCpuBackward(
grad_outputs, features, alphas, points_idx);
}
}

View File

@@ -0,0 +1,134 @@
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <torch/extension.h>
#include <cmath>
#include <vector>
// Epsilon float
const float kEps = 1e-4;
torch::Tensor weightedSumNormCpuForward(
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx) {
const int64_t B = points_idx.size(0);
const int64_t K = points_idx.size(1);
const int64_t H = points_idx.size(2);
const int64_t W = points_idx.size(3);
const int64_t C = features.size(0);
torch::Tensor result = torch::zeros({B, C, H, W}, features.options());
auto features_a = features.accessor<float, 2>();
auto alphas_a = alphas.accessor<float, 4>();
auto points_idx_a = points_idx.accessor<int64_t, 4>();
auto result_a = result.accessor<float, 4>();
// Iterate over the batch
for (int b = 0; b < B; ++b) {
// Iterate oer the features
for (int c = 0; c < C; ++c) {
// Iterate through the horizontal lines of the image from top to bottom
for (int j = 0; j < H; ++j) {
// Iterate over pixels in a horizontal line, left to right
for (int i = 0; i < W; ++i) {
float t_alpha = 0.;
for (int k = 0; k < K; ++k) {
int64_t n_idx = points_idx_a[b][k][j][i];
if (n_idx < 0) {
continue;
}
t_alpha += alphas_a[b][k][j][i];
}
if (t_alpha < kEps) {
t_alpha = kEps;
}
// Iterate over the different zs to combine
for (int k = 0; k < K; ++k) {
int64_t n_idx = points_idx_a[b][k][j][i];
// Sentinel value is -1 indicating no point overlaps the pixel
if (n_idx < 0) {
continue;
}
float alpha = alphas_a[b][k][j][i];
result_a[b][c][j][i] += alpha * features_a[c][n_idx] / t_alpha;
}
}
}
}
}
return result;
}
std::tuple<torch::Tensor, torch::Tensor> weightedSumNormCpuBackward(
const torch::Tensor& grad_outputs,
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx) {
torch::Tensor grad_features = torch::zeros_like(features);
torch::Tensor grad_alphas = torch::zeros_like(alphas);
const int64_t B = points_idx.size(0);
const int64_t K = points_idx.size(1);
const int64_t H = points_idx.size(2);
const int64_t W = points_idx.size(3);
const int64_t C = features.size(0);
auto grad_outputs_a = grad_outputs.accessor<float, 4>();
auto features_a = features.accessor<float, 2>();
auto alphas_a = alphas.accessor<float, 4>();
auto points_idx_a = points_idx.accessor<int64_t, 4>();
auto grad_features_a = grad_features.accessor<float, 2>();
auto grad_alphas_a = grad_alphas.accessor<float, 4>();
// Iterate over the batch
for (int b = 0; b < B; ++b) {
// Iterate oer the features
for (int c = 0; c < C; ++c) {
// Iterate through the horizontal lines of the image from top to bottom
for (int j = 0; j < H; ++j) {
// Iterate over pixels in a horizontal line, left to right
for (int i = 0; i < W; ++i) {
float t_alpha = 0.;
float t_alphafs = 0.;
// Iterate through the closest K points for this pixel
for (int k = 0; k < K; ++k) {
int64_t n_idx = points_idx_a[b][k][j][i];
// Sentinel value is -1, indicating no point overlaps this pixel
if (n_idx < 0) {
continue;
}
t_alpha += alphas_a[b][k][j][i];
t_alphafs += alphas_a[b][k][j][i] * features_a[c][n_idx];
}
if (t_alpha < kEps) {
t_alpha = kEps;
}
// Iterate through the closest K points for this pixel ordered by z
// distance.
for (int k = 0; k < K; ++k) {
int64_t n_idx = points_idx_a[b][k][j][i];
// Sentinel value is -1 indicating no point overlaps the pixel
if (n_idx < 0) {
continue;
}
float alpha = alphas_a[b][k][j][i];
grad_alphas_a[b][k][j][i] += grad_outputs_a[b][c][j][i] *
(features_a[c][n_idx] * t_alpha - t_alphafs) /
(t_alpha * t_alpha);
grad_features_a[c][n_idx] +=
grad_outputs_a[b][c][j][i] * alpha / t_alpha;
}
}
}
}
}
return std::make_tuple(grad_features, grad_alphas);
}

View File

@@ -0,0 +1,161 @@
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <vector>
// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
// Currently, support is for floats only.
__global__ void weightedSumCudaForwardKernel(
// clang-format off
torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> result,
const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
// clang-format on
const int64_t batch_size = result.size(0);
const int64_t C = features.size(0);
const int64_t H = points_idx.size(2);
const int64_t W = points_idx.size(3);
// Get the batch and index
const int batch = blockIdx.x;
const int num_pixels = C * W * H;
const int num_threads = gridDim.y * blockDim.x;
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
// Parallelize over each feature in each pixel in images of size H * W,
// for each image in the batch of size batch_size
for (int pid = tid; pid < num_pixels; pid += num_threads) {
int ch = pid / (W * H);
int j = (pid % (W * H)) / H;
int i = (pid % (W * H)) % H;
// Iterate through the closest K points for this pixel
for (int k = 0; k < points_idx.size(1); ++k) {
int n_idx = points_idx[batch][k][j][i];
// Sentinel value is -1 indicating no point overlaps the pixel
if (n_idx < 0) {
continue;
}
// Accumulate the values
float alpha = alphas[batch][k][j][i];
// TODO(gkioxari) It might be more efficient to have threads write in a
// local variable, and move atomicAdd outside of the loop such that
// atomicAdd is executed once per thread.
atomicAdd(&result[batch][ch][j][i], features[ch][n_idx] * alpha);
}
}
}
// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
// Currently, support is for floats only.
__global__ void weightedSumCudaBackwardKernel(
// clang-format off
torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> grad_features,
torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_alphas,
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_outputs,
const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
// clang-format on
const int64_t batch_size = points_idx.size(0);
const int64_t C = features.size(0);
const int64_t H = points_idx.size(2);
const int64_t W = points_idx.size(3);
// Get the batch and index
const int batch = blockIdx.x;
const int num_pixels = C * W * H;
const int num_threads = gridDim.y * blockDim.x;
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
// Iterate over each pixel to compute the contribution to the
// gradient for the features and weights
for (int pid = tid; pid < num_pixels; pid += num_threads) {
int ch = pid / (W * H);
int j = (pid % (W * H)) / H;
int i = (pid % (W * H)) % H;
// Iterate through the closest K points for this pixel
for (int k = 0; k < points_idx.size(1); ++k) {
int n_idx = points_idx[batch][k][j][i];
// Sentinel value is -1 indicating no point overlaps the pixel
if (n_idx < 0) {
continue;
}
float alpha = alphas[batch][k][j][i];
// TODO(gkioxari) It might be more efficient to have threads write in a
// local variable, and move atomicAdd outside of the loop such that
// atomicAdd is executed once per thread.
atomicAdd(
&grad_alphas[batch][k][j][i],
features[ch][n_idx] * grad_outputs[batch][ch][j][i]);
atomicAdd(
&grad_features[ch][n_idx], alpha * grad_outputs[batch][ch][j][i]);
}
}
}
torch::Tensor weightedSumCudaForward(
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx) {
const int64_t batch_size = points_idx.size(0);
const int64_t C = features.size(0);
const int64_t H = points_idx.size(2);
const int64_t W = points_idx.size(3);
auto result = torch::zeros({batch_size, C, H, W}, features.options());
const dim3 threadsPerBlock(64);
const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
// doubles. Currently, support is for floats only.
weightedSumCudaForwardKernel<<<numBlocks, threadsPerBlock>>>(
// clang-format off
result.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
// clang-format on
return result;
}
std::tuple<torch::Tensor, torch::Tensor> weightedSumCudaBackward(
const torch::Tensor& grad_outputs,
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx) {
auto grad_features = torch::zeros_like(features);
auto grad_alphas = torch::zeros_like(alphas);
const int64_t bs = points_idx.size(0);
const dim3 threadsPerBlock(64);
const dim3 numBlocks(bs, 1024 / bs + 1);
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
// doubles. Currently, support is for floats only.
weightedSumCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>(
// clang-format off
grad_features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
grad_alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
grad_outputs.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
// clang-format on
return std::make_tuple(grad_features, grad_alphas);
}

View File

@@ -0,0 +1,107 @@
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <torch/extension.h>
#include "pytorch3d_cutils.h"
#include <vector>
// Perform weighted sum compositing of points in a z-buffer.
//
// Inputs:
// features: FloatTensor of shape (C, P) which gives the features
// of each point where C is the size of the feature and
// P the number of points.
// alphas: FloatTensor of shape (N, points_per_pixel, W, W) where
// points_per_pixel is the number of points in the z-buffer
// sorted in z-order, and W is the image size.
// points_idx: IntTensor of shape (N, points_per_pixel, W, W) giving the
// indices of the nearest points at each pixel, sorted in z-order.
// Returns:
// weighted_fs: FloatTensor of shape (N, C, W, W) giving the accumulated
// feature in each point. Concretely, it gives:
// weighted_fs[b,c,i,j] = sum_k alphas[b,k,i,j] *
// features[c,points_idx[b,k,i,j]]
// CUDA declarations
#ifdef WITH_CUDA
torch::Tensor weightedSumCudaForward(
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx);
std::tuple<torch::Tensor, torch::Tensor> weightedSumCudaBackward(
const torch::Tensor& grad_outputs,
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx);
#endif
// C++ declarations
torch::Tensor weightedSumCpuForward(
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx);
std::tuple<torch::Tensor, torch::Tensor> weightedSumCpuBackward(
const torch::Tensor& grad_outputs,
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx);
torch::Tensor weightedSumForward(
torch::Tensor& features,
torch::Tensor& alphas,
torch::Tensor& points_idx) {
features = features.contiguous();
alphas = alphas.contiguous();
points_idx = points_idx.contiguous();
if (features.type().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(features);
CHECK_CONTIGUOUS_CUDA(alphas);
CHECK_CONTIGUOUS_CUDA(points_idx);
#else
AT_ERROR("Not compiled with GPU support");
#endif
return weightedSumCudaForward(features, alphas, points_idx);
} else {
CHECK_CONTIGUOUS(features);
CHECK_CONTIGUOUS(alphas);
CHECK_CONTIGUOUS(points_idx);
return weightedSumCpuForward(features, alphas, points_idx);
}
}
std::tuple<torch::Tensor, torch::Tensor> weightedSumBackward(
torch::Tensor& grad_outputs,
torch::Tensor& features,
torch::Tensor& alphas,
torch::Tensor& points_idx) {
grad_outputs = grad_outputs.contiguous();
features = features.contiguous();
alphas = alphas.contiguous();
points_idx = points_idx.contiguous();
if (grad_outputs.type().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(grad_outputs);
CHECK_CONTIGUOUS_CUDA(features);
CHECK_CONTIGUOUS_CUDA(alphas);
CHECK_CONTIGUOUS_CUDA(points_idx);
#else
AT_ERROR("Not compiled with GPU support");
#endif
return weightedSumCudaBackward(grad_outputs, features, alphas, points_idx);
} else {
CHECK_CONTIGUOUS(grad_outputs);
CHECK_CONTIGUOUS(features);
CHECK_CONTIGUOUS(alphas);
CHECK_CONTIGUOUS(points_idx);
return weightedSumCpuBackward(grad_outputs, features, alphas, points_idx);
}
}

View File

@@ -0,0 +1,98 @@
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <torch/extension.h>
#include <cmath>
#include <vector>
torch::Tensor weightedSumCpuForward(
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx) {
const int64_t B = points_idx.size(0);
const int64_t K = points_idx.size(1);
const int64_t H = points_idx.size(2);
const int64_t W = points_idx.size(3);
const int64_t C = features.size(0);
torch::Tensor result = torch::zeros({B, C, H, W}, features.options());
auto features_a = features.accessor<float, 2>();
auto alphas_a = alphas.accessor<float, 4>();
auto points_idx_a = points_idx.accessor<int64_t, 4>();
auto result_a = result.accessor<float, 4>();
// Iterate over the batch
for (int b = 0; b < B; ++b) {
// Iterate over the features
for (int c = 0; c < C; ++c) {
// Iterate through the horizontal lines of the image from top to bottom
for (int j = 0; j < H; ++j) {
// Iterate over pixels in a horizontal line, left to right
for (int i = 0; i < W; ++i) {
// Iterate through the closest K points for this pixel
for (int k = 0; k < K; ++k) {
int64_t n_idx = points_idx_a[b][k][j][i];
// Sentinel value is -1 indicating no point overlaps the pixel
if (n_idx < 0) {
continue;
}
float alpha = alphas_a[b][k][j][i];
result_a[b][c][j][i] += alpha * features_a[c][n_idx];
}
}
}
}
}
return result;
}
std::tuple<torch::Tensor, torch::Tensor> weightedSumCpuBackward(
const torch::Tensor& grad_outputs,
const torch::Tensor& features,
const torch::Tensor& alphas,
const torch::Tensor& points_idx) {
const int64_t B = points_idx.size(0);
const int64_t K = points_idx.size(1);
const int64_t H = points_idx.size(2);
const int64_t W = points_idx.size(3);
const int64_t C = features.size(0);
torch::Tensor grad_features = torch::zeros_like(features);
torch::Tensor grad_alphas = torch::zeros_like(alphas);
auto grad_outputs_a = grad_outputs.accessor<float, 4>();
auto features_a = features.accessor<float, 2>();
auto alphas_a = alphas.accessor<float, 4>();
auto points_idx_a = points_idx.accessor<int64_t, 4>();
auto grad_features_a = grad_features.accessor<float, 2>();
auto grad_alphas_a = grad_alphas.accessor<float, 4>();
// Iterate over the batch
for (int b = 0; b < B; ++b) {
// Iterate oer the features
for (int c = 0; c < C; ++c) {
// Iterate through the horizontal lines of the image from top to bottom
for (int j = 0; j < H; ++j) {
// Iterate over pixels in a horizontal line, left to right
for (int i = 0; i < W; ++i) {
// Iterate through the closest K points for this pixel
for (int k = 0; k < K; ++k) {
int64_t n_idx = points_idx_a[b][k][j][i];
// Sentinal value is -1, indicating no point overlaps this pixel
if (n_idx < 0) {
continue;
}
float alpha = alphas_a[b][k][j][i];
grad_alphas_a[b][k][j][i] +=
grad_outputs_a[b][c][j][i] * features_a[c][n_idx];
grad_features_a[c][n_idx] += grad_outputs_a[b][c][j][i] * alpha;
}
}
}
}
}
return std::make_tuple(grad_features, grad_alphas);
}

View File

@@ -1,6 +1,9 @@
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <torch/extension.h>
#include "compositing/alpha_composite.h"
#include "compositing/norm_weighted_sum.h"
#include "compositing/weighted_sum.h"
#include "face_areas_normals/face_areas_normals.h"
#include "gather_scatter/gather_scatter.h"
#include "nearest_neighbor_points/nearest_neighbor_points.h"
@@ -20,6 +23,14 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("rasterize_meshes_backward", &RasterizeMeshesBackward);
m.def("rasterize_meshes", &RasterizeMeshes);
// Accumulation functions
m.def("accum_weightedsumnorm", &weightedSumNormForward);
m.def("accum_weightedsum", &weightedSumForward);
m.def("accum_alphacomposite", &alphaCompositeForward);
m.def("accum_weightedsumnorm_backward", &weightedSumNormBackward);
m.def("accum_weightedsum_backward", &weightedSumBackward);
m.def("accum_alphacomposite_backward", &alphaCompositeBackward);
// These are only visible for testing; users should not call them directly
m.def("_rasterize_points_coarse", &RasterizePointsCoarse);
m.def("_rasterize_points_naive", &RasterizePointsNaive);