mirror of
https://github.com/facebookresearch/pytorch3d.git
synced 2026-02-07 06:22:19 +08:00
Accumulate points (#4)
Summary: Code for accumulating points in the z-buffer in three ways: 1. weighted sum 2. normalised weighted sum 3. alpha compositing Pull Request resolved: https://github.com/fairinternal/pytorch3d/pull/4 Reviewed By: nikhilaravi Differential Revision: D20522422 Pulled By: gkioxari fbshipit-source-id: 5023baa05f15e338f3821ef08f5552c2dcbfc06c
This commit is contained in:
committed by
Facebook GitHub Bot
parent
5218f45c2c
commit
53599770dd
187
pytorch3d/csrc/compositing/alpha_composite.cu
Normal file
187
pytorch3d/csrc/compositing/alpha_composite.cu
Normal file
@@ -0,0 +1,187 @@
|
||||
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
|
||||
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
|
||||
// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
|
||||
// Currently, support is for floats only.
|
||||
__global__ void alphaCompositeCudaForwardKernel(
|
||||
// clang-format off
|
||||
torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> result,
|
||||
const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
|
||||
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
|
||||
const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
|
||||
// clang-format on
|
||||
const int64_t batch_size = result.size(0);
|
||||
const int64_t C = features.size(0);
|
||||
const int64_t H = points_idx.size(2);
|
||||
const int64_t W = points_idx.size(3);
|
||||
|
||||
// Get the batch and index
|
||||
const int batch = blockIdx.x;
|
||||
|
||||
const int num_pixels = C * W * H;
|
||||
const int num_threads = gridDim.y * blockDim.x;
|
||||
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
// Iterate over each feature in each pixel
|
||||
for (int pid = tid; pid < num_pixels; pid += num_threads) {
|
||||
int ch = pid / (W * H);
|
||||
int j = (pid % (W * H)) / H;
|
||||
int i = (pid % (W * H)) % H;
|
||||
|
||||
// alphacomposite the different values
|
||||
float cum_alpha = 1.;
|
||||
// Iterate through the closest K points for this pixel
|
||||
for (int k = 0; k < points_idx.size(1); ++k) {
|
||||
int n_idx = points_idx[batch][k][j][i];
|
||||
|
||||
// Sentinel value is -1 indicating no point overlaps the pixel
|
||||
if (n_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
float alpha = alphas[batch][k][j][i];
|
||||
// TODO(gkioxari) It might be more efficient to have threads write in a
|
||||
// local variable, and move atomicAdd outside of the loop such that
|
||||
// atomicAdd is executed once per thread.
|
||||
atomicAdd(
|
||||
&result[batch][ch][j][i], features[ch][n_idx] * cum_alpha * alpha);
|
||||
cum_alpha = cum_alpha * (1 - alpha);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
|
||||
// Currently, support is for floats only.
|
||||
__global__ void alphaCompositeCudaBackwardKernel(
|
||||
// clang-format off
|
||||
torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> grad_features,
|
||||
torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_alphas,
|
||||
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_outputs,
|
||||
const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
|
||||
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
|
||||
const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
|
||||
// clang-format on
|
||||
const int64_t batch_size = points_idx.size(0);
|
||||
const int64_t C = features.size(0);
|
||||
const int64_t H = points_idx.size(2);
|
||||
const int64_t W = points_idx.size(3);
|
||||
|
||||
// Get the batch and index
|
||||
const int batch = blockIdx.x;
|
||||
|
||||
const int num_pixels = C * W * H;
|
||||
const int num_threads = gridDim.y * blockDim.x;
|
||||
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
// Parallelize over each feature in each pixel in images of size H * W,
|
||||
// for each image in the batch of size batch_size
|
||||
for (int pid = tid; pid < num_pixels; pid += num_threads) {
|
||||
int ch = pid / (W * H);
|
||||
int j = (pid % (W * H)) / H;
|
||||
int i = (pid % (W * H)) % H;
|
||||
|
||||
// alphacomposite the different values
|
||||
float cum_alpha = 1.;
|
||||
// Iterate through the closest K points for this pixel
|
||||
for (int k = 0; k < points_idx.size(1); ++k) {
|
||||
int n_idx = points_idx[batch][k][j][i];
|
||||
|
||||
// Sentinel value is -1 indicating no point overlaps the pixel
|
||||
if (n_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
float alpha = alphas[batch][k][j][i];
|
||||
|
||||
// TODO(gkioxari) It might be more efficient to have threads write in a
|
||||
// local variable, and move atomicAdd outside of the loop such that
|
||||
// atomicAdd is executed once per thread.
|
||||
atomicAdd(
|
||||
&grad_alphas[batch][k][j][i],
|
||||
cum_alpha * features[ch][n_idx] * grad_outputs[batch][ch][j][i]);
|
||||
atomicAdd(
|
||||
&grad_features[ch][n_idx],
|
||||
cum_alpha * alpha * grad_outputs[batch][ch][j][i]);
|
||||
|
||||
// Iterate over all (K-1) nearest points to update gradient
|
||||
for (int t = 0; t < k; ++t) {
|
||||
int t_idx = points_idx[batch][t][j][i];
|
||||
// Sentinel value is -1, indicating no point overlaps this pixel
|
||||
if (t_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
float alpha_tvalue = alphas[batch][t][j][i];
|
||||
// TODO(gkioxari) It might be more efficient to have threads write in a
|
||||
// local variable, and move atomicAdd outside of the loop such that
|
||||
// atomicAdd is executed once per thread.
|
||||
atomicAdd(
|
||||
&grad_alphas[batch][t][j][i],
|
||||
-grad_outputs[batch][ch][j][i] * features[ch][n_idx] * cum_alpha *
|
||||
alpha / (1 - alpha_tvalue));
|
||||
}
|
||||
|
||||
cum_alpha = cum_alpha * (1 - alphas[batch][k][j][i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
torch::Tensor alphaCompositeCudaForward(
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx) {
|
||||
const int64_t batch_size = points_idx.size(0);
|
||||
const int64_t C = features.size(0);
|
||||
const int64_t H = points_idx.size(2);
|
||||
const int64_t W = points_idx.size(3);
|
||||
|
||||
auto result = torch::zeros({batch_size, C, H, W}, features.options());
|
||||
|
||||
const dim3 threadsPerBlock(64);
|
||||
const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
|
||||
|
||||
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
|
||||
// doubles. Currently, support is for floats only.
|
||||
alphaCompositeCudaForwardKernel<<<numBlocks, threadsPerBlock>>>(
|
||||
// clang-format off
|
||||
result.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
|
||||
features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
|
||||
alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
|
||||
points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
|
||||
// clang-format on
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> alphaCompositeCudaBackward(
|
||||
const torch::Tensor& grad_outputs,
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx) {
|
||||
auto grad_features = torch::zeros_like(features);
|
||||
auto grad_alphas = torch::zeros_like(alphas);
|
||||
|
||||
const int64_t bs = alphas.size(0);
|
||||
|
||||
const dim3 threadsPerBlock(64);
|
||||
const dim3 numBlocks(bs, 1024 / bs + 1);
|
||||
|
||||
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
|
||||
// doubles. Currently, support is for floats only.
|
||||
alphaCompositeCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>(
|
||||
// clang-format off
|
||||
grad_features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
|
||||
grad_alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
|
||||
grad_outputs.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
|
||||
features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
|
||||
alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
|
||||
points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
|
||||
// clang-format on
|
||||
|
||||
return std::make_tuple(grad_features, grad_alphas);
|
||||
}
|
||||
110
pytorch3d/csrc/compositing/alpha_composite.h
Normal file
110
pytorch3d/csrc/compositing/alpha_composite.h
Normal file
@@ -0,0 +1,110 @@
|
||||
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
|
||||
|
||||
#include <torch/extension.h>
|
||||
#include "pytorch3d_cutils.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
// Perform alpha compositing of points in a z-buffer.
|
||||
//
|
||||
// Inputs:
|
||||
// features: FloatTensor of shape (C, P) which gives the features
|
||||
// of each point where C is the size of the feature and
|
||||
// P the number of points.
|
||||
// alphas: FloatTensor of shape (N, points_per_pixel, W, W) where
|
||||
// points_per_pixel is the number of points in the z-buffer
|
||||
// sorted in z-order, and W is the image size.
|
||||
// points_idx: IntTensor of shape (N, points_per_pixel, W, W) giving the
|
||||
// indices of the nearest points at each pixel, sorted in z-order.
|
||||
// Returns:
|
||||
// weighted_fs: FloatTensor of shape (N, C, W, W) giving the accumulated
|
||||
// feature for each point. Concretely, it gives:
|
||||
// weighted_fs[b,c,i,j] = sum_k cum_alpha_k *
|
||||
// features[c,points_idx[b,k,i,j]]
|
||||
// where cum_alpha_k =
|
||||
// alphas[b,k,i,j] * prod_l=0..k-1 (1 - alphas[b,l,i,j])
|
||||
|
||||
// CUDA declarations
|
||||
#ifdef WITH_CUDA
|
||||
torch::Tensor alphaCompositeCudaForward(
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx);
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> alphaCompositeCudaBackward(
|
||||
const torch::Tensor& grad_outputs,
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx);
|
||||
#endif
|
||||
|
||||
// C++ declarations
|
||||
torch::Tensor alphaCompositeCpuForward(
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx);
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> alphaCompositeCpuBackward(
|
||||
const torch::Tensor& grad_outputs,
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx);
|
||||
|
||||
torch::Tensor alphaCompositeForward(
|
||||
torch::Tensor& features,
|
||||
torch::Tensor& alphas,
|
||||
torch::Tensor& points_idx) {
|
||||
features = features.contiguous();
|
||||
alphas = alphas.contiguous();
|
||||
points_idx = points_idx.contiguous();
|
||||
|
||||
if (features.type().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(features);
|
||||
CHECK_CONTIGUOUS_CUDA(alphas);
|
||||
CHECK_CONTIGUOUS_CUDA(points_idx);
|
||||
#else
|
||||
AT_ERROR("Not compiled with GPU support");
|
||||
#endif
|
||||
return alphaCompositeCudaForward(features, alphas, points_idx);
|
||||
} else {
|
||||
CHECK_CONTIGUOUS(features);
|
||||
CHECK_CONTIGUOUS(alphas);
|
||||
CHECK_CONTIGUOUS(points_idx);
|
||||
|
||||
return alphaCompositeCpuForward(features, alphas, points_idx);
|
||||
}
|
||||
}
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> alphaCompositeBackward(
|
||||
torch::Tensor& grad_outputs,
|
||||
torch::Tensor& features,
|
||||
torch::Tensor& alphas,
|
||||
torch::Tensor& points_idx) {
|
||||
grad_outputs = grad_outputs.contiguous();
|
||||
features = features.contiguous();
|
||||
alphas = alphas.contiguous();
|
||||
points_idx = points_idx.contiguous();
|
||||
|
||||
if (grad_outputs.type().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(grad_outputs);
|
||||
CHECK_CONTIGUOUS_CUDA(features);
|
||||
CHECK_CONTIGUOUS_CUDA(alphas);
|
||||
CHECK_CONTIGUOUS_CUDA(points_idx);
|
||||
#else
|
||||
AT_ERROR("Not compiled with GPU support");
|
||||
#endif
|
||||
|
||||
return alphaCompositeCudaBackward(
|
||||
grad_outputs, features, alphas, points_idx);
|
||||
} else {
|
||||
CHECK_CONTIGUOUS(grad_outputs);
|
||||
CHECK_CONTIGUOUS(features);
|
||||
CHECK_CONTIGUOUS(alphas);
|
||||
CHECK_CONTIGUOUS(points_idx);
|
||||
|
||||
return alphaCompositeCpuBackward(
|
||||
grad_outputs, features, alphas, points_idx);
|
||||
}
|
||||
}
|
||||
114
pytorch3d/csrc/compositing/alpha_composite_cpu.cpp
Normal file
114
pytorch3d/csrc/compositing/alpha_composite_cpu.cpp
Normal file
@@ -0,0 +1,114 @@
|
||||
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
|
||||
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
torch::Tensor alphaCompositeCpuForward(
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx) {
|
||||
const int64_t B = points_idx.size(0);
|
||||
const int64_t K = points_idx.size(1);
|
||||
const int64_t H = points_idx.size(2);
|
||||
const int64_t W = points_idx.size(3);
|
||||
const int64_t C = features.size(0);
|
||||
|
||||
torch::Tensor result = torch::zeros({B, C, H, W}, features.options());
|
||||
|
||||
auto features_a = features.accessor<float, 2>();
|
||||
auto alphas_a = alphas.accessor<float, 4>();
|
||||
auto points_idx_a = points_idx.accessor<int64_t, 4>();
|
||||
auto result_a = result.accessor<float, 4>();
|
||||
|
||||
// Iterate over the batch
|
||||
for (int b = 0; b < B; ++b) {
|
||||
// Iterate over the features
|
||||
for (int c = 0; c < C; ++c) {
|
||||
// Iterate through the horizontal lines of the image from top to bottom
|
||||
for (int j = 0; j < H; ++j) {
|
||||
// Iterate over pixels in a horizontal line, left to right
|
||||
for (int i = 0; i < W; ++i) {
|
||||
float cum_alpha = 1.;
|
||||
// Iterate through the closest K points for this pixel
|
||||
for (int k = 0; k < K; ++k) {
|
||||
int64_t n_idx = points_idx_a[b][k][j][i];
|
||||
// Sentinel value is -1 indicating no point overlaps the pixel
|
||||
if (n_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
float alpha = alphas_a[b][k][j][i];
|
||||
result_a[b][c][j][i] += cum_alpha * alpha * features_a[c][n_idx];
|
||||
cum_alpha = cum_alpha * (1 - alpha);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> alphaCompositeCpuBackward(
|
||||
const torch::Tensor& grad_outputs,
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx) {
|
||||
torch::Tensor grad_features = torch::zeros_like(features);
|
||||
torch::Tensor grad_alphas = torch::zeros_like(alphas);
|
||||
|
||||
const int64_t B = points_idx.size(0);
|
||||
const int64_t K = points_idx.size(1);
|
||||
const int64_t H = points_idx.size(2);
|
||||
const int64_t W = points_idx.size(3);
|
||||
const int64_t C = features.size(0);
|
||||
|
||||
auto grad_outputs_a = grad_outputs.accessor<float, 4>();
|
||||
auto features_a = features.accessor<float, 2>();
|
||||
auto alphas_a = alphas.accessor<float, 4>();
|
||||
auto points_idx_a = points_idx.accessor<int64_t, 4>();
|
||||
auto grad_features_a = grad_features.accessor<float, 2>();
|
||||
auto grad_alphas_a = grad_alphas.accessor<float, 4>();
|
||||
|
||||
// Iterate over the batch
|
||||
for (int b = 0; b < B; ++b) {
|
||||
// Iterate over the features
|
||||
for (int c = 0; c < C; ++c) {
|
||||
// Iterate through the horizontal lines of the image from top to bottom
|
||||
for (int j = 0; j < H; ++j) {
|
||||
// Iterate over pixels in a horizontal line, left to right
|
||||
for (int i = 0; i < W; ++i) {
|
||||
float cum_alpha = 1.;
|
||||
// Iterate through the closest K points for this pixel
|
||||
for (int k = 0; k < K; ++k) {
|
||||
int64_t n_idx = points_idx_a[b][k][j][i];
|
||||
// Sentinal value is -1, indicating no point overlaps this pixel
|
||||
if (n_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
float alpha = alphas_a[b][k][j][i];
|
||||
grad_alphas_a[b][k][j][i] +=
|
||||
grad_outputs_a[b][c][j][i] * features_a[c][n_idx] * cum_alpha;
|
||||
grad_features_a[c][n_idx] +=
|
||||
grad_outputs_a[b][c][j][i] * cum_alpha * alpha;
|
||||
|
||||
// Iterate over all (K-1) nearer points to update gradient
|
||||
for (int t = 0; t < k; t++) {
|
||||
int64_t t_idx = points_idx_a[b][t][j][i];
|
||||
// Sentinal value is -1, indicating no point overlaps this pixel
|
||||
if (t_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
float alpha_tvalue = alphas_a[b][t][j][i];
|
||||
grad_alphas_a[b][t][j][i] -= grad_outputs_a[b][c][j][i] *
|
||||
features_a[c][n_idx] * cum_alpha * alpha / (1 - alpha_tvalue);
|
||||
}
|
||||
|
||||
cum_alpha = cum_alpha * (1 - alpha);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return std::make_tuple(grad_features, grad_alphas);
|
||||
}
|
||||
202
pytorch3d/csrc/compositing/norm_weighted_sum.cu
Normal file
202
pytorch3d/csrc/compositing/norm_weighted_sum.cu
Normal file
@@ -0,0 +1,202 @@
|
||||
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
|
||||
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
|
||||
__constant__ const float kEpsilon = 1e-4;
|
||||
|
||||
// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
|
||||
// Currently, support is for floats only.
|
||||
__global__ void weightedSumNormCudaForwardKernel(
|
||||
// clang-format off
|
||||
torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> result,
|
||||
const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
|
||||
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
|
||||
const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
|
||||
// clang-format on
|
||||
const int64_t batch_size = result.size(0);
|
||||
const int64_t C = features.size(0);
|
||||
const int64_t H = points_idx.size(2);
|
||||
const int64_t W = points_idx.size(3);
|
||||
|
||||
// Get the batch and index
|
||||
const int batch = blockIdx.x;
|
||||
|
||||
const int num_pixels = C * W * H;
|
||||
const int num_threads = gridDim.y * blockDim.x;
|
||||
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
// Parallelize over each feature in each pixel in images of size H * W,
|
||||
// for each image in the batch of size batch_size
|
||||
for (int pid = tid; pid < num_pixels; pid += num_threads) {
|
||||
int ch = pid / (W * H);
|
||||
int j = (pid % (W * H)) / H;
|
||||
int i = (pid % (W * H)) % H;
|
||||
|
||||
// Store the accumulated alpha value
|
||||
float cum_alpha = 0.;
|
||||
// Iterate through the closest K points for this pixel
|
||||
for (int k = 0; k < points_idx.size(1); ++k) {
|
||||
int n_idx = points_idx[batch][k][j][i];
|
||||
// Sentinel value is -1 indicating no point overlaps the pixel
|
||||
if (n_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
cum_alpha += alphas[batch][k][j][i];
|
||||
}
|
||||
|
||||
if (cum_alpha < kEpsilon) {
|
||||
cum_alpha = kEpsilon;
|
||||
}
|
||||
|
||||
// Iterate through the closest K points for this pixel
|
||||
for (int k = 0; k < points_idx.size(1); ++k) {
|
||||
int n_idx = points_idx[batch][k][j][i];
|
||||
// Sentinel value is -1 indicating no point overlaps the pixel
|
||||
if (n_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
float alpha = alphas[batch][k][j][i];
|
||||
// TODO(gkioxari) It might be more efficient to have threads write in a
|
||||
// local variable, and move atomicAdd outside of the loop such that
|
||||
// atomicAdd is executed once per thread.
|
||||
atomicAdd(
|
||||
&result[batch][ch][j][i], features[ch][n_idx] * alpha / cum_alpha);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
|
||||
// Currently, support is for floats only.
|
||||
__global__ void weightedSumNormCudaBackwardKernel(
|
||||
// clang-format off
|
||||
torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> grad_features,
|
||||
torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_alphas,
|
||||
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_outputs,
|
||||
const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
|
||||
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
|
||||
const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
|
||||
// clang-format on
|
||||
const int64_t batch_size = points_idx.size(0);
|
||||
const int64_t C = features.size(0);
|
||||
const int64_t H = points_idx.size(2);
|
||||
const int64_t W = points_idx.size(3);
|
||||
|
||||
// Get the batch and index
|
||||
const int batch = blockIdx.x;
|
||||
|
||||
const int num_pixels = C * W * H;
|
||||
const int num_threads = gridDim.y * blockDim.x;
|
||||
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
// Parallelize over each feature in each pixel in images of size H * W,
|
||||
// for each image in the batch of size batch_size
|
||||
for (int pid = tid; pid < num_pixels; pid += num_threads) {
|
||||
int ch = pid / (W * H);
|
||||
int j = (pid % (W * H)) / H;
|
||||
int i = (pid % (W * H)) % H;
|
||||
|
||||
float sum_alpha = 0.;
|
||||
float sum_alphafs = 0.;
|
||||
// Iterate through the closest K points for this pixel to calculate the
|
||||
// cumulative sum of the alphas for this pixel
|
||||
for (int k = 0; k < points_idx.size(1); ++k) {
|
||||
int n_idx = points_idx[batch][k][j][i];
|
||||
// Sentinel value is -1 indicating no point overlaps the pixel
|
||||
if (n_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
sum_alpha += alphas[batch][k][j][i];
|
||||
sum_alphafs += alphas[batch][k][j][i] * features[ch][n_idx];
|
||||
}
|
||||
|
||||
if (sum_alpha < kEpsilon) {
|
||||
sum_alpha = kEpsilon;
|
||||
}
|
||||
|
||||
// Iterate again through the closest K points for this pixel to calculate
|
||||
// the gradient.
|
||||
for (int k = 0; k < points_idx.size(1); ++k) {
|
||||
int n_idx = points_idx[batch][k][j][i];
|
||||
|
||||
// Sentinel value is -1 indicating no point overlaps the pixel
|
||||
if (n_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
float alpha = alphas[batch][k][j][i];
|
||||
|
||||
// TODO(gkioxari) It might be more efficient to have threads write in a
|
||||
// local variable, and move atomicAdd outside of the loop such that
|
||||
// atomicAdd is executed once per thread.
|
||||
atomicAdd(
|
||||
&grad_alphas[batch][k][j][i],
|
||||
(features[ch][n_idx] * sum_alpha - sum_alphafs) /
|
||||
(sum_alpha * sum_alpha) * grad_outputs[batch][ch][j][i]);
|
||||
atomicAdd(
|
||||
&grad_features[ch][n_idx],
|
||||
alpha * grad_outputs[batch][ch][j][i] / sum_alpha);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
torch::Tensor weightedSumNormCudaForward(
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx) {
|
||||
const int64_t batch_size = points_idx.size(0);
|
||||
const int64_t C = features.size(0);
|
||||
const int64_t H = points_idx.size(2);
|
||||
const int64_t W = points_idx.size(3);
|
||||
|
||||
auto result = torch::zeros({batch_size, C, H, W}, features.options());
|
||||
|
||||
const dim3 threadsPerBlock(64);
|
||||
const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
|
||||
|
||||
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
|
||||
// doubles. Currently, support is for floats only.
|
||||
// clang-format off
|
||||
weightedSumNormCudaForwardKernel<<<numBlocks, threadsPerBlock>>>(
|
||||
result.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
|
||||
features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
|
||||
alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
|
||||
points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
|
||||
// clang-format on
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> weightedSumNormCudaBackward(
|
||||
const torch::Tensor& grad_outputs,
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx) {
|
||||
auto grad_features = torch::zeros_like(features);
|
||||
auto grad_alphas = torch::zeros_like(alphas);
|
||||
|
||||
const int64_t bs = points_idx.size(0);
|
||||
|
||||
const dim3 threadsPerBlock(64);
|
||||
const dim3 numBlocks(bs, 1024 / bs + 1);
|
||||
|
||||
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
|
||||
// doubles. Currently, support is for floats only.
|
||||
weightedSumNormCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>(
|
||||
// clang-format off
|
||||
grad_features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
|
||||
grad_alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
|
||||
grad_outputs.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
|
||||
features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
|
||||
alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
|
||||
points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
|
||||
// clang-format on
|
||||
|
||||
return std::make_tuple(grad_features, grad_alphas);
|
||||
}
|
||||
109
pytorch3d/csrc/compositing/norm_weighted_sum.h
Normal file
109
pytorch3d/csrc/compositing/norm_weighted_sum.h
Normal file
@@ -0,0 +1,109 @@
|
||||
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
|
||||
|
||||
#include <torch/extension.h>
|
||||
#include "pytorch3d_cutils.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
// Perform normalized weighted sum compositing of points in a z-buffer.
|
||||
//
|
||||
// Inputs:
|
||||
// features: FloatTensor of shape (C, P) which gives the features
|
||||
// of each point where C is the size of the feature and
|
||||
// P the number of points.
|
||||
// alphas: FloatTensor of shape (N, points_per_pixel, W, W) where
|
||||
// points_per_pixel is the number of points in the z-buffer
|
||||
// sorted in z-order, and W is the image size.
|
||||
// points_idx: IntTensor of shape (N, points_per_pixel, W, W) giving the
|
||||
// indices of the nearest points at each pixel, sorted in z-order.
|
||||
// Returns:
|
||||
// weighted_fs: FloatTensor of shape (N, C, W, W) giving the accumulated
|
||||
// feature in each point. Concretely, it gives:
|
||||
// weighted_fs[b,c,i,j] = sum_k alphas[b,k,i,j] *
|
||||
// features[c,points_idx[b,k,i,j]] / sum_k alphas[b,k,i,j]
|
||||
|
||||
// CUDA declarations
|
||||
#ifdef WITH_CUDA
|
||||
torch::Tensor weightedSumNormCudaForward(
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx);
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> weightedSumNormCudaBackward(
|
||||
const torch::Tensor& grad_outputs,
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx);
|
||||
#endif
|
||||
|
||||
// C++ declarations
|
||||
torch::Tensor weightedSumNormCpuForward(
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx);
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> weightedSumNormCpuBackward(
|
||||
const torch::Tensor& grad_outputs,
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx);
|
||||
|
||||
torch::Tensor weightedSumNormForward(
|
||||
torch::Tensor& features,
|
||||
torch::Tensor& alphas,
|
||||
torch::Tensor& points_idx) {
|
||||
features = features.contiguous();
|
||||
alphas = alphas.contiguous();
|
||||
points_idx = points_idx.contiguous();
|
||||
|
||||
if (features.type().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(features);
|
||||
CHECK_CONTIGUOUS_CUDA(alphas);
|
||||
CHECK_CONTIGUOUS_CUDA(points_idx);
|
||||
#else
|
||||
AT_ERROR("Not compiled with GPU support");
|
||||
#endif
|
||||
|
||||
return weightedSumNormCudaForward(features, alphas, points_idx);
|
||||
} else {
|
||||
CHECK_CONTIGUOUS(features);
|
||||
CHECK_CONTIGUOUS(alphas);
|
||||
CHECK_CONTIGUOUS(points_idx);
|
||||
|
||||
return weightedSumNormCpuForward(features, alphas, points_idx);
|
||||
}
|
||||
}
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> weightedSumNormBackward(
|
||||
torch::Tensor& grad_outputs,
|
||||
torch::Tensor& features,
|
||||
torch::Tensor& alphas,
|
||||
torch::Tensor& points_idx) {
|
||||
grad_outputs = grad_outputs.contiguous();
|
||||
features = features.contiguous();
|
||||
alphas = alphas.contiguous();
|
||||
points_idx = points_idx.contiguous();
|
||||
|
||||
if (grad_outputs.type().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(grad_outputs);
|
||||
CHECK_CONTIGUOUS_CUDA(features);
|
||||
CHECK_CONTIGUOUS_CUDA(alphas);
|
||||
CHECK_CONTIGUOUS_CUDA(points_idx);
|
||||
#else
|
||||
AT_ERROR("Not compiled with GPU support");
|
||||
#endif
|
||||
|
||||
return weightedSumNormCudaBackward(
|
||||
grad_outputs, features, alphas, points_idx);
|
||||
} else {
|
||||
CHECK_CONTIGUOUS(grad_outputs);
|
||||
CHECK_CONTIGUOUS(features);
|
||||
CHECK_CONTIGUOUS(alphas);
|
||||
CHECK_CONTIGUOUS(points_idx);
|
||||
|
||||
return weightedSumNormCpuBackward(
|
||||
grad_outputs, features, alphas, points_idx);
|
||||
}
|
||||
}
|
||||
134
pytorch3d/csrc/compositing/norm_weighted_sum_cpu.cpp
Normal file
134
pytorch3d/csrc/compositing/norm_weighted_sum_cpu.cpp
Normal file
@@ -0,0 +1,134 @@
|
||||
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
|
||||
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
// Epsilon float
|
||||
const float kEps = 1e-4;
|
||||
|
||||
torch::Tensor weightedSumNormCpuForward(
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx) {
|
||||
const int64_t B = points_idx.size(0);
|
||||
const int64_t K = points_idx.size(1);
|
||||
const int64_t H = points_idx.size(2);
|
||||
const int64_t W = points_idx.size(3);
|
||||
const int64_t C = features.size(0);
|
||||
|
||||
torch::Tensor result = torch::zeros({B, C, H, W}, features.options());
|
||||
|
||||
auto features_a = features.accessor<float, 2>();
|
||||
auto alphas_a = alphas.accessor<float, 4>();
|
||||
auto points_idx_a = points_idx.accessor<int64_t, 4>();
|
||||
auto result_a = result.accessor<float, 4>();
|
||||
|
||||
// Iterate over the batch
|
||||
for (int b = 0; b < B; ++b) {
|
||||
// Iterate oer the features
|
||||
for (int c = 0; c < C; ++c) {
|
||||
// Iterate through the horizontal lines of the image from top to bottom
|
||||
for (int j = 0; j < H; ++j) {
|
||||
// Iterate over pixels in a horizontal line, left to right
|
||||
for (int i = 0; i < W; ++i) {
|
||||
float t_alpha = 0.;
|
||||
for (int k = 0; k < K; ++k) {
|
||||
int64_t n_idx = points_idx_a[b][k][j][i];
|
||||
if (n_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
t_alpha += alphas_a[b][k][j][i];
|
||||
}
|
||||
|
||||
if (t_alpha < kEps) {
|
||||
t_alpha = kEps;
|
||||
}
|
||||
|
||||
// Iterate over the different zs to combine
|
||||
for (int k = 0; k < K; ++k) {
|
||||
int64_t n_idx = points_idx_a[b][k][j][i];
|
||||
// Sentinel value is -1 indicating no point overlaps the pixel
|
||||
if (n_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
float alpha = alphas_a[b][k][j][i];
|
||||
result_a[b][c][j][i] += alpha * features_a[c][n_idx] / t_alpha;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> weightedSumNormCpuBackward(
|
||||
const torch::Tensor& grad_outputs,
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx) {
|
||||
torch::Tensor grad_features = torch::zeros_like(features);
|
||||
torch::Tensor grad_alphas = torch::zeros_like(alphas);
|
||||
|
||||
const int64_t B = points_idx.size(0);
|
||||
const int64_t K = points_idx.size(1);
|
||||
const int64_t H = points_idx.size(2);
|
||||
const int64_t W = points_idx.size(3);
|
||||
const int64_t C = features.size(0);
|
||||
|
||||
auto grad_outputs_a = grad_outputs.accessor<float, 4>();
|
||||
auto features_a = features.accessor<float, 2>();
|
||||
auto alphas_a = alphas.accessor<float, 4>();
|
||||
auto points_idx_a = points_idx.accessor<int64_t, 4>();
|
||||
auto grad_features_a = grad_features.accessor<float, 2>();
|
||||
auto grad_alphas_a = grad_alphas.accessor<float, 4>();
|
||||
|
||||
// Iterate over the batch
|
||||
for (int b = 0; b < B; ++b) {
|
||||
// Iterate oer the features
|
||||
for (int c = 0; c < C; ++c) {
|
||||
// Iterate through the horizontal lines of the image from top to bottom
|
||||
for (int j = 0; j < H; ++j) {
|
||||
// Iterate over pixels in a horizontal line, left to right
|
||||
for (int i = 0; i < W; ++i) {
|
||||
float t_alpha = 0.;
|
||||
float t_alphafs = 0.;
|
||||
// Iterate through the closest K points for this pixel
|
||||
for (int k = 0; k < K; ++k) {
|
||||
int64_t n_idx = points_idx_a[b][k][j][i];
|
||||
// Sentinel value is -1, indicating no point overlaps this pixel
|
||||
if (n_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
t_alpha += alphas_a[b][k][j][i];
|
||||
t_alphafs += alphas_a[b][k][j][i] * features_a[c][n_idx];
|
||||
}
|
||||
|
||||
if (t_alpha < kEps) {
|
||||
t_alpha = kEps;
|
||||
}
|
||||
|
||||
// Iterate through the closest K points for this pixel ordered by z
|
||||
// distance.
|
||||
for (int k = 0; k < K; ++k) {
|
||||
int64_t n_idx = points_idx_a[b][k][j][i];
|
||||
// Sentinel value is -1 indicating no point overlaps the pixel
|
||||
if (n_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
float alpha = alphas_a[b][k][j][i];
|
||||
grad_alphas_a[b][k][j][i] += grad_outputs_a[b][c][j][i] *
|
||||
(features_a[c][n_idx] * t_alpha - t_alphafs) /
|
||||
(t_alpha * t_alpha);
|
||||
grad_features_a[c][n_idx] +=
|
||||
grad_outputs_a[b][c][j][i] * alpha / t_alpha;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return std::make_tuple(grad_features, grad_alphas);
|
||||
}
|
||||
161
pytorch3d/csrc/compositing/weighted_sum.cu
Normal file
161
pytorch3d/csrc/compositing/weighted_sum.cu
Normal file
@@ -0,0 +1,161 @@
|
||||
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
|
||||
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
|
||||
// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
|
||||
// Currently, support is for floats only.
|
||||
__global__ void weightedSumCudaForwardKernel(
|
||||
// clang-format off
|
||||
torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> result,
|
||||
const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
|
||||
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
|
||||
const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
|
||||
// clang-format on
|
||||
const int64_t batch_size = result.size(0);
|
||||
const int64_t C = features.size(0);
|
||||
const int64_t H = points_idx.size(2);
|
||||
const int64_t W = points_idx.size(3);
|
||||
|
||||
// Get the batch and index
|
||||
const int batch = blockIdx.x;
|
||||
|
||||
const int num_pixels = C * W * H;
|
||||
const int num_threads = gridDim.y * blockDim.x;
|
||||
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
// Parallelize over each feature in each pixel in images of size H * W,
|
||||
// for each image in the batch of size batch_size
|
||||
for (int pid = tid; pid < num_pixels; pid += num_threads) {
|
||||
int ch = pid / (W * H);
|
||||
int j = (pid % (W * H)) / H;
|
||||
int i = (pid % (W * H)) % H;
|
||||
|
||||
// Iterate through the closest K points for this pixel
|
||||
for (int k = 0; k < points_idx.size(1); ++k) {
|
||||
int n_idx = points_idx[batch][k][j][i];
|
||||
// Sentinel value is -1 indicating no point overlaps the pixel
|
||||
if (n_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Accumulate the values
|
||||
float alpha = alphas[batch][k][j][i];
|
||||
// TODO(gkioxari) It might be more efficient to have threads write in a
|
||||
// local variable, and move atomicAdd outside of the loop such that
|
||||
// atomicAdd is executed once per thread.
|
||||
atomicAdd(&result[batch][ch][j][i], features[ch][n_idx] * alpha);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
|
||||
// Currently, support is for floats only.
|
||||
__global__ void weightedSumCudaBackwardKernel(
|
||||
// clang-format off
|
||||
torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> grad_features,
|
||||
torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_alphas,
|
||||
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> grad_outputs,
|
||||
const torch::PackedTensorAccessor<float, 2, torch::RestrictPtrTraits, size_t> features,
|
||||
const torch::PackedTensorAccessor<float, 4, torch::RestrictPtrTraits, size_t> alphas,
|
||||
const torch::PackedTensorAccessor<int64_t, 4, torch::RestrictPtrTraits, size_t> points_idx) {
|
||||
// clang-format on
|
||||
const int64_t batch_size = points_idx.size(0);
|
||||
const int64_t C = features.size(0);
|
||||
const int64_t H = points_idx.size(2);
|
||||
const int64_t W = points_idx.size(3);
|
||||
|
||||
// Get the batch and index
|
||||
const int batch = blockIdx.x;
|
||||
|
||||
const int num_pixels = C * W * H;
|
||||
const int num_threads = gridDim.y * blockDim.x;
|
||||
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
// Iterate over each pixel to compute the contribution to the
|
||||
// gradient for the features and weights
|
||||
for (int pid = tid; pid < num_pixels; pid += num_threads) {
|
||||
int ch = pid / (W * H);
|
||||
int j = (pid % (W * H)) / H;
|
||||
int i = (pid % (W * H)) % H;
|
||||
|
||||
// Iterate through the closest K points for this pixel
|
||||
for (int k = 0; k < points_idx.size(1); ++k) {
|
||||
int n_idx = points_idx[batch][k][j][i];
|
||||
// Sentinel value is -1 indicating no point overlaps the pixel
|
||||
if (n_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
float alpha = alphas[batch][k][j][i];
|
||||
|
||||
// TODO(gkioxari) It might be more efficient to have threads write in a
|
||||
// local variable, and move atomicAdd outside of the loop such that
|
||||
// atomicAdd is executed once per thread.
|
||||
atomicAdd(
|
||||
&grad_alphas[batch][k][j][i],
|
||||
features[ch][n_idx] * grad_outputs[batch][ch][j][i]);
|
||||
atomicAdd(
|
||||
&grad_features[ch][n_idx], alpha * grad_outputs[batch][ch][j][i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
torch::Tensor weightedSumCudaForward(
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx) {
|
||||
const int64_t batch_size = points_idx.size(0);
|
||||
const int64_t C = features.size(0);
|
||||
const int64_t H = points_idx.size(2);
|
||||
const int64_t W = points_idx.size(3);
|
||||
|
||||
auto result = torch::zeros({batch_size, C, H, W}, features.options());
|
||||
|
||||
const dim3 threadsPerBlock(64);
|
||||
const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
|
||||
|
||||
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
|
||||
// doubles. Currently, support is for floats only.
|
||||
weightedSumCudaForwardKernel<<<numBlocks, threadsPerBlock>>>(
|
||||
// clang-format off
|
||||
result.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
|
||||
features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
|
||||
alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
|
||||
points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
|
||||
// clang-format on
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> weightedSumCudaBackward(
|
||||
const torch::Tensor& grad_outputs,
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx) {
|
||||
auto grad_features = torch::zeros_like(features);
|
||||
auto grad_alphas = torch::zeros_like(alphas);
|
||||
|
||||
const int64_t bs = points_idx.size(0);
|
||||
|
||||
const dim3 threadsPerBlock(64);
|
||||
const dim3 numBlocks(bs, 1024 / bs + 1);
|
||||
|
||||
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
|
||||
// doubles. Currently, support is for floats only.
|
||||
weightedSumCudaBackwardKernel<<<numBlocks, threadsPerBlock>>>(
|
||||
// clang-format off
|
||||
grad_features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
|
||||
grad_alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
|
||||
grad_outputs.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
|
||||
features.packed_accessor<float, 2, torch::RestrictPtrTraits, size_t>(),
|
||||
alphas.packed_accessor<float, 4, torch::RestrictPtrTraits, size_t>(),
|
||||
points_idx.packed_accessor<int64_t, 4, torch::RestrictPtrTraits, size_t>());
|
||||
// clang-format on
|
||||
|
||||
return std::make_tuple(grad_features, grad_alphas);
|
||||
}
|
||||
107
pytorch3d/csrc/compositing/weighted_sum.h
Normal file
107
pytorch3d/csrc/compositing/weighted_sum.h
Normal file
@@ -0,0 +1,107 @@
|
||||
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
|
||||
|
||||
#include <torch/extension.h>
|
||||
#include "pytorch3d_cutils.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
// Perform weighted sum compositing of points in a z-buffer.
|
||||
//
|
||||
// Inputs:
|
||||
// features: FloatTensor of shape (C, P) which gives the features
|
||||
// of each point where C is the size of the feature and
|
||||
// P the number of points.
|
||||
// alphas: FloatTensor of shape (N, points_per_pixel, W, W) where
|
||||
// points_per_pixel is the number of points in the z-buffer
|
||||
// sorted in z-order, and W is the image size.
|
||||
// points_idx: IntTensor of shape (N, points_per_pixel, W, W) giving the
|
||||
// indices of the nearest points at each pixel, sorted in z-order.
|
||||
// Returns:
|
||||
// weighted_fs: FloatTensor of shape (N, C, W, W) giving the accumulated
|
||||
// feature in each point. Concretely, it gives:
|
||||
// weighted_fs[b,c,i,j] = sum_k alphas[b,k,i,j] *
|
||||
// features[c,points_idx[b,k,i,j]]
|
||||
|
||||
// CUDA declarations
|
||||
#ifdef WITH_CUDA
|
||||
torch::Tensor weightedSumCudaForward(
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx);
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> weightedSumCudaBackward(
|
||||
const torch::Tensor& grad_outputs,
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx);
|
||||
#endif
|
||||
|
||||
// C++ declarations
|
||||
torch::Tensor weightedSumCpuForward(
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx);
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> weightedSumCpuBackward(
|
||||
const torch::Tensor& grad_outputs,
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx);
|
||||
|
||||
torch::Tensor weightedSumForward(
|
||||
torch::Tensor& features,
|
||||
torch::Tensor& alphas,
|
||||
torch::Tensor& points_idx) {
|
||||
features = features.contiguous();
|
||||
alphas = alphas.contiguous();
|
||||
points_idx = points_idx.contiguous();
|
||||
|
||||
if (features.type().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(features);
|
||||
CHECK_CONTIGUOUS_CUDA(alphas);
|
||||
CHECK_CONTIGUOUS_CUDA(points_idx);
|
||||
#else
|
||||
AT_ERROR("Not compiled with GPU support");
|
||||
#endif
|
||||
|
||||
return weightedSumCudaForward(features, alphas, points_idx);
|
||||
} else {
|
||||
CHECK_CONTIGUOUS(features);
|
||||
CHECK_CONTIGUOUS(alphas);
|
||||
CHECK_CONTIGUOUS(points_idx);
|
||||
|
||||
return weightedSumCpuForward(features, alphas, points_idx);
|
||||
}
|
||||
}
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> weightedSumBackward(
|
||||
torch::Tensor& grad_outputs,
|
||||
torch::Tensor& features,
|
||||
torch::Tensor& alphas,
|
||||
torch::Tensor& points_idx) {
|
||||
grad_outputs = grad_outputs.contiguous();
|
||||
features = features.contiguous();
|
||||
alphas = alphas.contiguous();
|
||||
points_idx = points_idx.contiguous();
|
||||
|
||||
if (grad_outputs.type().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(grad_outputs);
|
||||
CHECK_CONTIGUOUS_CUDA(features);
|
||||
CHECK_CONTIGUOUS_CUDA(alphas);
|
||||
CHECK_CONTIGUOUS_CUDA(points_idx);
|
||||
#else
|
||||
AT_ERROR("Not compiled with GPU support");
|
||||
#endif
|
||||
|
||||
return weightedSumCudaBackward(grad_outputs, features, alphas, points_idx);
|
||||
} else {
|
||||
CHECK_CONTIGUOUS(grad_outputs);
|
||||
CHECK_CONTIGUOUS(features);
|
||||
CHECK_CONTIGUOUS(alphas);
|
||||
CHECK_CONTIGUOUS(points_idx);
|
||||
|
||||
return weightedSumCpuBackward(grad_outputs, features, alphas, points_idx);
|
||||
}
|
||||
}
|
||||
98
pytorch3d/csrc/compositing/weighted_sum_cpu.cpp
Normal file
98
pytorch3d/csrc/compositing/weighted_sum_cpu.cpp
Normal file
@@ -0,0 +1,98 @@
|
||||
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
|
||||
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
torch::Tensor weightedSumCpuForward(
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx) {
|
||||
const int64_t B = points_idx.size(0);
|
||||
const int64_t K = points_idx.size(1);
|
||||
const int64_t H = points_idx.size(2);
|
||||
const int64_t W = points_idx.size(3);
|
||||
const int64_t C = features.size(0);
|
||||
|
||||
torch::Tensor result = torch::zeros({B, C, H, W}, features.options());
|
||||
|
||||
auto features_a = features.accessor<float, 2>();
|
||||
auto alphas_a = alphas.accessor<float, 4>();
|
||||
auto points_idx_a = points_idx.accessor<int64_t, 4>();
|
||||
auto result_a = result.accessor<float, 4>();
|
||||
|
||||
// Iterate over the batch
|
||||
for (int b = 0; b < B; ++b) {
|
||||
// Iterate over the features
|
||||
for (int c = 0; c < C; ++c) {
|
||||
// Iterate through the horizontal lines of the image from top to bottom
|
||||
for (int j = 0; j < H; ++j) {
|
||||
// Iterate over pixels in a horizontal line, left to right
|
||||
for (int i = 0; i < W; ++i) {
|
||||
// Iterate through the closest K points for this pixel
|
||||
for (int k = 0; k < K; ++k) {
|
||||
int64_t n_idx = points_idx_a[b][k][j][i];
|
||||
// Sentinel value is -1 indicating no point overlaps the pixel
|
||||
if (n_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
float alpha = alphas_a[b][k][j][i];
|
||||
result_a[b][c][j][i] += alpha * features_a[c][n_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> weightedSumCpuBackward(
|
||||
const torch::Tensor& grad_outputs,
|
||||
const torch::Tensor& features,
|
||||
const torch::Tensor& alphas,
|
||||
const torch::Tensor& points_idx) {
|
||||
const int64_t B = points_idx.size(0);
|
||||
const int64_t K = points_idx.size(1);
|
||||
const int64_t H = points_idx.size(2);
|
||||
const int64_t W = points_idx.size(3);
|
||||
const int64_t C = features.size(0);
|
||||
|
||||
torch::Tensor grad_features = torch::zeros_like(features);
|
||||
torch::Tensor grad_alphas = torch::zeros_like(alphas);
|
||||
|
||||
auto grad_outputs_a = grad_outputs.accessor<float, 4>();
|
||||
auto features_a = features.accessor<float, 2>();
|
||||
auto alphas_a = alphas.accessor<float, 4>();
|
||||
auto points_idx_a = points_idx.accessor<int64_t, 4>();
|
||||
auto grad_features_a = grad_features.accessor<float, 2>();
|
||||
auto grad_alphas_a = grad_alphas.accessor<float, 4>();
|
||||
|
||||
// Iterate over the batch
|
||||
for (int b = 0; b < B; ++b) {
|
||||
// Iterate oer the features
|
||||
for (int c = 0; c < C; ++c) {
|
||||
// Iterate through the horizontal lines of the image from top to bottom
|
||||
for (int j = 0; j < H; ++j) {
|
||||
// Iterate over pixels in a horizontal line, left to right
|
||||
for (int i = 0; i < W; ++i) {
|
||||
// Iterate through the closest K points for this pixel
|
||||
for (int k = 0; k < K; ++k) {
|
||||
int64_t n_idx = points_idx_a[b][k][j][i];
|
||||
// Sentinal value is -1, indicating no point overlaps this pixel
|
||||
if (n_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
float alpha = alphas_a[b][k][j][i];
|
||||
grad_alphas_a[b][k][j][i] +=
|
||||
grad_outputs_a[b][c][j][i] * features_a[c][n_idx];
|
||||
grad_features_a[c][n_idx] += grad_outputs_a[b][c][j][i] * alpha;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return std::make_tuple(grad_features, grad_alphas);
|
||||
}
|
||||
@@ -1,6 +1,9 @@
|
||||
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
|
||||
|
||||
#include <torch/extension.h>
|
||||
#include "compositing/alpha_composite.h"
|
||||
#include "compositing/norm_weighted_sum.h"
|
||||
#include "compositing/weighted_sum.h"
|
||||
#include "face_areas_normals/face_areas_normals.h"
|
||||
#include "gather_scatter/gather_scatter.h"
|
||||
#include "nearest_neighbor_points/nearest_neighbor_points.h"
|
||||
@@ -20,6 +23,14 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
m.def("rasterize_meshes_backward", &RasterizeMeshesBackward);
|
||||
m.def("rasterize_meshes", &RasterizeMeshes);
|
||||
|
||||
// Accumulation functions
|
||||
m.def("accum_weightedsumnorm", &weightedSumNormForward);
|
||||
m.def("accum_weightedsum", &weightedSumForward);
|
||||
m.def("accum_alphacomposite", &alphaCompositeForward);
|
||||
m.def("accum_weightedsumnorm_backward", &weightedSumNormBackward);
|
||||
m.def("accum_weightedsum_backward", &weightedSumBackward);
|
||||
m.def("accum_alphacomposite_backward", &alphaCompositeBackward);
|
||||
|
||||
// These are only visible for testing; users should not call them directly
|
||||
m.def("_rasterize_points_coarse", &RasterizePointsCoarse);
|
||||
m.def("_rasterize_points_naive", &RasterizePointsNaive);
|
||||
|
||||
Reference in New Issue
Block a user