remove final nearest_neighbor files

Summary: A couple of files for the removed nearest_neighbor functionality are left behind.

Reviewed By: nikhilaravi

Differential Revision: D21088624

fbshipit-source-id: 4bb29016b4e5f63102765b384c363733b60032fa
This commit is contained in:
Jeremy Reizenstein 2020-04-17 09:23:49 -07:00 committed by Facebook GitHub Bot
parent 04d8bf6a43
commit e19df58766
2 changed files with 0 additions and 271 deletions

View File

@ -1,229 +0,0 @@
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <ATen/ATen.h>
#include <float.h>
#include "utils/warp_reduce.cuh"
// CUDA kernel to compute nearest neighbors between two batches of pointclouds
// where each point is of dimension D.
//
// Args:
// points1: First set of points, of shape (N, P1, D).
// points2: Second set of points, of shape (N, P2, D).
// idx: Output memory buffer of shape (N, P1).
// N: Batch size.
// P1: Number of points in points1.
// P2: Number of points in points2.
// D_2: Size of the shared buffer; this is D rounded up so that memory access
// is aligned.
//
template <typename scalar_t>
__global__ void NearestNeighborKernel(
const scalar_t* __restrict__ points1,
const scalar_t* __restrict__ points2,
int64_t* __restrict__ idx,
const size_t N,
const size_t P1,
const size_t P2,
const size_t D,
const size_t D_2) {
// Each block will compute one element of the output idx[n, i]. Within the
// block we will use threads to compute the distances between points1[n, i]
// and points2[n, j] for all 0 <= j < P2, then use a block reduction to
// take an argmin of the distances.
// Shared buffers for the threads in the block. CUDA only allows declaration
// of a single shared buffer, so it needs to be manually sliced and cast to
// build several logical shared buffers of different types.
extern __shared__ char shared_buf[];
scalar_t* x = (scalar_t*)shared_buf; // scalar_t[DD]
scalar_t* min_dists = &x[D_2]; // scalar_t[NUM_THREADS]
int64_t* min_idxs = (int64_t*)&min_dists[blockDim.x]; // int64_t[NUM_THREADS]
const size_t n = blockIdx.y; // index of batch element.
const size_t i = blockIdx.x; // index of point within batch element.
const size_t tid = threadIdx.x;
// Thread 0 copies points1[n, i, :] into x.
if (tid == 0) {
for (size_t d = 0; d < D; d++) {
x[d] = points1[n * (P1 * D) + i * D + d];
}
}
__syncthreads();
// Compute the distances between points1[n, i] and points2[n, j] for
// all 0 <= j < P2. Here each thread will reduce over P2 / blockDim.x
// in serial, and store its result to shared memory
scalar_t min_dist = FLT_MAX;
size_t min_idx = 0;
for (size_t j = tid; j < P2; j += blockDim.x) {
scalar_t dist = 0;
for (size_t d = 0; d < D; d++) {
scalar_t x_d = x[d];
scalar_t y_d = points2[n * (P2 * D) + j * D + d];
scalar_t diff = x_d - y_d;
dist += diff * diff;
}
min_dist = (j == tid) ? dist : min_dist;
min_idx = (dist <= min_dist) ? j : min_idx;
min_dist = (dist <= min_dist) ? dist : min_dist;
}
min_dists[tid] = min_dist;
min_idxs[tid] = min_idx;
__syncthreads();
// Perform reduction in shared memory.
for (int s = blockDim.x / 2; s > 32; s >>= 1) {
if (tid < s) {
if (min_dists[tid] > min_dists[tid + s]) {
min_dists[tid] = min_dists[tid + s];
min_idxs[tid] = min_idxs[tid + s];
}
}
__syncthreads();
}
// Unroll the last 6 iterations of the loop since they will happen
// synchronized within a single warp.
if (tid < 32)
WarpReduce<scalar_t>(min_dists, min_idxs, tid);
// Finally thread 0 writes the result to the output buffer.
if (tid == 0) {
idx[n * P1 + i] = min_idxs[0];
}
}
// CUDA kernel to compute nearest neighbors between two sets of 3-dimensional
// pointclouds. This is a specialization of the nearest_neighbor_kernel
// to the case D=3.
//
// Args:
// points1: First set of pointclouds, of shape (N, P1, 3).
// points2: Second set of pointclouds, of shape (N, P2, 3).
// idx: Output memory buffer of shape (N, P1).
// N: Batch size.
// P1: Number of points in points1.
// P2: Number of points in points2.
//
template <typename scalar_t>
__global__ void NearestNeighborKernelD3(
const scalar_t* __restrict__ points1,
const scalar_t* __restrict__ points2,
int64_t* __restrict__ idx,
const size_t N,
const size_t P1,
const size_t P2) {
// Single shared memory buffer which is split and cast to different types.
extern __shared__ char shared_buf[];
scalar_t* min_dists = (scalar_t*)shared_buf; // scalar_t[NUM_THREADS]
int64_t* min_idxs = (int64_t*)&min_dists[blockDim.x]; // int64_t[NUM_THREADS]
const size_t D = 3;
const size_t n = blockIdx.y; // index of batch element.
const size_t i = blockIdx.x; // index of point within batch element.
const size_t tid = threadIdx.x;
// Retrieve the coordinates of points1[n, i] from global memory; these
// will be stored in registers for fast access.
const scalar_t x = points1[n * (P1 * D) + i * D + 0];
const scalar_t y = points1[n * (P1 * D) + i * D + 1];
const scalar_t z = points1[n * (P1 * D) + i * D + 2];
// Compute distances between points1[n, i] and all points2[n, j]
// for 0 <= j < P2
scalar_t min_dist = FLT_MAX;
size_t min_idx = 0;
// Distance computation for points in p2 spread across threads in the block.
for (size_t j = tid; j < P2; j += blockDim.x) {
scalar_t dx = x - points2[n * (P2 * D) + j * D + 0];
scalar_t dy = y - points2[n * (P2 * D) + j * D + 1];
scalar_t dz = z - points2[n * (P2 * D) + j * D + 2];
scalar_t dist = dx * dx + dy * dy + dz * dz;
min_dist = (j == tid) ? dist : min_dist;
min_idx = (dist <= min_dist) ? j : min_idx;
min_dist = (dist <= min_dist) ? dist : min_dist;
}
min_dists[tid] = min_dist;
min_idxs[tid] = min_idx;
// Synchronize local threads writing to the shared memory buffer.
__syncthreads();
// Perform reduction in shared memory.
for (int s = blockDim.x / 2; s > 32; s >>= 1) {
if (tid < s) {
if (min_dists[tid] > min_dists[tid + s]) {
min_dists[tid] = min_dists[tid + s];
min_idxs[tid] = min_idxs[tid + s];
}
}
// Synchronize local threads so that min_dists is correct.
__syncthreads();
}
// Unroll the last 6 iterations of the loop since they will happen
// synchronized within a single warp.
if (tid < 32)
WarpReduce<scalar_t>(min_dists, min_idxs, tid);
// Finally thread 0 writes the result to the output buffer.
if (tid == 0) {
idx[n * P1 + i] = min_idxs[0];
}
}
at::Tensor NearestNeighborIdxCuda(at::Tensor p1, at::Tensor p2) {
const auto N = p1.size(0);
const auto P1 = p1.size(1);
const auto P2 = p2.size(1);
const auto D = p1.size(2);
AT_ASSERTM(p2.size(2) == D, "Point sets must have same last dimension.");
auto idx = at::empty({N, P1}, p1.options().dtype(at::kLong));
// On P100 with pointclouds of size (16, 5000, 3), 128 threads per block
// gives best results.
const int threads = 128;
const dim3 blocks(P1, N);
if (D == 3) {
// Use the specialized kernel for D=3.
AT_DISPATCH_FLOATING_TYPES(
p1.scalar_type(), "nearest_neighbor_v3_cuda", ([&] {
size_t shared_size =
threads * sizeof(size_t) + threads * sizeof(int64_t);
NearestNeighborKernelD3<scalar_t><<<blocks, threads, shared_size>>>(
p1.data_ptr<scalar_t>(),
p2.data_ptr<scalar_t>(),
idx.data_ptr<int64_t>(),
N,
P1,
P2);
}));
} else {
// Use the general kernel for all other D.
AT_DISPATCH_FLOATING_TYPES(
p1.scalar_type(), "nearest_neighbor_v3_cuda", ([&] {
// To avoid misaligned memory access, the size of shared buffers
// need to be rounded to the next even size.
size_t D_2 = D + (D % 2);
size_t shared_size = (D_2 + threads) * sizeof(size_t);
shared_size += threads * sizeof(int64_t);
NearestNeighborKernel<scalar_t><<<blocks, threads, shared_size>>>(
p1.data_ptr<scalar_t>(),
p2.data_ptr<scalar_t>(),
idx.data_ptr<int64_t>(),
N,
P1,
P2,
D,
D_2);
}));
}
return idx;
}

View File

@ -1,42 +0,0 @@
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#pragma once
#include <torch/extension.h>
#include "utils/pytorch3d_cutils.h"
// Compute indices of nearest neighbors in pointcloud p2 to points
// in pointcloud p1.
//
// Args:
// p1: FloatTensor of shape (N, P1, D) giving a batch of pointclouds each
// containing P1 points of dimension D.
// p2: FloatTensor of shape (N, P2, D) giving a batch of pointclouds each
// containing P2 points of dimension D.
//
// Returns:
// p1_neighbor_idx: LongTensor of shape (N, P1), where
// p1_neighbor_idx[n, i] = j means that the nearest neighbor
// to p1[n, i] in the cloud p2[n] is p2[n, j].
//
// CPU implementation.
at::Tensor NearestNeighborIdxCpu(at::Tensor p1, at::Tensor p2);
#ifdef WITH_CUDA
// Cuda implementation.
at::Tensor NearestNeighborIdxCuda(at::Tensor p1, at::Tensor p2);
#endif
// Implementation which is exposed.
at::Tensor NearestNeighborIdx(at::Tensor p1, at::Tensor p2) {
if (p1.is_cuda() && p2.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(p1);
CHECK_CONTIGUOUS_CUDA(p2);
return NearestNeighborIdxCuda(p1, p2);
#else
AT_ERROR("Not compiled with GPU support.");
#endif
}
return NearestNeighborIdxCpu(p1, p2);
}