From e19df58766be1b2f02f466fd957e6fd0e566731c Mon Sep 17 00:00:00 2001 From: Jeremy Reizenstein Date: Fri, 17 Apr 2020 09:23:49 -0700 Subject: [PATCH] remove final nearest_neighbor files Summary: A couple of files for the removed nearest_neighbor functionality are left behind. Reviewed By: nikhilaravi Differential Revision: D21088624 fbshipit-source-id: 4bb29016b4e5f63102765b384c363733b60032fa --- .../nearest_neighbor_points.cu | 229 ------------------ .../nearest_neighbor_points.h | 42 ---- 2 files changed, 271 deletions(-) delete mode 100644 pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu delete mode 100644 pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.h diff --git a/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu b/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu deleted file mode 100644 index f5c10904..00000000 --- a/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu +++ /dev/null @@ -1,229 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. - -#include -#include -#include "utils/warp_reduce.cuh" - -// CUDA kernel to compute nearest neighbors between two batches of pointclouds -// where each point is of dimension D. -// -// Args: -// points1: First set of points, of shape (N, P1, D). -// points2: Second set of points, of shape (N, P2, D). -// idx: Output memory buffer of shape (N, P1). -// N: Batch size. -// P1: Number of points in points1. -// P2: Number of points in points2. -// D_2: Size of the shared buffer; this is D rounded up so that memory access -// is aligned. -// -template -__global__ void NearestNeighborKernel( - const scalar_t* __restrict__ points1, - const scalar_t* __restrict__ points2, - int64_t* __restrict__ idx, - const size_t N, - const size_t P1, - const size_t P2, - const size_t D, - const size_t D_2) { - // Each block will compute one element of the output idx[n, i]. Within the - // block we will use threads to compute the distances between points1[n, i] - // and points2[n, j] for all 0 <= j < P2, then use a block reduction to - // take an argmin of the distances. - - // Shared buffers for the threads in the block. CUDA only allows declaration - // of a single shared buffer, so it needs to be manually sliced and cast to - // build several logical shared buffers of different types. - extern __shared__ char shared_buf[]; - scalar_t* x = (scalar_t*)shared_buf; // scalar_t[DD] - scalar_t* min_dists = &x[D_2]; // scalar_t[NUM_THREADS] - int64_t* min_idxs = (int64_t*)&min_dists[blockDim.x]; // int64_t[NUM_THREADS] - - const size_t n = blockIdx.y; // index of batch element. - const size_t i = blockIdx.x; // index of point within batch element. - const size_t tid = threadIdx.x; - - // Thread 0 copies points1[n, i, :] into x. - if (tid == 0) { - for (size_t d = 0; d < D; d++) { - x[d] = points1[n * (P1 * D) + i * D + d]; - } - } - __syncthreads(); - - // Compute the distances between points1[n, i] and points2[n, j] for - // all 0 <= j < P2. Here each thread will reduce over P2 / blockDim.x - // in serial, and store its result to shared memory - scalar_t min_dist = FLT_MAX; - size_t min_idx = 0; - for (size_t j = tid; j < P2; j += blockDim.x) { - scalar_t dist = 0; - for (size_t d = 0; d < D; d++) { - scalar_t x_d = x[d]; - scalar_t y_d = points2[n * (P2 * D) + j * D + d]; - scalar_t diff = x_d - y_d; - dist += diff * diff; - } - min_dist = (j == tid) ? dist : min_dist; - min_idx = (dist <= min_dist) ? j : min_idx; - min_dist = (dist <= min_dist) ? dist : min_dist; - } - min_dists[tid] = min_dist; - min_idxs[tid] = min_idx; - __syncthreads(); - - // Perform reduction in shared memory. - for (int s = blockDim.x / 2; s > 32; s >>= 1) { - if (tid < s) { - if (min_dists[tid] > min_dists[tid + s]) { - min_dists[tid] = min_dists[tid + s]; - min_idxs[tid] = min_idxs[tid + s]; - } - } - __syncthreads(); - } - - // Unroll the last 6 iterations of the loop since they will happen - // synchronized within a single warp. - if (tid < 32) - WarpReduce(min_dists, min_idxs, tid); - - // Finally thread 0 writes the result to the output buffer. - if (tid == 0) { - idx[n * P1 + i] = min_idxs[0]; - } -} - -// CUDA kernel to compute nearest neighbors between two sets of 3-dimensional -// pointclouds. This is a specialization of the nearest_neighbor_kernel -// to the case D=3. -// -// Args: -// points1: First set of pointclouds, of shape (N, P1, 3). -// points2: Second set of pointclouds, of shape (N, P2, 3). -// idx: Output memory buffer of shape (N, P1). -// N: Batch size. -// P1: Number of points in points1. -// P2: Number of points in points2. -// -template -__global__ void NearestNeighborKernelD3( - const scalar_t* __restrict__ points1, - const scalar_t* __restrict__ points2, - int64_t* __restrict__ idx, - const size_t N, - const size_t P1, - const size_t P2) { - // Single shared memory buffer which is split and cast to different types. - extern __shared__ char shared_buf[]; - scalar_t* min_dists = (scalar_t*)shared_buf; // scalar_t[NUM_THREADS] - int64_t* min_idxs = (int64_t*)&min_dists[blockDim.x]; // int64_t[NUM_THREADS] - - const size_t D = 3; - const size_t n = blockIdx.y; // index of batch element. - const size_t i = blockIdx.x; // index of point within batch element. - const size_t tid = threadIdx.x; - - // Retrieve the coordinates of points1[n, i] from global memory; these - // will be stored in registers for fast access. - const scalar_t x = points1[n * (P1 * D) + i * D + 0]; - const scalar_t y = points1[n * (P1 * D) + i * D + 1]; - const scalar_t z = points1[n * (P1 * D) + i * D + 2]; - - // Compute distances between points1[n, i] and all points2[n, j] - // for 0 <= j < P2 - scalar_t min_dist = FLT_MAX; - size_t min_idx = 0; - - // Distance computation for points in p2 spread across threads in the block. - for (size_t j = tid; j < P2; j += blockDim.x) { - scalar_t dx = x - points2[n * (P2 * D) + j * D + 0]; - scalar_t dy = y - points2[n * (P2 * D) + j * D + 1]; - scalar_t dz = z - points2[n * (P2 * D) + j * D + 2]; - scalar_t dist = dx * dx + dy * dy + dz * dz; - min_dist = (j == tid) ? dist : min_dist; - min_idx = (dist <= min_dist) ? j : min_idx; - min_dist = (dist <= min_dist) ? dist : min_dist; - } - min_dists[tid] = min_dist; - min_idxs[tid] = min_idx; - - // Synchronize local threads writing to the shared memory buffer. - __syncthreads(); - - // Perform reduction in shared memory. - for (int s = blockDim.x / 2; s > 32; s >>= 1) { - if (tid < s) { - if (min_dists[tid] > min_dists[tid + s]) { - min_dists[tid] = min_dists[tid + s]; - min_idxs[tid] = min_idxs[tid + s]; - } - } - - // Synchronize local threads so that min_dists is correct. - __syncthreads(); - } - - // Unroll the last 6 iterations of the loop since they will happen - // synchronized within a single warp. - if (tid < 32) - WarpReduce(min_dists, min_idxs, tid); - - // Finally thread 0 writes the result to the output buffer. - if (tid == 0) { - idx[n * P1 + i] = min_idxs[0]; - } -} - -at::Tensor NearestNeighborIdxCuda(at::Tensor p1, at::Tensor p2) { - const auto N = p1.size(0); - const auto P1 = p1.size(1); - const auto P2 = p2.size(1); - const auto D = p1.size(2); - - AT_ASSERTM(p2.size(2) == D, "Point sets must have same last dimension."); - auto idx = at::empty({N, P1}, p1.options().dtype(at::kLong)); - - // On P100 with pointclouds of size (16, 5000, 3), 128 threads per block - // gives best results. - const int threads = 128; - const dim3 blocks(P1, N); - - if (D == 3) { - // Use the specialized kernel for D=3. - AT_DISPATCH_FLOATING_TYPES( - p1.scalar_type(), "nearest_neighbor_v3_cuda", ([&] { - size_t shared_size = - threads * sizeof(size_t) + threads * sizeof(int64_t); - NearestNeighborKernelD3<<>>( - p1.data_ptr(), - p2.data_ptr(), - idx.data_ptr(), - N, - P1, - P2); - })); - } else { - // Use the general kernel for all other D. - AT_DISPATCH_FLOATING_TYPES( - p1.scalar_type(), "nearest_neighbor_v3_cuda", ([&] { - // To avoid misaligned memory access, the size of shared buffers - // need to be rounded to the next even size. - size_t D_2 = D + (D % 2); - size_t shared_size = (D_2 + threads) * sizeof(size_t); - shared_size += threads * sizeof(int64_t); - NearestNeighborKernel<<>>( - p1.data_ptr(), - p2.data_ptr(), - idx.data_ptr(), - N, - P1, - P2, - D, - D_2); - })); - } - - return idx; -} diff --git a/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.h b/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.h deleted file mode 100644 index a88ed113..00000000 --- a/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. - -#pragma once -#include -#include "utils/pytorch3d_cutils.h" - -// Compute indices of nearest neighbors in pointcloud p2 to points -// in pointcloud p1. -// -// Args: -// p1: FloatTensor of shape (N, P1, D) giving a batch of pointclouds each -// containing P1 points of dimension D. -// p2: FloatTensor of shape (N, P2, D) giving a batch of pointclouds each -// containing P2 points of dimension D. -// -// Returns: -// p1_neighbor_idx: LongTensor of shape (N, P1), where -// p1_neighbor_idx[n, i] = j means that the nearest neighbor -// to p1[n, i] in the cloud p2[n] is p2[n, j]. -// - -// CPU implementation. -at::Tensor NearestNeighborIdxCpu(at::Tensor p1, at::Tensor p2); - -#ifdef WITH_CUDA -// Cuda implementation. -at::Tensor NearestNeighborIdxCuda(at::Tensor p1, at::Tensor p2); -#endif - -// Implementation which is exposed. -at::Tensor NearestNeighborIdx(at::Tensor p1, at::Tensor p2) { - if (p1.is_cuda() && p2.is_cuda()) { -#ifdef WITH_CUDA - CHECK_CONTIGUOUS_CUDA(p1); - CHECK_CONTIGUOUS_CUDA(p2); - return NearestNeighborIdxCuda(p1, p2); -#else - AT_ERROR("Not compiled with GPU support."); -#endif - } - return NearestNeighborIdxCpu(p1, p2); -}