From e19df58766be1b2f02f466fd957e6fd0e566731c Mon Sep 17 00:00:00 2001
From: Jeremy Reizenstein <reizenstein@fb.com>
Date: Fri, 17 Apr 2020 09:23:49 -0700
Subject: [PATCH] remove final nearest_neighbor files

Summary: A couple of files for the removed nearest_neighbor functionality are left behind.

Reviewed By: nikhilaravi

Differential Revision: D21088624

fbshipit-source-id: 4bb29016b4e5f63102765b384c363733b60032fa
---
 .../nearest_neighbor_points.cu                | 229 ------------------
 .../nearest_neighbor_points.h                 |  42 ----
 2 files changed, 271 deletions(-)
 delete mode 100644 pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu
 delete mode 100644 pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.h
diff --git a/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu b/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu
deleted file mode 100644
index f5c10904..00000000
--- a/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu
+++ /dev/null
@@ -1,229 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-
-#include <ATen/ATen.h>
-#include <float.h>
-#include "utils/warp_reduce.cuh"
-
-//  CUDA kernel to compute nearest neighbors between two batches of pointclouds
-//  where each point is of dimension D.
-//
-//  Args:
-//    points1: First set of points, of shape (N, P1, D).
-//    points2: Second set of points, of shape (N, P2, D).
-//    idx: Output memory buffer of shape (N, P1).
-//    N: Batch size.
-//    P1: Number of points in points1.
-//    P2: Number of points in points2.
-//    D_2: Size of the shared buffer; this is D rounded up so that memory access
-//         is aligned.
-//
-template <typename scalar_t>
-__global__ void NearestNeighborKernel(
-    const scalar_t* __restrict__ points1,
-    const scalar_t* __restrict__ points2,
-    int64_t* __restrict__ idx,
-    const size_t N,
-    const size_t P1,
-    const size_t P2,
-    const size_t D,
-    const size_t D_2) {
-  // Each block will compute one element of the output idx[n, i]. Within the
-  // block we will use threads to compute the distances between points1[n, i]
-  // and points2[n, j] for all 0 <= j < P2, then use a block reduction to
-  // take an argmin of the distances.
-
-  // Shared buffers for the threads in the block. CUDA only allows declaration
-  // of a single shared buffer, so it needs to be manually sliced and cast to
-  // build several logical shared buffers of different types.
-  extern __shared__ char shared_buf[];
-  scalar_t* x = (scalar_t*)shared_buf; // scalar_t[DD]
-  scalar_t* min_dists = &x[D_2]; // scalar_t[NUM_THREADS]
-  int64_t* min_idxs = (int64_t*)&min_dists[blockDim.x]; // int64_t[NUM_THREADS]
-
-  const size_t n = blockIdx.y; // index of batch element.
-  const size_t i = blockIdx.x; // index of point within batch element.
-  const size_t tid = threadIdx.x;
-
-  // Thread 0 copies points1[n, i, :] into x.
-  if (tid == 0) {
-    for (size_t d = 0; d < D; d++) {
-      x[d] = points1[n * (P1 * D) + i * D + d];
-    }
-  }
-  __syncthreads();
-
-  // Compute the distances between points1[n, i] and points2[n, j] for
-  // all 0 <= j < P2. Here each thread will reduce over P2 / blockDim.x
-  // in serial, and store its result to shared memory
-  scalar_t min_dist = FLT_MAX;
-  size_t min_idx = 0;
-  for (size_t j = tid; j < P2; j += blockDim.x) {
-    scalar_t dist = 0;
-    for (size_t d = 0; d < D; d++) {
-      scalar_t x_d = x[d];
-      scalar_t y_d = points2[n * (P2 * D) + j * D + d];
-      scalar_t diff = x_d - y_d;
-      dist += diff * diff;
-    }
-    min_dist = (j == tid) ? dist : min_dist;
-    min_idx = (dist <= min_dist) ? j : min_idx;
-    min_dist = (dist <= min_dist) ? dist : min_dist;
-  }
-  min_dists[tid] = min_dist;
-  min_idxs[tid] = min_idx;
-  __syncthreads();
-
-  // Perform reduction in shared memory.
-  for (int s = blockDim.x / 2; s > 32; s >>= 1) {
-    if (tid < s) {
-      if (min_dists[tid] > min_dists[tid + s]) {
-        min_dists[tid] = min_dists[tid + s];
-        min_idxs[tid] = min_idxs[tid + s];
-      }
-    }
-    __syncthreads();
-  }
-
-  // Unroll the last 6 iterations of the loop since they will happen
-  // synchronized within a single warp.
-  if (tid < 32)
-    WarpReduce<scalar_t>(min_dists, min_idxs, tid);
-
-  // Finally thread 0 writes the result to the output buffer.
-  if (tid == 0) {
-    idx[n * P1 + i] = min_idxs[0];
-  }
-}
-
-//  CUDA kernel to compute nearest neighbors between two sets of 3-dimensional
-//  pointclouds. This is a specialization of the nearest_neighbor_kernel
-//  to the case D=3.
-//
-//  Args:
-//    points1: First set of pointclouds, of shape (N, P1, 3).
-//    points2: Second set of pointclouds, of shape (N, P2, 3).
-//    idx: Output memory buffer of shape (N, P1).
-//    N: Batch size.
-//    P1: Number of points in points1.
-//    P2: Number of points in points2.
-//
-template <typename scalar_t>
-__global__ void NearestNeighborKernelD3(
-    const scalar_t* __restrict__ points1,
-    const scalar_t* __restrict__ points2,
-    int64_t* __restrict__ idx,
-    const size_t N,
-    const size_t P1,
-    const size_t P2) {
-  // Single shared memory buffer which is split and cast to different types.
-  extern __shared__ char shared_buf[];
-  scalar_t* min_dists = (scalar_t*)shared_buf; // scalar_t[NUM_THREADS]
-  int64_t* min_idxs = (int64_t*)&min_dists[blockDim.x]; // int64_t[NUM_THREADS]
-
-  const size_t D = 3;
-  const size_t n = blockIdx.y; // index of batch element.
-  const size_t i = blockIdx.x; // index of point within batch element.
-  const size_t tid = threadIdx.x;
-
-  // Retrieve the coordinates of points1[n, i] from global memory; these
-  // will be stored in registers for fast access.
-  const scalar_t x = points1[n * (P1 * D) + i * D + 0];
-  const scalar_t y = points1[n * (P1 * D) + i * D + 1];
-  const scalar_t z = points1[n * (P1 * D) + i * D + 2];
-
-  // Compute distances between points1[n, i] and all points2[n, j]
-  // for 0 <= j < P2
-  scalar_t min_dist = FLT_MAX;
-  size_t min_idx = 0;
-
-  // Distance computation for points in p2 spread across threads in the block.
-  for (size_t j = tid; j < P2; j += blockDim.x) {
-    scalar_t dx = x - points2[n * (P2 * D) + j * D + 0];
-    scalar_t dy = y - points2[n * (P2 * D) + j * D + 1];
-    scalar_t dz = z - points2[n * (P2 * D) + j * D + 2];
-    scalar_t dist = dx * dx + dy * dy + dz * dz;
-    min_dist = (j == tid) ? dist : min_dist;
-    min_idx = (dist <= min_dist) ? j : min_idx;
-    min_dist = (dist <= min_dist) ? dist : min_dist;
-  }
-  min_dists[tid] = min_dist;
-  min_idxs[tid] = min_idx;
-
-  // Synchronize local threads writing to the shared memory buffer.
-  __syncthreads();
-
-  // Perform reduction in shared memory.
-  for (int s = blockDim.x / 2; s > 32; s >>= 1) {
-    if (tid < s) {
-      if (min_dists[tid] > min_dists[tid + s]) {
-        min_dists[tid] = min_dists[tid + s];
-        min_idxs[tid] = min_idxs[tid + s];
-      }
-    }
-
-    // Synchronize local threads so that min_dists is correct.
-    __syncthreads();
-  }
-
-  // Unroll the last 6 iterations of the loop since they will happen
-  // synchronized within a single warp.
-  if (tid < 32)
-    WarpReduce<scalar_t>(min_dists, min_idxs, tid);
-
-  // Finally thread 0 writes the result to the output buffer.
-  if (tid == 0) {
-    idx[n * P1 + i] = min_idxs[0];
-  }
-}
-
-at::Tensor NearestNeighborIdxCuda(at::Tensor p1, at::Tensor p2) {
-  const auto N = p1.size(0);
-  const auto P1 = p1.size(1);
-  const auto P2 = p2.size(1);
-  const auto D = p1.size(2);
-
-  AT_ASSERTM(p2.size(2) == D, "Point sets must have same last dimension.");
-  auto idx = at::empty({N, P1}, p1.options().dtype(at::kLong));
-
-  // On P100 with pointclouds of size (16, 5000, 3), 128 threads per block
-  // gives best results.
-  const int threads = 128;
-  const dim3 blocks(P1, N);
-
-  if (D == 3) {
-    // Use the specialized kernel for D=3.
-    AT_DISPATCH_FLOATING_TYPES(
-        p1.scalar_type(), "nearest_neighbor_v3_cuda", ([&] {
-          size_t shared_size =
-              threads * sizeof(size_t) + threads * sizeof(int64_t);
-          NearestNeighborKernelD3<scalar_t><<<blocks, threads, shared_size>>>(
-              p1.data_ptr<scalar_t>(),
-              p2.data_ptr<scalar_t>(),
-              idx.data_ptr<int64_t>(),
-              N,
-              P1,
-              P2);
-        }));
-  } else {
-    // Use the general kernel for all other D.
-    AT_DISPATCH_FLOATING_TYPES(
-        p1.scalar_type(), "nearest_neighbor_v3_cuda", ([&] {
-          // To avoid misaligned memory access, the size of shared buffers
-          // need to be rounded to the next even size.
-          size_t D_2 = D + (D % 2);
-          size_t shared_size = (D_2 + threads) * sizeof(size_t);
-          shared_size += threads * sizeof(int64_t);
-          NearestNeighborKernel<scalar_t><<<blocks, threads, shared_size>>>(
-              p1.data_ptr<scalar_t>(),
-              p2.data_ptr<scalar_t>(),
-              idx.data_ptr<int64_t>(),
-              N,
-              P1,
-              P2,
-              D,
-              D_2);
-        }));
-  }
-
-  return idx;
-}
diff --git a/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.h b/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.h
deleted file mode 100644
index a88ed113..00000000
--- a/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-
-#pragma once
-#include <torch/extension.h>
-#include "utils/pytorch3d_cutils.h"
-
-// Compute indices of nearest neighbors in pointcloud p2 to points
-// in pointcloud p1.
-//
-// Args:
-//    p1: FloatTensor of shape (N, P1, D) giving a batch of pointclouds each
-//        containing P1 points of dimension D.
-//    p2: FloatTensor of shape (N, P2, D) giving a batch of pointclouds each
-//        containing P2 points of dimension D.
-//
-// Returns:
-//    p1_neighbor_idx: LongTensor of shape (N, P1), where
-//                     p1_neighbor_idx[n, i] = j means that the nearest neighbor
-//                     to p1[n, i] in the cloud p2[n] is p2[n, j].
-//
-
-// CPU implementation.
-at::Tensor NearestNeighborIdxCpu(at::Tensor p1, at::Tensor p2);
-
-#ifdef WITH_CUDA
-// Cuda implementation.
-at::Tensor NearestNeighborIdxCuda(at::Tensor p1, at::Tensor p2);
-#endif
-
-// Implementation which is exposed.
-at::Tensor NearestNeighborIdx(at::Tensor p1, at::Tensor p2) {
-  if (p1.is_cuda() && p2.is_cuda()) {
-#ifdef WITH_CUDA
-    CHECK_CONTIGUOUS_CUDA(p1);
-    CHECK_CONTIGUOUS_CUDA(p2);
-    return NearestNeighborIdxCuda(p1, p2);
-#else
-    AT_ERROR("Not compiled with GPU support.");
-#endif
-  }
-  return NearestNeighborIdxCpu(p1, p2);
-}