From 8ea4da293846e64bee5cc99c8cb008888479947f Mon Sep 17 00:00:00 2001 From: Max Wasylow Date: Fri, 19 Aug 2022 09:17:19 -0700 Subject: [PATCH] Replacing custom CUDA block reductions with CUB in sample_farthest_points Summary: Removing hardcoded block reduction operation from `sample_farthest_points.cu` code, and replace it with `cub::BlockReduce` reducing complexity of the code, and letting established libraries do the thinking for us. Reviewed By: bottler Differential Revision: D38617147 fbshipit-source-id: b230029c55f05cda0aab1648d3105a8d3e92d27b --- .../sample_farthest_points.cu | 76 +++++++------------ 1 file changed, 26 insertions(+), 50 deletions(-) diff --git a/pytorch3d/csrc/sample_farthest_points/sample_farthest_points.cu b/pytorch3d/csrc/sample_farthest_points/sample_farthest_points.cu index a91e2df6..d9b8df81 100644 --- a/pytorch3d/csrc/sample_farthest_points/sample_farthest_points.cu +++ b/pytorch3d/csrc/sample_farthest_points/sample_farthest_points.cu @@ -12,6 +12,7 @@ #include #include #include +#include #include "utils/warp_reduce.cuh" template @@ -25,20 +26,19 @@ __global__ void FarthestPointSamplingKernel( const at::PackedTensorAccessor64 start_idxs // clang-format on ) { + typedef cub::BlockReduce< + cub::KeyValuePair, + block_size, + cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY> + BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + __shared__ int64_t selected_store; + // Get constants const int64_t N = points.size(0); const int64_t P = points.size(1); const int64_t D = points.size(2); - // Create single shared memory buffer which is split and cast to different - // types: dists/dists_idx are used to save the maximum distances seen by the - // points processed by any one thread and the associated point indices. - // These values only need to be accessed by other threads in this block which - // are processing the same batch and not by other blocks. - extern __shared__ char shared_buf[]; - float* dists = (float*)shared_buf; // block_size floats - int64_t* dists_idx = (int64_t*)&dists[block_size]; // block_size int64_t - // Get batch index and thread index const int64_t batch_idx = blockIdx.x; const size_t tid = threadIdx.x; @@ -82,43 +82,26 @@ __global__ void FarthestPointSamplingKernel( max_dist = (p_min_dist > max_dist) ? p_min_dist : max_dist; } - // After going through all points for this thread, save the max - // point and idx seen by this thread. Each thread sees P/block_size points. - dists[tid] = max_dist; - dists_idx[tid] = max_dist_idx; - // Sync to ensure all threads in the block have updated their max point. - __syncthreads(); - - // Parallelized block reduction to find the max point seen by - // all the threads in this block for iteration k. - // Each block represents one batch element so we can use a divide/conquer - // approach to find the max, syncing all threads after each step. - - for (int s = block_size / 2; s > 0; s >>= 1) { - if (tid < s) { - // Compare the best point seen by two threads and update the shared - // memory at the location of the first thread index with the max out - // of the two threads. - if (dists[tid] < dists[tid + s]) { - dists[tid] = dists[tid + s]; - dists_idx[tid] = dists_idx[tid + s]; - } - } - __syncthreads(); - } - - // TODO(nikhilar): As reduction proceeds, the number of “active” threads - // decreases. When tid < 32, there should only be one warp left which could - // be unrolled. - - // The overall max after reducing will be saved - // at the location of tid = 0. - selected = dists_idx[0]; + // max_dist, max_dist_idx are now the max point and idx seen by this thread. + // Now find the index corresponding to the maximum distance seen by any + // thread. (This value is only on thread 0.) + selected = + BlockReduce(temp_storage) + .Reduce( + cub::KeyValuePair(max_dist_idx, max_dist), + cub::ArgMax(), + block_size) + .key; if (tid == 0) { // Write the farthest point for iteration k to global memory idxs[batch_idx][k] = selected; + selected_store = selected; } + + // Ensure `selected` in all threads equals the global maximum. + __syncthreads(); + selected = selected_store; } } @@ -185,15 +168,8 @@ at::Tensor FarthestPointSamplingCuda( auto min_point_dist_a = min_point_dist.packed_accessor64(); - // Initialize the shared memory which will be used to store the - // distance/index of the best point seen by each thread. - size_t shared_mem = threads * sizeof(float) + threads * sizeof(int64_t); - // TODO: using shared memory for min_point_dist gives an ~2x speed up - // compared to using a global (N, P) shaped tensor, however for - // larger pointclouds this may exceed the shared memory limit per block. - // If a speed up is required for smaller pointclouds, then the storage - // could be switched to shared memory if the required total shared memory is - // within the memory limit per block. + // TempStorage for the reduction uses static shared memory only. + size_t shared_mem = 0; // Support a case for all powers of 2 up to MAX_THREADS_PER_BLOCK possible per // block.