mirror of
https://github.com/facebookresearch/pytorch3d.git
synced 2025-08-03 12:22:49 +08:00
Summary: Implementation of point to mesh distances. The current diff contains two types: (a) Point to Edge (b) Point to Face ``` Benchmark Avg Time(μs) Peak Time(μs) Iterations -------------------------------------------------------------------------------- POINT_MESH_EDGE_4_100_300_5000_cuda:0 2745 3138 183 POINT_MESH_EDGE_4_100_300_10000_cuda:0 4408 4499 114 POINT_MESH_EDGE_4_100_3000_5000_cuda:0 4978 5070 101 POINT_MESH_EDGE_4_100_3000_10000_cuda:0 9076 9187 56 POINT_MESH_EDGE_4_1000_300_5000_cuda:0 1411 1487 355 POINT_MESH_EDGE_4_1000_300_10000_cuda:0 4829 5030 104 POINT_MESH_EDGE_4_1000_3000_5000_cuda:0 7539 7620 67 POINT_MESH_EDGE_4_1000_3000_10000_cuda:0 12088 12272 42 POINT_MESH_EDGE_8_100_300_5000_cuda:0 3106 3222 161 POINT_MESH_EDGE_8_100_300_10000_cuda:0 8561 8648 59 POINT_MESH_EDGE_8_100_3000_5000_cuda:0 6932 7021 73 POINT_MESH_EDGE_8_100_3000_10000_cuda:0 24032 24176 21 POINT_MESH_EDGE_8_1000_300_5000_cuda:0 5272 5399 95 POINT_MESH_EDGE_8_1000_300_10000_cuda:0 11348 11430 45 POINT_MESH_EDGE_8_1000_3000_5000_cuda:0 17478 17683 29 POINT_MESH_EDGE_8_1000_3000_10000_cuda:0 25961 26236 20 POINT_MESH_EDGE_16_100_300_5000_cuda:0 8244 8323 61 POINT_MESH_EDGE_16_100_300_10000_cuda:0 18018 18071 28 POINT_MESH_EDGE_16_100_3000_5000_cuda:0 19428 19544 26 POINT_MESH_EDGE_16_100_3000_10000_cuda:0 44967 45135 12 POINT_MESH_EDGE_16_1000_300_5000_cuda:0 7825 7937 64 POINT_MESH_EDGE_16_1000_300_10000_cuda:0 18504 18571 28 POINT_MESH_EDGE_16_1000_3000_5000_cuda:0 65805 66132 8 POINT_MESH_EDGE_16_1000_3000_10000_cuda:0 90885 91089 6 -------------------------------------------------------------------------------- Benchmark Avg Time(μs) Peak Time(μs) Iterations -------------------------------------------------------------------------------- POINT_MESH_FACE_4_100_300_5000_cuda:0 1561 1685 321 POINT_MESH_FACE_4_100_300_10000_cuda:0 2818 2954 178 POINT_MESH_FACE_4_100_3000_5000_cuda:0 15893 16018 32 POINT_MESH_FACE_4_100_3000_10000_cuda:0 16350 16439 31 POINT_MESH_FACE_4_1000_300_5000_cuda:0 3179 3278 158 POINT_MESH_FACE_4_1000_300_10000_cuda:0 2353 2436 213 POINT_MESH_FACE_4_1000_3000_5000_cuda:0 16262 16336 31 POINT_MESH_FACE_4_1000_3000_10000_cuda:0 9334 9448 54 POINT_MESH_FACE_8_100_300_5000_cuda:0 4377 4493 115 POINT_MESH_FACE_8_100_300_10000_cuda:0 9728 9822 52 POINT_MESH_FACE_8_100_3000_5000_cuda:0 26428 26544 19 POINT_MESH_FACE_8_100_3000_10000_cuda:0 42238 43031 12 POINT_MESH_FACE_8_1000_300_5000_cuda:0 3891 3982 129 POINT_MESH_FACE_8_1000_300_10000_cuda:0 5363 5429 94 POINT_MESH_FACE_8_1000_3000_5000_cuda:0 20998 21084 24 POINT_MESH_FACE_8_1000_3000_10000_cuda:0 39711 39897 13 POINT_MESH_FACE_16_100_300_5000_cuda:0 5955 6001 84 POINT_MESH_FACE_16_100_300_10000_cuda:0 12082 12144 42 POINT_MESH_FACE_16_100_3000_5000_cuda:0 44996 45176 12 POINT_MESH_FACE_16_100_3000_10000_cuda:0 73042 73197 7 POINT_MESH_FACE_16_1000_300_5000_cuda:0 8292 8374 61 POINT_MESH_FACE_16_1000_300_10000_cuda:0 19442 19506 26 POINT_MESH_FACE_16_1000_3000_5000_cuda:0 36059 36194 14 POINT_MESH_FACE_16_1000_3000_10000_cuda:0 64644 64822 8 -------------------------------------------------------------------------------- ``` Reviewed By: jcjohnson Differential Revision: D20590462 fbshipit-source-id: 42a39837b514a546ac9471bfaff60eefe7fae829
219 lines
5.1 KiB
Plaintext
219 lines
5.1 KiB
Plaintext
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
|
|
|
|
// This converts dynamic array lookups into static array lookups, for small
|
|
// arrays up to size 32.
|
|
//
|
|
// Suppose we have a small thread-local array:
|
|
//
|
|
// float vals[10];
|
|
//
|
|
// Ideally we should only index this array using static indices:
|
|
//
|
|
// for (int i = 0; i < 10; ++i) vals[i] = i * i;
|
|
//
|
|
// If we do so, then the CUDA compiler may be able to place the array into
|
|
// registers, which can have a big performance improvement. However if we
|
|
// access the array dynamically, the the compiler may force the array into
|
|
// local memory, which has the same latency as global memory.
|
|
//
|
|
// These functions convert dynamic array access into static array access
|
|
// using a brute-force lookup table. It can be used like this:
|
|
//
|
|
// float vals[10];
|
|
// int idx = 3;
|
|
// float val = 3.14f;
|
|
// RegisterIndexUtils<float, 10>::set(vals, idx, val);
|
|
// float val2 = RegisterIndexUtils<float, 10>::get(vals, idx);
|
|
//
|
|
// The implementation is based on fbcuda/RegisterUtils.cuh:
|
|
// https://github.com/facebook/fbcuda/blob/master/RegisterUtils.cuh
|
|
// To avoid depending on the entire library, we just reimplement these two
|
|
// functions. The fbcuda implementation is a bit more sophisticated, and uses
|
|
// the preprocessor to generate switch statements that go up to N for each
|
|
// value of N. We are lazy and just have a giant explicit switch statement.
|
|
//
|
|
// We might be able to use a template metaprogramming approach similar to
|
|
// DispatchKernel1D for this. However DispatchKernel1D is intended to be used
|
|
// for dispatching to the correct CUDA kernel on the host, while this is
|
|
// is intended to run on the device. I was concerned that a metaprogramming
|
|
// approach for this might lead to extra function calls at runtime if the
|
|
// compiler fails to optimize them away, which could be very slow on device.
|
|
// However I didn't actually benchmark or test this.
|
|
template <typename T, int N>
|
|
struct RegisterIndexUtils {
|
|
__device__ __forceinline__ static T get(const T arr[N], int idx) {
|
|
if (idx < 0 || idx >= N)
|
|
return T();
|
|
switch (idx) {
|
|
case 0:
|
|
return arr[0];
|
|
case 1:
|
|
return arr[1];
|
|
case 2:
|
|
return arr[2];
|
|
case 3:
|
|
return arr[3];
|
|
case 4:
|
|
return arr[4];
|
|
case 5:
|
|
return arr[5];
|
|
case 6:
|
|
return arr[6];
|
|
case 7:
|
|
return arr[7];
|
|
case 8:
|
|
return arr[8];
|
|
case 9:
|
|
return arr[9];
|
|
case 10:
|
|
return arr[10];
|
|
case 11:
|
|
return arr[11];
|
|
case 12:
|
|
return arr[12];
|
|
case 13:
|
|
return arr[13];
|
|
case 14:
|
|
return arr[14];
|
|
case 15:
|
|
return arr[15];
|
|
case 16:
|
|
return arr[16];
|
|
case 17:
|
|
return arr[17];
|
|
case 18:
|
|
return arr[18];
|
|
case 19:
|
|
return arr[19];
|
|
case 20:
|
|
return arr[20];
|
|
case 21:
|
|
return arr[21];
|
|
case 22:
|
|
return arr[22];
|
|
case 23:
|
|
return arr[23];
|
|
case 24:
|
|
return arr[24];
|
|
case 25:
|
|
return arr[25];
|
|
case 26:
|
|
return arr[26];
|
|
case 27:
|
|
return arr[27];
|
|
case 28:
|
|
return arr[28];
|
|
case 29:
|
|
return arr[29];
|
|
case 30:
|
|
return arr[30];
|
|
case 31:
|
|
return arr[31];
|
|
};
|
|
return T();
|
|
}
|
|
|
|
__device__ __forceinline__ static void set(T arr[N], int idx, T val) {
|
|
if (idx < 0 || idx >= N)
|
|
return;
|
|
switch (idx) {
|
|
case 0:
|
|
arr[0] = val;
|
|
break;
|
|
case 1:
|
|
arr[1] = val;
|
|
break;
|
|
case 2:
|
|
arr[2] = val;
|
|
break;
|
|
case 3:
|
|
arr[3] = val;
|
|
break;
|
|
case 4:
|
|
arr[4] = val;
|
|
break;
|
|
case 5:
|
|
arr[5] = val;
|
|
break;
|
|
case 6:
|
|
arr[6] = val;
|
|
break;
|
|
case 7:
|
|
arr[7] = val;
|
|
break;
|
|
case 8:
|
|
arr[8] = val;
|
|
break;
|
|
case 9:
|
|
arr[9] = val;
|
|
break;
|
|
case 10:
|
|
arr[10] = val;
|
|
break;
|
|
case 11:
|
|
arr[11] = val;
|
|
break;
|
|
case 12:
|
|
arr[12] = val;
|
|
break;
|
|
case 13:
|
|
arr[13] = val;
|
|
break;
|
|
case 14:
|
|
arr[14] = val;
|
|
break;
|
|
case 15:
|
|
arr[15] = val;
|
|
break;
|
|
case 16:
|
|
arr[16] = val;
|
|
break;
|
|
case 17:
|
|
arr[17] = val;
|
|
break;
|
|
case 18:
|
|
arr[18] = val;
|
|
break;
|
|
case 19:
|
|
arr[19] = val;
|
|
break;
|
|
case 20:
|
|
arr[20] = val;
|
|
break;
|
|
case 21:
|
|
arr[21] = val;
|
|
break;
|
|
case 22:
|
|
arr[22] = val;
|
|
break;
|
|
case 23:
|
|
arr[23] = val;
|
|
break;
|
|
case 24:
|
|
arr[24] = val;
|
|
break;
|
|
case 25:
|
|
arr[25] = val;
|
|
break;
|
|
case 26:
|
|
arr[26] = val;
|
|
break;
|
|
case 27:
|
|
arr[27] = val;
|
|
break;
|
|
case 28:
|
|
arr[28] = val;
|
|
break;
|
|
case 29:
|
|
arr[29] = val;
|
|
break;
|
|
case 30:
|
|
arr[30] = val;
|
|
break;
|
|
case 31:
|
|
arr[31] = val;
|
|
break;
|
|
}
|
|
}
|
|
};
|