Georgia Gkioxari 487d4d6607 point mesh distances
Summary:
Implementation of point to mesh distances. The current diff contains two types:
(a) Point to Edge
(b) Point to Face

```

Benchmark                                       Avg Time(μs)      Peak Time(μs) Iterations
--------------------------------------------------------------------------------
POINT_MESH_EDGE_4_100_300_5000_cuda:0                2745            3138            183
POINT_MESH_EDGE_4_100_300_10000_cuda:0               4408            4499            114
POINT_MESH_EDGE_4_100_3000_5000_cuda:0               4978            5070            101
POINT_MESH_EDGE_4_100_3000_10000_cuda:0              9076            9187             56
POINT_MESH_EDGE_4_1000_300_5000_cuda:0               1411            1487            355
POINT_MESH_EDGE_4_1000_300_10000_cuda:0              4829            5030            104
POINT_MESH_EDGE_4_1000_3000_5000_cuda:0              7539            7620             67
POINT_MESH_EDGE_4_1000_3000_10000_cuda:0            12088           12272             42
POINT_MESH_EDGE_8_100_300_5000_cuda:0                3106            3222            161
POINT_MESH_EDGE_8_100_300_10000_cuda:0               8561            8648             59
POINT_MESH_EDGE_8_100_3000_5000_cuda:0               6932            7021             73
POINT_MESH_EDGE_8_100_3000_10000_cuda:0             24032           24176             21
POINT_MESH_EDGE_8_1000_300_5000_cuda:0               5272            5399             95
POINT_MESH_EDGE_8_1000_300_10000_cuda:0             11348           11430             45
POINT_MESH_EDGE_8_1000_3000_5000_cuda:0             17478           17683             29
POINT_MESH_EDGE_8_1000_3000_10000_cuda:0            25961           26236             20
POINT_MESH_EDGE_16_100_300_5000_cuda:0               8244            8323             61
POINT_MESH_EDGE_16_100_300_10000_cuda:0             18018           18071             28
POINT_MESH_EDGE_16_100_3000_5000_cuda:0             19428           19544             26
POINT_MESH_EDGE_16_100_3000_10000_cuda:0            44967           45135             12
POINT_MESH_EDGE_16_1000_300_5000_cuda:0              7825            7937             64
POINT_MESH_EDGE_16_1000_300_10000_cuda:0            18504           18571             28
POINT_MESH_EDGE_16_1000_3000_5000_cuda:0            65805           66132              8
POINT_MESH_EDGE_16_1000_3000_10000_cuda:0           90885           91089              6
--------------------------------------------------------------------------------

Benchmark                                       Avg Time(μs)      Peak Time(μs) Iterations
--------------------------------------------------------------------------------
POINT_MESH_FACE_4_100_300_5000_cuda:0                1561            1685            321
POINT_MESH_FACE_4_100_300_10000_cuda:0               2818            2954            178
POINT_MESH_FACE_4_100_3000_5000_cuda:0              15893           16018             32
POINT_MESH_FACE_4_100_3000_10000_cuda:0             16350           16439             31
POINT_MESH_FACE_4_1000_300_5000_cuda:0               3179            3278            158
POINT_MESH_FACE_4_1000_300_10000_cuda:0              2353            2436            213
POINT_MESH_FACE_4_1000_3000_5000_cuda:0             16262           16336             31
POINT_MESH_FACE_4_1000_3000_10000_cuda:0             9334            9448             54
POINT_MESH_FACE_8_100_300_5000_cuda:0                4377            4493            115
POINT_MESH_FACE_8_100_300_10000_cuda:0               9728            9822             52
POINT_MESH_FACE_8_100_3000_5000_cuda:0              26428           26544             19
POINT_MESH_FACE_8_100_3000_10000_cuda:0             42238           43031             12
POINT_MESH_FACE_8_1000_300_5000_cuda:0               3891            3982            129
POINT_MESH_FACE_8_1000_300_10000_cuda:0              5363            5429             94
POINT_MESH_FACE_8_1000_3000_5000_cuda:0             20998           21084             24
POINT_MESH_FACE_8_1000_3000_10000_cuda:0            39711           39897             13
POINT_MESH_FACE_16_100_300_5000_cuda:0               5955            6001             84
POINT_MESH_FACE_16_100_300_10000_cuda:0             12082           12144             42
POINT_MESH_FACE_16_100_3000_5000_cuda:0             44996           45176             12
POINT_MESH_FACE_16_100_3000_10000_cuda:0            73042           73197              7
POINT_MESH_FACE_16_1000_300_5000_cuda:0              8292            8374             61
POINT_MESH_FACE_16_1000_300_10000_cuda:0            19442           19506             26
POINT_MESH_FACE_16_1000_3000_5000_cuda:0            36059           36194             14
POINT_MESH_FACE_16_1000_3000_10000_cuda:0           64644           64822              8
--------------------------------------------------------------------------------
```

Reviewed By: jcjohnson

Differential Revision: D20590462

fbshipit-source-id: 42a39837b514a546ac9471bfaff60eefe7fae829
2020-04-11 00:21:24 -07:00

160 lines
4.5 KiB
Plaintext

// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#pragma once
#define MINK_H
#include "index_utils.cuh"
// A data structure to keep track of the smallest K keys seen so far as well
// as their associated values, intended to be used in device code.
// This data structure doesn't allocate any memory; keys and values are stored
// in arrays passed to the constructor.
//
// The implementation is generic; it can be used for any key type that supports
// the < operator, and can be used with any value type.
//
// Example usage:
//
// float keys[K];
// int values[K];
// MinK<float, int> mink(keys, values, K);
// for (...) {
// // Produce some key and value from somewhere
// mink.add(key, value);
// }
// mink.sort();
//
// Now keys and values store the smallest K keys seen so far and the values
// associated to these keys:
//
// for (int k = 0; k < K; ++k) {
// float key_k = keys[k];
// int value_k = values[k];
// }
template <typename key_t, typename value_t>
class MinK {
public:
// Constructor.
//
// Arguments:
// keys: Array in which to store keys
// values: Array in which to store values
// K: How many values to keep track of
__device__ MinK(key_t* keys, value_t* vals, int K)
: keys(keys), vals(vals), K(K), _size(0) {}
// Try to add a new key and associated value to the data structure. If the key
// is one of the smallest K seen so far then it will be kept; otherwise it
// it will not be kept.
//
// This takes O(1) operations if the new key is not kept, or if the structure
// currently contains fewer than K elements. Otherwise this takes O(K) time.
//
// Arguments:
// key: The key to add
// val: The value associated to the key
__device__ __forceinline__ void add(const key_t& key, const value_t& val) {
if (_size < K) {
keys[_size] = key;
vals[_size] = val;
if (_size == 0 || key > max_key) {
max_key = key;
max_idx = _size;
}
_size++;
} else if (key < max_key) {
keys[max_idx] = key;
vals[max_idx] = val;
max_key = key;
for (int k = 0; k < K; ++k) {
key_t cur_key = keys[k];
if (cur_key > max_key) {
max_key = cur_key;
max_idx = k;
}
}
}
}
// Get the number of items currently stored in the structure.
// This takes O(1) time.
__device__ __forceinline__ int size() {
return _size;
}
// Sort the items stored in the structure using bubble sort.
// This takes O(K^2) time.
__device__ __forceinline__ void sort() {
for (int i = 0; i < _size - 1; ++i) {
for (int j = 0; j < _size - i - 1; ++j) {
if (keys[j + 1] < keys[j]) {
key_t key = keys[j];
value_t val = vals[j];
keys[j] = keys[j + 1];
vals[j] = vals[j + 1];
keys[j + 1] = key;
vals[j + 1] = val;
}
}
}
}
private:
key_t* keys;
value_t* vals;
int K;
int _size;
key_t max_key;
int max_idx;
};
// This is a version of MinK that only touches the arrays using static indexing
// via RegisterIndexUtils. If the keys and values are stored in thread-local
// arrays, then this may allow the compiler to place them in registers for
// fast access.
//
// This has the same API as RegisterMinK, but doesn't support sorting.
// We found that sorting via RegisterIndexUtils gave very poor performance,
// and suspect it may have prevented the compiler from placing the arrays
// into registers.
template <typename key_t, typename value_t, int K>
class RegisterMinK {
public:
__device__ RegisterMinK(key_t* keys, value_t* vals)
: keys(keys), vals(vals), _size(0) {}
__device__ __forceinline__ void add(const key_t& key, const value_t& val) {
if (_size < K) {
RegisterIndexUtils<key_t, K>::set(keys, _size, key);
RegisterIndexUtils<value_t, K>::set(vals, _size, val);
if (_size == 0 || key > max_key) {
max_key = key;
max_idx = _size;
}
_size++;
} else if (key < max_key) {
RegisterIndexUtils<key_t, K>::set(keys, max_idx, key);
RegisterIndexUtils<value_t, K>::set(vals, max_idx, val);
max_key = key;
for (int k = 0; k < K; ++k) {
key_t cur_key = RegisterIndexUtils<key_t, K>::get(keys, k);
if (cur_key > max_key) {
max_key = cur_key;
max_idx = k;
}
}
}
}
__device__ __forceinline__ int size() {
return _size;
}
private:
key_t* keys;
value_t* vals;
int _size;
key_t max_key;
int max_idx;
};