Farthest point sampling C++

Summary: C++ implementation of iterative farthest point sampling. Reviewed By: jcjohnson Differential Revision: D30349887 fbshipit-source-id: d25990f857752633859fe00283e182858a870269
2026-02-06 14:02:19 +08:00 · 2021-09-15 13:47:55 -07:00
parent 3b7d78c7a7
commit d9f7611c4b
6 changed files with 346 additions and 19 deletions
--- a/tests/bm_sample_farthest_points.py
+++ b/tests/bm_sample_farthest_points.py
@@ -0,0 +1,37 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from itertools import product
+
+from fvcore.common.benchmark import benchmark
+from test_sample_farthest_points import TestFPS
+
+
+def bm_fps() -> None:
+    kwargs_list = []
+    backends = ["cpu", "cuda:0"]
+    Ns = [8, 32]
+    Ps = [64, 256]
+    Ds = [3]
+    Ks = [24]
+    test_cases = product(Ns, Ps, Ds, Ks, backends)
+    for case in test_cases:
+        N, P, D, K, d = case
+        kwargs_list.append({"N": N, "P": P, "D": D, "K": K, "device": d})
+
+    benchmark(
+        TestFPS.sample_farthest_points_naive,
+        "FPS_NAIVE_PYTHON",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+    kwargs_list = [k for k in kwargs_list if k["device"] == "cpu"]
+    benchmark(TestFPS.sample_farthest_points, "FPS_CPU", kwargs_list, warmup_iters=1)
+
+
+if __name__ == "__main__":
+    bm_fps()
--- a/tests/test_sample_farthest_points.py
+++ b/tests/test_sample_farthest_points.py
@@ -8,13 +8,15 @@ import unittest

 import torch
 from common_testing import TestCaseMixin, get_random_cuda_device
-from pytorch3d.ops.sample_farthest_points import sample_farthest_points_naive
+from pytorch3d.ops.sample_farthest_points import (
+    sample_farthest_points_naive,
+    sample_farthest_points,
+)
 from pytorch3d.ops.utils import masked_gather


 class TestFPS(TestCaseMixin, unittest.TestCase):
-    def test_simple(self):
-        device = get_random_cuda_device()
+    def _test_simple(self, fps_func, device="cpu"):
        # fmt: off
        points = torch.tensor(
            [
@@ -44,7 +46,7 @@ class TestFPS(TestCaseMixin, unittest.TestCase):
        )
        # fmt: on
        expected_inds = torch.tensor([[0, 4], [0, 7]], dtype=torch.int64, device=device)
-        out_points, out_inds = sample_farthest_points_naive(points, K=2)
+        out_points, out_inds = fps_func(points, K=2)
        self.assertClose(out_inds, expected_inds)

        # Gather the points
@@ -55,24 +57,37 @@ class TestFPS(TestCaseMixin, unittest.TestCase):
        expected_inds = torch.tensor(
            [[0, 4, 1], [0, 7, -1]], dtype=torch.int64, device=device
        )
-        out_points, out_inds = sample_farthest_points_naive(points, K=[3, 2])
+        out_points, out_inds = fps_func(points, K=[3, 2])
        self.assertClose(out_inds, expected_inds)

        # Gather the points
        expected_points = masked_gather(points, expected_inds)
        self.assertClose(out_points, expected_points)

-    def test_random_heterogeneous(self):
-        device = get_random_cuda_device()
-        N, P, D, K = 5, 40, 5, 8
-        points = torch.randn((N, P, D), device=device)
-        out_points, out_idxs = sample_farthest_points_naive(points, K=K)
+    def _test_compare_random_heterogeneous(self, device="cpu"):
+        N, P, D, K = 5, 20, 5, 8
+        points = torch.randn((N, P, D), device=device, dtype=torch.float32)
+        out_points_naive, out_idxs_naive = sample_farthest_points_naive(points, K=K)
+        out_points, out_idxs = sample_farthest_points(points, K=K)
        self.assertTrue(out_idxs.min() >= 0)
+        self.assertClose(out_idxs, out_idxs_naive)
+        self.assertClose(out_points, out_points_naive)
        for n in range(N):
            self.assertEqual(out_idxs[n].ne(-1).sum(), K)

+        # Test case where K > P
+        K = 30
+        points1 = torch.randn((N, P, D), dtype=torch.float32, device=device)
+        points2 = points1.clone()
+        points1.requires_grad = True
+        points2.requires_grad = True
        lengths = torch.randint(low=1, high=P, size=(N,), device=device)
-        out_points, out_idxs = sample_farthest_points_naive(points, lengths, K=50)
+        out_points_naive, out_idxs_naive = sample_farthest_points_naive(
+            points1, lengths, K=K
+        )
+        out_points, out_idxs = sample_farthest_points(points2, lengths, K=K)
+        self.assertClose(out_idxs, out_idxs_naive)
+        self.assertClose(out_points, out_points_naive)

        for n in range(N):
            # Check that for heterogeneous batches, the max number of
@@ -85,8 +100,15 @@ class TestFPS(TestCaseMixin, unittest.TestCase):
            vals, counts = torch.unique(out_idxs[n][val_mask], return_counts=True)
            self.assertTrue(counts.le(1).all())

-    def test_errors(self):
-        device = get_random_cuda_device()
+        # Check gradients
+        grad_sampled_points = torch.ones((N, K, D), dtype=torch.float32, device=device)
+        loss1 = (out_points_naive * grad_sampled_points).sum()
+        loss1.backward()
+        loss2 = (out_points * grad_sampled_points).sum()
+        loss2.backward()
+        self.assertClose(points1.grad, points2.grad, atol=5e-6)
+
+    def _test_errors(self, fps_func, device="cpu"):
        N, P, D, K = 5, 40, 5, 8
        points = torch.randn((N, P, D), device=device)
        wrong_batch_dim = torch.randint(low=1, high=K, size=(K,), device=device)
@@ -99,8 +121,7 @@ class TestFPS(TestCaseMixin, unittest.TestCase):
        with self.assertRaisesRegex(ValueError, "points and lengths must have"):
            sample_farthest_points_naive(points, lengths=wrong_batch_dim, K=K)

-    def test_random_start(self):
-        device = get_random_cuda_device()
+    def _test_random_start(self, fps_func, device="cpu"):
        N, P, D, K = 5, 40, 5, 8
        points = torch.randn((N, P, D), device=device)
        out_points, out_idxs = sample_farthest_points_naive(
@@ -109,3 +130,64 @@ class TestFPS(TestCaseMixin, unittest.TestCase):
        # Check the first index is not 0 for all batch elements
        # when random_start_point = True
        self.assertTrue(out_idxs[:, 0].sum() > 0)
+
+    def _test_gradcheck(self, fps_func, device="cpu"):
+        N, P, D, K = 2, 5, 3, 2
+        points = torch.randn(
+            (N, P, D), dtype=torch.float32, device=device, requires_grad=True
+        )
+        torch.autograd.gradcheck(
+            fps_func,
+            (points, None, K),
+            check_undefined_grad=False,
+            eps=2e-3,
+            atol=0.001,
+        )
+
+    def test_sample_farthest_points_naive(self):
+        device = get_random_cuda_device()
+        self._test_simple(sample_farthest_points_naive, device)
+        self._test_errors(sample_farthest_points_naive, device)
+        self._test_random_start(sample_farthest_points_naive, device)
+        self._test_gradcheck(sample_farthest_points_naive, device)
+
+    def test_sample_farthest_points_cpu(self):
+        self._test_simple(sample_farthest_points, "cpu")
+        self._test_errors(sample_farthest_points, "cpu")
+        self._test_compare_random_heterogeneous("cpu")
+        self._test_random_start(sample_farthest_points, "cpu")
+        self._test_gradcheck(sample_farthest_points, "cpu")
+
+    @staticmethod
+    def sample_farthest_points_naive(N: int, P: int, D: int, K: int, device: str):
+        device = torch.device(device)
+        pts = torch.randn(
+            N, P, D, dtype=torch.float32, device=device, requires_grad=True
+        )
+        grad_pts = torch.randn(N, K, D, dtype=torch.float32, device=device)
+        torch.cuda.synchronize()
+
+        def output():
+            out_points, _ = sample_farthest_points_naive(pts, K=K)
+            loss = (out_points * grad_pts).sum()
+            loss.backward()
+            torch.cuda.synchronize()
+
+        return output
+
+    @staticmethod
+    def sample_farthest_points(N: int, P: int, D: int, K: int, device: str):
+        device = torch.device(device)
+        pts = torch.randn(
+            N, P, D, dtype=torch.float32, device=device, requires_grad=True
+        )
+        grad_pts = torch.randn(N, K, D, dtype=torch.float32, device=device)
+        torch.cuda.synchronize()
+
+        def output():
+            out_points, _ = sample_farthest_points(pts, K=K)
+            loss = (out_points * grad_pts).sum()
+            loss.backward()
+            torch.cuda.synchronize()
+
+        return output