move benchmarks to separate directory

Summary: Move benchmarks to a separate directory as tests/ is getting big. Reviewed By: nikhilaravi Differential Revision: D32885462 fbshipit-source-id: a832662a494ee341ab77d95493c95b0af0a83f43
2026-02-06 05:52:17 +08:00 · 2021-12-07 10:22:17 -08:00
parent a6508ac3df
commit a0e2d2e3c3
43 changed files with 0 additions and 0 deletions
--- a/tests/benchmarks/init.py
+++ b/tests/benchmarks/init.py
--- a/tests/benchmarks/bm_acos_linear_extrapolation.py
+++ b/tests/benchmarks/bm_acos_linear_extrapolation.py
@@ -0,0 +1,27 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fvcore.common.benchmark import benchmark
+from test_acos_linear_extrapolation import TestAcosLinearExtrapolation
+
+
+def bm_acos_linear_extrapolation() -> None:
+    kwargs_list = [
+        {"batch_size": 1},
+        {"batch_size": 100},
+        {"batch_size": 10000},
+        {"batch_size": 1000000},
+    ]
+    benchmark(
+        TestAcosLinearExtrapolation.acos_linear_extrapolation,
+        "ACOS_LINEAR_EXTRAPOLATION",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_acos_linear_extrapolation()
--- a/tests/benchmarks/bm_ball_query.py
+++ b/tests/benchmarks/bm_ball_query.py
@@ -0,0 +1,40 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from itertools import product
+
+from fvcore.common.benchmark import benchmark
+from test_ball_query import TestBallQuery
+
+
+def bm_ball_query() -> None:
+
+    backends = ["cpu", "cuda:0"]
+
+    kwargs_list = []
+    Ns = [32]
+    P1s = [256]
+    P2s = [128, 512]
+    Ds = [3, 10]
+    Ks = [3, 24, 100]
+    Rs = [0.1, 0.2, 5]
+    test_cases = product(Ns, P1s, P2s, Ds, Ks, Rs, backends)
+    for case in test_cases:
+        N, P1, P2, D, K, R, b = case
+        kwargs_list.append(
+            {"N": N, "P1": P1, "P2": P2, "D": D, "K": K, "radius": R, "device": b}
+        )
+
+    benchmark(
+        TestBallQuery.ball_query_square, "BALLQUERY_SQUARE", kwargs_list, warmup_iters=1
+    )
+    benchmark(
+        TestBallQuery.ball_query_ragged, "BALLQUERY_RAGGED", kwargs_list, warmup_iters=1
+    )
+
+
+if __name__ == "__main__":
+    bm_ball_query()
--- a/tests/benchmarks/bm_barycentric_clipping.py
+++ b/tests/benchmarks/bm_barycentric_clipping.py
@@ -0,0 +1,120 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from itertools import product
+
+import torch
+from fvcore.common.benchmark import benchmark
+from pytorch3d.renderer.cameras import FoVPerspectiveCameras, look_at_view_transform
+from pytorch3d.renderer.mesh.rasterizer import (
+    Fragments,
+    MeshRasterizer,
+    RasterizationSettings,
+)
+from pytorch3d.renderer.mesh.utils import (
+    _clip_barycentric_coordinates,
+    _interpolate_zbuf,
+)
+from pytorch3d.utils.ico_sphere import ico_sphere
+
+
+def baryclip_cuda(
+    num_meshes: int = 8,
+    ico_level: int = 5,
+    image_size: int = 64,
+    faces_per_pixel: int = 50,
+    device="cuda",
+):
+    # Init meshes
+    sphere_meshes = ico_sphere(ico_level, device).extend(num_meshes)
+    # Init transform
+    R, T = look_at_view_transform(1.0, 0.0, 0.0)
+    cameras = FoVPerspectiveCameras(device=device, R=R, T=T)
+    # Init rasterizer
+    raster_settings = RasterizationSettings(
+        image_size=image_size,
+        blur_radius=1e-4,
+        faces_per_pixel=faces_per_pixel,
+        clip_barycentric_coords=True,
+    )
+    rasterizer = MeshRasterizer(cameras=cameras, raster_settings=raster_settings)
+
+    torch.cuda.synchronize()
+
+    def raster_fn():
+        rasterizer(sphere_meshes)
+        torch.cuda.synchronize()
+
+    return raster_fn
+
+
+def baryclip_pytorch(
+    num_meshes: int = 8,
+    ico_level: int = 5,
+    image_size: int = 64,
+    faces_per_pixel: int = 50,
+    device="cuda",
+):
+    # Init meshes
+    sphere_meshes = ico_sphere(ico_level, device).extend(num_meshes)
+    # Init transform
+    R, T = look_at_view_transform(1.0, 0.0, 0.0)
+    cameras = FoVPerspectiveCameras(device=device, R=R, T=T)
+    # Init rasterizer
+    raster_settings = RasterizationSettings(
+        image_size=image_size,
+        blur_radius=1e-4,
+        faces_per_pixel=faces_per_pixel,
+        clip_barycentric_coords=False,
+    )
+    rasterizer = MeshRasterizer(cameras=cameras, raster_settings=raster_settings)
+
+    torch.cuda.synchronize()
+
+    def raster_fn():
+        fragments = rasterizer(sphere_meshes)
+
+        # Clip bary and reinterpolate
+        clipped_bary_coords = _clip_barycentric_coordinates(fragments.bary_coords)
+        clipped_zbuf = _interpolate_zbuf(
+            fragments.pix_to_face, clipped_bary_coords, sphere_meshes
+        )
+        fragments = Fragments(
+            bary_coords=clipped_bary_coords,
+            zbuf=clipped_zbuf,
+            dists=fragments.dists,
+            pix_to_face=fragments.pix_to_face,
+        )
+        torch.cuda.synchronize()
+
+    return raster_fn
+
+
+def bm_barycentric_clip() -> None:
+    if torch.cuda.is_available():
+        kwargs_list = []
+        num_meshes = [1, 8]
+        ico_level = [0, 4]
+        image_size = [64, 128, 256]
+        faces_per_pixel = [10, 75, 100]
+        test_cases = product(num_meshes, ico_level, image_size, faces_per_pixel)
+        for case in test_cases:
+            n, ic, im, nf = case
+            kwargs_list.append(
+                {
+                    "num_meshes": n,
+                    "ico_level": ic,
+                    "image_size": im,
+                    "faces_per_pixel": nf,
+                }
+            )
+
+        benchmark(baryclip_cuda, "BARY_CLIP_CUDA", kwargs_list, warmup_iters=1)
+        benchmark(baryclip_pytorch, "BARY_CLIP_PYTORCH", kwargs_list, warmup_iters=1)
+
+
+if __name__ == "__main__":
+    bm_barycentric_clip()
--- a/tests/benchmarks/bm_blending.py
+++ b/tests/benchmarks/bm_blending.py
@@ -0,0 +1,52 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from itertools import product
+
+from fvcore.common.benchmark import benchmark
+from test_blending import TestBlending
+
+
+def bm_blending() -> None:
+    devices = ["cuda"]
+    kwargs_list = []
+    num_meshes = [8]
+    image_size = [64, 128, 256]
+    faces_per_pixel = [50, 100]
+    backend = ["pytorch", "custom"]
+    test_cases = product(num_meshes, image_size, faces_per_pixel, devices, backend)
+
+    for case in test_cases:
+        n, s, k, d, b = case
+        kwargs_list.append(
+            {
+                "num_meshes": n,
+                "image_size": s,
+                "faces_per_pixel": k,
+                "device": d,
+                "backend": b,
+            }
+        )
+
+    benchmark(
+        TestBlending.bm_sigmoid_alpha_blending,
+        "SIGMOID_ALPHA_BLENDING_PYTORCH",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+    kwargs_list = [case for case in kwargs_list if case["backend"] == "pytorch"]
+    benchmark(
+        TestBlending.bm_softmax_blending,
+        "SOFTMAX_BLENDING_PYTORCH",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_blending()
--- a/tests/benchmarks/bm_cameras_alignment.py
+++ b/tests/benchmarks/bm_cameras_alignment.py
@@ -0,0 +1,32 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+
+from fvcore.common.benchmark import benchmark
+from test_cameras_alignment import TestCamerasAlignment
+
+
+def bm_cameras_alignment() -> None:
+
+    case_grid = {
+        "batch_size": [10, 100, 1000],
+        "mode": ["centers", "extrinsics"],
+        "estimate_scale": [False, True],
+    }
+    test_cases = itertools.product(*case_grid.values())
+    kwargs_list = [dict(zip(case_grid.keys(), case)) for case in test_cases]
+
+    benchmark(
+        TestCamerasAlignment.corresponding_cameras_alignment,
+        "CORRESPONDING_CAMERAS_ALIGNMENT",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_cameras_alignment()
--- a/tests/benchmarks/bm_chamfer.py
+++ b/tests/benchmarks/bm_chamfer.py
@@ -0,0 +1,65 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from itertools import product
+
+import torch
+from fvcore.common.benchmark import benchmark
+from test_chamfer import TestChamfer
+
+
+def bm_chamfer() -> None:
+    # Currently disabled.
+    return
+    devices = ["cpu"]
+    if torch.cuda.is_available():
+        devices.append("cuda:0")
+
+    kwargs_list_naive = []
+    batch_size = [1, 32]
+    return_normals = [True, False]
+    test_cases = product(batch_size, return_normals, devices)
+
+    for case in test_cases:
+        b, n, d = case
+        kwargs_list_naive.append(
+            {"batch_size": b, "P1": 32, "P2": 64, "return_normals": n, "device": d}
+        )
+
+    benchmark(
+        TestChamfer.chamfer_naive_with_init,
+        "CHAMFER_NAIVE",
+        kwargs_list_naive,
+        warmup_iters=1,
+    )
+
+    if torch.cuda.is_available():
+        device = "cuda:0"
+        kwargs_list = []
+        batch_size = [1, 32]
+        P1 = [32, 1000, 10000]
+        P2 = [64, 3000, 30000]
+        return_normals = [True, False]
+        homogeneous = [True, False]
+        test_cases = product(batch_size, P1, P2, return_normals, homogeneous)
+
+        for case in test_cases:
+            b, p1, p2, n, h = case
+            kwargs_list.append(
+                {
+                    "batch_size": b,
+                    "P1": p1,
+                    "P2": p2,
+                    "return_normals": n,
+                    "homogeneous": h,
+                    "device": device,
+                }
+            )
+        benchmark(TestChamfer.chamfer_with_init, "CHAMFER", kwargs_list, warmup_iters=1)
+
+
+if __name__ == "__main__":
+    bm_chamfer()
--- a/tests/benchmarks/bm_cubify.py
+++ b/tests/benchmarks/bm_cubify.py
@@ -0,0 +1,21 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fvcore.common.benchmark import benchmark
+from test_cubify import TestCubify
+
+
+def bm_cubify() -> None:
+    kwargs_list = [
+        {"batch_size": 32, "V": 16},
+        {"batch_size": 64, "V": 16},
+        {"batch_size": 16, "V": 32},
+    ]
+    benchmark(TestCubify.cubify_with_init, "CUBIFY", kwargs_list, warmup_iters=1)
+
+
+if __name__ == "__main__":
+    bm_cubify()
--- a/tests/benchmarks/bm_face_areas_normals.py
+++ b/tests/benchmarks/bm_face_areas_normals.py
@@ -0,0 +1,47 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from itertools import product
+
+import torch
+from fvcore.common.benchmark import benchmark
+from test_face_areas_normals import TestFaceAreasNormals
+
+
+def bm_face_areas_normals() -> None:
+    kwargs_list = []
+    backend = ["cpu"]
+    if torch.cuda.is_available():
+        backend.append("cuda:0")
+
+    num_meshes = [2, 10, 32]
+    num_verts = [100, 1000]
+    num_faces = [300, 3000]
+
+    test_cases = product(num_meshes, num_verts, num_faces, backend)
+    for case in test_cases:
+        n, v, f, d = case
+        kwargs_list.append(
+            {"num_meshes": n, "num_verts": v, "num_faces": f, "device": d}
+        )
+    benchmark(
+        TestFaceAreasNormals.face_areas_normals_with_init,
+        "FACE_AREAS_NORMALS",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+    benchmark(
+        TestFaceAreasNormals.face_areas_normals_with_init_torch,
+        "FACE_AREAS_NORMALS_TORCH",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_face_areas_normals()
--- a/tests/benchmarks/bm_graph_conv.py
+++ b/tests/benchmarks/bm_graph_conv.py
@@ -0,0 +1,50 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from itertools import product
+
+import torch
+from fvcore.common.benchmark import benchmark
+from test_graph_conv import TestGraphConv
+
+
+def bm_graph_conv() -> None:
+    backends = ["cpu"]
+    if torch.cuda.is_available():
+        backends.append("cuda")
+
+    kwargs_list = []
+    gconv_dim = [128, 256]
+    num_meshes = [32, 64]
+    num_verts = [100]
+    num_faces = [1000]
+    directed = [False, True]
+    test_cases = product(
+        gconv_dim, num_meshes, num_verts, num_faces, directed, backends
+    )
+    for case in test_cases:
+        g, n, v, f, d, b = case
+        kwargs_list.append(
+            {
+                "gconv_dim": g,
+                "num_meshes": n,
+                "num_verts": v,
+                "num_faces": f,
+                "directed": d,
+                "backend": b,
+            }
+        )
+    benchmark(
+        TestGraphConv.graph_conv_forward_backward,
+        "GRAPH CONV",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_graph_conv()
--- a/tests/benchmarks/bm_interpolate_face_attributes.py
+++ b/tests/benchmarks/bm_interpolate_face_attributes.py
@@ -0,0 +1,84 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from itertools import product
+
+import torch
+from fvcore.common.benchmark import benchmark
+from pytorch3d.ops.interp_face_attrs import (
+    interpolate_face_attributes,
+    interpolate_face_attributes_python,
+)
+
+
+def _generate_data(N, S, K, F, D, device, requires_grad=False):
+    pix_to_face = torch.randint(-10, F, (N, S, S, K), device=device)
+    barycentric_coords = torch.randn(
+        N, S, S, K, 3, device=device, requires_grad=requires_grad
+    )
+    face_attrs = torch.randn(F, 3, D, device=device, requires_grad=requires_grad)
+    grad_pix_attrs = torch.randn(N, S, S, K, D, device=device)
+    return pix_to_face, barycentric_coords, face_attrs, grad_pix_attrs
+
+
+def _bm_forward(N, S, F, K, D, impl):
+    # The runtime depends on the values of pix_to_face. So for proper
+    # benchmarking we should probably take the average of multiple
+    # values of pix to face. But this doesn't easily fit into fvcore
+    # benchmarking, so instead we'll just set a manual seed to make sure
+    # that different impls will use the same data.
+    torch.manual_seed(0)
+    device = torch.device("cuda")
+    data = _generate_data(N, S, K, F, D, device, requires_grad=False)
+    args = data[:3]
+    torch.cuda.synchronize()
+    if impl == "cuda":
+        fun = interpolate_face_attributes
+    elif impl == "python":
+        fun = interpolate_face_attributes_python
+    return lambda: fun(*args)
+
+
+def _bm_forward_backward(N, S, F, K, D, impl):
+    torch.manual_seed(0)
+    device = torch.device("cuda")
+    data = _generate_data(N, S, K, F, D, device, requires_grad=True)
+    args, grad = data[:3], data[3]
+    torch.cuda.synchronize()
+    if impl == "cuda":
+        fun = interpolate_face_attributes
+    elif impl == "python":
+        fun = interpolate_face_attributes_python
+
+    def run():
+        out = fun(*args)
+        out.backward(gradient=grad)
+
+    return run
+
+
+def bm_interpolate_face_attribues() -> None:
+    # For now only benchmark on GPU
+    if not torch.cuda.is_available():
+        return
+
+    Ns = [1, 4]
+    Ss = [128]
+    Ks = [1, 10, 40]
+    Fs = [5000]
+    Ds = [1, 3, 16]
+    impls = ["python", "cuda"]
+    test_cases = product(Ns, Ss, Ks, Fs, Ds, impls)
+    kwargs_list = []
+    for case in test_cases:
+        N, S, K, F, D, impl = case
+        kwargs_list.append({"N": N, "S": S, "K": K, "F": F, "D": D, "impl": impl})
+    benchmark(_bm_forward, "FORWARD", kwargs_list, warmup_iters=3)
+    benchmark(_bm_forward_backward, "FORWARD+BACKWARD", kwargs_list, warmup_iters=3)
+
+
+if __name__ == "__main__":
+    bm_interpolate_face_attribues()
--- a/tests/benchmarks/bm_iou_box3d.py
+++ b/tests/benchmarks/bm_iou_box3d.py
@@ -0,0 +1,54 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from itertools import product
+
+from fvcore.common.benchmark import benchmark
+from test_iou_box3d import TestIoU3D
+
+
+def bm_iou_box3d() -> None:
+    # Realistic use cases
+    N = [30, 100]
+    M = [5, 10, 100]
+    kwargs_list = []
+    test_cases = product(N, M)
+    for case in test_cases:
+        n, m = case
+        kwargs_list.append({"N": n, "M": m, "device": "cuda:0"})
+    benchmark(TestIoU3D.iou, "3D_IOU", kwargs_list, warmup_iters=1)
+
+    # Comparison of C++/CUDA
+    kwargs_list = []
+    N = [1, 4, 8, 16]
+    devices = ["cpu", "cuda:0"]
+    test_cases = product(N, N, devices)
+    for case in test_cases:
+        n, m, d = case
+        kwargs_list.append({"N": n, "M": m, "device": d})
+    benchmark(TestIoU3D.iou, "3D_IOU", kwargs_list, warmup_iters=1)
+
+    # Naive PyTorch
+    N = [1, 4]
+    kwargs_list = []
+    test_cases = product(N, N)
+    for case in test_cases:
+        n, m = case
+        kwargs_list.append({"N": n, "M": m, "device": "cuda:0"})
+    benchmark(TestIoU3D.iou_naive, "3D_IOU_NAIVE", kwargs_list, warmup_iters=1)
+
+    # Sampling based method
+    num_samples = [2000, 5000]
+    kwargs_list = []
+    test_cases = product(N, N, num_samples)
+    for case in test_cases:
+        n, m, s = case
+        kwargs_list.append({"N": n, "M": m, "num_samples": s, "device": "cuda:0"})
+    benchmark(TestIoU3D.iou_sampling, "3D_IOU_SAMPLING", kwargs_list, warmup_iters=1)
+
+
+if __name__ == "__main__":
+    bm_iou_box3d()
--- a/tests/benchmarks/bm_knn.py
+++ b/tests/benchmarks/bm_knn.py
@@ -0,0 +1,34 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from itertools import product
+
+from fvcore.common.benchmark import benchmark
+from test_knn import TestKNN
+
+
+def bm_knn() -> None:
+
+    backends = ["cpu", "cuda:0"]
+
+    kwargs_list = []
+    Ns = [32]
+    P1s = [256]
+    P2s = [128, 512]
+    Ds = [3]
+    Ks = [24]
+    test_cases = product(Ns, P1s, P2s, Ds, Ks, backends)
+    for case in test_cases:
+        N, P1, P2, D, K, b = case
+        kwargs_list.append({"N": N, "P1": P1, "P2": P2, "D": D, "K": K, "device": b})
+
+    benchmark(TestKNN.knn_square, "KNN_SQUARE", kwargs_list, warmup_iters=1)
+
+    benchmark(TestKNN.knn_ragged, "KNN_RAGGED", kwargs_list, warmup_iters=1)
+
+
+if __name__ == "__main__":
+    bm_knn()
--- a/tests/benchmarks/bm_lighting.py
+++ b/tests/benchmarks/bm_lighting.py
@@ -0,0 +1,55 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from itertools import product
+
+import torch
+from fvcore.common.benchmark import benchmark
+from pytorch3d.renderer.lighting import diffuse, specular
+
+
+def _bm_diffuse_cuda_with_init(N, S, K):
+    device = torch.device("cuda")
+    normals = torch.randn(N, S, S, K, 3, device=device)
+    color = torch.randn(1, 3, device=device)
+    direction = torch.randn(N, S, S, K, 3, device=device)
+    args = (normals, color, direction)
+    torch.cuda.synchronize()
+    return lambda: diffuse(*args)
+
+
+def _bm_specular_cuda_with_init(N, S, K):
+    device = torch.device("cuda")
+    points = torch.randn(N, S, S, K, 3, device=device)
+    normals = torch.randn(N, S, S, K, 3, device=device)
+    direction = torch.randn(N, S, S, K, 3, device=device)
+    color = torch.randn(1, 3, device=device)
+    camera_position = torch.randn(N, 3, device=device)
+    shininess = torch.randn(N, device=device)
+    args = (points, normals, direction, color, camera_position, shininess)
+    torch.cuda.synchronize()
+    return lambda: specular(*args)
+
+
+def bm_lighting() -> None:
+    # For now only benchmark lighting on GPU
+    if not torch.cuda.is_available():
+        return
+
+    kwargs_list = []
+    Ns = [1, 8]
+    Ss = [128, 256]
+    Ks = [1, 10, 80]
+    test_cases = product(Ns, Ss, Ks)
+    for case in test_cases:
+        N, S, K = case
+        kwargs_list.append({"N": N, "S": S, "K": K})
+    benchmark(_bm_diffuse_cuda_with_init, "DIFFUSE", kwargs_list, warmup_iters=3)
+    benchmark(_bm_specular_cuda_with_init, "SPECULAR", kwargs_list, warmup_iters=3)
+
+
+if __name__ == "__main__":
+    bm_lighting()
--- a/tests/benchmarks/bm_main.py
+++ b/tests/benchmarks/bm_main.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import glob
+import os
+import subprocess
+import sys
+from os.path import dirname, isfile, join
+
+
+if __name__ == "__main__":
+    # pyre-ignore[16]
+    if len(sys.argv) > 1:
+        # Parse from flags.
+        # pyre-ignore[16]
+        file_names = [
+            join(dirname(__file__), n) for n in sys.argv if n.startswith("bm_")
+        ]
+    else:
+        # Get all the benchmark files (starting with "bm_").
+        bm_files = glob.glob(join(dirname(__file__), "bm_*.py"))
+        file_names = sorted(
+            f for f in bm_files if isfile(f) and not f.endswith("bm_main.py")
+        )
+
+    # Forward all important path information to the subprocesses through the
+    # environment.
+    os.environ["PATH"] = sys.path[0] + ":" + os.environ.get("PATH", "")
+    os.environ["LD_LIBRARY_PATH"] = (
+        sys.path[0] + ":" + os.environ.get("LD_LIBRARY_PATH", "")
+    )
+    os.environ["PYTHONPATH"] = ":".join(sys.path)
+    for file_name in file_names:
+        subprocess.check_call([sys.executable, file_name])
--- a/tests/benchmarks/bm_marching_cubes.py
+++ b/tests/benchmarks/bm_marching_cubes.py
@@ -0,0 +1,29 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fvcore.common.benchmark import benchmark
+from test_marching_cubes import TestMarchingCubes
+
+
+def bm_marching_cubes() -> None:
+    kwargs_list = [
+        {"batch_size": 1, "V": 5},
+        {"batch_size": 1, "V": 10},
+        {"batch_size": 1, "V": 20},
+        {"batch_size": 1, "V": 40},
+        {"batch_size": 5, "V": 5},
+        {"batch_size": 20, "V": 20},
+    ]
+    benchmark(
+        TestMarchingCubes.marching_cubes_with_init,
+        "MARCHING_CUBES",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_marching_cubes()
--- a/tests/benchmarks/bm_mesh_edge_loss.py
+++ b/tests/benchmarks/bm_mesh_edge_loss.py
@@ -0,0 +1,29 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from itertools import product
+
+from fvcore.common.benchmark import benchmark
+from test_mesh_edge_loss import TestMeshEdgeLoss
+
+
+def bm_mesh_edge_loss() -> None:
+    kwargs_list = []
+    num_meshes = [1, 16, 32]
+    max_v = [100, 10000]
+    max_f = [300, 30000]
+    test_cases = product(num_meshes, max_v, max_f)
+    for case in test_cases:
+        n, v, f = case
+        kwargs_list.append({"num_meshes": n, "max_v": v, "max_f": f})
+    benchmark(
+        TestMeshEdgeLoss.mesh_edge_loss, "MESH_EDGE_LOSS", kwargs_list, warmup_iters=1
+    )
+
+
+if __name__ == "__main__":
+    bm_mesh_edge_loss()
--- a/tests/benchmarks/bm_mesh_io.py
+++ b/tests/benchmarks/bm_mesh_io.py
@@ -0,0 +1,105 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from itertools import product
+
+from fvcore.common.benchmark import benchmark
+from test_io_obj import TestMeshObjIO
+from test_io_ply import TestMeshPlyIO
+
+
+def bm_save_load() -> None:
+    simple_kwargs_list = [
+        {"V": 100, "F": 200},
+        {"V": 1000, "F": 2000},
+        {"V": 10000, "F": 20000},
+    ]
+    benchmark(
+        TestMeshObjIO.bm_load_simple_obj_with_init,
+        "LOAD_SIMPLE_OBJ",
+        simple_kwargs_list,
+        warmup_iters=1,
+    )
+    benchmark(
+        TestMeshObjIO.bm_save_simple_obj_with_init,
+        "SAVE_SIMPLE_OBJ",
+        simple_kwargs_list,
+        warmup_iters=1,
+    )
+    benchmark(
+        TestMeshPlyIO.bm_load_simple_ply_with_init,
+        "LOAD_SIMPLE_PLY",
+        simple_kwargs_list,
+        warmup_iters=1,
+    )
+    benchmark(
+        TestMeshPlyIO.bm_save_simple_ply_with_init,
+        "SAVE_SIMPLE_PLY",
+        simple_kwargs_list,
+        warmup_iters=1,
+    )
+
+    complex_kwargs_list = [{"N": 8}, {"N": 32}, {"N": 128}]
+    benchmark(
+        TestMeshObjIO.bm_load_complex_obj,
+        "LOAD_COMPLEX_OBJ",
+        complex_kwargs_list,
+        warmup_iters=1,
+    )
+    benchmark(
+        TestMeshObjIO.bm_save_complex_obj,
+        "SAVE_COMPLEX_OBJ",
+        complex_kwargs_list,
+        warmup_iters=1,
+    )
+    benchmark(
+        TestMeshPlyIO.bm_load_complex_ply,
+        "LOAD_COMPLEX_PLY",
+        complex_kwargs_list,
+        warmup_iters=1,
+    )
+    benchmark(
+        TestMeshPlyIO.bm_save_complex_ply,
+        "SAVE_COMPLEX_PLY",
+        complex_kwargs_list,
+        warmup_iters=1,
+    )
+
+    # Texture loading benchmarks
+    kwargs_list = [{"R": 2}, {"R": 4}, {"R": 10}, {"R": 15}, {"R": 20}]
+    benchmark(
+        TestMeshObjIO.bm_load_texture_atlas,
+        "PYTORCH3D_TEXTURE_ATLAS",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+    kwargs_list = []
+    S = [64, 256, 1024]
+    F = [100, 1000, 10000]
+    R = [5, 10, 20]
+    test_cases = product(S, F, R)
+
+    for case in test_cases:
+        s, f, r = case
+        kwargs_list.append({"S": s, "F": f, "R": r})
+
+    benchmark(
+        TestMeshObjIO.bm_bilinear_sampling_vectorized,
+        "BILINEAR_VECTORIZED",
+        kwargs_list,
+        warmup_iters=1,
+    )
+    benchmark(
+        TestMeshObjIO.bm_bilinear_sampling_grid_sample,
+        "BILINEAR_GRID_SAMPLE",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_save_load()
--- a/tests/benchmarks/bm_mesh_laplacian_smoothing.py
+++ b/tests/benchmarks/bm_mesh_laplacian_smoothing.py
@@ -0,0 +1,40 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from itertools import product
+
+import torch
+from fvcore.common.benchmark import benchmark
+from test_mesh_laplacian_smoothing import TestLaplacianSmoothing
+
+
+def bm_mesh_laplacian_smoothing() -> None:
+    devices = ["cpu"]
+    if torch.cuda.is_available():
+        devices.append("cuda")
+
+    kwargs_list = []
+    num_meshes = [2, 10, 32]
+    num_verts = [100, 1000]
+    num_faces = [300, 3000]
+    test_cases = product(num_meshes, num_verts, num_faces, devices)
+    for case in test_cases:
+        n, v, f, d = case
+        kwargs_list.append(
+            {"num_meshes": n, "num_verts": v, "num_faces": f, "device": d}
+        )
+
+    benchmark(
+        TestLaplacianSmoothing.laplacian_smoothing_with_init,
+        "MESH_LAPLACIAN_SMOOTHING",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_mesh_laplacian_smoothing()
--- a/tests/benchmarks/bm_mesh_normal_consistency.py
+++ b/tests/benchmarks/bm_mesh_normal_consistency.py
@@ -0,0 +1,37 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from itertools import product
+
+import torch
+from fvcore.common.benchmark import benchmark
+from test_mesh_normal_consistency import TestMeshNormalConsistency
+
+
+def bm_mesh_normal_consistency() -> None:
+    devices = ["cpu"]
+    if torch.cuda.is_available():
+        devices.append("cuda")
+
+    kwargs_list = []
+    num_meshes = [16, 32, 64]
+    levels = [2, 3]
+    test_cases = product(num_meshes, levels, devices)
+    for case in test_cases:
+        n, l, d = case
+        kwargs_list.append({"num_meshes": n, "level": l, "device": d})
+
+    benchmark(
+        TestMeshNormalConsistency.mesh_normal_consistency_with_ico,
+        "MESH_NORMAL_CONSISTENCY_ICO",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_mesh_normal_consistency()
--- a/tests/benchmarks/bm_mesh_rasterizer_transform.py
+++ b/tests/benchmarks/bm_mesh_rasterizer_transform.py
@@ -0,0 +1,53 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from itertools import product
+
+import torch
+from fvcore.common.benchmark import benchmark
+from pytorch3d.renderer.cameras import FoVPerspectiveCameras, look_at_view_transform
+from pytorch3d.renderer.mesh.rasterizer import MeshRasterizer
+from pytorch3d.utils.ico_sphere import ico_sphere
+
+
+def rasterize_transform_with_init(num_meshes: int, ico_level: int = 5, device="cuda"):
+    # Init meshes
+    sphere_meshes = ico_sphere(ico_level, device).extend(num_meshes)
+    # Init transform
+    R, T = look_at_view_transform(1.0, 0.0, 0.0)
+    cameras = FoVPerspectiveCameras(device=device, R=R, T=T)
+    # Init rasterizer
+    rasterizer = MeshRasterizer(cameras=cameras)
+
+    torch.cuda.synchronize()
+
+    def raster_fn():
+        rasterizer.transform(sphere_meshes)
+        torch.cuda.synchronize()
+
+    return raster_fn
+
+
+def bm_mesh_rasterizer_transform() -> None:
+    if torch.cuda.is_available():
+        kwargs_list = []
+        num_meshes = [1, 8]
+        ico_level = [0, 1, 3, 4]
+        test_cases = product(num_meshes, ico_level)
+        for case in test_cases:
+            n, ic = case
+            kwargs_list.append({"num_meshes": n, "ico_level": ic})
+        benchmark(
+            rasterize_transform_with_init,
+            "MESH_RASTERIZER",
+            kwargs_list,
+            warmup_iters=1,
+        )
+
+
+if __name__ == "__main__":
+    bm_mesh_rasterizer_transform()
--- a/tests/benchmarks/bm_meshes.py
+++ b/tests/benchmarks/bm_meshes.py
@@ -0,0 +1,43 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from itertools import product
+
+import torch
+from fvcore.common.benchmark import benchmark
+from test_meshes import TestMeshes
+
+
+def bm_compute_packed_padded_meshes() -> None:
+    devices = ["cpu"]
+    if torch.cuda.is_available():
+        devices.append("cuda")
+
+    kwargs_list = []
+    num_meshes = [32, 128]
+    max_v = [100, 1000, 10000]
+    max_f = [300, 3000, 30000]
+    test_cases = product(num_meshes, max_v, max_f, devices)
+    for case in test_cases:
+        n, v, f, d = case
+        kwargs_list.append({"num_meshes": n, "max_v": v, "max_f": f, "device": d})
+    benchmark(
+        TestMeshes.compute_packed_with_init,
+        "COMPUTE_PACKED",
+        kwargs_list,
+        warmup_iters=1,
+    )
+    benchmark(
+        TestMeshes.compute_padded_with_init,
+        "COMPUTE_PADDED",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_compute_packed_padded_meshes()
--- a/tests/benchmarks/bm_packed_to_padded.py
+++ b/tests/benchmarks/bm_packed_to_padded.py
@@ -0,0 +1,48 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from itertools import product
+
+import torch
+from fvcore.common.benchmark import benchmark
+from test_packed_to_padded import TestPackedToPadded
+
+
+def bm_packed_to_padded() -> None:
+    kwargs_list = []
+    backend = ["cpu"]
+    if torch.cuda.is_available():
+        backend.append("cuda:0")
+
+    num_meshes = [2, 10, 32]
+    num_verts = [100, 1000]
+    num_faces = [300, 3000]
+    num_ds = [0, 1, 16]
+
+    test_cases = product(num_meshes, num_verts, num_faces, num_ds, backend)
+    for case in test_cases:
+        n, v, f, d, b = case
+        kwargs_list.append(
+            {"num_meshes": n, "num_verts": v, "num_faces": f, "num_d": d, "device": b}
+        )
+    benchmark(
+        TestPackedToPadded.packed_to_padded_with_init,
+        "PACKED_TO_PADDED",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+    benchmark(
+        TestPackedToPadded.packed_to_padded_with_init_torch,
+        "PACKED_TO_PADDED_TORCH",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_packed_to_padded()
--- a/tests/benchmarks/bm_perspective_n_points.py
+++ b/tests/benchmarks/bm_perspective_n_points.py
@@ -0,0 +1,33 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+
+from fvcore.common.benchmark import benchmark
+from test_perspective_n_points import TestPerspectiveNPoints
+
+
+def bm_perspective_n_points() -> None:
+    case_grid = {
+        "batch_size": [1, 10, 100],
+        "num_pts": [100, 100000],
+        "skip_q": [False, True],
+    }
+
+    test_cases = itertools.product(*case_grid.values())
+    kwargs_list = [dict(zip(case_grid.keys(), case)) for case in test_cases]
+
+    test = TestPerspectiveNPoints()
+    benchmark(
+        test.case_with_gaussian_points,
+        "PerspectiveNPoints",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_perspective_n_points()
--- a/tests/benchmarks/bm_point_mesh_distance.py
+++ b/tests/benchmarks/bm_point_mesh_distance.py
@@ -0,0 +1,44 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from itertools import product
+
+from fvcore.common.benchmark import benchmark
+from test_point_mesh_distance import TestPointMeshDistance
+
+
+def bm_point_mesh_distance() -> None:
+
+    backend = ["cuda:0"]
+
+    kwargs_list = []
+    batch_size = [4, 8, 16]
+    num_verts = [100, 1000]
+    num_faces = [300, 3000]
+    num_points = [5000, 10000]
+    test_cases = product(batch_size, num_verts, num_faces, num_points, backend)
+    for case in test_cases:
+        n, v, f, p, b = case
+        kwargs_list.append({"N": n, "V": v, "F": f, "P": p, "device": b})
+
+    benchmark(
+        TestPointMeshDistance.point_mesh_edge,
+        "POINT_MESH_EDGE",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+    benchmark(
+        TestPointMeshDistance.point_mesh_face,
+        "POINT_MESH_FACE",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_point_mesh_distance()
--- a/tests/benchmarks/bm_pointclouds.py
+++ b/tests/benchmarks/bm_pointclouds.py
@@ -0,0 +1,38 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from itertools import product
+
+from fvcore.common.benchmark import benchmark
+from test_pointclouds import TestPointclouds
+
+
+def bm_compute_packed_padded_pointclouds() -> None:
+    kwargs_list = []
+    num_clouds = [32, 128]
+    max_p = [100, 10000]
+    feats = [1, 10, 300]
+    test_cases = product(num_clouds, max_p, feats)
+    for case in test_cases:
+        n, p, f = case
+        kwargs_list.append({"num_clouds": n, "max_p": p, "features": f})
+    benchmark(
+        TestPointclouds.compute_packed_with_init,
+        "COMPUTE_PACKED",
+        kwargs_list,
+        warmup_iters=1,
+    )
+    benchmark(
+        TestPointclouds.compute_padded_with_init,
+        "COMPUTE_PADDED",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_compute_packed_padded_pointclouds()
--- a/tests/benchmarks/bm_points_alignment.py
+++ b/tests/benchmarks/bm_points_alignment.py
@@ -0,0 +1,80 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from copy import deepcopy
+from itertools import product
+
+from fvcore.common.benchmark import benchmark
+from test_points_alignment import TestCorrespondingPointsAlignment, TestICP
+
+
+def bm_iterative_closest_point() -> None:
+
+    case_grid = {
+        "batch_size": [1, 10],
+        "dim": [3, 20],
+        "n_points_X": [100, 1000],
+        "n_points_Y": [100, 1000],
+        "use_pointclouds": [False],
+    }
+
+    test_args = sorted(case_grid.keys())
+    test_cases = product(*case_grid.values())
+    kwargs_list = [dict(zip(test_args, case)) for case in test_cases]
+
+    # add the use_pointclouds=True test cases whenever we have dim==3
+    kwargs_to_add = []
+    for entry in kwargs_list:
+        if entry["dim"] == 3:
+            entry_add = deepcopy(entry)
+            entry_add["use_pointclouds"] = True
+            kwargs_to_add.append(entry_add)
+    kwargs_list.extend(kwargs_to_add)
+
+    benchmark(
+        TestICP.iterative_closest_point,
+        "IterativeClosestPoint",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+def bm_corresponding_points_alignment() -> None:
+
+    case_grid = {
+        "allow_reflection": [True, False],
+        "batch_size": [1, 10, 100],
+        "dim": [3, 20],
+        "estimate_scale": [True, False],
+        "n_points": [100, 10000],
+        "random_weights": [False, True],
+        "use_pointclouds": [False],
+    }
+
+    test_args = sorted(case_grid.keys())
+    test_cases = product(*case_grid.values())
+    kwargs_list = [dict(zip(test_args, case)) for case in test_cases]
+
+    # add the use_pointclouds=True test cases whenever we have dim==3
+    kwargs_to_add = []
+    for entry in kwargs_list:
+        if entry["dim"] == 3:
+            entry_add = deepcopy(entry)
+            entry_add["use_pointclouds"] = True
+            kwargs_to_add.append(entry_add)
+    kwargs_list.extend(kwargs_to_add)
+
+    benchmark(
+        TestCorrespondingPointsAlignment.corresponding_points_alignment,
+        "CorrespodingPointsAlignment",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_corresponding_points_alignment()
+    bm_iterative_closest_point()
--- a/tests/benchmarks/bm_points_to_volumes.py
+++ b/tests/benchmarks/bm_points_to_volumes.py
@@ -0,0 +1,33 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+
+from fvcore.common.benchmark import benchmark
+from test_points_to_volumes import TestPointsToVolumes
+
+
+def bm_points_to_volumes() -> None:
+    case_grid = {
+        "device": ["cpu", "cuda:0"],
+        "batch_size": [10, 100],
+        "interp_mode": ["trilinear", "nearest"],
+        "volume_size": [[25, 25, 25], [101, 111, 121]],
+        "n_points": [1000, 10000, 100000],
+    }
+    test_cases = itertools.product(*case_grid.values())
+    kwargs_list = [dict(zip(case_grid.keys(), case)) for case in test_cases]
+
+    benchmark(
+        TestPointsToVolumes.add_points_to_volumes,
+        "ADD_POINTS_TO_VOLUMES",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_points_to_volumes()
--- a/tests/benchmarks/bm_pulsar.py
+++ b/tests/benchmarks/bm_pulsar.py
@@ -0,0 +1,126 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Test render speed."""
+import logging
+import sys
+from os import path
+
+import torch
+from fvcore.common.benchmark import benchmark
+from pytorch3d.renderer.points.pulsar import Renderer
+from torch.autograd import Variable
+
+
+# Making sure you can run this, even if pulsar hasn't been installed yet.
+sys.path.insert(0, path.join(path.dirname(__file__), ".."))
+LOGGER = logging.getLogger(__name__)
+
+
+"""Measure the execution speed of the rendering.
+
+This measures a very pessimistic upper bound on speed, because synchronization
+points have to be introduced in Python. On a pure PyTorch execution pipeline,
+results should be significantly faster. You can get pure CUDA timings through
+C++ by activating `PULSAR_TIMINGS_BATCHED_ENABLED` in the file
+`pytorch3d/csrc/pulsar/logging.h` or defining it for your compiler.
+"""
+
+
+def _bm_pulsar():
+    n_points = 1_000_000
+    width = 1_000
+    height = 1_000
+    renderer = Renderer(width, height, n_points)
+    # Generate sample data.
+    torch.manual_seed(1)
+    vert_pos = torch.rand(n_points, 3, dtype=torch.float32) * 10.0
+    vert_pos[:, 2] += 25.0
+    vert_pos[:, :2] -= 5.0
+    vert_col = torch.rand(n_points, 3, dtype=torch.float32)
+    vert_rad = torch.rand(n_points, dtype=torch.float32)
+    cam_params = torch.tensor(
+        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 2.0], dtype=torch.float32
+    )
+    device = torch.device("cuda")
+    vert_pos = vert_pos.to(device)
+    vert_col = vert_col.to(device)
+    vert_rad = vert_rad.to(device)
+    cam_params = cam_params.to(device)
+    renderer = renderer.to(device)
+    vert_pos_var = Variable(vert_pos, requires_grad=False)
+    vert_col_var = Variable(vert_col, requires_grad=False)
+    vert_rad_var = Variable(vert_rad, requires_grad=False)
+    cam_params_var = Variable(cam_params, requires_grad=False)
+
+    def bm_closure():
+        renderer.forward(
+            vert_pos_var,
+            vert_col_var,
+            vert_rad_var,
+            cam_params_var,
+            1.0e-1,
+            45.0,
+            percent_allowed_difference=0.01,
+        )
+        torch.cuda.synchronize()
+
+    return bm_closure
+
+
+def _bm_pulsar_backward():
+    n_points = 1_000_000
+    width = 1_000
+    height = 1_000
+    renderer = Renderer(width, height, n_points)
+    # Generate sample data.
+    torch.manual_seed(1)
+    vert_pos = torch.rand(n_points, 3, dtype=torch.float32) * 10.0
+    vert_pos[:, 2] += 25.0
+    vert_pos[:, :2] -= 5.0
+    vert_col = torch.rand(n_points, 3, dtype=torch.float32)
+    vert_rad = torch.rand(n_points, dtype=torch.float32)
+    cam_params = torch.tensor(
+        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 2.0], dtype=torch.float32
+    )
+    device = torch.device("cuda")
+    vert_pos = vert_pos.to(device)
+    vert_col = vert_col.to(device)
+    vert_rad = vert_rad.to(device)
+    cam_params = cam_params.to(device)
+    renderer = renderer.to(device)
+    vert_pos_var = Variable(vert_pos, requires_grad=True)
+    vert_col_var = Variable(vert_col, requires_grad=True)
+    vert_rad_var = Variable(vert_rad, requires_grad=True)
+    cam_params_var = Variable(cam_params, requires_grad=True)
+    res = renderer.forward(
+        vert_pos_var,
+        vert_col_var,
+        vert_rad_var,
+        cam_params_var,
+        1.0e-1,
+        45.0,
+        percent_allowed_difference=0.01,
+    )
+    loss = res.sum()
+
+    def bm_closure():
+        loss.backward(retain_graph=True)
+        torch.cuda.synchronize()
+
+    return bm_closure
+
+
+def bm_pulsar() -> None:
+    if not torch.cuda.is_available():
+        return
+
+    benchmark(_bm_pulsar, "PULSAR_FORWARD", [{}], warmup_iters=3)
+    benchmark(_bm_pulsar_backward, "PULSAR_BACKWARD", [{}], warmup_iters=3)
+
+
+if __name__ == "__main__":
+    bm_pulsar()
--- a/tests/benchmarks/bm_rasterize_meshes.py
+++ b/tests/benchmarks/bm_rasterize_meshes.py
@@ -0,0 +1,125 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from itertools import product
+
+import torch
+from fvcore.common.benchmark import benchmark
+from test_rasterize_meshes import TestRasterizeMeshes
+
+
+# ico levels:
+# 0: (12 verts, 20 faces)
+# 1: (42 verts, 80 faces)
+# 3: (642 verts, 1280 faces)
+# 4: (2562 verts, 5120 faces)
+# 5: (10242 verts, 20480 faces)
+# 6: (40962 verts, 81920 faces)
+
+
+def bm_rasterize_meshes() -> None:
+    kwargs_list = [
+        {
+            "num_meshes": 1,
+            "ico_level": 0,
+            "image_size": 10,  # very slow with large image size
+            "blur_radius": 0.0,
+            "faces_per_pixel": 3,
+        }
+    ]
+    benchmark(
+        TestRasterizeMeshes.rasterize_meshes_python_with_init,
+        "RASTERIZE_MESHES",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+    kwargs_list = []
+    num_meshes = [1]
+    ico_level = [1]
+    image_size = [64, 128]
+    blur = [1e-6]
+    faces_per_pixel = [3, 50]
+    test_cases = product(num_meshes, ico_level, image_size, blur, faces_per_pixel)
+    for case in test_cases:
+        n, ic, im, b, f = case
+        kwargs_list.append(
+            {
+                "num_meshes": n,
+                "ico_level": ic,
+                "image_size": im,
+                "blur_radius": b,
+                "faces_per_pixel": f,
+            }
+        )
+    benchmark(
+        TestRasterizeMeshes.rasterize_meshes_cpu_with_init,
+        "RASTERIZE_MESHES",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+    if torch.cuda.is_available():
+        kwargs_list = []
+        num_meshes = [8, 16]
+        ico_level = [4, 5, 6]
+        # Square and non square cases
+        image_size = [64, 128, 512, (512, 256), (256, 512)]
+        blur = [1e-6]
+        faces_per_pixel = [40]
+        test_cases = product(num_meshes, ico_level, image_size, blur, faces_per_pixel)
+
+        for case in test_cases:
+            n, ic, im, b, f = case
+            kwargs_list.append(
+                {
+                    "num_meshes": n,
+                    "ico_level": ic,
+                    "image_size": im,
+                    "blur_radius": b,
+                    "faces_per_pixel": f,
+                }
+            )
+        benchmark(
+            TestRasterizeMeshes.rasterize_meshes_cuda_with_init,
+            "RASTERIZE_MESHES_CUDA",
+            kwargs_list,
+            warmup_iters=1,
+        )
+
+        # Test a subset of the cases with the
+        # image plane intersecting the mesh.
+        kwargs_list = []
+        num_meshes = [8, 16]
+        # Square and non square cases
+        image_size = [64, 128, 512, (512, 256), (256, 512)]
+        dist = [3, 0.8, 0.5]
+        test_cases = product(num_meshes, dist, image_size)
+
+        for case in test_cases:
+            n, d, im = case
+            kwargs_list.append(
+                {
+                    "num_meshes": n,
+                    "ico_level": 4,
+                    "image_size": im,
+                    "blur_radius": 1e-6,
+                    "faces_per_pixel": 40,
+                    "dist": d,
+                }
+            )
+
+        benchmark(
+            TestRasterizeMeshes.bm_rasterize_meshes_with_clipping,
+            "RASTERIZE_MESHES_CUDA_CLIPPING",
+            kwargs_list,
+            warmup_iters=1,
+        )
+
+
+if __name__ == "__main__":
+    bm_rasterize_meshes()
--- a/tests/benchmarks/bm_rasterize_points.py
+++ b/tests/benchmarks/bm_rasterize_points.py
@@ -0,0 +1,105 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from itertools import product
+
+import torch
+from fvcore.common.benchmark import benchmark
+from pytorch3d.renderer.points.rasterize_points import (
+    rasterize_points,
+    rasterize_points_python,
+)
+from pytorch3d.structures.pointclouds import Pointclouds
+
+
+def _bm_python_with_init(N, P, img_size=32, radius=0.1, pts_per_pxl=3):
+    torch.manual_seed(231)
+    points = torch.randn(N, P, 3)
+    pointclouds = Pointclouds(points=points)
+    args = (pointclouds, img_size, radius, pts_per_pxl)
+    return lambda: rasterize_points_python(*args)
+
+
+def _bm_rasterize_points_with_init(
+    N, P, img_size=32, radius=0.1, pts_per_pxl=3, device="cpu", expand_radius=False
+):
+    torch.manual_seed(231)
+    device = torch.device(device)
+    points = torch.randn(N, P, 3, device=device)
+    pointclouds = Pointclouds(points=points)
+
+    if expand_radius:
+        points_padded = pointclouds.points_padded()
+        radius = torch.full((N, P), fill_value=radius).type_as(points_padded)
+
+    args = (pointclouds, img_size, radius, pts_per_pxl)
+    if device == "cuda":
+        torch.cuda.synchronize(device)
+
+    def fn():
+        rasterize_points(*args)
+        if device == "cuda":
+            torch.cuda.synchronize(device)
+
+    return fn
+
+
+def bm_python_vs_cpu_vs_cuda() -> None:
+    kwargs_list = []
+    num_meshes = [1]
+    num_points = [10000, 2000]
+    image_size = [128, 256]
+    radius = [1e-3, 0.01]
+    pts_per_pxl = [50, 100]
+    expand = [True, False]
+    test_cases = product(
+        num_meshes, num_points, image_size, radius, pts_per_pxl, expand
+    )
+    for case in test_cases:
+        n, p, im, r, pts, e = case
+        kwargs_list.append(
+            {
+                "N": n,
+                "P": p,
+                "img_size": im,
+                "radius": r,
+                "pts_per_pxl": pts,
+                "device": "cpu",
+                "expand_radius": e,
+            }
+        )
+
+    benchmark(
+        _bm_rasterize_points_with_init, "RASTERIZE_CPU", kwargs_list, warmup_iters=1
+    )
+    kwargs_list += [
+        {"N": 32, "P": 100000, "img_size": 128, "radius": 0.01, "pts_per_pxl": 50},
+        {"N": 8, "P": 200000, "img_size": 512, "radius": 0.01, "pts_per_pxl": 50},
+        {"N": 8, "P": 200000, "img_size": 256, "radius": 0.01, "pts_per_pxl": 50},
+        {
+            "N": 8,
+            "P": 200000,
+            "img_size": (512, 256),
+            "radius": 0.01,
+            "pts_per_pxl": 50,
+        },
+        {
+            "N": 8,
+            "P": 200000,
+            "img_size": (256, 512),
+            "radius": 0.01,
+            "pts_per_pxl": 50,
+        },
+    ]
+    for k in kwargs_list:
+        k["device"] = "cuda"
+    benchmark(
+        _bm_rasterize_points_with_init, "RASTERIZE_CUDA", kwargs_list, warmup_iters=1
+    )
+
+
+if __name__ == "__main__":
+    bm_python_vs_cpu_vs_cuda()
--- a/tests/benchmarks/bm_raymarching.py
+++ b/tests/benchmarks/bm_raymarching.py
@@ -0,0 +1,23 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+
+from fvcore.common.benchmark import benchmark
+from pytorch3d.renderer import AbsorptionOnlyRaymarcher, EmissionAbsorptionRaymarcher
+from test_raymarching import TestRaymarching
+
+
+def bm_raymarching() -> None:
+    case_grid = {
+        "raymarcher_type": [EmissionAbsorptionRaymarcher, AbsorptionOnlyRaymarcher],
+        "n_rays": [10, 1000, 10000],
+        "n_pts_per_ray": [10, 1000, 10000],
+    }
+    test_cases = itertools.product(*case_grid.values())
+    kwargs_list = [dict(zip(case_grid.keys(), case)) for case in test_cases]
+
+    benchmark(TestRaymarching.raymarcher, "RAYMARCHER", kwargs_list, warmup_iters=1)
--- a/tests/benchmarks/bm_raysampling.py
+++ b/tests/benchmarks/bm_raysampling.py
@@ -0,0 +1,43 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+
+from fvcore.common.benchmark import benchmark
+from pytorch3d.renderer import (
+    FoVOrthographicCameras,
+    FoVPerspectiveCameras,
+    GridRaysampler,
+    MonteCarloRaysampler,
+    NDCGridRaysampler,
+    OrthographicCameras,
+    PerspectiveCameras,
+)
+from test_raysampling import TestRaysampling
+
+
+def bm_raysampling() -> None:
+    case_grid = {
+        "raysampler_type": [GridRaysampler, NDCGridRaysampler, MonteCarloRaysampler],
+        "camera_type": [
+            PerspectiveCameras,
+            OrthographicCameras,
+            FoVPerspectiveCameras,
+            FoVOrthographicCameras,
+        ],
+        "batch_size": [1, 10],
+        "n_pts_per_ray": [10, 1000, 10000],
+        "image_width": [10, 300],
+        "image_height": [10, 300],
+    }
+    test_cases = itertools.product(*case_grid.values())
+    kwargs_list = [dict(zip(case_grid.keys(), case)) for case in test_cases]
+
+    benchmark(TestRaysampling.raysampler, "RAYSAMPLER", kwargs_list, warmup_iters=1)
+
+
+if __name__ == "__main__":
+    bm_raysampling()
--- a/tests/benchmarks/bm_render_implicit.py
+++ b/tests/benchmarks/bm_render_implicit.py
@@ -0,0 +1,26 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+
+from fvcore.common.benchmark import benchmark
+from pytorch3d.renderer import AbsorptionOnlyRaymarcher, EmissionAbsorptionRaymarcher
+from test_render_implicit import TestRenderImplicit
+
+
+def bm_render_volumes() -> None:
+    case_grid = {
+        "batch_size": [1, 5],
+        "raymarcher_type": [EmissionAbsorptionRaymarcher, AbsorptionOnlyRaymarcher],
+        "n_rays_per_image": [64 ** 2, 256 ** 2],
+        "n_pts_per_ray": [16, 128],
+    }
+    test_cases = itertools.product(*case_grid.values())
+    kwargs_list = [dict(zip(case_grid.keys(), case)) for case in test_cases]
+
+    benchmark(
+        TestRenderImplicit.renderer, "IMPLICIT_RENDERER", kwargs_list, warmup_iters=1
+    )
--- a/tests/benchmarks/bm_render_volumes.py
+++ b/tests/benchmarks/bm_render_volumes.py
@@ -0,0 +1,28 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+
+from fvcore.common.benchmark import benchmark
+from pytorch3d.renderer import AbsorptionOnlyRaymarcher, EmissionAbsorptionRaymarcher
+from test_render_volumes import TestRenderVolumes
+
+
+def bm_render_volumes() -> None:
+    case_grid = {
+        "volume_size": [tuple([17] * 3), tuple([129] * 3)],
+        "batch_size": [1, 5],
+        "shape": ["sphere", "cube"],
+        "raymarcher_type": [EmissionAbsorptionRaymarcher, AbsorptionOnlyRaymarcher],
+        "n_rays_per_image": [64 ** 2, 256 ** 2],
+        "n_pts_per_ray": [16, 128],
+    }
+    test_cases = itertools.product(*case_grid.values())
+    kwargs_list = [dict(zip(case_grid.keys(), case)) for case in test_cases]
+
+    benchmark(
+        TestRenderVolumes.renderer, "VOLUME_RENDERER", kwargs_list, warmup_iters=1
+    )
--- a/tests/benchmarks/bm_sample_farthest_points.py
+++ b/tests/benchmarks/bm_sample_farthest_points.py
@@ -0,0 +1,46 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from itertools import product
+
+from fvcore.common.benchmark import benchmark
+from test_sample_farthest_points import TestFPS
+
+
+def bm_fps() -> None:
+    kwargs_list = []
+    backends = ["cpu", "cuda:0"]
+    Ns = [8, 32]
+    Ps = [64, 256]
+    Ds = [3]
+    Ks = [24]
+    test_cases = product(Ns, Ps, Ds, Ks, backends)
+    for case in test_cases:
+        N, P, D, K, d = case
+        kwargs_list.append({"N": N, "P": P, "D": D, "K": K, "device": d})
+
+    benchmark(
+        TestFPS.sample_farthest_points_naive,
+        "FPS_NAIVE_PYTHON",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+    # Add some larger batch sizes and pointcloud sizes
+    Ns = [32]
+    Ps = [2048, 8192, 18384]
+    Ds = [3, 9]
+    Ks = [24, 48]
+    test_cases = product(Ns, Ps, Ds, Ks, backends)
+    for case in test_cases:
+        N, P, D, K, d = case
+        kwargs_list.append({"N": N, "P": P, "D": D, "K": K, "device": d})
+
+    benchmark(TestFPS.sample_farthest_points, "FPS", kwargs_list, warmup_iters=1)
+
+
+if __name__ == "__main__":
+    bm_fps()
--- a/tests/benchmarks/bm_sample_pdf.py
+++ b/tests/benchmarks/bm_sample_pdf.py
@@ -0,0 +1,37 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from itertools import product
+
+from fvcore.common.benchmark import benchmark
+from test_sample_pdf import TestSamplePDF
+
+
+def bm_sample_pdf() -> None:
+
+    backends = ["python_cuda", "cuda", "python_cpu", "cpu"]
+
+    kwargs_list = []
+    sample_counts = [64]
+    batch_sizes = [1024, 10240]
+    bin_counts = [62, 600]
+    test_cases = product(backends, sample_counts, batch_sizes, bin_counts)
+    for case in test_cases:
+        backend, n_samples, batch_size, n_bins = case
+        kwargs_list.append(
+            {
+                "backend": backend,
+                "n_samples": n_samples,
+                "batch_size": batch_size,
+                "n_bins": n_bins,
+            }
+        )
+
+    benchmark(TestSamplePDF.bm_fn, "SAMPLE_PDF", kwargs_list, warmup_iters=1)
+
+
+if __name__ == "__main__":
+    bm_sample_pdf()
--- a/tests/benchmarks/bm_sample_points_from_meshes.py
+++ b/tests/benchmarks/bm_sample_points_from_meshes.py
@@ -0,0 +1,46 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from itertools import product
+
+import torch
+from fvcore.common.benchmark import benchmark
+from test_sample_points_from_meshes import TestSamplePoints
+
+
+def bm_sample_points() -> None:
+
+    backend = ["cpu"]
+    if torch.cuda.is_available():
+        backend.append("cuda:0")
+    kwargs_list = []
+    num_meshes = [2, 10, 32]
+    num_verts = [100, 1000]
+    num_faces = [300, 3000]
+    num_samples = [5000, 10000]
+    test_cases = product(num_meshes, num_verts, num_faces, num_samples, backend)
+    for case in test_cases:
+        n, v, f, s, b = case
+        kwargs_list.append(
+            {
+                "num_meshes": n,
+                "num_verts": v,
+                "num_faces": f,
+                "num_samples": s,
+                "device": b,
+            }
+        )
+    benchmark(
+        TestSamplePoints.sample_points_with_init,
+        "SAMPLE_MESH",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_sample_points()
--- a/tests/benchmarks/bm_se3.py
+++ b/tests/benchmarks/bm_se3.py
@@ -0,0 +1,23 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fvcore.common.benchmark import benchmark
+from test_se3 import TestSE3
+
+
+def bm_se3() -> None:
+    kwargs_list = [
+        {"batch_size": 1},
+        {"batch_size": 10},
+        {"batch_size": 100},
+        {"batch_size": 1000},
+    ]
+    benchmark(TestSE3.se3_expmap, "SE3_EXP", kwargs_list, warmup_iters=1)
+    benchmark(TestSE3.se3_logmap, "SE3_LOG", kwargs_list, warmup_iters=1)
+
+
+if __name__ == "__main__":
+    bm_se3()
--- a/tests/benchmarks/bm_so3.py
+++ b/tests/benchmarks/bm_so3.py
@@ -0,0 +1,23 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fvcore.common.benchmark import benchmark
+from test_so3 import TestSO3
+
+
+def bm_so3() -> None:
+    kwargs_list = [
+        {"batch_size": 1},
+        {"batch_size": 10},
+        {"batch_size": 100},
+        {"batch_size": 1000},
+    ]
+    benchmark(TestSO3.so3_expmap, "SO3_EXP", kwargs_list, warmup_iters=1)
+    benchmark(TestSO3.so3_logmap, "SO3_LOG", kwargs_list, warmup_iters=1)
+
+
+if __name__ == "__main__":
+    bm_so3()
--- a/tests/benchmarks/bm_subdivide_meshes.py
+++ b/tests/benchmarks/bm_subdivide_meshes.py
@@ -0,0 +1,31 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from itertools import product
+
+from fvcore.common.benchmark import benchmark
+from test_subdivide_meshes import TestSubdivideMeshes
+
+
+def bm_subdivide() -> None:
+    kwargs_list = []
+    num_meshes = [1, 16, 32]
+    same_topo = [True, False]
+    test_cases = product(num_meshes, same_topo)
+    for case in test_cases:
+        n, s = case
+        kwargs_list.append({"num_meshes": n, "same_topo": s})
+    benchmark(
+        TestSubdivideMeshes.subdivide_meshes_with_init,
+        "SUBDIVIDE",
+        kwargs_list,
+        warmup_iters=1,
+    )
+
+
+if __name__ == "__main__":
+    bm_subdivide()
--- a/tests/benchmarks/bm_symeig3x3.py
+++ b/tests/benchmarks/bm_symeig3x3.py
@@ -0,0 +1,94 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from itertools import product
+from typing import Any, Callable
+
+import torch
+from common_testing import get_random_cuda_device
+from fvcore.common.benchmark import benchmark
+from pytorch3d.common.workaround import symeig3x3
+from test_symeig3x3 import TestSymEig3x3
+
+
+torch.set_num_threads(1)
+
+CUDA_DEVICE = get_random_cuda_device()
+
+
+def create_traced_func(func, device, batch_size):
+    traced_func = torch.jit.trace(
+        func, (TestSymEig3x3.create_random_sym3x3(device, batch_size),)
+    )
+
+    return traced_func
+
+
+FUNC_NAME_TO_FUNC = {
+    "sym3x3eig": (lambda inputs: symeig3x3(inputs, eigenvectors=True)),
+    "sym3x3eig_traced_cuda": create_traced_func(
+        (lambda inputs: symeig3x3(inputs, eigenvectors=True)), CUDA_DEVICE, 1024
+    ),
+    "torch_symeig": (lambda inputs: torch.symeig(inputs, eigenvectors=True)),
+    "torch_linalg_eigh": (lambda inputs: torch.linalg.eigh(inputs)),
+    "torch_pca_lowrank": (
+        lambda inputs: torch.pca_lowrank(inputs, center=False, niter=1)
+    ),
+    "sym3x3eig_no_vecs": (lambda inputs: symeig3x3(inputs, eigenvectors=False)),
+    "torch_symeig_no_vecs": (lambda inputs: torch.symeig(inputs, eigenvectors=False)),
+    "torch_linalg_eigvalsh_no_vecs": (lambda inputs: torch.linalg.eigvalsh(inputs)),
+}
+
+
+def test_symeig3x3(func_name, batch_size, device) -> Callable[[], Any]:
+    func = FUNC_NAME_TO_FUNC[func_name]
+    inputs = TestSymEig3x3.create_random_sym3x3(device, batch_size)
+    torch.cuda.synchronize()
+
+    def symeig3x3():
+        func(inputs)
+        torch.cuda.synchronize()
+
+    return symeig3x3
+
+
+def bm_symeig3x3() -> None:
+    devices = ["cpu"]
+    if torch.cuda.is_available():
+        devices.append(CUDA_DEVICE)
+
+    kwargs_list = []
+    func_names = FUNC_NAME_TO_FUNC.keys()
+    batch_sizes = [16, 128, 1024, 8192, 65536, 1048576]
+
+    for func_name, batch_size, device in product(func_names, batch_sizes, devices):
+        # Run CUDA-only implementations only on GPU
+        if "cuda" in func_name and not device.startswith("cuda"):
+            continue
+
+        # Torch built-ins are quite slow on larger batches
+        if "torch" in func_name and batch_size > 8192:
+            continue
+
+        # Avoid running CPU implementations on larger batches as well
+        if device == "cpu" and batch_size > 8192:
+            continue
+
+        kwargs_list.append(
+            {"func_name": func_name, "batch_size": batch_size, "device": device}
+        )
+
+    benchmark(
+        test_symeig3x3,
+        "SYMEIG3X3",
+        kwargs_list,
+        warmup_iters=3,
+    )
+
+
+if __name__ == "__main__":
+    bm_symeig3x3()
--- a/tests/benchmarks/bm_vert_align.py
+++ b/tests/benchmarks/bm_vert_align.py
@@ -0,0 +1,37 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from itertools import product
+
+import torch
+from fvcore.common.benchmark import benchmark
+from test_vert_align import TestVertAlign
+
+
+def bm_vert_align() -> None:
+    devices = ["cpu"]
+    if torch.cuda.is_available():
+        devices.append("cuda")
+
+    kwargs_list = []
+    num_meshes = [2, 10, 32]
+    num_verts = [100, 1000]
+    num_faces = [300, 3000]
+    test_cases = product(num_meshes, num_verts, num_faces, devices)
+    for case in test_cases:
+        n, v, f, d = case
+        kwargs_list.append(
+            {"num_meshes": n, "num_verts": v, "num_faces": f, "device": d}
+        )
+
+    benchmark(
+        TestVertAlign.vert_align_with_init, "VERT_ALIGN", kwargs_list, warmup_iters=1
+    )
+
+
+if __name__ == "__main__":
+    bm_vert_align()