# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.

from itertools import product
import torch
from fvcore.common.benchmark import benchmark

from pytorch3d import _C
from pytorch3d.ops.knn import _knn_points_idx_naive


def bm_knn() -> None:
    """ Entry point for the benchmark """
    benchmark_knn_cpu()
    benchmark_knn_cuda_vs_naive()
    benchmark_knn_cuda_versions()


def benchmark_knn_cuda_versions() -> None:
    # Compare our different KNN implementations,
    # and also compare against our existing 1-NN
    Ns = [1, 2]
    Ps = [4096, 16384]
    Ds = [3]
    Ks = [1, 4, 16, 64]
    versions = [0, 1, 2, 3]
    knn_kwargs, nn_kwargs = [], []
    for N, P, D, K, version in product(Ns, Ps, Ds, Ks, versions):
        if version == 2 and K > 32:
            continue
        if version == 3 and K > 4:
            continue
        knn_kwargs.append({"N": N, "D": D, "P": P, "K": K, "v": version})
    for N, P, D in product(Ns, Ps, Ds):
        nn_kwargs.append({"N": N, "D": D, "P": P})
    benchmark(
        knn_cuda_with_init, "KNN_CUDA_VERSIONS", knn_kwargs, warmup_iters=1
    )
    benchmark(nn_cuda_with_init, "NN_CUDA", nn_kwargs, warmup_iters=1)


def benchmark_knn_cuda_vs_naive() -> None:
    # Compare against naive pytorch version of KNN
    Ns = [1, 2, 4]
    Ps = [1024, 4096, 16384, 65536]
    Ds = [3]
    Ks = [1, 2, 4, 8, 16]
    knn_kwargs, naive_kwargs = [], []
    for N, P, D, K in product(Ns, Ps, Ds, Ks):
        knn_kwargs.append({"N": N, "D": D, "P": P, "K": K})
        if P <= 4096:
            naive_kwargs.append({"N": N, "D": D, "P": P, "K": K})
    benchmark(
        knn_python_cuda_with_init,
        "KNN_CUDA_PYTHON",
        naive_kwargs,
        warmup_iters=1,
    )
    benchmark(knn_cuda_with_init, "KNN_CUDA", knn_kwargs, warmup_iters=1)


def benchmark_knn_cpu() -> None:
    Ns = [1, 2]
    Ps = [256, 512]
    Ds = [3]
    Ks = [1, 2, 4]
    knn_kwargs, nn_kwargs = [], []
    for N, P, D, K in product(Ns, Ps, Ds, Ks):
        knn_kwargs.append({"N": N, "D": D, "P": P, "K": K})
    for N, P, D in product(Ns, Ps, Ds):
        nn_kwargs.append({"N": N, "D": D, "P": P})
    benchmark(
        knn_python_cpu_with_init, "KNN_CPU_PYTHON", knn_kwargs, warmup_iters=1
    )
    benchmark(knn_cpu_with_init, "KNN_CPU_CPP", knn_kwargs, warmup_iters=1)
    benchmark(nn_cpu_with_init, "NN_CPU_CPP", nn_kwargs, warmup_iters=1)


def knn_cuda_with_init(N, D, P, K, v=-1):
    device = torch.device("cuda:0")
    x = torch.randn(N, P, D, device=device)
    y = torch.randn(N, P, D, device=device)
    torch.cuda.synchronize()

    def knn():
        _C.knn_points_idx(x, y, K, v)
        torch.cuda.synchronize()

    return knn


def knn_cpu_with_init(N, D, P, K):
    device = torch.device("cpu")
    x = torch.randn(N, P, D, device=device)
    y = torch.randn(N, P, D, device=device)

    def knn():
        _C.knn_points_idx(x, y, K, 0)

    return knn


def knn_python_cuda_with_init(N, D, P, K):
    device = torch.device("cuda")
    x = torch.randn(N, P, D, device=device)
    y = torch.randn(N, P, D, device=device)
    torch.cuda.synchronize()

    def knn():
        _knn_points_idx_naive(x, y, K)
        torch.cuda.synchronize()

    return knn


def knn_python_cpu_with_init(N, D, P, K):
    device = torch.device("cpu")
    x = torch.randn(N, P, D, device=device)
    y = torch.randn(N, P, D, device=device)

    def knn():
        _knn_points_idx_naive(x, y, K)

    return knn


def nn_cuda_with_init(N, D, P):
    device = torch.device("cuda")
    x = torch.randn(N, P, D, device=device)
    y = torch.randn(N, P, D, device=device)
    torch.cuda.synchronize()

    def knn():
        _C.nn_points_idx(x, y)
        torch.cuda.synchronize()

    return knn


def nn_cpu_with_init(N, D, P):
    device = torch.device("cpu")
    x = torch.randn(N, P, D, device=device)
    y = torch.randn(N, P, D, device=device)

    def knn():
        _C.nn_points_idx(x, y)

    return knn