# /// script
# requires-python = ">=3.13"
# dependencies = [
#     "kernels",
#     "numpy",
#     "torch",
# ]
# ///

import platform
from pathlib import Path

import kernels
import torch

# Load the locally built kernel
kernel = kernels.get_local_kernel(Path("build"), "__KERNEL_NAME_NORMALIZED__")

# Select device
if platform.system() == "Darwin":
    device = torch.device("mps")
elif hasattr(torch, "xpu") and torch.xpu.is_available():
    device = torch.device("xpu")
elif torch.version.cuda is not None and torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

# Create input tensor
x = torch.tensor([1.0, 2.0, 3.0], device=device)
print(f"Input:  {x}")

# Run kernel (adds 1 to each element)
result = kernel.__KERNEL_NAME_NORMALIZED__(x)
print(f"Output: {result}")

# Verify result
expected = x + 1.0
assert torch.allclose(result, expected), "Kernel output doesn't match expected!"
print("Success!")