mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2026-05-14 14:06:53 +08:00
- Added Python samples for CUDA Python 1.0 release - Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
224 lines
7.9 KiB
Python
224 lines
7.9 KiB
Python
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
# contributors may be used to endorse or promote products derived
|
|
# from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
"""
|
|
JIT Compilation and Link-Time Optimization with cuda.core
|
|
|
|
Real-world GPU code is rarely a single source string. Libraries ship a
|
|
"main" kernel that is compiled once, then link in user-supplied device
|
|
functions at runtime to customize behavior without recompiling the whole
|
|
program.
|
|
|
|
cuda.core exposes this pattern through ``Program`` (NVRTC compilation)
|
|
and ``Linker`` (JIT linking of multiple object codes). Two modes are
|
|
shown here:
|
|
|
|
* **PTX linking**: compile each translation unit with
|
|
``relocatable_device_code=True`` to PTX and link to a CUBIN.
|
|
The two modules remain independently compiled: no cross-module
|
|
inlining.
|
|
|
|
* **LTO (Link-Time Optimization)**: compile each translation unit
|
|
with ``link_time_optimization=True`` to LTO IR, then link with
|
|
``LinkerOptions(link_time_optimization=True)``. The linker reruns
|
|
the optimizer across both modules and can inline the device function
|
|
into the main kernel, typically matching a single-source build.
|
|
|
|
The same kernel math runs in both modes and is verified against a
|
|
NumPy reference.
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Utilities"))
|
|
|
|
try:
|
|
import cupy as cp
|
|
import numpy as np
|
|
from cuda.core import (
|
|
Device,
|
|
LaunchConfig,
|
|
Linker,
|
|
LinkerOptions,
|
|
Program,
|
|
ProgramOptions,
|
|
launch,
|
|
)
|
|
from cuda_samples_utils import print_gpu_info # noqa: E402
|
|
except ImportError as e:
|
|
print(f"Error: Required package not found: {e}")
|
|
print("Please install from requirements.txt:")
|
|
print(" pip install -r requirements.txt")
|
|
sys.exit(1)
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# Module A: the "library" main kernel. It forwards each element through a
|
|
# user-supplied device function (resolved at link time) and writes the result.
|
|
# --------------------------------------------------------------------------
|
|
MAIN_SRC = r"""
|
|
// Forward declare the user-supplied hook. Its definition lives in a separate
|
|
// translation unit and is resolved by the Linker at runtime.
|
|
extern "C" __device__ float user_transform(float x);
|
|
|
|
extern "C" __global__
|
|
void apply_transform(const float* __restrict__ in,
|
|
float* __restrict__ out,
|
|
size_t N)
|
|
{
|
|
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
size_t stride = (size_t)gridDim.x * blockDim.x;
|
|
for (size_t i = tid; i < N; i += stride) {
|
|
out[i] = user_transform(in[i]);
|
|
}
|
|
}
|
|
"""
|
|
|
|
# --------------------------------------------------------------------------
|
|
# Module B: the user-supplied "plug-in" device function. A different
|
|
# implementation of ``user_transform`` here produces different results without
|
|
# rebuilding MAIN_SRC.
|
|
# --------------------------------------------------------------------------
|
|
USER_SRC = r"""
|
|
extern "C" __device__
|
|
float user_transform(float x)
|
|
{
|
|
// A deliberately non-trivial expression so LTO has something to inline /
|
|
// optimize across the module boundary.
|
|
float y = x * x + 3.0f * x - 1.0f;
|
|
return y > 0.0f ? y : 0.0f;
|
|
}
|
|
"""
|
|
|
|
|
|
def host_reference(x: np.ndarray) -> np.ndarray:
|
|
y = x * x + 3.0 * x - 1.0
|
|
return np.where(y > 0.0, y, 0.0).astype(np.float32)
|
|
|
|
|
|
def link_ptx(device):
|
|
"""Compile both modules to PTX and link them into a cubin (no LTO)."""
|
|
prog_opts = ProgramOptions(
|
|
std="c++17", arch=f"sm_{device.arch}", relocatable_device_code=True
|
|
)
|
|
main_obj = Program(MAIN_SRC, "c++", options=prog_opts).compile("ptx")
|
|
user_obj = Program(USER_SRC, "c++", options=prog_opts).compile("ptx")
|
|
|
|
linker = Linker(main_obj, user_obj, options=LinkerOptions(arch=f"sm_{device.arch}"))
|
|
return linker.link("cubin")
|
|
|
|
|
|
def link_lto(device):
|
|
"""Compile both modules to LTO IR and link with LTO enabled."""
|
|
prog_opts = ProgramOptions(
|
|
std="c++17", arch=f"sm_{device.arch}", link_time_optimization=True
|
|
)
|
|
main_obj = Program(MAIN_SRC, "c++", options=prog_opts).compile("ltoir")
|
|
user_obj = Program(USER_SRC, "c++", options=prog_opts).compile("ltoir")
|
|
|
|
linker_opts = LinkerOptions(
|
|
arch=f"sm_{device.arch}", link_time_optimization=True
|
|
)
|
|
linker = Linker(main_obj, user_obj, options=linker_opts)
|
|
return linker.link("cubin")
|
|
|
|
|
|
def run_one_mode(mode, module, stream, d_in, d_out, size, expected):
|
|
kernel = module.get_kernel("apply_transform")
|
|
config = LaunchConfig(grid=(size + 255) // 256, block=256)
|
|
launch(
|
|
stream,
|
|
config,
|
|
kernel,
|
|
d_in.data.ptr,
|
|
d_out.data.ptr,
|
|
np.uint64(size),
|
|
)
|
|
stream.sync()
|
|
actual = cp.asnumpy(d_out)
|
|
if not np.allclose(actual, expected, rtol=1e-5, atol=1e-5):
|
|
max_err = np.max(np.abs(actual - expected))
|
|
print(f" [{mode}] verification FAILED (max_err={max_err})")
|
|
return False
|
|
print(f" [{mode}] result verified against NumPy reference")
|
|
return True
|
|
|
|
|
|
def main() -> int:
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="JIT + LTO linking of two device modules with cuda.core"
|
|
)
|
|
parser.add_argument(
|
|
"--elements", type=int, default=1 << 16,
|
|
help="Number of float32 elements (default: 65536)",
|
|
)
|
|
parser.add_argument("--device", type=int, default=0, help="CUDA device id")
|
|
args = parser.parse_args()
|
|
|
|
device = Device(args.device)
|
|
device.set_current()
|
|
print_gpu_info(device)
|
|
|
|
stream = device.create_stream()
|
|
cp.cuda.ExternalStream(int(stream.handle)).use()
|
|
|
|
try:
|
|
N = args.elements
|
|
rng = np.random.default_rng(seed=0)
|
|
host_in = rng.standard_normal(N).astype(np.float32)
|
|
expected = host_reference(host_in)
|
|
|
|
d_in = cp.asarray(host_in)
|
|
d_out = cp.empty(N, dtype=cp.float32)
|
|
device.sync()
|
|
|
|
print("\n[1] PTX linking (no LTO)")
|
|
ptx_module = link_ptx(device)
|
|
ok_ptx = run_one_mode("ptx", ptx_module, stream, d_in, d_out, N, expected)
|
|
|
|
d_out.fill(0)
|
|
device.sync()
|
|
|
|
print("\n[2] LTO linking (link-time optimization)")
|
|
lto_module = link_lto(device)
|
|
ok_lto = run_one_mode("lto", lto_module, stream, d_in, d_out, N, expected)
|
|
|
|
print()
|
|
if ok_ptx and ok_lto:
|
|
print("Both PTX and LTO linked kernels produced matching results. Done")
|
|
return 0
|
|
return 1
|
|
finally:
|
|
stream.close()
|
|
cp.cuda.Stream.null.use()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|