Dheemanth aeab82ff30
CUDA 13.2 samples update (#432)
- Added Python samples for CUDA Python 1.0 release
- Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
2026-05-13 17:13:18 -05:00

390 lines
12 KiB
Python
Executable File

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Device Query using CUDA Core API
This sample enumerates the properties of the CUDA devices present in the system.
"""
import platform
import sys
# cuda.bindings used for properties not yet exposed in cuda.core (see comments below)
try:
from cuda.bindings import driver as cuda, runtime as cudart
from cuda.core import Device, system
except ImportError as e:
print(f"Error: Required package not found: {e}")
print("Please install from requirements.txt:")
print(" pip install -r requirements.txt")
sys.exit(1)
def print_property(label, value, indent=2):
"""
Helper function to print device properties with aligned formatting.
Parameters
----------
label : str
Property label
value : any
Property value
indent : int
Number of spaces for indentation (default: 2)
"""
field_width = 47
spaces = " " * indent
print(f"{spaces}{label:<{field_width}}{value}")
def fmt_bytes(size_in_bytes):
"""Format bytes to human-readable string with MBytes."""
return f"{size_in_bytes / (1024 * 1024):.0f} MBytes ({size_in_bytes} bytes)"
def fmt_hz(rate_in_khz):
"""Format frequency in kHz to MHz and GHz."""
return f"{rate_in_khz * 1e-3:.0f} MHz ({rate_in_khz * 1e-6:.2f} GHz)"
def fmt_yes_no(val):
"""Format boolean value to Yes/No string."""
return "Yes" if val else "No"
def convert_sm_ver_to_cores(major, minor):
"""
Maps SM version to the number of CUDA cores per SM.
Information taken from:
https://github.com/NVIDIA/cuda-samples/blob/master/Common/helper_cuda.h
Parameters
----------
major : int
Major compute capability version
minor : int
Minor compute capability version
Returns
-------
int
Number of CUDA cores per SM, or 0 if unknown
"""
sm_to_cores = {
(3, 0): 192,
(3, 2): 192,
(3, 5): 192,
(3, 7): 192,
(5, 0): 128,
(5, 2): 128,
(5, 3): 128,
(6, 0): 64,
(6, 1): 128,
(6, 2): 128,
(7, 0): 64,
(7, 2): 64,
(7, 5): 64,
(8, 0): 64,
(8, 6): 128,
(8, 7): 128,
(8, 9): 128,
(9, 0): 128,
(10, 0): 128,
(10, 1): 128,
(10, 3): 128,
(11, 0): 128,
(12, 0): 128,
(12, 1): 128,
}
return sm_to_cores.get((major, minor), 0)
def print_device_info(dev_id, device):
"""
Print detailed information for a single CUDA device.
Uses device.properties (cuda.core) for most fields; cuda.bindings for
runtime version and global memory (not yet in high-level API).
"""
device.set_current()
props = device.properties
print()
print(f"Device {dev_id}: {device.name}")
# cuda.bindings workaround: runtime version not in cuda.core
driver_major, driver_minor = system.get_driver_version()
err, runtime_version = cudart.cudaRuntimeGetVersion()
if err != cudart.cudaError_t.cudaSuccess:
raise RuntimeError(f"Failed to get CUDA runtime version: {err}")
runtime_major = runtime_version // 1000
runtime_minor = (runtime_version % 1000) // 10
print_property(
"CUDA Driver Version / Runtime Version",
f"{driver_major}.{driver_minor} / {runtime_major}.{runtime_minor}",
)
print_property(
"CUDA Capability Major/Minor version number:",
f"{props.compute_capability_major}.{props.compute_capability_minor}",
)
# cuda.bindings workaround: global memory (free/total) not in device.properties
err, free_mem, total_mem_bytes = cuda.cuMemGetInfo()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError(f"Failed to get memory info: {err}")
print_property("Total amount of global memory:", fmt_bytes(total_mem_bytes))
sm_cores = convert_sm_ver_to_cores(
props.compute_capability_major, props.compute_capability_minor
)
total_cores = sm_cores * props.multiprocessor_count
print_property(
f"({props.multiprocessor_count:3d}) Multiprocessors, "
f"({sm_cores:3d}) CUDA Cores/MP:",
f"{total_cores} CUDA Cores",
)
print_property("GPU Max Clock rate:", fmt_hz(props.clock_rate))
print_property("Memory Clock rate:", f"{props.memory_clock_rate * 1e-3:.0f} Mhz")
print_property("Memory Bus Width:", f"{props.global_memory_bus_width}-bit")
if props.l2_cache_size > 0:
print_property("L2 Cache Size:", f"{props.l2_cache_size} bytes")
print_property(
"Maximum Texture Dimension Size (x,y,z)",
f"1D=({props.maximum_texture1d_width}), "
f"2D=({props.maximum_texture2d_width}, {props.maximum_texture2d_height}), "
f"3D=({props.maximum_texture3d_width}, {props.maximum_texture3d_height}, "
f"{props.maximum_texture3d_depth})",
)
print_property(
"Maximum Layered 1D Texture Size, (num) layers",
f"1D=({props.maximum_texture1d_layered_width}), "
f"{props.maximum_texture1d_layered_layers} layers",
)
print_property(
"Maximum Layered 2D Texture Size, (num) layers",
f"2D=({props.maximum_texture2d_layered_width}, "
f"{props.maximum_texture2d_layered_height}), "
f"{props.maximum_texture2d_layered_layers} layers",
)
print_property(
"Total amount of constant memory:", f"{props.total_constant_memory} bytes"
)
print_property(
"Total amount of shared memory per block:",
f"{props.max_shared_memory_per_block} bytes",
)
print_property(
"Total shared memory per multiprocessor:",
f"{props.max_shared_memory_per_multiprocessor} bytes",
)
print_property(
"Total number of registers available per block:", props.max_registers_per_block
)
print_property("Warp size:", props.warp_size)
print_property(
"Maximum number of threads per multiprocessor:",
props.max_threads_per_multiprocessor,
)
print_property("Maximum number of threads per block:", props.max_threads_per_block)
print_property(
"Max dimension size of a thread block (x,y,z):",
f"({props.max_block_dim_x}, {props.max_block_dim_y}, {props.max_block_dim_z})",
)
print_property(
"Max dimension size of a grid size (x,y,z):",
f"({props.max_grid_dim_x}, {props.max_grid_dim_y}, {props.max_grid_dim_z})",
)
print_property("Maximum memory pitch:", f"{props.max_pitch} bytes")
print_property("Texture alignment:", f"{props.texture_alignment} bytes")
print_property(
"Concurrent copy and kernel execution:",
f"{fmt_yes_no(props.gpu_overlap)} with "
f"{props.async_engine_count} copy engine(s)",
)
print_property("Run time limit on kernels:", fmt_yes_no(props.kernel_exec_timeout))
print_property("Integrated GPU sharing Host Memory:", fmt_yes_no(props.integrated))
print_property(
"Support host page-locked memory mapping:",
fmt_yes_no(props.can_map_host_memory),
)
print_property(
"Device has ECC support:", "Enabled" if props.ecc_enabled else "Disabled"
)
if platform.system() == "Windows":
mode = (
"TCC (Tesla Compute Cluster Driver)"
if props.tcc_driver
else "WDDM (Windows Display Driver Model)"
)
print_property("CUDA Device Driver Mode (TCC or WDDM):", mode)
print_property(
"Device supports Unified Addressing (UVA):",
fmt_yes_no(props.unified_addressing),
)
print_property("Device supports Managed Memory:", fmt_yes_no(props.managed_memory))
print_property(
"Device supports Compute Preemption:",
fmt_yes_no(props.compute_preemption_supported),
)
print_property(
"Supports Cooperative Kernel Launch:", fmt_yes_no(props.cooperative_launch)
)
print_property(
"Device PCI Domain ID / Bus ID / location ID:",
f"{props.pci_domain_id} / {props.pci_bus_id} / {props.pci_device_id}",
)
compute_modes = {
0: (
"Default (multiple host threads can use cudaSetDevice() "
"with device simultaneously)"
),
1: (
"Exclusive (only one host thread in one process is able to "
"use cudaSetDevice() with this device)"
),
2: "Prohibited (no host thread can use cudaSetDevice() with this device)",
3: (
"Exclusive Process (many threads in one process is able to "
"use cudaSetDevice() with this device)"
),
}
print_property("Compute Mode:", "")
print(f" < {compute_modes.get(props.compute_mode, 'Unknown')} >")
def print_p2p_access_info(devices):
"""
Print peer-to-peer access information for multi-GPU systems.
Parameters
----------
devices : tuple of Device
Tuple of CUDA device objects
"""
print()
print("Peer-to-Peer (P2P) access support:")
for i, dev_i in enumerate(devices):
for j, dev_j in enumerate(devices):
if i == j:
continue
try:
can_access = dev_i.can_access_peer(dev_j)
print(
f"> Peer access from {dev_i.name} (GPU{i}) -> "
f"{dev_j.name} (GPU{j}) : {fmt_yes_no(can_access)}"
)
except Exception as e:
print(
"Warning: Could not check peer access between "
f"device {i} and {j}: {e}"
)
def query_devices(show_p2p=True):
"""
Query and display information about all CUDA devices.
Parameters
----------
show_p2p : bool
Whether to show peer-to-peer access information (default: True)
Returns
-------
bool
True if successful, False otherwise
"""
try:
print("[CUDA Device Query using CUDA Core API]")
devices = Device.get_all_devices()
except Exception as e:
print(f"Error: Failed to get devices: {e}")
import traceback
traceback.print_exc()
return False
if len(devices) == 0:
print("There are no available device(s) that support CUDA")
return True
print(f"Detected {len(devices)} CUDA Capable device(s)")
for dev_id, device in enumerate(devices):
try:
print_device_info(dev_id, device)
except Exception as e:
print(f"Error: Failed to get information for device {dev_id}: {e}")
import traceback
traceback.print_exc()
return False
if show_p2p and len(devices) >= 2:
print_p2p_access_info(devices)
return True
def main():
"""
Main entry point for the device query sample.
"""
import argparse
parser = argparse.ArgumentParser(
description="Query CUDA Device Properties using CUDA Core API",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--no-p2p", action="store_true", help="Skip peer-to-peer access information"
)
args = parser.parse_args()
success = query_devices(show_p2p=not args.no_p2p)
if success:
print("\nDone")
return 0
else:
return 1
if __name__ == "__main__":
sys.exit(main())