Dheemanth aeab82ff30
CUDA 13.2 samples update (#432)
- Added Python samples for CUDA Python 1.0 release
- Renamed top-level `Samples` directory to `cpp` to accommodate Python samples.
2026-05-13 17:13:18 -05:00

216 lines
7.2 KiB
Python

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
System Information via cuda.core.system (NVML)
Demonstrates the ``cuda.core.system`` module, which wraps NVIDIA Management
Library (NVML) functionality.
This sample prints:
* Driver and NVML versions
* Current process name
* Per-device: name, UUID, compute capability / arch, PCI info, memory usage,
temperature, performance state
* GPU-to-GPU topology and peer-to-peer status (when more than one GPU)
"""
import os
import sys
try:
from cuda.core import system
from cuda.core.system import (
CUDA_BINDINGS_NVML_IS_COMPATIBLE,
GpuP2PCapsIndex,
TemperatureSensors,
)
except ImportError as e:
print(f"Error: Required package not found: {e}")
print("Please install from requirements.txt:")
print(" pip install -r requirements.txt")
sys.exit(1)
def print_header(title: str) -> None:
print()
print("=" * 70)
print(title)
print("=" * 70)
def format_bytes(nbytes: int) -> str:
"""Format a byte count as a human-readable string."""
units = ["B", "KiB", "MiB", "GiB", "TiB"]
size = float(nbytes)
for unit in units:
if size < 1024.0:
return f"{size:.2f} {unit}"
size /= 1024.0
return f"{size:.2f} PiB"
def print_driver_info() -> None:
print_header("Driver / NVML")
major, minor = system.get_driver_version()
print(f"CUDA driver version: {major}.{minor}")
print(f"CUDA driver version (full): {system.get_driver_version_full()}")
if CUDA_BINDINGS_NVML_IS_COMPATIBLE:
print(f"NVML version: {system.get_nvml_version()}")
try:
print(f"Driver branch: {system.get_driver_branch()}")
except Exception as e: # noqa: BLE001 - driver branch is informational
print(f"Driver branch: unavailable ({e})")
else:
print(
"NVML bindings are not compatible with this driver; "
"device info will be limited."
)
print(f"Current process: {system.get_process_name(os.getpid())}")
def print_device_info(device: "system.Device") -> None:
print(f"\n-- Device {device.index} --")
print(f"Name: {device.name}")
print(f"UUID: {device.uuid}")
try:
cc_major, cc_minor = device.cuda_compute_capability
print(f"Compute capability: {cc_major}.{cc_minor}")
except Exception as e: # noqa: BLE001
print(f"Compute capability: unavailable ({e})")
try:
print(f"Architecture: {device.arch.name}")
except Exception as e: # noqa: BLE001
print(f"Architecture: unavailable ({e})")
try:
print(f"Brand: {device.brand.name}")
except Exception as e: # noqa: BLE001
print(f"Brand: unavailable ({e})")
# Memory
try:
mem = device.memory_info
print(
f"Memory: total={format_bytes(mem.total)}, "
f"used={format_bytes(mem.used)}, "
f"free={format_bytes(mem.free)}"
)
except Exception as e: # noqa: BLE001
print(f"Memory: unavailable ({e})")
# PCI
try:
pci = device.pci_info
print(
f"PCI: domain={pci.domain:04x} bus={pci.bus:02x} "
f"device={pci.device:02x} id={pci.bus_id}"
)
except Exception as e: # noqa: BLE001
print(f"PCI: unavailable ({e})")
# Temperature (GPU sensor)
try:
temp_c = device.temperature.sensor(TemperatureSensors.TEMPERATURE_GPU)
print(f"Temperature (GPU sensor): {temp_c} C")
except Exception as e: # noqa: BLE001
print(f"Temperature: unavailable ({e})")
# Performance state
try:
pstate = device.performance_state
print(f"Performance state: {pstate}")
except Exception as e: # noqa: BLE001
print(f"Performance state: unavailable ({e})")
def print_topology(devices: list) -> None:
if len(devices) < 2:
return
print_header("GPU topology and peer-to-peer")
for i, d0 in enumerate(devices):
for d1 in devices[i + 1 :]:
try:
level = system.get_topology_common_ancestor(d0, d1)
level_name = level.name
except Exception as e: # noqa: BLE001
level_name = f"unavailable ({e})"
try:
read = system.get_p2p_status(
d0, d1, GpuP2PCapsIndex.P2P_CAPS_INDEX_READ
)
write = system.get_p2p_status(
d0, d1, GpuP2PCapsIndex.P2P_CAPS_INDEX_WRITE
)
read_name = read.name
write_name = write.name
except Exception as e: # noqa: BLE001
read_name = write_name = f"unavailable ({e})"
print(
f"Device {d0.index} <-> Device {d1.index}: "
f"topology={level_name}, p2p_read={read_name}, p2p_write={write_name}"
)
def main() -> int:
import argparse
parser = argparse.ArgumentParser(
description="Print CUDA system / NVML information via cuda.core.system"
)
parser.add_argument(
"--no-topology",
action="store_true",
help="Skip cross-device topology/P2P queries",
)
args = parser.parse_args()
print_driver_info()
num_devices = system.get_num_devices()
print_header(f"Devices detected: {num_devices}")
if num_devices == 0:
print("No CUDA-capable devices found.")
return 0
if not CUDA_BINDINGS_NVML_IS_COMPATIBLE:
print(
"NVML is not compatible with the installed driver; skipping device detail."
)
return 0
devices = [system.Device(index=i) for i in range(num_devices)]
for device in devices:
print_device_info(device)
if not args.no_topology:
print_topology(devices)
print("\nDone")
return 0
if __name__ == "__main__":
sys.exit(main())