Merge 13.0 changes into 13.1 dev branch

2026-03-13 05:25:41 +08:00 · 2025-07-29 14:14:27 -07:00 · 2025-07-29 14:14:27 -07:00 · 775d1b4dd3
commit 775d1b4dd3
parent eb8891b839 a5267b83a5
9 changed files with 73 additions and 19 deletions
--- a/README.md
+++ b/README.md
@ -133,6 +133,58 @@ $ cd build
 $ cmake -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_LIBRARY_PATH=/usr/local/cuda/orin/lib64/ -DCMAKE_INCLUDE_PATH=/usr/local/cuda/orin/include -DBUILD_TEGRA=True ..
 ```
 ### Cross Building for Automotive Linux Platforms from the DriveOS Docker containers
 To build CUDA samples to the target platform from the DriveOS Docker containers, use the following instructions.
 Mount the target Root Filesystem (RFS) in the container so that the CUDA cmake process has the correct paths to CUDA and other system libraries required to build the samples.
 Create a temporary directory, `<temp>` is any temporary directory of your choosing, for example, you can use `/drive/temp`:
 ```
 $ mkdir /drive/<temp>
 ```
 Mount the filesystem by running the following command:
 ```
 $ mount /drive/drive-linux/filesystem/targetfs-images/dev_nsr_desktop_ubuntu-24.04_thor_rfs.img /drive/temp
 ```
 Configure the project by running the following cmake command:
 ```
 $ mkdir build && cd build
 $ cmake .. -DBUILD_TEGRA=True \
  -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
  -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/toolchain-aarch64-linux.cmake \
  -DTARGET_FS=/drive/temp \
  -DCMAKE_LIBRARY_PATH=/drive/temp/usr/local/cuda-13.0/thor/lib64/ \
  -DCMAKE_INCLUDE_PATH=/drive/temp/usr/local/cuda-13.0/thor/include/
 ```
 Please note that the following libraries are not pre-installed in the DriveOS dev-nsr target filesystem:
 * libdrm-dev
 * Vulkan
 This causes the cmake command to throw errors related to the missing files, and as a result, the related samples will not build in later steps. This issue will be addressed in a future DriveOS release.
 To build the samples with ignore the error mentioned above, you can use `--ignore-errors`/`--keep-going` or comment out the comment out the corresponding `add_subdirectory` command in the CMakeLists.txt in the parent folder for the samples requiring Vulkan and libdrm_dev:
 ```
 $ make -j$(nproc) --ignore-errors # or --keep-going
 ```
 ```
 # In Samples/5_Domain_Specific/CMakeList.txt
 # add_subdirectory(simpleGL)
 # add_subdirectory(simpleVulkan)
 # add_subdirectory(simpleVulkanMMAP)
 # In Samples/8_Platform_Specific/Tegra/CMakeList.txt
 # add_subdirectory(simpleGLES_EGLOutput)
 ```
 ### QNX
 Cross-compilation for QNX with CMake is supported in the CUDA 13.0 samples release and newer. An example build for
--- a/Samples/3_CUDA_Features/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
+++ b/Samples/3_CUDA_Features/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
@ -224,7 +224,7 @@ __global__ void compute_gemm(const half *A, const half *B, const float *C, float
    // there's no such tile, all warps in this CTA exit.
    for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) {
        const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES);
-        const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES;
+        const unsigned int block_tile_j = (block_pos * BLOCK_ROW_TILES) % N_TILES;
        // Stop when there are no more D matrix tiles to compute in this CTA.
        if (block_tile_i >= M_TILES) {
--- a/Samples/7_libNVVM/README.md
+++ b/Samples/7_libNVVM/README.md
@ -4,7 +4,9 @@ libNVVM and NVVM IR Samples
 Introduction
 ------------
-The following samples illustrate the use of libNVVM and NVVM IR.
+The following samples illustrate the use of libNVVM and NVVM IR. Running and
 testing these samples requires an NVIDIA driver compatible with the CUDA
 Toolkit being used for compilation.
 - cuda-shared-memory - A directory containing NVVM IR programs that demonstrate
  CUDA 'shared' memory usage.
@ -71,7 +73,7 @@ A Note About the cuda-c-linking Sample
 This sample requires a development package (or locally-built) LLVM library
 between versions 7 to 14 inclusive.  LLVM 15 defaults to using opaque pointers,
-which are currently not supported in libNVVM.
+which are not supported in libNVVM for pre-Blackwell architectures.
 The LLVM_HOME environment variable is required for users who wish to build the
 cuda-c-linking sample and have a locally built copy of LLVM that they wish to
--- a/Samples/7_libNVVM/cuda-c-linking/CMakeLists.txt
+++ b/Samples/7_libNVVM/cuda-c-linking/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 1993-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 1993-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@ -29,7 +29,7 @@ if (LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL "15" OR
    LLVM_PACKAGE_VERSION VERSION_LESS "7")
  message(STATUS "The cuda-c-linking sample is expected to build with "
                 "LLVM development libraries v7 to v14, opaque pointers are "
-                 "not yet supported in libNVVM.")
+                 "not supported in libNVVM for pre-Blackwell architectures.")
  return()
 endif ()
--- a/Samples/7_libNVVM/cuda-c-linking/cuda-c-linking.cpp
+++ b/Samples/7_libNVVM/cuda-c-linking/cuda-c-linking.cpp
@ -1,4 +1,4 @@
-// Copyright (c) 1993-2023, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 1993-2025, NVIDIA CORPORATION. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@ -216,8 +216,8 @@ int main(int argc, char **argv)
    checkCudaErrors(cuDeviceGetAttribute(&devMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
    checkCudaErrors(cuDeviceGetAttribute(&devMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
    outs() << "Device Compute Capability: " << devMajor << "." << devMinor << "\n";
-    if (devMajor < 5) {
+    if (devMajor < 7 && devMinor < 5) {
-        errs() << "ERROR: Device 0 is not sm_50 or later.\n";
+        errs() << "ERROR: Device 0 is not sm_75 or later.\n";
        return 1;
    }
--- a/Samples/7_libNVVM/device-side-launch/dsl.c
+++ b/Samples/7_libNVVM/device-side-launch/dsl.c
@ -1,4 +1,4 @@
-// Copyright (c) 2014-2023, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2014-2025, NVIDIA CORPORATION. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@ -181,8 +181,8 @@ static CUdevice cudaDeviceInit(int *major, int *minor)
    checkCudaErrors(cuDeviceGetAttribute(major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
    checkCudaErrors(cuDeviceGetAttribute(minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
    printf("compute capability = %d.%d\n", *major, *minor);
-    if (*major < 5) {
+    if (*major < 7 && *minor < 5) {
-        fprintf(stderr, "Device 0 is not sm_50 or later\n");
+        fprintf(stderr, "Device 0 is not sm_75 or later\n");
        exit(EXIT_FAILURE);
    }
    return cuDevice;
--- a/Samples/7_libNVVM/ptxgen/README.md
+++ b/Samples/7_libNVVM/ptxgen/README.md
@ -22,6 +22,6 @@ interleaved.
 For example,
-    $ ptxgen a.ll -arch=compute_50 b.bc
+    $ ptxgen a.ll -arch=compute_75 b.bc
-links a.ll and b.bc, and generates PTX code for the compute_50 architecture.
+links a.ll and b.bc, and generates PTX code for the compute_75 architecture.
--- a/Samples/7_libNVVM/simple/simple.c
+++ b/Samples/7_libNVVM/simple/simple.c
@ -1,4 +1,4 @@
-// Copyright (c) 1993-2023, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 1993-2025, NVIDIA CORPORATION. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@ -74,8 +74,8 @@ static CUdevice cudaDeviceInit(int *devMajor, int *devMinor)
    // Obtain the device's compute capability.
    checkCudaErrors(cuDeviceGetAttribute(devMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
-    if (*devMajor < 5) {
+    if (*devMajor < 7 && *devMinor < 5) {
-        fprintf(stderr, "Device 0 is not sm_50 or later\n");
+        fprintf(stderr, "Device 0 is not sm_75 or later\n");
        exit(EXIT_FAILURE);
    }
    checkCudaErrors(cuDeviceGetAttribute(devMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
--- a/Samples/7_libNVVM/uvmlite/uvmlite.c
+++ b/Samples/7_libNVVM/uvmlite/uvmlite.c
@ -1,4 +1,4 @@
-// Copyright (c) 2014-2023, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2014-2025, NVIDIA CORPORATION. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@ -172,8 +172,8 @@ static CUdevice cudaDeviceInit(int *major, int *minor)
    checkCudaErrors(cuDeviceGetAttribute(major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
    checkCudaErrors(cuDeviceGetAttribute(minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
    printf("compute capability = %d.%d\n", *major, *minor);
-    if (*major < 5) {
+    if (*major < 7 && *minor < 5) {
-        fprintf(stderr, "Device 0 is not sm_50 or later\n");
+        fprintf(stderr, "Device 0 is not sm_75 or later\n");
        exit(EXIT_FAILURE);
    }