Merge 13.0 changes into 13.1 dev branch

2026-03-13 05:25:41 +08:00 · 2025-07-29 14:14:27 -07:00 · 2025-07-29 14:14:27 -07:00 · 775d1b4dd3
commit 775d1b4dd3
parent eb8891b839 a5267b83a5
9 changed files with 73 additions and 19 deletions
--- a/README.md
+++ b/README.md
@ -133,6 +133,58 @@ $ cd build
 $ cmake -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_LIBRARY_PATH=/usr/local/cuda/orin/lib64/ -DCMAKE_INCLUDE_PATH=/usr/local/cuda/orin/include -DBUILD_TEGRA=True ..
 ```

+### Cross Building for Automotive Linux Platforms from the DriveOS Docker containers
+
+To build CUDA samples to the target platform from the DriveOS Docker containers, use the following instructions.
+
+Mount the target Root Filesystem (RFS) in the container so that the CUDA cmake process has the correct paths to CUDA and other system libraries required to build the samples.
+
+Create a temporary directory, `<temp>` is any temporary directory of your choosing, for example, you can use `/drive/temp`:
+
+```
+$ mkdir /drive/<temp>
+```
+
+Mount the filesystem by running the following command:
+
+```
+$ mount /drive/drive-linux/filesystem/targetfs-images/dev_nsr_desktop_ubuntu-24.04_thor_rfs.img /drive/temp
+```
+
+Configure the project by running the following cmake command:
+
+```
+$ mkdir build && cd build
+$ cmake .. -DBUILD_TEGRA=True \
+  -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
+  -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/toolchain-aarch64-linux.cmake \
+  -DTARGET_FS=/drive/temp \
+  -DCMAKE_LIBRARY_PATH=/drive/temp/usr/local/cuda-13.0/thor/lib64/ \
+  -DCMAKE_INCLUDE_PATH=/drive/temp/usr/local/cuda-13.0/thor/include/
+```
+
+Please note that the following libraries are not pre-installed in the DriveOS dev-nsr target filesystem:
+* libdrm-dev
+* Vulkan
+
+This causes the cmake command to throw errors related to the missing files, and as a result, the related samples will not build in later steps. This issue will be addressed in a future DriveOS release.
+
+To build the samples with ignore the error mentioned above, you can use `--ignore-errors`/`--keep-going` or comment out the comment out the corresponding `add_subdirectory` command in the CMakeLists.txt in the parent folder for the samples requiring Vulkan and libdrm_dev:
+
+```
+$ make -j$(nproc) --ignore-errors # or --keep-going
+```
+
+```
+# In Samples/5_Domain_Specific/CMakeList.txt
+# add_subdirectory(simpleGL)
+# add_subdirectory(simpleVulkan)
+# add_subdirectory(simpleVulkanMMAP)
+
+# In Samples/8_Platform_Specific/Tegra/CMakeList.txt
+# add_subdirectory(simpleGLES_EGLOutput)
+```
+
 ### QNX

 Cross-compilation for QNX with CMake is supported in the CUDA 13.0 samples release and newer. An example build for
--- a/Samples/3_CUDA_Features/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
+++ b/Samples/3_CUDA_Features/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
@ -224,7 +224,7 @@ __global__ void compute_gemm(const half *A, const half *B, const float *C, float
    // there's no such tile, all warps in this CTA exit.
    for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) {
        const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES);
-        const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES;
+        const unsigned int block_tile_j = (block_pos * BLOCK_ROW_TILES) % N_TILES;

        // Stop when there are no more D matrix tiles to compute in this CTA.
        if (block_tile_i >= M_TILES) {
--- a/Samples/7_libNVVM/README.md
+++ b/Samples/7_libNVVM/README.md
@ -4,7 +4,9 @@ libNVVM and NVVM IR Samples
 Introduction
 ------------

-The following samples illustrate the use of libNVVM and NVVM IR.
+The following samples illustrate the use of libNVVM and NVVM IR. Running and
+testing these samples requires an NVIDIA driver compatible with the CUDA
+Toolkit being used for compilation.

 - cuda-shared-memory - A directory containing NVVM IR programs that demonstrate
  CUDA 'shared' memory usage.
@ -71,7 +73,7 @@ A Note About the cuda-c-linking Sample

 This sample requires a development package (or locally-built) LLVM library
 between versions 7 to 14 inclusive.  LLVM 15 defaults to using opaque pointers,
-which are currently not supported in libNVVM.
+which are not supported in libNVVM for pre-Blackwell architectures.

 The LLVM_HOME environment variable is required for users who wish to build the
 cuda-c-linking sample and have a locally built copy of LLVM that they wish to
--- a/Samples/7_libNVVM/cuda-c-linking/CMakeLists.txt
+++ b/Samples/7_libNVVM/cuda-c-linking/CMakeLists.txt
@ -1,4 +1,4 @@
-# Copyright (c) 1993-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 1993-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@ -29,7 +29,7 @@ if (LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL "15" OR
    LLVM_PACKAGE_VERSION VERSION_LESS "7")
  message(STATUS "The cuda-c-linking sample is expected to build with "
                 "LLVM development libraries v7 to v14, opaque pointers are "
-                 "not yet supported in libNVVM.")
+                 "not supported in libNVVM for pre-Blackwell architectures.")
  return()
 endif ()

--- a/Samples/7_libNVVM/cuda-c-linking/cuda-c-linking.cpp
+++ b/Samples/7_libNVVM/cuda-c-linking/cuda-c-linking.cpp
@ -1,4 +1,4 @@
-// Copyright (c) 1993-2023, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 1993-2025, NVIDIA CORPORATION. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@ -216,8 +216,8 @@ int main(int argc, char **argv)
    checkCudaErrors(cuDeviceGetAttribute(&devMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
    checkCudaErrors(cuDeviceGetAttribute(&devMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
    outs() << "Device Compute Capability: " << devMajor << "." << devMinor << "\n";
-    if (devMajor < 5) {
-        errs() << "ERROR: Device 0 is not sm_50 or later.\n";
+    if (devMajor < 7 && devMinor < 5) {
+        errs() << "ERROR: Device 0 is not sm_75 or later.\n";
        return 1;
    }

--- a/Samples/7_libNVVM/device-side-launch/dsl.c
+++ b/Samples/7_libNVVM/device-side-launch/dsl.c
@ -1,4 +1,4 @@
-// Copyright (c) 2014-2023, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2014-2025, NVIDIA CORPORATION. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@ -181,8 +181,8 @@ static CUdevice cudaDeviceInit(int *major, int *minor)
    checkCudaErrors(cuDeviceGetAttribute(major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
    checkCudaErrors(cuDeviceGetAttribute(minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
    printf("compute capability = %d.%d\n", *major, *minor);
-    if (*major < 5) {
-        fprintf(stderr, "Device 0 is not sm_50 or later\n");
+    if (*major < 7 && *minor < 5) {
+        fprintf(stderr, "Device 0 is not sm_75 or later\n");
        exit(EXIT_FAILURE);
    }
    return cuDevice;
--- a/Samples/7_libNVVM/ptxgen/README.md
+++ b/Samples/7_libNVVM/ptxgen/README.md
@ -22,6 +22,6 @@ interleaved.

 For example,

-    $ ptxgen a.ll -arch=compute_50 b.bc
+    $ ptxgen a.ll -arch=compute_75 b.bc

-links a.ll and b.bc, and generates PTX code for the compute_50 architecture.
+links a.ll and b.bc, and generates PTX code for the compute_75 architecture.
--- a/Samples/7_libNVVM/simple/simple.c
+++ b/Samples/7_libNVVM/simple/simple.c
@ -1,4 +1,4 @@
-// Copyright (c) 1993-2023, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 1993-2025, NVIDIA CORPORATION. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@ -74,8 +74,8 @@ static CUdevice cudaDeviceInit(int *devMajor, int *devMinor)

    // Obtain the device's compute capability.
    checkCudaErrors(cuDeviceGetAttribute(devMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
-    if (*devMajor < 5) {
-        fprintf(stderr, "Device 0 is not sm_50 or later\n");
+    if (*devMajor < 7 && *devMinor < 5) {
+        fprintf(stderr, "Device 0 is not sm_75 or later\n");
        exit(EXIT_FAILURE);
    }
    checkCudaErrors(cuDeviceGetAttribute(devMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
--- a/Samples/7_libNVVM/uvmlite/uvmlite.c
+++ b/Samples/7_libNVVM/uvmlite/uvmlite.c
@ -1,4 +1,4 @@
-// Copyright (c) 2014-2023, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2014-2025, NVIDIA CORPORATION. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@ -172,8 +172,8 @@ static CUdevice cudaDeviceInit(int *major, int *minor)
    checkCudaErrors(cuDeviceGetAttribute(major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
    checkCudaErrors(cuDeviceGetAttribute(minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
    printf("compute capability = %d.%d\n", *major, *minor);
-    if (*major < 5) {
-        fprintf(stderr, "Device 0 is not sm_50 or later\n");
+    if (*major < 7 && *minor < 5) {
+        fprintf(stderr, "Device 0 is not sm_75 or later\n");
        exit(EXIT_FAILURE);
    }