cuda-samples/Samples/0_Introduction/simpleAttributes/simpleAttributes.cu

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes CUDA
#include <cuda_runtime.h>

// includes, project
#include <helper_cuda.h>
#include <helper_functions.h>  // helper functions for SDK examples

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);

cudaAccessPolicyWindow initAccessPolicyWindow(void) {
  cudaAccessPolicyWindow accessPolicyWindow = {0};
  accessPolicyWindow.base_ptr = (void *)0;
  accessPolicyWindow.num_bytes = 0;
  accessPolicyWindow.hitRatio = 0.f;
  accessPolicyWindow.hitProp = cudaAccessPropertyNormal;
  accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
  return accessPolicyWindow;
}

////////////////////////////////////////////////////////////////////////////////
//! Simple test kernel for device functionality
//! @param data  input data in global memory
//! @param dataSize  input data size
//! @param bigData  input bigData in global memory
//! @param bigDataSize  input bigData size
//! @param hitcount how many data access are done within block
////////////////////////////////////////////////////////////////////////////////
static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash,
                                            int bigDataSize, int hitCount) {
  __shared__ unsigned int hit;
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;
  int tID = row * blockDim.y + col;
  uint32_t psRand = tID;

  atomicExch(&hit, 0);
  __syncthreads();
  while (hit < hitCount) {
    psRand ^= psRand << 13;
    psRand ^= psRand >> 17;
    psRand ^= psRand << 5;

    int idx = tID - psRand;
    if (idx < 0) {
      idx = -idx;
    }

    if ((tID % 2) == 0) {
      data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
    } else {
      trash[psRand % bigDataSize] =
          trash[psRand % bigDataSize] + trash[idx % bigDataSize];
    }

    atomicAdd(&hit, 1);
  }
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { runTest(argc, argv); }

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) {
  bool bTestResult = true;
  cudaAccessPolicyWindow accessPolicyWindow;
  cudaDeviceProp deviceProp;
  cudaStreamAttrValue streamAttrValue;
  cudaStream_t stream;
  cudaStreamAttrID streamAttrID;
  dim3 threads(32, 32);
  int *dataDevicePointer;
  int *dataHostPointer;
  int dataSize;
  int *bigDataDevicePointer;
  int *bigDataHostPointer;
  int bigDataSize;
  StopWatchInterface *timer = 0;

  printf("%s Starting...\n\n", argv[0]);

  // use command-line specified CUDA device, otherwise use device with highest
  // Gflops/s
  int devID = findCudaDevice(argc, (const char **)argv);
  sdkCreateTimer(&timer);
  sdkStartTimer(&timer);
  // Get device properties
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
  dim3 blocks(deviceProp.maxGridSize[1], 1);

  // Make sure device the l2 optimization
  if (deviceProp.persistingL2CacheMaxSize == 0) {
    printf(
        "Waiving execution as device %d does not support persisting L2 "
        "Caching\n",
        devID);
    exit(EXIT_WAIVED);
  }

  // Create stream to assiocate with window
  checkCudaErrors(cudaStreamCreate(&stream));

  // Set the amount of l2 cache that will be persisting to maximum the device
  // can support
  checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize,
                                     deviceProp.persistingL2CacheMaxSize));

  // Stream attribute to set
  streamAttrID = cudaStreamAttributeAccessPolicyWindow;

  // Default window
  streamAttrValue.accessPolicyWindow = initAccessPolicyWindow();
  accessPolicyWindow = initAccessPolicyWindow();

  // Allocate size of both buffers
  bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
  dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int);

  // Allocate data
  checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
  checkCudaErrors(
      cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));

  for (int i = 0; i < bigDataSize; ++i) {
    if (i < dataSize) {
      dataHostPointer[i] = i;
    }

    bigDataHostPointer[bigDataSize - i - 1] = i;
  }

  checkCudaErrors(
      cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
  checkCudaErrors(
      cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
  checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer,
                                  dataSize * sizeof(int),
                                  cudaMemcpyHostToDevice, stream));
  checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer,
                                  bigDataSize * sizeof(int),
                                  cudaMemcpyHostToDevice, stream));

  // Make a window for the buffer of interest
  accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
  accessPolicyWindow.num_bytes = dataSize * sizeof(int);
  accessPolicyWindow.hitRatio = 1.f;
  accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
  accessPolicyWindow.missProp = cudaAccessPropertyNormal;
  streamAttrValue.accessPolicyWindow = accessPolicyWindow;

  // Assign window to stream
  checkCudaErrors(
      cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));

  // Demote any previous persisting lines
  checkCudaErrors(cudaCtxResetPersistingL2Cache());

  checkCudaErrors(cudaStreamSynchronize(stream));
  kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(
      dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);

  checkCudaErrors(cudaStreamSynchronize(stream));
  // check if kernel execution generated and error
  getLastCudaError("Kernel execution failed");

  // Free memory
  checkCudaErrors(cudaFreeHost(dataHostPointer));
  checkCudaErrors(cudaFreeHost(bigDataHostPointer));
  checkCudaErrors(cudaFree(dataDevicePointer));
  checkCudaErrors(cudaFree(bigDataDevicePointer));

  sdkStopTimer(&timer);
  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
  sdkDeleteTimer(&timer);

  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
add and update samples for CUDA 11.6 2022-01-13 14:05:24 +08:00			`/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.`
Add and update samples for cuda 11.0 support 2020-05-19 00:52:06 +08:00			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`// includes, system`
			`#include <stdlib.h>`
			`#include <stdio.h>`
			`#include <string.h>`
			`#include <math.h>`

			`// includes CUDA`
			`#include <cuda_runtime.h>`

			`// includes, project`
			`#include <helper_cuda.h>`
add and update samples with CUDA 11.3 support 2021-04-16 14:24:26 +08:00			`#include <helper_functions.h> // helper functions for SDK examples`
Add and update samples for cuda 11.0 support 2020-05-19 00:52:06 +08:00
			`////////////////////////////////////////////////////////////////////////////////`
			`// declaration, forward`
			`void runTest(int argc, char **argv);`

add and update samples with CUDA 11.3 support 2021-04-16 14:24:26 +08:00			`cudaAccessPolicyWindow initAccessPolicyWindow(void) {`
			`cudaAccessPolicyWindow accessPolicyWindow = {0};`
			`accessPolicyWindow.base_ptr = (void *)0;`
			`accessPolicyWindow.num_bytes = 0;`
			`accessPolicyWindow.hitRatio = 0.f;`
			`accessPolicyWindow.hitProp = cudaAccessPropertyNormal;`
			`accessPolicyWindow.missProp = cudaAccessPropertyStreaming;`
			`return accessPolicyWindow;`
Add and update samples for cuda 11.0 support 2020-05-19 00:52:06 +08:00			`}`
add and update samples with CUDA 11.3 support 2021-04-16 14:24:26 +08:00
Add and update samples for cuda 11.0 support 2020-05-19 00:52:06 +08:00			`////////////////////////////////////////////////////////////////////////////////`
			`//! Simple test kernel for device functionality`
			`//! @param data input data in global memory`
			`//! @param dataSize input data size`
			`//! @param bigData input bigData in global memory`
			`//! @param bigDataSize input bigData size`
			`//! @param hitcount how many data access are done within block`
			`////////////////////////////////////////////////////////////////////////////////`
add and update samples with CUDA 11.3 support 2021-04-16 14:24:26 +08:00			`static __global__ void kernCacheSegmentTest(int data, int dataSize, int trash,`
			`int bigDataSize, int hitCount) {`
			`__shared__ unsigned int hit;`
			`int row = blockIdx.y * blockDim.y + threadIdx.y;`
			`int col = blockIdx.x * blockDim.x + threadIdx.x;`
			`int tID = row * blockDim.y + col;`
			`uint32_t psRand = tID;`

			`atomicExch(&hit, 0);`
			`__syncthreads();`
			`while (hit < hitCount) {`
			`psRand ^= psRand << 13;`
			`psRand ^= psRand >> 17;`
			`psRand ^= psRand << 5;`

			`int idx = tID - psRand;`
			`if (idx < 0) {`
			`idx = -idx;`
			`}`

			`if ((tID % 2) == 0) {`
			`data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];`
			`} else {`
			`trash[psRand % bigDataSize] =`
			`trash[psRand % bigDataSize] + trash[idx % bigDataSize];`
Add and update samples for cuda 11.0 support 2020-05-19 00:52:06 +08:00			`}`
add and update samples with CUDA 11.3 support 2021-04-16 14:24:26 +08:00
			`atomicAdd(&hit, 1);`
			`}`
Add and update samples for cuda 11.0 support 2020-05-19 00:52:06 +08:00			`}`
			`////////////////////////////////////////////////////////////////////////////////`
			`// Program main`
			`////////////////////////////////////////////////////////////////////////////////`
add and update samples with CUDA 11.3 support 2021-04-16 14:24:26 +08:00			`int main(int argc, char **argv) { runTest(argc, argv); }`
Add and update samples for cuda 11.0 support 2020-05-19 00:52:06 +08:00
			`////////////////////////////////////////////////////////////////////////////////`
			`//! Run a simple test for CUDA`
			`////////////////////////////////////////////////////////////////////////////////`
add and update samples with CUDA 11.3 support 2021-04-16 14:24:26 +08:00			`void runTest(int argc, char **argv) {`
			`bool bTestResult = true;`
			`cudaAccessPolicyWindow accessPolicyWindow;`
			`cudaDeviceProp deviceProp;`
			`cudaStreamAttrValue streamAttrValue;`
			`cudaStream_t stream;`
			`cudaStreamAttrID streamAttrID;`
			`dim3 threads(32, 32);`
			`int *dataDevicePointer;`
			`int *dataHostPointer;`
			`int dataSize;`
			`int *bigDataDevicePointer;`
			`int *bigDataHostPointer;`
			`int bigDataSize;`
			`StopWatchInterface *timer = 0;`

			`printf("%s Starting...\n\n", argv[0]);`

			`// use command-line specified CUDA device, otherwise use device with highest`
			`// Gflops/s`
			`int devID = findCudaDevice(argc, (const char **)argv);`
			`sdkCreateTimer(&timer);`
			`sdkStartTimer(&timer);`
			`// Get device properties`
			`checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));`
			`dim3 blocks(deviceProp.maxGridSize[1], 1);`

			`// Make sure device the l2 optimization`
			`if (deviceProp.persistingL2CacheMaxSize == 0) {`
			`printf(`
			`"Waiving execution as device %d does not support persisting L2 "`
			`"Caching\n",`
			`devID);`
			`exit(EXIT_WAIVED);`
			`}`

			`// Create stream to assiocate with window`
			`checkCudaErrors(cudaStreamCreate(&stream));`

			`// Set the amount of l2 cache that will be persisting to maximum the device`
			`// can support`
			`checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize,`
			`deviceProp.persistingL2CacheMaxSize));`

			`// Stream attribute to set`
			`streamAttrID = cudaStreamAttributeAccessPolicyWindow;`

			`// Default window`
			`streamAttrValue.accessPolicyWindow = initAccessPolicyWindow();`
			`accessPolicyWindow = initAccessPolicyWindow();`

			`// Allocate size of both buffers`
			`bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);`
			`dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int);`

			`// Allocate data`
			`checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));`
			`checkCudaErrors(`
			`cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));`

			`for (int i = 0; i < bigDataSize; ++i) {`
			`if (i < dataSize) {`
			`dataHostPointer[i] = i;`
Add and update samples for cuda 11.0 support 2020-05-19 00:52:06 +08:00			`}`

add and update samples with CUDA 11.3 support 2021-04-16 14:24:26 +08:00			`bigDataHostPointer[bigDataSize - i - 1] = i;`
			`}`

			`checkCudaErrors(`
			`cudaMalloc((void *)&dataDevicePointer, dataSize sizeof(int)));`
			`checkCudaErrors(`
			`cudaMalloc((void *)&bigDataDevicePointer, bigDataSize sizeof(int)));`
			`checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer,`
			`dataSize * sizeof(int),`
			`cudaMemcpyHostToDevice, stream));`
			`checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer,`
			`bigDataSize * sizeof(int),`
			`cudaMemcpyHostToDevice, stream));`

			`// Make a window for the buffer of interest`
			`accessPolicyWindow.base_ptr = (void *)dataDevicePointer;`
			`accessPolicyWindow.num_bytes = dataSize * sizeof(int);`
			`accessPolicyWindow.hitRatio = 1.f;`
			`accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;`
			`accessPolicyWindow.missProp = cudaAccessPropertyNormal;`
			`streamAttrValue.accessPolicyWindow = accessPolicyWindow;`

			`// Assign window to stream`
			`checkCudaErrors(`
			`cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));`

			`// Demote any previous persisting lines`
			`checkCudaErrors(cudaCtxResetPersistingL2Cache());`

			`checkCudaErrors(cudaStreamSynchronize(stream));`
			`kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(`
			`dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);`

			`checkCudaErrors(cudaStreamSynchronize(stream));`
			`// check if kernel execution generated and error`
			`getLastCudaError("Kernel execution failed");`

			`// Free memory`
			`checkCudaErrors(cudaFreeHost(dataHostPointer));`
			`checkCudaErrors(cudaFreeHost(bigDataHostPointer));`
			`checkCudaErrors(cudaFree(dataDevicePointer));`
			`checkCudaErrors(cudaFree(bigDataDevicePointer));`

			`sdkStopTimer(&timer);`
			`printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));`
			`sdkDeleteTimer(&timer);`

			`exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);`
Add and update samples for cuda 11.0 support 2020-05-19 00:52:06 +08:00			`}`