cuda-samples/Samples/0_Introduction/simpleAttributes/simpleAttributes.cu

215 lines
7.9 KiB
Plaintext
Raw Normal View History

2022-01-13 14:05:24 +08:00
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes CUDA
#include <cuda_runtime.h>
// includes, project
#include <helper_cuda.h>
#include <helper_functions.h> // helper functions for SDK examples
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);
cudaAccessPolicyWindow initAccessPolicyWindow(void) {
cudaAccessPolicyWindow accessPolicyWindow = {0};
accessPolicyWindow.base_ptr = (void *)0;
accessPolicyWindow.num_bytes = 0;
accessPolicyWindow.hitRatio = 0.f;
accessPolicyWindow.hitProp = cudaAccessPropertyNormal;
accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
return accessPolicyWindow;
}
////////////////////////////////////////////////////////////////////////////////
//! Simple test kernel for device functionality
//! @param data input data in global memory
//! @param dataSize input data size
//! @param bigData input bigData in global memory
//! @param bigDataSize input bigData size
//! @param hitcount how many data access are done within block
////////////////////////////////////////////////////////////////////////////////
static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash,
int bigDataSize, int hitCount) {
__shared__ unsigned int hit;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int tID = row * blockDim.y + col;
uint32_t psRand = tID;
atomicExch(&hit, 0);
__syncthreads();
while (hit < hitCount) {
psRand ^= psRand << 13;
psRand ^= psRand >> 17;
psRand ^= psRand << 5;
int idx = tID - psRand;
if (idx < 0) {
idx = -idx;
}
if ((tID % 2) == 0) {
data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
} else {
trash[psRand % bigDataSize] =
trash[psRand % bigDataSize] + trash[idx % bigDataSize];
}
atomicAdd(&hit, 1);
}
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { runTest(argc, argv); }
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) {
bool bTestResult = true;
cudaAccessPolicyWindow accessPolicyWindow;
cudaDeviceProp deviceProp;
cudaStreamAttrValue streamAttrValue;
cudaStream_t stream;
cudaStreamAttrID streamAttrID;
dim3 threads(32, 32);
int *dataDevicePointer;
int *dataHostPointer;
int dataSize;
int *bigDataDevicePointer;
int *bigDataHostPointer;
int bigDataSize;
StopWatchInterface *timer = 0;
printf("%s Starting...\n\n", argv[0]);
// use command-line specified CUDA device, otherwise use device with highest
// Gflops/s
int devID = findCudaDevice(argc, (const char **)argv);
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
// Get device properties
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
dim3 blocks(deviceProp.maxGridSize[1], 1);
// Make sure device the l2 optimization
if (deviceProp.persistingL2CacheMaxSize == 0) {
printf(
"Waiving execution as device %d does not support persisting L2 "
"Caching\n",
devID);
exit(EXIT_WAIVED);
}
// Create stream to assiocate with window
checkCudaErrors(cudaStreamCreate(&stream));
// Set the amount of l2 cache that will be persisting to maximum the device
// can support
checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize,
deviceProp.persistingL2CacheMaxSize));
// Stream attribute to set
streamAttrID = cudaStreamAttributeAccessPolicyWindow;
// Default window
streamAttrValue.accessPolicyWindow = initAccessPolicyWindow();
accessPolicyWindow = initAccessPolicyWindow();
// Allocate size of both buffers
bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int);
// Allocate data
checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
checkCudaErrors(
cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
for (int i = 0; i < bigDataSize; ++i) {
if (i < dataSize) {
dataHostPointer[i] = i;
}
bigDataHostPointer[bigDataSize - i - 1] = i;
}
checkCudaErrors(
cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
checkCudaErrors(
cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer,
dataSize * sizeof(int),
cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer,
bigDataSize * sizeof(int),
cudaMemcpyHostToDevice, stream));
// Make a window for the buffer of interest
accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
accessPolicyWindow.num_bytes = dataSize * sizeof(int);
accessPolicyWindow.hitRatio = 1.f;
accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
accessPolicyWindow.missProp = cudaAccessPropertyNormal;
streamAttrValue.accessPolicyWindow = accessPolicyWindow;
// Assign window to stream
checkCudaErrors(
cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
// Demote any previous persisting lines
checkCudaErrors(cudaCtxResetPersistingL2Cache());
checkCudaErrors(cudaStreamSynchronize(stream));
kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(
dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);
checkCudaErrors(cudaStreamSynchronize(stream));
// check if kernel execution generated and error
getLastCudaError("Kernel execution failed");
// Free memory
checkCudaErrors(cudaFreeHost(dataHostPointer));
checkCudaErrors(cudaFreeHost(bigDataHostPointer));
checkCudaErrors(cudaFree(dataDevicePointer));
checkCudaErrors(cudaFree(bigDataDevicePointer));
sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}