mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-01-19 22:25:49 +08:00
215 lines
7.9 KiB
Plaintext
215 lines
7.9 KiB
Plaintext
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
// includes, system
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <math.h>
|
|
|
|
// includes CUDA
|
|
#include <cuda_runtime.h>
|
|
|
|
// includes, project
|
|
#include <helper_cuda.h>
|
|
#include <helper_functions.h> // helper functions for SDK examples
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// declaration, forward
|
|
void runTest(int argc, char **argv);
|
|
|
|
cudaAccessPolicyWindow initAccessPolicyWindow(void) {
|
|
cudaAccessPolicyWindow accessPolicyWindow = {0};
|
|
accessPolicyWindow.base_ptr = (void *)0;
|
|
accessPolicyWindow.num_bytes = 0;
|
|
accessPolicyWindow.hitRatio = 0.f;
|
|
accessPolicyWindow.hitProp = cudaAccessPropertyNormal;
|
|
accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
|
|
return accessPolicyWindow;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Simple test kernel for device functionality
|
|
//! @param data input data in global memory
|
|
//! @param dataSize input data size
|
|
//! @param bigData input bigData in global memory
|
|
//! @param bigDataSize input bigData size
|
|
//! @param hitcount how many data access are done within block
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash,
|
|
int bigDataSize, int hitCount) {
|
|
__shared__ unsigned int hit;
|
|
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
|
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int tID = row * blockDim.y + col;
|
|
uint32_t psRand = tID;
|
|
|
|
atomicExch(&hit, 0);
|
|
__syncthreads();
|
|
while (hit < hitCount) {
|
|
psRand ^= psRand << 13;
|
|
psRand ^= psRand >> 17;
|
|
psRand ^= psRand << 5;
|
|
|
|
int idx = tID - psRand;
|
|
if (idx < 0) {
|
|
idx = -idx;
|
|
}
|
|
|
|
if ((tID % 2) == 0) {
|
|
data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
|
|
} else {
|
|
trash[psRand % bigDataSize] =
|
|
trash[psRand % bigDataSize] + trash[idx % bigDataSize];
|
|
}
|
|
|
|
atomicAdd(&hit, 1);
|
|
}
|
|
}
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Program main
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
int main(int argc, char **argv) { runTest(argc, argv); }
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Run a simple test for CUDA
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void runTest(int argc, char **argv) {
|
|
bool bTestResult = true;
|
|
cudaAccessPolicyWindow accessPolicyWindow;
|
|
cudaDeviceProp deviceProp;
|
|
cudaStreamAttrValue streamAttrValue;
|
|
cudaStream_t stream;
|
|
cudaStreamAttrID streamAttrID;
|
|
dim3 threads(32, 32);
|
|
int *dataDevicePointer;
|
|
int *dataHostPointer;
|
|
int dataSize;
|
|
int *bigDataDevicePointer;
|
|
int *bigDataHostPointer;
|
|
int bigDataSize;
|
|
StopWatchInterface *timer = 0;
|
|
|
|
printf("%s Starting...\n\n", argv[0]);
|
|
|
|
// use command-line specified CUDA device, otherwise use device with highest
|
|
// Gflops/s
|
|
int devID = findCudaDevice(argc, (const char **)argv);
|
|
sdkCreateTimer(&timer);
|
|
sdkStartTimer(&timer);
|
|
// Get device properties
|
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
|
|
dim3 blocks(deviceProp.maxGridSize[1], 1);
|
|
|
|
// Make sure device the l2 optimization
|
|
if (deviceProp.persistingL2CacheMaxSize == 0) {
|
|
printf(
|
|
"Waiving execution as device %d does not support persisting L2 "
|
|
"Caching\n",
|
|
devID);
|
|
exit(EXIT_WAIVED);
|
|
}
|
|
|
|
// Create stream to assiocate with window
|
|
checkCudaErrors(cudaStreamCreate(&stream));
|
|
|
|
// Set the amount of l2 cache that will be persisting to maximum the device
|
|
// can support
|
|
checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize,
|
|
deviceProp.persistingL2CacheMaxSize));
|
|
|
|
// Stream attribute to set
|
|
streamAttrID = cudaStreamAttributeAccessPolicyWindow;
|
|
|
|
// Default window
|
|
streamAttrValue.accessPolicyWindow = initAccessPolicyWindow();
|
|
accessPolicyWindow = initAccessPolicyWindow();
|
|
|
|
// Allocate size of both buffers
|
|
bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
|
|
dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int);
|
|
|
|
// Allocate data
|
|
checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
|
|
checkCudaErrors(
|
|
cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
|
|
|
|
for (int i = 0; i < bigDataSize; ++i) {
|
|
if (i < dataSize) {
|
|
dataHostPointer[i] = i;
|
|
}
|
|
|
|
bigDataHostPointer[bigDataSize - i - 1] = i;
|
|
}
|
|
|
|
checkCudaErrors(
|
|
cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
|
|
checkCudaErrors(
|
|
cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
|
|
checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer,
|
|
dataSize * sizeof(int),
|
|
cudaMemcpyHostToDevice, stream));
|
|
checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer,
|
|
bigDataSize * sizeof(int),
|
|
cudaMemcpyHostToDevice, stream));
|
|
|
|
// Make a window for the buffer of interest
|
|
accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
|
|
accessPolicyWindow.num_bytes = dataSize * sizeof(int);
|
|
accessPolicyWindow.hitRatio = 1.f;
|
|
accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
|
|
accessPolicyWindow.missProp = cudaAccessPropertyNormal;
|
|
streamAttrValue.accessPolicyWindow = accessPolicyWindow;
|
|
|
|
// Assign window to stream
|
|
checkCudaErrors(
|
|
cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
|
|
|
|
// Demote any previous persisting lines
|
|
checkCudaErrors(cudaCtxResetPersistingL2Cache());
|
|
|
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
|
kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(
|
|
dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);
|
|
|
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
|
// check if kernel execution generated and error
|
|
getLastCudaError("Kernel execution failed");
|
|
|
|
// Free memory
|
|
checkCudaErrors(cudaFreeHost(dataHostPointer));
|
|
checkCudaErrors(cudaFreeHost(bigDataHostPointer));
|
|
checkCudaErrors(cudaFree(dataDevicePointer));
|
|
checkCudaErrors(cudaFree(bigDataDevicePointer));
|
|
|
|
sdkStopTimer(&timer);
|
|
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
|
sdkDeleteTimer(&timer);
|
|
|
|
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
|
}
|