cuda-samples/Samples/systemWideAtomics/systemWideAtomics.cu

343 lines
8.8 KiB
Plaintext
Raw Normal View History

/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
2018-08-25 01:05:15 +08:00
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* A program demonstrating trivial use of system-wide atomics on migratable
* memory.
*/
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <math.h>
#include <stdint.h>
#include <cstdio>
#include <ctime>
#define min(a, b) (a) < (b) ? (a) : (b)
#define max(a, b) (a) > (b) ? (a) : (b)
#define LOOP_NUM 50
__global__ void atomicKernel(int *atom_arr) {
unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = 0; i < LOOP_NUM; i++) {
// Atomic addition
atomicAdd_system(&atom_arr[0], 10);
// Atomic exchange
atomicExch_system(&atom_arr[1], tid);
// Atomic maximum
atomicMax_system(&atom_arr[2], tid);
// Atomic minimum
atomicMin_system(&atom_arr[3], tid);
// Atomic increment (modulo 17+1)
atomicInc_system((unsigned int *)&atom_arr[4], 17);
// Atomic decrement
atomicDec_system((unsigned int *)&atom_arr[5], 137);
// Atomic compare-and-swap
atomicCAS_system(&atom_arr[6], tid - 1, tid);
// Bitwise atomic instructions
// Atomic AND
atomicAnd_system(&atom_arr[7], 2 * tid + 7);
// Atomic OR
atomicOr_system(&atom_arr[8], 1 << tid);
// Atomic XOR
atomicXor_system(&atom_arr[9], tid);
}
}
void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
for (int i = no_of_threads; i < 2 * no_of_threads; i++) {
for (int j = 0; j < LOOP_NUM; j++) {
// Atomic addition
__sync_fetch_and_add(&atom_arr[0], 10);
// Atomic exchange
__sync_lock_test_and_set(&atom_arr[1], i);
// Atomic maximum
int old, expected;
do {
expected = atom_arr[2];
old = __sync_val_compare_and_swap(&atom_arr[2], expected,
max(expected, i));
} while (old != expected);
// Atomic minimum
do {
expected = atom_arr[3];
old = __sync_val_compare_and_swap(&atom_arr[3], expected,
min(expected, i));
} while (old != expected);
// Atomic increment (modulo 17+1)
int limit = 17;
do {
expected = atom_arr[4];
old = __sync_val_compare_and_swap(
&atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1);
} while (old != expected);
// Atomic decrement
limit = 137;
do {
expected = atom_arr[5];
old = __sync_val_compare_and_swap(
&atom_arr[5], expected,
((expected == 0) || (expected > limit)) ? limit : expected - 1);
} while (old != expected);
// Atomic compare-and-swap
__sync_val_compare_and_swap(&atom_arr[6], i - 1, i);
// Bitwise atomic instructions
// Atomic AND
__sync_fetch_and_and(&atom_arr[7], 2 * i + 7);
// Atomic OR
__sync_fetch_and_or(&atom_arr[8], 1 << i);
// Atomic XOR
// 11th element should be 0xff
__sync_fetch_and_xor(&atom_arr[9], i);
}
}
}
////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set
//! Each element is multiplied with the number of threads / array length
//! @param reference reference data, computed but preallocated
//! @param idata input data as provided to device
//! @param len number of elements in reference / idata
////////////////////////////////////////////////////////////////////////////////
int verify(int *testData, const int len) {
int val = 0;
for (int i = 0; i < len * LOOP_NUM; ++i) {
val += 10;
}
if (val != testData[0]) {
printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]);
return false;
}
val = 0;
bool found = false;
for (int i = 0; i < len; ++i) {
// second element should be a member of [0, len)
if (i == testData[1]) {
found = true;
break;
}
}
if (!found) {
printf("atomicExch failed\n");
return false;
}
val = -(1 << 8);
for (int i = 0; i < len; ++i) {
// third element should be len-1
val = max(val, i);
}
if (val != testData[2]) {
printf("atomicMax failed\n");
return false;
}
val = 1 << 8;
for (int i = 0; i < len; ++i) {
val = min(val, i);
}
if (val != testData[3]) {
printf("atomicMin failed\n");
return false;
}
int limit = 17;
val = 0;
for (int i = 0; i < len * LOOP_NUM; ++i) {
val = (val >= limit) ? 0 : val + 1;
}
if (val != testData[4]) {
printf("atomicInc failed\n");
return false;
}
limit = 137;
val = 0;
for (int i = 0; i < len * LOOP_NUM; ++i) {
val = ((val == 0) || (val > limit)) ? limit : val - 1;
}
if (val != testData[5]) {
printf("atomicDec failed\n");
return false;
}
found = false;
for (int i = 0; i < len; ++i) {
// seventh element should be a member of [0, len)
if (i == testData[6]) {
found = true;
break;
}
}
if (!found) {
printf("atomicCAS failed\n");
return false;
}
val = 0xff;
for (int i = 0; i < len; ++i) {
// 8th element should be 1
val &= (2 * i + 7);
}
if (val != testData[7]) {
printf("atomicAnd failed\n");
return false;
}
val = 0;
for (int i = 0; i < len; ++i) {
// 9th element should be 0xff
val |= (1 << i);
}
if (val != testData[8]) {
printf("atomicOr failed\n");
return false;
}
val = 0xff;
for (int i = 0; i < len; ++i) {
// 11th element should be 0xff
val ^= i;
}
if (val != testData[9]) {
printf("atomicXor failed\n");
return false;
}
return true;
}
int main(int argc, char **argv) {
// set device
cudaDeviceProp device_prop;
int dev_id = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
if (!device_prop.managedMemory) {
// This samples requires being run on a device that supports Unified Memory
fprintf(stderr, "Unified Memory not supported on this device\n");
exit(EXIT_WAIVED);
}
if (device_prop.computeMode == cudaComputeModeProhibited) {
// This sample requires being run with a default or process exclusive mode
fprintf(stderr,
"This sample requires a device in either default or process "
"exclusive mode\n");
exit(EXIT_WAIVED);
}
if (device_prop.major < 6) {
printf(
"%s: requires a minimum CUDA compute 6.0 capability, waiving "
"testing.\n",
argv[0]);
exit(EXIT_WAIVED);
}
unsigned int numThreads = 256;
unsigned int numBlocks = 64;
unsigned int numData = 10;
int *atom_arr;
if (device_prop.pageableMemoryAccess) {
printf("CAN access pageable memory\n");
atom_arr = (int *)malloc(sizeof(int) * numData);
} else {
printf("CANNOT access pageable memory\n");
checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData));
}
for (unsigned int i = 0; i < numData; i++) atom_arr[i] = 0;
// To make the AND and XOR tests generate something other than 0...
atom_arr[7] = atom_arr[9] = 0xff;
atomicKernel<<<numBlocks, numThreads>>>(atom_arr);
atomicKernel_CPU(atom_arr, numBlocks * numThreads);
checkCudaErrors(cudaDeviceSynchronize());
// Compute & verify reference solution
int testResult = verify(atom_arr, 2 * numThreads * numBlocks);
if (device_prop.pageableMemoryAccess) {
free(atom_arr);
} else {
cudaFree(atom_arr);
}
printf("systemWideAtomics completed, returned %s \n",
testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}