/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* A program demonstrating trivial use of system-wide atomics on migratable * memory. */ #include #include #include #include #include #include #define min(a, b) (a) < (b) ? (a) : (b) #define max(a, b) (a) > (b) ? (a) : (b) #define LOOP_NUM 50 __global__ void atomicKernel(int *atom_arr) { unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; for (int i = 0; i < LOOP_NUM; i++) { // Atomic addition atomicAdd_system(&atom_arr[0], 10); // Atomic exchange atomicExch_system(&atom_arr[1], tid); // Atomic maximum atomicMax_system(&atom_arr[2], tid); // Atomic minimum atomicMin_system(&atom_arr[3], tid); // Atomic increment (modulo 17+1) atomicInc_system((unsigned int *)&atom_arr[4], 17); // Atomic decrement atomicDec_system((unsigned int *)&atom_arr[5], 137); // Atomic compare-and-swap atomicCAS_system(&atom_arr[6], tid - 1, tid); // Bitwise atomic instructions // Atomic AND atomicAnd_system(&atom_arr[7], 2 * tid + 7); // Atomic OR atomicOr_system(&atom_arr[8], 1 << tid); // Atomic XOR atomicXor_system(&atom_arr[9], tid); } } void atomicKernel_CPU(int *atom_arr, int no_of_threads) { for (int i = no_of_threads; i < 2 * no_of_threads; i++) { for (int j = 0; j < LOOP_NUM; j++) { // Atomic addition __sync_fetch_and_add(&atom_arr[0], 10); // Atomic exchange __sync_lock_test_and_set(&atom_arr[1], i); // Atomic maximum int old, expected; do { expected = atom_arr[2]; old = __sync_val_compare_and_swap(&atom_arr[2], expected, max(expected, i)); } while (old != expected); // Atomic minimum do { expected = atom_arr[3]; old = __sync_val_compare_and_swap(&atom_arr[3], expected, min(expected, i)); } while (old != expected); // Atomic increment (modulo 17+1) int limit = 17; do { expected = atom_arr[4]; old = __sync_val_compare_and_swap( &atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1); } while (old != expected); // Atomic decrement limit = 137; do { expected = atom_arr[5]; old = __sync_val_compare_and_swap( &atom_arr[5], expected, ((expected == 0) || (expected > limit)) ? limit : expected - 1); } while (old != expected); // Atomic compare-and-swap __sync_val_compare_and_swap(&atom_arr[6], i - 1, i); // Bitwise atomic instructions // Atomic AND __sync_fetch_and_and(&atom_arr[7], 2 * i + 7); // Atomic OR __sync_fetch_and_or(&atom_arr[8], 1 << i); // Atomic XOR // 11th element should be 0xff __sync_fetch_and_xor(&atom_arr[9], i); } } } //////////////////////////////////////////////////////////////////////////////// //! Compute reference data set //! Each element is multiplied with the number of threads / array length //! @param reference reference data, computed but preallocated //! @param idata input data as provided to device //! @param len number of elements in reference / idata //////////////////////////////////////////////////////////////////////////////// int verify(int *testData, const int len) { int val = 0; for (int i = 0; i < len * LOOP_NUM; ++i) { val += 10; } if (val != testData[0]) { printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]); return false; } val = 0; bool found = false; for (int i = 0; i < len; ++i) { // second element should be a member of [0, len) if (i == testData[1]) { found = true; break; } } if (!found) { printf("atomicExch failed\n"); return false; } val = -(1 << 8); for (int i = 0; i < len; ++i) { // third element should be len-1 val = max(val, i); } if (val != testData[2]) { printf("atomicMax failed\n"); return false; } val = 1 << 8; for (int i = 0; i < len; ++i) { val = min(val, i); } if (val != testData[3]) { printf("atomicMin failed\n"); return false; } int limit = 17; val = 0; for (int i = 0; i < len * LOOP_NUM; ++i) { val = (val >= limit) ? 0 : val + 1; } if (val != testData[4]) { printf("atomicInc failed\n"); return false; } limit = 137; val = 0; for (int i = 0; i < len * LOOP_NUM; ++i) { val = ((val == 0) || (val > limit)) ? limit : val - 1; } if (val != testData[5]) { printf("atomicDec failed\n"); return false; } found = false; for (int i = 0; i < len; ++i) { // seventh element should be a member of [0, len) if (i == testData[6]) { found = true; break; } } if (!found) { printf("atomicCAS failed\n"); return false; } val = 0xff; for (int i = 0; i < len; ++i) { // 8th element should be 1 val &= (2 * i + 7); } if (val != testData[7]) { printf("atomicAnd failed\n"); return false; } val = 0; for (int i = 0; i < len; ++i) { // 9th element should be 0xff val |= (1 << i); } if (val != testData[8]) { printf("atomicOr failed\n"); return false; } val = 0xff; for (int i = 0; i < len; ++i) { // 11th element should be 0xff val ^= i; } if (val != testData[9]) { printf("atomicXor failed\n"); return false; } return true; } int main(int argc, char **argv) { // set device cudaDeviceProp device_prop; int dev_id = findCudaDevice(argc, (const char **)argv); checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id)); if (!device_prop.managedMemory) { // This samples requires being run on a device that supports Unified Memory fprintf(stderr, "Unified Memory not supported on this device\n"); exit(EXIT_WAIVED); } if (device_prop.computeMode == cudaComputeModeProhibited) { // This sample requires being run with a default or process exclusive mode fprintf(stderr, "This sample requires a device in either default or process " "exclusive mode\n"); exit(EXIT_WAIVED); } if (device_prop.major < 6) { printf( "%s: requires a minimum CUDA compute 6.0 capability, waiving " "testing.\n", argv[0]); exit(EXIT_WAIVED); } unsigned int numThreads = 256; unsigned int numBlocks = 64; unsigned int numData = 10; int *atom_arr; if (device_prop.pageableMemoryAccess) { printf("CAN access pageable memory\n"); atom_arr = (int *)malloc(sizeof(int) * numData); } else { printf("CANNOT access pageable memory\n"); checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData)); } for (unsigned int i = 0; i < numData; i++) atom_arr[i] = 0; // To make the AND and XOR tests generate something other than 0... atom_arr[7] = atom_arr[9] = 0xff; atomicKernel<<>>(atom_arr); atomicKernel_CPU(atom_arr, numBlocks * numThreads); checkCudaErrors(cudaDeviceSynchronize()); // Compute & verify reference solution int testResult = verify(atom_arr, 2 * numThreads * numBlocks); if (device_prop.pageableMemoryAccess) { free(atom_arr); } else { cudaFree(atom_arr); } printf("systemWideAtomics completed, returned %s \n", testResult ? "OK" : "ERROR!"); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); }