cuda-samples/Samples/systemWideAtomics/systemWideAtomics.cu

/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/* A program demonstrating trivial use of system-wide atomics on migratable
 * memory.
 */

#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <math.h>
#include <stdint.h>
#include <cstdio>
#include <ctime>

#define min(a, b) (a) < (b) ? (a) : (b)
#define max(a, b) (a) > (b) ? (a) : (b)

#define LOOP_NUM 50

__global__ void atomicKernel(int *atom_arr) {
  unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;

  for (int i = 0; i < LOOP_NUM; i++) {
    // Atomic addition
    atomicAdd_system(&atom_arr[0], 10);

    // Atomic exchange
    atomicExch_system(&atom_arr[1], tid);

    // Atomic maximum
    atomicMax_system(&atom_arr[2], tid);

    // Atomic minimum
    atomicMin_system(&atom_arr[3], tid);

    // Atomic increment (modulo 17+1)
    atomicInc_system((unsigned int *)&atom_arr[4], 17);

    // Atomic decrement
    atomicDec_system((unsigned int *)&atom_arr[5], 137);

    // Atomic compare-and-swap
    atomicCAS_system(&atom_arr[6], tid - 1, tid);

    // Bitwise atomic instructions

    // Atomic AND
    atomicAnd_system(&atom_arr[7], 2 * tid + 7);

    // Atomic OR
    atomicOr_system(&atom_arr[8], 1 << tid);

    // Atomic XOR
    atomicXor_system(&atom_arr[9], tid);
  }
}

void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
  for (int i = no_of_threads; i < 2 * no_of_threads; i++) {
    for (int j = 0; j < LOOP_NUM; j++) {
      // Atomic addition
      __sync_fetch_and_add(&atom_arr[0], 10);

      // Atomic exchange
      __sync_lock_test_and_set(&atom_arr[1], i);

      // Atomic maximum
      int old, expected;
      do {
        expected = atom_arr[2];
        old = __sync_val_compare_and_swap(&atom_arr[2], expected,
                                          max(expected, i));
      } while (old != expected);

      // Atomic minimum
      do {
        expected = atom_arr[3];
        old = __sync_val_compare_and_swap(&atom_arr[3], expected,
                                          min(expected, i));
      } while (old != expected);

      // Atomic increment (modulo 17+1)
      int limit = 17;
      do {
        expected = atom_arr[4];
        old = __sync_val_compare_and_swap(
            &atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1);
      } while (old != expected);

      // Atomic decrement
      limit = 137;
      do {
        expected = atom_arr[5];
        old = __sync_val_compare_and_swap(
            &atom_arr[5], expected,
            ((expected == 0) || (expected > limit)) ? limit : expected - 1);
      } while (old != expected);

      // Atomic compare-and-swap
      __sync_val_compare_and_swap(&atom_arr[6], i - 1, i);

      // Bitwise atomic instructions

      // Atomic AND
      __sync_fetch_and_and(&atom_arr[7], 2 * i + 7);

      // Atomic OR
      __sync_fetch_and_or(&atom_arr[8], 1 << i);

      // Atomic XOR
      // 11th element should be 0xff
      __sync_fetch_and_xor(&atom_arr[9], i);
    }
  }
}

////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set
//! Each element is multiplied with the number of threads / array length
//! @param reference  reference data, computed but preallocated
//! @param idata      input data as provided to device
//! @param len        number of elements in reference / idata
////////////////////////////////////////////////////////////////////////////////
int verify(int *testData, const int len) {
  int val = 0;

  for (int i = 0; i < len * LOOP_NUM; ++i) {
    val += 10;
  }

  if (val != testData[0]) {
    printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]);
    return false;
  }

  val = 0;

  bool found = false;

  for (int i = 0; i < len; ++i) {
    // second element should be a member of [0, len)
    if (i == testData[1]) {
      found = true;
      break;
    }
  }

  if (!found) {
    printf("atomicExch failed\n");
    return false;
  }

  val = -(1 << 8);

  for (int i = 0; i < len; ++i) {
    // third element should be len-1
    val = max(val, i);
  }

  if (val != testData[2]) {
    printf("atomicMax failed\n");
    return false;
  }

  val = 1 << 8;

  for (int i = 0; i < len; ++i) {
    val = min(val, i);
  }

  if (val != testData[3]) {
    printf("atomicMin failed\n");
    return false;
  }

  int limit = 17;
  val = 0;

  for (int i = 0; i < len * LOOP_NUM; ++i) {
    val = (val >= limit) ? 0 : val + 1;
  }

  if (val != testData[4]) {
    printf("atomicInc failed\n");
    return false;
  }

  limit = 137;
  val = 0;

  for (int i = 0; i < len * LOOP_NUM; ++i) {
    val = ((val == 0) || (val > limit)) ? limit : val - 1;
  }

  if (val != testData[5]) {
    printf("atomicDec failed\n");
    return false;
  }

  found = false;

  for (int i = 0; i < len; ++i) {
    // seventh element should be a member of [0, len)
    if (i == testData[6]) {
      found = true;
      break;
    }
  }

  if (!found) {
    printf("atomicCAS failed\n");
    return false;
  }

  val = 0xff;

  for (int i = 0; i < len; ++i) {
    // 8th element should be 1
    val &= (2 * i + 7);
  }

  if (val != testData[7]) {
    printf("atomicAnd failed\n");
    return false;
  }

  val = 0;

  for (int i = 0; i < len; ++i) {
    // 9th element should be 0xff
    val |= (1 << i);
  }

  if (val != testData[8]) {
    printf("atomicOr failed\n");
    return false;
  }

  val = 0xff;

  for (int i = 0; i < len; ++i) {
    // 11th element should be 0xff
    val ^= i;
  }

  if (val != testData[9]) {
    printf("atomicXor failed\n");
    return false;
  }

  return true;
}

int main(int argc, char **argv) {
  // set device
  cudaDeviceProp device_prop;
  int dev_id = findCudaDevice(argc, (const char **)argv);
  checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));

  if (!device_prop.managedMemory) {
    // This samples requires being run on a device that supports Unified Memory
    fprintf(stderr, "Unified Memory not supported on this device\n");
    exit(EXIT_WAIVED);
  }

  if (device_prop.computeMode == cudaComputeModeProhibited) {
    // This sample requires being run with a default or process exclusive mode
    fprintf(stderr,
            "This sample requires a device in either default or process "
            "exclusive mode\n");
    exit(EXIT_WAIVED);
  }

  if (device_prop.major < 6) {
    printf(
        "%s: requires a minimum CUDA compute 6.0 capability, waiving "
        "testing.\n",
        argv[0]);
    exit(EXIT_WAIVED);
  }

  unsigned int numThreads = 256;
  unsigned int numBlocks = 64;
  unsigned int numData = 10;

  int *atom_arr;

  if (device_prop.pageableMemoryAccess) {
    printf("CAN access pageable memory\n");
    atom_arr = (int *)malloc(sizeof(int) * numData);
  } else {
    printf("CANNOT access pageable memory\n");
    checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData));
  }

  for (unsigned int i = 0; i < numData; i++) atom_arr[i] = 0;

  // To make the AND and XOR tests generate something other than 0...
  atom_arr[7] = atom_arr[9] = 0xff;

  atomicKernel<<<numBlocks, numThreads>>>(atom_arr);
  atomicKernel_CPU(atom_arr, numBlocks * numThreads);

  checkCudaErrors(cudaDeviceSynchronize());

  // Compute & verify reference solution
  int testResult = verify(atom_arr, 2 * numThreads * numBlocks);

  if (device_prop.pageableMemoryAccess) {
    free(atom_arr);
  } else {
    cudaFree(atom_arr);
  }

  printf("systemWideAtomics completed, returned %s \n",
         testResult ? "OK" : "ERROR!");
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
Add and update samples with CUDA 10.1 Update 1 support 2019-04-10 22:42:09 +08:00			`/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.`
Add and Update samples for CUDA 10.0 2018-08-25 01:05:15 +08:00			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`/* A program demonstrating trivial use of system-wide atomics on migratable`
			`* memory.`
			`*/`

			`#include <cuda_runtime.h>`
			`#include <helper_cuda.h>`
			`#include <math.h>`
			`#include <stdint.h>`
			`#include <cstdio>`
			`#include <ctime>`

			`#define min(a, b) (a) < (b) ? (a) : (b)`
			`#define max(a, b) (a) > (b) ? (a) : (b)`

			`#define LOOP_NUM 50`

			`__global__ void atomicKernel(int *atom_arr) {`
			`unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;`

			`for (int i = 0; i < LOOP_NUM; i++) {`
			`// Atomic addition`
			`atomicAdd_system(&atom_arr[0], 10);`

			`// Atomic exchange`
			`atomicExch_system(&atom_arr[1], tid);`

			`// Atomic maximum`
			`atomicMax_system(&atom_arr[2], tid);`

			`// Atomic minimum`
			`atomicMin_system(&atom_arr[3], tid);`

			`// Atomic increment (modulo 17+1)`
			`atomicInc_system((unsigned int *)&atom_arr[4], 17);`

			`// Atomic decrement`
			`atomicDec_system((unsigned int *)&atom_arr[5], 137);`

			`// Atomic compare-and-swap`
			`atomicCAS_system(&atom_arr[6], tid - 1, tid);`

			`// Bitwise atomic instructions`

			`// Atomic AND`
			`atomicAnd_system(&atom_arr[7], 2 * tid + 7);`

			`// Atomic OR`
			`atomicOr_system(&atom_arr[8], 1 << tid);`

			`// Atomic XOR`
			`atomicXor_system(&atom_arr[9], tid);`
			`}`
			`}`

			`void atomicKernel_CPU(int *atom_arr, int no_of_threads) {`
			`for (int i = no_of_threads; i < 2 * no_of_threads; i++) {`
			`for (int j = 0; j < LOOP_NUM; j++) {`
			`// Atomic addition`
			`__sync_fetch_and_add(&atom_arr[0], 10);`

			`// Atomic exchange`
			`__sync_lock_test_and_set(&atom_arr[1], i);`

			`// Atomic maximum`
			`int old, expected;`
			`do {`
			`expected = atom_arr[2];`
			`old = __sync_val_compare_and_swap(&atom_arr[2], expected,`
			`max(expected, i));`
			`} while (old != expected);`

			`// Atomic minimum`
			`do {`
			`expected = atom_arr[3];`
			`old = __sync_val_compare_and_swap(&atom_arr[3], expected,`
			`min(expected, i));`
			`} while (old != expected);`

			`// Atomic increment (modulo 17+1)`
			`int limit = 17;`
			`do {`
			`expected = atom_arr[4];`
			`old = __sync_val_compare_and_swap(`
			`&atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1);`
			`} while (old != expected);`

			`// Atomic decrement`
			`limit = 137;`
			`do {`
			`expected = atom_arr[5];`
			`old = __sync_val_compare_and_swap(`
			`&atom_arr[5], expected,`
			`((expected == 0) \|\| (expected > limit)) ? limit : expected - 1);`
			`} while (old != expected);`

			`// Atomic compare-and-swap`
			`__sync_val_compare_and_swap(&atom_arr[6], i - 1, i);`

			`// Bitwise atomic instructions`

			`// Atomic AND`
			`__sync_fetch_and_and(&atom_arr[7], 2 * i + 7);`

			`// Atomic OR`
			`__sync_fetch_and_or(&atom_arr[8], 1 << i);`

			`// Atomic XOR`
			`// 11th element should be 0xff`
			`__sync_fetch_and_xor(&atom_arr[9], i);`
			`}`
			`}`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Compute reference data set`
			`//! Each element is multiplied with the number of threads / array length`
			`//! @param reference reference data, computed but preallocated`
			`//! @param idata input data as provided to device`
			`//! @param len number of elements in reference / idata`
			`////////////////////////////////////////////////////////////////////////////////`
			`int verify(int *testData, const int len) {`
			`int val = 0;`

			`for (int i = 0; i < len * LOOP_NUM; ++i) {`
			`val += 10;`
			`}`

			`if (val != testData[0]) {`
			`printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]);`
			`return false;`
			`}`

			`val = 0;`

			`bool found = false;`

			`for (int i = 0; i < len; ++i) {`
			`// second element should be a member of [0, len)`
			`if (i == testData[1]) {`
			`found = true;`
			`break;`
			`}`
			`}`

			`if (!found) {`
			`printf("atomicExch failed\n");`
			`return false;`
			`}`

			`val = -(1 << 8);`

			`for (int i = 0; i < len; ++i) {`
			`// third element should be len-1`
			`val = max(val, i);`
			`}`

			`if (val != testData[2]) {`
			`printf("atomicMax failed\n");`
			`return false;`
			`}`

			`val = 1 << 8;`

			`for (int i = 0; i < len; ++i) {`
			`val = min(val, i);`
			`}`

			`if (val != testData[3]) {`
			`printf("atomicMin failed\n");`
			`return false;`
			`}`

			`int limit = 17;`
			`val = 0;`

			`for (int i = 0; i < len * LOOP_NUM; ++i) {`
			`val = (val >= limit) ? 0 : val + 1;`
			`}`

			`if (val != testData[4]) {`
			`printf("atomicInc failed\n");`
			`return false;`
			`}`

			`limit = 137;`
			`val = 0;`

			`for (int i = 0; i < len * LOOP_NUM; ++i) {`
			`val = ((val == 0) \|\| (val > limit)) ? limit : val - 1;`
			`}`

			`if (val != testData[5]) {`
			`printf("atomicDec failed\n");`
			`return false;`
			`}`

			`found = false;`

			`for (int i = 0; i < len; ++i) {`
			`// seventh element should be a member of [0, len)`
			`if (i == testData[6]) {`
			`found = true;`
			`break;`
			`}`
			`}`

			`if (!found) {`
			`printf("atomicCAS failed\n");`
			`return false;`
			`}`

			`val = 0xff;`

			`for (int i = 0; i < len; ++i) {`
			`// 8th element should be 1`
			`val &= (2 * i + 7);`
			`}`

			`if (val != testData[7]) {`
			`printf("atomicAnd failed\n");`
			`return false;`
			`}`

			`val = 0;`

			`for (int i = 0; i < len; ++i) {`
			`// 9th element should be 0xff`
			`val \|= (1 << i);`
			`}`

			`if (val != testData[8]) {`
			`printf("atomicOr failed\n");`
			`return false;`
			`}`

			`val = 0xff;`

			`for (int i = 0; i < len; ++i) {`
			`// 11th element should be 0xff`
			`val ^= i;`
			`}`

			`if (val != testData[9]) {`
			`printf("atomicXor failed\n");`
			`return false;`
			`}`

			`return true;`
			`}`

			`int main(int argc, char **argv) {`
			`// set device`
			`cudaDeviceProp device_prop;`
			`int dev_id = findCudaDevice(argc, (const char **)argv);`
			`checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));`

			`if (!device_prop.managedMemory) {`
			`// This samples requires being run on a device that supports Unified Memory`
			`fprintf(stderr, "Unified Memory not supported on this device\n");`
			`exit(EXIT_WAIVED);`
			`}`

			`if (device_prop.computeMode == cudaComputeModeProhibited) {`
			`// This sample requires being run with a default or process exclusive mode`
			`fprintf(stderr,`
			`"This sample requires a device in either default or process "`
			`"exclusive mode\n");`
			`exit(EXIT_WAIVED);`
			`}`

			`if (device_prop.major < 6) {`
			`printf(`
			`"%s: requires a minimum CUDA compute 6.0 capability, waiving "`
			`"testing.\n",`
			`argv[0]);`
			`exit(EXIT_WAIVED);`
			`}`

			`unsigned int numThreads = 256;`
			`unsigned int numBlocks = 64;`
			`unsigned int numData = 10;`

			`int *atom_arr;`

			`if (device_prop.pageableMemoryAccess) {`
			`printf("CAN access pageable memory\n");`
			`atom_arr = (int )malloc(sizeof(int) numData);`
			`} else {`
			`printf("CANNOT access pageable memory\n");`
			`checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData));`
			`}`

			`for (unsigned int i = 0; i < numData; i++) atom_arr[i] = 0;`

			`// To make the AND and XOR tests generate something other than 0...`
			`atom_arr[7] = atom_arr[9] = 0xff;`

			`atomicKernel<<<numBlocks, numThreads>>>(atom_arr);`
			`atomicKernel_CPU(atom_arr, numBlocks * numThreads);`

			`checkCudaErrors(cudaDeviceSynchronize());`

			`// Compute & verify reference solution`
			`int testResult = verify(atom_arr, 2 * numThreads * numBlocks);`

			`if (device_prop.pageableMemoryAccess) {`
			`free(atom_arr);`
			`} else {`
			`cudaFree(atom_arr);`
			`}`

			`printf("systemWideAtomics completed, returned %s \n",`
			`testResult ? "OK" : "ERROR!");`
			`exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);`
			`}`