cuda-samples/Samples/2_Concepts_and_Techniques/streamOrderedAllocationP2P/streamOrderedAllocationP2P.cu

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * This sample demonstrates peer-to-peer access of stream ordered memory
 * allocated with cudaMallocAsync and cudaMemPool family of APIs through simple
 * kernel which does peer-to-peer to access & scales vector elements.
 */

// System includes
#include <assert.h>
#include <stdio.h>
#include <iostream>
#include <map>
#include <set>
#include <utility>

// CUDA runtime
#include <cuda_runtime.h>

// helper functions and utilities to work with CUDA
#include <helper_cuda.h>
#include <helper_functions.h>

// Simple kernel to demonstrate copying cudaMallocAsync memory via P2P to peer
// device
__global__ void copyP2PAndScale(const int *src, int *dst, int N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  if (idx < N) {
    // scale & store src vector.
    dst[idx] = 2 * src[idx];
  }
}

// Map of device version to device number
std::multimap<std::pair<int, int>, int> getIdenticalGPUs() {
  int numGpus = 0;
  checkCudaErrors(cudaGetDeviceCount(&numGpus));

  std::multimap<std::pair<int, int>, int> identicalGpus;

  for (int i = 0; i < numGpus; i++) {
    int isMemPoolSupported = 0;
    checkCudaErrors(cudaDeviceGetAttribute(&isMemPoolSupported,
                                           cudaDevAttrMemoryPoolsSupported, i));

    // Filter unsupported devices
    if (isMemPoolSupported) {
      int major = 0, minor = 0;
      checkCudaErrors(
          cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, i));
      checkCudaErrors(
          cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, i));
      identicalGpus.emplace(std::make_pair(major, minor), i);
    }
  }

  return identicalGpus;
}

std::pair<int, int> getP2PCapableGpuPair() {
  constexpr size_t kNumGpusRequired = 2;

  auto gpusByArch = getIdenticalGPUs();

  auto it = gpusByArch.begin();
  auto end = gpusByArch.end();

  auto bestFit = std::make_pair(it, it);
  // use std::distance to find the largest number of GPUs amongst architectures
  auto distance = [](decltype(bestFit) p) {
    return std::distance(p.first, p.second);
  };

  // Read each unique key/pair element in order
  for (; it != end; it = gpusByArch.upper_bound(it->first)) {
    // first and second are iterators bounded within the architecture group
    auto testFit = gpusByArch.equal_range(it->first);
    // Always use devices with highest architecture version or whichever has the
    // most devices available
    if (distance(bestFit) <= distance(testFit)) bestFit = testFit;
  }

  if (distance(bestFit) < kNumGpusRequired) {
    printf(
        "No Two or more GPUs with same architecture capable of cuda Memory "
        "Pools found."
        "\nWaiving the sample\n");
    exit(EXIT_WAIVED);
  }

  std::set<int> bestFitDeviceIds;

  // check & select peer-to-peer access capable GPU devices.
  int devIds[2];
  for (auto itr = bestFit.first; itr != bestFit.second; itr++) {
    int deviceId = itr->second;
    checkCudaErrors(cudaSetDevice(deviceId));

    std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds,
                                        &kNumGpusRequired](
                                           decltype(*itr) mapPair) {
      if (deviceId != mapPair.second) {
        int access = 0;
        checkCudaErrors(
            cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second));
        printf("Device=%d %s Access Peer Device=%d\n", deviceId,
               access ? "CAN" : "CANNOT", mapPair.second);
        if (access && bestFitDeviceIds.size() < kNumGpusRequired) {
          bestFitDeviceIds.emplace(deviceId);
          bestFitDeviceIds.emplace(mapPair.second);
        } else {
          printf("Ignoring device %i (max devices exceeded)\n", mapPair.second);
        }
      }
    });

    if (bestFitDeviceIds.size() >= kNumGpusRequired) {
      printf("Selected p2p capable devices - ");
      int i = 0;
      for (auto devicesItr = bestFitDeviceIds.begin();
           devicesItr != bestFitDeviceIds.end(); devicesItr++) {
        devIds[i++] = *devicesItr;
        printf("deviceId = %d  ", *devicesItr);
      }
      printf("\n");
      break;
    }
  }

  // if bestFitDeviceIds.size() == 0 it means the GPUs in system are not p2p
  // capable, hence we add it without p2p capability check.
  if (!bestFitDeviceIds.size()) {
    printf("No Two or more Devices p2p capable found.. exiting..\n");
    exit(EXIT_WAIVED);
  }

  auto p2pGpuPair = std::make_pair(devIds[0], devIds[1]);

  return p2pGpuPair;
}

int memPoolP2PCopy() {
  int *dev0_srcVec, *dev1_dstVec;  // Device buffers
  cudaStream_t stream1, stream2;
  cudaMemPool_t memPool;
  cudaEvent_t waitOnStream1;

  // Allocate CPU memory.
  size_t nelem = 1048576;
  size_t bytes = nelem * sizeof(int);

  int *a = (int *)malloc(bytes);
  int *output = (int *)malloc(bytes);

  /* Initialize the vectors. */
  for (int n = 0; n < nelem; n++) {
    a[n] = rand() / (int)RAND_MAX;
  }

  auto p2pDevices = getP2PCapableGpuPair();
  printf("selected devices = %d & %d\n", p2pDevices.first, p2pDevices.second);
  checkCudaErrors(cudaSetDevice(p2pDevices.first));
  checkCudaErrors(cudaEventCreate(&waitOnStream1));

  checkCudaErrors(cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking));

  // Get the default mempool for device p2pDevices.first from the pair
  checkCudaErrors(cudaDeviceGetDefaultMemPool(&memPool, p2pDevices.first));

  // Allocate memory in a stream from the pool set above.
  checkCudaErrors(cudaMallocAsync(&dev0_srcVec, bytes, stream1));

  checkCudaErrors(
      cudaMemcpyAsync(dev0_srcVec, a, bytes, cudaMemcpyHostToDevice, stream1));
  checkCudaErrors(cudaEventRecord(waitOnStream1, stream1));

  checkCudaErrors(cudaSetDevice(p2pDevices.second));
  checkCudaErrors(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking));

  // Allocate memory in p2pDevices.second device
  checkCudaErrors(cudaMallocAsync(&dev1_dstVec, bytes, stream2));

  // Setup peer mappings for p2pDevices.second device
  cudaMemAccessDesc desc;
  memset(&desc, 0, sizeof(cudaMemAccessDesc));
  desc.location.type = cudaMemLocationTypeDevice;
  desc.location.id = p2pDevices.second;
  desc.flags = cudaMemAccessFlagsProtReadWrite;
  checkCudaErrors(cudaMemPoolSetAccess(memPool, &desc, 1));

  printf("> copyP2PAndScale kernel running ...\n");
  dim3 block(256);
  dim3 grid((unsigned int)ceil(nelem / (int)block.x));
  checkCudaErrors(cudaStreamWaitEvent(stream2, waitOnStream1));
  copyP2PAndScale<<<grid, block, 0, stream2>>>(dev0_srcVec, dev1_dstVec, nelem);

  checkCudaErrors(cudaMemcpyAsync(output, dev1_dstVec, bytes,
                                  cudaMemcpyDeviceToHost, stream2));
  checkCudaErrors(cudaFreeAsync(dev0_srcVec, stream2));
  checkCudaErrors(cudaFreeAsync(dev1_dstVec, stream2));
  checkCudaErrors(cudaStreamSynchronize(stream2));

  /* Compare the results */
  printf("> Checking the results from copyP2PAndScale() ...\n");

  for (int n = 0; n < nelem; n++) {
    if ((2 * a[n]) != output[n]) {
      printf("mismatch i = %d expected = %d val = %d\n", n, 2 * a[n],
             output[n]);
      return EXIT_FAILURE;
    }
  }

  free(a);
  free(output);
  checkCudaErrors(cudaStreamDestroy(stream1));
  checkCudaErrors(cudaStreamDestroy(stream2));
  printf("PASSED\n");

  return EXIT_SUCCESS;
}

int main(int argc, char **argv) {
  int ret = memPoolP2PCopy();
  return ret;
}
add and update samples for CUDA 11.6 2022-01-13 14:05:24 +08:00			`/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.`
Add and update samples with CUDA 11.2 support 2020-12-10 03:35:32 +08:00			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`/*`
			`* This sample demonstrates peer-to-peer access of stream ordered memory`
			`* allocated with cudaMallocAsync and cudaMemPool family of APIs through simple`
			`* kernel which does peer-to-peer to access & scales vector elements.`
			`*/`

			`// System includes`
			`#include <assert.h>`
			`#include <stdio.h>`
			`#include <iostream>`
			`#include <map>`
			`#include <set>`
			`#include <utility>`

			`// CUDA runtime`
			`#include <cuda_runtime.h>`

			`// helper functions and utilities to work with CUDA`
			`#include <helper_cuda.h>`
			`#include <helper_functions.h>`

			`// Simple kernel to demonstrate copying cudaMallocAsync memory via P2P to peer`
			`// device`
			`__global__ void copyP2PAndScale(const int src, int dst, int N) {`
			`int idx = blockIdx.x * blockDim.x + threadIdx.x;`

			`if (idx < N) {`
			`// scale & store src vector.`
			`dst[idx] = 2 * src[idx];`
			`}`
			`}`

			`// Map of device version to device number`
			`std::multimap<std::pair<int, int>, int> getIdenticalGPUs() {`
			`int numGpus = 0;`
			`checkCudaErrors(cudaGetDeviceCount(&numGpus));`

			`std::multimap<std::pair<int, int>, int> identicalGpus;`

			`for (int i = 0; i < numGpus; i++) {`
			`int isMemPoolSupported = 0;`
			`checkCudaErrors(cudaDeviceGetAttribute(&isMemPoolSupported,`
			`cudaDevAttrMemoryPoolsSupported, i));`

			`// Filter unsupported devices`
			`if (isMemPoolSupported) {`
			`int major = 0, minor = 0;`
			`checkCudaErrors(`
			`cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, i));`
			`checkCudaErrors(`
			`cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, i));`
			`identicalGpus.emplace(std::make_pair(major, minor), i);`
			`}`
			`}`

			`return identicalGpus;`
			`}`

			`std::pair<int, int> getP2PCapableGpuPair() {`
			`constexpr size_t kNumGpusRequired = 2;`

			`auto gpusByArch = getIdenticalGPUs();`

			`auto it = gpusByArch.begin();`
			`auto end = gpusByArch.end();`

			`auto bestFit = std::make_pair(it, it);`
			`// use std::distance to find the largest number of GPUs amongst architectures`
			`auto distance = [](decltype(bestFit) p) {`
			`return std::distance(p.first, p.second);`
			`};`

			`// Read each unique key/pair element in order`
			`for (; it != end; it = gpusByArch.upper_bound(it->first)) {`
			`// first and second are iterators bounded within the architecture group`
			`auto testFit = gpusByArch.equal_range(it->first);`
			`// Always use devices with highest architecture version or whichever has the`
			`// most devices available`
			`if (distance(bestFit) <= distance(testFit)) bestFit = testFit;`
			`}`

			`if (distance(bestFit) < kNumGpusRequired) {`
			`printf(`
			`"No Two or more GPUs with same architecture capable of cuda Memory "`
			`"Pools found."`
			`"\nWaiving the sample\n");`
			`exit(EXIT_WAIVED);`
			`}`

			`std::set<int> bestFitDeviceIds;`

			`// check & select peer-to-peer access capable GPU devices.`
			`int devIds[2];`
			`for (auto itr = bestFit.first; itr != bestFit.second; itr++) {`
			`int deviceId = itr->second;`
			`checkCudaErrors(cudaSetDevice(deviceId));`

add and update samples with CUDA 11.3 support 2021-04-16 14:24:26 +08:00			`std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds,`
			`&kNumGpusRequired](`
			`decltype(*itr) mapPair) {`
			`if (deviceId != mapPair.second) {`
			`int access = 0;`
			`checkCudaErrors(`
			`cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second));`
			`printf("Device=%d %s Access Peer Device=%d\n", deviceId,`
			`access ? "CAN" : "CANNOT", mapPair.second);`
			`if (access && bestFitDeviceIds.size() < kNumGpusRequired) {`
			`bestFitDeviceIds.emplace(deviceId);`
			`bestFitDeviceIds.emplace(mapPair.second);`
			`} else {`
			`printf("Ignoring device %i (max devices exceeded)\n", mapPair.second);`
			`}`
			`}`
			`});`
Add and update samples with CUDA 11.2 support 2020-12-10 03:35:32 +08:00
			`if (bestFitDeviceIds.size() >= kNumGpusRequired) {`
			`printf("Selected p2p capable devices - ");`
			`int i = 0;`
			`for (auto devicesItr = bestFitDeviceIds.begin();`
			`devicesItr != bestFitDeviceIds.end(); devicesItr++) {`
			`devIds[i++] = *devicesItr;`
			`printf("deviceId = %d ", *devicesItr);`
			`}`
			`printf("\n");`
			`break;`
			`}`
			`}`

			`// if bestFitDeviceIds.size() == 0 it means the GPUs in system are not p2p`
			`// capable, hence we add it without p2p capability check.`
			`if (!bestFitDeviceIds.size()) {`
			`printf("No Two or more Devices p2p capable found.. exiting..\n");`
			`exit(EXIT_WAIVED);`
			`}`

			`auto p2pGpuPair = std::make_pair(devIds[0], devIds[1]);`

			`return p2pGpuPair;`
			`}`

			`int memPoolP2PCopy() {`
			`int dev0_srcVec, dev1_dstVec; // Device buffers`
			`cudaStream_t stream1, stream2;`
			`cudaMemPool_t memPool;`
			`cudaEvent_t waitOnStream1;`

			`// Allocate CPU memory.`
			`size_t nelem = 1048576;`
			`size_t bytes = nelem * sizeof(int);`

			`int a = (int )malloc(bytes);`
			`int output = (int )malloc(bytes);`

			`/* Initialize the vectors. */`
			`for (int n = 0; n < nelem; n++) {`
			`a[n] = rand() / (int)RAND_MAX;`
			`}`

			`auto p2pDevices = getP2PCapableGpuPair();`
			`printf("selected devices = %d & %d\n", p2pDevices.first, p2pDevices.second);`
			`checkCudaErrors(cudaSetDevice(p2pDevices.first));`
			`checkCudaErrors(cudaEventCreate(&waitOnStream1));`

			`checkCudaErrors(cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking));`

			`// Get the default mempool for device p2pDevices.first from the pair`
			`checkCudaErrors(cudaDeviceGetDefaultMemPool(&memPool, p2pDevices.first));`

			`// Allocate memory in a stream from the pool set above.`
			`checkCudaErrors(cudaMallocAsync(&dev0_srcVec, bytes, stream1));`

			`checkCudaErrors(`
			`cudaMemcpyAsync(dev0_srcVec, a, bytes, cudaMemcpyHostToDevice, stream1));`
			`checkCudaErrors(cudaEventRecord(waitOnStream1, stream1));`

			`checkCudaErrors(cudaSetDevice(p2pDevices.second));`
			`checkCudaErrors(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking));`

			`// Allocate memory in p2pDevices.second device`
			`checkCudaErrors(cudaMallocAsync(&dev1_dstVec, bytes, stream2));`

			`// Setup peer mappings for p2pDevices.second device`
			`cudaMemAccessDesc desc;`
			`memset(&desc, 0, sizeof(cudaMemAccessDesc));`
			`desc.location.type = cudaMemLocationTypeDevice;`
			`desc.location.id = p2pDevices.second;`
			`desc.flags = cudaMemAccessFlagsProtReadWrite;`
			`checkCudaErrors(cudaMemPoolSetAccess(memPool, &desc, 1));`

			`printf("> copyP2PAndScale kernel running ...\n");`
			`dim3 block(256);`
			`dim3 grid((unsigned int)ceil(nelem / (int)block.x));`
			`checkCudaErrors(cudaStreamWaitEvent(stream2, waitOnStream1));`
			`copyP2PAndScale<<<grid, block, 0, stream2>>>(dev0_srcVec, dev1_dstVec, nelem);`

			`checkCudaErrors(cudaMemcpyAsync(output, dev1_dstVec, bytes,`
			`cudaMemcpyDeviceToHost, stream2));`
			`checkCudaErrors(cudaFreeAsync(dev0_srcVec, stream2));`
			`checkCudaErrors(cudaFreeAsync(dev1_dstVec, stream2));`
			`checkCudaErrors(cudaStreamSynchronize(stream2));`

			`/* Compare the results */`
			`printf("> Checking the results from copyP2PAndScale() ...\n");`

			`for (int n = 0; n < nelem; n++) {`
			`if ((2 * a[n]) != output[n]) {`
			`printf("mismatch i = %d expected = %d val = %d\n", n, 2 * a[n],`
			`output[n]);`
			`return EXIT_FAILURE;`
			`}`
			`}`

			`free(a);`
			`free(output);`
			`checkCudaErrors(cudaStreamDestroy(stream1));`
			`checkCudaErrors(cudaStreamDestroy(stream2));`
			`printf("PASSED\n");`

			`return EXIT_SUCCESS;`
			`}`

			`int main(int argc, char **argv) {`
			`int ret = memPoolP2PCopy();`
			`return ret;`
			`}`