cuda-samples/Samples/streamOrderedAllocationP2P/streamOrderedAllocationP2P.cu

253 lines
8.7 KiB
Plaintext
Raw Normal View History

/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This sample demonstrates peer-to-peer access of stream ordered memory
* allocated with cudaMallocAsync and cudaMemPool family of APIs through simple
* kernel which does peer-to-peer to access & scales vector elements.
*/
// System includes
#include <assert.h>
#include <stdio.h>
#include <iostream>
#include <map>
#include <set>
#include <utility>
// CUDA runtime
#include <cuda_runtime.h>
// helper functions and utilities to work with CUDA
#include <helper_cuda.h>
#include <helper_functions.h>
// Simple kernel to demonstrate copying cudaMallocAsync memory via P2P to peer
// device
__global__ void copyP2PAndScale(const int *src, int *dst, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
// scale & store src vector.
dst[idx] = 2 * src[idx];
}
}
// Map of device version to device number
std::multimap<std::pair<int, int>, int> getIdenticalGPUs() {
int numGpus = 0;
checkCudaErrors(cudaGetDeviceCount(&numGpus));
std::multimap<std::pair<int, int>, int> identicalGpus;
for (int i = 0; i < numGpus; i++) {
int isMemPoolSupported = 0;
checkCudaErrors(cudaDeviceGetAttribute(&isMemPoolSupported,
cudaDevAttrMemoryPoolsSupported, i));
// Filter unsupported devices
if (isMemPoolSupported) {
int major = 0, minor = 0;
checkCudaErrors(
cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, i));
checkCudaErrors(
cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, i));
identicalGpus.emplace(std::make_pair(major, minor), i);
}
}
return identicalGpus;
}
std::pair<int, int> getP2PCapableGpuPair() {
constexpr size_t kNumGpusRequired = 2;
auto gpusByArch = getIdenticalGPUs();
auto it = gpusByArch.begin();
auto end = gpusByArch.end();
auto bestFit = std::make_pair(it, it);
// use std::distance to find the largest number of GPUs amongst architectures
auto distance = [](decltype(bestFit) p) {
return std::distance(p.first, p.second);
};
// Read each unique key/pair element in order
for (; it != end; it = gpusByArch.upper_bound(it->first)) {
// first and second are iterators bounded within the architecture group
auto testFit = gpusByArch.equal_range(it->first);
// Always use devices with highest architecture version or whichever has the
// most devices available
if (distance(bestFit) <= distance(testFit)) bestFit = testFit;
}
if (distance(bestFit) < kNumGpusRequired) {
printf(
"No Two or more GPUs with same architecture capable of cuda Memory "
"Pools found."
"\nWaiving the sample\n");
exit(EXIT_WAIVED);
}
std::set<int> bestFitDeviceIds;
// check & select peer-to-peer access capable GPU devices.
int devIds[2];
for (auto itr = bestFit.first; itr != bestFit.second; itr++) {
int deviceId = itr->second;
checkCudaErrors(cudaSetDevice(deviceId));
std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds,
&kNumGpusRequired](
decltype(*itr) mapPair) {
if (deviceId != mapPair.second) {
int access = 0;
checkCudaErrors(
cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second));
printf("Device=%d %s Access Peer Device=%d\n", deviceId,
access ? "CAN" : "CANNOT", mapPair.second);
if (access && bestFitDeviceIds.size() < kNumGpusRequired) {
bestFitDeviceIds.emplace(deviceId);
bestFitDeviceIds.emplace(mapPair.second);
} else {
printf("Ignoring device %i (max devices exceeded)\n", mapPair.second);
}
}
});
if (bestFitDeviceIds.size() >= kNumGpusRequired) {
printf("Selected p2p capable devices - ");
int i = 0;
for (auto devicesItr = bestFitDeviceIds.begin();
devicesItr != bestFitDeviceIds.end(); devicesItr++) {
devIds[i++] = *devicesItr;
printf("deviceId = %d ", *devicesItr);
}
printf("\n");
break;
}
}
// if bestFitDeviceIds.size() == 0 it means the GPUs in system are not p2p
// capable, hence we add it without p2p capability check.
if (!bestFitDeviceIds.size()) {
printf("No Two or more Devices p2p capable found.. exiting..\n");
exit(EXIT_WAIVED);
}
auto p2pGpuPair = std::make_pair(devIds[0], devIds[1]);
return p2pGpuPair;
}
int memPoolP2PCopy() {
int *dev0_srcVec, *dev1_dstVec; // Device buffers
cudaStream_t stream1, stream2;
cudaMemPool_t memPool;
cudaEvent_t waitOnStream1;
// Allocate CPU memory.
size_t nelem = 1048576;
size_t bytes = nelem * sizeof(int);
int *a = (int *)malloc(bytes);
int *output = (int *)malloc(bytes);
/* Initialize the vectors. */
for (int n = 0; n < nelem; n++) {
a[n] = rand() / (int)RAND_MAX;
}
auto p2pDevices = getP2PCapableGpuPair();
printf("selected devices = %d & %d\n", p2pDevices.first, p2pDevices.second);
checkCudaErrors(cudaSetDevice(p2pDevices.first));
checkCudaErrors(cudaEventCreate(&waitOnStream1));
checkCudaErrors(cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking));
// Get the default mempool for device p2pDevices.first from the pair
checkCudaErrors(cudaDeviceGetDefaultMemPool(&memPool, p2pDevices.first));
// Allocate memory in a stream from the pool set above.
checkCudaErrors(cudaMallocAsync(&dev0_srcVec, bytes, stream1));
checkCudaErrors(
cudaMemcpyAsync(dev0_srcVec, a, bytes, cudaMemcpyHostToDevice, stream1));
checkCudaErrors(cudaEventRecord(waitOnStream1, stream1));
checkCudaErrors(cudaSetDevice(p2pDevices.second));
checkCudaErrors(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking));
// Allocate memory in p2pDevices.second device
checkCudaErrors(cudaMallocAsync(&dev1_dstVec, bytes, stream2));
// Setup peer mappings for p2pDevices.second device
cudaMemAccessDesc desc;
memset(&desc, 0, sizeof(cudaMemAccessDesc));
desc.location.type = cudaMemLocationTypeDevice;
desc.location.id = p2pDevices.second;
desc.flags = cudaMemAccessFlagsProtReadWrite;
checkCudaErrors(cudaMemPoolSetAccess(memPool, &desc, 1));
printf("> copyP2PAndScale kernel running ...\n");
dim3 block(256);
dim3 grid((unsigned int)ceil(nelem / (int)block.x));
checkCudaErrors(cudaStreamWaitEvent(stream2, waitOnStream1));
copyP2PAndScale<<<grid, block, 0, stream2>>>(dev0_srcVec, dev1_dstVec, nelem);
checkCudaErrors(cudaMemcpyAsync(output, dev1_dstVec, bytes,
cudaMemcpyDeviceToHost, stream2));
checkCudaErrors(cudaFreeAsync(dev0_srcVec, stream2));
checkCudaErrors(cudaFreeAsync(dev1_dstVec, stream2));
checkCudaErrors(cudaStreamSynchronize(stream2));
/* Compare the results */
printf("> Checking the results from copyP2PAndScale() ...\n");
for (int n = 0; n < nelem; n++) {
if ((2 * a[n]) != output[n]) {
printf("mismatch i = %d expected = %d val = %d\n", n, 2 * a[n],
output[n]);
return EXIT_FAILURE;
}
}
free(a);
free(output);
checkCudaErrors(cudaStreamDestroy(stream1));
checkCudaErrors(cudaStreamDestroy(stream2));
printf("PASSED\n");
return EXIT_SUCCESS;
}
int main(int argc, char **argv) {
int ret = memPoolP2PCopy();
return ret;
}