mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-01-20 03:45:54 +08:00
253 lines
8.7 KiB
Plaintext
253 lines
8.7 KiB
Plaintext
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* This sample demonstrates peer-to-peer access of stream ordered memory
|
|
* allocated with cudaMallocAsync and cudaMemPool family of APIs through simple
|
|
* kernel which does peer-to-peer to access & scales vector elements.
|
|
*/
|
|
|
|
// System includes
|
|
#include <assert.h>
|
|
#include <stdio.h>
|
|
#include <iostream>
|
|
#include <map>
|
|
#include <set>
|
|
#include <utility>
|
|
|
|
// CUDA runtime
|
|
#include <cuda_runtime.h>
|
|
|
|
// helper functions and utilities to work with CUDA
|
|
#include <helper_cuda.h>
|
|
#include <helper_functions.h>
|
|
|
|
// Simple kernel to demonstrate copying cudaMallocAsync memory via P2P to peer
|
|
// device
|
|
__global__ void copyP2PAndScale(const int *src, int *dst, int N) {
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if (idx < N) {
|
|
// scale & store src vector.
|
|
dst[idx] = 2 * src[idx];
|
|
}
|
|
}
|
|
|
|
// Map of device version to device number
|
|
std::multimap<std::pair<int, int>, int> getIdenticalGPUs() {
|
|
int numGpus = 0;
|
|
checkCudaErrors(cudaGetDeviceCount(&numGpus));
|
|
|
|
std::multimap<std::pair<int, int>, int> identicalGpus;
|
|
|
|
for (int i = 0; i < numGpus; i++) {
|
|
int isMemPoolSupported = 0;
|
|
checkCudaErrors(cudaDeviceGetAttribute(&isMemPoolSupported,
|
|
cudaDevAttrMemoryPoolsSupported, i));
|
|
|
|
// Filter unsupported devices
|
|
if (isMemPoolSupported) {
|
|
int major = 0, minor = 0;
|
|
checkCudaErrors(
|
|
cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, i));
|
|
checkCudaErrors(
|
|
cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, i));
|
|
identicalGpus.emplace(std::make_pair(major, minor), i);
|
|
}
|
|
}
|
|
|
|
return identicalGpus;
|
|
}
|
|
|
|
std::pair<int, int> getP2PCapableGpuPair() {
|
|
constexpr size_t kNumGpusRequired = 2;
|
|
|
|
auto gpusByArch = getIdenticalGPUs();
|
|
|
|
auto it = gpusByArch.begin();
|
|
auto end = gpusByArch.end();
|
|
|
|
auto bestFit = std::make_pair(it, it);
|
|
// use std::distance to find the largest number of GPUs amongst architectures
|
|
auto distance = [](decltype(bestFit) p) {
|
|
return std::distance(p.first, p.second);
|
|
};
|
|
|
|
// Read each unique key/pair element in order
|
|
for (; it != end; it = gpusByArch.upper_bound(it->first)) {
|
|
// first and second are iterators bounded within the architecture group
|
|
auto testFit = gpusByArch.equal_range(it->first);
|
|
// Always use devices with highest architecture version or whichever has the
|
|
// most devices available
|
|
if (distance(bestFit) <= distance(testFit)) bestFit = testFit;
|
|
}
|
|
|
|
if (distance(bestFit) < kNumGpusRequired) {
|
|
printf(
|
|
"No Two or more GPUs with same architecture capable of cuda Memory "
|
|
"Pools found."
|
|
"\nWaiving the sample\n");
|
|
exit(EXIT_WAIVED);
|
|
}
|
|
|
|
std::set<int> bestFitDeviceIds;
|
|
|
|
// check & select peer-to-peer access capable GPU devices.
|
|
int devIds[2];
|
|
for (auto itr = bestFit.first; itr != bestFit.second; itr++) {
|
|
int deviceId = itr->second;
|
|
checkCudaErrors(cudaSetDevice(deviceId));
|
|
|
|
std::for_each(itr, bestFit.second, [&deviceId, &bestFitDeviceIds,
|
|
&kNumGpusRequired](
|
|
decltype(*itr) mapPair) {
|
|
if (deviceId != mapPair.second) {
|
|
int access = 0;
|
|
checkCudaErrors(
|
|
cudaDeviceCanAccessPeer(&access, deviceId, mapPair.second));
|
|
printf("Device=%d %s Access Peer Device=%d\n", deviceId,
|
|
access ? "CAN" : "CANNOT", mapPair.second);
|
|
if (access && bestFitDeviceIds.size() < kNumGpusRequired) {
|
|
bestFitDeviceIds.emplace(deviceId);
|
|
bestFitDeviceIds.emplace(mapPair.second);
|
|
} else {
|
|
printf("Ignoring device %i (max devices exceeded)\n", mapPair.second);
|
|
}
|
|
}
|
|
});
|
|
|
|
if (bestFitDeviceIds.size() >= kNumGpusRequired) {
|
|
printf("Selected p2p capable devices - ");
|
|
int i = 0;
|
|
for (auto devicesItr = bestFitDeviceIds.begin();
|
|
devicesItr != bestFitDeviceIds.end(); devicesItr++) {
|
|
devIds[i++] = *devicesItr;
|
|
printf("deviceId = %d ", *devicesItr);
|
|
}
|
|
printf("\n");
|
|
break;
|
|
}
|
|
}
|
|
|
|
// if bestFitDeviceIds.size() == 0 it means the GPUs in system are not p2p
|
|
// capable, hence we add it without p2p capability check.
|
|
if (!bestFitDeviceIds.size()) {
|
|
printf("No Two or more Devices p2p capable found.. exiting..\n");
|
|
exit(EXIT_WAIVED);
|
|
}
|
|
|
|
auto p2pGpuPair = std::make_pair(devIds[0], devIds[1]);
|
|
|
|
return p2pGpuPair;
|
|
}
|
|
|
|
int memPoolP2PCopy() {
|
|
int *dev0_srcVec, *dev1_dstVec; // Device buffers
|
|
cudaStream_t stream1, stream2;
|
|
cudaMemPool_t memPool;
|
|
cudaEvent_t waitOnStream1;
|
|
|
|
// Allocate CPU memory.
|
|
size_t nelem = 1048576;
|
|
size_t bytes = nelem * sizeof(int);
|
|
|
|
int *a = (int *)malloc(bytes);
|
|
int *output = (int *)malloc(bytes);
|
|
|
|
/* Initialize the vectors. */
|
|
for (int n = 0; n < nelem; n++) {
|
|
a[n] = rand() / (int)RAND_MAX;
|
|
}
|
|
|
|
auto p2pDevices = getP2PCapableGpuPair();
|
|
printf("selected devices = %d & %d\n", p2pDevices.first, p2pDevices.second);
|
|
checkCudaErrors(cudaSetDevice(p2pDevices.first));
|
|
checkCudaErrors(cudaEventCreate(&waitOnStream1));
|
|
|
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking));
|
|
|
|
// Get the default mempool for device p2pDevices.first from the pair
|
|
checkCudaErrors(cudaDeviceGetDefaultMemPool(&memPool, p2pDevices.first));
|
|
|
|
// Allocate memory in a stream from the pool set above.
|
|
checkCudaErrors(cudaMallocAsync(&dev0_srcVec, bytes, stream1));
|
|
|
|
checkCudaErrors(
|
|
cudaMemcpyAsync(dev0_srcVec, a, bytes, cudaMemcpyHostToDevice, stream1));
|
|
checkCudaErrors(cudaEventRecord(waitOnStream1, stream1));
|
|
|
|
checkCudaErrors(cudaSetDevice(p2pDevices.second));
|
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking));
|
|
|
|
// Allocate memory in p2pDevices.second device
|
|
checkCudaErrors(cudaMallocAsync(&dev1_dstVec, bytes, stream2));
|
|
|
|
// Setup peer mappings for p2pDevices.second device
|
|
cudaMemAccessDesc desc;
|
|
memset(&desc, 0, sizeof(cudaMemAccessDesc));
|
|
desc.location.type = cudaMemLocationTypeDevice;
|
|
desc.location.id = p2pDevices.second;
|
|
desc.flags = cudaMemAccessFlagsProtReadWrite;
|
|
checkCudaErrors(cudaMemPoolSetAccess(memPool, &desc, 1));
|
|
|
|
printf("> copyP2PAndScale kernel running ...\n");
|
|
dim3 block(256);
|
|
dim3 grid((unsigned int)ceil(nelem / (int)block.x));
|
|
checkCudaErrors(cudaStreamWaitEvent(stream2, waitOnStream1));
|
|
copyP2PAndScale<<<grid, block, 0, stream2>>>(dev0_srcVec, dev1_dstVec, nelem);
|
|
|
|
checkCudaErrors(cudaMemcpyAsync(output, dev1_dstVec, bytes,
|
|
cudaMemcpyDeviceToHost, stream2));
|
|
checkCudaErrors(cudaFreeAsync(dev0_srcVec, stream2));
|
|
checkCudaErrors(cudaFreeAsync(dev1_dstVec, stream2));
|
|
checkCudaErrors(cudaStreamSynchronize(stream2));
|
|
|
|
/* Compare the results */
|
|
printf("> Checking the results from copyP2PAndScale() ...\n");
|
|
|
|
for (int n = 0; n < nelem; n++) {
|
|
if ((2 * a[n]) != output[n]) {
|
|
printf("mismatch i = %d expected = %d val = %d\n", n, 2 * a[n],
|
|
output[n]);
|
|
return EXIT_FAILURE;
|
|
}
|
|
}
|
|
|
|
free(a);
|
|
free(output);
|
|
checkCudaErrors(cudaStreamDestroy(stream1));
|
|
checkCudaErrors(cudaStreamDestroy(stream2));
|
|
printf("PASSED\n");
|
|
|
|
return EXIT_SUCCESS;
|
|
}
|
|
|
|
int main(int argc, char **argv) {
|
|
int ret = memPoolP2PCopy();
|
|
return ret;
|
|
}
|