mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-12-01 12:09:15 +08:00
201 lines
7.8 KiB
C++
201 lines
7.8 KiB
C++
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "multidevicealloc_memmap.hpp"
|
|
|
|
static size_t round_up(size_t x, size_t y) { return ((x + y - 1) / y) * y; }
|
|
|
|
CUresult simpleMallocMultiDeviceMmap(
|
|
CUdeviceptr *dptr, size_t *allocationSize, size_t size,
|
|
const std::vector<CUdevice> &residentDevices,
|
|
const std::vector<CUdevice> &mappingDevices, size_t align) {
|
|
CUresult status = CUDA_SUCCESS;
|
|
size_t min_granularity = 0;
|
|
size_t stripeSize;
|
|
|
|
// Setup the properties common for all the chunks
|
|
// The allocations will be device pinned memory.
|
|
// This property structure describes the physical location where the memory
|
|
// will be allocated via cuMemCreate allong with additional properties In this
|
|
// case, the allocation will be pinnded device memory local to a given device.
|
|
CUmemAllocationProp prop = {};
|
|
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
|
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
|
|
// Get the minimum granularity needed for the resident devices
|
|
// (the max of the minimum granularity of each participating device)
|
|
for (int idx = 0; idx < residentDevices.size(); idx++) {
|
|
size_t granularity = 0;
|
|
|
|
// get the minnimum granularity for residentDevices[idx]
|
|
prop.location.id = residentDevices[idx];
|
|
status = cuMemGetAllocationGranularity(&granularity, &prop,
|
|
CU_MEM_ALLOC_GRANULARITY_MINIMUM);
|
|
if (status != CUDA_SUCCESS) {
|
|
goto done;
|
|
}
|
|
if (min_granularity < granularity) {
|
|
min_granularity = granularity;
|
|
}
|
|
}
|
|
|
|
// Get the minimum granularity needed for the accessing devices
|
|
// (the max of the minimum granularity of each participating device)
|
|
for (size_t idx = 0; idx < mappingDevices.size(); idx++) {
|
|
size_t granularity = 0;
|
|
|
|
// get the minnimum granularity for mappingDevices[idx]
|
|
prop.location.id = mappingDevices[idx];
|
|
status = cuMemGetAllocationGranularity(&granularity, &prop,
|
|
CU_MEM_ALLOC_GRANULARITY_MINIMUM);
|
|
if (status != CUDA_SUCCESS) {
|
|
goto done;
|
|
}
|
|
if (min_granularity < granularity) {
|
|
min_granularity = granularity;
|
|
}
|
|
}
|
|
|
|
// Round up the size such that we can evenly split it into a stripe size tha
|
|
// meets the granularity requirements Essentially size = N *
|
|
// residentDevices.size() * min_granularity is the requirement, since each
|
|
// piece of the allocation will be stripeSize = N * min_granularity and the
|
|
// min_granularity requirement applies to each stripeSize piece of the
|
|
// allocation.
|
|
size = round_up(size, residentDevices.size() * min_granularity);
|
|
stripeSize = size / residentDevices.size();
|
|
|
|
// Return the rounded up size to the caller for use in the free
|
|
if (allocationSize) {
|
|
*allocationSize = size;
|
|
}
|
|
|
|
// Reserve the required contiguous VA space for the allocations
|
|
status = cuMemAddressReserve(dptr, size, align, 0, 0);
|
|
if (status != CUDA_SUCCESS) {
|
|
goto done;
|
|
}
|
|
|
|
// Create and map the backings on each gpu
|
|
// note: reusing CUmemAllocationProp prop from earlier with prop.type &
|
|
// prop.location.type already specified.
|
|
for (size_t idx = 0; idx < residentDevices.size(); idx++) {
|
|
CUresult status2 = CUDA_SUCCESS;
|
|
|
|
// Set the location for this chunk to this device
|
|
prop.location.id = residentDevices[idx];
|
|
|
|
// Create the allocation as a pinned allocation on this device
|
|
CUmemGenericAllocationHandle allocationHandle;
|
|
status = cuMemCreate(&allocationHandle, stripeSize, &prop, 0);
|
|
if (status != CUDA_SUCCESS) {
|
|
goto done;
|
|
}
|
|
|
|
// Assign the chunk to the appropriate VA range and release the handle.
|
|
// After mapping the memory, it can be referenced by virtual address.
|
|
// Since we do not need to make any other mappings of this memory or export
|
|
// it, we no longer need and can release the allocationHandle. The
|
|
// allocation will be kept live until it is unmapped.
|
|
status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0,
|
|
allocationHandle, 0);
|
|
|
|
// the handle needs to be released even if the mapping failed.
|
|
status2 = cuMemRelease(allocationHandle);
|
|
if (status == CUDA_SUCCESS) {
|
|
// cuMemRelease should not have failed here
|
|
// as the handle was just allocated successfully
|
|
// however return an error if it does.
|
|
status = status2;
|
|
}
|
|
|
|
// Cleanup in case of any mapping failures.
|
|
if (status != CUDA_SUCCESS) {
|
|
goto done;
|
|
}
|
|
}
|
|
|
|
{
|
|
// Each accessDescriptor will describe the mapping requirement for a single
|
|
// device
|
|
std::vector<CUmemAccessDesc> accessDescriptors;
|
|
accessDescriptors.resize(mappingDevices.size());
|
|
|
|
// Prepare the access descriptor array indicating where and how the backings
|
|
// should be visible.
|
|
for (size_t idx = 0; idx < mappingDevices.size(); idx++) {
|
|
// Specify which device we are adding mappings for.
|
|
accessDescriptors[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
accessDescriptors[idx].location.id = mappingDevices[idx];
|
|
|
|
// Specify both read and write access.
|
|
accessDescriptors[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
|
}
|
|
|
|
// Apply the access descriptors to the whole VA range.
|
|
status = cuMemSetAccess(*dptr, size, &accessDescriptors[0],
|
|
accessDescriptors.size());
|
|
if (status != CUDA_SUCCESS) {
|
|
goto done;
|
|
}
|
|
}
|
|
|
|
done:
|
|
if (status != CUDA_SUCCESS) {
|
|
if (*dptr) {
|
|
simpleFreeMultiDeviceMmap(*dptr, size);
|
|
}
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size) {
|
|
CUresult status = CUDA_SUCCESS;
|
|
|
|
// Unmap the mapped virtual memory region
|
|
// Since the handles to the mapped backing stores have already been released
|
|
// by cuMemRelease, and these are the only/last mappings referencing them,
|
|
// The backing stores will be freed.
|
|
// Since the memory has been unmapped after this call, accessing the specified
|
|
// va range will result in a fault (unitll it is remapped).
|
|
status = cuMemUnmap(dptr, size);
|
|
if (status != CUDA_SUCCESS) {
|
|
return status;
|
|
}
|
|
// Free the virtual address region. This allows the virtual address region
|
|
// to be reused by future cuMemAddressReserve calls. This also allows the
|
|
// virtual address region to be used by other allocation made through
|
|
// opperating system calls like malloc & mmap.
|
|
status = cuMemAddressFree(dptr, size);
|
|
if (status != CUDA_SUCCESS) {
|
|
return status;
|
|
}
|
|
|
|
return status;
|
|
}
|