/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "multidevicealloc_memmap.hpp" static size_t round_up(size_t x, size_t y) { return ((x + y - 1) / y) * y; } CUresult simpleMallocMultiDeviceMmap( CUdeviceptr *dptr, size_t *allocationSize, size_t size, const std::vector &residentDevices, const std::vector &mappingDevices, size_t align) { CUresult status = CUDA_SUCCESS; size_t min_granularity = 0; size_t stripeSize; // Setup the properties common for all the chunks // The allocations will be device pinned memory. // This property structure describes the physical location where the memory // will be allocated via cuMemCreate allong with additional properties In this // case, the allocation will be pinnded device memory local to a given device. CUmemAllocationProp prop = {}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; // Get the minimum granularity needed for the resident devices // (the max of the minimum granularity of each participating device) for (int idx = 0; idx < residentDevices.size(); idx++) { size_t granularity = 0; // get the minnimum granularity for residentDevices[idx] prop.location.id = residentDevices[idx]; status = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); if (status != CUDA_SUCCESS) { goto done; } if (min_granularity < granularity) { min_granularity = granularity; } } // Get the minimum granularity needed for the accessing devices // (the max of the minimum granularity of each participating device) for (size_t idx = 0; idx < mappingDevices.size(); idx++) { size_t granularity = 0; // get the minnimum granularity for mappingDevices[idx] prop.location.id = mappingDevices[idx]; status = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); if (status != CUDA_SUCCESS) { goto done; } if (min_granularity < granularity) { min_granularity = granularity; } } // Round up the size such that we can evenly split it into a stripe size tha // meets the granularity requirements Essentially size = N * // residentDevices.size() * min_granularity is the requirement, since each // piece of the allocation will be stripeSize = N * min_granularity and the // min_granularity requirement applies to each stripeSize piece of the // allocation. size = round_up(size, residentDevices.size() * min_granularity); stripeSize = size / residentDevices.size(); // Return the rounded up size to the caller for use in the free if (allocationSize) { *allocationSize = size; } // Reserve the required contiguous VA space for the allocations status = cuMemAddressReserve(dptr, size, align, 0, 0); if (status != CUDA_SUCCESS) { goto done; } // Create and map the backings on each gpu // note: reusing CUmemAllocationProp prop from earlier with prop.type & // prop.location.type already specified. for (size_t idx = 0; idx < residentDevices.size(); idx++) { CUresult status2 = CUDA_SUCCESS; // Set the location for this chunk to this device prop.location.id = residentDevices[idx]; // Create the allocation as a pinned allocation on this device CUmemGenericAllocationHandle allocationHandle; status = cuMemCreate(&allocationHandle, stripeSize, &prop, 0); if (status != CUDA_SUCCESS) { goto done; } // Assign the chunk to the appropriate VA range and release the handle. // After mapping the memory, it can be referenced by virtual address. // Since we do not need to make any other mappings of this memory or export // it, we no longer need and can release the allocationHandle. The // allocation will be kept live until it is unmapped. status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0, allocationHandle, 0); // the handle needs to be released even if the mapping failed. status2 = cuMemRelease(allocationHandle); if (status == CUDA_SUCCESS) { // cuMemRelease should not have failed here // as the handle was just allocated successfully // however return an error if it does. status = status2; } // Cleanup in case of any mapping failures. if (status != CUDA_SUCCESS) { goto done; } } { // Each accessDescriptor will describe the mapping requirement for a single // device std::vector accessDescriptors; accessDescriptors.resize(mappingDevices.size()); // Prepare the access descriptor array indicating where and how the backings // should be visible. for (size_t idx = 0; idx < mappingDevices.size(); idx++) { // Specify which device we are adding mappings for. accessDescriptors[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; accessDescriptors[idx].location.id = mappingDevices[idx]; // Specify both read and write access. accessDescriptors[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; } // Apply the access descriptors to the whole VA range. status = cuMemSetAccess(*dptr, size, &accessDescriptors[0], accessDescriptors.size()); if (status != CUDA_SUCCESS) { goto done; } } done: if (status != CUDA_SUCCESS) { if (*dptr) { simpleFreeMultiDeviceMmap(*dptr, size); } } return status; } CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size) { CUresult status = CUDA_SUCCESS; // Unmap the mapped virtual memory region // Since the handles to the mapped backing stores have already been released // by cuMemRelease, and these are the only/last mappings referencing them, // The backing stores will be freed. // Since the memory has been unmapped after this call, accessing the specified // va range will result in a fault (unitll it is remapped). status = cuMemUnmap(dptr, size); if (status != CUDA_SUCCESS) { return status; } // Free the virtual address region. This allows the virtual address region // to be reused by future cuMemAddressReserve calls. This also allows the // virtual address region to be used by other allocation made through // opperating system calls like malloc & mmap. status = cuMemAddressFree(dptr, size); if (status != CUDA_SUCCESS) { return status; } return status; }