/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This sample demonstrates Inter Process Communication * using cuMemMap APIs and with one process per GPU for computation. */ #include #include #include #include #include "cuda.h" #include "helper_multiprocess.h" // includes, project #include #include "helper_cuda_drvapi.h" // includes, CUDA #include using namespace std; // For direct NVLINK and PCI-E peers, at max 8 simultaneous peers are allowed // For NVSWITCH connected peers like DGX-2, simultaneous peers are not limited // in the same way. #define MAX_DEVICES (32) #define PROCESSES_PER_DEVICE 1 #define DATA_BUF_SIZE 4ULL * 1024ULL * 1024ULL static const char ipcName[] = "memmap_ipc_pipe"; static const char shmName[] = "memmap_ipc_shm"; typedef struct shmStruct_st { size_t nprocesses; int barrier; int sense; } shmStruct; bool findModulePath(const char *, string &, char **, string &); // define input ptx file for different platforms #if defined(_WIN64) || defined(__LP64__) #define PTX_FILE "memMapIpc_kernel64.ptx" #else #define PTX_FILE "memMapIpc_kernel32.ptx" #endif // `ipcHandleTypeFlag` specifies the platform specific handle type this sample // uses for importing and exporting memory allocation. On Linux this sample // specifies the type as CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR meaning that // file descriptors will be used. On Windows this sample specifies the type as // CU_MEM_HANDLE_TYPE_WIN32 meaning that NT HANDLEs will be used. The // ipcHandleTypeFlag variable is a convenience variable and is passed by value // to individual requests. #if defined(__linux__) CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; #else CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_WIN32; #endif #if defined(__linux__) #define cpu_atomic_add32(a, x) __sync_add_and_fetch(a, x) #elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #define cpu_atomic_add32(a, x) InterlockedAdd((volatile LONG *)a, x) #else #error Unsupported system #endif CUmodule cuModule; CUfunction _memMapIpc_kernel; static void barrierWait(volatile int *barrier, volatile int *sense, unsigned int n) { int count; // Check-in count = cpu_atomic_add32(barrier, 1); if (count == n) { // Last one in *sense = 1; } while (!*sense) ; // Check-out count = cpu_atomic_add32(barrier, -1); if (count == 0) { // Last one out *sense = 0; } while (*sense) ; } // Windows-specific LPSECURITYATTRIBUTES void getDefaultSecurityDescriptor(CUmemAllocationProp *prop) { #if defined(__linux__) return; #elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) static const char sddl[] = "D:P(OA;;GARCSDWDWOCCDCLCSWLODTWPRPCRFA;;;WD)"; static OBJECT_ATTRIBUTES objAttributes; static bool objAttributesConfigured = false; if (!objAttributesConfigured) { PSECURITY_DESCRIPTOR secDesc; BOOL result = ConvertStringSecurityDescriptorToSecurityDescriptorA( sddl, SDDL_REVISION_1, &secDesc, NULL); if (result == 0) { printf("IPC failure: getDefaultSecurityDescriptor Failed! (%d)\n", GetLastError()); } InitializeObjectAttributes(&objAttributes, NULL, 0, NULL, secDesc); objAttributesConfigured = true; } prop->win32HandleMetaData = &objAttributes; return; #endif } static void memMapAllocateAndExportMemory( unsigned char backingDevice, size_t allocSize, std::vector &allocationHandles, std::vector &shareableHandles) { // This property structure describes the physical location where the memory // will be allocated via cuMemCreate along with additional properties. CUmemAllocationProp prop = {}; // The allocations will be device pinned memory backed on backingDevice and // exportable with the specified handle type. prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; // Back all allocations on backingDevice. prop.location.id = (int)backingDevice; // Passing a requestedHandleTypes indicates intention to export this // allocation to a platform-specific handle. This sample requests a file // descriptor on Linux and NT Handle on Windows. prop.requestedHandleTypes = ipcHandleTypeFlag; // Get the minimum granularity supported for allocation with cuMemCreate() size_t granularity = 0; checkCudaErrors(cuMemGetAllocationGranularity( &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); if (allocSize % granularity) { printf( "Allocation size is not a multiple of minimum supported granularity " "for this device. Exiting...\n"); exit(EXIT_FAILURE); } // Windows-specific LPSECURITYATTRIBUTES is required when // CU_MEM_HANDLE_TYPE_WIN32 is used. The security attribute defines the scope // of which exported allocations may be tranferred to other processes. For all // other handle types, pass NULL. getDefaultSecurityDescriptor(&prop); for (int i = 0; i < allocationHandles.size(); i++) { // Create the allocation as a pinned allocation on device specified in // prop.location.id checkCudaErrors(cuMemCreate(&allocationHandles[i], allocSize, &prop, 0)); // Export the allocation to a platform-specific handle. The type of handle // requested here must match the requestedHandleTypes field in the prop // structure passed to cuMemCreate. checkCudaErrors(cuMemExportToShareableHandle((void *)&shareableHandles[i], allocationHandles[i], ipcHandleTypeFlag, 0)); } } static void memMapImportAndMapMemory( CUdeviceptr d_ptr, size_t mapSize, std::vector &shareableHandles, int mapDevice) { std::vector allocationHandles; allocationHandles.resize(shareableHandles.size()); // The accessDescriptor will describe the mapping requirement for the // mapDevice passed as argument CUmemAccessDesc accessDescriptor; // Specify location for mapping the imported allocations. accessDescriptor.location.type = CU_MEM_LOCATION_TYPE_DEVICE; accessDescriptor.location.id = mapDevice; // Specify both read and write accesses. accessDescriptor.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; for (int i = 0; i < shareableHandles.size(); i++) { // Import the memory allocation back into a CUDA handle from the platform // specific handle. checkCudaErrors(cuMemImportFromShareableHandle( &allocationHandles[i], (void *)(uintptr_t)shareableHandles[i], ipcHandleTypeFlag)); // Assign the chunk to the appropriate VA range and release the handle. // After mapping the memory, it can be referenced by virtual address. checkCudaErrors( cuMemMap(d_ptr + (i * mapSize), mapSize, 0, allocationHandles[i], 0)); // Since we do not need to make any other mappings of this memory or export // it, we no longer need and can release the allocationHandle. The // allocation will be kept live until it is unmapped. checkCudaErrors(cuMemRelease(allocationHandles[i])); } // Retain peer access and map all chunks to mapDevice checkCudaErrors(cuMemSetAccess(d_ptr, shareableHandles.size() * mapSize, &accessDescriptor, 1)); } static void memMapUnmapAndFreeMemory(CUdeviceptr dptr, size_t size) { CUresult status = CUDA_SUCCESS; // Unmap the mapped virtual memory region // Since the handles to the mapped backing stores have already been released // by cuMemRelease, and these are the only/last mappings referencing them, // The backing stores will be freed. // Since the memory has been unmapped after this call, accessing the specified // va range will result in a fault (unitll it is remapped). checkCudaErrors(cuMemUnmap(dptr, size)); // Free the virtual address region. This allows the virtual address region // to be reused by future cuMemAddressReserve calls. This also allows the // virtual address region to be used by other allocation made through // opperating system calls like malloc & mmap. checkCudaErrors(cuMemAddressFree(dptr, size)); } static void memMapGetDeviceFunction(char **argv) { // first search for the module path before we load the results string module_path, ptx_source; if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) { if (!findModulePath("memMapIpc_kernel.cubin", module_path, argv, ptx_source)) { printf( "> findModulePath could not find ptx or cubin\n"); exit(EXIT_FAILURE); } } else { printf("> initCUDA loading module: <%s>\n", module_path.c_str()); } // Create module from binary file (PTX or CUBIN) if (module_path.rfind("ptx") != string::npos) { // in this branch we use compilation with parameters const unsigned int jitNumOptions = 3; CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; void **jitOptVals = new void *[jitNumOptions]; // set up size of compilation log buffer jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; int jitLogBufferSize = 1024; jitOptVals[0] = (void *)(size_t)jitLogBufferSize; // set up pointer to the compilation log buffer jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; char *jitLogBuffer = new char[jitLogBufferSize]; jitOptVals[1] = jitLogBuffer; // set up pointer to set the Maximum # of registers for a particular kernel jitOptions[2] = CU_JIT_MAX_REGISTERS; int jitRegCount = 32; jitOptVals[2] = (void *)(size_t)jitRegCount; checkCudaErrors(cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals)); printf("> PTX JIT log:\n%s\n", jitLogBuffer); } else { checkCudaErrors(cuModuleLoad(&cuModule, module_path.c_str())); } // Get function handle from module checkCudaErrors( cuModuleGetFunction(&_memMapIpc_kernel, cuModule, "memMapIpc_kernel")); } static void childProcess(int devId, int id, char **argv) { volatile shmStruct *shm = NULL; sharedMemoryInfo info; ipcHandle *ipcChildHandle = NULL; int blocks = 0; int threads = 128; checkIpcErrors(ipcOpenSocket(ipcChildHandle)); if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) { printf("Failed to create shared memory slab\n"); exit(EXIT_FAILURE); } shm = (volatile shmStruct *)info.addr; int procCount = (int)shm->nprocesses; barrierWait(&shm->barrier, &shm->sense, (unsigned int)(procCount + 1)); // Receive all allocation handles shared by Parent. std::vector shHandle(procCount); checkIpcErrors(ipcRecvShareableHandles(ipcChildHandle, shHandle)); CUcontext ctx; CUdevice device; CUstream stream; int multiProcessorCount; checkCudaErrors(cuDeviceGet(&device, devId)); checkCudaErrors(cuCtxCreate(&ctx, 0, device)); checkCudaErrors(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); // Obtain kernel function for the sample memMapGetDeviceFunction(argv); checkCudaErrors(cuOccupancyMaxActiveBlocksPerMultiprocessor( &blocks, _memMapIpc_kernel, threads, 0)); checkCudaErrors(cuDeviceGetAttribute( &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); blocks *= multiProcessorCount; CUdeviceptr d_ptr = 0ULL; // Reserve the required contiguous VA space for the allocations checkCudaErrors(cuMemAddressReserve(&d_ptr, procCount * DATA_BUF_SIZE, DATA_BUF_SIZE, 0, 0)); // Import the memory allocations shared by the parent with us and map them in // our address space. memMapImportAndMapMemory(d_ptr, DATA_BUF_SIZE, shHandle, devId); // Since we have imported allocations shared by the parent with us, we can // close all the ShareableHandles. for (int i = 0; i < procCount; i++) { checkIpcErrors(ipcCloseShareableHandle(shHandle[i])); } checkIpcErrors(ipcCloseSocket(ipcChildHandle)); for (int i = 0; i < procCount; i++) { size_t bufferId = (i + id) % procCount; // Build arguments to be passed to cuda kernel. CUdeviceptr ptr = d_ptr + (bufferId * DATA_BUF_SIZE); int size = DATA_BUF_SIZE; char val = (char)id; void *args[] = {&ptr, &size, &val}; // Push a simple kernel on th buffer. checkCudaErrors(cuLaunchKernel(_memMapIpc_kernel, blocks, 1, 1, threads, 1, 1, 0, stream, args, 0)); checkCudaErrors(cuStreamSynchronize(stream)); // Wait for all my sibling processes to push this stage of their work // before proceeding to the next. This makes the data in the buffer // deterministic. barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount); if (id == 0) { printf("Step %lld done\n", (unsigned long long)i); } } printf("Process %d: verifying...\n", id); // Copy the data onto host and verify value if it matches expected value or // not. std::vector verification_buffer(DATA_BUF_SIZE); checkCudaErrors(cuMemcpyDtoHAsync(&verification_buffer[0], d_ptr + (id * DATA_BUF_SIZE), DATA_BUF_SIZE, stream)); checkCudaErrors(cuStreamSynchronize(stream)); // The contents should have the id of the sibling just after me char compareId = (char)((id + 1) % procCount); for (unsigned long long j = 0; j < DATA_BUF_SIZE; j++) { if (verification_buffer[j] != compareId) { printf("Process %d: Verification mismatch at %lld: %d != %d\n", id, j, (int)verification_buffer[j], (int)compareId); break; } } // Clean up! checkCudaErrors(cuStreamDestroy(stream)); checkCudaErrors(cuCtxDestroy(ctx)); // Unmap the allocations from our address space. Unmapping will also free the // handle as we have already released the imported handle with the call to // cuMemRelease. Finally, free up the Virtual Address space we reserved with // cuMemAddressReserve. memMapUnmapAndFreeMemory(d_ptr, procCount * DATA_BUF_SIZE); exit(EXIT_SUCCESS); } static void parentProcess(char *app) { int devCount, i, nprocesses = 0; volatile shmStruct *shm = NULL; sharedMemoryInfo info; std::vector processes; checkCudaErrors(cuDeviceGetCount(&devCount)); std::vector devices(devCount); if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) { printf("Failed to create shared memory slab\n"); exit(EXIT_FAILURE); } shm = (volatile shmStruct *)info.addr; memset((void *)shm, 0, sizeof(*shm)); for (i = 0; i < devCount; i++) { checkCudaErrors(cuDeviceGet(&devices[i], i)); } std::vector ctxs; std::vector selectedDevices; // Pick all the devices that can access each other's memory for this test // Keep in mind that CUDA has minimal support for fork() without a // corresponding exec() in the child process, but in this case our // spawnProcess will always exec, so no need to worry. for (i = 0; i < devCount; i++) { bool allPeers = true; int deviceComputeMode; int deviceSupportsIpcHandle; int attributeVal = 0; checkCudaErrors(cuDeviceGet(&devices[i], i)); checkCudaErrors(cuDeviceGetAttribute( &deviceComputeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, devices[i])); checkCudaErrors(cuDeviceGetAttribute( &attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, devices[i])); #if defined(__linux__) checkCudaErrors(cuDeviceGetAttribute( &deviceSupportsIpcHandle, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED, devices[i])); #else checkCudaErrors(cuDeviceGetAttribute( &deviceSupportsIpcHandle, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED, devices[i])); #endif // Check that the selected device supports virtual address management if (attributeVal == 0) { printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", devices[i]); continue; } // This sample requires two processes accessing each device, so we need // to ensure exclusive or prohibited mode is not set if (deviceComputeMode != CU_COMPUTEMODE_DEFAULT) { printf("Device %d is in an unsupported compute mode for this sample\n", i); continue; } if (!deviceSupportsIpcHandle) { printf( "Device %d does not support requested handle type for IPC, " "skipping...\n", i); continue; } for (int j = 0; j < nprocesses; j++) { int canAccessPeerIJ, canAccessPeerJI; checkCudaErrors( cuDeviceCanAccessPeer(&canAccessPeerJI, devices[j], devices[i])); checkCudaErrors( cuDeviceCanAccessPeer(&canAccessPeerIJ, devices[i], devices[j])); if (!canAccessPeerIJ || !canAccessPeerJI) { allPeers = false; break; } } if (allPeers) { CUcontext ctx; checkCudaErrors(cuCtxCreate(&ctx, 0, devices[i])); ctxs.push_back(ctx); // Enable peers here. This isn't necessary for IPC, but it will // setup the peers for the device. For systems that only allow 8 // peers per GPU at a time, this acts to remove devices from CanAccessPeer for (int j = 0; j < nprocesses; j++) { checkCudaErrors(cuCtxSetCurrent(ctxs[i])); checkCudaErrors(cuCtxEnablePeerAccess(ctxs[j], 0)); checkCudaErrors(cuCtxSetCurrent(ctxs[j])); checkCudaErrors(cuCtxEnablePeerAccess(ctxs[i], 0)); } selectedDevices.push_back(i); nprocesses++; if (nprocesses >= MAX_DEVICES) { break; } } else { printf( "Device %d is not peer capable with some other selected peers, " "skipping\n", i); } } for (int i = 0; i < ctxs.size(); ++i) { checkCudaErrors(cuCtxDestroy(ctxs[i])); }; if (nprocesses == 0) { printf("No CUDA devices support IPC\n"); exit(EXIT_WAIVED); } shm->nprocesses = nprocesses; unsigned char firstSelectedDevice = selectedDevices[0]; std::vector shHandles(nprocesses); std::vector allocationHandles(nprocesses); // Allocate `nprocesses` number of memory chunks and obtain a shareable handle // for each allocation. Share all memory allocations with all children. memMapAllocateAndExportMemory(firstSelectedDevice, DATA_BUF_SIZE, allocationHandles, shHandles); // Launch the child processes! for (i = 0; i < nprocesses; i++) { char devIdx[10]; char procIdx[10]; char *const args[] = {app, devIdx, procIdx, NULL}; Process process; SPRINTF(devIdx, "%d", selectedDevices[i]); SPRINTF(procIdx, "%d", i); if (spawnProcess(&process, app, args)) { printf("Failed to create process\n"); exit(EXIT_FAILURE); } processes.push_back(process); } barrierWait(&shm->barrier, &shm->sense, (unsigned int)(nprocesses + 1)); ipcHandle *ipcParentHandle = NULL; checkIpcErrors(ipcCreateSocket(ipcParentHandle, ipcName, processes)); checkIpcErrors( ipcSendShareableHandles(ipcParentHandle, shHandles, processes)); // Close the shareable handles as they are not needed anymore. for (int i = 0; i < nprocesses; i++) { checkIpcErrors(ipcCloseShareableHandle(shHandles[i])); } // And wait for them to finish for (i = 0; i < processes.size(); i++) { if (waitProcess(&processes[i]) != EXIT_SUCCESS) { printf("Process %d failed!\n", i); exit(EXIT_FAILURE); } } for (i = 0; i < nprocesses; i++) { checkCudaErrors(cuMemRelease(allocationHandles[i])); } checkIpcErrors(ipcCloseSocket(ipcParentHandle)); sharedMemoryClose(&info); } // Host code int main(int argc, char **argv) { #if defined(__arm__) || defined(__aarch64__) printf("Not supported on ARM\n"); return EXIT_WAIVED; #else // Initialize checkCudaErrors(cuInit(0)); if (argc == 1) { parentProcess(argv[0]); } else { childProcess(atoi(argv[1]), atoi(argv[2]), argv); } return EXIT_SUCCESS; #endif } bool inline findModulePath(const char *module_file, string &module_path, char **argv, string &ptx_source) { char *actual_path = sdkFindFilePath(module_file, argv[0]); if (actual_path) { module_path = actual_path; } else { printf("> findModulePath file not found: <%s> \n", module_file); return false; } if (module_path.empty()) { printf("> findModulePath could not find file: <%s> \n", module_file); return false; } else { printf("> findModulePath found file at <%s>\n", module_path.c_str()); if (module_path.rfind(".ptx") != string::npos) { FILE *fp = fopen(module_path.c_str(), "rb"); fseek(fp, 0, SEEK_END); int file_size = ftell(fp); char *buf = new char[file_size + 1]; fseek(fp, 0, SEEK_SET); fread(buf, sizeof(char), file_size, fp); fclose(fp); buf[file_size] = '\0'; ptx_source = buf; delete[] buf; } return true; } }