diff --git a/Common/nvrtc_helper.h b/Common/nvrtc_helper.h index 4544b194..aab3c681 100644 --- a/Common/nvrtc_helper.h +++ b/Common/nvrtc_helper.h @@ -187,6 +187,7 @@ CUmodule loadCUBIN(char *cubin, int argc, char **argv) { CUcontext context; int major = 0, minor = 0; char deviceName[256]; + CUctxCreateParams ctxCreateParams = {}; // Picks the best CUDA device available CUdevice cuDevice = findCudaDeviceDRV(argc, (const char **)argv); @@ -200,7 +201,7 @@ CUmodule loadCUBIN(char *cubin, int argc, char **argv) { printf("> GPU Device has SM %d.%d compute capability\n", major, minor); checkCudaErrors(cuInit(0)); - checkCudaErrors(cuCtxCreate(&context, 0, cuDevice)); + checkCudaErrors(cuCtxCreate(&context, &ctxCreateParams, 0, cuDevice)); checkCudaErrors(cuModuleLoadData(&module, cubin)); free(cubin); diff --git a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu index ca5cfcbd..cbede7d6 100644 --- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu +++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu @@ -247,7 +247,9 @@ int main(int argc, char **argv) exit(EXIT_WAIVED); } - if (device_prop.computeMode == cudaComputeModeProhibited) { + int computeMode; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, dev_id)); + if (computeMode == cudaComputeModeProhibited) { // This sample requires being run with a default or process exclusive mode fprintf(stderr, "This sample requires a device in either default or process " diff --git a/Samples/0_Introduction/matrixMulDrv/matrixMulDrv.cpp b/Samples/0_Introduction/matrixMulDrv/matrixMulDrv.cpp index 8f3c83d3..815801b8 100644 --- a/Samples/0_Introduction/matrixMulDrv/matrixMulDrv.cpp +++ b/Samples/0_Introduction/matrixMulDrv/matrixMulDrv.cpp @@ -268,9 +268,10 @@ void randomInit(float *data, int size) static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size) { - CUfunction cuFunction = 0; - int major = 0, minor = 0; - char deviceName[100]; + CUfunction cuFunction = 0; + int major = 0, minor = 0; + char deviceName[100]; + CUctxCreateParams ctxCreateParams = {}; cuDevice = findCudaDeviceDRV(argc, (const char **)argv); @@ -283,7 +284,7 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice)); printf(" Total amount of global memory: %llu bytes\n", (long long unsigned int)totalGlobalMem); - checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice)); + checkCudaErrors(cuCtxCreate(&cuContext, &ctxCreateParams, 0, cuDevice)); // first search for the module path before we load the results std::string module_path; diff --git a/Samples/0_Introduction/simpleDrvRuntime/simpleDrvRuntime.cpp b/Samples/0_Introduction/simpleDrvRuntime/simpleDrvRuntime.cpp index 77bcf10a..d4499e00 100644 --- a/Samples/0_Introduction/simpleDrvRuntime/simpleDrvRuntime.cpp +++ b/Samples/0_Introduction/simpleDrvRuntime/simpleDrvRuntime.cpp @@ -80,19 +80,20 @@ static void check(CUresult result, char const *const func, const char *const fil int main(int argc, char **argv) { printf("simpleDrvRuntime..\n"); - int N = 50000, devID = 0; - size_t size = N * sizeof(float); - CUdevice cuDevice; - CUfunction vecAdd_kernel; - CUmodule cuModule = 0; - CUcontext cuContext; + int N = 50000, devID = 0; + size_t size = N * sizeof(float); + CUdevice cuDevice; + CUfunction vecAdd_kernel; + CUmodule cuModule = 0; + CUcontext cuContext; + CUctxCreateParams ctxCreateParams = {}; // Initialize checkCudaDrvErrors(cuInit(0)); cuDevice = findCudaDevice(argc, (const char **)argv); // Create context - checkCudaDrvErrors(cuCtxCreate(&cuContext, 0, cuDevice)); + checkCudaDrvErrors(cuCtxCreate(&cuContext, &ctxCreateParams, 0, cuDevice)); // first search for the module path before we load the results string module_path; diff --git a/Samples/0_Introduction/simpleHyperQ/simpleHyperQ.cu b/Samples/0_Introduction/simpleHyperQ/simpleHyperQ.cu index 2972d88b..2a1ac958 100644 --- a/Samples/0_Introduction/simpleHyperQ/simpleHyperQ.cu +++ b/Samples/0_Introduction/simpleHyperQ/simpleHyperQ.cu @@ -127,6 +127,10 @@ int main(int argc, char **argv) checkCudaErrors(cudaGetDevice(&cuda_device)); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); + // Get device clock rate + int clockRate; + checkCudaErrors(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, cuda_device)); + // HyperQ is available in devices of Compute Capability 3.5 and higher if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) { if (deviceProp.concurrentKernels == 0) { @@ -170,9 +174,9 @@ int main(int argc, char **argv) #if defined(__arm__) || defined(__aarch64__) // the kernel takes more time than the channel reset time on arm archs, so to // prevent hangs reduce time_clocks. - clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 100)); + clock_t time_clocks = (clock_t)(kernel_time * (clockRate / 100)); #else - clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate); + clock_t time_clocks = (clock_t)(kernel_time * clockRate); #endif clock_t total_clocks = 0; diff --git a/Samples/0_Introduction/simpleIPC/simpleIPC.cu b/Samples/0_Introduction/simpleIPC/simpleIPC.cu index ab59fc4d..29403ca5 100644 --- a/Samples/0_Introduction/simpleIPC/simpleIPC.cu +++ b/Samples/0_Introduction/simpleIPC/simpleIPC.cu @@ -247,7 +247,9 @@ static void parentProcess(char *app) } // This sample requires two processes accessing each device, so we need // to ensure exclusive or prohibited mode is not set - if (prop.computeMode != cudaComputeModeDefault) { + int computeMode; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, i)); + if (computeMode != cudaComputeModeDefault) { printf("Device %d is in an unsupported compute mode for this sample\n", i); continue; } diff --git a/Samples/0_Introduction/simpleMultiCopy/simpleMultiCopy.cu b/Samples/0_Introduction/simpleMultiCopy/simpleMultiCopy.cu index 20e0d450..de28f396 100644 --- a/Samples/0_Introduction/simpleMultiCopy/simpleMultiCopy.cu +++ b/Samples/0_Introduction/simpleMultiCopy/simpleMultiCopy.cu @@ -218,9 +218,11 @@ int main(int argc, char *argv[]) printf("\n"); printf("Relevant properties of this CUDA device\n"); + int canOverlap; + checkCudaErrors(cudaDeviceGetAttribute(&canOverlap, cudaDevAttrGpuOverlap, cuda_device)); printf("(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution " - "(device property \"deviceOverlap\")\n", - deviceProp.deviceOverlap ? "X" : " "); + "(device property \"cudaDevAttrGpuOverlap\")\n", + canOverlap ? "X" : " "); // printf("(%s) Can execute several GPU kernels simultaneously (compute // capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " "); printf("(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n" diff --git a/Samples/0_Introduction/simpleTextureDrv/simpleTextureDrv.cpp b/Samples/0_Introduction/simpleTextureDrv/simpleTextureDrv.cpp index 322b7eb5..aa6b2627 100644 --- a/Samples/0_Introduction/simpleTextureDrv/simpleTextureDrv.cpp +++ b/Samples/0_Introduction/simpleTextureDrv/simpleTextureDrv.cpp @@ -309,10 +309,11 @@ void runTest(int argc, char **argv) //////////////////////////////////////////////////////////////////////////////// static CUresult initCUDA(int argc, char **argv, CUfunction *transform) { - CUfunction cuFunction = 0; - int major = 0, minor = 0, devID = 0; - char deviceName[100]; - string module_path; + CUfunction cuFunction = 0; + int major = 0, minor = 0, devID = 0; + char deviceName[100]; + string module_path; + CUctxCreateParams ctxCreateParams = {}; cuDevice = findCudaDeviceDRV(argc, (const char **)argv); @@ -322,7 +323,7 @@ static CUresult initCUDA(int argc, char **argv, CUfunction *transform) checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice)); printf("> GPU Device has SM %d.%d compute capability\n", major, minor); - checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice)); + checkCudaErrors(cuCtxCreate(&cuContext, &ctxCreateParams, 0, cuDevice)); // first search for the module_path before we try to load the results std::ostringstream fatbin; diff --git a/Samples/0_Introduction/systemWideAtomics/systemWideAtomics.cu b/Samples/0_Introduction/systemWideAtomics/systemWideAtomics.cu index 28e40ebc..121ba207 100644 --- a/Samples/0_Introduction/systemWideAtomics/systemWideAtomics.cu +++ b/Samples/0_Introduction/systemWideAtomics/systemWideAtomics.cu @@ -287,7 +287,9 @@ int main(int argc, char **argv) exit(EXIT_WAIVED); } - if (device_prop.computeMode == cudaComputeModeProhibited) { + int computeMode; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, dev_id)); + if (computeMode == cudaComputeModeProhibited) { // This sample requires being run with a default or process exclusive mode fprintf(stderr, "This sample requires a device in either default or process " diff --git a/Samples/0_Introduction/vectorAddDrv/vectorAddDrv.cpp b/Samples/0_Introduction/vectorAddDrv/vectorAddDrv.cpp index 16f6cbfd..e6a1a051 100644 --- a/Samples/0_Introduction/vectorAddDrv/vectorAddDrv.cpp +++ b/Samples/0_Introduction/vectorAddDrv/vectorAddDrv.cpp @@ -75,15 +75,16 @@ bool findModulePath(const char *, string &, char **, string &); int main(int argc, char **argv) { printf("Vector Addition (Driver API)\n"); - int N = 50000, devID = 0; - size_t size = N * sizeof(float); + int N = 50000, devID = 0; + size_t size = N * sizeof(float); + CUctxCreateParams ctxCreateParams = {}; // Initialize checkCudaErrors(cuInit(0)); cuDevice = findCudaDeviceDRV(argc, (const char **)argv); // Create context - checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice)); + checkCudaErrors(cuCtxCreate(&cuContext, &ctxCreateParams, 0, cuDevice)); // first search for the module path before we load the results string module_path; diff --git a/Samples/1_Utilities/deviceQuery/deviceQuery.cpp b/Samples/1_Utilities/deviceQuery/deviceQuery.cpp index 4deffb87..4d7da97b 100644 --- a/Samples/1_Utilities/deviceQuery/deviceQuery.cpp +++ b/Samples/1_Utilities/deviceQuery/deviceQuery.cpp @@ -59,6 +59,7 @@ template inline void getCudaAttribute(T *attribute, CUdevice_attribute #endif /* CUDART_VERSION < 5000 */ + //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// @@ -128,14 +129,20 @@ int main(int argc, char **argv) deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); + int clockRate; + checkCudaErrors(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, dev)); printf(" GPU Max Clock rate: %.0f MHz (%0.2f " "GHz)\n", - deviceProp.clockRate * 1e-3f, - deviceProp.clockRate * 1e-6f); - + clockRate * 1e-3f, + clockRate * 1e-6f); #if CUDART_VERSION >= 5000 - // This is supported in CUDA 5.0 (runtime API device properties) - printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f); + int memoryClockRate; +#if CUDART_VERSION >= 13000 + checkCudaErrors(cudaDeviceGetAttribute(&memoryClockRate, cudaDevAttrMemoryClockRate, dev)); +#else + memoryClockRate = deviceProp.memoryClockRate; +#endif + printf(" Memory Clock rate: %.0f Mhz\n", memoryClockRate * 1e-3f); printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth); if (deviceProp.l2CacheSize) { @@ -194,12 +201,15 @@ int main(int argc, char **argv) deviceProp.maxGridSize[2]); printf(" Maximum memory pitch: %zu bytes\n", deviceProp.memPitch); printf(" Texture alignment: %zu bytes\n", deviceProp.textureAlignment); + int gpuOverlap; + checkCudaErrors(cudaDeviceGetAttribute(&gpuOverlap, cudaDevAttrGpuOverlap, dev)); printf(" Concurrent copy and kernel execution: %s with %d copy " "engine(s)\n", - (deviceProp.deviceOverlap ? "Yes" : "No"), + (gpuOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); - printf(" Run time limit on kernels: %s\n", - deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); + int kernelExecTimeout; + checkCudaErrors(cudaDeviceGetAttribute(&kernelExecTimeout, cudaDevAttrKernelExecTimeout, dev)); + printf(" Run time limit on kernels: %s\n", kernelExecTimeout ? "Yes" : "No"); printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); @@ -213,8 +223,11 @@ int main(int argc, char **argv) printf(" Device supports Compute Preemption: %s\n", deviceProp.computePreemptionSupported ? "Yes" : "No"); printf(" Supports Cooperative Kernel Launch: %s\n", deviceProp.cooperativeLaunch ? "Yes" : "No"); + // The property cooperativeMultiDeviceLaunch is deprecated in CUDA 13.0 +#if CUDART_VERSION < 13000 printf(" Supports MultiDevice Co-op Kernel Launch: %s\n", deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No"); +#endif printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, @@ -230,8 +243,10 @@ int main(int argc, char **argv) "::cudaSetDevice() with this device)", "Unknown", NULL}; + int computeMode; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, dev)); printf(" Compute Mode:\n"); - printf(" < %s >\n", sComputeMode[deviceProp.computeMode]); + printf(" < %s >\n", sComputeMode[computeMode]); } // If there are 2 or more GPUs, query to determine whether RDMA is supported diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_consumer.cpp b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_consumer.cpp index a1bbc02a..fc1c5418 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_consumer.cpp +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_consumer.cpp @@ -190,8 +190,9 @@ done: CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer) { - CUdevice device; - CUresult status = CUDA_SUCCESS; + CUdevice device; + CUresult status = CUDA_SUCCESS; + CUctxCreateParams ctxCreateParams = {}; if (CUDA_SUCCESS != (status = cuInit(0))) { printf("Failed to initialize CUDA\n"); @@ -203,7 +204,7 @@ CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer) return status; } - if (CUDA_SUCCESS != (status = cuCtxCreate(&cudaConsumer->context, 0, device))) { + if (CUDA_SUCCESS != (status = cuCtxCreate(&cudaConsumer->context, &ctxCreateParams, 0, device))) { printf("failed to create CUDA context\n"); return status; } diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_producer.cpp b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_producer.cpp index e862e541..2416bb37 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_producer.cpp +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/cuda_producer.cpp @@ -182,8 +182,9 @@ done: CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer) { - CUdevice device; - CUresult status = CUDA_SUCCESS; + CUdevice device; + CUresult status = CUDA_SUCCESS; + CUctxCreateParams ctxCreateParams = {}; if (CUDA_SUCCESS != (status = cuInit(0))) { printf("Failed to initialize CUDA\n"); @@ -195,7 +196,7 @@ CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer) return status; } - if (CUDA_SUCCESS != (status = cuCtxCreate(&cudaProducer->context, 0, device))) { + if (CUDA_SUCCESS != (status = cuCtxCreate(&cudaProducer->context, &ctxCreateParams, 0, device))) { printf("failed to create CUDA context\n"); return status; } diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_consumer.cpp b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_consumer.cpp index 358b8a16..08f4b8c8 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_consumer.cpp +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_consumer.cpp @@ -302,7 +302,8 @@ CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer, CUdevice d major, minor); - if (CUDA_SUCCESS != (status = cuCtxCreate(&cudaConsumer->context, 0, device))) { + CUctxCreateParams ctxCreateParams = {}; + if (CUDA_SUCCESS != (status = cuCtxCreate(&cudaConsumer->context, &ctxCreateParams, 0, device))) { printf("failed to create CUDA context\n"); return status; } diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_producer.cpp b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_producer.cpp index 6d356841..c379291f 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_producer.cpp +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/cuda_producer.cpp @@ -316,7 +316,8 @@ CUresult cudaDeviceCreateProducer(test_cuda_producer_s *cudaProducer, CUdevice d exit(2); // EXIT_WAIVED } - if (CUDA_SUCCESS != (status = cuCtxCreate(&cudaProducer->context, 0, device))) { + CUctxCreateParams ctxCreateParams = {}; + if (CUDA_SUCCESS != (status = cuCtxCreate(&cudaProducer->context, &ctxCreateParams, 0, device))) { printf("failed to create CUDA context\n"); return status; } diff --git a/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/segmentationTree.cu b/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/segmentationTree.cu index 7d7c84df..b0c63605 100644 --- a/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/segmentationTree.cu +++ b/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/segmentationTree.cu @@ -69,6 +69,9 @@ #include #include +// for cuda::std::identity +#include + // Sample framework includes. #include #include @@ -680,7 +683,7 @@ private: thrust::make_counting_iterator(validEdgesCount), dEdgesFlags, dVertices_, - thrust::identity()) + cuda::std::identity()) .get(); pools.uintEdges.put(dEdgesFlags); diff --git a/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu b/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu index 3bfa9fe5..b473e8da 100644 --- a/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu +++ b/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/streamOrderedAllocationIPC.cu @@ -322,7 +322,9 @@ static void parentProcess(char *app) } // This sample requires two processes accessing each device, so we need // to ensure exclusive or prohibited mode is not set - if (prop.computeMode != cudaComputeModeDefault) { + int computeMode; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, i)); + if (computeMode != cudaComputeModeDefault) { printf("Device %d is in an unsupported compute mode for this sample\n", i); continue; } diff --git a/Samples/2_Concepts_and_Techniques/threadMigration/threadMigration.cpp b/Samples/2_Concepts_and_Techniques/threadMigration/threadMigration.cpp index 96c280aa..f6f4cdfc 100644 --- a/Samples/2_Concepts_and_Techniques/threadMigration/threadMigration.cpp +++ b/Samples/2_Concepts_and_Techniques/threadMigration/threadMigration.cpp @@ -118,13 +118,14 @@ bool runTest(int argc, char **argv); // to be setup and the CUDA module (CUBIN) is built by NVCC static CUresult InitCUDAContext(CUDAContext *pContext, CUdevice hcuDevice, int deviceID, char **argv) { - CUcontext hcuContext = 0; - CUmodule hcuModule = 0; - CUfunction hcuFunction = 0; - CUdeviceptr dptr = 0; + CUcontext hcuContext = 0; + CUmodule hcuModule = 0; + CUfunction hcuFunction = 0; + CUdeviceptr dptr = 0; + CUctxCreateParams ctxCreateParams = {}; // cuCtxCreate: Function works on floating contexts and current context - CUresult status = cuCtxCreate(&hcuContext, 0, hcuDevice); + CUresult status = cuCtxCreate(&hcuContext, &ctxCreateParams, 0, hcuDevice); if (CUDA_SUCCESS != status) { fprintf(stderr, "cuCtxCreate for failed %d\n", deviceID, status); diff --git a/Samples/3_CUDA_Features/graphConditionalNodes/graphConditionalNodes.cu b/Samples/3_CUDA_Features/graphConditionalNodes/graphConditionalNodes.cu index 5fbdbfbe..22ade6a7 100644 --- a/Samples/3_CUDA_Features/graphConditionalNodes/graphConditionalNodes.cu +++ b/Samples/3_CUDA_Features/graphConditionalNodes/graphConditionalNodes.cu @@ -97,13 +97,13 @@ void simpleIfGraph(void) params.kernel.kernelParams = kernelArgs; kernelArgs[0] = &dPtr; kernelArgs[1] = &handle; - checkCudaErrors(cudaGraphAddNode(&kernelNode, graph, NULL, 0, ¶ms)); + checkCudaErrors(cudaGraphAddNode(&kernelNode, graph, NULL, NULL, 0, ¶ms)); cudaGraphNodeParams cParams = {cudaGraphNodeTypeConditional}; cParams.conditional.handle = handle; cParams.conditional.type = cudaGraphCondTypeIf; cParams.conditional.size = 1; - checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, &kernelNode, 1, &cParams)); + checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, &kernelNode, NULL, 0, &cParams)); cudaGraph_t bodyGraph = cParams.conditional.phGraph_out[0]; @@ -111,7 +111,7 @@ void simpleIfGraph(void) cudaGraphNode_t bodyNode; params.kernel.func = (void *)ifGraphKernelC; params.kernel.kernelParams = nullptr; - checkCudaErrors(cudaGraphAddNode(&bodyNode, bodyGraph, NULL, 0, ¶ms)); + checkCudaErrors(cudaGraphAddNode(&bodyNode, bodyGraph, NULL, NULL, 0, ¶ms)); checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); @@ -182,7 +182,7 @@ void simpleDoWhileGraph(void) cParams.conditional.handle = handle; cParams.conditional.type = cudaGraphCondTypeWhile; cParams.conditional.size = 1; - checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, NULL, 0, &cParams)); + checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, NULL, NULL, 0, &cParams)); cudaGraph_t bodyGraph = cParams.conditional.phGraph_out[0]; @@ -267,7 +267,8 @@ void capturedWhileGraph(void) checkCudaErrors(cudaStreamBeginCapture(captureStream, cudaStreamCaptureModeGlobal)); // Obtain the handle of the graph - checkCudaErrors(cudaStreamGetCaptureInfo(captureStream, &status, NULL, &graph, &dependencies, &numDependencies)); + checkCudaErrors( + cudaStreamGetCaptureInfo(captureStream, &status, NULL, &graph, &dependencies, NULL, &numDependencies)); // Create the conditional handle cudaGraphConditionalHandle handle; @@ -277,7 +278,8 @@ void capturedWhileGraph(void) capturedWhileKernel<<<1, 1, 0, captureStream>>>(dPtr, handle); // Obtain the handle for node A - checkCudaErrors(cudaStreamGetCaptureInfo(captureStream, &status, NULL, &graph, &dependencies, &numDependencies)); + checkCudaErrors( + cudaStreamGetCaptureInfo(captureStream, &status, NULL, &graph, &dependencies, NULL, &numDependencies)); // Insert conditional node B cudaGraphNode_t conditionalNode; @@ -285,13 +287,13 @@ void capturedWhileGraph(void) cParams.conditional.handle = handle; cParams.conditional.type = cudaGraphCondTypeWhile; cParams.conditional.size = 1; - checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, dependencies, numDependencies, &cParams)); + checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, dependencies, NULL, numDependencies, &cParams)); cudaGraph_t bodyGraph = cParams.conditional.phGraph_out[0]; // Update stream capture dependencies to account for the node we manually added - checkCudaErrors( - cudaStreamUpdateCaptureDependencies(captureStream, &conditionalNode, 1, cudaStreamSetCaptureDependencies)); + checkCudaErrors(cudaStreamUpdateCaptureDependencies( + captureStream, &conditionalNode, NULL, 1, cudaStreamSetCaptureDependencies)); // Insert kernel node D capturedWhileEmptyKernel<<<1, 1, 0, captureStream>>>(); @@ -380,13 +382,13 @@ void simpleIfElseGraph(void) params.kernel.kernelParams = kernelArgs; kernelArgs[0] = &dPtr; kernelArgs[1] = &handle; - checkCudaErrors(cudaGraphAddNode(&kernelNode, graph, NULL, 0, ¶ms)); + checkCudaErrors(cudaGraphAddNode(&kernelNode, graph, NULL, NULL, 0, ¶ms)); cudaGraphNodeParams cParams = {cudaGraphNodeTypeConditional}; cParams.conditional.handle = handle; cParams.conditional.type = cudaGraphCondTypeIf; cParams.conditional.size = 2; // Set size to 2 to indicate an ELSE graph will be used - checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, &kernelNode, 1, &cParams)); + checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, &kernelNode, NULL, 0, &cParams)); cudaGraph_t bodyGraph = cParams.conditional.phGraph_out[0]; @@ -394,7 +396,7 @@ void simpleIfElseGraph(void) cudaGraphNode_t trueBodyNode; params.kernel.func = (void *)ifGraphKernelC; params.kernel.kernelParams = nullptr; - checkCudaErrors(cudaGraphAddNode(&trueBodyNode, bodyGraph, NULL, 0, ¶ms)); + checkCudaErrors(cudaGraphAddNode(&trueBodyNode, bodyGraph, NULL, NULL, 0, ¶ms)); // Populate the body of the second graph in the conditional node, executed if the condition is false bodyGraph = cParams.conditional.phGraph_out[1]; @@ -402,7 +404,7 @@ void simpleIfElseGraph(void) cudaGraphNode_t falseBodyNode; params.kernel.func = (void *)ifGraphKernelD; params.kernel.kernelParams = nullptr; - checkCudaErrors(cudaGraphAddNode(&falseBodyNode, bodyGraph, NULL, 0, ¶ms)); + checkCudaErrors(cudaGraphAddNode(&falseBodyNode, bodyGraph, NULL, NULL, 0, ¶ms)); checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); @@ -484,25 +486,25 @@ void simpleSwitchGraph(void) params.kernel.kernelParams = kernelArgs; kernelArgs[0] = &dPtr; kernelArgs[1] = &handle; - checkCudaErrors(cudaGraphAddNode(&kernelNode, graph, NULL, 0, ¶ms)); + checkCudaErrors(cudaGraphAddNode(&kernelNode, graph, NULL, NULL, 0, ¶ms)); cudaGraphNodeParams cParams = {cudaGraphNodeTypeConditional}; cParams.conditional.handle = handle; cParams.conditional.type = cudaGraphCondTypeSwitch; cParams.conditional.size = 4; - checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, &kernelNode, 1, &cParams)); + checkCudaErrors(cudaGraphAddNode(&conditionalNode, graph, &kernelNode, NULL, 0, &cParams)); // Populate the four graph bodies within the SWITCH conditional graph cudaGraphNode_t bodyNode; params.kernel.kernelParams = nullptr; params.kernel.func = (void *)switchGraphKernelC; - checkCudaErrors(cudaGraphAddNode(&bodyNode, cParams.conditional.phGraph_out[0], NULL, 0, ¶ms)); + checkCudaErrors(cudaGraphAddNode(&bodyNode, cParams.conditional.phGraph_out[0], NULL, NULL, 0, ¶ms)); params.kernel.func = (void *)switchGraphKernelD; - checkCudaErrors(cudaGraphAddNode(&bodyNode, cParams.conditional.phGraph_out[1], NULL, 0, ¶ms)); + checkCudaErrors(cudaGraphAddNode(&bodyNode, cParams.conditional.phGraph_out[1], NULL, NULL, 0, ¶ms)); params.kernel.func = (void *)switchGraphKernelE; - checkCudaErrors(cudaGraphAddNode(&bodyNode, cParams.conditional.phGraph_out[2], NULL, 0, ¶ms)); + checkCudaErrors(cudaGraphAddNode(&bodyNode, cParams.conditional.phGraph_out[2], NULL, NULL, 0, ¶ms)); params.kernel.func = (void *)switchGraphKernelF; - checkCudaErrors(cudaGraphAddNode(&bodyNode, cParams.conditional.phGraph_out[3], NULL, 0, ¶ms)); + checkCudaErrors(cudaGraphAddNode(&bodyNode, cParams.conditional.phGraph_out[3], NULL, NULL, 0, ¶ms)); checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); diff --git a/Samples/3_CUDA_Features/graphMemoryFootprint/graphMemoryFootprint.cu b/Samples/3_CUDA_Features/graphMemoryFootprint/graphMemoryFootprint.cu index 84fa45f1..8ca79852 100644 --- a/Samples/3_CUDA_Features/graphMemoryFootprint/graphMemoryFootprint.cu +++ b/Samples/3_CUDA_Features/graphMemoryFootprint/graphMemoryFootprint.cu @@ -149,9 +149,9 @@ void createSimpleAllocFreeGraph(cudaGraphExec_t *graphExec, float **dPtr, size_t checkCudaErrors(cudaGraphAddMemAllocNode(&allocNodeA, graph, NULL, 0, &allocParams)); *dPtr = (float *)allocParams.dptr; - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device)); - clock_t time_clocks = (clock_t)((kernelTime / 1000.0) * deviceProp.clockRate); + int clockRate; + checkCudaErrors(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, device)); + clock_t time_clocks = (clock_t)((kernelTime / 1000.0) * clockRate); void *blockDeviceArgs[1] = {(void *)&time_clocks}; diff --git a/Samples/3_CUDA_Features/memMapIPCDrv/memMapIpc.cpp b/Samples/3_CUDA_Features/memMapIPCDrv/memMapIpc.cpp index 0fe208d2..09937acc 100644 --- a/Samples/3_CUDA_Features/memMapIPCDrv/memMapIpc.cpp +++ b/Samples/3_CUDA_Features/memMapIPCDrv/memMapIpc.cpp @@ -340,13 +340,14 @@ static void childProcess(int devId, int id, char **argv) std::vector shHandle(procCount); checkIpcErrors(ipcRecvShareableHandles(ipcChildHandle, shHandle)); - CUcontext ctx; - CUdevice device; - CUstream stream; - int multiProcessorCount; + CUcontext ctx; + CUdevice device; + CUstream stream; + int multiProcessorCount; + CUctxCreateParams ctx_params = {}; checkCudaErrors(cuDeviceGet(&device, devId)); - checkCudaErrors(cuCtxCreate(&ctx, 0, device)); + checkCudaErrors(cuCtxCreate(&ctx, &ctx_params, 0, device)); checkCudaErrors(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); // Obtain kernel function for the sample @@ -518,8 +519,9 @@ static void parentProcess(char *app) } } if (allPeers) { - CUcontext ctx; - checkCudaErrors(cuCtxCreate(&ctx, 0, devices[i])); + CUcontext ctx; + CUctxCreateParams ctx_params = {}; + checkCudaErrors(cuCtxCreate(&ctx, &ctx_params, 0, devices[i])); ctxs.push_back(ctx); // Enable peers here. This isn't necessary for IPC, but it will diff --git a/Samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu b/Samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu index 3b20c063..6344d5e7 100644 --- a/Samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu +++ b/Samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu @@ -585,9 +585,12 @@ int main(int argc, char **argv) genTridiag(I, J, val_cpu, N, nz); memcpy(val, val_cpu, sizeof(float) * nz); - checkCudaErrors(cudaMemAdvise(I, sizeof(int) * (N + 1), cudaMemAdviseSetReadMostly, 0)); - checkCudaErrors(cudaMemAdvise(J, sizeof(int) * nz, cudaMemAdviseSetReadMostly, 0)); - checkCudaErrors(cudaMemAdvise(val, sizeof(float) * nz, cudaMemAdviseSetReadMostly, 0)); + cudaMemLocation deviceLoc; + deviceLoc.type = cudaMemLocationTypeDevice; + deviceLoc.id = 0; // Device location with initial device 0 + checkCudaErrors(cudaMemAdvise(I, sizeof(int) * (N + 1), cudaMemAdviseSetReadMostly, deviceLoc)); + checkCudaErrors(cudaMemAdvise(J, sizeof(int) * nz, cudaMemAdviseSetReadMostly, deviceLoc)); + checkCudaErrors(cudaMemAdvise(val, sizeof(float) * nz, cudaMemAdviseSetReadMostly, deviceLoc)); checkCudaErrors(cudaMallocManaged((void **)&x, sizeof(float) * N)); @@ -648,26 +651,30 @@ int main(int argc, char **argv) int offset_p = device_count * totalThreadsPerGPU; int offset_x = device_count * totalThreadsPerGPU; - checkCudaErrors(cudaMemPrefetchAsync(I, sizeof(int) * N, *deviceId, nStreams[device_count])); - checkCudaErrors(cudaMemPrefetchAsync(val, sizeof(float) * nz, *deviceId, nStreams[device_count])); - checkCudaErrors(cudaMemPrefetchAsync(J, sizeof(float) * nz, *deviceId, nStreams[device_count])); + // Create device location with specific device ID + cudaMemLocation deviceLoc; + deviceLoc.type = cudaMemLocationTypeDevice; + deviceLoc.id = *deviceId; + checkCudaErrors(cudaMemPrefetchAsync(I, sizeof(int) * N, deviceLoc, 0, nStreams[device_count])); + checkCudaErrors(cudaMemPrefetchAsync(val, sizeof(float) * nz, deviceLoc, 0, nStreams[device_count])); + checkCudaErrors(cudaMemPrefetchAsync(J, sizeof(float) * nz, deviceLoc, 0, nStreams[device_count])); if (offset_Ax <= N) { for (int i = 0; i < perGPUIter; i++) { cudaMemAdvise( - Ax + offset_Ax, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetPreferredLocation, *deviceId); + Ax + offset_Ax, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetPreferredLocation, deviceLoc); cudaMemAdvise( - r + offset_r, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetPreferredLocation, *deviceId); + r + offset_r, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetPreferredLocation, deviceLoc); cudaMemAdvise( - x + offset_x, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetPreferredLocation, *deviceId); + x + offset_x, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetPreferredLocation, deviceLoc); cudaMemAdvise( - p + offset_p, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetPreferredLocation, *deviceId); + p + offset_p, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetPreferredLocation, deviceLoc); cudaMemAdvise( - Ax + offset_Ax, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetAccessedBy, *deviceId); - cudaMemAdvise(r + offset_r, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetAccessedBy, *deviceId); - cudaMemAdvise(p + offset_p, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetAccessedBy, *deviceId); - cudaMemAdvise(x + offset_x, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetAccessedBy, *deviceId); + Ax + offset_Ax, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetAccessedBy, deviceLoc); + cudaMemAdvise(r + offset_r, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetAccessedBy, deviceLoc); + cudaMemAdvise(p + offset_p, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetAccessedBy, deviceLoc); + cudaMemAdvise(x + offset_x, sizeof(float) * totalThreadsPerGPU, cudaMemAdviseSetAccessedBy, deviceLoc); offset_Ax += totalThreadsPerGPU * kNumGpusRequired; offset_r += totalThreadsPerGPU * kNumGpusRequired; @@ -739,8 +746,11 @@ int main(int argc, char **argv) deviceId++; } - checkCudaErrors(cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId)); - checkCudaErrors(cudaMemPrefetchAsync(dot_result, sizeof(double), cudaCpuDeviceId)); + // Use cudaMemLocationTypeHost for optimal host memory location + cudaMemLocation hostLoc; + hostLoc.type = cudaMemLocationTypeHost; + checkCudaErrors(cudaMemPrefetchAsync(x, sizeof(float) * N, hostLoc, 0)); + checkCudaErrors(cudaMemPrefetchAsync(dot_result, sizeof(double), hostLoc, 0)); deviceId = bestFitDeviceIds.begin(); device_count = 0; diff --git a/Samples/4_CUDA_Libraries/cudaNvSci/CMakeLists.txt b/Samples/4_CUDA_Libraries/cudaNvSci/CMakeLists.txt index a169d00a..91b19005 100644 --- a/Samples/4_CUDA_Libraries/cudaNvSci/CMakeLists.txt +++ b/Samples/4_CUDA_Libraries/cudaNvSci/CMakeLists.txt @@ -8,7 +8,7 @@ find_package(CUDAToolkit REQUIRED) set(CMAKE_POSITION_INDEPENDENT_CODE ON) -set(CMAKE_CUDA_ARCHITECTURES 53 72 75 80 86 87 90) +set(CMAKE_CUDA_ARCHITECTURES 75 80 86 87 90) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") if(ENABLE_CUDA_DEBUG) diff --git a/Samples/4_CUDA_Libraries/jitLto/jitLto.cpp b/Samples/4_CUDA_Libraries/jitLto/jitLto.cpp index 0bf1d732..0a57dfee 100644 --- a/Samples/4_CUDA_Libraries/jitLto/jitLto.cpp +++ b/Samples/4_CUDA_Libraries/jitLto/jitLto.cpp @@ -146,13 +146,14 @@ int main(int argc, char *argv[]) getLTOIR(lto_saxpy, "lto_saxpy.cu", <oIR1, <oIR1Size); getLTOIR(lto_compute, "lto_compute.cu", <oIR2, <oIR2Size); - CUdevice cuDevice; - CUcontext context; - CUmodule module; - CUfunction kernel; + CUdevice cuDevice; + CUcontext context; + CUmodule module; + CUfunction kernel; + CUctxCreateParams ctxCreateParams = {}; CUDA_SAFE_CALL(cuInit(0)); CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0)); - CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); + CUDA_SAFE_CALL(cuCtxCreate(&context, &ctxCreateParams, 0, cuDevice)); // Dynamically determine the arch to link for int major = 0; diff --git a/Samples/4_CUDA_Libraries/simpleCUBLASXT/simpleCUBLASXT.cpp b/Samples/4_CUDA_Libraries/simpleCUBLASXT/simpleCUBLASXT.cpp index 44f85de9..b100f1ae 100644 --- a/Samples/4_CUDA_Libraries/simpleCUBLASXT/simpleCUBLASXT.cpp +++ b/Samples/4_CUDA_Libraries/simpleCUBLASXT/simpleCUBLASXT.cpp @@ -84,13 +84,17 @@ void findMultipleBestGPUs(int &num_of_devices, int *device_ids) cudaDeviceProp deviceProp; int devices_prohibited = 0; + int computeMode; + int clockRate; + while (current_device < device_count) { cudaGetDeviceProperties(&deviceProp, current_device); - + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device)); // If this GPU is not running on Compute Mode prohibited, // then we can add it to the list int sm_per_multiproc; - if (deviceProp.computeMode != cudaComputeModeProhibited) { + if (computeMode != cudaComputeModeProhibited) { if (deviceProp.major == 9999 && deviceProp.minor == 9999) { sm_per_multiproc = 1; } @@ -99,7 +103,7 @@ void findMultipleBestGPUs(int &num_of_devices, int *device_ids) } gpu_stats[current_device].compute_perf = - (uint64_t)deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate; + (uint64_t)deviceProp.multiProcessorCount * sm_per_multiproc * clockRate; gpu_stats[current_device].device_id = current_device; } else { diff --git a/Samples/5_Domain_Specific/simpleVulkan/SineWaveSimulation.cu b/Samples/5_Domain_Specific/simpleVulkan/SineWaveSimulation.cu index 7c6a7737..032bf2f5 100644 --- a/Samples/5_Domain_Specific/simpleVulkan/SineWaveSimulation.cu +++ b/Samples/5_Domain_Specific/simpleVulkan/SineWaveSimulation.cu @@ -94,8 +94,10 @@ int SineWaveSimulation::initCuda(uint8_t *vkDeviceUUID, size_t UUID_SIZE) // Find the GPU which is selected by Vulkan while (current_device < device_count) { cudaGetDeviceProperties(&deviceProp, current_device); + int computeMode; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device)); - if ((deviceProp.computeMode != cudaComputeModeProhibited)) { + if ((computeMode != cudaComputeModeProhibited)) { // Compare the cuda device UUID with vulkan UUID int ret = memcmp((void *)&deviceProp.uuid, vkDeviceUUID, UUID_SIZE); if (ret == 0) { diff --git a/Samples/5_Domain_Specific/vulkanImageCUDA/vulkanImageCUDA.cu b/Samples/5_Domain_Specific/vulkanImageCUDA/vulkanImageCUDA.cu index f782bbb9..702d6ee3 100644 --- a/Samples/5_Domain_Specific/vulkanImageCUDA/vulkanImageCUDA.cu +++ b/Samples/5_Domain_Specific/vulkanImageCUDA/vulkanImageCUDA.cu @@ -830,6 +830,7 @@ private: int devices_prohibited = 0; cudaDeviceProp deviceProp; + int computeMode; checkCudaErrors(cudaGetDeviceCount(&device_count)); if (device_count == 0) { @@ -840,8 +841,8 @@ private: // Find the GPU which is selected by Vulkan while (current_device < device_count) { cudaGetDeviceProperties(&deviceProp, current_device); - - if ((deviceProp.computeMode != cudaComputeModeProhibited)) { + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device)); + if ((computeMode != cudaComputeModeProhibited)) { // Compare the cuda device UUID with vulkan UUID int ret = memcmp(&deviceProp.uuid, &vkDeviceUUID, VK_UUID_SIZE); if (ret == 0) { diff --git a/Samples/6_Performance/UnifiedMemoryPerf/matrixMultiplyPerf.cu b/Samples/6_Performance/UnifiedMemoryPerf/matrixMultiplyPerf.cu index b260fbcc..59533cfd 100644 --- a/Samples/6_Performance/UnifiedMemoryPerf/matrixMultiplyPerf.cu +++ b/Samples/6_Performance/UnifiedMemoryPerf/matrixMultiplyPerf.cu @@ -335,9 +335,11 @@ void runMatrixMultiplyKernel(unsigned int matrixDim, checkCudaErrors(cudaMallocManaged(&dptrA, size)); checkCudaErrors(cudaMallocManaged(&dptrB, size)); checkCudaErrors(cudaMallocManaged(&dptrC, size)); - checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, cudaCpuDeviceId)); - checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, cudaCpuDeviceId)); - checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, cudaCpuDeviceId)); + cudaMemLocation hostLoc; + hostLoc.type = cudaMemLocationTypeHost; + checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, hostLoc, 0)); + checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, hostLoc, 0)); + checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, hostLoc, 0)); } else { checkCudaErrors(cudaMallocManaged(&dptrA, size, cudaMemAttachHost)); @@ -402,9 +404,12 @@ void runMatrixMultiplyKernel(unsigned int matrixDim, } if (hintsRequired) { if (deviceProp.concurrentManagedAccess) { - checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, device_id, streamToRunOn)); - checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, device_id, streamToRunOn)); - checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, device_id, streamToRunOn)); + cudaMemLocation deviceLoc; + deviceLoc.type = cudaMemLocationTypeDevice; + deviceLoc.id = device_id; + checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, deviceLoc, 0, streamToRunOn)); + checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, deviceLoc, 0, streamToRunOn)); + checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, deviceLoc, 0, streamToRunOn)); } else { checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrA, 0, cudaMemAttachGlobal)); @@ -437,9 +442,11 @@ void runMatrixMultiplyKernel(unsigned int matrixDim, sdkStartTimer(&gpuTransferCallsTimer); if (hintsRequired) { if (deviceProp.concurrentManagedAccess) { - checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, cudaCpuDeviceId)); - checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, cudaCpuDeviceId)); - checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, cudaCpuDeviceId)); + cudaMemLocation hostLoc; + hostLoc.type = cudaMemLocationTypeHost; + checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, hostLoc, 0)); + checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, hostLoc, 0)); + checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, hostLoc, 0)); } else { checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrA, 0, cudaMemAttachHost)); diff --git a/Samples/7_libNVVM/device-side-launch/dsl.c b/Samples/7_libNVVM/device-side-launch/dsl.c index cc493743..17cfe835 100644 --- a/Samples/7_libNVVM/device-side-launch/dsl.c +++ b/Samples/7_libNVVM/device-side-launch/dsl.c @@ -195,7 +195,7 @@ static CUresult buildKernel(CUcontext *phContext, CUdevice *phDevice, CUmodule * // Initialize CUDA and obtain the device's compute capability. int major = 0, minor = 0; *phDevice = cudaDeviceInit(&major, &minor); - checkCudaErrors(cuCtxCreate(phContext, 0, *phDevice)); + checkCudaErrors(cuCtxCreate(phContext, NULL, 0, *phDevice)); // Get the NVVM IR from file. size_t size = 0; diff --git a/Samples/7_libNVVM/simple/simple.c b/Samples/7_libNVVM/simple/simple.c index 94035e30..58c31d5b 100644 --- a/Samples/7_libNVVM/simple/simple.c +++ b/Samples/7_libNVVM/simple/simple.c @@ -89,7 +89,7 @@ initCUDA(CUcontext *phContext, CUdevice *phDevice, CUmodule *phModule, CUfunctio assert(phContext && phDevice && phModule && phKernel && ptx); // Create a CUDA context on the device. - checkCudaErrors(cuCtxCreate(phContext, 0, *phDevice)); + checkCudaErrors(cuCtxCreate(phContext, NULL, 0, *phDevice)); // Load the PTX. checkCudaErrors(cuModuleLoadDataEx(phModule, ptx, 0, 0, 0)); diff --git a/Samples/7_libNVVM/uvmlite/uvmlite.c b/Samples/7_libNVVM/uvmlite/uvmlite.c index 9cfead84..f977e092 100644 --- a/Samples/7_libNVVM/uvmlite/uvmlite.c +++ b/Samples/7_libNVVM/uvmlite/uvmlite.c @@ -206,7 +206,7 @@ static CUresult buildKernel(CUcontext *phContext, CUdevice *phDevice, CUmodule * *phDevice = cudaDeviceInit(&major, &minor); // Create a context on the device. - checkCudaErrors(cuCtxCreate(phContext, 0, *phDevice)); + checkCudaErrors(cuCtxCreate(phContext, NULL, 0, *phDevice)); // Get the NVVM IR from file. size_t size = 0;