mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-24 21:49:15 +08:00
431 lines
14 KiB
C++
431 lines
14 KiB
C++
|
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or without
|
||
|
* modification, are permitted provided that the following conditions
|
||
|
* are met:
|
||
|
* * Redistributions of source code must retain the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer.
|
||
|
* * Redistributions in binary form must reproduce the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer in the
|
||
|
* documentation and/or other materials provided with the distribution.
|
||
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||
|
* contributors may be used to endorse or promote products derived
|
||
|
* from this software without specific prior written permission.
|
||
|
*
|
||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
*/
|
||
|
|
||
|
/******************************************************************************
|
||
|
*
|
||
|
* Module: threadMigration.cpp
|
||
|
*
|
||
|
* Description:
|
||
|
* Simple sample demonstrating multi-GPU/multithread functionality using
|
||
|
* the CUDA Context Management API. This API allows the a CUDA context to
|
||
|
* be associated with a CPU process. A host thread may have only one device
|
||
|
* context current at a time.
|
||
|
*
|
||
|
* Refer to the CUDA programming guide 4.5.3.3 on Context Management
|
||
|
*
|
||
|
******************************************************************************/
|
||
|
|
||
|
#define MAXTHREADS 256
|
||
|
#define NUM_INTS 32
|
||
|
|
||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||
|
// Windows threads use different data structures
|
||
|
#include <windows.h>
|
||
|
DWORD rgdwThreadIds[MAXTHREADS];
|
||
|
HANDLE rghThreads[MAXTHREADS];
|
||
|
CRITICAL_SECTION g_cs;
|
||
|
|
||
|
#define ENTERCRITICALSECTION EnterCriticalSection(&g_cs);
|
||
|
#define LEAVECRITICALSECTION LeaveCriticalSection(&g_cs);
|
||
|
#define STRICMP stricmp
|
||
|
#else
|
||
|
|
||
|
// Includes POSIX thread headers for Linux thread support
|
||
|
#include <pthread.h>
|
||
|
#include <stdint.h>
|
||
|
pthread_t rghThreads[MAXTHREADS];
|
||
|
pthread_mutex_t g_mutex;
|
||
|
|
||
|
#define ENTERCRITICALSECTION pthread_mutex_lock(&g_mutex);
|
||
|
#define LEAVECRITICALSECTION pthread_mutex_unlock(&g_mutex);
|
||
|
#define STRICMP strcasecmp
|
||
|
#endif
|
||
|
|
||
|
#include <stdlib.h>
|
||
|
#include <stdio.h>
|
||
|
#include <cuda.h>
|
||
|
#include <cuda_runtime_api.h>
|
||
|
#include <helper_cuda_drvapi.h>
|
||
|
|
||
|
#include <iostream>
|
||
|
#include <cstring>
|
||
|
|
||
|
using namespace std;
|
||
|
|
||
|
int NumThreads;
|
||
|
int ThreadLaunchCount;
|
||
|
|
||
|
typedef struct _CUDAContext_st {
|
||
|
CUcontext hcuContext;
|
||
|
CUmodule hcuModule;
|
||
|
CUfunction hcuFunction;
|
||
|
CUdeviceptr dptr;
|
||
|
int deviceID;
|
||
|
int threadNum;
|
||
|
} CUDAContext;
|
||
|
|
||
|
CUDAContext g_ThreadParams[MAXTHREADS];
|
||
|
|
||
|
// define input fatbin file
|
||
|
#ifndef FATBIN_FILE
|
||
|
#define FATBIN_FILE "threadMigration_kernel64.fatbin"
|
||
|
#endif
|
||
|
|
||
|
bool gbAutoQuit = false;
|
||
|
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
// declaration, forward
|
||
|
bool runTest(int argc, char **argv);
|
||
|
|
||
|
#define CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status) \
|
||
|
if (dptr) cuMemFree(dptr); \
|
||
|
if (hcuModule) cuModuleUnload(hcuModule); \
|
||
|
if (hcuContext) cuCtxDestroy(hcuContext); \
|
||
|
return status;
|
||
|
|
||
|
#define THREAD_QUIT \
|
||
|
printf("Error\n"); \
|
||
|
return 0;
|
||
|
|
||
|
// This sample uses the Driver API interface. The CUDA context needs
|
||
|
// to be setup and the CUDA module (CUBIN) is built by NVCC
|
||
|
static CUresult InitCUDAContext(CUDAContext *pContext, CUdevice hcuDevice,
|
||
|
int deviceID, char **argv) {
|
||
|
CUcontext hcuContext = 0;
|
||
|
CUmodule hcuModule = 0;
|
||
|
CUfunction hcuFunction = 0;
|
||
|
CUdeviceptr dptr = 0;
|
||
|
|
||
|
// cuCtxCreate: Function works on floating contexts and current context
|
||
|
CUresult status = cuCtxCreate(&hcuContext, 0, hcuDevice);
|
||
|
|
||
|
if (CUDA_SUCCESS != status) {
|
||
|
fprintf(stderr, "cuCtxCreate for <deviceID=%d> failed %d\n", deviceID,
|
||
|
status);
|
||
|
CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status);
|
||
|
}
|
||
|
|
||
|
status = CUDA_ERROR_INVALID_IMAGE;
|
||
|
|
||
|
string module_path, ptx_source;
|
||
|
std::ostringstream fatbin;
|
||
|
|
||
|
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
|
||
|
exit(EXIT_FAILURE);
|
||
|
} else {
|
||
|
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
||
|
}
|
||
|
|
||
|
if (!fatbin.str().size()) {
|
||
|
printf("fatbin file empty. exiting..\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
// Create module from binary file (FATBIN)
|
||
|
checkCudaErrors(cuModuleLoadData(&hcuModule, fatbin.str().c_str()));
|
||
|
|
||
|
status = cuModuleGetFunction(&hcuFunction, hcuModule, "kernelFunction");
|
||
|
|
||
|
if (CUDA_SUCCESS != status) {
|
||
|
fprintf(stderr, "cuModuleGetFunction failed %d\n", status);
|
||
|
CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status);
|
||
|
}
|
||
|
|
||
|
// Here we must release the CUDA context from the thread context
|
||
|
status = cuCtxPopCurrent(NULL);
|
||
|
|
||
|
if (CUDA_SUCCESS != status) {
|
||
|
fprintf(stderr, "cuCtxPopCurrent failed %d\n", status);
|
||
|
CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status);
|
||
|
}
|
||
|
|
||
|
pContext->hcuContext = hcuContext;
|
||
|
pContext->hcuModule = hcuModule;
|
||
|
pContext->hcuFunction = hcuFunction;
|
||
|
pContext->deviceID = deviceID;
|
||
|
|
||
|
return CUDA_SUCCESS;
|
||
|
}
|
||
|
|
||
|
// ThreadProc launches the CUDA kernel on a CUDA context.
|
||
|
// We have more than one thread that talks to a CUDA context
|
||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||
|
DWORD WINAPI ThreadProc(CUDAContext *pParams)
|
||
|
#else
|
||
|
void *ThreadProc(CUDAContext *pParams)
|
||
|
#endif
|
||
|
{
|
||
|
int wrong = 0;
|
||
|
int *pInt = 0;
|
||
|
|
||
|
printf("<CUDA Device=%d, Context=%p, Thread=%d> - ThreadProc() Launched...\n",
|
||
|
pParams->deviceID, pParams->hcuContext, pParams->threadNum);
|
||
|
|
||
|
// cuCtxPushCurrent: Attach the caller CUDA context to the thread context.
|
||
|
CUresult status = cuCtxPushCurrent(pParams->hcuContext);
|
||
|
|
||
|
if (CUDA_SUCCESS != status) {
|
||
|
THREAD_QUIT;
|
||
|
}
|
||
|
checkCudaErrors(cuMemAlloc(&pParams->dptr, NUM_INTS * sizeof(int)));
|
||
|
|
||
|
// There are two ways to launch CUDA kernels via the Driver API.
|
||
|
// In this CUDA Sample, we illustrate both ways to pass parameters
|
||
|
// and specify parameters. By default we use the simpler method.
|
||
|
|
||
|
if (1) {
|
||
|
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
||
|
// Launching (simpler method)
|
||
|
void *args[5] = {&pParams->dptr};
|
||
|
|
||
|
// new CUDA 4.0 Driver API Kernel launch call
|
||
|
status = cuLaunchKernel(pParams->hcuFunction, 1, 1, 1, 32, 1, 1, 0, NULL,
|
||
|
args, NULL);
|
||
|
|
||
|
if (CUDA_SUCCESS != status) {
|
||
|
fprintf(stderr, "cuLaunch failed %d\n", status);
|
||
|
THREAD_QUIT;
|
||
|
}
|
||
|
} else {
|
||
|
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
||
|
// Launching (advanced method)
|
||
|
int offset = 0;
|
||
|
char argBuffer[256];
|
||
|
|
||
|
// pass in launch parameters (not actually de-referencing CUdeviceptr).
|
||
|
// CUdeviceptr is storing the value of the parameters
|
||
|
*((CUdeviceptr *)&argBuffer[offset]) = pParams->dptr;
|
||
|
offset += sizeof(CUdeviceptr);
|
||
|
|
||
|
void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
|
||
|
CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
|
||
|
CU_LAUNCH_PARAM_END};
|
||
|
|
||
|
// new CUDA 4.0 Driver API Kernel launch call
|
||
|
status = cuLaunchKernel(pParams->hcuFunction, 1, 1, 1, 32, 1, 1, 0, 0, NULL,
|
||
|
(void **)&kernel_launch_config);
|
||
|
|
||
|
if (CUDA_SUCCESS != status) {
|
||
|
fprintf(stderr, "cuLaunch failed %d\n", status);
|
||
|
THREAD_QUIT;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
pInt = (int *)malloc(NUM_INTS * sizeof(int));
|
||
|
|
||
|
if (!pInt) return 0;
|
||
|
|
||
|
if (CUDA_SUCCESS ==
|
||
|
cuMemcpyDtoH(pInt, pParams->dptr, NUM_INTS * sizeof(int))) {
|
||
|
for (int i = 0; i < NUM_INTS; i++) {
|
||
|
if (pInt[i] != 32 - i) {
|
||
|
printf("<CUDA Device=%d, Context=%p, Thread=%d> error [%d]=%d!\n",
|
||
|
pParams->deviceID, pParams->hcuContext, pParams->threadNum, i,
|
||
|
pInt[i]);
|
||
|
wrong++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
ENTERCRITICALSECTION
|
||
|
|
||
|
if (!wrong) ThreadLaunchCount += 1;
|
||
|
|
||
|
LEAVECRITICALSECTION
|
||
|
}
|
||
|
|
||
|
free(pInt);
|
||
|
fflush(stdout);
|
||
|
checkCudaErrors(cuMemFree(pParams->dptr));
|
||
|
|
||
|
// cuCtxPopCurrent: Detach the current CUDA context from the calling thread.
|
||
|
checkCudaErrors(cuCtxPopCurrent(NULL));
|
||
|
|
||
|
printf("<CUDA Device=%d, Context=%p, Thread=%d> - ThreadProc() Finished!\n\n",
|
||
|
pParams->deviceID, pParams->hcuContext, pParams->threadNum);
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
bool FinalErrorCheck(CUDAContext *pContext, int NumThreads, int deviceCount) {
|
||
|
if (ThreadLaunchCount != NumThreads * deviceCount) {
|
||
|
printf("<Expected=%d, Actual=%d> ThreadLaunchCounts(s)\n",
|
||
|
NumThreads * deviceCount, ThreadLaunchCount);
|
||
|
return false;
|
||
|
} else {
|
||
|
for (int iDevice = 0; iDevice < deviceCount; iDevice++) {
|
||
|
// cuCtxDestroy called on current context or a floating context
|
||
|
if (CUDA_SUCCESS != cuCtxDestroy(pContext[iDevice].hcuContext))
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int main(int argc, char **argv) {
|
||
|
printf("Starting threadMigration\n");
|
||
|
|
||
|
bool bTestResult = runTest(argc, argv);
|
||
|
|
||
|
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
bool runTest(int argc, char **argv) {
|
||
|
printf("[ threadMigration ] API test...\n");
|
||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||
|
InitializeCriticalSection(&g_cs);
|
||
|
#else
|
||
|
pthread_mutex_init(&g_mutex, NULL);
|
||
|
#endif
|
||
|
// By default, we will launch 2 CUDA threads for each device
|
||
|
NumThreads = 2;
|
||
|
|
||
|
if (argc > 1) {
|
||
|
// If we are doing the QAtest or automated testing, we quit without
|
||
|
// prompting
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "qatest") ||
|
||
|
checkCmdLineFlag(argc, (const char **)argv, "noprompt")) {
|
||
|
gbAutoQuit = true;
|
||
|
}
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "numthreads")) {
|
||
|
NumThreads =
|
||
|
getCmdLineArgumentInt(argc, (const char **)argv, "numthreads");
|
||
|
|
||
|
if (NumThreads < 1 || NumThreads > 15) {
|
||
|
printf(
|
||
|
"Usage: \"threadMigration -n=<threads>\", <threads> ranges 1-15\n");
|
||
|
return 1;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int deviceCount;
|
||
|
int hcuDevice = 0;
|
||
|
CUresult status;
|
||
|
status = cuInit(0);
|
||
|
|
||
|
if (CUDA_SUCCESS != status) return false;
|
||
|
|
||
|
status = cuDeviceGetCount(&deviceCount);
|
||
|
|
||
|
if (CUDA_SUCCESS != status) return false;
|
||
|
|
||
|
printf("> %d CUDA device(s), %d Thread(s)/device to launched\n\n",
|
||
|
deviceCount, NumThreads);
|
||
|
|
||
|
if (deviceCount == 0) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
int ihThread = 0;
|
||
|
int ThreadIndex = 0;
|
||
|
|
||
|
CUDAContext *pContext =
|
||
|
(CUDAContext *)malloc(sizeof(CUDAContext) * deviceCount);
|
||
|
|
||
|
for (int iDevice = 0; iDevice < deviceCount; iDevice++) {
|
||
|
char szName[256];
|
||
|
status = cuDeviceGet(&hcuDevice, iDevice);
|
||
|
|
||
|
if (CUDA_SUCCESS != status) return false;
|
||
|
|
||
|
status = cuDeviceGetName(szName, 256, hcuDevice);
|
||
|
|
||
|
if (CUDA_SUCCESS != status) return false;
|
||
|
|
||
|
{
|
||
|
int major = 0, minor = 0;
|
||
|
checkCudaErrors(cuDeviceGetAttribute(
|
||
|
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hcuDevice));
|
||
|
checkCudaErrors(cuDeviceGetAttribute(
|
||
|
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hcuDevice));
|
||
|
int sharedMemPerBlock;
|
||
|
checkCudaErrors(cuDeviceGetAttribute(
|
||
|
&sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
|
||
|
hcuDevice));
|
||
|
int totalConstantMemory;
|
||
|
checkCudaErrors(cuDeviceGetAttribute(
|
||
|
&totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
|
||
|
hcuDevice));
|
||
|
int regsPerBlock;
|
||
|
checkCudaErrors(cuDeviceGetAttribute(
|
||
|
®sPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
|
||
|
hcuDevice));
|
||
|
int clockRate;
|
||
|
checkCudaErrors(cuDeviceGetAttribute(
|
||
|
&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, hcuDevice));
|
||
|
printf("Device %d: \"%s\" (Compute %d.%d)\n", iDevice, szName, major,
|
||
|
minor);
|
||
|
printf("\tsharedMemPerBlock: %d\n", sharedMemPerBlock);
|
||
|
printf("\tconstantMemory : %d\n", totalConstantMemory);
|
||
|
printf("\tregsPerBlock : %d\n", regsPerBlock);
|
||
|
printf("\tclockRate : %d\n", clockRate);
|
||
|
printf("\n");
|
||
|
}
|
||
|
|
||
|
if (CUDA_SUCCESS !=
|
||
|
InitCUDAContext(&pContext[iDevice], hcuDevice, iDevice, argv)) {
|
||
|
return FinalErrorCheck(pContext, NumThreads, deviceCount);
|
||
|
} else {
|
||
|
for (int iThread = 0; iThread < NumThreads; iThread++, ihThread++) {
|
||
|
g_ThreadParams[ThreadIndex].hcuContext = pContext[iDevice].hcuContext;
|
||
|
g_ThreadParams[ThreadIndex].hcuModule = pContext[iDevice].hcuModule;
|
||
|
g_ThreadParams[ThreadIndex].hcuFunction = pContext[iDevice].hcuFunction;
|
||
|
g_ThreadParams[ThreadIndex].deviceID = pContext[iDevice].deviceID;
|
||
|
g_ThreadParams[ThreadIndex].threadNum = iThread;
|
||
|
// Launch (NumThreads) for each CUDA context
|
||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||
|
rghThreads[ThreadIndex] = CreateThread(
|
||
|
NULL, 0, (LPTHREAD_START_ROUTINE)ThreadProc,
|
||
|
&g_ThreadParams[ThreadIndex], 0, &rgdwThreadIds[ThreadIndex]);
|
||
|
#else // Assume we are running linux
|
||
|
pthread_create(&rghThreads[ThreadIndex], NULL,
|
||
|
(void *(*)(void *))ThreadProc,
|
||
|
&g_ThreadParams[ThreadIndex]);
|
||
|
#endif
|
||
|
ThreadIndex += 1;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Wait until all workers are done
|
||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||
|
WaitForMultipleObjects(ThreadIndex, rghThreads, TRUE, INFINITE);
|
||
|
#else
|
||
|
|
||
|
for (int i = 0; i < ThreadIndex; i++) {
|
||
|
pthread_join(rghThreads[i], NULL);
|
||
|
}
|
||
|
|
||
|
#endif
|
||
|
|
||
|
bool ret_status = FinalErrorCheck(pContext, NumThreads, deviceCount);
|
||
|
free(pContext);
|
||
|
return ret_status;
|
||
|
}
|