cuda-samples/Samples/threadMigration/threadMigration.cpp

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/******************************************************************************
*
*   Module: threadMigration.cpp
*
*   Description:
*     Simple sample demonstrating multi-GPU/multithread functionality using
*     the CUDA Context Management API.  This API allows the a CUDA context to
*     be associated with a CPU process. A host thread may have only one device
*     context current at a time.
*
*    Refer to the CUDA programming guide 4.5.3.3 on Context Management
*
******************************************************************************/

#define MAXTHREADS 256
#define NUM_INTS 32

#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// Windows threads use different data structures
#include <windows.h>
DWORD rgdwThreadIds[MAXTHREADS];
HANDLE rghThreads[MAXTHREADS];
CRITICAL_SECTION g_cs;

#define ENTERCRITICALSECTION EnterCriticalSection(&g_cs);
#define LEAVECRITICALSECTION LeaveCriticalSection(&g_cs);
#define STRICMP stricmp
#else

// Includes POSIX thread headers for Linux thread support
#include <pthread.h>
#include <stdint.h>
pthread_t rghThreads[MAXTHREADS];
pthread_mutex_t g_mutex;

#define ENTERCRITICALSECTION pthread_mutex_lock(&g_mutex);
#define LEAVECRITICALSECTION pthread_mutex_unlock(&g_mutex);
#define STRICMP strcasecmp
#endif

#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <helper_cuda_drvapi.h>

#include <iostream>
#include <cstring>

using namespace std;

int NumThreads;
int ThreadLaunchCount;

typedef struct _CUDAContext_st {
  CUcontext hcuContext;
  CUmodule hcuModule;
  CUfunction hcuFunction;
  CUdeviceptr dptr;
  int deviceID;
  int threadNum;
} CUDAContext;

CUDAContext g_ThreadParams[MAXTHREADS];

// define input fatbin file
#ifndef FATBIN_FILE
#define FATBIN_FILE "threadMigration_kernel64.fatbin"
#endif

bool gbAutoQuit = false;

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
bool runTest(int argc, char **argv);

#define CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status) \
  if (dptr) cuMemFree(dptr);                                  \
  if (hcuModule) cuModuleUnload(hcuModule);                   \
  if (hcuContext) cuCtxDestroy(hcuContext);                   \
  return status;

#define THREAD_QUIT  \
  printf("Error\n"); \
  return 0;

// This sample uses the Driver API interface.  The CUDA context needs
// to be setup and the CUDA module (CUBIN) is built by NVCC
static CUresult InitCUDAContext(CUDAContext *pContext, CUdevice hcuDevice,
                                int deviceID, char **argv) {
  CUcontext hcuContext = 0;
  CUmodule hcuModule = 0;
  CUfunction hcuFunction = 0;
  CUdeviceptr dptr = 0;

  // cuCtxCreate: Function works on floating contexts and current context
  CUresult status = cuCtxCreate(&hcuContext, 0, hcuDevice);

  if (CUDA_SUCCESS != status) {
    fprintf(stderr, "cuCtxCreate for <deviceID=%d> failed %d\n", deviceID,
            status);
    CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status);
  }

  status = CUDA_ERROR_INVALID_IMAGE;

  string module_path, ptx_source;
  std::ostringstream fatbin;

  if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
    exit(EXIT_FAILURE);
  } else {
    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
  }

  if (!fatbin.str().size()) {
    printf("fatbin file empty. exiting..\n");
    exit(EXIT_FAILURE);
  }

  // Create module from binary file (FATBIN)
  checkCudaErrors(cuModuleLoadData(&hcuModule, fatbin.str().c_str()));

  status = cuModuleGetFunction(&hcuFunction, hcuModule, "kernelFunction");

  if (CUDA_SUCCESS != status) {
    fprintf(stderr, "cuModuleGetFunction failed %d\n", status);
    CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status);
  }

  // Here we must release the CUDA context from the thread context
  status = cuCtxPopCurrent(NULL);

  if (CUDA_SUCCESS != status) {
    fprintf(stderr, "cuCtxPopCurrent failed %d\n", status);
    CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status);
  }

  pContext->hcuContext = hcuContext;
  pContext->hcuModule = hcuModule;
  pContext->hcuFunction = hcuFunction;
  pContext->deviceID = deviceID;

  return CUDA_SUCCESS;
}

// ThreadProc launches the CUDA kernel on a CUDA context.
// We have more than one thread that talks to a CUDA context
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
DWORD WINAPI ThreadProc(CUDAContext *pParams)
#else
void *ThreadProc(CUDAContext *pParams)
#endif
{
  int wrong = 0;
  int *pInt = 0;

  printf("<CUDA Device=%d, Context=%p, Thread=%d> - ThreadProc() Launched...\n",
         pParams->deviceID, pParams->hcuContext, pParams->threadNum);

  // cuCtxPushCurrent: Attach the caller CUDA context to the thread context.
  CUresult status = cuCtxPushCurrent(pParams->hcuContext);

  if (CUDA_SUCCESS != status) {
    THREAD_QUIT;
  }
  checkCudaErrors(cuMemAlloc(&pParams->dptr, NUM_INTS * sizeof(int)));

  // There are two ways to launch CUDA kernels via the Driver API.
  // In this CUDA Sample, we illustrate both ways to pass parameters
  // and specify parameters.  By default we use the simpler method.

  if (1) {
    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
    // Launching (simpler method)
    void *args[5] = {&pParams->dptr};

    // new CUDA 4.0 Driver API Kernel launch call
    status = cuLaunchKernel(pParams->hcuFunction, 1, 1, 1, 32, 1, 1, 0, NULL,
                            args, NULL);

    if (CUDA_SUCCESS != status) {
      fprintf(stderr, "cuLaunch failed %d\n", status);
      THREAD_QUIT;
    }
  } else {
    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
    // Launching (advanced method)
    int offset = 0;
    char argBuffer[256];

    // pass in launch parameters (not actually de-referencing CUdeviceptr).
    // CUdeviceptr is storing the value of the parameters
    *((CUdeviceptr *)&argBuffer[offset]) = pParams->dptr;
    offset += sizeof(CUdeviceptr);

    void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
                                     CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
                                     CU_LAUNCH_PARAM_END};

    // new CUDA 4.0 Driver API Kernel launch call
    status = cuLaunchKernel(pParams->hcuFunction, 1, 1, 1, 32, 1, 1, 0, 0, NULL,
                            (void **)&kernel_launch_config);

    if (CUDA_SUCCESS != status) {
      fprintf(stderr, "cuLaunch failed %d\n", status);
      THREAD_QUIT;
    }
  }

  pInt = (int *)malloc(NUM_INTS * sizeof(int));

  if (!pInt) return 0;

  if (CUDA_SUCCESS ==
      cuMemcpyDtoH(pInt, pParams->dptr, NUM_INTS * sizeof(int))) {
    for (int i = 0; i < NUM_INTS; i++) {
      if (pInt[i] != 32 - i) {
        printf("<CUDA Device=%d, Context=%p, Thread=%d> error [%d]=%d!\n",
               pParams->deviceID, pParams->hcuContext, pParams->threadNum, i,
               pInt[i]);
        wrong++;
      }
    }

    ENTERCRITICALSECTION

    if (!wrong) ThreadLaunchCount += 1;

    LEAVECRITICALSECTION
  }

  free(pInt);
  fflush(stdout);
  checkCudaErrors(cuMemFree(pParams->dptr));

  // cuCtxPopCurrent: Detach the current CUDA context from the calling thread.
  checkCudaErrors(cuCtxPopCurrent(NULL));

  printf("<CUDA Device=%d, Context=%p, Thread=%d> - ThreadProc() Finished!\n\n",
         pParams->deviceID, pParams->hcuContext, pParams->threadNum);

  return 0;
}

bool FinalErrorCheck(CUDAContext *pContext, int NumThreads, int deviceCount) {
  if (ThreadLaunchCount != NumThreads * deviceCount) {
    printf("<Expected=%d, Actual=%d> ThreadLaunchCounts(s)\n",
           NumThreads * deviceCount, ThreadLaunchCount);
    return false;
  } else {
    for (int iDevice = 0; iDevice < deviceCount; iDevice++) {
      // cuCtxDestroy called on current context or a floating context
      if (CUDA_SUCCESS != cuCtxDestroy(pContext[iDevice].hcuContext))
        return false;
    }

    return true;
  }
}

int main(int argc, char **argv) {
  printf("Starting threadMigration\n");

  bool bTestResult = runTest(argc, argv);

  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

bool runTest(int argc, char **argv) {
  printf("[ threadMigration ] API test...\n");
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
  InitializeCriticalSection(&g_cs);
#else
  pthread_mutex_init(&g_mutex, NULL);
#endif
  // By default, we will launch 2 CUDA threads for each device
  NumThreads = 2;

  if (argc > 1) {
    // If we are doing the QAtest or automated testing, we quit without
    // prompting
    if (checkCmdLineFlag(argc, (const char **)argv, "qatest") ||
        checkCmdLineFlag(argc, (const char **)argv, "noprompt")) {
      gbAutoQuit = true;
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "numthreads")) {
      NumThreads =
          getCmdLineArgumentInt(argc, (const char **)argv, "numthreads");

      if (NumThreads < 1 || NumThreads > 15) {
        printf(
            "Usage: \"threadMigration -n=<threads>\", <threads> ranges 1-15\n");
        return 1;
      }
    }
  }

  int deviceCount;
  int hcuDevice = 0;
  CUresult status;
  status = cuInit(0);

  if (CUDA_SUCCESS != status) return false;

  status = cuDeviceGetCount(&deviceCount);

  if (CUDA_SUCCESS != status) return false;

  printf("> %d CUDA device(s), %d Thread(s)/device to launched\n\n",
         deviceCount, NumThreads);

  if (deviceCount == 0) {
    return false;
  }

  int ihThread = 0;
  int ThreadIndex = 0;

  CUDAContext *pContext =
      (CUDAContext *)malloc(sizeof(CUDAContext) * deviceCount);

  for (int iDevice = 0; iDevice < deviceCount; iDevice++) {
    char szName[256];
    status = cuDeviceGet(&hcuDevice, iDevice);

    if (CUDA_SUCCESS != status) return false;

    status = cuDeviceGetName(szName, 256, hcuDevice);

    if (CUDA_SUCCESS != status) return false;

    {
      int major = 0, minor = 0;
      checkCudaErrors(cuDeviceGetAttribute(
          &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hcuDevice));
      checkCudaErrors(cuDeviceGetAttribute(
          &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hcuDevice));
      int sharedMemPerBlock;
      checkCudaErrors(cuDeviceGetAttribute(
          &sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
          hcuDevice));
      int totalConstantMemory;
      checkCudaErrors(cuDeviceGetAttribute(
          &totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
          hcuDevice));
      int regsPerBlock;
      checkCudaErrors(cuDeviceGetAttribute(
          &regsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
          hcuDevice));
      int clockRate;
      checkCudaErrors(cuDeviceGetAttribute(
          &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, hcuDevice));
      printf("Device %d: \"%s\" (Compute %d.%d)\n", iDevice, szName, major,
             minor);
      printf("\tsharedMemPerBlock: %d\n", sharedMemPerBlock);
      printf("\tconstantMemory   : %d\n", totalConstantMemory);
      printf("\tregsPerBlock     : %d\n", regsPerBlock);
      printf("\tclockRate        : %d\n", clockRate);
      printf("\n");
    }

    if (CUDA_SUCCESS !=
        InitCUDAContext(&pContext[iDevice], hcuDevice, iDevice, argv)) {
      return FinalErrorCheck(pContext, NumThreads, deviceCount);
    } else {
      for (int iThread = 0; iThread < NumThreads; iThread++, ihThread++) {
        g_ThreadParams[ThreadIndex].hcuContext = pContext[iDevice].hcuContext;
        g_ThreadParams[ThreadIndex].hcuModule = pContext[iDevice].hcuModule;
        g_ThreadParams[ThreadIndex].hcuFunction = pContext[iDevice].hcuFunction;
        g_ThreadParams[ThreadIndex].deviceID = pContext[iDevice].deviceID;
        g_ThreadParams[ThreadIndex].threadNum = iThread;
        // Launch (NumThreads) for each CUDA context
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
        rghThreads[ThreadIndex] = CreateThread(
            NULL, 0, (LPTHREAD_START_ROUTINE)ThreadProc,
            &g_ThreadParams[ThreadIndex], 0, &rgdwThreadIds[ThreadIndex]);
#else  // Assume we are running linux
        pthread_create(&rghThreads[ThreadIndex], NULL,
                       (void *(*)(void *))ThreadProc,
                       &g_ThreadParams[ThreadIndex]);
#endif
        ThreadIndex += 1;
      }
    }
  }

  // Wait until all workers are done
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
  WaitForMultipleObjects(ThreadIndex, rghThreads, TRUE, INFINITE);
#else

  for (int i = 0; i < ThreadIndex; i++) {
    pthread_join(rghThreads[i], NULL);
  }

#endif

  bool ret_status = FinalErrorCheck(pContext, NumThreads, deviceCount);
  free(pContext);
  return ret_status;
}
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`/******************************************************************************`
			`*`
			`* Module: threadMigration.cpp`
			`*`
			`* Description:`
			`* Simple sample demonstrating multi-GPU/multithread functionality using`
			`* the CUDA Context Management API. This API allows the a CUDA context to`
			`* be associated with a CPU process. A host thread may have only one device`
			`* context current at a time.`
			`*`
			`* Refer to the CUDA programming guide 4.5.3.3 on Context Management`
			`*`
			`******************************************************************************/`

			`#define MAXTHREADS 256`
			`#define NUM_INTS 32`

			`#if defined(WIN32) \|\| defined(_WIN32) \|\| defined(WIN64) \|\| defined(_WIN64)`
			`// Windows threads use different data structures`
			`#include <windows.h>`
			`DWORD rgdwThreadIds[MAXTHREADS];`
			`HANDLE rghThreads[MAXTHREADS];`
			`CRITICAL_SECTION g_cs;`

			`#define ENTERCRITICALSECTION EnterCriticalSection(&g_cs);`
			`#define LEAVECRITICALSECTION LeaveCriticalSection(&g_cs);`
			`#define STRICMP stricmp`
			`#else`

			`// Includes POSIX thread headers for Linux thread support`
			`#include <pthread.h>`
			`#include <stdint.h>`
			`pthread_t rghThreads[MAXTHREADS];`
			`pthread_mutex_t g_mutex;`

			`#define ENTERCRITICALSECTION pthread_mutex_lock(&g_mutex);`
			`#define LEAVECRITICALSECTION pthread_mutex_unlock(&g_mutex);`
			`#define STRICMP strcasecmp`
			`#endif`

			`#include <stdlib.h>`
			`#include <stdio.h>`
			`#include <cuda.h>`
			`#include <cuda_runtime_api.h>`
			`#include <helper_cuda_drvapi.h>`

			`#include <iostream>`
			`#include <cstring>`

			`using namespace std;`

			`int NumThreads;`
			`int ThreadLaunchCount;`

			`typedef struct _CUDAContext_st {`
			`CUcontext hcuContext;`
			`CUmodule hcuModule;`
			`CUfunction hcuFunction;`
			`CUdeviceptr dptr;`
			`int deviceID;`
			`int threadNum;`
			`} CUDAContext;`

			`CUDAContext g_ThreadParams[MAXTHREADS];`

			`// define input fatbin file`
			`#ifndef FATBIN_FILE`
			`#define FATBIN_FILE "threadMigration_kernel64.fatbin"`
			`#endif`

			`bool gbAutoQuit = false;`

			`////////////////////////////////////////////////////////////////////////////////`
			`// declaration, forward`
			`bool runTest(int argc, char **argv);`

			`#define CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status) \`
			`if (dptr) cuMemFree(dptr); \`
			`if (hcuModule) cuModuleUnload(hcuModule); \`
			`if (hcuContext) cuCtxDestroy(hcuContext); \`
			`return status;`

			`#define THREAD_QUIT \`
			`printf("Error\n"); \`
			`return 0;`

			`// This sample uses the Driver API interface. The CUDA context needs`
			`// to be setup and the CUDA module (CUBIN) is built by NVCC`
			`static CUresult InitCUDAContext(CUDAContext *pContext, CUdevice hcuDevice,`
			`int deviceID, char **argv) {`
			`CUcontext hcuContext = 0;`
			`CUmodule hcuModule = 0;`
			`CUfunction hcuFunction = 0;`
			`CUdeviceptr dptr = 0;`

			`// cuCtxCreate: Function works on floating contexts and current context`
			`CUresult status = cuCtxCreate(&hcuContext, 0, hcuDevice);`

			`if (CUDA_SUCCESS != status) {`
			`fprintf(stderr, "cuCtxCreate for <deviceID=%d> failed %d\n", deviceID,`
			`status);`
			`CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status);`
			`}`

			`status = CUDA_ERROR_INVALID_IMAGE;`

			`string module_path, ptx_source;`
			`std::ostringstream fatbin;`

			`if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {`
			`exit(EXIT_FAILURE);`
			`} else {`
			`printf("> initCUDA loading module: <%s>\n", module_path.c_str());`
			`}`

			`if (!fatbin.str().size()) {`
			`printf("fatbin file empty. exiting..\n");`
			`exit(EXIT_FAILURE);`
			`}`

			`// Create module from binary file (FATBIN)`
			`checkCudaErrors(cuModuleLoadData(&hcuModule, fatbin.str().c_str()));`

			`status = cuModuleGetFunction(&hcuFunction, hcuModule, "kernelFunction");`

			`if (CUDA_SUCCESS != status) {`
			`fprintf(stderr, "cuModuleGetFunction failed %d\n", status);`
			`CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status);`
			`}`

			`// Here we must release the CUDA context from the thread context`
			`status = cuCtxPopCurrent(NULL);`

			`if (CUDA_SUCCESS != status) {`
			`fprintf(stderr, "cuCtxPopCurrent failed %d\n", status);`
			`CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status);`
			`}`

			`pContext->hcuContext = hcuContext;`
			`pContext->hcuModule = hcuModule;`
			`pContext->hcuFunction = hcuFunction;`
			`pContext->deviceID = deviceID;`

			`return CUDA_SUCCESS;`
			`}`

			`// ThreadProc launches the CUDA kernel on a CUDA context.`
			`// We have more than one thread that talks to a CUDA context`
			`#if defined(WIN32) \|\| defined(_WIN32) \|\| defined(WIN64) \|\| defined(_WIN64)`
			`DWORD WINAPI ThreadProc(CUDAContext *pParams)`
			`#else`
			`void ThreadProc(CUDAContext pParams)`
			`#endif`
			`{`
			`int wrong = 0;`
			`int *pInt = 0;`

			`printf("<CUDA Device=%d, Context=%p, Thread=%d> - ThreadProc() Launched...\n",`
			`pParams->deviceID, pParams->hcuContext, pParams->threadNum);`

			`// cuCtxPushCurrent: Attach the caller CUDA context to the thread context.`
			`CUresult status = cuCtxPushCurrent(pParams->hcuContext);`

			`if (CUDA_SUCCESS != status) {`
			`THREAD_QUIT;`
			`}`
			`checkCudaErrors(cuMemAlloc(&pParams->dptr, NUM_INTS * sizeof(int)));`

			`// There are two ways to launch CUDA kernels via the Driver API.`
			`// In this CUDA Sample, we illustrate both ways to pass parameters`
			`// and specify parameters. By default we use the simpler method.`

			`if (1) {`
			`// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel`
			`// Launching (simpler method)`
			`void *args[5] = {&pParams->dptr};`

			`// new CUDA 4.0 Driver API Kernel launch call`
			`status = cuLaunchKernel(pParams->hcuFunction, 1, 1, 1, 32, 1, 1, 0, NULL,`
			`args, NULL);`

			`if (CUDA_SUCCESS != status) {`
			`fprintf(stderr, "cuLaunch failed %d\n", status);`
			`THREAD_QUIT;`
			`}`
			`} else {`
			`// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel`
			`// Launching (advanced method)`
			`int offset = 0;`
			`char argBuffer[256];`

			`// pass in launch parameters (not actually de-referencing CUdeviceptr).`
			`// CUdeviceptr is storing the value of the parameters`
			`((CUdeviceptr )&argBuffer[offset]) = pParams->dptr;`
			`offset += sizeof(CUdeviceptr);`

			`void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,`
			`CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,`
			`CU_LAUNCH_PARAM_END};`

			`// new CUDA 4.0 Driver API Kernel launch call`
			`status = cuLaunchKernel(pParams->hcuFunction, 1, 1, 1, 32, 1, 1, 0, 0, NULL,`
			`(void **)&kernel_launch_config);`

			`if (CUDA_SUCCESS != status) {`
			`fprintf(stderr, "cuLaunch failed %d\n", status);`
			`THREAD_QUIT;`
			`}`
			`}`

			`pInt = (int )malloc(NUM_INTS sizeof(int));`

			`if (!pInt) return 0;`

			`if (CUDA_SUCCESS ==`
			`cuMemcpyDtoH(pInt, pParams->dptr, NUM_INTS * sizeof(int))) {`
			`for (int i = 0; i < NUM_INTS; i++) {`
			`if (pInt[i] != 32 - i) {`
			`printf("<CUDA Device=%d, Context=%p, Thread=%d> error [%d]=%d!\n",`
			`pParams->deviceID, pParams->hcuContext, pParams->threadNum, i,`
			`pInt[i]);`
			`wrong++;`
			`}`
			`}`

			`ENTERCRITICALSECTION`

			`if (!wrong) ThreadLaunchCount += 1;`

			`LEAVECRITICALSECTION`
			`}`

			`free(pInt);`
			`fflush(stdout);`
			`checkCudaErrors(cuMemFree(pParams->dptr));`

			`// cuCtxPopCurrent: Detach the current CUDA context from the calling thread.`
			`checkCudaErrors(cuCtxPopCurrent(NULL));`

			`printf("<CUDA Device=%d, Context=%p, Thread=%d> - ThreadProc() Finished!\n\n",`
			`pParams->deviceID, pParams->hcuContext, pParams->threadNum);`

			`return 0;`
			`}`

			`bool FinalErrorCheck(CUDAContext *pContext, int NumThreads, int deviceCount) {`
			`if (ThreadLaunchCount != NumThreads * deviceCount) {`
			`printf("<Expected=%d, Actual=%d> ThreadLaunchCounts(s)\n",`
			`NumThreads * deviceCount, ThreadLaunchCount);`
			`return false;`
			`} else {`
			`for (int iDevice = 0; iDevice < deviceCount; iDevice++) {`
			`// cuCtxDestroy called on current context or a floating context`
			`if (CUDA_SUCCESS != cuCtxDestroy(pContext[iDevice].hcuContext))`
			`return false;`
			`}`

			`return true;`
			`}`
			`}`

			`int main(int argc, char **argv) {`
			`printf("Starting threadMigration\n");`

			`bool bTestResult = runTest(argc, argv);`

			`exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);`
			`}`

			`bool runTest(int argc, char **argv) {`
			`printf("[ threadMigration ] API test...\n");`
			`#if defined(WIN32) \|\| defined(_WIN32) \|\| defined(WIN64) \|\| defined(_WIN64)`
			`InitializeCriticalSection(&g_cs);`
			`#else`
			`pthread_mutex_init(&g_mutex, NULL);`
			`#endif`
			`// By default, we will launch 2 CUDA threads for each device`
			`NumThreads = 2;`

			`if (argc > 1) {`
			`// If we are doing the QAtest or automated testing, we quit without`
			`// prompting`
			`if (checkCmdLineFlag(argc, (const char **)argv, "qatest") \|\|`
			`checkCmdLineFlag(argc, (const char **)argv, "noprompt")) {`
			`gbAutoQuit = true;`
			`}`

			`if (checkCmdLineFlag(argc, (const char **)argv, "numthreads")) {`
			`NumThreads =`
			`getCmdLineArgumentInt(argc, (const char **)argv, "numthreads");`

			`if (NumThreads < 1 \|\| NumThreads > 15) {`
			`printf(`
			`"Usage: \"threadMigration -n=<threads>\", <threads> ranges 1-15\n");`
			`return 1;`
			`}`
			`}`
			`}`

			`int deviceCount;`
			`int hcuDevice = 0;`
			`CUresult status;`
			`status = cuInit(0);`

			`if (CUDA_SUCCESS != status) return false;`

			`status = cuDeviceGetCount(&deviceCount);`

			`if (CUDA_SUCCESS != status) return false;`

			`printf("> %d CUDA device(s), %d Thread(s)/device to launched\n\n",`
			`deviceCount, NumThreads);`

			`if (deviceCount == 0) {`
			`return false;`
			`}`

			`int ihThread = 0;`
			`int ThreadIndex = 0;`

			`CUDAContext *pContext =`
			`(CUDAContext )malloc(sizeof(CUDAContext) deviceCount);`

			`for (int iDevice = 0; iDevice < deviceCount; iDevice++) {`
			`char szName[256];`
			`status = cuDeviceGet(&hcuDevice, iDevice);`

			`if (CUDA_SUCCESS != status) return false;`

			`status = cuDeviceGetName(szName, 256, hcuDevice);`

			`if (CUDA_SUCCESS != status) return false;`

			`{`
			`int major = 0, minor = 0;`
			`checkCudaErrors(cuDeviceGetAttribute(`
			`&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hcuDevice));`
			`checkCudaErrors(cuDeviceGetAttribute(`
			`&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hcuDevice));`
			`int sharedMemPerBlock;`
			`checkCudaErrors(cuDeviceGetAttribute(`
			`&sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,`
			`hcuDevice));`
			`int totalConstantMemory;`
			`checkCudaErrors(cuDeviceGetAttribute(`
			`&totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,`
			`hcuDevice));`
			`int regsPerBlock;`
			`checkCudaErrors(cuDeviceGetAttribute(`
			`&regsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,`
			`hcuDevice));`
			`int clockRate;`
			`checkCudaErrors(cuDeviceGetAttribute(`
			`&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, hcuDevice));`
			`printf("Device %d: \"%s\" (Compute %d.%d)\n", iDevice, szName, major,`
			`minor);`
			`printf("\tsharedMemPerBlock: %d\n", sharedMemPerBlock);`
			`printf("\tconstantMemory : %d\n", totalConstantMemory);`
			`printf("\tregsPerBlock : %d\n", regsPerBlock);`
			`printf("\tclockRate : %d\n", clockRate);`
			`printf("\n");`
			`}`

			`if (CUDA_SUCCESS !=`
			`InitCUDAContext(&pContext[iDevice], hcuDevice, iDevice, argv)) {`
			`return FinalErrorCheck(pContext, NumThreads, deviceCount);`
			`} else {`
			`for (int iThread = 0; iThread < NumThreads; iThread++, ihThread++) {`
			`g_ThreadParams[ThreadIndex].hcuContext = pContext[iDevice].hcuContext;`
			`g_ThreadParams[ThreadIndex].hcuModule = pContext[iDevice].hcuModule;`
			`g_ThreadParams[ThreadIndex].hcuFunction = pContext[iDevice].hcuFunction;`
			`g_ThreadParams[ThreadIndex].deviceID = pContext[iDevice].deviceID;`
			`g_ThreadParams[ThreadIndex].threadNum = iThread;`
			`// Launch (NumThreads) for each CUDA context`
			`#if defined(WIN32) \|\| defined(_WIN32) \|\| defined(WIN64) \|\| defined(_WIN64)`
			`rghThreads[ThreadIndex] = CreateThread(`
			`NULL, 0, (LPTHREAD_START_ROUTINE)ThreadProc,`
			`&g_ThreadParams[ThreadIndex], 0, &rgdwThreadIds[ThreadIndex]);`
			`#else // Assume we are running linux`
			`pthread_create(&rghThreads[ThreadIndex], NULL,`
			`(void ()(void *))ThreadProc,`
			`&g_ThreadParams[ThreadIndex]);`
			`#endif`
			`ThreadIndex += 1;`
			`}`
			`}`
			`}`

			`// Wait until all workers are done`
			`#if defined(WIN32) \|\| defined(_WIN32) \|\| defined(WIN64) \|\| defined(_WIN64)`
			`WaitForMultipleObjects(ThreadIndex, rghThreads, TRUE, INFINITE);`
			`#else`

			`for (int i = 0; i < ThreadIndex; i++) {`
			`pthread_join(rghThreads[i], NULL);`
			`}`

			`#endif`

			`bool ret_status = FinalErrorCheck(pContext, NumThreads, deviceCount);`
			`free(pContext);`
			`return ret_status;`
			`}`