cuda-samples/Samples/deviceQuery/deviceQuery.cpp

/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/* This sample queries the properties of the CUDA devices present in the system
 * via CUDA Runtime API. */

// std::system includes

#include <cuda_runtime.h>
#include <helper_cuda.h>

#include <iostream>
#include <memory>
#include <string>

int *pArgc = NULL;
char **pArgv = NULL;

#if CUDART_VERSION < 5000

// CUDA-C includes
#include <cuda.h>

// This function wraps the CUDA Driver API into a template function
template <class T>
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
                             int device) {
  CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);

  if (CUDA_SUCCESS != error) {
    fprintf(
        stderr,
        "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
        error, __FILE__, __LINE__);

    exit(EXIT_FAILURE);
  }
}

#endif /* CUDART_VERSION < 5000 */

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
  pArgc = &argc;
  pArgv = argv;

  printf("%s Starting...\n\n", argv[0]);
  printf(
      " CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");

  int deviceCount = 0;
  cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

  if (error_id != cudaSuccess) {
    printf("cudaGetDeviceCount returned %d\n-> %s\n",
           static_cast<int>(error_id), cudaGetErrorString(error_id));
    printf("Result = FAIL\n");
    exit(EXIT_FAILURE);
  }

  // This function call returns 0 if there are no CUDA capable devices.
  if (deviceCount == 0) {
    printf("There are no available device(s) that support CUDA\n");
  } else {
    printf("Detected %d CUDA Capable device(s)\n", deviceCount);
  }

  int dev, driverVersion = 0, runtimeVersion = 0;

  for (dev = 0; dev < deviceCount; ++dev) {
    cudaSetDevice(dev);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);

    printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

    // Console log
    cudaDriverGetVersion(&driverVersion);
    cudaRuntimeGetVersion(&runtimeVersion);
    printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n",
           driverVersion / 1000, (driverVersion % 100) / 10,
           runtimeVersion / 1000, (runtimeVersion % 100) / 10);
    printf("  CUDA Capability Major/Minor version number:    %d.%d\n",
           deviceProp.major, deviceProp.minor);

    char msg[256];
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    sprintf_s(msg, sizeof(msg),
             "  Total amount of global memory:                 %.0f MBytes "
             "(%llu bytes)\n",
             static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
             (unsigned long long)deviceProp.totalGlobalMem);
#else
    snprintf(msg, sizeof(msg),
             "  Total amount of global memory:                 %.0f MBytes "
             "(%llu bytes)\n",
             static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
             (unsigned long long)deviceProp.totalGlobalMem);
#endif
    printf("%s", msg);

    printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
           deviceProp.multiProcessorCount,
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
               deviceProp.multiProcessorCount);
    printf(
        "  GPU Max Clock rate:                            %.0f MHz (%0.2f "
        "GHz)\n",
        deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);

#if CUDART_VERSION >= 5000
    // This is supported in CUDA 5.0 (runtime API device properties)
    printf("  Memory Clock rate:                             %.0f Mhz\n",
           deviceProp.memoryClockRate * 1e-3f);
    printf("  Memory Bus Width:                              %d-bit\n",
           deviceProp.memoryBusWidth);

    if (deviceProp.l2CacheSize) {
      printf("  L2 Cache Size:                                 %d bytes\n",
             deviceProp.l2CacheSize);
    }

#else
    // This only available in CUDA 4.0-4.2 (but these were only exposed in the
    // CUDA Driver API)
    int memoryClock;
    getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
                          dev);
    printf("  Memory Clock rate:                             %.0f Mhz\n",
           memoryClock * 1e-3f);
    int memBusWidth;
    getCudaAttribute<int>(&memBusWidth,
                          CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
    printf("  Memory Bus Width:                              %d-bit\n",
           memBusWidth);
    int L2CacheSize;
    getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);

    if (L2CacheSize) {
      printf("  L2 Cache Size:                                 %d bytes\n",
             L2CacheSize);
    }

#endif

    printf(
        "  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, "
        "%d), 3D=(%d, %d, %d)\n",
        deviceProp.maxTexture1D, deviceProp.maxTexture2D[0],
        deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0],
        deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
    printf(
        "  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
        deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
    printf(
        "  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
        "layers\n",
        deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1],
        deviceProp.maxTexture2DLayered[2]);

    printf("  Total amount of constant memory:               %lu bytes\n",
           deviceProp.totalConstMem);
    printf("  Total amount of shared memory per block:       %lu bytes\n",
           deviceProp.sharedMemPerBlock);
    printf("  Total number of registers available per block: %d\n",
           deviceProp.regsPerBlock);
    printf("  Warp size:                                     %d\n",
           deviceProp.warpSize);
    printf("  Maximum number of threads per multiprocessor:  %d\n",
           deviceProp.maxThreadsPerMultiProcessor);
    printf("  Maximum number of threads per block:           %d\n",
           deviceProp.maxThreadsPerBlock);
    printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
           deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
           deviceProp.maxThreadsDim[2]);
    printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
           deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
           deviceProp.maxGridSize[2]);
    printf("  Maximum memory pitch:                          %lu bytes\n",
           deviceProp.memPitch);
    printf("  Texture alignment:                             %lu bytes\n",
           deviceProp.textureAlignment);
    printf(
        "  Concurrent copy and kernel execution:          %s with %d copy "
        "engine(s)\n",
        (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
    printf("  Run time limit on kernels:                     %s\n",
           deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
    printf("  Integrated GPU sharing Host Memory:            %s\n",
           deviceProp.integrated ? "Yes" : "No");
    printf("  Support host page-locked memory mapping:       %s\n",
           deviceProp.canMapHostMemory ? "Yes" : "No");
    printf("  Alignment requirement for Surfaces:            %s\n",
           deviceProp.surfaceAlignment ? "Yes" : "No");
    printf("  Device has ECC support:                        %s\n",
           deviceProp.ECCEnabled ? "Enabled" : "Disabled");
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
           deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)"
                                : "WDDM (Windows Display Driver Model)");
#endif
    printf("  Device supports Unified Addressing (UVA):      %s\n",
           deviceProp.unifiedAddressing ? "Yes" : "No");
    printf("  Device supports Compute Preemption:            %s\n",
           deviceProp.computePreemptionSupported ? "Yes" : "No");
    printf("  Supports Cooperative Kernel Launch:            %s\n",
           deviceProp.cooperativeLaunch ? "Yes" : "No");
    printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n",
           deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
    printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
           deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);

    const char *sComputeMode[] = {
        "Default (multiple host threads can use ::cudaSetDevice() with device "
        "simultaneously)",
        "Exclusive (only one host thread in one process is able to use "
        "::cudaSetDevice() with this device)",
        "Prohibited (no host thread can use ::cudaSetDevice() with this "
        "device)",
        "Exclusive Process (many threads in one process is able to use "
        "::cudaSetDevice() with this device)",
        "Unknown",
        NULL};
    printf("  Compute Mode:\n");
    printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
  }

  // If there are 2 or more GPUs, query to determine whether RDMA is supported
  if (deviceCount >= 2) {
    cudaDeviceProp prop[64];
    int gpuid[64];  // we want to find the first two GPUs that can support P2P
    int gpu_p2p_count = 0;

    for (int i = 0; i < deviceCount; i++) {
      checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));

      // Only boards based on Fermi or later can support P2P
      if ((prop[i].major >= 2)
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
          // on Windows (64-bit), the Tesla Compute Cluster driver for windows
          // must be enabled to support this
          && prop[i].tccDriver
#endif
      ) {
        // This is an array of P2P capable GPUs
        gpuid[gpu_p2p_count++] = i;
      }
    }

    // Show all the combinations of support P2P GPUs
    int can_access_peer;

    if (gpu_p2p_count >= 2) {
      for (int i = 0; i < gpu_p2p_count; i++) {
        for (int j = 0; j < gpu_p2p_count; j++) {
          if (gpuid[i] == gpuid[j]) {
            continue;
          }
          checkCudaErrors(
              cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
          printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
                 prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j],
                 can_access_peer ? "Yes" : "No");
        }
      }
    }
  }

  // csv masterlog info
  // *****************************
  // exe and CUDA driver name
  printf("\n");
  std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
  char cTemp[16];

  // driver version
  sProfileString += ", CUDA Driver Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
  sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#else
  snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000,
           (driverVersion % 100) / 10);
#endif
  sProfileString += cTemp;

  // Runtime version
  sProfileString += ", CUDA Runtime Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
  sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#else
  snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000,
           (runtimeVersion % 100) / 10);
#endif
  sProfileString += cTemp;

  // Device count
  sProfileString += ", NumDevs = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
  sprintf_s(cTemp, 10, "%d", deviceCount);
#else
  snprintf(cTemp, sizeof(cTemp), "%d", deviceCount);
#endif
  sProfileString += cTemp;
  sProfileString += "\n";
  printf("%s", sProfileString.c_str());

  printf("Result = PASS\n");

  // finish
  exit(EXIT_SUCCESS);
}
Initial public release for CUDA 9.2. 2018-03-03 08:07:37 +08:00			`/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`/* This sample queries the properties of the CUDA devices present in the system`
			`* via CUDA Runtime API. */`

			`// std::system includes`

			`#include <cuda_runtime.h>`
			`#include <helper_cuda.h>`

			`#include <iostream>`
			`#include <memory>`
			`#include <string>`

			`int *pArgc = NULL;`
			`char **pArgv = NULL;`

			`#if CUDART_VERSION < 5000`

			`// CUDA-C includes`
			`#include <cuda.h>`

			`// This function wraps the CUDA Driver API into a template function`
			`template <class T>`
			`inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,`
			`int device) {`
			`CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);`

			`if (CUDA_SUCCESS != error) {`
			`fprintf(`
			`stderr,`
			`"cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",`
			`error, __FILE__, __LINE__);`

			`exit(EXIT_FAILURE);`
			`}`
			`}`

			`#endif /* CUDART_VERSION < 5000 */`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Program main`
			`////////////////////////////////////////////////////////////////////////////////`
			`int main(int argc, char **argv) {`
			`pArgc = &argc;`
			`pArgv = argv;`

			`printf("%s Starting...\n\n", argv[0]);`
			`printf(`
			`" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");`

			`int deviceCount = 0;`
			`cudaError_t error_id = cudaGetDeviceCount(&deviceCount);`

			`if (error_id != cudaSuccess) {`
			`printf("cudaGetDeviceCount returned %d\n-> %s\n",`
			`static_cast<int>(error_id), cudaGetErrorString(error_id));`
			`printf("Result = FAIL\n");`
			`exit(EXIT_FAILURE);`
			`}`

			`// This function call returns 0 if there are no CUDA capable devices.`
			`if (deviceCount == 0) {`
			`printf("There are no available device(s) that support CUDA\n");`
			`} else {`
			`printf("Detected %d CUDA Capable device(s)\n", deviceCount);`
			`}`

			`int dev, driverVersion = 0, runtimeVersion = 0;`

			`for (dev = 0; dev < deviceCount; ++dev) {`
			`cudaSetDevice(dev);`
			`cudaDeviceProp deviceProp;`
			`cudaGetDeviceProperties(&deviceProp, dev);`

			`printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);`

			`// Console log`
			`cudaDriverGetVersion(&driverVersion);`
			`cudaRuntimeGetVersion(&runtimeVersion);`
			`printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n",`
			`driverVersion / 1000, (driverVersion % 100) / 10,`
			`runtimeVersion / 1000, (runtimeVersion % 100) / 10);`
			`printf(" CUDA Capability Major/Minor version number: %d.%d\n",`
			`deviceProp.major, deviceProp.minor);`

			`char msg[256];`
			`#if defined(WIN32) \|\| defined(_WIN32) \|\| defined(WIN64) \|\| defined(_WIN64)`
			`sprintf_s(msg, sizeof(msg),`
			`" Total amount of global memory: %.0f MBytes "`
			`"(%llu bytes)\n",`
			`static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),`
			`(unsigned long long)deviceProp.totalGlobalMem);`
			`#else`
			`snprintf(msg, sizeof(msg),`
			`" Total amount of global memory: %.0f MBytes "`
			`"(%llu bytes)\n",`
			`static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),`
			`(unsigned long long)deviceProp.totalGlobalMem);`
			`#endif`
			`printf("%s", msg);`

			`printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n",`
			`deviceProp.multiProcessorCount,`
			`_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),`
			`_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *`
			`deviceProp.multiProcessorCount);`
			`printf(`
			`" GPU Max Clock rate: %.0f MHz (%0.2f "`
			`"GHz)\n",`
			`deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);`

			`#if CUDART_VERSION >= 5000`
			`// This is supported in CUDA 5.0 (runtime API device properties)`
			`printf(" Memory Clock rate: %.0f Mhz\n",`
			`deviceProp.memoryClockRate * 1e-3f);`
			`printf(" Memory Bus Width: %d-bit\n",`
			`deviceProp.memoryBusWidth);`

			`if (deviceProp.l2CacheSize) {`
			`printf(" L2 Cache Size: %d bytes\n",`
			`deviceProp.l2CacheSize);`
			`}`

			`#else`
			`// This only available in CUDA 4.0-4.2 (but these were only exposed in the`
			`// CUDA Driver API)`
			`int memoryClock;`
			`getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,`
			`dev);`
			`printf(" Memory Clock rate: %.0f Mhz\n",`
			`memoryClock * 1e-3f);`
			`int memBusWidth;`
			`getCudaAttribute<int>(&memBusWidth,`
			`CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);`
			`printf(" Memory Bus Width: %d-bit\n",`
			`memBusWidth);`
			`int L2CacheSize;`
			`getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);`

			`if (L2CacheSize) {`
			`printf(" L2 Cache Size: %d bytes\n",`
			`L2CacheSize);`
			`}`

			`#endif`

			`printf(`
			`" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, "`
			`"%d), 3D=(%d, %d, %d)\n",`
			`deviceProp.maxTexture1D, deviceProp.maxTexture2D[0],`
			`deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0],`
			`deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);`
			`printf(`
			`" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",`
			`deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);`
			`printf(`
			`" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d "`
			`"layers\n",`
			`deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1],`
			`deviceProp.maxTexture2DLayered[2]);`

			`printf(" Total amount of constant memory: %lu bytes\n",`
			`deviceProp.totalConstMem);`
			`printf(" Total amount of shared memory per block: %lu bytes\n",`
			`deviceProp.sharedMemPerBlock);`
			`printf(" Total number of registers available per block: %d\n",`
			`deviceProp.regsPerBlock);`
			`printf(" Warp size: %d\n",`
			`deviceProp.warpSize);`
			`printf(" Maximum number of threads per multiprocessor: %d\n",`
			`deviceProp.maxThreadsPerMultiProcessor);`
			`printf(" Maximum number of threads per block: %d\n",`
			`deviceProp.maxThreadsPerBlock);`
			`printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",`
			`deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],`
			`deviceProp.maxThreadsDim[2]);`
			`printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n",`
			`deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],`
			`deviceProp.maxGridSize[2]);`
			`printf(" Maximum memory pitch: %lu bytes\n",`
			`deviceProp.memPitch);`
			`printf(" Texture alignment: %lu bytes\n",`
			`deviceProp.textureAlignment);`
			`printf(`
			`" Concurrent copy and kernel execution: %s with %d copy "`
			`"engine(s)\n",`
			`(deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);`
			`printf(" Run time limit on kernels: %s\n",`
			`deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");`
			`printf(" Integrated GPU sharing Host Memory: %s\n",`
			`deviceProp.integrated ? "Yes" : "No");`
			`printf(" Support host page-locked memory mapping: %s\n",`
			`deviceProp.canMapHostMemory ? "Yes" : "No");`
			`printf(" Alignment requirement for Surfaces: %s\n",`
			`deviceProp.surfaceAlignment ? "Yes" : "No");`
			`printf(" Device has ECC support: %s\n",`
			`deviceProp.ECCEnabled ? "Enabled" : "Disabled");`
			`#if defined(WIN32) \|\| defined(_WIN32) \|\| defined(WIN64) \|\| defined(_WIN64)`
			`printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n",`
			`deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)"`
			`: "WDDM (Windows Display Driver Model)");`
			`#endif`
			`printf(" Device supports Unified Addressing (UVA): %s\n",`
			`deviceProp.unifiedAddressing ? "Yes" : "No");`
			`printf(" Device supports Compute Preemption: %s\n",`
			`deviceProp.computePreemptionSupported ? "Yes" : "No");`
			`printf(" Supports Cooperative Kernel Launch: %s\n",`
			`deviceProp.cooperativeLaunch ? "Yes" : "No");`
			`printf(" Supports MultiDevice Co-op Kernel Launch: %s\n",`
			`deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");`
			`printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n",`
			`deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);`

			`const char *sComputeMode[] = {`
			`"Default (multiple host threads can use ::cudaSetDevice() with device "`
			`"simultaneously)",`
			`"Exclusive (only one host thread in one process is able to use "`
			`"::cudaSetDevice() with this device)",`
			`"Prohibited (no host thread can use ::cudaSetDevice() with this "`
			`"device)",`
			`"Exclusive Process (many threads in one process is able to use "`
			`"::cudaSetDevice() with this device)",`
			`"Unknown",`
			`NULL};`
			`printf(" Compute Mode:\n");`
			`printf(" < %s >\n", sComputeMode[deviceProp.computeMode]);`
			`}`

			`// If there are 2 or more GPUs, query to determine whether RDMA is supported`
			`if (deviceCount >= 2) {`
			`cudaDeviceProp prop[64];`
			`int gpuid[64]; // we want to find the first two GPUs that can support P2P`
			`int gpu_p2p_count = 0;`

			`for (int i = 0; i < deviceCount; i++) {`
			`checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));`

			`// Only boards based on Fermi or later can support P2P`
			`if ((prop[i].major >= 2)`
			`#if defined(WIN32) \|\| defined(_WIN32) \|\| defined(WIN64) \|\| defined(_WIN64)`
			`// on Windows (64-bit), the Tesla Compute Cluster driver for windows`
			`// must be enabled to support this`
			`&& prop[i].tccDriver`
			`#endif`
			`) {`
			`// This is an array of P2P capable GPUs`
			`gpuid[gpu_p2p_count++] = i;`
			`}`
			`}`

			`// Show all the combinations of support P2P GPUs`
			`int can_access_peer;`

			`if (gpu_p2p_count >= 2) {`
			`for (int i = 0; i < gpu_p2p_count; i++) {`
			`for (int j = 0; j < gpu_p2p_count; j++) {`
			`if (gpuid[i] == gpuid[j]) {`
			`continue;`
			`}`
			`checkCudaErrors(`
			`cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));`
			`printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",`
			`prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j],`
			`can_access_peer ? "Yes" : "No");`
			`}`
			`}`
			`}`
			`}`

			`// csv masterlog info`
			`// *****************************`
			`// exe and CUDA driver name`
			`printf("\n");`
			`std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";`
			`char cTemp[16];`

			`// driver version`
			`sProfileString += ", CUDA Driver Version = ";`
			`#if defined(WIN32) \|\| defined(_WIN32) \|\| defined(WIN64) \|\| defined(_WIN64)`
			`sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);`
			`#else`
			`snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000,`
			`(driverVersion % 100) / 10);`
			`#endif`
			`sProfileString += cTemp;`

			`// Runtime version`
			`sProfileString += ", CUDA Runtime Version = ";`
			`#if defined(WIN32) \|\| defined(_WIN32) \|\| defined(WIN64) \|\| defined(_WIN64)`
			`sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);`
			`#else`
			`snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000,`
			`(runtimeVersion % 100) / 10);`
			`#endif`
			`sProfileString += cTemp;`

			`// Device count`
			`sProfileString += ", NumDevs = ";`
			`#if defined(WIN32) \|\| defined(_WIN32) \|\| defined(WIN64) \|\| defined(_WIN64)`
			`sprintf_s(cTemp, 10, "%d", deviceCount);`
			`#else`
			`snprintf(cTemp, sizeof(cTemp), "%d", deviceCount);`
			`#endif`
			`sProfileString += cTemp;`
			`sProfileString += "\n";`
			`printf("%s", sProfileString.c_str());`

			`printf("Result = PASS\n");`

			`// finish`
			`exit(EXIT_SUCCESS);`
			`}`