cuda-samples/Samples/alignedTypes/alignedTypes.cu

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * This is a simple test showing huge access speed gap
 * between aligned and misaligned structures
 * (those having/missing __align__ keyword).
 * It measures per-element copy throughput for
 * aligned and misaligned structures on
 * big chunks of data.
 */

// includes, system
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// includes, project
#include <helper_cuda.h>  // helper functions for CUDA error checking and initialization
#include <helper_functions.h>  // helper utility functions

////////////////////////////////////////////////////////////////////////////////
// Misaligned types
////////////////////////////////////////////////////////////////////////////////
typedef unsigned char uint8;

typedef unsigned short int uint16;

typedef struct {
  unsigned char r, g, b, a;
} RGBA8_misaligned;

typedef struct {
  unsigned int l, a;
} LA32_misaligned;

typedef struct {
  unsigned int r, g, b;
} RGB32_misaligned;

typedef struct {
  unsigned int r, g, b, a;
} RGBA32_misaligned;

////////////////////////////////////////////////////////////////////////////////
// Aligned types
////////////////////////////////////////////////////////////////////////////////
typedef struct __align__(4) {
  unsigned char r, g, b, a;
}
RGBA8;

typedef unsigned int I32;

typedef struct __align__(8) {
  unsigned int l, a;
}
LA32;

typedef struct __align__(16) {
  unsigned int r, g, b;
}
RGB32;

typedef struct __align__(16) {
  unsigned int r, g, b, a;
}
RGBA32;

////////////////////////////////////////////////////////////////////////////////
// Because G80 class hardware natively supports global memory operations
// only with data elements of 4, 8 and 16 bytes, if structure size
// exceeds 16 bytes, it can't be efficiently read or written,
// since more than one global memory non-coalescable load/store instructions
// will be generated, even if __align__ option is supplied.
// "Structure of arrays" storage strategy offers best performance
// in general case. See section 5.1.2 of the Programming Guide.
////////////////////////////////////////////////////////////////////////////////
typedef struct __align__(16) {
  RGBA32 c1, c2;
}
RGBA32_2;

////////////////////////////////////////////////////////////////////////////////
// Common host and device functions
////////////////////////////////////////////////////////////////////////////////
// Round a / b to nearest higher integer value
int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }

// Round a / b to nearest lower integer value
int iDivDown(int a, int b) { return a / b; }

// Align a to nearest higher multiple of b
int iAlignUp(int a, int b) { return (a % b != 0) ? (a - a % b + b) : a; }

// Align a to nearest lower multiple of b
int iAlignDown(int a, int b) { return a - a % b; }

////////////////////////////////////////////////////////////////////////////////
// Simple CUDA kernel.
// Copy is carried out on per-element basis,
// so it's not per-byte in case of padded structures.
////////////////////////////////////////////////////////////////////////////////
template <class TData>
__global__ void testKernel(TData *d_odata, TData *d_idata, int numElements) {
  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
  const int numThreads = blockDim.x * gridDim.x;

  for (int pos = tid; pos < numElements; pos += numThreads) {
    d_odata[pos] = d_idata[pos];
  }
}

////////////////////////////////////////////////////////////////////////////////
// Validation routine for simple copy kernel.
// We must know "packed" size of TData (number_of_fields * sizeof(simple_type))
// and compare only these "packed" parts of the structure,
// containing actual user data. The compiler behavior with padding bytes
// is undefined, since padding is merely a placeholder
// and doesn't contain any user data.
////////////////////////////////////////////////////////////////////////////////
template <class TData>
int testCPU(TData *h_odata, TData *h_idata, int numElements,
            int packedElementSize) {
  for (int pos = 0; pos < numElements; pos++) {
    TData src = h_idata[pos];
    TData dst = h_odata[pos];

    for (int i = 0; i < packedElementSize; i++)
      if (((char *)&src)[i] != ((char *)&dst)[i]) {
        return 0;
      }
  }

  return 1;
}

////////////////////////////////////////////////////////////////////////////////
// Data configuration
////////////////////////////////////////////////////////////////////////////////
// Memory chunk size in bytes. Reused for test
const int MEM_SIZE = 50000000;
const int NUM_ITERATIONS = 32;

// GPU input and output data
unsigned char *d_idata, *d_odata;
// CPU input data and instance of GPU output data
unsigned char *h_idataCPU, *h_odataGPU;
StopWatchInterface *hTimer = NULL;

template <class TData>
int runTest(int packedElementSize, int memory_size) {
  const int totalMemSizeAligned = iAlignDown(memory_size, sizeof(TData));
  const int numElements = iDivDown(memory_size, sizeof(TData));

  // Clean output buffer before current test
  checkCudaErrors(cudaMemset(d_odata, 0, memory_size));
  // Run test
  checkCudaErrors(cudaDeviceSynchronize());
  sdkResetTimer(&hTimer);
  sdkStartTimer(&hTimer);

  for (int i = 0; i < NUM_ITERATIONS; i++) {
    testKernel<TData>
        <<<64, 256>>>((TData *)d_odata, (TData *)d_idata, numElements);
    getLastCudaError("testKernel() execution failed\n");
  }

  checkCudaErrors(cudaDeviceSynchronize());
  sdkStopTimer(&hTimer);
  double gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS;
  printf("Avg. time: %f ms / Copy throughput: %f GB/s.\n", gpuTime,
         (double)totalMemSizeAligned / (gpuTime * 0.001 * 1073741824.0));

  // Read back GPU results and run validation
  checkCudaErrors(
      cudaMemcpy(h_odataGPU, d_odata, memory_size, cudaMemcpyDeviceToHost));
  int flag = testCPU((TData *)h_odataGPU, (TData *)h_idataCPU, numElements,
                     packedElementSize);

  printf(flag ? "\tTEST OK\n" : "\tTEST FAILURE\n");

  return !flag;
}

int main(int argc, char **argv) {
  int i, nTotalFailures = 0;

  int devID;
  cudaDeviceProp deviceProp;
  printf("[%s] - Starting...\n", argv[0]);

  // find first CUDA device
  devID = findCudaDevice(argc, (const char **)argv);

  // get number of SMs on this GPU
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
  printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name,
         deviceProp.multiProcessorCount,
         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
             deviceProp.multiProcessorCount);

  // Anything that is less than 192 Cores will have a scaled down workload
  float scale_factor =
      max((192.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
                     (float)deviceProp.multiProcessorCount)),
          1.0f);

  int MemorySize = (int)(MEM_SIZE / scale_factor) &
                   0xffffff00;  // force multiple of 256 bytes

  printf("> Compute scaling value = %4.2f\n", scale_factor);
  printf("> Memory Size = %d\n", MemorySize);

  sdkCreateTimer(&hTimer);

  printf("Allocating memory...\n");
  h_idataCPU = (unsigned char *)malloc(MemorySize);
  h_odataGPU = (unsigned char *)malloc(MemorySize);
  checkCudaErrors(cudaMalloc((void **)&d_idata, MemorySize));
  checkCudaErrors(cudaMalloc((void **)&d_odata, MemorySize));

  printf("Generating host input data array...\n");

  for (i = 0; i < MemorySize; i++) {
    h_idataCPU[i] = (i & 0xFF) + 1;
  }

  printf("Uploading input data to GPU memory...\n");
  checkCudaErrors(
      cudaMemcpy(d_idata, h_idataCPU, MemorySize, cudaMemcpyHostToDevice));

  printf("Testing misaligned types...\n");
  printf("uint8...\n");
  nTotalFailures += runTest<uint8>(1, MemorySize);

  printf("uint16...\n");
  nTotalFailures += runTest<uint16>(2, MemorySize);

  printf("RGBA8_misaligned...\n");
  nTotalFailures += runTest<RGBA8_misaligned>(4, MemorySize);

  printf("LA32_misaligned...\n");
  nTotalFailures += runTest<LA32_misaligned>(8, MemorySize);

  printf("RGB32_misaligned...\n");
  nTotalFailures += runTest<RGB32_misaligned>(12, MemorySize);

  printf("RGBA32_misaligned...\n");
  nTotalFailures += runTest<RGBA32_misaligned>(16, MemorySize);

  printf("Testing aligned types...\n");
  printf("RGBA8...\n");
  nTotalFailures += runTest<RGBA8>(4, MemorySize);

  printf("I32...\n");
  nTotalFailures += runTest<I32>(4, MemorySize);

  printf("LA32...\n");
  nTotalFailures += runTest<LA32>(8, MemorySize);

  printf("RGB32...\n");
  nTotalFailures += runTest<RGB32>(12, MemorySize);

  printf("RGBA32...\n");
  nTotalFailures += runTest<RGBA32>(16, MemorySize);

  printf("RGBA32_2...\n");
  nTotalFailures += runTest<RGBA32_2>(32, MemorySize);

  printf("\n[alignedTypes] -> Test Results: %d Failures\n", nTotalFailures);

  printf("Shutting down...\n");
  checkCudaErrors(cudaFree(d_idata));
  checkCudaErrors(cudaFree(d_odata));
  free(h_odataGPU);
  free(h_idataCPU);

  sdkDeleteTimer(&hTimer);

  if (nTotalFailures != 0) {
    printf("Test failed!\n");
    exit(EXIT_FAILURE);
  }

  printf("Test passed\n");
  exit(EXIT_SUCCESS);
}
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`/*`
			`* This is a simple test showing huge access speed gap`
			`* between aligned and misaligned structures`
			`* (those having/missing __align__ keyword).`
			`* It measures per-element copy throughput for`
			`* aligned and misaligned structures on`
			`* big chunks of data.`
			`*/`

			`// includes, system`
			`#include <math.h>`
			`#include <stdio.h>`
			`#include <stdlib.h>`
			`#include <string.h>`

			`// includes, project`
			`#include <helper_cuda.h> // helper functions for CUDA error checking and initialization`
			`#include <helper_functions.h> // helper utility functions`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Misaligned types`
			`////////////////////////////////////////////////////////////////////////////////`
			`typedef unsigned char uint8;`

			`typedef unsigned short int uint16;`

			`typedef struct {`
			`unsigned char r, g, b, a;`
			`} RGBA8_misaligned;`

			`typedef struct {`
			`unsigned int l, a;`
			`} LA32_misaligned;`

			`typedef struct {`
			`unsigned int r, g, b;`
			`} RGB32_misaligned;`

			`typedef struct {`
			`unsigned int r, g, b, a;`
			`} RGBA32_misaligned;`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Aligned types`
			`////////////////////////////////////////////////////////////////////////////////`
			`typedef struct __align__(4) {`
			`unsigned char r, g, b, a;`
			`}`
			`RGBA8;`

			`typedef unsigned int I32;`

			`typedef struct __align__(8) {`
			`unsigned int l, a;`
			`}`
			`LA32;`

			`typedef struct __align__(16) {`
			`unsigned int r, g, b;`
			`}`
			`RGB32;`

			`typedef struct __align__(16) {`
			`unsigned int r, g, b, a;`
			`}`
			`RGBA32;`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Because G80 class hardware natively supports global memory operations`
			`// only with data elements of 4, 8 and 16 bytes, if structure size`
			`// exceeds 16 bytes, it can't be efficiently read or written,`
			`// since more than one global memory non-coalescable load/store instructions`
			`// will be generated, even if __align__ option is supplied.`
			`// "Structure of arrays" storage strategy offers best performance`
			`// in general case. See section 5.1.2 of the Programming Guide.`
			`////////////////////////////////////////////////////////////////////////////////`
			`typedef struct __align__(16) {`
			`RGBA32 c1, c2;`
			`}`
			`RGBA32_2;`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Common host and device functions`
			`////////////////////////////////////////////////////////////////////////////////`
			`// Round a / b to nearest higher integer value`
			`int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }`

			`// Round a / b to nearest lower integer value`
			`int iDivDown(int a, int b) { return a / b; }`

			`// Align a to nearest higher multiple of b`
			`int iAlignUp(int a, int b) { return (a % b != 0) ? (a - a % b + b) : a; }`

			`// Align a to nearest lower multiple of b`
			`int iAlignDown(int a, int b) { return a - a % b; }`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Simple CUDA kernel.`
			`// Copy is carried out on per-element basis,`
			`// so it's not per-byte in case of padded structures.`
			`////////////////////////////////////////////////////////////////////////////////`
			`template <class TData>`
			`__global__ void testKernel(TData d_odata, TData d_idata, int numElements) {`
			`const int tid = blockDim.x * blockIdx.x + threadIdx.x;`
			`const int numThreads = blockDim.x * gridDim.x;`

			`for (int pos = tid; pos < numElements; pos += numThreads) {`
			`d_odata[pos] = d_idata[pos];`
			`}`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Validation routine for simple copy kernel.`
			`// We must know "packed" size of TData (number_of_fields * sizeof(simple_type))`
			`// and compare only these "packed" parts of the structure,`
			`// containing actual user data. The compiler behavior with padding bytes`
			`// is undefined, since padding is merely a placeholder`
			`// and doesn't contain any user data.`
			`////////////////////////////////////////////////////////////////////////////////`
			`template <class TData>`
			`int testCPU(TData h_odata, TData h_idata, int numElements,`
			`int packedElementSize) {`
			`for (int pos = 0; pos < numElements; pos++) {`
			`TData src = h_idata[pos];`
			`TData dst = h_odata[pos];`

			`for (int i = 0; i < packedElementSize; i++)`
			`if (((char )&src)[i] != ((char )&dst)[i]) {`
			`return 0;`
			`}`
			`}`

			`return 1;`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Data configuration`
			`////////////////////////////////////////////////////////////////////////////////`
			`// Memory chunk size in bytes. Reused for test`
			`const int MEM_SIZE = 50000000;`
			`const int NUM_ITERATIONS = 32;`

			`// GPU input and output data`
			`unsigned char d_idata, d_odata;`
			`// CPU input data and instance of GPU output data`
			`unsigned char h_idataCPU, h_odataGPU;`
			`StopWatchInterface *hTimer = NULL;`

			`template <class TData>`
			`int runTest(int packedElementSize, int memory_size) {`
			`const int totalMemSizeAligned = iAlignDown(memory_size, sizeof(TData));`
			`const int numElements = iDivDown(memory_size, sizeof(TData));`

			`// Clean output buffer before current test`
			`checkCudaErrors(cudaMemset(d_odata, 0, memory_size));`
			`// Run test`
			`checkCudaErrors(cudaDeviceSynchronize());`
			`sdkResetTimer(&hTimer);`
			`sdkStartTimer(&hTimer);`

			`for (int i = 0; i < NUM_ITERATIONS; i++) {`
			`testKernel<TData>`
			`<<<64, 256>>>((TData )d_odata, (TData )d_idata, numElements);`
			`getLastCudaError("testKernel() execution failed\n");`
			`}`

			`checkCudaErrors(cudaDeviceSynchronize());`
			`sdkStopTimer(&hTimer);`
			`double gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS;`
			`printf("Avg. time: %f ms / Copy throughput: %f GB/s.\n", gpuTime,`
			`(double)totalMemSizeAligned / (gpuTime * 0.001 * 1073741824.0));`

			`// Read back GPU results and run validation`
			`checkCudaErrors(`
			`cudaMemcpy(h_odataGPU, d_odata, memory_size, cudaMemcpyDeviceToHost));`
			`int flag = testCPU((TData )h_odataGPU, (TData )h_idataCPU, numElements,`
			`packedElementSize);`

			`printf(flag ? "\tTEST OK\n" : "\tTEST FAILURE\n");`

			`return !flag;`
			`}`

			`int main(int argc, char **argv) {`
			`int i, nTotalFailures = 0;`

			`int devID;`
			`cudaDeviceProp deviceProp;`
			`printf("[%s] - Starting...\n", argv[0]);`

			`// find first CUDA device`
			`devID = findCudaDevice(argc, (const char **)argv);`

			`// get number of SMs on this GPU`
			`checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));`
			`printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name,`
			`deviceProp.multiProcessorCount,`
			`_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),`
			`_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *`
			`deviceProp.multiProcessorCount);`

			`// Anything that is less than 192 Cores will have a scaled down workload`
			`float scale_factor =`
			`max((192.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *`
			`(float)deviceProp.multiProcessorCount)),`
			`1.0f);`

			`int MemorySize = (int)(MEM_SIZE / scale_factor) &`
			`0xffffff00; // force multiple of 256 bytes`

			`printf("> Compute scaling value = %4.2f\n", scale_factor);`
			`printf("> Memory Size = %d\n", MemorySize);`

			`sdkCreateTimer(&hTimer);`

			`printf("Allocating memory...\n");`
			`h_idataCPU = (unsigned char *)malloc(MemorySize);`
			`h_odataGPU = (unsigned char *)malloc(MemorySize);`
			`checkCudaErrors(cudaMalloc((void **)&d_idata, MemorySize));`
			`checkCudaErrors(cudaMalloc((void **)&d_odata, MemorySize));`

			`printf("Generating host input data array...\n");`

			`for (i = 0; i < MemorySize; i++) {`
			`h_idataCPU[i] = (i & 0xFF) + 1;`
			`}`

			`printf("Uploading input data to GPU memory...\n");`
			`checkCudaErrors(`
			`cudaMemcpy(d_idata, h_idataCPU, MemorySize, cudaMemcpyHostToDevice));`

			`printf("Testing misaligned types...\n");`
			`printf("uint8...\n");`
			`nTotalFailures += runTest<uint8>(1, MemorySize);`

			`printf("uint16...\n");`
			`nTotalFailures += runTest<uint16>(2, MemorySize);`

			`printf("RGBA8_misaligned...\n");`
			`nTotalFailures += runTest<RGBA8_misaligned>(4, MemorySize);`

			`printf("LA32_misaligned...\n");`
			`nTotalFailures += runTest<LA32_misaligned>(8, MemorySize);`

			`printf("RGB32_misaligned...\n");`
			`nTotalFailures += runTest<RGB32_misaligned>(12, MemorySize);`

			`printf("RGBA32_misaligned...\n");`
			`nTotalFailures += runTest<RGBA32_misaligned>(16, MemorySize);`

			`printf("Testing aligned types...\n");`
			`printf("RGBA8...\n");`
			`nTotalFailures += runTest<RGBA8>(4, MemorySize);`

			`printf("I32...\n");`
			`nTotalFailures += runTest<I32>(4, MemorySize);`

			`printf("LA32...\n");`
			`nTotalFailures += runTest<LA32>(8, MemorySize);`

			`printf("RGB32...\n");`
			`nTotalFailures += runTest<RGB32>(12, MemorySize);`

			`printf("RGBA32...\n");`
			`nTotalFailures += runTest<RGBA32>(16, MemorySize);`

			`printf("RGBA32_2...\n");`
			`nTotalFailures += runTest<RGBA32_2>(32, MemorySize);`

			`printf("\n[alignedTypes] -> Test Results: %d Failures\n", nTotalFailures);`

			`printf("Shutting down...\n");`
			`checkCudaErrors(cudaFree(d_idata));`
			`checkCudaErrors(cudaFree(d_odata));`
			`free(h_odataGPU);`
			`free(h_idataCPU);`

			`sdkDeleteTimer(&hTimer);`

			`if (nTotalFailures != 0) {`
			`printf("Test failed!\n");`
			`exit(EXIT_FAILURE);`
			`}`

			`printf("Test passed\n");`
			`exit(EXIT_SUCCESS);`
			`}`