cuda-samples/Samples/simpleGLES_screen/simpleGLES_screen.cu

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
   This example demonstrates how to use the CUDA C bindings to OpenGL ES to
   dynamically modify a vertex buffer using a CUDA C kernel.

   The steps are:
   1. Create an empty vertex buffer object (VBO)
   2. Register the VBO with CUDA C
   3. Map the VBO for writing from CUDA C
   4. Run CUDA C kernel to modify the vertex positions
   5. Unmap the VBO
   6. Render the results using OpenGL ES

   Host code
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include <stdarg.h>
#include <unistd.h>
#include <screen/screen.h>

#include "graphics_interface.c"

// includes, cuda
#include <cuda_runtime.h>
#include <cuda_gl_interop.h>

// Utilities and timing functions
#include <helper_functions.h>    // includes cuda.h and cuda_runtime_api.h

// CUDA helper functions
#include <helper_cuda.h>         // helper functions for CUDA error check

#include <vector_types.h>

#define MAX_EPSILON_ERROR 0.0f
#define THRESHOLD 0.0f
#define REFRESH_DELAY 1  // ms

#define GUI_IDLE 0x100
#define GUI_ROTATE 0x101
#define GUI_TRANSLATE 0x102

int gui_mode;

////////////////////////////////////////////////////////////////////////////////
// Default configuration
unsigned int window_width = 512;
unsigned int window_height = 512;
unsigned int dispno = 0;

// constants
const unsigned int mesh_width = 256;
const unsigned int mesh_height = 256;

// OpenGL ES variables and interop with CUDA C
GLuint mesh_vao, mesh_vbo;
struct cudaGraphicsResource *cuda_vbo_resource;
void *d_vbo_buffer = NULL;

float g_fAnim = 0.0;

// UI / mouse controls
int mouse_old_x, mouse_old_y;
int mouse_buttons = 0;
float rotate_x = 0.0, rotate_y = 0.0;
float translate_z = -3.0;

StopWatchInterface *timer = NULL;

// Frame statistics
int frame;
int fpsCount = 0;  // FPS count for averaging
int fpsLimit = 1;  // FPS limit for sampling
int g_Index = 0;
float avgFPS = 0.0f;
unsigned int frameCount = 0;
unsigned int g_TotalErrors = 0;

// The default number of seconds after which the test will end.
#define TIME_LIMIT 10.0  // 10 secs

// Flag indicating it is time to shut down
static GLboolean shutdown = GL_FALSE;

// Callback to close window
static void closeCB_app(void) { shutdown = GL_TRUE; }

// Callback to handle key presses
static void keyCB_app(char key, int state) {
  // Ignoring releases
  if (!state) return;

  if ((key == 'q') || (key == 'Q') || (key == NvGlDemoKeyCode_Escape))
    shutdown = GL_TRUE;
}

// Auto-Verification Code
bool g_bQAReadback = false;

int *pArgc = NULL;
char **pArgv = NULL;

#define MAX(a, b) ((a > b) ? a : b)

////////////////////////////////////////////////////////////////////////////////
// declaration, forward

// CUDA functionality
void runCuda(struct cudaGraphicsResource **vbo_resource);
void runAutoTest(int devID, char **argv, char *ref_file);
void checkResultCuda(int argc, char **argv, const GLuint &vbo);

const char *sSDKsample = "simpleGLES on Screen (VBO)";

void computeFPS() {
  frameCount++;
  fpsCount++;

  if (fpsCount == fpsLimit) {
    avgFPS = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
    fpsCount = 0;
    fpsLimit = (int)MAX(avgFPS, 1.f);

    sdkResetTimer(&timer);
  }

  char fps[256];
  sprintf(fps, "Cuda/OpenGL ES Interop (VBO): %3.1f fps (Max 1000 fps)",
          avgFPS);
  graphics_set_windowtitle(fps);
}

///////////////////////////////////////////////////////////////////////////////
//! Simple kernel to modify vertex positions in sine wave pattern
//! @param data  data in global memory
///////////////////////////////////////////////////////////////////////////////
__global__ void simple_vbo_kernel(float4 *pos, unsigned int width,
                                  unsigned int height, float time) {
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

  // calculate uv coordinates
  float u = x / (float)width;
  float v = y / (float)height;
  u = u * 2.0f - 1.0f;
  v = v * 2.0f - 1.0f;

  // calculate simple sine wave pattern
  float freq = 4.0f;
  float w = sinf(u * freq + time) * cosf(v * freq + time) * 0.5f;

  // write output vertex
  pos[y * width + x] = make_float4(u, w, v, 1.0f);
}

void launch_kernel(float4 *pos, unsigned int mesh_width,
                   unsigned int mesh_height, float time) {
  // execute the kernel
  dim3 block(8, 8, 1);
  dim3 grid(mesh_width / block.x, mesh_height / block.y, 1);
  simple_vbo_kernel<<<grid, block>>>(pos, mesh_width, mesh_height, time);
}

////////////////////////////////////////////////////////////////////////////////
//! Run the Cuda part of the computation
////////////////////////////////////////////////////////////////////////////////
void runCuda(struct cudaGraphicsResource **vbo_resource) {
  // map OpenGL buffer object for writing from CUDA
  float4 *dptr;
  cudaGraphicsMapResources(1, vbo_resource, 0);
  size_t num_bytes;
  cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes,
                                       *vbo_resource);

  launch_kernel(dptr, mesh_width, mesh_height, g_fAnim);

  // unmap buffer object
  cudaGraphicsUnmapResources(1, vbo_resource, 0);
}

#ifndef FOPEN
#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
#endif

void sdkDumpBin2(void *data, unsigned int bytes, const char *filename) {
  printf("sdkDumpBin: <%s>\n", filename);
  FILE *fp;
  FOPEN(fp, filename, "wb");
  fwrite(data, bytes, 1, fp);
  fflush(fp);
  fclose(fp);
}

////////////////////////////////////////////////////////////////////////////////
//! Run the Cuda part of the computation
////////////////////////////////////////////////////////////////////////////////
void runAutoTest(int devID, char **argv, char *ref_file) {
  char *reference_file = NULL;
  void *imageData = malloc(mesh_width * mesh_height * sizeof(float));

  // execute the kernel
  launch_kernel((float4 *)d_vbo_buffer, mesh_width, mesh_height, g_fAnim);

  cudaDeviceSynchronize();
  getLastCudaError("launch_kernel failed");

  cudaMemcpy(imageData, d_vbo_buffer, mesh_width * mesh_height * sizeof(float),
             cudaMemcpyDeviceToHost);

  sdkDumpBin2(imageData, mesh_width * mesh_height * sizeof(float),
              "simpleGLES_screen.bin");
  reference_file = sdkFindFilePath(ref_file, argv[0]);

  if (reference_file &&
      !sdkCompareBin2BinFloat("simpleGLES_screen.bin", reference_file,
                              mesh_width * mesh_height * sizeof(float),
                              MAX_EPSILON_ERROR, THRESHOLD, pArgv[0])) {
    g_TotalErrors++;
  }
}

////////////////////////////////////////////////////////////////////////////////
//! Display callback
////////////////////////////////////////////////////////////////////////////////
void display_thisframe(float time_delta) {
  sdkStartTimer(&timer);

  // run CUDA kernel to generate vertex positions
  runCuda(&cuda_vbo_resource);

  glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

  glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height);

  glFinish();

  g_fAnim += time_delta;

  sdkStopTimer(&timer);
  computeFPS();
}

////////////////////////////////////////////////////////////////////////////////
//! Check if the result is correct or write data to file for external
//! regression testing
////////////////////////////////////////////////////////////////////////////////
void checkResultCuda(int argc, char **argv, const GLuint &vbo) {
  if (!d_vbo_buffer) {
    printf("%s: Mapping result buffer from OpenGL ES\n", __FUNCTION__);

    cudaGraphicsUnregisterResource(cuda_vbo_resource);

    // map buffer object
    glBindBuffer(GL_ARRAY_BUFFER, vbo);
    float *data = (float *)glMapBufferRange(
        GL_ARRAY_BUFFER, 0, mesh_width * mesh_height * 4 * sizeof(float),
        GL_READ_ONLY);

    // check result
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
      // write file for regression test
      sdkWriteFile<float>("./data/regression.dat", data,
                          mesh_width * mesh_height * 3, 0.0, false);
    }

    // unmap GL buffer object
    if (!glUnmapBuffer(GL_ARRAY_BUFFER)) {
      fprintf(stderr, "Unmap buffer failed.\n");
      fflush(stderr);
    }

    checkCudaErrors(cudaGraphicsGLRegisterBuffer(
        &cuda_vbo_resource, vbo, cudaGraphicsMapFlagsWriteDiscard));

    CHECK_GLERROR();
  }
}

GLuint mesh_shader = 0;

void readAndCompileShaderFromGLSLFile(GLuint new_shaderprogram,
                                      const char *filename, GLenum shaderType) {
  FILE *file = fopen(filename, "rb");  // open shader text file
  if (!file) {
    error_exit("Filename %s does not exist\n", filename);
  }

  // get the size of the file and read it
  fseek(file, 0, SEEK_END);
  GLint size = ftell(file);
  char *data = (char *)malloc(sizeof(char) * (size + 1));
  memset(data, 0, sizeof(char) * (size + 1));
  fseek(file, 0, SEEK_SET);
  size_t res = fread(data, 1, size, file);
  fclose(file);

  GLuint shader = glCreateShader(shaderType);
  glShaderSource(shader, 1, (const GLchar **)&data, &size);
  glCompileShader(shader);

  CHECK_GLERROR();
  GLint compile_success = 0;
  glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_success);
  CHECK_GLERROR();

  if (compile_success == GL_FALSE) {
    printf("Compilation of %s failed!\n Reason:\n", filename);

    GLint maxLength = 0;
    glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &maxLength);

    char errorLog[maxLength];
    glGetShaderInfoLog(shader, maxLength, &maxLength, &errorLog[0]);

    printf("%s", errorLog);

    glDeleteShader(shader);
    exit(1);
  }

  glAttachShader(new_shaderprogram, shader);
  glDeleteShader(shader);

  free(data);
}

GLuint ShaderCreate(const char *vshader_filename,
                    const char *fshader_filename) {
  printf("Loading GLSL shaders %s %s\n", vshader_filename, fshader_filename);

  GLuint new_shaderprogram = glCreateProgram();

  CHECK_GLERROR();
  if (vshader_filename) {
    readAndCompileShaderFromGLSLFile(new_shaderprogram, vshader_filename,
                                     GL_VERTEX_SHADER);
  }

  CHECK_GLERROR();
  if (fshader_filename) {
    readAndCompileShaderFromGLSLFile(new_shaderprogram, fshader_filename,
                                     GL_FRAGMENT_SHADER);
  }

  CHECK_GLERROR();

  glLinkProgram(new_shaderprogram);

  CHECK_GLERROR();
  GLint link_success;
  glGetProgramiv(new_shaderprogram, GL_LINK_STATUS, &link_success);

  if (link_success == GL_FALSE) {
    printf("Linking of %s with %s failed!\n Reason:\n", vshader_filename,
           fshader_filename);

    GLint maxLength = 0;
    glGetShaderiv(new_shaderprogram, GL_INFO_LOG_LENGTH, &maxLength);

    char errorLog[maxLength];
    glGetShaderInfoLog(new_shaderprogram, maxLength, &maxLength, &errorLog[0]);

    printf("%s", errorLog);

    exit(EXIT_FAILURE);
  }

  return new_shaderprogram;
}

//===========================================================================
// InitGraphicsState() - initialize OpenGL
//===========================================================================
static void InitGraphicsState(void) {
  char *GL_version = (char *)glGetString(GL_VERSION);
  char *GL_vendor = (char *)glGetString(GL_VENDOR);
  char *GL_renderer = (char *)glGetString(GL_RENDERER);

  printf("Version: %s\n", GL_version);
  printf("Vendor: %s\n", GL_vendor);
  printf("Renderer: %s\n", GL_renderer);

  // RENDERING SETUP (OpenGL ES or OpenGL Core Profile!)
  glGenVertexArrays(1, &mesh_vao);  // Features' Vertex Array Object allocation
  glBindVertexArray(mesh_vao);      // bind VAO

  // initialize buffer object
  glGenBuffers(1, &mesh_vbo);
  glBindBuffer(GL_ARRAY_BUFFER, mesh_vbo);

  unsigned int size = mesh_width * mesh_height * 4 * sizeof(float);
  glBufferData(GL_ARRAY_BUFFER, size, NULL, GL_DYNAMIC_DRAW);
  glVertexAttribPointer((GLuint)0, 4, GL_FLOAT, GL_FALSE, 0, 0);
  glEnableVertexAttribArray(0);

  checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, mesh_vbo,
                                               cudaGraphicsMapFlagsNone));

  // GLSL stuff
  char *vertex_shader_path = sdkFindFilePath("mesh.vert.glsl", pArgv[0]);
  char *fragment_shader_path = sdkFindFilePath("mesh.frag.glsl", pArgv[0]);

  if (vertex_shader_path == NULL || fragment_shader_path == NULL) {
    printf("Error finding shader file\n");
    exit(EXIT_FAILURE);
  }

  mesh_shader = ShaderCreate(vertex_shader_path, fragment_shader_path);
  CHECK_GLERROR();

  free(vertex_shader_path);
  free(fragment_shader_path);

  glUseProgram(mesh_shader);
}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
bool runTest(int argc, char **argv, char *ref_file) {
  // command line mode only
  if (ref_file != NULL) {
    // This will pick the best possible CUDA capable device
    // int devID = findCudaDevice(argc, (const char **)argv);
#if defined(__aarch64__) || defined(__arm__)
    // find iGPU on the system which is compute capable which will perform
    // GLES-CUDA interop
    int devID = findIntegratedGPU();
#else
    // use command-line specified CUDA device, otherwise use device with highest
    // Gflops/s
    int devID = findCudaDevice(argc, (const char **)argv);
#endif

    // create VBO
    checkCudaErrors(cudaMalloc((void **)&d_vbo_buffer,
                               mesh_width * mesh_height * 4 * sizeof(float)));

    // run the cuda part
    runAutoTest(devID, argv, ref_file);

    // check result of Cuda step
    checkResultCuda(argc, argv, mesh_vbo);

    cudaFree(d_vbo_buffer);
    d_vbo_buffer = NULL;
  } else {
    double endTime = TIME_LIMIT;

    // this would use command-line specified CUDA device, note that CUDA
    // defaults to highest Gflops/s device
    if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
      error_exit("Device setting not yet implemented!\n");
    }

    // display selection
    if (checkCmdLineFlag(argc, (const char **)argv, "dispno")) {
      dispno = getCmdLineArgumentInt(argc, (const char **)argv, "dispno");
    }

    // Window width
    if (checkCmdLineFlag(argc, (const char **)argv, "width")) {
      window_width = getCmdLineArgumentInt(argc, (const char **)argv, "width");
    }

    // Window Height
    if (checkCmdLineFlag(argc, (const char **)argv, "height")) {
      window_height =
          getCmdLineArgumentInt(argc, (const char **)argv, "height");
    }

    // Determine how long to run for in secs: default is 10s
    if (checkCmdLineFlag(argc, (const char **)argv, "runtime")) {
      endTime = getCmdLineArgumentInt(argc, (const char **)argv, "runtime");
    }

    SetCloseCB(closeCB_app);
    SetKeyCB(keyCB_app);

    // create QNX screen window and set up associated OpenGL ES context
    graphics_setup_window(0, 0, window_width, window_height, sSDKsample,
                          dispno);

#if defined(__aarch64__) || defined(__arm__)
    // find iGPU on the system which is compute capable which will perform
    // GLES-CUDA interop
    int devID = findIntegratedGPU();
#else
    // use command-line specified CUDA device, otherwise use device with highest
    // Gflops/s
    int devID = findCudaDevice(argc, (const char **)argv);
#endif
    InitGraphicsState();  // set up GLES stuff

    glClearColor(0, 0.5, 1, 1);  // blue-ish background
    glClear(GL_COLOR_BUFFER_BIT);

    graphics_swap_buffers();

    int frame = 0;

    struct timeval begin, end;
    gettimeofday(&begin, NULL);

    // Print runtime
    if (endTime < 0.0) {
      endTime = TIME_LIMIT;
      printf(" running forever...\n");
    } else {
      printf(" running for %f seconds...\n", endTime);
    }

    while (!shutdown) {
      frame++;
      display_thisframe(0.010);
      usleep(1000);
      graphics_swap_buffers();
      CheckEvents();

      gettimeofday(&end, 0);
      double elapsed = (end.tv_sec - begin.tv_sec) +
                       ((end.tv_usec - begin.tv_usec) / 1000000.0);

      // Check whether time limit has been exceeded
      if (!shutdown) shutdown = (endTime <= elapsed);
    }

    // NOTE: Before destroying OpenGL ES context, must unregister all shared
    // resources from CUDA !
    checkCudaErrors(cudaGraphicsUnregisterResource(cuda_vbo_resource));

    graphics_close_window();  // close window and destroy OpenGL ES context
  }

  return true;
}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
  char *ref_file = NULL;

  pArgc = &argc;
  pArgv = argv;

#if defined(__linux__)
  setenv("DISPLAY", ":0", 0);
#endif

  printf("%s starting...\n", sSDKsample);

  if (argc > 1) {
    if (checkCmdLineFlag(argc, (const char **)argv, "file")) {
      // In this mode, we run without OpenGL and see if VBO is generated
      // correctly
      getCmdLineArgumentString(argc, (const char **)argv, "file",
                               (char **)&ref_file);
    }
  }

  printf("\n");

  runTest(argc, argv, ref_file);

  printf("%s completed, returned %s\n", sSDKsample,
         (g_TotalErrors == 0) ? "OK" : "ERROR!");

  exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
}
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`/*`
			`This example demonstrates how to use the CUDA C bindings to OpenGL ES to`
			`dynamically modify a vertex buffer using a CUDA C kernel.`

			`The steps are:`
			`1. Create an empty vertex buffer object (VBO)`
			`2. Register the VBO with CUDA C`
			`3. Map the VBO for writing from CUDA C`
			`4. Run CUDA C kernel to modify the vertex positions`
			`5. Unmap the VBO`
			`6. Render the results using OpenGL ES`

			`Host code`
			`*/`

			`#include <stdlib.h>`
			`#include <stdio.h>`
			`#include <string.h>`
			`#include <math.h>`

			`#include <stdarg.h>`
			`#include <unistd.h>`
			`#include <screen/screen.h>`

			`#include "graphics_interface.c"`

			`// includes, cuda`
			`#include <cuda_runtime.h>`
			`#include <cuda_gl_interop.h>`

			`// Utilities and timing functions`
			`#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h`

			`// CUDA helper functions`
			`#include <helper_cuda.h> // helper functions for CUDA error check`

			`#include <vector_types.h>`

			`#define MAX_EPSILON_ERROR 0.0f`
			`#define THRESHOLD 0.0f`
			`#define REFRESH_DELAY 1 // ms`

			`#define GUI_IDLE 0x100`
			`#define GUI_ROTATE 0x101`
			`#define GUI_TRANSLATE 0x102`

			`int gui_mode;`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Default configuration`
			`unsigned int window_width = 512;`
			`unsigned int window_height = 512;`
			`unsigned int dispno = 0;`

			`// constants`
			`const unsigned int mesh_width = 256;`
			`const unsigned int mesh_height = 256;`

			`// OpenGL ES variables and interop with CUDA C`
			`GLuint mesh_vao, mesh_vbo;`
			`struct cudaGraphicsResource *cuda_vbo_resource;`
			`void *d_vbo_buffer = NULL;`

			`float g_fAnim = 0.0;`

			`// UI / mouse controls`
			`int mouse_old_x, mouse_old_y;`
			`int mouse_buttons = 0;`
			`float rotate_x = 0.0, rotate_y = 0.0;`
			`float translate_z = -3.0;`

			`StopWatchInterface *timer = NULL;`

			`// Frame statistics`
			`int frame;`
			`int fpsCount = 0; // FPS count for averaging`
			`int fpsLimit = 1; // FPS limit for sampling`
			`int g_Index = 0;`
			`float avgFPS = 0.0f;`
			`unsigned int frameCount = 0;`
			`unsigned int g_TotalErrors = 0;`

			`// The default number of seconds after which the test will end.`
			`#define TIME_LIMIT 10.0 // 10 secs`

			`// Flag indicating it is time to shut down`
			`static GLboolean shutdown = GL_FALSE;`

			`// Callback to close window`
			`static void closeCB_app(void) { shutdown = GL_TRUE; }`

			`// Callback to handle key presses`
			`static void keyCB_app(char key, int state) {`
			`// Ignoring releases`
			`if (!state) return;`

			`if ((key == 'q') \|\| (key == 'Q') \|\| (key == NvGlDemoKeyCode_Escape))`
			`shutdown = GL_TRUE;`
			`}`

			`// Auto-Verification Code`
			`bool g_bQAReadback = false;`

			`int *pArgc = NULL;`
			`char **pArgv = NULL;`

			`#define MAX(a, b) ((a > b) ? a : b)`

			`////////////////////////////////////////////////////////////////////////////////`
			`// declaration, forward`

			`// CUDA functionality`
			`void runCuda(struct cudaGraphicsResource **vbo_resource);`
			`void runAutoTest(int devID, char *argv, char ref_file);`
			`void checkResultCuda(int argc, char **argv, const GLuint &vbo);`

			`const char *sSDKsample = "simpleGLES on Screen (VBO)";`

			`void computeFPS() {`
			`frameCount++;`
			`fpsCount++;`

			`if (fpsCount == fpsLimit) {`
			`avgFPS = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);`
			`fpsCount = 0;`
			`fpsLimit = (int)MAX(avgFPS, 1.f);`

			`sdkResetTimer(&timer);`
			`}`

			`char fps[256];`
			`sprintf(fps, "Cuda/OpenGL ES Interop (VBO): %3.1f fps (Max 1000 fps)",`
			`avgFPS);`
			`graphics_set_windowtitle(fps);`
			`}`

			`///////////////////////////////////////////////////////////////////////////////`
			`//! Simple kernel to modify vertex positions in sine wave pattern`
			`//! @param data data in global memory`
			`///////////////////////////////////////////////////////////////////////////////`
			`__global__ void simple_vbo_kernel(float4 *pos, unsigned int width,`
			`unsigned int height, float time) {`
			`unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;`
			`unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;`

			`// calculate uv coordinates`
			`float u = x / (float)width;`
			`float v = y / (float)height;`
			`u = u * 2.0f - 1.0f;`
			`v = v * 2.0f - 1.0f;`

			`// calculate simple sine wave pattern`
			`float freq = 4.0f;`
			`float w = sinf(u * freq + time) * cosf(v * freq + time) * 0.5f;`

			`// write output vertex`
			`pos[y * width + x] = make_float4(u, w, v, 1.0f);`
			`}`

			`void launch_kernel(float4 *pos, unsigned int mesh_width,`
			`unsigned int mesh_height, float time) {`
			`// execute the kernel`
			`dim3 block(8, 8, 1);`
			`dim3 grid(mesh_width / block.x, mesh_height / block.y, 1);`
			`simple_vbo_kernel<<<grid, block>>>(pos, mesh_width, mesh_height, time);`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Run the Cuda part of the computation`
			`////////////////////////////////////////////////////////////////////////////////`
			`void runCuda(struct cudaGraphicsResource **vbo_resource) {`
			`// map OpenGL buffer object for writing from CUDA`
			`float4 *dptr;`
			`cudaGraphicsMapResources(1, vbo_resource, 0);`
			`size_t num_bytes;`
			`cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes,`
			`*vbo_resource);`

			`launch_kernel(dptr, mesh_width, mesh_height, g_fAnim);`

			`// unmap buffer object`
			`cudaGraphicsUnmapResources(1, vbo_resource, 0);`
			`}`

			`#ifndef FOPEN`
			`#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))`
			`#endif`

			`void sdkDumpBin2(void data, unsigned int bytes, const char filename) {`
			`printf("sdkDumpBin: <%s>\n", filename);`
			`FILE *fp;`
			`FOPEN(fp, filename, "wb");`
			`fwrite(data, bytes, 1, fp);`
			`fflush(fp);`
			`fclose(fp);`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Run the Cuda part of the computation`
			`////////////////////////////////////////////////////////////////////////////////`
			`void runAutoTest(int devID, char *argv, char ref_file) {`
			`char *reference_file = NULL;`
			`void imageData = malloc(mesh_width mesh_height * sizeof(float));`

			`// execute the kernel`
			`launch_kernel((float4 *)d_vbo_buffer, mesh_width, mesh_height, g_fAnim);`

			`cudaDeviceSynchronize();`
			`getLastCudaError("launch_kernel failed");`

			`cudaMemcpy(imageData, d_vbo_buffer, mesh_width * mesh_height * sizeof(float),`
			`cudaMemcpyDeviceToHost);`

			`sdkDumpBin2(imageData, mesh_width * mesh_height * sizeof(float),`
			`"simpleGLES_screen.bin");`
			`reference_file = sdkFindFilePath(ref_file, argv[0]);`

			`if (reference_file &&`
			`!sdkCompareBin2BinFloat("simpleGLES_screen.bin", reference_file,`
			`mesh_width * mesh_height * sizeof(float),`
			`MAX_EPSILON_ERROR, THRESHOLD, pArgv[0])) {`
			`g_TotalErrors++;`
			`}`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Display callback`
			`////////////////////////////////////////////////////////////////////////////////`
			`void display_thisframe(float time_delta) {`
			`sdkStartTimer(&timer);`

			`// run CUDA kernel to generate vertex positions`
			`runCuda(&cuda_vbo_resource);`

			`glClear(GL_COLOR_BUFFER_BIT \| GL_DEPTH_BUFFER_BIT);`

			`glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height);`

			`glFinish();`

			`g_fAnim += time_delta;`

			`sdkStopTimer(&timer);`
			`computeFPS();`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Check if the result is correct or write data to file for external`
			`//! regression testing`
			`////////////////////////////////////////////////////////////////////////////////`
			`void checkResultCuda(int argc, char **argv, const GLuint &vbo) {`
			`if (!d_vbo_buffer) {`
			`printf("%s: Mapping result buffer from OpenGL ES\n", __FUNCTION__);`

			`cudaGraphicsUnregisterResource(cuda_vbo_resource);`

			`// map buffer object`
			`glBindBuffer(GL_ARRAY_BUFFER, vbo);`
			`float data = (float )glMapBufferRange(`
			`GL_ARRAY_BUFFER, 0, mesh_width * mesh_height * 4 * sizeof(float),`
			`GL_READ_ONLY);`

			`// check result`
			`if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {`
			`// write file for regression test`
			`sdkWriteFile<float>("./data/regression.dat", data,`
			`mesh_width * mesh_height * 3, 0.0, false);`
			`}`

			`// unmap GL buffer object`
			`if (!glUnmapBuffer(GL_ARRAY_BUFFER)) {`
			`fprintf(stderr, "Unmap buffer failed.\n");`
			`fflush(stderr);`
			`}`

			`checkCudaErrors(cudaGraphicsGLRegisterBuffer(`
			`&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsWriteDiscard));`

			`CHECK_GLERROR();`
			`}`
			`}`

			`GLuint mesh_shader = 0;`

			`void readAndCompileShaderFromGLSLFile(GLuint new_shaderprogram,`
			`const char *filename, GLenum shaderType) {`
			`FILE *file = fopen(filename, "rb"); // open shader text file`
			`if (!file) {`
			`error_exit("Filename %s does not exist\n", filename);`
			`}`

			`// get the size of the file and read it`
			`fseek(file, 0, SEEK_END);`
			`GLint size = ftell(file);`
			`char data = (char )malloc(sizeof(char) * (size + 1));`
			`memset(data, 0, sizeof(char) * (size + 1));`
			`fseek(file, 0, SEEK_SET);`
			`size_t res = fread(data, 1, size, file);`
			`fclose(file);`

			`GLuint shader = glCreateShader(shaderType);`
			`glShaderSource(shader, 1, (const GLchar **)&data, &size);`
			`glCompileShader(shader);`

			`CHECK_GLERROR();`
			`GLint compile_success = 0;`
			`glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_success);`
			`CHECK_GLERROR();`

			`if (compile_success == GL_FALSE) {`
			`printf("Compilation of %s failed!\n Reason:\n", filename);`

			`GLint maxLength = 0;`
			`glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &maxLength);`

			`char errorLog[maxLength];`
			`glGetShaderInfoLog(shader, maxLength, &maxLength, &errorLog[0]);`

			`printf("%s", errorLog);`

			`glDeleteShader(shader);`
			`exit(1);`
			`}`

			`glAttachShader(new_shaderprogram, shader);`
			`glDeleteShader(shader);`

			`free(data);`
			`}`

			`GLuint ShaderCreate(const char *vshader_filename,`
			`const char *fshader_filename) {`
			`printf("Loading GLSL shaders %s %s\n", vshader_filename, fshader_filename);`

			`GLuint new_shaderprogram = glCreateProgram();`

			`CHECK_GLERROR();`
			`if (vshader_filename) {`
			`readAndCompileShaderFromGLSLFile(new_shaderprogram, vshader_filename,`
			`GL_VERTEX_SHADER);`
			`}`

			`CHECK_GLERROR();`
			`if (fshader_filename) {`
			`readAndCompileShaderFromGLSLFile(new_shaderprogram, fshader_filename,`
			`GL_FRAGMENT_SHADER);`
			`}`

			`CHECK_GLERROR();`

			`glLinkProgram(new_shaderprogram);`

			`CHECK_GLERROR();`
			`GLint link_success;`
			`glGetProgramiv(new_shaderprogram, GL_LINK_STATUS, &link_success);`

			`if (link_success == GL_FALSE) {`
			`printf("Linking of %s with %s failed!\n Reason:\n", vshader_filename,`
			`fshader_filename);`

			`GLint maxLength = 0;`
			`glGetShaderiv(new_shaderprogram, GL_INFO_LOG_LENGTH, &maxLength);`

			`char errorLog[maxLength];`
			`glGetShaderInfoLog(new_shaderprogram, maxLength, &maxLength, &errorLog[0]);`

			`printf("%s", errorLog);`

			`exit(EXIT_FAILURE);`
			`}`

			`return new_shaderprogram;`
			`}`

			`//===========================================================================`
			`// InitGraphicsState() - initialize OpenGL`
			`//===========================================================================`
			`static void InitGraphicsState(void) {`
			`char GL_version = (char )glGetString(GL_VERSION);`
			`char GL_vendor = (char )glGetString(GL_VENDOR);`
			`char GL_renderer = (char )glGetString(GL_RENDERER);`

			`printf("Version: %s\n", GL_version);`
			`printf("Vendor: %s\n", GL_vendor);`
			`printf("Renderer: %s\n", GL_renderer);`

			`// RENDERING SETUP (OpenGL ES or OpenGL Core Profile!)`
			`glGenVertexArrays(1, &mesh_vao); // Features' Vertex Array Object allocation`
			`glBindVertexArray(mesh_vao); // bind VAO`

			`// initialize buffer object`
			`glGenBuffers(1, &mesh_vbo);`
			`glBindBuffer(GL_ARRAY_BUFFER, mesh_vbo);`

			`unsigned int size = mesh_width * mesh_height * 4 * sizeof(float);`
			`glBufferData(GL_ARRAY_BUFFER, size, NULL, GL_DYNAMIC_DRAW);`
			`glVertexAttribPointer((GLuint)0, 4, GL_FLOAT, GL_FALSE, 0, 0);`
			`glEnableVertexAttribArray(0);`

			`checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, mesh_vbo,`
			`cudaGraphicsMapFlagsNone));`

			`// GLSL stuff`
			`char *vertex_shader_path = sdkFindFilePath("mesh.vert.glsl", pArgv[0]);`
			`char *fragment_shader_path = sdkFindFilePath("mesh.frag.glsl", pArgv[0]);`

			`if (vertex_shader_path == NULL \|\| fragment_shader_path == NULL) {`
			`printf("Error finding shader file\n");`
			`exit(EXIT_FAILURE);`
			`}`

			`mesh_shader = ShaderCreate(vertex_shader_path, fragment_shader_path);`
			`CHECK_GLERROR();`

			`free(vertex_shader_path);`
			`free(fragment_shader_path);`

			`glUseProgram(mesh_shader);`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Run a simple test for CUDA`
			`////////////////////////////////////////////////////////////////////////////////`
			`bool runTest(int argc, char *argv, char ref_file) {`
			`// command line mode only`
			`if (ref_file != NULL) {`
			`// This will pick the best possible CUDA capable device`
			`// int devID = findCudaDevice(argc, (const char **)argv);`
			`#if defined(__aarch64__) \|\| defined(__arm__)`
			`// find iGPU on the system which is compute capable which will perform`
			`// GLES-CUDA interop`
			`int devID = findIntegratedGPU();`
			`#else`
			`// use command-line specified CUDA device, otherwise use device with highest`
			`// Gflops/s`
			`int devID = findCudaDevice(argc, (const char **)argv);`
			`#endif`

			`// create VBO`
			`checkCudaErrors(cudaMalloc((void **)&d_vbo_buffer,`
			`mesh_width * mesh_height * 4 * sizeof(float)));`

			`// run the cuda part`
			`runAutoTest(devID, argv, ref_file);`

			`// check result of Cuda step`
			`checkResultCuda(argc, argv, mesh_vbo);`

			`cudaFree(d_vbo_buffer);`
			`d_vbo_buffer = NULL;`
			`} else {`
			`double endTime = TIME_LIMIT;`

			`// this would use command-line specified CUDA device, note that CUDA`
			`// defaults to highest Gflops/s device`
			`if (checkCmdLineFlag(argc, (const char **)argv, "device")) {`
			`error_exit("Device setting not yet implemented!\n");`
			`}`

			`// display selection`
			`if (checkCmdLineFlag(argc, (const char **)argv, "dispno")) {`
			`dispno = getCmdLineArgumentInt(argc, (const char **)argv, "dispno");`
			`}`

			`// Window width`
			`if (checkCmdLineFlag(argc, (const char **)argv, "width")) {`
			`window_width = getCmdLineArgumentInt(argc, (const char **)argv, "width");`
			`}`

			`// Window Height`
			`if (checkCmdLineFlag(argc, (const char **)argv, "height")) {`
			`window_height =`
			`getCmdLineArgumentInt(argc, (const char **)argv, "height");`
			`}`

			`// Determine how long to run for in secs: default is 10s`
			`if (checkCmdLineFlag(argc, (const char **)argv, "runtime")) {`
			`endTime = getCmdLineArgumentInt(argc, (const char **)argv, "runtime");`
			`}`

			`SetCloseCB(closeCB_app);`
			`SetKeyCB(keyCB_app);`

			`// create QNX screen window and set up associated OpenGL ES context`
			`graphics_setup_window(0, 0, window_width, window_height, sSDKsample,`
			`dispno);`

			`#if defined(__aarch64__) \|\| defined(__arm__)`
			`// find iGPU on the system which is compute capable which will perform`
			`// GLES-CUDA interop`
			`int devID = findIntegratedGPU();`
			`#else`
			`// use command-line specified CUDA device, otherwise use device with highest`
			`// Gflops/s`
			`int devID = findCudaDevice(argc, (const char **)argv);`
			`#endif`
			`InitGraphicsState(); // set up GLES stuff`

			`glClearColor(0, 0.5, 1, 1); // blue-ish background`
			`glClear(GL_COLOR_BUFFER_BIT);`

			`graphics_swap_buffers();`

			`int frame = 0;`

			`struct timeval begin, end;`
			`gettimeofday(&begin, NULL);`

			`// Print runtime`
			`if (endTime < 0.0) {`
			`endTime = TIME_LIMIT;`
			`printf(" running forever...\n");`
			`} else {`
			`printf(" running for %f seconds...\n", endTime);`
			`}`

			`while (!shutdown) {`
			`frame++;`
			`display_thisframe(0.010);`
			`usleep(1000);`
			`graphics_swap_buffers();`
			`CheckEvents();`

			`gettimeofday(&end, 0);`
			`double elapsed = (end.tv_sec - begin.tv_sec) +`
			`((end.tv_usec - begin.tv_usec) / 1000000.0);`

			`// Check whether time limit has been exceeded`
			`if (!shutdown) shutdown = (endTime <= elapsed);`
			`}`

			`// NOTE: Before destroying OpenGL ES context, must unregister all shared`
			`// resources from CUDA !`
			`checkCudaErrors(cudaGraphicsUnregisterResource(cuda_vbo_resource));`

			`graphics_close_window(); // close window and destroy OpenGL ES context`
			`}`

			`return true;`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Program main`
			`////////////////////////////////////////////////////////////////////////////////`
			`int main(int argc, char **argv) {`
			`char *ref_file = NULL;`

			`pArgc = &argc;`
			`pArgv = argv;`

			`#if defined(__linux__)`
			`setenv("DISPLAY", ":0", 0);`
			`#endif`

			`printf("%s starting...\n", sSDKsample);`

			`if (argc > 1) {`
			`if (checkCmdLineFlag(argc, (const char **)argv, "file")) {`
			`// In this mode, we run without OpenGL and see if VBO is generated`
			`// correctly`
			`getCmdLineArgumentString(argc, (const char **)argv, "file",`
			`(char **)&ref_file);`
			`}`
			`}`

			`printf("\n");`

			`runTest(argc, argv, ref_file);`

			`printf("%s completed, returned %s\n", sSDKsample,`
			`(g_TotalErrors == 0) ? "OK" : "ERROR!");`

			`exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);`
			`}`