cuda-samples/Samples/5_Domain_Specific/simpleGLES/simpleGLES.cu

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
    This example demonstrates how to use the CUDA C bindings to OpenGL ES to
    dynamically modify a vertex buffer using a CUDA C kernel.

    The steps are:
    1. Create an empty vertex buffer object (VBO)
    2. Register the VBO with CUDA C
    3. Map the VBO for writing from CUDA C
    4. Run CUDA C kernel to modify the vertex positions
    5. Unmap the VBO
    6. Render the results using OpenGL ES

    Host code
*/

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include <stdarg.h>
#include <unistd.h>
#include <X11/Xlib.h>
#include <X11/Xutil.h>

void error_exit(const char *format, ...) {
  va_list args;
  va_start(args, format);
  vfprintf(stderr, format, args);
  va_end(args);
  exit(1);
}

#include "graphics_interface.c"

#ifdef _WIN32
#define WINDOWS_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#endif

// includes, cuda
#include <cuda_runtime.h>
#include <cuda_gl_interop.h>

// Utilities and timing functions
#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h

// CUDA helper functions
#include <helper_cuda.h>  // helper functions for CUDA error check
//#include <helper_cuda_gl.h>      // helper functions for CUDA/GL interop

#include <vector_types.h>

#define MAX_EPSILON_ERROR 0.0f
#define THRESHOLD 0.0f
#define REFRESH_DELAY 1  // ms

#define GUI_IDLE 0x100
#define GUI_ROTATE 0x101
#define GUI_TRANSLATE 0x102

int gui_mode;

////////////////////////////////////////////////////////////////////////////////
// constants
const unsigned int window_width = 512;
const unsigned int window_height = 512;

const unsigned int mesh_width = 256;
const unsigned int mesh_height = 256;

// OpenGL ES variables and interop with CUDA C
GLuint mesh_vao, mesh_vbo;
struct cudaGraphicsResource *cuda_vbo_resource;
void *d_vbo_buffer = NULL;

float g_fAnim = 0.0;

// UI / mouse controls
int mouse_old_x, mouse_old_y;
int mouse_buttons = 0;
float rotate_x = 0.0, rotate_y = 0.0;
float translate_z = -3.0;

StopWatchInterface *timer = NULL;

// Frame statistics
int frame;
int fpsCount = 0;  // FPS count for averaging
int fpsLimit = 1;  // FPS limit for sampling
int g_Index = 0;
float avgFPS = 0.0f;
unsigned int frameCount = 0;
unsigned int g_TotalErrors = 0;

// Auto-Verification Code
bool g_bQAReadback = false;

int *pArgc = NULL;
char **pArgv = NULL;

#define MAX(a, b) ((a > b) ? a : b)

////////////////////////////////////////////////////////////////////////////////
// declaration, forward

// CUDA functionality
void runCuda(struct cudaGraphicsResource **vbo_resource);
void runAutoTest(int devID, char **argv, char *ref_file);
void checkResultCuda(int argc, char **argv, const GLuint &vbo);

const char *sSDKsample = "simpleGLES (VBO)";

void computeFPS() {
  frameCount++;
  fpsCount++;

  if (fpsCount == fpsLimit) {
    avgFPS = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
    fpsCount = 0;
    fpsLimit = (int)MAX(avgFPS, 1.f);

    sdkResetTimer(&timer);
  }

  char fps[256];
  sprintf(fps, "Cuda/OpenGL ES Interop (VBO): %3.1f fps (Max 1000 fps)",
          avgFPS);
  graphics_set_windowtitle(fps);
}

///////////////////////////////////////////////////////////////////////////////
//! Simple kernel to modify vertex positions in sine wave pattern
//! @param data  data in global memory
///////////////////////////////////////////////////////////////////////////////
__global__ void simple_vbo_kernel(float4 *pos, unsigned int width,
                                  unsigned int height, float time) {
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

  // calculate uv coordinates
  float u = x / (float)width;
  float v = y / (float)height;
  u = u * 2.0f - 1.0f;
  v = v * 2.0f - 1.0f;

  // calculate simple sine wave pattern
  float freq = 4.0f;
  float w = sinf(u * freq + time) * cosf(v * freq + time) * 0.5f;

  // write output vertex
  pos[y * width + x] = make_float4(u, w, v, 1.0f);
}

void launch_kernel(float4 *pos, unsigned int mesh_width,
                   unsigned int mesh_height, float time) {
  // execute the kernel
  dim3 block(8, 8, 1);
  dim3 grid(mesh_width / block.x, mesh_height / block.y, 1);
  simple_vbo_kernel<<<grid, block>>>(pos, mesh_width, mesh_height, time);
}

////////////////////////////////////////////////////////////////////////////////
//! Run the Cuda part of the computation
////////////////////////////////////////////////////////////////////////////////
void runCuda(struct cudaGraphicsResource **vbo_resource) {
  // map OpenGL buffer object for writing from CUDA
  float4 *dptr;
  cudaGraphicsMapResources(1, vbo_resource, 0);
  size_t num_bytes;
  cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes,
                                       *vbo_resource);
  // printf("Sample CUDA mapped VBO: May access %ld bytes\n", num_bytes);

  // execute the kernel
  //    dim3 block(8, 8, 1);
  //    dim3 grid(mesh_width / block.x, mesh_height / block.y, 1);
  //    kernel<<< grid, block>>>(dptr, mesh_width, mesh_height, g_fAnim);

  launch_kernel(dptr, mesh_width, mesh_height, g_fAnim);

  // unmap buffer object
  cudaGraphicsUnmapResources(1, vbo_resource, 0);
}

#ifdef _WIN32
#ifndef FOPEN
#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
#endif
#else
#ifndef FOPEN
#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
#endif
#endif

void sdkDumpBin2(void *data, unsigned int bytes, const char *filename) {
  printf("sdkDumpBin: <%s>\n", filename);
  FILE *fp;
  FOPEN(fp, filename, "wb");
  fwrite(data, bytes, 1, fp);
  fflush(fp);
  fclose(fp);
}

////////////////////////////////////////////////////////////////////////////////
//! Run the Cuda part of the computation
////////////////////////////////////////////////////////////////////////////////
void runAutoTest(int devID, char **argv, char *ref_file) {
  char *reference_file = NULL;
  void *imageData = malloc(mesh_width * mesh_height * sizeof(float));

  // execute the kernel
  launch_kernel((float4 *)d_vbo_buffer, mesh_width, mesh_height, g_fAnim);

  cudaDeviceSynchronize();
  getLastCudaError("launch_kernel failed");

  cudaMemcpy(imageData, d_vbo_buffer, mesh_width * mesh_height * sizeof(float),
             cudaMemcpyDeviceToHost);

  sdkDumpBin2(imageData, mesh_width * mesh_height * sizeof(float),
              "simpleGL.bin");
  reference_file = sdkFindFilePath(ref_file, argv[0]);

  if (reference_file &&
      !sdkCompareBin2BinFloat("simpleGL.bin", reference_file,
                              mesh_width * mesh_height * sizeof(float),
                              MAX_EPSILON_ERROR, THRESHOLD, pArgv[0])) {
    g_TotalErrors++;
  }
}

////////////////////////////////////////////////////////////////////////////////
//! Display callback
////////////////////////////////////////////////////////////////////////////////
void display_thisframe(float time_delta) {
  sdkStartTimer(&timer);

  // run CUDA kernel to generate vertex positions
  runCuda(&cuda_vbo_resource);

  glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
  // GET_GLERROR(0);

  // set view matrix: broken, it doesn't work in OpenGL ES! Must put into shader
  // glMatrixMode(GL_MODELVIEW);
  // glLoadIdentity();
  // glTranslatef(0.0, 0.0, translate_z);
  // glRotatef(rotate_x, 1.0, 0.0, 0.0);
  // glRotatef(rotate_y, 0.0, 1.0, 0.0);

  glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height);

  // GET_GLERROR(0);
  glFinish();
  // GET_GLERROR(0);

  g_fAnim += time_delta;

  sdkStopTimer(&timer);
  computeFPS();
}

////////////////////////////////////////////////////////////////////////////////
//! Check if the result is correct or write data to file for external
//! regression testing
////////////////////////////////////////////////////////////////////////////////
void checkResultCuda(int argc, char **argv, const GLuint &vbo) {
  if (!d_vbo_buffer) {
    printf("%s: Mapping result buffer from OpenGL ES\n", __FUNCTION__);

    cudaGraphicsUnregisterResource(cuda_vbo_resource);

    // map buffer object
    glBindBuffer(GL_ARRAY_BUFFER, vbo);
    float *data = (float *)glMapBufferRange(
        GL_ARRAY_BUFFER, 0, mesh_width * mesh_height * 4 * sizeof(float),
        GL_READ_ONLY);

    // check result
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
      // write file for regression test
      sdkWriteFile<float>("./data/regression.dat", data,
                          mesh_width * mesh_height * 3, 0.0, false);
    }

    // unmap GL buffer object
    if (!glUnmapBuffer(GL_ARRAY_BUFFER)) {
      fprintf(stderr, "Unmap buffer failed.\n");
      fflush(stderr);
    }

    checkCudaErrors(cudaGraphicsGLRegisterBuffer(
        &cuda_vbo_resource, vbo, cudaGraphicsMapFlagsWriteDiscard));

    GET_GLERROR(0);
  }
}

GLuint mesh_shader = 0;

void readAndCompileShaderFromGLSLFile(GLuint new_shaderprogram,
                                      const char *filename, GLenum shaderType) {
  FILE *file = fopen(filename, "rb");  // open shader text file
  if (!file) error_exit("Filename %s does not exist\n", filename);

  /* get the size of the file and read it */
  fseek(file, 0, SEEK_END);
  GLint size = ftell(file);
  char *data = (char *)malloc(sizeof(char) * (size + 1));
  memset(data, 0, sizeof(char) * (size + 1));
  fseek(file, 0, SEEK_SET);
  size_t res = fread(data, 1, size, file);
  fclose(file);

  GLuint shader = glCreateShader(shaderType);
  glShaderSource(shader, 1, (const GLchar **)&data, &size);
  glCompileShader(shader);

  GET_GLERROR(0);
  GLint compile_success = 0;
  glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_success);
  GET_GLERROR(0);

  if (compile_success == GL_FALSE) {
    printf("Compilation of %s failed!\n Reason:\n", filename);

    GLint maxLength = 0;
    glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &maxLength);

    char errorLog[maxLength];
    glGetShaderInfoLog(shader, maxLength, &maxLength, &errorLog[0]);

    printf("%s", errorLog);

    glDeleteShader(shader);
    exit(1);
  }

  glAttachShader(new_shaderprogram, shader);
  glDeleteShader(shader);  // good to do?

  free(data);
}

GLuint ShaderCreate(const char *vshader_filename,
                    const char *fshader_filename) {
  printf("Loading GLSL shaders %s %s\n", vshader_filename, fshader_filename);

  GLuint new_shaderprogram = glCreateProgram();

  GET_GLERROR(0);
  if (vshader_filename)
    readAndCompileShaderFromGLSLFile(new_shaderprogram, vshader_filename,
                                     GL_VERTEX_SHADER);

  GET_GLERROR(0);
  if (fshader_filename)
    readAndCompileShaderFromGLSLFile(new_shaderprogram, fshader_filename,
                                     GL_FRAGMENT_SHADER);

  GET_GLERROR(0);

  glLinkProgram(new_shaderprogram);

  GET_GLERROR(0);
  GLint link_success;
  glGetProgramiv(new_shaderprogram, GL_LINK_STATUS, &link_success);

  if (link_success == GL_FALSE) {
    printf("Linking of %s with %s failed!\n Reason:\n", vshader_filename,
           fshader_filename);

    GLint maxLength = 0;
    glGetShaderiv(new_shaderprogram, GL_INFO_LOG_LENGTH, &maxLength);

    char errorLog[maxLength];
    glGetShaderInfoLog(new_shaderprogram, maxLength, &maxLength, &errorLog[0]);

    printf("%s", errorLog);

    exit(EXIT_FAILURE);
  }

  return new_shaderprogram;
}

//===========================================================================
// InitGraphicsState() - initialize OpenGL
//===========================================================================
static void InitGraphicsState(void) {
  char *GL_version = (char *)glGetString(GL_VERSION);
  char *GL_vendor = (char *)glGetString(GL_VENDOR);
  char *GL_renderer = (char *)glGetString(GL_RENDERER);

  printf("Version: %s\n", GL_version);
  printf("Vendor: %s\n", GL_vendor);
  printf("Renderer: %s\n", GL_renderer);

  // RENDERING SETUP (OpenGL ES or OpenGL Core Profile!)
  glGenVertexArrays(1, &mesh_vao);  // Features' Vertex Array Object allocation
  glBindVertexArray(mesh_vao);      // bind VAO

  // initialize buffer object
  glGenBuffers(1, &mesh_vbo);
  glBindBuffer(GL_ARRAY_BUFFER, mesh_vbo);

  unsigned int size = mesh_width * mesh_height * 4 * sizeof(float);
  glBufferData(GL_ARRAY_BUFFER, size, NULL, GL_DYNAMIC_DRAW);
  glVertexAttribPointer((GLuint)0, 4, GL_FLOAT, GL_FALSE, 0, 0);
  glEnableVertexAttribArray(0);

  checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, mesh_vbo,
                                               cudaGraphicsMapFlagsNone));
  // glBindVertexArray(0); // keep above Vertex Array Object bound (it's the
  // only one throughout)

  // GLSL stuff
  char *vertex_shader_path = sdkFindFilePath("mesh.vert.glsl", pArgv[0]);
  char *fragment_shader_path = sdkFindFilePath("mesh.frag.glsl", pArgv[0]);

  if (vertex_shader_path == NULL || fragment_shader_path == NULL) {
    printf("Error finding shader file\n");
    exit(EXIT_FAILURE);
  }

  mesh_shader = ShaderCreate(vertex_shader_path, fragment_shader_path);
  GET_GLERROR(0);

  free(vertex_shader_path);
  free(fragment_shader_path);

  glUseProgram(mesh_shader);
}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
bool runTest(int argc, char **argv, char *ref_file) {
  // Create the CUTIL timer
  sdkCreateTimer(&timer);

  int devID = 0;
#if defined(__aarch64__) || defined(__arm__)
  // find iGPU on the system which is compute capable which will perform
  // GLES-CUDA interop
  devID = findIntegratedGPU();
#else
  // use command-line specified CUDA device, otherwise use device with highest
  // Gflops/s
  devID = findCudaDevice(argc, (const char **)argv);
#endif

  // command line mode only
  if (ref_file != NULL) {
    // create VBO
    checkCudaErrors(cudaMalloc((void **)&d_vbo_buffer,
                               mesh_width * mesh_height * 4 * sizeof(float)));

    // run the cuda part
    runAutoTest(devID, argv, ref_file);

    // check result of Cuda step
    checkResultCuda(argc, argv, mesh_vbo);

    cudaFree(d_vbo_buffer);
    d_vbo_buffer = NULL;
  } else {
    // this would use command-line specified CUDA device, note that CUDA
    // defaults to highest Gflops/s device
    if (checkCmdLineFlag(argc, (const char **)argv, "device"))
      error_exit("Device setting not yet implemented!\n");

    // create X11 window and set up associated OpenGL ES context
    graphics_setup_window(0, 0, window_width, window_height, sSDKsample);

    InitGraphicsState();  // set up GLES stuff

    glClearColor(0, 0.5, 1, 1);  // blue-ish background
    glClear(GL_COLOR_BUFFER_BIT);

    // printf("WP%d\n", __LINE__);
    graphics_swap_buffers();

    XEvent event;
    KeySym key;
    char text[255];

    int frame = 0;

    while (frame < 100000) {
      if (XPending(display)) {
        XNextEvent(display, &event);

        if (event.type == Expose && event.xexpose.count == 0) {
          printf("Redraw requested!\n");
        }
        if (event.type == KeyPress &&
            XLookupString(&event.xkey, text, 255, &key, 0) == 1) {
          if (text[0] == 27) goto label_stop_x;

          printf("You pressed the %c key!\n", text[0]);
        }

        if (event.type == ButtonPress) {
          printf("Mouse button %d press at (%d,%d)\n", event.xbutton.button,
                 event.xbutton.x, event.xbutton.y);

          if (event.xbutton.button == Button1) gui_mode = GUI_TRANSLATE;
          if (event.xbutton.button == Button3) gui_mode = GUI_ROTATE;
          mouse_old_x = event.xbutton.x;
          mouse_old_y = event.xbutton.y;
        }

        if (event.type == ButtonRelease) {
          printf("Mouse button %d released at (%d,%d)\n", event.xbutton.button,
                 event.xbutton.x, event.xbutton.y);

          gui_mode = GUI_IDLE;
          mouse_old_x = event.xbutton.x;
          mouse_old_y = event.xbutton.y;
        }

        if (event.type == MotionNotify) {
          // printf("Mouse motion towards %d %d, GUI mode is 0x%x\n",
          //	   event.xmotion.x, event.xmotion.y, gui_mode);
          float dx, dy;
          dx = (float)(event.xmotion.x - mouse_old_x);
          dy = (float)(event.xmotion.y - mouse_old_y);

          if (gui_mode == GUI_ROTATE) {
            rotate_x += dy * 0.2f;
            rotate_y += dx * 0.2f;
            printf("rot x %f y %f\n", rotate_x, rotate_y);
          }
          if (gui_mode == GUI_TRANSLATE) {
            translate_z += dy * 0.01f;
            printf("translate z %f\n", translate_z);
          }

          mouse_old_x = event.xmotion.x;
          mouse_old_y = event.xmotion.y;
        }
      }

      display_thisframe(0.010);
      usleep(1000);  // need not take full CPU and GPU

      graphics_swap_buffers();
      // printf("frame %d\n",frame++);
    }

  label_stop_x:
    // NOTE: Before destroying OpenGL ES context, must unregister all shared
    // resources from CUDA !
    cudaGraphicsUnregisterResource(cuda_vbo_resource);

    graphics_close_window();  // close window and destroy OpenGL ES context

    sdkDeleteTimer(&timer);
  }

  return true;
}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
  char *ref_file = NULL;

  pArgc = &argc;
  pArgv = argv;

#if defined(__linux__)
  setenv("DISPLAY", ":0", 0);
#endif

  printf("%s starting...\n", sSDKsample);

  if (argc > 1) {
    if (checkCmdLineFlag(argc, (const char **)argv, "file")) {
      // In this mode, we run without OpenGL and see if VBO is generated
      // correctly
      getCmdLineArgumentString(argc, (const char **)argv, "file",
                               (char **)&ref_file);
    }
  }

  printf("\n");

  runTest(argc, argv, ref_file);

  printf("%s completed, returned %s\n", sSDKsample,
         (g_TotalErrors == 0) ? "OK" : "ERROR!");

  exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
}
add and update samples for CUDA 11.6 2022-01-13 14:05:24 +08:00			`/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.`
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`/*`
			`This example demonstrates how to use the CUDA C bindings to OpenGL ES to`
			`dynamically modify a vertex buffer using a CUDA C kernel.`

			`The steps are:`
			`1. Create an empty vertex buffer object (VBO)`
			`2. Register the VBO with CUDA C`
			`3. Map the VBO for writing from CUDA C`
			`4. Run CUDA C kernel to modify the vertex positions`
			`5. Unmap the VBO`
			`6. Render the results using OpenGL ES`

			`Host code`
			`*/`

			`// includes, system`
			`#include <stdlib.h>`
			`#include <stdio.h>`
			`#include <string.h>`
			`#include <math.h>`

			`#include <stdarg.h>`
			`#include <unistd.h>`
			`#include <X11/Xlib.h>`
			`#include <X11/Xutil.h>`

			`void error_exit(const char *format, ...) {`
			`va_list args;`
			`va_start(args, format);`
			`vfprintf(stderr, format, args);`
			`va_end(args);`
			`exit(1);`
			`}`

			`#include "graphics_interface.c"`

			`#ifdef _WIN32`
			`#define WINDOWS_LEAN_AND_MEAN`
			`#define NOMINMAX`
			`#include <windows.h>`
			`#endif`

			`// includes, cuda`
			`#include <cuda_runtime.h>`
			`#include <cuda_gl_interop.h>`

			`// Utilities and timing functions`
			`#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h`

			`// CUDA helper functions`
			`#include <helper_cuda.h> // helper functions for CUDA error check`
			`//#include <helper_cuda_gl.h> // helper functions for CUDA/GL interop`

			`#include <vector_types.h>`

			`#define MAX_EPSILON_ERROR 0.0f`
			`#define THRESHOLD 0.0f`
			`#define REFRESH_DELAY 1 // ms`

			`#define GUI_IDLE 0x100`
			`#define GUI_ROTATE 0x101`
			`#define GUI_TRANSLATE 0x102`

			`int gui_mode;`

			`////////////////////////////////////////////////////////////////////////////////`
			`// constants`
			`const unsigned int window_width = 512;`
			`const unsigned int window_height = 512;`

			`const unsigned int mesh_width = 256;`
			`const unsigned int mesh_height = 256;`

			`// OpenGL ES variables and interop with CUDA C`
			`GLuint mesh_vao, mesh_vbo;`
			`struct cudaGraphicsResource *cuda_vbo_resource;`
			`void *d_vbo_buffer = NULL;`

			`float g_fAnim = 0.0;`

			`// UI / mouse controls`
			`int mouse_old_x, mouse_old_y;`
			`int mouse_buttons = 0;`
			`float rotate_x = 0.0, rotate_y = 0.0;`
			`float translate_z = -3.0;`

			`StopWatchInterface *timer = NULL;`

			`// Frame statistics`
			`int frame;`
			`int fpsCount = 0; // FPS count for averaging`
			`int fpsLimit = 1; // FPS limit for sampling`
			`int g_Index = 0;`
			`float avgFPS = 0.0f;`
			`unsigned int frameCount = 0;`
			`unsigned int g_TotalErrors = 0;`

			`// Auto-Verification Code`
			`bool g_bQAReadback = false;`

			`int *pArgc = NULL;`
			`char **pArgv = NULL;`

			`#define MAX(a, b) ((a > b) ? a : b)`

			`////////////////////////////////////////////////////////////////////////////////`
			`// declaration, forward`

			`// CUDA functionality`
			`void runCuda(struct cudaGraphicsResource **vbo_resource);`
			`void runAutoTest(int devID, char *argv, char ref_file);`
			`void checkResultCuda(int argc, char **argv, const GLuint &vbo);`

			`const char *sSDKsample = "simpleGLES (VBO)";`

			`void computeFPS() {`
			`frameCount++;`
			`fpsCount++;`

			`if (fpsCount == fpsLimit) {`
			`avgFPS = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);`
			`fpsCount = 0;`
			`fpsLimit = (int)MAX(avgFPS, 1.f);`

			`sdkResetTimer(&timer);`
			`}`

			`char fps[256];`
			`sprintf(fps, "Cuda/OpenGL ES Interop (VBO): %3.1f fps (Max 1000 fps)",`
			`avgFPS);`
			`graphics_set_windowtitle(fps);`
			`}`

			`///////////////////////////////////////////////////////////////////////////////`
			`//! Simple kernel to modify vertex positions in sine wave pattern`
			`//! @param data data in global memory`
			`///////////////////////////////////////////////////////////////////////////////`
			`__global__ void simple_vbo_kernel(float4 *pos, unsigned int width,`
			`unsigned int height, float time) {`
			`unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;`
			`unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;`

			`// calculate uv coordinates`
			`float u = x / (float)width;`
			`float v = y / (float)height;`
			`u = u * 2.0f - 1.0f;`
			`v = v * 2.0f - 1.0f;`

			`// calculate simple sine wave pattern`
			`float freq = 4.0f;`
			`float w = sinf(u * freq + time) * cosf(v * freq + time) * 0.5f;`

			`// write output vertex`
			`pos[y * width + x] = make_float4(u, w, v, 1.0f);`
			`}`

			`void launch_kernel(float4 *pos, unsigned int mesh_width,`
			`unsigned int mesh_height, float time) {`
			`// execute the kernel`
			`dim3 block(8, 8, 1);`
			`dim3 grid(mesh_width / block.x, mesh_height / block.y, 1);`
			`simple_vbo_kernel<<<grid, block>>>(pos, mesh_width, mesh_height, time);`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Run the Cuda part of the computation`
			`////////////////////////////////////////////////////////////////////////////////`
			`void runCuda(struct cudaGraphicsResource **vbo_resource) {`
			`// map OpenGL buffer object for writing from CUDA`
			`float4 *dptr;`
			`cudaGraphicsMapResources(1, vbo_resource, 0);`
			`size_t num_bytes;`
			`cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes,`
			`*vbo_resource);`
			`// printf("Sample CUDA mapped VBO: May access %ld bytes\n", num_bytes);`

			`// execute the kernel`
			`// dim3 block(8, 8, 1);`
			`// dim3 grid(mesh_width / block.x, mesh_height / block.y, 1);`
			`// kernel<<< grid, block>>>(dptr, mesh_width, mesh_height, g_fAnim);`

			`launch_kernel(dptr, mesh_width, mesh_height, g_fAnim);`

			`// unmap buffer object`
			`cudaGraphicsUnmapResources(1, vbo_resource, 0);`
			`}`

			`#ifdef _WIN32`
			`#ifndef FOPEN`
			`#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)`
			`#endif`
			`#else`
			`#ifndef FOPEN`
			`#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))`
			`#endif`
			`#endif`

			`void sdkDumpBin2(void data, unsigned int bytes, const char filename) {`
			`printf("sdkDumpBin: <%s>\n", filename);`
			`FILE *fp;`
			`FOPEN(fp, filename, "wb");`
			`fwrite(data, bytes, 1, fp);`
			`fflush(fp);`
			`fclose(fp);`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Run the Cuda part of the computation`
			`////////////////////////////////////////////////////////////////////////////////`
			`void runAutoTest(int devID, char *argv, char ref_file) {`
			`char *reference_file = NULL;`
			`void imageData = malloc(mesh_width mesh_height * sizeof(float));`

			`// execute the kernel`
			`launch_kernel((float4 *)d_vbo_buffer, mesh_width, mesh_height, g_fAnim);`

			`cudaDeviceSynchronize();`
			`getLastCudaError("launch_kernel failed");`

			`cudaMemcpy(imageData, d_vbo_buffer, mesh_width * mesh_height * sizeof(float),`
			`cudaMemcpyDeviceToHost);`

			`sdkDumpBin2(imageData, mesh_width * mesh_height * sizeof(float),`
			`"simpleGL.bin");`
			`reference_file = sdkFindFilePath(ref_file, argv[0]);`

			`if (reference_file &&`
			`!sdkCompareBin2BinFloat("simpleGL.bin", reference_file,`
			`mesh_width * mesh_height * sizeof(float),`
			`MAX_EPSILON_ERROR, THRESHOLD, pArgv[0])) {`
			`g_TotalErrors++;`
			`}`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Display callback`
			`////////////////////////////////////////////////////////////////////////////////`
			`void display_thisframe(float time_delta) {`
			`sdkStartTimer(&timer);`

			`// run CUDA kernel to generate vertex positions`
			`runCuda(&cuda_vbo_resource);`

			`glClear(GL_COLOR_BUFFER_BIT \| GL_DEPTH_BUFFER_BIT);`
			`// GET_GLERROR(0);`

			`// set view matrix: broken, it doesn't work in OpenGL ES! Must put into shader`
			`// glMatrixMode(GL_MODELVIEW);`
			`// glLoadIdentity();`
			`// glTranslatef(0.0, 0.0, translate_z);`
			`// glRotatef(rotate_x, 1.0, 0.0, 0.0);`
			`// glRotatef(rotate_y, 0.0, 1.0, 0.0);`

			`glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height);`

			`// GET_GLERROR(0);`
			`glFinish();`
			`// GET_GLERROR(0);`

			`g_fAnim += time_delta;`

			`sdkStopTimer(&timer);`
			`computeFPS();`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Check if the result is correct or write data to file for external`
			`//! regression testing`
			`////////////////////////////////////////////////////////////////////////////////`
			`void checkResultCuda(int argc, char **argv, const GLuint &vbo) {`
			`if (!d_vbo_buffer) {`
			`printf("%s: Mapping result buffer from OpenGL ES\n", __FUNCTION__);`

			`cudaGraphicsUnregisterResource(cuda_vbo_resource);`

			`// map buffer object`
			`glBindBuffer(GL_ARRAY_BUFFER, vbo);`
			`float data = (float )glMapBufferRange(`
			`GL_ARRAY_BUFFER, 0, mesh_width * mesh_height * 4 * sizeof(float),`
			`GL_READ_ONLY);`

			`// check result`
			`if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {`
			`// write file for regression test`
			`sdkWriteFile<float>("./data/regression.dat", data,`
			`mesh_width * mesh_height * 3, 0.0, false);`
			`}`

			`// unmap GL buffer object`
			`if (!glUnmapBuffer(GL_ARRAY_BUFFER)) {`
			`fprintf(stderr, "Unmap buffer failed.\n");`
			`fflush(stderr);`
			`}`

			`checkCudaErrors(cudaGraphicsGLRegisterBuffer(`
			`&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsWriteDiscard));`

			`GET_GLERROR(0);`
			`}`
			`}`

			`GLuint mesh_shader = 0;`

			`void readAndCompileShaderFromGLSLFile(GLuint new_shaderprogram,`
			`const char *filename, GLenum shaderType) {`
			`FILE *file = fopen(filename, "rb"); // open shader text file`
			`if (!file) error_exit("Filename %s does not exist\n", filename);`

			`/* get the size of the file and read it */`
			`fseek(file, 0, SEEK_END);`
			`GLint size = ftell(file);`
			`char data = (char )malloc(sizeof(char) * (size + 1));`
			`memset(data, 0, sizeof(char) * (size + 1));`
			`fseek(file, 0, SEEK_SET);`
			`size_t res = fread(data, 1, size, file);`
			`fclose(file);`

			`GLuint shader = glCreateShader(shaderType);`
			`glShaderSource(shader, 1, (const GLchar **)&data, &size);`
			`glCompileShader(shader);`

			`GET_GLERROR(0);`
			`GLint compile_success = 0;`
			`glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_success);`
			`GET_GLERROR(0);`

			`if (compile_success == GL_FALSE) {`
			`printf("Compilation of %s failed!\n Reason:\n", filename);`

			`GLint maxLength = 0;`
			`glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &maxLength);`

			`char errorLog[maxLength];`
			`glGetShaderInfoLog(shader, maxLength, &maxLength, &errorLog[0]);`

			`printf("%s", errorLog);`

			`glDeleteShader(shader);`
			`exit(1);`
			`}`

			`glAttachShader(new_shaderprogram, shader);`
			`glDeleteShader(shader); // good to do?`

			`free(data);`
			`}`

			`GLuint ShaderCreate(const char *vshader_filename,`
			`const char *fshader_filename) {`
			`printf("Loading GLSL shaders %s %s\n", vshader_filename, fshader_filename);`

			`GLuint new_shaderprogram = glCreateProgram();`

			`GET_GLERROR(0);`
			`if (vshader_filename)`
			`readAndCompileShaderFromGLSLFile(new_shaderprogram, vshader_filename,`
			`GL_VERTEX_SHADER);`

			`GET_GLERROR(0);`
			`if (fshader_filename)`
			`readAndCompileShaderFromGLSLFile(new_shaderprogram, fshader_filename,`
			`GL_FRAGMENT_SHADER);`

			`GET_GLERROR(0);`

			`glLinkProgram(new_shaderprogram);`

			`GET_GLERROR(0);`
			`GLint link_success;`
			`glGetProgramiv(new_shaderprogram, GL_LINK_STATUS, &link_success);`

			`if (link_success == GL_FALSE) {`
			`printf("Linking of %s with %s failed!\n Reason:\n", vshader_filename,`
			`fshader_filename);`

			`GLint maxLength = 0;`
			`glGetShaderiv(new_shaderprogram, GL_INFO_LOG_LENGTH, &maxLength);`

			`char errorLog[maxLength];`
			`glGetShaderInfoLog(new_shaderprogram, maxLength, &maxLength, &errorLog[0]);`

			`printf("%s", errorLog);`

			`exit(EXIT_FAILURE);`
			`}`

			`return new_shaderprogram;`
			`}`

			`//===========================================================================`
			`// InitGraphicsState() - initialize OpenGL`
			`//===========================================================================`
			`static void InitGraphicsState(void) {`
			`char GL_version = (char )glGetString(GL_VERSION);`
			`char GL_vendor = (char )glGetString(GL_VENDOR);`
			`char GL_renderer = (char )glGetString(GL_RENDERER);`

			`printf("Version: %s\n", GL_version);`
			`printf("Vendor: %s\n", GL_vendor);`
			`printf("Renderer: %s\n", GL_renderer);`

			`// RENDERING SETUP (OpenGL ES or OpenGL Core Profile!)`
			`glGenVertexArrays(1, &mesh_vao); // Features' Vertex Array Object allocation`
			`glBindVertexArray(mesh_vao); // bind VAO`

			`// initialize buffer object`
			`glGenBuffers(1, &mesh_vbo);`
			`glBindBuffer(GL_ARRAY_BUFFER, mesh_vbo);`

			`unsigned int size = mesh_width * mesh_height * 4 * sizeof(float);`
			`glBufferData(GL_ARRAY_BUFFER, size, NULL, GL_DYNAMIC_DRAW);`
			`glVertexAttribPointer((GLuint)0, 4, GL_FLOAT, GL_FALSE, 0, 0);`
			`glEnableVertexAttribArray(0);`

			`checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, mesh_vbo,`
			`cudaGraphicsMapFlagsNone));`
			`// glBindVertexArray(0); // keep above Vertex Array Object bound (it's the`
			`// only one throughout)`

			`// GLSL stuff`
			`char *vertex_shader_path = sdkFindFilePath("mesh.vert.glsl", pArgv[0]);`
			`char *fragment_shader_path = sdkFindFilePath("mesh.frag.glsl", pArgv[0]);`

			`if (vertex_shader_path == NULL \|\| fragment_shader_path == NULL) {`
			`printf("Error finding shader file\n");`
			`exit(EXIT_FAILURE);`
			`}`

			`mesh_shader = ShaderCreate(vertex_shader_path, fragment_shader_path);`
			`GET_GLERROR(0);`

			`free(vertex_shader_path);`
			`free(fragment_shader_path);`

			`glUseProgram(mesh_shader);`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`//! Run a simple test for CUDA`
			`////////////////////////////////////////////////////////////////////////////////`
			`bool runTest(int argc, char *argv, char ref_file) {`
			`// Create the CUTIL timer`
			`sdkCreateTimer(&timer);`

			`int devID = 0;`
			`#if defined(__aarch64__) \|\| defined(__arm__)`
			`// find iGPU on the system which is compute capable which will perform`
			`// GLES-CUDA interop`
			`devID = findIntegratedGPU();`
			`#else`
			`// use command-line specified CUDA device, otherwise use device with highest`
			`// Gflops/s`
			`devID = findCudaDevice(argc, (const char **)argv);`
			`#endif`

			`// command line mode only`
			`if (ref_file != NULL) {`
			`// create VBO`
			`checkCudaErrors(cudaMalloc((void **)&d_vbo_buffer,`
			`mesh_width * mesh_height * 4 * sizeof(float)));`

			`// run the cuda part`
			`runAutoTest(devID, argv, ref_file);`

			`// check result of Cuda step`
			`checkResultCuda(argc, argv, mesh_vbo);`

			`cudaFree(d_vbo_buffer);`
			`d_vbo_buffer = NULL;`
			`} else {`
			`// this would use command-line specified CUDA device, note that CUDA`
			`// defaults to highest Gflops/s device`
			`if (checkCmdLineFlag(argc, (const char **)argv, "device"))`
			`error_exit("Device setting not yet implemented!\n");`

			`// create X11 window and set up associated OpenGL ES context`
			`graphics_setup_window(0, 0, window_width, window_height, sSDKsample);`

			`InitGraphicsState(); // set up GLES stuff`

			`glClearColor(0, 0.5, 1, 1); // blue-ish background`
			`glClear(GL_COLOR_BUFFER_BIT);`

			`// printf("WP%d\n", __LINE__);`
			`graphics_swap_buffers();`

			`XEvent event;`
			`KeySym key;`
			`char text[255];`

			`int frame = 0;`

			`while (frame < 100000) {`
			`if (XPending(display)) {`
			`XNextEvent(display, &event);`

			`if (event.type == Expose && event.xexpose.count == 0) {`
			`printf("Redraw requested!\n");`
			`}`
			`if (event.type == KeyPress &&`
			`XLookupString(&event.xkey, text, 255, &key, 0) == 1) {`
			`if (text[0] == 27) goto label_stop_x;`

			`printf("You pressed the %c key!\n", text[0]);`
			`}`

			`if (event.type == ButtonPress) {`
			`printf("Mouse button %d press at (%d,%d)\n", event.xbutton.button,`
			`event.xbutton.x, event.xbutton.y);`

			`if (event.xbutton.button == Button1) gui_mode = GUI_TRANSLATE;`
			`if (event.xbutton.button == Button3) gui_mode = GUI_ROTATE;`
			`mouse_old_x = event.xbutton.x;`
			`mouse_old_y = event.xbutton.y;`
			`}`

			`if (event.type == ButtonRelease) {`
			`printf("Mouse button %d released at (%d,%d)\n", event.xbutton.button,`
			`event.xbutton.x, event.xbutton.y);`

			`gui_mode = GUI_IDLE;`
			`mouse_old_x = event.xbutton.x;`
			`mouse_old_y = event.xbutton.y;`
			`}`

			`if (event.type == MotionNotify) {`
			`// printf("Mouse motion towards %d %d, GUI mode is 0x%x\n",`
			`// event.xmotion.x, event.xmotion.y, gui_mode);`
			`float dx, dy;`
			`dx = (float)(event.xmotion.x - mouse_old_x);`
			`dy = (float)(event.xmotion.y - mouse_old_y);`

			`if (gui_mode == GUI_ROTATE) {`
			`rotate_x += dy * 0.2f;`
			`rotate_y += dx * 0.2f;`
			`printf("rot x %f y %f\n", rotate_x, rotate_y);`
			`}`
			`if (gui_mode == GUI_TRANSLATE) {`
			`translate_z += dy * 0.01f;`
			`printf("translate z %f\n", translate_z);`
			`}`

			`mouse_old_x = event.xmotion.x;`
			`mouse_old_y = event.xmotion.y;`
			`}`
			`}`

			`display_thisframe(0.010);`
			`usleep(1000); // need not take full CPU and GPU`

			`graphics_swap_buffers();`
			`// printf("frame %d\n",frame++);`
			`}`

			`label_stop_x:`
			`// NOTE: Before destroying OpenGL ES context, must unregister all shared`
			`// resources from CUDA !`
			`cudaGraphicsUnregisterResource(cuda_vbo_resource);`

			`graphics_close_window(); // close window and destroy OpenGL ES context`

			`sdkDeleteTimer(&timer);`
			`}`

			`return true;`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`// Program main`
			`////////////////////////////////////////////////////////////////////////////////`
			`int main(int argc, char **argv) {`
			`char *ref_file = NULL;`

			`pArgc = &argc;`
			`pArgv = argv;`

			`#if defined(__linux__)`
			`setenv("DISPLAY", ":0", 0);`
			`#endif`

			`printf("%s starting...\n", sSDKsample);`

			`if (argc > 1) {`
			`if (checkCmdLineFlag(argc, (const char **)argv, "file")) {`
			`// In this mode, we run without OpenGL and see if VBO is generated`
			`// correctly`
			`getCmdLineArgumentString(argc, (const char **)argv, "file",`
			`(char **)&ref_file);`
			`}`
			`}`

			`printf("\n");`

			`runTest(argc, argv, ref_file);`

			`printf("%s completed, returned %s\n", sSDKsample,`
			`(g_TotalErrors == 0) ? "OK" : "ERROR!");`

			`exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);`
			`}`