/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Bicubic texture filtering sample sgreen 6/2008 This sample demonstrates how to efficiently implement bicubic texture filtering in CUDA. Bicubic filtering is a higher order interpolation method that produces smoother results than bilinear interpolation: http://en.wikipedia.org/wiki/Bicubic It requires reading a 4 x 4 pixel neighbourhood rather than the 2 x 2 area required by bilinear filtering. Current graphics hardware doesn't support bicubic filtering natively, but it is possible to compose a bicubic filter using just 4 bilinear lookups by offsetting the sample position within each texel and weighting the samples correctly. The only disadvantage to this method is that the hardware only maintains 9-bits of filtering precision within each texel. See "Fast Third-Order Texture Filtering", Sigg & Hadwiger, GPU Gems 2: https://developer.nvidia.com/gpugems/gpugems2/part-iii-high-quality-rendering/chapter-20-fast-third-order-texture-filtering v1.1 - updated to include the brute force method using 16 texture lookups. v1.2 - added Catmull-Rom interpolation Example performance results from GeForce 8800 GTS: Bilinear - 5500 MPixels/sec Bicubic - 1400 MPixels/sec Fast Bicubic - 2100 MPixels/sec */ // OpenGL Graphics includes #include #if defined(__APPLE__) || defined(MACOSX) #pragma clang diagnostic ignored "-Wdeprecated-declarations" #include #else #include #endif // Includes #include #include #include #include // CUDA system and GL includes #include #include // Helper functions #include // CUDA SDK Helper functions #include // CUDA device initialization helper functions typedef unsigned int uint; typedef unsigned char uchar; #define USE_BUFFER_TEX 0 #ifndef MAX #define MAX(a, b) ((a < b) ? b : a) #endif // Auto-Verification Code const int frameCheckNumber = 4; int fpsCount = 0; // FPS count for averaging int fpsLimit = 4; // FPS limit for sampling int g_Index = 0; unsigned int frameCount = 0; unsigned int g_TotalErrors = 0; StopWatchInterface *timer = 0; bool g_Verify = false; int *pArgc = NULL; char **pArgv = NULL; #define MAX_EPSILON_ERROR 5.0f #define REFRESH_DELAY 10 // ms static const char *sSDKsample = "CUDA BicubicTexture"; // Define the files that are to be save and the reference images for validation const char *sFilterMode[] = {"Nearest", "Bilinear", "Bicubic", "Fast Bicubic", "Catmull-Rom", NULL}; const char *sOriginal[] = {"0_nearest.ppm", "1_bilinear.ppm", "2_bicubic.ppm", "3_fastbicubic.ppm", "4_catmull-rom.ppm", NULL}; const char *sReference[] = {"0_nearest.ppm", "1_bilinear.ppm", "2_bicubic.ppm", "3_fastbicubic.ppm", "4_catmull-rom.ppm", NULL}; const char *srcImageFilename = "lena_bw.pgm"; char *dumpFilename = NULL; uint width = 512, height = 512; uint imageWidth, imageHeight; dim3 blockSize(16, 16); dim3 gridSize(width / blockSize.x, height / blockSize.y); enum eFilterMode { MODE_NEAREST, MODE_BILINEAR, MODE_BICUBIC, MODE_FAST_BICUBIC, MODE_CATMULL_ROM, NUM_MODES }; eFilterMode g_FilterMode = MODE_FAST_BICUBIC; bool drawCurves = false; GLuint pbo = 0; // OpenGL pixel buffer object struct cudaGraphicsResource *cuda_pbo_resource; // handles OpenGL-CUDA exchange GLuint displayTex = 0; GLuint bufferTex = 0; GLuint fprog; // fragment program (shader) float tx = 9.0f, ty = 10.0f; // image translation float scale = 1.0f / 16.0f; // image scale float cx, cy; // image centre void display(); void initGLBuffers(); void runBenchmark(int iterations); void cleanup(); #define GL_TEXTURE_TYPE GL_TEXTURE_RECTANGLE_ARB //#define GL_TEXTURE_TYPE GL_TEXTURE_2D extern "C" void initGL(int *argc, char **argv); extern "C" void loadImageData(int argc, char **argv); extern "C" void initTexture(int imageWidth, int imageHeight, uchar *h_data); extern "C" void freeTexture(); extern "C" void render(int width, int height, float tx, float ty, float scale, float cx, float cy, dim3 blockSize, dim3 gridSize, eFilterMode filter_mode, uchar4 *output); // w0, w1, w2, and w3 are the four cubic B-spline basis functions float bspline_w0(float a) { return (1.0f / 6.0f) * (-a * a * a + 3.0f * a * a - 3.0f * a + 1.0f); } float bspline_w1(float a) { return (1.0f / 6.0f) * (3.0f * a * a * a - 6.0f * a * a + 4.0f); } float bspline_w2(float a) { return (1.0f / 6.0f) * (-3.0f * a * a * a + 3.0f * a * a + 3.0f * a + 1.0f); } __host__ __device__ float bspline_w3(float a) { return (1.0f / 6.0f) * (a * a * a); } void computeFPS() { frameCount++; fpsCount++; if (fpsCount == fpsLimit - 1) { g_Verify = true; } if (fpsCount == fpsLimit) { char fps[256]; float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); sprintf(fps, "%s %s <%s>: %3.1f fps", "", sSDKsample, sFilterMode[g_FilterMode], ifps); glutSetWindowTitle(fps); fpsCount = 0; sdkResetTimer(&timer); } } void plotCurve(float (*func)(float)) { const int steps = 100; glBegin(GL_LINE_STRIP); for (int i = 0; i < steps; i++) { float x = i / (float)(steps - 1); glVertex2f(x, func(x)); } glEnd(); } // display results using OpenGL (called by GLUT) void display() { sdkStartTimer(&timer); // map PBO to get CUDA device pointer uchar4 *d_output; checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer( (void **)&d_output, &num_bytes, cuda_pbo_resource)); render(imageWidth, imageHeight, tx, ty, scale, cx, cy, blockSize, gridSize, g_FilterMode, d_output); checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); // Common display path { // display results glClear(GL_COLOR_BUFFER_BIT); #if USE_BUFFER_TEX // display using buffer texture glBindTexture(GL_TEXTURE_BUFFER_EXT, bufferTex); glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, fprog); glEnable(GL_FRAGMENT_PROGRAM_ARB); glProgramLocalParameterI4iNV(GL_FRAGMENT_PROGRAM_ARB, 0, width, 0, 0, 0); #else // download image from PBO to OpenGL texture glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glBindTexture(GL_TEXTURE_TYPE, displayTex); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); glTexSubImage2D(GL_TEXTURE_TYPE, 0, 0, 0, width, height, GL_BGRA, GL_UNSIGNED_BYTE, 0); glEnable(GL_TEXTURE_TYPE); #endif // draw textured quad glDisable(GL_DEPTH_TEST); glBegin(GL_QUADS); glTexCoord2f(0.0f, (GLfloat)height); glVertex2f(0.0f, 0.0f); glTexCoord2f((GLfloat)width, (GLfloat)height); glVertex2f(1.0f, 0.0f); glTexCoord2f((GLfloat)width, 0.0f); glVertex2f(1.0f, 1.0f); glTexCoord2f(0.0f, 0.0f); glVertex2f(0.0f, 1.0f); glEnd(); glDisable(GL_TEXTURE_TYPE); glDisable(GL_FRAGMENT_PROGRAM_ARB); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); if (drawCurves) { // draw spline curves glPushMatrix(); glScalef(0.25, 0.25, 1.0); glTranslatef(0.0, 2.0, 0.0); glColor3f(1.0, 0.0, 0.0); plotCurve(bspline_w3); glTranslatef(1.0, 0.0, 0.0); glColor3f(0.0, 1.0, 0.0); plotCurve(bspline_w2); glTranslatef(1.0, 0.0, 0.0); glColor3f(0.0, 0.0, 1.0); plotCurve(bspline_w1); glTranslatef(1.0, 0.0, 0.0); glColor3f(1.0, 0.0, 1.0); plotCurve(bspline_w0); glPopMatrix(); glColor3f(1.0, 1.0, 1.0); } } glutSwapBuffers(); glutReportErrors(); sdkStopTimer(&timer); computeFPS(); } // GLUT callback functions void timerEvent(int value) { if (glutGetWindow()) { glutPostRedisplay(); glutTimerFunc(REFRESH_DELAY, timerEvent, 0); } } void keyboard(unsigned char key, int /*x*/, int /*y*/) { switch (key) { case 27: #if defined(__APPLE__) || defined(MACOSX) exit(EXIT_SUCCESS); #else glutDestroyWindow(glutGetWindow()); return; #endif case '1': g_FilterMode = MODE_NEAREST; break; case '2': g_FilterMode = MODE_BILINEAR; break; case '3': g_FilterMode = MODE_BICUBIC; break; case '4': g_FilterMode = MODE_FAST_BICUBIC; break; case '5': g_FilterMode = MODE_CATMULL_ROM; break; case '=': case '+': scale *= 0.5f; break; case '-': scale *= 2.0f; break; case 'r': scale = 1.0f; tx = ty = 0.0f; break; case 'd': printf("%f, %f, %f\n", tx, ty, scale); break; case 'b': runBenchmark(500); break; case 'c': drawCurves ^= 1; break; default: break; } if (key >= '1' && key <= '5') { printf("> FilterMode[%d] = %s\n", g_FilterMode + 1, sFilterMode[g_FilterMode]); } } int ox, oy; int buttonState = 0; void mouse(int button, int state, int x, int y) { if (state == GLUT_DOWN) { buttonState |= 1 << button; } else if (state == GLUT_UP) { buttonState = 0; } ox = x; oy = y; } void motion(int x, int y) { float dx, dy; dx = (float)(x - ox); dy = (float)(y - oy); if (buttonState & 1) { // left = translate tx -= dx * scale; ty -= dy * scale; } else if (buttonState & 2) { // middle = zoom scale -= dy / 1000.0f; } ox = x; oy = y; } void reshape(int x, int y) { width = x; height = y; imageWidth = width; imageHeight = height; initGLBuffers(); glViewport(0, 0, x, y); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glMatrixMode(GL_PROJECTION); glLoadIdentity(); glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); } void cleanup() { freeTexture(); checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); glDeleteBuffers(1, &pbo); #if USE_BUFFER_TEX glDeleteTextures(1, &bufferTex); glDeleteProgramsARB(1, &fprog); #else glDeleteTextures(1, &displayTex); #endif sdkDeleteTimer(&timer); } int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); } void initGLBuffers() { if (pbo) { // delete old buffer checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); glDeleteBuffers(1, &pbo); } // create pixel buffer object for display glGenBuffers(1, &pbo); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(uchar4), 0, GL_STREAM_DRAW_ARB); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); checkCudaErrors(cudaGraphicsGLRegisterBuffer( &cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard)); #if USE_BUFFER_TEX // create buffer texture, attach to pbo if (bufferTex) { glDeleteTextures(1, &bufferTex); } glGenTextures(1, &bufferTex); glBindTexture(GL_TEXTURE_BUFFER_EXT, bufferTex); glTexBufferEXT(GL_TEXTURE_BUFFER_EXT, GL_RGBA8, pbo); glBindTexture(GL_TEXTURE_BUFFER_EXT, 0); #else // create texture for display if (displayTex) { glDeleteTextures(1, &displayTex); } glGenTextures(1, &displayTex); glBindTexture(GL_TEXTURE_TYPE, displayTex); glTexImage2D(GL_TEXTURE_TYPE, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); glTexParameteri(GL_TEXTURE_TYPE, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_TYPE, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glBindTexture(GL_TEXTURE_TYPE, 0); #endif // calculate new grid size gridSize = dim3(iDivUp(width, blockSize.x), iDivUp(height, blockSize.y)); } void mainMenu(int i) { keyboard(i, 0, 0); } void initMenus() { glutCreateMenu(mainMenu); glutAddMenuEntry("Nearest [1]", '1'); glutAddMenuEntry("Bilinear [2]", '2'); glutAddMenuEntry("Bicubic [3]", '3'); glutAddMenuEntry("Fast Bicubic [4]", '4'); glutAddMenuEntry("Catmull-Rom [5]", '5'); glutAddMenuEntry("Zoom in [=]", '='); glutAddMenuEntry("Zoom out [-]", '-'); glutAddMenuEntry("Benchmark [b]", 'b'); glutAddMenuEntry("DrawCurves [c]", 'c'); glutAddMenuEntry("Quit [esc]", 27); glutAttachMenu(GLUT_RIGHT_BUTTON); } void runBenchmark(int iterations) { printf("[%s] (Benchmark Mode)\n", sSDKsample); sdkCreateTimer(&timer); uchar4 *d_output; checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); size_t num_bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer( (void **)&d_output, &num_bytes, cuda_pbo_resource)); sdkStartTimer(&timer); for (int i = 0; i < iterations; ++i) { render(imageWidth, imageHeight, tx, ty, scale, cx, cy, blockSize, gridSize, g_FilterMode, d_output); } cudaDeviceSynchronize(); sdkStopTimer(&timer); float time = sdkGetTimerValue(&timer) / (float)iterations; checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); printf("time: %0.3f ms, %f Mpixels/sec\n", time, (width * height / (time * 0.001f)) / 1e6); } void runAutoTest(int argc, char **argv, const char *dump_filename, eFilterMode filter_mode) { cudaDeviceProp deviceProps; int devID = findCudaDevice(argc, (const char **)argv); checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); printf("[%s] (automated testing w/ readback)\n", sSDKsample); printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); loadImageData(argc, argv); uchar4 *d_output; checkCudaErrors(cudaMalloc((void **)&d_output, imageWidth * imageHeight * 4)); unsigned int *h_result = (unsigned int *)malloc(width * height * sizeof(unsigned int)); printf("AutoTest: %s Filter Mode: <%s>\n", sSDKsample, sFilterMode[g_FilterMode]); render(imageWidth, imageHeight, tx, ty, scale, cx, cy, blockSize, gridSize, filter_mode, d_output); // check if kernel execution generated an error getLastCudaError("Error: render (bicubicTexture) Kernel execution FAILED"); checkCudaErrors(cudaDeviceSynchronize()); cudaMemcpy(h_result, d_output, imageWidth * imageHeight * 4, cudaMemcpyDeviceToHost); sdkSavePPM4ub(dump_filename, (unsigned char *)h_result, imageWidth, imageHeight); checkCudaErrors(cudaFree(d_output)); free(h_result); } #if USE_BUFFER_TEX // fragment program for reading from buffer texture static const char *shaderCode = "!!NVfp4.0\n" "INT PARAM width = program.local[0];\n" "INT TEMP index;\n" "FLR.S index, fragment.texcoord;\n" "MAD.S index.x, index.y, width, index.x;\n" // compute 1D index from 2D // coords "TXF result.color, index.x, texture[0], BUFFER;\n" "END"; #endif GLuint compileASMShader(GLenum program_type, const char *code) { GLuint program_id; glGenProgramsARB(1, &program_id); glBindProgramARB(program_type, program_id); glProgramStringARB(program_type, GL_PROGRAM_FORMAT_ASCII_ARB, (GLsizei)strlen(code), (GLubyte *)code); GLint error_pos; glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &error_pos); if (error_pos != -1) { const GLubyte *error_string; error_string = glGetString(GL_PROGRAM_ERROR_STRING_ARB); fprintf(stderr, "Program error at position: %d\n%s\n", (int)error_pos, error_string); return 0; } return program_id; } void initialize(int argc, char **argv) { printf("[%s] (OpenGL Mode)\n", sSDKsample); initGL(&argc, argv); // use command-line specified CUDA device, otherwise use device with highest // Gflops/s int devID = findCudaDevice(argc, (const char **)argv); // get number of SMs on this GPU cudaDeviceProp deviceProps; checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount); // Create the timer (for fps measurement) sdkCreateTimer(&timer); // load image from disk loadImageData(argc, argv); printf( "\n" "\tControls\n" "\t=/- : Zoom in/out\n" "\tb : Run Benchmark g_FilterMode\n" "\tc : Draw Bicubic Spline Curve\n" "\t[esc] - Quit\n\n" "\tPress number keys to change filtering g_FilterMode:\n\n" "\t1 : nearest filtering\n" "\t2 : bilinear filtering\n" "\t3 : bicubic filtering\n" "\t4 : fast bicubic filtering\n" "\t5 : Catmull-Rom filtering\n\n"); initGLBuffers(); #if USE_BUFFER_TEX fprog = compileASMShader(GL_FRAGMENT_PROGRAM_ARB, shaderCode); if (!fprog) { exit(EXIT_SUCCESS); } #endif } void initGL(int *argc, char **argv) { // initialize GLUT callback functions glutInit(argc, argv); glutInitDisplayMode(GLUT_RGBA | GLUT_ALPHA | GLUT_DOUBLE | GLUT_DEPTH); glutInitWindowSize(width, height); glutCreateWindow("CUDA bicubic texture filtering"); glutDisplayFunc(display); glutKeyboardFunc(keyboard); glutMouseFunc(mouse); glutMotionFunc(motion); glutReshapeFunc(reshape); glutTimerFunc(REFRESH_DELAY, timerEvent, 0); #if defined(__APPLE__) || defined(MACOSX) atexit(cleanup); #else glutCloseFunc(cleanup); #endif initMenus(); if (!isGLVersionSupported(2, 0) || !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) { fprintf(stderr, "Required OpenGL extensions are missing."); exit(EXIT_FAILURE); } #if USE_BUFFER_TEX if (!areGLExtensionsSupported("GL_EXT_texture_buffer_object")) { fprintf(stderr, "OpenGL extension: GL_EXT_texture_buffer_object missing.\n"); exit(EXIT_FAILURE); } if (!areGLExtensionsSupported("GL_NV_gpu_program4")) { fprintf(stderr, "OpenGL extension: GL_NV_gpu_program4 missing.\n"); exit(EXIT_FAILURE); } #endif } void loadImageData(int argc, char **argv) { // load image from disk uchar *h_data = NULL; char *srcImagePath = NULL; if ((srcImagePath = sdkFindFilePath(srcImageFilename, argv[0])) == NULL) { printf("bicubicTexture loadImageData() could not find <%s>\nExiting...\n", srcImageFilename); exit(EXIT_FAILURE); } sdkLoadPGM(srcImagePath, &h_data, &imageWidth, &imageHeight); printf("Loaded '%s', %d x %d pixels\n", srcImageFilename, imageWidth, imageHeight); cx = imageWidth * 0.5f; cy = imageHeight * 0.5f; // initialize texture initTexture(imageWidth, imageHeight, h_data); } void printHelp() { printf("bicubicTexture Usage:\n"); printf("\t-file=output.ppm (output file to save to disk)\n"); printf( "\t-mode=n (0=Nearest, 1=Bilinear, 2=Bicubic, 3=Fast-Bicubic, " "4=Catmull-Rom\n"); } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; // parse arguments char *filename; #if defined(__linux__) setenv("DISPLAY", ":0", 0); #endif printf("Starting bicubicTexture\n"); if (checkCmdLineFlag(argc, (const char **)argv, "help")) { printHelp(); exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **)argv, "mode")) { g_FilterMode = (eFilterMode)getCmdLineArgumentInt(argc, (const char **)argv, "mode"); if (g_FilterMode < 0 || g_FilterMode >= NUM_MODES) { printf("Invalid Mode setting %d\n", g_FilterMode); exit(EXIT_FAILURE); } } if (getCmdLineArgumentString(argc, (const char **)argv, "file", &filename)) { dumpFilename = filename; fpsLimit = frameCheckNumber; // Running CUDA kernel (bicubicFiltering) without visualization (QA // Testing/Verification) runAutoTest(argc, argv, (const char *)dumpFilename, g_FilterMode); } else { // This runs the CUDA kernel (bicubicFiltering) + OpenGL visualization initialize(argc, argv); glutMainLoop(); } exit(EXIT_SUCCESS); }