cuda-samples/Samples/FunctionPointers/FunctionPointers_kernels.cu

403 lines
17 KiB
Plaintext
Raw Normal View History

2021-10-21 19:04:49 +08:00
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
#include <helper_cuda.h>
#include "FunctionPointers_kernels.h"
// Texture object for reading image
cudaTextureObject_t tex;
extern __shared__ unsigned char LocalBlock[];
static cudaArray *array = NULL;
#define RADIUS 1
// pixel value used for thresholding function,
// works well with sample image 'lena'
#define THRESHOLD 150.0f
#ifdef FIXED_BLOCKWIDTH
#define BlockWidth 80
#define SharedPitch 384
#endif
// A function pointer can be declared explicitly like this line:
//__device__ unsigned char (*pointFunction)(unsigned char, float ) = NULL;
// or by using typedef's like below:
typedef unsigned char (*blockFunction_t)(unsigned char, unsigned char,
unsigned char, unsigned char,
unsigned char, unsigned char,
unsigned char, unsigned char,
unsigned char, float);
typedef unsigned char (*pointFunction_t)(unsigned char, float);
__device__ blockFunction_t blockFunction;
__device__ unsigned char ComputeSobel(unsigned char ul, // upper left
unsigned char um, // upper middle
unsigned char ur, // upper right
unsigned char ml, // middle left
unsigned char mm, // middle (unused)
unsigned char mr, // middle right
unsigned char ll, // lower left
unsigned char lm, // lower middle
unsigned char lr, // lower right
float fScale) {
short Horz = ur + 2 * mr + lr - ul - 2 * ml - ll;
short Vert = ul + 2 * um + ur - ll - 2 * lm - lr;
short Sum = (short)(fScale * (abs((int)Horz) + abs((int)Vert)));
return (unsigned char)((Sum < 0) ? 0 : ((Sum > 255) ? 255 : Sum));
}
// define a function pointer and initialize to NULL
__device__ unsigned char (*varFunction)(unsigned char, unsigned char,
unsigned char, unsigned char,
unsigned char, unsigned char,
unsigned char, unsigned char,
unsigned char, float x) = NULL;
__device__ unsigned char ComputeBox(unsigned char ul, // upper left
unsigned char um, // upper middle
unsigned char ur, // upper right
unsigned char ml, // middle left
unsigned char mm, // middle...middle
unsigned char mr, // middle right
unsigned char ll, // lower left
unsigned char lm, // lower middle
unsigned char lr, // lower right
float fscale) {
short Sum = (short)(ul + um + ur + ml + mm + mr + ll + lm + lr) / 9;
Sum *= fscale;
return (unsigned char)((Sum < 0) ? 0 : ((Sum > 255) ? 255 : Sum));
}
__device__ unsigned char Threshold(unsigned char in, float thresh) {
if (in > thresh) {
return 0xFF;
} else {
return 0;
}
}
// Declare function tables, one for the point function chosen, one for the
// block function chosen. The number of entries is determined by the
// enum in FunctionPointers_kernels.h
__device__ blockFunction_t blockFunction_table[LAST_BLOCK_FILTER];
__device__ pointFunction_t pointFunction_table[LAST_POINT_FILTER];
// Declare device side function pointers. We retrieve them later with
// cudaMemcpyFromSymbol to set our function tables above in some
// particular order specified at runtime.
__device__ blockFunction_t pComputeSobel = ComputeSobel;
__device__ blockFunction_t pComputeBox = ComputeBox;
__device__ pointFunction_t pComputeThreshold = Threshold;
// Allocate host side tables to mirror the device side, and later, we
// fill these tables with the function pointers. This lets us send
// the pointers to the kernel on invocation, as a method of choosing
// which function to run.
blockFunction_t h_blockFunction_table[2];
pointFunction_t h_pointFunction_table[2];
// Perform a filter operation on the data, using shared memory
// The actual operation performed is
// determined by the function pointer "blockFunction" and selected
// by the integer argument "blockOperation" and has access
// to an apron around the current pixel being processed.
// Following the block operation, a per-pixel operation,
// pointed to by pPointFunction is performed before the final
// pixel is produced.
__global__ void SobelShared(uchar4 *pSobelOriginal, unsigned short SobelPitch,
#ifndef FIXED_BLOCKWIDTH
short BlockWidth, short SharedPitch,
#endif
short w, short h, float fScale, int blockOperation,
pointFunction_t pPointFunction,
cudaTextureObject_t tex) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
short u = 4 * blockIdx.x * BlockWidth;
short v = blockIdx.y * blockDim.y + threadIdx.y;
short ib;
int SharedIdx = threadIdx.y * SharedPitch;
for (ib = threadIdx.x; ib < BlockWidth + 2 * RADIUS; ib += blockDim.x) {
LocalBlock[SharedIdx + 4 * ib + 0] = tex2D<unsigned char>(
tex, (float)(u + 4 * ib - RADIUS + 0), (float)(v - RADIUS));
LocalBlock[SharedIdx + 4 * ib + 1] = tex2D<unsigned char>(
tex, (float)(u + 4 * ib - RADIUS + 1), (float)(v - RADIUS));
LocalBlock[SharedIdx + 4 * ib + 2] = tex2D<unsigned char>(
tex, (float)(u + 4 * ib - RADIUS + 2), (float)(v - RADIUS));
LocalBlock[SharedIdx + 4 * ib + 3] = tex2D<unsigned char>(
tex, (float)(u + 4 * ib - RADIUS + 3), (float)(v - RADIUS));
}
if (threadIdx.y < RADIUS * 2) {
//
// copy trailing RADIUS*2 rows of pixels into shared
//
SharedIdx = (blockDim.y + threadIdx.y) * SharedPitch;
for (ib = threadIdx.x; ib < BlockWidth + 2 * RADIUS; ib += blockDim.x) {
LocalBlock[SharedIdx + 4 * ib + 0] =
tex2D<unsigned char>(tex, (float)(u + 4 * ib - RADIUS + 0),
(float)(v + blockDim.y - RADIUS));
LocalBlock[SharedIdx + 4 * ib + 1] =
tex2D<unsigned char>(tex, (float)(u + 4 * ib - RADIUS + 1),
(float)(v + blockDim.y - RADIUS));
LocalBlock[SharedIdx + 4 * ib + 2] =
tex2D<unsigned char>(tex, (float)(u + 4 * ib - RADIUS + 2),
(float)(v + blockDim.y - RADIUS));
LocalBlock[SharedIdx + 4 * ib + 3] =
tex2D<unsigned char>(tex, (float)(u + 4 * ib - RADIUS + 3),
(float)(v + blockDim.y - RADIUS));
}
}
cg::sync(cta);
u >>= 2; // index as uchar4 from here
uchar4 *pSobel = (uchar4 *)(((char *)pSobelOriginal) + v * SobelPitch);
SharedIdx = threadIdx.y * SharedPitch;
blockFunction = blockFunction_table[blockOperation];
for (ib = threadIdx.x; ib < BlockWidth; ib += blockDim.x) {
uchar4 out;
unsigned char pix00 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 0];
unsigned char pix01 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 1];
unsigned char pix02 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 2];
unsigned char pix10 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 0];
unsigned char pix11 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 1];
unsigned char pix12 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 2];
unsigned char pix20 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 0];
unsigned char pix21 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 1];
unsigned char pix22 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 2];
out.x = (*blockFunction)(pix00, pix01, pix02, pix10, pix11, pix12, pix20,
pix21, pix22, fScale);
pix00 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 3];
pix10 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 3];
pix20 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 3];
out.y = (*blockFunction)(pix01, pix02, pix00, pix11, pix12, pix10, pix21,
pix22, pix20, fScale);
pix01 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 4];
pix11 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 4];
pix21 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 4];
out.z = (*blockFunction)(pix02, pix00, pix01, pix12, pix10, pix11, pix22,
pix20, pix21, fScale);
pix02 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 5];
pix12 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 5];
pix22 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 5];
out.w = (*blockFunction)(pix00, pix01, pix02, pix10, pix11, pix12, pix20,
pix21, pix22, fScale);
if (pPointFunction != NULL) {
out.x = (*pPointFunction)(out.x, THRESHOLD);
out.y = (*pPointFunction)(out.y, THRESHOLD);
out.z = (*pPointFunction)(out.z, THRESHOLD);
out.w = (*pPointFunction)(out.w, THRESHOLD);
}
if (u + ib < w / 4 && v < h) {
pSobel[u + ib] = out;
}
}
cg::sync(cta);
}
__global__ void SobelCopyImage(Pixel *pSobelOriginal, unsigned int Pitch, int w,
int h, float fscale, cudaTextureObject_t tex) {
unsigned char *pSobel =
(unsigned char *)(((char *)pSobelOriginal) + blockIdx.x * Pitch);
for (int i = threadIdx.x; i < w; i += blockDim.x) {
pSobel[i] = min(
max((tex2D<unsigned char>(tex, (float)i, (float)blockIdx.x) * fscale),
0.f),
255.f);
}
}
// Perform block and pointer filtering using texture lookups.
// The block and point operations are determined by the
// input argument (see comment above for "SobelShared" function)
__global__ void SobelTex(Pixel *pSobelOriginal, unsigned int Pitch, int w,
int h, float fScale, int blockOperation,
pointFunction_t pPointOperation,
cudaTextureObject_t tex) {
unsigned char *pSobel =
(unsigned char *)(((char *)pSobelOriginal) + blockIdx.x * Pitch);
unsigned char tmp = 0;
for (int i = threadIdx.x; i < w; i += blockDim.x) {
unsigned char pix00 =
tex2D<unsigned char>(tex, (float)i - 1, (float)blockIdx.x - 1);
unsigned char pix01 =
tex2D<unsigned char>(tex, (float)i + 0, (float)blockIdx.x - 1);
unsigned char pix02 =
tex2D<unsigned char>(tex, (float)i + 1, (float)blockIdx.x - 1);
unsigned char pix10 =
tex2D<unsigned char>(tex, (float)i - 1, (float)blockIdx.x + 0);
unsigned char pix11 =
tex2D<unsigned char>(tex, (float)i + 0, (float)blockIdx.x + 0);
unsigned char pix12 =
tex2D<unsigned char>(tex, (float)i + 1, (float)blockIdx.x + 0);
unsigned char pix20 =
tex2D<unsigned char>(tex, (float)i - 1, (float)blockIdx.x + 1);
unsigned char pix21 =
tex2D<unsigned char>(tex, (float)i + 0, (float)blockIdx.x + 1);
unsigned char pix22 =
tex2D<unsigned char>(tex, (float)i + 1, (float)blockIdx.x + 1);
tmp = (*(blockFunction_table[blockOperation]))(
pix00, pix01, pix02, pix10, pix11, pix12, pix20, pix21, pix22, fScale);
if (pPointOperation != NULL) {
tmp = (*pPointOperation)(tmp, 150.0);
}
pSobel[i] = tmp;
}
}
extern "C" void setupTexture(int iw, int ih, Pixel *data, int Bpp) {
cudaChannelFormatDesc desc;
if (Bpp == 1) {
desc = cudaCreateChannelDesc<unsigned char>();
} else {
desc = cudaCreateChannelDesc<uchar4>();
}
checkCudaErrors(cudaMallocArray(&array, &desc, iw, ih));
checkCudaErrors(cudaMemcpy2DToArray(
array, 0, 0, data, iw * Bpp * sizeof(Pixel), iw * Bpp * sizeof(Pixel), ih,
cudaMemcpyHostToDevice));
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = array;
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
}
extern "C" void deleteTexture(void) {
checkCudaErrors(cudaFreeArray(array));
checkCudaErrors(cudaDestroyTextureObject(tex));
}
// Copy the pointers from the function tables to the host side
void setupFunctionTables() {
// Dynamically assign the function table.
// Copy the function pointers to their appropriate locations according to the
// enum
checkCudaErrors(cudaMemcpyFromSymbol(&h_blockFunction_table[SOBEL_FILTER],
pComputeSobel, sizeof(blockFunction_t)));
checkCudaErrors(cudaMemcpyFromSymbol(&h_blockFunction_table[BOX_FILTER],
pComputeBox, sizeof(blockFunction_t)));
// do the same for the point function, where the 2nd function is NULL ("no-op"
// filter, skipped in kernel code)
checkCudaErrors(cudaMemcpyFromSymbol(&h_pointFunction_table[THRESHOLD_FILTER],
pComputeThreshold,
sizeof(pointFunction_t)));
h_pointFunction_table[NULL_FILTER] = NULL;
// now copy the function tables back to the device, so if we wish we can use
// an index into the table to choose them
// We have now set the order in the function table according to our enum.
checkCudaErrors(
cudaMemcpyToSymbol(blockFunction_table, h_blockFunction_table,
sizeof(blockFunction_t) * LAST_BLOCK_FILTER));
checkCudaErrors(
cudaMemcpyToSymbol(pointFunction_table, h_pointFunction_table,
sizeof(pointFunction_t) * LAST_POINT_FILTER));
}
// Wrapper for the __global__ call that sets up the texture and threads
// Below two methods for selecting the image processing function to run are
// shown.
// BlockOperation is an integer kernel argument used as an index into the
// blockFunction_table on the device side
// pPointOp is itself a function pointer passed as a kernel argument, retrieved
// from a host side copy of the function table
extern "C" void sobelFilter(Pixel *odata, int iw, int ih,
enum SobelDisplayMode mode, float fScale,
int blockOperation, int pointOperation) {
pointFunction_t pPointOp = h_pointFunction_table[pointOperation];
switch (mode) {
case SOBELDISPLAY_IMAGE:
SobelCopyImage<<<ih, 384>>>(odata, iw, iw, ih, fScale, tex);
break;
case SOBELDISPLAY_SOBELTEX:
SobelTex<<<ih, 384>>>(odata, iw, iw, ih, fScale, blockOperation, pPointOp,
tex);
break;
case SOBELDISPLAY_SOBELSHARED: {
dim3 threads(16, 4);
#ifndef FIXED_BLOCKWIDTH
int BlockWidth = 80; // must be divisible by 16 for coalescing
#endif
dim3 blocks = dim3(iw / (4 * BlockWidth) + (0 != iw % (4 * BlockWidth)),
ih / threads.y + (0 != ih % threads.y));
int SharedPitch = ~0x3f & (4 * (BlockWidth + 2 * RADIUS) + 0x3f);
int sharedMem = SharedPitch * (threads.y + 2 * RADIUS);
// for the shared kernel, width must be divisible by 4
iw &= ~3;
SobelShared<<<blocks, threads, sharedMem>>>(
(uchar4 *)odata, iw,
#ifndef FIXED_BLOCKWIDTH
BlockWidth, SharedPitch,
#endif
iw, ih, fScale, blockOperation, pPointOp, tex);
} break;
}
}