mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-25 03:29:18 +08:00
403 lines
17 KiB
Plaintext
403 lines
17 KiB
Plaintext
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <cooperative_groups.h>
|
|
|
|
namespace cg = cooperative_groups;
|
|
|
|
#include <helper_cuda.h>
|
|
|
|
#include "FunctionPointers_kernels.h"
|
|
|
|
// Texture object for reading image
|
|
cudaTextureObject_t tex;
|
|
extern __shared__ unsigned char LocalBlock[];
|
|
static cudaArray *array = NULL;
|
|
|
|
#define RADIUS 1
|
|
|
|
// pixel value used for thresholding function,
|
|
// works well with sample image 'teapot512'
|
|
#define THRESHOLD 150.0f
|
|
|
|
#ifdef FIXED_BLOCKWIDTH
|
|
#define BlockWidth 80
|
|
#define SharedPitch 384
|
|
#endif
|
|
|
|
// A function pointer can be declared explicitly like this line:
|
|
//__device__ unsigned char (*pointFunction)(unsigned char, float ) = NULL;
|
|
// or by using typedef's like below:
|
|
|
|
typedef unsigned char (*blockFunction_t)(unsigned char, unsigned char,
|
|
unsigned char, unsigned char,
|
|
unsigned char, unsigned char,
|
|
unsigned char, unsigned char,
|
|
unsigned char, float);
|
|
|
|
typedef unsigned char (*pointFunction_t)(unsigned char, float);
|
|
|
|
__device__ blockFunction_t blockFunction;
|
|
|
|
__device__ unsigned char ComputeSobel(unsigned char ul, // upper left
|
|
unsigned char um, // upper middle
|
|
unsigned char ur, // upper right
|
|
unsigned char ml, // middle left
|
|
unsigned char mm, // middle (unused)
|
|
unsigned char mr, // middle right
|
|
unsigned char ll, // lower left
|
|
unsigned char lm, // lower middle
|
|
unsigned char lr, // lower right
|
|
float fScale) {
|
|
short Horz = ur + 2 * mr + lr - ul - 2 * ml - ll;
|
|
short Vert = ul + 2 * um + ur - ll - 2 * lm - lr;
|
|
short Sum = (short)(fScale * (abs((int)Horz) + abs((int)Vert)));
|
|
return (unsigned char)((Sum < 0) ? 0 : ((Sum > 255) ? 255 : Sum));
|
|
}
|
|
|
|
// define a function pointer and initialize to NULL
|
|
__device__ unsigned char (*varFunction)(unsigned char, unsigned char,
|
|
unsigned char, unsigned char,
|
|
unsigned char, unsigned char,
|
|
unsigned char, unsigned char,
|
|
unsigned char, float x) = NULL;
|
|
|
|
__device__ unsigned char ComputeBox(unsigned char ul, // upper left
|
|
unsigned char um, // upper middle
|
|
unsigned char ur, // upper right
|
|
unsigned char ml, // middle left
|
|
unsigned char mm, // middle...middle
|
|
unsigned char mr, // middle right
|
|
unsigned char ll, // lower left
|
|
unsigned char lm, // lower middle
|
|
unsigned char lr, // lower right
|
|
float fscale) {
|
|
short Sum = (short)(ul + um + ur + ml + mm + mr + ll + lm + lr) / 9;
|
|
Sum *= fscale;
|
|
return (unsigned char)((Sum < 0) ? 0 : ((Sum > 255) ? 255 : Sum));
|
|
}
|
|
__device__ unsigned char Threshold(unsigned char in, float thresh) {
|
|
if (in > thresh) {
|
|
return 0xFF;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
// Declare function tables, one for the point function chosen, one for the
|
|
// block function chosen. The number of entries is determined by the
|
|
// enum in FunctionPointers_kernels.h
|
|
__device__ blockFunction_t blockFunction_table[LAST_BLOCK_FILTER];
|
|
__device__ pointFunction_t pointFunction_table[LAST_POINT_FILTER];
|
|
|
|
// Declare device side function pointers. We retrieve them later with
|
|
// cudaMemcpyFromSymbol to set our function tables above in some
|
|
// particular order specified at runtime.
|
|
__device__ blockFunction_t pComputeSobel = ComputeSobel;
|
|
__device__ blockFunction_t pComputeBox = ComputeBox;
|
|
__device__ pointFunction_t pComputeThreshold = Threshold;
|
|
|
|
// Allocate host side tables to mirror the device side, and later, we
|
|
// fill these tables with the function pointers. This lets us send
|
|
// the pointers to the kernel on invocation, as a method of choosing
|
|
// which function to run.
|
|
blockFunction_t h_blockFunction_table[2];
|
|
pointFunction_t h_pointFunction_table[2];
|
|
|
|
// Perform a filter operation on the data, using shared memory
|
|
// The actual operation performed is
|
|
// determined by the function pointer "blockFunction" and selected
|
|
// by the integer argument "blockOperation" and has access
|
|
// to an apron around the current pixel being processed.
|
|
// Following the block operation, a per-pixel operation,
|
|
// pointed to by pPointFunction is performed before the final
|
|
// pixel is produced.
|
|
__global__ void SobelShared(uchar4 *pSobelOriginal, unsigned short SobelPitch,
|
|
#ifndef FIXED_BLOCKWIDTH
|
|
short BlockWidth, short SharedPitch,
|
|
#endif
|
|
short w, short h, float fScale, int blockOperation,
|
|
pointFunction_t pPointFunction,
|
|
cudaTextureObject_t tex) {
|
|
// Handle to thread block group
|
|
cg::thread_block cta = cg::this_thread_block();
|
|
short u = 4 * blockIdx.x * BlockWidth;
|
|
short v = blockIdx.y * blockDim.y + threadIdx.y;
|
|
short ib;
|
|
|
|
int SharedIdx = threadIdx.y * SharedPitch;
|
|
|
|
for (ib = threadIdx.x; ib < BlockWidth + 2 * RADIUS; ib += blockDim.x) {
|
|
LocalBlock[SharedIdx + 4 * ib + 0] = tex2D<unsigned char>(
|
|
tex, (float)(u + 4 * ib - RADIUS + 0), (float)(v - RADIUS));
|
|
LocalBlock[SharedIdx + 4 * ib + 1] = tex2D<unsigned char>(
|
|
tex, (float)(u + 4 * ib - RADIUS + 1), (float)(v - RADIUS));
|
|
LocalBlock[SharedIdx + 4 * ib + 2] = tex2D<unsigned char>(
|
|
tex, (float)(u + 4 * ib - RADIUS + 2), (float)(v - RADIUS));
|
|
LocalBlock[SharedIdx + 4 * ib + 3] = tex2D<unsigned char>(
|
|
tex, (float)(u + 4 * ib - RADIUS + 3), (float)(v - RADIUS));
|
|
}
|
|
|
|
if (threadIdx.y < RADIUS * 2) {
|
|
//
|
|
// copy trailing RADIUS*2 rows of pixels into shared
|
|
//
|
|
SharedIdx = (blockDim.y + threadIdx.y) * SharedPitch;
|
|
|
|
for (ib = threadIdx.x; ib < BlockWidth + 2 * RADIUS; ib += blockDim.x) {
|
|
LocalBlock[SharedIdx + 4 * ib + 0] =
|
|
tex2D<unsigned char>(tex, (float)(u + 4 * ib - RADIUS + 0),
|
|
(float)(v + blockDim.y - RADIUS));
|
|
LocalBlock[SharedIdx + 4 * ib + 1] =
|
|
tex2D<unsigned char>(tex, (float)(u + 4 * ib - RADIUS + 1),
|
|
(float)(v + blockDim.y - RADIUS));
|
|
LocalBlock[SharedIdx + 4 * ib + 2] =
|
|
tex2D<unsigned char>(tex, (float)(u + 4 * ib - RADIUS + 2),
|
|
(float)(v + blockDim.y - RADIUS));
|
|
LocalBlock[SharedIdx + 4 * ib + 3] =
|
|
tex2D<unsigned char>(tex, (float)(u + 4 * ib - RADIUS + 3),
|
|
(float)(v + blockDim.y - RADIUS));
|
|
}
|
|
}
|
|
|
|
cg::sync(cta);
|
|
|
|
u >>= 2; // index as uchar4 from here
|
|
uchar4 *pSobel = (uchar4 *)(((char *)pSobelOriginal) + v * SobelPitch);
|
|
SharedIdx = threadIdx.y * SharedPitch;
|
|
|
|
blockFunction = blockFunction_table[blockOperation];
|
|
|
|
for (ib = threadIdx.x; ib < BlockWidth; ib += blockDim.x) {
|
|
uchar4 out;
|
|
|
|
unsigned char pix00 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 0];
|
|
unsigned char pix01 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 1];
|
|
unsigned char pix02 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 2];
|
|
unsigned char pix10 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 0];
|
|
unsigned char pix11 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 1];
|
|
unsigned char pix12 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 2];
|
|
unsigned char pix20 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 0];
|
|
unsigned char pix21 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 1];
|
|
unsigned char pix22 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 2];
|
|
|
|
out.x = (*blockFunction)(pix00, pix01, pix02, pix10, pix11, pix12, pix20,
|
|
pix21, pix22, fScale);
|
|
|
|
pix00 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 3];
|
|
pix10 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 3];
|
|
pix20 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 3];
|
|
out.y = (*blockFunction)(pix01, pix02, pix00, pix11, pix12, pix10, pix21,
|
|
pix22, pix20, fScale);
|
|
|
|
pix01 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 4];
|
|
pix11 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 4];
|
|
pix21 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 4];
|
|
out.z = (*blockFunction)(pix02, pix00, pix01, pix12, pix10, pix11, pix22,
|
|
pix20, pix21, fScale);
|
|
|
|
pix02 = LocalBlock[SharedIdx + 4 * ib + 0 * SharedPitch + 5];
|
|
pix12 = LocalBlock[SharedIdx + 4 * ib + 1 * SharedPitch + 5];
|
|
pix22 = LocalBlock[SharedIdx + 4 * ib + 2 * SharedPitch + 5];
|
|
out.w = (*blockFunction)(pix00, pix01, pix02, pix10, pix11, pix12, pix20,
|
|
pix21, pix22, fScale);
|
|
|
|
if (pPointFunction != NULL) {
|
|
out.x = (*pPointFunction)(out.x, THRESHOLD);
|
|
out.y = (*pPointFunction)(out.y, THRESHOLD);
|
|
out.z = (*pPointFunction)(out.z, THRESHOLD);
|
|
out.w = (*pPointFunction)(out.w, THRESHOLD);
|
|
}
|
|
|
|
if (u + ib < w / 4 && v < h) {
|
|
pSobel[u + ib] = out;
|
|
}
|
|
}
|
|
|
|
cg::sync(cta);
|
|
}
|
|
|
|
__global__ void SobelCopyImage(Pixel *pSobelOriginal, unsigned int Pitch, int w,
|
|
int h, float fscale, cudaTextureObject_t tex) {
|
|
unsigned char *pSobel =
|
|
(unsigned char *)(((char *)pSobelOriginal) + blockIdx.x * Pitch);
|
|
|
|
for (int i = threadIdx.x; i < w; i += blockDim.x) {
|
|
pSobel[i] = min(
|
|
max((tex2D<unsigned char>(tex, (float)i, (float)blockIdx.x) * fscale),
|
|
0.f),
|
|
255.f);
|
|
}
|
|
}
|
|
|
|
// Perform block and pointer filtering using texture lookups.
|
|
// The block and point operations are determined by the
|
|
// input argument (see comment above for "SobelShared" function)
|
|
__global__ void SobelTex(Pixel *pSobelOriginal, unsigned int Pitch, int w,
|
|
int h, float fScale, int blockOperation,
|
|
pointFunction_t pPointOperation,
|
|
cudaTextureObject_t tex) {
|
|
unsigned char *pSobel =
|
|
(unsigned char *)(((char *)pSobelOriginal) + blockIdx.x * Pitch);
|
|
unsigned char tmp = 0;
|
|
|
|
for (int i = threadIdx.x; i < w; i += blockDim.x) {
|
|
unsigned char pix00 =
|
|
tex2D<unsigned char>(tex, (float)i - 1, (float)blockIdx.x - 1);
|
|
unsigned char pix01 =
|
|
tex2D<unsigned char>(tex, (float)i + 0, (float)blockIdx.x - 1);
|
|
unsigned char pix02 =
|
|
tex2D<unsigned char>(tex, (float)i + 1, (float)blockIdx.x - 1);
|
|
unsigned char pix10 =
|
|
tex2D<unsigned char>(tex, (float)i - 1, (float)blockIdx.x + 0);
|
|
unsigned char pix11 =
|
|
tex2D<unsigned char>(tex, (float)i + 0, (float)blockIdx.x + 0);
|
|
unsigned char pix12 =
|
|
tex2D<unsigned char>(tex, (float)i + 1, (float)blockIdx.x + 0);
|
|
unsigned char pix20 =
|
|
tex2D<unsigned char>(tex, (float)i - 1, (float)blockIdx.x + 1);
|
|
unsigned char pix21 =
|
|
tex2D<unsigned char>(tex, (float)i + 0, (float)blockIdx.x + 1);
|
|
unsigned char pix22 =
|
|
tex2D<unsigned char>(tex, (float)i + 1, (float)blockIdx.x + 1);
|
|
tmp = (*(blockFunction_table[blockOperation]))(
|
|
pix00, pix01, pix02, pix10, pix11, pix12, pix20, pix21, pix22, fScale);
|
|
|
|
if (pPointOperation != NULL) {
|
|
tmp = (*pPointOperation)(tmp, 150.0);
|
|
}
|
|
|
|
pSobel[i] = tmp;
|
|
}
|
|
}
|
|
|
|
extern "C" void setupTexture(int iw, int ih, Pixel *data, int Bpp) {
|
|
cudaChannelFormatDesc desc;
|
|
|
|
if (Bpp == 1) {
|
|
desc = cudaCreateChannelDesc<unsigned char>();
|
|
} else {
|
|
desc = cudaCreateChannelDesc<uchar4>();
|
|
}
|
|
|
|
checkCudaErrors(cudaMallocArray(&array, &desc, iw, ih));
|
|
checkCudaErrors(cudaMemcpy2DToArray(
|
|
array, 0, 0, data, iw * Bpp * sizeof(Pixel), iw * Bpp * sizeof(Pixel), ih,
|
|
cudaMemcpyHostToDevice));
|
|
|
|
cudaResourceDesc texRes;
|
|
memset(&texRes, 0, sizeof(cudaResourceDesc));
|
|
|
|
texRes.resType = cudaResourceTypeArray;
|
|
texRes.res.array.array = array;
|
|
|
|
cudaTextureDesc texDescr;
|
|
memset(&texDescr, 0, sizeof(cudaTextureDesc));
|
|
|
|
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
|
|
}
|
|
|
|
extern "C" void deleteTexture(void) {
|
|
checkCudaErrors(cudaFreeArray(array));
|
|
checkCudaErrors(cudaDestroyTextureObject(tex));
|
|
}
|
|
|
|
// Copy the pointers from the function tables to the host side
|
|
void setupFunctionTables() {
|
|
// Dynamically assign the function table.
|
|
// Copy the function pointers to their appropriate locations according to the
|
|
// enum
|
|
checkCudaErrors(cudaMemcpyFromSymbol(&h_blockFunction_table[SOBEL_FILTER],
|
|
pComputeSobel, sizeof(blockFunction_t)));
|
|
checkCudaErrors(cudaMemcpyFromSymbol(&h_blockFunction_table[BOX_FILTER],
|
|
pComputeBox, sizeof(blockFunction_t)));
|
|
|
|
// do the same for the point function, where the 2nd function is NULL ("no-op"
|
|
// filter, skipped in kernel code)
|
|
checkCudaErrors(cudaMemcpyFromSymbol(&h_pointFunction_table[THRESHOLD_FILTER],
|
|
pComputeThreshold,
|
|
sizeof(pointFunction_t)));
|
|
h_pointFunction_table[NULL_FILTER] = NULL;
|
|
|
|
// now copy the function tables back to the device, so if we wish we can use
|
|
// an index into the table to choose them
|
|
// We have now set the order in the function table according to our enum.
|
|
checkCudaErrors(
|
|
cudaMemcpyToSymbol(blockFunction_table, h_blockFunction_table,
|
|
sizeof(blockFunction_t) * LAST_BLOCK_FILTER));
|
|
checkCudaErrors(
|
|
cudaMemcpyToSymbol(pointFunction_table, h_pointFunction_table,
|
|
sizeof(pointFunction_t) * LAST_POINT_FILTER));
|
|
}
|
|
|
|
// Wrapper for the __global__ call that sets up the texture and threads
|
|
// Below two methods for selecting the image processing function to run are
|
|
// shown.
|
|
// BlockOperation is an integer kernel argument used as an index into the
|
|
// blockFunction_table on the device side
|
|
// pPointOp is itself a function pointer passed as a kernel argument, retrieved
|
|
// from a host side copy of the function table
|
|
extern "C" void sobelFilter(Pixel *odata, int iw, int ih,
|
|
enum SobelDisplayMode mode, float fScale,
|
|
int blockOperation, int pointOperation) {
|
|
pointFunction_t pPointOp = h_pointFunction_table[pointOperation];
|
|
|
|
switch (mode) {
|
|
case SOBELDISPLAY_IMAGE:
|
|
SobelCopyImage<<<ih, 384>>>(odata, iw, iw, ih, fScale, tex);
|
|
break;
|
|
|
|
case SOBELDISPLAY_SOBELTEX:
|
|
SobelTex<<<ih, 384>>>(odata, iw, iw, ih, fScale, blockOperation, pPointOp,
|
|
tex);
|
|
break;
|
|
|
|
case SOBELDISPLAY_SOBELSHARED: {
|
|
dim3 threads(16, 4);
|
|
#ifndef FIXED_BLOCKWIDTH
|
|
int BlockWidth = 80; // must be divisible by 16 for coalescing
|
|
#endif
|
|
dim3 blocks = dim3(iw / (4 * BlockWidth) + (0 != iw % (4 * BlockWidth)),
|
|
ih / threads.y + (0 != ih % threads.y));
|
|
int SharedPitch = ~0x3f & (4 * (BlockWidth + 2 * RADIUS) + 0x3f);
|
|
int sharedMem = SharedPitch * (threads.y + 2 * RADIUS);
|
|
|
|
// for the shared kernel, width must be divisible by 4
|
|
iw &= ~3;
|
|
|
|
SobelShared<<<blocks, threads, sharedMem>>>(
|
|
(uchar4 *)odata, iw,
|
|
#ifndef FIXED_BLOCKWIDTH
|
|
BlockWidth, SharedPitch,
|
|
#endif
|
|
iw, ih, fScale, blockOperation, pPointOp, tex);
|
|
} break;
|
|
}
|
|
}
|