cuda-samples/Samples/0_Introduction/simplePitchLinearTexture/simplePitchLinearTexture.cu
2022-01-13 11:35:24 +05:30

312 lines
11 KiB
Plaintext

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* pitchLinearTexture
*
* This example demonstrates how to use textures bound to pitch linear memory.
* It performs a shift of matrix elements using wrap addressing mode (aka
* periodic boundary conditions) on two arrays, a pitch linear and a CUDA array,
* in order to highlight the differences in using each.
*
* Textures binding to pitch linear memory is a new feature in CUDA 2.2,
* and allows use of texture features such as wrap addressing mode and
* filtering which are not possible with textures bound to regular linear memory
*/
// includes, system
#include <stdio.h>
#ifdef _WIN32
#define WINDOWS_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#endif
// Includes CUDA
#include <cuda_runtime.h>
// Utilities and timing functions
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions
#include <helper_cuda.h> // helper functions for CUDA error check
#define NUM_REPS 100 // number of repetitions performed
#define TILE_DIM 16 // tile/block size
const char *sSDKsample = "simplePitchLinearTexture";
// Auto-Verification Code
bool bTestResult = true;
////////////////////////////////////////////////////////////////////////////////
// NB: (1) The second argument "pitch" is in elements, not bytes
// (2) normalized coordinates are used (required for wrap address mode)
////////////////////////////////////////////////////////////////////////////////
//! Shifts matrix elements using pitch linear array
//! @param odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void shiftPitchLinear(float *odata, int pitch, int width, int height,
int shiftX, int shiftY,
cudaTextureObject_t texRefPL) {
int xid = blockIdx.x * blockDim.x + threadIdx.x;
int yid = blockIdx.y * blockDim.y + threadIdx.y;
odata[yid * pitch + xid] = tex2D<float>(
texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
}
////////////////////////////////////////////////////////////////////////////////
//! Shifts matrix elements using regular array
//! @param odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void shiftArray(float *odata, int pitch, int width, int height,
int shiftX, int shiftY,
cudaTextureObject_t texRefArray) {
int xid = blockIdx.x * blockDim.x + threadIdx.x;
int yid = blockIdx.y * blockDim.y + threadIdx.y;
odata[yid * pitch + xid] =
tex2D<float>(texRefArray, (xid + shiftX) / (float)width,
(yid + shiftY) / (float)height);
}
////////////////////////////////////////////////////////////////////////////////
// Declaration, forward
void runTest(int argc, char **argv);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
printf("%s starting...\n\n", sSDKsample);
runTest(argc, argv);
printf("%s completed, returned %s\n", sSDKsample,
bTestResult ? "OK" : "ERROR!");
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) {
// Set array size
const int nx = 2048;
const int ny = 2048;
// Setup shifts applied to x and y data
const int x_shift = 5;
const int y_shift = 7;
if ((nx % TILE_DIM != 0) || (ny % TILE_DIM != 0)) {
printf("nx and ny must be multiples of TILE_DIM\n");
exit(EXIT_FAILURE);
}
// Setup execution configuration parameters
dim3 dimGrid(nx / TILE_DIM, ny / TILE_DIM), dimBlock(TILE_DIM, TILE_DIM);
// This will pick the best possible CUDA capable device
int devID = findCudaDevice(argc, (const char **)argv);
// CUDA events for timing
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Host allocation and initialization
float *h_idata = (float *)malloc(sizeof(float) * nx * ny);
float *h_odata = (float *)malloc(sizeof(float) * nx * ny);
float *gold = (float *)malloc(sizeof(float) * nx * ny);
for (int i = 0; i < nx * ny; ++i) {
h_idata[i] = (float)i;
}
// Device memory allocation
// Pitch linear input data
float *d_idataPL;
size_t d_pitchBytes;
checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes,
nx * sizeof(float), ny));
// Array input data
cudaArray *d_idataArray;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
checkCudaErrors(cudaMallocArray(&d_idataArray, &channelDesc, nx, ny));
// Pitch linear output data
float *d_odata;
checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes,
nx * sizeof(float), ny));
// Copy host data to device
// Pitch linear
size_t h_pitchBytes = nx * sizeof(float);
checkCudaErrors(cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes,
nx * sizeof(float), ny, cudaMemcpyHostToDevice));
// Array
checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata,
nx * ny * sizeof(float),
cudaMemcpyHostToDevice));
cudaTextureObject_t texRefPL;
cudaTextureObject_t texRefArray;
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypePitch2D;
texRes.res.pitch2D.devPtr = d_idataPL;
texRes.res.pitch2D.desc = channelDesc;
texRes.res.pitch2D.width = nx;
texRes.res.pitch2D.height = ny;
texRes.res.pitch2D.pitchInBytes = h_pitchBytes;
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = true;
texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.addressMode[1] = cudaAddressModeWrap;
texDescr.readMode = cudaReadModeElementType;
checkCudaErrors(cudaCreateTextureObject(&texRefPL, &texRes, &texDescr, NULL));
memset(&texRes, 0, sizeof(cudaResourceDesc));
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = d_idataArray;
texDescr.normalizedCoords = true;
texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.addressMode[1] = cudaAddressModeWrap;
texDescr.readMode = cudaReadModeElementType;
checkCudaErrors(
cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL));
// Reference calculation
for (int j = 0; j < ny; ++j) {
int jshift = (j + y_shift) % ny;
for (int i = 0; i < nx; ++i) {
int ishift = (i + x_shift) % nx;
gold[j * nx + i] = h_idata[jshift * nx + ishift];
}
}
// Run ShiftPitchLinear kernel
checkCudaErrors(
cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
checkCudaErrors(cudaEventRecord(start, 0));
for (int i = 0; i < NUM_REPS; ++i) {
shiftPitchLinear<<<dimGrid, dimBlock>>>(d_odata,
(int)(d_pitchBytes / sizeof(float)),
nx, ny, x_shift, y_shift, texRefPL);
}
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
float timePL;
checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop));
// Check results
checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes,
nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
bool res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
bTestResult = true;
if (res == false) {
printf("*** shiftPitchLinear failed ***\n");
bTestResult = false;
}
// Run ShiftArray kernel
checkCudaErrors(
cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
checkCudaErrors(cudaEventRecord(start, 0));
for (int i = 0; i < NUM_REPS; ++i) {
shiftArray<<<dimGrid, dimBlock>>>(d_odata,
(int)(d_pitchBytes / sizeof(float)), nx,
ny, x_shift, y_shift, texRefArray);
}
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
float timeArray;
checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop));
// Check results
checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes,
nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
if (res == false) {
printf("*** shiftArray failed ***\n");
bTestResult = false;
}
float bandwidthPL =
2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS);
float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) /
(timeArray / NUM_REPS);
printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n",
bandwidthPL, bandwidthArray);
float fetchRatePL = nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS));
float fetchRateArray = nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS));
printf(
"\nTexture fetch rate (Mpix/s) for pitch linear: "
"%.2e; for array: %.2e\n\n",
fetchRatePL, fetchRateArray);
// Cleanup
free(h_idata);
free(h_odata);
free(gold);
checkCudaErrors(cudaDestroyTextureObject(texRefPL));
checkCudaErrors(cudaDestroyTextureObject(texRefArray));
checkCudaErrors(cudaFree(d_idataPL));
checkCudaErrors(cudaFreeArray(d_idataArray));
checkCudaErrors(cudaFree(d_odata));
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
}