mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-29 03:49:18 +08:00
312 lines
11 KiB
Plaintext
312 lines
11 KiB
Plaintext
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/* pitchLinearTexture
|
|
*
|
|
* This example demonstrates how to use textures bound to pitch linear memory.
|
|
* It performs a shift of matrix elements using wrap addressing mode (aka
|
|
* periodic boundary conditions) on two arrays, a pitch linear and a CUDA array,
|
|
* in order to highlight the differences in using each.
|
|
*
|
|
* Textures binding to pitch linear memory is a new feature in CUDA 2.2,
|
|
* and allows use of texture features such as wrap addressing mode and
|
|
* filtering which are not possible with textures bound to regular linear memory
|
|
*/
|
|
|
|
// includes, system
|
|
#include <stdio.h>
|
|
|
|
#ifdef _WIN32
|
|
#define WINDOWS_LEAN_AND_MEAN
|
|
#define NOMINMAX
|
|
#include <windows.h>
|
|
#endif
|
|
|
|
// Includes CUDA
|
|
#include <cuda_runtime.h>
|
|
|
|
// Utilities and timing functions
|
|
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
|
|
|
|
// CUDA helper functions
|
|
#include <helper_cuda.h> // helper functions for CUDA error check
|
|
|
|
#define NUM_REPS 100 // number of repetitions performed
|
|
#define TILE_DIM 16 // tile/block size
|
|
|
|
const char *sSDKsample = "simplePitchLinearTexture";
|
|
|
|
// Auto-Verification Code
|
|
bool bTestResult = true;
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// NB: (1) The second argument "pitch" is in elements, not bytes
|
|
// (2) normalized coordinates are used (required for wrap address mode)
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Shifts matrix elements using pitch linear array
|
|
//! @param odata output data in global memory
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__global__ void shiftPitchLinear(float *odata, int pitch, int width, int height,
|
|
int shiftX, int shiftY,
|
|
cudaTextureObject_t texRefPL) {
|
|
int xid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int yid = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
|
odata[yid * pitch + xid] = tex2D<float>(
|
|
texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Shifts matrix elements using regular array
|
|
//! @param odata output data in global memory
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
__global__ void shiftArray(float *odata, int pitch, int width, int height,
|
|
int shiftX, int shiftY,
|
|
cudaTextureObject_t texRefArray) {
|
|
int xid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int yid = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
|
odata[yid * pitch + xid] =
|
|
tex2D<float>(texRefArray, (xid + shiftX) / (float)width,
|
|
(yid + shiftY) / (float)height);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Declaration, forward
|
|
void runTest(int argc, char **argv);
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Program main
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
int main(int argc, char **argv) {
|
|
printf("%s starting...\n\n", sSDKsample);
|
|
|
|
runTest(argc, argv);
|
|
|
|
printf("%s completed, returned %s\n", sSDKsample,
|
|
bTestResult ? "OK" : "ERROR!");
|
|
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Run a simple test for CUDA
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void runTest(int argc, char **argv) {
|
|
// Set array size
|
|
const int nx = 2048;
|
|
const int ny = 2048;
|
|
|
|
// Setup shifts applied to x and y data
|
|
const int x_shift = 5;
|
|
const int y_shift = 7;
|
|
|
|
if ((nx % TILE_DIM != 0) || (ny % TILE_DIM != 0)) {
|
|
printf("nx and ny must be multiples of TILE_DIM\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
// Setup execution configuration parameters
|
|
dim3 dimGrid(nx / TILE_DIM, ny / TILE_DIM), dimBlock(TILE_DIM, TILE_DIM);
|
|
|
|
// This will pick the best possible CUDA capable device
|
|
int devID = findCudaDevice(argc, (const char **)argv);
|
|
|
|
// CUDA events for timing
|
|
cudaEvent_t start, stop;
|
|
cudaEventCreate(&start);
|
|
cudaEventCreate(&stop);
|
|
|
|
// Host allocation and initialization
|
|
float *h_idata = (float *)malloc(sizeof(float) * nx * ny);
|
|
float *h_odata = (float *)malloc(sizeof(float) * nx * ny);
|
|
float *gold = (float *)malloc(sizeof(float) * nx * ny);
|
|
|
|
for (int i = 0; i < nx * ny; ++i) {
|
|
h_idata[i] = (float)i;
|
|
}
|
|
|
|
// Device memory allocation
|
|
// Pitch linear input data
|
|
float *d_idataPL;
|
|
size_t d_pitchBytes;
|
|
|
|
checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes,
|
|
nx * sizeof(float), ny));
|
|
|
|
// Array input data
|
|
cudaArray *d_idataArray;
|
|
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
|
|
|
|
checkCudaErrors(cudaMallocArray(&d_idataArray, &channelDesc, nx, ny));
|
|
|
|
// Pitch linear output data
|
|
float *d_odata;
|
|
checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes,
|
|
nx * sizeof(float), ny));
|
|
|
|
// Copy host data to device
|
|
// Pitch linear
|
|
size_t h_pitchBytes = nx * sizeof(float);
|
|
|
|
checkCudaErrors(cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes,
|
|
nx * sizeof(float), ny, cudaMemcpyHostToDevice));
|
|
|
|
// Array
|
|
checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata,
|
|
nx * ny * sizeof(float),
|
|
cudaMemcpyHostToDevice));
|
|
|
|
cudaTextureObject_t texRefPL;
|
|
cudaTextureObject_t texRefArray;
|
|
cudaResourceDesc texRes;
|
|
memset(&texRes, 0, sizeof(cudaResourceDesc));
|
|
|
|
texRes.resType = cudaResourceTypePitch2D;
|
|
texRes.res.pitch2D.devPtr = d_idataPL;
|
|
texRes.res.pitch2D.desc = channelDesc;
|
|
texRes.res.pitch2D.width = nx;
|
|
texRes.res.pitch2D.height = ny;
|
|
texRes.res.pitch2D.pitchInBytes = h_pitchBytes;
|
|
cudaTextureDesc texDescr;
|
|
memset(&texDescr, 0, sizeof(cudaTextureDesc));
|
|
|
|
texDescr.normalizedCoords = true;
|
|
texDescr.filterMode = cudaFilterModePoint;
|
|
texDescr.addressMode[0] = cudaAddressModeWrap;
|
|
texDescr.addressMode[1] = cudaAddressModeWrap;
|
|
texDescr.readMode = cudaReadModeElementType;
|
|
|
|
checkCudaErrors(cudaCreateTextureObject(&texRefPL, &texRes, &texDescr, NULL));
|
|
memset(&texRes, 0, sizeof(cudaResourceDesc));
|
|
memset(&texDescr, 0, sizeof(cudaTextureDesc));
|
|
texRes.resType = cudaResourceTypeArray;
|
|
texRes.res.array.array = d_idataArray;
|
|
texDescr.normalizedCoords = true;
|
|
texDescr.filterMode = cudaFilterModePoint;
|
|
texDescr.addressMode[0] = cudaAddressModeWrap;
|
|
texDescr.addressMode[1] = cudaAddressModeWrap;
|
|
texDescr.readMode = cudaReadModeElementType;
|
|
checkCudaErrors(
|
|
cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL));
|
|
|
|
// Reference calculation
|
|
for (int j = 0; j < ny; ++j) {
|
|
int jshift = (j + y_shift) % ny;
|
|
|
|
for (int i = 0; i < nx; ++i) {
|
|
int ishift = (i + x_shift) % nx;
|
|
gold[j * nx + i] = h_idata[jshift * nx + ishift];
|
|
}
|
|
}
|
|
|
|
// Run ShiftPitchLinear kernel
|
|
checkCudaErrors(
|
|
cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
|
|
|
|
checkCudaErrors(cudaEventRecord(start, 0));
|
|
|
|
for (int i = 0; i < NUM_REPS; ++i) {
|
|
shiftPitchLinear<<<dimGrid, dimBlock>>>(d_odata,
|
|
(int)(d_pitchBytes / sizeof(float)),
|
|
nx, ny, x_shift, y_shift, texRefPL);
|
|
}
|
|
|
|
checkCudaErrors(cudaEventRecord(stop, 0));
|
|
checkCudaErrors(cudaEventSynchronize(stop));
|
|
float timePL;
|
|
checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop));
|
|
|
|
// Check results
|
|
checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes,
|
|
nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
|
|
|
|
bool res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
|
|
|
|
bTestResult = true;
|
|
|
|
if (res == false) {
|
|
printf("*** shiftPitchLinear failed ***\n");
|
|
bTestResult = false;
|
|
}
|
|
|
|
// Run ShiftArray kernel
|
|
checkCudaErrors(
|
|
cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
|
|
checkCudaErrors(cudaEventRecord(start, 0));
|
|
|
|
for (int i = 0; i < NUM_REPS; ++i) {
|
|
shiftArray<<<dimGrid, dimBlock>>>(d_odata,
|
|
(int)(d_pitchBytes / sizeof(float)), nx,
|
|
ny, x_shift, y_shift, texRefArray);
|
|
}
|
|
|
|
checkCudaErrors(cudaEventRecord(stop, 0));
|
|
checkCudaErrors(cudaEventSynchronize(stop));
|
|
float timeArray;
|
|
checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop));
|
|
|
|
// Check results
|
|
checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes,
|
|
nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
|
|
res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
|
|
|
|
if (res == false) {
|
|
printf("*** shiftArray failed ***\n");
|
|
bTestResult = false;
|
|
}
|
|
|
|
float bandwidthPL =
|
|
2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS);
|
|
float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) /
|
|
(timeArray / NUM_REPS);
|
|
|
|
printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n",
|
|
bandwidthPL, bandwidthArray);
|
|
|
|
float fetchRatePL = nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS));
|
|
float fetchRateArray = nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS));
|
|
|
|
printf(
|
|
"\nTexture fetch rate (Mpix/s) for pitch linear: "
|
|
"%.2e; for array: %.2e\n\n",
|
|
fetchRatePL, fetchRateArray);
|
|
|
|
// Cleanup
|
|
free(h_idata);
|
|
free(h_odata);
|
|
free(gold);
|
|
|
|
checkCudaErrors(cudaDestroyTextureObject(texRefPL));
|
|
checkCudaErrors(cudaDestroyTextureObject(texRefArray));
|
|
checkCudaErrors(cudaFree(d_idataPL));
|
|
checkCudaErrors(cudaFreeArray(d_idataArray));
|
|
checkCudaErrors(cudaFree(d_odata));
|
|
|
|
checkCudaErrors(cudaEventDestroy(start));
|
|
checkCudaErrors(cudaEventDestroy(stop));
|
|
}
|