mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-28 16:29:17 +08:00
143 lines
4.9 KiB
Plaintext
143 lines
4.9 KiB
Plaintext
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* This sample illustrates the usage of CUDA events for both GPU timing and
|
|
* overlapping CPU and GPU execution. Events are inserted into a stream
|
|
* of CUDA calls. Since CUDA stream calls are asynchronous, the CPU can
|
|
* perform computations while GPU is executing (including DMA memcopies
|
|
* between the host and device). CPU can query CUDA events to determine
|
|
* whether GPU has completed tasks.
|
|
*/
|
|
|
|
// includes, system
|
|
#include <stdio.h>
|
|
|
|
// includes CUDA Runtime
|
|
#include <cuda_runtime.h>
|
|
#include <cuda_profiler_api.h>
|
|
|
|
// includes, project
|
|
#include <helper_cuda.h>
|
|
#include <helper_functions.h> // helper utility functions
|
|
|
|
__global__ void increment_kernel(int *g_data, int inc_value) {
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
g_data[idx] = g_data[idx] + inc_value;
|
|
}
|
|
|
|
bool correct_output(int *data, const int n, const int x) {
|
|
for (int i = 0; i < n; i++)
|
|
if (data[i] != x) {
|
|
printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
int main(int argc, char *argv[]) {
|
|
int devID;
|
|
cudaDeviceProp deviceProps;
|
|
|
|
printf("[%s] - Starting...\n", argv[0]);
|
|
|
|
// This will pick the best possible CUDA capable device
|
|
devID = findCudaDevice(argc, (const char **)argv);
|
|
|
|
// get device name
|
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
|
|
printf("CUDA device [%s]\n", deviceProps.name);
|
|
|
|
int n = 16 * 1024 * 1024;
|
|
int nbytes = n * sizeof(int);
|
|
int value = 26;
|
|
|
|
// allocate host memory
|
|
int *a = 0;
|
|
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
|
|
memset(a, 0, nbytes);
|
|
|
|
// allocate device memory
|
|
int *d_a = 0;
|
|
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
|
|
checkCudaErrors(cudaMemset(d_a, 255, nbytes));
|
|
|
|
// set kernel launch configuration
|
|
dim3 threads = dim3(512, 1);
|
|
dim3 blocks = dim3(n / threads.x, 1);
|
|
|
|
// create cuda event handles
|
|
cudaEvent_t start, stop;
|
|
checkCudaErrors(cudaEventCreate(&start));
|
|
checkCudaErrors(cudaEventCreate(&stop));
|
|
|
|
StopWatchInterface *timer = NULL;
|
|
sdkCreateTimer(&timer);
|
|
sdkResetTimer(&timer);
|
|
|
|
checkCudaErrors(cudaDeviceSynchronize());
|
|
float gpu_time = 0.0f;
|
|
|
|
// asynchronously issue work to the GPU (all to stream 0)
|
|
checkCudaErrors(cudaProfilerStart());
|
|
sdkStartTimer(&timer);
|
|
cudaEventRecord(start, 0);
|
|
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
|
|
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
|
|
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
|
|
cudaEventRecord(stop, 0);
|
|
sdkStopTimer(&timer);
|
|
checkCudaErrors(cudaProfilerStop());
|
|
|
|
// have CPU do some work while waiting for stage 1 to finish
|
|
unsigned long int counter = 0;
|
|
|
|
while (cudaEventQuery(stop) == cudaErrorNotReady) {
|
|
counter++;
|
|
}
|
|
|
|
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
|
|
|
|
// print the cpu and gpu times
|
|
printf("time spent executing by the GPU: %.2f\n", gpu_time);
|
|
printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
|
|
printf("CPU executed %lu iterations while waiting for GPU to finish\n",
|
|
counter);
|
|
|
|
// check the output for correctness
|
|
bool bFinalResults = correct_output(a, n, value);
|
|
|
|
// release resources
|
|
checkCudaErrors(cudaEventDestroy(start));
|
|
checkCudaErrors(cudaEventDestroy(stop));
|
|
checkCudaErrors(cudaFreeHost(a));
|
|
checkCudaErrors(cudaFree(d_a));
|
|
|
|
exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
|
|
}
|