/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This sample illustrates the usage of CUDA events for both GPU timing and * overlapping CPU and GPU execution. Events are inserted into a stream * of CUDA calls. Since CUDA stream calls are asynchronous, the CPU can * perform computations while GPU is executing (including DMA memcopies * between the host and device). CPU can query CUDA events to determine * whether GPU has completed tasks. */ // includes, system #include // includes CUDA Runtime #include // includes, project #include #include // helper utility functions __global__ void increment_kernel(int *g_data, int inc_value) { int idx = blockIdx.x * blockDim.x + threadIdx.x; g_data[idx] = g_data[idx] + inc_value; } bool correct_output(int *data, const int n, const int x) { for (int i = 0; i < n; i++) if (data[i] != x) { printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x); return false; } return true; } int main(int argc, char *argv[]) { int devID; cudaDeviceProp deviceProps; printf("[%s] - Starting...\n", argv[0]); // This will pick the best possible CUDA capable device devID = findCudaDevice(argc, (const char **)argv); // get device name checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); printf("CUDA device [%s]\n", deviceProps.name); int n = 16 * 1024 * 1024; int nbytes = n * sizeof(int); int value = 26; // allocate host memory int *a = 0; checkCudaErrors(cudaMallocHost((void **)&a, nbytes)); memset(a, 0, nbytes); // allocate device memory int *d_a = 0; checkCudaErrors(cudaMalloc((void **)&d_a, nbytes)); checkCudaErrors(cudaMemset(d_a, 255, nbytes)); // set kernel launch configuration dim3 threads = dim3(512, 1); dim3 blocks = dim3(n / threads.x, 1); // create cuda event handles cudaEvent_t start, stop; checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); StopWatchInterface *timer = NULL; sdkCreateTimer(&timer); sdkResetTimer(&timer); checkCudaErrors(cudaDeviceSynchronize()); float gpu_time = 0.0f; // asynchronously issue work to the GPU (all to stream 0) sdkStartTimer(&timer); cudaEventRecord(start, 0); cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0); increment_kernel<<>>(d_a, value); cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0); cudaEventRecord(stop, 0); sdkStopTimer(&timer); // have CPU do some work while waiting for stage 1 to finish unsigned long int counter = 0; while (cudaEventQuery(stop) == cudaErrorNotReady) { counter++; } checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop)); // print the cpu and gpu times printf("time spent executing by the GPU: %.2f\n", gpu_time); printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer)); printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter); // check the output for correctness bool bFinalResults = correct_output(a, n, value); // release resources checkCudaErrors(cudaEventDestroy(start)); checkCudaErrors(cudaEventDestroy(stop)); checkCudaErrors(cudaFreeHost(a)); checkCudaErrors(cudaFree(d_a)); exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE); }