/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Quadro and Tesla GPUs with compute capability >= 2.0 can overlap two * memcopies with kernel execution. This sample illustrates the usage of CUDA * streams to achieve overlapping of kernel execution with copying data to and * from the device. * * Additionally, this sample uses CUDA events to measure elapsed time for * CUDA calls. Events are a part of CUDA API and provide a system independent * way to measure execution times on CUDA devices with approximately 0.5 * microsecond precision. * * Elapsed times are averaged over nreps repetitions (10 by default). * */ const char *sSDKname = "simpleMultiCopy"; // includes, system #include // include CUDA #include // includes, project #include #include // helper for shared that are common to CUDA Samples // includes, kernels // Declare the CUDA kernels here and main() code that is needed to launch // Compute workload on the system __global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < N) { for (int i = 0; i < inner_reps; ++i) { g_out[idx] = g_in[idx] + 1; } } } #define STREAM_COUNT 4 // Uncomment to simulate data source/sink IO times //#define SIMULATE_IO int *h_data_source; int *h_data_sink; int *h_data_in[STREAM_COUNT]; int *d_data_in[STREAM_COUNT]; int *h_data_out[STREAM_COUNT]; int *d_data_out[STREAM_COUNT]; cudaEvent_t cycleDone[STREAM_COUNT]; cudaStream_t stream[STREAM_COUNT]; cudaEvent_t start, stop; int N = 1 << 22; int nreps = 10; // number of times each experiment is repeated int inner_reps = 5; int memsize; dim3 block(512); dim3 grid; int thread_blocks; float processWithStreams(int streams_used); void init(); bool test(); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { int cuda_device = 0; float scale_factor; cudaDeviceProp deviceProp; printf("[%s] - Starting...\n", sSDKname); if (checkCmdLineFlag(argc, (const char **)argv, "device")) { cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device="); if (cuda_device < 0) { printf("Invalid command line parameters\n"); exit(EXIT_FAILURE); } else { printf("cuda_device = %d\n", cuda_device); cuda_device = gpuDeviceInit(cuda_device); if (cuda_device < 0) { printf("No CUDA Capable devices found, exiting...\n"); exit(EXIT_SUCCESS); } } } else { // Otherwise pick the device with the highest Gflops/s cuda_device = gpuGetMaxGflopsDeviceId(); checkCudaErrors(cudaSetDevice(cuda_device)); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name); } checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name, deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); // Anything that is less than 32 Cores will have scaled down workload scale_factor = max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), 1.0f); N = (int)((float)N / scale_factor); printf("> Device name: %s\n", deviceProp.name); printf("> CUDA Capability %d.%d hardware with %d multi-processors\n", deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); printf("> scale_factor = %.2f\n", 1.0f / scale_factor); printf("> array_size = %d\n\n", N); memsize = N * sizeof(int); thread_blocks = N / block.x; grid.x = thread_blocks % 65535; grid.y = (thread_blocks / 65535 + 1); // Allocate resources h_data_source = (int *)malloc(memsize); h_data_sink = (int *)malloc(memsize); for (int i = 0; i < STREAM_COUNT; ++i) { checkCudaErrors( cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault)); checkCudaErrors(cudaMalloc(&d_data_in[i], memsize)); checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize)); checkCudaErrors( cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault)); checkCudaErrors(cudaMalloc(&d_data_out[i], memsize)); checkCudaErrors(cudaStreamCreate(&stream[i])); checkCudaErrors(cudaEventCreate(&cycleDone[i])); cudaEventRecord(cycleDone[i], stream[i]); } cudaEventCreate(&start); cudaEventCreate(&stop); init(); // Kernel warmup incKernel<<>>(d_data_out[0], d_data_in[0], N, inner_reps); // Time copies and kernel cudaEventRecord(start, 0); checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, cudaMemcpyHostToDevice, 0)); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float memcpy_h2d_time; cudaEventElapsedTime(&memcpy_h2d_time, start, stop); cudaEventRecord(start, 0); checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, cudaMemcpyDeviceToHost, 0)); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float memcpy_d2h_time; cudaEventElapsedTime(&memcpy_d2h_time, start, stop); cudaEventRecord(start, 0); incKernel<<>>(d_data_out[0], d_data_in[0], N, inner_reps); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float kernel_time; cudaEventElapsedTime(&kernel_time, start, stop); printf("\n"); printf("Relevant properties of this CUDA device\n"); printf( "(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution " "(device property \"deviceOverlap\")\n", deviceProp.deviceOverlap ? "X" : " "); // printf("(%s) Can execute several GPU kernels simultaneously (compute // capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " "); printf( "(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n" " (Compute Capability >= 2.0 AND (Tesla product OR Quadro " "4000/5000/6000/K5000)\n", (deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " "); printf("\n"); printf("Measured timings (throughput):\n"); printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time, (memsize * 1e-6) / memcpy_h2d_time); printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time, (memsize * 1e-6) / memcpy_d2h_time); printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time, (inner_reps * memsize * 2e-6) / kernel_time); printf("\n"); printf( "Theoretical limits for speedup gained from overlapped data " "transfers:\n"); printf("No overlap at all (transfer-kernel-transfer): %f ms \n", memcpy_h2d_time + memcpy_d2h_time + kernel_time); printf("Compute can overlap with one transfer: %f ms\n", max((memcpy_h2d_time + memcpy_d2h_time), kernel_time)); printf("Compute can overlap with both data transfers: %f ms\n", max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time)); // Process pipelined work float serial_time = processWithStreams(1); float overlap_time = processWithStreams(STREAM_COUNT); printf("\nAverage measured timings over %d repetitions:\n", nreps); printf(" Avg. time when execution fully serialized\t: %f ms\n", serial_time / nreps); printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT, overlap_time / nreps); printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n", (serial_time - overlap_time) / nreps); printf("\nMeasured throughput:\n"); printf(" Fully serialized execution\t\t: %f GB/s\n", (nreps * (memsize * 2e-6)) / serial_time); printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT, (nreps * (memsize * 2e-6)) / overlap_time); // Verify the results, we will use the results for final output bool bResults = test(); // Free resources free(h_data_source); free(h_data_sink); for (int i = 0; i < STREAM_COUNT; ++i) { cudaFreeHost(h_data_in[i]); cudaFree(d_data_in[i]); cudaFreeHost(h_data_out[i]); cudaFree(d_data_out[i]); cudaStreamDestroy(stream[i]); cudaEventDestroy(cycleDone[i]); } cudaEventDestroy(start); cudaEventDestroy(stop); // Test result exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE); } float processWithStreams(int streams_used) { int current_stream = 0; float time; // Do processing in a loop // // Note: All memory commands are processed in the order they are issued, // independent of the stream they are enqueued in. Hence the pattern by // which the copy and kernel commands are enqueued in the stream // has an influence on the achieved overlap. cudaEventRecord(start, 0); for (int i = 0; i < nreps; ++i) { int next_stream = (current_stream + 1) % streams_used; #ifdef SIMULATE_IO // Store the result memcpy(h_data_sink, h_data_out[current_stream], memsize); // Read new input memcpy(h_data_in[next_stream], h_data_source, memsize); #endif // Ensure that processing and copying of the last cycle has finished cudaEventSynchronize(cycleDone[next_stream]); // Process current frame incKernel<<>>( d_data_out[current_stream], d_data_in[current_stream], N, inner_reps); // Upload next frame checkCudaErrors( cudaMemcpyAsync(d_data_in[next_stream], h_data_in[next_stream], memsize, cudaMemcpyHostToDevice, stream[next_stream])); // Download current frame checkCudaErrors(cudaMemcpyAsync( h_data_out[current_stream], d_data_out[current_stream], memsize, cudaMemcpyDeviceToHost, stream[current_stream])); checkCudaErrors( cudaEventRecord(cycleDone[current_stream], stream[current_stream])); current_stream = next_stream; } cudaEventRecord(stop, 0); cudaDeviceSynchronize(); cudaEventElapsedTime(&time, start, stop); return time; } void init() { for (int i = 0; i < N; ++i) { h_data_source[i] = 0; } for (int i = 0; i < STREAM_COUNT; ++i) { memcpy(h_data_in[i], h_data_source, memsize); } } bool test() { bool passed = true; for (int j = 0; j < STREAM_COUNT; ++j) { for (int i = 0; i < N; ++i) { passed &= (h_data_out[j][i] == 1); } } return passed; }