cuda-samples/Samples/0_Introduction/simpleMultiCopy/simpleMultiCopy.cu
2022-01-13 11:35:24 +05:30

374 lines
12 KiB
Plaintext

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Quadro and Tesla GPUs with compute capability >= 2.0 can overlap two
* memcopies with kernel execution. This sample illustrates the usage of CUDA
* streams to achieve overlapping of kernel execution with copying data to and
* from the device.
*
* Additionally, this sample uses CUDA events to measure elapsed time for
* CUDA calls. Events are a part of CUDA API and provide a system independent
* way to measure execution times on CUDA devices with approximately 0.5
* microsecond precision.
*
* Elapsed times are averaged over nreps repetitions (10 by default).
*
*/
const char *sSDKname = "simpleMultiCopy";
// includes, system
#include <stdio.h>
// include CUDA
#include <cuda_runtime.h>
// includes, project
#include <helper_cuda.h>
#include <helper_functions.h> // helper for shared that are common to CUDA Samples
// includes, kernels
// Declare the CUDA kernels here and main() code that is needed to launch
// Compute workload on the system
__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
for (int i = 0; i < inner_reps; ++i) {
g_out[idx] = g_in[idx] + 1;
}
}
}
#define STREAM_COUNT 4
// Uncomment to simulate data source/sink IO times
//#define SIMULATE_IO
int *h_data_source;
int *h_data_sink;
int *h_data_in[STREAM_COUNT];
int *d_data_in[STREAM_COUNT];
int *h_data_out[STREAM_COUNT];
int *d_data_out[STREAM_COUNT];
cudaEvent_t cycleDone[STREAM_COUNT];
cudaStream_t stream[STREAM_COUNT];
cudaEvent_t start, stop;
int N = 1 << 22;
int nreps = 10; // number of times each experiment is repeated
int inner_reps = 5;
int memsize;
dim3 block(512);
dim3 grid;
int thread_blocks;
float processWithStreams(int streams_used);
void init();
bool test();
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char *argv[]) {
int cuda_device = 0;
float scale_factor;
cudaDeviceProp deviceProp;
printf("[%s] - Starting...\n", sSDKname);
if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device=");
if (cuda_device < 0) {
printf("Invalid command line parameters\n");
exit(EXIT_FAILURE);
} else {
printf("cuda_device = %d\n", cuda_device);
cuda_device = gpuDeviceInit(cuda_device);
if (cuda_device < 0) {
printf("No CUDA Capable devices found, exiting...\n");
exit(EXIT_SUCCESS);
}
}
} else {
// Otherwise pick the device with the highest Gflops/s
cuda_device = gpuGetMaxGflopsDeviceId();
checkCudaErrors(cudaSetDevice(cuda_device));
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name);
}
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name,
deviceProp.multiProcessorCount,
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
deviceProp.multiProcessorCount);
// Anything that is less than 32 Cores will have scaled down workload
scale_factor =
max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
(float)deviceProp.multiProcessorCount)),
1.0f);
N = (int)((float)N / scale_factor);
printf("> Device name: %s\n", deviceProp.name);
printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
printf("> scale_factor = %.2f\n", 1.0f / scale_factor);
printf("> array_size = %d\n\n", N);
memsize = N * sizeof(int);
thread_blocks = N / block.x;
grid.x = thread_blocks % 65535;
grid.y = (thread_blocks / 65535 + 1);
// Allocate resources
h_data_source = (int *)malloc(memsize);
h_data_sink = (int *)malloc(memsize);
for (int i = 0; i < STREAM_COUNT; ++i) {
checkCudaErrors(
cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize));
checkCudaErrors(
cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault));
checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));
checkCudaErrors(cudaStreamCreate(&stream[i]));
checkCudaErrors(cudaEventCreate(&cycleDone[i]));
cudaEventRecord(cycleDone[i], stream[i]);
}
cudaEventCreate(&start);
cudaEventCreate(&stop);
init();
// Kernel warmup
incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N, inner_reps);
// Time copies and kernel
cudaEventRecord(start, 0);
checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize,
cudaMemcpyHostToDevice, 0));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float memcpy_h2d_time;
cudaEventElapsedTime(&memcpy_h2d_time, start, stop);
cudaEventRecord(start, 0);
checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize,
cudaMemcpyDeviceToHost, 0));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float memcpy_d2h_time;
cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
cudaEventRecord(start, 0);
incKernel<<<grid, block, 0, 0>>>(d_data_out[0], d_data_in[0], N, inner_reps);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float kernel_time;
cudaEventElapsedTime(&kernel_time, start, stop);
printf("\n");
printf("Relevant properties of this CUDA device\n");
printf(
"(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution "
"(device property \"deviceOverlap\")\n",
deviceProp.deviceOverlap ? "X" : " ");
// printf("(%s) Can execute several GPU kernels simultaneously (compute
// capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
printf(
"(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
" (Compute Capability >= 2.0 AND (Tesla product OR Quadro "
"4000/5000/6000/K5000)\n",
(deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " ");
printf("\n");
printf("Measured timings (throughput):\n");
printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time,
(memsize * 1e-6) / memcpy_h2d_time);
printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time,
(memsize * 1e-6) / memcpy_d2h_time);
printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time,
(inner_reps * memsize * 2e-6) / kernel_time);
printf("\n");
printf(
"Theoretical limits for speedup gained from overlapped data "
"transfers:\n");
printf("No overlap at all (transfer-kernel-transfer): %f ms \n",
memcpy_h2d_time + memcpy_d2h_time + kernel_time);
printf("Compute can overlap with one transfer: %f ms\n",
max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
printf("Compute can overlap with both data transfers: %f ms\n",
max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));
// Process pipelined work
float serial_time = processWithStreams(1);
float overlap_time = processWithStreams(STREAM_COUNT);
printf("\nAverage measured timings over %d repetitions:\n", nreps);
printf(" Avg. time when execution fully serialized\t: %f ms\n",
serial_time / nreps);
printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT,
overlap_time / nreps);
printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n",
(serial_time - overlap_time) / nreps);
printf("\nMeasured throughput:\n");
printf(" Fully serialized execution\t\t: %f GB/s\n",
(nreps * (memsize * 2e-6)) / serial_time);
printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT,
(nreps * (memsize * 2e-6)) / overlap_time);
// Verify the results, we will use the results for final output
bool bResults = test();
// Free resources
free(h_data_source);
free(h_data_sink);
for (int i = 0; i < STREAM_COUNT; ++i) {
cudaFreeHost(h_data_in[i]);
cudaFree(d_data_in[i]);
cudaFreeHost(h_data_out[i]);
cudaFree(d_data_out[i]);
cudaStreamDestroy(stream[i]);
cudaEventDestroy(cycleDone[i]);
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
// Test result
exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
}
float processWithStreams(int streams_used) {
int current_stream = 0;
float time;
// Do processing in a loop
//
// Note: All memory commands are processed in the order they are issued,
// independent of the stream they are enqueued in. Hence the pattern by
// which the copy and kernel commands are enqueued in the stream
// has an influence on the achieved overlap.
cudaEventRecord(start, 0);
for (int i = 0; i < nreps; ++i) {
int next_stream = (current_stream + 1) % streams_used;
#ifdef SIMULATE_IO
// Store the result
memcpy(h_data_sink, h_data_out[current_stream], memsize);
// Read new input
memcpy(h_data_in[next_stream], h_data_source, memsize);
#endif
// Ensure that processing and copying of the last cycle has finished
cudaEventSynchronize(cycleDone[next_stream]);
// Process current frame
incKernel<<<grid, block, 0, stream[current_stream]>>>(
d_data_out[current_stream], d_data_in[current_stream], N, inner_reps);
// Upload next frame
checkCudaErrors(
cudaMemcpyAsync(d_data_in[next_stream], h_data_in[next_stream], memsize,
cudaMemcpyHostToDevice, stream[next_stream]));
// Download current frame
checkCudaErrors(cudaMemcpyAsync(
h_data_out[current_stream], d_data_out[current_stream], memsize,
cudaMemcpyDeviceToHost, stream[current_stream]));
checkCudaErrors(
cudaEventRecord(cycleDone[current_stream], stream[current_stream]));
current_stream = next_stream;
}
cudaEventRecord(stop, 0);
cudaDeviceSynchronize();
cudaEventElapsedTime(&time, start, stop);
return time;
}
void init() {
for (int i = 0; i < N; ++i) {
h_data_source[i] = 0;
}
for (int i = 0; i < STREAM_COUNT; ++i) {
memcpy(h_data_in[i], h_data_source, memsize);
}
}
bool test() {
bool passed = true;
for (int j = 0; j < STREAM_COUNT; ++j) {
for (int i = 0; i < N; ++i) {
passed &= (h_data_out[j][i] == 1);
}
}
return passed;
}