mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-28 20:39:17 +08:00
374 lines
12 KiB
Plaintext
374 lines
12 KiB
Plaintext
|
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or without
|
||
|
* modification, are permitted provided that the following conditions
|
||
|
* are met:
|
||
|
* * Redistributions of source code must retain the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer.
|
||
|
* * Redistributions in binary form must reproduce the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer in the
|
||
|
* documentation and/or other materials provided with the distribution.
|
||
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||
|
* contributors may be used to endorse or promote products derived
|
||
|
* from this software without specific prior written permission.
|
||
|
*
|
||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
*/
|
||
|
|
||
|
/*
|
||
|
* Quadro and Tesla GPUs with compute capability >= 2.0 can overlap two
|
||
|
* memcopies with kernel execution. This sample illustrates the usage of CUDA
|
||
|
* streams to achieve overlapping of kernel execution with copying data to and
|
||
|
* from the device.
|
||
|
*
|
||
|
* Additionally, this sample uses CUDA events to measure elapsed time for
|
||
|
* CUDA calls. Events are a part of CUDA API and provide a system independent
|
||
|
* way to measure execution times on CUDA devices with approximately 0.5
|
||
|
* microsecond precision.
|
||
|
*
|
||
|
* Elapsed times are averaged over nreps repetitions (10 by default).
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
const char *sSDKname = "simpleMultiCopy";
|
||
|
|
||
|
// includes, system
|
||
|
#include <stdio.h>
|
||
|
|
||
|
// include CUDA
|
||
|
#include <cuda_runtime.h>
|
||
|
|
||
|
// includes, project
|
||
|
#include <helper_cuda.h>
|
||
|
#include <helper_functions.h> // helper for shared that are common to CUDA Samples
|
||
|
|
||
|
// includes, kernels
|
||
|
// Declare the CUDA kernels here and main() code that is needed to launch
|
||
|
// Compute workload on the system
|
||
|
__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps) {
|
||
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
|
|
||
|
if (idx < N) {
|
||
|
for (int i = 0; i < inner_reps; ++i) {
|
||
|
g_out[idx] = g_in[idx] + 1;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#define STREAM_COUNT 4
|
||
|
|
||
|
// Uncomment to simulate data source/sink IO times
|
||
|
//#define SIMULATE_IO
|
||
|
|
||
|
int *h_data_source;
|
||
|
int *h_data_sink;
|
||
|
|
||
|
int *h_data_in[STREAM_COUNT];
|
||
|
int *d_data_in[STREAM_COUNT];
|
||
|
|
||
|
int *h_data_out[STREAM_COUNT];
|
||
|
int *d_data_out[STREAM_COUNT];
|
||
|
|
||
|
cudaEvent_t cycleDone[STREAM_COUNT];
|
||
|
cudaStream_t stream[STREAM_COUNT];
|
||
|
|
||
|
cudaEvent_t start, stop;
|
||
|
|
||
|
int N = 1 << 22;
|
||
|
int nreps = 10; // number of times each experiment is repeated
|
||
|
int inner_reps = 5;
|
||
|
|
||
|
int memsize;
|
||
|
|
||
|
dim3 block(512);
|
||
|
dim3 grid;
|
||
|
|
||
|
int thread_blocks;
|
||
|
|
||
|
float processWithStreams(int streams_used);
|
||
|
void init();
|
||
|
bool test();
|
||
|
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
// Program main
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
int main(int argc, char *argv[]) {
|
||
|
int cuda_device = 0;
|
||
|
float scale_factor;
|
||
|
cudaDeviceProp deviceProp;
|
||
|
|
||
|
printf("[%s] - Starting...\n", sSDKname);
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
|
||
|
cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device=");
|
||
|
|
||
|
if (cuda_device < 0) {
|
||
|
printf("Invalid command line parameters\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
} else {
|
||
|
printf("cuda_device = %d\n", cuda_device);
|
||
|
cuda_device = gpuDeviceInit(cuda_device);
|
||
|
|
||
|
if (cuda_device < 0) {
|
||
|
printf("No CUDA Capable devices found, exiting...\n");
|
||
|
exit(EXIT_SUCCESS);
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
// Otherwise pick the device with the highest Gflops/s
|
||
|
cuda_device = gpuGetMaxGflopsDeviceId();
|
||
|
checkCudaErrors(cudaSetDevice(cuda_device));
|
||
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
|
||
|
printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name);
|
||
|
}
|
||
|
|
||
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
|
||
|
printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name,
|
||
|
deviceProp.multiProcessorCount,
|
||
|
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
|
||
|
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
|
||
|
deviceProp.multiProcessorCount);
|
||
|
|
||
|
// Anything that is less than 32 Cores will have scaled down workload
|
||
|
scale_factor =
|
||
|
max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
|
||
|
(float)deviceProp.multiProcessorCount)),
|
||
|
1.0f);
|
||
|
N = (int)((float)N / scale_factor);
|
||
|
|
||
|
printf("> Device name: %s\n", deviceProp.name);
|
||
|
printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
|
||
|
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
|
||
|
printf("> scale_factor = %.2f\n", 1.0f / scale_factor);
|
||
|
printf("> array_size = %d\n\n", N);
|
||
|
|
||
|
memsize = N * sizeof(int);
|
||
|
|
||
|
thread_blocks = N / block.x;
|
||
|
|
||
|
grid.x = thread_blocks % 65535;
|
||
|
grid.y = (thread_blocks / 65535 + 1);
|
||
|
|
||
|
// Allocate resources
|
||
|
|
||
|
h_data_source = (int *)malloc(memsize);
|
||
|
h_data_sink = (int *)malloc(memsize);
|
||
|
|
||
|
for (int i = 0; i < STREAM_COUNT; ++i) {
|
||
|
checkCudaErrors(
|
||
|
cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
|
||
|
checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
|
||
|
checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize));
|
||
|
|
||
|
checkCudaErrors(
|
||
|
cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault));
|
||
|
checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));
|
||
|
|
||
|
checkCudaErrors(cudaStreamCreate(&stream[i]));
|
||
|
checkCudaErrors(cudaEventCreate(&cycleDone[i]));
|
||
|
|
||
|
cudaEventRecord(cycleDone[i], stream[i]);
|
||
|
}
|
||
|
|
||
|
cudaEventCreate(&start);
|
||
|
cudaEventCreate(&stop);
|
||
|
|
||
|
init();
|
||
|
|
||
|
// Kernel warmup
|
||
|
incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N, inner_reps);
|
||
|
|
||
|
// Time copies and kernel
|
||
|
cudaEventRecord(start, 0);
|
||
|
checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize,
|
||
|
cudaMemcpyHostToDevice, 0));
|
||
|
cudaEventRecord(stop, 0);
|
||
|
cudaEventSynchronize(stop);
|
||
|
|
||
|
float memcpy_h2d_time;
|
||
|
cudaEventElapsedTime(&memcpy_h2d_time, start, stop);
|
||
|
|
||
|
cudaEventRecord(start, 0);
|
||
|
checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize,
|
||
|
cudaMemcpyDeviceToHost, 0));
|
||
|
cudaEventRecord(stop, 0);
|
||
|
cudaEventSynchronize(stop);
|
||
|
|
||
|
float memcpy_d2h_time;
|
||
|
cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
|
||
|
|
||
|
cudaEventRecord(start, 0);
|
||
|
incKernel<<<grid, block, 0, 0>>>(d_data_out[0], d_data_in[0], N, inner_reps);
|
||
|
cudaEventRecord(stop, 0);
|
||
|
cudaEventSynchronize(stop);
|
||
|
|
||
|
float kernel_time;
|
||
|
cudaEventElapsedTime(&kernel_time, start, stop);
|
||
|
|
||
|
printf("\n");
|
||
|
printf("Relevant properties of this CUDA device\n");
|
||
|
printf(
|
||
|
"(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution "
|
||
|
"(device property \"deviceOverlap\")\n",
|
||
|
deviceProp.deviceOverlap ? "X" : " ");
|
||
|
// printf("(%s) Can execute several GPU kernels simultaneously (compute
|
||
|
// capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
|
||
|
printf(
|
||
|
"(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
|
||
|
" (Compute Capability >= 2.0 AND (Tesla product OR Quadro "
|
||
|
"4000/5000/6000/K5000)\n",
|
||
|
(deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " ");
|
||
|
|
||
|
printf("\n");
|
||
|
printf("Measured timings (throughput):\n");
|
||
|
printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time,
|
||
|
(memsize * 1e-6) / memcpy_h2d_time);
|
||
|
printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time,
|
||
|
(memsize * 1e-6) / memcpy_d2h_time);
|
||
|
printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time,
|
||
|
(inner_reps * memsize * 2e-6) / kernel_time);
|
||
|
|
||
|
printf("\n");
|
||
|
printf(
|
||
|
"Theoretical limits for speedup gained from overlapped data "
|
||
|
"transfers:\n");
|
||
|
printf("No overlap at all (transfer-kernel-transfer): %f ms \n",
|
||
|
memcpy_h2d_time + memcpy_d2h_time + kernel_time);
|
||
|
printf("Compute can overlap with one transfer: %f ms\n",
|
||
|
max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
|
||
|
printf("Compute can overlap with both data transfers: %f ms\n",
|
||
|
max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));
|
||
|
|
||
|
// Process pipelined work
|
||
|
float serial_time = processWithStreams(1);
|
||
|
float overlap_time = processWithStreams(STREAM_COUNT);
|
||
|
|
||
|
printf("\nAverage measured timings over %d repetitions:\n", nreps);
|
||
|
printf(" Avg. time when execution fully serialized\t: %f ms\n",
|
||
|
serial_time / nreps);
|
||
|
printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT,
|
||
|
overlap_time / nreps);
|
||
|
printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n",
|
||
|
(serial_time - overlap_time) / nreps);
|
||
|
|
||
|
printf("\nMeasured throughput:\n");
|
||
|
printf(" Fully serialized execution\t\t: %f GB/s\n",
|
||
|
(nreps * (memsize * 2e-6)) / serial_time);
|
||
|
printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT,
|
||
|
(nreps * (memsize * 2e-6)) / overlap_time);
|
||
|
|
||
|
// Verify the results, we will use the results for final output
|
||
|
bool bResults = test();
|
||
|
|
||
|
// Free resources
|
||
|
|
||
|
free(h_data_source);
|
||
|
free(h_data_sink);
|
||
|
|
||
|
for (int i = 0; i < STREAM_COUNT; ++i) {
|
||
|
cudaFreeHost(h_data_in[i]);
|
||
|
cudaFree(d_data_in[i]);
|
||
|
|
||
|
cudaFreeHost(h_data_out[i]);
|
||
|
cudaFree(d_data_out[i]);
|
||
|
|
||
|
cudaStreamDestroy(stream[i]);
|
||
|
cudaEventDestroy(cycleDone[i]);
|
||
|
}
|
||
|
|
||
|
cudaEventDestroy(start);
|
||
|
cudaEventDestroy(stop);
|
||
|
|
||
|
// Test result
|
||
|
exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
float processWithStreams(int streams_used) {
|
||
|
int current_stream = 0;
|
||
|
|
||
|
float time;
|
||
|
|
||
|
// Do processing in a loop
|
||
|
//
|
||
|
// Note: All memory commands are processed in the order they are issued,
|
||
|
// independent of the stream they are enqueued in. Hence the pattern by
|
||
|
// which the copy and kernel commands are enqueued in the stream
|
||
|
// has an influence on the achieved overlap.
|
||
|
|
||
|
cudaEventRecord(start, 0);
|
||
|
|
||
|
for (int i = 0; i < nreps; ++i) {
|
||
|
int next_stream = (current_stream + 1) % streams_used;
|
||
|
|
||
|
#ifdef SIMULATE_IO
|
||
|
// Store the result
|
||
|
memcpy(h_data_sink, h_data_out[current_stream], memsize);
|
||
|
|
||
|
// Read new input
|
||
|
memcpy(h_data_in[next_stream], h_data_source, memsize);
|
||
|
#endif
|
||
|
|
||
|
// Ensure that processing and copying of the last cycle has finished
|
||
|
cudaEventSynchronize(cycleDone[next_stream]);
|
||
|
|
||
|
// Process current frame
|
||
|
incKernel<<<grid, block, 0, stream[current_stream]>>>(
|
||
|
d_data_out[current_stream], d_data_in[current_stream], N, inner_reps);
|
||
|
|
||
|
// Upload next frame
|
||
|
checkCudaErrors(
|
||
|
cudaMemcpyAsync(d_data_in[next_stream], h_data_in[next_stream], memsize,
|
||
|
cudaMemcpyHostToDevice, stream[next_stream]));
|
||
|
|
||
|
// Download current frame
|
||
|
checkCudaErrors(cudaMemcpyAsync(
|
||
|
h_data_out[current_stream], d_data_out[current_stream], memsize,
|
||
|
cudaMemcpyDeviceToHost, stream[current_stream]));
|
||
|
|
||
|
checkCudaErrors(
|
||
|
cudaEventRecord(cycleDone[current_stream], stream[current_stream]));
|
||
|
|
||
|
current_stream = next_stream;
|
||
|
}
|
||
|
|
||
|
cudaEventRecord(stop, 0);
|
||
|
|
||
|
cudaDeviceSynchronize();
|
||
|
|
||
|
cudaEventElapsedTime(&time, start, stop);
|
||
|
|
||
|
return time;
|
||
|
}
|
||
|
|
||
|
void init() {
|
||
|
for (int i = 0; i < N; ++i) {
|
||
|
h_data_source[i] = 0;
|
||
|
}
|
||
|
|
||
|
for (int i = 0; i < STREAM_COUNT; ++i) {
|
||
|
memcpy(h_data_in[i], h_data_source, memsize);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
bool test() {
|
||
|
bool passed = true;
|
||
|
|
||
|
for (int j = 0; j < STREAM_COUNT; ++j) {
|
||
|
for (int i = 0; i < N; ++i) {
|
||
|
passed &= (h_data_out[j][i] == 1);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return passed;
|
||
|
}
|