mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-01-19 22:15:55 +08:00
446 lines
16 KiB
Plaintext
446 lines
16 KiB
Plaintext
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* This sample illustrates the usage of CUDA streams for overlapping
|
|
* kernel execution with device/host memcopies. The kernel is used to
|
|
* initialize an array to a specific value, after which the array is
|
|
* copied to the host (CPU) memory. To increase performance, multiple
|
|
* kernel/memcopy pairs are launched asynchronously, each pair in its
|
|
* own stream. Devices with Compute Capability 1.1 can overlap a kernel
|
|
* and a memcopy as long as they are issued in different streams. Kernels
|
|
* are serialized. Thus, if n pairs are launched, streamed approach
|
|
* can reduce the memcopy cost to the (1/n)th of a single copy of the entire
|
|
* data set.
|
|
*
|
|
* Additionally, this sample uses CUDA events to measure elapsed time for
|
|
* CUDA calls. Events are a part of CUDA API and provide a system independent
|
|
* way to measure execution times on CUDA devices with approximately 0.5
|
|
* microsecond precision.
|
|
*
|
|
* Elapsed times are averaged over nreps repetitions (10 by default).
|
|
*
|
|
*/
|
|
|
|
const char *sSDKsample = "simpleStreams";
|
|
|
|
const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync",
|
|
"cudaEventDisableTiming", NULL};
|
|
|
|
const char *sDeviceSyncMethod[] = {
|
|
"cudaDeviceScheduleAuto", "cudaDeviceScheduleSpin",
|
|
"cudaDeviceScheduleYield", "INVALID",
|
|
"cudaDeviceScheduleBlockingSync", NULL};
|
|
|
|
// System includes
|
|
#include <stdio.h>
|
|
#include <assert.h>
|
|
|
|
// CUDA runtime
|
|
#include <cuda_runtime.h>
|
|
|
|
// helper functions and utilities to work with CUDA
|
|
#include <helper_functions.h>
|
|
#include <helper_cuda.h>
|
|
|
|
#ifndef WIN32
|
|
#include <sys/mman.h> // for mmap() / munmap()
|
|
#endif
|
|
|
|
// Macro to aligned up to the memory size in question
|
|
#define MEMORY_ALIGNMENT 4096
|
|
#define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))
|
|
|
|
__global__ void init_array(int *g_data, int *factor, int num_iterations) {
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
for (int i = 0; i < num_iterations; i++) {
|
|
g_data[idx] += *factor; // non-coalesced on purpose, to burn time
|
|
}
|
|
}
|
|
|
|
bool correct_data(int *a, const int n, const int c) {
|
|
for (int i = 0; i < n; i++) {
|
|
if (a[i] != c) {
|
|
printf("%d: %d %d\n", i, a[i], c);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a,
|
|
int **ppAligned_a, int nbytes) {
|
|
#if CUDART_VERSION >= 4000
|
|
#if !defined(__arm__) && !defined(__aarch64__)
|
|
if (bPinGenericMemory) {
|
|
// allocate a generic page-aligned chunk of system memory
|
|
#ifdef WIN32
|
|
printf(
|
|
"> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned "
|
|
"system memory)\n",
|
|
(float)nbytes / 1048576.0f);
|
|
*pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT),
|
|
MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
|
|
#else
|
|
printf(
|
|
"> mmap() allocating %4.2f Mbytes (generic page-aligned system "
|
|
"memory)\n",
|
|
(float)nbytes / 1048576.0f);
|
|
*pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT),
|
|
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
|
|
#endif
|
|
|
|
*ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);
|
|
|
|
printf(
|
|
"> cudaHostRegister() registering %4.2f Mbytes of generic allocated "
|
|
"system memory\n",
|
|
(float)nbytes / 1048576.0f);
|
|
// pin allocate memory
|
|
checkCudaErrors(
|
|
cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped));
|
|
} else
|
|
#endif
|
|
#endif
|
|
{
|
|
printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n",
|
|
(float)nbytes / 1048576.0f);
|
|
// allocate host memory (pinned is required for achieve asynchronicity)
|
|
checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes));
|
|
*ppAligned_a = *pp_a;
|
|
}
|
|
}
|
|
|
|
inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a,
|
|
int **ppAligned_a, int nbytes) {
|
|
#if CUDART_VERSION >= 4000
|
|
#if !defined(__arm__) && !defined(__aarch64__)
|
|
// CUDA 4.0 support pinning of generic host memory
|
|
if (bPinGenericMemory) {
|
|
// unpin and delete host memory
|
|
checkCudaErrors(cudaHostUnregister(*ppAligned_a));
|
|
#ifdef WIN32
|
|
VirtualFree(*pp_a, 0, MEM_RELEASE);
|
|
#else
|
|
munmap(*pp_a, nbytes);
|
|
#endif
|
|
} else
|
|
#endif
|
|
#endif
|
|
{
|
|
cudaFreeHost(*pp_a);
|
|
}
|
|
}
|
|
|
|
static const char *sSyncMethod[] = {
|
|
"0 (Automatic Blocking)",
|
|
"1 (Spin Blocking)",
|
|
"2 (Yield Blocking)",
|
|
"3 (Undefined Blocking Method)",
|
|
"4 (Blocking Sync Event) = low CPU utilization",
|
|
NULL};
|
|
|
|
void printHelp() {
|
|
printf("Usage: %s [options below]\n", sSDKsample);
|
|
printf("\t--sync_method=n for CPU/GPU synchronization\n");
|
|
printf("\t n=%s\n", sSyncMethod[0]);
|
|
printf("\t n=%s\n", sSyncMethod[1]);
|
|
printf("\t n=%s\n", sSyncMethod[2]);
|
|
printf("\t <Default> n=%s\n", sSyncMethod[4]);
|
|
printf(
|
|
"\t--use_generic_memory (default) use generic page-aligned for system "
|
|
"memory\n");
|
|
printf(
|
|
"\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate "
|
|
"system memory\n");
|
|
}
|
|
|
|
#if defined(__APPLE__) || defined(MACOSX)
|
|
#define DEFAULT_PINNED_GENERIC_MEMORY false
|
|
#else
|
|
#define DEFAULT_PINNED_GENERIC_MEMORY true
|
|
#endif
|
|
|
|
int main(int argc, char **argv) {
|
|
int cuda_device = 0;
|
|
int nstreams = 4; // number of streams for CUDA calls
|
|
int nreps = 10; // number of times each experiment is repeated
|
|
int n = 16 * 1024 * 1024; // number of ints in the data set
|
|
int nbytes = n * sizeof(int); // number of data bytes
|
|
dim3 threads, blocks; // kernel launch configuration
|
|
float elapsed_time, time_memcpy, time_kernel; // timing variables
|
|
float scale_factor = 1.0f;
|
|
|
|
// allocate generic memory and pin it laster instead of using cudaHostAlloc()
|
|
|
|
bool bPinGenericMemory =
|
|
DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior
|
|
int device_sync_method =
|
|
cudaDeviceBlockingSync; // by default we use BlockingSync
|
|
|
|
int niterations; // number of iterations for the loop inside the kernel
|
|
|
|
printf("[ %s ]\n\n", sSDKsample);
|
|
|
|
if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
|
|
printHelp();
|
|
return EXIT_SUCCESS;
|
|
}
|
|
|
|
if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv,
|
|
"sync_method")) >= 0) {
|
|
if (device_sync_method == 0 || device_sync_method == 1 ||
|
|
device_sync_method == 2 || device_sync_method == 4) {
|
|
printf("Device synchronization method set to = %s\n",
|
|
sSyncMethod[device_sync_method]);
|
|
printf("Setting reps to 100 to demonstrate steady state\n");
|
|
nreps = 100;
|
|
} else {
|
|
printf("Invalid command line option sync_method=\"%d\"\n",
|
|
device_sync_method);
|
|
return EXIT_FAILURE;
|
|
}
|
|
} else {
|
|
printHelp();
|
|
return EXIT_SUCCESS;
|
|
}
|
|
|
|
if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) {
|
|
#if defined(__APPLE__) || defined(MACOSX)
|
|
bPinGenericMemory = false; // Generic Pinning of System Paged memory not
|
|
// currently supported on Mac OSX
|
|
#else
|
|
bPinGenericMemory = true;
|
|
#endif
|
|
}
|
|
|
|
if (checkCmdLineFlag(argc, (const char **)argv, "use_cuda_malloc_host")) {
|
|
bPinGenericMemory = false;
|
|
}
|
|
|
|
printf("\n> ");
|
|
cuda_device = findCudaDevice(argc, (const char **)argv);
|
|
|
|
// check the compute capability of the device
|
|
int num_devices = 0;
|
|
checkCudaErrors(cudaGetDeviceCount(&num_devices));
|
|
|
|
if (0 == num_devices) {
|
|
printf(
|
|
"your system does not have a CUDA capable device, waiving test...\n");
|
|
return EXIT_WAIVED;
|
|
}
|
|
|
|
// check if the command-line chosen device ID is within range, exit if not
|
|
if (cuda_device >= num_devices) {
|
|
printf(
|
|
"cuda_device=%d is invalid, must choose device ID between 0 and %d\n",
|
|
cuda_device, num_devices - 1);
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
checkCudaErrors(cudaSetDevice(cuda_device));
|
|
|
|
// Checking for compute capabilities
|
|
cudaDeviceProp deviceProp;
|
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
|
|
|
|
niterations = 5;
|
|
|
|
// Check if GPU can map host memory (Generic Method), if not then we override
|
|
// bPinGenericMemory to be false
|
|
if (bPinGenericMemory) {
|
|
printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name,
|
|
deviceProp.canMapHostMemory ? "Yes" : "No");
|
|
|
|
if (deviceProp.canMapHostMemory == 0) {
|
|
printf(
|
|
"Using cudaMallocHost, CUDA device does not support mapping of "
|
|
"generic host memory\n");
|
|
bPinGenericMemory = false;
|
|
}
|
|
}
|
|
|
|
// Anything that is less than 32 Cores will have scaled down workload
|
|
scale_factor =
|
|
max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
|
|
(float)deviceProp.multiProcessorCount)),
|
|
1.0f);
|
|
n = (int)rint((float)n / scale_factor);
|
|
|
|
printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major,
|
|
deviceProp.minor);
|
|
printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n",
|
|
deviceProp.multiProcessorCount,
|
|
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
|
|
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
|
|
deviceProp.multiProcessorCount);
|
|
|
|
printf("> scale_factor = %1.4f\n", 1.0f / scale_factor);
|
|
printf("> array_size = %d\n\n", n);
|
|
|
|
// enable use of blocking sync, to reduce CPU usage
|
|
printf("> Using CPU/GPU Device Synchronization method (%s)\n",
|
|
sDeviceSyncMethod[device_sync_method]);
|
|
checkCudaErrors(cudaSetDeviceFlags(
|
|
device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0)));
|
|
|
|
// allocate host memory
|
|
int c = 5; // value to which the array will be initialized
|
|
int *h_a = 0; // pointer to the array data in host memory
|
|
int *hAligned_a = 0; // pointer to the array data in host memory (aligned to
|
|
// MEMORY_ALIGNMENT)
|
|
|
|
// Allocate Host memory (could be using cudaMallocHost or VirtualAlloc/mmap if
|
|
// using the new CUDA 4.0 features
|
|
AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);
|
|
|
|
// allocate device memory
|
|
int *d_a = 0,
|
|
*d_c = 0; // pointers to data and init value in the device memory
|
|
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
|
|
checkCudaErrors(cudaMemset(d_a, 0x0, nbytes));
|
|
checkCudaErrors(cudaMalloc((void **)&d_c, sizeof(int)));
|
|
checkCudaErrors(cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice));
|
|
|
|
printf("\nStarting Test\n");
|
|
|
|
// allocate and initialize an array of stream handles
|
|
cudaStream_t *streams =
|
|
(cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
|
|
|
|
for (int i = 0; i < nstreams; i++) {
|
|
checkCudaErrors(cudaStreamCreate(&(streams[i])));
|
|
}
|
|
|
|
// create CUDA event handles
|
|
// use blocking sync
|
|
cudaEvent_t start_event, stop_event;
|
|
int eventflags =
|
|
((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync
|
|
: cudaEventDefault);
|
|
|
|
checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
|
|
checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
|
|
|
|
// time memcopy from device
|
|
checkCudaErrors(cudaEventRecord(start_event, 0)); // record in stream-0, to
|
|
// ensure that all previous
|
|
// CUDA calls have
|
|
// completed
|
|
checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes,
|
|
cudaMemcpyDeviceToHost, streams[0]));
|
|
checkCudaErrors(cudaEventRecord(stop_event, 0));
|
|
checkCudaErrors(cudaEventSynchronize(
|
|
stop_event)); // block until the event is actually recorded
|
|
checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
|
|
printf("memcopy:\t%.2f\n", time_memcpy);
|
|
|
|
// time kernel
|
|
threads = dim3(512, 1);
|
|
blocks = dim3(n / threads.x, 1);
|
|
checkCudaErrors(cudaEventRecord(start_event, 0));
|
|
init_array<<<blocks, threads, 0, streams[0]>>>(d_a, d_c, niterations);
|
|
checkCudaErrors(cudaEventRecord(stop_event, 0));
|
|
checkCudaErrors(cudaEventSynchronize(stop_event));
|
|
checkCudaErrors(cudaEventElapsedTime(&time_kernel, start_event, stop_event));
|
|
printf("kernel:\t\t%.2f\n", time_kernel);
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
// time non-streamed execution for reference
|
|
threads = dim3(512, 1);
|
|
blocks = dim3(n / threads.x, 1);
|
|
checkCudaErrors(cudaEventRecord(start_event, 0));
|
|
|
|
for (int k = 0; k < nreps; k++) {
|
|
init_array<<<blocks, threads>>>(d_a, d_c, niterations);
|
|
checkCudaErrors(
|
|
cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost));
|
|
}
|
|
|
|
checkCudaErrors(cudaEventRecord(stop_event, 0));
|
|
checkCudaErrors(cudaEventSynchronize(stop_event));
|
|
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
|
|
printf("non-streamed:\t%.2f\n", elapsed_time / nreps);
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
// time execution with nstreams streams
|
|
threads = dim3(512, 1);
|
|
blocks = dim3(n / (nstreams * threads.x), 1);
|
|
memset(hAligned_a, 255,
|
|
nbytes); // set host memory bits to all 1s, for testing correctness
|
|
checkCudaErrors(cudaMemset(
|
|
d_a, 0, nbytes)); // set device memory to all 0s, for testing correctness
|
|
checkCudaErrors(cudaEventRecord(start_event, 0));
|
|
|
|
for (int k = 0; k < nreps; k++) {
|
|
// asynchronously launch nstreams kernels, each operating on its own portion
|
|
// of data
|
|
for (int i = 0; i < nstreams; i++) {
|
|
init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams,
|
|
d_c, niterations);
|
|
}
|
|
|
|
// asynchronously launch nstreams memcopies. Note that memcopy in stream x
|
|
// will only
|
|
// commence executing when all previous CUDA calls in stream x have
|
|
// completed
|
|
for (int i = 0; i < nstreams; i++) {
|
|
checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams,
|
|
d_a + i * n / nstreams, nbytes / nstreams,
|
|
cudaMemcpyDeviceToHost, streams[i]));
|
|
}
|
|
}
|
|
|
|
checkCudaErrors(cudaEventRecord(stop_event, 0));
|
|
checkCudaErrors(cudaEventSynchronize(stop_event));
|
|
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
|
|
printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps);
|
|
|
|
// check whether the output is correct
|
|
printf("-------------------------------\n");
|
|
bool bResults = correct_data(hAligned_a, n, c * nreps * niterations);
|
|
|
|
// release resources
|
|
for (int i = 0; i < nstreams; i++) {
|
|
checkCudaErrors(cudaStreamDestroy(streams[i]));
|
|
}
|
|
|
|
checkCudaErrors(cudaEventDestroy(start_event));
|
|
checkCudaErrors(cudaEventDestroy(stop_event));
|
|
|
|
// Free cudaMallocHost or Generic Host allocated memory (from CUDA 4.0)
|
|
FreeHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);
|
|
|
|
checkCudaErrors(cudaFree(d_a));
|
|
checkCudaErrors(cudaFree(d_c));
|
|
|
|
return bResults ? EXIT_SUCCESS : EXIT_FAILURE;
|
|
}
|