cuda-samples/Samples/streamOrderedAllocation/streamOrderedAllocation.cu
2021-10-21 16:34:49 +05:30

244 lines
8.8 KiB
Plaintext

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This sample demonstrates stream ordered memory allocation on a GPU using
* cudaMallocAsync and cudaMemPool family of APIs.
*
* basicStreamOrderedAllocation(): demonstrates stream ordered allocation using
* cudaMallocAsync/cudaFreeAsync APIs with default settings.
*
* streamOrderedAllocationPostSync(): demonstrates if there's a synchronization
* in between allocations, then setting the release threshold on the pool will
* make sure the synchronize will not free memory back to the OS.
*/
// System includes
#include <assert.h>
#include <stdio.h>
#include <climits>
// CUDA runtime
#include <cuda_runtime.h>
// helper functions and utilities to work with CUDA
#include <helper_cuda.h>
#include <helper_functions.h>
#define MAX_ITER 20
/* Add two vectors on the GPU */
__global__ void vectorAddGPU(const float *a, const float *b, float *c, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
c[idx] = a[idx] + b[idx];
}
}
int basicStreamOrderedAllocation(const int dev, const int nelem, const float *a,
const float *b, float *c) {
float *d_a, *d_b, *d_c; // Device buffers
float errorNorm, refNorm, ref, diff;
size_t bytes = nelem * sizeof(float);
cudaStream_t stream;
printf("Starting basicStreamOrderedAllocation()\n");
checkCudaErrors(cudaSetDevice(dev));
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaMallocAsync(&d_a, bytes, stream));
checkCudaErrors(cudaMallocAsync(&d_b, bytes, stream));
checkCudaErrors(cudaMallocAsync(&d_c, bytes, stream));
checkCudaErrors(
cudaMemcpyAsync(d_a, a, bytes, cudaMemcpyHostToDevice, stream));
checkCudaErrors(
cudaMemcpyAsync(d_b, b, bytes, cudaMemcpyHostToDevice, stream));
dim3 block(256);
dim3 grid((unsigned int)ceil(nelem / (float)block.x));
vectorAddGPU<<<grid, block, 0, stream>>>(d_a, d_b, d_c, nelem);
checkCudaErrors(cudaFreeAsync(d_a, stream));
checkCudaErrors(cudaFreeAsync(d_b, stream));
checkCudaErrors(
cudaMemcpyAsync(c, d_c, bytes, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaFreeAsync(d_c, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
/* Compare the results */
printf("> Checking the results from vectorAddGPU() ...\n");
errorNorm = 0.f;
refNorm = 0.f;
for (int n = 0; n < nelem; n++) {
ref = a[n] + b[n];
diff = c[n] - ref;
errorNorm += diff * diff;
refNorm += ref * ref;
}
errorNorm = (float)sqrt((double)errorNorm);
refNorm = (float)sqrt((double)refNorm);
if (errorNorm / refNorm < 1.e-6f)
printf("basicStreamOrderedAllocation PASSED\n");
checkCudaErrors(cudaStreamDestroy(stream));
return errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE;
}
// streamOrderedAllocationPostSync(): demonstrates If the application wants the
// memory to persist in the pool beyond synchronization, then it sets the
// release threshold on the pool. This way, when the application reaches the
// "steady state", it is no longer allocating/freeing memory from the OS.
int streamOrderedAllocationPostSync(const int dev, const int nelem,
const float *a, const float *b, float *c) {
float *d_a, *d_b, *d_c; // Device buffers
float errorNorm, refNorm, ref, diff;
size_t bytes = nelem * sizeof(float);
cudaStream_t stream;
cudaMemPool_t memPool;
cudaEvent_t start, end;
printf("Starting streamOrderedAllocationPostSync()\n");
checkCudaErrors(cudaSetDevice(dev));
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&end));
checkCudaErrors(cudaDeviceGetDefaultMemPool(&memPool, dev));
uint64_t thresholdVal = ULONG_MAX;
// set high release threshold on the default pool so that cudaFreeAsync will
// not actually release memory to the system. By default, the release
// threshold for a memory pool is set to zero. This implies that the CUDA
// driver is allowed to release a memory chunk back to the system as long as
// it does not contain any active suballocations.
checkCudaErrors(cudaMemPoolSetAttribute(
memPool, cudaMemPoolAttrReleaseThreshold, (void *)&thresholdVal));
// Record the start event
checkCudaErrors(cudaEventRecord(start, stream));
for (int i = 0; i < MAX_ITER; i++) {
checkCudaErrors(cudaMallocAsync(&d_a, bytes, stream));
checkCudaErrors(cudaMallocAsync(&d_b, bytes, stream));
checkCudaErrors(cudaMallocAsync(&d_c, bytes, stream));
checkCudaErrors(
cudaMemcpyAsync(d_a, a, bytes, cudaMemcpyHostToDevice, stream));
checkCudaErrors(
cudaMemcpyAsync(d_b, b, bytes, cudaMemcpyHostToDevice, stream));
dim3 block(256);
dim3 grid((unsigned int)ceil(nelem / (float)block.x));
vectorAddGPU<<<grid, block, 0, stream>>>(d_a, d_b, d_c, nelem);
checkCudaErrors(cudaFreeAsync(d_a, stream));
checkCudaErrors(cudaFreeAsync(d_b, stream));
checkCudaErrors(
cudaMemcpyAsync(c, d_c, bytes, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaFreeAsync(d_c, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
}
checkCudaErrors(cudaEventRecord(end, stream));
// Wait for the end event to complete
checkCudaErrors(cudaEventSynchronize(end));
float msecTotal = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, end));
printf("Total elapsed time = %f ms over %d iterations\n", msecTotal,
MAX_ITER);
/* Compare the results */
printf("> Checking the results from vectorAddGPU() ...\n");
errorNorm = 0.f;
refNorm = 0.f;
for (int n = 0; n < nelem; n++) {
ref = a[n] + b[n];
diff = c[n] - ref;
errorNorm += diff * diff;
refNorm += ref * ref;
}
errorNorm = (float)sqrt((double)errorNorm);
refNorm = (float)sqrt((double)refNorm);
if (errorNorm / refNorm < 1.e-6f)
printf("streamOrderedAllocationPostSync PASSED\n");
checkCudaErrors(cudaStreamDestroy(stream));
return errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE;
}
int main(int argc, char **argv) {
int nelem;
int dev = 0; // use default device 0
size_t bytes;
float *a, *b, *c; // Host
if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
printf("Usage: streamOrderedAllocation [OPTION]\n\n");
printf("Options:\n");
printf(" --device=[device #] Specify the device to be used\n");
return EXIT_SUCCESS;
}
dev = findCudaDevice(argc, (const char **)argv);
int isMemPoolSupported = 0;
checkCudaErrors(cudaDeviceGetAttribute(&isMemPoolSupported,
cudaDevAttrMemoryPoolsSupported, dev));
if (!isMemPoolSupported) {
printf("Waiving execution as device does not support Memory Pools\n");
exit(EXIT_WAIVED);
}
// Allocate CPU memory.
nelem = 1048576;
bytes = nelem * sizeof(float);
a = (float *)malloc(bytes);
b = (float *)malloc(bytes);
c = (float *)malloc(bytes);
/* Initialize the vectors. */
for (int n = 0; n < nelem; n++) {
a[n] = rand() / (float)RAND_MAX;
b[n] = rand() / (float)RAND_MAX;
}
int ret1 = basicStreamOrderedAllocation(dev, nelem, a, b, c);
int ret2 = streamOrderedAllocationPostSync(dev, nelem, a, b, c);
/* Memory clean up */
free(a);
free(b);
free(c);
return ((ret1 == EXIT_SUCCESS && ret2 == EXIT_SUCCESS) ? EXIT_SUCCESS
: EXIT_FAILURE);
}