mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-24 18:59:16 +08:00
244 lines
8.8 KiB
Plaintext
244 lines
8.8 KiB
Plaintext
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* This sample demonstrates stream ordered memory allocation on a GPU using
|
|
* cudaMallocAsync and cudaMemPool family of APIs.
|
|
*
|
|
* basicStreamOrderedAllocation(): demonstrates stream ordered allocation using
|
|
* cudaMallocAsync/cudaFreeAsync APIs with default settings.
|
|
*
|
|
* streamOrderedAllocationPostSync(): demonstrates if there's a synchronization
|
|
* in between allocations, then setting the release threshold on the pool will
|
|
* make sure the synchronize will not free memory back to the OS.
|
|
*/
|
|
|
|
// System includes
|
|
#include <assert.h>
|
|
#include <stdio.h>
|
|
#include <climits>
|
|
|
|
// CUDA runtime
|
|
#include <cuda_runtime.h>
|
|
|
|
// helper functions and utilities to work with CUDA
|
|
#include <helper_cuda.h>
|
|
#include <helper_functions.h>
|
|
|
|
#define MAX_ITER 20
|
|
|
|
/* Add two vectors on the GPU */
|
|
__global__ void vectorAddGPU(const float *a, const float *b, float *c, int N) {
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if (idx < N) {
|
|
c[idx] = a[idx] + b[idx];
|
|
}
|
|
}
|
|
|
|
int basicStreamOrderedAllocation(const int dev, const int nelem, const float *a,
|
|
const float *b, float *c) {
|
|
float *d_a, *d_b, *d_c; // Device buffers
|
|
float errorNorm, refNorm, ref, diff;
|
|
size_t bytes = nelem * sizeof(float);
|
|
|
|
cudaStream_t stream;
|
|
printf("Starting basicStreamOrderedAllocation()\n");
|
|
checkCudaErrors(cudaSetDevice(dev));
|
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
|
|
|
checkCudaErrors(cudaMallocAsync(&d_a, bytes, stream));
|
|
checkCudaErrors(cudaMallocAsync(&d_b, bytes, stream));
|
|
checkCudaErrors(cudaMallocAsync(&d_c, bytes, stream));
|
|
checkCudaErrors(
|
|
cudaMemcpyAsync(d_a, a, bytes, cudaMemcpyHostToDevice, stream));
|
|
checkCudaErrors(
|
|
cudaMemcpyAsync(d_b, b, bytes, cudaMemcpyHostToDevice, stream));
|
|
|
|
dim3 block(256);
|
|
dim3 grid((unsigned int)ceil(nelem / (float)block.x));
|
|
vectorAddGPU<<<grid, block, 0, stream>>>(d_a, d_b, d_c, nelem);
|
|
|
|
checkCudaErrors(cudaFreeAsync(d_a, stream));
|
|
checkCudaErrors(cudaFreeAsync(d_b, stream));
|
|
checkCudaErrors(
|
|
cudaMemcpyAsync(c, d_c, bytes, cudaMemcpyDeviceToHost, stream));
|
|
checkCudaErrors(cudaFreeAsync(d_c, stream));
|
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
|
|
|
/* Compare the results */
|
|
printf("> Checking the results from vectorAddGPU() ...\n");
|
|
errorNorm = 0.f;
|
|
refNorm = 0.f;
|
|
|
|
for (int n = 0; n < nelem; n++) {
|
|
ref = a[n] + b[n];
|
|
diff = c[n] - ref;
|
|
errorNorm += diff * diff;
|
|
refNorm += ref * ref;
|
|
}
|
|
|
|
errorNorm = (float)sqrt((double)errorNorm);
|
|
refNorm = (float)sqrt((double)refNorm);
|
|
if (errorNorm / refNorm < 1.e-6f)
|
|
printf("basicStreamOrderedAllocation PASSED\n");
|
|
|
|
checkCudaErrors(cudaStreamDestroy(stream));
|
|
|
|
return errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE;
|
|
}
|
|
|
|
// streamOrderedAllocationPostSync(): demonstrates If the application wants the
|
|
// memory to persist in the pool beyond synchronization, then it sets the
|
|
// release threshold on the pool. This way, when the application reaches the
|
|
// "steady state", it is no longer allocating/freeing memory from the OS.
|
|
int streamOrderedAllocationPostSync(const int dev, const int nelem,
|
|
const float *a, const float *b, float *c) {
|
|
float *d_a, *d_b, *d_c; // Device buffers
|
|
float errorNorm, refNorm, ref, diff;
|
|
size_t bytes = nelem * sizeof(float);
|
|
|
|
cudaStream_t stream;
|
|
cudaMemPool_t memPool;
|
|
cudaEvent_t start, end;
|
|
printf("Starting streamOrderedAllocationPostSync()\n");
|
|
checkCudaErrors(cudaSetDevice(dev));
|
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
|
checkCudaErrors(cudaEventCreate(&start));
|
|
checkCudaErrors(cudaEventCreate(&end));
|
|
|
|
checkCudaErrors(cudaDeviceGetDefaultMemPool(&memPool, dev));
|
|
uint64_t thresholdVal = ULONG_MAX;
|
|
// set high release threshold on the default pool so that cudaFreeAsync will
|
|
// not actually release memory to the system. By default, the release
|
|
// threshold for a memory pool is set to zero. This implies that the CUDA
|
|
// driver is allowed to release a memory chunk back to the system as long as
|
|
// it does not contain any active suballocations.
|
|
checkCudaErrors(cudaMemPoolSetAttribute(
|
|
memPool, cudaMemPoolAttrReleaseThreshold, (void *)&thresholdVal));
|
|
|
|
// Record the start event
|
|
checkCudaErrors(cudaEventRecord(start, stream));
|
|
for (int i = 0; i < MAX_ITER; i++) {
|
|
checkCudaErrors(cudaMallocAsync(&d_a, bytes, stream));
|
|
checkCudaErrors(cudaMallocAsync(&d_b, bytes, stream));
|
|
checkCudaErrors(cudaMallocAsync(&d_c, bytes, stream));
|
|
checkCudaErrors(
|
|
cudaMemcpyAsync(d_a, a, bytes, cudaMemcpyHostToDevice, stream));
|
|
checkCudaErrors(
|
|
cudaMemcpyAsync(d_b, b, bytes, cudaMemcpyHostToDevice, stream));
|
|
|
|
dim3 block(256);
|
|
dim3 grid((unsigned int)ceil(nelem / (float)block.x));
|
|
vectorAddGPU<<<grid, block, 0, stream>>>(d_a, d_b, d_c, nelem);
|
|
|
|
checkCudaErrors(cudaFreeAsync(d_a, stream));
|
|
checkCudaErrors(cudaFreeAsync(d_b, stream));
|
|
checkCudaErrors(
|
|
cudaMemcpyAsync(c, d_c, bytes, cudaMemcpyDeviceToHost, stream));
|
|
checkCudaErrors(cudaFreeAsync(d_c, stream));
|
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
|
}
|
|
checkCudaErrors(cudaEventRecord(end, stream));
|
|
// Wait for the end event to complete
|
|
checkCudaErrors(cudaEventSynchronize(end));
|
|
|
|
float msecTotal = 0.0f;
|
|
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, end));
|
|
printf("Total elapsed time = %f ms over %d iterations\n", msecTotal,
|
|
MAX_ITER);
|
|
|
|
/* Compare the results */
|
|
printf("> Checking the results from vectorAddGPU() ...\n");
|
|
errorNorm = 0.f;
|
|
refNorm = 0.f;
|
|
|
|
for (int n = 0; n < nelem; n++) {
|
|
ref = a[n] + b[n];
|
|
diff = c[n] - ref;
|
|
errorNorm += diff * diff;
|
|
refNorm += ref * ref;
|
|
}
|
|
|
|
errorNorm = (float)sqrt((double)errorNorm);
|
|
refNorm = (float)sqrt((double)refNorm);
|
|
if (errorNorm / refNorm < 1.e-6f)
|
|
printf("streamOrderedAllocationPostSync PASSED\n");
|
|
|
|
checkCudaErrors(cudaStreamDestroy(stream));
|
|
|
|
return errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE;
|
|
}
|
|
|
|
int main(int argc, char **argv) {
|
|
int nelem;
|
|
int dev = 0; // use default device 0
|
|
size_t bytes;
|
|
float *a, *b, *c; // Host
|
|
|
|
if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
|
|
printf("Usage: streamOrderedAllocation [OPTION]\n\n");
|
|
printf("Options:\n");
|
|
printf(" --device=[device #] Specify the device to be used\n");
|
|
return EXIT_SUCCESS;
|
|
}
|
|
|
|
dev = findCudaDevice(argc, (const char **)argv);
|
|
|
|
int isMemPoolSupported = 0;
|
|
checkCudaErrors(cudaDeviceGetAttribute(&isMemPoolSupported,
|
|
cudaDevAttrMemoryPoolsSupported, dev));
|
|
if (!isMemPoolSupported) {
|
|
printf("Waiving execution as device does not support Memory Pools\n");
|
|
exit(EXIT_WAIVED);
|
|
}
|
|
|
|
// Allocate CPU memory.
|
|
nelem = 1048576;
|
|
bytes = nelem * sizeof(float);
|
|
|
|
a = (float *)malloc(bytes);
|
|
b = (float *)malloc(bytes);
|
|
c = (float *)malloc(bytes);
|
|
/* Initialize the vectors. */
|
|
for (int n = 0; n < nelem; n++) {
|
|
a[n] = rand() / (float)RAND_MAX;
|
|
b[n] = rand() / (float)RAND_MAX;
|
|
}
|
|
|
|
int ret1 = basicStreamOrderedAllocation(dev, nelem, a, b, c);
|
|
int ret2 = streamOrderedAllocationPostSync(dev, nelem, a, b, c);
|
|
|
|
/* Memory clean up */
|
|
free(a);
|
|
free(b);
|
|
free(c);
|
|
|
|
return ((ret1 == EXIT_SUCCESS && ret2 == EXIT_SUCCESS) ? EXIT_SUCCESS
|
|
: EXIT_FAILURE);
|
|
}
|