/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // This sample demonstrates dynamic global memory allocation through device C++ // new and delete operators and virtual function declarations available with // CUDA 4.0. #include #include namespace cg = cooperative_groups; #include #include #include #include const char *sSDKsample = "newdelete"; #include "container.hpp" //////////////////////////////////////////////////////////////////////////////// // // Kernels to allocate and instantiate Container objects on the device heap // //////////////////////////////////////////////////////////////////////////////// __global__ void vectorCreate(Container **g_container, int max_size) { // The Vector object and the data storage are allocated in device heap memory. // This makes it persistent for the lifetime of the CUDA context. // The grid has only one thread as only a single object instance is needed. *g_container = new Vector(max_size); } //////////////////////////////////////////////////////////////////////////////// // // Kernels to fill and consume shared Container objects. // //////////////////////////////////////////////////////////////////////////////// __global__ void containerFill(Container **g_container) { // All threads of the grid cooperatively populate the shared Container object // with data. if (threadIdx.x == 0) { (*g_container)->push(blockIdx.x); } } __global__ void containerConsume(Container **g_container, int *d_result) { // All threads of the grid cooperatively consume the data from the shared // Container object. int idx = blockIdx.x * blockDim.x + threadIdx.x; int v; if ((*g_container)->pop(v)) { d_result[idx] = v; } else { d_result[idx] = -1; } } //////////////////////////////////////////////////////////////////////////////// // // Kernel to delete shared Container objects. // //////////////////////////////////////////////////////////////////////////////// __global__ void containerDelete(Container **g_container) { delete *g_container; } //////////////////////////////////////////////////////////////////////////////// // // Kernels to using of placement new to put shared Vector objects and data in // shared memory // //////////////////////////////////////////////////////////////////////////////// __global__ void placementNew(int *d_result) { // Handle to thread block group cg::thread_block cta = cg::this_thread_block(); __shared__ unsigned char __align__(8) s_buffer[sizeof(Vector)]; __shared__ int __align__(8) s_data[1024]; __shared__ Vector *s_vector; // The first thread of the block initializes the shared Vector object. // The placement new operator enables the Vector object and the data array top // be placed in shared memory. if (threadIdx.x == 0) { s_vector = new (s_buffer) Vector(1024, s_data); } cg::sync(cta); if ((threadIdx.x & 1) == 0) { s_vector->push(threadIdx.x >> 1); } // Need to sync as the vector implementation does not support concurrent // push/pop operations. cg::sync(cta); int v; if (s_vector->pop(v)) { d_result[threadIdx.x] = v; } else { d_result[threadIdx.x] = -1; } // Note: deleting objects placed in shared memory is not necessary (lifetime // of shared memory is that of the block) } struct ComplexType_t { int a; int b; float c; float d; }; __global__ void complexVector(int *d_result) { // Handle to thread block group cg::thread_block cta = cg::this_thread_block(); __shared__ unsigned char __align__(8) s_buffer[sizeof(Vector)]; __shared__ ComplexType_t __align__(8) s_data[1024]; __shared__ Vector *s_vector; // The first thread of the block initializes the shared Vector object. // The placement new operator enables the Vector object and the data array top // be placed in shared memory. if (threadIdx.x == 0) { s_vector = new (s_buffer) Vector(1024, s_data); } cg::sync(cta); if ((threadIdx.x & 1) == 0) { ComplexType_t data; data.a = threadIdx.x >> 1; data.b = blockIdx.x; data.c = threadIdx.x / (float)(blockDim.x); data.d = blockIdx.x / (float)(gridDim.x); s_vector->push(data); } cg::sync(cta); ComplexType_t v; if (s_vector->pop(v)) { d_result[threadIdx.x] = v.a; } else { d_result[threadIdx.x] = -1; } // Note: deleting objects placed in shared memory is not necessary (lifetime // of shared memory is that of the block) } //////////////////////////////////////////////////////////////////////////////// // // Host code // //////////////////////////////////////////////////////////////////////////////// bool checkResult(int *d_result, int N) { std::vector h_result; h_result.resize(N); checkCudaErrors(cudaMemcpy(&h_result[0], d_result, N * sizeof(int), cudaMemcpyDeviceToHost)); std::sort(h_result.begin(), h_result.end()); bool success = true; bool test = false; int value = 0; for (int i = 0; i < N; ++i) { if (h_result[i] != -1) { test = true; } if (test && (value++) != h_result[i]) { success = false; } } return success; } bool testContainer(Container **d_container, int blocks, int threads) { int *d_result; cudaMalloc(&d_result, blocks * threads * sizeof(int)); containerFill<<>>(d_container); containerConsume<<>>(d_container, d_result); containerDelete<<<1, 1>>>(d_container); checkCudaErrors(cudaDeviceSynchronize()); bool success = checkResult(d_result, blocks * threads); cudaFree(d_result); return success; } bool testPlacementNew(int threads) { int *d_result; cudaMalloc(&d_result, threads * sizeof(int)); placementNew<<<1, threads>>>(d_result); checkCudaErrors(cudaDeviceSynchronize()); bool success = checkResult(d_result, threads); cudaFree(d_result); return success; } bool testComplexType(int threads) { int *d_result; cudaMalloc(&d_result, threads * sizeof(int)); complexVector<<<1, threads>>>(d_result); checkCudaErrors(cudaDeviceSynchronize()); bool success = checkResult(d_result, threads); cudaFree(d_result); return success; } //////////////////////////////////////////////////////////////////////////////// // // MAIN // //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { printf("%s Starting...\n\n", sSDKsample); // use command-line specified CUDA device, otherwise use device with highest // Gflops/s findCudaDevice(argc, (const char **)argv); // set the heap size for device size new/delete to 128 MB checkCudaErrors(cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128 * (1 << 20))); Container **d_container; checkCudaErrors(cudaMalloc(&d_container, sizeof(Container **))); bool bTest = false; int test_passed = 0; printf(" > Container = Vector test "); vectorCreate<<<1, 1>>>(d_container, 128 * 128); bTest = testContainer(d_container, 128, 128); printf(bTest ? "OK\n\n" : "NOT OK\n\n"); test_passed += (bTest ? 1 : 0); checkCudaErrors(cudaFree(d_container)); printf(" > Container = Vector, using placement new on SMEM buffer test "); bTest = testPlacementNew(1024); printf(bTest ? "OK\n\n" : "NOT OK\n\n"); test_passed += (bTest ? 1 : 0); printf(" > Container = Vector, with user defined datatype test "); bTest = testComplexType(1024); printf(bTest ? "OK\n\n" : "NOT OK\n\n"); test_passed += (bTest ? 1 : 0); printf("Test Summary: %d/3 succesfully run\n", test_passed); exit(test_passed == 3 ? EXIT_SUCCESS : EXIT_FAILURE); }