/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include __forceinline__ __device__ float2 operator+(float2 a, float2 b) { float2 c; c.x = a.x + b.x; c.y = a.y + b.y; return c; } __forceinline__ __device__ float2 operator-(float2 a, float2 b) { float2 c; c.x = a.x - b.x; c.y = a.y - b.y; return c; } __forceinline__ __device__ float2 operator*(float a, float2 b) { float2 c; c.x = a * b.x; c.y = a * b.y; return c; } __forceinline__ __device__ float length(float2 a) { return sqrtf(a.x * a.x + a.y * a.y); } #define MAX_TESSELLATION 32 struct BezierLine { float2 CP[3]; float2 *vertexPos; int nVertices; }; __global__ void computeBezierLinePositions(int lidx, BezierLine *bLines, int nTessPoints) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx < nTessPoints) { float u = (float)idx / (float)(nTessPoints - 1); float omu = 1.0f - u; float B3u[3]; B3u[0] = omu * omu; B3u[1] = 2.0f * u * omu; B3u[2] = u * u; float2 position = {0, 0}; for (int i = 0; i < 3; i++) { position = position + B3u[i] * bLines[lidx].CP[i]; } bLines[lidx].vertexPos[idx] = position; } } __global__ void computeBezierLinesCDP(BezierLine *bLines, int nLines) { int lidx = threadIdx.x + blockDim.x * blockIdx.x; if (lidx < nLines) { float curvature = length(bLines[lidx].CP[1] - 0.5f * (bLines[lidx].CP[0] + bLines[lidx].CP[2])) / length(bLines[lidx].CP[2] - bLines[lidx].CP[0]); int nTessPoints = min(max((int)(curvature * 16.0f), 4), MAX_TESSELLATION); if (bLines[lidx].vertexPos == NULL) { bLines[lidx].nVertices = nTessPoints; cudaMalloc((void **)&bLines[lidx].vertexPos, nTessPoints * sizeof(float2)); } computeBezierLinePositions<<>>(lidx, bLines, bLines[lidx].nVertices); } } __global__ void freeVertexMem(BezierLine *bLines, int nLines) { int lidx = threadIdx.x + blockDim.x * blockIdx.x; if (lidx < nLines) cudaFree(bLines[lidx].vertexPos); } unsigned int checkCapableSM35Device(int argc, char **argv) { // Get device properties cudaDeviceProp properties; int device_count = 0, device = -1; if (checkCmdLineFlag(argc, (const char **)argv, "device")) { device = getCmdLineArgumentInt(argc, (const char **)argv, "device"); cudaDeviceProp properties; checkCudaErrors(cudaGetDeviceProperties(&properties, device)); if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5)) { printf("Running on GPU %d (%s)\n", device, properties.name); } else { printf( "cdpBezierTessellation requires GPU devices with compute SM 3.5 or " "higher."); printf("Current GPU device has compute SM %d.%d. Exiting...\n", properties.major, properties.minor); return EXIT_FAILURE; } } else { checkCudaErrors(cudaGetDeviceCount(&device_count)); for (int i = 0; i < device_count; ++i) { checkCudaErrors(cudaGetDeviceProperties(&properties, i)); if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5)) { device = i; printf("Running on GPU %d (%s)\n", i, properties.name); break; } printf("GPU %d %s does not support CUDA Dynamic Parallelism\n", i, properties.name); } } if (device == -1) { fprintf(stderr, "cdpBezierTessellation requires GPU devices with compute SM 3.5 or " "higher. Exiting...\n"); return EXIT_WAIVED; } return EXIT_SUCCESS; } #define N_LINES 256 #define BLOCK_DIM 64 int main(int argc, char **argv) { BezierLine *bLines_h = new BezierLine[N_LINES]; float2 last = {0, 0}; for (int i = 0; i < N_LINES; i++) { bLines_h[i].CP[0] = last; for (int j = 1; j < 3; j++) { bLines_h[i].CP[j].x = (float)rand() / (float)RAND_MAX; bLines_h[i].CP[j].y = (float)rand() / (float)RAND_MAX; } last = bLines_h[i].CP[2]; bLines_h[i].vertexPos = NULL; bLines_h[i].nVertices = 0; } unsigned int sm35Ret = checkCapableSM35Device(argc, argv); if (sm35Ret != EXIT_SUCCESS) { exit(sm35Ret); } BezierLine *bLines_d; checkCudaErrors(cudaMalloc((void **)&bLines_d, N_LINES * sizeof(BezierLine))); checkCudaErrors(cudaMemcpy(bLines_d, bLines_h, N_LINES * sizeof(BezierLine), cudaMemcpyHostToDevice)); printf("Computing Bezier Lines (CUDA Dynamic Parallelism Version) ... "); computeBezierLinesCDP<<<(unsigned int)ceil((float)N_LINES / (float)BLOCK_DIM), BLOCK_DIM>>>(bLines_d, N_LINES); printf("Done!\n"); // Do something to draw the lines here freeVertexMem<<<(unsigned int)ceil((float)N_LINES / (float)BLOCK_DIM), BLOCK_DIM>>>(bLines_d, N_LINES); checkCudaErrors(cudaFree(bLines_d)); delete[] bLines_h; exit(EXIT_SUCCESS); }