cuda-samples/Samples/cdpBezierTessellation/BezierLineCDP.cu

/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <stdio.h>
#include <cuda_runtime_api.h>
#include <helper_cuda.h>
#include <string.h>

__forceinline__ __device__ float2 operator+(float2 a, float2 b) {
  float2 c;
  c.x = a.x + b.x;
  c.y = a.y + b.y;
  return c;
}

__forceinline__ __device__ float2 operator-(float2 a, float2 b) {
  float2 c;
  c.x = a.x - b.x;
  c.y = a.y - b.y;
  return c;
}

__forceinline__ __device__ float2 operator*(float a, float2 b) {
  float2 c;
  c.x = a * b.x;
  c.y = a * b.y;
  return c;
}

__forceinline__ __device__ float length(float2 a) {
  return sqrtf(a.x * a.x + a.y * a.y);
}

#define MAX_TESSELLATION 32
struct BezierLine {
  float2 CP[3];
  float2 *vertexPos;
  int nVertices;
};

__global__ void computeBezierLinePositions(int lidx, BezierLine *bLines,
                                           int nTessPoints) {
  int idx = threadIdx.x + blockDim.x * blockIdx.x;

  if (idx < nTessPoints) {
    float u = (float)idx / (float)(nTessPoints - 1);
    float omu = 1.0f - u;

    float B3u[3];

    B3u[0] = omu * omu;
    B3u[1] = 2.0f * u * omu;
    B3u[2] = u * u;

    float2 position = {0, 0};

    for (int i = 0; i < 3; i++) {
      position = position + B3u[i] * bLines[lidx].CP[i];
    }

    bLines[lidx].vertexPos[idx] = position;
  }
}

__global__ void computeBezierLinesCDP(BezierLine *bLines, int nLines) {
  int lidx = threadIdx.x + blockDim.x * blockIdx.x;

  if (lidx < nLines) {
    float curvature = length(bLines[lidx].CP[1] -
                             0.5f * (bLines[lidx].CP[0] + bLines[lidx].CP[2])) /
                      length(bLines[lidx].CP[2] - bLines[lidx].CP[0]);
    int nTessPoints = min(max((int)(curvature * 16.0f), 4), MAX_TESSELLATION);

    if (bLines[lidx].vertexPos == NULL) {
      bLines[lidx].nVertices = nTessPoints;
      cudaMalloc((void **)&bLines[lidx].vertexPos,
                 nTessPoints * sizeof(float2));
    }

    computeBezierLinePositions<<<ceilf((float)bLines[lidx].nVertices / 32.0f),
                                 32>>>(lidx, bLines, bLines[lidx].nVertices);
  }
}

__global__ void freeVertexMem(BezierLine *bLines, int nLines) {
  int lidx = threadIdx.x + blockDim.x * blockIdx.x;

  if (lidx < nLines) cudaFree(bLines[lidx].vertexPos);
}

unsigned int checkCapableSM35Device(int argc, char **argv) {
  // Get device properties
  cudaDeviceProp properties;
  int device_count = 0, device = -1;

  if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
    device = getCmdLineArgumentInt(argc, (const char **)argv, "device");

    cudaDeviceProp properties;
    checkCudaErrors(cudaGetDeviceProperties(&properties, device));

    if (properties.major > 3 ||
        (properties.major == 3 && properties.minor >= 5)) {
      printf("Running on GPU  %d (%s)\n", device, properties.name);
    } else {
      printf(
          "cdpBezierTessellation requires GPU devices with compute SM 3.5 or "
          "higher.");
      printf("Current GPU device has compute SM %d.%d. Exiting...\n",
             properties.major, properties.minor);
      return EXIT_FAILURE;
    }

  } else {
    checkCudaErrors(cudaGetDeviceCount(&device_count));

    for (int i = 0; i < device_count; ++i) {
      checkCudaErrors(cudaGetDeviceProperties(&properties, i));

      if (properties.major > 3 ||
          (properties.major == 3 && properties.minor >= 5)) {
        device = i;
        printf("Running on GPU %d (%s)\n", i, properties.name);
        break;
      }

      printf("GPU %d %s does not support CUDA Dynamic Parallelism\n", i,
             properties.name);
    }
  }
  if (device == -1) {
    fprintf(stderr,
            "cdpBezierTessellation requires GPU devices with compute SM 3.5 or "
            "higher.  Exiting...\n");
    return EXIT_WAIVED;
  }

  return EXIT_SUCCESS;
}

#define N_LINES 256
#define BLOCK_DIM 64
int main(int argc, char **argv) {
  BezierLine *bLines_h = new BezierLine[N_LINES];

  float2 last = {0, 0};

  for (int i = 0; i < N_LINES; i++) {
    bLines_h[i].CP[0] = last;

    for (int j = 1; j < 3; j++) {
      bLines_h[i].CP[j].x = (float)rand() / (float)RAND_MAX;
      bLines_h[i].CP[j].y = (float)rand() / (float)RAND_MAX;
    }

    last = bLines_h[i].CP[2];
    bLines_h[i].vertexPos = NULL;
    bLines_h[i].nVertices = 0;
  }

  unsigned int sm35Ret = checkCapableSM35Device(argc, argv);
  if (sm35Ret != EXIT_SUCCESS) {
    exit(sm35Ret);
  }

  BezierLine *bLines_d;
  checkCudaErrors(cudaMalloc((void **)&bLines_d, N_LINES * sizeof(BezierLine)));
  checkCudaErrors(cudaMemcpy(bLines_d, bLines_h, N_LINES * sizeof(BezierLine),
                             cudaMemcpyHostToDevice));
  printf("Computing Bezier Lines (CUDA Dynamic Parallelism Version) ... ");
  computeBezierLinesCDP<<<(unsigned int)ceil((float)N_LINES / (float)BLOCK_DIM),
                          BLOCK_DIM>>>(bLines_d, N_LINES);
  printf("Done!\n");

  // Do something to draw the lines here

  freeVertexMem<<<(unsigned int)ceil((float)N_LINES / (float)BLOCK_DIM),
                  BLOCK_DIM>>>(bLines_d, N_LINES);
  checkCudaErrors(cudaFree(bLines_d));
  delete[] bLines_h;

  exit(EXIT_SUCCESS);
}
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`#include <stdio.h>`
			`#include <cuda_runtime_api.h>`
			`#include <helper_cuda.h>`
			`#include <string.h>`

			`__forceinline__ __device__ float2 operator+(float2 a, float2 b) {`
			`float2 c;`
			`c.x = a.x + b.x;`
			`c.y = a.y + b.y;`
			`return c;`
			`}`

			`__forceinline__ __device__ float2 operator-(float2 a, float2 b) {`
			`float2 c;`
			`c.x = a.x - b.x;`
			`c.y = a.y - b.y;`
			`return c;`
			`}`

			`__forceinline__ __device__ float2 operator*(float a, float2 b) {`
			`float2 c;`
			`c.x = a * b.x;`
			`c.y = a * b.y;`
			`return c;`
			`}`

			`__forceinline__ __device__ float length(float2 a) {`
			`return sqrtf(a.x * a.x + a.y * a.y);`
			`}`

			`#define MAX_TESSELLATION 32`
			`struct BezierLine {`
			`float2 CP[3];`
			`float2 *vertexPos;`
			`int nVertices;`
			`};`

			`__global__ void computeBezierLinePositions(int lidx, BezierLine *bLines,`
			`int nTessPoints) {`
			`int idx = threadIdx.x + blockDim.x * blockIdx.x;`

			`if (idx < nTessPoints) {`
			`float u = (float)idx / (float)(nTessPoints - 1);`
			`float omu = 1.0f - u;`

			`float B3u[3];`

			`B3u[0] = omu * omu;`
			`B3u[1] = 2.0f * u * omu;`
			`B3u[2] = u * u;`

			`float2 position = {0, 0};`

			`for (int i = 0; i < 3; i++) {`
			`position = position + B3u[i] * bLines[lidx].CP[i];`
			`}`

			`bLines[lidx].vertexPos[idx] = position;`
			`}`
			`}`

			`__global__ void computeBezierLinesCDP(BezierLine *bLines, int nLines) {`
			`int lidx = threadIdx.x + blockDim.x * blockIdx.x;`

			`if (lidx < nLines) {`
			`float curvature = length(bLines[lidx].CP[1] -`
			`0.5f * (bLines[lidx].CP[0] + bLines[lidx].CP[2])) /`
			`length(bLines[lidx].CP[2] - bLines[lidx].CP[0]);`
			`int nTessPoints = min(max((int)(curvature * 16.0f), 4), MAX_TESSELLATION);`

			`if (bLines[lidx].vertexPos == NULL) {`
			`bLines[lidx].nVertices = nTessPoints;`
			`cudaMalloc((void **)&bLines[lidx].vertexPos,`
			`nTessPoints * sizeof(float2));`
			`}`

			`computeBezierLinePositions<<<ceilf((float)bLines[lidx].nVertices / 32.0f),`
			`32>>>(lidx, bLines, bLines[lidx].nVertices);`
			`}`
			`}`

			`__global__ void freeVertexMem(BezierLine *bLines, int nLines) {`
			`int lidx = threadIdx.x + blockDim.x * blockIdx.x;`

			`if (lidx < nLines) cudaFree(bLines[lidx].vertexPos);`
			`}`

			`unsigned int checkCapableSM35Device(int argc, char **argv) {`
			`// Get device properties`
			`cudaDeviceProp properties;`
			`int device_count = 0, device = -1;`

			`if (checkCmdLineFlag(argc, (const char **)argv, "device")) {`
			`device = getCmdLineArgumentInt(argc, (const char **)argv, "device");`

			`cudaDeviceProp properties;`
			`checkCudaErrors(cudaGetDeviceProperties(&properties, device));`

			`if (properties.major > 3 \|\|`
			`(properties.major == 3 && properties.minor >= 5)) {`
			`printf("Running on GPU %d (%s)\n", device, properties.name);`
			`} else {`
			`printf(`
			`"cdpBezierTessellation requires GPU devices with compute SM 3.5 or "`
			`"higher.");`
			`printf("Current GPU device has compute SM %d.%d. Exiting...\n",`
			`properties.major, properties.minor);`
			`return EXIT_FAILURE;`
			`}`

			`} else {`
			`checkCudaErrors(cudaGetDeviceCount(&device_count));`

			`for (int i = 0; i < device_count; ++i) {`
			`checkCudaErrors(cudaGetDeviceProperties(&properties, i));`

			`if (properties.major > 3 \|\|`
			`(properties.major == 3 && properties.minor >= 5)) {`
			`device = i;`
			`printf("Running on GPU %d (%s)\n", i, properties.name);`
			`break;`
			`}`

			`printf("GPU %d %s does not support CUDA Dynamic Parallelism\n", i,`
			`properties.name);`
			`}`
			`}`
			`if (device == -1) {`
			`fprintf(stderr,`
			`"cdpBezierTessellation requires GPU devices with compute SM 3.5 or "`
			`"higher. Exiting...\n");`
			`return EXIT_WAIVED;`
			`}`

			`return EXIT_SUCCESS;`
			`}`

			`#define N_LINES 256`
			`#define BLOCK_DIM 64`
			`int main(int argc, char **argv) {`
			`BezierLine *bLines_h = new BezierLine[N_LINES];`

			`float2 last = {0, 0};`

			`for (int i = 0; i < N_LINES; i++) {`
			`bLines_h[i].CP[0] = last;`

			`for (int j = 1; j < 3; j++) {`
			`bLines_h[i].CP[j].x = (float)rand() / (float)RAND_MAX;`
			`bLines_h[i].CP[j].y = (float)rand() / (float)RAND_MAX;`
			`}`

			`last = bLines_h[i].CP[2];`
			`bLines_h[i].vertexPos = NULL;`
			`bLines_h[i].nVertices = 0;`
			`}`

			`unsigned int sm35Ret = checkCapableSM35Device(argc, argv);`
			`if (sm35Ret != EXIT_SUCCESS) {`
			`exit(sm35Ret);`
			`}`

			`BezierLine *bLines_d;`
			`checkCudaErrors(cudaMalloc((void *)&bLines_d, N_LINES sizeof(BezierLine)));`
			`checkCudaErrors(cudaMemcpy(bLines_d, bLines_h, N_LINES * sizeof(BezierLine),`
			`cudaMemcpyHostToDevice));`
			`printf("Computing Bezier Lines (CUDA Dynamic Parallelism Version) ... ");`
			`computeBezierLinesCDP<<<(unsigned int)ceil((float)N_LINES / (float)BLOCK_DIM),`
			`BLOCK_DIM>>>(bLines_d, N_LINES);`
			`printf("Done!\n");`

			`// Do something to draw the lines here`

			`freeVertexMem<<<(unsigned int)ceil((float)N_LINES / (float)BLOCK_DIM),`
			`BLOCK_DIM>>>(bLines_d, N_LINES);`
			`checkCudaErrors(cudaFree(bLines_d));`
			`delete[] bLines_h;`

			`exit(EXIT_SUCCESS);`
			`}`