cuda-samples/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/kernels.cuh

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Various kernels and functors used throughout the algorithm. For details
 * on usage see "SegmentationTreeBuilder::invokeStep()".
 */

#ifndef _KERNELS_H_
#define _KERNELS_H_

#include <stdio.h>
#include <thrust/functional.h>

#include "common.cuh"

// Functors used with thrust library.
template <typename Input>
struct IsGreaterEqualThan : public thrust::unary_function<Input, bool>
{
    __host__ __device__ IsGreaterEqualThan(uint upperBound) :
        upperBound_(upperBound) {}

    __host__ __device__ bool operator()(const Input &value) const
    {
        return value >= upperBound_;
    }

    uint upperBound_;
};

// CUDA kernels.
__global__ void addScalar(uint *array, int scalar, uint size)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < size)
    {
        array[tid] += scalar;
    }
}

__global__ void markSegments(const uint *verticesOffsets,
                             uint *flags,
                             uint verticesCount)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < verticesCount)
    {
        flags[verticesOffsets[tid]] = 1;
    }
}

__global__ void getVerticesMapping(const uint *clusteredVerticesIDs,
                                   const uint *newVerticesIDs,
                                   uint *verticesMapping,
                                   uint verticesCount)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < verticesCount)
    {
        uint vertexID = clusteredVerticesIDs[tid];
        verticesMapping[vertexID] = newVerticesIDs[tid];
    }
}

__global__ void getSuccessors(const uint *verticesOffsets,
                              const uint *minScannedEdges,
                              uint *successors,
                              uint verticesCount,
                              uint edgesCount)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < verticesCount)
    {
        uint successorPos = (tid < verticesCount - 1) ?
                            (verticesOffsets[tid + 1] - 1) :
                            (edgesCount - 1);

        successors[tid] = minScannedEdges[successorPos];
    }
}

__global__ void removeCycles(uint *successors,
                             uint verticesCount)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < verticesCount)
    {
        uint successor = successors[tid];
        uint nextSuccessor = successors[successor];

        if (tid == nextSuccessor)
        {
            if (tid < successor)
            {
                successors[tid] = tid;
            }
            else
            {
                successors[successor] = successor;
            }
        }
    }
}

__global__ void getRepresentatives(const uint *successors,
                                   uint *representatives,
                                   uint verticesCount)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < verticesCount)
    {
        uint successor = successors[tid];
        uint nextSuccessor = successors[successor];

        while (successor != nextSuccessor)
        {
            successor = nextSuccessor;
            nextSuccessor = successors[nextSuccessor];
        }

        representatives[tid] = successor;
    }
}

__global__ void invalidateLoops(const uint *startpoints,
                                const uint *verticesMapping,
                                uint *edges,
                                uint edgesCount)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < edgesCount)
    {
        uint startpoint = startpoints[tid];
        uint &endpoint = edges[tid];

        uint newStartpoint = verticesMapping[startpoint];
        uint newEndpoint = verticesMapping[endpoint];

        if (newStartpoint == newEndpoint)
        {
            endpoint = UINT_MAX;
        }
    }
}

__global__ void calculateEdgesInfo(const uint *startpoints,
                                   const uint *verticesMapping,
                                   const uint *edges,
                                   const float *weights,
                                   uint *newStartpoints,
                                   uint *survivedEdgesIDs,
                                   uint edgesCount,
                                   uint newVerticesCount)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < edgesCount)
    {
        uint startpoint = startpoints[tid];
        uint endpoint = edges[tid];

        newStartpoints[tid] = endpoint < UINT_MAX ?
                              verticesMapping[startpoint] :
                              newVerticesCount + verticesMapping[startpoint];

        survivedEdgesIDs[tid] = endpoint < UINT_MAX ?
                                tid :
                                UINT_MAX;
    }
}

__global__ void makeNewEdges(const uint *survivedEdgesIDs,
                             const uint *verticesMapping,
                             const uint *edges,
                             const float *weights,
                             uint *newEdges,
                             float *newWeights,
                             uint edgesCount)
{
    uint tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < edgesCount)
    {
        uint edgeID = survivedEdgesIDs[tid];
        uint oldEdge = edges[edgeID];

        newEdges[tid] = verticesMapping[oldEdge];
        newWeights[tid] = weights[edgeID];
    }
}

#endif // #ifndef _KERNELS_H_
add and update samples for CUDA 11.6 2022-01-13 14:05:24 +08:00			`/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.`
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`*`
add and update samples for CUDA 11.6 2022-01-13 14:05:24 +08:00			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* * Neither the name of NVIDIA CORPORATION nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`*`
add and update samples for CUDA 11.6 2022-01-13 14:05:24 +08:00			* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
			`* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`*/`

			`/*`
			`* Various kernels and functors used throughout the algorithm. For details`
			`* on usage see "SegmentationTreeBuilder::invokeStep()".`
			`*/`

			`#ifndef _KERNELS_H_`
			`#define _KERNELS_H_`

			`#include <stdio.h>`
			`#include <thrust/functional.h>`

			`#include "common.cuh"`

			`// Functors used with thrust library.`
			`template <typename Input>`
			`struct IsGreaterEqualThan : public thrust::unary_function<Input, bool>`
			`{`
			`__host__ __device__ IsGreaterEqualThan(uint upperBound) :`
			`upperBound_(upperBound) {}`

			`__host__ __device__ bool operator()(const Input &value) const`
			`{`
			`return value >= upperBound_;`
			`}`

			`uint upperBound_;`
			`};`

			`// CUDA kernels.`
			`__global__ void addScalar(uint *array, int scalar, uint size)`
			`{`
			`uint tid = blockIdx.x * blockDim.x + threadIdx.x;`

			`if (tid < size)`
			`{`
			`array[tid] += scalar;`
			`}`
			`}`

			`__global__ void markSegments(const uint *verticesOffsets,`
			`uint *flags,`
			`uint verticesCount)`
			`{`
			`uint tid = blockIdx.x * blockDim.x + threadIdx.x;`

			`if (tid < verticesCount)`
			`{`
			`flags[verticesOffsets[tid]] = 1;`
			`}`
			`}`

			`__global__ void getVerticesMapping(const uint *clusteredVerticesIDs,`
			`const uint *newVerticesIDs,`
			`uint *verticesMapping,`
			`uint verticesCount)`
			`{`
			`uint tid = blockIdx.x * blockDim.x + threadIdx.x;`

			`if (tid < verticesCount)`
			`{`
			`uint vertexID = clusteredVerticesIDs[tid];`
			`verticesMapping[vertexID] = newVerticesIDs[tid];`
			`}`
			`}`

			`__global__ void getSuccessors(const uint *verticesOffsets,`
			`const uint *minScannedEdges,`
			`uint *successors,`
			`uint verticesCount,`
			`uint edgesCount)`
			`{`
			`uint tid = blockIdx.x * blockDim.x + threadIdx.x;`

			`if (tid < verticesCount)`
			`{`
			`uint successorPos = (tid < verticesCount - 1) ?`
			`(verticesOffsets[tid + 1] - 1) :`
			`(edgesCount - 1);`

			`successors[tid] = minScannedEdges[successorPos];`
			`}`
			`}`

			`__global__ void removeCycles(uint *successors,`
			`uint verticesCount)`
			`{`
			`uint tid = blockIdx.x * blockDim.x + threadIdx.x;`

			`if (tid < verticesCount)`
			`{`
			`uint successor = successors[tid];`
			`uint nextSuccessor = successors[successor];`

			`if (tid == nextSuccessor)`
			`{`
			`if (tid < successor)`
			`{`
			`successors[tid] = tid;`
			`}`
			`else`
			`{`
			`successors[successor] = successor;`
			`}`
			`}`
			`}`
			`}`

			`__global__ void getRepresentatives(const uint *successors,`
			`uint *representatives,`
			`uint verticesCount)`
			`{`
			`uint tid = blockIdx.x * blockDim.x + threadIdx.x;`

			`if (tid < verticesCount)`
			`{`
			`uint successor = successors[tid];`
			`uint nextSuccessor = successors[successor];`

			`while (successor != nextSuccessor)`
			`{`
			`successor = nextSuccessor;`
			`nextSuccessor = successors[nextSuccessor];`
			`}`

			`representatives[tid] = successor;`
			`}`
			`}`

			`__global__ void invalidateLoops(const uint *startpoints,`
			`const uint *verticesMapping,`
			`uint *edges,`
			`uint edgesCount)`
			`{`
			`uint tid = blockIdx.x * blockDim.x + threadIdx.x;`

			`if (tid < edgesCount)`
			`{`
			`uint startpoint = startpoints[tid];`
			`uint &endpoint = edges[tid];`

			`uint newStartpoint = verticesMapping[startpoint];`
			`uint newEndpoint = verticesMapping[endpoint];`

			`if (newStartpoint == newEndpoint)`
			`{`
			`endpoint = UINT_MAX;`
			`}`
			`}`
			`}`

			`__global__ void calculateEdgesInfo(const uint *startpoints,`
			`const uint *verticesMapping,`
			`const uint *edges,`
			`const float *weights,`
			`uint *newStartpoints,`
			`uint *survivedEdgesIDs,`
			`uint edgesCount,`
			`uint newVerticesCount)`
			`{`
			`uint tid = blockIdx.x * blockDim.x + threadIdx.x;`

			`if (tid < edgesCount)`
			`{`
			`uint startpoint = startpoints[tid];`
			`uint endpoint = edges[tid];`

			`newStartpoints[tid] = endpoint < UINT_MAX ?`
			`verticesMapping[startpoint] :`
			`newVerticesCount + verticesMapping[startpoint];`

			`survivedEdgesIDs[tid] = endpoint < UINT_MAX ?`
			`tid :`
			`UINT_MAX;`
			`}`
			`}`

			`__global__ void makeNewEdges(const uint *survivedEdgesIDs,`
			`const uint *verticesMapping,`
			`const uint *edges,`
			`const float *weights,`
			`uint *newEdges,`
			`float *newWeights,`
			`uint edgesCount)`
			`{`
			`uint tid = blockIdx.x * blockDim.x + threadIdx.x;`

			`if (tid < edgesCount)`
			`{`
			`uint edgeID = survivedEdgesIDs[tid];`
			`uint oldEdge = edges[edgeID];`

			`newEdges[tid] = verticesMapping[oldEdge];`
			`newWeights[tid] = weights[edgeID];`
			`}`
			`}`

			`#endif // #ifndef _KERNELS_H_`