mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-07-01 20:20:29 +08:00
897 lines
33 KiB
Plaintext
897 lines
33 KiB
Plaintext
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* This application demonstrates an approach to the image segmentation
|
|
* trees construction. It is based on Boruvka's MST algorithm.
|
|
* Here's the complete list of references:
|
|
* 1) V. Vineet et al, "Fast Minimum Spanning Tree for
|
|
* Large Graphs on the GPU";
|
|
* 2) P. Felzenszwalb et al, "Efficient Graph-Based Image Segmentation";
|
|
* 3) A. Ion et al, "Considerations Regarding the Minimum Spanning
|
|
* Tree Pyramid Segmentation Method".
|
|
*/
|
|
|
|
// System includes.
|
|
#include <math.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
// STL includes.
|
|
#include <algorithm>
|
|
#include <deque>
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <iterator>
|
|
#include <list>
|
|
#include <vector>
|
|
|
|
// Thrust library includes.
|
|
#include <thrust/adjacent_difference.h>
|
|
#include <thrust/copy.h>
|
|
#include <thrust/device_free.h>
|
|
#include <thrust/device_malloc.h>
|
|
#include <thrust/fill.h>
|
|
#include <thrust/find.h>
|
|
#include <thrust/for_each.h>
|
|
#include <thrust/iterator/counting_iterator.h>
|
|
#include <thrust/iterator/discard_iterator.h>
|
|
#include <thrust/iterator/transform_iterator.h>
|
|
#include <thrust/iterator/zip_iterator.h>
|
|
#include <thrust/reduce.h>
|
|
#include <thrust/scan.h>
|
|
#include <thrust/sequence.h>
|
|
#include <thrust/sort.h>
|
|
#include <thrust/unique.h>
|
|
|
|
// Sample framework includes.
|
|
#include <helper_cuda.h>
|
|
#include <helper_functions.h>
|
|
|
|
// Project includes.
|
|
#include "common.cuh"
|
|
|
|
// Kernels.
|
|
#include "kernels.cuh"
|
|
|
|
using std::cin;
|
|
using std::cout;
|
|
using std::deque;
|
|
using std::endl;
|
|
using std::list;
|
|
using std::vector;
|
|
|
|
// Very simple von Neumann middle-square prng. rand() is different across
|
|
// various OS platforms, which makes testing and the output inconsistent.
|
|
int myrand(void)
|
|
{
|
|
static int seed = 72191;
|
|
char sq[22];
|
|
|
|
seed *= seed;
|
|
sprintf(sq, "%010d", seed);
|
|
// pull the middle 5 digits out of sq
|
|
sq[8] = 0;
|
|
seed = atoi(&sq[3]);
|
|
|
|
return seed;
|
|
}
|
|
|
|
// Simple memory pool class. It is nothing more than array of fixed-sized
|
|
// arrays.
|
|
template <typename T> class DeviceMemoryPool
|
|
{
|
|
public:
|
|
// The parameters of the constructor are as follows:
|
|
// 1) uint chunkSize --- size of the particular array;
|
|
// 2) uint chunksCount --- number of fixed-sized arrays.
|
|
DeviceMemoryPool(uint chunkSize, uint chunksCount)
|
|
: chunkSize_(chunkSize)
|
|
{
|
|
chunkRawSize_ = (chunkSize * sizeof(T) + 511) & ~511;
|
|
|
|
try {
|
|
basePtr_ = thrust::device_malloc(chunkRawSize_ * chunksCount);
|
|
}
|
|
catch (thrust::system_error &e) {
|
|
cout << "Pool memory allocation failed (" << e.what() << ")" << endl;
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
for (uint chunkIndex = 0; chunkIndex < chunksCount; ++chunkIndex) {
|
|
chunks_.push_back(thrust::device_ptr<T>(
|
|
reinterpret_cast<T *>(static_cast<char *>(basePtr_.get()) + chunkRawSize_ * chunkIndex)));
|
|
}
|
|
}
|
|
|
|
~DeviceMemoryPool()
|
|
{
|
|
try {
|
|
thrust::device_free(basePtr_);
|
|
}
|
|
catch (thrust::system_error &e) {
|
|
cout << "Pool memory allocation failed (" << e.what() << ")" << endl;
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
|
|
// Returns an address of the first available array
|
|
// in the memory pool.
|
|
thrust::device_ptr<T> get()
|
|
{
|
|
thrust::device_ptr<T> ptr(chunks_.back());
|
|
chunks_.pop_back();
|
|
|
|
return ptr;
|
|
}
|
|
|
|
// Pushes an address stored in "ptr" to the list
|
|
// of available arrays of the memory pool.
|
|
// It should be noted that it is user who is responsible for returning
|
|
// the previously requested memory to the appropriate pool.
|
|
inline void put(const thrust::device_ptr<T> &ptr) { chunks_.push_back(ptr); }
|
|
|
|
uint totalFreeChunks() const { return chunks_.size(); }
|
|
|
|
private:
|
|
uint chunkSize_, chunkRawSize_;
|
|
thrust::device_ptr<void> basePtr_;
|
|
|
|
list<thrust::device_ptr<T>> chunks_;
|
|
};
|
|
|
|
// Graph structure.
|
|
struct Graph
|
|
{
|
|
Graph() {}
|
|
|
|
Graph(uint verticesCount, uint edgesCount)
|
|
: vertices(verticesCount)
|
|
, edges(edgesCount)
|
|
, weights(edgesCount)
|
|
{
|
|
}
|
|
|
|
// This vector stores offsets for each vertex in "edges" and "weights"
|
|
// vectors. For example:
|
|
// "vertices[0]" is an index of the first outgoing edge of vertex #0,
|
|
// "vertices[1]" is an index of the first outgoing edge of vertex #1, etc.
|
|
vector<uint> vertices;
|
|
|
|
// This vector stores indices of endpoints of the corresponding edges.
|
|
// For example, "edges[vertices[0]]" is the first neighbouring vertex
|
|
// of vertex #0.
|
|
vector<uint> edges;
|
|
|
|
// This vector stores weights of the corresponding edges.
|
|
vector<float> weights;
|
|
};
|
|
|
|
// Simple segmentation tree class.
|
|
// Each level of the tree corresponds to the segmentation.
|
|
// See "Level" class for the details.
|
|
class Pyramid
|
|
{
|
|
public:
|
|
void addLevel(uint totalSuperNodes,
|
|
uint totalNodes,
|
|
thrust::device_ptr<uint> superVerticesOffsets,
|
|
thrust::device_ptr<uint> verticesIDs)
|
|
{
|
|
levels_.push_back(Level(totalSuperNodes, totalNodes));
|
|
levels_.back().buildFromDeviceData(superVerticesOffsets, verticesIDs);
|
|
}
|
|
|
|
uint levelsCount() const { return static_cast<uint>(levels_.size()); }
|
|
|
|
void dump(uint width, uint height) const
|
|
{
|
|
char filename[256], format[256];
|
|
uint levelIndex = 0;
|
|
|
|
uint requiredDigitsCount = static_cast<uint>(log10(static_cast<float>(levelsCount()))) + 1;
|
|
sprintf(format, "level_%%0%uu.ppm", requiredDigitsCount);
|
|
|
|
for (LevelsIterator level = levels_.rbegin(); level != levels_.rend(); ++level, ++levelIndex) {
|
|
|
|
sprintf(filename, format, levelIndex);
|
|
dumpLevel(level, width, height, filename);
|
|
}
|
|
}
|
|
|
|
private:
|
|
// Level of the segmentation tree.
|
|
class Level
|
|
{
|
|
public:
|
|
Level(uint totalSuperNodes, uint totalNodes)
|
|
: superNodesOffsets_(totalSuperNodes)
|
|
, nodes_(totalNodes)
|
|
{
|
|
}
|
|
|
|
void buildFromDeviceData(thrust::device_ptr<uint> superVerticesOffsets, thrust::device_ptr<uint> verticesIDs)
|
|
{
|
|
checkCudaErrors(cudaMemcpy(&(superNodesOffsets_[0]),
|
|
superVerticesOffsets.get(),
|
|
sizeof(uint) * superNodesOffsets_.size(),
|
|
cudaMemcpyDeviceToHost));
|
|
|
|
checkCudaErrors(
|
|
cudaMemcpy(&(nodes_[0]), verticesIDs.get(), sizeof(uint) * nodes_.size(), cudaMemcpyDeviceToHost));
|
|
}
|
|
|
|
private:
|
|
friend class Pyramid;
|
|
|
|
// The pair of the following vectors describes the
|
|
// relation between the consecutive levels.
|
|
// Consider an example. Let the index of the current level be n.
|
|
// Then nodes of level #(n-1) with indices stored in
|
|
// "nodes[superNodesOffsets_[0]]",
|
|
// "nodes[superNodesOffsets_[0] + 1]",
|
|
// ...,
|
|
// "nodes[superNodesOffsets_[1] - 1]"
|
|
// correspond to vertex #0 of level #n. An so on.
|
|
vector<uint> superNodesOffsets_;
|
|
vector<uint> nodes_;
|
|
};
|
|
|
|
typedef list<Level>::const_reverse_iterator LevelsIterator;
|
|
|
|
// Dumps level to the file "level_n.ppm" where n
|
|
// is index of the level. Segments are drawn in random colors.
|
|
void dumpLevel(LevelsIterator level, uint width, uint height, const char *filename) const
|
|
{
|
|
deque<std::pair<uint, uint>> nodesQueue;
|
|
|
|
uint totalSegments;
|
|
|
|
{
|
|
const vector<uint> &superNodesOffsets = level->superNodesOffsets_;
|
|
const vector<uint> &nodes = level->nodes_;
|
|
|
|
totalSegments = static_cast<uint>(superNodesOffsets.size());
|
|
|
|
for (uint superNodeIndex = 0, nodeIndex = 0; superNodeIndex < superNodesOffsets.size(); ++superNodeIndex) {
|
|
|
|
uint superNodeEnd = superNodeIndex + 1 < superNodesOffsets.size()
|
|
? superNodesOffsets[superNodeIndex + 1]
|
|
: static_cast<uint>(nodes.size());
|
|
|
|
for (; nodeIndex < superNodeEnd; ++nodeIndex) {
|
|
nodesQueue.push_back(std::make_pair(nodes[nodeIndex], superNodeIndex));
|
|
}
|
|
}
|
|
}
|
|
|
|
++level;
|
|
|
|
while (level != levels_.rend()) {
|
|
uint superNodesCount = static_cast<uint>(nodesQueue.size());
|
|
|
|
const vector<uint> &superNodesOffsets = level->superNodesOffsets_;
|
|
const vector<uint> &nodes = level->nodes_;
|
|
|
|
while (superNodesCount--) {
|
|
std::pair<uint, uint> currentNode = nodesQueue.front();
|
|
nodesQueue.pop_front();
|
|
|
|
uint superNodeBegin = superNodesOffsets[currentNode.first];
|
|
|
|
uint superNodeEnd = currentNode.first + 1 < superNodesOffsets.size()
|
|
? superNodesOffsets[currentNode.first + 1]
|
|
: static_cast<uint>(nodes.size());
|
|
|
|
for (uint nodeIndex = superNodeBegin; nodeIndex < superNodeEnd; ++nodeIndex) {
|
|
|
|
nodesQueue.push_back(std::make_pair(nodes[nodeIndex], currentNode.second));
|
|
}
|
|
}
|
|
|
|
++level;
|
|
}
|
|
|
|
vector<uint> colors(3 * totalSegments);
|
|
|
|
for (uint colorIndex = 0; colorIndex < totalSegments; ++colorIndex) {
|
|
colors[colorIndex * 3] = myrand() % 256;
|
|
colors[colorIndex * 3 + 1] = myrand() % 256;
|
|
colors[colorIndex * 3 + 2] = myrand() % 256;
|
|
}
|
|
|
|
uchar *image = new uchar[width * height * 3];
|
|
|
|
while (!nodesQueue.empty()) {
|
|
std::pair<uint, uint> currentNode = nodesQueue.front();
|
|
nodesQueue.pop_front();
|
|
|
|
uint pixelIndex = currentNode.first;
|
|
uint pixelSegment = currentNode.second;
|
|
|
|
image[pixelIndex * 3] = colors[pixelSegment * 3];
|
|
image[pixelIndex * 3 + 1] = colors[pixelSegment * 3 + 1];
|
|
image[pixelIndex * 3 + 2] = colors[pixelSegment * 3 + 2];
|
|
}
|
|
|
|
__savePPM(filename, image, width, height, 3);
|
|
|
|
delete[] image;
|
|
}
|
|
|
|
list<Level> levels_;
|
|
};
|
|
|
|
// The class that encapsulates the main algorithm.
|
|
class SegmentationTreeBuilder
|
|
{
|
|
public:
|
|
SegmentationTreeBuilder()
|
|
: verticesCount_(0)
|
|
, edgesCount_(0)
|
|
{
|
|
}
|
|
|
|
~SegmentationTreeBuilder() {}
|
|
|
|
// Repeatedly invokes the step of the algorithm
|
|
// until the limiting segmentation is found.
|
|
// Returns time (in ms) spent on building the tree.
|
|
float run(const Graph &graph, Pyramid &segmentations)
|
|
{
|
|
cudaEvent_t start, stop;
|
|
|
|
cudaEventCreate(&start);
|
|
cudaEventCreate(&stop);
|
|
|
|
cudaEventRecord(start, 0);
|
|
|
|
// Allocate required memory pools. We need just 4 types of arrays.
|
|
MemoryPoolsCollection pools = {
|
|
DeviceMemoryPool<uint>(static_cast<uint>(graph.vertices.size()), kUintVerticesPoolsRequired),
|
|
DeviceMemoryPool<float>(static_cast<uint>(graph.vertices.size()), kFloatVerticesPoolsRequired),
|
|
DeviceMemoryPool<uint>(static_cast<uint>(graph.edges.size()), kUintEdgesPoolsRequired),
|
|
DeviceMemoryPool<float>(static_cast<uint>(graph.edges.size()), kFloatEdgesPoolsRequired)};
|
|
|
|
// Initialize internal variables
|
|
try {
|
|
initalizeData(graph, pools);
|
|
}
|
|
catch (thrust::system_error &e) {
|
|
cout << "Initialization failed (" << e.what() << ")" << endl;
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
// Run steps
|
|
AlgorithmStatus status;
|
|
|
|
try {
|
|
do {
|
|
status = invokeStep(pools, segmentations);
|
|
} while (status != ALGORITHM_FINISHED);
|
|
}
|
|
catch (thrust::system_error &e) {
|
|
cout << "Algorithm failed (" << e.what() << ")" << endl;
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
cudaEventRecord(stop, 0);
|
|
cudaEventSynchronize(stop);
|
|
|
|
float elapsedTime;
|
|
cudaEventElapsedTime(&elapsedTime, start, stop);
|
|
|
|
return elapsedTime;
|
|
}
|
|
|
|
private:
|
|
void printMemoryUsage()
|
|
{
|
|
size_t availableMemory, totalMemory, usedMemory;
|
|
|
|
cudaMemGetInfo(&availableMemory, &totalMemory);
|
|
usedMemory = totalMemory - availableMemory;
|
|
|
|
cout << "Device memory: used " << usedMemory << " available " << availableMemory << " total " << totalMemory
|
|
<< endl;
|
|
}
|
|
|
|
struct MemoryPoolsCollection
|
|
{
|
|
DeviceMemoryPool<uint> uintVertices;
|
|
DeviceMemoryPool<float> floatVertices;
|
|
DeviceMemoryPool<uint> uintEdges;
|
|
DeviceMemoryPool<float> floatEdges;
|
|
};
|
|
|
|
static const uint kUintVerticesPoolsRequired = 8;
|
|
static const uint kFloatVerticesPoolsRequired = 3;
|
|
static const uint kUintEdgesPoolsRequired = 8;
|
|
static const uint kFloatEdgesPoolsRequired = 4;
|
|
|
|
void initalizeData(const Graph &graph, MemoryPoolsCollection &pools)
|
|
{
|
|
// Get memory for the internal variables
|
|
verticesCount_ = static_cast<uint>(graph.vertices.size());
|
|
edgesCount_ = static_cast<uint>(graph.edges.size());
|
|
|
|
dVertices_ = pools.uintVertices.get();
|
|
dEdges_ = pools.uintEdges.get();
|
|
dWeights_ = pools.floatEdges.get();
|
|
|
|
dOutputEdgesFlags_ = pools.uintEdges.get();
|
|
|
|
// Copy graph to the device memory
|
|
checkCudaErrors(
|
|
cudaMemcpy(dVertices_.get(), &(graph.vertices[0]), sizeof(uint) * verticesCount_, cudaMemcpyHostToDevice));
|
|
checkCudaErrors(
|
|
cudaMemcpy(dEdges_.get(), &(graph.edges[0]), sizeof(uint) * edgesCount_, cudaMemcpyHostToDevice));
|
|
checkCudaErrors(
|
|
cudaMemcpy(dWeights_.get(), &(graph.weights[0]), sizeof(float) * edgesCount_, cudaMemcpyHostToDevice));
|
|
|
|
|
|
thrust::fill(dOutputEdgesFlags_, dOutputEdgesFlags_ + edgesCount_, 0);
|
|
}
|
|
|
|
static const uint kMaxThreadsPerBlock = 256;
|
|
|
|
// Calculates grid parameters of the consecutive kernel calls
|
|
// based on the number of elements in the array.
|
|
void calculateThreadsDistribution(uint totalElements, uint &blocksCount, uint &threadsPerBlockCount)
|
|
{
|
|
if (totalElements > kMaxThreadsPerBlock) {
|
|
blocksCount = (totalElements + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
|
|
|
|
threadsPerBlockCount = kMaxThreadsPerBlock;
|
|
}
|
|
else {
|
|
blocksCount = 1;
|
|
threadsPerBlockCount = totalElements;
|
|
}
|
|
}
|
|
|
|
enum AlgorithmStatus { ALGORITHM_NOT_FINISHED, ALGORITHM_FINISHED };
|
|
|
|
AlgorithmStatus invokeStep(MemoryPoolsCollection &pools, Pyramid &segmentations)
|
|
{
|
|
uint blocksCount, threadsPerBlockCount;
|
|
|
|
calculateThreadsDistribution(edgesCount_, blocksCount, threadsPerBlockCount);
|
|
dim3 gridDimsForEdges(blocksCount, 1, 1);
|
|
dim3 blockDimsForEdges(threadsPerBlockCount, 1, 1);
|
|
|
|
calculateThreadsDistribution(verticesCount_, blocksCount, threadsPerBlockCount);
|
|
dim3 gridDimsForVertices(blocksCount, 1, 1);
|
|
dim3 blockDimsForVertices(threadsPerBlockCount, 1, 1);
|
|
|
|
thrust::device_ptr<uint> dEdgesFlags = pools.uintEdges.get();
|
|
|
|
thrust::fill(dEdgesFlags, dEdgesFlags + edgesCount_, 0);
|
|
|
|
// Mark the first edge for each vertex in "dEdgesFlags"
|
|
markSegments<<<gridDimsForVertices, blockDimsForVertices, 0>>>(
|
|
dVertices_.get(), dEdgesFlags.get(), verticesCount_);
|
|
getLastCudaError("markSegments launch failed.");
|
|
|
|
// Now find minimum edges for each vertex.
|
|
thrust::device_ptr<uint> dMinScannedEdges = pools.uintEdges.get();
|
|
thrust::device_ptr<float> dMinScannedWeights = pools.floatEdges.get();
|
|
|
|
thrust::inclusive_scan_by_key(
|
|
dEdgesFlags,
|
|
dEdgesFlags + edgesCount_,
|
|
thrust::make_zip_iterator(thrust::make_tuple(dWeights_, dEdges_)),
|
|
thrust::make_zip_iterator(thrust::make_tuple(dMinScannedWeights, dMinScannedEdges)),
|
|
thrust::greater_equal<uint>(),
|
|
thrust::minimum<thrust::tuple<float, uint>>());
|
|
|
|
// To make things clear.
|
|
// Let "dEdgesFlags" denote groups of edges that
|
|
// correspond to the same vertices. Then the last edge of each group
|
|
// (in "dMinScannedEdges" and "dMinScannedWeights") is now minimal.
|
|
|
|
// Calculate a successor vertex for each vertex. A successor of the
|
|
// vertex v is a neighbouring vertex connected to v
|
|
// by the minimal edge.
|
|
thrust::device_ptr<uint> dSuccessors = pools.uintVertices.get();
|
|
|
|
getSuccessors<<<gridDimsForVertices, blockDimsForVertices, 0>>>(
|
|
dVertices_.get(), dMinScannedEdges.get(), dSuccessors.get(), verticesCount_, edgesCount_);
|
|
getLastCudaError("getSuccessors launch failed.");
|
|
|
|
pools.uintEdges.put(dMinScannedEdges);
|
|
pools.floatEdges.put(dMinScannedWeights);
|
|
|
|
// Remove cyclic successor dependencies. Note that there can be only
|
|
// two vertices in a cycle. See [1] for details.
|
|
removeCycles<<<gridDimsForVertices, blockDimsForVertices, 0>>>(dSuccessors.get(), verticesCount_);
|
|
getLastCudaError("removeCycles launch failed.");
|
|
|
|
// Build up an array of startpoints for edges. As already stated,
|
|
// each group of edges denoted by "dEdgesFlags"
|
|
// has the same startpoint.
|
|
thrust::device_ptr<uint> dStartpoints = pools.uintEdges.get();
|
|
|
|
thrust::inclusive_scan(dEdgesFlags, dEdgesFlags + edgesCount_, dStartpoints);
|
|
|
|
addScalar<<<gridDimsForEdges, blockDimsForEdges, 0>>>(dStartpoints.get(), -1, edgesCount_);
|
|
getLastCudaError("addScalar launch failed.");
|
|
|
|
// Shrink the chains of successors. New successors will eventually
|
|
// represent superpixels of the new level.
|
|
thrust::device_ptr<uint> dRepresentatives = pools.uintVertices.get();
|
|
|
|
getRepresentatives<<<gridDimsForVertices, blockDimsForVertices, 0>>>(
|
|
dSuccessors.get(), dRepresentatives.get(), verticesCount_);
|
|
getLastCudaError("getRepresentatives launch failed.");
|
|
|
|
swap(dSuccessors, dRepresentatives);
|
|
|
|
pools.uintVertices.put(dRepresentatives);
|
|
|
|
// Group vertices by successors' indices.
|
|
thrust::device_ptr<uint> dClusteredVerticesIDs = pools.uintVertices.get();
|
|
|
|
thrust::sequence(dClusteredVerticesIDs, dClusteredVerticesIDs + verticesCount_);
|
|
|
|
thrust::sort(thrust::make_zip_iterator(thrust::make_tuple(thrust::device_ptr<uint>(dSuccessors),
|
|
thrust::device_ptr<uint>(dClusteredVerticesIDs))),
|
|
thrust::make_zip_iterator(
|
|
thrust::make_tuple(thrust::device_ptr<uint>(dSuccessors + verticesCount_),
|
|
thrust::device_ptr<uint>(dClusteredVerticesIDs + verticesCount_))));
|
|
|
|
// Mark those groups.
|
|
thrust::device_ptr<uint> dVerticesFlags_ = pools.uintVertices.get();
|
|
|
|
thrust::fill(dVerticesFlags_, dVerticesFlags_ + verticesCount_, 0);
|
|
|
|
thrust::adjacent_difference(
|
|
dSuccessors, dSuccessors + verticesCount_, dVerticesFlags_, thrust::not_equal_to<uint>());
|
|
|
|
cudaMemset((void *)dVerticesFlags_.get(), 0, sizeof(uint));
|
|
|
|
// Assign new indices to the successors (the indices of vertices
|
|
// at the new level).
|
|
thrust::device_ptr<uint> dNewVerticesIDs_ = pools.uintVertices.get();
|
|
|
|
thrust::inclusive_scan(dVerticesFlags_, dVerticesFlags_ + verticesCount_, dNewVerticesIDs_);
|
|
|
|
pools.uintVertices.put(dVerticesFlags_);
|
|
|
|
// Now we can calculate number of resulting superpixels easily.
|
|
uint newVerticesCount;
|
|
cudaMemcpy(
|
|
&newVerticesCount, (dNewVerticesIDs_ + verticesCount_ - 1).get(), sizeof(uint), cudaMemcpyDeviceToHost);
|
|
++newVerticesCount;
|
|
|
|
// There are two special cases when we can stop our algorithm:
|
|
// 1) number of vertices in the graph remained unchanged;
|
|
// 2) only one vertex remains.
|
|
if (newVerticesCount == verticesCount_) {
|
|
return ALGORITHM_FINISHED;
|
|
}
|
|
else if (newVerticesCount == 1) {
|
|
thrust::device_ptr<uint> dDummyVerticesOffsets = pools.uintVertices.get();
|
|
|
|
cudaMemset((void *)dDummyVerticesOffsets.get(), 0, sizeof(uint));
|
|
|
|
thrust::device_ptr<uint> dDummyVerticesIDs = pools.uintVertices.get();
|
|
|
|
thrust::sequence(dDummyVerticesIDs, dDummyVerticesIDs + verticesCount_);
|
|
|
|
segmentations.addLevel(1, verticesCount_, dDummyVerticesOffsets, dDummyVerticesIDs);
|
|
|
|
return ALGORITHM_FINISHED;
|
|
}
|
|
|
|
// Calculate how old vertices IDs map to new vertices IDs.
|
|
thrust::device_ptr<uint> dVerticesMapping = pools.uintVertices.get();
|
|
|
|
getVerticesMapping<<<gridDimsForVertices, blockDimsForVertices, 0>>>(
|
|
dClusteredVerticesIDs.get(), dNewVerticesIDs_.get(), dVerticesMapping.get(), verticesCount_);
|
|
getLastCudaError("getVerticesMapping launch failed.");
|
|
|
|
pools.uintVertices.put(dNewVerticesIDs_);
|
|
pools.uintVertices.put(dClusteredVerticesIDs);
|
|
pools.uintVertices.put(dSuccessors);
|
|
|
|
// Invalidate self-loops in the reduced graph (the graph
|
|
// produced by merging all old vertices that have
|
|
// the same successor).
|
|
invalidateLoops<<<gridDimsForEdges, blockDimsForEdges, 0>>>(
|
|
dStartpoints.get(), dVerticesMapping.get(), dEdges_.get(), edgesCount_);
|
|
getLastCudaError("invalidateLoops launch failed.");
|
|
|
|
// Calculate various information about the surviving
|
|
// (new startpoints IDs and IDs of edges) and
|
|
// non-surviving/contracted edges (their weights).
|
|
thrust::device_ptr<uint> dNewStartpoints = pools.uintEdges.get();
|
|
thrust::device_ptr<uint> dSurvivedEdgesIDs = pools.uintEdges.get();
|
|
|
|
calculateEdgesInfo<<<gridDimsForEdges, blockDimsForEdges, 0>>>(dStartpoints.get(),
|
|
dVerticesMapping.get(),
|
|
dEdges_.get(),
|
|
dWeights_.get(),
|
|
dNewStartpoints.get(),
|
|
dSurvivedEdgesIDs.get(),
|
|
edgesCount_,
|
|
newVerticesCount);
|
|
getLastCudaError("calculateEdgesInfo launch failed.");
|
|
|
|
pools.uintEdges.put(dStartpoints);
|
|
|
|
// Group that information by the new startpoints IDs.
|
|
// Keep in mind that we want to build new (reduced) graph and apply
|
|
// the step of the algorithm to that one. Hence we need to
|
|
// preserve the structure of the original graph: neighbours and
|
|
// weights should be grouped by vertex.
|
|
thrust::sort(thrust::make_zip_iterator(thrust::make_tuple(dNewStartpoints, dSurvivedEdgesIDs)),
|
|
thrust::make_zip_iterator(
|
|
thrust::make_tuple(dNewStartpoints + edgesCount_, dSurvivedEdgesIDs + edgesCount_)));
|
|
|
|
// Find the group of contracted edges.
|
|
uint *invalidEdgesPtr =
|
|
thrust::find_if(dNewStartpoints, dNewStartpoints + edgesCount_, IsGreaterEqualThan<uint>(newVerticesCount))
|
|
.get();
|
|
|
|
// Calculate how many edges there are in the reduced graph.
|
|
uint validEdgesCount = static_cast<uint>(invalidEdgesPtr - dNewStartpoints.get());
|
|
|
|
// Mark groups of edges corresponding to the same vertex in the
|
|
// reduced graph.
|
|
thrust::adjacent_difference(
|
|
dNewStartpoints, dNewStartpoints + edgesCount_, dEdgesFlags, thrust::not_equal_to<uint>());
|
|
|
|
cudaMemset((void *)dEdgesFlags.get(), 0, sizeof(uint));
|
|
cudaMemset((void *)dEdgesFlags.get(), 1, 1);
|
|
|
|
pools.uintEdges.put(dNewStartpoints);
|
|
|
|
// Now we are able to build the reduced graph. See "Graph"
|
|
// class for the details on the graph's internal structure.
|
|
|
|
// Calculate vertices' offsets for the reduced graph.
|
|
thrust::copy_if(thrust::make_counting_iterator(0U),
|
|
thrust::make_counting_iterator(validEdgesCount),
|
|
dEdgesFlags,
|
|
dVertices_,
|
|
thrust::identity<uint>())
|
|
.get();
|
|
|
|
pools.uintEdges.put(dEdgesFlags);
|
|
|
|
// Build up a neighbourhood for each vertex in the reduced graph
|
|
// (this includes recalculating edges' weights).
|
|
calculateThreadsDistribution(validEdgesCount, blocksCount, threadsPerBlockCount);
|
|
dim3 newGridDimsForEdges(blocksCount, 1, 1);
|
|
dim3 newBlockDimsForEdges(threadsPerBlockCount, 1, 1);
|
|
|
|
thrust::device_ptr<uint> dNewEdges = pools.uintEdges.get();
|
|
thrust::device_ptr<float> dNewWeights = pools.floatEdges.get();
|
|
|
|
makeNewEdges<<<newGridDimsForEdges, newBlockDimsForEdges, 0>>>(dSurvivedEdgesIDs.get(),
|
|
dVerticesMapping.get(),
|
|
dEdges_.get(),
|
|
dWeights_.get(),
|
|
dNewEdges.get(),
|
|
dNewWeights.get(),
|
|
validEdgesCount);
|
|
getLastCudaError("makeNewEdges launch failed.");
|
|
|
|
swap(dEdges_, dNewEdges);
|
|
swap(dWeights_, dNewWeights);
|
|
|
|
pools.uintEdges.put(dNewEdges);
|
|
pools.floatEdges.put(dNewWeights);
|
|
|
|
pools.uintEdges.put(dSurvivedEdgesIDs);
|
|
|
|
// The graph's reconstruction is now finished.
|
|
|
|
// Build new level of the segmentation tree. It is a trivial task
|
|
// as we already have "dVerticesMapping" that contains all
|
|
// sufficient information about the vertices' transformations.
|
|
thrust::device_ptr<uint> dVerticesIDs = pools.uintVertices.get();
|
|
thrust::device_ptr<uint> dNewVerticesOffsets = pools.uintVertices.get();
|
|
|
|
thrust::sequence(dVerticesIDs, dVerticesIDs + verticesCount_);
|
|
|
|
thrust::sort_by_key(dVerticesMapping, dVerticesMapping + verticesCount_, dVerticesIDs);
|
|
|
|
thrust::unique_by_key_copy(dVerticesMapping,
|
|
dVerticesMapping + verticesCount_,
|
|
thrust::make_counting_iterator(0),
|
|
thrust::make_discard_iterator(),
|
|
dNewVerticesOffsets);
|
|
|
|
segmentations.addLevel(newVerticesCount, verticesCount_, dNewVerticesOffsets, dVerticesIDs);
|
|
|
|
pools.uintVertices.put(dVerticesIDs);
|
|
pools.uintVertices.put(dNewVerticesOffsets);
|
|
pools.uintVertices.put(dVerticesMapping);
|
|
|
|
// We can now safely set new counts for vertices and edges.
|
|
verticesCount_ = newVerticesCount;
|
|
edgesCount_ = validEdgesCount;
|
|
|
|
return ALGORITHM_NOT_FINISHED;
|
|
}
|
|
|
|
uint verticesCount_;
|
|
uint edgesCount_;
|
|
|
|
thrust::device_ptr<uint> dVertices_;
|
|
thrust::device_ptr<uint> dEdges_;
|
|
thrust::device_ptr<float> dWeights_;
|
|
|
|
thrust::device_ptr<uint> dOutputEdgesFlags_;
|
|
};
|
|
|
|
// Loads PPM image.
|
|
int loadImage(const char *filename, const char *executablePath, vector<uchar3> &data, uint &width, uint &height)
|
|
{
|
|
const char *imagePath = sdkFindFilePath(filename, executablePath);
|
|
|
|
if (imagePath == NULL) {
|
|
return -1;
|
|
}
|
|
|
|
uchar *dataHandle = NULL;
|
|
unsigned int channels;
|
|
|
|
if (!__loadPPM(imagePath, &dataHandle, &width, &height, &channels)) {
|
|
return -1;
|
|
}
|
|
|
|
data.assign(reinterpret_cast<uchar3 *>(dataHandle), reinterpret_cast<uchar3 *>(dataHandle) + width * height);
|
|
|
|
free(reinterpret_cast<void *>(dataHandle));
|
|
|
|
return 0;
|
|
}
|
|
|
|
inline float distance(const uchar3 &first, const uchar3 &second)
|
|
{
|
|
int dx = static_cast<int>(first.x) - static_cast<int>(second.x);
|
|
int dy = static_cast<int>(first.y) - static_cast<int>(second.y);
|
|
int dz = static_cast<int>(first.z) - static_cast<int>(second.z);
|
|
|
|
uint sqrResult = dx * dx + dy * dy + dz * dz;
|
|
|
|
return sqrt(static_cast<float>(sqrResult));
|
|
}
|
|
|
|
// Builds a net-graph for the image with 4-connected pixels.
|
|
void buildGraph(const vector<uchar3> &image, uint width, uint height, Graph &graph)
|
|
{
|
|
uint totalNodes = static_cast<uint>(image.size());
|
|
|
|
graph.vertices.resize(totalNodes);
|
|
graph.edges.reserve(4 * totalNodes - 2 * (width + height));
|
|
graph.weights.reserve(graph.edges.size());
|
|
|
|
uint edgesProcessed = 0;
|
|
|
|
for (uint y = 0; y < height; ++y) {
|
|
for (uint x = 0; x < width; ++x) {
|
|
uint nodeIndex = y * width + x;
|
|
const uchar3 ¢erPixel = image[nodeIndex];
|
|
|
|
graph.vertices[nodeIndex] = edgesProcessed;
|
|
|
|
if (y > 0) {
|
|
uint lowerNodeIndex = (y - 1) * width + x;
|
|
const uchar3 &lowerPixel = image[lowerNodeIndex];
|
|
|
|
graph.edges.push_back(lowerNodeIndex);
|
|
graph.weights.push_back(distance(centerPixel, lowerPixel));
|
|
|
|
++edgesProcessed;
|
|
}
|
|
|
|
if (y + 1 < height) {
|
|
uint upperNodeIndex = (y + 1) * width + x;
|
|
const uchar3 &upperPixel = image[upperNodeIndex];
|
|
|
|
graph.edges.push_back(upperNodeIndex);
|
|
graph.weights.push_back(distance(centerPixel, upperPixel));
|
|
|
|
++edgesProcessed;
|
|
}
|
|
|
|
if (x > 0) {
|
|
uint leftNodeIndex = y * width + x - 1;
|
|
const uchar3 &leftPixel = image[leftNodeIndex];
|
|
|
|
graph.edges.push_back(leftNodeIndex);
|
|
graph.weights.push_back(distance(centerPixel, leftPixel));
|
|
|
|
++edgesProcessed;
|
|
}
|
|
|
|
if (x + 1 < width) {
|
|
uint rightNodeIndex = y * width + x + 1;
|
|
const uchar3 &rightPixel = image[rightNodeIndex];
|
|
|
|
graph.edges.push_back(rightNodeIndex);
|
|
graph.weights.push_back(distance(centerPixel, rightPixel));
|
|
|
|
++edgesProcessed;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static char *kDefaultImageName = (char *)"test.ppm";
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
vector<uchar3> image;
|
|
uint imageWidth, imageHeight;
|
|
char *imageName;
|
|
|
|
printf("%s Starting...\n\n", argv[0]);
|
|
|
|
imageName = (char *)kDefaultImageName;
|
|
|
|
if (checkCmdLineFlag(argc, (const char **)argv, "file")) {
|
|
getCmdLineArgumentString(argc, (const char **)argv, "file", &imageName);
|
|
}
|
|
|
|
if (loadImage(imageName, argv[0], image, imageWidth, imageHeight) != 0) {
|
|
printf("Failed to open <%s>, program exit...\n", imageName);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
findCudaDevice(argc, (const char **)argv);
|
|
|
|
Graph graph;
|
|
buildGraph(image, imageWidth, imageHeight, graph);
|
|
|
|
Pyramid segmentations;
|
|
|
|
cout << "* Building segmentation tree... ";
|
|
cout.flush();
|
|
|
|
SegmentationTreeBuilder algo;
|
|
float elapsedTime = algo.run(graph, segmentations);
|
|
|
|
cout << "done in " << elapsedTime << " (ms)" << endl;
|
|
|
|
cout << "* Dumping levels for each tree..." << endl << endl;
|
|
|
|
segmentations.dump(imageWidth, imageHeight);
|
|
|
|
bool bResults[2];
|
|
|
|
bResults[0] = sdkComparePPM("level_00.ppm", sdkFindFilePath("ref_00.ppm", argv[0]), 5.0f, 0.15f, false);
|
|
bResults[1] = sdkComparePPM("level_09.ppm", sdkFindFilePath("ref_09.ppm", argv[0]), 5.0f, 0.15f, false);
|
|
|
|
exit((bResults[0] && bResults[1]) ? EXIT_SUCCESS : EXIT_FAILURE);
|
|
}
|