mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-25 00:49:17 +08:00
1083 lines
38 KiB
Plaintext
1083 lines
38 KiB
Plaintext
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* This application demonstrates an approach to the image segmentation
|
|
* trees construction. It is based on Boruvka's MST algorithm.
|
|
* Here's the complete list of references:
|
|
* 1) V. Vineet et al, "Fast Minimum Spanning Tree for
|
|
* Large Graphs on the GPU";
|
|
* 2) P. Felzenszwalb et al, "Efficient Graph-Based Image Segmentation";
|
|
* 3) A. Ion et al, "Considerations Regarding the Minimum Spanning
|
|
* Tree Pyramid Segmentation Method".
|
|
*/
|
|
|
|
// System includes.
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <math.h>
|
|
|
|
// STL includes.
|
|
#include <iostream>
|
|
#include <fstream>
|
|
#include <iterator>
|
|
#include <vector>
|
|
#include <list>
|
|
#include <deque>
|
|
#include <algorithm>
|
|
|
|
// Thrust library includes.
|
|
#include <thrust/iterator/discard_iterator.h>
|
|
#include <thrust/iterator/counting_iterator.h>
|
|
#include <thrust/iterator/transform_iterator.h>
|
|
#include <thrust/iterator/zip_iterator.h>
|
|
#include <thrust/for_each.h>
|
|
#include <thrust/reduce.h>
|
|
#include <thrust/unique.h>
|
|
#include <thrust/scan.h>
|
|
#include <thrust/copy.h>
|
|
#include <thrust/sequence.h>
|
|
#include <thrust/fill.h>
|
|
#include <thrust/sort.h>
|
|
#include <thrust/adjacent_difference.h>
|
|
#include <thrust/find.h>
|
|
|
|
#include <thrust/device_malloc.h>
|
|
#include <thrust/device_free.h>
|
|
|
|
// Sample framework includes.
|
|
#include <helper_functions.h>
|
|
#include <helper_cuda.h>
|
|
|
|
// Project includes.
|
|
#include "common.cuh"
|
|
|
|
// Kernels.
|
|
#include "kernels.cuh"
|
|
|
|
using std::cin;
|
|
using std::cout;
|
|
using std::endl;
|
|
using std::vector;
|
|
using std::list;
|
|
using std::deque;
|
|
|
|
// Very simple von Neumann middle-square prng. rand() is different across
|
|
// various OS platforms, which makes testing and the output inconsistent.
|
|
int myrand(void)
|
|
{
|
|
static int seed = 72191;
|
|
char sq[22];
|
|
|
|
seed *= seed;
|
|
sprintf(sq, "%010d", seed);
|
|
// pull the middle 5 digits out of sq
|
|
sq[8] = 0;
|
|
seed = atoi(&sq[3]);
|
|
|
|
return seed;
|
|
}
|
|
|
|
// Simple memory pool class. It is nothing more than array of fixed-sized
|
|
// arrays.
|
|
template <typename T>
|
|
class DeviceMemoryPool
|
|
{
|
|
public:
|
|
// The parameters of the constructor are as follows:
|
|
// 1) uint chunkSize --- size of the particular array;
|
|
// 2) uint chunksCount --- number of fixed-sized arrays.
|
|
DeviceMemoryPool(uint chunkSize, uint chunksCount) :
|
|
chunkSize_(chunkSize)
|
|
{
|
|
chunkRawSize_ = (chunkSize * sizeof(T) + 511) & ~511;
|
|
|
|
try
|
|
{
|
|
basePtr_ =
|
|
thrust::device_malloc(chunkRawSize_ * chunksCount);
|
|
}
|
|
catch (thrust::system_error &e)
|
|
{
|
|
cout << "Pool memory allocation failed (" << e.what() << ")"
|
|
<< endl;
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
for (uint chunkIndex = 0; chunkIndex < chunksCount; ++chunkIndex)
|
|
{
|
|
chunks_.push_back(
|
|
thrust::device_ptr<T>(
|
|
reinterpret_cast<T *>(
|
|
static_cast<char *>(basePtr_.get()) +
|
|
chunkRawSize_ * chunkIndex)));
|
|
}
|
|
}
|
|
|
|
~DeviceMemoryPool()
|
|
{
|
|
try
|
|
{
|
|
thrust::device_free(basePtr_);
|
|
}
|
|
catch (thrust::system_error &e)
|
|
{
|
|
cout << "Pool memory allocation failed (" << e.what() << ")"
|
|
<< endl;
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
|
|
// Returns an address of the first available array
|
|
// in the memory pool.
|
|
thrust::device_ptr<T> get()
|
|
{
|
|
thrust::device_ptr<T> ptr(chunks_.back());
|
|
chunks_.pop_back();
|
|
|
|
return ptr;
|
|
}
|
|
|
|
// Pushes an address stored in "ptr" to the list
|
|
// of available arrays of the memory pool.
|
|
// It should be noted that it is user who is responsible for returning
|
|
// the previously requested memory to the appropriate pool.
|
|
inline void put(const thrust::device_ptr<T> &ptr)
|
|
{
|
|
chunks_.push_back(ptr);
|
|
}
|
|
|
|
uint totalFreeChunks() const
|
|
{
|
|
return chunks_.size();
|
|
}
|
|
|
|
private:
|
|
uint chunkSize_, chunkRawSize_;
|
|
thrust::device_ptr<void> basePtr_;
|
|
|
|
list< thrust::device_ptr<T> > chunks_;
|
|
};
|
|
|
|
// Graph structure.
|
|
struct Graph
|
|
{
|
|
Graph() {}
|
|
|
|
Graph(uint verticesCount, uint edgesCount) :
|
|
vertices(verticesCount),
|
|
edges(edgesCount),
|
|
weights(edgesCount)
|
|
{}
|
|
|
|
// This vector stores offsets for each vertex in "edges" and "weights"
|
|
// vectors. For example:
|
|
// "vertices[0]" is an index of the first outgoing edge of vertex #0,
|
|
// "vertices[1]" is an index of the first outgoing edge of vertex #1, etc.
|
|
vector<uint> vertices;
|
|
|
|
// This vector stores indices of endpoints of the corresponding edges.
|
|
// For example, "edges[vertices[0]]" is the first neighbouring vertex
|
|
// of vertex #0.
|
|
vector<uint> edges;
|
|
|
|
// This vector stores weights of the corresponding edges.
|
|
vector<float> weights;
|
|
};
|
|
|
|
// Simple segmentation tree class.
|
|
// Each level of the tree corresponds to the segmentation.
|
|
// See "Level" class for the details.
|
|
class Pyramid
|
|
{
|
|
public:
|
|
void addLevel(uint totalSuperNodes,
|
|
uint totalNodes,
|
|
thrust::device_ptr<uint> superVerticesOffsets,
|
|
thrust::device_ptr<uint> verticesIDs)
|
|
{
|
|
levels_.push_back(Level(totalSuperNodes, totalNodes));
|
|
levels_.back().buildFromDeviceData(superVerticesOffsets,
|
|
verticesIDs);
|
|
}
|
|
|
|
uint levelsCount() const
|
|
{
|
|
return static_cast<uint>(levels_.size());
|
|
}
|
|
|
|
void dump(uint width, uint height) const
|
|
{
|
|
char filename[256], format[256];
|
|
uint levelIndex = 0;
|
|
|
|
uint requiredDigitsCount =
|
|
static_cast<uint>(log10(static_cast<float>(levelsCount()))) +
|
|
1;
|
|
sprintf(format, "level_%%0%uu.ppm", requiredDigitsCount);
|
|
|
|
for (LevelsIterator level = levels_.rbegin();
|
|
level != levels_.rend();
|
|
++level, ++levelIndex)
|
|
{
|
|
|
|
sprintf(filename, format, levelIndex);
|
|
dumpLevel(level, width, height, filename);
|
|
}
|
|
}
|
|
|
|
private:
|
|
// Level of the segmentation tree.
|
|
class Level
|
|
{
|
|
public:
|
|
Level(uint totalSuperNodes, uint totalNodes) :
|
|
superNodesOffsets_(totalSuperNodes), nodes_(totalNodes)
|
|
{
|
|
}
|
|
|
|
void buildFromDeviceData(
|
|
thrust::device_ptr<uint> superVerticesOffsets,
|
|
thrust::device_ptr<uint> verticesIDs)
|
|
{
|
|
checkCudaErrors(
|
|
cudaMemcpy(&(superNodesOffsets_[0]),
|
|
superVerticesOffsets.get(),
|
|
sizeof(uint) * superNodesOffsets_.size(),
|
|
cudaMemcpyDeviceToHost));
|
|
|
|
checkCudaErrors(
|
|
cudaMemcpy(&(nodes_[0]),
|
|
verticesIDs.get(),
|
|
sizeof(uint) * nodes_.size(),
|
|
cudaMemcpyDeviceToHost));
|
|
}
|
|
|
|
private:
|
|
friend class Pyramid;
|
|
|
|
// The pair of the following vectors describes the
|
|
// relation between the consecutive levels.
|
|
// Consider an example. Let the index of the current level be n.
|
|
// Then nodes of level #(n-1) with indices stored in
|
|
// "nodes[superNodesOffsets_[0]]",
|
|
// "nodes[superNodesOffsets_[0] + 1]",
|
|
// ...,
|
|
// "nodes[superNodesOffsets_[1] - 1]"
|
|
// correspond to vertex #0 of level #n. An so on.
|
|
vector<uint> superNodesOffsets_;
|
|
vector<uint> nodes_;
|
|
};
|
|
|
|
typedef list<Level>::const_reverse_iterator LevelsIterator;
|
|
|
|
// Dumps level to the file "level_n.ppm" where n
|
|
// is index of the level. Segments are drawn in random colors.
|
|
void dumpLevel(LevelsIterator level,
|
|
uint width,
|
|
uint height,
|
|
const char *filename) const
|
|
{
|
|
deque< std::pair<uint, uint> > nodesQueue;
|
|
|
|
uint totalSegments;
|
|
|
|
{
|
|
const vector<uint> &superNodesOffsets =
|
|
level->superNodesOffsets_;
|
|
const vector<uint> &nodes =
|
|
level->nodes_;
|
|
|
|
totalSegments = static_cast<uint>(superNodesOffsets.size());
|
|
|
|
for (uint superNodeIndex = 0, nodeIndex = 0;
|
|
superNodeIndex < superNodesOffsets.size();
|
|
++superNodeIndex)
|
|
{
|
|
|
|
uint superNodeEnd =
|
|
superNodeIndex + 1 < superNodesOffsets.size() ?
|
|
superNodesOffsets[superNodeIndex + 1] :
|
|
static_cast<uint>(nodes.size());
|
|
|
|
for (; nodeIndex < superNodeEnd; ++nodeIndex)
|
|
{
|
|
nodesQueue.push_back(std::make_pair(nodes[nodeIndex],
|
|
superNodeIndex));
|
|
}
|
|
}
|
|
}
|
|
|
|
++level;
|
|
|
|
while (level != levels_.rend())
|
|
{
|
|
uint superNodesCount = static_cast<uint>(nodesQueue.size());
|
|
|
|
const vector<uint> &superNodesOffsets =
|
|
level->superNodesOffsets_;
|
|
const vector<uint> &nodes =
|
|
level->nodes_;
|
|
|
|
while (superNodesCount--)
|
|
{
|
|
std::pair<uint, uint> currentNode = nodesQueue.front();
|
|
nodesQueue.pop_front();
|
|
|
|
uint superNodeBegin = superNodesOffsets[currentNode.first];
|
|
|
|
uint superNodeEnd =
|
|
currentNode.first + 1 < superNodesOffsets.size() ?
|
|
superNodesOffsets[currentNode.first + 1] :
|
|
static_cast<uint>(nodes.size());
|
|
|
|
for (uint nodeIndex = superNodeBegin;
|
|
nodeIndex < superNodeEnd;
|
|
++nodeIndex)
|
|
{
|
|
|
|
nodesQueue.push_back(
|
|
std::make_pair(nodes[nodeIndex],
|
|
currentNode.second));
|
|
}
|
|
}
|
|
|
|
++level;
|
|
}
|
|
|
|
vector<uint> colors(3 * totalSegments);
|
|
|
|
for (uint colorIndex = 0; colorIndex < totalSegments; ++colorIndex)
|
|
{
|
|
colors[colorIndex * 3 ] = myrand() % 256;
|
|
colors[colorIndex * 3 + 1] = myrand() % 256;
|
|
colors[colorIndex * 3 + 2] = myrand() % 256;
|
|
}
|
|
|
|
uchar *image = new uchar[width * height * 3];
|
|
|
|
while (!nodesQueue.empty())
|
|
{
|
|
std::pair<uint, uint> currentNode = nodesQueue.front();
|
|
nodesQueue.pop_front();
|
|
|
|
uint pixelIndex = currentNode.first;
|
|
uint pixelSegment = currentNode.second;
|
|
|
|
image[pixelIndex * 3 ] = colors[pixelSegment * 3 ];
|
|
image[pixelIndex * 3 + 1] = colors[pixelSegment * 3 + 1];
|
|
image[pixelIndex * 3 + 2] = colors[pixelSegment * 3 + 2];
|
|
}
|
|
|
|
__savePPM(filename, image, width, height, 3);
|
|
|
|
delete[] image;
|
|
}
|
|
|
|
list<Level> levels_;
|
|
};
|
|
|
|
// The class that encapsulates the main algorithm.
|
|
class SegmentationTreeBuilder
|
|
{
|
|
public:
|
|
SegmentationTreeBuilder():verticesCount_(0),edgesCount_(0) {}
|
|
|
|
~SegmentationTreeBuilder() {}
|
|
|
|
// Repeatedly invokes the step of the algorithm
|
|
// until the limiting segmentation is found.
|
|
// Returns time (in ms) spent on building the tree.
|
|
float run(const Graph &graph, Pyramid &segmentations)
|
|
{
|
|
cudaEvent_t start, stop;
|
|
|
|
cudaEventCreate(&start);
|
|
cudaEventCreate(&stop);
|
|
|
|
cudaEventRecord(start, 0);
|
|
|
|
// Allocate required memory pools. We need just 4 types of arrays.
|
|
MemoryPoolsCollection pools =
|
|
{
|
|
DeviceMemoryPool<uint>(
|
|
static_cast<uint>(graph.vertices.size()),
|
|
kUintVerticesPoolsRequired),
|
|
DeviceMemoryPool<float>(
|
|
static_cast<uint>(graph.vertices.size()),
|
|
kFloatVerticesPoolsRequired),
|
|
DeviceMemoryPool<uint>(
|
|
static_cast<uint>(graph.edges.size()),
|
|
kUintEdgesPoolsRequired),
|
|
DeviceMemoryPool<float>(
|
|
static_cast<uint>(graph.edges.size()),
|
|
kFloatEdgesPoolsRequired)
|
|
};
|
|
|
|
// Initialize internal variables
|
|
try
|
|
{
|
|
initalizeData(graph, pools);
|
|
}
|
|
catch (thrust::system_error &e)
|
|
{
|
|
cout << "Initialization failed (" << e.what() << ")" << endl;
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
// Run steps
|
|
AlgorithmStatus status;
|
|
|
|
try
|
|
{
|
|
do
|
|
{
|
|
status = invokeStep(pools, segmentations);
|
|
}
|
|
while (status != ALGORITHM_FINISHED);
|
|
}
|
|
catch (thrust::system_error &e)
|
|
{
|
|
cout << "Algorithm failed (" << e.what() << ")" << endl;
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
cudaEventRecord(stop, 0);
|
|
cudaEventSynchronize(stop);
|
|
|
|
float elapsedTime;
|
|
cudaEventElapsedTime(&elapsedTime, start, stop);
|
|
|
|
return elapsedTime;
|
|
}
|
|
|
|
private:
|
|
void printMemoryUsage()
|
|
{
|
|
size_t availableMemory, totalMemory, usedMemory;
|
|
|
|
cudaMemGetInfo(&availableMemory, &totalMemory);
|
|
usedMemory = totalMemory - availableMemory;
|
|
|
|
cout << "Device memory: used " << usedMemory
|
|
<< " available " << availableMemory
|
|
<< " total " << totalMemory << endl;
|
|
}
|
|
|
|
struct MemoryPoolsCollection
|
|
{
|
|
DeviceMemoryPool<uint> uintVertices;
|
|
DeviceMemoryPool<float> floatVertices;
|
|
DeviceMemoryPool<uint> uintEdges;
|
|
DeviceMemoryPool<float> floatEdges;
|
|
};
|
|
|
|
static const uint kUintVerticesPoolsRequired = 8;
|
|
static const uint kFloatVerticesPoolsRequired = 3;
|
|
static const uint kUintEdgesPoolsRequired = 8;
|
|
static const uint kFloatEdgesPoolsRequired = 4;
|
|
|
|
void initalizeData(const Graph &graph, MemoryPoolsCollection &pools)
|
|
{
|
|
// Get memory for the internal variables
|
|
verticesCount_ = static_cast<uint>(graph.vertices.size());
|
|
edgesCount_ = static_cast<uint>(graph.edges.size());
|
|
|
|
dVertices_ = pools.uintVertices.get();
|
|
dEdges_ = pools.uintEdges.get();
|
|
dWeights_ = pools.floatEdges.get();
|
|
|
|
dOutputEdgesFlags_ = pools.uintEdges.get();
|
|
|
|
// Copy graph to the device memory
|
|
checkCudaErrors(cudaMemcpy(dVertices_.get(),
|
|
&(graph.vertices[0]),
|
|
sizeof(uint) * verticesCount_,
|
|
cudaMemcpyHostToDevice));
|
|
checkCudaErrors(cudaMemcpy(dEdges_.get(),
|
|
&(graph.edges[0]),
|
|
sizeof(uint) * edgesCount_,
|
|
cudaMemcpyHostToDevice));
|
|
checkCudaErrors(cudaMemcpy(dWeights_.get(),
|
|
&(graph.weights[0]),
|
|
sizeof(float) * edgesCount_,
|
|
cudaMemcpyHostToDevice));
|
|
|
|
|
|
thrust::fill(dOutputEdgesFlags_,
|
|
dOutputEdgesFlags_ + edgesCount_,
|
|
0);
|
|
}
|
|
|
|
static const uint kMaxThreadsPerBlock = 256;
|
|
|
|
// Calculates grid parameters of the consecutive kernel calls
|
|
// based on the number of elements in the array.
|
|
void calculateThreadsDistribution(uint totalElements,
|
|
uint &blocksCount,
|
|
uint &threadsPerBlockCount)
|
|
{
|
|
if (totalElements > kMaxThreadsPerBlock)
|
|
{
|
|
blocksCount =
|
|
(totalElements + kMaxThreadsPerBlock - 1) /
|
|
kMaxThreadsPerBlock;
|
|
|
|
threadsPerBlockCount = kMaxThreadsPerBlock;
|
|
}
|
|
else
|
|
{
|
|
blocksCount = 1;
|
|
threadsPerBlockCount = totalElements;
|
|
}
|
|
}
|
|
|
|
enum AlgorithmStatus { ALGORITHM_NOT_FINISHED, ALGORITHM_FINISHED };
|
|
|
|
AlgorithmStatus invokeStep(MemoryPoolsCollection &pools,
|
|
Pyramid &segmentations)
|
|
{
|
|
uint blocksCount, threadsPerBlockCount;
|
|
|
|
calculateThreadsDistribution(edgesCount_,
|
|
blocksCount,
|
|
threadsPerBlockCount);
|
|
dim3 gridDimsForEdges(blocksCount, 1, 1);
|
|
dim3 blockDimsForEdges(threadsPerBlockCount, 1, 1);
|
|
|
|
calculateThreadsDistribution(verticesCount_,
|
|
blocksCount,
|
|
threadsPerBlockCount);
|
|
dim3 gridDimsForVertices(blocksCount, 1, 1);
|
|
dim3 blockDimsForVertices(threadsPerBlockCount, 1, 1);
|
|
|
|
thrust::device_ptr<uint> dEdgesFlags = pools.uintEdges.get();
|
|
|
|
thrust::fill(dEdgesFlags, dEdgesFlags + edgesCount_, 0);
|
|
|
|
// Mark the first edge for each vertex in "dEdgesFlags"
|
|
markSegments<<< gridDimsForVertices, blockDimsForVertices, 0 >>>
|
|
(dVertices_.get(), dEdgesFlags.get(), verticesCount_);
|
|
getLastCudaError("markSegments launch failed.");
|
|
|
|
// Now find minimum edges for each vertex.
|
|
thrust::device_ptr<uint> dMinScannedEdges =
|
|
pools.uintEdges.get();
|
|
thrust::device_ptr<float> dMinScannedWeights =
|
|
pools.floatEdges.get();
|
|
|
|
thrust::inclusive_scan_by_key(
|
|
dEdgesFlags,
|
|
dEdgesFlags + edgesCount_,
|
|
thrust::make_zip_iterator(
|
|
thrust::make_tuple(dWeights_, dEdges_)),
|
|
thrust::make_zip_iterator(
|
|
thrust::make_tuple(dMinScannedWeights, dMinScannedEdges)),
|
|
thrust::greater_equal<uint>(),
|
|
thrust::minimum< thrust::tuple<float, uint> >());
|
|
|
|
// To make things clear.
|
|
// Let "dEdgesFlags" denote groups of edges that
|
|
// correspond to the same vertices. Then the last edge of each group
|
|
// (in "dMinScannedEdges" and "dMinScannedWeights") is now minimal.
|
|
|
|
// Calculate a successor vertex for each vertex. A successor of the
|
|
// vertex v is a neighbouring vertex connected to v
|
|
// by the minimal edge.
|
|
thrust::device_ptr<uint> dSuccessors = pools.uintVertices.get();
|
|
|
|
getSuccessors<<< gridDimsForVertices, blockDimsForVertices, 0 >>>
|
|
(dVertices_.get(),
|
|
dMinScannedEdges.get(),
|
|
dSuccessors.get(),
|
|
verticesCount_,
|
|
edgesCount_);
|
|
getLastCudaError("getSuccessors launch failed.");
|
|
|
|
pools.uintEdges.put(dMinScannedEdges);
|
|
pools.floatEdges.put(dMinScannedWeights);
|
|
|
|
// Remove cyclic successor dependencies. Note that there can be only
|
|
// two vertices in a cycle. See [1] for details.
|
|
removeCycles<<< gridDimsForVertices, blockDimsForVertices, 0 >>>
|
|
(dSuccessors.get(), verticesCount_);
|
|
getLastCudaError("removeCycles launch failed.");
|
|
|
|
// Build up an array of startpoints for edges. As already stated,
|
|
// each group of edges denoted by "dEdgesFlags"
|
|
// has the same startpoint.
|
|
thrust::device_ptr<uint> dStartpoints = pools.uintEdges.get();
|
|
|
|
thrust::inclusive_scan(dEdgesFlags,
|
|
dEdgesFlags + edgesCount_,
|
|
dStartpoints);
|
|
|
|
addScalar<<< gridDimsForEdges, blockDimsForEdges, 0 >>>
|
|
(dStartpoints.get(), -1, edgesCount_);
|
|
getLastCudaError("addScalar launch failed.");
|
|
|
|
// Shrink the chains of successors. New successors will eventually
|
|
// represent superpixels of the new level.
|
|
thrust::device_ptr<uint> dRepresentatives =
|
|
pools.uintVertices.get();
|
|
|
|
getRepresentatives
|
|
<<< gridDimsForVertices, blockDimsForVertices, 0 >>>
|
|
(dSuccessors.get(),
|
|
dRepresentatives.get(),
|
|
verticesCount_);
|
|
getLastCudaError("getRepresentatives launch failed.");
|
|
|
|
swap(dSuccessors, dRepresentatives);
|
|
|
|
pools.uintVertices.put(dRepresentatives);
|
|
|
|
// Group vertices by successors' indices.
|
|
thrust::device_ptr<uint> dClusteredVerticesIDs =
|
|
pools.uintVertices.get();
|
|
|
|
thrust::sequence(dClusteredVerticesIDs,
|
|
dClusteredVerticesIDs + verticesCount_);
|
|
|
|
thrust::sort(
|
|
thrust::make_zip_iterator(
|
|
thrust::make_tuple(
|
|
thrust::device_ptr<uint> (dSuccessors),
|
|
thrust::device_ptr<uint> (dClusteredVerticesIDs))),
|
|
thrust::make_zip_iterator(
|
|
thrust::make_tuple(
|
|
thrust::device_ptr<uint> (dSuccessors +
|
|
verticesCount_),
|
|
thrust::device_ptr<uint> (dClusteredVerticesIDs +
|
|
verticesCount_))));
|
|
|
|
// Mark those groups.
|
|
thrust::device_ptr<uint> dVerticesFlags_ = pools.uintVertices.get();
|
|
|
|
thrust::fill(dVerticesFlags_, dVerticesFlags_ + verticesCount_, 0);
|
|
|
|
thrust::adjacent_difference(dSuccessors,
|
|
dSuccessors + verticesCount_,
|
|
dVerticesFlags_,
|
|
thrust::not_equal_to<uint>());
|
|
|
|
cudaMemset((void *) dVerticesFlags_.get(), 0, sizeof(uint));
|
|
|
|
// Assign new indices to the successors (the indices of vertices
|
|
// at the new level).
|
|
thrust::device_ptr<uint> dNewVerticesIDs_ =
|
|
pools.uintVertices.get();
|
|
|
|
thrust::inclusive_scan(dVerticesFlags_,
|
|
dVerticesFlags_ + verticesCount_,
|
|
dNewVerticesIDs_);
|
|
|
|
pools.uintVertices.put(dVerticesFlags_);
|
|
|
|
// Now we can calculate number of resulting superpixels easily.
|
|
uint newVerticesCount;
|
|
cudaMemcpy(&newVerticesCount,
|
|
(dNewVerticesIDs_ + verticesCount_ - 1).get(),
|
|
sizeof(uint),
|
|
cudaMemcpyDeviceToHost);
|
|
++newVerticesCount;
|
|
|
|
// There are two special cases when we can stop our algorithm:
|
|
// 1) number of vertices in the graph remained unchanged;
|
|
// 2) only one vertex remains.
|
|
if (newVerticesCount == verticesCount_)
|
|
{
|
|
return ALGORITHM_FINISHED;
|
|
}
|
|
else if (newVerticesCount == 1)
|
|
{
|
|
thrust::device_ptr<uint> dDummyVerticesOffsets =
|
|
pools.uintVertices.get();
|
|
|
|
cudaMemset((void *) dDummyVerticesOffsets.get(),
|
|
0,
|
|
sizeof(uint));
|
|
|
|
thrust::device_ptr<uint> dDummyVerticesIDs =
|
|
pools.uintVertices.get();
|
|
|
|
thrust::sequence(dDummyVerticesIDs,
|
|
dDummyVerticesIDs + verticesCount_);
|
|
|
|
segmentations.addLevel(1,
|
|
verticesCount_,
|
|
dDummyVerticesOffsets,
|
|
dDummyVerticesIDs);
|
|
|
|
return ALGORITHM_FINISHED;
|
|
}
|
|
|
|
// Calculate how old vertices IDs map to new vertices IDs.
|
|
thrust::device_ptr<uint> dVerticesMapping =
|
|
pools.uintVertices.get();
|
|
|
|
getVerticesMapping
|
|
<<< gridDimsForVertices, blockDimsForVertices, 0 >>>
|
|
(dClusteredVerticesIDs.get(),
|
|
dNewVerticesIDs_.get(),
|
|
dVerticesMapping.get(),
|
|
verticesCount_);
|
|
getLastCudaError("getVerticesMapping launch failed.");
|
|
|
|
pools.uintVertices.put(dNewVerticesIDs_);
|
|
pools.uintVertices.put(dClusteredVerticesIDs);
|
|
pools.uintVertices.put(dSuccessors);
|
|
|
|
// Invalidate self-loops in the reduced graph (the graph
|
|
// produced by merging all old vertices that have
|
|
// the same successor).
|
|
invalidateLoops<<< gridDimsForEdges, blockDimsForEdges, 0 >>>
|
|
(dStartpoints.get(),
|
|
dVerticesMapping.get(),
|
|
dEdges_.get(),
|
|
edgesCount_);
|
|
getLastCudaError("invalidateLoops launch failed.");
|
|
|
|
// Calculate various information about the surviving
|
|
// (new startpoints IDs and IDs of edges) and
|
|
// non-surviving/contracted edges (their weights).
|
|
thrust::device_ptr<uint> dNewStartpoints = pools.uintEdges.get();
|
|
thrust::device_ptr<uint> dSurvivedEdgesIDs = pools.uintEdges.get();
|
|
|
|
calculateEdgesInfo<<< gridDimsForEdges, blockDimsForEdges, 0 >>>
|
|
(dStartpoints.get(),
|
|
dVerticesMapping.get(),
|
|
dEdges_.get(),
|
|
dWeights_.get(),
|
|
dNewStartpoints.get(),
|
|
dSurvivedEdgesIDs.get(),
|
|
edgesCount_,
|
|
newVerticesCount);
|
|
getLastCudaError("calculateEdgesInfo launch failed.");
|
|
|
|
pools.uintEdges.put(dStartpoints);
|
|
|
|
// Group that information by the new startpoints IDs.
|
|
// Keep in mind that we want to build new (reduced) graph and apply
|
|
// the step of the algorithm to that one. Hence we need to
|
|
// preserve the structure of the original graph: neighbours and
|
|
// weights should be grouped by vertex.
|
|
thrust::sort(
|
|
thrust::make_zip_iterator(
|
|
thrust::make_tuple(dNewStartpoints,
|
|
dSurvivedEdgesIDs)),
|
|
thrust::make_zip_iterator(
|
|
thrust::make_tuple(dNewStartpoints + edgesCount_,
|
|
dSurvivedEdgesIDs + edgesCount_)));
|
|
|
|
// Find the group of contracted edges.
|
|
uint *invalidEdgesPtr =
|
|
thrust::find_if(
|
|
dNewStartpoints,
|
|
dNewStartpoints + edgesCount_,
|
|
IsGreaterEqualThan<uint>(newVerticesCount)).get();
|
|
|
|
// Calculate how many edges there are in the reduced graph.
|
|
uint validEdgesCount =
|
|
static_cast<uint>(invalidEdgesPtr - dNewStartpoints.get());
|
|
|
|
// Mark groups of edges corresponding to the same vertex in the
|
|
// reduced graph.
|
|
thrust::adjacent_difference(dNewStartpoints,
|
|
dNewStartpoints + edgesCount_,
|
|
dEdgesFlags,
|
|
thrust::not_equal_to<uint>());
|
|
|
|
cudaMemset((void *) dEdgesFlags.get(), 0, sizeof(uint));
|
|
cudaMemset((void *) dEdgesFlags.get(), 1, 1);
|
|
|
|
pools.uintEdges.put(dNewStartpoints);
|
|
|
|
// Now we are able to build the reduced graph. See "Graph"
|
|
// class for the details on the graph's internal structure.
|
|
|
|
// Calculate vertices' offsets for the reduced graph.
|
|
thrust::copy_if(thrust::make_counting_iterator(0U),
|
|
thrust::make_counting_iterator(validEdgesCount),
|
|
dEdgesFlags,
|
|
dVertices_,
|
|
thrust::identity<uint>()).get();
|
|
|
|
pools.uintEdges.put(dEdgesFlags);
|
|
|
|
// Build up a neighbourhood for each vertex in the reduced graph
|
|
// (this includes recalculating edges' weights).
|
|
calculateThreadsDistribution(validEdgesCount,
|
|
blocksCount,
|
|
threadsPerBlockCount);
|
|
dim3 newGridDimsForEdges(blocksCount, 1, 1);
|
|
dim3 newBlockDimsForEdges(threadsPerBlockCount, 1, 1);
|
|
|
|
thrust::device_ptr<uint> dNewEdges = pools.uintEdges.get();
|
|
thrust::device_ptr<float> dNewWeights = pools.floatEdges.get();
|
|
|
|
makeNewEdges<<< newGridDimsForEdges,
|
|
newBlockDimsForEdges,
|
|
0 >>>
|
|
(dSurvivedEdgesIDs.get(),
|
|
dVerticesMapping.get(),
|
|
dEdges_.get(),
|
|
dWeights_.get(),
|
|
dNewEdges.get(),
|
|
dNewWeights.get(),
|
|
validEdgesCount);
|
|
getLastCudaError("makeNewEdges launch failed.");
|
|
|
|
swap(dEdges_, dNewEdges);
|
|
swap(dWeights_, dNewWeights);
|
|
|
|
pools.uintEdges.put(dNewEdges);
|
|
pools.floatEdges.put(dNewWeights);
|
|
|
|
pools.uintEdges.put(dSurvivedEdgesIDs);
|
|
|
|
// The graph's reconstruction is now finished.
|
|
|
|
// Build new level of the segmentation tree. It is a trivial task
|
|
// as we already have "dVerticesMapping" that contains all
|
|
// sufficient information about the vertices' transformations.
|
|
thrust::device_ptr<uint> dVerticesIDs =
|
|
pools.uintVertices.get();
|
|
thrust::device_ptr<uint> dNewVerticesOffsets =
|
|
pools.uintVertices.get();
|
|
|
|
thrust::sequence(dVerticesIDs, dVerticesIDs + verticesCount_);
|
|
|
|
thrust::sort_by_key(dVerticesMapping,
|
|
dVerticesMapping + verticesCount_,
|
|
dVerticesIDs);
|
|
|
|
thrust::unique_by_key_copy(dVerticesMapping,
|
|
dVerticesMapping + verticesCount_,
|
|
thrust::make_counting_iterator(0),
|
|
thrust::make_discard_iterator(),
|
|
dNewVerticesOffsets);
|
|
|
|
segmentations.addLevel(newVerticesCount,
|
|
verticesCount_,
|
|
dNewVerticesOffsets,
|
|
dVerticesIDs);
|
|
|
|
pools.uintVertices.put(dVerticesIDs);
|
|
pools.uintVertices.put(dNewVerticesOffsets);
|
|
pools.uintVertices.put(dVerticesMapping);
|
|
|
|
// We can now safely set new counts for vertices and edges.
|
|
verticesCount_ = newVerticesCount;
|
|
edgesCount_ = validEdgesCount;
|
|
|
|
return ALGORITHM_NOT_FINISHED;
|
|
}
|
|
|
|
uint verticesCount_;
|
|
uint edgesCount_;
|
|
|
|
thrust::device_ptr<uint> dVertices_;
|
|
thrust::device_ptr<uint> dEdges_;
|
|
thrust::device_ptr<float> dWeights_;
|
|
|
|
thrust::device_ptr<uint> dOutputEdgesFlags_;
|
|
};
|
|
|
|
// Loads PPM image.
|
|
int loadImage(const char *filename,
|
|
const char *executablePath,
|
|
vector<uchar3> &data,
|
|
uint &width,
|
|
uint &height)
|
|
{
|
|
const char *imagePath = sdkFindFilePath(filename, executablePath);
|
|
|
|
if (imagePath == NULL)
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
uchar *dataHandle = NULL;
|
|
unsigned int channels;
|
|
|
|
if (!__loadPPM(imagePath, &dataHandle, &width, &height, &channels))
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
data.assign(reinterpret_cast<uchar3 *>(dataHandle),
|
|
reinterpret_cast<uchar3 *>(dataHandle) + width * height);
|
|
|
|
free(reinterpret_cast<void *>(dataHandle));
|
|
|
|
return 0;
|
|
}
|
|
|
|
inline float distance(const uchar3 &first, const uchar3 &second)
|
|
{
|
|
int dx = static_cast<int>(first.x) - static_cast<int>(second.x);
|
|
int dy = static_cast<int>(first.y) - static_cast<int>(second.y);
|
|
int dz = static_cast<int>(first.z) - static_cast<int>(second.z);
|
|
|
|
uint sqrResult = dx * dx + dy * dy + dz * dz;
|
|
|
|
return sqrt(static_cast<float>(sqrResult));
|
|
}
|
|
|
|
// Builds a net-graph for the image with 4-connected pixels.
|
|
void buildGraph(const vector<uchar3> &image,
|
|
uint width,
|
|
uint height,
|
|
Graph &graph)
|
|
{
|
|
uint totalNodes = static_cast<uint>(image.size());
|
|
|
|
graph.vertices.resize(totalNodes);
|
|
graph.edges.reserve(4 * totalNodes - 2 * (width + height));
|
|
graph.weights.reserve(graph.edges.size());
|
|
|
|
uint edgesProcessed = 0;
|
|
|
|
for (uint y = 0; y < height; ++y)
|
|
{
|
|
for (uint x = 0; x < width; ++x)
|
|
{
|
|
uint nodeIndex = y * width + x;
|
|
const uchar3 ¢erPixel = image[nodeIndex];
|
|
|
|
graph.vertices[nodeIndex] = edgesProcessed;
|
|
|
|
if (y > 0)
|
|
{
|
|
uint lowerNodeIndex = (y - 1) * width + x;
|
|
const uchar3 &lowerPixel = image[lowerNodeIndex];
|
|
|
|
graph.edges.push_back(lowerNodeIndex);
|
|
graph.weights.push_back(distance(centerPixel, lowerPixel));
|
|
|
|
++edgesProcessed;
|
|
}
|
|
|
|
if (y + 1 < height)
|
|
{
|
|
uint upperNodeIndex = (y + 1) * width + x;
|
|
const uchar3 &upperPixel = image[upperNodeIndex];
|
|
|
|
graph.edges.push_back(upperNodeIndex);
|
|
graph.weights.push_back(distance(centerPixel, upperPixel));
|
|
|
|
++edgesProcessed;
|
|
}
|
|
|
|
if (x > 0)
|
|
{
|
|
uint leftNodeIndex = y * width + x - 1;
|
|
const uchar3 &leftPixel = image[leftNodeIndex];
|
|
|
|
graph.edges.push_back(leftNodeIndex);
|
|
graph.weights.push_back(distance(centerPixel, leftPixel));
|
|
|
|
++edgesProcessed;
|
|
}
|
|
|
|
if (x + 1 < width)
|
|
{
|
|
uint rightNodeIndex = y * width + x + 1;
|
|
const uchar3 &rightPixel = image[rightNodeIndex];
|
|
|
|
graph.edges.push_back(rightNodeIndex);
|
|
graph.weights.push_back(distance(centerPixel, rightPixel));
|
|
|
|
++edgesProcessed;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static char *kDefaultImageName = (char*)"test.ppm";
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
vector<uchar3> image;
|
|
uint imageWidth, imageHeight;
|
|
char *imageName;
|
|
|
|
printf("%s Starting...\n\n", argv[0]);
|
|
|
|
imageName = (char *)kDefaultImageName;
|
|
|
|
if (checkCmdLineFlag(argc, (const char **) argv, "file"))
|
|
{
|
|
getCmdLineArgumentString(argc,
|
|
(const char **) argv,
|
|
"file",
|
|
&imageName);
|
|
}
|
|
|
|
if (loadImage(imageName, argv[0], image, imageWidth, imageHeight) != 0)
|
|
{
|
|
printf("Failed to open <%s>, program exit...\n", imageName);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
findCudaDevice(argc, (const char **)argv);
|
|
|
|
Graph graph;
|
|
buildGraph(image, imageWidth, imageHeight, graph);
|
|
|
|
Pyramid segmentations;
|
|
|
|
cout << "* Building segmentation tree... ";
|
|
cout.flush();
|
|
|
|
SegmentationTreeBuilder algo;
|
|
float elapsedTime = algo.run(graph, segmentations);
|
|
|
|
cout << "done in " << elapsedTime << " (ms)" << endl;
|
|
|
|
cout << "* Dumping levels for each tree..." << endl << endl;
|
|
|
|
segmentations.dump(imageWidth, imageHeight);
|
|
|
|
bool bResults[2];
|
|
|
|
bResults[0] = sdkComparePPM("level_00.ppm",
|
|
sdkFindFilePath("ref_00.ppm", argv[0]),
|
|
5.0f,
|
|
0.15f,
|
|
false);
|
|
bResults[1] = sdkComparePPM("level_09.ppm",
|
|
sdkFindFilePath("ref_09.ppm", argv[0]),
|
|
5.0f,
|
|
0.15f,
|
|
false);
|
|
|
|
exit((bResults[0] && bResults[1]) ? EXIT_SUCCESS : EXIT_FAILURE);
|
|
}
|