mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-11-24 23:09:16 +08:00
584 lines
21 KiB
Plaintext
584 lines
21 KiB
Plaintext
|
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or without
|
||
|
* modification, are permitted provided that the following conditions
|
||
|
* are met:
|
||
|
* * Redistributions of source code must retain the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer.
|
||
|
* * Redistributions in binary form must reproduce the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer in the
|
||
|
* documentation and/or other materials provided with the distribution.
|
||
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||
|
* contributors may be used to endorse or promote products derived
|
||
|
* from this software without specific prior written permission.
|
||
|
*
|
||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
*/
|
||
|
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
//
|
||
|
// cdpAdvancedQuicksort.cu
|
||
|
//
|
||
|
// Implementation of a parallel quicksort in CUDA. It comes in
|
||
|
// several parts:
|
||
|
//
|
||
|
// 1. A small-set insertion sort. We do this on any set with <=32 elements
|
||
|
// 2. A partitioning kernel, which - given a pivot - separates an input
|
||
|
// array into elements <=pivot, and >pivot. Two quicksorts will then
|
||
|
// be launched to resolve each of these.
|
||
|
// 3. A quicksort co-ordinator, which figures out what kernels to launch
|
||
|
// and when.
|
||
|
//
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
#include <thrust/random.h>
|
||
|
#include <thrust/device_vector.h>
|
||
|
#include <cooperative_groups.h>
|
||
|
|
||
|
namespace cg = cooperative_groups;
|
||
|
|
||
|
#include <helper_cuda.h>
|
||
|
#include <helper_string.h>
|
||
|
#include "cdpQuicksort.h"
|
||
|
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
// Inline PTX call to return index of highest non-zero bit in a word
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
static __device__ __forceinline__ unsigned int __qsflo(unsigned int word) {
|
||
|
unsigned int ret;
|
||
|
asm volatile("bfind.u32 %0, %1;" : "=r"(ret) : "r"(word));
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
//
|
||
|
// ringbufAlloc
|
||
|
//
|
||
|
// Allocates from a ringbuffer. Allows for not failing when we run out
|
||
|
// of stack for tracking the offset counts for each sort subsection.
|
||
|
//
|
||
|
// We use the atomicMax trick to allow out-of-order retirement. If we
|
||
|
// hit the size limit on the ringbuffer, then we spin-wait for people
|
||
|
// to complete.
|
||
|
//
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
template <typename T>
|
||
|
static __device__ T *ringbufAlloc(qsortRingbuf *ringbuf) {
|
||
|
// Wait for there to be space in the ring buffer. We'll retry only a fixed
|
||
|
// number of times and then fail, to avoid an out-of-memory deadlock.
|
||
|
unsigned int loop = 10000;
|
||
|
|
||
|
while (((ringbuf->head - ringbuf->tail) >= ringbuf->stacksize) &&
|
||
|
(loop-- > 0))
|
||
|
;
|
||
|
|
||
|
if (loop == 0) return NULL;
|
||
|
|
||
|
// Note that the element includes a little index book-keeping, for freeing
|
||
|
// later.
|
||
|
unsigned int index = atomicAdd((unsigned int *)&ringbuf->head, 1);
|
||
|
T *ret = (T *)(ringbuf->stackbase) + (index & (ringbuf->stacksize - 1));
|
||
|
ret->index = index;
|
||
|
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
//
|
||
|
// ringBufFree
|
||
|
//
|
||
|
// Releases an element from the ring buffer. If every element is released
|
||
|
// up to and including this one, we can advance the tail to indicate that
|
||
|
// space is now available.
|
||
|
//
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
template <typename T>
|
||
|
static __device__ void ringbufFree(qsortRingbuf *ringbuf, T *data) {
|
||
|
unsigned int index = data->index; // Non-wrapped index to free
|
||
|
unsigned int count = atomicAdd((unsigned int *)&(ringbuf->count), 1) + 1;
|
||
|
unsigned int max = atomicMax((unsigned int *)&(ringbuf->max), index + 1);
|
||
|
|
||
|
// Update the tail if need be. Note we update "max" to be the new value in
|
||
|
// ringbuf->max
|
||
|
if (max < (index + 1)) max = index + 1;
|
||
|
|
||
|
if (max == count) atomicMax((unsigned int *)&(ringbuf->tail), count);
|
||
|
}
|
||
|
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
//
|
||
|
// qsort_warp
|
||
|
//
|
||
|
// Simplest possible implementation, does a per-warp quicksort with no
|
||
|
// inter-warp
|
||
|
// communication. This has a high atomic issue rate, but the rest should
|
||
|
// actually
|
||
|
// be fairly quick because of low work per thread.
|
||
|
//
|
||
|
// A warp finds its section of the data, then writes all data <pivot to one
|
||
|
// buffer and all data >pivot to the other. Atomics are used to get a unique
|
||
|
// section of the buffer.
|
||
|
//
|
||
|
// Obvious optimisation: do multiple chunks per warp, to increase in-flight
|
||
|
// loads
|
||
|
// and cover the instruction overhead.
|
||
|
//
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
__global__ void qsort_warp(unsigned *indata, unsigned *outdata,
|
||
|
unsigned int offset, unsigned int len,
|
||
|
qsortAtomicData *atomicData,
|
||
|
qsortRingbuf *atomicDataStack,
|
||
|
unsigned int source_is_indata, unsigned int depth) {
|
||
|
// Handle to thread block group
|
||
|
cg::thread_block cta = cg::this_thread_block();
|
||
|
// Find my data offset, based on warp ID
|
||
|
unsigned int thread_id = threadIdx.x + (blockIdx.x << QSORT_BLOCKSIZE_SHIFT);
|
||
|
// unsigned int warp_id = threadIdx.x >> 5; // Used for debug only
|
||
|
unsigned int lane_id = threadIdx.x & (warpSize - 1);
|
||
|
|
||
|
// Exit if I'm outside the range of sort to be done
|
||
|
if (thread_id >= len) return;
|
||
|
|
||
|
//
|
||
|
// First part of the algorithm. Each warp counts the number of elements that
|
||
|
// are
|
||
|
// greater/less than the pivot.
|
||
|
//
|
||
|
// When a warp knows its count, it updates an atomic counter.
|
||
|
//
|
||
|
|
||
|
// Read in the data and the pivot. Arbitrary pivot selection for now.
|
||
|
unsigned pivot = indata[offset + len / 2];
|
||
|
unsigned data = indata[offset + thread_id];
|
||
|
|
||
|
// Count how many are <= and how many are > pivot.
|
||
|
// If all are <= pivot then we adjust the comparison
|
||
|
// because otherwise the sort will move nothing and
|
||
|
// we'll iterate forever.
|
||
|
cg::coalesced_group active = cg::coalesced_threads();
|
||
|
unsigned int greater = (data > pivot);
|
||
|
unsigned int gt_mask = active.ballot(greater);
|
||
|
|
||
|
if (gt_mask == 0) {
|
||
|
greater = (data >= pivot);
|
||
|
gt_mask = active.ballot(greater); // Must re-ballot for adjusted comparator
|
||
|
}
|
||
|
|
||
|
unsigned int lt_mask = active.ballot(!greater);
|
||
|
unsigned int gt_count = __popc(gt_mask);
|
||
|
unsigned int lt_count = __popc(lt_mask);
|
||
|
|
||
|
// Atomically adjust the lt_ and gt_offsets by this amount. Only one thread
|
||
|
// need do this. Share the result using shfl
|
||
|
unsigned int lt_offset, gt_offset;
|
||
|
|
||
|
if (lane_id == 0) {
|
||
|
if (lt_count > 0)
|
||
|
lt_offset = atomicAdd((unsigned int *)&atomicData->lt_offset, lt_count);
|
||
|
|
||
|
if (gt_count > 0)
|
||
|
gt_offset =
|
||
|
len - (atomicAdd((unsigned int *)&atomicData->gt_offset, gt_count) +
|
||
|
gt_count);
|
||
|
}
|
||
|
|
||
|
lt_offset =
|
||
|
active.shfl((int)lt_offset, 0); // Everyone pulls the offsets from lane 0
|
||
|
gt_offset = active.shfl((int)gt_offset, 0);
|
||
|
|
||
|
// Now compute my own personal offset within this. I need to know how many
|
||
|
// threads with a lane ID less than mine are going to write to the same buffer
|
||
|
// as me. We can use popc to implement a single-operation warp scan in this
|
||
|
// case.
|
||
|
unsigned lane_mask_lt;
|
||
|
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt));
|
||
|
unsigned int my_mask = greater ? gt_mask : lt_mask;
|
||
|
unsigned int my_offset = __popc(my_mask & lane_mask_lt);
|
||
|
|
||
|
// Move data.
|
||
|
my_offset += greater ? gt_offset : lt_offset;
|
||
|
outdata[offset + my_offset] = data;
|
||
|
|
||
|
// Count up if we're the last warp in. If so, then Kepler will launch the next
|
||
|
// set of sorts directly from here.
|
||
|
if (lane_id == 0) {
|
||
|
// Count "elements written". If I wrote the last one, then trigger the next
|
||
|
// qsorts
|
||
|
unsigned int mycount = lt_count + gt_count;
|
||
|
|
||
|
if (atomicAdd((unsigned int *)&atomicData->sorted_count, mycount) +
|
||
|
mycount ==
|
||
|
len) {
|
||
|
// We're the last warp to do any sorting. Therefore it's up to us to
|
||
|
// launch the next stage.
|
||
|
unsigned int lt_len = atomicData->lt_offset;
|
||
|
unsigned int gt_len = atomicData->gt_offset;
|
||
|
|
||
|
cudaStream_t lstream, rstream;
|
||
|
cudaStreamCreateWithFlags(&lstream, cudaStreamNonBlocking);
|
||
|
cudaStreamCreateWithFlags(&rstream, cudaStreamNonBlocking);
|
||
|
|
||
|
// Begin by freeing our atomicData storage. It's better for the ringbuffer
|
||
|
// algorithm
|
||
|
// if we free when we're done, rather than re-using (makes for less
|
||
|
// fragmentation).
|
||
|
ringbufFree<qsortAtomicData>(atomicDataStack, atomicData);
|
||
|
|
||
|
// Exceptional case: if "lt_len" is zero, then all values in the batch
|
||
|
// are equal. We are then done (may need to copy into correct buffer,
|
||
|
// though)
|
||
|
if (lt_len == 0) {
|
||
|
if (source_is_indata)
|
||
|
cudaMemcpyAsync(indata + offset, outdata + offset,
|
||
|
gt_len * sizeof(unsigned), cudaMemcpyDeviceToDevice,
|
||
|
lstream);
|
||
|
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
// Start with lower half first
|
||
|
if (lt_len > BITONICSORT_LEN) {
|
||
|
// If we've exceeded maximum depth, fall through to backup
|
||
|
// big_bitonicsort
|
||
|
if (depth >= QSORT_MAXDEPTH) {
|
||
|
// The final bitonic stage sorts in-place in "outdata". We therefore
|
||
|
// re-use "indata" as the out-of-range tracking buffer. For (2^n)+1
|
||
|
// elements we need (2^(n+1)) bytes of oor buffer. The backup qsort
|
||
|
// buffer is at least this large when sizeof(QTYPE) >= 2.
|
||
|
big_bitonicsort<<<1, BITONICSORT_LEN, 0, lstream>>>(
|
||
|
outdata, source_is_indata ? indata : outdata, indata, offset,
|
||
|
lt_len);
|
||
|
} else {
|
||
|
// Launch another quicksort. We need to allocate more storage for the
|
||
|
// atomic data.
|
||
|
if ((atomicData = ringbufAlloc<qsortAtomicData>(atomicDataStack)) ==
|
||
|
NULL)
|
||
|
printf("Stack-allocation error. Failing left child launch.\n");
|
||
|
else {
|
||
|
atomicData->lt_offset = atomicData->gt_offset =
|
||
|
atomicData->sorted_count = 0;
|
||
|
unsigned int numblocks =
|
||
|
(unsigned int)(lt_len + (QSORT_BLOCKSIZE - 1)) >>
|
||
|
QSORT_BLOCKSIZE_SHIFT;
|
||
|
qsort_warp<<<numblocks, QSORT_BLOCKSIZE, 0, lstream>>>(
|
||
|
outdata, indata, offset, lt_len, atomicData, atomicDataStack,
|
||
|
!source_is_indata, depth + 1);
|
||
|
}
|
||
|
}
|
||
|
} else if (lt_len > 1) {
|
||
|
// Final stage uses a bitonic sort instead. It's important to
|
||
|
// make sure the final stage ends up in the correct (original) buffer.
|
||
|
// We launch the smallest power-of-2 number of threads that we can.
|
||
|
unsigned int bitonic_len = 1 << (__qsflo(lt_len - 1U) + 1);
|
||
|
bitonicsort<<<1, bitonic_len, 0, lstream>>>(
|
||
|
outdata, source_is_indata ? indata : outdata, offset, lt_len);
|
||
|
}
|
||
|
// Finally, if we sorted just one single element, we must still make
|
||
|
// sure that it winds up in the correct place.
|
||
|
else if (source_is_indata && (lt_len == 1))
|
||
|
indata[offset] = outdata[offset];
|
||
|
|
||
|
if (cudaPeekAtLastError() != cudaSuccess)
|
||
|
printf("Left-side launch fail: %s\n",
|
||
|
cudaGetErrorString(cudaGetLastError()));
|
||
|
|
||
|
// Now the upper half.
|
||
|
if (gt_len > BITONICSORT_LEN) {
|
||
|
// If we've exceeded maximum depth, fall through to backup
|
||
|
// big_bitonicsort
|
||
|
if (depth >= QSORT_MAXDEPTH)
|
||
|
big_bitonicsort<<<1, BITONICSORT_LEN, 0, rstream>>>(
|
||
|
outdata, source_is_indata ? indata : outdata, indata,
|
||
|
offset + lt_len, gt_len);
|
||
|
else {
|
||
|
// Allocate new atomic storage for this launch
|
||
|
if ((atomicData = ringbufAlloc<qsortAtomicData>(atomicDataStack)) ==
|
||
|
NULL)
|
||
|
printf("Stack allocation error! Failing right-side launch.\n");
|
||
|
else {
|
||
|
atomicData->lt_offset = atomicData->gt_offset =
|
||
|
atomicData->sorted_count = 0;
|
||
|
unsigned int numblocks =
|
||
|
(unsigned int)(gt_len + (QSORT_BLOCKSIZE - 1)) >>
|
||
|
QSORT_BLOCKSIZE_SHIFT;
|
||
|
qsort_warp<<<numblocks, QSORT_BLOCKSIZE, 0, rstream>>>(
|
||
|
outdata, indata, offset + lt_len, gt_len, atomicData,
|
||
|
atomicDataStack, !source_is_indata, depth + 1);
|
||
|
}
|
||
|
}
|
||
|
} else if (gt_len > 1) {
|
||
|
unsigned int bitonic_len = 1 << (__qsflo(gt_len - 1U) + 1);
|
||
|
bitonicsort<<<1, bitonic_len, 0, rstream>>>(
|
||
|
outdata, source_is_indata ? indata : outdata, offset + lt_len,
|
||
|
gt_len);
|
||
|
} else if (source_is_indata && (gt_len == 1))
|
||
|
indata[offset + lt_len] = outdata[offset + lt_len];
|
||
|
|
||
|
if (cudaPeekAtLastError() != cudaSuccess)
|
||
|
printf("Right-side launch fail: %s\n",
|
||
|
cudaGetErrorString(cudaGetLastError()));
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
//
|
||
|
// run_quicksort
|
||
|
//
|
||
|
// Host-side code to run the Kepler version of quicksort. It's pretty
|
||
|
// simple, because all launch control is handled on the device via CDP.
|
||
|
//
|
||
|
// All parallel quicksorts require an equal-sized scratch buffer. This
|
||
|
// must be passed in ahead of time.
|
||
|
//
|
||
|
// Returns the time elapsed for the sort.
|
||
|
//
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
float run_quicksort_cdp(unsigned *gpudata, unsigned *scratchdata,
|
||
|
unsigned int count, cudaStream_t stream) {
|
||
|
unsigned int stacksize = QSORT_STACK_ELEMS;
|
||
|
|
||
|
// This is the stack, for atomic tracking of each sort's status
|
||
|
qsortAtomicData *gpustack;
|
||
|
checkCudaErrors(
|
||
|
cudaMalloc((void **)&gpustack, stacksize * sizeof(qsortAtomicData)));
|
||
|
checkCudaErrors(cudaMemset(
|
||
|
gpustack, 0, sizeof(qsortAtomicData))); // Only need set first entry to 0
|
||
|
|
||
|
// Create the memory ringbuffer used for handling the stack.
|
||
|
// Initialise everything to where it needs to be.
|
||
|
qsortRingbuf buf;
|
||
|
qsortRingbuf *ringbuf;
|
||
|
checkCudaErrors(cudaMalloc((void **)&ringbuf, sizeof(qsortRingbuf)));
|
||
|
buf.head = 1; // We start with one allocation
|
||
|
buf.tail = 0;
|
||
|
buf.count = 0;
|
||
|
buf.max = 0;
|
||
|
buf.stacksize = stacksize;
|
||
|
buf.stackbase = gpustack;
|
||
|
checkCudaErrors(
|
||
|
cudaMemcpy(ringbuf, &buf, sizeof(buf), cudaMemcpyHostToDevice));
|
||
|
|
||
|
// Timing events...
|
||
|
cudaEvent_t ev1, ev2;
|
||
|
checkCudaErrors(cudaEventCreate(&ev1));
|
||
|
checkCudaErrors(cudaEventCreate(&ev2));
|
||
|
checkCudaErrors(cudaEventRecord(ev1));
|
||
|
|
||
|
// Now we trivially launch the qsort kernel
|
||
|
if (count > BITONICSORT_LEN) {
|
||
|
unsigned int numblocks =
|
||
|
(unsigned int)(count + (QSORT_BLOCKSIZE - 1)) >> QSORT_BLOCKSIZE_SHIFT;
|
||
|
qsort_warp<<<numblocks, QSORT_BLOCKSIZE, 0, stream>>>(
|
||
|
gpudata, scratchdata, 0U, count, gpustack, ringbuf, true, 0);
|
||
|
} else {
|
||
|
bitonicsort<<<1, BITONICSORT_LEN>>>(gpudata, gpudata, 0, count);
|
||
|
}
|
||
|
|
||
|
checkCudaErrors(cudaGetLastError());
|
||
|
checkCudaErrors(cudaEventRecord(ev2));
|
||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||
|
|
||
|
float elapse = 0.0f;
|
||
|
|
||
|
if (cudaPeekAtLastError() != cudaSuccess)
|
||
|
printf("Launch failure: %s\n", cudaGetErrorString(cudaGetLastError()));
|
||
|
else
|
||
|
checkCudaErrors(cudaEventElapsedTime(&elapse, ev1, ev2));
|
||
|
|
||
|
// Sanity check that the stack allocator is doing the right thing
|
||
|
checkCudaErrors(
|
||
|
cudaMemcpy(&buf, ringbuf, sizeof(*ringbuf), cudaMemcpyDeviceToHost));
|
||
|
|
||
|
if (count > BITONICSORT_LEN && buf.head != buf.tail) {
|
||
|
printf("Stack allocation error!\nRingbuf:\n");
|
||
|
printf("\t head = %u\n", buf.head);
|
||
|
printf("\t tail = %u\n", buf.tail);
|
||
|
printf("\tcount = %u\n", buf.count);
|
||
|
printf("\t max = %u\n", buf.max);
|
||
|
}
|
||
|
|
||
|
// Release our stack data once we're done
|
||
|
checkCudaErrors(cudaFree(ringbuf));
|
||
|
checkCudaErrors(cudaFree(gpustack));
|
||
|
|
||
|
return elapse;
|
||
|
}
|
||
|
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
int run_qsort(unsigned int size, int seed, int debug, int loop, int verbose) {
|
||
|
if (seed > 0) srand(seed);
|
||
|
|
||
|
// Create and set up our test
|
||
|
unsigned *gpudata, *scratchdata;
|
||
|
checkCudaErrors(cudaMalloc((void **)&gpudata, size * sizeof(unsigned)));
|
||
|
checkCudaErrors(cudaMalloc((void **)&scratchdata, size * sizeof(unsigned)));
|
||
|
|
||
|
// Create CPU data.
|
||
|
unsigned *data = new unsigned[size];
|
||
|
unsigned int min = loop ? loop : size;
|
||
|
unsigned int max = size;
|
||
|
loop = (loop == 0) ? 1 : loop;
|
||
|
|
||
|
for (size = min; size <= max; size += loop) {
|
||
|
if (verbose) printf(" Input: ");
|
||
|
|
||
|
for (unsigned int i = 0; i < size; i++) {
|
||
|
// Build data 8 bits at a time
|
||
|
data[i] = 0;
|
||
|
char *ptr = (char *)&(data[i]);
|
||
|
|
||
|
for (unsigned j = 0; j < sizeof(unsigned); j++) {
|
||
|
// Easy-to-read data in debug mode
|
||
|
if (debug) {
|
||
|
*ptr++ = (char)(rand() % 10);
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
*ptr++ = (char)(rand() & 255);
|
||
|
}
|
||
|
|
||
|
if (verbose) {
|
||
|
if (i && !(i % 32)) printf("\n ");
|
||
|
|
||
|
printf("%u ", data[i]);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (verbose) printf("\n");
|
||
|
|
||
|
checkCudaErrors(cudaMemcpy(gpudata, data, size * sizeof(unsigned),
|
||
|
cudaMemcpyHostToDevice));
|
||
|
|
||
|
// So we're now populated and ready to go! We size our launch as
|
||
|
// blocks of up to BLOCKSIZE threads, and appropriate grid size.
|
||
|
// One thread is launched per element.
|
||
|
float elapse;
|
||
|
elapse = run_quicksort_cdp(gpudata, scratchdata, size, NULL);
|
||
|
|
||
|
// run_bitonicsort<SORTTYPE>(gpudata, scratchdata, size, verbose);
|
||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||
|
|
||
|
// Copy back the data and verify correct sort
|
||
|
checkCudaErrors(cudaMemcpy(data, gpudata, size * sizeof(unsigned),
|
||
|
cudaMemcpyDeviceToHost));
|
||
|
|
||
|
if (verbose) {
|
||
|
printf("Output: ");
|
||
|
|
||
|
for (unsigned int i = 0; i < size; i++) {
|
||
|
if (i && !(i % 32)) printf("\n ");
|
||
|
|
||
|
printf("%u ", data[i]);
|
||
|
}
|
||
|
|
||
|
printf("\n");
|
||
|
}
|
||
|
|
||
|
unsigned int check;
|
||
|
|
||
|
for (check = 1; check < size; check++) {
|
||
|
if (data[check] < data[check - 1]) {
|
||
|
printf("FAILED at element: %d\n", check);
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (check != size) {
|
||
|
printf(" cdpAdvancedQuicksort FAILED\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
} else
|
||
|
printf(" cdpAdvancedQuicksort PASSED\n");
|
||
|
|
||
|
// Display the time between event recordings
|
||
|
printf("Sorted %u elems in %.3f ms (%.3f Melems/sec)\n", size, elapse,
|
||
|
(float)size / (elapse * 1000.0f));
|
||
|
fflush(stdout);
|
||
|
}
|
||
|
|
||
|
// Release everything and we're done
|
||
|
checkCudaErrors(cudaFree(scratchdata));
|
||
|
checkCudaErrors(cudaFree(gpudata));
|
||
|
delete (data);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
static void usage() {
|
||
|
printf(
|
||
|
"Syntax: cdpAdvancedQuicksort [-size=<num>] [-seed=<num>] [-debug] "
|
||
|
"[-loop-step=<num>] [-verbose]\n");
|
||
|
printf(
|
||
|
"If loop_step is non-zero, will run from 1->array_len in steps of "
|
||
|
"loop_step\n");
|
||
|
}
|
||
|
|
||
|
// Host side entry
|
||
|
int main(int argc, char *argv[]) {
|
||
|
int size = 1000000;
|
||
|
unsigned int seed = 0;
|
||
|
int debug = 0;
|
||
|
int loop = 0;
|
||
|
int verbose = 0;
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
|
||
|
checkCmdLineFlag(argc, (const char **)argv, "h")) {
|
||
|
usage();
|
||
|
printf("&&&& cdpAdvancedQuicksort WAIVED\n");
|
||
|
exit(EXIT_WAIVED);
|
||
|
}
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "size")) {
|
||
|
size = getCmdLineArgumentInt(argc, (const char **)argv, "size");
|
||
|
}
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "seed")) {
|
||
|
seed = getCmdLineArgumentInt(argc, (const char **)argv, "seed");
|
||
|
}
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "loop-step")) {
|
||
|
loop = getCmdLineArgumentInt(argc, (const char **)argv, "loop-step");
|
||
|
}
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "debug")) {
|
||
|
debug = 1;
|
||
|
}
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "verbose")) {
|
||
|
verbose = 1;
|
||
|
}
|
||
|
|
||
|
// Get device properties
|
||
|
int cuda_device = findCudaDevice(argc, (const char **)argv);
|
||
|
cudaDeviceProp properties;
|
||
|
checkCudaErrors(cudaGetDeviceProperties(&properties, cuda_device));
|
||
|
int cdpCapable =
|
||
|
(properties.major == 3 && properties.minor >= 5) || properties.major >= 4;
|
||
|
|
||
|
printf("GPU device %s has compute capabilities (SM %d.%d)\n", properties.name,
|
||
|
properties.major, properties.minor);
|
||
|
|
||
|
if (!cdpCapable) {
|
||
|
printf(
|
||
|
"cdpAdvancedQuicksort requires SM 3.5 or higher to use CUDA Dynamic "
|
||
|
"Parallelism. Exiting...\n");
|
||
|
exit(EXIT_WAIVED);
|
||
|
}
|
||
|
|
||
|
printf("Running qsort on %d elements with seed %d, on %s\n", size, seed,
|
||
|
properties.name);
|
||
|
|
||
|
run_qsort(size, seed, debug, loop, verbose);
|
||
|
|
||
|
exit(EXIT_SUCCESS);
|
||
|
}
|