/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //////////////////////////////////////////////////////////////////////////////// // // cdpAdvancedQuicksort.cu // // Implementation of a parallel quicksort in CUDA. It comes in // several parts: // // 1. A small-set insertion sort. We do this on any set with <=32 elements // 2. A partitioning kernel, which - given a pivot - separates an input // array into elements <=pivot, and >pivot. Two quicksorts will then // be launched to resolve each of these. // 3. A quicksort co-ordinator, which figures out what kernels to launch // and when. // //////////////////////////////////////////////////////////////////////////////// #include #include #include namespace cg = cooperative_groups; #include #include #include "cdpQuicksort.h" //////////////////////////////////////////////////////////////////////////////// // Inline PTX call to return index of highest non-zero bit in a word //////////////////////////////////////////////////////////////////////////////// static __device__ __forceinline__ unsigned int __qsflo(unsigned int word) { unsigned int ret; asm volatile("bfind.u32 %0, %1;" : "=r"(ret) : "r"(word)); return ret; } //////////////////////////////////////////////////////////////////////////////// // // ringbufAlloc // // Allocates from a ringbuffer. Allows for not failing when we run out // of stack for tracking the offset counts for each sort subsection. // // We use the atomicMax trick to allow out-of-order retirement. If we // hit the size limit on the ringbuffer, then we spin-wait for people // to complete. // //////////////////////////////////////////////////////////////////////////////// template static __device__ T *ringbufAlloc(qsortRingbuf *ringbuf) { // Wait for there to be space in the ring buffer. We'll retry only a fixed // number of times and then fail, to avoid an out-of-memory deadlock. unsigned int loop = 10000; while (((ringbuf->head - ringbuf->tail) >= ringbuf->stacksize) && (loop-- > 0)) ; if (loop == 0) return NULL; // Note that the element includes a little index book-keeping, for freeing // later. unsigned int index = atomicAdd((unsigned int *)&ringbuf->head, 1); T *ret = (T *)(ringbuf->stackbase) + (index & (ringbuf->stacksize - 1)); ret->index = index; return ret; } //////////////////////////////////////////////////////////////////////////////// // // ringBufFree // // Releases an element from the ring buffer. If every element is released // up to and including this one, we can advance the tail to indicate that // space is now available. // //////////////////////////////////////////////////////////////////////////////// template static __device__ void ringbufFree(qsortRingbuf *ringbuf, T *data) { unsigned int index = data->index; // Non-wrapped index to free unsigned int count = atomicAdd((unsigned int *)&(ringbuf->count), 1) + 1; unsigned int max = atomicMax((unsigned int *)&(ringbuf->max), index + 1); // Update the tail if need be. Note we update "max" to be the new value in // ringbuf->max if (max < (index + 1)) max = index + 1; if (max == count) atomicMax((unsigned int *)&(ringbuf->tail), count); } //////////////////////////////////////////////////////////////////////////////// // // qsort_warp // // Simplest possible implementation, does a per-warp quicksort with no // inter-warp // communication. This has a high atomic issue rate, but the rest should // actually // be fairly quick because of low work per thread. // // A warp finds its section of the data, then writes all data pivot to the other. Atomics are used to get a unique // section of the buffer. // // Obvious optimisation: do multiple chunks per warp, to increase in-flight // loads // and cover the instruction overhead. // //////////////////////////////////////////////////////////////////////////////// __global__ void qsort_warp(unsigned *indata, unsigned *outdata, unsigned int offset, unsigned int len, qsortAtomicData *atomicData, qsortRingbuf *atomicDataStack, unsigned int source_is_indata, unsigned int depth) { // Handle to thread block group cg::thread_block cta = cg::this_thread_block(); // Find my data offset, based on warp ID unsigned int thread_id = threadIdx.x + (blockIdx.x << QSORT_BLOCKSIZE_SHIFT); // unsigned int warp_id = threadIdx.x >> 5; // Used for debug only unsigned int lane_id = threadIdx.x & (warpSize - 1); // Exit if I'm outside the range of sort to be done if (thread_id >= len) return; // // First part of the algorithm. Each warp counts the number of elements that // are // greater/less than the pivot. // // When a warp knows its count, it updates an atomic counter. // // Read in the data and the pivot. Arbitrary pivot selection for now. unsigned pivot = indata[offset + len / 2]; unsigned data = indata[offset + thread_id]; // Count how many are <= and how many are > pivot. // If all are <= pivot then we adjust the comparison // because otherwise the sort will move nothing and // we'll iterate forever. cg::coalesced_group active = cg::coalesced_threads(); unsigned int greater = (data > pivot); unsigned int gt_mask = active.ballot(greater); if (gt_mask == 0) { greater = (data >= pivot); gt_mask = active.ballot(greater); // Must re-ballot for adjusted comparator } unsigned int lt_mask = active.ballot(!greater); unsigned int gt_count = __popc(gt_mask); unsigned int lt_count = __popc(lt_mask); // Atomically adjust the lt_ and gt_offsets by this amount. Only one thread // need do this. Share the result using shfl unsigned int lt_offset, gt_offset; if (lane_id == 0) { if (lt_count > 0) lt_offset = atomicAdd((unsigned int *)&atomicData->lt_offset, lt_count); if (gt_count > 0) gt_offset = len - (atomicAdd((unsigned int *)&atomicData->gt_offset, gt_count) + gt_count); } lt_offset = active.shfl((int)lt_offset, 0); // Everyone pulls the offsets from lane 0 gt_offset = active.shfl((int)gt_offset, 0); // Now compute my own personal offset within this. I need to know how many // threads with a lane ID less than mine are going to write to the same buffer // as me. We can use popc to implement a single-operation warp scan in this // case. unsigned lane_mask_lt; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt)); unsigned int my_mask = greater ? gt_mask : lt_mask; unsigned int my_offset = __popc(my_mask & lane_mask_lt); // Move data. my_offset += greater ? gt_offset : lt_offset; outdata[offset + my_offset] = data; // Count up if we're the last warp in. If so, then Kepler will launch the next // set of sorts directly from here. if (lane_id == 0) { // Count "elements written". If I wrote the last one, then trigger the next // qsorts unsigned int mycount = lt_count + gt_count; if (atomicAdd((unsigned int *)&atomicData->sorted_count, mycount) + mycount == len) { // We're the last warp to do any sorting. Therefore it's up to us to // launch the next stage. unsigned int lt_len = atomicData->lt_offset; unsigned int gt_len = atomicData->gt_offset; cudaStream_t lstream, rstream; cudaStreamCreateWithFlags(&lstream, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&rstream, cudaStreamNonBlocking); // Begin by freeing our atomicData storage. It's better for the ringbuffer // algorithm // if we free when we're done, rather than re-using (makes for less // fragmentation). ringbufFree(atomicDataStack, atomicData); // Exceptional case: if "lt_len" is zero, then all values in the batch // are equal. We are then done (may need to copy into correct buffer, // though) if (lt_len == 0) { if (source_is_indata) cudaMemcpyAsync(indata + offset, outdata + offset, gt_len * sizeof(unsigned), cudaMemcpyDeviceToDevice, lstream); return; } // Start with lower half first if (lt_len > BITONICSORT_LEN) { // If we've exceeded maximum depth, fall through to backup // big_bitonicsort if (depth >= QSORT_MAXDEPTH) { // The final bitonic stage sorts in-place in "outdata". We therefore // re-use "indata" as the out-of-range tracking buffer. For (2^n)+1 // elements we need (2^(n+1)) bytes of oor buffer. The backup qsort // buffer is at least this large when sizeof(QTYPE) >= 2. big_bitonicsort<<<1, BITONICSORT_LEN, 0, lstream>>>( outdata, source_is_indata ? indata : outdata, indata, offset, lt_len); } else { // Launch another quicksort. We need to allocate more storage for the // atomic data. if ((atomicData = ringbufAlloc(atomicDataStack)) == NULL) printf("Stack-allocation error. Failing left child launch.\n"); else { atomicData->lt_offset = atomicData->gt_offset = atomicData->sorted_count = 0; unsigned int numblocks = (unsigned int)(lt_len + (QSORT_BLOCKSIZE - 1)) >> QSORT_BLOCKSIZE_SHIFT; qsort_warp<<>>( outdata, indata, offset, lt_len, atomicData, atomicDataStack, !source_is_indata, depth + 1); } } } else if (lt_len > 1) { // Final stage uses a bitonic sort instead. It's important to // make sure the final stage ends up in the correct (original) buffer. // We launch the smallest power-of-2 number of threads that we can. unsigned int bitonic_len = 1 << (__qsflo(lt_len - 1U) + 1); bitonicsort<<<1, bitonic_len, 0, lstream>>>( outdata, source_is_indata ? indata : outdata, offset, lt_len); } // Finally, if we sorted just one single element, we must still make // sure that it winds up in the correct place. else if (source_is_indata && (lt_len == 1)) indata[offset] = outdata[offset]; if (cudaPeekAtLastError() != cudaSuccess) printf("Left-side launch fail: %s\n", cudaGetErrorString(cudaGetLastError())); // Now the upper half. if (gt_len > BITONICSORT_LEN) { // If we've exceeded maximum depth, fall through to backup // big_bitonicsort if (depth >= QSORT_MAXDEPTH) big_bitonicsort<<<1, BITONICSORT_LEN, 0, rstream>>>( outdata, source_is_indata ? indata : outdata, indata, offset + lt_len, gt_len); else { // Allocate new atomic storage for this launch if ((atomicData = ringbufAlloc(atomicDataStack)) == NULL) printf("Stack allocation error! Failing right-side launch.\n"); else { atomicData->lt_offset = atomicData->gt_offset = atomicData->sorted_count = 0; unsigned int numblocks = (unsigned int)(gt_len + (QSORT_BLOCKSIZE - 1)) >> QSORT_BLOCKSIZE_SHIFT; qsort_warp<<>>( outdata, indata, offset + lt_len, gt_len, atomicData, atomicDataStack, !source_is_indata, depth + 1); } } } else if (gt_len > 1) { unsigned int bitonic_len = 1 << (__qsflo(gt_len - 1U) + 1); bitonicsort<<<1, bitonic_len, 0, rstream>>>( outdata, source_is_indata ? indata : outdata, offset + lt_len, gt_len); } else if (source_is_indata && (gt_len == 1)) indata[offset + lt_len] = outdata[offset + lt_len]; if (cudaPeekAtLastError() != cudaSuccess) printf("Right-side launch fail: %s\n", cudaGetErrorString(cudaGetLastError())); } } } //////////////////////////////////////////////////////////////////////////////// // // run_quicksort // // Host-side code to run the Kepler version of quicksort. It's pretty // simple, because all launch control is handled on the device via CDP. // // All parallel quicksorts require an equal-sized scratch buffer. This // must be passed in ahead of time. // // Returns the time elapsed for the sort. // //////////////////////////////////////////////////////////////////////////////// float run_quicksort_cdp(unsigned *gpudata, unsigned *scratchdata, unsigned int count, cudaStream_t stream) { unsigned int stacksize = QSORT_STACK_ELEMS; // This is the stack, for atomic tracking of each sort's status qsortAtomicData *gpustack; checkCudaErrors( cudaMalloc((void **)&gpustack, stacksize * sizeof(qsortAtomicData))); checkCudaErrors(cudaMemset( gpustack, 0, sizeof(qsortAtomicData))); // Only need set first entry to 0 // Create the memory ringbuffer used for handling the stack. // Initialise everything to where it needs to be. qsortRingbuf buf; qsortRingbuf *ringbuf; checkCudaErrors(cudaMalloc((void **)&ringbuf, sizeof(qsortRingbuf))); buf.head = 1; // We start with one allocation buf.tail = 0; buf.count = 0; buf.max = 0; buf.stacksize = stacksize; buf.stackbase = gpustack; checkCudaErrors( cudaMemcpy(ringbuf, &buf, sizeof(buf), cudaMemcpyHostToDevice)); // Timing events... cudaEvent_t ev1, ev2; checkCudaErrors(cudaEventCreate(&ev1)); checkCudaErrors(cudaEventCreate(&ev2)); checkCudaErrors(cudaEventRecord(ev1)); // Now we trivially launch the qsort kernel if (count > BITONICSORT_LEN) { unsigned int numblocks = (unsigned int)(count + (QSORT_BLOCKSIZE - 1)) >> QSORT_BLOCKSIZE_SHIFT; qsort_warp<<>>( gpudata, scratchdata, 0U, count, gpustack, ringbuf, true, 0); } else { bitonicsort<<<1, BITONICSORT_LEN>>>(gpudata, gpudata, 0, count); } checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaEventRecord(ev2)); checkCudaErrors(cudaDeviceSynchronize()); float elapse = 0.0f; if (cudaPeekAtLastError() != cudaSuccess) printf("Launch failure: %s\n", cudaGetErrorString(cudaGetLastError())); else checkCudaErrors(cudaEventElapsedTime(&elapse, ev1, ev2)); // Sanity check that the stack allocator is doing the right thing checkCudaErrors( cudaMemcpy(&buf, ringbuf, sizeof(*ringbuf), cudaMemcpyDeviceToHost)); if (count > BITONICSORT_LEN && buf.head != buf.tail) { printf("Stack allocation error!\nRingbuf:\n"); printf("\t head = %u\n", buf.head); printf("\t tail = %u\n", buf.tail); printf("\tcount = %u\n", buf.count); printf("\t max = %u\n", buf.max); } // Release our stack data once we're done checkCudaErrors(cudaFree(ringbuf)); checkCudaErrors(cudaFree(gpustack)); return elapse; } //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// int run_qsort(unsigned int size, int seed, int debug, int loop, int verbose) { if (seed > 0) srand(seed); // Create and set up our test unsigned *gpudata, *scratchdata; checkCudaErrors(cudaMalloc((void **)&gpudata, size * sizeof(unsigned))); checkCudaErrors(cudaMalloc((void **)&scratchdata, size * sizeof(unsigned))); // Create CPU data. unsigned *data = new unsigned[size]; unsigned int min = loop ? loop : size; unsigned int max = size; loop = (loop == 0) ? 1 : loop; for (size = min; size <= max; size += loop) { if (verbose) printf(" Input: "); for (unsigned int i = 0; i < size; i++) { // Build data 8 bits at a time data[i] = 0; char *ptr = (char *)&(data[i]); for (unsigned j = 0; j < sizeof(unsigned); j++) { // Easy-to-read data in debug mode if (debug) { *ptr++ = (char)(rand() % 10); break; } *ptr++ = (char)(rand() & 255); } if (verbose) { if (i && !(i % 32)) printf("\n "); printf("%u ", data[i]); } } if (verbose) printf("\n"); checkCudaErrors(cudaMemcpy(gpudata, data, size * sizeof(unsigned), cudaMemcpyHostToDevice)); // So we're now populated and ready to go! We size our launch as // blocks of up to BLOCKSIZE threads, and appropriate grid size. // One thread is launched per element. float elapse; elapse = run_quicksort_cdp(gpudata, scratchdata, size, NULL); // run_bitonicsort(gpudata, scratchdata, size, verbose); checkCudaErrors(cudaDeviceSynchronize()); // Copy back the data and verify correct sort checkCudaErrors(cudaMemcpy(data, gpudata, size * sizeof(unsigned), cudaMemcpyDeviceToHost)); if (verbose) { printf("Output: "); for (unsigned int i = 0; i < size; i++) { if (i && !(i % 32)) printf("\n "); printf("%u ", data[i]); } printf("\n"); } unsigned int check; for (check = 1; check < size; check++) { if (data[check] < data[check - 1]) { printf("FAILED at element: %d\n", check); break; } } if (check != size) { printf(" cdpAdvancedQuicksort FAILED\n"); exit(EXIT_FAILURE); } else printf(" cdpAdvancedQuicksort PASSED\n"); // Display the time between event recordings printf("Sorted %u elems in %.3f ms (%.3f Melems/sec)\n", size, elapse, (float)size / (elapse * 1000.0f)); fflush(stdout); } // Release everything and we're done checkCudaErrors(cudaFree(scratchdata)); checkCudaErrors(cudaFree(gpudata)); delete (data); return 0; } static void usage() { printf( "Syntax: cdpAdvancedQuicksort [-size=] [-seed=] [-debug] " "[-loop-step=] [-verbose]\n"); printf( "If loop_step is non-zero, will run from 1->array_len in steps of " "loop_step\n"); } // Host side entry int main(int argc, char *argv[]) { int size = 1000000; unsigned int seed = 0; int debug = 0; int loop = 0; int verbose = 0; if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "h")) { usage(); printf("&&&& cdpAdvancedQuicksort WAIVED\n"); exit(EXIT_WAIVED); } if (checkCmdLineFlag(argc, (const char **)argv, "size")) { size = getCmdLineArgumentInt(argc, (const char **)argv, "size"); } if (checkCmdLineFlag(argc, (const char **)argv, "seed")) { seed = getCmdLineArgumentInt(argc, (const char **)argv, "seed"); } if (checkCmdLineFlag(argc, (const char **)argv, "loop-step")) { loop = getCmdLineArgumentInt(argc, (const char **)argv, "loop-step"); } if (checkCmdLineFlag(argc, (const char **)argv, "debug")) { debug = 1; } if (checkCmdLineFlag(argc, (const char **)argv, "verbose")) { verbose = 1; } // Get device properties int cuda_device = findCudaDevice(argc, (const char **)argv); cudaDeviceProp properties; checkCudaErrors(cudaGetDeviceProperties(&properties, cuda_device)); int cdpCapable = (properties.major == 3 && properties.minor >= 5) || properties.major >= 4; printf("GPU device %s has compute capabilities (SM %d.%d)\n", properties.name, properties.major, properties.minor); if (!cdpCapable) { printf( "cdpAdvancedQuicksort requires SM 3.5 or higher to use CUDA Dynamic " "Parallelism. Exiting...\n"); exit(EXIT_WAIVED); } printf("Running qsort on %d elements with seed %d, on %s\n", size, seed, properties.name); run_qsort(size, seed, debug, loop, verbose); exit(EXIT_SUCCESS); }