/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #define MAX_DEPTH 16 #define INSERTION_SORT 32 //////////////////////////////////////////////////////////////////////////////// // Selection sort used when depth gets too big or the number of elements drops // below a threshold. //////////////////////////////////////////////////////////////////////////////// __device__ void selection_sort(unsigned int *data, int left, int right) { for (int i = left; i <= right; ++i) { unsigned min_val = data[i]; int min_idx = i; // Find the smallest value in the range [left, right]. for (int j = i + 1; j <= right; ++j) { unsigned val_j = data[j]; if (val_j < min_val) { min_idx = j; min_val = val_j; } } // Swap the values. if (i != min_idx) { data[min_idx] = data[i]; data[i] = min_val; } } } //////////////////////////////////////////////////////////////////////////////// // Very basic quicksort algorithm, recursively launching the next level. //////////////////////////////////////////////////////////////////////////////// __global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth) { // If we're too deep or there are few elements left, we use an insertion // sort... if (depth >= MAX_DEPTH || right - left <= INSERTION_SORT) { selection_sort(data, left, right); return; } unsigned int *lptr = data + left; unsigned int *rptr = data + right; unsigned int pivot = data[(left + right) / 2]; // Do the partitioning. while (lptr <= rptr) { // Find the next left- and right-hand values to swap unsigned int lval = *lptr; unsigned int rval = *rptr; // Move the left pointer as long as the pointed element is smaller than the // pivot. while (lval < pivot) { lptr++; lval = *lptr; } // Move the right pointer as long as the pointed element is larger than the // pivot. while (rval > pivot) { rptr--; rval = *rptr; } // If the swap points are valid, do the swap! if (lptr <= rptr) { *lptr++ = rval; *rptr-- = lval; } } // Now the recursive part int nright = rptr - data; int nleft = lptr - data; // Launch a new block to sort the left part. if (left < (rptr - data)) { cudaStream_t s; cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); cdp_simple_quicksort<<<1, 1, 0, s>>>(data, left, nright, depth + 1); cudaStreamDestroy(s); } // Launch a new block to sort the right part. if ((lptr - data) < right) { cudaStream_t s1; cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking); cdp_simple_quicksort<<<1, 1, 0, s1>>>(data, nleft, right, depth + 1); cudaStreamDestroy(s1); } } //////////////////////////////////////////////////////////////////////////////// // Call the quicksort kernel from the host. //////////////////////////////////////////////////////////////////////////////// void run_qsort(unsigned int *data, unsigned int nitems) { // Prepare CDP for the max depth 'MAX_DEPTH'. checkCudaErrors(cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH)); // Launch on device int left = 0; int right = nitems - 1; std::cout << "Launching kernel on the GPU" << std::endl; cdp_simple_quicksort<<<1, 1>>>(data, left, right, 0); checkCudaErrors(cudaDeviceSynchronize()); } //////////////////////////////////////////////////////////////////////////////// // Initialize data on the host. //////////////////////////////////////////////////////////////////////////////// void initialize_data(unsigned int *dst, unsigned int nitems) { // Fixed seed for illustration srand(2047); // Fill dst with random values for (unsigned i = 0; i < nitems; i++) dst[i] = rand() % nitems; } //////////////////////////////////////////////////////////////////////////////// // Verify the results. //////////////////////////////////////////////////////////////////////////////// void check_results(int n, unsigned int *results_d) { unsigned int *results_h = new unsigned[n]; checkCudaErrors(cudaMemcpy(results_h, results_d, n * sizeof(unsigned), cudaMemcpyDeviceToHost)); for (int i = 1; i < n; ++i) if (results_h[i - 1] > results_h[i]) { std::cout << "Invalid item[" << i - 1 << "]: " << results_h[i - 1] << " greater than " << results_h[i] << std::endl; exit(EXIT_FAILURE); } std::cout << "OK" << std::endl; delete[] results_h; } //////////////////////////////////////////////////////////////////////////////// // Main entry point. //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { int num_items = 128; bool verbose = false; if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "h")) { std::cerr << "Usage: " << argv[0] << " num_items=\twhere num_items is the number of " "items to sort" << std::endl; exit(EXIT_SUCCESS); } if (checkCmdLineFlag(argc, (const char **)argv, "v")) { verbose = true; } if (checkCmdLineFlag(argc, (const char **)argv, "num_items")) { num_items = getCmdLineArgumentInt(argc, (const char **)argv, "num_items"); if (num_items < 1) { std::cerr << "ERROR: num_items has to be greater than 1" << std::endl; exit(EXIT_FAILURE); } } // Find/set device and get device properties int device = -1; cudaDeviceProp deviceProp; device = findCudaDevice(argc, (const char **)argv); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device)); if (!(deviceProp.major > 3 || (deviceProp.major == 3 && deviceProp.minor >= 5))) { printf("GPU %d - %s does not support CUDA Dynamic Parallelism\n Exiting.", device, deviceProp.name); exit(EXIT_WAIVED); } // Create input data unsigned int *h_data = 0; unsigned int *d_data = 0; // Allocate CPU memory and initialize data. std::cout << "Initializing data:" << std::endl; h_data = (unsigned int *)malloc(num_items * sizeof(unsigned int)); initialize_data(h_data, num_items); if (verbose) { for (int i = 0; i < num_items; i++) std::cout << "Data [" << i << "]: " << h_data[i] << std::endl; } // Allocate GPU memory. checkCudaErrors( cudaMalloc((void **)&d_data, num_items * sizeof(unsigned int))); checkCudaErrors(cudaMemcpy(d_data, h_data, num_items * sizeof(unsigned int), cudaMemcpyHostToDevice)); // Execute std::cout << "Running quicksort on " << num_items << " elements" << std::endl; run_qsort(d_data, num_items); // Check result std::cout << "Validating results: "; check_results(num_items, d_data); free(h_data); checkCudaErrors(cudaFree(d_data)); exit(EXIT_SUCCESS); }