/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // This sample demonstrates Instantiated CUDA Graph Update // with Jacobi Iterative Method in 3 different methods: // 1 - JacobiMethodGpuCudaGraphExecKernelSetParams() - CUDA Graph with // cudaGraphExecKernelNodeSetParams() 2 - JacobiMethodGpuCudaGraphExecUpdate() - // CUDA Graph with cudaGraphExecUpdate() 3 - JacobiMethodGpu() - Non CUDA Graph // method // Jacobi method on a linear system A*x = b, // where A is diagonally dominant and the exact solution consists // of all ones. // The dimension N_ROWS is included in jacobi.h #include #include #include #include #include #include #include "jacobi.h" // Run the Jacobi method for A*x = b on GPU with CUDA Graph - // cudaGraphExecKernelNodeSetParams(). extern double JacobiMethodGpuCudaGraphExecKernelSetParams( const float *A, const double *b, const float conv_threshold, const int max_iter, double *x, double *x_new, cudaStream_t stream); // Run the Jacobi method for A*x = b on GPU with Instantiated CUDA Graph Update // API - cudaGraphExecUpdate(). extern double JacobiMethodGpuCudaGraphExecUpdate( const float *A, const double *b, const float conv_threshold, const int max_iter, double *x, double *x_new, cudaStream_t stream); // Run the Jacobi method for A*x = b on GPU without CUDA Graph. extern double JacobiMethodGpu(const float *A, const double *b, const float conv_threshold, const int max_iter, double *x, double *x_new, cudaStream_t stream); // creates N_ROWS x N_ROWS matrix A with N_ROWS+1 on the diagonal and 1 // elsewhere. The elements of the right hand side b all equal 2*n, hence the // exact solution x to A*x = b is a vector of ones. void createLinearSystem(float *A, double *b); // Run the Jacobi method for A*x = b on CPU. void JacobiMethodCPU(float *A, double *b, float conv_threshold, int max_iter, int *numit, double *x); int main(int argc, char **argv) { if (checkCmdLineFlag(argc, (const char **)argv, "help")) { printf("Command line: jacobiCudaGraphs [-option]\n"); printf("Valid options:\n"); printf( "-gpumethod=<0,1 or 2> : 0 - [Default] " "JacobiMethodGpuCudaGraphExecKernelSetParams\n"); printf(" : 1 - JacobiMethodGpuCudaGraphExecUpdate\n"); printf(" : 2 - JacobiMethodGpu - Non CUDA Graph\n"); printf("-device=device_num : cuda device id"); printf("-help : Output a help message\n"); exit(EXIT_SUCCESS); } int gpumethod = 0; if (checkCmdLineFlag(argc, (const char **)argv, "gpumethod")) { gpumethod = getCmdLineArgumentInt(argc, (const char **)argv, "gpumethod"); if (gpumethod < 0 || gpumethod > 2) { printf("Error: gpumethod must be 0 or 1 or 2, gpumethod=%d is invalid\n", gpumethod); exit(EXIT_SUCCESS); } } int dev = findCudaDevice(argc, (const char **)argv); double *b = NULL; float *A = NULL; checkCudaErrors(cudaMallocHost(&b, N_ROWS * sizeof(double))); memset(b, 0, N_ROWS * sizeof(double)); checkCudaErrors(cudaMallocHost(&A, N_ROWS * N_ROWS * sizeof(float))); memset(A, 0, N_ROWS * N_ROWS * sizeof(float)); createLinearSystem(A, b); double *x = NULL; // start with array of all zeroes x = (double *)calloc(N_ROWS, sizeof(double)); float conv_threshold = 1.0e-2; int max_iter = 4 * N_ROWS * N_ROWS; int cnt = 0; // create timer StopWatchInterface *timerCPU = NULL, *timerGpu = NULL; sdkCreateTimer(&timerCPU); sdkStartTimer(&timerCPU); JacobiMethodCPU(A, b, conv_threshold, max_iter, &cnt, x); double sum = 0.0; // Compute error for (int i = 0; i < N_ROWS; i++) { double d = x[i] - 1.0; sum += fabs(d); } sdkStopTimer(&timerCPU); printf("CPU iterations : %d\n", cnt); printf("CPU error : %.3e\n", sum); printf("CPU Processing time: %f (ms)\n", sdkGetTimerValue(&timerCPU)); float *d_A; double *d_b, *d_x, *d_x_new; cudaStream_t stream1; checkCudaErrors(cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking)); checkCudaErrors(cudaMalloc(&d_b, sizeof(double) * N_ROWS)); checkCudaErrors(cudaMalloc(&d_A, sizeof(float) * N_ROWS * N_ROWS)); checkCudaErrors(cudaMalloc(&d_x, sizeof(double) * N_ROWS)); checkCudaErrors(cudaMalloc(&d_x_new, sizeof(double) * N_ROWS)); checkCudaErrors(cudaMemsetAsync(d_x, 0, sizeof(double) * N_ROWS, stream1)); checkCudaErrors( cudaMemsetAsync(d_x_new, 0, sizeof(double) * N_ROWS, stream1)); checkCudaErrors(cudaMemcpyAsync(d_A, A, sizeof(float) * N_ROWS * N_ROWS, cudaMemcpyHostToDevice, stream1)); checkCudaErrors(cudaMemcpyAsync(d_b, b, sizeof(double) * N_ROWS, cudaMemcpyHostToDevice, stream1)); sdkCreateTimer(&timerGpu); sdkStartTimer(&timerGpu); double sumGPU = 0.0; if (gpumethod == 0) { sumGPU = JacobiMethodGpuCudaGraphExecKernelSetParams( d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, stream1); } else if (gpumethod == 1) { sumGPU = JacobiMethodGpuCudaGraphExecUpdate( d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, stream1); } else if (gpumethod == 2) { sumGPU = JacobiMethodGpu(d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, stream1); } sdkStopTimer(&timerGpu); printf("GPU Processing time: %f (ms)\n", sdkGetTimerValue(&timerGpu)); checkCudaErrors(cudaFree(d_b)); checkCudaErrors(cudaFree(d_A)); checkCudaErrors(cudaFree(d_x)); checkCudaErrors(cudaFree(d_x_new)); checkCudaErrors(cudaFreeHost(A)); checkCudaErrors(cudaFreeHost(b)); printf("&&&& jacobiCudaGraphs %s\n", (fabs(sum - sumGPU) < conv_threshold) ? "PASSED" : "FAILED"); return (fabs(sum - sumGPU) < conv_threshold) ? EXIT_SUCCESS : EXIT_FAILURE; } void createLinearSystem(float *A, double *b) { int i, j; for (i = 0; i < N_ROWS; i++) { b[i] = 2.0 * N_ROWS; for (j = 0; j < N_ROWS; j++) A[i * N_ROWS + j] = 1.0; A[i * N_ROWS + i] = N_ROWS + 1.0; } } void JacobiMethodCPU(float *A, double *b, float conv_threshold, int max_iter, int *num_iter, double *x) { double *x_new; x_new = (double *)calloc(N_ROWS, sizeof(double)); int k; for (k = 0; k < max_iter; k++) { double sum = 0.0; for (int i = 0; i < N_ROWS; i++) { double temp_dx = b[i]; for (int j = 0; j < N_ROWS; j++) temp_dx -= A[i * N_ROWS + j] * x[j]; temp_dx /= A[i * N_ROWS + i]; x_new[i] += temp_dx; sum += fabs(temp_dx); } for (int i = 0; i < N_ROWS; i++) x[i] = x_new[i]; if (sum <= conv_threshold) break; } *num_iter = k + 1; free(x_new); }