/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

// This sample demonstrates Instantiated CUDA Graph Update
// with Jacobi Iterative Method in 3 different methods:
// 1 - JacobiMethodGpuCudaGraphExecKernelSetParams() - CUDA Graph with
// cudaGraphExecKernelNodeSetParams() 2 - JacobiMethodGpuCudaGraphExecUpdate() -
// CUDA Graph with cudaGraphExecUpdate() 3 - JacobiMethodGpu() - Non CUDA Graph
// method

// Jacobi method on a linear system A*x = b,
// where A is diagonally dominant and the exact solution consists
// of all ones.
// The dimension N_ROWS is included in jacobi.h

#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <helper_timer.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include "jacobi.h"

// Run the Jacobi method for A*x = b on GPU with CUDA Graph -
// cudaGraphExecKernelNodeSetParams().
extern double JacobiMethodGpuCudaGraphExecKernelSetParams(const float  *A,
                                                          const double *b,
                                                          const float   conv_threshold,
                                                          const int     max_iter,
                                                          double       *x,
                                                          double       *x_new,
                                                          cudaStream_t  stream);

// Run the Jacobi method for A*x = b on GPU with Instantiated CUDA Graph Update
// API - cudaGraphExecUpdate().
extern double JacobiMethodGpuCudaGraphExecUpdate(const float  *A,
                                                 const double *b,
                                                 const float   conv_threshold,
                                                 const int     max_iter,
                                                 double       *x,
                                                 double       *x_new,
                                                 cudaStream_t  stream);

// Run the Jacobi method for A*x = b on GPU without CUDA Graph.
extern double JacobiMethodGpu(const float  *A,
                              const double *b,
                              const float   conv_threshold,
                              const int     max_iter,
                              double       *x,
                              double       *x_new,
                              cudaStream_t  stream);

// creates N_ROWS x N_ROWS matrix A with N_ROWS+1 on the diagonal and 1
// elsewhere. The elements of the right hand side b all equal 2*n, hence the
// exact solution x to A*x = b is a vector of ones.
void createLinearSystem(float *A, double *b);

// Run the Jacobi method for A*x = b on CPU.
void JacobiMethodCPU(float *A, double *b, float conv_threshold, int max_iter, int *numit, double *x);

int main(int argc, char **argv)
{
    if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
        printf("Command line: jacobiCudaGraphs [-option]\n");
        printf("Valid options:\n");
        printf("-gpumethod=<0,1 or 2>  : 0 - [Default] "
               "JacobiMethodGpuCudaGraphExecKernelSetParams\n");
        printf("                       : 1 - JacobiMethodGpuCudaGraphExecUpdate\n");
        printf("                       : 2 - JacobiMethodGpu - Non CUDA Graph\n");
        printf("-device=device_num     : cuda device id");
        printf("-help         : Output a help message\n");
        exit(EXIT_SUCCESS);
    }

    int gpumethod = 0;
    if (checkCmdLineFlag(argc, (const char **)argv, "gpumethod")) {
        gpumethod = getCmdLineArgumentInt(argc, (const char **)argv, "gpumethod");

        if (gpumethod < 0 || gpumethod > 2) {
            printf("Error: gpumethod must be 0 or 1 or 2, gpumethod=%d is invalid\n", gpumethod);
            exit(EXIT_SUCCESS);
        }
    }

    int dev = findCudaDevice(argc, (const char **)argv);

    double *b = NULL;
    float  *A = NULL;
    checkCudaErrors(cudaMallocHost(&b, N_ROWS * sizeof(double)));
    memset(b, 0, N_ROWS * sizeof(double));
    checkCudaErrors(cudaMallocHost(&A, N_ROWS * N_ROWS * sizeof(float)));
    memset(A, 0, N_ROWS * N_ROWS * sizeof(float));

    createLinearSystem(A, b);
    double *x = NULL;
    // start with array of all zeroes
    x = (double *)calloc(N_ROWS, sizeof(double));

    float conv_threshold = 1.0e-2;
    int   max_iter       = 4 * N_ROWS * N_ROWS;
    int   cnt            = 0;

    // create timer
    StopWatchInterface *timerCPU = NULL, *timerGpu = NULL;
    sdkCreateTimer(&timerCPU);

    sdkStartTimer(&timerCPU);
    JacobiMethodCPU(A, b, conv_threshold, max_iter, &cnt, x);

    double sum = 0.0;
    // Compute error
    for (int i = 0; i < N_ROWS; i++) {
        double d = x[i] - 1.0;
        sum += fabs(d);
    }
    sdkStopTimer(&timerCPU);
    printf("CPU iterations : %d\n", cnt);
    printf("CPU error : %.3e\n", sum);
    printf("CPU Processing time: %f (ms)\n", sdkGetTimerValue(&timerCPU));

    float       *d_A;
    double      *d_b, *d_x, *d_x_new;
    cudaStream_t stream1;
    checkCudaErrors(cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking));
    checkCudaErrors(cudaMalloc(&d_b, sizeof(double) * N_ROWS));
    checkCudaErrors(cudaMalloc(&d_A, sizeof(float) * N_ROWS * N_ROWS));
    checkCudaErrors(cudaMalloc(&d_x, sizeof(double) * N_ROWS));
    checkCudaErrors(cudaMalloc(&d_x_new, sizeof(double) * N_ROWS));

    checkCudaErrors(cudaMemsetAsync(d_x, 0, sizeof(double) * N_ROWS, stream1));
    checkCudaErrors(cudaMemsetAsync(d_x_new, 0, sizeof(double) * N_ROWS, stream1));
    checkCudaErrors(cudaMemcpyAsync(d_A, A, sizeof(float) * N_ROWS * N_ROWS, cudaMemcpyHostToDevice, stream1));
    checkCudaErrors(cudaMemcpyAsync(d_b, b, sizeof(double) * N_ROWS, cudaMemcpyHostToDevice, stream1));

    sdkCreateTimer(&timerGpu);
    sdkStartTimer(&timerGpu);

    double sumGPU = 0.0;
    if (gpumethod == 0) {
        sumGPU = JacobiMethodGpuCudaGraphExecKernelSetParams(d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, stream1);
    }
    else if (gpumethod == 1) {
        sumGPU = JacobiMethodGpuCudaGraphExecUpdate(d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, stream1);
    }
    else if (gpumethod == 2) {
        sumGPU = JacobiMethodGpu(d_A, d_b, conv_threshold, max_iter, d_x, d_x_new, stream1);
    }

    sdkStopTimer(&timerGpu);
    printf("GPU Processing time: %f (ms)\n", sdkGetTimerValue(&timerGpu));

    checkCudaErrors(cudaFree(d_b));
    checkCudaErrors(cudaFree(d_A));
    checkCudaErrors(cudaFree(d_x));
    checkCudaErrors(cudaFree(d_x_new));

    checkCudaErrors(cudaFreeHost(A));
    checkCudaErrors(cudaFreeHost(b));

    printf("&&&& jacobiCudaGraphs %s\n", (fabs(sum - sumGPU) < conv_threshold) ? "PASSED" : "FAILED");

    return (fabs(sum - sumGPU) < conv_threshold) ? EXIT_SUCCESS : EXIT_FAILURE;
}

void createLinearSystem(float *A, double *b)
{
    int i, j;
    for (i = 0; i < N_ROWS; i++) {
        b[i] = 2.0 * N_ROWS;
        for (j = 0; j < N_ROWS; j++)
            A[i * N_ROWS + j] = 1.0;
        A[i * N_ROWS + i] = N_ROWS + 1.0;
    }
}

void JacobiMethodCPU(float *A, double *b, float conv_threshold, int max_iter, int *num_iter, double *x)
{
    double *x_new;
    x_new = (double *)calloc(N_ROWS, sizeof(double));
    int k;

    for (k = 0; k < max_iter; k++) {
        double sum = 0.0;
        for (int i = 0; i < N_ROWS; i++) {
            double temp_dx = b[i];
            for (int j = 0; j < N_ROWS; j++)
                temp_dx -= A[i * N_ROWS + j] * x[j];
            temp_dx /= A[i * N_ROWS + i];
            x_new[i] += temp_dx;
            sum += fabs(temp_dx);
        }

        for (int i = 0; i < N_ROWS; i++)
            x[i] = x_new[i];

        if (sum <= conv_threshold)
            break;
    }
    *num_iter = k + 1;
    free(x_new);
}