cuda-samples/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/cuSolverSp_LowlevelCholesky.cpp

/*
 * Copyright 2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>

#include "cusolverSp.h"

#include "cusolverSp_LOWLEVEL_PREVIEW.h"

#include <cuda_runtime.h>

#include "helper_cuda.h"
#include "helper_cusolver.h"

template <typename T_ELEM>
int loadMMSparseMatrix(
    char *filename,
    char elem_type,
    bool csrFormat,
    int *m,
    int *n,
    int *nnz,
    T_ELEM **aVal,
    int **aRowInd,
    int **aColInd,
    int extendSymMatrix);

void UsageSP(void)
{
    printf( "<options>\n");
    printf( "-h          : display this help\n");
    printf( "-file=<filename> : filename containing a matrix in MM format\n");
    printf( "-device=<device_id> : <device_id> if want to run on specific GPU\n");

    exit( 0 );
}

void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts)
{
    memset(&opts, 0, sizeof(opts));

    if (checkCmdLineFlag(argc, (const char **)argv, "-h"))
    {
        UsageSP();
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "file"))
    {
        char *fileName = 0;
        getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName);

        if (fileName)
        {
            opts.sparse_mat_filename = fileName;
        }
        else
        {
            printf("\nIncorrect filename passed to -file \n ");
            UsageSP();
        }
    }
}


int main (int argc, char *argv[])
{
    struct testOpts opts;
    cusolverSpHandle_t cusolverSpH = NULL; // reordering, permutation and 1st LU factorization
    cusparseHandle_t   cusparseH = NULL;   // residual evaluation
    cudaStream_t stream = NULL;
    cusparseMatDescr_t descrA = NULL; // A is a base-0 general matrix

    csrcholInfoHost_t h_info = NULL; // opaque info structure for LU with parital pivoting
    csrcholInfo_t d_info = NULL; // opaque info structure for LU with parital pivoting

    int rowsA = 0; // number of rows of A
    int colsA = 0; // number of columns of A
    int nnzA  = 0; // number of nonzeros of A
    int baseA = 0; // base index in CSR format

    // CSR(A) from I/O
    int *h_csrRowPtrA = NULL; // <int> n+1 
    int *h_csrColIndA = NULL; // <int> nnzA 
    double *h_csrValA = NULL; // <double> nnzA 

    double *h_x = NULL; // <double> n,  x = A \ b
    double *h_b = NULL; // <double> n, b = ones(m,1)
    double *h_r = NULL; // <double> n, r = b - A*x

    size_t size_internal = 0; 
    size_t size_chol = 0; // size of working space for csrlu
    void *buffer_cpu = NULL; // working space for Cholesky
    void *buffer_gpu = NULL; // working space for Cholesky

    int *d_csrRowPtrA = NULL; // <int> n+1
    int *d_csrColIndA = NULL; // <int> nnzA
    double *d_csrValA = NULL; // <double> nnzA
    double *d_x = NULL; // <double> n, x = A \ b 
    double *d_b = NULL; // <double> n, a copy of h_b
    double *d_r = NULL; // <double> n, r = b - A*x

    // the constants used in residual evaluation, r = b - A*x
    const double minus_one = -1.0;
    const double one = 1.0;
    // the constant used in cusolverSp
    // singularity is -1 if A is invertible under tol
    // tol determines the condition of singularity
    int singularity = 0; 
    const double tol = 1.e-14;

    double x_inf = 0.0; // |x|
    double r_inf = 0.0; // |r|
    double A_inf = 0.0; // |A|
    int errors = 0;

    parseCommandLineArguments(argc, argv, opts);

    findCudaDevice(argc, (const char **)argv);

    if (opts.sparse_mat_filename == NULL)
    {
        opts.sparse_mat_filename =  sdkFindFilePath("lap2D_5pt_n100.mtx", argv[0]);
        if (opts.sparse_mat_filename != NULL)
            printf("Using default input file [%s]\n", opts.sparse_mat_filename);
        else
            printf("Could not find lap2D_5pt_n100.mtx\n");
    }
    else
    {
        printf("Using input file [%s]\n", opts.sparse_mat_filename);
    }

    printf("step 1: read matrix market format\n");

    if (opts.sparse_mat_filename)
    {
        if (loadMMSparseMatrix<double>(opts.sparse_mat_filename, 'd', true , &rowsA, &colsA,
               &nnzA, &h_csrValA, &h_csrRowPtrA, &h_csrColIndA, true)) 
        {
            return 1;
        }
        baseA = h_csrRowPtrA[0]; // baseA = {0,1}
    }
    else
    {
        fprintf(stderr, "Error: input matrix is not provided\n");
        return 1;
    }

    if ( rowsA != colsA ) 
    {
        fprintf(stderr, "Error: only support square matrix\n");
        return 1;
    }

    printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, nnzA, baseA);

    checkCudaErrors(cusolverSpCreate(&cusolverSpH));

    checkCudaErrors(cusparseCreate(&cusparseH));

    checkCudaErrors(cudaStreamCreate(&stream));

    checkCudaErrors(cusolverSpSetStream(cusolverSpH, stream));

    checkCudaErrors(cusparseSetStream(cusparseH, stream));

    checkCudaErrors(cusparseCreateMatDescr(&descrA));

    checkCudaErrors(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
    if (baseA)
    {
        checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));
    }
    else
    {
        checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));
    }

    h_x    = (double*)malloc(sizeof(double)*colsA);
    h_b    = (double*)malloc(sizeof(double)*rowsA);
    h_r    = (double*)malloc(sizeof(double)*rowsA);

    assert(NULL != h_x);
    assert(NULL != h_b);
    assert(NULL != h_r);

    checkCudaErrors(cudaMalloc((void **)&d_csrRowPtrA, sizeof(int)*(rowsA+1)));
    checkCudaErrors(cudaMalloc((void **)&d_csrColIndA, sizeof(int)*nnzA));
    checkCudaErrors(cudaMalloc((void **)&d_csrValA   , sizeof(double)*nnzA));
    checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double)*colsA));
    checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double)*rowsA));
    checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double)*rowsA));

    for(int row = 0 ; row < rowsA ; row++)
    {
        h_b[row] = 1.0;
    }

    printf("step 2: create opaque info structure\n");
    checkCudaErrors(cusolverSpCreateCsrcholInfoHost(&h_info));

    printf("step 3: analyze chol(A) to know structure of L\n");
    checkCudaErrors(cusolverSpXcsrcholAnalysisHost(
        cusolverSpH, rowsA, nnzA,
        descrA, h_csrRowPtrA, h_csrColIndA,
        h_info));

    printf("step 4: workspace for chol(A)\n");
    checkCudaErrors(cusolverSpDcsrcholBufferInfoHost(
        cusolverSpH, rowsA, nnzA,
        descrA, h_csrValA, h_csrRowPtrA, h_csrColIndA,
        h_info,
        &size_internal,
        &size_chol));

    if (buffer_cpu) 
    {
        free(buffer_cpu); 
    }
    buffer_cpu = (void*)malloc(sizeof(char)*size_chol);
    assert(NULL != buffer_cpu);

    printf("step 5: compute A = L*L^T \n");
    checkCudaErrors(cusolverSpDcsrcholFactorHost(
        cusolverSpH, rowsA, nnzA,
        descrA, h_csrValA, h_csrRowPtrA, h_csrColIndA,
        h_info,
        buffer_cpu));

    printf("step 6: check if the matrix is singular \n");
    checkCudaErrors(cusolverSpDcsrcholZeroPivotHost(
        cusolverSpH, h_info, tol, &singularity));

    if ( 0 <= singularity)
    {
        fprintf(stderr, "Error: A is not invertible, singularity=%d\n", singularity);
        return 1;
    }

    printf("step 7: solve A*x = b \n");
    checkCudaErrors(cusolverSpDcsrcholSolveHost(
        cusolverSpH, rowsA, h_b, h_x, h_info, buffer_cpu));

    printf("step 8: evaluate residual r = b - A*x (result on CPU)\n");
    // use GPU gemv to compute r = b - A*x
    checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, sizeof(int)*(rowsA+1), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int)*nnzA     , cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_csrValA   , h_csrValA   , sizeof(double)*nnzA  , cudaMemcpyHostToDevice));

    checkCudaErrors(cudaMemcpy(d_r, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_x, h_x, sizeof(double)*colsA, cudaMemcpyHostToDevice));

    /* Wrap raw data into cuSPARSE generic API objects */
    cusparseSpMatDescr_t matA = NULL;
    if (baseA)
    {
        checkCudaErrors(cusparseCreateCsr(
        &matA, rowsA, colsA, nnzA, d_csrRowPtrA, d_csrColIndA, d_csrValA, CUSPARSE_INDEX_32I,
        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F));
    }
    else
    {
        checkCudaErrors(cusparseCreateCsr(
        &matA, rowsA, colsA, nnzA, d_csrRowPtrA, d_csrColIndA, d_csrValA, CUSPARSE_INDEX_32I,
        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
    }

    cusparseDnVecDescr_t vecx = NULL;
    checkCudaErrors(cusparseCreateDnVec(&vecx, colsA, d_x, CUDA_R_64F));
    cusparseDnVecDescr_t vecAx = NULL;
    checkCudaErrors(cusparseCreateDnVec(&vecAx, rowsA, d_r, CUDA_R_64F));

    /* Allocate workspace for cuSPARSE */
    size_t bufferSize = 0;
    checkCudaErrors(cusparseSpMV_bufferSize(
        cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx,
        &one, vecAx, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize));
    void *buffer = NULL;
    checkCudaErrors(cudaMalloc(&buffer, bufferSize));

    checkCudaErrors(cusparseSpMV(
        cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx,
        &one, vecAx, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, buffer));

    checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost));

    x_inf = vec_norminf(colsA, h_x);
    r_inf = vec_norminf(rowsA, h_r);
    A_inf = csr_mat_norminf(rowsA, colsA, nnzA, descrA, h_csrValA, h_csrRowPtrA, h_csrColIndA);

    printf("(CPU) |b - A*x| = %E \n", r_inf);
    printf("(CPU) |A| = %E \n", A_inf);
    printf("(CPU) |x| = %E \n", x_inf);
    printf("(CPU) |b - A*x|/(|A|*|x|) = %E \n", r_inf/(A_inf * x_inf));

    printf("step 9: create opaque info structure\n");
    checkCudaErrors(cusolverSpCreateCsrcholInfo(&d_info));

    checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, sizeof(int)*(rowsA+1), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int)*nnzA     , cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_csrValA   , h_csrValA   , sizeof(double)*nnzA  , cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_b, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice));

    printf("step 10: analyze chol(A) to know structure of L\n");
    checkCudaErrors(cusolverSpXcsrcholAnalysis(
        cusolverSpH, rowsA, nnzA,
        descrA, d_csrRowPtrA, d_csrColIndA,
        d_info));

    printf("step 11: workspace for chol(A)\n");
    checkCudaErrors(cusolverSpDcsrcholBufferInfo(
        cusolverSpH, rowsA, nnzA,
        descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA,
        d_info,
        &size_internal,
        &size_chol));

    if (buffer_gpu) {
        checkCudaErrors(cudaFree(buffer_gpu));
    }
    checkCudaErrors(cudaMalloc(&buffer_gpu, sizeof(char)*size_chol));

    printf("step 12: compute A = L*L^T \n");
    checkCudaErrors(cusolverSpDcsrcholFactor(
        cusolverSpH, rowsA, nnzA,
        descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA,
        d_info,
        buffer_gpu));

    printf("step 13: check if the matrix is singular \n");
    checkCudaErrors(cusolverSpDcsrcholZeroPivot(
        cusolverSpH, d_info, tol, &singularity));

    if ( 0 <= singularity){
        fprintf(stderr, "Error: A is not invertible, singularity=%d\n", singularity);
        return 1;
    }

    printf("step 14: solve A*x = b \n");
    checkCudaErrors(cusolverSpDcsrcholSolve(
        cusolverSpH, rowsA, d_b, d_x, d_info, buffer_gpu));

    checkCudaErrors(cudaMemcpy(d_r, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice));

    checkCudaErrors(cusparseSpMV(
        cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx,
        &one, vecAx, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, buffer));

    checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost));

    r_inf = vec_norminf(rowsA, h_r);

    printf("(GPU) |b - A*x| = %E \n", r_inf);
    printf("(GPU) |b - A*x|/(|A|*|x|) = %E \n", r_inf/(A_inf * x_inf));


    if (cusolverSpH) { checkCudaErrors(cusolverSpDestroy(cusolverSpH)); }
    if (cusparseH  ) { checkCudaErrors(cusparseDestroy(cusparseH)); }
    if (stream     ) { checkCudaErrors(cudaStreamDestroy(stream)); }
    if (descrA     ) { checkCudaErrors(cusparseDestroyMatDescr(descrA)); }
    if (h_info     ) { checkCudaErrors(cusolverSpDestroyCsrcholInfoHost(h_info)); }
    if (d_info     ) { checkCudaErrors(cusolverSpDestroyCsrcholInfo(d_info)); }
    if (matA       ) { checkCudaErrors(cusparseDestroySpMat(matA)); }
    if (vecx       ) { checkCudaErrors(cusparseDestroyDnVec(vecx)); }
    if (vecAx      ) { checkCudaErrors(cusparseDestroyDnVec(vecAx)); }

    if (h_csrValA   ) { free(h_csrValA); }
    if (h_csrRowPtrA) { free(h_csrRowPtrA); }
    if (h_csrColIndA) { free(h_csrColIndA); }

    if (h_x   ) { free(h_x); }
    if (h_b   ) { free(h_b); }
    if (h_r   ) { free(h_r); }

    if (buffer_cpu) { free(buffer_cpu); }
    if (buffer_gpu) { checkCudaErrors(cudaFree(buffer_gpu)); }

    if (d_csrValA   ) { checkCudaErrors(cudaFree(d_csrValA)); }
    if (d_csrRowPtrA) { checkCudaErrors(cudaFree(d_csrRowPtrA)); }
    if (d_csrColIndA) { checkCudaErrors(cudaFree(d_csrColIndA)); }
    if (d_x) { checkCudaErrors(cudaFree(d_x)); }
    if (d_b) { checkCudaErrors(cudaFree(d_b)); }
    if (d_r) { checkCudaErrors(cudaFree(d_r)); }

    return 0;
}
add and update samples for CUDA 11.5 2021-10-21 19:04:49 +08:00			`/*`
			`* Copyright 2015 NVIDIA Corporation. All rights reserved.`
			`*`
			`* Please refer to the NVIDIA end user license agreement (EULA) associated`
			`* with this source code for terms and conditions that govern your use of`
			`* this software. Any use, reproduction, disclosure, or distribution of`
			`* this software and related documentation outside the terms of the EULA`
			`* is strictly prohibited.`
			`*`
			`*/`

			`#include <stdio.h>`
			`#include <stdlib.h>`
			`#include <string.h>`
			`#include <ctype.h>`
			`#include <assert.h>`

			`#include "cusolverSp.h"`

			`#include "cusolverSp_LOWLEVEL_PREVIEW.h"`

			`#include <cuda_runtime.h>`

			`#include "helper_cuda.h"`
			`#include "helper_cusolver.h"`

			`template <typename T_ELEM>`
			`int loadMMSparseMatrix(`
			`char *filename,`
			`char elem_type,`
			`bool csrFormat,`
			`int *m,`
			`int *n,`
			`int *nnz,`
			`T_ELEM **aVal,`
			`int **aRowInd,`
			`int **aColInd,`
			`int extendSymMatrix);`

			`void UsageSP(void)`
			`{`
			`printf( "<options>\n");`
			`printf( "-h : display this help\n");`
			`printf( "-file=<filename> : filename containing a matrix in MM format\n");`
			`printf( "-device=<device_id> : <device_id> if want to run on specific GPU\n");`

			`exit( 0 );`
			`}`

			`void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts)`
			`{`
			`memset(&opts, 0, sizeof(opts));`

			`if (checkCmdLineFlag(argc, (const char **)argv, "-h"))`
			`{`
			`UsageSP();`
			`}`

			`if (checkCmdLineFlag(argc, (const char **)argv, "file"))`
			`{`
			`char *fileName = 0;`
			`getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName);`

			`if (fileName)`
			`{`
			`opts.sparse_mat_filename = fileName;`
			`}`
			`else`
			`{`
			`printf("\nIncorrect filename passed to -file \n ");`
			`UsageSP();`
			`}`
			`}`
			`}`


			`int main (int argc, char *argv[])`
			`{`
			`struct testOpts opts;`
			`cusolverSpHandle_t cusolverSpH = NULL; // reordering, permutation and 1st LU factorization`
			`cusparseHandle_t cusparseH = NULL; // residual evaluation`
			`cudaStream_t stream = NULL;`
			`cusparseMatDescr_t descrA = NULL; // A is a base-0 general matrix`

			`csrcholInfoHost_t h_info = NULL; // opaque info structure for LU with parital pivoting`
			`csrcholInfo_t d_info = NULL; // opaque info structure for LU with parital pivoting`

			`int rowsA = 0; // number of rows of A`
			`int colsA = 0; // number of columns of A`
			`int nnzA = 0; // number of nonzeros of A`
			`int baseA = 0; // base index in CSR format`

			`// CSR(A) from I/O`
			`int *h_csrRowPtrA = NULL; // <int> n+1`
			`int *h_csrColIndA = NULL; // <int> nnzA`
			`double *h_csrValA = NULL; // <double> nnzA`

			`double *h_x = NULL; // <double> n, x = A \ b`
			`double *h_b = NULL; // <double> n, b = ones(m,1)`
			`double h_r = NULL; // <double> n, r = b - Ax`

			`size_t size_internal = 0;`
			`size_t size_chol = 0; // size of working space for csrlu`
			`void *buffer_cpu = NULL; // working space for Cholesky`
			`void *buffer_gpu = NULL; // working space for Cholesky`

			`int *d_csrRowPtrA = NULL; // <int> n+1`
			`int *d_csrColIndA = NULL; // <int> nnzA`
			`double *d_csrValA = NULL; // <double> nnzA`
			`double *d_x = NULL; // <double> n, x = A \ b`
			`double *d_b = NULL; // <double> n, a copy of h_b`
			`double d_r = NULL; // <double> n, r = b - Ax`

			`// the constants used in residual evaluation, r = b - A*x`
			`const double minus_one = -1.0;`
			`const double one = 1.0;`
			`// the constant used in cusolverSp`
			`// singularity is -1 if A is invertible under tol`
			`// tol determines the condition of singularity`
			`int singularity = 0;`
			`const double tol = 1.e-14;`

			`double x_inf = 0.0; // \|x\|`
			`double r_inf = 0.0; // \|r\|`
			`double A_inf = 0.0; // \|A\|`
			`int errors = 0;`

			`parseCommandLineArguments(argc, argv, opts);`

			`findCudaDevice(argc, (const char **)argv);`

			`if (opts.sparse_mat_filename == NULL)`
			`{`
			`opts.sparse_mat_filename = sdkFindFilePath("lap2D_5pt_n100.mtx", argv[0]);`
			`if (opts.sparse_mat_filename != NULL)`
			`printf("Using default input file [%s]\n", opts.sparse_mat_filename);`
			`else`
			`printf("Could not find lap2D_5pt_n100.mtx\n");`
			`}`
			`else`
			`{`
			`printf("Using input file [%s]\n", opts.sparse_mat_filename);`
			`}`

			`printf("step 1: read matrix market format\n");`

			`if (opts.sparse_mat_filename)`
			`{`
			`if (loadMMSparseMatrix<double>(opts.sparse_mat_filename, 'd', true , &rowsA, &colsA,`
			`&nnzA, &h_csrValA, &h_csrRowPtrA, &h_csrColIndA, true))`
			`{`
			`return 1;`
			`}`
			`baseA = h_csrRowPtrA[0]; // baseA = {0,1}`
			`}`
			`else`
			`{`
			`fprintf(stderr, "Error: input matrix is not provided\n");`
			`return 1;`
			`}`

			`if ( rowsA != colsA )`
			`{`
			`fprintf(stderr, "Error: only support square matrix\n");`
			`return 1;`
			`}`

			`printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, nnzA, baseA);`

			`checkCudaErrors(cusolverSpCreate(&cusolverSpH));`

			`checkCudaErrors(cusparseCreate(&cusparseH));`

			`checkCudaErrors(cudaStreamCreate(&stream));`

			`checkCudaErrors(cusolverSpSetStream(cusolverSpH, stream));`

			`checkCudaErrors(cusparseSetStream(cusparseH, stream));`

			`checkCudaErrors(cusparseCreateMatDescr(&descrA));`

			`checkCudaErrors(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL));`
			`if (baseA)`
			`{`
			`checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));`
			`}`
			`else`
			`{`
			`checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));`
			`}`

			`h_x = (double)malloc(sizeof(double)colsA);`
			`h_b = (double)malloc(sizeof(double)rowsA);`
			`h_r = (double)malloc(sizeof(double)rowsA);`

			`assert(NULL != h_x);`
			`assert(NULL != h_b);`
			`assert(NULL != h_r);`

			`checkCudaErrors(cudaMalloc((void *)&d_csrRowPtrA, sizeof(int)(rowsA+1)));`
			`checkCudaErrors(cudaMalloc((void *)&d_csrColIndA, sizeof(int)nnzA));`
			`checkCudaErrors(cudaMalloc((void *)&d_csrValA , sizeof(double)nnzA));`
			`checkCudaErrors(cudaMalloc((void *)&d_x, sizeof(double)colsA));`
			`checkCudaErrors(cudaMalloc((void *)&d_b, sizeof(double)rowsA));`
			`checkCudaErrors(cudaMalloc((void *)&d_r, sizeof(double)rowsA));`

			`for(int row = 0 ; row < rowsA ; row++)`
			`{`
			`h_b[row] = 1.0;`
			`}`

			`printf("step 2: create opaque info structure\n");`
			`checkCudaErrors(cusolverSpCreateCsrcholInfoHost(&h_info));`

			`printf("step 3: analyze chol(A) to know structure of L\n");`
			`checkCudaErrors(cusolverSpXcsrcholAnalysisHost(`
			`cusolverSpH, rowsA, nnzA,`
			`descrA, h_csrRowPtrA, h_csrColIndA,`
			`h_info));`

			`printf("step 4: workspace for chol(A)\n");`
			`checkCudaErrors(cusolverSpDcsrcholBufferInfoHost(`
			`cusolverSpH, rowsA, nnzA,`
			`descrA, h_csrValA, h_csrRowPtrA, h_csrColIndA,`
			`h_info,`
			`&size_internal,`
			`&size_chol));`

			`if (buffer_cpu)`
			`{`
			`free(buffer_cpu);`
			`}`
			`buffer_cpu = (void)malloc(sizeof(char)size_chol);`
			`assert(NULL != buffer_cpu);`

			`printf("step 5: compute A = L*L^T \n");`
			`checkCudaErrors(cusolverSpDcsrcholFactorHost(`
			`cusolverSpH, rowsA, nnzA,`
			`descrA, h_csrValA, h_csrRowPtrA, h_csrColIndA,`
			`h_info,`
			`buffer_cpu));`

			`printf("step 6: check if the matrix is singular \n");`
			`checkCudaErrors(cusolverSpDcsrcholZeroPivotHost(`
			`cusolverSpH, h_info, tol, &singularity));`

			`if ( 0 <= singularity)`
			`{`
			`fprintf(stderr, "Error: A is not invertible, singularity=%d\n", singularity);`
			`return 1;`
			`}`

			`printf("step 7: solve A*x = b \n");`
			`checkCudaErrors(cusolverSpDcsrcholSolveHost(`
			`cusolverSpH, rowsA, h_b, h_x, h_info, buffer_cpu));`

			`printf("step 8: evaluate residual r = b - A*x (result on CPU)\n");`
			`// use GPU gemv to compute r = b - A*x`
			`checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, sizeof(int)*(rowsA+1), cudaMemcpyHostToDevice));`
			`checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int)*nnzA , cudaMemcpyHostToDevice));`
			`checkCudaErrors(cudaMemcpy(d_csrValA , h_csrValA , sizeof(double)*nnzA , cudaMemcpyHostToDevice));`

			`checkCudaErrors(cudaMemcpy(d_r, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice));`
			`checkCudaErrors(cudaMemcpy(d_x, h_x, sizeof(double)*colsA, cudaMemcpyHostToDevice));`

			`/* Wrap raw data into cuSPARSE generic API objects */`
			`cusparseSpMatDescr_t matA = NULL;`
			`if (baseA)`
			`{`
			`checkCudaErrors(cusparseCreateCsr(`
			`&matA, rowsA, colsA, nnzA, d_csrRowPtrA, d_csrColIndA, d_csrValA, CUSPARSE_INDEX_32I,`
			`CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F));`
			`}`
			`else`
			`{`
			`checkCudaErrors(cusparseCreateCsr(`
			`&matA, rowsA, colsA, nnzA, d_csrRowPtrA, d_csrColIndA, d_csrValA, CUSPARSE_INDEX_32I,`
			`CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));`
			`}`

			`cusparseDnVecDescr_t vecx = NULL;`
			`checkCudaErrors(cusparseCreateDnVec(&vecx, colsA, d_x, CUDA_R_64F));`
			`cusparseDnVecDescr_t vecAx = NULL;`
			`checkCudaErrors(cusparseCreateDnVec(&vecAx, rowsA, d_r, CUDA_R_64F));`

			`/* Allocate workspace for cuSPARSE */`
			`size_t bufferSize = 0;`
			`checkCudaErrors(cusparseSpMV_bufferSize(`
			`cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx,`
			`&one, vecAx, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize));`
			`void *buffer = NULL;`
			`checkCudaErrors(cudaMalloc(&buffer, bufferSize));`

			`checkCudaErrors(cusparseSpMV(`
			`cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx,`
			`&one, vecAx, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, buffer));`

			`checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost));`

			`x_inf = vec_norminf(colsA, h_x);`
			`r_inf = vec_norminf(rowsA, h_r);`
			`A_inf = csr_mat_norminf(rowsA, colsA, nnzA, descrA, h_csrValA, h_csrRowPtrA, h_csrColIndA);`

			`printf("(CPU) \|b - A*x\| = %E \n", r_inf);`
			`printf("(CPU) \|A\| = %E \n", A_inf);`
			`printf("(CPU) \|x\| = %E \n", x_inf);`
			`printf("(CPU) \|b - Ax\|/(\|A\|\|x\|) = %E \n", r_inf/(A_inf * x_inf));`

			`printf("step 9: create opaque info structure\n");`
			`checkCudaErrors(cusolverSpCreateCsrcholInfo(&d_info));`

			`checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, sizeof(int)*(rowsA+1), cudaMemcpyHostToDevice));`
			`checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int)*nnzA , cudaMemcpyHostToDevice));`
			`checkCudaErrors(cudaMemcpy(d_csrValA , h_csrValA , sizeof(double)*nnzA , cudaMemcpyHostToDevice));`
			`checkCudaErrors(cudaMemcpy(d_b, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice));`

			`printf("step 10: analyze chol(A) to know structure of L\n");`
			`checkCudaErrors(cusolverSpXcsrcholAnalysis(`
			`cusolverSpH, rowsA, nnzA,`
			`descrA, d_csrRowPtrA, d_csrColIndA,`
			`d_info));`

			`printf("step 11: workspace for chol(A)\n");`
			`checkCudaErrors(cusolverSpDcsrcholBufferInfo(`
			`cusolverSpH, rowsA, nnzA,`
			`descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA,`
			`d_info,`
			`&size_internal,`
			`&size_chol));`

			`if (buffer_gpu) {`
			`checkCudaErrors(cudaFree(buffer_gpu));`
			`}`
			`checkCudaErrors(cudaMalloc(&buffer_gpu, sizeof(char)*size_chol));`

			`printf("step 12: compute A = L*L^T \n");`
			`checkCudaErrors(cusolverSpDcsrcholFactor(`
			`cusolverSpH, rowsA, nnzA,`
			`descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA,`
			`d_info,`
			`buffer_gpu));`

			`printf("step 13: check if the matrix is singular \n");`
			`checkCudaErrors(cusolverSpDcsrcholZeroPivot(`
			`cusolverSpH, d_info, tol, &singularity));`

			`if ( 0 <= singularity){`
			`fprintf(stderr, "Error: A is not invertible, singularity=%d\n", singularity);`
			`return 1;`
			`}`

			`printf("step 14: solve A*x = b \n");`
			`checkCudaErrors(cusolverSpDcsrcholSolve(`
			`cusolverSpH, rowsA, d_b, d_x, d_info, buffer_gpu));`

			`checkCudaErrors(cudaMemcpy(d_r, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice));`

			`checkCudaErrors(cusparseSpMV(`
			`cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx,`
			`&one, vecAx, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, buffer));`

			`checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost));`

			`r_inf = vec_norminf(rowsA, h_r);`

			`printf("(GPU) \|b - A*x\| = %E \n", r_inf);`
			`printf("(GPU) \|b - Ax\|/(\|A\|\|x\|) = %E \n", r_inf/(A_inf * x_inf));`


			`if (cusolverSpH) { checkCudaErrors(cusolverSpDestroy(cusolverSpH)); }`
			`if (cusparseH ) { checkCudaErrors(cusparseDestroy(cusparseH)); }`
			`if (stream ) { checkCudaErrors(cudaStreamDestroy(stream)); }`
			`if (descrA ) { checkCudaErrors(cusparseDestroyMatDescr(descrA)); }`
			`if (h_info ) { checkCudaErrors(cusolverSpDestroyCsrcholInfoHost(h_info)); }`
			`if (d_info ) { checkCudaErrors(cusolverSpDestroyCsrcholInfo(d_info)); }`
			`if (matA ) { checkCudaErrors(cusparseDestroySpMat(matA)); }`
			`if (vecx ) { checkCudaErrors(cusparseDestroyDnVec(vecx)); }`
			`if (vecAx ) { checkCudaErrors(cusparseDestroyDnVec(vecAx)); }`

			`if (h_csrValA ) { free(h_csrValA); }`
			`if (h_csrRowPtrA) { free(h_csrRowPtrA); }`
			`if (h_csrColIndA) { free(h_csrColIndA); }`

			`if (h_x ) { free(h_x); }`
			`if (h_b ) { free(h_b); }`
			`if (h_r ) { free(h_r); }`

			`if (buffer_cpu) { free(buffer_cpu); }`
			`if (buffer_gpu) { checkCudaErrors(cudaFree(buffer_gpu)); }`

			`if (d_csrValA ) { checkCudaErrors(cudaFree(d_csrValA)); }`
			`if (d_csrRowPtrA) { checkCudaErrors(cudaFree(d_csrRowPtrA)); }`
			`if (d_csrColIndA) { checkCudaErrors(cudaFree(d_csrColIndA)); }`
			`if (d_x) { checkCudaErrors(cudaFree(d_x)); }`
			`if (d_b) { checkCudaErrors(cudaFree(d_b)); }`
			`if (d_r) { checkCudaErrors(cudaFree(d_r)); }`

			`return 0;`
			`}`