cuda-samples/Samples/cuSolverDn_LinearSolver/cuSolverDn_LinearSolver.cpp

585 lines
18 KiB
C++
Raw Normal View History

2021-10-21 19:04:49 +08:00
/* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Test three linear solvers, including Cholesky, LU and QR.
* The user has to prepare a sparse matrix of "matrix market format" (with
* extension .mtx). For example, the user can download matrices in Florida
* Sparse Matrix Collection.
* (http://www.cise.ufl.edu/research/sparse/matrices/)
*
* The user needs to choose a solver by switch -R<solver> and
* to provide the path of the matrix by switch -F<file>, then
* the program solves
* A*x = b where b = ones(m,1)
* and reports relative error
* |b-A*x|/(|A|*|x|)
*
* The elapsed time is also reported so the user can compare efficiency of
* different solvers.
*
* How to use
* ./cuSolverDn_LinearSolver // Default: cholesky
* ./cuSolverDn_LinearSolver -R=chol -filefile> // cholesky factorization
* ./cuSolverDn_LinearSolver -R=lu -file<file> // LU with partial
* pivoting
* ./cuSolverDn_LinearSolver -R=qr -file<file> // QR factorization
*
* Remark: the absolute error on solution x is meaningless without knowing
* condition number of A. The relative error on residual should be close to
* machine zero, i.e. 1.e-15.
*/
#include <assert.h>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cusolverDn.h"
#include "helper_cuda.h"
#include "helper_cusolver.h"
template <typename T_ELEM>
int loadMMSparseMatrix(char *filename, char elem_type, bool csrFormat, int *m,
int *n, int *nnz, T_ELEM **aVal, int **aRowInd,
int **aColInd, int extendSymMatrix);
void UsageDN(void) {
printf("<options>\n");
printf("-h : display this help\n");
printf("-R=<name> : choose a linear solver\n");
printf(" chol (cholesky factorization), this is default\n");
printf(" qr (QR factorization)\n");
printf(" lu (LU factorization)\n");
printf("-lda=<int> : leading dimension of A , m by default\n");
printf("-file=<filename>: filename containing a matrix in MM format\n");
printf("-device=<device_id> : <device_id> if want to run on specific GPU\n");
exit(0);
}
/*
* solve A*x = b by Cholesky factorization
*
*/
int linearSolverCHOL(cusolverDnHandle_t handle, int n, const double *Acopy,
int lda, const double *b, double *x) {
int bufferSize = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
int h_info = 0;
double start, stop;
double time_solve;
cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
checkCudaErrors(cusolverDnDpotrf_bufferSize(handle, uplo, n, (double *)Acopy,
lda, &bufferSize));
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n));
// prepare a copy of A because potrf will overwrite A with L
checkCudaErrors(
cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
start = second();
start = second();
checkCudaErrors(
cusolverDnDpotrf(handle, uplo, n, A, lda, buffer, bufferSize, info));
checkCudaErrors(
cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
if (0 != h_info) {
fprintf(stderr, "Error: Cholesky factorization failed\n");
}
checkCudaErrors(
cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cusolverDnDpotrs(handle, uplo, n, 1, A, lda, x, n, info));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();
time_solve = stop - start;
fprintf(stdout, "timing: cholesky = %10.6f sec\n", time_solve);
if (info) {
checkCudaErrors(cudaFree(info));
}
if (buffer) {
checkCudaErrors(cudaFree(buffer));
}
if (A) {
checkCudaErrors(cudaFree(A));
}
return 0;
}
/*
* solve A*x = b by LU with partial pivoting
*
*/
int linearSolverLU(cusolverDnHandle_t handle, int n, const double *Acopy,
int lda, const double *b, double *x) {
int bufferSize = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
int *ipiv = NULL; // pivoting sequence
int h_info = 0;
double start, stop;
double time_solve;
checkCudaErrors(cusolverDnDgetrf_bufferSize(handle, n, n, (double *)Acopy,
lda, &bufferSize));
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n));
checkCudaErrors(cudaMalloc(&ipiv, sizeof(int) * n));
// prepare a copy of A because getrf will overwrite A with L
checkCudaErrors(
cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
start = second();
start = second();
checkCudaErrors(cusolverDnDgetrf(handle, n, n, A, lda, buffer, ipiv, info));
checkCudaErrors(
cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
if (0 != h_info) {
fprintf(stderr, "Error: LU factorization failed\n");
}
checkCudaErrors(
cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice));
checkCudaErrors(
cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, A, lda, ipiv, x, n, info));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();
time_solve = stop - start;
fprintf(stdout, "timing: LU = %10.6f sec\n", time_solve);
if (info) {
checkCudaErrors(cudaFree(info));
}
if (buffer) {
checkCudaErrors(cudaFree(buffer));
}
if (A) {
checkCudaErrors(cudaFree(A));
}
if (ipiv) {
checkCudaErrors(cudaFree(ipiv));
}
return 0;
}
/*
* solve A*x = b by QR
*
*/
int linearSolverQR(cusolverDnHandle_t handle, int n, const double *Acopy,
int lda, const double *b, double *x) {
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
int bufferSize = 0;
int bufferSize_geqrf = 0;
int bufferSize_ormqr = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
double *tau = NULL;
int h_info = 0;
double start, stop;
double time_solve;
const double one = 1.0;
checkCudaErrors(cublasCreate(&cublasHandle));
checkCudaErrors(cusolverDnDgeqrf_bufferSize(handle, n, n, (double *)Acopy,
lda, &bufferSize_geqrf));
checkCudaErrors(cusolverDnDormqr_bufferSize(handle, CUBLAS_SIDE_LEFT,
CUBLAS_OP_T, n, 1, n, A, lda,
NULL, x, n, &bufferSize_ormqr));
printf("buffer_geqrf = %d, buffer_ormqr = %d \n", bufferSize_geqrf,
bufferSize_ormqr);
bufferSize = (bufferSize_geqrf > bufferSize_ormqr) ? bufferSize_geqrf
: bufferSize_ormqr;
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n));
checkCudaErrors(cudaMalloc((void **)&tau, sizeof(double) * n));
// prepare a copy of A because getrf will overwrite A with L
checkCudaErrors(
cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
start = second();
start = second();
// compute QR factorization
checkCudaErrors(
cusolverDnDgeqrf(handle, n, n, A, lda, tau, buffer, bufferSize, info));
checkCudaErrors(
cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
if (0 != h_info) {
fprintf(stderr, "Error: LU factorization failed\n");
}
checkCudaErrors(
cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice));
// compute Q^T*b
checkCudaErrors(cusolverDnDormqr(handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, n, 1,
n, A, lda, tau, x, n, buffer, bufferSize,
info));
// x = R \ Q^T*b
checkCudaErrors(cublasDtrsm(cublasHandle, CUBLAS_SIDE_LEFT,
CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT, n, 1, &one, A, lda, x, n));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();
time_solve = stop - start;
fprintf(stdout, "timing: QR = %10.6f sec\n", time_solve);
if (cublasHandle) {
checkCudaErrors(cublasDestroy(cublasHandle));
}
if (info) {
checkCudaErrors(cudaFree(info));
}
if (buffer) {
checkCudaErrors(cudaFree(buffer));
}
if (A) {
checkCudaErrors(cudaFree(A));
}
if (tau) {
checkCudaErrors(cudaFree(tau));
}
return 0;
}
void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts) {
memset(&opts, 0, sizeof(opts));
if (checkCmdLineFlag(argc, (const char **)argv, "-h")) {
UsageDN();
}
if (checkCmdLineFlag(argc, (const char **)argv, "R")) {
char *solverType = NULL;
getCmdLineArgumentString(argc, (const char **)argv, "R", &solverType);
if (solverType) {
if ((STRCASECMP(solverType, "chol") != 0) &&
(STRCASECMP(solverType, "lu") != 0) &&
(STRCASECMP(solverType, "qr") != 0)) {
printf("\nIncorrect argument passed to -R option\n");
UsageDN();
} else {
opts.testFunc = solverType;
}
}
}
if (checkCmdLineFlag(argc, (const char **)argv, "file")) {
char *fileName = 0;
getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName);
if (fileName) {
opts.sparse_mat_filename = fileName;
} else {
printf("\nIncorrect filename passed to -file \n ");
UsageDN();
}
}
if (checkCmdLineFlag(argc, (const char **)argv, "lda")) {
opts.lda = getCmdLineArgumentInt(argc, (const char **)argv, "lda");
}
}
int main(int argc, char *argv[]) {
struct testOpts opts;
cusolverDnHandle_t handle = NULL;
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
cudaStream_t stream = NULL;
int rowsA = 0; // number of rows of A
int colsA = 0; // number of columns of A
int nnzA = 0; // number of nonzeros of A
int baseA = 0; // base index in CSR format
int lda = 0; // leading dimension in dense matrix
// CSR(A) from I/O
int *h_csrRowPtrA = NULL;
int *h_csrColIndA = NULL;
double *h_csrValA = NULL;
double *h_A = NULL; // dense matrix from CSR(A)
double *h_x = NULL; // a copy of d_x
double *h_b = NULL; // b = ones(m,1)
double *h_r = NULL; // r = b - A*x, a copy of d_r
double *d_A = NULL; // a copy of h_A
double *d_x = NULL; // x = A \ b
double *d_b = NULL; // a copy of h_b
double *d_r = NULL; // r = b - A*x
// the constants are used in residual evaluation, r = b - A*x
const double minus_one = -1.0;
const double one = 1.0;
double x_inf = 0.0;
double r_inf = 0.0;
double A_inf = 0.0;
int errors = 0;
parseCommandLineArguments(argc, argv, opts);
if (NULL == opts.testFunc) {
opts.testFunc = "chol"; // By default running Cholesky as NO solver
// selected with -R option.
}
findCudaDevice(argc, (const char **)argv);
printf("step 1: read matrix market format\n");
if (opts.sparse_mat_filename == NULL) {
opts.sparse_mat_filename = sdkFindFilePath("gr_900_900_crg.mtx", argv[0]);
if (opts.sparse_mat_filename != NULL)
printf("Using default input file [%s]\n", opts.sparse_mat_filename);
else
printf("Could not find gr_900_900_crg.mtx\n");
} else {
printf("Using input file [%s]\n", opts.sparse_mat_filename);
}
if (opts.sparse_mat_filename == NULL) {
fprintf(stderr, "Error: input matrix is not provided\n");
return EXIT_FAILURE;
}
if (loadMMSparseMatrix<double>(opts.sparse_mat_filename, 'd', true, &rowsA,
&colsA, &nnzA, &h_csrValA, &h_csrRowPtrA,
&h_csrColIndA, true)) {
exit(EXIT_FAILURE);
}
baseA = h_csrRowPtrA[0]; // baseA = {0,1}
printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA,
nnzA, baseA);
if (rowsA != colsA) {
fprintf(stderr, "Error: only support square matrix\n");
exit(EXIT_FAILURE);
}
printf("step 2: convert CSR(A) to dense matrix\n");
lda = opts.lda ? opts.lda : rowsA;
if (lda < rowsA) {
fprintf(stderr, "Error: lda must be greater or equal to dimension of A\n");
exit(EXIT_FAILURE);
}
h_A = (double *)malloc(sizeof(double) * lda * colsA);
h_x = (double *)malloc(sizeof(double) * colsA);
h_b = (double *)malloc(sizeof(double) * rowsA);
h_r = (double *)malloc(sizeof(double) * rowsA);
assert(NULL != h_A);
assert(NULL != h_x);
assert(NULL != h_b);
assert(NULL != h_r);
memset(h_A, 0, sizeof(double) * lda * colsA);
for (int row = 0; row < rowsA; row++) {
const int start = h_csrRowPtrA[row] - baseA;
const int end = h_csrRowPtrA[row + 1] - baseA;
for (int colidx = start; colidx < end; colidx++) {
const int col = h_csrColIndA[colidx] - baseA;
const double Areg = h_csrValA[colidx];
h_A[row + col * lda] = Areg;
}
}
printf("step 3: set right hand side vector (b) to 1\n");
for (int row = 0; row < rowsA; row++) {
h_b[row] = 1.0;
}
// verify if A is symmetric or not.
if (0 == strcmp(opts.testFunc, "chol")) {
int issym = 1;
for (int j = 0; j < colsA; j++) {
for (int i = j; i < rowsA; i++) {
double Aij = h_A[i + j * lda];
double Aji = h_A[j + i * lda];
if (Aij != Aji) {
issym = 0;
break;
}
}
}
if (!issym) {
printf("Error: A has no symmetric pattern, please use LU or QR \n");
exit(EXIT_FAILURE);
}
}
checkCudaErrors(cusolverDnCreate(&handle));
checkCudaErrors(cublasCreate(&cublasHandle));
checkCudaErrors(cudaStreamCreate(&stream));
checkCudaErrors(cusolverDnSetStream(handle, stream));
checkCudaErrors(cublasSetStream(cublasHandle, stream));
checkCudaErrors(cudaMalloc((void **)&d_A, sizeof(double) * lda * colsA));
checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double) * colsA));
checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double) * rowsA));
checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double) * rowsA));
printf("step 4: prepare data on device\n");
checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(double) * lda * colsA,
cudaMemcpyHostToDevice));
checkCudaErrors(
cudaMemcpy(d_b, h_b, sizeof(double) * rowsA, cudaMemcpyHostToDevice));
printf("step 5: solve A*x = b \n");
// d_A and d_b are read-only
if (0 == strcmp(opts.testFunc, "chol")) {
linearSolverCHOL(handle, rowsA, d_A, lda, d_b, d_x);
} else if (0 == strcmp(opts.testFunc, "lu")) {
linearSolverLU(handle, rowsA, d_A, lda, d_b, d_x);
} else if (0 == strcmp(opts.testFunc, "qr")) {
linearSolverQR(handle, rowsA, d_A, lda, d_b, d_x);
} else {
fprintf(stderr, "Error: %s is unknown function\n", opts.testFunc);
exit(EXIT_FAILURE);
}
printf("step 6: evaluate residual\n");
checkCudaErrors(
cudaMemcpy(d_r, d_b, sizeof(double) * rowsA, cudaMemcpyDeviceToDevice));
// r = b - A*x
checkCudaErrors(cublasDgemm_v2(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, rowsA,
1, colsA, &minus_one, d_A, lda, d_x, rowsA,
&one, d_r, rowsA));
checkCudaErrors(
cudaMemcpy(h_x, d_x, sizeof(double) * colsA, cudaMemcpyDeviceToHost));
checkCudaErrors(
cudaMemcpy(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost));
x_inf = vec_norminf(colsA, h_x);
r_inf = vec_norminf(rowsA, h_r);
A_inf = mat_norminf(rowsA, colsA, h_A, lda);
printf("|b - A*x| = %E \n", r_inf);
printf("|A| = %E \n", A_inf);
printf("|x| = %E \n", x_inf);
printf("|b - A*x|/(|A|*|x|) = %E \n", r_inf / (A_inf * x_inf));
if (handle) {
checkCudaErrors(cusolverDnDestroy(handle));
}
if (cublasHandle) {
checkCudaErrors(cublasDestroy(cublasHandle));
}
if (stream) {
checkCudaErrors(cudaStreamDestroy(stream));
}
if (h_csrValA) {
free(h_csrValA);
}
if (h_csrRowPtrA) {
free(h_csrRowPtrA);
}
if (h_csrColIndA) {
free(h_csrColIndA);
}
if (h_A) {
free(h_A);
}
if (h_x) {
free(h_x);
}
if (h_b) {
free(h_b);
}
if (h_r) {
free(h_r);
}
if (d_A) {
checkCudaErrors(cudaFree(d_A));
}
if (d_x) {
checkCudaErrors(cudaFree(d_x));
}
if (d_b) {
checkCudaErrors(cudaFree(d_b));
}
if (d_r) {
checkCudaErrors(cudaFree(d_r));
}
return 0;
}