mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2024-12-01 14:29:16 +08:00
585 lines
18 KiB
C++
585 lines
18 KiB
C++
|
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or without
|
||
|
* modification, are permitted provided that the following conditions
|
||
|
* are met:
|
||
|
* * Redistributions of source code must retain the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer.
|
||
|
* * Redistributions in binary form must reproduce the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer in the
|
||
|
* documentation and/or other materials provided with the distribution.
|
||
|
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||
|
* contributors may be used to endorse or promote products derived
|
||
|
* from this software without specific prior written permission.
|
||
|
*
|
||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
*/
|
||
|
|
||
|
/*
|
||
|
* Test three linear solvers, including Cholesky, LU and QR.
|
||
|
* The user has to prepare a sparse matrix of "matrix market format" (with
|
||
|
* extension .mtx). For example, the user can download matrices in Florida
|
||
|
* Sparse Matrix Collection.
|
||
|
* (http://www.cise.ufl.edu/research/sparse/matrices/)
|
||
|
*
|
||
|
* The user needs to choose a solver by switch -R<solver> and
|
||
|
* to provide the path of the matrix by switch -F<file>, then
|
||
|
* the program solves
|
||
|
* A*x = b where b = ones(m,1)
|
||
|
* and reports relative error
|
||
|
* |b-A*x|/(|A|*|x|)
|
||
|
*
|
||
|
* The elapsed time is also reported so the user can compare efficiency of
|
||
|
* different solvers.
|
||
|
*
|
||
|
* How to use
|
||
|
* ./cuSolverDn_LinearSolver // Default: cholesky
|
||
|
* ./cuSolverDn_LinearSolver -R=chol -filefile> // cholesky factorization
|
||
|
* ./cuSolverDn_LinearSolver -R=lu -file<file> // LU with partial
|
||
|
* pivoting
|
||
|
* ./cuSolverDn_LinearSolver -R=qr -file<file> // QR factorization
|
||
|
*
|
||
|
* Remark: the absolute error on solution x is meaningless without knowing
|
||
|
* condition number of A. The relative error on residual should be close to
|
||
|
* machine zero, i.e. 1.e-15.
|
||
|
*/
|
||
|
|
||
|
#include <assert.h>
|
||
|
#include <ctype.h>
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <string.h>
|
||
|
|
||
|
#include <cuda_runtime.h>
|
||
|
|
||
|
#include "cublas_v2.h"
|
||
|
#include "cusolverDn.h"
|
||
|
#include "helper_cuda.h"
|
||
|
|
||
|
#include "helper_cusolver.h"
|
||
|
|
||
|
template <typename T_ELEM>
|
||
|
int loadMMSparseMatrix(char *filename, char elem_type, bool csrFormat, int *m,
|
||
|
int *n, int *nnz, T_ELEM **aVal, int **aRowInd,
|
||
|
int **aColInd, int extendSymMatrix);
|
||
|
|
||
|
void UsageDN(void) {
|
||
|
printf("<options>\n");
|
||
|
printf("-h : display this help\n");
|
||
|
printf("-R=<name> : choose a linear solver\n");
|
||
|
printf(" chol (cholesky factorization), this is default\n");
|
||
|
printf(" qr (QR factorization)\n");
|
||
|
printf(" lu (LU factorization)\n");
|
||
|
printf("-lda=<int> : leading dimension of A , m by default\n");
|
||
|
printf("-file=<filename>: filename containing a matrix in MM format\n");
|
||
|
printf("-device=<device_id> : <device_id> if want to run on specific GPU\n");
|
||
|
|
||
|
exit(0);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* solve A*x = b by Cholesky factorization
|
||
|
*
|
||
|
*/
|
||
|
int linearSolverCHOL(cusolverDnHandle_t handle, int n, const double *Acopy,
|
||
|
int lda, const double *b, double *x) {
|
||
|
int bufferSize = 0;
|
||
|
int *info = NULL;
|
||
|
double *buffer = NULL;
|
||
|
double *A = NULL;
|
||
|
int h_info = 0;
|
||
|
double start, stop;
|
||
|
double time_solve;
|
||
|
cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
|
||
|
|
||
|
checkCudaErrors(cusolverDnDpotrf_bufferSize(handle, uplo, n, (double *)Acopy,
|
||
|
lda, &bufferSize));
|
||
|
|
||
|
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
|
||
|
checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize));
|
||
|
checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n));
|
||
|
|
||
|
// prepare a copy of A because potrf will overwrite A with L
|
||
|
checkCudaErrors(
|
||
|
cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice));
|
||
|
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
|
||
|
|
||
|
start = second();
|
||
|
start = second();
|
||
|
|
||
|
checkCudaErrors(
|
||
|
cusolverDnDpotrf(handle, uplo, n, A, lda, buffer, bufferSize, info));
|
||
|
|
||
|
checkCudaErrors(
|
||
|
cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
|
||
|
|
||
|
if (0 != h_info) {
|
||
|
fprintf(stderr, "Error: Cholesky factorization failed\n");
|
||
|
}
|
||
|
|
||
|
checkCudaErrors(
|
||
|
cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice));
|
||
|
|
||
|
checkCudaErrors(cusolverDnDpotrs(handle, uplo, n, 1, A, lda, x, n, info));
|
||
|
|
||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||
|
stop = second();
|
||
|
|
||
|
time_solve = stop - start;
|
||
|
fprintf(stdout, "timing: cholesky = %10.6f sec\n", time_solve);
|
||
|
|
||
|
if (info) {
|
||
|
checkCudaErrors(cudaFree(info));
|
||
|
}
|
||
|
if (buffer) {
|
||
|
checkCudaErrors(cudaFree(buffer));
|
||
|
}
|
||
|
if (A) {
|
||
|
checkCudaErrors(cudaFree(A));
|
||
|
}
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* solve A*x = b by LU with partial pivoting
|
||
|
*
|
||
|
*/
|
||
|
int linearSolverLU(cusolverDnHandle_t handle, int n, const double *Acopy,
|
||
|
int lda, const double *b, double *x) {
|
||
|
int bufferSize = 0;
|
||
|
int *info = NULL;
|
||
|
double *buffer = NULL;
|
||
|
double *A = NULL;
|
||
|
int *ipiv = NULL; // pivoting sequence
|
||
|
int h_info = 0;
|
||
|
double start, stop;
|
||
|
double time_solve;
|
||
|
|
||
|
checkCudaErrors(cusolverDnDgetrf_bufferSize(handle, n, n, (double *)Acopy,
|
||
|
lda, &bufferSize));
|
||
|
|
||
|
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
|
||
|
checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize));
|
||
|
checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n));
|
||
|
checkCudaErrors(cudaMalloc(&ipiv, sizeof(int) * n));
|
||
|
|
||
|
// prepare a copy of A because getrf will overwrite A with L
|
||
|
checkCudaErrors(
|
||
|
cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice));
|
||
|
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
|
||
|
|
||
|
start = second();
|
||
|
start = second();
|
||
|
|
||
|
checkCudaErrors(cusolverDnDgetrf(handle, n, n, A, lda, buffer, ipiv, info));
|
||
|
checkCudaErrors(
|
||
|
cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
|
||
|
|
||
|
if (0 != h_info) {
|
||
|
fprintf(stderr, "Error: LU factorization failed\n");
|
||
|
}
|
||
|
|
||
|
checkCudaErrors(
|
||
|
cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice));
|
||
|
checkCudaErrors(
|
||
|
cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, A, lda, ipiv, x, n, info));
|
||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||
|
stop = second();
|
||
|
|
||
|
time_solve = stop - start;
|
||
|
fprintf(stdout, "timing: LU = %10.6f sec\n", time_solve);
|
||
|
|
||
|
if (info) {
|
||
|
checkCudaErrors(cudaFree(info));
|
||
|
}
|
||
|
if (buffer) {
|
||
|
checkCudaErrors(cudaFree(buffer));
|
||
|
}
|
||
|
if (A) {
|
||
|
checkCudaErrors(cudaFree(A));
|
||
|
}
|
||
|
if (ipiv) {
|
||
|
checkCudaErrors(cudaFree(ipiv));
|
||
|
}
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* solve A*x = b by QR
|
||
|
*
|
||
|
*/
|
||
|
int linearSolverQR(cusolverDnHandle_t handle, int n, const double *Acopy,
|
||
|
int lda, const double *b, double *x) {
|
||
|
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
|
||
|
int bufferSize = 0;
|
||
|
int bufferSize_geqrf = 0;
|
||
|
int bufferSize_ormqr = 0;
|
||
|
int *info = NULL;
|
||
|
double *buffer = NULL;
|
||
|
double *A = NULL;
|
||
|
double *tau = NULL;
|
||
|
int h_info = 0;
|
||
|
double start, stop;
|
||
|
double time_solve;
|
||
|
const double one = 1.0;
|
||
|
|
||
|
checkCudaErrors(cublasCreate(&cublasHandle));
|
||
|
|
||
|
checkCudaErrors(cusolverDnDgeqrf_bufferSize(handle, n, n, (double *)Acopy,
|
||
|
lda, &bufferSize_geqrf));
|
||
|
checkCudaErrors(cusolverDnDormqr_bufferSize(handle, CUBLAS_SIDE_LEFT,
|
||
|
CUBLAS_OP_T, n, 1, n, A, lda,
|
||
|
NULL, x, n, &bufferSize_ormqr));
|
||
|
|
||
|
printf("buffer_geqrf = %d, buffer_ormqr = %d \n", bufferSize_geqrf,
|
||
|
bufferSize_ormqr);
|
||
|
|
||
|
bufferSize = (bufferSize_geqrf > bufferSize_ormqr) ? bufferSize_geqrf
|
||
|
: bufferSize_ormqr;
|
||
|
|
||
|
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
|
||
|
checkCudaErrors(cudaMalloc(&buffer, sizeof(double) * bufferSize));
|
||
|
checkCudaErrors(cudaMalloc(&A, sizeof(double) * lda * n));
|
||
|
checkCudaErrors(cudaMalloc((void **)&tau, sizeof(double) * n));
|
||
|
|
||
|
// prepare a copy of A because getrf will overwrite A with L
|
||
|
checkCudaErrors(
|
||
|
cudaMemcpy(A, Acopy, sizeof(double) * lda * n, cudaMemcpyDeviceToDevice));
|
||
|
|
||
|
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
|
||
|
|
||
|
start = second();
|
||
|
start = second();
|
||
|
|
||
|
// compute QR factorization
|
||
|
checkCudaErrors(
|
||
|
cusolverDnDgeqrf(handle, n, n, A, lda, tau, buffer, bufferSize, info));
|
||
|
|
||
|
checkCudaErrors(
|
||
|
cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
|
||
|
|
||
|
if (0 != h_info) {
|
||
|
fprintf(stderr, "Error: LU factorization failed\n");
|
||
|
}
|
||
|
|
||
|
checkCudaErrors(
|
||
|
cudaMemcpy(x, b, sizeof(double) * n, cudaMemcpyDeviceToDevice));
|
||
|
|
||
|
// compute Q^T*b
|
||
|
checkCudaErrors(cusolverDnDormqr(handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, n, 1,
|
||
|
n, A, lda, tau, x, n, buffer, bufferSize,
|
||
|
info));
|
||
|
|
||
|
// x = R \ Q^T*b
|
||
|
checkCudaErrors(cublasDtrsm(cublasHandle, CUBLAS_SIDE_LEFT,
|
||
|
CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
|
||
|
CUBLAS_DIAG_NON_UNIT, n, 1, &one, A, lda, x, n));
|
||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||
|
stop = second();
|
||
|
|
||
|
time_solve = stop - start;
|
||
|
fprintf(stdout, "timing: QR = %10.6f sec\n", time_solve);
|
||
|
|
||
|
if (cublasHandle) {
|
||
|
checkCudaErrors(cublasDestroy(cublasHandle));
|
||
|
}
|
||
|
if (info) {
|
||
|
checkCudaErrors(cudaFree(info));
|
||
|
}
|
||
|
if (buffer) {
|
||
|
checkCudaErrors(cudaFree(buffer));
|
||
|
}
|
||
|
if (A) {
|
||
|
checkCudaErrors(cudaFree(A));
|
||
|
}
|
||
|
if (tau) {
|
||
|
checkCudaErrors(cudaFree(tau));
|
||
|
}
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts) {
|
||
|
memset(&opts, 0, sizeof(opts));
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "-h")) {
|
||
|
UsageDN();
|
||
|
}
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "R")) {
|
||
|
char *solverType = NULL;
|
||
|
getCmdLineArgumentString(argc, (const char **)argv, "R", &solverType);
|
||
|
|
||
|
if (solverType) {
|
||
|
if ((STRCASECMP(solverType, "chol") != 0) &&
|
||
|
(STRCASECMP(solverType, "lu") != 0) &&
|
||
|
(STRCASECMP(solverType, "qr") != 0)) {
|
||
|
printf("\nIncorrect argument passed to -R option\n");
|
||
|
UsageDN();
|
||
|
} else {
|
||
|
opts.testFunc = solverType;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "file")) {
|
||
|
char *fileName = 0;
|
||
|
getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName);
|
||
|
|
||
|
if (fileName) {
|
||
|
opts.sparse_mat_filename = fileName;
|
||
|
} else {
|
||
|
printf("\nIncorrect filename passed to -file \n ");
|
||
|
UsageDN();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (checkCmdLineFlag(argc, (const char **)argv, "lda")) {
|
||
|
opts.lda = getCmdLineArgumentInt(argc, (const char **)argv, "lda");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int main(int argc, char *argv[]) {
|
||
|
struct testOpts opts;
|
||
|
cusolverDnHandle_t handle = NULL;
|
||
|
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
|
||
|
cudaStream_t stream = NULL;
|
||
|
|
||
|
int rowsA = 0; // number of rows of A
|
||
|
int colsA = 0; // number of columns of A
|
||
|
int nnzA = 0; // number of nonzeros of A
|
||
|
int baseA = 0; // base index in CSR format
|
||
|
int lda = 0; // leading dimension in dense matrix
|
||
|
|
||
|
// CSR(A) from I/O
|
||
|
int *h_csrRowPtrA = NULL;
|
||
|
int *h_csrColIndA = NULL;
|
||
|
double *h_csrValA = NULL;
|
||
|
|
||
|
double *h_A = NULL; // dense matrix from CSR(A)
|
||
|
double *h_x = NULL; // a copy of d_x
|
||
|
double *h_b = NULL; // b = ones(m,1)
|
||
|
double *h_r = NULL; // r = b - A*x, a copy of d_r
|
||
|
|
||
|
double *d_A = NULL; // a copy of h_A
|
||
|
double *d_x = NULL; // x = A \ b
|
||
|
double *d_b = NULL; // a copy of h_b
|
||
|
double *d_r = NULL; // r = b - A*x
|
||
|
|
||
|
// the constants are used in residual evaluation, r = b - A*x
|
||
|
const double minus_one = -1.0;
|
||
|
const double one = 1.0;
|
||
|
|
||
|
double x_inf = 0.0;
|
||
|
double r_inf = 0.0;
|
||
|
double A_inf = 0.0;
|
||
|
int errors = 0;
|
||
|
|
||
|
parseCommandLineArguments(argc, argv, opts);
|
||
|
|
||
|
if (NULL == opts.testFunc) {
|
||
|
opts.testFunc = "chol"; // By default running Cholesky as NO solver
|
||
|
// selected with -R option.
|
||
|
}
|
||
|
|
||
|
findCudaDevice(argc, (const char **)argv);
|
||
|
|
||
|
printf("step 1: read matrix market format\n");
|
||
|
|
||
|
if (opts.sparse_mat_filename == NULL) {
|
||
|
opts.sparse_mat_filename = sdkFindFilePath("gr_900_900_crg.mtx", argv[0]);
|
||
|
if (opts.sparse_mat_filename != NULL)
|
||
|
printf("Using default input file [%s]\n", opts.sparse_mat_filename);
|
||
|
else
|
||
|
printf("Could not find gr_900_900_crg.mtx\n");
|
||
|
} else {
|
||
|
printf("Using input file [%s]\n", opts.sparse_mat_filename);
|
||
|
}
|
||
|
|
||
|
if (opts.sparse_mat_filename == NULL) {
|
||
|
fprintf(stderr, "Error: input matrix is not provided\n");
|
||
|
return EXIT_FAILURE;
|
||
|
}
|
||
|
|
||
|
if (loadMMSparseMatrix<double>(opts.sparse_mat_filename, 'd', true, &rowsA,
|
||
|
&colsA, &nnzA, &h_csrValA, &h_csrRowPtrA,
|
||
|
&h_csrColIndA, true)) {
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
baseA = h_csrRowPtrA[0]; // baseA = {0,1}
|
||
|
|
||
|
printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA,
|
||
|
nnzA, baseA);
|
||
|
|
||
|
if (rowsA != colsA) {
|
||
|
fprintf(stderr, "Error: only support square matrix\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
printf("step 2: convert CSR(A) to dense matrix\n");
|
||
|
|
||
|
lda = opts.lda ? opts.lda : rowsA;
|
||
|
if (lda < rowsA) {
|
||
|
fprintf(stderr, "Error: lda must be greater or equal to dimension of A\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
h_A = (double *)malloc(sizeof(double) * lda * colsA);
|
||
|
h_x = (double *)malloc(sizeof(double) * colsA);
|
||
|
h_b = (double *)malloc(sizeof(double) * rowsA);
|
||
|
h_r = (double *)malloc(sizeof(double) * rowsA);
|
||
|
assert(NULL != h_A);
|
||
|
assert(NULL != h_x);
|
||
|
assert(NULL != h_b);
|
||
|
assert(NULL != h_r);
|
||
|
|
||
|
memset(h_A, 0, sizeof(double) * lda * colsA);
|
||
|
|
||
|
for (int row = 0; row < rowsA; row++) {
|
||
|
const int start = h_csrRowPtrA[row] - baseA;
|
||
|
const int end = h_csrRowPtrA[row + 1] - baseA;
|
||
|
for (int colidx = start; colidx < end; colidx++) {
|
||
|
const int col = h_csrColIndA[colidx] - baseA;
|
||
|
const double Areg = h_csrValA[colidx];
|
||
|
h_A[row + col * lda] = Areg;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
printf("step 3: set right hand side vector (b) to 1\n");
|
||
|
for (int row = 0; row < rowsA; row++) {
|
||
|
h_b[row] = 1.0;
|
||
|
}
|
||
|
|
||
|
// verify if A is symmetric or not.
|
||
|
if (0 == strcmp(opts.testFunc, "chol")) {
|
||
|
int issym = 1;
|
||
|
for (int j = 0; j < colsA; j++) {
|
||
|
for (int i = j; i < rowsA; i++) {
|
||
|
double Aij = h_A[i + j * lda];
|
||
|
double Aji = h_A[j + i * lda];
|
||
|
if (Aij != Aji) {
|
||
|
issym = 0;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if (!issym) {
|
||
|
printf("Error: A has no symmetric pattern, please use LU or QR \n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
checkCudaErrors(cusolverDnCreate(&handle));
|
||
|
checkCudaErrors(cublasCreate(&cublasHandle));
|
||
|
checkCudaErrors(cudaStreamCreate(&stream));
|
||
|
|
||
|
checkCudaErrors(cusolverDnSetStream(handle, stream));
|
||
|
checkCudaErrors(cublasSetStream(cublasHandle, stream));
|
||
|
|
||
|
checkCudaErrors(cudaMalloc((void **)&d_A, sizeof(double) * lda * colsA));
|
||
|
checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double) * colsA));
|
||
|
checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double) * rowsA));
|
||
|
checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double) * rowsA));
|
||
|
|
||
|
printf("step 4: prepare data on device\n");
|
||
|
checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(double) * lda * colsA,
|
||
|
cudaMemcpyHostToDevice));
|
||
|
checkCudaErrors(
|
||
|
cudaMemcpy(d_b, h_b, sizeof(double) * rowsA, cudaMemcpyHostToDevice));
|
||
|
|
||
|
printf("step 5: solve A*x = b \n");
|
||
|
// d_A and d_b are read-only
|
||
|
if (0 == strcmp(opts.testFunc, "chol")) {
|
||
|
linearSolverCHOL(handle, rowsA, d_A, lda, d_b, d_x);
|
||
|
} else if (0 == strcmp(opts.testFunc, "lu")) {
|
||
|
linearSolverLU(handle, rowsA, d_A, lda, d_b, d_x);
|
||
|
} else if (0 == strcmp(opts.testFunc, "qr")) {
|
||
|
linearSolverQR(handle, rowsA, d_A, lda, d_b, d_x);
|
||
|
} else {
|
||
|
fprintf(stderr, "Error: %s is unknown function\n", opts.testFunc);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
printf("step 6: evaluate residual\n");
|
||
|
checkCudaErrors(
|
||
|
cudaMemcpy(d_r, d_b, sizeof(double) * rowsA, cudaMemcpyDeviceToDevice));
|
||
|
|
||
|
// r = b - A*x
|
||
|
checkCudaErrors(cublasDgemm_v2(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, rowsA,
|
||
|
1, colsA, &minus_one, d_A, lda, d_x, rowsA,
|
||
|
&one, d_r, rowsA));
|
||
|
|
||
|
checkCudaErrors(
|
||
|
cudaMemcpy(h_x, d_x, sizeof(double) * colsA, cudaMemcpyDeviceToHost));
|
||
|
checkCudaErrors(
|
||
|
cudaMemcpy(h_r, d_r, sizeof(double) * rowsA, cudaMemcpyDeviceToHost));
|
||
|
|
||
|
x_inf = vec_norminf(colsA, h_x);
|
||
|
r_inf = vec_norminf(rowsA, h_r);
|
||
|
A_inf = mat_norminf(rowsA, colsA, h_A, lda);
|
||
|
|
||
|
printf("|b - A*x| = %E \n", r_inf);
|
||
|
printf("|A| = %E \n", A_inf);
|
||
|
printf("|x| = %E \n", x_inf);
|
||
|
printf("|b - A*x|/(|A|*|x|) = %E \n", r_inf / (A_inf * x_inf));
|
||
|
|
||
|
if (handle) {
|
||
|
checkCudaErrors(cusolverDnDestroy(handle));
|
||
|
}
|
||
|
if (cublasHandle) {
|
||
|
checkCudaErrors(cublasDestroy(cublasHandle));
|
||
|
}
|
||
|
if (stream) {
|
||
|
checkCudaErrors(cudaStreamDestroy(stream));
|
||
|
}
|
||
|
|
||
|
if (h_csrValA) {
|
||
|
free(h_csrValA);
|
||
|
}
|
||
|
if (h_csrRowPtrA) {
|
||
|
free(h_csrRowPtrA);
|
||
|
}
|
||
|
if (h_csrColIndA) {
|
||
|
free(h_csrColIndA);
|
||
|
}
|
||
|
|
||
|
if (h_A) {
|
||
|
free(h_A);
|
||
|
}
|
||
|
if (h_x) {
|
||
|
free(h_x);
|
||
|
}
|
||
|
if (h_b) {
|
||
|
free(h_b);
|
||
|
}
|
||
|
if (h_r) {
|
||
|
free(h_r);
|
||
|
}
|
||
|
|
||
|
if (d_A) {
|
||
|
checkCudaErrors(cudaFree(d_A));
|
||
|
}
|
||
|
if (d_x) {
|
||
|
checkCudaErrors(cudaFree(d_x));
|
||
|
}
|
||
|
if (d_b) {
|
||
|
checkCudaErrors(cudaFree(d_b));
|
||
|
}
|
||
|
if (d_r) {
|
||
|
checkCudaErrors(cudaFree(d_r));
|
||
|
}
|
||
|
|
||
|
return 0;
|
||
|
}
|