/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This example demonstrates how to get better performance by * batching CUBLAS calls with the use of using streams */ #include #include #include #include #include #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #include #endif /* Using updated (v2) interfaces to cublas and cusparse */ #include #include // Utilities and system includes #include #include "batchCUBLAS.h" const char *sSDKname = "batchCUBLAS"; //============================================================================== // Device information utilities //============================================================================== #if defined(__cplusplus) extern "C" { #endif /* __cplusplus */ int getDeviceVersion(void) { int device; struct cudaDeviceProp properties; if (cudaGetDevice(&device) != cudaSuccess) { printf("failed to get device\n"); return 0; } if (cudaGetDeviceProperties(&properties, device) != cudaSuccess) { printf("failed to get properties\n"); return 0; } return properties.major * 100 + properties.minor * 10; } size_t getDeviceMemory(void) { struct cudaDeviceProp properties; int device; if (cudaGetDevice(&device) != cudaSuccess) { return 0; } if (cudaGetDeviceProperties(&properties, device) != cudaSuccess) { return 0; } return properties.totalGlobalMem; } #if defined(__cplusplus) } #endif /* __cplusplus */ //============================================================================== // random utilities //============================================================================== template void fillupMatrix(T_ELEM *A, int lda, int rows, int cols, int seed = 0); template void fillupMatrix(T_ELEM *A, int lda, int rows, int cols, int seed) { for (int j = 0; j < cols; j++) { for (int i = 0; i < rows; i++) { A[i + lda * j] = cuGet( ((double)(((lda * i + j + seed) % 253) + 1)) / 256.0, ((double)((((cols * i + j) + 123 + seed) % 253) + 1)) / 256.0); } } } /* Explicit instantiation */ template void fillupMatrix(float *A, int lda, int rows, int cols, int seed); template void fillupMatrix(double *A, int lda, int rows, int cols, int seed); /* For debugging */ void printCuType(const char *str, float A) { fprintf(stdout, "%s (0x%08x, %g)", str, floatAsUInt(A), A); } void printCuType(const char *str, double A) { fprintf(stdout, "%s (0x%016llx, %g)", str, doubleAsULL(A), A); } //============================================================================== // defines and structures //============================================================================== #define CUBLAS_SGEMM_MAX_ULP_ERR (.3) #define CUBLAS_DGEMM_MAX_ULP_ERR (1.e-3) #define CUBLAS_SGEMM_MAX_RELATIVE_ERR (6.e-6) #define CUBLAS_DGEMM_MAX_RELATIVE_ERR (0.0) #define CUBLAS_GEMM_TEST_COUNT (30) #define BENCH_MATRIX_M (128) #define BENCH_MATRIX_K (128) #define BENCH_MATRIX_N (128) #define CLEANUP() \ do { \ if (A) free(A); \ if (B) free(B); \ if (C) free(C); \ for (int i = 0; i < opts.N; ++i) { \ if (devPtrA[i]) cudaFree(devPtrA[i]); \ if (devPtrB[i]) cudaFree(devPtrB[i]); \ if (devPtrC[i]) cudaFree(devPtrC[i]); \ } \ if (devPtrA) free(devPtrA); \ if (devPtrB) free(devPtrB); \ if (devPtrC) free(devPtrC); \ if (devPtrA_dev) cudaFree(devPtrA_dev); \ if (devPtrB_dev) cudaFree(devPtrB_dev); \ if (devPtrC_dev) cudaFree(devPtrC_dev); \ fflush(stdout); \ } while (0) enum testMethod { tmRegular, tmStream, tmBatched }; struct gemmOpts { int m; int n; int k; testMethod test_method; char *elem_type; int N; // number of multiplications }; template struct gemmTestParams { cublasOperation_t transa; cublasOperation_t transb; int m; int n; int k; T_ELEM alpha; T_ELEM beta; }; //============================================================================== // template wrappers for cuda functions //============================================================================== static inline cublasStatus_t cublasXgemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float *alpha, const float *A, int lda, float *B, int ldb, float *beta, float *C, int ldc) { return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } static inline cublasStatus_t cublasXgemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double *alpha, const double *A, int lda, double *B, int ldb, double *beta, double *C, int ldc) { return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } static inline cublasStatus_t cublasXgemmBatched( cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float *alpha, const float *Aarray[], int lda, const float *Barray[], int ldb, float *beta, float *Carray[], int ldc, int batchCount) { #if CUDART_VERSION >= 4010 return cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); #else return CUBLAS_STATUS_SUCCESS; #endif } static inline cublasStatus_t cublasXgemmBatched( cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double *alpha, const double *Aarray[], int lda, const double *Barray[], int ldb, double *beta, double *Carray[], int ldc, int batchCount) { #if CUDART_VERSION >= 4010 return cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount); #else return CUBLAS_STATUS_SUCCESS; #endif } //============================================================================== // Primary Application code //============================================================================== static int processArgs(int argc, char *argv[], struct gemmOpts *opts) { int error = 0; int oldError; memset(opts, 0, sizeof(*opts)); static char default_type[] = "d"; // default double opts->elem_type = default_type; opts->N = 10; while (argc) { oldError = error; if (*argv[0] == SWITCH_CHAR) { switch (*(argv[0] + 1)) { case 'm': opts->m = (int)atol(argv[0] + 2); break; case 'n': opts->n = (int)atol(argv[0] + 2); break; case 'k': opts->k = (int)atol(argv[0] + 2); break; case 'N': opts->N = (int)atol(argv[0] + 2); break; default: break; } } if (error > oldError) { fprintf(stderr, "Invalid switch '%c%s'\n", SWITCH_CHAR, argv[0] + 1); } argc -= 1; argv++; } return error; } template static int TESTGEN(gemm)(const struct gemmOpts *opts, int matrixM, int matrixN, int matrixK, int &numTests, struct gemmTestParams *params) { static T_ELEM alpha[] = {cuGet(0, 0), cuGet(-1, -1), cuGet(1, -2), cuGet(2, -1), cuGet(0, -3)}; static T_ELEM beta[] = {cuGet(0, 0), cuGet(-1, -1), cuGet(1, -2), cuGet(2, -1), cuGet(0, -3)}; #define NBR_ALPHAS (sizeof(alpha) / sizeof(alpha[0])) #define NBR_BETAS (sizeof(beta) / sizeof(beta[0])) static T_ELEM theAlpha; static T_ELEM theBeta; static int state; static int m; static int n; static int k; if (numTests-- <= 0) { return -1; } theAlpha = alpha[cuRand() % NBR_ALPHAS]; theBeta = beta[cuRand() % NBR_BETAS]; params->transa = CUBLAS_OP_N; params->transb = CUBLAS_OP_N; m = matrixM; n = matrixN; k = matrixK; params->m = m; params->n = n; params->k = k; params->alpha = theAlpha; params->beta = theBeta; printf("#### args: ta=%d tb=%d m=%d n=%d k=%d ", (unsigned int)params->transa, (unsigned int)params->transb, params->m, params->n, params->k); printCuType(" alpha =", params->alpha); printCuType(" beta=", params->beta); printf("\n"); m = cuRand() % matrixM; n = cuRand() % matrixN; k = cuRand() % matrixK; state = cuRand() % 9; return 0; } template void fillupMatrixDebug(T_ELEM *A, int lda, int rows, int cols) { for (int j = 0; j < cols; j++) { for (int i = 0; i < rows; i++) { A[i + lda * j] = cuGet(i + j); } } } template int test_gemm_loop(struct gemmOpts &opts, float err, double max_relative_error, cublasHandle_t handle) { struct gemmTestParams params; cudaStream_t *streamArray = 0; cublasStatus_t status1, status2, status3; T_ELEM *A = NULL; T_ELEM *B = NULL; T_ELEM *C = NULL; T_ELEM **devPtrA = 0; T_ELEM **devPtrB = 0; T_ELEM **devPtrC = 0; T_ELEM **devPtrA_dev = NULL; T_ELEM **devPtrB_dev = NULL; T_ELEM **devPtrC_dev = NULL; int matrixM, matrixN, matrixK; int rowsA, rowsB, rowsC; int colsA, colsB, colsC; int matrixSizeA, matrixSizeB, matrixSizeC; int errors; double start, stop; printf("Testing %cgemm\n", *opts.elem_type); matrixM = (opts.m) ? opts.m : BENCH_MATRIX_M; matrixN = (opts.n) ? opts.n : BENCH_MATRIX_N; matrixK = (opts.k) ? opts.k : BENCH_MATRIX_K; rowsA = imax(1, matrixM); colsA = imax(1, matrixK); rowsB = imax(1, matrixK); colsB = imax(1, matrixN); rowsC = imax(1, matrixM); colsC = imax(1, matrixN); matrixSizeA = rowsA * colsA; matrixSizeB = rowsB * colsB; matrixSizeC = rowsC * colsC; devPtrA = (T_ELEM **)malloc(opts.N * sizeof(*devPtrA)); devPtrB = (T_ELEM **)malloc(opts.N * sizeof(*devPtrB)); devPtrC = (T_ELEM **)malloc(opts.N * sizeof(*devPtrC)); for (int i = 0; i < opts.N; i++) { cudaError_t err1 = cudaMalloc((void **)&devPtrA[i], matrixSizeA * sizeof(devPtrA[0][0])); cudaError_t err2 = cudaMalloc((void **)&devPtrB[i], matrixSizeB * sizeof(devPtrB[0][0])); cudaError_t err3 = cudaMalloc((void **)&devPtrC[i], matrixSizeC * sizeof(devPtrC[0][0])); if ((err1 != cudaSuccess) || (err2 != cudaSuccess) || (err3 != cudaSuccess)) { CLEANUP(); fprintf(stderr, "!!!! GPU memory allocation error\n"); return CUBLASTEST_FAILED; } } // For batched processing we need those arrays on the device if (opts.test_method == tmBatched) { cudaError_t err1 = cudaMalloc((void **)&devPtrA_dev, opts.N * sizeof(*devPtrA)); cudaError_t err2 = cudaMalloc((void **)&devPtrB_dev, opts.N * sizeof(*devPtrB)); cudaError_t err3 = cudaMalloc((void **)&devPtrC_dev, opts.N * sizeof(*devPtrC)); if ((err1 != cudaSuccess) || (err2 != cudaSuccess) || (err3 != cudaSuccess)) { CLEANUP(); fprintf(stderr, "!!!! GPU memory allocation error\n"); return CUBLASTEST_FAILED; } err1 = cudaMemcpy(devPtrA_dev, devPtrA, opts.N * sizeof(*devPtrA), cudaMemcpyHostToDevice); err2 = cudaMemcpy(devPtrB_dev, devPtrB, opts.N * sizeof(*devPtrB), cudaMemcpyHostToDevice); err3 = cudaMemcpy(devPtrC_dev, devPtrC, opts.N * sizeof(*devPtrC), cudaMemcpyHostToDevice); if ((err1 != cudaSuccess) || (err2 != cudaSuccess) || (err3 != cudaSuccess)) { CLEANUP(); fprintf(stderr, "!!!! cannot copy pointer array to device\n"); return CUBLASTEST_FAILED; } } A = (T_ELEM *)malloc(matrixSizeA * sizeof(A[0])); B = (T_ELEM *)malloc(matrixSizeB * sizeof(B[0])); C = (T_ELEM *)malloc(matrixSizeC * sizeof(C[0])); if ((!A) || (!B) || (!C)) { CLEANUP(); fprintf(stderr, "!!!! system memory allocation error\n"); return CUBLASTEST_FAILED; } streamArray = (cudaStream_t *)malloc(opts.N * sizeof(cudaStream_t *)); for (int i = 0; i < opts.N; i++) { if (opts.test_method == tmStream) { cudaError_t cudaErr = cudaStreamCreate(&streamArray[i]); if (cudaErr != cudaSuccess) { CLEANUP(); fprintf(stderr, "!!!! cannot create stream\n"); return CUBLASTEST_FAILED; } } else { streamArray[i] = 0; } } errors = 0; int numTests = 1; while (TESTGEN(gemm)(&opts, matrixM, matrixN, matrixK, numTests, ¶ms) == 0) { printf("#### args: lda=%d ldb=%d ldc=%d\n", rowsA, rowsB, rowsC); // fillup with Nan first (so lda padding is full on Nan) memset(A, 0xFF, matrixSizeA * sizeof(A[0])); fillupMatrixDebug(A, rowsA, params.m, params.k); memset(B, 0xFF, matrixSizeB * sizeof(B[0])); fillupMatrix(B, rowsB, params.k, params.n, 121); if (!cuEqual(params.beta, cuGet(0))) { fillupMatrix(C, rowsC, params.m, params.n); } else { /* fill with SNaNs to make sure ZGEMM doesn't access C */ memset(C, 0xFF, matrixSizeC * sizeof(C[0])); } double flopsCoef = 2.0; for (int i = 0; i < opts.N; i++) { status1 = cublasSetMatrix(rowsA, colsA, sizeof(A[0]), A, rowsA, devPtrA[i], rowsA); status2 = cublasSetMatrix(rowsB, colsB, sizeof(B[0]), B, rowsB, devPtrB[i], rowsB); status3 = cublasSetMatrix(rowsC, colsC, sizeof(C[0]), C, rowsC, devPtrC[i], rowsC); if ((status1 != CUBLAS_STATUS_SUCCESS) || (status2 != status1) || (status3 != status1)) { CLEANUP(); fprintf(stderr, "!!!! GPU access error (write)\n"); return CUBLASTEST_FAILED; } } start = second(); if (opts.test_method == tmBatched) { cublasSetStream(handle, streamArray[0]); status1 = cublasXgemmBatched(handle, params.transa, params.transb, params.m, params.n, params.k, ¶ms.alpha, (const T_ELEM **)devPtrA_dev, rowsA, (const T_ELEM **)devPtrB_dev, rowsB, ¶ms.beta, devPtrC_dev, rowsC, opts.N); if (status1 != CUBLAS_STATUS_SUCCESS) { cudaError_t cudaStatus = cudaGetLastError(); CLEANUP(); fprintf(stderr, "!!!! GPU program execution error : cublas Error=%d, cuda " "Error=%d,(%s)\n", status1, cudaStatus, cudaGetErrorString(cudaStatus)); return CUBLASTEST_FAILED; } } else { for (int i = 0; i < opts.N; i++) { cublasSetStream(handle, streamArray[i]); status1 = cublasXgemm(handle, params.transa, params.transb, params.m, params.n, params.k, ¶ms.alpha, devPtrA[i], rowsA, devPtrB[i], rowsB, ¶ms.beta, devPtrC[i], rowsC); if (status1 != CUBLAS_STATUS_SUCCESS) { cudaError_t cudaStatus = cudaGetLastError(); CLEANUP(); fprintf(stderr, "!!!! GPU program execution error : cublas Error=%d, cuda " "Error=%d,(%s)\n", status1, cudaStatus, cudaGetErrorString(cudaStatus)); return CUBLASTEST_FAILED; } } } cudaError_t cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { CLEANUP(); fprintf(stderr, "!!!! GPU program execution error on cudaDeviceSynchronize : " "cudaError=%d,(%s)\n", cudaStatus, cudaGetErrorString(cudaStatus)); return CUBLASTEST_FAILED; } stop = second(); fprintf(stdout, "^^^^ elapsed = %10.8f sec GFLOPS=%g\n", (stop - start), opts.N * (1e-9 * flopsCoef * params.m * params.n * params.k) / (stop - start)); } // end while (TESTGEN.. CLEANUP(); fprintf(stdout, "@@@@ %cgemm test %s\n", *opts.elem_type, errors ? "FAIL" : "OK"); return CUBLASTEST_PASSED; } int main(int argc, char *argv[]) { struct gemmOpts opts; int errors, nTimes, nTotalErrors = 0; int status = CUBLASTEST_PASSED; printf("%s Starting...\n\n", sSDKname); int dev = findCudaDevice(argc, (const char **)argv); if (dev == -1) { return CUBLASTEST_FAILED; } errors = processArgs(argc, argv, &opts); if (errors) { fprintf(stdout, "\n Usage: batchcublas [-mSIZE_M] [-nSIZE_N] [-kSIZE_N] " "[-NSIZE_NUM_ITERATIONS] [-qatest] [-noprompt]\n"); return CUBLASTEST_FAILED; } cublasHandle_t handle; if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { fprintf(stdout, "CUBLAS initialization failed!\n"); exit(EXIT_FAILURE); } // Run single kernels fprintf(stdout, "\n ==== Running single kernels ==== \n\n"); nTimes = opts.N; opts.N = 1; *(opts.elem_type) = 's'; status = test_gemm_loop(opts, (float)CUBLAS_SGEMM_MAX_ULP_ERR, (double)CUBLAS_SGEMM_MAX_RELATIVE_ERR, handle); // Run Double version *(opts.elem_type) = 'd'; if (getDeviceVersion() < DEV_VER_DBL_SUPPORT) { fprintf(stdout, "@@@@ dgemm test WAIVED due to lack of DP support\n"); exit(EXIT_WAIVED); } status = test_gemm_loop(opts, (float)CUBLAS_DGEMM_MAX_ULP_ERR, (double)CUBLAS_DGEMM_MAX_RELATIVE_ERR, handle); nTotalErrors += (status == CUBLASTEST_PASSED ? 0 : 1); opts.N = nTimes; // Run with and without streams and then batched. The batched functions are a // feature new feature in 4.1 #if CUDART_VERSION >= 4010 for (int ii = 0; ii < 3; ii++) { #else for (int ii = 0; ii < 2; ii++) { #endif switch (ii) { case 0: opts.test_method = tmRegular; fprintf(stdout, "\n ==== Running N=%d without streams ==== \n\n", opts.N); break; case 1: opts.test_method = tmStream; fprintf(stdout, "\n ==== Running N=%d with streams ==== \n\n", opts.N); break; case 2: opts.test_method = tmBatched; fprintf(stdout, "\n ==== Running N=%d batched ==== \n\n", opts.N); break; } // Run single version *(opts.elem_type) = 's'; status = test_gemm_loop(opts, (float)CUBLAS_SGEMM_MAX_ULP_ERR, (double)CUBLAS_SGEMM_MAX_RELATIVE_ERR, handle); nTotalErrors += (status == CUBLASTEST_PASSED ? 0 : 1); // Run Double version *(opts.elem_type) = 'd'; // Test doesn't meet minSpec, will will wave the DP test if (getDeviceVersion() < DEV_VER_DBL_SUPPORT) { fprintf(stdout, "@@@@ dgemm test WAIVED due to lack of DP support\n"); exit(EXIT_WAIVED); } else { status = test_gemm_loop(opts, (float)CUBLAS_DGEMM_MAX_ULP_ERR, (double)CUBLAS_DGEMM_MAX_RELATIVE_ERR, handle); nTotalErrors += (status == CUBLASTEST_PASSED ? 0 : 1); } } cublasDestroy(handle); printf("\nTest Summary\n"); printf("%d error(s)\n", nTotalErrors); exit(nTotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); }