cudaTensorCoreGemm -maxrregcount=255 cudaMallocManaged cudaDeviceSynchronize cudaFuncSetAttribute cudaEventCreate cudaEventRecord cudaEventSynchronize cudaEventElapsedTime cudaFree whole ./ ../ ../../common/inc Matrix Multiply WMMA Tensor Cores true cudaTensorCoreGemm.cu 1:CUDA Basic Topics sm70 sm75 x86_64 linux windows7 ppc64le linux 7.0 CUDA Tensor Core GEMM exe