reduction --std=c++11 cudaFree cudaDeviceSynchronize cudaMalloc cudaSetDevice cudaMemcpy cudaGetDeviceProperties cudaGetDevice whole ./ ../ ../../../Common Data-Parallel Algorithms Performance Strategies CUDA GPGPU Parallel Reduction CPP11 true reduction.cpp -kernel 0 -kernel 1 -kernel 2 -kernel 3 -kernel 4 -kernel 5 -kernel 6 CPP11 1:CUDA Advanced Topics 1:Data-Parallel Algorithms 1:Performance Strategies sm35 sm37 sm50 sm52 sm53 sm60 sm61 sm70 sm72 sm75 sm80 sm86 sm87 x86_64 linux windows7 x86_64 macosx arm sbsa ppc64le linux all CUDA Parallel Reduction exe