mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-01-19 02:15:47 +08:00
Removing stray cpp from master
This commit is contained in:
parent
8004ad59ab
commit
5688ee0013
|
@ -1,251 +0,0 @@
|
|||
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <cuda.h>
|
||||
#include <nvJitLink.h>
|
||||
#include <nvrtc.h>
|
||||
#include <iostream>
|
||||
#include <cstring>
|
||||
|
||||
#define NUM_THREADS 128
|
||||
#define NUM_BLOCKS 32
|
||||
|
||||
#define NVRTC_SAFE_CALL(x) \
|
||||
do { \
|
||||
nvrtcResult result = x; \
|
||||
if (result != NVRTC_SUCCESS) { \
|
||||
std::cerr << "\nerror: " #x " failed with error " \
|
||||
<< nvrtcGetErrorString(result) << '\n'; \
|
||||
exit(1); \
|
||||
} \
|
||||
} while(0)
|
||||
#define CUDA_SAFE_CALL(x) \
|
||||
do { \
|
||||
CUresult result = x; \
|
||||
if (result != CUDA_SUCCESS) { \
|
||||
const char *msg; \
|
||||
cuGetErrorName(result, &msg); \
|
||||
std::cerr << "\nerror: " #x " failed with error " \
|
||||
<< msg << '\n'; \
|
||||
exit(1); \
|
||||
} \
|
||||
} while(0)
|
||||
#define NVJITLINK_SAFE_CALL(h,x) \
|
||||
do { \
|
||||
nvJitLinkResult result = x; \
|
||||
if (result != NVJITLINK_SUCCESS) { \
|
||||
std::cerr << "\nerror: " #x " failed with error " \
|
||||
<< result << '\n'; \
|
||||
size_t lsize; \
|
||||
result = nvJitLinkGetErrorLogSize(h, &lsize); \
|
||||
if (result == NVJITLINK_SUCCESS && lsize > 0) { \
|
||||
char *log = (char*)malloc(lsize); \
|
||||
result = nvJitLinkGetErrorLog(h, log); \
|
||||
if (result == NVJITLINK_SUCCESS) { \
|
||||
std::cerr << "error log: " << log << '\n'; \
|
||||
free(log); \
|
||||
} \
|
||||
} \
|
||||
exit(1); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
const char *lto_saxpy = " \n\
|
||||
extern __device__ float compute(float a, float x, float y); \n\
|
||||
\n\
|
||||
extern \"C\" __global__ \n\
|
||||
void saxpy(float a, float *x, float *y, float *out, size_t n) \n\
|
||||
{ \n\
|
||||
size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n\
|
||||
if (tid < n) { \n\
|
||||
out[tid] = compute(a, x[tid], y[tid]); \n\
|
||||
} \n\
|
||||
} \n";
|
||||
|
||||
const char *lto_compute = " \n\
|
||||
__device__ float compute(float a, float x, float y) { \n\
|
||||
return a * x + y; \n\
|
||||
} \n";
|
||||
|
||||
// compile code into LTOIR, returning the IR and its size
|
||||
static void getLTOIR (const char *code, const char *name,
|
||||
char **ltoIR, size_t *ltoIRSize)
|
||||
{
|
||||
// Create an instance of nvrtcProgram with the code string.
|
||||
nvrtcProgram prog;
|
||||
NVRTC_SAFE_CALL(
|
||||
nvrtcCreateProgram(&prog, // prog
|
||||
code, // buffer
|
||||
name, // name
|
||||
0, // numHeaders
|
||||
NULL, // headers
|
||||
NULL)); // includeNames
|
||||
|
||||
// specify that LTO IR should be generated for LTO operation
|
||||
const char *opts[] = {"-dlto",
|
||||
"--relocatable-device-code=true"};
|
||||
nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog
|
||||
2, // numOptions
|
||||
opts); // options
|
||||
// Obtain compilation log from the program.
|
||||
size_t logSize;
|
||||
NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
|
||||
char *log = new char[logSize];
|
||||
NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
|
||||
std::cout << log << '\n';
|
||||
delete[] log;
|
||||
if (compileResult != NVRTC_SUCCESS) {
|
||||
exit(1);
|
||||
}
|
||||
// Obtain generated LTO IR from the program.
|
||||
NVRTC_SAFE_CALL(nvrtcGetLTOIRSize(prog, ltoIRSize));
|
||||
*ltoIR = new char[*ltoIRSize];
|
||||
NVRTC_SAFE_CALL(nvrtcGetLTOIR(prog, *ltoIR));
|
||||
// Destroy the program.
|
||||
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
char *ltoIR1;
|
||||
char *ltoIR2;
|
||||
size_t ltoIR1Size;
|
||||
size_t ltoIR2Size;
|
||||
// getLTOIR uses nvrtc to get the LTOIR.
|
||||
// We could also use nvcc offline with -dlto -fatbin
|
||||
// to generate the IR, but using nvrtc keeps the build simpler.
|
||||
getLTOIR(lto_saxpy, "lto_saxpy.cu", <oIR1, <oIR1Size);
|
||||
getLTOIR(lto_compute, "lto_compute.cu", <oIR2, <oIR2Size);
|
||||
|
||||
CUdevice cuDevice;
|
||||
CUcontext context;
|
||||
CUmodule module;
|
||||
CUfunction kernel;
|
||||
CUDA_SAFE_CALL(cuInit(0));
|
||||
CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
|
||||
CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));
|
||||
|
||||
// Dynamically determine the arch to link for
|
||||
int major = 0;
|
||||
int minor = 0;
|
||||
CUDA_SAFE_CALL(cuDeviceGetAttribute(&major,
|
||||
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
|
||||
CUDA_SAFE_CALL(cuDeviceGetAttribute(&minor,
|
||||
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
|
||||
int arch = major*10 + minor;
|
||||
char smbuf[16];
|
||||
memset(smbuf,0,16);
|
||||
sprintf(smbuf, "-arch=sm_%d", arch);
|
||||
|
||||
// Load the generated LTO IR and link them together
|
||||
nvJitLinkHandle handle;
|
||||
const char *lopts[] = {"-lto", smbuf};
|
||||
NVJITLINK_SAFE_CALL(handle, nvJitLinkCreate(&handle, 2, lopts));
|
||||
|
||||
NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR,
|
||||
(void *)ltoIR1, ltoIR1Size, "lto_saxpy"));
|
||||
NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR,
|
||||
(void *)ltoIR2, ltoIR2Size, "lto_compute"));
|
||||
|
||||
// The call to nvJitLinkComplete causes linker to link together the two
|
||||
// LTO IR modules, do optimization on the linked LTO IR,
|
||||
// and generate cubin from it.
|
||||
NVJITLINK_SAFE_CALL(handle, nvJitLinkComplete(handle));
|
||||
|
||||
// check error log
|
||||
size_t logSize;
|
||||
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLogSize(handle, &logSize));
|
||||
if (logSize > 0) {
|
||||
char *log = (char*)malloc(logSize+1);
|
||||
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLog(handle, log));
|
||||
std::cout << "Error log: " << log << std::endl;
|
||||
free(log);
|
||||
}
|
||||
|
||||
// get linked cubin
|
||||
size_t cubinSize;
|
||||
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubinSize(handle, &cubinSize));
|
||||
void *cubin = malloc(cubinSize);
|
||||
NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubin(handle, cubin));
|
||||
|
||||
NVJITLINK_SAFE_CALL(handle, nvJitLinkDestroy(&handle));
|
||||
delete[] ltoIR1;
|
||||
delete[] ltoIR2;
|
||||
|
||||
// cubin is linked, so now load it
|
||||
CUDA_SAFE_CALL(cuModuleLoadData(&module, cubin));
|
||||
CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "saxpy"));
|
||||
|
||||
// Generate input for execution, and create output buffers.
|
||||
size_t n = NUM_THREADS * NUM_BLOCKS;
|
||||
size_t bufferSize = n * sizeof(float);
|
||||
float a = 5.1f;
|
||||
float *hX = new float[n], *hY = new float[n], *hOut = new float[n];
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
hX[i] = static_cast<float>(i);
|
||||
hY[i] = static_cast<float>(i * 2);
|
||||
}
|
||||
CUdeviceptr dX, dY, dOut;
|
||||
CUDA_SAFE_CALL(cuMemAlloc(&dX, bufferSize));
|
||||
CUDA_SAFE_CALL(cuMemAlloc(&dY, bufferSize));
|
||||
CUDA_SAFE_CALL(cuMemAlloc(&dOut, bufferSize));
|
||||
CUDA_SAFE_CALL(cuMemcpyHtoD(dX, hX, bufferSize));
|
||||
CUDA_SAFE_CALL(cuMemcpyHtoD(dY, hY, bufferSize));
|
||||
// Execute SAXPY.
|
||||
void *args[] = { &a, &dX, &dY, &dOut, &n };
|
||||
CUDA_SAFE_CALL(
|
||||
cuLaunchKernel(kernel,
|
||||
NUM_BLOCKS, 1, 1, // grid dim
|
||||
NUM_THREADS, 1, 1, // block dim
|
||||
0, NULL, // shared mem and stream
|
||||
args, 0)); // arguments
|
||||
CUDA_SAFE_CALL(cuCtxSynchronize());
|
||||
// Retrieve and print output.
|
||||
CUDA_SAFE_CALL(cuMemcpyDtoH(hOut, dOut, bufferSize));
|
||||
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
std::cout << a << " * " << hX[i] << " + " << hY[i]
|
||||
<< " = " << hOut[i] << '\n';
|
||||
}
|
||||
// check last value to verify
|
||||
if (hOut[n-1] == 29074.5) {
|
||||
std::cout << "PASSED!\n";
|
||||
} else {
|
||||
std::cout << "values not expected?\n";
|
||||
}
|
||||
// Release resources.
|
||||
CUDA_SAFE_CALL(cuMemFree(dX));
|
||||
CUDA_SAFE_CALL(cuMemFree(dY));
|
||||
CUDA_SAFE_CALL(cuMemFree(dOut));
|
||||
CUDA_SAFE_CALL(cuModuleUnload(module));
|
||||
CUDA_SAFE_CALL(cuCtxDestroy(context));
|
||||
free(cubin);
|
||||
delete[] hX;
|
||||
delete[] hY;
|
||||
delete[] hOut;
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue
Block a user