mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-07-02 13:20:30 +08:00
Compare commits
6 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
8a9e2c830c | ||
|
adacf1cffd | ||
|
da3b7a2b3c | ||
|
5987a9e9fa | ||
|
107f3f537f | ||
|
b530f1cf42 |
@ -1,9 +1,5 @@
|
|||||||
# 1. Utilities
|
# 1. Utilities
|
||||||
|
|
||||||
|
|
||||||
### [bandwidthTest](./bandwidthTest)
|
|
||||||
This is a simple test program to measure the memcopy bandwidth of the GPU and memcpy bandwidth across PCI-e. This test application is capable of measuring device to device copy bandwidth, host to device copy bandwidth for pageable and page-locked memory, and device to host copy bandwidth for pageable and page-locked memory.
|
|
||||||
|
|
||||||
### [deviceQuery](./deviceQuery)
|
### [deviceQuery](./deviceQuery)
|
||||||
This sample enumerates the properties of the CUDA devices present in the system.
|
This sample enumerates the properties of the CUDA devices present in the system.
|
||||||
|
|
||||||
@ -12,3 +8,8 @@ This sample enumerates the properties of the CUDA devices present using CUDA Dri
|
|||||||
|
|
||||||
### [topologyQuery](./topologyQuery)
|
### [topologyQuery](./topologyQuery)
|
||||||
A simple example on how to query the topology of a system with multiple GPU
|
A simple example on how to query the topology of a system with multiple GPU
|
||||||
|
|
||||||
|
## Note
|
||||||
|
|
||||||
|
### bandwidthTest
|
||||||
|
The bandwidthTest sample was out-of-date and has been removed as of the CUDA Samples 12.9 release (see the [change log](../../CHANGELOG.md)). For up-to-date bandwidth measurements, refer instead to the [NVBandwith](https://github.com/nvidia/nvbandwidth) utility.
|
||||||
|
@ -34,8 +34,10 @@
|
|||||||
#include <vulkan/vulkan.h>
|
#include <vulkan/vulkan.h>
|
||||||
#ifdef _WIN64
|
#ifdef _WIN64
|
||||||
#define NOMINMAX
|
#define NOMINMAX
|
||||||
#include <vulkan/vulkan_win32.h>
|
// Add windows.h to the include path
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
|
// Add vulkan_win32.h to the include path
|
||||||
|
#include <vulkan/vulkan_win32.h>
|
||||||
#endif /* _WIN64 */
|
#endif /* _WIN64 */
|
||||||
|
|
||||||
/* remove _VK_TIMELINE_SEMAPHORE to use binary semaphores */
|
/* remove _VK_TIMELINE_SEMAPHORE to use binary semaphores */
|
||||||
|
@ -34,8 +34,10 @@
|
|||||||
#include <vulkan/vulkan.h>
|
#include <vulkan/vulkan.h>
|
||||||
#ifdef _WIN64
|
#ifdef _WIN64
|
||||||
#define NOMINMAX
|
#define NOMINMAX
|
||||||
#include <vulkan/vulkan_win32.h>
|
// Add windows.h to the include path firstly as dependency for other Windows headers
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
|
// Add other Windows headers
|
||||||
|
#include <vulkan/vulkan_win32.h>
|
||||||
#endif /* _WIN64 */
|
#endif /* _WIN64 */
|
||||||
|
|
||||||
struct GLFWwindow;
|
struct GLFWwindow;
|
||||||
|
@ -27,10 +27,12 @@
|
|||||||
|
|
||||||
#define GLFW_INCLUDE_VULKAN
|
#define GLFW_INCLUDE_VULKAN
|
||||||
#ifdef _WIN64
|
#ifdef _WIN64
|
||||||
|
// Add windows.h to the include path firstly as dependency for other Windows headers
|
||||||
|
#include <windows.h>
|
||||||
|
// Add other Windows headers
|
||||||
#include <VersionHelpers.h>
|
#include <VersionHelpers.h>
|
||||||
#include <aclapi.h>
|
#include <aclapi.h>
|
||||||
#include <dxgi1_2.h>
|
#include <dxgi1_2.h>
|
||||||
#include <windows.h>
|
|
||||||
#define _USE_MATH_DEFINES
|
#define _USE_MATH_DEFINES
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -103,7 +103,7 @@ __global__ void copySharedMem(float *odata, float *idata, int width, int height)
|
|||||||
|
|
||||||
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
|
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
|
||||||
if (xIndex < width && yIndex < height) {
|
if (xIndex < width && yIndex < height) {
|
||||||
tile[threadIdx.y][threadIdx.x] = idata[index];
|
tile[threadIdx.y + i][threadIdx.x] = idata[index + i * width];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -111,7 +111,7 @@ __global__ void copySharedMem(float *odata, float *idata, int width, int height)
|
|||||||
|
|
||||||
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
|
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) {
|
||||||
if (xIndex < height && yIndex < width) {
|
if (xIndex < height && yIndex < width) {
|
||||||
odata[index] = tile[threadIdx.y][threadIdx.x];
|
odata[index + i * width] = tile[threadIdx.y + i][threadIdx.x];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -596,6 +596,17 @@ int main(int argc, char **argv)
|
|||||||
(size_x * size_y),
|
(size_x * size_y),
|
||||||
1,
|
1,
|
||||||
TILE_DIM * BLOCK_ROWS);
|
TILE_DIM * BLOCK_ROWS);
|
||||||
|
|
||||||
|
// Reset d_odata to zero before starting the next loop iteration to avoid
|
||||||
|
// carrying over results from previous kernels. Without this reset, residual
|
||||||
|
// data from a prior kernel (e.g., 'copy') could make a subsequent
|
||||||
|
// kernel (e.g., 'copySharedMem') appear correct even if it performs no work,
|
||||||
|
// leading to false positives in compareData.
|
||||||
|
for (int i = 0; i < (size_x * size_y); ++i) {
|
||||||
|
h_odata[i] = 0;
|
||||||
|
}
|
||||||
|
// copy host data to device
|
||||||
|
checkCudaErrors(cudaMemcpy(d_odata, h_odata, mem_size, cudaMemcpyHostToDevice));
|
||||||
}
|
}
|
||||||
|
|
||||||
// cleanup
|
// cleanup
|
||||||
|
Loading…
x
Reference in New Issue
Block a user