Bug 5809031: Explicitly ensure P2P is disabled before fallback detecting

This commit is contained in:
Shawn Zeng 2026-01-26 11:37:52 +08:00
parent 5bbae96fd3
commit 9b7a6f907a

View File

@ -124,16 +124,26 @@ bool detectFallback(int numGPUs)
cudaStream_t s; cudaStream_t s;
cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
size_t testElems = 1; // 比如 1M ints (4MB) size_t testElems = 1;
cudaMalloc(&tmp0, testElems * sizeof(int)); cudaMalloc(&tmp0, testElems * sizeof(int));
cudaSetDevice(1); cudaSetDevice(1);
cudaMalloc(&tmp1, testElems * sizeof(int)); cudaMalloc(&tmp1, testElems * sizeof(int));
cudaCheckError(); cudaCheckError();
// Explicitly ensure P2P is disabled for this test
// (Clear any pre-existing P2P access if it happens to be enabled)
cudaSetDevice(0);
cudaDeviceDisablePeerAccess(1);
cudaGetLastError(); // Clear error if peer access was not enabled
cudaSetDevice(1);
cudaDeviceDisablePeerAccess(0);
cudaGetLastError(); // Clear error if peer access was not enabled
bool needsFallback = false; bool needsFallback = false;
cudaError_t testErr = cudaMemcpyPeerAsync(tmp1, 1, tmp0, 0, testElems * sizeof(int), s); cudaError_t testErr = cudaMemcpyPeerAsync(tmp1, 1, tmp0, 0, testElems * sizeof(int), s);
if (testErr == cudaErrorNotSupported) { if (testErr == cudaErrorPeerAccessNotEnabled || testErr == cudaErrorNotSupported) {
needsFallback = true; needsFallback = true;
printf("Note: cudaMemcpyPeerAsync reported '%s' - will use host-mediated copy when P2P is disabled\n", printf("Note: cudaMemcpyPeerAsync reported '%s' - will use host-mediated copy when P2P is disabled\n",
cudaGetErrorString(testErr)); cudaGetErrorString(testErr));
@ -208,7 +218,7 @@ void performP2PCopy(int *dest,
cudaCheckError(); cudaCheckError();
} }
else { else {
// Use cudaMemcpyPeerAsync (original behavior) // Use cudaMemcpyPeerAsync
for (int r = 0; r < repeat; r++) { for (int r = 0; r < repeat; r++) {
cudaMemcpyPeerAsync(dest, destDevice, src, srcDevice, sizeof(int) * num_elems, streamToRun); cudaMemcpyPeerAsync(dest, destDevice, src, srcDevice, sizeof(int) * num_elems, streamToRun);
} }