Some samples require multiple GPUs. Update 'run_tests.py' to skip them on single- or no-GPU systems.

This commit is contained in:
Rob Armstrong 2025-04-30 09:45:20 -07:00
parent ee15cc0fe2
commit c14a0114d6
2 changed files with 72 additions and 7 deletions

View File

@ -123,13 +123,44 @@ def run_single_test_instance(executable, args, output_file, global_args, run_des
safe_print(f"Error running {exe_name} {run_description}: {str(e)}")
return {"name": exe_name, "description": run_description, "return_code": -1, "status": f"Error: {str(e)}"}
def run_test(executable, output_dir, args_config, global_args=None):
"""Deprecated: This function is replaced by the parallel execution logic in main."""
# This function is no longer called directly by the main logic.
# It remains here temporarily in case it's needed for reference or single-threaded debugging.
# The core logic is now in run_single_test_instance and managed by ThreadPoolExecutor.
print("Warning: run_test function called directly - this indicates an issue in the refactoring.")
return 1 # Indicate failure if called
def get_gpu_count():
"""Return the number of NVIDIA GPUs visible on the system.
The function first tries to use the `nvidia-smi` CLI which should be
available on most systems with a CUDA-capable driver installed. If the
command is not present or fails we fall back to checking the
CUDA_VISIBLE_DEVICES environment variable. The fallback is conservative
if we cannot determine the GPU count we assume 0."""
# Try the recommended NVML/nvidia-smi approach first
try:
smi = subprocess.run(
["nvidia-smi", "-L"],
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
text=True,
check=False,
)
if smi.returncode == 0:
# Each GPU is reported on its own line that starts with "GPU 0:" etc.
gpu_lines = [ln for ln in smi.stdout.strip().splitlines() if ln.strip().lower().startswith("gpu ")]
if gpu_lines:
return len(gpu_lines)
except FileNotFoundError:
# nvidia-smi is missing may be WSL/no driver inside container etc.
pass
except Exception:
# Any unexpected error treat as unknown → 0
pass
# Fallback: attempt to infer from CUDA_VISIBLE_DEVICES if it is set and not empty
visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip()
if visible and visible not in {"no", "none"}:
# Handles comma-separated list like "0,1,2" or single values
return len([v for v in visible.split(',') if v])
# Unable to determine, assume no GPUs
return 0
def main():
parser = argparse.ArgumentParser(description='Run all executables and capture output')
@ -149,6 +180,14 @@ def main():
# Load arguments configuration
args_config = load_args_config(args.config)
# Determine how many GPUs are available
gpu_count = get_gpu_count()
if gpu_count == 0:
print("No NVIDIA GPU detected cannot run CUDA samples. Exiting.")
return 1
else:
print(f"Detected {gpu_count} GPU(s).")
executables = find_executables(args.dir)
if not executables:
print("No executables found!")
@ -167,6 +206,14 @@ def main():
safe_print(f"Skipping {exe_name} (marked as skip in config)")
continue
# Skip if the sample requires more GPUs than available
required_gpus = args_config.get(base_name, {}).get("min_gpus", 1)
if required_gpus > gpu_count:
safe_print(
f"Skipping {exe_name} (requires {required_gpus} GPU(s), only {gpu_count} available)"
)
continue
arg_sets_configs = []
if base_name in args_config:
config = args_config[base_name]

View File

@ -325,5 +325,23 @@
]
}
]
},
"simpleP2P": {
"min_gpus": 2
},
"conjugateGradientMultiDeviceCG": {
"min_gpus": 2
},
"simpleCUFFT_2d_MGPU": {
"min_gpus": 2
},
"simpleCUFFT_MGPU": {
"min_gpus": 2
},
"streamOrderedAllocationP2P": {
"min_gpus": 2
},
"EGLStream_CUDA_CrossGPU": {
"min_gpus": 2
}
}