diff --git a/run_tests.py b/run_tests.py index 98b7dee1..d14c6df7 100644 --- a/run_tests.py +++ b/run_tests.py @@ -123,13 +123,44 @@ def run_single_test_instance(executable, args, output_file, global_args, run_des safe_print(f"Error running {exe_name} {run_description}: {str(e)}") return {"name": exe_name, "description": run_description, "return_code": -1, "status": f"Error: {str(e)}"} -def run_test(executable, output_dir, args_config, global_args=None): - """Deprecated: This function is replaced by the parallel execution logic in main.""" - # This function is no longer called directly by the main logic. - # It remains here temporarily in case it's needed for reference or single-threaded debugging. - # The core logic is now in run_single_test_instance and managed by ThreadPoolExecutor. - print("Warning: run_test function called directly - this indicates an issue in the refactoring.") - return 1 # Indicate failure if called +def get_gpu_count(): + """Return the number of NVIDIA GPUs visible on the system. + + The function first tries to use the `nvidia-smi` CLI which should be + available on most systems with a CUDA-capable driver installed. If the + command is not present or fails we fall back to checking the + CUDA_VISIBLE_DEVICES environment variable. The fallback is conservative + – if we cannot determine the GPU count we assume 0.""" + + # Try the recommended NVML/nvidia-smi approach first + try: + smi = subprocess.run( + ["nvidia-smi", "-L"], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + text=True, + check=False, + ) + if smi.returncode == 0: + # Each GPU is reported on its own line that starts with "GPU 0:" etc. + gpu_lines = [ln for ln in smi.stdout.strip().splitlines() if ln.strip().lower().startswith("gpu ")] + if gpu_lines: + return len(gpu_lines) + except FileNotFoundError: + # nvidia-smi is missing – may be WSL/no driver inside container etc. + pass + except Exception: + # Any unexpected error – treat as unknown → 0 + pass + + # Fallback: attempt to infer from CUDA_VISIBLE_DEVICES if it is set and not empty + visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip() + if visible and visible not in {"no", "none"}: + # Handles comma-separated list like "0,1,2" or single values + return len([v for v in visible.split(',') if v]) + + # Unable to determine, assume no GPUs + return 0 def main(): parser = argparse.ArgumentParser(description='Run all executables and capture output') @@ -149,6 +180,14 @@ def main(): # Load arguments configuration args_config = load_args_config(args.config) + # Determine how many GPUs are available + gpu_count = get_gpu_count() + if gpu_count == 0: + print("No NVIDIA GPU detected – cannot run CUDA samples. Exiting.") + return 1 + else: + print(f"Detected {gpu_count} GPU(s).") + executables = find_executables(args.dir) if not executables: print("No executables found!") @@ -167,6 +206,14 @@ def main(): safe_print(f"Skipping {exe_name} (marked as skip in config)") continue + # Skip if the sample requires more GPUs than available + required_gpus = args_config.get(base_name, {}).get("min_gpus", 1) + if required_gpus > gpu_count: + safe_print( + f"Skipping {exe_name} (requires {required_gpus} GPU(s), only {gpu_count} available)" + ) + continue + arg_sets_configs = [] if base_name in args_config: config = args_config[base_name] diff --git a/test_args.json b/test_args.json index 04b79549..ec5fd0c3 100644 --- a/test_args.json +++ b/test_args.json @@ -325,5 +325,23 @@ ] } ] + }, + "simpleP2P": { + "min_gpus": 2 + }, + "conjugateGradientMultiDeviceCG": { + "min_gpus": 2 + }, + "simpleCUFFT_2d_MGPU": { + "min_gpus": 2 + }, + "simpleCUFFT_MGPU": { + "min_gpus": 2 + }, + "streamOrderedAllocationP2P": { + "min_gpus": 2 + }, + "EGLStream_CUDA_CrossGPU": { + "min_gpus": 2 } }