cuda-samples/run_tests.py

308 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

## Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions
## are met:
## * Redistributions of source code must retain the above copyright
## notice, this list of conditions and the following disclaimer.
## * Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimer in the
## documentation and/or other materials provided with the distribution.
## * Neither the name of NVIDIA CORPORATION nor the names of its
## contributors may be used to endorse or promote products derived
## from this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
## EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
## IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
## PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
## CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
## EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
## PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
## OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
##
## For additional information on the license terms, see the CUDA EULA at
## https://docs.nvidia.com/cuda/eula/index.html
import os
import sys
import json
import subprocess
import argparse
from pathlib import Path
import concurrent.futures
import threading
print_lock = threading.Lock()
def safe_print(*args, **kwargs):
"""Thread-safe print function"""
with print_lock:
print(*args, **kwargs)
def normalize_exe_name(name):
"""Normalize executable name across platforms by removing .exe if present"""
return Path(name).stem
def load_args_config(config_file):
"""Load arguments configuration from JSON file"""
if not config_file or not os.path.exists(config_file):
return {}
try:
with open(config_file, 'r') as f:
config = json.load(f)
# Validate the config format
if not isinstance(config, dict):
print("Warning: Config file must contain a dictionary/object")
return {}
return config
except json.JSONDecodeError:
print("Warning: Failed to parse config file as JSON")
return {}
except Exception as e:
print(f"Warning: Error reading config file: {str(e)}")
return {}
def find_executables(root_dir):
"""Find all executable files recursively"""
executables = []
for path in Path(root_dir).rglob('*'):
# Skip directories
if not path.is_file():
continue
# Check if file is executable
if os.access(path, os.X_OK):
# Skip if it's a library file
if path.suffix.lower() in ('.dll', '.so', '.dylib'):
continue
executables.append(path)
return executables
def run_single_test_instance(executable, args, output_file, global_args, run_description):
"""Run a single instance of a test executable with specific arguments."""
exe_path = str(executable)
exe_name = executable.name
safe_print(f"Starting {exe_name} {run_description}")
try:
cmd = [f"./{exe_name}"]
cmd.extend(args)
if global_args:
cmd.extend(global_args)
safe_print(f" Command ({exe_name} {run_description}): {' '.join(cmd)}")
# Run the executable in its own directory using cwd
with open(output_file, 'w') as f:
result = subprocess.run(
cmd,
stdout=f,
stderr=subprocess.STDOUT,
timeout=300, # 5 minute timeout
cwd=os.path.dirname(exe_path) # Execute in the executable's directory
)
status = "Passed" if result.returncode == 0 else "Failed"
safe_print(f" Finished {exe_name} {run_description}: {status} (code {result.returncode})")
return {"name": exe_name, "description": run_description, "return_code": result.returncode, "status": status}
except subprocess.TimeoutExpired:
safe_print(f"Error ({exe_name} {run_description}): Timed out after 5 minutes")
return {"name": exe_name, "description": run_description, "return_code": -1, "status": "Timeout"}
except Exception as e:
safe_print(f"Error running {exe_name} {run_description}: {str(e)}")
return {"name": exe_name, "description": run_description, "return_code": -1, "status": f"Error: {str(e)}"}
def get_gpu_count():
"""Return the number of NVIDIA GPUs visible on the system.
The function first tries to use the `nvidia-smi` CLI which should be
available on most systems with a CUDA-capable driver installed. If the
command is not present or fails we fall back to checking the
CUDA_VISIBLE_DEVICES environment variable. The fallback is conservative
if we cannot determine the GPU count we assume 0."""
# Try the recommended NVML/nvidia-smi approach first
try:
smi = subprocess.run(
["nvidia-smi", "-L"],
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
text=True,
check=False,
)
if smi.returncode == 0:
# Each GPU is reported on its own line that starts with "GPU 0:" etc.
gpu_lines = [ln for ln in smi.stdout.strip().splitlines() if ln.strip().lower().startswith("gpu ")]
if gpu_lines:
return len(gpu_lines)
except FileNotFoundError:
# nvidia-smi is missing may be WSL/no driver inside container etc.
pass
except Exception:
# Any unexpected error treat as unknown → 0
pass
# Fallback: attempt to infer from CUDA_VISIBLE_DEVICES if it is set and not empty
visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip()
if visible and visible not in {"no", "none"}:
# Handles comma-separated list like "0,1,2" or single values
return len([v for v in visible.split(',') if v])
# Unable to determine, assume no GPUs
return 0
def main():
parser = argparse.ArgumentParser(description='Run all executables and capture output')
parser.add_argument('--dir', default='.', help='Root directory to search for executables')
parser.add_argument('--config', help='JSON configuration file for executable arguments')
parser.add_argument('--output', default='.', # Default to current directory
help='Output directory for test results')
parser.add_argument('--parallel', type=int, default=1, help='Number of parallel tests to run')
parser.add_argument('--args', nargs=argparse.REMAINDER,
help='Global arguments to pass to all executables')
args = parser.parse_args()
# Create output directory if it doesn't exist
if args.output:
os.makedirs(args.output, exist_ok=True)
# Load arguments configuration
args_config = load_args_config(args.config)
# Determine how many GPUs are available
gpu_count = get_gpu_count()
if gpu_count == 0:
print("No NVIDIA GPU detected cannot run CUDA samples. Exiting.")
return 1
else:
print(f"Detected {gpu_count} GPU(s).")
executables = find_executables(args.dir)
if not executables:
print("No executables found!")
return 1
print(f"Found {len(executables)} executables")
print(f"Running tests with up to {args.parallel} parallel tasks")
print("----------------------------------------" + "-" * len(str(args.parallel)) + "\n")
tasks = []
for exe in executables:
exe_name = exe.name
base_name = normalize_exe_name(exe_name)
# Check if this executable should be skipped globally
if base_name in args_config and args_config[base_name].get("skip", False):
safe_print(f"Skipping {exe_name} (marked as skip in config)")
continue
# Skip if the sample requires more GPUs than available
required_gpus = args_config.get(base_name, {}).get("min_gpus", 1)
if required_gpus > gpu_count:
safe_print(
f"Skipping {exe_name} (requires {required_gpus} GPU(s), only {gpu_count} available)"
)
continue
arg_sets_configs = []
if base_name in args_config:
config = args_config[base_name]
if "args" in config:
if isinstance(config["args"], list):
arg_sets_configs.append({"args": config["args"]}) # Wrap in dict for consistency
else:
safe_print(f"Warning: Arguments for {base_name} must be a list")
elif "runs" in config:
for i, run_config in enumerate(config["runs"]):
if run_config.get("skip", False):
safe_print(f"Skipping run {i+1} for {exe_name} (marked as skip in config)")
continue
if isinstance(run_config.get("args", []), list):
arg_sets_configs.append(run_config)
else:
safe_print(f"Warning: Arguments for {base_name} run {i+1} must be a list")
# If no specific args defined, create one run with no args
if not arg_sets_configs:
arg_sets_configs.append({"args": []})
# Create tasks for each run configuration
num_runs = len(arg_sets_configs)
for i, run_config in enumerate(arg_sets_configs):
current_args = run_config.get("args", [])
run_desc = f"(run {i+1}/{num_runs})" if num_runs > 1 else ""
# Create output file name
if num_runs > 1:
output_file = os.path.abspath(f"{args.output}/APM_{exe_name}.run{i+1}.txt")
else:
output_file = os.path.abspath(f"{args.output}/APM_{exe_name}.txt")
tasks.append({
"executable": exe,
"args": current_args,
"output_file": output_file,
"global_args": args.args,
"description": run_desc
})
failed = []
total_runs = len(tasks)
completed_runs = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=args.parallel) as executor:
future_to_task = {
executor.submit(run_single_test_instance,
task["executable"],
task["args"],
task["output_file"],
task["global_args"],
task["description"]): task
for task in tasks
}
for future in concurrent.futures.as_completed(future_to_task):
task_info = future_to_task[future]
completed_runs += 1
safe_print(f"Progress: {completed_runs}/{total_runs} runs completed.")
try:
result = future.result()
if result["return_code"] != 0:
failed.append(result)
except Exception as exc:
safe_print(f'Task {task_info["executable"].name} {task_info["description"]} generated an exception: {exc}')
failed.append({
"name": task_info["executable"].name,
"description": task_info["description"],
"return_code": -1,
"status": f"Execution Exception: {exc}"
})
# Print summary
print("\nTest Summary:")
print(f"Ran {total_runs} test runs for {len(executables)} executables.")
if failed:
print(f"Failed runs ({len(failed)}):")
for fail in failed:
print(f" {fail['name']} {fail['description']}: {fail['status']} (code {fail['return_code']})")
# Return the return code of the first failure, or 1 if only exceptions occurred
first_failure_code = next((f["return_code"] for f in failed if f["return_code"] != -1), 1)
return first_failure_code
else:
print("All test runs passed!")
return 0
if __name__ == '__main__':
sys.exit(main())