2025-03-28 15:07:07 -07:00
|
|
|
|
## Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
|
|
|
|
##
|
|
|
|
|
|
## Redistribution and use in source and binary forms, with or without
|
|
|
|
|
|
## modification, are permitted provided that the following conditions
|
|
|
|
|
|
## are met:
|
|
|
|
|
|
## * Redistributions of source code must retain the above copyright
|
|
|
|
|
|
## notice, this list of conditions and the following disclaimer.
|
|
|
|
|
|
## * Redistributions in binary form must reproduce the above copyright
|
|
|
|
|
|
## notice, this list of conditions and the following disclaimer in the
|
|
|
|
|
|
## documentation and/or other materials provided with the distribution.
|
|
|
|
|
|
## * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
|
|
|
|
## contributors may be used to endorse or promote products derived
|
|
|
|
|
|
## from this software without specific prior written permission.
|
|
|
|
|
|
##
|
|
|
|
|
|
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
|
|
|
|
## EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
|
|
## IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
|
|
|
|
## PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
|
|
|
|
## CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
|
|
|
|
## EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
|
|
|
|
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
|
|
|
|
## PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
|
|
|
|
## OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
|
|
## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
|
|
|
|
## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
|
|
##
|
|
|
|
|
|
## For additional information on the license terms, see the CUDA EULA at
|
|
|
|
|
|
## https://docs.nvidia.com/cuda/eula/index.html
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
import sys
|
|
|
|
|
|
import json
|
|
|
|
|
|
import subprocess
|
|
|
|
|
|
import argparse
|
|
|
|
|
|
from pathlib import Path
|
2025-04-07 08:48:44 -07:00
|
|
|
|
import concurrent.futures
|
|
|
|
|
|
import threading
|
|
|
|
|
|
|
|
|
|
|
|
print_lock = threading.Lock()
|
|
|
|
|
|
|
|
|
|
|
|
def safe_print(*args, **kwargs):
|
|
|
|
|
|
"""Thread-safe print function"""
|
|
|
|
|
|
with print_lock:
|
|
|
|
|
|
print(*args, **kwargs)
|
2025-03-28 15:07:07 -07:00
|
|
|
|
|
|
|
|
|
|
def normalize_exe_name(name):
|
|
|
|
|
|
"""Normalize executable name across platforms by removing .exe if present"""
|
|
|
|
|
|
return Path(name).stem
|
|
|
|
|
|
|
|
|
|
|
|
def load_args_config(config_file):
|
|
|
|
|
|
"""Load arguments configuration from JSON file"""
|
|
|
|
|
|
if not config_file or not os.path.exists(config_file):
|
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
with open(config_file, 'r') as f:
|
|
|
|
|
|
config = json.load(f)
|
|
|
|
|
|
|
|
|
|
|
|
# Validate the config format
|
|
|
|
|
|
if not isinstance(config, dict):
|
|
|
|
|
|
print("Warning: Config file must contain a dictionary/object")
|
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
return config
|
|
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
|
|
print("Warning: Failed to parse config file as JSON")
|
|
|
|
|
|
return {}
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"Warning: Error reading config file: {str(e)}")
|
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
def find_executables(root_dir):
|
|
|
|
|
|
"""Find all executable files recursively"""
|
|
|
|
|
|
executables = []
|
|
|
|
|
|
|
|
|
|
|
|
for path in Path(root_dir).rglob('*'):
|
|
|
|
|
|
# Skip directories
|
|
|
|
|
|
if not path.is_file():
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# Check if file is executable
|
|
|
|
|
|
if os.access(path, os.X_OK):
|
|
|
|
|
|
# Skip if it's a library file
|
|
|
|
|
|
if path.suffix.lower() in ('.dll', '.so', '.dylib'):
|
|
|
|
|
|
continue
|
|
|
|
|
|
executables.append(path)
|
|
|
|
|
|
|
|
|
|
|
|
return executables
|
|
|
|
|
|
|
2025-04-07 08:48:44 -07:00
|
|
|
|
def run_single_test_instance(executable, args, output_file, global_args, run_description):
|
|
|
|
|
|
"""Run a single instance of a test executable with specific arguments."""
|
2025-03-28 15:07:07 -07:00
|
|
|
|
exe_path = str(executable)
|
|
|
|
|
|
exe_name = executable.name
|
|
|
|
|
|
|
2025-04-07 08:48:44 -07:00
|
|
|
|
safe_print(f"Starting {exe_name} {run_description}")
|
2025-03-28 15:07:07 -07:00
|
|
|
|
|
2025-04-07 08:48:44 -07:00
|
|
|
|
try:
|
|
|
|
|
|
cmd = [f"./{exe_name}"]
|
|
|
|
|
|
cmd.extend(args)
|
|
|
|
|
|
if global_args:
|
|
|
|
|
|
cmd.extend(global_args)
|
|
|
|
|
|
|
|
|
|
|
|
safe_print(f" Command ({exe_name} {run_description}): {' '.join(cmd)}")
|
|
|
|
|
|
|
|
|
|
|
|
# Run the executable in its own directory using cwd
|
|
|
|
|
|
with open(output_file, 'w') as f:
|
|
|
|
|
|
result = subprocess.run(
|
|
|
|
|
|
cmd,
|
|
|
|
|
|
stdout=f,
|
|
|
|
|
|
stderr=subprocess.STDOUT,
|
|
|
|
|
|
timeout=300, # 5 minute timeout
|
|
|
|
|
|
cwd=os.path.dirname(exe_path) # Execute in the executable's directory
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
status = "Passed" if result.returncode == 0 else "Failed"
|
|
|
|
|
|
safe_print(f" Finished {exe_name} {run_description}: {status} (code {result.returncode})")
|
|
|
|
|
|
return {"name": exe_name, "description": run_description, "return_code": result.returncode, "status": status}
|
|
|
|
|
|
|
|
|
|
|
|
except subprocess.TimeoutExpired:
|
|
|
|
|
|
safe_print(f"Error ({exe_name} {run_description}): Timed out after 5 minutes")
|
|
|
|
|
|
return {"name": exe_name, "description": run_description, "return_code": -1, "status": "Timeout"}
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
safe_print(f"Error running {exe_name} {run_description}: {str(e)}")
|
|
|
|
|
|
return {"name": exe_name, "description": run_description, "return_code": -1, "status": f"Error: {str(e)}"}
|
2025-03-28 15:07:07 -07:00
|
|
|
|
|
2025-04-30 09:45:20 -07:00
|
|
|
|
def get_gpu_count():
|
|
|
|
|
|
"""Return the number of NVIDIA GPUs visible on the system.
|
|
|
|
|
|
|
|
|
|
|
|
The function first tries to use the `nvidia-smi` CLI which should be
|
|
|
|
|
|
available on most systems with a CUDA-capable driver installed. If the
|
|
|
|
|
|
command is not present or fails we fall back to checking the
|
|
|
|
|
|
CUDA_VISIBLE_DEVICES environment variable. The fallback is conservative
|
|
|
|
|
|
– if we cannot determine the GPU count we assume 0."""
|
|
|
|
|
|
|
|
|
|
|
|
# Try the recommended NVML/nvidia-smi approach first
|
|
|
|
|
|
try:
|
|
|
|
|
|
smi = subprocess.run(
|
|
|
|
|
|
["nvidia-smi", "-L"],
|
|
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
|
|
stderr=subprocess.DEVNULL,
|
|
|
|
|
|
text=True,
|
|
|
|
|
|
check=False,
|
|
|
|
|
|
)
|
|
|
|
|
|
if smi.returncode == 0:
|
|
|
|
|
|
# Each GPU is reported on its own line that starts with "GPU 0:" etc.
|
|
|
|
|
|
gpu_lines = [ln for ln in smi.stdout.strip().splitlines() if ln.strip().lower().startswith("gpu ")]
|
|
|
|
|
|
if gpu_lines:
|
|
|
|
|
|
return len(gpu_lines)
|
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
|
# nvidia-smi is missing – may be WSL/no driver inside container etc.
|
|
|
|
|
|
pass
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
# Any unexpected error – treat as unknown → 0
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
# Fallback: attempt to infer from CUDA_VISIBLE_DEVICES if it is set and not empty
|
|
|
|
|
|
visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip()
|
|
|
|
|
|
if visible and visible not in {"no", "none"}:
|
|
|
|
|
|
# Handles comma-separated list like "0,1,2" or single values
|
|
|
|
|
|
return len([v for v in visible.split(',') if v])
|
|
|
|
|
|
|
|
|
|
|
|
# Unable to determine, assume no GPUs
|
|
|
|
|
|
return 0
|
2025-03-28 15:07:07 -07:00
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
parser = argparse.ArgumentParser(description='Run all executables and capture output')
|
|
|
|
|
|
parser.add_argument('--dir', default='.', help='Root directory to search for executables')
|
|
|
|
|
|
parser.add_argument('--config', help='JSON configuration file for executable arguments')
|
|
|
|
|
|
parser.add_argument('--output', default='.', # Default to current directory
|
|
|
|
|
|
help='Output directory for test results')
|
2025-04-07 08:48:44 -07:00
|
|
|
|
parser.add_argument('--parallel', type=int, default=1, help='Number of parallel tests to run')
|
2025-03-28 15:07:07 -07:00
|
|
|
|
parser.add_argument('--args', nargs=argparse.REMAINDER,
|
|
|
|
|
|
help='Global arguments to pass to all executables')
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
# Create output directory if it doesn't exist
|
|
|
|
|
|
if args.output:
|
|
|
|
|
|
os.makedirs(args.output, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
# Load arguments configuration
|
|
|
|
|
|
args_config = load_args_config(args.config)
|
|
|
|
|
|
|
2025-04-30 09:45:20 -07:00
|
|
|
|
# Determine how many GPUs are available
|
|
|
|
|
|
gpu_count = get_gpu_count()
|
|
|
|
|
|
if gpu_count == 0:
|
|
|
|
|
|
print("No NVIDIA GPU detected – cannot run CUDA samples. Exiting.")
|
|
|
|
|
|
return 1
|
|
|
|
|
|
else:
|
|
|
|
|
|
print(f"Detected {gpu_count} GPU(s).")
|
|
|
|
|
|
|
2025-03-28 15:07:07 -07:00
|
|
|
|
executables = find_executables(args.dir)
|
|
|
|
|
|
if not executables:
|
|
|
|
|
|
print("No executables found!")
|
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
|
|
print(f"Found {len(executables)} executables")
|
2025-05-01 09:54:25 -07:00
|
|
|
|
print(f"Running tests with up to {args.parallel} parallel tasks")
|
2025-05-01 10:14:09 -07:00
|
|
|
|
print("----------------------------------------" + "-" * len(str(args.parallel)) + "\n")
|
2025-03-28 15:07:07 -07:00
|
|
|
|
|
2025-04-07 08:48:44 -07:00
|
|
|
|
tasks = []
|
2025-03-28 15:07:07 -07:00
|
|
|
|
for exe in executables:
|
2025-04-07 08:48:44 -07:00
|
|
|
|
exe_name = exe.name
|
|
|
|
|
|
base_name = normalize_exe_name(exe_name)
|
|
|
|
|
|
|
|
|
|
|
|
# Check if this executable should be skipped globally
|
|
|
|
|
|
if base_name in args_config and args_config[base_name].get("skip", False):
|
|
|
|
|
|
safe_print(f"Skipping {exe_name} (marked as skip in config)")
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
2025-04-30 09:45:20 -07:00
|
|
|
|
# Skip if the sample requires more GPUs than available
|
|
|
|
|
|
required_gpus = args_config.get(base_name, {}).get("min_gpus", 1)
|
|
|
|
|
|
if required_gpus > gpu_count:
|
|
|
|
|
|
safe_print(
|
|
|
|
|
|
f"Skipping {exe_name} (requires {required_gpus} GPU(s), only {gpu_count} available)"
|
|
|
|
|
|
)
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
2025-04-07 08:48:44 -07:00
|
|
|
|
arg_sets_configs = []
|
|
|
|
|
|
if base_name in args_config:
|
|
|
|
|
|
config = args_config[base_name]
|
|
|
|
|
|
if "args" in config:
|
|
|
|
|
|
if isinstance(config["args"], list):
|
|
|
|
|
|
arg_sets_configs.append({"args": config["args"]}) # Wrap in dict for consistency
|
|
|
|
|
|
else:
|
|
|
|
|
|
safe_print(f"Warning: Arguments for {base_name} must be a list")
|
|
|
|
|
|
elif "runs" in config:
|
|
|
|
|
|
for i, run_config in enumerate(config["runs"]):
|
|
|
|
|
|
if run_config.get("skip", False):
|
|
|
|
|
|
safe_print(f"Skipping run {i+1} for {exe_name} (marked as skip in config)")
|
|
|
|
|
|
continue
|
|
|
|
|
|
if isinstance(run_config.get("args", []), list):
|
|
|
|
|
|
arg_sets_configs.append(run_config)
|
|
|
|
|
|
else:
|
|
|
|
|
|
safe_print(f"Warning: Arguments for {base_name} run {i+1} must be a list")
|
|
|
|
|
|
|
|
|
|
|
|
# If no specific args defined, create one run with no args
|
|
|
|
|
|
if not arg_sets_configs:
|
|
|
|
|
|
arg_sets_configs.append({"args": []})
|
|
|
|
|
|
|
|
|
|
|
|
# Create tasks for each run configuration
|
|
|
|
|
|
num_runs = len(arg_sets_configs)
|
|
|
|
|
|
for i, run_config in enumerate(arg_sets_configs):
|
|
|
|
|
|
current_args = run_config.get("args", [])
|
|
|
|
|
|
run_desc = f"(run {i+1}/{num_runs})" if num_runs > 1 else ""
|
|
|
|
|
|
|
|
|
|
|
|
# Create output file name
|
|
|
|
|
|
if num_runs > 1:
|
|
|
|
|
|
output_file = os.path.abspath(f"{args.output}/APM_{exe_name}.run{i+1}.txt")
|
|
|
|
|
|
else:
|
|
|
|
|
|
output_file = os.path.abspath(f"{args.output}/APM_{exe_name}.txt")
|
|
|
|
|
|
|
|
|
|
|
|
tasks.append({
|
|
|
|
|
|
"executable": exe,
|
|
|
|
|
|
"args": current_args,
|
|
|
|
|
|
"output_file": output_file,
|
|
|
|
|
|
"global_args": args.args,
|
|
|
|
|
|
"description": run_desc
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
failed = []
|
|
|
|
|
|
total_runs = len(tasks)
|
|
|
|
|
|
completed_runs = 0
|
|
|
|
|
|
|
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=args.parallel) as executor:
|
|
|
|
|
|
future_to_task = {
|
|
|
|
|
|
executor.submit(run_single_test_instance,
|
|
|
|
|
|
task["executable"],
|
|
|
|
|
|
task["args"],
|
|
|
|
|
|
task["output_file"],
|
|
|
|
|
|
task["global_args"],
|
|
|
|
|
|
task["description"]): task
|
|
|
|
|
|
for task in tasks
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for future in concurrent.futures.as_completed(future_to_task):
|
|
|
|
|
|
task_info = future_to_task[future]
|
|
|
|
|
|
completed_runs += 1
|
|
|
|
|
|
safe_print(f"Progress: {completed_runs}/{total_runs} runs completed.")
|
|
|
|
|
|
try:
|
|
|
|
|
|
result = future.result()
|
|
|
|
|
|
if result["return_code"] != 0:
|
|
|
|
|
|
failed.append(result)
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
safe_print(f'Task {task_info["executable"].name} {task_info["description"]} generated an exception: {exc}')
|
|
|
|
|
|
failed.append({
|
|
|
|
|
|
"name": task_info["executable"].name,
|
|
|
|
|
|
"description": task_info["description"],
|
|
|
|
|
|
"return_code": -1,
|
|
|
|
|
|
"status": f"Execution Exception: {exc}"
|
|
|
|
|
|
})
|
2025-03-28 15:07:07 -07:00
|
|
|
|
|
|
|
|
|
|
# Print summary
|
|
|
|
|
|
print("\nTest Summary:")
|
2025-04-07 08:48:44 -07:00
|
|
|
|
print(f"Ran {total_runs} test runs for {len(executables)} executables.")
|
2025-03-28 15:07:07 -07:00
|
|
|
|
if failed:
|
2025-04-07 08:48:44 -07:00
|
|
|
|
print(f"Failed runs ({len(failed)}):")
|
|
|
|
|
|
for fail in failed:
|
|
|
|
|
|
print(f" {fail['name']} {fail['description']}: {fail['status']} (code {fail['return_code']})")
|
|
|
|
|
|
# Return the return code of the first failure, or 1 if only exceptions occurred
|
|
|
|
|
|
first_failure_code = next((f["return_code"] for f in failed if f["return_code"] != -1), 1)
|
|
|
|
|
|
return first_failure_code
|
2025-03-28 15:07:07 -07:00
|
|
|
|
else:
|
2025-04-07 08:48:44 -07:00
|
|
|
|
print("All test runs passed!")
|
2025-03-28 15:07:07 -07:00
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
sys.exit(main())
|