#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] && { set -a; source "$ENV_FILE"; set +a; } PROM_PORT="${PROMETHEUS_PORT:-9090}" GRAF_PORT="${GRAFANA_PORT:-3000}" ok(){ echo "[OK] $*"; } warn(){ echo "[WARN] $*"; } err(){ echo "[ERR] $*" >&2; } fail(){ err "$*"; exit 1; } GPU_HOST="${GPU_NODE_HOSTNAME:-swarm-metric-gpu-001}" # 1) nodes.json contains gpu node hostname NODES_JSON="$ROOT/private-server/argus/metric/prometheus/nodes.json" if [[ ! -f "$NODES_JSON" ]]; then warn "nodes.json not found at $NODES_JSON" else if jq -e --arg h "$GPU_HOST" '.[] | select(.hostname==$h)' "$NODES_JSON" >/dev/null 2>&1; then ok "nodes.json contains $GPU_HOST" else warn "nodes.json does not list $GPU_HOST" fi fi # 2) Prometheus targets health for :9100 (must) and :9400 (optional) targets_json="$ROOT/tmp/gpu-verify/targets.json"; mkdir -p "$(dirname "$targets_json")" if ! curl -fsS "http://127.0.0.1:${PROM_PORT}/api/v1/targets" -o "$targets_json"; then fail "failed to fetch Prometheus targets" fi # derive gpu node overlay IP GPU_IP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-metric-gpu-node-swarm 2>/dev/null || true) must_ok=false if jq -e --arg ip "$GPU_IP" '.data.activeTargets[] | select(.scrapeUrl | contains($ip+":9100")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then ok "node-exporter 9100 up for GPU node ($GPU_IP)" must_ok=true else # fallback: any 9100 up if jq -e '.data.activeTargets[] | select(.scrapeUrl | test(":9100")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then ok "node-exporter 9100 has at least one up target (fallback)" must_ok=true else fail "node-exporter 9100 has no up targets" fi fi if jq -e --arg ip "$GPU_IP" '.data.activeTargets[] | select(.scrapeUrl | contains($ip+":9400")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then ok "dcgm-exporter 9400 up for GPU node" else if jq -e '.data.activeTargets[] | select(.scrapeUrl | test(":9400")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then ok "dcgm-exporter 9400 has up target (not necessarily GPU node)" else warn "dcgm-exporter 9400 down or missing (acceptable in some envs)" fi fi # 3) Quick PromQL sample for DCGM metric (optional) if curl -fsS "http://127.0.0.1:${PROM_PORT}/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL" -o "$ROOT/tmp/gpu-verify/dcgm.json"; then if jq -e '.data.result | length > 0' "$ROOT/tmp/gpu-verify/dcgm.json" >/dev/null 2>&1; then ok "DCGM_FI_DEV_GPU_UTIL has samples" else warn "no samples for DCGM_FI_DEV_GPU_UTIL (not blocking)" fi fi echo "[DONE] gpu metric verify"