74 lines
2.7 KiB
Bash
Executable File
74 lines
2.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] && { set -a; source "$ENV_FILE"; set +a; }
|
|
|
|
PROM_PORT="${PROMETHEUS_PORT:-9090}"
|
|
GRAF_PORT="${GRAFANA_PORT:-3000}"
|
|
|
|
ok(){ echo "[OK] $*"; }
|
|
warn(){ echo "[WARN] $*"; }
|
|
err(){ echo "[ERR] $*" >&2; }
|
|
fail(){ err "$*"; exit 1; }
|
|
|
|
GPU_HOST="${GPU_NODE_HOSTNAME:-swarm-metric-gpu-001}"
|
|
|
|
# 1) nodes.json contains gpu node hostname
|
|
NODES_JSON="$ROOT/private-server/argus/metric/prometheus/nodes.json"
|
|
if [[ ! -f "$NODES_JSON" ]]; then
|
|
warn "nodes.json not found at $NODES_JSON"
|
|
else
|
|
if jq -e --arg h "$GPU_HOST" '.[] | select(.hostname==$h)' "$NODES_JSON" >/dev/null 2>&1; then
|
|
ok "nodes.json contains $GPU_HOST"
|
|
else
|
|
warn "nodes.json does not list $GPU_HOST"
|
|
fi
|
|
fi
|
|
|
|
# 2) Prometheus targets health for :9100 (must) and :9400 (optional)
|
|
targets_json="$ROOT/tmp/gpu-verify/targets.json"; mkdir -p "$(dirname "$targets_json")"
|
|
if ! curl -fsS "http://127.0.0.1:${PROM_PORT}/api/v1/targets" -o "$targets_json"; then
|
|
fail "failed to fetch Prometheus targets"
|
|
fi
|
|
|
|
# derive gpu node overlay IP
|
|
GPU_IP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-metric-gpu-node-swarm 2>/dev/null || true)
|
|
|
|
must_ok=false
|
|
if jq -e --arg ip "$GPU_IP" '.data.activeTargets[] | select(.scrapeUrl | contains($ip+":9100")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then
|
|
ok "node-exporter 9100 up for GPU node ($GPU_IP)"
|
|
must_ok=true
|
|
else
|
|
# fallback: any 9100 up
|
|
if jq -e '.data.activeTargets[] | select(.scrapeUrl | test(":9100")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then
|
|
ok "node-exporter 9100 has at least one up target (fallback)"
|
|
must_ok=true
|
|
else
|
|
fail "node-exporter 9100 has no up targets"
|
|
fi
|
|
fi
|
|
|
|
if jq -e --arg ip "$GPU_IP" '.data.activeTargets[] | select(.scrapeUrl | contains($ip+":9400")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then
|
|
ok "dcgm-exporter 9400 up for GPU node"
|
|
else
|
|
if jq -e '.data.activeTargets[] | select(.scrapeUrl | test(":9400")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then
|
|
ok "dcgm-exporter 9400 has up target (not necessarily GPU node)"
|
|
else
|
|
warn "dcgm-exporter 9400 down or missing (acceptable in some envs)"
|
|
fi
|
|
fi
|
|
|
|
# 3) Quick PromQL sample for DCGM metric (optional)
|
|
if curl -fsS "http://127.0.0.1:${PROM_PORT}/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL" -o "$ROOT/tmp/gpu-verify/dcgm.json"; then
|
|
if jq -e '.data.result | length > 0' "$ROOT/tmp/gpu-verify/dcgm.json" >/dev/null 2>&1; then
|
|
ok "DCGM_FI_DEV_GPU_UTIL has samples"
|
|
else
|
|
warn "no samples for DCGM_FI_DEV_GPU_UTIL (not blocking)"
|
|
fi
|
|
fi
|
|
|
|
echo "[DONE] gpu metric verify"
|
|
|