diff --git a/deployment/build/build_cpu_node_image.sh b/deployment/build/build_cpu_node_image.sh new file mode 100755 index 0000000..276c9c0 --- /dev/null +++ b/deployment/build/build_cpu_node_image.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +cd "$ROOT_DIR" + +usage() { + cat <&2; usage; exit 1;; + esac +done + +CMD=("./deployment/build/build_images.sh" "--with-node-bundle") +if [[ -n "$VERSION" ]]; then CMD+=("--client-version" "$VERSION"); fi + +echo "[CPU-BUNDLE] invoking: ${CMD[*]}" +"${CMD[@]}" + +echo "[CPU-BUNDLE] built image: argus-sys-metric-test-node-bundle:latest" +docker image inspect argus-sys-metric-test-node-bundle:latest >/dev/null 2>&1 || { + echo "[ERR] expected image not found" >&2; exit 1; } + +echo "[CPU-BUNDLE] done" + diff --git a/deployment/build/build_gpu_node_image.sh b/deployment/build/build_gpu_node_image.sh new file mode 100755 index 0000000..d8414aa --- /dev/null +++ b/deployment/build/build_gpu_node_image.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +cd "$ROOT_DIR" + +usage() { + cat <&2; usage; exit 1;; + esac +done + +BASE_IMAGE="argus-sys-metric-test-gpu-node:latest" + +CMD=("./deployment/build/build_images.sh" "--with-node-bundle" "--base-image" "$BASE_IMAGE") +if [[ -n "$VERSION" ]]; then CMD+=("--client-version" "$VERSION"); fi + +echo "[GPU-BUNDLE] invoking: ${CMD[*]}" +"${CMD[@]}" + +echo "[GPU-BUNDLE] re-tagging to $OUT_TAG" +docker image inspect argus-sys-metric-test-node-bundle:latest >/dev/null 2>&1 || { + echo "[ERR] base bundle image missing: argus-sys-metric-test-node-bundle:latest" >&2; exit 1; } +docker tag argus-sys-metric-test-node-bundle:latest "$OUT_TAG" +docker image inspect "$OUT_TAG" >/dev/null 2>&1 || { echo "[ERR] re-tag failed" >&2; exit 1; } + +echo "[GPU-BUNDLE] built image: $OUT_TAG (base=$BASE_IMAGE)" + diff --git a/src/sys/build/node-bundle/.gitignore b/src/sys/build/node-bundle/.gitignore new file mode 100644 index 0000000..8d4322e --- /dev/null +++ b/src/sys/build/node-bundle/.gitignore @@ -0,0 +1 @@ +bundle/*.tar.gz \ No newline at end of file diff --git a/src/sys/build/node-bundle/bundle/.gitignore b/src/sys/build/node-bundle/bundle/.gitignore deleted file mode 100644 index 11b12c6..0000000 --- a/src/sys/build/node-bundle/bundle/.gitignore +++ /dev/null @@ -1 +0,0 @@ -argus-metric_*.tar.gz diff --git a/src/sys/swarm_tests/.env.example b/src/sys/swarm_tests/.env.example index 9287dda..b7cd948 100644 --- a/src/sys/swarm_tests/.env.example +++ b/src/sys/swarm_tests/.env.example @@ -19,3 +19,6 @@ WEB_PROXY_PORT_8085=8085 ARGUS_BUILD_UID=2133 ARGUS_BUILD_GID=2015 +# Node bundle images +NODE_BUNDLE_IMAGE_TAG=argus-sys-metric-test-node-bundle:latest +NODE_GPU_BUNDLE_IMAGE_TAG=argus-sys-metric-test-node-bundle-gpu:latest diff --git a/src/sys/swarm_tests/docker-compose.gpu-node.yml b/src/sys/swarm_tests/docker-compose.gpu-node.yml new file mode 100644 index 0000000..e6dd051 --- /dev/null +++ b/src/sys/swarm_tests/docker-compose.gpu-node.yml @@ -0,0 +1,36 @@ +version: "3.8" + +networks: + argus-sys-net: + external: true + +services: + metric-gpu-node: + image: ${NODE_GPU_BUNDLE_IMAGE_TAG:-argus-sys-metric-test-node-bundle-gpu:latest} + container_name: argus-metric-gpu-node-swarm + hostname: ${GPU_NODE_HOSTNAME:-swarm-metric-gpu-001} + restart: unless-stopped + privileged: true + runtime: nvidia + environment: + - TZ=Asia/Shanghai + - DEBIAN_FRONTEND=noninteractive + - MASTER_ENDPOINT=${MASTER_ENDPOINT:-http://master.argus.com:3000} + - FTPIP=${FTPIP} + - BINDIP=${BINDIP} + - FTP_USER=${FTP_USER:-ftpuser} + - FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!} + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - AGENT_ENV=${AGENT_ENV:-dev2} + - AGENT_USER=${AGENT_USER:-yuyr} + - AGENT_INSTANCE=${AGENT_INSTANCE:-gpu001sX} + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - GPU_MODE=gpu + dns: + - ${BINDIP} + networks: [argus-sys-net] + volumes: + - ./private-gpu-nodes/argus/agent:/private/argus/agent + command: ["sleep", "infinity"] diff --git a/src/sys/swarm_tests/scripts/05_gpu_node_up.sh b/src/sys/swarm_tests/scripts/05_gpu_node_up.sh new file mode 100755 index 0000000..78dcf69 --- /dev/null +++ b/src/sys/swarm_tests/scripts/05_gpu_node_up.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] && { set -a; source "$ENV_FILE"; set +a; } +ENV_NODES_FILE="$ROOT/.env.nodes"; [[ -f "$ENV_NODES_FILE" ]] && { set -a; source "$ENV_NODES_FILE"; set +a; } + +PROJECT="${GPU_PROJECT:-argus-swarm-gpu}" +COMPOSE_FILE="$ROOT/docker-compose.gpu-node.yml" + +# Prepare private dir +mkdir -p "$ROOT/private-gpu-nodes/argus/agent" + +echo "[GPU] checking host NVIDIA driver/runtime" +if ! command -v nvidia-smi >/dev/null 2>&1; then + echo "[ERR] nvidia-smi not found on host; install NVIDIA driver/runtime first" >&2 + exit 1 +fi + +echo "[GPU] starting compose project: $PROJECT" +docker compose -p "$PROJECT" --env-file "$ENV_NODES_FILE" -f "$COMPOSE_FILE" up -d +docker compose -p "$PROJECT" -f "$COMPOSE_FILE" ps + +echo "[GPU] container GPU visibility" +if ! docker exec argus-metric-gpu-node-swarm nvidia-smi -L >/dev/null 2>&1; then + echo "[WARN] nvidia-smi failed inside container; check --gpus/runtime/driver" >&2 +else + docker exec argus-metric-gpu-node-swarm nvidia-smi -L || true +fi + +echo "[GPU] done" + diff --git a/src/sys/swarm_tests/scripts/06_gpu_metric_verify.sh b/src/sys/swarm_tests/scripts/06_gpu_metric_verify.sh new file mode 100755 index 0000000..47d94eb --- /dev/null +++ b/src/sys/swarm_tests/scripts/06_gpu_metric_verify.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] && { set -a; source "$ENV_FILE"; set +a; } + +PROM_PORT="${PROMETHEUS_PORT:-9090}" +GRAF_PORT="${GRAFANA_PORT:-3000}" + +ok(){ echo "[OK] $*"; } +warn(){ echo "[WARN] $*"; } +err(){ echo "[ERR] $*" >&2; } +fail(){ err "$*"; exit 1; } + +GPU_HOST="${GPU_NODE_HOSTNAME:-swarm-metric-gpu-001}" + +# 1) nodes.json contains gpu node hostname +NODES_JSON="$ROOT/private-server/argus/metric/prometheus/nodes.json" +if [[ ! -f "$NODES_JSON" ]]; then + warn "nodes.json not found at $NODES_JSON" +else + if jq -e --arg h "$GPU_HOST" '.[] | select(.hostname==$h)' "$NODES_JSON" >/dev/null 2>&1; then + ok "nodes.json contains $GPU_HOST" + else + warn "nodes.json does not list $GPU_HOST" + fi +fi + +# 2) Prometheus targets health for :9100 (must) and :9400 (optional) +targets_json="$ROOT/tmp/gpu-verify/targets.json"; mkdir -p "$(dirname "$targets_json")" +if ! curl -fsS "http://127.0.0.1:${PROM_PORT}/api/v1/targets" -o "$targets_json"; then + fail "failed to fetch Prometheus targets" +fi + +# derive gpu node overlay IP +GPU_IP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-metric-gpu-node-swarm 2>/dev/null || true) + +must_ok=false +if jq -e --arg ip "$GPU_IP" '.data.activeTargets[] | select(.scrapeUrl | contains($ip+":9100")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then + ok "node-exporter 9100 up for GPU node ($GPU_IP)" + must_ok=true +else + # fallback: any 9100 up + if jq -e '.data.activeTargets[] | select(.scrapeUrl | test(":9100")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then + ok "node-exporter 9100 has at least one up target (fallback)" + must_ok=true + else + fail "node-exporter 9100 has no up targets" + fi +fi + +if jq -e --arg ip "$GPU_IP" '.data.activeTargets[] | select(.scrapeUrl | contains($ip+":9400")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then + ok "dcgm-exporter 9400 up for GPU node" +else + if jq -e '.data.activeTargets[] | select(.scrapeUrl | test(":9400")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then + ok "dcgm-exporter 9400 has up target (not necessarily GPU node)" + else + warn "dcgm-exporter 9400 down or missing (acceptable in some envs)" + fi +fi + +# 3) Quick PromQL sample for DCGM metric (optional) +if curl -fsS "http://127.0.0.1:${PROM_PORT}/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL" -o "$ROOT/tmp/gpu-verify/dcgm.json"; then + if jq -e '.data.result | length > 0' "$ROOT/tmp/gpu-verify/dcgm.json" >/dev/null 2>&1; then + ok "DCGM_FI_DEV_GPU_UTIL has samples" + else + warn "no samples for DCGM_FI_DEV_GPU_UTIL (not blocking)" + fi +fi + +echo "[DONE] gpu metric verify" +