[#37] 增加gpu bundle node镜像构建

This commit is contained in:
yuyr 2025-11-07 10:23:59 +08:00
parent 0b9268332f
commit 7548e46d1f
8 changed files with 234 additions and 1 deletions

View File

@ -0,0 +1,39 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
cd "$ROOT_DIR"
usage() {
cat <<EOF
Build CPU node-bundle image (wrapper)
Usage: $(basename "$0") [--client-version YYYYMMDD]
Examples:
$(basename "$0") --client-version 20251106
$(basename "$0") # auto-detect artifact version via packaging
EOF
}
VERSION=""
while [[ $# -gt 0 ]]; do
case "$1" in
--client-version) VERSION="${2:-}"; shift 2;;
-h|--help) usage; exit 0;;
*) echo "Unknown arg: $1" >&2; usage; exit 1;;
esac
done
CMD=("./deployment/build/build_images.sh" "--with-node-bundle")
if [[ -n "$VERSION" ]]; then CMD+=("--client-version" "$VERSION"); fi
echo "[CPU-BUNDLE] invoking: ${CMD[*]}"
"${CMD[@]}"
echo "[CPU-BUNDLE] built image: argus-sys-metric-test-node-bundle:latest"
docker image inspect argus-sys-metric-test-node-bundle:latest >/dev/null 2>&1 || {
echo "[ERR] expected image not found" >&2; exit 1; }
echo "[CPU-BUNDLE] done"

View File

@ -0,0 +1,49 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
cd "$ROOT_DIR"
usage() {
cat <<EOF
Build GPU node-bundle image (wrapper)
Usage: $(basename "$0") [--client-version YYYYMMDD] [--tag IMAGE:TAG]
Defaults:
base-image = argus-sys-metric-test-gpu-node:latest
output tag = argus-sys-metric-test-node-bundle-gpu:latest
Examples:
$(basename "$0") --client-version 20251106
$(basename "$0") --client-version 20251106 --tag myrepo/node-bundle-gpu:20251106
EOF
}
VERSION=""
OUT_TAG="argus-sys-metric-test-node-bundle-gpu:latest"
while [[ $# -gt 0 ]]; do
case "$1" in
--client-version) VERSION="${2:-}"; shift 2;;
--tag) OUT_TAG="${2:-}"; shift 2;;
-h|--help) usage; exit 0;;
*) echo "Unknown arg: $1" >&2; usage; exit 1;;
esac
done
BASE_IMAGE="argus-sys-metric-test-gpu-node:latest"
CMD=("./deployment/build/build_images.sh" "--with-node-bundle" "--base-image" "$BASE_IMAGE")
if [[ -n "$VERSION" ]]; then CMD+=("--client-version" "$VERSION"); fi
echo "[GPU-BUNDLE] invoking: ${CMD[*]}"
"${CMD[@]}"
echo "[GPU-BUNDLE] re-tagging to $OUT_TAG"
docker image inspect argus-sys-metric-test-node-bundle:latest >/dev/null 2>&1 || {
echo "[ERR] base bundle image missing: argus-sys-metric-test-node-bundle:latest" >&2; exit 1; }
docker tag argus-sys-metric-test-node-bundle:latest "$OUT_TAG"
docker image inspect "$OUT_TAG" >/dev/null 2>&1 || { echo "[ERR] re-tag failed" >&2; exit 1; }
echo "[GPU-BUNDLE] built image: $OUT_TAG (base=$BASE_IMAGE)"

1
src/sys/build/node-bundle/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
bundle/*.tar.gz

View File

@ -1 +0,0 @@
argus-metric_*.tar.gz

View File

@ -19,3 +19,6 @@ WEB_PROXY_PORT_8085=8085
ARGUS_BUILD_UID=2133 ARGUS_BUILD_UID=2133
ARGUS_BUILD_GID=2015 ARGUS_BUILD_GID=2015
# Node bundle images
NODE_BUNDLE_IMAGE_TAG=argus-sys-metric-test-node-bundle:latest
NODE_GPU_BUNDLE_IMAGE_TAG=argus-sys-metric-test-node-bundle-gpu:latest

View File

@ -0,0 +1,36 @@
version: "3.8"
networks:
argus-sys-net:
external: true
services:
metric-gpu-node:
image: ${NODE_GPU_BUNDLE_IMAGE_TAG:-argus-sys-metric-test-node-bundle-gpu:latest}
container_name: argus-metric-gpu-node-swarm
hostname: ${GPU_NODE_HOSTNAME:-swarm-metric-gpu-001}
restart: unless-stopped
privileged: true
runtime: nvidia
environment:
- TZ=Asia/Shanghai
- DEBIAN_FRONTEND=noninteractive
- MASTER_ENDPOINT=${MASTER_ENDPOINT:-http://master.argus.com:3000}
- FTPIP=${FTPIP}
- BINDIP=${BINDIP}
- FTP_USER=${FTP_USER:-ftpuser}
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
- AGENT_ENV=${AGENT_ENV:-dev2}
- AGENT_USER=${AGENT_USER:-yuyr}
- AGENT_INSTANCE=${AGENT_INSTANCE:-gpu001sX}
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- GPU_MODE=gpu
dns:
- ${BINDIP}
networks: [argus-sys-net]
volumes:
- ./private-gpu-nodes/argus/agent:/private/argus/agent
command: ["sleep", "infinity"]

View File

@ -0,0 +1,33 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] && { set -a; source "$ENV_FILE"; set +a; }
ENV_NODES_FILE="$ROOT/.env.nodes"; [[ -f "$ENV_NODES_FILE" ]] && { set -a; source "$ENV_NODES_FILE"; set +a; }
PROJECT="${GPU_PROJECT:-argus-swarm-gpu}"
COMPOSE_FILE="$ROOT/docker-compose.gpu-node.yml"
# Prepare private dir
mkdir -p "$ROOT/private-gpu-nodes/argus/agent"
echo "[GPU] checking host NVIDIA driver/runtime"
if ! command -v nvidia-smi >/dev/null 2>&1; then
echo "[ERR] nvidia-smi not found on host; install NVIDIA driver/runtime first" >&2
exit 1
fi
echo "[GPU] starting compose project: $PROJECT"
docker compose -p "$PROJECT" --env-file "$ENV_NODES_FILE" -f "$COMPOSE_FILE" up -d
docker compose -p "$PROJECT" -f "$COMPOSE_FILE" ps
echo "[GPU] container GPU visibility"
if ! docker exec argus-metric-gpu-node-swarm nvidia-smi -L >/dev/null 2>&1; then
echo "[WARN] nvidia-smi failed inside container; check --gpus/runtime/driver" >&2
else
docker exec argus-metric-gpu-node-swarm nvidia-smi -L || true
fi
echo "[GPU] done"

View File

@ -0,0 +1,73 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] && { set -a; source "$ENV_FILE"; set +a; }
PROM_PORT="${PROMETHEUS_PORT:-9090}"
GRAF_PORT="${GRAFANA_PORT:-3000}"
ok(){ echo "[OK] $*"; }
warn(){ echo "[WARN] $*"; }
err(){ echo "[ERR] $*" >&2; }
fail(){ err "$*"; exit 1; }
GPU_HOST="${GPU_NODE_HOSTNAME:-swarm-metric-gpu-001}"
# 1) nodes.json contains gpu node hostname
NODES_JSON="$ROOT/private-server/argus/metric/prometheus/nodes.json"
if [[ ! -f "$NODES_JSON" ]]; then
warn "nodes.json not found at $NODES_JSON"
else
if jq -e --arg h "$GPU_HOST" '.[] | select(.hostname==$h)' "$NODES_JSON" >/dev/null 2>&1; then
ok "nodes.json contains $GPU_HOST"
else
warn "nodes.json does not list $GPU_HOST"
fi
fi
# 2) Prometheus targets health for :9100 (must) and :9400 (optional)
targets_json="$ROOT/tmp/gpu-verify/targets.json"; mkdir -p "$(dirname "$targets_json")"
if ! curl -fsS "http://127.0.0.1:${PROM_PORT}/api/v1/targets" -o "$targets_json"; then
fail "failed to fetch Prometheus targets"
fi
# derive gpu node overlay IP
GPU_IP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-metric-gpu-node-swarm 2>/dev/null || true)
must_ok=false
if jq -e --arg ip "$GPU_IP" '.data.activeTargets[] | select(.scrapeUrl | contains($ip+":9100")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then
ok "node-exporter 9100 up for GPU node ($GPU_IP)"
must_ok=true
else
# fallback: any 9100 up
if jq -e '.data.activeTargets[] | select(.scrapeUrl | test(":9100")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then
ok "node-exporter 9100 has at least one up target (fallback)"
must_ok=true
else
fail "node-exporter 9100 has no up targets"
fi
fi
if jq -e --arg ip "$GPU_IP" '.data.activeTargets[] | select(.scrapeUrl | contains($ip+":9400")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then
ok "dcgm-exporter 9400 up for GPU node"
else
if jq -e '.data.activeTargets[] | select(.scrapeUrl | test(":9400")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then
ok "dcgm-exporter 9400 has up target (not necessarily GPU node)"
else
warn "dcgm-exporter 9400 down or missing (acceptable in some envs)"
fi
fi
# 3) Quick PromQL sample for DCGM metric (optional)
if curl -fsS "http://127.0.0.1:${PROM_PORT}/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL" -o "$ROOT/tmp/gpu-verify/dcgm.json"; then
if jq -e '.data.result | length > 0' "$ROOT/tmp/gpu-verify/dcgm.json" >/dev/null 2>&1; then
ok "DCGM_FI_DEV_GPU_UTIL has samples"
else
warn "no samples for DCGM_FI_DEV_GPU_UTIL (not blocking)"
fi
fi
echo "[DONE] gpu metric verify"