[#37] 增加gpu bundle node镜像构建
This commit is contained in:
parent
0b9268332f
commit
7548e46d1f
39
deployment/build/build_cpu_node_image.sh
Executable file
39
deployment/build/build_cpu_node_image.sh
Executable file
@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||
cd "$ROOT_DIR"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Build CPU node-bundle image (wrapper)
|
||||
|
||||
Usage: $(basename "$0") [--client-version YYYYMMDD]
|
||||
|
||||
Examples:
|
||||
$(basename "$0") --client-version 20251106
|
||||
$(basename "$0") # auto-detect artifact version via packaging
|
||||
EOF
|
||||
}
|
||||
|
||||
VERSION=""
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--client-version) VERSION="${2:-}"; shift 2;;
|
||||
-h|--help) usage; exit 0;;
|
||||
*) echo "Unknown arg: $1" >&2; usage; exit 1;;
|
||||
esac
|
||||
done
|
||||
|
||||
CMD=("./deployment/build/build_images.sh" "--with-node-bundle")
|
||||
if [[ -n "$VERSION" ]]; then CMD+=("--client-version" "$VERSION"); fi
|
||||
|
||||
echo "[CPU-BUNDLE] invoking: ${CMD[*]}"
|
||||
"${CMD[@]}"
|
||||
|
||||
echo "[CPU-BUNDLE] built image: argus-sys-metric-test-node-bundle:latest"
|
||||
docker image inspect argus-sys-metric-test-node-bundle:latest >/dev/null 2>&1 || {
|
||||
echo "[ERR] expected image not found" >&2; exit 1; }
|
||||
|
||||
echo "[CPU-BUNDLE] done"
|
||||
|
||||
49
deployment/build/build_gpu_node_image.sh
Executable file
49
deployment/build/build_gpu_node_image.sh
Executable file
@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||
cd "$ROOT_DIR"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Build GPU node-bundle image (wrapper)
|
||||
|
||||
Usage: $(basename "$0") [--client-version YYYYMMDD] [--tag IMAGE:TAG]
|
||||
|
||||
Defaults:
|
||||
base-image = argus-sys-metric-test-gpu-node:latest
|
||||
output tag = argus-sys-metric-test-node-bundle-gpu:latest
|
||||
|
||||
Examples:
|
||||
$(basename "$0") --client-version 20251106
|
||||
$(basename "$0") --client-version 20251106 --tag myrepo/node-bundle-gpu:20251106
|
||||
EOF
|
||||
}
|
||||
|
||||
VERSION=""
|
||||
OUT_TAG="argus-sys-metric-test-node-bundle-gpu:latest"
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--client-version) VERSION="${2:-}"; shift 2;;
|
||||
--tag) OUT_TAG="${2:-}"; shift 2;;
|
||||
-h|--help) usage; exit 0;;
|
||||
*) echo "Unknown arg: $1" >&2; usage; exit 1;;
|
||||
esac
|
||||
done
|
||||
|
||||
BASE_IMAGE="argus-sys-metric-test-gpu-node:latest"
|
||||
|
||||
CMD=("./deployment/build/build_images.sh" "--with-node-bundle" "--base-image" "$BASE_IMAGE")
|
||||
if [[ -n "$VERSION" ]]; then CMD+=("--client-version" "$VERSION"); fi
|
||||
|
||||
echo "[GPU-BUNDLE] invoking: ${CMD[*]}"
|
||||
"${CMD[@]}"
|
||||
|
||||
echo "[GPU-BUNDLE] re-tagging to $OUT_TAG"
|
||||
docker image inspect argus-sys-metric-test-node-bundle:latest >/dev/null 2>&1 || {
|
||||
echo "[ERR] base bundle image missing: argus-sys-metric-test-node-bundle:latest" >&2; exit 1; }
|
||||
docker tag argus-sys-metric-test-node-bundle:latest "$OUT_TAG"
|
||||
docker image inspect "$OUT_TAG" >/dev/null 2>&1 || { echo "[ERR] re-tag failed" >&2; exit 1; }
|
||||
|
||||
echo "[GPU-BUNDLE] built image: $OUT_TAG (base=$BASE_IMAGE)"
|
||||
|
||||
1
src/sys/build/node-bundle/.gitignore
vendored
Normal file
1
src/sys/build/node-bundle/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
bundle/*.tar.gz
|
||||
1
src/sys/build/node-bundle/bundle/.gitignore
vendored
1
src/sys/build/node-bundle/bundle/.gitignore
vendored
@ -1 +0,0 @@
|
||||
argus-metric_*.tar.gz
|
||||
@ -19,3 +19,6 @@ WEB_PROXY_PORT_8085=8085
|
||||
ARGUS_BUILD_UID=2133
|
||||
ARGUS_BUILD_GID=2015
|
||||
|
||||
# Node bundle images
|
||||
NODE_BUNDLE_IMAGE_TAG=argus-sys-metric-test-node-bundle:latest
|
||||
NODE_GPU_BUNDLE_IMAGE_TAG=argus-sys-metric-test-node-bundle-gpu:latest
|
||||
|
||||
36
src/sys/swarm_tests/docker-compose.gpu-node.yml
Normal file
36
src/sys/swarm_tests/docker-compose.gpu-node.yml
Normal file
@ -0,0 +1,36 @@
|
||||
version: "3.8"
|
||||
|
||||
networks:
|
||||
argus-sys-net:
|
||||
external: true
|
||||
|
||||
services:
|
||||
metric-gpu-node:
|
||||
image: ${NODE_GPU_BUNDLE_IMAGE_TAG:-argus-sys-metric-test-node-bundle-gpu:latest}
|
||||
container_name: argus-metric-gpu-node-swarm
|
||||
hostname: ${GPU_NODE_HOSTNAME:-swarm-metric-gpu-001}
|
||||
restart: unless-stopped
|
||||
privileged: true
|
||||
runtime: nvidia
|
||||
environment:
|
||||
- TZ=Asia/Shanghai
|
||||
- DEBIAN_FRONTEND=noninteractive
|
||||
- MASTER_ENDPOINT=${MASTER_ENDPOINT:-http://master.argus.com:3000}
|
||||
- FTPIP=${FTPIP}
|
||||
- BINDIP=${BINDIP}
|
||||
- FTP_USER=${FTP_USER:-ftpuser}
|
||||
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
- AGENT_ENV=${AGENT_ENV:-dev2}
|
||||
- AGENT_USER=${AGENT_USER:-yuyr}
|
||||
- AGENT_INSTANCE=${AGENT_INSTANCE:-gpu001sX}
|
||||
- NVIDIA_VISIBLE_DEVICES=all
|
||||
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||
- GPU_MODE=gpu
|
||||
dns:
|
||||
- ${BINDIP}
|
||||
networks: [argus-sys-net]
|
||||
volumes:
|
||||
- ./private-gpu-nodes/argus/agent:/private/argus/agent
|
||||
command: ["sleep", "infinity"]
|
||||
33
src/sys/swarm_tests/scripts/05_gpu_node_up.sh
Executable file
33
src/sys/swarm_tests/scripts/05_gpu_node_up.sh
Executable file
@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] && { set -a; source "$ENV_FILE"; set +a; }
|
||||
ENV_NODES_FILE="$ROOT/.env.nodes"; [[ -f "$ENV_NODES_FILE" ]] && { set -a; source "$ENV_NODES_FILE"; set +a; }
|
||||
|
||||
PROJECT="${GPU_PROJECT:-argus-swarm-gpu}"
|
||||
COMPOSE_FILE="$ROOT/docker-compose.gpu-node.yml"
|
||||
|
||||
# Prepare private dir
|
||||
mkdir -p "$ROOT/private-gpu-nodes/argus/agent"
|
||||
|
||||
echo "[GPU] checking host NVIDIA driver/runtime"
|
||||
if ! command -v nvidia-smi >/dev/null 2>&1; then
|
||||
echo "[ERR] nvidia-smi not found on host; install NVIDIA driver/runtime first" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "[GPU] starting compose project: $PROJECT"
|
||||
docker compose -p "$PROJECT" --env-file "$ENV_NODES_FILE" -f "$COMPOSE_FILE" up -d
|
||||
docker compose -p "$PROJECT" -f "$COMPOSE_FILE" ps
|
||||
|
||||
echo "[GPU] container GPU visibility"
|
||||
if ! docker exec argus-metric-gpu-node-swarm nvidia-smi -L >/dev/null 2>&1; then
|
||||
echo "[WARN] nvidia-smi failed inside container; check --gpus/runtime/driver" >&2
|
||||
else
|
||||
docker exec argus-metric-gpu-node-swarm nvidia-smi -L || true
|
||||
fi
|
||||
|
||||
echo "[GPU] done"
|
||||
|
||||
73
src/sys/swarm_tests/scripts/06_gpu_metric_verify.sh
Executable file
73
src/sys/swarm_tests/scripts/06_gpu_metric_verify.sh
Executable file
@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] && { set -a; source "$ENV_FILE"; set +a; }
|
||||
|
||||
PROM_PORT="${PROMETHEUS_PORT:-9090}"
|
||||
GRAF_PORT="${GRAFANA_PORT:-3000}"
|
||||
|
||||
ok(){ echo "[OK] $*"; }
|
||||
warn(){ echo "[WARN] $*"; }
|
||||
err(){ echo "[ERR] $*" >&2; }
|
||||
fail(){ err "$*"; exit 1; }
|
||||
|
||||
GPU_HOST="${GPU_NODE_HOSTNAME:-swarm-metric-gpu-001}"
|
||||
|
||||
# 1) nodes.json contains gpu node hostname
|
||||
NODES_JSON="$ROOT/private-server/argus/metric/prometheus/nodes.json"
|
||||
if [[ ! -f "$NODES_JSON" ]]; then
|
||||
warn "nodes.json not found at $NODES_JSON"
|
||||
else
|
||||
if jq -e --arg h "$GPU_HOST" '.[] | select(.hostname==$h)' "$NODES_JSON" >/dev/null 2>&1; then
|
||||
ok "nodes.json contains $GPU_HOST"
|
||||
else
|
||||
warn "nodes.json does not list $GPU_HOST"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 2) Prometheus targets health for :9100 (must) and :9400 (optional)
|
||||
targets_json="$ROOT/tmp/gpu-verify/targets.json"; mkdir -p "$(dirname "$targets_json")"
|
||||
if ! curl -fsS "http://127.0.0.1:${PROM_PORT}/api/v1/targets" -o "$targets_json"; then
|
||||
fail "failed to fetch Prometheus targets"
|
||||
fi
|
||||
|
||||
# derive gpu node overlay IP
|
||||
GPU_IP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-metric-gpu-node-swarm 2>/dev/null || true)
|
||||
|
||||
must_ok=false
|
||||
if jq -e --arg ip "$GPU_IP" '.data.activeTargets[] | select(.scrapeUrl | contains($ip+":9100")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then
|
||||
ok "node-exporter 9100 up for GPU node ($GPU_IP)"
|
||||
must_ok=true
|
||||
else
|
||||
# fallback: any 9100 up
|
||||
if jq -e '.data.activeTargets[] | select(.scrapeUrl | test(":9100")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then
|
||||
ok "node-exporter 9100 has at least one up target (fallback)"
|
||||
must_ok=true
|
||||
else
|
||||
fail "node-exporter 9100 has no up targets"
|
||||
fi
|
||||
fi
|
||||
|
||||
if jq -e --arg ip "$GPU_IP" '.data.activeTargets[] | select(.scrapeUrl | contains($ip+":9400")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then
|
||||
ok "dcgm-exporter 9400 up for GPU node"
|
||||
else
|
||||
if jq -e '.data.activeTargets[] | select(.scrapeUrl | test(":9400")) | select(.health=="up")' "$targets_json" >/dev/null 2>&1; then
|
||||
ok "dcgm-exporter 9400 has up target (not necessarily GPU node)"
|
||||
else
|
||||
warn "dcgm-exporter 9400 down or missing (acceptable in some envs)"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 3) Quick PromQL sample for DCGM metric (optional)
|
||||
if curl -fsS "http://127.0.0.1:${PROM_PORT}/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL" -o "$ROOT/tmp/gpu-verify/dcgm.json"; then
|
||||
if jq -e '.data.result | length > 0' "$ROOT/tmp/gpu-verify/dcgm.json" >/dev/null 2>&1; then
|
||||
ok "DCGM_FI_DEV_GPU_UTIL has samples"
|
||||
else
|
||||
warn "no samples for DCGM_FI_DEV_GPU_UTIL (not blocking)"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "[DONE] gpu metric verify"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user