#!/usr/bin/env bash set -euo pipefail ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" PKG_ROOT="$ROOT_DIR" ENV_FILE="$PKG_ROOT/compose/.env" COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml" info(){ echo -e "\033[34m[INSTALL]\033[0m $*"; } err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; } require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; } require docker curl jq awk sed tar gzip [[ -f "$ENV_FILE" ]] || { err "缺少 compose/.env,请先运行 scripts/config.sh"; exit 1; } info "使用环境文件: $ENV_FILE" set -a; source "$ENV_FILE"; set +a # 兼容:若 .env 未包含 SWARM_MANAGER_ADDR,则从已存在的 cluster-info.env 读取以避免写空 SMADDR="${SWARM_MANAGER_ADDR:-}" CI_FILE="$PKG_ROOT/cluster-info.env" if [[ -z "$SMADDR" && -f "$CI_FILE" ]]; then SMADDR=$(sed -n 's/^SWARM_MANAGER_ADDR=\(.*\)$/\1/p' "$CI_FILE" | head -n1) fi SWARM_MANAGER_ADDR="$SMADDR" # Swarm init & overlay if ! docker info 2>/dev/null | grep -q "Swarm: active"; then [[ -n "${SWARM_MANAGER_ADDR:-}" ]] || { err "SWARM_MANAGER_ADDR 未设置,请在 scripts/config.sh 中配置"; exit 1; } info "初始化 Swarm (--advertise-addr $SWARM_MANAGER_ADDR)" docker swarm init --advertise-addr "$SWARM_MANAGER_ADDR" >/dev/null 2>&1 || true else info "Swarm 已激活" fi NET_NAME="${ARGUS_OVERLAY_NET:-argus-sys-net}" if ! docker network inspect "$NET_NAME" >/dev/null 2>&1; then info "创建 overlay 网络: $NET_NAME" docker network create -d overlay --attachable "$NET_NAME" >/dev/null else info "overlay 网络已存在: $NET_NAME" fi # Load images IMAGES_DIR="$PKG_ROOT/images" shopt -s nullglob tars=("$IMAGES_DIR"/*.tar.gz) if [[ ${#tars[@]} -eq 0 ]]; then err "images 目录为空,缺少镜像 tar.gz"; exit 1; fi total=${#tars[@]}; idx=0 for tgz in "${tars[@]}"; do idx=$((idx+1)) info "导入镜像 ($idx/$total): $(basename "$tgz")" tmp=$(mktemp); gunzip -c "$tgz" > "$tmp"; docker load -i "$tmp" >/dev/null; rm -f "$tmp" done shopt -u nullglob # Compose up info "启动服务栈 (docker compose up -d)" docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps # Wait readiness (best-effort) code(){ curl -4 -s -o /dev/null -w "%{http_code}" "$1" || echo 000; } prom_ok(){ (exec 3<>/dev/tcp/127.0.0.1/${PROMETHEUS_PORT:-9090}) >/dev/null 2>&1 && return 0 || return 1; } kb_ok(){ local body; body=$(curl -s "http://127.0.0.1:${KIBANA_PORT:-5601}/api/status" || true); echo "$body" | grep -q '"level"\s*:\s*"available"'; } RETRIES=${RETRIES:-60}; SLEEP=${SLEEP:-5}; ok=0 info "等待基础服务就绪 (<= $((RETRIES*SLEEP))s)" for i in $(seq 1 "$RETRIES"); do e1=$(code "http://127.0.0.1:${MASTER_PORT:-32300}/readyz") e2=$(code "http://127.0.0.1:${ES_HTTP_PORT:-9200}/_cluster/health") e3=000; prom_ok && e3=200 e4=$(code "http://127.0.0.1:${GRAFANA_PORT:-3000}/api/health") e5=$(code "http://127.0.0.1:${ALERTMANAGER_PORT:-9093}/api/v2/status") e6=$(kb_ok && echo 200 || echo 000) info "[ready] t=$((i*SLEEP))s master=$e1 es=$e2 prom=$e3 graf=$e4 alert=$e5 kibana=$e6" [[ "$e1" == 200 ]] && ok=$((ok+1)) [[ "$e2" == 200 ]] && ok=$((ok+1)) [[ "$e3" == 200 ]] && ok=$((ok+1)) [[ "$e4" == 200 ]] && ok=$((ok+1)) [[ "$e5" == 200 ]] && ok=$((ok+1)) [[ "$e6" == 200 ]] && ok=$((ok+1)) if [[ $ok -ge 6 ]]; then break; fi; ok=0; sleep "$SLEEP" done [[ $ok -ge 6 ]] || err "部分服务未就绪(可稍后重试 selfcheck)" # Resolve overlay IPs bind_c=argus-bind-sys; ftp_c=argus-ftp BINDIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "'$NET_NAME'").IPAddress }}' "$bind_c" 2>/dev/null || true) FTPIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "'$NET_NAME'").IPAddress }}' "$ftp_c" 2>/dev/null || true) info "解析 overlay IP: BINDIP=${BINDIP:-} FTPIP=${FTPIP:-}" # Swarm join tokens TOKEN_WORKER=$(docker swarm join-token -q worker 2>/dev/null || echo "") TOKEN_MANAGER=$(docker swarm join-token -q manager 2>/dev/null || echo "") # cluster-info.env CI="$PKG_ROOT/cluster-info.env" info "写入 cluster-info.env (manager/token/IP)" { echo "SWARM_MANAGER_ADDR=${SWARM_MANAGER_ADDR:-}" echo "BINDIP=${BINDIP:-}" echo "FTPIP=${FTPIP:-}" echo "SWARM_JOIN_TOKEN_WORKER=${TOKEN_WORKER:-}" echo "SWARM_JOIN_TOKEN_MANAGER=${TOKEN_MANAGER:-}" } > "$CI" info "已输出 $CI" # 安装报告 ts=$(date +%Y%m%d-%H%M%S) RPT="$PKG_ROOT/安装报告_${ts}.md" { echo "# Argus Server 安装报告 (${ts})" echo echo "## 端口映射" echo "- MASTER_PORT=${MASTER_PORT}" echo "- ES_HTTP_PORT=${ES_HTTP_PORT}" echo "- KIBANA_PORT=${KIBANA_PORT}" echo "- PROMETHEUS_PORT=${PROMETHEUS_PORT}" echo "- GRAFANA_PORT=${GRAFANA_PORT}" echo "- ALERTMANAGER_PORT=${ALERTMANAGER_PORT}" echo "- WEB_PROXY_PORT_8080=${WEB_PROXY_PORT_8080} ... 8085=${WEB_PROXY_PORT_8085}" echo echo "## Swarm/Overlay" echo "- SWARM_MANAGER_ADDR=${SWARM_MANAGER_ADDR:-}" echo "- NET=${NET_NAME}" echo "- JOIN_TOKEN_WORKER=${TOKEN_WORKER:-}" echo "- JOIN_TOKEN_MANAGER=${TOKEN_MANAGER:-}" echo echo "## Overlay IPs" echo "- BINDIP=${BINDIP:-}" echo "- FTPIP=${FTPIP:-}" echo echo "## 健康检查(简要)" echo "- master/readyz=$(code http://127.0.0.1:${MASTER_PORT:-32300}/readyz)" echo "- es/_cluster/health=$(code http://127.0.0.1:${ES_HTTP_PORT:-9200}/_cluster/health)" echo "- grafana/api/health=$(code http://127.0.0.1:${GRAFANA_PORT:-3000}/api/health)" echo "- prometheus/tcp=$([[ $(prom_ok; echo $?) == 0 ]] && echo 200 || echo 000)" echo "- alertmanager/api/v2/status=$(code http://127.0.0.1:${ALERTMANAGER_PORT:-9093}/api/v2/status)" echo "- kibana/api/status=$([[ $(kb_ok; echo $?) == 0 ]] && echo available || echo not-ready)" } > "$RPT" info "已生成报告: $RPT" info "安装完成。可将 cluster-info.env 分发给 Client-GPU 安装方。" # 写入域名→overlay IP 并热更新 Bind/Nginx ETC_DIR="$PKG_ROOT/private/argus/etc"; mkdir -p "$ETC_DIR" declare -A MAP MAP[web-frontend]=web.argus.com MAP[argus-grafana]=grafana.metric.argus.com MAP[argus-prometheus]=prom.metric.argus.com MAP[argus-kibana-sys]=kibana.log.argus.com MAP[argus-alertmanager]=alertmanager.alert.argus.com MAP[argus-master-sys]=master.argus.com changed=0 for cname in "${!MAP[@]}"; do domain="${MAP[$cname]}"; fpath="$ETC_DIR/$domain" ip=$(docker inspect -f '{{ (index .NetworkSettings.Networks "'$NET_NAME'").IPAddress }}' "$cname" 2>/dev/null || true) [[ -z "$ip" ]] && { echo "[DNS-FIX][WARN] $domain: container $cname no overlay IP yet"; continue; } cur=$(cat "$fpath" 2>/dev/null || echo "") if [[ "$cur" != "$ip" ]]; then echo "$ip" > "$fpath"; echo "[DNS-FIX][SET] $domain = $ip (was: ${cur:-})"; changed=1 else echo "[DNS-FIX][OK] $domain already $ip" fi done if [[ $changed -eq 1 ]]; then docker exec argus-bind-sys /usr/local/bin/reload-bind9.sh >/dev/null 2>&1 || docker exec argus-bind-sys rndc reload >/dev/null 2>&1 || true sleep 1 fi docker exec argus-web-proxy nginx -t >/dev/null 2>&1 && docker exec argus-web-proxy nginx -s reload >/dev/null 2>&1 || true