#!/usr/bin/env bash set -euo pipefail # ARM Client-CPU 配置脚本: # - 读取 server 侧生成的 cluster-info.env(包含 SWARM_MANAGER_ADDR / SWARM_JOIN_TOKEN_WORKER) # - 加入 Swarm(worker) # - 预热 overlay 网络,确保能通过 master.argus.com 访问 server # - 生成/更新 compose/.env(填充 Swarm 相关字段,保留人工填写的 AGENT_*、CPU_NODE_HOSTNAME 等) ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" PKG_ROOT="$ROOT_DIR" ENV_EX="$PKG_ROOT/compose/.env.example" ENV_OUT="$PKG_ROOT/compose/.env" info(){ echo -e "\033[34m[CONFIG-ARM-CLIENT]\033[0m $*"; } err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; } require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; } # Compose 检测:优先 docker compose(v2),回退 docker-compose(v1) require_compose(){ if docker compose version >/dev/null 2>&1; then return 0; fi if command -v docker-compose >/dev/null 2>&1 && docker-compose version >/dev/null 2>&1; then return 0; fi err "未检测到 Docker Compose,请安装 docker compose v2 或 docker-compose v1"; exit 1 } require docker curl jq awk sed tar gzip require_compose [[ -f "$ENV_EX" ]] || { err "缺少模板文件: $ENV_EX"; exit 1; } # 磁盘空间检查(MB) check_disk(){ local p="$1"; local need=5120; local free free=$(df -Pm "$p" | awk 'NR==2{print $4+0}') if [[ -z "$free" || "$free" -lt "$need" ]]; then err "磁盘空间不足: $p 剩余 ${free:-0}MB (<${need}MB)"; fi } check_disk "$PKG_ROOT" || true check_disk "/var/lib/docker" || true # 导入 cluster-info.env(默认取当前包根,也可用 CLUSTER_INFO 指定路径) CI_IN="${CLUSTER_INFO:-$PKG_ROOT/cluster-info.env}" NET_NAME="${ARGUS_OVERLAY_NET:-argus-sys-net}" SWARM_AVAILABLE=true if [[ -f "$CI_IN" ]]; then info "读取 cluster-info.env: $CI_IN" set -a; source "$CI_IN"; set +a # 尝试加入 Swarm(幂等),允许 join-token 为空 if [[ -n "${SWARM_MANAGER_ADDR:-}" && -n "${SWARM_JOIN_TOKEN_WORKER:-}" ]]; then info "尝试加入 Swarm(幂等):$SWARM_MANAGER_ADDR" docker swarm join --token "$SWARM_JOIN_TOKEN_WORKER" "$SWARM_MANAGER_ADDR":2377 >/dev/null 2>&1 || true fi # 检查当前节点 Swarm 状态 if ! docker info 2>/dev/null | grep -q "Swarm: active"; then info "检测到当前节点 Swarm 未启用,client_arm 将退回本地 network 模式" SWARM_AVAILABLE=false fi else info "未找到 cluster-info.env($CI_IN),client_arm 将以本地 network 模式运行" SWARM_AVAILABLE=false fi # 生成/更新 .env(保留人工填写项,不覆盖已有键) if [[ ! -f "$ENV_OUT" ]]; then cp "$ENV_EX" "$ENV_OUT" fi set_kv(){ local k="$1" v="$2"; if grep -q "^${k}=" "$ENV_OUT"; then sed -i -E "s#^${k}=.*#${k}=${v}#" "$ENV_OUT"; else echo "${k}=${v}" >> "$ENV_OUT"; fi } if [[ "$SWARM_AVAILABLE" == true ]]; then # ===== Swarm 模式:强依赖 overlay 网络和 DNS alias ===== # 导入 busybox 并做 overlay 预热与连通性 if ! docker image inspect busybox:latest >/dev/null 2>&1; then if [[ -f "$PKG_ROOT/images/busybox.tar" ]]; then info "加载 busybox.tar 以预热 overlay" docker load -i "$PKG_ROOT/images/busybox.tar" >/dev/null else err "缺少 busybox 镜像(包内 images/busybox.tar 或本地 busybox:latest),无法预热 overlay $NET_NAME"; exit 1 fi fi docker rm -f argus-net-warmup >/dev/null 2>&1 || true info "启动 warmup 容器加入 overlay: $NET_NAME" docker run -d --rm --name argus-net-warmup --network "$NET_NAME" busybox:latest sleep 600 >/dev/null 2>&1 || true for i in {1..60}; do docker network inspect "$NET_NAME" >/dev/null 2>&1 && { info "overlay 可见 (t=${i}s)"; break; }; sleep 1; done docker network inspect "$NET_NAME" >/dev/null 2>&1 || { err "预热后仍未看到 overlay: $NET_NAME;请确认 server 侧 overlay 已创建且可达"; exit 1; } if ! docker exec argus-net-warmup sh -lc "ping -c 1 -W 2 master.argus.com >/dev/null 2>&1"; then err "warmup 容器内无法通过别名访问 master.argus.com;请确认 server compose 已启动并加入 overlay $NET_NAME" exit 1 fi info "warmup 容器内可达 master.argus.com(Docker DNS + alias 正常)" # 写入与 Swarm 相关的字段,便于后续诊断 set_kv SWARM_MANAGER_ADDR "${SWARM_MANAGER_ADDR:-}" set_kv SWARM_JOIN_TOKEN_WORKER "${SWARM_JOIN_TOKEN_WORKER:-}" set_kv SWARM_JOIN_TOKEN_MANAGER "${SWARM_JOIN_TOKEN_MANAGER:-}" # 若未显式设置 MASTER_ENDPOINT,则默认走 overlay 别名 if ! grep -q '^MASTER_ENDPOINT=' "$ENV_OUT"; then echo "MASTER_ENDPOINT=http://master.argus.com:3000" >> "$ENV_OUT" fi else # ===== 本地 network 退化模式:不依赖 Swarm 和 overlay ===== if ! docker network inspect "$NET_NAME" >/dev/null 2>&1; then info "创建本地 bridge 网络: $NET_NAME(client_arm 退化模式)" docker network create "$NET_NAME" >/dev/null else info "本地网络已存在: $NET_NAME" fi # 若未显式设置 MASTER_ENDPOINT,则提示用户后续手动配置 if ! grep -q '^MASTER_ENDPOINT=' "$ENV_OUT"; then echo "MASTER_ENDPOINT=" >> "$ENV_OUT" fi ME=$(grep -E '^MASTER_ENDPOINT=' "$ENV_OUT" | head -1 | cut -d= -f2-) if [[ -z "$ME" ]]; then err "本地 network 模式下必须配置 MASTER_ENDPOINT(示例:http://:),请编辑 compose/.env 后重试" else info "当前为本地 network 模式,metric-cpu-node 将通过 MASTER_ENDPOINT=${ME} 访问 master" fi fi # 检查用户必须填写的字段 REQ_VARS=(AGENT_ENV AGENT_USER AGENT_INSTANCE CPU_NODE_HOSTNAME) missing=() for v in "${REQ_VARS[@]}"; do val=$(grep -E "^$v=" "$ENV_OUT" | head -1 | cut -d= -f2-) if [[ -z "$val" ]]; then missing+=("$v"); fi done if [[ ${#missing[@]} -gt 0 ]]; then err "以下变量必须在 compose/.env 中填写:${missing[*]}(已保留你现有的内容,不会被覆盖)" exit 1 fi info "已生成 compose/.env;可执行 scripts/install.sh 启动 ARM Client-CPU"