141 lines
6.0 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
set -euo pipefail
# ARM Client-CPU 配置脚本:
# - 读取 server 侧生成的 cluster-info.env包含 SWARM_MANAGER_ADDR / SWARM_JOIN_TOKEN_WORKER
# - 加入 Swarmworker
# - 预热 overlay 网络,确保能通过 master.argus.com 访问 server
# - 生成/更新 compose/.env填充 Swarm 相关字段,保留人工填写的 AGENT_*、CPU_NODE_HOSTNAME 等)
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
PKG_ROOT="$ROOT_DIR"
ENV_EX="$PKG_ROOT/compose/.env.example"
ENV_OUT="$PKG_ROOT/compose/.env"
info(){ echo -e "\033[34m[CONFIG-ARM-CLIENT]\033[0m $*"; }
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; }
# Compose 检测:优先 docker composev2回退 docker-composev1
require_compose(){
if docker compose version >/dev/null 2>&1; then return 0; fi
if command -v docker-compose >/dev/null 2>&1 && docker-compose version >/dev/null 2>&1; then return 0; fi
err "未检测到 Docker Compose请安装 docker compose v2 或 docker-compose v1"; exit 1
}
require docker curl jq awk sed tar gzip
require_compose
[[ -f "$ENV_EX" ]] || { err "缺少模板文件: $ENV_EX"; exit 1; }
# 磁盘空间检查MB
check_disk(){ local p="$1"; local need=5120; local free
free=$(df -Pm "$p" | awk 'NR==2{print $4+0}')
if [[ -z "$free" || "$free" -lt "$need" ]]; then err "磁盘空间不足: $p 剩余 ${free:-0}MB (<${need}MB)"; fi
}
check_disk "$PKG_ROOT" || true
check_disk "/var/lib/docker" || true
# 导入 cluster-info.env默认取当前包根也可用 CLUSTER_INFO 指定路径)
CI_IN="${CLUSTER_INFO:-$PKG_ROOT/cluster-info.env}"
NET_NAME="${ARGUS_OVERLAY_NET:-argus-sys-net}"
SWARM_AVAILABLE=true
if [[ -f "$CI_IN" ]]; then
info "读取 cluster-info.env: $CI_IN"
set -a; source "$CI_IN"; set +a
# 尝试加入 Swarm幂等允许 join-token 为空
if [[ -n "${SWARM_MANAGER_ADDR:-}" && -n "${SWARM_JOIN_TOKEN_WORKER:-}" ]]; then
info "尝试加入 Swarm幂等$SWARM_MANAGER_ADDR"
docker swarm join --token "$SWARM_JOIN_TOKEN_WORKER" "$SWARM_MANAGER_ADDR":2377 >/dev/null 2>&1 || true
fi
# 检查当前节点 Swarm 状态
if ! docker info 2>/dev/null | grep -q "Swarm: active"; then
info "检测到当前节点 Swarm 未启用client_arm 将退回本地 network 模式"
SWARM_AVAILABLE=false
fi
else
info "未找到 cluster-info.env$CI_INclient_arm 将以本地 network 模式运行"
SWARM_AVAILABLE=false
fi
# 生成/更新 .env保留人工填写项不覆盖已有键
if [[ ! -f "$ENV_OUT" ]]; then
cp "$ENV_EX" "$ENV_OUT"
fi
set_kv(){ local k="$1" v="$2"; if grep -q "^${k}=" "$ENV_OUT"; then sed -i -E "s#^${k}=.*#${k}=${v}#" "$ENV_OUT"; else echo "${k}=${v}" >> "$ENV_OUT"; fi }
if [[ "$SWARM_AVAILABLE" == true ]]; then
# ===== Swarm 模式:强依赖 overlay 网络和 DNS alias =====
# 导入 busybox 并做 overlay 预热与连通性
if ! docker image inspect busybox:latest >/dev/null 2>&1; then
if [[ -f "$PKG_ROOT/images/busybox.tar" ]]; then
info "加载 busybox.tar 以预热 overlay"
docker load -i "$PKG_ROOT/images/busybox.tar" >/dev/null
else
err "缺少 busybox 镜像(包内 images/busybox.tar 或本地 busybox:latest无法预热 overlay $NET_NAME"; exit 1
fi
fi
docker rm -f argus-net-warmup >/dev/null 2>&1 || true
info "启动 warmup 容器加入 overlay: $NET_NAME"
docker run -d --rm --name argus-net-warmup --network "$NET_NAME" busybox:latest sleep 600 >/dev/null 2>&1 || true
for i in {1..60}; do docker network inspect "$NET_NAME" >/dev/null 2>&1 && { info "overlay 可见 (t=${i}s)"; break; }; sleep 1; done
docker network inspect "$NET_NAME" >/dev/null 2>&1 || { err "预热后仍未看到 overlay: $NET_NAME;请确认 server 侧 overlay 已创建且可达"; exit 1; }
if ! docker exec argus-net-warmup sh -lc "ping -c 1 -W 2 master.argus.com >/dev/null 2>&1"; then
err "warmup 容器内无法通过别名访问 master.argus.com请确认 server compose 已启动并加入 overlay $NET_NAME"
exit 1
fi
info "warmup 容器内可达 master.argus.comDocker DNS + alias 正常)"
# 写入与 Swarm 相关的字段,便于后续诊断
set_kv SWARM_MANAGER_ADDR "${SWARM_MANAGER_ADDR:-}"
set_kv SWARM_JOIN_TOKEN_WORKER "${SWARM_JOIN_TOKEN_WORKER:-}"
set_kv SWARM_JOIN_TOKEN_MANAGER "${SWARM_JOIN_TOKEN_MANAGER:-}"
# 若未显式设置 MASTER_ENDPOINT则默认走 overlay 别名
if ! grep -q '^MASTER_ENDPOINT=' "$ENV_OUT"; then
echo "MASTER_ENDPOINT=http://master.argus.com:3000" >> "$ENV_OUT"
fi
else
# ===== 本地 network 退化模式:不依赖 Swarm 和 overlay =====
if ! docker network inspect "$NET_NAME" >/dev/null 2>&1; then
info "创建本地 bridge 网络: $NET_NAMEclient_arm 退化模式)"
docker network create "$NET_NAME" >/dev/null
else
info "本地网络已存在: $NET_NAME"
fi
# 若未显式设置 MASTER_ENDPOINT则提示用户后续手动配置
if ! grep -q '^MASTER_ENDPOINT=' "$ENV_OUT"; then
echo "MASTER_ENDPOINT=" >> "$ENV_OUT"
fi
ME=$(grep -E '^MASTER_ENDPOINT=' "$ENV_OUT" | head -1 | cut -d= -f2-)
if [[ -z "$ME" ]]; then
err "本地 network 模式下必须配置 MASTER_ENDPOINT示例http://<Server-IP>:<Port>),请编辑 compose/.env 后重试"
else
info "当前为本地 network 模式metric-cpu-node 将通过 MASTER_ENDPOINT=${ME} 访问 master"
fi
fi
# 检查用户必须填写的字段
REQ_VARS=(AGENT_ENV AGENT_USER AGENT_INSTANCE CPU_NODE_HOSTNAME)
missing=()
for v in "${REQ_VARS[@]}"; do
val=$(grep -E "^$v=" "$ENV_OUT" | head -1 | cut -d= -f2-)
if [[ -z "$val" ]]; then missing+=("$v"); fi
done
if [[ ${#missing[@]} -gt 0 ]]; then
err "以下变量必须在 compose/.env 中填写:${missing[*]}(已保留你现有的内容,不会被覆盖)"
exit 1
fi
info "已生成 compose/.env可执行 scripts/install.sh 启动 ARM Client-CPU"