[#37] swarm 部署优化

This commit is contained in:
yuyr 2025-11-04 09:34:42 +08:00
parent 4ed5c64804
commit c8279997a4
7 changed files with 234 additions and 29 deletions

View File

@ -46,7 +46,11 @@ make_dir "$STAGE/private/argus"
# 2) Compose: derive from sys/tests by removing test-only services # 2) Compose: derive from sys/tests by removing test-only services
SRC_COMPOSE="$ROOT_DIR/src/sys/tests/docker-compose.yml" SRC_COMPOSE="$ROOT_DIR/src/sys/tests/docker-compose.yml"
[[ -f "$SRC_COMPOSE" ]] || { err "missing $SRC_COMPOSE"; exit 1; } [[ -f "$SRC_COMPOSE" ]] || { err "missing $SRC_COMPOSE"; exit 1; }
awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$STAGE/compose/docker-compose.yml" # 2.1 filter out test services
tmp_compose1="$STAGE/compose/docker-compose.filtered.yml"
awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$tmp_compose1"
# 2.2 transform to external overlay network (remove sysnet and per-service blocks)
awk -f "$BUILD_DIR/templates/docker-compose.overlay.awk" "$tmp_compose1" > "$STAGE/compose/docker-compose.yml"
cp "$BUILD_DIR/templates/.env.example" "$STAGE/compose/.env.example" cp "$BUILD_DIR/templates/.env.example" "$STAGE/compose/.env.example"
# fix relative private path to match package layout (compose/ and private/ are siblings) # fix relative private path to match package layout (compose/ and private/ are siblings)
sed -i 's#\./private/#../private/#g' "$STAGE/compose/docker-compose.yml" sed -i 's#\./private/#../private/#g' "$STAGE/compose/docker-compose.yml"
@ -55,17 +59,41 @@ sed -i -E 's#- \./private:/private#- ../private:/private#g' "$STAGE/compose/dock
# drop timezone file bind which may not exist on target distros (e.g. NixOS) # drop timezone file bind which may not exist on target distros (e.g. NixOS)
sed -i '/\/etc\/timezone:\/etc\/timezone:ro/d' "$STAGE/compose/docker-compose.yml" sed -i '/\/etc\/timezone:\/etc\/timezone:ro/d' "$STAGE/compose/docker-compose.yml"
# sanity-check: ensure test services are absent # sanity-check: ensure test services are absent and external network present
if grep -E '^(\s{2})(node-a|node-b|test-node|test-gpu-node):\s*$' -n "$STAGE/compose/docker-compose.yml" >/dev/null; then if grep -E '^(\s{2})(node-a|node-b|test-node|test-gpu-node):\s*$' -n "$STAGE/compose/docker-compose.yml" >/dev/null; then
err "compose filter failed: test services still present"; exit 1; err "compose filter failed: test services still present"; exit 1;
fi fi
if ! grep -q '^networks:' "$STAGE/compose/docker-compose.yml" || ! grep -q 'argus-sys-net:' "$STAGE/compose/docker-compose.yml"; then
err "compose overlay transform failed: external network missing"; exit 1;
fi
# 3) Images (reuse if already exported unless --resave-image) # 3) Images (reuse if already exported unless --resave-image)
existing_images_tar="$PKG_DIR/images/all-images.tar.gz" existing_images_tar="$PKG_DIR/images/all-images.tar.gz"
if [[ "$RESAVE_IMAGE" == false && -f "$existing_images_tar" ]]; then if [[ "$RESAVE_IMAGE" == false && -f "$existing_images_tar" ]]; then
log "Reusing existing images tar: $existing_images_tar" log "Reusing existing images tar: $existing_images_tar"
cp "$existing_images_tar" "$STAGE/images/" cp "$existing_images_tar" "$STAGE/images/"
else elif [[ "$RESAVE_IMAGE" == false ]]; then
# Try cross-version reuse from latest server_*.tar.gz
latest_pkg=$(ls -1t "$ART_ROOT/server"/server_*.tar.gz 2>/dev/null | head -n1 || true)
if [[ -n "$latest_pkg" ]]; then
log "Reusing images from: $latest_pkg"
mkdir -p "$STAGE/images"
# extract matching file regardless of top-level dir
if tar -xzf "$latest_pkg" -C "$STAGE" --wildcards '*/images/all-images.tar.gz' 2>/dev/null; then
# locate and move
found=$(find "$STAGE" -type f -path '*/images/all-images.tar.gz' | head -n1 || true)
if [[ -n "$found" ]]; then
mv "$found" "$STAGE/images/all-images.tar.gz"
# cleanup leftover extracted dir
dir_to_clean=$(dirname "$found")
rm -rf "${dir_to_clean%/images}" 2>/dev/null || true
fi
fi
fi
fi
# If still not present, save from local docker daemon
if [[ ! -f "$STAGE/images/all-images.tar.gz" ]]; then
require_cmd docker gzip require_cmd docker gzip
images=( images=(
argus-bind9:latest argus-bind9:latest

View File

@ -27,3 +27,6 @@ FTP_DOMAIN=ftp.metric.argus.com
# GPU profile disabled by default # GPU profile disabled by default
ENABLE_GPU=false ENABLE_GPU=false
# External overlay network (Swarm attachable)
OVERLAY_NET_NAME=argus-sys-net

View File

@ -0,0 +1,74 @@
#!/usr/bin/awk -f
# Transform docker-compose.yml to use an external overlay network for all services
# - Remove top-level networks definition
# - Remove per-service networks block (including ipv4_address and sysnet refs)
# - Insert per-service networks: [argus-sys-net]
# - Append external networks mapping at the end
BEGIN{
in_top_networks=0; in_services=0; in_service=0; svc_indent=0; curr_name="";
}
function is_service_header(line){ return svc_name(line)!=""; }
function svc_name(line, m){ if (match(line, /^ ([A-Za-z0-9_-]+):[ ]*$/, m)) return m[1]; return ""; }
function indent_len(s, n){ n=match(s,/[^ ]/)-1; if(n<0) n=0; return n; }
{
# Detect entry into top-level sections
if ($0 ~ /^[A-Za-z0-9_-]+:[ ]*$/ && $0 !~ /^\s/) {
in_services = ($0 ~ /^services:[ ]*$/);
# If a new top-level section starts, stop skipping top networks
in_top_networks = 0;
}
# Handle removal of initial top-level 'networks:' block
if ($0 ~ /^networks:[ ]*$/ && $0 !~ /^\s/) {
in_top_networks = 1; next;
}
if (in_top_networks) {
# skip until next top-level section (non-indented key)
next;
}
if (in_services) {
# Track service boundaries
if (is_service_header($0)) {
in_service=1; svc_indent=2; networks_inserted=0; curr_name=svc_name($0); print; next;
}
if (in_service) {
# If line is indented <= service indent, we've left this service
if (indent_len($0) <= svc_indent && $0 !~ /^\s*$/) {
in_service=0;
}
}
if (in_service) {
# Skip any existing networks block under the service
if ($0 ~ /^\s{4}networks:[ ]*$/) { skipping_nets=1; next; }
if (skipping_nets) {
if (indent_len($0) <= 4) { skipping_nets=0; }
else next;
}
# After container_name or image, inject networks once
if (!networks_inserted && ($0 ~ /^\s{4}container_name:/ || $0 ~ /^\s{4}image:/)) {
print;
print " networks:";
print " - argus-sys-net";
networks_inserted=1; next;
}
# no host port injection; bind serves DNS inside overlay only
}
}
print;
}
END{
print "";
print "networks:";
print " argus-sys-net:";
print " external: true";
print " name: ${OVERLAY_NET_NAME:-argus-sys-net}";
}

View File

@ -0,0 +1,26 @@
# Argus 多机部署Docker Swarm + External Overlay
- 前提Docker ≥ 20.10Manager/Worker 节点开放 2377/tcp、7946/tcp+udp、4789/udp。
- DNSBind9 为统一权威,解析 *.argus.com 至部署机固定对外主机 IP。
## 在部署机Manager
- 初始化 Swarm`docker swarm init --advertise-addr <manager_ip>`
- 创建 overlay`docker network create --driver overlay --attachable argus-sys-net`
- 解压离线包后执行:
- `cd scripts && sudo ./server-prepare-dirs.sh`
- `./server-install.sh`(会验证 Swarm 状态、确保 overlay 存在、写入 dns.conf
- `./server-selfcheck.sh`(失败会自动触发诊断)
## 在节点机Worker 或非 Docker 主机)
- Swarm Worker执行 Manager 的 `docker swarm join ...`
- 运行客户端容器:
- `docker run -d --name argus-metric-node-001 --network argus-sys-net -v /data/argus/agent:/private/argus/agent argus-sys-metric-test-node:latest sleep infinity`
- 进入容器安装(先 IP 引导,后域名):
- `curl -u ftpuser:*** -fsSL ftp://<FTP_IP>:21/setup.sh -o /tmp/setup.sh && chmod +x /tmp/setup.sh`
- `AGENT_ENV=dev AGENT_USER=alice AGENT_INSTANCE=node001 MASTER_ENDPOINT=http://master.argus.com:3000 /tmp/setup.sh --server ftp.argus.com --user ftpuser --password '***' --port 21`
## 关键点
- 首次必须使用 FTP 的 IP 引导(下载 setup.sh 与 dns.conf
- MASTER_ENDPOINT 永远使用域名:`http://master.argus.com:3000`
- docker compose 改为 external overlay容器内不使用 Docker 服务名web-proxy 与组件上游统一用域名

View File

@ -6,10 +6,16 @@ ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
mkdir -p "$ROOT/logs"
ts="$(date -u +%Y%m%d-%H%M%SZ)" ts="$(date -u +%Y%m%d-%H%M%SZ)"
DETAILS="$ROOT/logs/diagnose_details_${ts}.log" LOG_DIR="$ROOT/logs"
ERRORS="$ROOT/logs/diagnose_error_${ts}.log" mkdir -p "$LOG_DIR" || true
# Fallback to /tmp when logs dir is not writable
if ! ( : > "$LOG_DIR/.w" 2>/dev/null ); then
LOG_DIR="/tmp/argus-logs"
mkdir -p "$LOG_DIR" || true
fi
DETAILS="$LOG_DIR/diagnose_details_${ts}.log"
ERRORS="$LOG_DIR/diagnose_error_${ts}.log"
: > "$DETAILS"; : > "$ERRORS" : > "$DETAILS"; : > "$ERRORS"
logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; } logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; }
@ -83,6 +89,25 @@ logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083}
logd "Web-Proxy 8084 CORS: ${cors8084}" logd "Web-Proxy 8084 CORS: ${cors8084}"
logd "Web-Proxy 8085 CORS: ${cors8085}" logd "Web-Proxy 8085 CORS: ${cors8085}"
# Overlay network diagnostics
section OVERLAY-NET
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then
logd "overlay present: ${OVERLAY_NET_NAME:-argus-sys-net}"
docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" | sed -n '1,60p' >> "$DETAILS" 2>/dev/null || true
else
append_err "[overlay][network] missing ${OVERLAY_NET_NAME:-argus-sys-net}"
fi
# Domain resolution & reachability from inside web-proxy (bind-backed)
section DOMAIN
for d in master.argus.com ftp.argus.com kibana.argus.com es.log.argus.com prom.argus.com grafana.argus.com; do
logd "getent $d (web-proxy):"
docker exec argus-web-proxy sh -lc "getent hosts $d || true" >> "$DETAILS" 2>&1 || true
done
logd "HTTP (web-proxy): master.readyz=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://master.argus.com:3000/readyz\" 2>/dev/null || echo 000)"
logd "HTTP (web-proxy): es.health=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://es.log.argus.com:9200/_cluster/health\" 2>/dev/null || echo 000)"
logd "HTTP (web-proxy): kibana.status=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://kibana.argus.com:5601/api/status\" 2>/dev/null || echo 000)"
# FTP share writability (container perspective) # FTP share writability (container perspective)
section FTP-SHARE section FTP-SHARE
docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true
@ -110,8 +135,13 @@ sort -u -o "$ERRORS" "$ERRORS"
echo "Diagnostic details -> $DETAILS" echo "Diagnostic details -> $DETAILS"
echo "Detected errors -> $ERRORS" echo "Detected errors -> $ERRORS"
# maintain latest symlinks for convenience if [[ "$LOG_DIR" == "$ROOT/logs" ]]; then
# maintain latest symlinks when writing under package logs
ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true
ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true
else
echo "Diagnostic details -> $DETAILS"
echo "Detected errors -> $ERRORS"
fi
exit 0 exit 0

View File

@ -30,17 +30,7 @@ prepare_env() {
if [[ -f "$ENV_FILE" ]]; then log ".env exists, keep as-is"; return; fi if [[ -f "$ENV_FILE" ]]; then log ".env exists, keep as-is"; return; fi
[[ -f "$ENV_TEMPLATE" ]] || { err "missing $ENV_TEMPLATE"; exit 1; } [[ -f "$ENV_TEMPLATE" ]] || { err "missing $ENV_TEMPLATE"; exit 1; }
cp "$ENV_TEMPLATE" "$ENV_FILE" cp "$ENV_TEMPLATE" "$ENV_FILE"
# auto-assign ports if busy # overlay 模式下,避免为不同服务分配到同一新端口;保持模板端口不做自动改写
for key in MASTER_PORT ES_HTTP_PORT KIBANA_PORT NODE_A_PORT NODE_B_PORT PROMETHEUS_PORT GRAFANA_PORT ALERTMANAGER_PORT \
WEB_PROXY_PORT_8080 WEB_PROXY_PORT_8081 WEB_PROXY_PORT_8082 WEB_PROXY_PORT_8083 WEB_PROXY_PORT_8084 WEB_PROXY_PORT_8085 \
FTP_PORT FTP_DATA_PORT; do
val=$(grep -E "^${key}=" "$ENV_FILE" | tail -1 | cut -d= -f2)
new=$(find_free_port "$val") || true
if [[ -n "${new:-}" && "$new" != "$val" ]]; then
sed -i "s/^${key}=.*/${key}=${new}/" "$ENV_FILE"
log "port ${key} busy -> ${new}"
fi
done
} }
prepare_data_dirs() { prepare_data_dirs() {
@ -63,6 +53,43 @@ prepare_data_dirs() {
"$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \ "$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \
"$PKG_ROOT/private/argus/alert/alertmanager" \ "$PKG_ROOT/private/argus/alert/alertmanager" \
"$PKG_ROOT/private/argus/metric/ftp/share" "$PKG_ROOT/private/argus/metric/ftp/share"
# non-root: relax permissions to avoid container UID mismatch blocking writes
chmod -R a+rwx "$PKG_ROOT/private/argus" 2>/dev/null || true
fi
}
ensure_swarm_and_overlay() {
local net_name="${OVERLAY_NET_NAME:-argus-sys-net}"
# Require swarm active
local state
state=$(docker info --format '{{.Swarm.LocalNodeState}}' 2>/dev/null || echo "")
if [[ "$state" != "active" ]]; then
err "Docker Swarm is not active. On this host run:"
err " docker swarm init --advertise-addr <this_host_ip>"
exit 1
fi
# Create attachable overlay if missing
if ! docker network inspect "$net_name" >/dev/null 2>&1; then
log "creating attachable overlay network: $net_name"
docker network create --driver overlay --attachable "$net_name" >/dev/null
fi
}
bootstrap_dns_conf() {
local etc_dir="$PKG_ROOT/private/argus/etc"
mkdir -p "$etc_dir"
local dns_file="$etc_dir/dns.conf"
if [[ ! -s "$dns_file" ]]; then
# detect host primary IP
local host_ip
host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7; exit}')
[[ -z "$host_ip" ]] && host_ip=$(hostname -I 2>/dev/null | awk '{print $1}')
if [[ -n "$host_ip" ]]; then
echo "$host_ip" > "$dns_file"
log "wrote initial dns.conf with host IP: $host_ip"
else
err "failed to determine host IP for dns.conf; please edit $dns_file manually"
fi
fi fi
} }
@ -75,6 +102,8 @@ load_images() {
bring_up() { bring_up() {
log "starting services via compose" log "starting services via compose"
ensure_swarm_and_overlay
bootstrap_dns_conf
local ov="$PKG_ROOT/compose/docker-compose.os-compat.override.yml" local ov="$PKG_ROOT/compose/docker-compose.os-compat.override.yml"
if [[ ! -f "$ov" ]]; then if [[ ! -f "$ov" ]]; then
cat > "$ov" <<'YAML' cat > "$ov" <<'YAML'
@ -182,6 +211,7 @@ selfcheck() {
} }
main() { main() {
mkdir -p "$PKG_ROOT/logs"
prepare_env prepare_env
prepare_data_dirs prepare_data_dirs
load_images load_images

View File

@ -13,23 +13,31 @@ wait_http() { local url="$1"; local attempts=${2:-120}; local i=1; while ((i<=at
code_for() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; } code_for() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; } header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; }
mkdir -p "$ROOT/logs" LOG_DIR="$ROOT/logs"
OUT_JSON="$ROOT/logs/selfcheck.json" mkdir -p "$LOG_DIR" || true
OUT_JSON="$LOG_DIR/selfcheck.json"
tmp=$(mktemp) tmp=$(mktemp)
ok=1 ok=1
log "checking Elasticsearch" log "checking overlay network"
if curl -fsS "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then es_ok=true; else es_ok=false; ok=0; fi net_ok=false
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" | grep -q '"Driver": "overlay"'; then net_ok=true; fi
fi
[[ "$net_ok" == true ]] || ok=0
log "checking Kibana" log "checking Elasticsearch (via domain inside web-proxy)"
kb_code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:${KIBANA_PORT:-5601}/api/status" || echo 000) if docker exec argus-web-proxy sh -lc "curl -fsS http://es.log.argus.com:9200/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then es_ok=true; else es_ok=false; ok=0; fi
log "checking Kibana (via domain inside web-proxy)"
kb_code=$(docker exec argus-web-proxy sh -lc "curl -s -o /dev/null -w '%{http_code}' http://kibana.log.argus.com:5601/api/status" || echo 000)
kb_ok=false kb_ok=false
if [[ "$kb_code" == "200" ]]; then body=$(curl -sS "http://localhost:${KIBANA_PORT:-5601}/api/status"); echo "$body" | grep -q '"level":"available"' && kb_ok=true; fi if [[ "$kb_code" == "200" ]]; then body=$(curl -sS "http://localhost:${KIBANA_PORT:-5601}/api/status"); echo "$body" | grep -q '"level":"available"' && kb_ok=true; fi
[[ "$kb_ok" == true ]] || ok=0 [[ "$kb_ok" == true ]] || ok=0
log "checking Master" log "checking Master (via domain inside web-proxy)"
wait_http "http://localhost:${MASTER_PORT:-32300}/readyz" 60 || ok=0 if docker exec argus-web-proxy sh -lc "curl -fsS http://master.argus.com:3000/readyz" >/dev/null 2>&1; then true; else ok=0; fi
log "checking FTP" log "checking FTP"
if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then
@ -71,11 +79,17 @@ cat > "$tmp" <<JSON
"grafana": $gf_ok, "grafana": $gf_ok,
"alertmanager": true, "alertmanager": true,
"web_proxy": $wp_ok, "web_proxy": $wp_ok,
"overlay_net": $net_ok,
"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
} }
JSON JSON
mv "$tmp" "$OUT_JSON" if ! mv "$tmp" "$OUT_JSON" 2>/dev/null; then
# fallback when logs dir not writable (no sudo allowed)
OUT_JSON="/tmp/selfcheck_$(date -u +%Y%m%d-%H%M%SZ).json"
cp "$tmp" "$OUT_JSON"
log "selfcheck.json written to $OUT_JSON (logs dir not writable)"
fi
if [[ "$ok" == 1 ]]; then if [[ "$ok" == 1 ]]; then
log "selfcheck OK" log "selfcheck OK"
exit 0 exit 0