diff --git a/deployment/build/build_server_package.sh b/deployment/build/build_server_package.sh index 77a3249..1717f8a 100755 --- a/deployment/build/build_server_package.sh +++ b/deployment/build/build_server_package.sh @@ -46,7 +46,11 @@ make_dir "$STAGE/private/argus" # 2) Compose: derive from sys/tests by removing test-only services SRC_COMPOSE="$ROOT_DIR/src/sys/tests/docker-compose.yml" [[ -f "$SRC_COMPOSE" ]] || { err "missing $SRC_COMPOSE"; exit 1; } -awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$STAGE/compose/docker-compose.yml" +# 2.1 filter out test services +tmp_compose1="$STAGE/compose/docker-compose.filtered.yml" +awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$tmp_compose1" +# 2.2 transform to external overlay network (remove sysnet and per-service blocks) +awk -f "$BUILD_DIR/templates/docker-compose.overlay.awk" "$tmp_compose1" > "$STAGE/compose/docker-compose.yml" cp "$BUILD_DIR/templates/.env.example" "$STAGE/compose/.env.example" # fix relative private path to match package layout (compose/ and private/ are siblings) sed -i 's#\./private/#../private/#g' "$STAGE/compose/docker-compose.yml" @@ -55,17 +59,41 @@ sed -i -E 's#- \./private:/private#- ../private:/private#g' "$STAGE/compose/dock # drop timezone file bind which may not exist on target distros (e.g. NixOS) sed -i '/\/etc\/timezone:\/etc\/timezone:ro/d' "$STAGE/compose/docker-compose.yml" -# sanity-check: ensure test services are absent +# sanity-check: ensure test services are absent and external network present if grep -E '^(\s{2})(node-a|node-b|test-node|test-gpu-node):\s*$' -n "$STAGE/compose/docker-compose.yml" >/dev/null; then err "compose filter failed: test services still present"; exit 1; fi +if ! grep -q '^networks:' "$STAGE/compose/docker-compose.yml" || ! grep -q 'argus-sys-net:' "$STAGE/compose/docker-compose.yml"; then + err "compose overlay transform failed: external network missing"; exit 1; +fi # 3) Images (reuse if already exported unless --resave-image) existing_images_tar="$PKG_DIR/images/all-images.tar.gz" if [[ "$RESAVE_IMAGE" == false && -f "$existing_images_tar" ]]; then log "Reusing existing images tar: $existing_images_tar" cp "$existing_images_tar" "$STAGE/images/" -else +elif [[ "$RESAVE_IMAGE" == false ]]; then + # Try cross-version reuse from latest server_*.tar.gz + latest_pkg=$(ls -1t "$ART_ROOT/server"/server_*.tar.gz 2>/dev/null | head -n1 || true) + if [[ -n "$latest_pkg" ]]; then + log "Reusing images from: $latest_pkg" + mkdir -p "$STAGE/images" + # extract matching file regardless of top-level dir + if tar -xzf "$latest_pkg" -C "$STAGE" --wildcards '*/images/all-images.tar.gz' 2>/dev/null; then + # locate and move + found=$(find "$STAGE" -type f -path '*/images/all-images.tar.gz' | head -n1 || true) + if [[ -n "$found" ]]; then + mv "$found" "$STAGE/images/all-images.tar.gz" + # cleanup leftover extracted dir + dir_to_clean=$(dirname "$found") + rm -rf "${dir_to_clean%/images}" 2>/dev/null || true + fi + fi + fi +fi + +# If still not present, save from local docker daemon +if [[ ! -f "$STAGE/images/all-images.tar.gz" ]]; then require_cmd docker gzip images=( argus-bind9:latest diff --git a/deployment/build/templates/.env.example b/deployment/build/templates/.env.example index 0af566c..557dda2 100644 --- a/deployment/build/templates/.env.example +++ b/deployment/build/templates/.env.example @@ -27,3 +27,6 @@ FTP_DOMAIN=ftp.metric.argus.com # GPU profile disabled by default ENABLE_GPU=false + +# External overlay network (Swarm attachable) +OVERLAY_NET_NAME=argus-sys-net diff --git a/deployment/build/templates/docker-compose.overlay.awk b/deployment/build/templates/docker-compose.overlay.awk new file mode 100644 index 0000000..e719225 --- /dev/null +++ b/deployment/build/templates/docker-compose.overlay.awk @@ -0,0 +1,74 @@ +#!/usr/bin/awk -f +# Transform docker-compose.yml to use an external overlay network for all services +# - Remove top-level networks definition +# - Remove per-service networks block (including ipv4_address and sysnet refs) +# - Insert per-service networks: [argus-sys-net] +# - Append external networks mapping at the end + +BEGIN{ + in_top_networks=0; in_services=0; in_service=0; svc_indent=0; curr_name=""; +} + +function is_service_header(line){ return svc_name(line)!=""; } +function svc_name(line, m){ if (match(line, /^ ([A-Za-z0-9_-]+):[ ]*$/, m)) return m[1]; return ""; } + +function indent_len(s, n){ n=match(s,/[^ ]/)-1; if(n<0) n=0; return n; } + +{ + # Detect entry into top-level sections + if ($0 ~ /^[A-Za-z0-9_-]+:[ ]*$/ && $0 !~ /^\s/) { + in_services = ($0 ~ /^services:[ ]*$/); + # If a new top-level section starts, stop skipping top networks + in_top_networks = 0; + } + + # Handle removal of initial top-level 'networks:' block + if ($0 ~ /^networks:[ ]*$/ && $0 !~ /^\s/) { + in_top_networks = 1; next; + } + if (in_top_networks) { + # skip until next top-level section (non-indented key) + next; + } + + if (in_services) { + # Track service boundaries + if (is_service_header($0)) { + in_service=1; svc_indent=2; networks_inserted=0; curr_name=svc_name($0); print; next; + } + if (in_service) { + # If line is indented <= service indent, we've left this service + if (indent_len($0) <= svc_indent && $0 !~ /^\s*$/) { + in_service=0; + } + } + + if (in_service) { + # Skip any existing networks block under the service + if ($0 ~ /^\s{4}networks:[ ]*$/) { skipping_nets=1; next; } + if (skipping_nets) { + if (indent_len($0) <= 4) { skipping_nets=0; } + else next; + } + + # After container_name or image, inject networks once + if (!networks_inserted && ($0 ~ /^\s{4}container_name:/ || $0 ~ /^\s{4}image:/)) { + print; + print " networks:"; + print " - argus-sys-net"; + networks_inserted=1; next; + } + # no host port injection; bind serves DNS inside overlay only + } + } + + print; +} + +END{ + print ""; + print "networks:"; + print " argus-sys-net:"; + print " external: true"; + print " name: ${OVERLAY_NET_NAME:-argus-sys-net}"; +} diff --git a/deployment/build/templates/docs/SWARM_DEPLOY_zh.md b/deployment/build/templates/docs/SWARM_DEPLOY_zh.md new file mode 100644 index 0000000..bad4ab6 --- /dev/null +++ b/deployment/build/templates/docs/SWARM_DEPLOY_zh.md @@ -0,0 +1,26 @@ +# Argus 多机部署(Docker Swarm + External Overlay) + +- 前提:Docker ≥ 20.10;Manager/Worker 节点开放 2377/tcp、7946/tcp+udp、4789/udp。 +- DNS:Bind9 为统一权威,解析 *.argus.com 至部署机固定对外主机 IP。 + +## 在部署机(Manager) +- 初始化 Swarm:`docker swarm init --advertise-addr ` +- 创建 overlay:`docker network create --driver overlay --attachable argus-sys-net` +- 解压离线包后执行: + - `cd scripts && sudo ./server-prepare-dirs.sh` + - `./server-install.sh`(会验证 Swarm 状态、确保 overlay 存在、写入 dns.conf) + - `./server-selfcheck.sh`(失败会自动触发诊断) + +## 在节点机(Worker 或非 Docker 主机) +- Swarm Worker:执行 Manager 的 `docker swarm join ...`; +- 运行客户端容器: + - `docker run -d --name argus-metric-node-001 --network argus-sys-net -v /data/argus/agent:/private/argus/agent argus-sys-metric-test-node:latest sleep infinity` +- 进入容器安装(先 IP 引导,后域名): + - `curl -u ftpuser:*** -fsSL ftp://:21/setup.sh -o /tmp/setup.sh && chmod +x /tmp/setup.sh` + - `AGENT_ENV=dev AGENT_USER=alice AGENT_INSTANCE=node001 MASTER_ENDPOINT=http://master.argus.com:3000 /tmp/setup.sh --server ftp.argus.com --user ftpuser --password '***' --port 21` + +## 关键点 +- 首次必须使用 FTP 的 IP 引导(下载 setup.sh 与 dns.conf) +- MASTER_ENDPOINT 永远使用域名:`http://master.argus.com:3000` +- docker compose 改为 external overlay;容器内不使用 Docker 服务名;web-proxy 与组件上游统一用域名 + diff --git a/deployment/build/templates/scripts/server-diagnose.sh b/deployment/build/templates/scripts/server-diagnose.sh index 5120d6e..27520e8 100755 --- a/deployment/build/templates/scripts/server-diagnose.sh +++ b/deployment/build/templates/scripts/server-diagnose.sh @@ -6,10 +6,16 @@ ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a -mkdir -p "$ROOT/logs" ts="$(date -u +%Y%m%d-%H%M%SZ)" -DETAILS="$ROOT/logs/diagnose_details_${ts}.log" -ERRORS="$ROOT/logs/diagnose_error_${ts}.log" +LOG_DIR="$ROOT/logs" +mkdir -p "$LOG_DIR" || true +# Fallback to /tmp when logs dir is not writable +if ! ( : > "$LOG_DIR/.w" 2>/dev/null ); then + LOG_DIR="/tmp/argus-logs" + mkdir -p "$LOG_DIR" || true +fi +DETAILS="$LOG_DIR/diagnose_details_${ts}.log" +ERRORS="$LOG_DIR/diagnose_error_${ts}.log" : > "$DETAILS"; : > "$ERRORS" logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; } @@ -83,6 +89,25 @@ logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083} logd "Web-Proxy 8084 CORS: ${cors8084}" logd "Web-Proxy 8085 CORS: ${cors8085}" +# Overlay network diagnostics +section OVERLAY-NET +if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then + logd "overlay present: ${OVERLAY_NET_NAME:-argus-sys-net}" + docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" | sed -n '1,60p' >> "$DETAILS" 2>/dev/null || true +else + append_err "[overlay][network] missing ${OVERLAY_NET_NAME:-argus-sys-net}" +fi + +# Domain resolution & reachability from inside web-proxy (bind-backed) +section DOMAIN +for d in master.argus.com ftp.argus.com kibana.argus.com es.log.argus.com prom.argus.com grafana.argus.com; do + logd "getent $d (web-proxy):" + docker exec argus-web-proxy sh -lc "getent hosts $d || true" >> "$DETAILS" 2>&1 || true +done +logd "HTTP (web-proxy): master.readyz=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://master.argus.com:3000/readyz\" 2>/dev/null || echo 000)" +logd "HTTP (web-proxy): es.health=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://es.log.argus.com:9200/_cluster/health\" 2>/dev/null || echo 000)" +logd "HTTP (web-proxy): kibana.status=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://kibana.argus.com:5601/api/status\" 2>/dev/null || echo 000)" + # FTP share writability (container perspective) section FTP-SHARE docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true @@ -110,8 +135,13 @@ sort -u -o "$ERRORS" "$ERRORS" echo "Diagnostic details -> $DETAILS" echo "Detected errors -> $ERRORS" -# maintain latest symlinks for convenience -ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true -ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true +if [[ "$LOG_DIR" == "$ROOT/logs" ]]; then + # maintain latest symlinks when writing under package logs + ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true + ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true +else + echo "Diagnostic details -> $DETAILS" + echo "Detected errors -> $ERRORS" +fi exit 0 diff --git a/deployment/build/templates/scripts/server-install.sh b/deployment/build/templates/scripts/server-install.sh index a9b9fef..0e60b17 100755 --- a/deployment/build/templates/scripts/server-install.sh +++ b/deployment/build/templates/scripts/server-install.sh @@ -30,17 +30,7 @@ prepare_env() { if [[ -f "$ENV_FILE" ]]; then log ".env exists, keep as-is"; return; fi [[ -f "$ENV_TEMPLATE" ]] || { err "missing $ENV_TEMPLATE"; exit 1; } cp "$ENV_TEMPLATE" "$ENV_FILE" - # auto-assign ports if busy - for key in MASTER_PORT ES_HTTP_PORT KIBANA_PORT NODE_A_PORT NODE_B_PORT PROMETHEUS_PORT GRAFANA_PORT ALERTMANAGER_PORT \ - WEB_PROXY_PORT_8080 WEB_PROXY_PORT_8081 WEB_PROXY_PORT_8082 WEB_PROXY_PORT_8083 WEB_PROXY_PORT_8084 WEB_PROXY_PORT_8085 \ - FTP_PORT FTP_DATA_PORT; do - val=$(grep -E "^${key}=" "$ENV_FILE" | tail -1 | cut -d= -f2) - new=$(find_free_port "$val") || true - if [[ -n "${new:-}" && "$new" != "$val" ]]; then - sed -i "s/^${key}=.*/${key}=${new}/" "$ENV_FILE" - log "port ${key} busy -> ${new}" - fi - done + # overlay 模式下,避免为不同服务分配到同一新端口;保持模板端口不做自动改写 } prepare_data_dirs() { @@ -63,6 +53,43 @@ prepare_data_dirs() { "$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \ "$PKG_ROOT/private/argus/alert/alertmanager" \ "$PKG_ROOT/private/argus/metric/ftp/share" + # non-root: relax permissions to avoid container UID mismatch blocking writes + chmod -R a+rwx "$PKG_ROOT/private/argus" 2>/dev/null || true + fi +} + +ensure_swarm_and_overlay() { + local net_name="${OVERLAY_NET_NAME:-argus-sys-net}" + # Require swarm active + local state + state=$(docker info --format '{{.Swarm.LocalNodeState}}' 2>/dev/null || echo "") + if [[ "$state" != "active" ]]; then + err "Docker Swarm is not active. On this host run:" + err " docker swarm init --advertise-addr " + exit 1 + fi + # Create attachable overlay if missing + if ! docker network inspect "$net_name" >/dev/null 2>&1; then + log "creating attachable overlay network: $net_name" + docker network create --driver overlay --attachable "$net_name" >/dev/null + fi +} + +bootstrap_dns_conf() { + local etc_dir="$PKG_ROOT/private/argus/etc" + mkdir -p "$etc_dir" + local dns_file="$etc_dir/dns.conf" + if [[ ! -s "$dns_file" ]]; then + # detect host primary IP + local host_ip + host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7; exit}') + [[ -z "$host_ip" ]] && host_ip=$(hostname -I 2>/dev/null | awk '{print $1}') + if [[ -n "$host_ip" ]]; then + echo "$host_ip" > "$dns_file" + log "wrote initial dns.conf with host IP: $host_ip" + else + err "failed to determine host IP for dns.conf; please edit $dns_file manually" + fi fi } @@ -75,6 +102,8 @@ load_images() { bring_up() { log "starting services via compose" + ensure_swarm_and_overlay + bootstrap_dns_conf local ov="$PKG_ROOT/compose/docker-compose.os-compat.override.yml" if [[ ! -f "$ov" ]]; then cat > "$ov" <<'YAML' @@ -182,6 +211,7 @@ selfcheck() { } main() { + mkdir -p "$PKG_ROOT/logs" prepare_env prepare_data_dirs load_images diff --git a/deployment/build/templates/scripts/server-selfcheck.sh b/deployment/build/templates/scripts/server-selfcheck.sh index 2d82829..204ecdc 100755 --- a/deployment/build/templates/scripts/server-selfcheck.sh +++ b/deployment/build/templates/scripts/server-selfcheck.sh @@ -13,23 +13,31 @@ wait_http() { local url="$1"; local attempts=${2:-120}; local i=1; while ((i<=at code_for() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; } header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; } -mkdir -p "$ROOT/logs" -OUT_JSON="$ROOT/logs/selfcheck.json" +LOG_DIR="$ROOT/logs" +mkdir -p "$LOG_DIR" || true +OUT_JSON="$LOG_DIR/selfcheck.json" tmp=$(mktemp) ok=1 -log "checking Elasticsearch" -if curl -fsS "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then es_ok=true; else es_ok=false; ok=0; fi +log "checking overlay network" +net_ok=false +if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then + if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" | grep -q '"Driver": "overlay"'; then net_ok=true; fi +fi +[[ "$net_ok" == true ]] || ok=0 -log "checking Kibana" -kb_code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:${KIBANA_PORT:-5601}/api/status" || echo 000) +log "checking Elasticsearch (via domain inside web-proxy)" +if docker exec argus-web-proxy sh -lc "curl -fsS http://es.log.argus.com:9200/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then es_ok=true; else es_ok=false; ok=0; fi + +log "checking Kibana (via domain inside web-proxy)" +kb_code=$(docker exec argus-web-proxy sh -lc "curl -s -o /dev/null -w '%{http_code}' http://kibana.log.argus.com:5601/api/status" || echo 000) kb_ok=false if [[ "$kb_code" == "200" ]]; then body=$(curl -sS "http://localhost:${KIBANA_PORT:-5601}/api/status"); echo "$body" | grep -q '"level":"available"' && kb_ok=true; fi [[ "$kb_ok" == true ]] || ok=0 -log "checking Master" -wait_http "http://localhost:${MASTER_PORT:-32300}/readyz" 60 || ok=0 +log "checking Master (via domain inside web-proxy)" +if docker exec argus-web-proxy sh -lc "curl -fsS http://master.argus.com:3000/readyz" >/dev/null 2>&1; then true; else ok=0; fi log "checking FTP" if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then @@ -71,11 +79,17 @@ cat > "$tmp" </dev/null; then + # fallback when logs dir not writable (no sudo allowed) + OUT_JSON="/tmp/selfcheck_$(date -u +%Y%m%d-%H%M%SZ).json" + cp "$tmp" "$OUT_JSON" + log "selfcheck.json written to $OUT_JSON (logs dir not writable)" +fi if [[ "$ok" == 1 ]]; then log "selfcheck OK" exit 0