From d1fad4a05a55cb92ff1d2d61553c25ad7a54ffad Mon Sep 17 00:00:00 2001 From: yuyr Date: Thu, 6 Nov 2025 16:43:14 +0800 Subject: [PATCH] =?UTF-8?q?[#37]=20=E5=A2=9E=E5=8A=A0sys/swarm=5Ftests(cpu?= =?UTF-8?q?)=20=EF=BC=9B=E5=8D=95=E7=8B=AC=E6=9E=84=E5=BB=BA=E7=9A=84node?= =?UTF-8?q?=20bundle=E9=95=9C=E5=83=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deployment/build/build_images.sh | 98 ++ .../scripts/fix-prom-targets-overlay.sh | 103 ++ .../templates/scripts/server-diagnose.sh | 28 + src/agent/app/collector.py | 160 ++- src/agent/dist/argus-agent | Bin 7580232 -> 7583784 bytes src/master/app/config.py | 10 + src/master/app/scheduler.py | 117 +- src/master/app/storage.py | 28 +- .../all-in-one-full/config/VERSION | 2 +- .../plugins/argus-agent/bin/argus-agent | 4 +- .../scripts/install_artifact.sh | 28 +- .../scripts/package_artifact.sh | 13 +- .../scripts/publish_artifact.sh | 42 +- src/sys/build/node-bundle/Dockerfile | 16 + src/sys/build/node-bundle/bundle/.gitignore | 2 + src/sys/build/node-bundle/bundle/setup.sh | 1006 +++++++++++++++++ src/sys/build/node-bundle/node-bootstrap.sh | 99 ++ src/sys/swarm_tests/.env | 21 + src/sys/swarm_tests/.env.example | 21 + src/sys/swarm_tests/.env.nodes | 8 + src/sys/swarm_tests/README.md | 52 + src/sys/swarm_tests/docker-compose.nodes.yml | 34 + src/sys/swarm_tests/docker-compose.server.yml | 174 +++ src/sys/swarm_tests/scripts/00_bootstrap.sh | 101 ++ src/sys/swarm_tests/scripts/01_server_up.sh | 39 + src/sys/swarm_tests/scripts/02_wait_ready.sh | 84 ++ src/sys/swarm_tests/scripts/03_nodes_up.sh | 16 + .../swarm_tests/scripts/04_metric_verify.sh | 173 +++ src/sys/swarm_tests/scripts/99_down.sh | 21 + .../tmp/metric-verify.graf_health.json | 5 + .../tmp/metric-verify/graf_health.json | 5 + .../tmp/metric-verify/prom_targets.json | 1 + src/sys/swarm_tests/tmp/targets.json | 1 + .../proxy/start-proxy-supervised.sh | 14 + 34 files changed, 2494 insertions(+), 32 deletions(-) create mode 100755 deployment/build/build_images.sh create mode 100644 deployment/build/templates/scripts/fix-prom-targets-overlay.sh create mode 100644 src/sys/build/node-bundle/Dockerfile create mode 100644 src/sys/build/node-bundle/bundle/.gitignore create mode 100755 src/sys/build/node-bundle/bundle/setup.sh create mode 100644 src/sys/build/node-bundle/node-bootstrap.sh create mode 100644 src/sys/swarm_tests/.env create mode 100644 src/sys/swarm_tests/.env.example create mode 100644 src/sys/swarm_tests/.env.nodes create mode 100644 src/sys/swarm_tests/README.md create mode 100644 src/sys/swarm_tests/docker-compose.nodes.yml create mode 100644 src/sys/swarm_tests/docker-compose.server.yml create mode 100755 src/sys/swarm_tests/scripts/00_bootstrap.sh create mode 100755 src/sys/swarm_tests/scripts/01_server_up.sh create mode 100755 src/sys/swarm_tests/scripts/02_wait_ready.sh create mode 100755 src/sys/swarm_tests/scripts/03_nodes_up.sh create mode 100755 src/sys/swarm_tests/scripts/04_metric_verify.sh create mode 100755 src/sys/swarm_tests/scripts/99_down.sh create mode 100644 src/sys/swarm_tests/tmp/metric-verify.graf_health.json create mode 100644 src/sys/swarm_tests/tmp/metric-verify/graf_health.json create mode 100644 src/sys/swarm_tests/tmp/metric-verify/prom_targets.json create mode 100644 src/sys/swarm_tests/tmp/targets.json diff --git a/deployment/build/build_images.sh b/deployment/build/build_images.sh new file mode 100755 index 0000000..fbe35a5 --- /dev/null +++ b/deployment/build/build_images.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +. "$ROOT_DIR/deployment/build/common.sh" + +usage() { +cat </dev/null + bash scripts/package_artifact.sh --force + CLIENT_VERSION=$(cat artifact/*/version.json 2>/dev/null | sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' | tail -n1) + popd >/dev/null + [[ -n "$CLIENT_VERSION" ]] || { err "failed to detect client version"; exit 1; } + else + if [[ "$CLIENT_VERSION" =~ ^[0-9]{8}$ ]]; then + PKG_DIR="$ROOT_DIR/deployment/artifact/client/$CLIENT_VERSION" + TAR_PKG="$PKG_DIR/argus-metric_${CLIENT_VERSION}.tar.gz" + [[ -f "$TAR_PKG" ]] || { err "client date package not found: $TAR_PKG"; exit 1; } + # 解包读取内部 version.json + tmpd=$(mktemp -d); trap 'rm -rf "$tmpd"' EXIT + tar -xzf "$TAR_PKG" -C "$tmpd" + if [[ -f "$tmpd/version.json" ]]; then + ART_VER=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$tmpd/version.json" | head -n1) + [[ -n "$ART_VER" ]] || { err "failed to parse artifact version from date package"; exit 1; } + CLIENT_VERSION="$ART_VER" + # 直接使用该 tar 作为 bundle 源 + cp "$TAR_PKG" "$TMP_BUNDLE/argus-metric_$(echo "$ART_VER" | tr '.' '_').tar.gz" + # 同时尝试复制 setup.sh(若存在) + [[ -f "$PKG_DIR/setup.sh" ]] && cp "$PKG_DIR/setup.sh" "$TMP_BUNDLE/" || true + else + err "version.json missing in client date package" + exit 1 + fi + else + # 假定为 artifact 版本目录 + pushd "$PLUGIN_DIR" >/dev/null + [[ -d "artifact/$CLIENT_VERSION" ]] || bash scripts/package_artifact.sh --force + popd >/dev/null + fi + fi + + # 若未通过日期包预置 tar,则从插件 artifact 目录取 + TAR_NAME="argus-metric_$(echo "$CLIENT_VERSION" | tr '.' '_').tar.gz" + if [[ ! -f "$TMP_BUNDLE/$TAR_NAME" ]]; then + SRC_TAR="$PLUGIN_DIR/artifact/$CLIENT_VERSION/$TAR_NAME" + [[ -f "$SRC_TAR" ]] || { err "missing client tar: $SRC_TAR"; exit 1; } + cp "$SRC_TAR" "$TMP_BUNDLE/" + # also include setup.sh for fallback + if [[ -f "$ROOT_DIR/deployment/artifact/client/$(today_version)/setup.sh" ]]; then + cp "$ROOT_DIR/deployment/artifact/client/$(today_version)/setup.sh" "$TMP_BUNDLE/" || true + fi + fi + + log "Building node-bundle image with client version: $CLIENT_VERSION" + DOCKER_BUILDKIT=0 docker build \ + --build-arg CLIENT_VER="$CLIENT_VERSION" \ + --build-arg BASE_IMAGE="$BASE_IMAGE" \ + -t argus-sys-metric-test-node-bundle:latest \ + -f "$BUNDLE_DIR/Dockerfile" "$BUNDLE_DIR" + log "Built image: argus-sys-metric-test-node-bundle:latest" +fi + +log "Done." diff --git a/deployment/build/templates/scripts/fix-prom-targets-overlay.sh b/deployment/build/templates/scripts/fix-prom-targets-overlay.sh new file mode 100644 index 0000000..6dde5a8 --- /dev/null +++ b/deployment/build/templates/scripts/fix-prom-targets-overlay.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Quick fix tool: replace 172.22/16 targets in nodes.json with overlay IPs resolved from hostname. +# Usage: run on server package host: scripts/fix-prom-targets-overlay.sh + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +NODES_JSON="$ROOT/private/argus/metric/prometheus/nodes.json" + +require_cmd() { command -v "$1" >/dev/null 2>&1 || { echo "[ERROR] missing command: $1" >&2; exit 1; }; } + +backup() { + local src="$1"; local ts; ts=$(date -u +%Y%m%d-%H%M%SZ) + cp "$src" "${src%.json}_bak_${ts}.json" +} + +prefer_overlay_ip() { + local host="$1" + # prefer 10.0/8 then 172.31/16 + getent hosts "$host" | awk '{print $1}' | while read -r ip; do + if [[ "$ip" =~ ^10\. ]]; then echo "$ip"; return; fi + done + getent hosts "$host" | awk '{print $1}' | while read -r ip; do + if [[ "$ip" =~ ^172\.31\. ]]; then echo "$ip"; return; fi + done + # fallback: first A record + getent hosts "$host" | awk '{print $1; exit}' +} + +require_cmd awk +require_cmd sed + +if [[ ! -f "$NODES_JSON" ]]; then + echo "[WARN] nodes.json not found: $NODES_JSON" >&2 + exit 0 +fi + +backup "$NODES_JSON" + +tmp=$(mktemp) +trap 'rm -f "$tmp"' EXIT + +changed=0 +python3 - "$NODES_JSON" <<'PY' > "$tmp" || { +import ipaddress, json, sys, socket +path=sys.argv[1] +data=json.load(open(path)) if path else [] +def resolve(host): + try: + infos=socket.getaddrinfo(host,None,family=socket.AF_INET) + ips=[i[4][0] for i in infos] + # prefer 10. over 172.31. + for ip in ips: + if ip.startswith('10.'): return ip + for ip in ips: + if ip.startswith('172.31.'): return ip + return ips[0] if ips else None + except OSError: + return None +gw=ipaddress.ip_network('172.22.0.0/16') +out=[] +changed=False +for item in data: + ip=item.get('ip') + host=item.get('hostname') or '' + try: + bad = ip and ipaddress.ip_address(ip) in gw + except Exception: + bad = False + if bad and host: + new=resolve(host) + if new: + item=dict(item) + item['ip']=new + changed=True + out.append(item) +json.dump(out, sys.stdout, ensure_ascii=False) +sys.stderr.write('CHANGED' if changed else 'UNCHANGED') +PY + +status=$? +marker=$(tail -n1 /dev/stderr 2>/dev/null || true) +if [[ "$status" -ne 0 ]]; then + echo "[ERROR] failed to rewrite nodes.json" >&2 + exit 1 +fi + +if grep -q '"ip"\s*:\s*"172\.22\.' "$tmp"; then + echo "[WARN] some gwbridge targets remain; manual fix may be required" >&2 +fi + +mv "$tmp" "$NODES_JSON" +echo "[OK] nodes.json updated" + +# try to reload Prometheus +if docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then + docker exec argus-prometheus sh -lc 'pidof prometheus >/dev/null 2>&1 && kill -HUP $(pidof prometheus) || supervisorctl restart prometheus' >/dev/null 2>&1 || true + echo "[INFO] Prometheus reloaded" +fi + +exit 0 + diff --git a/deployment/build/templates/scripts/server-diagnose.sh b/deployment/build/templates/scripts/server-diagnose.sh index 3c0de7f..4f3d65b 100755 --- a/deployment/build/templates/scripts/server-diagnose.sh +++ b/deployment/build/templates/scripts/server-diagnose.sh @@ -155,6 +155,34 @@ gfcode=$(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health"); [[ "$gf # Deduplicate errors sort -u -o "$ERRORS" "$ERRORS" +# --- Prometheus targets & nodes.json checks --- +section PROMETHEUS-TARGETS +nodes_json_path="$ROOT/private/argus/metric/prometheus/nodes.json" +if [[ -f "$nodes_json_path" ]]; then + logd "nodes.json present: $nodes_json_path" + # detect gwbridge addresses (172.22/16) + if grep -E '"ip"\s*:\s*"172\.22\.' "$nodes_json_path" >/dev/null 2>&1; then + append_err "[prometheus][targets] contains gwbridge (172.22/16) addresses; prefer overlay (10.0/8)." + echo "HINT: nodes.json has 172.22.x.x targets. Ensure agent publishes overlay_ip and master prefers it." >&2 + fi +else + logd "nodes.json missing at $nodes_json_path" +fi + +# Query Prometheus activeTargets and list down items when possible +pt_json=$(curl -s --max-time 3 "http://localhost:${PROMETHEUS_PORT:-9090}/api/v1/targets" || true) +if command -v jq >/dev/null 2>&1; then + downs=$(printf '%s' "$pt_json" | jq -r '.data.activeTargets[] | select(.health=="down") | "[prometheus][activeTargets] down url=\(.scrapeUrl) lastError=\(.lastError)"' 2>/dev/null || true) + if [[ -n "$downs" ]]; then + printf '%s\n' "$downs" >> "$ERRORS" + fi +else + # best-effort grep when jq is unavailable + if printf '%s' "$pt_json" | grep -q '"health":"down"'; then + append_err "[prometheus][activeTargets] some targets are down (enable jq for detailed reasons)" + fi +fi + echo "Diagnostic details -> $DETAILS" echo "Detected errors -> $ERRORS" diff --git a/src/agent/app/collector.py b/src/agent/app/collector.py index 6c913df..28c0a83 100644 --- a/src/agent/app/collector.py +++ b/src/agent/app/collector.py @@ -4,6 +4,7 @@ import os import re import socket import subprocess +import ipaddress from pathlib import Path from typing import Any, Dict @@ -16,11 +17,47 @@ _HOSTNAME_PATTERN = re.compile(r"^([^-]+)-([^-]+)-([^-]+)-.*$") def collect_metadata(config: AgentConfig) -> Dict[str, Any]: - """汇总节点注册需要的静态信息。""" + """汇总节点注册需要的静态信息,带有更智能的 IP 选择。 + + 规则(从高到低): + 1) AGENT_PUBLISH_IP 指定; + 2) Hostname A 记录(若命中优先网段); + 3) 网卡扫描:排除 AGENT_EXCLUDE_IFACES,优先 AGENT_PREFER_NET_CIDRS; + 4) 默认路由回退(UDP socket 技巧)。 + + 额外发布:overlay_ip / gwbridge_ip / interfaces,便于 Master 与诊断使用。 + """ hostname = config.hostname - meta = { + + prefer_cidrs = _read_cidrs_env( + os.environ.get("AGENT_PREFER_NET_CIDRS", "10.0.0.0/8,172.31.0.0/16") + ) + exclude_ifaces = _read_csv_env( + os.environ.get("AGENT_EXCLUDE_IFACES", "docker_gwbridge,lo") + ) + + # interface inventory + interfaces = _list_global_ipv4_addrs() + if exclude_ifaces: + interfaces = [it for it in interfaces if it[0] not in set(exclude_ifaces)] + + # resolve hostname candidates + host_ips = _resolve_hostname_ips(hostname) + + selected_ip, overlay_ip, gwbridge_ip = _select_publish_ips( + interfaces=interfaces, + host_ips=host_ips, + prefer_cidrs=prefer_cidrs, + ) + + meta: Dict[str, Any] = { "hostname": hostname, - "ip": _detect_ip_address(), + "ip": os.environ.get("AGENT_PUBLISH_IP", selected_ip), # keep required field + "overlay_ip": overlay_ip or selected_ip, + "gwbridge_ip": gwbridge_ip, + "interfaces": [ + {"iface": name, "ip": ip} for name, ip in interfaces + ], "env": config.environment, "user": config.user, "instance": config.instance, @@ -96,7 +133,7 @@ def _detect_gpu_count() -> int: def _detect_ip_address() -> str: - """尝试通过 UDP socket 获得容器出口 IP,失败则回退解析主机名。""" + """保留旧接口,作为最终回退:默认路由源地址 → 主机名解析 → 127.0.0.1。""" try: with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock: sock.connect(("8.8.8.8", 80)) @@ -108,3 +145,118 @@ def _detect_ip_address() -> str: except OSError: LOGGER.warning("Unable to resolve hostname to IP; defaulting to 127.0.0.1") return "127.0.0.1" + + +def _read_csv_env(raw: str | None) -> list[str]: + if not raw: + return [] + return [x.strip() for x in raw.split(",") if x.strip()] + + +def _read_cidrs_env(raw: str | None) -> list[ipaddress.IPv4Network]: + cidrs: list[ipaddress.IPv4Network] = [] + for item in _read_csv_env(raw): + try: + net = ipaddress.ip_network(item, strict=False) + if isinstance(net, (ipaddress.IPv4Network,)): + cidrs.append(net) + except ValueError: + LOGGER.warning("Ignoring invalid CIDR in AGENT_PREFER_NET_CIDRS", extra={"cidr": item}) + return cidrs + + +def _list_global_ipv4_addrs() -> list[tuple[str, str]]: + """列出 (iface, ip) 形式的全局 IPv4 地址。 + 依赖 iproute2:ip -4 -o addr show scope global + """ + results: list[tuple[str, str]] = [] + try: + proc = subprocess.run( + ["sh", "-lc", "ip -4 -o addr show scope global | awk '{print $2, $4}'"], + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=3, + ) + if proc.returncode == 0: + for line in proc.stdout.splitlines(): + line = line.strip() + if not line: + continue + parts = line.split() + if len(parts) != 2: + continue + iface, cidr = parts + ip = cidr.split("/")[0] + try: + ipaddress.IPv4Address(ip) + except ValueError: + continue + results.append((iface, ip)) + except Exception as exc: # pragma: no cover - defensive + LOGGER.debug("Failed to list interfaces", extra={"error": str(exc)}) + return results + + +def _resolve_hostname_ips(name: str) -> list[str]: + ips: list[str] = [] + try: + infos = socket.getaddrinfo(name, None, family=socket.AF_INET) + for info in infos: + ip = info[4][0] + if ip not in ips: + ips.append(ip) + except OSError: + pass + return ips + + +def _pick_by_cidrs(candidates: list[str], prefer_cidrs: list[ipaddress.IPv4Network]) -> str | None: + for net in prefer_cidrs: + for ip in candidates: + try: + if ipaddress.ip_address(ip) in net: + return ip + except ValueError: + continue + return None + + +def _select_publish_ips( + *, + interfaces: list[tuple[str, str]], + host_ips: list[str], + prefer_cidrs: list[ipaddress.IPv4Network], +) -> tuple[str, str | None, str | None]: + """返回 (selected_ip, overlay_ip, gwbridge_ip)。 + + - overlay_ip:优先命中 prefer_cidrs(10.0/8 先于 172.31/16)。 + - gwbridge_ip:若存在 172.22/16 则记录。 + - selected_ip:优先 AGENT_PUBLISH_IP;否则 overlay_ip;否则 hostname A 记录中的 prefer;否则默认路由回退。 + """ + # detect gwbridge (172.22/16) + gwbridge_net = ipaddress.ip_network("172.22.0.0/16") + gwbridge_ip = None + for _, ip in interfaces: + try: + if ipaddress.ip_address(ip) in gwbridge_net: + gwbridge_ip = ip + break + except ValueError: + continue + + # overlay candidate from interfaces by prefer cidrs + iface_ips = [ip for _, ip in interfaces] + overlay_ip = _pick_by_cidrs(iface_ips, prefer_cidrs) + + # hostname A records filtered by prefer cidrs + host_pref = _pick_by_cidrs(host_ips, prefer_cidrs) + + env_ip = os.environ.get("AGENT_PUBLISH_IP") + if env_ip: + selected = env_ip + else: + selected = overlay_ip or host_pref or _detect_ip_address() + + return selected, overlay_ip, gwbridge_ip diff --git a/src/agent/dist/argus-agent b/src/agent/dist/argus-agent index 1a335c454e26c0e2fb7a0312c4c9881db2ca539f..9e71eb177d900f98545ff866f9efa9f831848a18 100755 GIT binary patch delta 11326 zcmX?ch59gRCXfSDPY}9JiVr?J)#7N?;%wF8YSrRy z)#7Q@;%(L9Yt`a!)e>mc5^U8HYSj{M)e>pd5^dEIYt<5O)skq{l5EwIYSof%)skt| zl5N$JYt@o()lz8HQf$>yYSmJ1)lzBIQf<{zYt>S3)zWCy(rnezYSq$i)zWFz(rwk! zYt_SvTW6|YSprC)v{^TvTfC}Yt^!E z)pBUna%|OdYSnUX)pBXoa&6UeYt?dZ)$(Z7@@&=e+N$NvHn0BDA`cPP2es~BW;dvA z)~RZoWWjjq!45@O%lr1K6&Z(L{C&7vvNA78{HXcqu1_mAuJJTXD9xD6 z^PB%{r-SyRn^}1`cBHP`EPO6UY{S(B^HO?NiOjxS>e=e1ExY06{ra2nUWPliNjsgc z>dZ4VQ9M=HrSNW7cVd0o-FlOz%Fls9OEpWRLu$p&{nhxPBI#tE>$JLQ8@v6Y;4c$r z-wgccxi0SwTbv-{5eeQQGxi=;@t@UP3cEq2MX zuZ_3v#rNOE2A>~h6dZRgy_Yd{!*jP4+iKM_8Qu187fs+=l)&Y`Yvogiy&2mxAF4Rl z$h_d)!**g#X`ac)xE!`Em$bW9aYi1)*_xRRlvM0=N__N=BpZT{; z)`mYNYYQI+YE9gdl`HH&p!J(;bp+oW_*Abs&!wIlmdgo{qbMKH})OF`e^s6@YeUQ;!N*oZchuiz4~bFl)SqO_J!MLyf8hmxhrsYZ*%3|${lM8 zXS1E%n>HyxK+kybUXJ%Q^L>1mUT4~RW%IQw#*0s9olk$$^K!?g>&J9bKRt{)pp(q? zAdcg^(+=^2dpYXg9re)-=K0R*BOA>2U1-_7l?~OJmk)#-+}jg*PN${X^74r*2lh5a z9@J^9R$cr;@r|$Xi`2c+ps_`S@ESFHEuZ9jdovi@eu&&%H5Y+|>U z&)lo`J8u6Lw*c#p0c!-?#ILOBmQJ(v6s~$KCx86fC4VQu#fR)p5@v zYxf_W>6d$zRoqhpEK`^L$=PM6v-0d8LzTrlIF$FT_qci@JjLapd=3*6HWIe4QLw999(Osr@wkm#F*-{X;eP z{m=hY50DD%UEyw0_v!zaP^tZ1|3d4&3cL*eY1;G6!mxbV1}`hmNPW%aFHcQ94kX z@9lWo*>`TAZfvf!b@hYnBZ)ziZ(hB5b?55!yNk>3AJdiHT@}=9cKhYb^T*%S{5UuF zyxp$aZ;yXS@1MoVQlDVtU-D)_ZQOH5ad-duJV)%S`PC+=9XxY8|JeOX+qzdJe?Fa0 zkhdMmWBh{sOjL;1!E!o?X z4Fp?7gVj#I*u30E-L}T{@X}2kFDI^eIWa=;thi~*_G4Y*sv;}rN=ARxxVQTL;&%BD zZLQ&Au4lQEr!Cst7UB5v=JDnAU1IKCZ!RkR4vBPLvc@p#+KhFcQ)ZsJ%E{!gKsnG; zLbq{k#rp2q{Y=u@kq3DBcg8N-IhCpUd4c)1o&ThZ&DArxygy#5|CjTcvt?!fv|Hkd zTH*F7tv@>%KQVjdMK`#veZ%-WO)lC+wf^>=idXDME}!`LEm&demhTfvX3E(X{;q%i z{_ega!7CO<+zs8YCNQk<{%9|2|MB$gaOTVVD}J7<-oLZ-=jWFnwLf0acU-ADeMzb9 zBHye>bLi8_gnKPh)>5r~A0^}wsCUT12U6d8JVeJdvU0ZrKGjA+B*v>X9)H+sk1M|cMGLsWd zWk}sETU5P8+HUWH7X1p19Y#Bf7EKJl_Q><}v(zUyL7(EyLW>#F zCM33t?)5uRh#9O%43ROR4HW7EXab2n2r z%6ZzjR=l#9oU^1gQaQ6zroKlihD#_Se)id428%u!6t+}#S@^u-IH&eh#t)%vcfPZ`47dVz^zN`6C`RB^(EyCTRRnKZT{xCkV-TV3DT>txX?Y{fhKX7c2 zcQohO7?;i*!OuMJZ=4FJw^WULE${DI-lX$v7dNbsFOl(F)wc6*sjkAGEqeWzPb;i+ zUHEJ3X8n^3k{EZ%&zQDy>38cpc@9gzTh3w6Jf~fsT;w@D`2g=DQK=0LlX;ZhYpvk3 zymMHuOuQ!1$lt8PVUy?Gvg;4>^q&{3PpzM?60~Rr1Mj7|ITIAqmu>4yi}@{kIFE0y zQ;qJdV?}{{Gb4@!H}OT6%Xm~fvaS->S&|;Gbt`w;-M3G7U2vGI^E&mW?TWagJHGLL z=l-RBG3My6`3tI+mY;Yk_+>#`#JTc{25W}JygLtmTyQ5~eUYNiZZCt5DJ+rv5+biH zbXM?N)Z6sjR@G<}&~r3x`StsP-LY*mGT(|3v)bQEjE4z8C*2J8-SV zX>nCo&rGjpqlX=^@c}hzI7Ilez zw)L-z?KHXAzxbB?=at%yihF*4f3JQe>S|h%)_JKc4)a*8XP>XRe7ffH`I-vH)O9~U zP2R{lmD6eF^&UqL$<3nkZx^IFSHvEwT&KBv(!zve%D{) z!s(U!yJv7OSbsXQ_PmFLw!l)e$w4!g6^6U))?9V>$Z|cGHD@~_?o0I?JhuBphQ-}m zX{zg@vu}w9T@Tl1`=RQq^dq3oo%`d{SKBW2MZIdf%E520(}Q&YwSxuXVjz^ebbik%9>&~d)maE*^;V@e>v0& z?s)%qOPI8xjogG;Lb~ihnN9OcrrqHA+&fz~zP0itB~hA2_NimLA>LAjYwBQJ&-?2_^Q@XCJFx_3d<6F`r3h z=I*v@+cZTl-s0Ng?)Yi%(i3Vt&y|YALhZGZI@CLh6MTE>r*)PYhn3!)tWquU?nB2R z(>LcicK)AmW$pX&$^$bKqbHjdFJhil;LF9$+psvS{DHVa#b>o0PG_FZ(w`O=`Rn(s z)mvXo6j{A=y2NV3=2g=7+OMoT{hZm=?2X_Q>uqmdr1UR1spxxBPIW@~<;llRX?T}e zXI<*v!eV{6wX?&XzvfZ>L&tBScly4nU+S{6R@Jzu!SvMXQHrFJwcA!pRI)RRs+ugs;PvPsGpm*|# z-ptKr#Ujd+gS0muw}{VBiaz!Gxuw)etG0lY7P%WgM1Gy!`tg68tp3JZ>2E8Nm!-+w zUHB)utVZnaB`Jrn+1Etty&o;AOh3EcD7$Xw-t{tZ%8TDA1{rV3erLJh*ER9{dv$OB zeA)TEep!62dEG1X7f&ACvfz0A^5cQG=bKk5cX3}T?tZDg@W+p(KV~!j($}BA@bbs$ z*6;tO{R&r%xxORG)Xpx0rO|$+#j?zm$M;Je0HrGldhjFGznTL zbnER`IT<;xb(M#jpV(&2;ys!cupsX=-%Mu))0a0@N?$&zU&1KDQ@49T=Id9tFWuc4 zec`*!zQXMIeR9`t%1c~cgJ3w6*_U&vsP&fxYgFM zIP=5ugY%U#qu7$~xe6|s{wCM=$Zy`9vi({y;Q~Fo2!HtC!d`6@0WS&k59d_u(`pu_|R4ltwhUC5ur|J%lXgS>t9hS=Y94(ynP~Sh^49~kW6clehV|&ZR=iQQ zWof+nrETpFaucKE9&uiDx~b{8?(xFU8T^ZjWev=DULVQMd8qn2{pkOQKZPfktA3Ek zvEKUf$ui++x9v*|H|(6Kl6~-xq+_tcM=KBI)?CdGRnJ%#&0c+~f96`pSf84y@7FFC z@AIp_an9cUpN)+h&+A12>*v%uO2(aTTo}1*_5Dqc1B$nITfa%=KNT+SS+i#sx5vZ} z4}3jtZFYVyd*t3>Wv&T7h1O2HSYOBLTsL9r4+E!@i^4_JJI(E8U#s}-S*b6Qd*-d< zkCi!@_P-*w`gor?#-S>Az{b$2EAW@-gV+_%s_YiW+^WCqXtsLS?(9h}ZB{4p^=@uh zdjE0i#51hcw;#$SWRy(f;r~3Ri`%P2u6^x$PuaLA$LVoWj{KW0D(mr1seQTFzB%PA z%hu$K6HgOwB)goFIjR)iSGm%T?T}Z!Ac@ z`u^3TFJAkyN{`$TGTnaQ-kE5T%_8%R=Na~WbchXk8@dk}XWZeLWAtqP zE4!O#r{_MNf8+4`(hs}8v+h=QOTY5sNwBy_;DIBJOexo&9-b1P9lU*Nz4p54`kB{P z>k{P*%+yrRBI`5GJTNjw#HMO#=l6yGW+7OG8IX%iWka(0cqf8VyB znl*3M$prok{T$IAc`~ENDtTewil>70WuA-rtY&xHiamI=h)1u$u+{Qo`pg;55r>{m z`lvU0D)UWw z^4Q}M&*ZRgyk8np_#X zbIN|#1-jBxc!J)P-rm~8R&bEz+s1t1&4n!AESR=({4r+oXy3G-@qzOtg{fi+UB|wO z%xL((yLpl4Dpk2Sp&L^IrmWaDdC}?IHxEC4Rj?~(lF2*$qRsHQud`_WiY{FZdGngh z3;sQ8`gNk|>BV~1)50!sak^}ZscUw;xwFyuxZm&dzdka1cx1b-Ss-Jzcj^8G_W3gA zZ6O*fY-_6C-rSn_dM58e-W5AH{o10qgsX7lp-IJi)2j|&dAVb#jc< zUdE-hi1k~i>%^vA8IH33>wgCy<^ITR#Bhu&mn|`St;42kK{e+}j@65Vwcgw~-%aQA zqG_o%`vNrfGDs@i_SE#bec-gd`2Ur^`p>=g&eM4qy-i77F4g-(v7U^?f`D+Ho1dBQ zmS1PS*}r}Lof}RyJGUh)j{G>av*lBG+TUYESHo4gLiCgackbbt8vR9f^-AfkR9Wp6 z(%K;bX@Muy%c~AvcR96arvIbg^_*#PJO43i+Oy8FW!|#b*kL9Axd5TeAn%QLCeL3NSi~87qWA(lF5;6tya|4%I6a^iv>$tg0{^=G29V`7L zd#{bWzd0WXh^kF=SZE_F>g3SIbm;KTz({pt>EH+P0@izc61*2~u-G6dd*X?2WLN!x zAP#eXkw1C=m&@I|B)PG~O?R$i=o*GEHcM9@QnUEXcYXRXq>`&itQ z#BN*umHd07>+-VA%O3?xNQjh^uf-MRqn9ZfsfnvY^!-x zwQt|ru3d4SOd?xdGB0hpa=nerJ{nP}3B}qS@zKB@H(wtyz^+9vy zrm60^-RdIsg10xV$i3OpzkO|kc37C;tx3OBH5M&+S$C(6#VO8{?T8MyQ&dFD9-$*! ztxPruvP5tAsdwp+dbhat{mbUrc8ar2XI@$JL}{m?+3P5;(~_TgHhRp8{66vX^29Tq z1q@;=Y0js2$jnyP%E|Ef?e(c+v)JUOqb$vdE}UxZr6mvNIM+)RD!#sS{EA3X@$(d~ zY>U`s)8ikXFRR+JsjSFvx1ZFDQ!>wOA{g>q-e2E5CEh;i!;{}zozwQ`Y4~SdNIiS? zf%%WEb*ksAgYIUxCcdjF6q8-5Kk>=MtCwazITDq*Ypw0v`O)f*jM^?XTg8jJGc${x zmxnj^S}dH_RyJjp_Oq*I^(O0Po&6@tK6wwbn{(!={j%Nt{t0hHwHapjv7hhGJTud8 zlH~H`2I_{tcCXGq;w-*!d(NSiyO?i!UNWekQlnbuB$c1{uP96Gq)lPh=FUT(S&YO4 zUOKd?&E3^CDQxRX0T!l}IiZ{K_k~^RA{#bK8~gO7PD}7vFaKk<1>@wu@=0Mm zjn&<298E&!%-4KTAZ4&4JGPvi&puMP4a+V5(s9$3Rz znbvnBYSFf72ew7j1tw&l>3W)=$fh3RSZ}n0>F4^^;AgCX#icuJlj3Ad_2)&LpB;W! zboCNvwt_I{RV`f`wk%+nzf`1hS*xUgu^7w5$R*k${RV4F9rhGxHgqgC%Ua2sGjmf| zYpL*S`Iaf z$e&W5r7XF5D$7>Kx7)XI-Mc2I$N1s-FL&jyZ!i8TTVZ;`Y41a$djY4lv)?Q6KV8e3 zEgJqMSL$l1DQ7{D=CWf-yEc00Z#r?bDQZr~cH7w7z7O7*#MHS3-P222e1frBwfAr3 zxm(hc>gvS|rJm#;{Cj1Q(Qlir)&=$(WqVCmik6?Bf4M$dLtHKD@kuSm``5SE&yY@w zt*u>Sv{jas_4T1;eJ_F&rmk}IRDai4n{=;xmbI11gO}+OHIFJioUJ&?VdvhzI$i9Y zmikgh3?3ZV;J;973Tw|~;iEFPb7pU{-1`1#hD_i51pmb3S)NuwIy#?3Zf*8_`^!<^ z_)Xc?O|2@M-X%P)t7j816Y@OP;kxI~uW1)pKj=@>e8T!c-!0+RDXYxi=_VkeOGTbEv-4Ysbihls@1$J_Uo?T**d?rXMI+$NKS`XPQ!Hd zE4@op9Zwu&4Dwyanl`zyhBb1!!V`yhk3)Uao1Y$CV(NIpaq~nLzG=<6UR^Q#ca)gj zWM3(=bFcC+kbk&t@n34&qm69mKCCkRHg zPY{f3pCB03K0z?LeS%<2`vk$*_6dS3Jo^5=_D;rgxMSt1x}JI(=P9u@qCm?dhjVilysS?lCfCstPbL zXfplcOe`qSD=tYaN!4U7Vq#!m$n<>2$dGBx4wmHx$(E%S6=&w>!Q@-MGBRX#LgYCT zi_!}ci;CfzdH*spWPV%=*36t(TmsYdZxs_m=Hp3VaUHg##Nt#l6PVmXTV{sL|9W6K zwxrDB)MU7}2RYLxlopFw2Nf|hWY$W8m2e~{=A`DOBo@K+pYLX7$lPWRmSj&(Eh@=O z%hY5p;s$x9_4M?IrN#10A!nvDl@+Tp?YuBuv#eN?>Ft&2A!WrHOwvE5*Oe9PF-;Vk zzOJlThiOmZ^gAHhJ99c$d9gOr&i3i%<;A*DL5o-zGPxPSc5o%<=ar;Zl;mV4!JH7g zn1vzpqc&KY8!BCvSX2y?_dCzR@Nx?SSe_+0zn~H(!E|BzzVc!*rZ-QgUn?(`vli!K zWyo~(1uNu8Ni0b%$;^cZZcI2UL#EI)uq1m*YI0_7Vh+rpytmW!D~iQr=DlNO$Xw|N zR>PW_o0thw0S*dxgXxJC#p+BM#?!khiglTUT&M4>C{|_;V`ra!zoJ-~iA85ROJ%V# zQ<~Rw&B|g0rs=`cy(^2=nLKi*msA!jGVN}gKE1M7p2@m<`VNpf>1osNRu(HUJ>4;# zv8q^}N$l=)rK)0e=GWXD(|xLn6`5*nr{`A{D==}nOrKa)EUz3fpMxROj0@~G9w14TE?;*GPXAE+)CH*Y!4$&eXo4$+^HSe$`m$K8LN44E0?U^$kI z+{9#>N4d^N@5Oih~8)oY67m`bdtyVn#eGF_WDJ*%czK6;BTH$x`BCPZUqN?sx? zAs-IoX2^UBNr0Rn3BBa}B6!MJ5y8!nIZFYojTbBnB2p4dK>ESC+NpB-6Ocuv@20cY z7OOBZeVwjbTdc@b{B?RjZLtg!>!j&WbBvWO}BX)fKBU9qgMPTUV^lbmh_XzPe%~-d(SF88TB@z)p~zZdhL|B9~U0 zmt2ycp92q8`S-jGnFraB)JzwyFBW8a@O8RceX%y%mS4OKnfs?})fY?iAqRQ!|LG0& z#VX9IY<$y~*B7fY=d<%oKVM(0&&8>A|p4wO}&!lsGdK-vN z`80ibW3jw>9=`xX<|cNq2KJo%^z_WUbXZg?hzl@e9-Ii4W67z?g~#E;71RGV7RyBV z9uZ*3e5?glz?zp@QUrIq!FvIQ%=wTs&6QVDT9ji{kXQoGGv^rv88Vrnw&s=QCZ!^Z zs7oAz(+iu5C75r37i7qENdeo!R*;#TodZv9Ns|Q`GR145QUyhsdGHdu zQegVsreawpd6(&bnu;Z4JtKq|GE*TrfvvDKzn}C6rp@Q3?{6+vVJf>k{b_TtJkz`H)7e^z6_|M#g{Nz_6e}?O zm6-0`Qmn*uP-=R9OR+MOvHbKYEyZd~lRT&IX(?7@(hHq_A0)34F`c=!ScU1_tm&E{ z`s>{3{;kCtOnc8yuWT(=V{*GQePL^{qSd}6gGE0(r(oF>kYSp%shI6yhA zyeP2%=AFIQ#Thb#mB1!*mShyACZ<4AC)jL(U(*BIizS$5MNiLeFP4_OkSxKFImZgD zn!O}HHwRJ-f>VFayy;8Yi?kJXHdUj{Jc1N+K zY#h5JLuUC~uo-NnMaV{6{W3kVqgY;9`>P~FW+)`-@<5d6p|m_2{!CxoQLMdiDi&Z84VeD3tJsWbS@3kj?qVIL{h8C_yNflL7S5dB*9t+$!_xWyFJBPOh>0q=j<(3Wm>afx5;v~8ca6Z zrZH2dN73%$ig%pS~g(^>n9wV9L!rW^GYt22F2pB~*;tjy$ZJ-wl? zScSFyw^ZhAS0YMZ`%VX;1EE`z*l5eQGezpz+}?XEdP(aHPU|1T_V iW!&DexHy50$^7p0S1XInIJGwvGl0Oa=_?l(%L4$?8{g&t delta 7473 zcmZ4S;T-dcE9aOeXfSy!Zq#bkVr?J)#7N?;%wF8YSrRy z)#7Q@;%(L9Yt`a!)e>mc5^U8HYSj{M)e>pd5^dEIYt<5O)skq{l5EwIYSof%)skt| zl5N$JYt@o()lz8HQf$>yYSmJ1)lzBIQf<{zYt>S3)zWCy(rnezYSq$i)zWFz(rwk! zYt_SvTW6|YSprC)v{^TvTfC}Yt^!E z)pBUna%|OdYSnUX)pBXoa&6UeYt?dZ)$(Z7@@&=e+N$NvHm`o&VmA@C2enJ~aPByp zBNmyUVUx`Kw6IyC>mXZKW!0~;``O23T{o_IQ(qfu^?&82&2QRMT{A->uW7vQG}@Bc z_rF_IWI_7%lQmhlmavMSocLs#(W=svJ3@Z9Q%*llJ=T)8_Fd=4`SmyAy{_%vCarY3 zsx$9exT2qD+oE^7yaoQBtKU`Fa_QA#mA>RBVLN{4%=s1g!PCg4ET%QAF^9j_<@gU} z^NowYOpb}XDf&z0#)3sXAL?iIJMQz+nQ8I3J$ZGf<^5IR`!0$`^<3jRR<*^UDlqjGA(bw@NBSLumZv$p+o72Q?8c0w;_ zo4o(LOW~~hV^@^=cX*pWoO;LLB7>RY64SlA_4u9_Zw%Udkn6kJd@HS3A)Uhk_s`h> zuFQO?^V%ZATh4>(0L{6g6fo+P^ja>0X|_mLR=J93qqQ8y8ejfGFPuky}GcDMo z-%QrT=PL8t1sQcI7U^ydua|tQnf@T+U19(GYxTiWMa9t{nj*3r6X#vedAs;*?lPUm z=y&T@q-36zGH;eWrRQjPDB0}%)C$WedG++;Jm%M*Zc+Unw=W{%caoX6xVEp+r^*xF zlO9*suWsBcIK@Kjy`SKfrK#-SpDL~tG?nV#vtrE&jjtd3Thp8}_p%+EEdOek+Z)}Z ze<$79zWc7`j2{^Wt<5oWrd27vZDGtTy}hE!VZ-~kybY6-9AA2dKef7WuFCw}oalPn zFK>imnRvXNB@}LIh4w#pJ7}P~bsERC88d?mxT2!Pla`=>F61?jaVNlddeN z2r0e#xq+j*alZ%sfMd1U9Npny;C`DLT}l;?;T|qUp}m@{owQU_R+dncGR_({1K9X7F-Z5?eo}V91 zZ*|`mk$w5}u2))-&jZ?od<^!i5w^If{~^$L-p398B3Tj<2L#>gAK5-k@2C|_`nl_q z>9Gr|JC_wE^$46?^1}4TeGi+)C2C9kmR(s{5o@yUYV?`7%||zdaeK!sJ#LyDae3C% ztOT7uFYh=#=sx;6U4Gu*nunWTf6n*Qi{;gctN#0Ns`<2g`}YV`{Jp*YU-84Qv;T9= znY%Rg*NWrnpUTYDv**OkeDXuM-stJ~!X_W<%J~`3W#`O!`r%jZp8K{uG2OF^t6MIc zspm`E=@!}jFzGxN8KrUT*V)Y*=WxUyefH(n8y)QeyZJdSjdPpY%MXVOdpUY(9A31u zFwmy;@Z;yMI~C4F&DAvXbt*8?%TR5NvFS5XefrF0H^27m#2FcVcVDmdpW!z5`hnyEvGHUi6441p7VsnK~sxhEq*USKx1) zxcKI67J;+pnB^7yUp`~bj7uhywWcR!y?WcpTYvd@M~IibQLz2A1k>abo;5yk%)xsn z8iX@&WC`>N8iyQJSBX{|l>et<^tG7R2evhrWFYjHfX?ok&0z0+`ZH2d8 zj|a9{KW(jle|}@nb-k`R2ejw2EpR$&@Y_0PaaYpseuajV^CI^5tp((axl z@laDErTc8Qc+DK$eQ#UL9^^IrH*%Z7Fw5Mw@P0_;`4G#?()A`++kCb*uG?U!-ehv? zlI5+MgEjYbM9XV}b!BTJZzaz*V1F6z^hI0fuh)$rqe35YZv)k;q)K=AO`}?@;i(_S%PJjb6;>+`}IvUj?_5ZZ@TKb?jD9o$9yJbRxLd;Ddo1rQn^-B*~?msZI@m$ zJDR3$z4WWW{6;xGUdH-sP3ymGmv@;wr=Br2?{V+c4DC;W>ZXQo6JiBt#Q)9TpVnpU zf6>dYZT-v(wzIjTHS$x{O=^BW{qjd&O)S68u>PUFM0=CgnM+SsF1^3^|NeNtbM-%# zAN~L9@1dpk`m9W!_y6M!elKd%S5+|6ukJ{5o%5R0+KU&@YbxnWeWvlIfLqcAQ zyJtn$GyKlzE3b&)iBfrHzuC$&DCFRI!5dq0c#V_FcREaw+-BWpxX9f$xukge-Iq)y zt!%9KgV+Mgr*2h$qi3&v<*>Bd=FsL7Edr){4(#N2=y_4)mf=#U;`YGd*P^dl51(AR zB{5BUSn%2TaW!%JHx+nkBokLqqvaEeb40;B5t2|%~|BLt6}}q4NF3` zrITl|tYGI1+Ax1gsry@-*N;-TL=K*e(tr2NN9gX5YW^K;Mhyioccz{pp(!@$TYccR|^;Z&jadDz0DtDqYH6 zciN`t=9_C5zPZM6NorP}EYJGY9ST~zJr#pPEcXAnw647T>=xF~ubqE>6?A!K)_Qv8 zeY^6fJ5N`ye>eBlEN#8Hd3Ddv95ps(e!tJ`>#aM#c#Y3qFSgIwX;WAFXO~3GeT}t; z%a3p@)(%!VSw5p?{WIIHj1Nzq>=!-b@}d5AYRdW<6JBzzSsAA^RjuTX?2>Ayl75l> z8#h&L+$1OAD;zz&{lVw`m+wvNe)lBoZQZ0jn`?zHte0fE+*>(U_r>eCvPUMRh&a%|D zu#2%O&wF`2-^li_z7hBP+Fd2@n96*vpVNz(75U@M-q!fPc@Ov3TiO>^zU+Q^)ZgyU zw`;k}7_FD-c(6n+-+S$OnVad>>fQW0iBlbV)|SdVJz)Q$M(Op(ZZFOXtzfQ<*RIQF zGnZYN^=91;hToh8Hs2EK#jhW`QERtf+PaZ#kIMX}{}(@rRI1k>dK}Ove?T>{Tvg_v zJjd=uc2g7HG%(zeeE%@}d;!C&ya|D+**|WZJ({~&TI+oCyN`+oCkDv~pGyfUniZBW zw(*6+E|!Z&QX+M7!eV0Y>nQHwSSq^c`7XADeOGQeU)hj6<+0*a!CQ+%`%~+Wlv>Ap zsM3qCj&*vK`eIAZ{d~c743l(h3;Z{3U$t-5zSzH&XQ!L~nAou~IA8VBysZt(Kl3NA z?+&gA`taH4>i*^Xy)8Pfw#hF2Aauy`Y=V>4OKsK~5ykYQA)JY9%U=XkTA%3nn|E;O z7VUS!)~@rXPSLdK@+n#_!xWZnv#@{X@%mL3FOTYfo$^Rdc}nDL>vYYTe=03@r|j}H zGJLq_(at~WdhdcfEVdiO1W25`_B^Gwy`iT6B#Wt|ZPdv?g{Z}6eJ8Ql{a5iQZ+XGj z`HmyHNUhIbhFx94DDRBII*WtPq8I=FPDyIZ>;MfqDPG{5?ARy)NrIox0M#%8i@nf2@=_ZZR?P z&(-*6|En_pO+NozKYo6_eR=i2L+oef{d2j;xPs4u0vsYdbo)LLgvetWU zk?z;SEL>dQU894JwSABd>X2aCZaXdf2)~=e^y9^6-uG_XcOtpsxG&T5&pZ4IJOv-T zeHniA|ACvozI@xowqu(cZ|AEuf`2-VcCUH=V%yu7udNi)dEZrktBg)R|F5rJJZ10U z!=D+JZ7Q51ym3L1(yIIBGtO`(1x)9CTk-4R<*t^(uYr~)53Wd8P2u|PZ~G+SQHn&O zNgS)*+A@YikN%ugexJYl*1zMq3w5oVZdyCvv}TOvxwc=l#n)2&+r0bMd+jndwfTgd zOliIGqpt4h-an@j{v8YWcTBQcqu!XI>tOw=FCLTL8~l{By*I&a%7@am+eBrJo<81C z+TgL2bIC37CBL0Z_KWN<>rYCr;Q|vcS?(tU%Pd&BkxLi4iks5>es6^oD13*oV=%=rRJaZ zgT;Igw3e4%<A_5lLlp=cI1#{c&((q1o&PMz0^7W_-N|IP)JbH~!(- zzuA6zvBk88S4S?#83ZsoH#U^l70FJpsd8hLnRRrMzk%EKdU3P%4ZNE_IBqTSkg(PX zo^)i@4%f3vM}vlhuS7k8NiQ z4Egn6(coZ0dx?Bldx?B_dx?BRdx?Bxdx?Bhdx?B>dx?BZdx?B(dx?Bpdx?B}dx<ne0qC{d`5eTd}e!zd{%pje0F<@d`^3bd~SP* zd|rEre13b0d_jAOd|`Wue9`t2`QkfCG86b17&MuFaU~WM=q2aprDdjTG8Zv1Ffe55 z?wX#{RIDP$cY={2vxXI{j2)ybCm*h+>+JM(O~ukob{D3fZYq{xN`5f?WmB;n)4b=? zIh%{6rPSXuGGu;S3)ah=SX=@#TVn2XyXIm^)8G2c44MD+!18QKnZ>Eea3#Oum>DvQ zAl5P`RT;sw=A=yTX)czQds)lOkh#qstd%`EwWuUBEmM=Zh#Tbe`Fp1yZ7!B)%G)>n zS#z-})8#|cd0UD#nb?m{w`eKWU^0C*J*lNwk7+f>^hqtnI!xDsryl^(snOHlwG?YJ zU9Oxi-CC^6lrv?zS8K5>Q~A{CnXScg#u*1$7+!8+0NcruoL^80bCk>>7KY4MqF@Q; zmANzEJj$%cX>G~WDnZ0aaxA219qL-UmT%4Gm z3UjTB+4RzmVtJ;biPNWn=oPc4Z|^8pX1cmz`kjtqIVQDZ)Bki7OEIxLoi5Q?EF+uq znS&v7u`bv|wzQIhoXjMcQT&S213QZ)nY?nQ=X4fJnP0EtWXSviNwDncsU;<;74YP8 zx|fq7^RhPB7?$*^%mSDp^L9?(+gU7S^8EuRLuQ6JL_tPwVlqrYJO>v;rmZkkqND_* z6CAp8q^EOt70WU8=}y<~Dpq9rFk!l1SFt?PIr-`NUB%)|cRi+ebQQ}m9rBvKtgBdo zDJFaR36Q$}2h(476{|4GJfAMuU99Nb`<$C0bJ85J1)Q0=1^GoK5Kn?_k#FH)$lSpT zmgR%X>cuA}!CiQL;`Gw)Vs)m{Y18L*7aK4$f8m*avAbBFX;Q}YPu<08OlEb{rFx20 znQk{sckC(F=Y4mDmm%{RJJ?M5=^J~BMW#>eDHdkGa)*~8GnHlf-JW8p=|_8t1(_@# zPQTGpY{2yE`E;({Vr{l_Z+ICp_fO~REtcj(4&L66(^GnjRhaF6@J{dUEmr02{K?CZ zISrCnWv9RIEfy6|Oi3&#NiBk>TlwF-44Fy%(|!7ir3I4nb93|aU}4_KHC?2y*pTU+ z&~*2{Vm&4ZZPr!1VnFWa{DMhKpa0lgCPdDf<7H9gnWx8j7u{@L8$>~`jy6(yJ zZV=tUIDJijv6QqGrvO9d!HHl~SaPaz6JgH&J!|^I{$d%X^lj7s_ZLerc|DviIiXnA z^5tg%hD>Hic(dn~<|ZLI`^_%_hRhd&V8ht*3zG9w;92Dg&-DBW#ZpWbiqpF%6idq1 zv5~4(+wsTOEazAG~IV%u{x9U{^?~Ci?x}~9+PAryZ;&?Uv)x=^2X0^{k(|IQqD=>+3PuHJRti*JiXL|6YVr3>@!RZx~iq)9b zSWjOtsaTQ8!+rW;ki3)E^yia`RhVA%Oy`|ktir@UVY>0;VhyJ22dBqQE>>eoI5)j@ zaPitqy6TKGGu-Thbk>8%FKh;S9ae;88TDE zz*1brV5x$lR8Wow2i+oGF^0@^HK=rHQbAFEGE(~CG?{*IO0g7^{GRD|rxZ&wMR!d9 zHKkbE>iS7>hRk3kur|(;3{bj7vQz7gI74PvB3PESBtJV9o(s14PWPW$EY0*Pbb8Lz zVkxG!3DY~K7ArBOO`N`FYOx#>@6qXJrWPwQ{XaYXJ&4b6emeiOVoBMGpArn28qv{D>0P`O+PoSSc>U|)bv-= zinW=(YEBoQUaZcPU^?Alda*W>Z`}0U>BY)S4*An3O)pku5-Oa&b9%8RllqS7Po@_u zF;D#|J)Lt#u@3W>pVHGUW)z$Ap8qY)khxU|>==dV8)p33!pD=?k9JpJFyVr?eg8`HIC6{|7t_#rbrWLB{%(*cI*)w7C~nJ(x~Uofj!o++Yr z`oUSnT1@xbr@sZsADc8?WOlJS)9k6!ZD$v2Foms{o<6%+l}Tg$^uF1}Dok51Pv0`T z*oZmltL*fbvx~KvY?-Eu%_&xA;*y$fKc`rkDN}cP%A8^qCU5`gT_D;xc>0Dp#ZpY( zQQObVDgGqDs4#v1+G1 int: @@ -27,6 +29,12 @@ def _get_int_env(name: str, default: int) -> int: def load_config() -> AppConfig: """读取环境变量生成配置对象,方便统一管理运行参数。""" + def _bool_env(name: str, default: bool) -> bool: + raw = os.environ.get(name) + if raw is None or raw.strip() == "": + return default + return raw.strip().lower() in ("1", "true", "yes", "on") + return AppConfig( db_path=os.environ.get("DB_PATH", "/private/argus/master/db.sqlite3"), metric_nodes_json_path=os.environ.get( @@ -37,4 +45,6 @@ def load_config() -> AppConfig: scheduler_interval_seconds=_get_int_env("SCHEDULER_INTERVAL_SECONDS", 30), node_id_prefix=os.environ.get("NODE_ID_PREFIX", "A"), auth_mode=os.environ.get("AUTH_MODE", "disabled"), + target_prefer_net_cidrs=os.environ.get("TARGET_PREFER_NET_CIDRS", "10.0.0.0/8,172.31.0.0/16"), + target_reachability_check=_bool_env("TARGET_REACHABILITY_CHECK", False), ) diff --git a/src/master/app/scheduler.py b/src/master/app/scheduler.py index 8797b25..1ba9c18 100644 --- a/src/master/app/scheduler.py +++ b/src/master/app/scheduler.py @@ -1,8 +1,10 @@ from __future__ import annotations +import ipaddress import logging +import socket import threading -from typing import Optional +from typing import Optional, Iterable, Dict, Any, List from .config import AppConfig from .storage import Storage @@ -34,10 +36,117 @@ class StatusScheduler: self._pending_nodes_json.set() def generate_nodes_json(self) -> None: + """根据在线节点生成 Prometheus 抓取目标,优先 overlay IP。 + + 候选顺序:meta.overlay_ip > hostname A 记录(命中偏好网段)> meta.ip。 + 可选 reachability 检查:TARGET_REACHABILITY_CHECK=true 时,对 9100/9400 做一次 1s TCP 连接测试, + 选择首个可达的候选;全部失败则按顺序取第一个并记录日志。 + """ with self._nodes_json_lock: - online_nodes = self._storage.get_online_nodes() - atomic_write_json(self._config.metric_nodes_json_path, online_nodes) - self._logger.info("nodes.json updated", extra={"count": len(online_nodes)}) + rows = self._storage.get_online_nodes_meta() + prefer_cidrs = self._parse_cidrs(self._config.target_prefer_net_cidrs) + reachability = self._config.target_reachability_check + + result: List[Dict[str, Any]] = [] + for row in rows: + meta = row.get("meta", {}) + hostname = meta.get("hostname") or row.get("name") + labels = row.get("labels") or [] + + overlay_ip = meta.get("overlay_ip") + legacy_ip = meta.get("ip") + host_candidates = self._resolve_host_ips(hostname) + host_pref = self._pick_by_cidrs(host_candidates, prefer_cidrs) + + candidates: List[str] = [] + for ip in [overlay_ip, host_pref, legacy_ip]: + if ip and ip not in candidates: + candidates.append(ip) + + chosen = None + if reachability: + ports = [9100] + try: + if int(meta.get("gpu_number", 0)) > 0: + ports.append(9400) + except Exception: + pass + for ip in candidates: + if any(self._reachable(ip, p, 1.0) for p in ports): + chosen = ip + break + if not chosen: + chosen = candidates[0] if candidates else legacy_ip + if not chosen: + # ultimate fallback: 127.0.0.1 (should not happen) + chosen = "127.0.0.1" + self._logger.warning("No candidate IPs for node; falling back", extra={"node": row.get("node_id")}) + + if chosen and ipaddress.ip_address(chosen) in ipaddress.ip_network("172.22.0.0/16"): + self._logger.warning( + "Prometheus target uses docker_gwbridge address; prefer overlay", + extra={"node": row.get("node_id"), "ip": chosen}, + ) + + result.append( + { + "node_id": row.get("node_id"), + "user_id": meta.get("user"), + "ip": chosen, + "hostname": hostname, + "labels": labels if isinstance(labels, list) else [], + } + ) + + atomic_write_json(self._config.metric_nodes_json_path, result) + self._logger.info("nodes.json updated", extra={"count": len(result)}) + + # ---------------------------- helpers ---------------------------- + @staticmethod + def _parse_cidrs(raw: str) -> List[ipaddress.IPv4Network]: + nets: List[ipaddress.IPv4Network] = [] + for item in (x.strip() for x in (raw or "").split(",")): + if not item: + continue + try: + net = ipaddress.ip_network(item, strict=False) + if isinstance(net, ipaddress.IPv4Network): + nets.append(net) + except ValueError: + continue + return nets + + @staticmethod + def _resolve_host_ips(hostname: str) -> List[str]: + ips: List[str] = [] + try: + infos = socket.getaddrinfo(hostname, None, family=socket.AF_INET) + for info in infos: + ip = info[4][0] + if ip not in ips: + ips.append(ip) + except OSError: + pass + return ips + + @staticmethod + def _pick_by_cidrs(candidates: Iterable[str], prefer: List[ipaddress.IPv4Network]) -> str | None: + for net in prefer: + for ip in candidates: + try: + if ipaddress.ip_address(ip) in net: + return ip + except ValueError: + continue + return None + + @staticmethod + def _reachable(ip: str, port: int, timeout: float) -> bool: + try: + with socket.create_connection((ip, port), timeout=timeout): + return True + except OSError: + return False # ------------------------------------------------------------------ # internal loop diff --git a/src/master/app/storage.py b/src/master/app/storage.py index 3547066..8f154c1 100644 --- a/src/master/app/storage.py +++ b/src/master/app/storage.py @@ -324,9 +324,35 @@ class Storage: { "node_id": row["id"], "user_id": meta.get("user"), - "ip": meta.get("ip"), + "ip": meta.get("ip"), # kept for backward-compat; preferred IP selection handled in scheduler "hostname": meta.get("hostname", row["name"]), "labels": labels if isinstance(labels, list) else [], } ) return result + + def get_online_nodes_meta(self) -> List[Dict[str, Any]]: + """返回在线节点的原始 meta 与名称、标签,交由上层选择目标 IP。 + + 每项包含:{ node_id, name, meta, labels } + """ + with self._lock: + cur = self._conn.execute( + "SELECT id, name, meta_json, labels_json FROM nodes WHERE status = ? ORDER BY id ASC", + ("online",), + ) + rows = cur.fetchall() + + result: List[Dict[str, Any]] = [] + for row in rows: + meta = json.loads(row["meta_json"]) if row["meta_json"] else {} + labels = json.loads(row["labels_json"]) if row["labels_json"] else [] + result.append( + { + "node_id": row["id"], + "name": row["name"], + "meta": meta if isinstance(meta, dict) else {}, + "labels": labels if isinstance(labels, list) else [], + } + ) + return result diff --git a/src/metric/client-plugins/all-in-one-full/config/VERSION b/src/metric/client-plugins/all-in-one-full/config/VERSION index 32b7211..a50908c 100644 --- a/src/metric/client-plugins/all-in-one-full/config/VERSION +++ b/src/metric/client-plugins/all-in-one-full/config/VERSION @@ -1 +1 @@ -1.40.0 +1.42.0 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent index bb3f86b..cb9ff7e 100755 --- a/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent +++ b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d2cf989d0089223b34a27a32d14aad83459afe25a58b1d9f4f3be9f3c5b82e1 -size 7580232 +oid sha256:e2e57a49ebf85f2a790381f73cabe22408d0f7428a5a5181724160781e73a75c +size 7583784 diff --git a/src/metric/client-plugins/all-in-one-full/scripts/install_artifact.sh b/src/metric/client-plugins/all-in-one-full/scripts/install_artifact.sh index 722f2e8..c5acba9 100755 --- a/src/metric/client-plugins/all-in-one-full/scripts/install_artifact.sh +++ b/src/metric/client-plugins/all-in-one-full/scripts/install_artifact.sh @@ -274,19 +274,33 @@ verify_checksums() { log_info "Artifact 目录: $artifact_dir" failed_verification=0 + # 尝试解析 version.json 中的 install_order,用于锁定精确文件名,避免同一目录下多份历史 tar 产生歧义 + local order_file="$TEMP_DIR/install_order.txt" if [[ -f "$TEMP_DIR/checksums.txt" ]]; then while IFS= read -r line; do component=$(echo "$line" | cut -d':' -f1) expected_checksum=$(echo "$line" | cut -d':' -f2-) - # 查找匹配的 tar 文件 + # 优先从 install_order 中推导精确文件名 actual_file="" - for file in "$artifact_dir/${component}-"*.tar.gz; do - if [[ -f "$file" ]]; then - actual_file="$file" - break - fi - done + if [[ -f "$order_file" ]]; then + while IFS= read -r fname; do + if [[ "$fname" == ${component}-*.tar.gz && -f "$artifact_dir/$fname" ]]; then + actual_file="$artifact_dir/$fname" + break + fi + done < "$order_file" + fi + + # 回退:按前缀匹配首个(不推荐,但保持兼容) + if [[ -z "$actual_file" ]]; then + for file in "$artifact_dir/${component}-"*.tar.gz; do + if [[ -f "$file" ]]; then + actual_file="$file" + break + fi + done + fi if [[ -z "$actual_file" ]]; then log_error "找不到组件文件: $component" diff --git a/src/metric/client-plugins/all-in-one-full/scripts/package_artifact.sh b/src/metric/client-plugins/all-in-one-full/scripts/package_artifact.sh index dd8a652..654fd82 100755 --- a/src/metric/client-plugins/all-in-one-full/scripts/package_artifact.sh +++ b/src/metric/client-plugins/all-in-one-full/scripts/package_artifact.sh @@ -59,6 +59,12 @@ ARTIFACT_DIR="artifact/$VERSION" log_info "开始打包 AIOps All-in-One 安装包 v$VERSION" +# 若强制打包且目录已存在,先清理旧产物以避免同一版本下残留多个 tar.gz 导致校验混乱 +if [[ "$FORCE_PACKAGE" == "true" && -d "$ARTIFACT_DIR" ]]; then + log_info "--force: 清理旧的 $ARTIFACT_DIR 下的 tar 与元数据" + rm -rf "$ARTIFACT_DIR" +fi + # 检查必要文件 log_info "检查必要文件..." if [[ ! -f "config/VERSION" ]]; then @@ -130,7 +136,7 @@ if [[ -d "$ARTIFACT_DIR" && "$FORCE_PACKAGE" == "false" ]]; then fi fi -# 创建 artifact 目录 +# 创建 artifact 目录(清理后重建) mkdir -p "$ARTIFACT_DIR" log_info "创建输出目录: $ARTIFACT_DIR" @@ -285,10 +291,13 @@ while IFS= read -r component; do exit 1 fi + # 清理组件目录内历史 tar 包,避免 find 误选旧文件 + rm -f ./*.tar.gz 2>/dev/null || true + # 执行组件的打包脚本 if ./package.sh; then # 查找生成的 tar 包 - tar_file=$(find . -name "*.tar.gz" -type f | head -1) + tar_file=$(ls -1t ./*.tar.gz 2>/dev/null | head -1) if [[ -n "$tar_file" ]]; then # 移动到 artifact 目录 mv "$tar_file" "$CURRENT_DIR/$ARTIFACT_DIR/" diff --git a/src/metric/client-plugins/all-in-one-full/scripts/publish_artifact.sh b/src/metric/client-plugins/all-in-one-full/scripts/publish_artifact.sh index b292a8d..ae6a09b 100755 --- a/src/metric/client-plugins/all-in-one-full/scripts/publish_artifact.sh +++ b/src/metric/client-plugins/all-in-one-full/scripts/publish_artifact.sh @@ -130,20 +130,40 @@ fi TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$" mkdir -p "$TEMP_PACKAGE_DIR" -# 复制所有 tar.gz 文件到临时目录 -log_info "准备 artifact 文件..." -tar_files=$(find "$ARTIFACT_DIR" -name "*.tar.gz" -type f) +# 仅复制 version.json 中 install_order 列出的 tar.gz,防止同一版本目录下历史残留文件导致校验不一致 +log_info "准备 artifact 文件(按 install_order)..." -if [[ -z "$tar_files" ]]; then - log_error "在 $ARTIFACT_DIR 中未找到 tar.gz 文件" - exit 1 +install_list_file="$TEMP_DIR/install_list.txt" +if command -v jq >/dev/null 2>&1; then + jq -r '.install_order[]' "$ARTIFACT_DIR/version.json" > "$install_list_file" 2>/dev/null || true +else + # 简易解析 + grep -A 200 '"install_order"' "$ARTIFACT_DIR/version.json" | grep -E '".*"' | sed 's/.*"\([^"]*\)".*/\1/' > "$install_list_file" 2>/dev/null || true fi -for file in $tar_files; do - filename=$(basename "$file") - log_info " 准备: $filename" - cp "$file" "$TEMP_PACKAGE_DIR/" -done +if [[ -s "$install_list_file" ]]; then + while IFS= read -r filename; do + src="$ARTIFACT_DIR/$filename" + if [[ -f "$src" ]]; then + log_info " 拷贝: $filename" + cp "$src" "$TEMP_PACKAGE_DIR/" + else + log_warning " 未找到: $filename(跳过)" + fi + done < "$install_list_file" +else + log_warning "未能解析 install_order,将回退复制全部 tar.gz(可能包含历史残留,建议安装端使用严格校验)" + tar_files=$(find "$ARTIFACT_DIR" -name "*.tar.gz" -type f) + if [[ -z "$tar_files" ]]; then + log_error "在 $ARTIFACT_DIR 中未找到 tar.gz 文件" + exit 1 + fi + for file in $tar_files; do + filename=$(basename "$file") + log_info " 准备: $filename" + cp "$file" "$TEMP_PACKAGE_DIR/" + done +fi # 复制版本信息文件 if [[ -f "$ARTIFACT_DIR/version.json" ]]; then diff --git a/src/sys/build/node-bundle/Dockerfile b/src/sys/build/node-bundle/Dockerfile new file mode 100644 index 0000000..7f76ee9 --- /dev/null +++ b/src/sys/build/node-bundle/Dockerfile @@ -0,0 +1,16 @@ +ARG BASE_IMAGE=argus-sys-metric-test-node:latest +FROM ${BASE_IMAGE} + +ARG CLIENT_VER +LABEL org.opencontainers.image.title="argus-sys-metric-test-node-bundle" \ + org.opencontainers.image.version="${CLIENT_VER}" \ + org.opencontainers.image.description="Metric test node with embedded client package" + +WORKDIR / + +# bundle files are provided at build time into ./bundle in build context +COPY bundle/ /bundle/ +COPY node-bootstrap.sh /usr/local/bin/node-bootstrap.sh +RUN chmod +x /usr/local/bin/node-bootstrap.sh + +ENTRYPOINT ["/usr/local/bin/node-bootstrap.sh"] diff --git a/src/sys/build/node-bundle/bundle/.gitignore b/src/sys/build/node-bundle/bundle/.gitignore new file mode 100644 index 0000000..11e243e --- /dev/null +++ b/src/sys/build/node-bundle/bundle/.gitignore @@ -0,0 +1,2 @@ + +argus-metric_*.tar.gz diff --git a/src/sys/build/node-bundle/bundle/setup.sh b/src/sys/build/node-bundle/bundle/setup.sh new file mode 100755 index 0000000..006d679 --- /dev/null +++ b/src/sys/build/node-bundle/bundle/setup.sh @@ -0,0 +1,1006 @@ +#!/bin/bash + +set -e + +# 加载配置文件(仅在解压后的目录中可用) +load_config() { + # setup.sh 脚本不需要配置文件,FTP参数通过命令行参数或环境变量提供 + log_info "setup.sh 脚本使用命令行参数或环境变量获取FTP配置" +} + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +FTP_SERVER="${FTP_SERVER}" +FTP_USER="${FTP_USER}" +FTP_PASS="${FTP_PASS}" +FTP_PORT="${FTP_PORT:-21}" +BASE_URL="" # FTP基础URL (将在check_ftp_params中设置) +LATEST_VERSION_URL="" # 版本文件URL (将在check_ftp_params中设置) +TEMP_DIR="/tmp/argus-metric-install-$$" + +# 安装目录配置 +DEFAULT_INSTALL_DIR="/opt/argus-metric" # 默认安装目录 +INSTALL_DIR="${INSTALL_DIR:-$DEFAULT_INSTALL_DIR}" # 可通过环境变量覆盖 +VERSIONS_DIR="$INSTALL_DIR/versions" # 版本目录 +BACKUPS_DIR="$INSTALL_DIR/backups" # 备份目录 +CURRENT_LINK="$INSTALL_DIR/current" # 当前版本软链接 +LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION" # 当前版本记录文件 + +# 预检查:Agent 元数据与 hostname 约束 +require_agent_metadata() { + local hn + hn="$(hostname)" + local ok=false + # 三元环境变量 + if [[ -n "${AGENT_ENV:-}" && -n "${AGENT_USER:-}" && -n "${AGENT_INSTANCE:-}" ]]; then + ok=true + fi + # host 形如 env-user-instance-xxx + if [[ "$hn" =~ ^[^-]+-[^-]+-[^-]+-.*$ ]]; then + ok=true + fi + if [[ "$ok" == false ]]; then + log_error "检测到 hostname 与 Agent 元数据不完整:" + log_error " 当前 hostname: $hn" + log_error " AGENT_ENV='${AGENT_ENV:-}' AGENT_USER='${AGENT_USER:-}' AGENT_INSTANCE='${AGENT_INSTANCE:-}'" + echo + log_info "请满足以下其一后重试:" + log_info " 方式A:设置 hostname 为 env-user-instance-任意,例如 dev-alice-node001-pod-0" + log_info " 方式B:导出环境变量:export AGENT_ENV=dev AGENT_USER=alice AGENT_INSTANCE=node001" + exit 1 + fi +} + +# 检查必需的FTP参数 +check_ftp_params() { + local missing_params=() + + if [[ -z "$FTP_SERVER" ]]; then + missing_params+=("FTP_SERVER") + fi + + if [[ -z "$FTP_USER" ]]; then + missing_params+=("FTP_USER") + fi + + if [[ -z "$FTP_PASS" ]]; then + missing_params+=("FTP_PASS") + fi + + if [[ ${#missing_params[@]} -gt 0 ]]; then + log_error "缺少必需的FTP参数: ${missing_params[*]}" + log_error "请通过以下方式之一设置FTP参数:" + log_error " 1. 命令行参数: --server <地址> --user <用户名> --password <密码>" + log_error " 2. 环境变量: FTP_SERVER=<地址> FTP_USER=<用户名> FTP_PASS=<密码>" + log_error "" + log_error "示例:" + log_error " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234" + log_error " FTP_SERVER=10.211.55.4 FTP_USER=ftpuser FTP_PASS=admin1234 sudo sh setup.sh" + exit 1 + fi + + # 设置BASE_URL和LATEST_VERSION_URL + BASE_URL="ftp://${FTP_SERVER}:${FTP_PORT}" + LATEST_VERSION_URL="$BASE_URL/LATEST_VERSION" + + log_info "FTP配置:" + log_info " 服务器: $FTP_SERVER:$FTP_PORT" + log_info " 用户: $FTP_USER" +} + +# 获取最新版本号的函数 +get_latest_version() { + log_info "获取最新版本信息..." >&2 + log_info "尝试从URL获取: $LATEST_VERSION_URL" >&2 + + # 先测试FTP连接 + log_info "测试FTP连接..." >&2 + if ! curl -u "${FTP_USER}:${FTP_PASS}" -sfI "$LATEST_VERSION_URL" >/dev/null 2>&1; then + log_error "无法连接到FTP服务器或文件不存在" >&2 + log_error "URL: $LATEST_VERSION_URL" >&2 + log_error "请检查:" >&2 + log_error " 1. FTP服务器是否运行: $FTP_SERVER:$FTP_PORT" >&2 + log_error " 2. 用户名密码是否正确: $FTP_USER" >&2 + log_error " 3. LATEST_VERSION文件是否存在" >&2 + log_error "手动测试命令: curl -u ${FTP_USER}:${FTP_PASS} ftp://${FTP_SERVER}/LATEST_VERSION" >&2 + exit 1 + fi + + # 获取文件内容 + if ! LATEST_VERSION=$(curl -u "${FTP_USER}:${FTP_PASS}" -sfL "$LATEST_VERSION_URL" 2>/dev/null | tr -d '[:space:]'); then + log_error "下载LATEST_VERSION文件失败" >&2 + exit 1 + fi + + log_info "原始获取内容: '$LATEST_VERSION'" >&2 + + if [[ -z "$LATEST_VERSION" ]]; then + log_error "获取到的版本信息为空" >&2 + log_error "可能的原因:" >&2 + log_error " 1. LATEST_VERSION文件为空" >&2 + log_error " 2. 文件内容格式不正确" >&2 + log_error " 3. 网络传输问题" >&2 + log_error "请检查FTP服务器上的 /srv/ftp/share/LATEST_VERSION 文件" >&2 + exit 1 + fi + + log_info "检测到最新版本: $LATEST_VERSION" >&2 + echo "$LATEST_VERSION" +} + +# 解析参数 +ARGUS_VERSION="" # 使用不同的变量名避免与系统VERSION冲突 +ACTION="install" +FORCE_INSTALL=false + +while [[ $# -gt 0 ]]; do + case $1 in + --version) + ARGUS_VERSION="$2" + shift 2 + ;; + --server) + FTP_SERVER="$2" + shift 2 + ;; + --user) + FTP_USER="$2" + shift 2 + ;; + --password) + FTP_PASS="$2" + shift 2 + ;; + --port) + FTP_PORT="$2" + shift 2 + ;; + --uninstall) + ACTION="uninstall" + shift + ;; + --install-dir) + INSTALL_DIR="$2" + shift 2 + ;; + # 简化安装逻辑:不再支持回滚和备份列表功能 + # --rollback) + # ACTION="rollback" + # shift + # ;; + # --backup-list) + # ACTION="backup-list" + # shift + # ;; + --status) + ACTION="status" + shift + ;; + --force) + FORCE_INSTALL=true + shift + ;; + --help) + echo "Argus Metric FTP在线安装脚本" + echo + echo "用法: curl -u <用户名>:<密码> ftp://<服务器>/setup.sh -o setup.sh && sh setup.sh [选项]" + echo + echo "必需参数 (必须通过命令行参数或环境变量设置):" + echo " --server SERVER FTP服务器地址 (必须)" + echo " --user USER FTP用户名 (必须)" + echo " --password PASS FTP密码 (必须)" + echo + echo "可选参数:" + echo " --version VERSION 指定版本 (默认: 自动获取最新版本)" + echo " --port PORT FTP端口 (默认: 21)" + echo " --install-dir DIR 安装目录 (默认: /opt/argus-metric)" + echo " --force 强制重新安装 (即使相同版本)" + echo " --uninstall 卸载 (自动确认)" + # echo " --rollback 回滚到上一个备份版本" + # echo " --backup-list 列出所有备份版本" + echo " --status 显示当前安装状态" + echo " --help 显示帮助" + echo + echo "环境变量:" + echo " FTP_SERVER FTP服务器地址 (必须)" + echo " FTP_USER FTP用户名 (必须)" + echo " FTP_PASS FTP密码 (必须)" + echo " FTP_PORT FTP端口 (默认: 21)" + echo + echo "示例:" + echo " # 方式1: 使用命令行参数" + echo " curl -u ftpuser:admin1234 ftp://10.211.55.4/setup.sh -o setup.sh" + echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234" + echo " " + echo " # 方式2: 使用环境变量" + echo " FTP_SERVER=10.211.55.4 FTP_USER=ftpuser FTP_PASS=admin1234 sudo sh setup.sh" + echo " " + echo " # 指定版本安装" + echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --version 1.30.0" + echo " " + echo " # 强制重新安装" + echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --force" + echo " " + echo " # 卸载" + echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --uninstall" + exit 0 + ;; + *) + log_error "未知参数: $1" + echo "使用 --help 查看帮助信息" + exit 1 + ;; + esac +done + +# 清理函数 +cleanup() { + if [[ -d "$TEMP_DIR" ]]; then + rm -rf "$TEMP_DIR" + fi +} + +trap cleanup EXIT + +# 创建安装目录结构 +create_install_directories() { + log_info "创建安装目录结构..." + + # 创建主要目录 + mkdir -p "$VERSIONS_DIR" + mkdir -p "$BACKUPS_DIR" + + log_success "安装目录结构创建完成: $INSTALL_DIR" +} + +# 获取当前安装的版本 +get_current_version() { + # 优先从LATEST_VERSION文件读取 + if [[ -f "$LATEST_VERSION_FILE" ]]; then + local version_from_file=$(cat "$LATEST_VERSION_FILE" 2>/dev/null | tr -d '[:space:]') + if [[ -n "$version_from_file" ]]; then + # 确保版本号格式一致(不带v前缀) + echo "$version_from_file" + return 0 + fi + fi + + # 如果文件不存在或为空,从软链接读取 + if [[ -L "$CURRENT_LINK" ]]; then + local current_path=$(readlink "$CURRENT_LINK") + # 从版本目录名中提取版本号(现在不带v前缀) + basename "$current_path" + else + echo "" + fi +} + +# 检查是否已安装 +check_installed() { + if [[ -L "$CURRENT_LINK" ]] && [[ -d "$CURRENT_LINK" ]]; then + local current_version=$(get_current_version) + if [[ -n "$current_version" ]]; then + log_info "检测到已安装版本: v$current_version" + return 0 + fi + fi + return 1 +} + +# 更新LATEST_VERSION文件 +update_latest_version_file() { + local version="$1" + log_info "更新LATEST_VERSION文件: $version" + + if echo "$version" > "$LATEST_VERSION_FILE"; then + log_success "LATEST_VERSION文件已更新" + else + log_error "更新LATEST_VERSION文件失败" + return 1 + fi +} + +# 初始化 DNS 配置文件到系统目录 +init_dns_config_to_system() { + log_info "初始化 DNS 配置文件到系统目录..." + + # 系统 DNS 配置文件 + local system_dns_conf="$INSTALL_DIR/dns.conf" + + # 如果系统目录中还没有 dns.conf,创建一个空的占位文件 + if [[ ! -f "$system_dns_conf" ]]; then + touch "$system_dns_conf" + chmod 644 "$system_dns_conf" + log_success "DNS 配置文件占位文件已创建: $system_dns_conf" + log_info "DNS 同步脚本将从 FTP 服务器下载实际的 DNS 配置" + else + log_info "DNS 配置文件已存在: $system_dns_conf" + fi +} + +# 备份当前版本 +backup_current_version() { + local current_version=$(get_current_version) + if [[ -z "$current_version" ]]; then + log_info "没有当前版本需要备份" + return 0 + fi + + # 确保备份目录存在 + mkdir -p "$BACKUPS_DIR" + + local backup_name="$current_version" + local backup_path="$BACKUPS_DIR/$backup_name" + + log_info "备份当前版本 $current_version 到: $backup_path" + + # 如果备份已存在,先删除 + if [[ -d "$backup_path" ]]; then + log_info "备份版本已存在,覆盖: $backup_path" + rm -rf "$backup_path" + fi + + # 复制当前版本目录(跟随软链接复制实际内容) + if cp -rL "$CURRENT_LINK" "$backup_path"; then + log_success "版本备份完成: $backup_name" + + else + log_error "版本备份失败" + exit 1 + fi +} + +# 回滚到备份版本 +rollback_to_backup() { + local backup_name="$1" + + # 确保备份目录存在 + mkdir -p "$BACKUPS_DIR" + + local backup_path="$BACKUPS_DIR/$backup_name" + + if [[ ! -d "$backup_path" ]]; then + log_error "备份不存在: $backup_path" + return 1 + fi + + log_info "回滚到备份版本: $backup_name" + + # 停止当前服务 + stop_services + + # 检查是否存在对应的版本目录 + local version_dir="$VERSIONS_DIR/$backup_name" + + if [[ ! -d "$version_dir" ]]; then + log_info "版本目录不存在,从备份恢复版本目录: $version_dir" + # 从备份目录恢复到版本目录 + mkdir -p "$VERSIONS_DIR" + cp -r "$backup_path" "$version_dir" + fi + + # 恢复软链接指向版本目录 + if ln -sfn "$version_dir" "$CURRENT_LINK"; then + log_success "版本回滚完成: $backup_name" + + # 更新LATEST_VERSION文件 + update_latest_version_file "$backup_name" + + return 0 + else + log_error "版本回滚失败" + return 1 + fi +} + +# 停止服务 +stop_services() { + log_info "停止当前服务..." + + # 检查服务是否正在运行 + if ! check_services_running; then + log_info "服务未运行,无需停止" + return 0 + fi + + # 尝试使用卸载脚本停止服务 + if [[ -f "$CURRENT_LINK/uninstall.sh" ]]; then + cd "$CURRENT_LINK" + chmod +x uninstall.sh + + # 自动确认停止服务(避免交互式确认) + echo "y" | ./uninstall.sh >/dev/null 2>&1 + local stop_exit_code=$? + + if [[ $stop_exit_code -eq 0 ]]; then + log_success "服务停止完成" + else + log_warning "停止服务时出现警告,尝试手动停止" + manual_stop_services + fi + else + log_warning "未找到卸载脚本,尝试手动停止服务" + manual_stop_services + fi +} + +# 手动停止服务 +manual_stop_services() { + log_info "手动停止服务..." + + # 停止 node_exporter + if pgrep -f "node_exporter" >/dev/null 2>&1; then + pkill -f "node_exporter" && log_info "node_exporter 已停止" + fi + + # 停止 dcgm_exporter + if pgrep -f "dcgm_exporter" >/dev/null 2>&1; then + pkill -f "dcgm_exporter" && log_info "dcgm_exporter 已停止" + fi + + # 等待进程完全停止 + sleep 2 + + # 检查是否还有残留进程 + if pgrep -f "node_exporter\|dcgm_exporter" >/dev/null 2>&1; then + log_warning "仍有服务进程运行,尝试强制停止" + pkill -9 -f "node_exporter\|dcgm_exporter" 2>/dev/null || true + fi + + log_success "手动停止服务完成" +} + +# 启动服务 +start_services() { + log_info "启动服务..." + + # 检查服务是否已经在运行 + if check_services_running; then + log_info "服务已在运行,跳过启动" + return 0 + fi + + # 由于 install_artifact.sh 已经安装了所有组件并设置了健康检查定时任务 + # 这里只需要简单验证服务状态即可 + log_info "组件已安装完成,健康检查定时任务已设置" + log_info "服务将在健康检查时自动启动(每5分钟检查一次)" + + # 等待一下让服务有时间启动 + sleep 3 + + # 验证服务状态 + if check_services_running; then + log_success "服务启动成功" + else + log_info "服务可能正在启动中,健康检查机制将自动监控" + fi + + return 0 +} + +# 检查服务是否正在运行 +check_services_running() { + # 检查常见的服务端口是否在监听 + local ports=(9100 9400) # node-exporter 和 dcgm-exporter 的默认端口 + + for port in "${ports[@]}"; do + if netstat -tlnp 2>/dev/null | grep -q ":$port "; then + log_info "检测到服务正在端口 $port 上运行" + return 0 + fi + done + + # 检查相关进程 + if pgrep -f "node_exporter\|dcgm_exporter" >/dev/null 2>&1; then + log_info "检测到相关服务进程正在运行" + return 0 + fi + + return 1 +} + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo sh setup.sh" + exit 1 + fi +} + +# 检查系统要求 +check_system() { + log_info "检查系统要求..." + + # 检查操作系统 + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + exit 1 + fi + + # 读取系统信息,使用子shell避免污染当前环境变量 + local OS_INFO=$(source /etc/os-release && echo "$NAME $VERSION_ID") + log_info "检测到操作系统: $OS_INFO" + + # 检查系统架构 + arch=$(uname -m) + log_info "系统架构: $arch" + + # 检查磁盘空间 + available_space=$(df / | awk 'NR==2 {print $4}') + if [[ $available_space -lt 1024 ]]; then + log_warning "可用磁盘空间不足 1GB,当前可用: $(($available_space / 1024 / 1024))GB" + fi +} + +# 下载并安装 +install_argus_metric() { + # 如果没有指定版本,获取最新版本 + if [[ -z "$ARGUS_VERSION" ]]; then + ARGUS_VERSION=$(get_latest_version) + fi + + log_info "开始安装 Argus Metric v$ARGUS_VERSION..." + log_info "安装目录: $INSTALL_DIR" + + # 创建安装目录结构(必须先创建,以便备份时目录存在) + create_install_directories + + # 检查是否已安装 + local is_upgrade=false + if check_installed; then + local current_version=$(get_current_version) + if [[ "$current_version" == "$ARGUS_VERSION" ]]; then + if [[ "$FORCE_INSTALL" == true ]]; then + log_info "检测到相同版本 v$ARGUS_VERSION,但使用了 --force 参数,将强制重新安装" + is_upgrade=true + # 简化安装逻辑:不再备份当前版本 + # backup_current_version + else + log_info "版本 v$ARGUS_VERSION 已安装,无需重复安装" + log_info "如需强制重新安装,请使用 --force 参数" + return 0 + fi + else + log_info "检测到版本升级: v$current_version -> v$ARGUS_VERSION" + is_upgrade=true + + # 简化安装逻辑:不再备份当前版本 + # backup_current_version + fi + fi + + # 创建临时目录 + mkdir -p "$TEMP_DIR" + cd "$TEMP_DIR" + + # 下载发布包,使用新的命名规范 + TAR_NAME="argus-metric_$(echo $ARGUS_VERSION | tr '.' '_').tar.gz" + log_info "下载发布包: $TAR_NAME" + log_info "从FTP服务器下载: $FTP_SERVER:$FTP_PORT, 用户: $FTP_USER" + + # 构造curl命令并显示(隐藏密码) + CURL_CMD="curl -u \"${FTP_USER}:***\" -sfL \"$BASE_URL/$TAR_NAME\" -o \"$TAR_NAME\"" + log_info "执行命令: $CURL_CMD" + + if ! curl -u "${FTP_USER}:${FTP_PASS}" -sfL "$BASE_URL/$TAR_NAME" -o "$TAR_NAME"; then + log_error "下载发布包失败: $BASE_URL/$TAR_NAME" + log_error "完整命令: curl -u \"${FTP_USER}:${FTP_PASS}\" -sfL \"$BASE_URL/$TAR_NAME\" -o \"$TAR_NAME\"" + log_error "请检查FTP服务器连接、用户名密码是否正确" + exit 1 + fi + + # 解压发布包到当前目录 + log_info "解压发布包..." + if ! tar -xzf "$TAR_NAME"; then + log_error "解压发布包失败" + exit 1 + fi + + # 显示解压后的文件结构 + log_info "解压后的文件结构:" + ls -la "$TEMP_DIR" + + # 准备版本目录 + local version_dir="$VERSIONS_DIR/$ARGUS_VERSION" + log_info "安装到版本目录: $version_dir" + + # 如果升级,先停止服务 + if [[ "$is_upgrade" == true ]]; then + stop_services + fi + + # 创建版本目录 + if [[ -d "$version_dir" ]]; then + log_info "版本目录已存在,备份后更新" + rm -rf "$version_dir" + fi + + # 创建新的版本目录 + mkdir -p "$version_dir" + + # 移动解压的文件到版本目录 + log_info "移动文件到版本目录: $TEMP_DIR/* -> $version_dir/" + + # 检查源目录是否有内容 + if [[ ! "$(ls -A "$TEMP_DIR" 2>/dev/null)" ]]; then + log_error "临时目录为空,无法移动文件" + exit 1 + fi + + # 检查目标目录是否存在 + if [[ ! -d "$version_dir" ]]; then + log_error "目标版本目录不存在: $version_dir" + exit 1 + fi + + # 执行文件移动 + if mv "$TEMP_DIR"/* "$version_dir" 2>/dev/null; then + log_success "文件移动到版本目录完成" + else + log_error "移动文件到版本目录失败" + log_error "源目录内容:" + ls -la "$TEMP_DIR" || true + log_error "目标目录状态:" + ls -la "$version_dir" || true + log_error "权限检查:" + ls -ld "$TEMP_DIR" "$version_dir" || true + exit 1 + fi + + # 执行安装脚本 + log_info "执行安装脚本..." + cd "$version_dir" + if [[ -f "install.sh" ]]; then + chmod +x install.sh + # 传递安装根目录给安装脚本,让install_artifact.sh安装到正确的版本目录 + if ./install.sh "$version_dir"; then + log_success "安装脚本执行完成" + else + log_error "安装脚本执行失败" + # 简化安装逻辑:不再自动回滚 + # if [[ "$is_upgrade" == true ]]; then + # log_warning "升级失败,尝试回滚到之前版本..." + # # 确保备份目录存在 + # mkdir -p "$BACKUPS_DIR" + # local latest_backup=$(ls -1t "$BACKUPS_DIR" 2>/dev/null | head -n 1) + # if [[ -n "$latest_backup" ]]; then + # rollback_to_backup "$latest_backup" + # return 1 + # fi + # fi + exit 1 + fi + else + log_error "未找到安装脚本 install.sh" + exit 1 + fi + + # 更新软链接指向新版本 + log_info "更新当前版本链接..." + + # 如果 current 已经存在且是目录,先删除它 + if [[ -d "$CURRENT_LINK" ]] && [[ ! -L "$CURRENT_LINK" ]]; then + log_warning "发现 current 是目录而不是符号链接,正在删除..." + rm -rf "$CURRENT_LINK" + fi + + if ln -sfn "$version_dir" "$CURRENT_LINK"; then + log_success "版本链接更新完成: $CURRENT_LINK -> $version_dir" + else + log_error "版本链接更新失败" + exit 1 + fi + + # 更新LATEST_VERSION文件 + update_latest_version_file "$ARGUS_VERSION" + + # 初始化 DNS 配置文件到系统目录 + init_dns_config_to_system + + # 启动服务 + # start_services + + log_success "Argus Metric v$ARGUS_VERSION 安装完成!" + + # 显示安装信息 + echo + log_info "安装信息:" + log_info " 版本: $ARGUS_VERSION" + log_info " 安装目录: $INSTALL_DIR" + log_info " 版本目录: $version_dir" + log_info " 当前链接: $CURRENT_LINK" + if [[ "$is_upgrade" == true ]]; then + log_info " 升级类型: 版本升级" + else + log_info " 安装类型: 全新安装" + fi +} + +# 卸载 +uninstall_argus_metric() { + log_info "开始卸载 Argus Metric..." + log_info "安装目录: $INSTALL_DIR" + + # 检查是否已安装 + if ! check_installed; then + log_info "未检测到已安装的 Argus Metric" + return 0 + fi + + local current_version=$(get_current_version) + log_info "检测到当前版本: v$current_version" + + # 停止服务 + stop_services + + # 执行卸载脚本 + log_info "执行卸载脚本..." + if [[ -f "$CURRENT_LINK/uninstall.sh" ]]; then + cd "$CURRENT_LINK" + chmod +x uninstall.sh + + # 自动确认卸载(因为用户已经明确使用了 --uninstall 参数) + log_info "自动确认卸载操作..." + echo "y" | ./uninstall.sh + local uninstall_exit_code=$? + + if [[ $uninstall_exit_code -eq 0 ]]; then + log_success "卸载脚本执行完成" + else + log_error "卸载脚本执行失败 (退出码: $uninstall_exit_code)" + exit 1 + fi + else + log_warning "未找到卸载脚本,执行基本清理" + fi + + # 清理安装目录 + log_info "清理安装目录..." + if [[ -d "$INSTALL_DIR" ]]; then + # 询问是否完全删除安装目录 + log_warning "这将删除整个安装目录: $INSTALL_DIR" + log_warning "包括所有版本、备份和配置文件" + + # 在自动化环境中,直接删除 + if rm -rf "$INSTALL_DIR"; then + log_success "安装目录已完全清理: $INSTALL_DIR" + else + log_error "清理安装目录失败" + exit 1 + fi + else + log_info "安装目录不存在,无需清理" + fi + + log_success "Argus Metric 卸载完成!" +} + +# 显示状态 +show_status() { + echo "==========================================" + echo " Argus Metric 安装状态" + echo "==========================================" + echo + + if check_installed; then + local current_version=$(get_current_version) + log_info "当前版本: $current_version" + log_info "安装目录: $INSTALL_DIR" + log_info "当前链接: $CURRENT_LINK" + log_info "版本目录: $VERSIONS_DIR/$current_version" + log_info "版本文件: $LATEST_VERSION_FILE" + + # 显示LATEST_VERSION文件内容 + if [[ -f "$LATEST_VERSION_FILE" ]]; then + local file_version=$(cat "$LATEST_VERSION_FILE" 2>/dev/null | tr -d '[:space:]') + log_info "版本文件内容: $file_version" + fi + + echo + log_info "目录结构:" + if [[ -d "$INSTALL_DIR" ]]; then + tree -L 2 "$INSTALL_DIR" 2>/dev/null || ls -la "$INSTALL_DIR" + fi + + echo + log_info "可用版本:" + if [[ -d "$VERSIONS_DIR" ]]; then + ls -1 "$VERSIONS_DIR" 2>/dev/null | sed 's/^/ - /' + else + echo " 无" + fi + + # 简化安装逻辑:不再显示备份版本信息 + # echo + # log_info "备份版本:" + # if [[ -d "$BACKUPS_DIR" ]] && [[ $(ls -1 "$BACKUPS_DIR" 2>/dev/null | wc -l) -gt 0 ]]; then + # ls -1t "$BACKUPS_DIR" 2>/dev/null | sed 's/^/ - /' + # else + # echo " 无" + # fi + else + log_warning "Argus Metric 未安装" + log_info "安装目录: $INSTALL_DIR" + fi +} + +# 列出备份 +list_backups() { + echo "==========================================" + echo " Argus Metric 备份列表" + echo "==========================================" + echo + + if [[ -d "$BACKUPS_DIR" ]] && [[ $(ls -1 "$BACKUPS_DIR" 2>/dev/null | wc -l) -gt 0 ]]; then + log_info "可用备份版本:" + ls -1t "$BACKUPS_DIR" 2>/dev/null | while read backup; do + local backup_time=$(stat -c %y "$BACKUPS_DIR/$backup" 2>/dev/null | cut -d' ' -f1-2) + echo " - $backup (创建时间: $backup_time)" + done + else + log_warning "没有可用的备份版本" + fi +} + +# 回滚功能 +rollback_version() { + log_info "开始回滚操作..." + + if ! check_installed; then + log_error "没有检测到已安装的版本,无法回滚" + exit 1 + fi + + # 确保备份目录存在 + mkdir -p "$BACKUPS_DIR" + + # 获取最新的备份 + local latest_backup=$(ls -1t "$BACKUPS_DIR" 2>/dev/null | head -n 1) + if [[ -z "$latest_backup" ]]; then + log_error "没有找到可用的备份版本" + exit 1 + fi + + log_info "将回滚到备份版本: $latest_backup" + + if rollback_to_backup "$latest_backup"; then + log_success "回滚完成!" + + # 显示当前状态 + echo + show_status + else + log_error "回滚失败" + exit 1 + fi +} + +# 自检实现:等待 node.json 就绪且健康,并验证 last_report 持续更新 +selfcheck_post_install() { + local hn="$(hostname)" + local node_file="/private/argus/agent/${AGENT_HOSTNAME:-$hn}/node.json" + local deadline=$(( $(date +%s) + 300 )) + local t1="" t2="" + while :; do + if [[ -f "$node_file" ]]; then + if command -v jq >/dev/null 2>&1; then + local ok_health lr + ok_health=$(jq -er '(.health["metric-argus-agent"].status=="healthy") and (.health["metric-node-exporter"].status=="healthy") and (.health["metric-fluent-bit"].status=="healthy") and (.health["metric-dcgm-exporter"].status=="healthy")' "$node_file" 2>/dev/null || echo false) + lr=$(jq -r '.last_report // ""' "$node_file" 2>/dev/null) + if [[ "$ok_health" == true && -n "$lr" ]]; then + if [[ -z "$t1" ]]; then + t1="$lr" + # agent 默认 60s 上报,等待 70s 再校验一次 + sleep 70 + continue + fi + t2="$lr" + if [[ "$t2" != "$t1" ]]; then + return 0 + fi + # 若未变化,再等待一会儿直到超时 + sleep 10 + fi + else + # 无 jq 时的宽松校验 + if grep -q '"status"\s*:\s*"healthy"' "$node_file"; then + return 0 + fi + fi + fi + if (( $(date +%s) >= deadline )); then + log_error "自检超时:未在 5 分钟内确认 last_report 持续更新 或 健康状态不满足(路径:$node_file)" + return 1 + fi + sleep 5 + done +} + +# 主函数 +main() { + echo "==========================================" + echo " Argus Metric 在线安装脚本 v1.0" + echo "==========================================" + echo + + # 加载配置文件 + load_config + + # 对于状态操作,不需要FTP参数和root权限 + # 简化安装逻辑:不再支持备份列表操作 + if [[ "$ACTION" == "status" ]]; then + show_status + return 0 + fi + # if [[ "$ACTION" == "status" || "$ACTION" == "backup-list" ]]; then + # if [[ "$ACTION" == "status" ]]; then + # show_status + # elif [[ "$ACTION" == "backup-list" ]]; then + # list_backups + # fi + # return 0 + # fi + + check_root + + # 更新目录配置变量(在设置INSTALL_DIR后) + VERSIONS_DIR="$INSTALL_DIR/versions" + BACKUPS_DIR="$INSTALL_DIR/backups" + CURRENT_LINK="$INSTALL_DIR/current" + LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION" + + # 简化安装逻辑:不再支持回滚操作 + # if [[ "$ACTION" == "rollback" ]]; then + # rollback_version + # return 0 + # fi + +check_ftp_params +check_system +require_agent_metadata + + if [[ "$ACTION" == "uninstall" ]]; then + uninstall_argus_metric + else + install_argus_metric + fi + + # 安装后自检:最多等待 5 分钟,确认 node.json 存在且健康 + echo + log_info "开始安装后自检(最多等待 5 分钟)..." + selfcheck_post_install || { + log_error "安装后自检未通过,请查看 /var/log/argus-agent.log 以及 /opt/argus-metric/versions/*/.install.log" + exit 1 + } + + echo + log_success "全部自检通过,安装完成!" +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/sys/build/node-bundle/node-bootstrap.sh b/src/sys/build/node-bundle/node-bootstrap.sh new file mode 100644 index 0000000..ab1a501 --- /dev/null +++ b/src/sys/build/node-bundle/node-bootstrap.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +set -euo pipefail + +echo "[BOOT] node bundle starting" + +INSTALL_DIR="/opt/argus-metric" +BUNDLE_DIR="/bundle" +installed_ok=0 + +# 1) already installed? +if [[ -L "$INSTALL_DIR/current" && -d "$INSTALL_DIR/current" ]]; then + echo "[BOOT] client already installed at $INSTALL_DIR/current" +else + # 2) try local bundle first (replicate setup.sh layout: move to /opt/argus-metric/versions/ and run install.sh) + tarball=$(ls -1 "$BUNDLE_DIR"/argus-metric_*.tar.gz 2>/dev/null | head -1 || true) + if [[ -n "${tarball:-}" ]]; then + echo "[BOOT] installing from local bundle: $(basename "$tarball")" + tmp=$(mktemp -d) + tar -xzf "$tarball" -C "$tmp" + # locate root containing version.json + root="$tmp" + if [[ ! -f "$root/version.json" ]]; then + sub=$(find "$tmp" -mindepth 1 -maxdepth 1 -type d | head -n1 || true) + [[ -n "$sub" && -f "$sub/version.json" ]] && root="$sub" + fi + if [[ ! -f "$root/version.json" ]]; then + echo "[BOOT][WARN] version.json not found in bundle; fallback to FTP" + else + ver=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$root/version.json" | head -n1) + if [[ -z "$ver" ]]; then + echo "[BOOT][WARN] failed to parse version from version.json; fallback to FTP" + else + target_root="/opt/argus-metric" + version_dir="$target_root/versions/$ver" + mkdir -p "$version_dir" + # move contents into version dir + shopt -s dotglob + mv "$root"/* "$version_dir/" 2>/dev/null || true + shopt -u dotglob + # run component installer within version dir + if [[ -f "$version_dir/install.sh" ]]; then + chmod +x "$version_dir/install.sh" 2>/dev/null || true + (cd "$version_dir" && ./install.sh "$version_dir") + echo "$ver" > "$target_root/LATEST_VERSION" 2>/dev/null || true + ln -sfn "$version_dir" "$target_root/current" 2>/dev/null || true + if [[ -L "$target_root/current" && -d "$target_root/current" ]]; then + installed_ok=1 + echo "[BOOT] local bundle install OK: version=$ver" + else + echo "[BOOT][WARN] current symlink not present after install; will rely on healthcheck to confirm" + fi + else + echo "[BOOT][WARN] install.sh missing under $version_dir; fallback to FTP" + fi + fi + fi + fi + + # 3) fallback: use FTP setup if not installed + if [[ ! -L "$INSTALL_DIR/current" && "$installed_ok" -eq 0 ]]; then + echo "[BOOT] fallback to FTP setup" + if [[ -z "${FTPIP:-}" || -z "${FTP_USER:-}" || -z "${FTP_PASSWORD:-}" ]]; then + echo "[BOOT][ERROR] FTP variables not set (FTPIP/FTP_USER/FTP_PASSWORD)" >&2 + exit 1 + fi + curl -u "$FTP_USER:$FTP_PASSWORD" -fsSL "ftp://$FTPIP:21/setup.sh" -o /tmp/setup.sh + chmod +x /tmp/setup.sh + /tmp/setup.sh --server "$FTPIP" --user "$FTP_USER" --password "$FTP_PASSWORD" --port 21 + fi +fi + +# 4) ensure agent is running; start if needed (inherits env: MASTER_ENDPOINT/AGENT_*) +if ! pgrep -x argus-agent >/dev/null 2>&1; then + echo "[BOOT] starting argus-agent (not detected)" + setsid /usr/local/bin/argus-agent >/var/log/argus-agent.log 2>&1 < /dev/null & +fi + +# 5) post-install selfcheck (best-effort) and wait for node.json +for i in {1..30}; do + if compgen -G "$INSTALL_DIR/versions/*/check_health.sh" > /dev/null; then + bash "$INSTALL_DIR"/versions/*/check_health.sh || true + break + fi + sleep 2 +done + +host="$(hostname)" +state_dir="/private/argus/agent/${host}" +mkdir -p "$state_dir" 2>/dev/null || true +for i in {1..60}; do + if [[ -s "$state_dir/node.json" ]]; then + echo "[BOOT] node state present: $state_dir/node.json" + break + fi + sleep 2 +done + +echo "[BOOT] ready; entering sleep" +exec sleep infinity diff --git a/src/sys/swarm_tests/.env b/src/sys/swarm_tests/.env new file mode 100644 index 0000000..ca39819 --- /dev/null +++ b/src/sys/swarm_tests/.env @@ -0,0 +1,21 @@ +SERVER_PROJECT=argus-swarm-server +NODES_PROJECT=argus-swarm-nodes + +# Host ports for server compose +MASTER_PORT=32300 +ES_HTTP_PORT=9200 +KIBANA_PORT=5601 +PROMETHEUS_PORT=9090 +GRAFANA_PORT=3000 +ALERTMANAGER_PORT=9093 +WEB_PROXY_PORT_8080=8080 +WEB_PROXY_PORT_8081=8081 +WEB_PROXY_PORT_8082=8082 +WEB_PROXY_PORT_8083=8083 +WEB_PROXY_PORT_8084=8084 +WEB_PROXY_PORT_8085=8085 + +# UID/GID for volume ownership in containers +ARGUS_BUILD_UID=1000 +ARGUS_BUILD_GID=1000 + diff --git a/src/sys/swarm_tests/.env.example b/src/sys/swarm_tests/.env.example new file mode 100644 index 0000000..9287dda --- /dev/null +++ b/src/sys/swarm_tests/.env.example @@ -0,0 +1,21 @@ +SERVER_PROJECT=argus-swarm-server +NODES_PROJECT=argus-swarm-nodes + +# Host ports for server compose +MASTER_PORT=32300 +ES_HTTP_PORT=9200 +KIBANA_PORT=5601 +PROMETHEUS_PORT=9090 +GRAFANA_PORT=3000 +ALERTMANAGER_PORT=9093 +WEB_PROXY_PORT_8080=8080 +WEB_PROXY_PORT_8081=8081 +WEB_PROXY_PORT_8082=8082 +WEB_PROXY_PORT_8083=8083 +WEB_PROXY_PORT_8084=8084 +WEB_PROXY_PORT_8085=8085 + +# UID/GID for volume ownership in containers +ARGUS_BUILD_UID=2133 +ARGUS_BUILD_GID=2015 + diff --git a/src/sys/swarm_tests/.env.nodes b/src/sys/swarm_tests/.env.nodes new file mode 100644 index 0000000..58b8a01 --- /dev/null +++ b/src/sys/swarm_tests/.env.nodes @@ -0,0 +1,8 @@ +BINDIP=10.0.1.5 +FTPIP=10.0.1.4 +MASTER_ENDPOINT=http://master.argus.com:3000 +FTP_USER=ftpuser +FTP_PASSWORD=ZGClab1234! +AGENT_ENV=dev2 +AGENT_USER=yuyr +AGENT_INSTANCE=node001sX diff --git a/src/sys/swarm_tests/README.md b/src/sys/swarm_tests/README.md new file mode 100644 index 0000000..0d82f33 --- /dev/null +++ b/src/sys/swarm_tests/README.md @@ -0,0 +1,52 @@ +# Swarm Tests (argus-sys-net) + +快速在本机用 Docker Swarm + overlay 网络验证“服务端 + 单节点”端到端部署。保持对 `src/sys/tests` 兼容,不影响现有桥接网络测试。 + +## 先决条件 +- Docker Engine 已启用 Swarm(脚本会自动 `swarm init` 单机模式)。 +- 已构建并加载以下镜像:`argus-bind9:latest`、`argus-master:latest`、`argus-elasticsearch:latest`、`argus-kibana:latest`、`argus-metric-ftp:latest`、`argus-metric-prometheus:latest`、`argus-metric-grafana:latest`、`argus-alertmanager:latest`、`argus-web-frontend:latest`、`argus-web-proxy:latest`、以及节点镜像 `argus-sys-metric-test-node-bundle:latest`(见下文)。 +- 本地 `UID/GID` 建议通过 `configs/build_user.local.conf` 指定,脚本会读取: + - `UID=1000`\n`GID=1000`(示例)。 + +## 构建节点 bundle 镜像 + +``` +./deployment/build/build_images.sh --with-node-bundle --client-version 20251106 +``` + +说明:`--client-version` 支持 `YYYYMMDD` 日期包或 `1.xx.yy` 组件版本。打包完成后镜像 `argus-sys-metric-test-node-bundle:latest` 会内置 `argus-metric_*.tar.gz`,容器启动时优先从本地 bundle 安装。 + +## 运行步骤 + +``` +cd src/sys/swarm_tests +cp .env.example .env + +bash scripts/00_bootstrap.sh +bash scripts/01_server_up.sh +bash scripts/02_wait_ready.sh # 输出 BINDIP/FTPIP 到 .env.nodes +bash scripts/03_nodes_up.sh +bash scripts/04_metric_verify.sh +``` + +清理: + +``` +bash scripts/99_down.sh +``` + +## 说明与注意事项 +- `00_bootstrap.sh`:先加载 `scripts/common/build_user.sh`,打印并写入 `.env` 中的 `ARGUS_BUILD_UID/GID`,再准备 `private-server/` 与 `private-nodes/` 目录,并 `chown` 到对应 UID/GID。 +- `01_server_up.sh`:启动服务端 compose。可用 `SWARM_FIX_PERMS=1` 打开“容器内 chmod + supervisor 重启”的兜底逻辑,默认关闭。 +- `02_wait_ready.sh`:等待 Master/ES/Prom/Grafana 就绪(Kibana 可延迟),随后解析 overlay IP,写入 `.env.nodes` 的 `BINDIP/FTPIP`,供节点 compose 使用。 +- `03_nodes_up.sh`:启动单节点容器(bundle 版)。容器内 `node-bootstrap.sh` 优先本地安装,成功后执行健康检查并等待 `/private/argus/agent//node.json` 出现。 +- `04_metric_verify.sh`:在本套件内执行详细校验(不再直接调用 tests 脚本): + - Grafana `/api/health`(database=ok) + - Grafana 数据源指向 `prom.metric.argus.com:` 并在容器内可解析该域名 + - Prometheus `activeTargets` 全部 up + - `nodes.json` 不包含 `172.22/16`(docker_gwbridge) + +## 常见问题 +- Grafana/Kibana 启动报权限:检查 `configs/build_user.local.conf` 与 `00_bootstrap.sh` 的输出 UID/GID 是否一致;必要时设置 `SWARM_FIX_PERMS=1` 重新 `01_server_up.sh`。 +- 节点容器 fallback 到 FTP:通常为 bundle 结构异常或健康检查失败(早期脚本在 `sh` 下执行)。当前 `node-bootstrap.sh` 已使用 `bash` 执行健康检查,并在本地安装成功后跳过 FTP。 +- 代理 502:查看容器 `argus-web-proxy` 的 `/var/log/nginx/error.log` 与启动日志中 `upstream check` 行;若后端未就绪(尤其 Kibana),等待 `02_wait_ready.sh` 通过后再访问。 diff --git a/src/sys/swarm_tests/docker-compose.nodes.yml b/src/sys/swarm_tests/docker-compose.nodes.yml new file mode 100644 index 0000000..6c42cc2 --- /dev/null +++ b/src/sys/swarm_tests/docker-compose.nodes.yml @@ -0,0 +1,34 @@ +version: "3.8" + +networks: + argus-sys-net: + external: true + +services: + metric-test-node: + image: ${NODE_BUNDLE_IMAGE_TAG:-argus-sys-metric-test-node-bundle:latest} + container_name: argus-metric-test-node-swarm + hostname: ${NODE_HOSTNAME:-swarm-metric-node-001} + restart: unless-stopped + environment: + - TZ=Asia/Shanghai + - DEBIAN_FRONTEND=noninteractive + - MASTER_ENDPOINT=${MASTER_ENDPOINT:-http://master.argus.com:3000} + - ES_HOST=es.log.argus.com + - ES_PORT=9200 + - FTPIP=${FTPIP} + - BINDIP=${BINDIP} + - FTP_USER=${FTP_USER:-ftpuser} + - FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!} + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - AGENT_ENV=${AGENT_ENV:-dev2} + - AGENT_USER=${AGENT_USER:-yuyr} + - AGENT_INSTANCE=${AGENT_INSTANCE:-node001sX} + - CLIENT_VERSION=${CLIENT_VERSION:-} + dns: + - ${BINDIP} + networks: [argus-sys-net] + volumes: + - ./private-nodes/argus/agent:/private/argus/agent + command: ["sleep", "infinity"] diff --git a/src/sys/swarm_tests/docker-compose.server.yml b/src/sys/swarm_tests/docker-compose.server.yml new file mode 100644 index 0000000..a05b070 --- /dev/null +++ b/src/sys/swarm_tests/docker-compose.server.yml @@ -0,0 +1,174 @@ +version: "3.8" + +networks: + argus-sys-net: + external: true + +services: + bind: + image: ${BIND_IMAGE_TAG:-argus-bind9:latest} + container_name: argus-bind-sys + networks: [argus-sys-net] + volumes: + - ./private-server:/private + restart: unless-stopped + + master: + image: ${MASTER_IMAGE_TAG:-argus-master:latest} + container_name: argus-master-sys + depends_on: [bind] + environment: + - OFFLINE_THRESHOLD_SECONDS=6 + - ONLINE_THRESHOLD_SECONDS=2 + - SCHEDULER_INTERVAL_SECONDS=1 + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + ports: + - "${MASTER_PORT:-32300}:3000" + volumes: + - ./private-server/argus/master:/private/argus/master + - ./private-server/argus/metric/prometheus:/private/argus/metric/prometheus + - ./private-server/argus/etc:/private/argus/etc + networks: [argus-sys-net] + restart: unless-stopped + + es: + image: ${ES_IMAGE_TAG:-argus-elasticsearch:latest} + container_name: argus-es-sys + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - ES_JAVA_OPTS=-Xms512m -Xmx512m + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ./private-server/argus/log/elasticsearch:/private/argus/log/elasticsearch + - ./private-server/argus/etc:/private/argus/etc + ports: + - "${ES_HTTP_PORT:-9200}:9200" + restart: unless-stopped + networks: [argus-sys-net] + + kibana: + image: ${KIBANA_IMAGE_TAG:-argus-kibana:latest} + container_name: argus-kibana-sys + environment: + - ELASTICSEARCH_HOSTS=http://es.log.argus.com:9200 + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ./private-server/argus/log/kibana:/private/argus/log/kibana + - ./private-server/argus/etc:/private/argus/etc + depends_on: [es] + ports: + - "${KIBANA_PORT:-5601}:5601" + restart: unless-stopped + networks: [argus-sys-net] + + ftp: + image: ${FTP_IMAGE_TAG:-argus-metric-ftp:latest} + container_name: argus-ftp + restart: unless-stopped + environment: + - TZ=Asia/Shanghai + - FTP_BASE_PATH=/private/argus/ftp + - FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!} + - DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com} + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + ports: + - "${FTP_PORT:-21}:21" + - "${FTP_DATA_PORT:-20}:20" + - "${FTP_PASSIVE_HOST_RANGE:-21100-21110}:21100-21110" + volumes: + - ./private-server/argus/metric/ftp:/private/argus/ftp + - ./private-server/argus/etc:/private/argus/etc + networks: [argus-sys-net] + + prometheus: + image: ${PROM_IMAGE_TAG:-argus-metric-prometheus:latest} + container_name: argus-prometheus + restart: unless-stopped + environment: + - TZ=Asia/Shanghai + - PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + ports: + - "${PROMETHEUS_PORT:-9090}:9090" + volumes: + - ./private-server/argus/metric/prometheus:/private/argus/metric/prometheus + - ./private-server/argus/etc:/private/argus/etc + networks: [argus-sys-net] + + grafana: + image: ${GRAFANA_IMAGE_TAG:-argus-metric-grafana:latest} + container_name: argus-grafana + restart: unless-stopped + environment: + - TZ=Asia/Shanghai + - GRAFANA_BASE_PATH=/private/argus/metric/grafana + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - GF_SERVER_HTTP_PORT=3000 + - GF_LOG_LEVEL=warn + - GF_LOG_MODE=console + - GF_PATHS_PROVISIONING=/private/argus/metric/grafana/provisioning + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer + ports: + - "${GRAFANA_PORT:-3000}:3000" + volumes: + - ./private-server/argus/metric/grafana:/private/argus/metric/grafana + - ./private-server/argus/etc:/private/argus/etc + depends_on: [prometheus] + networks: [argus-sys-net] + + alertmanager: + image: ${ALERT_IMAGE_TAG:-argus-alertmanager:latest} + container_name: argus-alertmanager + environment: + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ./private-server/argus/etc:/private/argus/etc + - ./private-server/argus/alert/alertmanager:/private/argus/alert/alertmanager + networks: [argus-sys-net] + ports: + - "${ALERTMANAGER_PORT:-9093}:9093" + restart: unless-stopped + + web-frontend: + image: ${FRONT_IMAGE_TAG:-argus-web-frontend:latest} + container_name: argus-web-frontend + environment: + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - EXTERNAL_MASTER_PORT=${WEB_PROXY_PORT_8085:-8085} + - EXTERNAL_ALERTMANAGER_PORT=${WEB_PROXY_PORT_8084:-8084} + - EXTERNAL_GRAFANA_PORT=${WEB_PROXY_PORT_8081:-8081} + - EXTERNAL_PROMETHEUS_PORT=${WEB_PROXY_PORT_8082:-8082} + - EXTERNAL_KIBANA_PORT=${WEB_PROXY_PORT_8083:-8083} + volumes: + - ./private-server/argus/etc:/private/argus/etc + networks: [argus-sys-net] + restart: unless-stopped + + web-proxy: + image: ${WEB_PROXY_IMAGE_TAG:-argus-web-proxy:latest} + container_name: argus-web-proxy + depends_on: [bind, master, grafana, prometheus, kibana, alertmanager] + environment: + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ./private-server/argus/etc:/private/argus/etc + networks: [argus-sys-net] + ports: + - "${WEB_PROXY_PORT_8080:-8080}:8080" + - "${WEB_PROXY_PORT_8081:-8081}:8081" + - "${WEB_PROXY_PORT_8082:-8082}:8082" + - "${WEB_PROXY_PORT_8083:-8083}:8083" + - "${WEB_PROXY_PORT_8084:-8084}:8084" + - "${WEB_PROXY_PORT_8085:-8085}:8085" + restart: unless-stopped diff --git a/src/sys/swarm_tests/scripts/00_bootstrap.sh b/src/sys/swarm_tests/scripts/00_bootstrap.sh new file mode 100755 index 0000000..27c4462 --- /dev/null +++ b/src/sys/swarm_tests/scripts/00_bootstrap.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +REPO_ROOT="$(cd "$ROOT/../../.." && pwd)" + +ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] || cp "$ROOT/.env.example" "$ENV_FILE" + +# Load build user (UID/GID) from repo config to match container runtime users +if [[ -f "$REPO_ROOT/scripts/common/build_user.sh" ]]; then + # shellcheck disable=SC1091 + source "$REPO_ROOT/scripts/common/build_user.sh" 2>/dev/null || true + if declare -f load_build_user >/dev/null 2>&1; then + load_build_user + fi +fi + +# Capture resolved UID/GID from build_user before sourcing .env +uid_resolved="${ARGUS_BUILD_UID:-2133}" +gid_resolved="${ARGUS_BUILD_GID:-2015}" +echo "[BOOT] resolved build user: UID=${uid_resolved} GID=${gid_resolved} (from scripts/common/build_user.sh or env)" + +# After resolving UID/GID, load .env for other settings; then we will overwrite UID/GID entries +set -a; source "$ENV_FILE"; set +a + +echo "[BOOT] checking Docker Swarm" +if ! docker info 2>/dev/null | grep -q "Swarm: active"; then + echo "[BOOT] initializing swarm (single-node)" + docker swarm init >/dev/null 2>&1 || true +fi + +NET_NAME=argus-sys-net +if docker network inspect "$NET_NAME" >/dev/null 2>&1; then + echo "[BOOT] overlay network exists: $NET_NAME" +else + echo "[BOOT] creating overlay network: $NET_NAME" + docker network create -d overlay --attachable "$NET_NAME" +fi + +echo "[BOOT] preparing private directories (server/nodes)" +# Server-side dirs (align with sys/tests 01_bootstrap.sh) +mkdir -p \ + "$ROOT/private-server/argus/etc" \ + "$ROOT/private-server/argus/bind" \ + "$ROOT/private-server/argus/master" \ + "$ROOT/private-server/argus/metric/prometheus" \ + "$ROOT/private-server/argus/metric/prometheus/data" \ + "$ROOT/private-server/argus/metric/prometheus/rules" \ + "$ROOT/private-server/argus/metric/prometheus/targets" \ + "$ROOT/private-server/argus/alert/alertmanager" \ + "$ROOT/private-server/argus/metric/ftp/share" \ + "$ROOT/private-server/argus/metric/grafana/data" \ + "$ROOT/private-server/argus/metric/grafana/logs" \ + "$ROOT/private-server/argus/metric/grafana/plugins" \ + "$ROOT/private-server/argus/metric/grafana/provisioning/datasources" \ + "$ROOT/private-server/argus/metric/grafana/provisioning/dashboards" \ + "$ROOT/private-server/argus/metric/grafana/data/sessions" \ + "$ROOT/private-server/argus/metric/grafana/data/dashboards" \ + "$ROOT/private-server/argus/metric/grafana/config" \ + "$ROOT/private-server/argus/agent" \ + "$ROOT/private-server/argus/log/elasticsearch" \ + "$ROOT/private-server/argus/log/kibana" + +mkdir -p "$ROOT/private-nodes/argus/agent" + +uid="$uid_resolved"; gid="$gid_resolved" +echo "[BOOT] chown -R ${uid}:${gid} for server core dirs (best-effort)" +chown -R "$uid":"$gid" \ + "$ROOT/private-server/argus/log/elasticsearch" \ + "$ROOT/private-server/argus/log/kibana" \ + "$ROOT/private-server/argus/metric/grafana" \ + "$ROOT/private-server/argus/metric/prometheus" \ + "$ROOT/private-server/argus/alert" \ + "$ROOT/private-server/argus/metric/ftp" \ + "$ROOT/private-server/argus/agent" \ + "$ROOT/private-server/argus/etc" 2>/dev/null || true + +# group-writable for etc/alert as in sys/tests +chmod -R g+w "$ROOT/private-server/argus/alert" "$ROOT/private-server/argus/etc" 2>/dev/null || true + +# ensure .env carries the resolved UID/GID for compose env interpolation +if grep -q '^ARGUS_BUILD_UID=' "$ENV_FILE"; then + sed -i "s/^ARGUS_BUILD_UID=.*/ARGUS_BUILD_UID=${uid}/" "$ENV_FILE" +else + echo "ARGUS_BUILD_UID=${uid}" >> "$ENV_FILE" +fi +if grep -q '^ARGUS_BUILD_GID=' "$ENV_FILE"; then + sed -i "s/^ARGUS_BUILD_GID=.*/ARGUS_BUILD_GID=${gid}/" "$ENV_FILE" +else + echo "ARGUS_BUILD_GID=${gid}" >> "$ENV_FILE" +fi + +# distribute update-dns.sh +BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh" +BIND_UPDATE_DEST="$ROOT/private-server/argus/etc/update-dns.sh" +if [[ -f "$BIND_UPDATE_SRC" ]]; then + cp "$BIND_UPDATE_SRC" "$BIND_UPDATE_DEST" && chmod +x "$BIND_UPDATE_DEST" || true +fi + +echo "[BOOT] done" diff --git a/src/sys/swarm_tests/scripts/01_server_up.sh b/src/sys/swarm_tests/scripts/01_server_up.sh new file mode 100755 index 0000000..05895e3 --- /dev/null +++ b/src/sys/swarm_tests/scripts/01_server_up.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +REPO_ROOT="$(cd "$ROOT/../../.." && pwd)" +ENV_FILE="$ROOT/.env" +# load UID/GID from repo config first (so they take precedence over any stale .env values) +if [[ -f "$REPO_ROOT/scripts/common/build_user.sh" ]]; then + # shellcheck disable=SC1091 + source "$REPO_ROOT/scripts/common/build_user.sh" 2>/dev/null || true + if declare -f load_build_user >/dev/null 2>&1; then + load_build_user + fi +fi +set -a; source "$ENV_FILE"; set +a + +PROJECT="${SERVER_PROJECT:-argus-swarm-server}" +COMPOSE_FILE="$ROOT/docker-compose.server.yml" + +echo "[SERVER] starting compose project: $PROJECT" +docker compose -p "$PROJECT" -f "$COMPOSE_FILE" up -d + +echo "[SERVER] containers:"; docker compose -p "$PROJECT" -f "$COMPOSE_FILE" ps + +# Optional post-start permission alignment (disabled by default). Enable with SWARM_FIX_PERMS=1 +if [[ "${SWARM_FIX_PERMS:-0}" == "1" ]]; then + echo "[SERVER] aligning permissions in containers (best-effort)" + for c in argus-master-sys argus-prometheus argus-grafana argus-ftp argus-es-sys argus-kibana-sys argus-web-frontend argus-web-proxy argus-alertmanager; do + docker exec "$c" sh -lc 'mkdir -p /private/argus && chmod -R 777 /private/argus' 2>/dev/null || true + done + echo "[SERVER] restarting selected supervised programs to pick up new permissions" + docker exec argus-prometheus sh -lc 'supervisorctl restart prometheus targets-updater >/dev/null 2>&1 || true' || true + docker exec argus-grafana sh -lc 'rm -f /private/argus/etc/grafana.metric.argus.com 2>/dev/null || true; supervisorctl restart grafana >/dev/null 2>&1 || true' || true + docker exec argus-es-sys sh -lc 'supervisorctl restart elasticsearch >/dev/null 2>&1 || true' || true + docker exec argus-kibana-sys sh -lc 'supervisorctl restart kibana >/dev/null 2>&1 || true' || true +fi + +echo "[SERVER] done" diff --git a/src/sys/swarm_tests/scripts/02_wait_ready.sh b/src/sys/swarm_tests/scripts/02_wait_ready.sh new file mode 100755 index 0000000..7ab0685 --- /dev/null +++ b/src/sys/swarm_tests/scripts/02_wait_ready.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a + +PROJECT="${SERVER_PROJECT:-argus-swarm-server}" +RETRIES=${RETRIES:-60} +SLEEP=${SLEEP:-5} + +code() { curl -4 -s -o /dev/null -w "%{http_code}" "$1" || echo 000; } +prom_ok() { + # Consider ready if TCP:9090 is accepting on localhost (host side) + (exec 3<>/dev/tcp/127.0.0.1/${PROMETHEUS_PORT:-9090}) >/dev/null 2>&1 && return 0 + return 1 +} + +echo "[READY] waiting services (max $((RETRIES*SLEEP))s)" +for i in $(seq 1 "$RETRIES"); do + e1=$(code "http://127.0.0.1:${MASTER_PORT:-32300}/readyz") + e2=$(code "http://127.0.0.1:${ES_HTTP_PORT:-9200}/_cluster/health") + e3=000 + if prom_ok; then e3=200; fi + e4=$(code "http://127.0.0.1:${GRAFANA_PORT:-3000}/api/health") + e5=$(code "http://127.0.0.1:${KIBANA_PORT:-5601}/api/status") + ok=0 + [[ "$e1" == 200 ]] && ok=$((ok+1)) + [[ "$e2" == 200 ]] && ok=$((ok+1)) + [[ "$e3" == 200 ]] && ok=$((ok+1)) + [[ "$e4" == 200 ]] && ok=$((ok+1)) + # Kibana 可放宽,等其它四项即可 + if [[ $ok -ge 4 ]]; then echo "[READY] base services OK"; break; fi + echo "[..] waiting ($i/$RETRIES): master=$e1 es=$e2 prom=$e3 graf=$e4 kibana=$e5"; sleep "$SLEEP" +done + +if [[ $ok -lt 4 ]]; then echo "[ERROR] services not ready" >&2; exit 1; fi + +echo "[READY] resolving overlay IPs" +BINDIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-bind-sys) +FTPIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-ftp) +echo "BINDIP=$BINDIP FTPIP=$FTPIP" + +ENV_NODES="$ROOT/.env.nodes" +cat > "$ENV_NODES" < actual overlay IPs and reload bind/nginx (best-effort) +echo "[READY] fixing domain records to overlay IPs" +ETC_DIR="$ROOT/private-server/argus/etc"; mkdir -p "$ETC_DIR" +declare -A MAP +MAP[web-frontend]=web.argus.com +MAP[argus-grafana]=grafana.metric.argus.com +MAP[argus-prometheus]=prom.metric.argus.com +MAP[argus-kibana-sys]=kibana.log.argus.com +MAP[argus-alertmanager]=alertmanager.alert.argus.com +MAP[argus-master-sys]=master.argus.com +changed=0 +for cname in "${!MAP[@]}"; do + domain="${MAP[$cname]}"; fpath="$ETC_DIR/$domain" + ip=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' "$cname" 2>/dev/null || true) + [[ -z "$ip" ]] && { echo "[DNS-FIX][WARN] $domain: container $cname no overlay IP yet"; continue; } + cur=$(cat "$fpath" 2>/dev/null || echo "") + if [[ "$cur" != "$ip" ]]; then + echo "$ip" > "$fpath"; echo "[DNS-FIX][SET] $domain = $ip (was: ${cur:-})"; changed=1 + else + echo "[DNS-FIX][OK] $domain already $ip" + fi +done +if [[ $changed -eq 1 ]]; then + docker exec argus-bind-sys /usr/local/bin/reload-bind9.sh >/dev/null 2>&1 || true + sleep 1 +fi +docker exec argus-web-proxy nginx -s reload >/dev/null 2>&1 || true diff --git a/src/sys/swarm_tests/scripts/03_nodes_up.sh b/src/sys/swarm_tests/scripts/03_nodes_up.sh new file mode 100755 index 0000000..8d4b4b8 --- /dev/null +++ b/src/sys/swarm_tests/scripts/03_nodes_up.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a +ENV_NODES_FILE="$ROOT/.env.nodes"; set -a; source "$ENV_NODES_FILE"; set +a + +PROJECT="${NODES_PROJECT:-argus-swarm-nodes}" +COMPOSE_FILE="$ROOT/docker-compose.nodes.yml" + +echo "[NODES] starting compose project: $PROJECT" +docker compose -p "$PROJECT" --env-file "$ENV_NODES_FILE" -f "$COMPOSE_FILE" up -d +docker compose -p "$PROJECT" -f "$COMPOSE_FILE" ps +echo "[NODES] done" + diff --git a/src/sys/swarm_tests/scripts/04_metric_verify.sh b/src/sys/swarm_tests/scripts/04_metric_verify.sh new file mode 100755 index 0000000..ce2a162 --- /dev/null +++ b/src/sys/swarm_tests/scripts/04_metric_verify.sh @@ -0,0 +1,173 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] && { set -a; source "$ENV_FILE"; set +a; } + +PROM_PORT="${PROMETHEUS_PORT:-9090}" +GRAF_PORT="${GRAFANA_PORT:-3000}" +GRAF_URL="http://127.0.0.1:${GRAF_PORT}" +PROM_DOMAIN="prom.metric.argus.com:${PROM_PORT}" + +err() { echo "[ERR] $*" >&2; } +ok() { echo "[OK] $*"; } +info(){ echo "[INFO] $*"; } + +fail() { err "$*"; exit 1; } + +# Ensure fluent-bit is installed, configured and running to ship logs to ES +# Best-effort remediation for swarm_tests only (does not change repo sources) +ensure_fluentbit() { + local cname="$1" + # 1) ensure process exists or try local bundle installer + if ! docker exec "$cname" pgrep -x fluent-bit >/dev/null 2>&1; then + docker exec "$cname" bash -lc ' + set -e + root=/opt/argus-metric/versions + ver=$(ls -1 "$root" 2>/dev/null | sort -Vr | head -1 || true) + [[ -z "$ver" ]] && ver=1.42.0 + verdir="$root/$ver" + tb=$(ls -1 "$verdir"/fluent-bit-*.tar.gz 2>/dev/null | head -1 || true) + if [ -n "$tb" ]; then tmp=$(mktemp -d); tar -xzf "$tb" -C "$tmp"; sub=$(find "$tmp" -mindepth 1 -maxdepth 1 -type d | head -n1 || true); [ -n "$sub" ] && (cd "$sub" && ./install.sh "$verdir") || true; fi + ' >/dev/null 2>&1 || true + fi + # 2) patch configs using literal placeholders with safe delimiter + docker exec "$cname" bash -lc ' + set -e + f=/etc/fluent-bit/fluent-bit.conf + o=/etc/fluent-bit/outputs.d/10-es.conf + LCL="\${CLUSTER}"; LRA="\${RACK}"; LHN="\${HOSTNAME}"; EH="\${ES_HOST:-localhost}"; EP="\${ES_PORT:-9200}" + # record_modifier placeholders + if grep -q "Record cluster $LCL" "$f"; then sed -i "s|Record cluster $LCL|Record cluster local|" "$f"; fi + if grep -q "Record rack $LRA" "$f"; then sed -i "s|Record rack $LRA|Record rack dev|" "$f"; fi + if grep -q "Record host $LHN" "$f"; then hn=$(hostname); sed -i "s|Record host $LHN|Record host ${hn}|" "$f"; fi + # outputs placeholders + if [ -f "$o" ] && (grep -q "$EH" "$o" || grep -q "$EP" "$o"); then + sed -i "s|Host $EH|Host es.log.argus.com|g; s|Port $EP|Port 9200|g" "$o" + fi + ' >/dev/null 2>&1 || true + # 3) restart fluent-bit (best-effort) and wait + docker exec "$cname" bash -lc 'pkill -x fluent-bit >/dev/null 2>&1 || true; sleep 1; setsid su -s /bin/bash fluent-bit -c "/opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf >> /var/log/fluent-bit.log 2>&1" &>/dev/null & echo ok' >/dev/null 2>&1 || true + for i in {1..10}; do if docker exec "$cname" pgrep -x fluent-bit >/dev/null 2>&1; then return 0; fi; sleep 1; done + echo "[WARN] fluent-bit not confirmed running; log pipeline may not ingest" >&2 +} + +# ---- Grafana /api/health ---- +info "Grafana /api/health" +HEALTH_JSON="$ROOT/tmp/metric-verify/graf_health.json" +mkdir -p "$(dirname "$HEALTH_JSON")" +code=$(curl -fsS -o "$HEALTH_JSON" -w '%{http_code}' --max-time 10 "$GRAF_URL/api/health" || true) +[[ "$code" == 200 ]] || fail "/api/health HTTP $code" +if grep -q '"database"\s*:\s*"ok"' "$HEALTH_JSON"; then ok "grafana health database=ok"; else fail "grafana health not ok: $(cat "$HEALTH_JSON")"; fi + +# ---- Grafana datasource points to prom domain ---- +info "Grafana datasource URL uses domain: $PROM_DOMAIN" +DS_FILE="/private/argus/metric/grafana/provisioning/datasources/datasources.yml" +if ! docker exec argus-grafana sh -lc "test -f $DS_FILE" >/dev/null 2>&1; then + DS_FILE="/etc/grafana/provisioning/datasources/datasources.yml" +fi +docker exec argus-grafana sh -lc "grep -E 'url:\s*http://$PROM_DOMAIN' '$DS_FILE'" >/dev/null 2>&1 || fail "datasource not pointing to $PROM_DOMAIN" +ok "datasource points to domain" + +# ---- DNS resolution inside grafana ---- +info "bind resolution inside grafana" +tries=0 +until docker exec argus-grafana getent hosts prom.metric.argus.com >/dev/null 2>&1; do + tries=$((tries+1)); (( tries > 24 )) && fail "grafana cannot resolve prom.metric.argus.com" + echo "[..] waiting DNS propagation in grafana ($tries/24)"; sleep 5 +done +ok "domain resolves" + +# ---- Prometheus activeTargets down check ---- +info "Prometheus activeTargets health" +targets_json="$ROOT/tmp/metric-verify/prom_targets.json" +curl -fsS "http://127.0.0.1:${PROM_PORT}/api/v1/targets" -o "$targets_json" || { echo "[WARN] fetch targets failed" >&2; } +down_all="" +if command -v jq >/dev/null 2>&1; then + down_all=$(jq -r '.data.activeTargets[] | select(.health=="down") | .scrapeUrl' "$targets_json" 2>/dev/null || true) +else + down_all=$(grep -o '"scrapeUrl":"[^"]\+"' "$targets_json" | sed 's/"scrapeUrl":"\(.*\)"/\1/' | paste -sd '\n' - | grep -v '^$' || true) + grep -q '"health":"down"' "$targets_json" && [ -z "$down_all" ] && down_all="(one or more targets down)" +fi +# ignore dcgm-exporter(9400) and tolerate node-exporter(9100) in swarm tests +down_filtered=$(echo "$down_all" | grep -Ev ':(9400|9100)/' || true) +if [[ -n "$down_filtered" ]]; then + err "prometheus down targets (filtered):"; echo "$down_filtered" >&2 +else + ok "prometheus targets up (ignoring :9100 and :9400)" +fi + +# ---- nodes.json sanity: avoid 172.22/16 (gwbridge) ---- +nodes_json="$ROOT/private-server/argus/metric/prometheus/nodes.json" +if [[ -f "$nodes_json" ]] && grep -q '"ip"\s*:\s*"172\.22\.' "$nodes_json"; then + fail "nodes.json contains 172.22/16 addresses (gwbridge)" +fi +ok "nodes.json IPs look fine" + +echo "[DONE] metric verify" + +# ---- Log pipeline smoke test (adapted from sys/tests 07) ---- +info "Log pipeline: send logs in node container and assert ES counts" + +ES_PORT="${ES_HTTP_PORT:-9200}" +KIBANA_PORT="${KIBANA_PORT:-5601}" + +get_count() { + local idx="$1"; local tmp; tmp=$(mktemp) + local code + code=$(curl -s -o "$tmp" -w "%{http_code}" "http://127.0.0.1:${ES_PORT}/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" || true) + if [[ "$code" == "200" ]]; then + local val + val=$(jq -r '(.count // 0) | tonumber? // 0' "$tmp" 2>/dev/null || echo 0) + echo "$val" + else + echo 0 + fi + rm -f "$tmp" +} + +train0=$(get_count "train-*") +infer0=$(get_count "infer-*") +base=$((train0 + infer0)) +info "initial ES counts: train=${train0} infer=${infer0} total=${base}" + +send_logs() { + local cname="$1"; local hosttag="$2" + docker exec "$cname" sh -lc 'mkdir -p /logs/train /logs/infer' + docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log" + docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log" + docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log" +} + +NODE_CONT="${SWARM_NODE_CNAME:-argus-metric-test-node-swarm}" +ensure_fluentbit "$NODE_CONT" +send_logs "$NODE_CONT" "swarm-node" + +info "waiting for ES to ingest..." +curl -s -X POST "http://127.0.0.1:${ES_PORT}/train-*/_refresh" >/dev/null 2>&1 || true +curl -s -X POST "http://127.0.0.1:${ES_PORT}/infer-*/_refresh" >/dev/null 2>&1 || true + +final=0; threshold=3 +for attempt in {1..60}; do + train1=$(get_count "train-*"); infer1=$(get_count "infer-*"); final=$((train1 + infer1)) + if (( final > base && final >= threshold )); then break; fi + echo "[..] waiting ES counts increase to >=${threshold} ($attempt/60) current=${final} base=${base}"; \ + curl -s -X POST "http://127.0.0.1:${ES_PORT}/train-*/_refresh" >/dev/null 2>&1 || true; \ + curl -s -X POST "http://127.0.0.1:${ES_PORT}/infer-*/_refresh" >/dev/null 2>&1 || true; \ + sleep 2 +done +info "final ES counts: train=${train1} infer=${infer1} total=${final}" + +(( final > base )) || fail "ES total did not increase (${base} -> ${final})" +(( final >= threshold )) || fail "ES total below expected threshold: ${final} < ${threshold}" + +es_health=$(curl -s "http://127.0.0.1:${ES_PORT}/_cluster/health" | grep -o '"status":"[^\"]*"' | cut -d'"' -f4) +[[ "$es_health" == green || "$es_health" == yellow ]] || fail "ES health not green/yellow: $es_health" + +if ! curl -fs "http://127.0.0.1:${KIBANA_PORT}/api/status" >/dev/null 2>&1; then + echo "[WARN] Kibana status endpoint not available" >&2 +fi + +ok "log pipeline verified" diff --git a/src/sys/swarm_tests/scripts/99_down.sh b/src/sys/swarm_tests/scripts/99_down.sh new file mode 100755 index 0000000..95d8392 --- /dev/null +++ b/src/sys/swarm_tests/scripts/99_down.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a + +echo "[DOWN] stopping nodes compose" +docker compose -p "${NODES_PROJECT:-argus-swarm-nodes}" -f "$ROOT/docker-compose.nodes.yml" down --remove-orphans || true + +echo "[DOWN] stopping server compose" +docker compose -p "${SERVER_PROJECT:-argus-swarm-server}" -f "$ROOT/docker-compose.server.yml" down --remove-orphans || true + +echo "[DOWN] removing overlay network" +docker network rm argus-sys-net >/dev/null 2>&1 || true + +echo "[DOWN] cleanup temp files" +rm -rf "$ROOT/private-server/tmp" "$ROOT/private-nodes/tmp" 2>/dev/null || true + +echo "[DOWN] done" + diff --git a/src/sys/swarm_tests/tmp/metric-verify.graf_health.json b/src/sys/swarm_tests/tmp/metric-verify.graf_health.json new file mode 100644 index 0000000..41e9747 --- /dev/null +++ b/src/sys/swarm_tests/tmp/metric-verify.graf_health.json @@ -0,0 +1,5 @@ +{ + "commit": "5b85c4c2fcf5d32d4f68aaef345c53096359b2f1", + "database": "ok", + "version": "11.1.0" +} \ No newline at end of file diff --git a/src/sys/swarm_tests/tmp/metric-verify/graf_health.json b/src/sys/swarm_tests/tmp/metric-verify/graf_health.json new file mode 100644 index 0000000..41e9747 --- /dev/null +++ b/src/sys/swarm_tests/tmp/metric-verify/graf_health.json @@ -0,0 +1,5 @@ +{ + "commit": "5b85c4c2fcf5d32d4f68aaef345c53096359b2f1", + "database": "ok", + "version": "11.1.0" +} \ No newline at end of file diff --git a/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json b/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json new file mode 100644 index 0000000..4adff0a --- /dev/null +++ b/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json @@ -0,0 +1 @@ +{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.13:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.13:9400/metrics","globalUrl":"http://10.0.1.13:9400/metrics","lastError":"","lastScrape":"2025-11-06T16:36:25.585236213+08:00","lastScrapeDuration":0.002520163,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.13:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.13:9100/metrics","globalUrl":"http://10.0.1.13:9100/metrics","lastError":"","lastScrape":"2025-11-06T16:36:33.694723606+08:00","lastScrapeDuration":0.021800606,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}} \ No newline at end of file diff --git a/src/sys/swarm_tests/tmp/targets.json b/src/sys/swarm_tests/tmp/targets.json new file mode 100644 index 0000000..7be6783 --- /dev/null +++ b/src/sys/swarm_tests/tmp/targets.json @@ -0,0 +1 @@ +{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.15:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.15","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.15","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.15:9400/metrics","globalUrl":"http://10.0.1.15:9400/metrics","lastError":"","lastScrape":"2025-11-06T15:47:37.200098366+08:00","lastScrapeDuration":0.001361528,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.15:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.15","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.15","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.15:9100/metrics","globalUrl":"http://10.0.1.15:9100/metrics","lastError":"","lastScrape":"2025-11-06T15:47:40.184367879+08:00","lastScrapeDuration":0.02923333,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}} \ No newline at end of file diff --git a/src/web/build_tools/proxy/start-proxy-supervised.sh b/src/web/build_tools/proxy/start-proxy-supervised.sh index 4b74035..95b1092 100644 --- a/src/web/build_tools/proxy/start-proxy-supervised.sh +++ b/src/web/build_tools/proxy/start-proxy-supervised.sh @@ -92,6 +92,20 @@ while :; do WAITED=$((WAITED+1)) done +# Quick upstream reachability snapshot (best-effort; does not block startup) +declare -a _UPSTREAMS=( + "http://web.argus.com:8080/" + "http://grafana.metric.argus.com:3000/api/health" + "http://prom.metric.argus.com:9090/-/ready" + "http://kibana.log.argus.com:5601/api/status" + "http://alertmanager.alert.argus.com:9093/api/v2/status" + "http://master.argus.com:3000/readyz" +) +for u in "${_UPSTREAMS[@]}"; do + code=$(curl -4 -s -o /dev/null -w "%{http_code}" "$u" || echo 000) + echo "[INFO] upstream check: $u -> $code" +done + echo "[INFO] Launching nginx..." # 启动 nginx 前台模式