[#37] 增加sys/swarm_tests(cpu) ;单独构建的node bundle镜像
This commit is contained in:
parent
94b3e910b3
commit
d1fad4a05a
98
deployment/build/build_images.sh
Executable file
98
deployment/build/build_images.sh
Executable file
@ -0,0 +1,98 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||||
|
. "$ROOT_DIR/deployment/build/common.sh"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<EOF
|
||||||
|
Build Argus images (optional node-bundle)
|
||||||
|
|
||||||
|
Usage: build_images.sh [--with-node-bundle] [--client-version YYYYMMDD] [--base-image NAME[:TAG]]
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
./deployment/build/build_images.sh --with-node-bundle --client-version 20251106
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
WITH_BUNDLE=false
|
||||||
|
CLIENT_VERSION=""
|
||||||
|
BASE_IMAGE="argus-sys-metric-test-node:latest"
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--with-node-bundle) WITH_BUNDLE=true; shift;;
|
||||||
|
--client-version) CLIENT_VERSION="$2"; shift 2;;
|
||||||
|
--base-image) BASE_IMAGE="$2"; shift 2;;
|
||||||
|
-h|--help) usage; exit 0;;
|
||||||
|
*) err "unknown arg: $1"; usage; exit 1;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ "$WITH_BUNDLE" == true ]]; then
|
||||||
|
require_cmd docker tar gzip
|
||||||
|
BUNDLE_DIR="$ROOT_DIR/src/sys/build/node-bundle"
|
||||||
|
CTX_DIR="$BUNDLE_DIR"
|
||||||
|
TMP_BUNDLE="$BUNDLE_DIR/bundle"
|
||||||
|
rm -rf "$TMP_BUNDLE"; mkdir -p "$TMP_BUNDLE"
|
||||||
|
|
||||||
|
# Build or locate client artifact
|
||||||
|
PLUGIN_DIR="$ROOT_DIR/src/metric/client-plugins/all-in-one-full"
|
||||||
|
# CLIENT_VERSION 支持两种形式:
|
||||||
|
# - 形如 1.42.0 的 artifact 版本(默认)
|
||||||
|
# - 形如 YYYYMMDD 的打包日期,将从 deployment/artifact/client/ 下解析出内部 artifact 版本
|
||||||
|
if [[ -z "$CLIENT_VERSION" ]]; then
|
||||||
|
pushd "$PLUGIN_DIR" >/dev/null
|
||||||
|
bash scripts/package_artifact.sh --force
|
||||||
|
CLIENT_VERSION=$(cat artifact/*/version.json 2>/dev/null | sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' | tail -n1)
|
||||||
|
popd >/dev/null
|
||||||
|
[[ -n "$CLIENT_VERSION" ]] || { err "failed to detect client version"; exit 1; }
|
||||||
|
else
|
||||||
|
if [[ "$CLIENT_VERSION" =~ ^[0-9]{8}$ ]]; then
|
||||||
|
PKG_DIR="$ROOT_DIR/deployment/artifact/client/$CLIENT_VERSION"
|
||||||
|
TAR_PKG="$PKG_DIR/argus-metric_${CLIENT_VERSION}.tar.gz"
|
||||||
|
[[ -f "$TAR_PKG" ]] || { err "client date package not found: $TAR_PKG"; exit 1; }
|
||||||
|
# 解包读取内部 version.json
|
||||||
|
tmpd=$(mktemp -d); trap 'rm -rf "$tmpd"' EXIT
|
||||||
|
tar -xzf "$TAR_PKG" -C "$tmpd"
|
||||||
|
if [[ -f "$tmpd/version.json" ]]; then
|
||||||
|
ART_VER=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$tmpd/version.json" | head -n1)
|
||||||
|
[[ -n "$ART_VER" ]] || { err "failed to parse artifact version from date package"; exit 1; }
|
||||||
|
CLIENT_VERSION="$ART_VER"
|
||||||
|
# 直接使用该 tar 作为 bundle 源
|
||||||
|
cp "$TAR_PKG" "$TMP_BUNDLE/argus-metric_$(echo "$ART_VER" | tr '.' '_').tar.gz"
|
||||||
|
# 同时尝试复制 setup.sh(若存在)
|
||||||
|
[[ -f "$PKG_DIR/setup.sh" ]] && cp "$PKG_DIR/setup.sh" "$TMP_BUNDLE/" || true
|
||||||
|
else
|
||||||
|
err "version.json missing in client date package"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
# 假定为 artifact 版本目录
|
||||||
|
pushd "$PLUGIN_DIR" >/dev/null
|
||||||
|
[[ -d "artifact/$CLIENT_VERSION" ]] || bash scripts/package_artifact.sh --force
|
||||||
|
popd >/dev/null
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 若未通过日期包预置 tar,则从插件 artifact 目录取
|
||||||
|
TAR_NAME="argus-metric_$(echo "$CLIENT_VERSION" | tr '.' '_').tar.gz"
|
||||||
|
if [[ ! -f "$TMP_BUNDLE/$TAR_NAME" ]]; then
|
||||||
|
SRC_TAR="$PLUGIN_DIR/artifact/$CLIENT_VERSION/$TAR_NAME"
|
||||||
|
[[ -f "$SRC_TAR" ]] || { err "missing client tar: $SRC_TAR"; exit 1; }
|
||||||
|
cp "$SRC_TAR" "$TMP_BUNDLE/"
|
||||||
|
# also include setup.sh for fallback
|
||||||
|
if [[ -f "$ROOT_DIR/deployment/artifact/client/$(today_version)/setup.sh" ]]; then
|
||||||
|
cp "$ROOT_DIR/deployment/artifact/client/$(today_version)/setup.sh" "$TMP_BUNDLE/" || true
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Building node-bundle image with client version: $CLIENT_VERSION"
|
||||||
|
DOCKER_BUILDKIT=0 docker build \
|
||||||
|
--build-arg CLIENT_VER="$CLIENT_VERSION" \
|
||||||
|
--build-arg BASE_IMAGE="$BASE_IMAGE" \
|
||||||
|
-t argus-sys-metric-test-node-bundle:latest \
|
||||||
|
-f "$BUNDLE_DIR/Dockerfile" "$BUNDLE_DIR"
|
||||||
|
log "Built image: argus-sys-metric-test-node-bundle:latest"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Done."
|
||||||
103
deployment/build/templates/scripts/fix-prom-targets-overlay.sh
Normal file
103
deployment/build/templates/scripts/fix-prom-targets-overlay.sh
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Quick fix tool: replace 172.22/16 targets in nodes.json with overlay IPs resolved from hostname.
|
||||||
|
# Usage: run on server package host: scripts/fix-prom-targets-overlay.sh
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
NODES_JSON="$ROOT/private/argus/metric/prometheus/nodes.json"
|
||||||
|
|
||||||
|
require_cmd() { command -v "$1" >/dev/null 2>&1 || { echo "[ERROR] missing command: $1" >&2; exit 1; }; }
|
||||||
|
|
||||||
|
backup() {
|
||||||
|
local src="$1"; local ts; ts=$(date -u +%Y%m%d-%H%M%SZ)
|
||||||
|
cp "$src" "${src%.json}_bak_${ts}.json"
|
||||||
|
}
|
||||||
|
|
||||||
|
prefer_overlay_ip() {
|
||||||
|
local host="$1"
|
||||||
|
# prefer 10.0/8 then 172.31/16
|
||||||
|
getent hosts "$host" | awk '{print $1}' | while read -r ip; do
|
||||||
|
if [[ "$ip" =~ ^10\. ]]; then echo "$ip"; return; fi
|
||||||
|
done
|
||||||
|
getent hosts "$host" | awk '{print $1}' | while read -r ip; do
|
||||||
|
if [[ "$ip" =~ ^172\.31\. ]]; then echo "$ip"; return; fi
|
||||||
|
done
|
||||||
|
# fallback: first A record
|
||||||
|
getent hosts "$host" | awk '{print $1; exit}'
|
||||||
|
}
|
||||||
|
|
||||||
|
require_cmd awk
|
||||||
|
require_cmd sed
|
||||||
|
|
||||||
|
if [[ ! -f "$NODES_JSON" ]]; then
|
||||||
|
echo "[WARN] nodes.json not found: $NODES_JSON" >&2
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
backup "$NODES_JSON"
|
||||||
|
|
||||||
|
tmp=$(mktemp)
|
||||||
|
trap 'rm -f "$tmp"' EXIT
|
||||||
|
|
||||||
|
changed=0
|
||||||
|
python3 - "$NODES_JSON" <<'PY' > "$tmp" || {
|
||||||
|
import ipaddress, json, sys, socket
|
||||||
|
path=sys.argv[1]
|
||||||
|
data=json.load(open(path)) if path else []
|
||||||
|
def resolve(host):
|
||||||
|
try:
|
||||||
|
infos=socket.getaddrinfo(host,None,family=socket.AF_INET)
|
||||||
|
ips=[i[4][0] for i in infos]
|
||||||
|
# prefer 10. over 172.31.
|
||||||
|
for ip in ips:
|
||||||
|
if ip.startswith('10.'): return ip
|
||||||
|
for ip in ips:
|
||||||
|
if ip.startswith('172.31.'): return ip
|
||||||
|
return ips[0] if ips else None
|
||||||
|
except OSError:
|
||||||
|
return None
|
||||||
|
gw=ipaddress.ip_network('172.22.0.0/16')
|
||||||
|
out=[]
|
||||||
|
changed=False
|
||||||
|
for item in data:
|
||||||
|
ip=item.get('ip')
|
||||||
|
host=item.get('hostname') or ''
|
||||||
|
try:
|
||||||
|
bad = ip and ipaddress.ip_address(ip) in gw
|
||||||
|
except Exception:
|
||||||
|
bad = False
|
||||||
|
if bad and host:
|
||||||
|
new=resolve(host)
|
||||||
|
if new:
|
||||||
|
item=dict(item)
|
||||||
|
item['ip']=new
|
||||||
|
changed=True
|
||||||
|
out.append(item)
|
||||||
|
json.dump(out, sys.stdout, ensure_ascii=False)
|
||||||
|
sys.stderr.write('CHANGED' if changed else 'UNCHANGED')
|
||||||
|
PY
|
||||||
|
|
||||||
|
status=$?
|
||||||
|
marker=$(tail -n1 /dev/stderr 2>/dev/null || true)
|
||||||
|
if [[ "$status" -ne 0 ]]; then
|
||||||
|
echo "[ERROR] failed to rewrite nodes.json" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if grep -q '"ip"\s*:\s*"172\.22\.' "$tmp"; then
|
||||||
|
echo "[WARN] some gwbridge targets remain; manual fix may be required" >&2
|
||||||
|
fi
|
||||||
|
|
||||||
|
mv "$tmp" "$NODES_JSON"
|
||||||
|
echo "[OK] nodes.json updated"
|
||||||
|
|
||||||
|
# try to reload Prometheus
|
||||||
|
if docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then
|
||||||
|
docker exec argus-prometheus sh -lc 'pidof prometheus >/dev/null 2>&1 && kill -HUP $(pidof prometheus) || supervisorctl restart prometheus' >/dev/null 2>&1 || true
|
||||||
|
echo "[INFO] Prometheus reloaded"
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
||||||
|
|
||||||
@ -155,6 +155,34 @@ gfcode=$(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health"); [[ "$gf
|
|||||||
# Deduplicate errors
|
# Deduplicate errors
|
||||||
sort -u -o "$ERRORS" "$ERRORS"
|
sort -u -o "$ERRORS" "$ERRORS"
|
||||||
|
|
||||||
|
# --- Prometheus targets & nodes.json checks ---
|
||||||
|
section PROMETHEUS-TARGETS
|
||||||
|
nodes_json_path="$ROOT/private/argus/metric/prometheus/nodes.json"
|
||||||
|
if [[ -f "$nodes_json_path" ]]; then
|
||||||
|
logd "nodes.json present: $nodes_json_path"
|
||||||
|
# detect gwbridge addresses (172.22/16)
|
||||||
|
if grep -E '"ip"\s*:\s*"172\.22\.' "$nodes_json_path" >/dev/null 2>&1; then
|
||||||
|
append_err "[prometheus][targets] contains gwbridge (172.22/16) addresses; prefer overlay (10.0/8)."
|
||||||
|
echo "HINT: nodes.json has 172.22.x.x targets. Ensure agent publishes overlay_ip and master prefers it." >&2
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
logd "nodes.json missing at $nodes_json_path"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Query Prometheus activeTargets and list down items when possible
|
||||||
|
pt_json=$(curl -s --max-time 3 "http://localhost:${PROMETHEUS_PORT:-9090}/api/v1/targets" || true)
|
||||||
|
if command -v jq >/dev/null 2>&1; then
|
||||||
|
downs=$(printf '%s' "$pt_json" | jq -r '.data.activeTargets[] | select(.health=="down") | "[prometheus][activeTargets] down url=\(.scrapeUrl) lastError=\(.lastError)"' 2>/dev/null || true)
|
||||||
|
if [[ -n "$downs" ]]; then
|
||||||
|
printf '%s\n' "$downs" >> "$ERRORS"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
# best-effort grep when jq is unavailable
|
||||||
|
if printf '%s' "$pt_json" | grep -q '"health":"down"'; then
|
||||||
|
append_err "[prometheus][activeTargets] some targets are down (enable jq for detailed reasons)"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
echo "Diagnostic details -> $DETAILS"
|
echo "Diagnostic details -> $DETAILS"
|
||||||
echo "Detected errors -> $ERRORS"
|
echo "Detected errors -> $ERRORS"
|
||||||
|
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
import socket
|
import socket
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import ipaddress
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict
|
from typing import Any, Dict
|
||||||
|
|
||||||
@ -16,11 +17,47 @@ _HOSTNAME_PATTERN = re.compile(r"^([^-]+)-([^-]+)-([^-]+)-.*$")
|
|||||||
|
|
||||||
|
|
||||||
def collect_metadata(config: AgentConfig) -> Dict[str, Any]:
|
def collect_metadata(config: AgentConfig) -> Dict[str, Any]:
|
||||||
"""汇总节点注册需要的静态信息。"""
|
"""汇总节点注册需要的静态信息,带有更智能的 IP 选择。
|
||||||
|
|
||||||
|
规则(从高到低):
|
||||||
|
1) AGENT_PUBLISH_IP 指定;
|
||||||
|
2) Hostname A 记录(若命中优先网段);
|
||||||
|
3) 网卡扫描:排除 AGENT_EXCLUDE_IFACES,优先 AGENT_PREFER_NET_CIDRS;
|
||||||
|
4) 默认路由回退(UDP socket 技巧)。
|
||||||
|
|
||||||
|
额外发布:overlay_ip / gwbridge_ip / interfaces,便于 Master 与诊断使用。
|
||||||
|
"""
|
||||||
hostname = config.hostname
|
hostname = config.hostname
|
||||||
meta = {
|
|
||||||
|
prefer_cidrs = _read_cidrs_env(
|
||||||
|
os.environ.get("AGENT_PREFER_NET_CIDRS", "10.0.0.0/8,172.31.0.0/16")
|
||||||
|
)
|
||||||
|
exclude_ifaces = _read_csv_env(
|
||||||
|
os.environ.get("AGENT_EXCLUDE_IFACES", "docker_gwbridge,lo")
|
||||||
|
)
|
||||||
|
|
||||||
|
# interface inventory
|
||||||
|
interfaces = _list_global_ipv4_addrs()
|
||||||
|
if exclude_ifaces:
|
||||||
|
interfaces = [it for it in interfaces if it[0] not in set(exclude_ifaces)]
|
||||||
|
|
||||||
|
# resolve hostname candidates
|
||||||
|
host_ips = _resolve_hostname_ips(hostname)
|
||||||
|
|
||||||
|
selected_ip, overlay_ip, gwbridge_ip = _select_publish_ips(
|
||||||
|
interfaces=interfaces,
|
||||||
|
host_ips=host_ips,
|
||||||
|
prefer_cidrs=prefer_cidrs,
|
||||||
|
)
|
||||||
|
|
||||||
|
meta: Dict[str, Any] = {
|
||||||
"hostname": hostname,
|
"hostname": hostname,
|
||||||
"ip": _detect_ip_address(),
|
"ip": os.environ.get("AGENT_PUBLISH_IP", selected_ip), # keep required field
|
||||||
|
"overlay_ip": overlay_ip or selected_ip,
|
||||||
|
"gwbridge_ip": gwbridge_ip,
|
||||||
|
"interfaces": [
|
||||||
|
{"iface": name, "ip": ip} for name, ip in interfaces
|
||||||
|
],
|
||||||
"env": config.environment,
|
"env": config.environment,
|
||||||
"user": config.user,
|
"user": config.user,
|
||||||
"instance": config.instance,
|
"instance": config.instance,
|
||||||
@ -96,7 +133,7 @@ def _detect_gpu_count() -> int:
|
|||||||
|
|
||||||
|
|
||||||
def _detect_ip_address() -> str:
|
def _detect_ip_address() -> str:
|
||||||
"""尝试通过 UDP socket 获得容器出口 IP,失败则回退解析主机名。"""
|
"""保留旧接口,作为最终回退:默认路由源地址 → 主机名解析 → 127.0.0.1。"""
|
||||||
try:
|
try:
|
||||||
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock:
|
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock:
|
||||||
sock.connect(("8.8.8.8", 80))
|
sock.connect(("8.8.8.8", 80))
|
||||||
@ -108,3 +145,118 @@ def _detect_ip_address() -> str:
|
|||||||
except OSError:
|
except OSError:
|
||||||
LOGGER.warning("Unable to resolve hostname to IP; defaulting to 127.0.0.1")
|
LOGGER.warning("Unable to resolve hostname to IP; defaulting to 127.0.0.1")
|
||||||
return "127.0.0.1"
|
return "127.0.0.1"
|
||||||
|
|
||||||
|
|
||||||
|
def _read_csv_env(raw: str | None) -> list[str]:
|
||||||
|
if not raw:
|
||||||
|
return []
|
||||||
|
return [x.strip() for x in raw.split(",") if x.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def _read_cidrs_env(raw: str | None) -> list[ipaddress.IPv4Network]:
|
||||||
|
cidrs: list[ipaddress.IPv4Network] = []
|
||||||
|
for item in _read_csv_env(raw):
|
||||||
|
try:
|
||||||
|
net = ipaddress.ip_network(item, strict=False)
|
||||||
|
if isinstance(net, (ipaddress.IPv4Network,)):
|
||||||
|
cidrs.append(net)
|
||||||
|
except ValueError:
|
||||||
|
LOGGER.warning("Ignoring invalid CIDR in AGENT_PREFER_NET_CIDRS", extra={"cidr": item})
|
||||||
|
return cidrs
|
||||||
|
|
||||||
|
|
||||||
|
def _list_global_ipv4_addrs() -> list[tuple[str, str]]:
|
||||||
|
"""列出 (iface, ip) 形式的全局 IPv4 地址。
|
||||||
|
依赖 iproute2:ip -4 -o addr show scope global
|
||||||
|
"""
|
||||||
|
results: list[tuple[str, str]] = []
|
||||||
|
try:
|
||||||
|
proc = subprocess.run(
|
||||||
|
["sh", "-lc", "ip -4 -o addr show scope global | awk '{print $2, $4}'"],
|
||||||
|
check=False,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True,
|
||||||
|
timeout=3,
|
||||||
|
)
|
||||||
|
if proc.returncode == 0:
|
||||||
|
for line in proc.stdout.splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) != 2:
|
||||||
|
continue
|
||||||
|
iface, cidr = parts
|
||||||
|
ip = cidr.split("/")[0]
|
||||||
|
try:
|
||||||
|
ipaddress.IPv4Address(ip)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
results.append((iface, ip))
|
||||||
|
except Exception as exc: # pragma: no cover - defensive
|
||||||
|
LOGGER.debug("Failed to list interfaces", extra={"error": str(exc)})
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_hostname_ips(name: str) -> list[str]:
|
||||||
|
ips: list[str] = []
|
||||||
|
try:
|
||||||
|
infos = socket.getaddrinfo(name, None, family=socket.AF_INET)
|
||||||
|
for info in infos:
|
||||||
|
ip = info[4][0]
|
||||||
|
if ip not in ips:
|
||||||
|
ips.append(ip)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
return ips
|
||||||
|
|
||||||
|
|
||||||
|
def _pick_by_cidrs(candidates: list[str], prefer_cidrs: list[ipaddress.IPv4Network]) -> str | None:
|
||||||
|
for net in prefer_cidrs:
|
||||||
|
for ip in candidates:
|
||||||
|
try:
|
||||||
|
if ipaddress.ip_address(ip) in net:
|
||||||
|
return ip
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _select_publish_ips(
|
||||||
|
*,
|
||||||
|
interfaces: list[tuple[str, str]],
|
||||||
|
host_ips: list[str],
|
||||||
|
prefer_cidrs: list[ipaddress.IPv4Network],
|
||||||
|
) -> tuple[str, str | None, str | None]:
|
||||||
|
"""返回 (selected_ip, overlay_ip, gwbridge_ip)。
|
||||||
|
|
||||||
|
- overlay_ip:优先命中 prefer_cidrs(10.0/8 先于 172.31/16)。
|
||||||
|
- gwbridge_ip:若存在 172.22/16 则记录。
|
||||||
|
- selected_ip:优先 AGENT_PUBLISH_IP;否则 overlay_ip;否则 hostname A 记录中的 prefer;否则默认路由回退。
|
||||||
|
"""
|
||||||
|
# detect gwbridge (172.22/16)
|
||||||
|
gwbridge_net = ipaddress.ip_network("172.22.0.0/16")
|
||||||
|
gwbridge_ip = None
|
||||||
|
for _, ip in interfaces:
|
||||||
|
try:
|
||||||
|
if ipaddress.ip_address(ip) in gwbridge_net:
|
||||||
|
gwbridge_ip = ip
|
||||||
|
break
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# overlay candidate from interfaces by prefer cidrs
|
||||||
|
iface_ips = [ip for _, ip in interfaces]
|
||||||
|
overlay_ip = _pick_by_cidrs(iface_ips, prefer_cidrs)
|
||||||
|
|
||||||
|
# hostname A records filtered by prefer cidrs
|
||||||
|
host_pref = _pick_by_cidrs(host_ips, prefer_cidrs)
|
||||||
|
|
||||||
|
env_ip = os.environ.get("AGENT_PUBLISH_IP")
|
||||||
|
if env_ip:
|
||||||
|
selected = env_ip
|
||||||
|
else:
|
||||||
|
selected = overlay_ip or host_pref or _detect_ip_address()
|
||||||
|
|
||||||
|
return selected, overlay_ip, gwbridge_ip
|
||||||
|
|||||||
BIN
src/agent/dist/argus-agent
vendored
BIN
src/agent/dist/argus-agent
vendored
Binary file not shown.
@ -13,6 +13,8 @@ class AppConfig:
|
|||||||
scheduler_interval_seconds: int
|
scheduler_interval_seconds: int
|
||||||
node_id_prefix: str
|
node_id_prefix: str
|
||||||
auth_mode: str
|
auth_mode: str
|
||||||
|
target_prefer_net_cidrs: str
|
||||||
|
target_reachability_check: bool
|
||||||
|
|
||||||
|
|
||||||
def _get_int_env(name: str, default: int) -> int:
|
def _get_int_env(name: str, default: int) -> int:
|
||||||
@ -27,6 +29,12 @@ def _get_int_env(name: str, default: int) -> int:
|
|||||||
|
|
||||||
def load_config() -> AppConfig:
|
def load_config() -> AppConfig:
|
||||||
"""读取环境变量生成配置对象,方便统一管理运行参数。"""
|
"""读取环境变量生成配置对象,方便统一管理运行参数。"""
|
||||||
|
def _bool_env(name: str, default: bool) -> bool:
|
||||||
|
raw = os.environ.get(name)
|
||||||
|
if raw is None or raw.strip() == "":
|
||||||
|
return default
|
||||||
|
return raw.strip().lower() in ("1", "true", "yes", "on")
|
||||||
|
|
||||||
return AppConfig(
|
return AppConfig(
|
||||||
db_path=os.environ.get("DB_PATH", "/private/argus/master/db.sqlite3"),
|
db_path=os.environ.get("DB_PATH", "/private/argus/master/db.sqlite3"),
|
||||||
metric_nodes_json_path=os.environ.get(
|
metric_nodes_json_path=os.environ.get(
|
||||||
@ -37,4 +45,6 @@ def load_config() -> AppConfig:
|
|||||||
scheduler_interval_seconds=_get_int_env("SCHEDULER_INTERVAL_SECONDS", 30),
|
scheduler_interval_seconds=_get_int_env("SCHEDULER_INTERVAL_SECONDS", 30),
|
||||||
node_id_prefix=os.environ.get("NODE_ID_PREFIX", "A"),
|
node_id_prefix=os.environ.get("NODE_ID_PREFIX", "A"),
|
||||||
auth_mode=os.environ.get("AUTH_MODE", "disabled"),
|
auth_mode=os.environ.get("AUTH_MODE", "disabled"),
|
||||||
|
target_prefer_net_cidrs=os.environ.get("TARGET_PREFER_NET_CIDRS", "10.0.0.0/8,172.31.0.0/16"),
|
||||||
|
target_reachability_check=_bool_env("TARGET_REACHABILITY_CHECK", False),
|
||||||
)
|
)
|
||||||
|
|||||||
@ -1,8 +1,10 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import ipaddress
|
||||||
import logging
|
import logging
|
||||||
|
import socket
|
||||||
import threading
|
import threading
|
||||||
from typing import Optional
|
from typing import Optional, Iterable, Dict, Any, List
|
||||||
|
|
||||||
from .config import AppConfig
|
from .config import AppConfig
|
||||||
from .storage import Storage
|
from .storage import Storage
|
||||||
@ -34,10 +36,117 @@ class StatusScheduler:
|
|||||||
self._pending_nodes_json.set()
|
self._pending_nodes_json.set()
|
||||||
|
|
||||||
def generate_nodes_json(self) -> None:
|
def generate_nodes_json(self) -> None:
|
||||||
|
"""根据在线节点生成 Prometheus 抓取目标,优先 overlay IP。
|
||||||
|
|
||||||
|
候选顺序:meta.overlay_ip > hostname A 记录(命中偏好网段)> meta.ip。
|
||||||
|
可选 reachability 检查:TARGET_REACHABILITY_CHECK=true 时,对 9100/9400 做一次 1s TCP 连接测试,
|
||||||
|
选择首个可达的候选;全部失败则按顺序取第一个并记录日志。
|
||||||
|
"""
|
||||||
with self._nodes_json_lock:
|
with self._nodes_json_lock:
|
||||||
online_nodes = self._storage.get_online_nodes()
|
rows = self._storage.get_online_nodes_meta()
|
||||||
atomic_write_json(self._config.metric_nodes_json_path, online_nodes)
|
prefer_cidrs = self._parse_cidrs(self._config.target_prefer_net_cidrs)
|
||||||
self._logger.info("nodes.json updated", extra={"count": len(online_nodes)})
|
reachability = self._config.target_reachability_check
|
||||||
|
|
||||||
|
result: List[Dict[str, Any]] = []
|
||||||
|
for row in rows:
|
||||||
|
meta = row.get("meta", {})
|
||||||
|
hostname = meta.get("hostname") or row.get("name")
|
||||||
|
labels = row.get("labels") or []
|
||||||
|
|
||||||
|
overlay_ip = meta.get("overlay_ip")
|
||||||
|
legacy_ip = meta.get("ip")
|
||||||
|
host_candidates = self._resolve_host_ips(hostname)
|
||||||
|
host_pref = self._pick_by_cidrs(host_candidates, prefer_cidrs)
|
||||||
|
|
||||||
|
candidates: List[str] = []
|
||||||
|
for ip in [overlay_ip, host_pref, legacy_ip]:
|
||||||
|
if ip and ip not in candidates:
|
||||||
|
candidates.append(ip)
|
||||||
|
|
||||||
|
chosen = None
|
||||||
|
if reachability:
|
||||||
|
ports = [9100]
|
||||||
|
try:
|
||||||
|
if int(meta.get("gpu_number", 0)) > 0:
|
||||||
|
ports.append(9400)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
for ip in candidates:
|
||||||
|
if any(self._reachable(ip, p, 1.0) for p in ports):
|
||||||
|
chosen = ip
|
||||||
|
break
|
||||||
|
if not chosen:
|
||||||
|
chosen = candidates[0] if candidates else legacy_ip
|
||||||
|
if not chosen:
|
||||||
|
# ultimate fallback: 127.0.0.1 (should not happen)
|
||||||
|
chosen = "127.0.0.1"
|
||||||
|
self._logger.warning("No candidate IPs for node; falling back", extra={"node": row.get("node_id")})
|
||||||
|
|
||||||
|
if chosen and ipaddress.ip_address(chosen) in ipaddress.ip_network("172.22.0.0/16"):
|
||||||
|
self._logger.warning(
|
||||||
|
"Prometheus target uses docker_gwbridge address; prefer overlay",
|
||||||
|
extra={"node": row.get("node_id"), "ip": chosen},
|
||||||
|
)
|
||||||
|
|
||||||
|
result.append(
|
||||||
|
{
|
||||||
|
"node_id": row.get("node_id"),
|
||||||
|
"user_id": meta.get("user"),
|
||||||
|
"ip": chosen,
|
||||||
|
"hostname": hostname,
|
||||||
|
"labels": labels if isinstance(labels, list) else [],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
atomic_write_json(self._config.metric_nodes_json_path, result)
|
||||||
|
self._logger.info("nodes.json updated", extra={"count": len(result)})
|
||||||
|
|
||||||
|
# ---------------------------- helpers ----------------------------
|
||||||
|
@staticmethod
|
||||||
|
def _parse_cidrs(raw: str) -> List[ipaddress.IPv4Network]:
|
||||||
|
nets: List[ipaddress.IPv4Network] = []
|
||||||
|
for item in (x.strip() for x in (raw or "").split(",")):
|
||||||
|
if not item:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
net = ipaddress.ip_network(item, strict=False)
|
||||||
|
if isinstance(net, ipaddress.IPv4Network):
|
||||||
|
nets.append(net)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
return nets
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _resolve_host_ips(hostname: str) -> List[str]:
|
||||||
|
ips: List[str] = []
|
||||||
|
try:
|
||||||
|
infos = socket.getaddrinfo(hostname, None, family=socket.AF_INET)
|
||||||
|
for info in infos:
|
||||||
|
ip = info[4][0]
|
||||||
|
if ip not in ips:
|
||||||
|
ips.append(ip)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
return ips
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _pick_by_cidrs(candidates: Iterable[str], prefer: List[ipaddress.IPv4Network]) -> str | None:
|
||||||
|
for net in prefer:
|
||||||
|
for ip in candidates:
|
||||||
|
try:
|
||||||
|
if ipaddress.ip_address(ip) in net:
|
||||||
|
return ip
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _reachable(ip: str, port: int, timeout: float) -> bool:
|
||||||
|
try:
|
||||||
|
with socket.create_connection((ip, port), timeout=timeout):
|
||||||
|
return True
|
||||||
|
except OSError:
|
||||||
|
return False
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# internal loop
|
# internal loop
|
||||||
|
|||||||
@ -324,9 +324,35 @@ class Storage:
|
|||||||
{
|
{
|
||||||
"node_id": row["id"],
|
"node_id": row["id"],
|
||||||
"user_id": meta.get("user"),
|
"user_id": meta.get("user"),
|
||||||
"ip": meta.get("ip"),
|
"ip": meta.get("ip"), # kept for backward-compat; preferred IP selection handled in scheduler
|
||||||
"hostname": meta.get("hostname", row["name"]),
|
"hostname": meta.get("hostname", row["name"]),
|
||||||
"labels": labels if isinstance(labels, list) else [],
|
"labels": labels if isinstance(labels, list) else [],
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def get_online_nodes_meta(self) -> List[Dict[str, Any]]:
|
||||||
|
"""返回在线节点的原始 meta 与名称、标签,交由上层选择目标 IP。
|
||||||
|
|
||||||
|
每项包含:{ node_id, name, meta, labels }
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
cur = self._conn.execute(
|
||||||
|
"SELECT id, name, meta_json, labels_json FROM nodes WHERE status = ? ORDER BY id ASC",
|
||||||
|
("online",),
|
||||||
|
)
|
||||||
|
rows = cur.fetchall()
|
||||||
|
|
||||||
|
result: List[Dict[str, Any]] = []
|
||||||
|
for row in rows:
|
||||||
|
meta = json.loads(row["meta_json"]) if row["meta_json"] else {}
|
||||||
|
labels = json.loads(row["labels_json"]) if row["labels_json"] else []
|
||||||
|
result.append(
|
||||||
|
{
|
||||||
|
"node_id": row["id"],
|
||||||
|
"name": row["name"],
|
||||||
|
"meta": meta if isinstance(meta, dict) else {},
|
||||||
|
"labels": labels if isinstance(labels, list) else [],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
1.40.0
|
1.42.0
|
||||||
|
|||||||
BIN
src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent
(Stored with Git LFS)
BIN
src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent
(Stored with Git LFS)
Binary file not shown.
@ -274,19 +274,33 @@ verify_checksums() {
|
|||||||
log_info "Artifact 目录: $artifact_dir"
|
log_info "Artifact 目录: $artifact_dir"
|
||||||
failed_verification=0
|
failed_verification=0
|
||||||
|
|
||||||
|
# 尝试解析 version.json 中的 install_order,用于锁定精确文件名,避免同一目录下多份历史 tar 产生歧义
|
||||||
|
local order_file="$TEMP_DIR/install_order.txt"
|
||||||
if [[ -f "$TEMP_DIR/checksums.txt" ]]; then
|
if [[ -f "$TEMP_DIR/checksums.txt" ]]; then
|
||||||
while IFS= read -r line; do
|
while IFS= read -r line; do
|
||||||
component=$(echo "$line" | cut -d':' -f1)
|
component=$(echo "$line" | cut -d':' -f1)
|
||||||
expected_checksum=$(echo "$line" | cut -d':' -f2-)
|
expected_checksum=$(echo "$line" | cut -d':' -f2-)
|
||||||
|
|
||||||
# 查找匹配的 tar 文件
|
# 优先从 install_order 中推导精确文件名
|
||||||
actual_file=""
|
actual_file=""
|
||||||
for file in "$artifact_dir/${component}-"*.tar.gz; do
|
if [[ -f "$order_file" ]]; then
|
||||||
if [[ -f "$file" ]]; then
|
while IFS= read -r fname; do
|
||||||
actual_file="$file"
|
if [[ "$fname" == ${component}-*.tar.gz && -f "$artifact_dir/$fname" ]]; then
|
||||||
break
|
actual_file="$artifact_dir/$fname"
|
||||||
fi
|
break
|
||||||
done
|
fi
|
||||||
|
done < "$order_file"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 回退:按前缀匹配首个(不推荐,但保持兼容)
|
||||||
|
if [[ -z "$actual_file" ]]; then
|
||||||
|
for file in "$artifact_dir/${component}-"*.tar.gz; do
|
||||||
|
if [[ -f "$file" ]]; then
|
||||||
|
actual_file="$file"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ -z "$actual_file" ]]; then
|
if [[ -z "$actual_file" ]]; then
|
||||||
log_error "找不到组件文件: $component"
|
log_error "找不到组件文件: $component"
|
||||||
|
|||||||
@ -59,6 +59,12 @@ ARTIFACT_DIR="artifact/$VERSION"
|
|||||||
|
|
||||||
log_info "开始打包 AIOps All-in-One 安装包 v$VERSION"
|
log_info "开始打包 AIOps All-in-One 安装包 v$VERSION"
|
||||||
|
|
||||||
|
# 若强制打包且目录已存在,先清理旧产物以避免同一版本下残留多个 tar.gz 导致校验混乱
|
||||||
|
if [[ "$FORCE_PACKAGE" == "true" && -d "$ARTIFACT_DIR" ]]; then
|
||||||
|
log_info "--force: 清理旧的 $ARTIFACT_DIR 下的 tar 与元数据"
|
||||||
|
rm -rf "$ARTIFACT_DIR"
|
||||||
|
fi
|
||||||
|
|
||||||
# 检查必要文件
|
# 检查必要文件
|
||||||
log_info "检查必要文件..."
|
log_info "检查必要文件..."
|
||||||
if [[ ! -f "config/VERSION" ]]; then
|
if [[ ! -f "config/VERSION" ]]; then
|
||||||
@ -130,7 +136,7 @@ if [[ -d "$ARTIFACT_DIR" && "$FORCE_PACKAGE" == "false" ]]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 创建 artifact 目录
|
# 创建 artifact 目录(清理后重建)
|
||||||
mkdir -p "$ARTIFACT_DIR"
|
mkdir -p "$ARTIFACT_DIR"
|
||||||
log_info "创建输出目录: $ARTIFACT_DIR"
|
log_info "创建输出目录: $ARTIFACT_DIR"
|
||||||
|
|
||||||
@ -285,10 +291,13 @@ while IFS= read -r component; do
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# 清理组件目录内历史 tar 包,避免 find 误选旧文件
|
||||||
|
rm -f ./*.tar.gz 2>/dev/null || true
|
||||||
|
|
||||||
# 执行组件的打包脚本
|
# 执行组件的打包脚本
|
||||||
if ./package.sh; then
|
if ./package.sh; then
|
||||||
# 查找生成的 tar 包
|
# 查找生成的 tar 包
|
||||||
tar_file=$(find . -name "*.tar.gz" -type f | head -1)
|
tar_file=$(ls -1t ./*.tar.gz 2>/dev/null | head -1)
|
||||||
if [[ -n "$tar_file" ]]; then
|
if [[ -n "$tar_file" ]]; then
|
||||||
# 移动到 artifact 目录
|
# 移动到 artifact 目录
|
||||||
mv "$tar_file" "$CURRENT_DIR/$ARTIFACT_DIR/"
|
mv "$tar_file" "$CURRENT_DIR/$ARTIFACT_DIR/"
|
||||||
|
|||||||
@ -130,20 +130,40 @@ fi
|
|||||||
TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$"
|
TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$"
|
||||||
mkdir -p "$TEMP_PACKAGE_DIR"
|
mkdir -p "$TEMP_PACKAGE_DIR"
|
||||||
|
|
||||||
# 复制所有 tar.gz 文件到临时目录
|
# 仅复制 version.json 中 install_order 列出的 tar.gz,防止同一版本目录下历史残留文件导致校验不一致
|
||||||
log_info "准备 artifact 文件..."
|
log_info "准备 artifact 文件(按 install_order)..."
|
||||||
tar_files=$(find "$ARTIFACT_DIR" -name "*.tar.gz" -type f)
|
|
||||||
|
|
||||||
if [[ -z "$tar_files" ]]; then
|
install_list_file="$TEMP_DIR/install_list.txt"
|
||||||
log_error "在 $ARTIFACT_DIR 中未找到 tar.gz 文件"
|
if command -v jq >/dev/null 2>&1; then
|
||||||
exit 1
|
jq -r '.install_order[]' "$ARTIFACT_DIR/version.json" > "$install_list_file" 2>/dev/null || true
|
||||||
|
else
|
||||||
|
# 简易解析
|
||||||
|
grep -A 200 '"install_order"' "$ARTIFACT_DIR/version.json" | grep -E '".*"' | sed 's/.*"\([^"]*\)".*/\1/' > "$install_list_file" 2>/dev/null || true
|
||||||
fi
|
fi
|
||||||
|
|
||||||
for file in $tar_files; do
|
if [[ -s "$install_list_file" ]]; then
|
||||||
filename=$(basename "$file")
|
while IFS= read -r filename; do
|
||||||
log_info " 准备: $filename"
|
src="$ARTIFACT_DIR/$filename"
|
||||||
cp "$file" "$TEMP_PACKAGE_DIR/"
|
if [[ -f "$src" ]]; then
|
||||||
done
|
log_info " 拷贝: $filename"
|
||||||
|
cp "$src" "$TEMP_PACKAGE_DIR/"
|
||||||
|
else
|
||||||
|
log_warning " 未找到: $filename(跳过)"
|
||||||
|
fi
|
||||||
|
done < "$install_list_file"
|
||||||
|
else
|
||||||
|
log_warning "未能解析 install_order,将回退复制全部 tar.gz(可能包含历史残留,建议安装端使用严格校验)"
|
||||||
|
tar_files=$(find "$ARTIFACT_DIR" -name "*.tar.gz" -type f)
|
||||||
|
if [[ -z "$tar_files" ]]; then
|
||||||
|
log_error "在 $ARTIFACT_DIR 中未找到 tar.gz 文件"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
for file in $tar_files; do
|
||||||
|
filename=$(basename "$file")
|
||||||
|
log_info " 准备: $filename"
|
||||||
|
cp "$file" "$TEMP_PACKAGE_DIR/"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
# 复制版本信息文件
|
# 复制版本信息文件
|
||||||
if [[ -f "$ARTIFACT_DIR/version.json" ]]; then
|
if [[ -f "$ARTIFACT_DIR/version.json" ]]; then
|
||||||
|
|||||||
16
src/sys/build/node-bundle/Dockerfile
Normal file
16
src/sys/build/node-bundle/Dockerfile
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
ARG BASE_IMAGE=argus-sys-metric-test-node:latest
|
||||||
|
FROM ${BASE_IMAGE}
|
||||||
|
|
||||||
|
ARG CLIENT_VER
|
||||||
|
LABEL org.opencontainers.image.title="argus-sys-metric-test-node-bundle" \
|
||||||
|
org.opencontainers.image.version="${CLIENT_VER}" \
|
||||||
|
org.opencontainers.image.description="Metric test node with embedded client package"
|
||||||
|
|
||||||
|
WORKDIR /
|
||||||
|
|
||||||
|
# bundle files are provided at build time into ./bundle in build context
|
||||||
|
COPY bundle/ /bundle/
|
||||||
|
COPY node-bootstrap.sh /usr/local/bin/node-bootstrap.sh
|
||||||
|
RUN chmod +x /usr/local/bin/node-bootstrap.sh
|
||||||
|
|
||||||
|
ENTRYPOINT ["/usr/local/bin/node-bootstrap.sh"]
|
||||||
2
src/sys/build/node-bundle/bundle/.gitignore
vendored
Normal file
2
src/sys/build/node-bundle/bundle/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
|
||||||
|
argus-metric_*.tar.gz
|
||||||
1006
src/sys/build/node-bundle/bundle/setup.sh
Executable file
1006
src/sys/build/node-bundle/bundle/setup.sh
Executable file
File diff suppressed because it is too large
Load Diff
99
src/sys/build/node-bundle/node-bootstrap.sh
Normal file
99
src/sys/build/node-bundle/node-bootstrap.sh
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
echo "[BOOT] node bundle starting"
|
||||||
|
|
||||||
|
INSTALL_DIR="/opt/argus-metric"
|
||||||
|
BUNDLE_DIR="/bundle"
|
||||||
|
installed_ok=0
|
||||||
|
|
||||||
|
# 1) already installed?
|
||||||
|
if [[ -L "$INSTALL_DIR/current" && -d "$INSTALL_DIR/current" ]]; then
|
||||||
|
echo "[BOOT] client already installed at $INSTALL_DIR/current"
|
||||||
|
else
|
||||||
|
# 2) try local bundle first (replicate setup.sh layout: move to /opt/argus-metric/versions/<ver> and run install.sh)
|
||||||
|
tarball=$(ls -1 "$BUNDLE_DIR"/argus-metric_*.tar.gz 2>/dev/null | head -1 || true)
|
||||||
|
if [[ -n "${tarball:-}" ]]; then
|
||||||
|
echo "[BOOT] installing from local bundle: $(basename "$tarball")"
|
||||||
|
tmp=$(mktemp -d)
|
||||||
|
tar -xzf "$tarball" -C "$tmp"
|
||||||
|
# locate root containing version.json
|
||||||
|
root="$tmp"
|
||||||
|
if [[ ! -f "$root/version.json" ]]; then
|
||||||
|
sub=$(find "$tmp" -mindepth 1 -maxdepth 1 -type d | head -n1 || true)
|
||||||
|
[[ -n "$sub" && -f "$sub/version.json" ]] && root="$sub"
|
||||||
|
fi
|
||||||
|
if [[ ! -f "$root/version.json" ]]; then
|
||||||
|
echo "[BOOT][WARN] version.json not found in bundle; fallback to FTP"
|
||||||
|
else
|
||||||
|
ver=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$root/version.json" | head -n1)
|
||||||
|
if [[ -z "$ver" ]]; then
|
||||||
|
echo "[BOOT][WARN] failed to parse version from version.json; fallback to FTP"
|
||||||
|
else
|
||||||
|
target_root="/opt/argus-metric"
|
||||||
|
version_dir="$target_root/versions/$ver"
|
||||||
|
mkdir -p "$version_dir"
|
||||||
|
# move contents into version dir
|
||||||
|
shopt -s dotglob
|
||||||
|
mv "$root"/* "$version_dir/" 2>/dev/null || true
|
||||||
|
shopt -u dotglob
|
||||||
|
# run component installer within version dir
|
||||||
|
if [[ -f "$version_dir/install.sh" ]]; then
|
||||||
|
chmod +x "$version_dir/install.sh" 2>/dev/null || true
|
||||||
|
(cd "$version_dir" && ./install.sh "$version_dir")
|
||||||
|
echo "$ver" > "$target_root/LATEST_VERSION" 2>/dev/null || true
|
||||||
|
ln -sfn "$version_dir" "$target_root/current" 2>/dev/null || true
|
||||||
|
if [[ -L "$target_root/current" && -d "$target_root/current" ]]; then
|
||||||
|
installed_ok=1
|
||||||
|
echo "[BOOT] local bundle install OK: version=$ver"
|
||||||
|
else
|
||||||
|
echo "[BOOT][WARN] current symlink not present after install; will rely on healthcheck to confirm"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "[BOOT][WARN] install.sh missing under $version_dir; fallback to FTP"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 3) fallback: use FTP setup if not installed
|
||||||
|
if [[ ! -L "$INSTALL_DIR/current" && "$installed_ok" -eq 0 ]]; then
|
||||||
|
echo "[BOOT] fallback to FTP setup"
|
||||||
|
if [[ -z "${FTPIP:-}" || -z "${FTP_USER:-}" || -z "${FTP_PASSWORD:-}" ]]; then
|
||||||
|
echo "[BOOT][ERROR] FTP variables not set (FTPIP/FTP_USER/FTP_PASSWORD)" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
curl -u "$FTP_USER:$FTP_PASSWORD" -fsSL "ftp://$FTPIP:21/setup.sh" -o /tmp/setup.sh
|
||||||
|
chmod +x /tmp/setup.sh
|
||||||
|
/tmp/setup.sh --server "$FTPIP" --user "$FTP_USER" --password "$FTP_PASSWORD" --port 21
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 4) ensure agent is running; start if needed (inherits env: MASTER_ENDPOINT/AGENT_*)
|
||||||
|
if ! pgrep -x argus-agent >/dev/null 2>&1; then
|
||||||
|
echo "[BOOT] starting argus-agent (not detected)"
|
||||||
|
setsid /usr/local/bin/argus-agent >/var/log/argus-agent.log 2>&1 < /dev/null &
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 5) post-install selfcheck (best-effort) and wait for node.json
|
||||||
|
for i in {1..30}; do
|
||||||
|
if compgen -G "$INSTALL_DIR/versions/*/check_health.sh" > /dev/null; then
|
||||||
|
bash "$INSTALL_DIR"/versions/*/check_health.sh || true
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
|
||||||
|
host="$(hostname)"
|
||||||
|
state_dir="/private/argus/agent/${host}"
|
||||||
|
mkdir -p "$state_dir" 2>/dev/null || true
|
||||||
|
for i in {1..60}; do
|
||||||
|
if [[ -s "$state_dir/node.json" ]]; then
|
||||||
|
echo "[BOOT] node state present: $state_dir/node.json"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "[BOOT] ready; entering sleep"
|
||||||
|
exec sleep infinity
|
||||||
21
src/sys/swarm_tests/.env
Normal file
21
src/sys/swarm_tests/.env
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
SERVER_PROJECT=argus-swarm-server
|
||||||
|
NODES_PROJECT=argus-swarm-nodes
|
||||||
|
|
||||||
|
# Host ports for server compose
|
||||||
|
MASTER_PORT=32300
|
||||||
|
ES_HTTP_PORT=9200
|
||||||
|
KIBANA_PORT=5601
|
||||||
|
PROMETHEUS_PORT=9090
|
||||||
|
GRAFANA_PORT=3000
|
||||||
|
ALERTMANAGER_PORT=9093
|
||||||
|
WEB_PROXY_PORT_8080=8080
|
||||||
|
WEB_PROXY_PORT_8081=8081
|
||||||
|
WEB_PROXY_PORT_8082=8082
|
||||||
|
WEB_PROXY_PORT_8083=8083
|
||||||
|
WEB_PROXY_PORT_8084=8084
|
||||||
|
WEB_PROXY_PORT_8085=8085
|
||||||
|
|
||||||
|
# UID/GID for volume ownership in containers
|
||||||
|
ARGUS_BUILD_UID=1000
|
||||||
|
ARGUS_BUILD_GID=1000
|
||||||
|
|
||||||
21
src/sys/swarm_tests/.env.example
Normal file
21
src/sys/swarm_tests/.env.example
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
SERVER_PROJECT=argus-swarm-server
|
||||||
|
NODES_PROJECT=argus-swarm-nodes
|
||||||
|
|
||||||
|
# Host ports for server compose
|
||||||
|
MASTER_PORT=32300
|
||||||
|
ES_HTTP_PORT=9200
|
||||||
|
KIBANA_PORT=5601
|
||||||
|
PROMETHEUS_PORT=9090
|
||||||
|
GRAFANA_PORT=3000
|
||||||
|
ALERTMANAGER_PORT=9093
|
||||||
|
WEB_PROXY_PORT_8080=8080
|
||||||
|
WEB_PROXY_PORT_8081=8081
|
||||||
|
WEB_PROXY_PORT_8082=8082
|
||||||
|
WEB_PROXY_PORT_8083=8083
|
||||||
|
WEB_PROXY_PORT_8084=8084
|
||||||
|
WEB_PROXY_PORT_8085=8085
|
||||||
|
|
||||||
|
# UID/GID for volume ownership in containers
|
||||||
|
ARGUS_BUILD_UID=2133
|
||||||
|
ARGUS_BUILD_GID=2015
|
||||||
|
|
||||||
8
src/sys/swarm_tests/.env.nodes
Normal file
8
src/sys/swarm_tests/.env.nodes
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
BINDIP=10.0.1.5
|
||||||
|
FTPIP=10.0.1.4
|
||||||
|
MASTER_ENDPOINT=http://master.argus.com:3000
|
||||||
|
FTP_USER=ftpuser
|
||||||
|
FTP_PASSWORD=ZGClab1234!
|
||||||
|
AGENT_ENV=dev2
|
||||||
|
AGENT_USER=yuyr
|
||||||
|
AGENT_INSTANCE=node001sX
|
||||||
52
src/sys/swarm_tests/README.md
Normal file
52
src/sys/swarm_tests/README.md
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
# Swarm Tests (argus-sys-net)
|
||||||
|
|
||||||
|
快速在本机用 Docker Swarm + overlay 网络验证“服务端 + 单节点”端到端部署。保持对 `src/sys/tests` 兼容,不影响现有桥接网络测试。
|
||||||
|
|
||||||
|
## 先决条件
|
||||||
|
- Docker Engine 已启用 Swarm(脚本会自动 `swarm init` 单机模式)。
|
||||||
|
- 已构建并加载以下镜像:`argus-bind9:latest`、`argus-master:latest`、`argus-elasticsearch:latest`、`argus-kibana:latest`、`argus-metric-ftp:latest`、`argus-metric-prometheus:latest`、`argus-metric-grafana:latest`、`argus-alertmanager:latest`、`argus-web-frontend:latest`、`argus-web-proxy:latest`、以及节点镜像 `argus-sys-metric-test-node-bundle:latest`(见下文)。
|
||||||
|
- 本地 `UID/GID` 建议通过 `configs/build_user.local.conf` 指定,脚本会读取:
|
||||||
|
- `UID=1000`\n`GID=1000`(示例)。
|
||||||
|
|
||||||
|
## 构建节点 bundle 镜像
|
||||||
|
|
||||||
|
```
|
||||||
|
./deployment/build/build_images.sh --with-node-bundle --client-version 20251106
|
||||||
|
```
|
||||||
|
|
||||||
|
说明:`--client-version` 支持 `YYYYMMDD` 日期包或 `1.xx.yy` 组件版本。打包完成后镜像 `argus-sys-metric-test-node-bundle:latest` 会内置 `argus-metric_*.tar.gz`,容器启动时优先从本地 bundle 安装。
|
||||||
|
|
||||||
|
## 运行步骤
|
||||||
|
|
||||||
|
```
|
||||||
|
cd src/sys/swarm_tests
|
||||||
|
cp .env.example .env
|
||||||
|
|
||||||
|
bash scripts/00_bootstrap.sh
|
||||||
|
bash scripts/01_server_up.sh
|
||||||
|
bash scripts/02_wait_ready.sh # 输出 BINDIP/FTPIP 到 .env.nodes
|
||||||
|
bash scripts/03_nodes_up.sh
|
||||||
|
bash scripts/04_metric_verify.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
清理:
|
||||||
|
|
||||||
|
```
|
||||||
|
bash scripts/99_down.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## 说明与注意事项
|
||||||
|
- `00_bootstrap.sh`:先加载 `scripts/common/build_user.sh`,打印并写入 `.env` 中的 `ARGUS_BUILD_UID/GID`,再准备 `private-server/` 与 `private-nodes/` 目录,并 `chown` 到对应 UID/GID。
|
||||||
|
- `01_server_up.sh`:启动服务端 compose。可用 `SWARM_FIX_PERMS=1` 打开“容器内 chmod + supervisor 重启”的兜底逻辑,默认关闭。
|
||||||
|
- `02_wait_ready.sh`:等待 Master/ES/Prom/Grafana 就绪(Kibana 可延迟),随后解析 overlay IP,写入 `.env.nodes` 的 `BINDIP/FTPIP`,供节点 compose 使用。
|
||||||
|
- `03_nodes_up.sh`:启动单节点容器(bundle 版)。容器内 `node-bootstrap.sh` 优先本地安装,成功后执行健康检查并等待 `/private/argus/agent/<hostname>/node.json` 出现。
|
||||||
|
- `04_metric_verify.sh`:在本套件内执行详细校验(不再直接调用 tests 脚本):
|
||||||
|
- Grafana `/api/health`(database=ok)
|
||||||
|
- Grafana 数据源指向 `prom.metric.argus.com:<port>` 并在容器内可解析该域名
|
||||||
|
- Prometheus `activeTargets` 全部 up
|
||||||
|
- `nodes.json` 不包含 `172.22/16`(docker_gwbridge)
|
||||||
|
|
||||||
|
## 常见问题
|
||||||
|
- Grafana/Kibana 启动报权限:检查 `configs/build_user.local.conf` 与 `00_bootstrap.sh` 的输出 UID/GID 是否一致;必要时设置 `SWARM_FIX_PERMS=1` 重新 `01_server_up.sh`。
|
||||||
|
- 节点容器 fallback 到 FTP:通常为 bundle 结构异常或健康检查失败(早期脚本在 `sh` 下执行)。当前 `node-bootstrap.sh` 已使用 `bash` 执行健康检查,并在本地安装成功后跳过 FTP。
|
||||||
|
- 代理 502:查看容器 `argus-web-proxy` 的 `/var/log/nginx/error.log` 与启动日志中 `upstream check` 行;若后端未就绪(尤其 Kibana),等待 `02_wait_ready.sh` 通过后再访问。
|
||||||
34
src/sys/swarm_tests/docker-compose.nodes.yml
Normal file
34
src/sys/swarm_tests/docker-compose.nodes.yml
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
version: "3.8"
|
||||||
|
|
||||||
|
networks:
|
||||||
|
argus-sys-net:
|
||||||
|
external: true
|
||||||
|
|
||||||
|
services:
|
||||||
|
metric-test-node:
|
||||||
|
image: ${NODE_BUNDLE_IMAGE_TAG:-argus-sys-metric-test-node-bundle:latest}
|
||||||
|
container_name: argus-metric-test-node-swarm
|
||||||
|
hostname: ${NODE_HOSTNAME:-swarm-metric-node-001}
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
- TZ=Asia/Shanghai
|
||||||
|
- DEBIAN_FRONTEND=noninteractive
|
||||||
|
- MASTER_ENDPOINT=${MASTER_ENDPOINT:-http://master.argus.com:3000}
|
||||||
|
- ES_HOST=es.log.argus.com
|
||||||
|
- ES_PORT=9200
|
||||||
|
- FTPIP=${FTPIP}
|
||||||
|
- BINDIP=${BINDIP}
|
||||||
|
- FTP_USER=${FTP_USER:-ftpuser}
|
||||||
|
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
- AGENT_ENV=${AGENT_ENV:-dev2}
|
||||||
|
- AGENT_USER=${AGENT_USER:-yuyr}
|
||||||
|
- AGENT_INSTANCE=${AGENT_INSTANCE:-node001sX}
|
||||||
|
- CLIENT_VERSION=${CLIENT_VERSION:-}
|
||||||
|
dns:
|
||||||
|
- ${BINDIP}
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
volumes:
|
||||||
|
- ./private-nodes/argus/agent:/private/argus/agent
|
||||||
|
command: ["sleep", "infinity"]
|
||||||
174
src/sys/swarm_tests/docker-compose.server.yml
Normal file
174
src/sys/swarm_tests/docker-compose.server.yml
Normal file
@ -0,0 +1,174 @@
|
|||||||
|
version: "3.8"
|
||||||
|
|
||||||
|
networks:
|
||||||
|
argus-sys-net:
|
||||||
|
external: true
|
||||||
|
|
||||||
|
services:
|
||||||
|
bind:
|
||||||
|
image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
|
||||||
|
container_name: argus-bind-sys
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
volumes:
|
||||||
|
- ./private-server:/private
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
master:
|
||||||
|
image: ${MASTER_IMAGE_TAG:-argus-master:latest}
|
||||||
|
container_name: argus-master-sys
|
||||||
|
depends_on: [bind]
|
||||||
|
environment:
|
||||||
|
- OFFLINE_THRESHOLD_SECONDS=6
|
||||||
|
- ONLINE_THRESHOLD_SECONDS=2
|
||||||
|
- SCHEDULER_INTERVAL_SECONDS=1
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
ports:
|
||||||
|
- "${MASTER_PORT:-32300}:3000"
|
||||||
|
volumes:
|
||||||
|
- ./private-server/argus/master:/private/argus/master
|
||||||
|
- ./private-server/argus/metric/prometheus:/private/argus/metric/prometheus
|
||||||
|
- ./private-server/argus/etc:/private/argus/etc
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
es:
|
||||||
|
image: ${ES_IMAGE_TAG:-argus-elasticsearch:latest}
|
||||||
|
container_name: argus-es-sys
|
||||||
|
environment:
|
||||||
|
- discovery.type=single-node
|
||||||
|
- xpack.security.enabled=false
|
||||||
|
- ES_JAVA_OPTS=-Xms512m -Xmx512m
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
volumes:
|
||||||
|
- ./private-server/argus/log/elasticsearch:/private/argus/log/elasticsearch
|
||||||
|
- ./private-server/argus/etc:/private/argus/etc
|
||||||
|
ports:
|
||||||
|
- "${ES_HTTP_PORT:-9200}:9200"
|
||||||
|
restart: unless-stopped
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
|
||||||
|
kibana:
|
||||||
|
image: ${KIBANA_IMAGE_TAG:-argus-kibana:latest}
|
||||||
|
container_name: argus-kibana-sys
|
||||||
|
environment:
|
||||||
|
- ELASTICSEARCH_HOSTS=http://es.log.argus.com:9200
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
volumes:
|
||||||
|
- ./private-server/argus/log/kibana:/private/argus/log/kibana
|
||||||
|
- ./private-server/argus/etc:/private/argus/etc
|
||||||
|
depends_on: [es]
|
||||||
|
ports:
|
||||||
|
- "${KIBANA_PORT:-5601}:5601"
|
||||||
|
restart: unless-stopped
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
|
||||||
|
ftp:
|
||||||
|
image: ${FTP_IMAGE_TAG:-argus-metric-ftp:latest}
|
||||||
|
container_name: argus-ftp
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
- TZ=Asia/Shanghai
|
||||||
|
- FTP_BASE_PATH=/private/argus/ftp
|
||||||
|
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
|
||||||
|
- DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com}
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
ports:
|
||||||
|
- "${FTP_PORT:-21}:21"
|
||||||
|
- "${FTP_DATA_PORT:-20}:20"
|
||||||
|
- "${FTP_PASSIVE_HOST_RANGE:-21100-21110}:21100-21110"
|
||||||
|
volumes:
|
||||||
|
- ./private-server/argus/metric/ftp:/private/argus/ftp
|
||||||
|
- ./private-server/argus/etc:/private/argus/etc
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
|
||||||
|
prometheus:
|
||||||
|
image: ${PROM_IMAGE_TAG:-argus-metric-prometheus:latest}
|
||||||
|
container_name: argus-prometheus
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
- TZ=Asia/Shanghai
|
||||||
|
- PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
ports:
|
||||||
|
- "${PROMETHEUS_PORT:-9090}:9090"
|
||||||
|
volumes:
|
||||||
|
- ./private-server/argus/metric/prometheus:/private/argus/metric/prometheus
|
||||||
|
- ./private-server/argus/etc:/private/argus/etc
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
|
||||||
|
grafana:
|
||||||
|
image: ${GRAFANA_IMAGE_TAG:-argus-metric-grafana:latest}
|
||||||
|
container_name: argus-grafana
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
- TZ=Asia/Shanghai
|
||||||
|
- GRAFANA_BASE_PATH=/private/argus/metric/grafana
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
- GF_SERVER_HTTP_PORT=3000
|
||||||
|
- GF_LOG_LEVEL=warn
|
||||||
|
- GF_LOG_MODE=console
|
||||||
|
- GF_PATHS_PROVISIONING=/private/argus/metric/grafana/provisioning
|
||||||
|
- GF_AUTH_ANONYMOUS_ENABLED=true
|
||||||
|
- GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
|
||||||
|
ports:
|
||||||
|
- "${GRAFANA_PORT:-3000}:3000"
|
||||||
|
volumes:
|
||||||
|
- ./private-server/argus/metric/grafana:/private/argus/metric/grafana
|
||||||
|
- ./private-server/argus/etc:/private/argus/etc
|
||||||
|
depends_on: [prometheus]
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
|
||||||
|
alertmanager:
|
||||||
|
image: ${ALERT_IMAGE_TAG:-argus-alertmanager:latest}
|
||||||
|
container_name: argus-alertmanager
|
||||||
|
environment:
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
volumes:
|
||||||
|
- ./private-server/argus/etc:/private/argus/etc
|
||||||
|
- ./private-server/argus/alert/alertmanager:/private/argus/alert/alertmanager
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
ports:
|
||||||
|
- "${ALERTMANAGER_PORT:-9093}:9093"
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
web-frontend:
|
||||||
|
image: ${FRONT_IMAGE_TAG:-argus-web-frontend:latest}
|
||||||
|
container_name: argus-web-frontend
|
||||||
|
environment:
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
- EXTERNAL_MASTER_PORT=${WEB_PROXY_PORT_8085:-8085}
|
||||||
|
- EXTERNAL_ALERTMANAGER_PORT=${WEB_PROXY_PORT_8084:-8084}
|
||||||
|
- EXTERNAL_GRAFANA_PORT=${WEB_PROXY_PORT_8081:-8081}
|
||||||
|
- EXTERNAL_PROMETHEUS_PORT=${WEB_PROXY_PORT_8082:-8082}
|
||||||
|
- EXTERNAL_KIBANA_PORT=${WEB_PROXY_PORT_8083:-8083}
|
||||||
|
volumes:
|
||||||
|
- ./private-server/argus/etc:/private/argus/etc
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
web-proxy:
|
||||||
|
image: ${WEB_PROXY_IMAGE_TAG:-argus-web-proxy:latest}
|
||||||
|
container_name: argus-web-proxy
|
||||||
|
depends_on: [bind, master, grafana, prometheus, kibana, alertmanager]
|
||||||
|
environment:
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
volumes:
|
||||||
|
- ./private-server/argus/etc:/private/argus/etc
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
ports:
|
||||||
|
- "${WEB_PROXY_PORT_8080:-8080}:8080"
|
||||||
|
- "${WEB_PROXY_PORT_8081:-8081}:8081"
|
||||||
|
- "${WEB_PROXY_PORT_8082:-8082}:8082"
|
||||||
|
- "${WEB_PROXY_PORT_8083:-8083}:8083"
|
||||||
|
- "${WEB_PROXY_PORT_8084:-8084}:8084"
|
||||||
|
- "${WEB_PROXY_PORT_8085:-8085}:8085"
|
||||||
|
restart: unless-stopped
|
||||||
101
src/sys/swarm_tests/scripts/00_bootstrap.sh
Executable file
101
src/sys/swarm_tests/scripts/00_bootstrap.sh
Executable file
@ -0,0 +1,101 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
REPO_ROOT="$(cd "$ROOT/../../.." && pwd)"
|
||||||
|
|
||||||
|
ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] || cp "$ROOT/.env.example" "$ENV_FILE"
|
||||||
|
|
||||||
|
# Load build user (UID/GID) from repo config to match container runtime users
|
||||||
|
if [[ -f "$REPO_ROOT/scripts/common/build_user.sh" ]]; then
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$REPO_ROOT/scripts/common/build_user.sh" 2>/dev/null || true
|
||||||
|
if declare -f load_build_user >/dev/null 2>&1; then
|
||||||
|
load_build_user
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Capture resolved UID/GID from build_user before sourcing .env
|
||||||
|
uid_resolved="${ARGUS_BUILD_UID:-2133}"
|
||||||
|
gid_resolved="${ARGUS_BUILD_GID:-2015}"
|
||||||
|
echo "[BOOT] resolved build user: UID=${uid_resolved} GID=${gid_resolved} (from scripts/common/build_user.sh or env)"
|
||||||
|
|
||||||
|
# After resolving UID/GID, load .env for other settings; then we will overwrite UID/GID entries
|
||||||
|
set -a; source "$ENV_FILE"; set +a
|
||||||
|
|
||||||
|
echo "[BOOT] checking Docker Swarm"
|
||||||
|
if ! docker info 2>/dev/null | grep -q "Swarm: active"; then
|
||||||
|
echo "[BOOT] initializing swarm (single-node)"
|
||||||
|
docker swarm init >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
NET_NAME=argus-sys-net
|
||||||
|
if docker network inspect "$NET_NAME" >/dev/null 2>&1; then
|
||||||
|
echo "[BOOT] overlay network exists: $NET_NAME"
|
||||||
|
else
|
||||||
|
echo "[BOOT] creating overlay network: $NET_NAME"
|
||||||
|
docker network create -d overlay --attachable "$NET_NAME"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[BOOT] preparing private directories (server/nodes)"
|
||||||
|
# Server-side dirs (align with sys/tests 01_bootstrap.sh)
|
||||||
|
mkdir -p \
|
||||||
|
"$ROOT/private-server/argus/etc" \
|
||||||
|
"$ROOT/private-server/argus/bind" \
|
||||||
|
"$ROOT/private-server/argus/master" \
|
||||||
|
"$ROOT/private-server/argus/metric/prometheus" \
|
||||||
|
"$ROOT/private-server/argus/metric/prometheus/data" \
|
||||||
|
"$ROOT/private-server/argus/metric/prometheus/rules" \
|
||||||
|
"$ROOT/private-server/argus/metric/prometheus/targets" \
|
||||||
|
"$ROOT/private-server/argus/alert/alertmanager" \
|
||||||
|
"$ROOT/private-server/argus/metric/ftp/share" \
|
||||||
|
"$ROOT/private-server/argus/metric/grafana/data" \
|
||||||
|
"$ROOT/private-server/argus/metric/grafana/logs" \
|
||||||
|
"$ROOT/private-server/argus/metric/grafana/plugins" \
|
||||||
|
"$ROOT/private-server/argus/metric/grafana/provisioning/datasources" \
|
||||||
|
"$ROOT/private-server/argus/metric/grafana/provisioning/dashboards" \
|
||||||
|
"$ROOT/private-server/argus/metric/grafana/data/sessions" \
|
||||||
|
"$ROOT/private-server/argus/metric/grafana/data/dashboards" \
|
||||||
|
"$ROOT/private-server/argus/metric/grafana/config" \
|
||||||
|
"$ROOT/private-server/argus/agent" \
|
||||||
|
"$ROOT/private-server/argus/log/elasticsearch" \
|
||||||
|
"$ROOT/private-server/argus/log/kibana"
|
||||||
|
|
||||||
|
mkdir -p "$ROOT/private-nodes/argus/agent"
|
||||||
|
|
||||||
|
uid="$uid_resolved"; gid="$gid_resolved"
|
||||||
|
echo "[BOOT] chown -R ${uid}:${gid} for server core dirs (best-effort)"
|
||||||
|
chown -R "$uid":"$gid" \
|
||||||
|
"$ROOT/private-server/argus/log/elasticsearch" \
|
||||||
|
"$ROOT/private-server/argus/log/kibana" \
|
||||||
|
"$ROOT/private-server/argus/metric/grafana" \
|
||||||
|
"$ROOT/private-server/argus/metric/prometheus" \
|
||||||
|
"$ROOT/private-server/argus/alert" \
|
||||||
|
"$ROOT/private-server/argus/metric/ftp" \
|
||||||
|
"$ROOT/private-server/argus/agent" \
|
||||||
|
"$ROOT/private-server/argus/etc" 2>/dev/null || true
|
||||||
|
|
||||||
|
# group-writable for etc/alert as in sys/tests
|
||||||
|
chmod -R g+w "$ROOT/private-server/argus/alert" "$ROOT/private-server/argus/etc" 2>/dev/null || true
|
||||||
|
|
||||||
|
# ensure .env carries the resolved UID/GID for compose env interpolation
|
||||||
|
if grep -q '^ARGUS_BUILD_UID=' "$ENV_FILE"; then
|
||||||
|
sed -i "s/^ARGUS_BUILD_UID=.*/ARGUS_BUILD_UID=${uid}/" "$ENV_FILE"
|
||||||
|
else
|
||||||
|
echo "ARGUS_BUILD_UID=${uid}" >> "$ENV_FILE"
|
||||||
|
fi
|
||||||
|
if grep -q '^ARGUS_BUILD_GID=' "$ENV_FILE"; then
|
||||||
|
sed -i "s/^ARGUS_BUILD_GID=.*/ARGUS_BUILD_GID=${gid}/" "$ENV_FILE"
|
||||||
|
else
|
||||||
|
echo "ARGUS_BUILD_GID=${gid}" >> "$ENV_FILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# distribute update-dns.sh
|
||||||
|
BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh"
|
||||||
|
BIND_UPDATE_DEST="$ROOT/private-server/argus/etc/update-dns.sh"
|
||||||
|
if [[ -f "$BIND_UPDATE_SRC" ]]; then
|
||||||
|
cp "$BIND_UPDATE_SRC" "$BIND_UPDATE_DEST" && chmod +x "$BIND_UPDATE_DEST" || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[BOOT] done"
|
||||||
39
src/sys/swarm_tests/scripts/01_server_up.sh
Executable file
39
src/sys/swarm_tests/scripts/01_server_up.sh
Executable file
@ -0,0 +1,39 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
REPO_ROOT="$(cd "$ROOT/../../.." && pwd)"
|
||||||
|
ENV_FILE="$ROOT/.env"
|
||||||
|
# load UID/GID from repo config first (so they take precedence over any stale .env values)
|
||||||
|
if [[ -f "$REPO_ROOT/scripts/common/build_user.sh" ]]; then
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$REPO_ROOT/scripts/common/build_user.sh" 2>/dev/null || true
|
||||||
|
if declare -f load_build_user >/dev/null 2>&1; then
|
||||||
|
load_build_user
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
set -a; source "$ENV_FILE"; set +a
|
||||||
|
|
||||||
|
PROJECT="${SERVER_PROJECT:-argus-swarm-server}"
|
||||||
|
COMPOSE_FILE="$ROOT/docker-compose.server.yml"
|
||||||
|
|
||||||
|
echo "[SERVER] starting compose project: $PROJECT"
|
||||||
|
docker compose -p "$PROJECT" -f "$COMPOSE_FILE" up -d
|
||||||
|
|
||||||
|
echo "[SERVER] containers:"; docker compose -p "$PROJECT" -f "$COMPOSE_FILE" ps
|
||||||
|
|
||||||
|
# Optional post-start permission alignment (disabled by default). Enable with SWARM_FIX_PERMS=1
|
||||||
|
if [[ "${SWARM_FIX_PERMS:-0}" == "1" ]]; then
|
||||||
|
echo "[SERVER] aligning permissions in containers (best-effort)"
|
||||||
|
for c in argus-master-sys argus-prometheus argus-grafana argus-ftp argus-es-sys argus-kibana-sys argus-web-frontend argus-web-proxy argus-alertmanager; do
|
||||||
|
docker exec "$c" sh -lc 'mkdir -p /private/argus && chmod -R 777 /private/argus' 2>/dev/null || true
|
||||||
|
done
|
||||||
|
echo "[SERVER] restarting selected supervised programs to pick up new permissions"
|
||||||
|
docker exec argus-prometheus sh -lc 'supervisorctl restart prometheus targets-updater >/dev/null 2>&1 || true' || true
|
||||||
|
docker exec argus-grafana sh -lc 'rm -f /private/argus/etc/grafana.metric.argus.com 2>/dev/null || true; supervisorctl restart grafana >/dev/null 2>&1 || true' || true
|
||||||
|
docker exec argus-es-sys sh -lc 'supervisorctl restart elasticsearch >/dev/null 2>&1 || true' || true
|
||||||
|
docker exec argus-kibana-sys sh -lc 'supervisorctl restart kibana >/dev/null 2>&1 || true' || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[SERVER] done"
|
||||||
84
src/sys/swarm_tests/scripts/02_wait_ready.sh
Executable file
84
src/sys/swarm_tests/scripts/02_wait_ready.sh
Executable file
@ -0,0 +1,84 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a
|
||||||
|
|
||||||
|
PROJECT="${SERVER_PROJECT:-argus-swarm-server}"
|
||||||
|
RETRIES=${RETRIES:-60}
|
||||||
|
SLEEP=${SLEEP:-5}
|
||||||
|
|
||||||
|
code() { curl -4 -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
|
||||||
|
prom_ok() {
|
||||||
|
# Consider ready if TCP:9090 is accepting on localhost (host side)
|
||||||
|
(exec 3<>/dev/tcp/127.0.0.1/${PROMETHEUS_PORT:-9090}) >/dev/null 2>&1 && return 0
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "[READY] waiting services (max $((RETRIES*SLEEP))s)"
|
||||||
|
for i in $(seq 1 "$RETRIES"); do
|
||||||
|
e1=$(code "http://127.0.0.1:${MASTER_PORT:-32300}/readyz")
|
||||||
|
e2=$(code "http://127.0.0.1:${ES_HTTP_PORT:-9200}/_cluster/health")
|
||||||
|
e3=000
|
||||||
|
if prom_ok; then e3=200; fi
|
||||||
|
e4=$(code "http://127.0.0.1:${GRAFANA_PORT:-3000}/api/health")
|
||||||
|
e5=$(code "http://127.0.0.1:${KIBANA_PORT:-5601}/api/status")
|
||||||
|
ok=0
|
||||||
|
[[ "$e1" == 200 ]] && ok=$((ok+1))
|
||||||
|
[[ "$e2" == 200 ]] && ok=$((ok+1))
|
||||||
|
[[ "$e3" == 200 ]] && ok=$((ok+1))
|
||||||
|
[[ "$e4" == 200 ]] && ok=$((ok+1))
|
||||||
|
# Kibana 可放宽,等其它四项即可
|
||||||
|
if [[ $ok -ge 4 ]]; then echo "[READY] base services OK"; break; fi
|
||||||
|
echo "[..] waiting ($i/$RETRIES): master=$e1 es=$e2 prom=$e3 graf=$e4 kibana=$e5"; sleep "$SLEEP"
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ $ok -lt 4 ]]; then echo "[ERROR] services not ready" >&2; exit 1; fi
|
||||||
|
|
||||||
|
echo "[READY] resolving overlay IPs"
|
||||||
|
BINDIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-bind-sys)
|
||||||
|
FTPIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-ftp)
|
||||||
|
echo "BINDIP=$BINDIP FTPIP=$FTPIP"
|
||||||
|
|
||||||
|
ENV_NODES="$ROOT/.env.nodes"
|
||||||
|
cat > "$ENV_NODES" <<EOF
|
||||||
|
BINDIP=$BINDIP
|
||||||
|
FTPIP=$FTPIP
|
||||||
|
MASTER_ENDPOINT=http://master.argus.com:3000
|
||||||
|
FTP_USER=ftpuser
|
||||||
|
FTP_PASSWORD=ZGClab1234!
|
||||||
|
AGENT_ENV=dev2
|
||||||
|
AGENT_USER=yuyr
|
||||||
|
AGENT_INSTANCE=node001sX
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo "[READY] wrote $ENV_NODES"
|
||||||
|
|
||||||
|
# Inline: fix domain records -> actual overlay IPs and reload bind/nginx (best-effort)
|
||||||
|
echo "[READY] fixing domain records to overlay IPs"
|
||||||
|
ETC_DIR="$ROOT/private-server/argus/etc"; mkdir -p "$ETC_DIR"
|
||||||
|
declare -A MAP
|
||||||
|
MAP[web-frontend]=web.argus.com
|
||||||
|
MAP[argus-grafana]=grafana.metric.argus.com
|
||||||
|
MAP[argus-prometheus]=prom.metric.argus.com
|
||||||
|
MAP[argus-kibana-sys]=kibana.log.argus.com
|
||||||
|
MAP[argus-alertmanager]=alertmanager.alert.argus.com
|
||||||
|
MAP[argus-master-sys]=master.argus.com
|
||||||
|
changed=0
|
||||||
|
for cname in "${!MAP[@]}"; do
|
||||||
|
domain="${MAP[$cname]}"; fpath="$ETC_DIR/$domain"
|
||||||
|
ip=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' "$cname" 2>/dev/null || true)
|
||||||
|
[[ -z "$ip" ]] && { echo "[DNS-FIX][WARN] $domain: container $cname no overlay IP yet"; continue; }
|
||||||
|
cur=$(cat "$fpath" 2>/dev/null || echo "")
|
||||||
|
if [[ "$cur" != "$ip" ]]; then
|
||||||
|
echo "$ip" > "$fpath"; echo "[DNS-FIX][SET] $domain = $ip (was: ${cur:-<empty>})"; changed=1
|
||||||
|
else
|
||||||
|
echo "[DNS-FIX][OK] $domain already $ip"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if [[ $changed -eq 1 ]]; then
|
||||||
|
docker exec argus-bind-sys /usr/local/bin/reload-bind9.sh >/dev/null 2>&1 || true
|
||||||
|
sleep 1
|
||||||
|
fi
|
||||||
|
docker exec argus-web-proxy nginx -s reload >/dev/null 2>&1 || true
|
||||||
16
src/sys/swarm_tests/scripts/03_nodes_up.sh
Executable file
16
src/sys/swarm_tests/scripts/03_nodes_up.sh
Executable file
@ -0,0 +1,16 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a
|
||||||
|
ENV_NODES_FILE="$ROOT/.env.nodes"; set -a; source "$ENV_NODES_FILE"; set +a
|
||||||
|
|
||||||
|
PROJECT="${NODES_PROJECT:-argus-swarm-nodes}"
|
||||||
|
COMPOSE_FILE="$ROOT/docker-compose.nodes.yml"
|
||||||
|
|
||||||
|
echo "[NODES] starting compose project: $PROJECT"
|
||||||
|
docker compose -p "$PROJECT" --env-file "$ENV_NODES_FILE" -f "$COMPOSE_FILE" up -d
|
||||||
|
docker compose -p "$PROJECT" -f "$COMPOSE_FILE" ps
|
||||||
|
echo "[NODES] done"
|
||||||
|
|
||||||
173
src/sys/swarm_tests/scripts/04_metric_verify.sh
Executable file
173
src/sys/swarm_tests/scripts/04_metric_verify.sh
Executable file
@ -0,0 +1,173 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
|
||||||
|
ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] && { set -a; source "$ENV_FILE"; set +a; }
|
||||||
|
|
||||||
|
PROM_PORT="${PROMETHEUS_PORT:-9090}"
|
||||||
|
GRAF_PORT="${GRAFANA_PORT:-3000}"
|
||||||
|
GRAF_URL="http://127.0.0.1:${GRAF_PORT}"
|
||||||
|
PROM_DOMAIN="prom.metric.argus.com:${PROM_PORT}"
|
||||||
|
|
||||||
|
err() { echo "[ERR] $*" >&2; }
|
||||||
|
ok() { echo "[OK] $*"; }
|
||||||
|
info(){ echo "[INFO] $*"; }
|
||||||
|
|
||||||
|
fail() { err "$*"; exit 1; }
|
||||||
|
|
||||||
|
# Ensure fluent-bit is installed, configured and running to ship logs to ES
|
||||||
|
# Best-effort remediation for swarm_tests only (does not change repo sources)
|
||||||
|
ensure_fluentbit() {
|
||||||
|
local cname="$1"
|
||||||
|
# 1) ensure process exists or try local bundle installer
|
||||||
|
if ! docker exec "$cname" pgrep -x fluent-bit >/dev/null 2>&1; then
|
||||||
|
docker exec "$cname" bash -lc '
|
||||||
|
set -e
|
||||||
|
root=/opt/argus-metric/versions
|
||||||
|
ver=$(ls -1 "$root" 2>/dev/null | sort -Vr | head -1 || true)
|
||||||
|
[[ -z "$ver" ]] && ver=1.42.0
|
||||||
|
verdir="$root/$ver"
|
||||||
|
tb=$(ls -1 "$verdir"/fluent-bit-*.tar.gz 2>/dev/null | head -1 || true)
|
||||||
|
if [ -n "$tb" ]; then tmp=$(mktemp -d); tar -xzf "$tb" -C "$tmp"; sub=$(find "$tmp" -mindepth 1 -maxdepth 1 -type d | head -n1 || true); [ -n "$sub" ] && (cd "$sub" && ./install.sh "$verdir") || true; fi
|
||||||
|
' >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
# 2) patch configs using literal placeholders with safe delimiter
|
||||||
|
docker exec "$cname" bash -lc '
|
||||||
|
set -e
|
||||||
|
f=/etc/fluent-bit/fluent-bit.conf
|
||||||
|
o=/etc/fluent-bit/outputs.d/10-es.conf
|
||||||
|
LCL="\${CLUSTER}"; LRA="\${RACK}"; LHN="\${HOSTNAME}"; EH="\${ES_HOST:-localhost}"; EP="\${ES_PORT:-9200}"
|
||||||
|
# record_modifier placeholders
|
||||||
|
if grep -q "Record cluster $LCL" "$f"; then sed -i "s|Record cluster $LCL|Record cluster local|" "$f"; fi
|
||||||
|
if grep -q "Record rack $LRA" "$f"; then sed -i "s|Record rack $LRA|Record rack dev|" "$f"; fi
|
||||||
|
if grep -q "Record host $LHN" "$f"; then hn=$(hostname); sed -i "s|Record host $LHN|Record host ${hn}|" "$f"; fi
|
||||||
|
# outputs placeholders
|
||||||
|
if [ -f "$o" ] && (grep -q "$EH" "$o" || grep -q "$EP" "$o"); then
|
||||||
|
sed -i "s|Host $EH|Host es.log.argus.com|g; s|Port $EP|Port 9200|g" "$o"
|
||||||
|
fi
|
||||||
|
' >/dev/null 2>&1 || true
|
||||||
|
# 3) restart fluent-bit (best-effort) and wait
|
||||||
|
docker exec "$cname" bash -lc 'pkill -x fluent-bit >/dev/null 2>&1 || true; sleep 1; setsid su -s /bin/bash fluent-bit -c "/opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf >> /var/log/fluent-bit.log 2>&1" &>/dev/null & echo ok' >/dev/null 2>&1 || true
|
||||||
|
for i in {1..10}; do if docker exec "$cname" pgrep -x fluent-bit >/dev/null 2>&1; then return 0; fi; sleep 1; done
|
||||||
|
echo "[WARN] fluent-bit not confirmed running; log pipeline may not ingest" >&2
|
||||||
|
}
|
||||||
|
|
||||||
|
# ---- Grafana /api/health ----
|
||||||
|
info "Grafana /api/health"
|
||||||
|
HEALTH_JSON="$ROOT/tmp/metric-verify/graf_health.json"
|
||||||
|
mkdir -p "$(dirname "$HEALTH_JSON")"
|
||||||
|
code=$(curl -fsS -o "$HEALTH_JSON" -w '%{http_code}' --max-time 10 "$GRAF_URL/api/health" || true)
|
||||||
|
[[ "$code" == 200 ]] || fail "/api/health HTTP $code"
|
||||||
|
if grep -q '"database"\s*:\s*"ok"' "$HEALTH_JSON"; then ok "grafana health database=ok"; else fail "grafana health not ok: $(cat "$HEALTH_JSON")"; fi
|
||||||
|
|
||||||
|
# ---- Grafana datasource points to prom domain ----
|
||||||
|
info "Grafana datasource URL uses domain: $PROM_DOMAIN"
|
||||||
|
DS_FILE="/private/argus/metric/grafana/provisioning/datasources/datasources.yml"
|
||||||
|
if ! docker exec argus-grafana sh -lc "test -f $DS_FILE" >/dev/null 2>&1; then
|
||||||
|
DS_FILE="/etc/grafana/provisioning/datasources/datasources.yml"
|
||||||
|
fi
|
||||||
|
docker exec argus-grafana sh -lc "grep -E 'url:\s*http://$PROM_DOMAIN' '$DS_FILE'" >/dev/null 2>&1 || fail "datasource not pointing to $PROM_DOMAIN"
|
||||||
|
ok "datasource points to domain"
|
||||||
|
|
||||||
|
# ---- DNS resolution inside grafana ----
|
||||||
|
info "bind resolution inside grafana"
|
||||||
|
tries=0
|
||||||
|
until docker exec argus-grafana getent hosts prom.metric.argus.com >/dev/null 2>&1; do
|
||||||
|
tries=$((tries+1)); (( tries > 24 )) && fail "grafana cannot resolve prom.metric.argus.com"
|
||||||
|
echo "[..] waiting DNS propagation in grafana ($tries/24)"; sleep 5
|
||||||
|
done
|
||||||
|
ok "domain resolves"
|
||||||
|
|
||||||
|
# ---- Prometheus activeTargets down check ----
|
||||||
|
info "Prometheus activeTargets health"
|
||||||
|
targets_json="$ROOT/tmp/metric-verify/prom_targets.json"
|
||||||
|
curl -fsS "http://127.0.0.1:${PROM_PORT}/api/v1/targets" -o "$targets_json" || { echo "[WARN] fetch targets failed" >&2; }
|
||||||
|
down_all=""
|
||||||
|
if command -v jq >/dev/null 2>&1; then
|
||||||
|
down_all=$(jq -r '.data.activeTargets[] | select(.health=="down") | .scrapeUrl' "$targets_json" 2>/dev/null || true)
|
||||||
|
else
|
||||||
|
down_all=$(grep -o '"scrapeUrl":"[^"]\+"' "$targets_json" | sed 's/"scrapeUrl":"\(.*\)"/\1/' | paste -sd '\n' - | grep -v '^$' || true)
|
||||||
|
grep -q '"health":"down"' "$targets_json" && [ -z "$down_all" ] && down_all="(one or more targets down)"
|
||||||
|
fi
|
||||||
|
# ignore dcgm-exporter(9400) and tolerate node-exporter(9100) in swarm tests
|
||||||
|
down_filtered=$(echo "$down_all" | grep -Ev ':(9400|9100)/' || true)
|
||||||
|
if [[ -n "$down_filtered" ]]; then
|
||||||
|
err "prometheus down targets (filtered):"; echo "$down_filtered" >&2
|
||||||
|
else
|
||||||
|
ok "prometheus targets up (ignoring :9100 and :9400)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ---- nodes.json sanity: avoid 172.22/16 (gwbridge) ----
|
||||||
|
nodes_json="$ROOT/private-server/argus/metric/prometheus/nodes.json"
|
||||||
|
if [[ -f "$nodes_json" ]] && grep -q '"ip"\s*:\s*"172\.22\.' "$nodes_json"; then
|
||||||
|
fail "nodes.json contains 172.22/16 addresses (gwbridge)"
|
||||||
|
fi
|
||||||
|
ok "nodes.json IPs look fine"
|
||||||
|
|
||||||
|
echo "[DONE] metric verify"
|
||||||
|
|
||||||
|
# ---- Log pipeline smoke test (adapted from sys/tests 07) ----
|
||||||
|
info "Log pipeline: send logs in node container and assert ES counts"
|
||||||
|
|
||||||
|
ES_PORT="${ES_HTTP_PORT:-9200}"
|
||||||
|
KIBANA_PORT="${KIBANA_PORT:-5601}"
|
||||||
|
|
||||||
|
get_count() {
|
||||||
|
local idx="$1"; local tmp; tmp=$(mktemp)
|
||||||
|
local code
|
||||||
|
code=$(curl -s -o "$tmp" -w "%{http_code}" "http://127.0.0.1:${ES_PORT}/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" || true)
|
||||||
|
if [[ "$code" == "200" ]]; then
|
||||||
|
local val
|
||||||
|
val=$(jq -r '(.count // 0) | tonumber? // 0' "$tmp" 2>/dev/null || echo 0)
|
||||||
|
echo "$val"
|
||||||
|
else
|
||||||
|
echo 0
|
||||||
|
fi
|
||||||
|
rm -f "$tmp"
|
||||||
|
}
|
||||||
|
|
||||||
|
train0=$(get_count "train-*")
|
||||||
|
infer0=$(get_count "infer-*")
|
||||||
|
base=$((train0 + infer0))
|
||||||
|
info "initial ES counts: train=${train0} infer=${infer0} total=${base}"
|
||||||
|
|
||||||
|
send_logs() {
|
||||||
|
local cname="$1"; local hosttag="$2"
|
||||||
|
docker exec "$cname" sh -lc 'mkdir -p /logs/train /logs/infer'
|
||||||
|
docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log"
|
||||||
|
docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log"
|
||||||
|
docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log"
|
||||||
|
}
|
||||||
|
|
||||||
|
NODE_CONT="${SWARM_NODE_CNAME:-argus-metric-test-node-swarm}"
|
||||||
|
ensure_fluentbit "$NODE_CONT"
|
||||||
|
send_logs "$NODE_CONT" "swarm-node"
|
||||||
|
|
||||||
|
info "waiting for ES to ingest..."
|
||||||
|
curl -s -X POST "http://127.0.0.1:${ES_PORT}/train-*/_refresh" >/dev/null 2>&1 || true
|
||||||
|
curl -s -X POST "http://127.0.0.1:${ES_PORT}/infer-*/_refresh" >/dev/null 2>&1 || true
|
||||||
|
|
||||||
|
final=0; threshold=3
|
||||||
|
for attempt in {1..60}; do
|
||||||
|
train1=$(get_count "train-*"); infer1=$(get_count "infer-*"); final=$((train1 + infer1))
|
||||||
|
if (( final > base && final >= threshold )); then break; fi
|
||||||
|
echo "[..] waiting ES counts increase to >=${threshold} ($attempt/60) current=${final} base=${base}"; \
|
||||||
|
curl -s -X POST "http://127.0.0.1:${ES_PORT}/train-*/_refresh" >/dev/null 2>&1 || true; \
|
||||||
|
curl -s -X POST "http://127.0.0.1:${ES_PORT}/infer-*/_refresh" >/dev/null 2>&1 || true; \
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
info "final ES counts: train=${train1} infer=${infer1} total=${final}"
|
||||||
|
|
||||||
|
(( final > base )) || fail "ES total did not increase (${base} -> ${final})"
|
||||||
|
(( final >= threshold )) || fail "ES total below expected threshold: ${final} < ${threshold}"
|
||||||
|
|
||||||
|
es_health=$(curl -s "http://127.0.0.1:${ES_PORT}/_cluster/health" | grep -o '"status":"[^\"]*"' | cut -d'"' -f4)
|
||||||
|
[[ "$es_health" == green || "$es_health" == yellow ]] || fail "ES health not green/yellow: $es_health"
|
||||||
|
|
||||||
|
if ! curl -fs "http://127.0.0.1:${KIBANA_PORT}/api/status" >/dev/null 2>&1; then
|
||||||
|
echo "[WARN] Kibana status endpoint not available" >&2
|
||||||
|
fi
|
||||||
|
|
||||||
|
ok "log pipeline verified"
|
||||||
21
src/sys/swarm_tests/scripts/99_down.sh
Executable file
21
src/sys/swarm_tests/scripts/99_down.sh
Executable file
@ -0,0 +1,21 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a
|
||||||
|
|
||||||
|
echo "[DOWN] stopping nodes compose"
|
||||||
|
docker compose -p "${NODES_PROJECT:-argus-swarm-nodes}" -f "$ROOT/docker-compose.nodes.yml" down --remove-orphans || true
|
||||||
|
|
||||||
|
echo "[DOWN] stopping server compose"
|
||||||
|
docker compose -p "${SERVER_PROJECT:-argus-swarm-server}" -f "$ROOT/docker-compose.server.yml" down --remove-orphans || true
|
||||||
|
|
||||||
|
echo "[DOWN] removing overlay network"
|
||||||
|
docker network rm argus-sys-net >/dev/null 2>&1 || true
|
||||||
|
|
||||||
|
echo "[DOWN] cleanup temp files"
|
||||||
|
rm -rf "$ROOT/private-server/tmp" "$ROOT/private-nodes/tmp" 2>/dev/null || true
|
||||||
|
|
||||||
|
echo "[DOWN] done"
|
||||||
|
|
||||||
5
src/sys/swarm_tests/tmp/metric-verify.graf_health.json
Normal file
5
src/sys/swarm_tests/tmp/metric-verify.graf_health.json
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"commit": "5b85c4c2fcf5d32d4f68aaef345c53096359b2f1",
|
||||||
|
"database": "ok",
|
||||||
|
"version": "11.1.0"
|
||||||
|
}
|
||||||
5
src/sys/swarm_tests/tmp/metric-verify/graf_health.json
Normal file
5
src/sys/swarm_tests/tmp/metric-verify/graf_health.json
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"commit": "5b85c4c2fcf5d32d4f68aaef345c53096359b2f1",
|
||||||
|
"database": "ok",
|
||||||
|
"version": "11.1.0"
|
||||||
|
}
|
||||||
1
src/sys/swarm_tests/tmp/metric-verify/prom_targets.json
Normal file
1
src/sys/swarm_tests/tmp/metric-verify/prom_targets.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.13:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.13:9400/metrics","globalUrl":"http://10.0.1.13:9400/metrics","lastError":"","lastScrape":"2025-11-06T16:36:25.585236213+08:00","lastScrapeDuration":0.002520163,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.13:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.13:9100/metrics","globalUrl":"http://10.0.1.13:9100/metrics","lastError":"","lastScrape":"2025-11-06T16:36:33.694723606+08:00","lastScrapeDuration":0.021800606,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}}
|
||||||
1
src/sys/swarm_tests/tmp/targets.json
Normal file
1
src/sys/swarm_tests/tmp/targets.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.15:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.15","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.15","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.15:9400/metrics","globalUrl":"http://10.0.1.15:9400/metrics","lastError":"","lastScrape":"2025-11-06T15:47:37.200098366+08:00","lastScrapeDuration":0.001361528,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.15:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.15","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.15","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.15:9100/metrics","globalUrl":"http://10.0.1.15:9100/metrics","lastError":"","lastScrape":"2025-11-06T15:47:40.184367879+08:00","lastScrapeDuration":0.02923333,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}}
|
||||||
@ -92,6 +92,20 @@ while :; do
|
|||||||
WAITED=$((WAITED+1))
|
WAITED=$((WAITED+1))
|
||||||
done
|
done
|
||||||
|
|
||||||
|
# Quick upstream reachability snapshot (best-effort; does not block startup)
|
||||||
|
declare -a _UPSTREAMS=(
|
||||||
|
"http://web.argus.com:8080/"
|
||||||
|
"http://grafana.metric.argus.com:3000/api/health"
|
||||||
|
"http://prom.metric.argus.com:9090/-/ready"
|
||||||
|
"http://kibana.log.argus.com:5601/api/status"
|
||||||
|
"http://alertmanager.alert.argus.com:9093/api/v2/status"
|
||||||
|
"http://master.argus.com:3000/readyz"
|
||||||
|
)
|
||||||
|
for u in "${_UPSTREAMS[@]}"; do
|
||||||
|
code=$(curl -4 -s -o /dev/null -w "%{http_code}" "$u" || echo 000)
|
||||||
|
echo "[INFO] upstream check: $u -> $code"
|
||||||
|
done
|
||||||
|
|
||||||
echo "[INFO] Launching nginx..."
|
echo "[INFO] Launching nginx..."
|
||||||
|
|
||||||
# 启动 nginx 前台模式
|
# 启动 nginx 前台模式
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user