[#37] 增加sys/swarm_tests(cpu) ;单独构建的node bundle镜像
This commit is contained in:
parent
94b3e910b3
commit
d1fad4a05a
98
deployment/build/build_images.sh
Executable file
98
deployment/build/build_images.sh
Executable file
@ -0,0 +1,98 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||
. "$ROOT_DIR/deployment/build/common.sh"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Build Argus images (optional node-bundle)
|
||||
|
||||
Usage: build_images.sh [--with-node-bundle] [--client-version YYYYMMDD] [--base-image NAME[:TAG]]
|
||||
|
||||
Examples:
|
||||
./deployment/build/build_images.sh --with-node-bundle --client-version 20251106
|
||||
EOF
|
||||
}
|
||||
|
||||
WITH_BUNDLE=false
|
||||
CLIENT_VERSION=""
|
||||
BASE_IMAGE="argus-sys-metric-test-node:latest"
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--with-node-bundle) WITH_BUNDLE=true; shift;;
|
||||
--client-version) CLIENT_VERSION="$2"; shift 2;;
|
||||
--base-image) BASE_IMAGE="$2"; shift 2;;
|
||||
-h|--help) usage; exit 0;;
|
||||
*) err "unknown arg: $1"; usage; exit 1;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ "$WITH_BUNDLE" == true ]]; then
|
||||
require_cmd docker tar gzip
|
||||
BUNDLE_DIR="$ROOT_DIR/src/sys/build/node-bundle"
|
||||
CTX_DIR="$BUNDLE_DIR"
|
||||
TMP_BUNDLE="$BUNDLE_DIR/bundle"
|
||||
rm -rf "$TMP_BUNDLE"; mkdir -p "$TMP_BUNDLE"
|
||||
|
||||
# Build or locate client artifact
|
||||
PLUGIN_DIR="$ROOT_DIR/src/metric/client-plugins/all-in-one-full"
|
||||
# CLIENT_VERSION 支持两种形式:
|
||||
# - 形如 1.42.0 的 artifact 版本(默认)
|
||||
# - 形如 YYYYMMDD 的打包日期,将从 deployment/artifact/client/ 下解析出内部 artifact 版本
|
||||
if [[ -z "$CLIENT_VERSION" ]]; then
|
||||
pushd "$PLUGIN_DIR" >/dev/null
|
||||
bash scripts/package_artifact.sh --force
|
||||
CLIENT_VERSION=$(cat artifact/*/version.json 2>/dev/null | sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' | tail -n1)
|
||||
popd >/dev/null
|
||||
[[ -n "$CLIENT_VERSION" ]] || { err "failed to detect client version"; exit 1; }
|
||||
else
|
||||
if [[ "$CLIENT_VERSION" =~ ^[0-9]{8}$ ]]; then
|
||||
PKG_DIR="$ROOT_DIR/deployment/artifact/client/$CLIENT_VERSION"
|
||||
TAR_PKG="$PKG_DIR/argus-metric_${CLIENT_VERSION}.tar.gz"
|
||||
[[ -f "$TAR_PKG" ]] || { err "client date package not found: $TAR_PKG"; exit 1; }
|
||||
# 解包读取内部 version.json
|
||||
tmpd=$(mktemp -d); trap 'rm -rf "$tmpd"' EXIT
|
||||
tar -xzf "$TAR_PKG" -C "$tmpd"
|
||||
if [[ -f "$tmpd/version.json" ]]; then
|
||||
ART_VER=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$tmpd/version.json" | head -n1)
|
||||
[[ -n "$ART_VER" ]] || { err "failed to parse artifact version from date package"; exit 1; }
|
||||
CLIENT_VERSION="$ART_VER"
|
||||
# 直接使用该 tar 作为 bundle 源
|
||||
cp "$TAR_PKG" "$TMP_BUNDLE/argus-metric_$(echo "$ART_VER" | tr '.' '_').tar.gz"
|
||||
# 同时尝试复制 setup.sh(若存在)
|
||||
[[ -f "$PKG_DIR/setup.sh" ]] && cp "$PKG_DIR/setup.sh" "$TMP_BUNDLE/" || true
|
||||
else
|
||||
err "version.json missing in client date package"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
# 假定为 artifact 版本目录
|
||||
pushd "$PLUGIN_DIR" >/dev/null
|
||||
[[ -d "artifact/$CLIENT_VERSION" ]] || bash scripts/package_artifact.sh --force
|
||||
popd >/dev/null
|
||||
fi
|
||||
fi
|
||||
|
||||
# 若未通过日期包预置 tar,则从插件 artifact 目录取
|
||||
TAR_NAME="argus-metric_$(echo "$CLIENT_VERSION" | tr '.' '_').tar.gz"
|
||||
if [[ ! -f "$TMP_BUNDLE/$TAR_NAME" ]]; then
|
||||
SRC_TAR="$PLUGIN_DIR/artifact/$CLIENT_VERSION/$TAR_NAME"
|
||||
[[ -f "$SRC_TAR" ]] || { err "missing client tar: $SRC_TAR"; exit 1; }
|
||||
cp "$SRC_TAR" "$TMP_BUNDLE/"
|
||||
# also include setup.sh for fallback
|
||||
if [[ -f "$ROOT_DIR/deployment/artifact/client/$(today_version)/setup.sh" ]]; then
|
||||
cp "$ROOT_DIR/deployment/artifact/client/$(today_version)/setup.sh" "$TMP_BUNDLE/" || true
|
||||
fi
|
||||
fi
|
||||
|
||||
log "Building node-bundle image with client version: $CLIENT_VERSION"
|
||||
DOCKER_BUILDKIT=0 docker build \
|
||||
--build-arg CLIENT_VER="$CLIENT_VERSION" \
|
||||
--build-arg BASE_IMAGE="$BASE_IMAGE" \
|
||||
-t argus-sys-metric-test-node-bundle:latest \
|
||||
-f "$BUNDLE_DIR/Dockerfile" "$BUNDLE_DIR"
|
||||
log "Built image: argus-sys-metric-test-node-bundle:latest"
|
||||
fi
|
||||
|
||||
log "Done."
|
||||
103
deployment/build/templates/scripts/fix-prom-targets-overlay.sh
Normal file
103
deployment/build/templates/scripts/fix-prom-targets-overlay.sh
Normal file
@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Quick fix tool: replace 172.22/16 targets in nodes.json with overlay IPs resolved from hostname.
|
||||
# Usage: run on server package host: scripts/fix-prom-targets-overlay.sh
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
NODES_JSON="$ROOT/private/argus/metric/prometheus/nodes.json"
|
||||
|
||||
require_cmd() { command -v "$1" >/dev/null 2>&1 || { echo "[ERROR] missing command: $1" >&2; exit 1; }; }
|
||||
|
||||
backup() {
|
||||
local src="$1"; local ts; ts=$(date -u +%Y%m%d-%H%M%SZ)
|
||||
cp "$src" "${src%.json}_bak_${ts}.json"
|
||||
}
|
||||
|
||||
prefer_overlay_ip() {
|
||||
local host="$1"
|
||||
# prefer 10.0/8 then 172.31/16
|
||||
getent hosts "$host" | awk '{print $1}' | while read -r ip; do
|
||||
if [[ "$ip" =~ ^10\. ]]; then echo "$ip"; return; fi
|
||||
done
|
||||
getent hosts "$host" | awk '{print $1}' | while read -r ip; do
|
||||
if [[ "$ip" =~ ^172\.31\. ]]; then echo "$ip"; return; fi
|
||||
done
|
||||
# fallback: first A record
|
||||
getent hosts "$host" | awk '{print $1; exit}'
|
||||
}
|
||||
|
||||
require_cmd awk
|
||||
require_cmd sed
|
||||
|
||||
if [[ ! -f "$NODES_JSON" ]]; then
|
||||
echo "[WARN] nodes.json not found: $NODES_JSON" >&2
|
||||
exit 0
|
||||
fi
|
||||
|
||||
backup "$NODES_JSON"
|
||||
|
||||
tmp=$(mktemp)
|
||||
trap 'rm -f "$tmp"' EXIT
|
||||
|
||||
changed=0
|
||||
python3 - "$NODES_JSON" <<'PY' > "$tmp" || {
|
||||
import ipaddress, json, sys, socket
|
||||
path=sys.argv[1]
|
||||
data=json.load(open(path)) if path else []
|
||||
def resolve(host):
|
||||
try:
|
||||
infos=socket.getaddrinfo(host,None,family=socket.AF_INET)
|
||||
ips=[i[4][0] for i in infos]
|
||||
# prefer 10. over 172.31.
|
||||
for ip in ips:
|
||||
if ip.startswith('10.'): return ip
|
||||
for ip in ips:
|
||||
if ip.startswith('172.31.'): return ip
|
||||
return ips[0] if ips else None
|
||||
except OSError:
|
||||
return None
|
||||
gw=ipaddress.ip_network('172.22.0.0/16')
|
||||
out=[]
|
||||
changed=False
|
||||
for item in data:
|
||||
ip=item.get('ip')
|
||||
host=item.get('hostname') or ''
|
||||
try:
|
||||
bad = ip and ipaddress.ip_address(ip) in gw
|
||||
except Exception:
|
||||
bad = False
|
||||
if bad and host:
|
||||
new=resolve(host)
|
||||
if new:
|
||||
item=dict(item)
|
||||
item['ip']=new
|
||||
changed=True
|
||||
out.append(item)
|
||||
json.dump(out, sys.stdout, ensure_ascii=False)
|
||||
sys.stderr.write('CHANGED' if changed else 'UNCHANGED')
|
||||
PY
|
||||
|
||||
status=$?
|
||||
marker=$(tail -n1 /dev/stderr 2>/dev/null || true)
|
||||
if [[ "$status" -ne 0 ]]; then
|
||||
echo "[ERROR] failed to rewrite nodes.json" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if grep -q '"ip"\s*:\s*"172\.22\.' "$tmp"; then
|
||||
echo "[WARN] some gwbridge targets remain; manual fix may be required" >&2
|
||||
fi
|
||||
|
||||
mv "$tmp" "$NODES_JSON"
|
||||
echo "[OK] nodes.json updated"
|
||||
|
||||
# try to reload Prometheus
|
||||
if docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then
|
||||
docker exec argus-prometheus sh -lc 'pidof prometheus >/dev/null 2>&1 && kill -HUP $(pidof prometheus) || supervisorctl restart prometheus' >/dev/null 2>&1 || true
|
||||
echo "[INFO] Prometheus reloaded"
|
||||
fi
|
||||
|
||||
exit 0
|
||||
|
||||
@ -155,6 +155,34 @@ gfcode=$(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health"); [[ "$gf
|
||||
# Deduplicate errors
|
||||
sort -u -o "$ERRORS" "$ERRORS"
|
||||
|
||||
# --- Prometheus targets & nodes.json checks ---
|
||||
section PROMETHEUS-TARGETS
|
||||
nodes_json_path="$ROOT/private/argus/metric/prometheus/nodes.json"
|
||||
if [[ -f "$nodes_json_path" ]]; then
|
||||
logd "nodes.json present: $nodes_json_path"
|
||||
# detect gwbridge addresses (172.22/16)
|
||||
if grep -E '"ip"\s*:\s*"172\.22\.' "$nodes_json_path" >/dev/null 2>&1; then
|
||||
append_err "[prometheus][targets] contains gwbridge (172.22/16) addresses; prefer overlay (10.0/8)."
|
||||
echo "HINT: nodes.json has 172.22.x.x targets. Ensure agent publishes overlay_ip and master prefers it." >&2
|
||||
fi
|
||||
else
|
||||
logd "nodes.json missing at $nodes_json_path"
|
||||
fi
|
||||
|
||||
# Query Prometheus activeTargets and list down items when possible
|
||||
pt_json=$(curl -s --max-time 3 "http://localhost:${PROMETHEUS_PORT:-9090}/api/v1/targets" || true)
|
||||
if command -v jq >/dev/null 2>&1; then
|
||||
downs=$(printf '%s' "$pt_json" | jq -r '.data.activeTargets[] | select(.health=="down") | "[prometheus][activeTargets] down url=\(.scrapeUrl) lastError=\(.lastError)"' 2>/dev/null || true)
|
||||
if [[ -n "$downs" ]]; then
|
||||
printf '%s\n' "$downs" >> "$ERRORS"
|
||||
fi
|
||||
else
|
||||
# best-effort grep when jq is unavailable
|
||||
if printf '%s' "$pt_json" | grep -q '"health":"down"'; then
|
||||
append_err "[prometheus][activeTargets] some targets are down (enable jq for detailed reasons)"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Diagnostic details -> $DETAILS"
|
||||
echo "Detected errors -> $ERRORS"
|
||||
|
||||
|
||||
@ -4,6 +4,7 @@ import os
|
||||
import re
|
||||
import socket
|
||||
import subprocess
|
||||
import ipaddress
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
@ -16,11 +17,47 @@ _HOSTNAME_PATTERN = re.compile(r"^([^-]+)-([^-]+)-([^-]+)-.*$")
|
||||
|
||||
|
||||
def collect_metadata(config: AgentConfig) -> Dict[str, Any]:
|
||||
"""汇总节点注册需要的静态信息。"""
|
||||
"""汇总节点注册需要的静态信息,带有更智能的 IP 选择。
|
||||
|
||||
规则(从高到低):
|
||||
1) AGENT_PUBLISH_IP 指定;
|
||||
2) Hostname A 记录(若命中优先网段);
|
||||
3) 网卡扫描:排除 AGENT_EXCLUDE_IFACES,优先 AGENT_PREFER_NET_CIDRS;
|
||||
4) 默认路由回退(UDP socket 技巧)。
|
||||
|
||||
额外发布:overlay_ip / gwbridge_ip / interfaces,便于 Master 与诊断使用。
|
||||
"""
|
||||
hostname = config.hostname
|
||||
meta = {
|
||||
|
||||
prefer_cidrs = _read_cidrs_env(
|
||||
os.environ.get("AGENT_PREFER_NET_CIDRS", "10.0.0.0/8,172.31.0.0/16")
|
||||
)
|
||||
exclude_ifaces = _read_csv_env(
|
||||
os.environ.get("AGENT_EXCLUDE_IFACES", "docker_gwbridge,lo")
|
||||
)
|
||||
|
||||
# interface inventory
|
||||
interfaces = _list_global_ipv4_addrs()
|
||||
if exclude_ifaces:
|
||||
interfaces = [it for it in interfaces if it[0] not in set(exclude_ifaces)]
|
||||
|
||||
# resolve hostname candidates
|
||||
host_ips = _resolve_hostname_ips(hostname)
|
||||
|
||||
selected_ip, overlay_ip, gwbridge_ip = _select_publish_ips(
|
||||
interfaces=interfaces,
|
||||
host_ips=host_ips,
|
||||
prefer_cidrs=prefer_cidrs,
|
||||
)
|
||||
|
||||
meta: Dict[str, Any] = {
|
||||
"hostname": hostname,
|
||||
"ip": _detect_ip_address(),
|
||||
"ip": os.environ.get("AGENT_PUBLISH_IP", selected_ip), # keep required field
|
||||
"overlay_ip": overlay_ip or selected_ip,
|
||||
"gwbridge_ip": gwbridge_ip,
|
||||
"interfaces": [
|
||||
{"iface": name, "ip": ip} for name, ip in interfaces
|
||||
],
|
||||
"env": config.environment,
|
||||
"user": config.user,
|
||||
"instance": config.instance,
|
||||
@ -96,7 +133,7 @@ def _detect_gpu_count() -> int:
|
||||
|
||||
|
||||
def _detect_ip_address() -> str:
|
||||
"""尝试通过 UDP socket 获得容器出口 IP,失败则回退解析主机名。"""
|
||||
"""保留旧接口,作为最终回退:默认路由源地址 → 主机名解析 → 127.0.0.1。"""
|
||||
try:
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock:
|
||||
sock.connect(("8.8.8.8", 80))
|
||||
@ -108,3 +145,118 @@ def _detect_ip_address() -> str:
|
||||
except OSError:
|
||||
LOGGER.warning("Unable to resolve hostname to IP; defaulting to 127.0.0.1")
|
||||
return "127.0.0.1"
|
||||
|
||||
|
||||
def _read_csv_env(raw: str | None) -> list[str]:
|
||||
if not raw:
|
||||
return []
|
||||
return [x.strip() for x in raw.split(",") if x.strip()]
|
||||
|
||||
|
||||
def _read_cidrs_env(raw: str | None) -> list[ipaddress.IPv4Network]:
|
||||
cidrs: list[ipaddress.IPv4Network] = []
|
||||
for item in _read_csv_env(raw):
|
||||
try:
|
||||
net = ipaddress.ip_network(item, strict=False)
|
||||
if isinstance(net, (ipaddress.IPv4Network,)):
|
||||
cidrs.append(net)
|
||||
except ValueError:
|
||||
LOGGER.warning("Ignoring invalid CIDR in AGENT_PREFER_NET_CIDRS", extra={"cidr": item})
|
||||
return cidrs
|
||||
|
||||
|
||||
def _list_global_ipv4_addrs() -> list[tuple[str, str]]:
|
||||
"""列出 (iface, ip) 形式的全局 IPv4 地址。
|
||||
依赖 iproute2:ip -4 -o addr show scope global
|
||||
"""
|
||||
results: list[tuple[str, str]] = []
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
["sh", "-lc", "ip -4 -o addr show scope global | awk '{print $2, $4}'"],
|
||||
check=False,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
timeout=3,
|
||||
)
|
||||
if proc.returncode == 0:
|
||||
for line in proc.stdout.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
parts = line.split()
|
||||
if len(parts) != 2:
|
||||
continue
|
||||
iface, cidr = parts
|
||||
ip = cidr.split("/")[0]
|
||||
try:
|
||||
ipaddress.IPv4Address(ip)
|
||||
except ValueError:
|
||||
continue
|
||||
results.append((iface, ip))
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
LOGGER.debug("Failed to list interfaces", extra={"error": str(exc)})
|
||||
return results
|
||||
|
||||
|
||||
def _resolve_hostname_ips(name: str) -> list[str]:
|
||||
ips: list[str] = []
|
||||
try:
|
||||
infos = socket.getaddrinfo(name, None, family=socket.AF_INET)
|
||||
for info in infos:
|
||||
ip = info[4][0]
|
||||
if ip not in ips:
|
||||
ips.append(ip)
|
||||
except OSError:
|
||||
pass
|
||||
return ips
|
||||
|
||||
|
||||
def _pick_by_cidrs(candidates: list[str], prefer_cidrs: list[ipaddress.IPv4Network]) -> str | None:
|
||||
for net in prefer_cidrs:
|
||||
for ip in candidates:
|
||||
try:
|
||||
if ipaddress.ip_address(ip) in net:
|
||||
return ip
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def _select_publish_ips(
|
||||
*,
|
||||
interfaces: list[tuple[str, str]],
|
||||
host_ips: list[str],
|
||||
prefer_cidrs: list[ipaddress.IPv4Network],
|
||||
) -> tuple[str, str | None, str | None]:
|
||||
"""返回 (selected_ip, overlay_ip, gwbridge_ip)。
|
||||
|
||||
- overlay_ip:优先命中 prefer_cidrs(10.0/8 先于 172.31/16)。
|
||||
- gwbridge_ip:若存在 172.22/16 则记录。
|
||||
- selected_ip:优先 AGENT_PUBLISH_IP;否则 overlay_ip;否则 hostname A 记录中的 prefer;否则默认路由回退。
|
||||
"""
|
||||
# detect gwbridge (172.22/16)
|
||||
gwbridge_net = ipaddress.ip_network("172.22.0.0/16")
|
||||
gwbridge_ip = None
|
||||
for _, ip in interfaces:
|
||||
try:
|
||||
if ipaddress.ip_address(ip) in gwbridge_net:
|
||||
gwbridge_ip = ip
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# overlay candidate from interfaces by prefer cidrs
|
||||
iface_ips = [ip for _, ip in interfaces]
|
||||
overlay_ip = _pick_by_cidrs(iface_ips, prefer_cidrs)
|
||||
|
||||
# hostname A records filtered by prefer cidrs
|
||||
host_pref = _pick_by_cidrs(host_ips, prefer_cidrs)
|
||||
|
||||
env_ip = os.environ.get("AGENT_PUBLISH_IP")
|
||||
if env_ip:
|
||||
selected = env_ip
|
||||
else:
|
||||
selected = overlay_ip or host_pref or _detect_ip_address()
|
||||
|
||||
return selected, overlay_ip, gwbridge_ip
|
||||
|
||||
BIN
src/agent/dist/argus-agent
vendored
BIN
src/agent/dist/argus-agent
vendored
Binary file not shown.
@ -13,6 +13,8 @@ class AppConfig:
|
||||
scheduler_interval_seconds: int
|
||||
node_id_prefix: str
|
||||
auth_mode: str
|
||||
target_prefer_net_cidrs: str
|
||||
target_reachability_check: bool
|
||||
|
||||
|
||||
def _get_int_env(name: str, default: int) -> int:
|
||||
@ -27,6 +29,12 @@ def _get_int_env(name: str, default: int) -> int:
|
||||
|
||||
def load_config() -> AppConfig:
|
||||
"""读取环境变量生成配置对象,方便统一管理运行参数。"""
|
||||
def _bool_env(name: str, default: bool) -> bool:
|
||||
raw = os.environ.get(name)
|
||||
if raw is None or raw.strip() == "":
|
||||
return default
|
||||
return raw.strip().lower() in ("1", "true", "yes", "on")
|
||||
|
||||
return AppConfig(
|
||||
db_path=os.environ.get("DB_PATH", "/private/argus/master/db.sqlite3"),
|
||||
metric_nodes_json_path=os.environ.get(
|
||||
@ -37,4 +45,6 @@ def load_config() -> AppConfig:
|
||||
scheduler_interval_seconds=_get_int_env("SCHEDULER_INTERVAL_SECONDS", 30),
|
||||
node_id_prefix=os.environ.get("NODE_ID_PREFIX", "A"),
|
||||
auth_mode=os.environ.get("AUTH_MODE", "disabled"),
|
||||
target_prefer_net_cidrs=os.environ.get("TARGET_PREFER_NET_CIDRS", "10.0.0.0/8,172.31.0.0/16"),
|
||||
target_reachability_check=_bool_env("TARGET_REACHABILITY_CHECK", False),
|
||||
)
|
||||
|
||||
@ -1,8 +1,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import ipaddress
|
||||
import logging
|
||||
import socket
|
||||
import threading
|
||||
from typing import Optional
|
||||
from typing import Optional, Iterable, Dict, Any, List
|
||||
|
||||
from .config import AppConfig
|
||||
from .storage import Storage
|
||||
@ -34,10 +36,117 @@ class StatusScheduler:
|
||||
self._pending_nodes_json.set()
|
||||
|
||||
def generate_nodes_json(self) -> None:
|
||||
"""根据在线节点生成 Prometheus 抓取目标,优先 overlay IP。
|
||||
|
||||
候选顺序:meta.overlay_ip > hostname A 记录(命中偏好网段)> meta.ip。
|
||||
可选 reachability 检查:TARGET_REACHABILITY_CHECK=true 时,对 9100/9400 做一次 1s TCP 连接测试,
|
||||
选择首个可达的候选;全部失败则按顺序取第一个并记录日志。
|
||||
"""
|
||||
with self._nodes_json_lock:
|
||||
online_nodes = self._storage.get_online_nodes()
|
||||
atomic_write_json(self._config.metric_nodes_json_path, online_nodes)
|
||||
self._logger.info("nodes.json updated", extra={"count": len(online_nodes)})
|
||||
rows = self._storage.get_online_nodes_meta()
|
||||
prefer_cidrs = self._parse_cidrs(self._config.target_prefer_net_cidrs)
|
||||
reachability = self._config.target_reachability_check
|
||||
|
||||
result: List[Dict[str, Any]] = []
|
||||
for row in rows:
|
||||
meta = row.get("meta", {})
|
||||
hostname = meta.get("hostname") or row.get("name")
|
||||
labels = row.get("labels") or []
|
||||
|
||||
overlay_ip = meta.get("overlay_ip")
|
||||
legacy_ip = meta.get("ip")
|
||||
host_candidates = self._resolve_host_ips(hostname)
|
||||
host_pref = self._pick_by_cidrs(host_candidates, prefer_cidrs)
|
||||
|
||||
candidates: List[str] = []
|
||||
for ip in [overlay_ip, host_pref, legacy_ip]:
|
||||
if ip and ip not in candidates:
|
||||
candidates.append(ip)
|
||||
|
||||
chosen = None
|
||||
if reachability:
|
||||
ports = [9100]
|
||||
try:
|
||||
if int(meta.get("gpu_number", 0)) > 0:
|
||||
ports.append(9400)
|
||||
except Exception:
|
||||
pass
|
||||
for ip in candidates:
|
||||
if any(self._reachable(ip, p, 1.0) for p in ports):
|
||||
chosen = ip
|
||||
break
|
||||
if not chosen:
|
||||
chosen = candidates[0] if candidates else legacy_ip
|
||||
if not chosen:
|
||||
# ultimate fallback: 127.0.0.1 (should not happen)
|
||||
chosen = "127.0.0.1"
|
||||
self._logger.warning("No candidate IPs for node; falling back", extra={"node": row.get("node_id")})
|
||||
|
||||
if chosen and ipaddress.ip_address(chosen) in ipaddress.ip_network("172.22.0.0/16"):
|
||||
self._logger.warning(
|
||||
"Prometheus target uses docker_gwbridge address; prefer overlay",
|
||||
extra={"node": row.get("node_id"), "ip": chosen},
|
||||
)
|
||||
|
||||
result.append(
|
||||
{
|
||||
"node_id": row.get("node_id"),
|
||||
"user_id": meta.get("user"),
|
||||
"ip": chosen,
|
||||
"hostname": hostname,
|
||||
"labels": labels if isinstance(labels, list) else [],
|
||||
}
|
||||
)
|
||||
|
||||
atomic_write_json(self._config.metric_nodes_json_path, result)
|
||||
self._logger.info("nodes.json updated", extra={"count": len(result)})
|
||||
|
||||
# ---------------------------- helpers ----------------------------
|
||||
@staticmethod
|
||||
def _parse_cidrs(raw: str) -> List[ipaddress.IPv4Network]:
|
||||
nets: List[ipaddress.IPv4Network] = []
|
||||
for item in (x.strip() for x in (raw or "").split(",")):
|
||||
if not item:
|
||||
continue
|
||||
try:
|
||||
net = ipaddress.ip_network(item, strict=False)
|
||||
if isinstance(net, ipaddress.IPv4Network):
|
||||
nets.append(net)
|
||||
except ValueError:
|
||||
continue
|
||||
return nets
|
||||
|
||||
@staticmethod
|
||||
def _resolve_host_ips(hostname: str) -> List[str]:
|
||||
ips: List[str] = []
|
||||
try:
|
||||
infos = socket.getaddrinfo(hostname, None, family=socket.AF_INET)
|
||||
for info in infos:
|
||||
ip = info[4][0]
|
||||
if ip not in ips:
|
||||
ips.append(ip)
|
||||
except OSError:
|
||||
pass
|
||||
return ips
|
||||
|
||||
@staticmethod
|
||||
def _pick_by_cidrs(candidates: Iterable[str], prefer: List[ipaddress.IPv4Network]) -> str | None:
|
||||
for net in prefer:
|
||||
for ip in candidates:
|
||||
try:
|
||||
if ipaddress.ip_address(ip) in net:
|
||||
return ip
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _reachable(ip: str, port: int, timeout: float) -> bool:
|
||||
try:
|
||||
with socket.create_connection((ip, port), timeout=timeout):
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# internal loop
|
||||
|
||||
@ -324,9 +324,35 @@ class Storage:
|
||||
{
|
||||
"node_id": row["id"],
|
||||
"user_id": meta.get("user"),
|
||||
"ip": meta.get("ip"),
|
||||
"ip": meta.get("ip"), # kept for backward-compat; preferred IP selection handled in scheduler
|
||||
"hostname": meta.get("hostname", row["name"]),
|
||||
"labels": labels if isinstance(labels, list) else [],
|
||||
}
|
||||
)
|
||||
return result
|
||||
|
||||
def get_online_nodes_meta(self) -> List[Dict[str, Any]]:
|
||||
"""返回在线节点的原始 meta 与名称、标签,交由上层选择目标 IP。
|
||||
|
||||
每项包含:{ node_id, name, meta, labels }
|
||||
"""
|
||||
with self._lock:
|
||||
cur = self._conn.execute(
|
||||
"SELECT id, name, meta_json, labels_json FROM nodes WHERE status = ? ORDER BY id ASC",
|
||||
("online",),
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
|
||||
result: List[Dict[str, Any]] = []
|
||||
for row in rows:
|
||||
meta = json.loads(row["meta_json"]) if row["meta_json"] else {}
|
||||
labels = json.loads(row["labels_json"]) if row["labels_json"] else []
|
||||
result.append(
|
||||
{
|
||||
"node_id": row["id"],
|
||||
"name": row["name"],
|
||||
"meta": meta if isinstance(meta, dict) else {},
|
||||
"labels": labels if isinstance(labels, list) else [],
|
||||
}
|
||||
)
|
||||
return result
|
||||
|
||||
@ -1 +1 @@
|
||||
1.40.0
|
||||
1.42.0
|
||||
|
||||
BIN
src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent
(Stored with Git LFS)
BIN
src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent
(Stored with Git LFS)
Binary file not shown.
@ -274,19 +274,33 @@ verify_checksums() {
|
||||
log_info "Artifact 目录: $artifact_dir"
|
||||
failed_verification=0
|
||||
|
||||
# 尝试解析 version.json 中的 install_order,用于锁定精确文件名,避免同一目录下多份历史 tar 产生歧义
|
||||
local order_file="$TEMP_DIR/install_order.txt"
|
||||
if [[ -f "$TEMP_DIR/checksums.txt" ]]; then
|
||||
while IFS= read -r line; do
|
||||
component=$(echo "$line" | cut -d':' -f1)
|
||||
expected_checksum=$(echo "$line" | cut -d':' -f2-)
|
||||
|
||||
# 查找匹配的 tar 文件
|
||||
# 优先从 install_order 中推导精确文件名
|
||||
actual_file=""
|
||||
for file in "$artifact_dir/${component}-"*.tar.gz; do
|
||||
if [[ -f "$file" ]]; then
|
||||
actual_file="$file"
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [[ -f "$order_file" ]]; then
|
||||
while IFS= read -r fname; do
|
||||
if [[ "$fname" == ${component}-*.tar.gz && -f "$artifact_dir/$fname" ]]; then
|
||||
actual_file="$artifact_dir/$fname"
|
||||
break
|
||||
fi
|
||||
done < "$order_file"
|
||||
fi
|
||||
|
||||
# 回退:按前缀匹配首个(不推荐,但保持兼容)
|
||||
if [[ -z "$actual_file" ]]; then
|
||||
for file in "$artifact_dir/${component}-"*.tar.gz; do
|
||||
if [[ -f "$file" ]]; then
|
||||
actual_file="$file"
|
||||
break
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
if [[ -z "$actual_file" ]]; then
|
||||
log_error "找不到组件文件: $component"
|
||||
|
||||
@ -59,6 +59,12 @@ ARTIFACT_DIR="artifact/$VERSION"
|
||||
|
||||
log_info "开始打包 AIOps All-in-One 安装包 v$VERSION"
|
||||
|
||||
# 若强制打包且目录已存在,先清理旧产物以避免同一版本下残留多个 tar.gz 导致校验混乱
|
||||
if [[ "$FORCE_PACKAGE" == "true" && -d "$ARTIFACT_DIR" ]]; then
|
||||
log_info "--force: 清理旧的 $ARTIFACT_DIR 下的 tar 与元数据"
|
||||
rm -rf "$ARTIFACT_DIR"
|
||||
fi
|
||||
|
||||
# 检查必要文件
|
||||
log_info "检查必要文件..."
|
||||
if [[ ! -f "config/VERSION" ]]; then
|
||||
@ -130,7 +136,7 @@ if [[ -d "$ARTIFACT_DIR" && "$FORCE_PACKAGE" == "false" ]]; then
|
||||
fi
|
||||
fi
|
||||
|
||||
# 创建 artifact 目录
|
||||
# 创建 artifact 目录(清理后重建)
|
||||
mkdir -p "$ARTIFACT_DIR"
|
||||
log_info "创建输出目录: $ARTIFACT_DIR"
|
||||
|
||||
@ -285,10 +291,13 @@ while IFS= read -r component; do
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 清理组件目录内历史 tar 包,避免 find 误选旧文件
|
||||
rm -f ./*.tar.gz 2>/dev/null || true
|
||||
|
||||
# 执行组件的打包脚本
|
||||
if ./package.sh; then
|
||||
# 查找生成的 tar 包
|
||||
tar_file=$(find . -name "*.tar.gz" -type f | head -1)
|
||||
tar_file=$(ls -1t ./*.tar.gz 2>/dev/null | head -1)
|
||||
if [[ -n "$tar_file" ]]; then
|
||||
# 移动到 artifact 目录
|
||||
mv "$tar_file" "$CURRENT_DIR/$ARTIFACT_DIR/"
|
||||
|
||||
@ -130,20 +130,40 @@ fi
|
||||
TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$"
|
||||
mkdir -p "$TEMP_PACKAGE_DIR"
|
||||
|
||||
# 复制所有 tar.gz 文件到临时目录
|
||||
log_info "准备 artifact 文件..."
|
||||
tar_files=$(find "$ARTIFACT_DIR" -name "*.tar.gz" -type f)
|
||||
# 仅复制 version.json 中 install_order 列出的 tar.gz,防止同一版本目录下历史残留文件导致校验不一致
|
||||
log_info "准备 artifact 文件(按 install_order)..."
|
||||
|
||||
if [[ -z "$tar_files" ]]; then
|
||||
log_error "在 $ARTIFACT_DIR 中未找到 tar.gz 文件"
|
||||
exit 1
|
||||
install_list_file="$TEMP_DIR/install_list.txt"
|
||||
if command -v jq >/dev/null 2>&1; then
|
||||
jq -r '.install_order[]' "$ARTIFACT_DIR/version.json" > "$install_list_file" 2>/dev/null || true
|
||||
else
|
||||
# 简易解析
|
||||
grep -A 200 '"install_order"' "$ARTIFACT_DIR/version.json" | grep -E '".*"' | sed 's/.*"\([^"]*\)".*/\1/' > "$install_list_file" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
for file in $tar_files; do
|
||||
filename=$(basename "$file")
|
||||
log_info " 准备: $filename"
|
||||
cp "$file" "$TEMP_PACKAGE_DIR/"
|
||||
done
|
||||
if [[ -s "$install_list_file" ]]; then
|
||||
while IFS= read -r filename; do
|
||||
src="$ARTIFACT_DIR/$filename"
|
||||
if [[ -f "$src" ]]; then
|
||||
log_info " 拷贝: $filename"
|
||||
cp "$src" "$TEMP_PACKAGE_DIR/"
|
||||
else
|
||||
log_warning " 未找到: $filename(跳过)"
|
||||
fi
|
||||
done < "$install_list_file"
|
||||
else
|
||||
log_warning "未能解析 install_order,将回退复制全部 tar.gz(可能包含历史残留,建议安装端使用严格校验)"
|
||||
tar_files=$(find "$ARTIFACT_DIR" -name "*.tar.gz" -type f)
|
||||
if [[ -z "$tar_files" ]]; then
|
||||
log_error "在 $ARTIFACT_DIR 中未找到 tar.gz 文件"
|
||||
exit 1
|
||||
fi
|
||||
for file in $tar_files; do
|
||||
filename=$(basename "$file")
|
||||
log_info " 准备: $filename"
|
||||
cp "$file" "$TEMP_PACKAGE_DIR/"
|
||||
done
|
||||
fi
|
||||
|
||||
# 复制版本信息文件
|
||||
if [[ -f "$ARTIFACT_DIR/version.json" ]]; then
|
||||
|
||||
16
src/sys/build/node-bundle/Dockerfile
Normal file
16
src/sys/build/node-bundle/Dockerfile
Normal file
@ -0,0 +1,16 @@
|
||||
ARG BASE_IMAGE=argus-sys-metric-test-node:latest
|
||||
FROM ${BASE_IMAGE}
|
||||
|
||||
ARG CLIENT_VER
|
||||
LABEL org.opencontainers.image.title="argus-sys-metric-test-node-bundle" \
|
||||
org.opencontainers.image.version="${CLIENT_VER}" \
|
||||
org.opencontainers.image.description="Metric test node with embedded client package"
|
||||
|
||||
WORKDIR /
|
||||
|
||||
# bundle files are provided at build time into ./bundle in build context
|
||||
COPY bundle/ /bundle/
|
||||
COPY node-bootstrap.sh /usr/local/bin/node-bootstrap.sh
|
||||
RUN chmod +x /usr/local/bin/node-bootstrap.sh
|
||||
|
||||
ENTRYPOINT ["/usr/local/bin/node-bootstrap.sh"]
|
||||
2
src/sys/build/node-bundle/bundle/.gitignore
vendored
Normal file
2
src/sys/build/node-bundle/bundle/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
|
||||
argus-metric_*.tar.gz
|
||||
1006
src/sys/build/node-bundle/bundle/setup.sh
Executable file
1006
src/sys/build/node-bundle/bundle/setup.sh
Executable file
File diff suppressed because it is too large
Load Diff
99
src/sys/build/node-bundle/node-bootstrap.sh
Normal file
99
src/sys/build/node-bundle/node-bootstrap.sh
Normal file
@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
echo "[BOOT] node bundle starting"
|
||||
|
||||
INSTALL_DIR="/opt/argus-metric"
|
||||
BUNDLE_DIR="/bundle"
|
||||
installed_ok=0
|
||||
|
||||
# 1) already installed?
|
||||
if [[ -L "$INSTALL_DIR/current" && -d "$INSTALL_DIR/current" ]]; then
|
||||
echo "[BOOT] client already installed at $INSTALL_DIR/current"
|
||||
else
|
||||
# 2) try local bundle first (replicate setup.sh layout: move to /opt/argus-metric/versions/<ver> and run install.sh)
|
||||
tarball=$(ls -1 "$BUNDLE_DIR"/argus-metric_*.tar.gz 2>/dev/null | head -1 || true)
|
||||
if [[ -n "${tarball:-}" ]]; then
|
||||
echo "[BOOT] installing from local bundle: $(basename "$tarball")"
|
||||
tmp=$(mktemp -d)
|
||||
tar -xzf "$tarball" -C "$tmp"
|
||||
# locate root containing version.json
|
||||
root="$tmp"
|
||||
if [[ ! -f "$root/version.json" ]]; then
|
||||
sub=$(find "$tmp" -mindepth 1 -maxdepth 1 -type d | head -n1 || true)
|
||||
[[ -n "$sub" && -f "$sub/version.json" ]] && root="$sub"
|
||||
fi
|
||||
if [[ ! -f "$root/version.json" ]]; then
|
||||
echo "[BOOT][WARN] version.json not found in bundle; fallback to FTP"
|
||||
else
|
||||
ver=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$root/version.json" | head -n1)
|
||||
if [[ -z "$ver" ]]; then
|
||||
echo "[BOOT][WARN] failed to parse version from version.json; fallback to FTP"
|
||||
else
|
||||
target_root="/opt/argus-metric"
|
||||
version_dir="$target_root/versions/$ver"
|
||||
mkdir -p "$version_dir"
|
||||
# move contents into version dir
|
||||
shopt -s dotglob
|
||||
mv "$root"/* "$version_dir/" 2>/dev/null || true
|
||||
shopt -u dotglob
|
||||
# run component installer within version dir
|
||||
if [[ -f "$version_dir/install.sh" ]]; then
|
||||
chmod +x "$version_dir/install.sh" 2>/dev/null || true
|
||||
(cd "$version_dir" && ./install.sh "$version_dir")
|
||||
echo "$ver" > "$target_root/LATEST_VERSION" 2>/dev/null || true
|
||||
ln -sfn "$version_dir" "$target_root/current" 2>/dev/null || true
|
||||
if [[ -L "$target_root/current" && -d "$target_root/current" ]]; then
|
||||
installed_ok=1
|
||||
echo "[BOOT] local bundle install OK: version=$ver"
|
||||
else
|
||||
echo "[BOOT][WARN] current symlink not present after install; will rely on healthcheck to confirm"
|
||||
fi
|
||||
else
|
||||
echo "[BOOT][WARN] install.sh missing under $version_dir; fallback to FTP"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# 3) fallback: use FTP setup if not installed
|
||||
if [[ ! -L "$INSTALL_DIR/current" && "$installed_ok" -eq 0 ]]; then
|
||||
echo "[BOOT] fallback to FTP setup"
|
||||
if [[ -z "${FTPIP:-}" || -z "${FTP_USER:-}" || -z "${FTP_PASSWORD:-}" ]]; then
|
||||
echo "[BOOT][ERROR] FTP variables not set (FTPIP/FTP_USER/FTP_PASSWORD)" >&2
|
||||
exit 1
|
||||
fi
|
||||
curl -u "$FTP_USER:$FTP_PASSWORD" -fsSL "ftp://$FTPIP:21/setup.sh" -o /tmp/setup.sh
|
||||
chmod +x /tmp/setup.sh
|
||||
/tmp/setup.sh --server "$FTPIP" --user "$FTP_USER" --password "$FTP_PASSWORD" --port 21
|
||||
fi
|
||||
fi
|
||||
|
||||
# 4) ensure agent is running; start if needed (inherits env: MASTER_ENDPOINT/AGENT_*)
|
||||
if ! pgrep -x argus-agent >/dev/null 2>&1; then
|
||||
echo "[BOOT] starting argus-agent (not detected)"
|
||||
setsid /usr/local/bin/argus-agent >/var/log/argus-agent.log 2>&1 < /dev/null &
|
||||
fi
|
||||
|
||||
# 5) post-install selfcheck (best-effort) and wait for node.json
|
||||
for i in {1..30}; do
|
||||
if compgen -G "$INSTALL_DIR/versions/*/check_health.sh" > /dev/null; then
|
||||
bash "$INSTALL_DIR"/versions/*/check_health.sh || true
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
host="$(hostname)"
|
||||
state_dir="/private/argus/agent/${host}"
|
||||
mkdir -p "$state_dir" 2>/dev/null || true
|
||||
for i in {1..60}; do
|
||||
if [[ -s "$state_dir/node.json" ]]; then
|
||||
echo "[BOOT] node state present: $state_dir/node.json"
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
echo "[BOOT] ready; entering sleep"
|
||||
exec sleep infinity
|
||||
21
src/sys/swarm_tests/.env
Normal file
21
src/sys/swarm_tests/.env
Normal file
@ -0,0 +1,21 @@
|
||||
SERVER_PROJECT=argus-swarm-server
|
||||
NODES_PROJECT=argus-swarm-nodes
|
||||
|
||||
# Host ports for server compose
|
||||
MASTER_PORT=32300
|
||||
ES_HTTP_PORT=9200
|
||||
KIBANA_PORT=5601
|
||||
PROMETHEUS_PORT=9090
|
||||
GRAFANA_PORT=3000
|
||||
ALERTMANAGER_PORT=9093
|
||||
WEB_PROXY_PORT_8080=8080
|
||||
WEB_PROXY_PORT_8081=8081
|
||||
WEB_PROXY_PORT_8082=8082
|
||||
WEB_PROXY_PORT_8083=8083
|
||||
WEB_PROXY_PORT_8084=8084
|
||||
WEB_PROXY_PORT_8085=8085
|
||||
|
||||
# UID/GID for volume ownership in containers
|
||||
ARGUS_BUILD_UID=1000
|
||||
ARGUS_BUILD_GID=1000
|
||||
|
||||
21
src/sys/swarm_tests/.env.example
Normal file
21
src/sys/swarm_tests/.env.example
Normal file
@ -0,0 +1,21 @@
|
||||
SERVER_PROJECT=argus-swarm-server
|
||||
NODES_PROJECT=argus-swarm-nodes
|
||||
|
||||
# Host ports for server compose
|
||||
MASTER_PORT=32300
|
||||
ES_HTTP_PORT=9200
|
||||
KIBANA_PORT=5601
|
||||
PROMETHEUS_PORT=9090
|
||||
GRAFANA_PORT=3000
|
||||
ALERTMANAGER_PORT=9093
|
||||
WEB_PROXY_PORT_8080=8080
|
||||
WEB_PROXY_PORT_8081=8081
|
||||
WEB_PROXY_PORT_8082=8082
|
||||
WEB_PROXY_PORT_8083=8083
|
||||
WEB_PROXY_PORT_8084=8084
|
||||
WEB_PROXY_PORT_8085=8085
|
||||
|
||||
# UID/GID for volume ownership in containers
|
||||
ARGUS_BUILD_UID=2133
|
||||
ARGUS_BUILD_GID=2015
|
||||
|
||||
8
src/sys/swarm_tests/.env.nodes
Normal file
8
src/sys/swarm_tests/.env.nodes
Normal file
@ -0,0 +1,8 @@
|
||||
BINDIP=10.0.1.5
|
||||
FTPIP=10.0.1.4
|
||||
MASTER_ENDPOINT=http://master.argus.com:3000
|
||||
FTP_USER=ftpuser
|
||||
FTP_PASSWORD=ZGClab1234!
|
||||
AGENT_ENV=dev2
|
||||
AGENT_USER=yuyr
|
||||
AGENT_INSTANCE=node001sX
|
||||
52
src/sys/swarm_tests/README.md
Normal file
52
src/sys/swarm_tests/README.md
Normal file
@ -0,0 +1,52 @@
|
||||
# Swarm Tests (argus-sys-net)
|
||||
|
||||
快速在本机用 Docker Swarm + overlay 网络验证“服务端 + 单节点”端到端部署。保持对 `src/sys/tests` 兼容,不影响现有桥接网络测试。
|
||||
|
||||
## 先决条件
|
||||
- Docker Engine 已启用 Swarm(脚本会自动 `swarm init` 单机模式)。
|
||||
- 已构建并加载以下镜像:`argus-bind9:latest`、`argus-master:latest`、`argus-elasticsearch:latest`、`argus-kibana:latest`、`argus-metric-ftp:latest`、`argus-metric-prometheus:latest`、`argus-metric-grafana:latest`、`argus-alertmanager:latest`、`argus-web-frontend:latest`、`argus-web-proxy:latest`、以及节点镜像 `argus-sys-metric-test-node-bundle:latest`(见下文)。
|
||||
- 本地 `UID/GID` 建议通过 `configs/build_user.local.conf` 指定,脚本会读取:
|
||||
- `UID=1000`\n`GID=1000`(示例)。
|
||||
|
||||
## 构建节点 bundle 镜像
|
||||
|
||||
```
|
||||
./deployment/build/build_images.sh --with-node-bundle --client-version 20251106
|
||||
```
|
||||
|
||||
说明:`--client-version` 支持 `YYYYMMDD` 日期包或 `1.xx.yy` 组件版本。打包完成后镜像 `argus-sys-metric-test-node-bundle:latest` 会内置 `argus-metric_*.tar.gz`,容器启动时优先从本地 bundle 安装。
|
||||
|
||||
## 运行步骤
|
||||
|
||||
```
|
||||
cd src/sys/swarm_tests
|
||||
cp .env.example .env
|
||||
|
||||
bash scripts/00_bootstrap.sh
|
||||
bash scripts/01_server_up.sh
|
||||
bash scripts/02_wait_ready.sh # 输出 BINDIP/FTPIP 到 .env.nodes
|
||||
bash scripts/03_nodes_up.sh
|
||||
bash scripts/04_metric_verify.sh
|
||||
```
|
||||
|
||||
清理:
|
||||
|
||||
```
|
||||
bash scripts/99_down.sh
|
||||
```
|
||||
|
||||
## 说明与注意事项
|
||||
- `00_bootstrap.sh`:先加载 `scripts/common/build_user.sh`,打印并写入 `.env` 中的 `ARGUS_BUILD_UID/GID`,再准备 `private-server/` 与 `private-nodes/` 目录,并 `chown` 到对应 UID/GID。
|
||||
- `01_server_up.sh`:启动服务端 compose。可用 `SWARM_FIX_PERMS=1` 打开“容器内 chmod + supervisor 重启”的兜底逻辑,默认关闭。
|
||||
- `02_wait_ready.sh`:等待 Master/ES/Prom/Grafana 就绪(Kibana 可延迟),随后解析 overlay IP,写入 `.env.nodes` 的 `BINDIP/FTPIP`,供节点 compose 使用。
|
||||
- `03_nodes_up.sh`:启动单节点容器(bundle 版)。容器内 `node-bootstrap.sh` 优先本地安装,成功后执行健康检查并等待 `/private/argus/agent/<hostname>/node.json` 出现。
|
||||
- `04_metric_verify.sh`:在本套件内执行详细校验(不再直接调用 tests 脚本):
|
||||
- Grafana `/api/health`(database=ok)
|
||||
- Grafana 数据源指向 `prom.metric.argus.com:<port>` 并在容器内可解析该域名
|
||||
- Prometheus `activeTargets` 全部 up
|
||||
- `nodes.json` 不包含 `172.22/16`(docker_gwbridge)
|
||||
|
||||
## 常见问题
|
||||
- Grafana/Kibana 启动报权限:检查 `configs/build_user.local.conf` 与 `00_bootstrap.sh` 的输出 UID/GID 是否一致;必要时设置 `SWARM_FIX_PERMS=1` 重新 `01_server_up.sh`。
|
||||
- 节点容器 fallback 到 FTP:通常为 bundle 结构异常或健康检查失败(早期脚本在 `sh` 下执行)。当前 `node-bootstrap.sh` 已使用 `bash` 执行健康检查,并在本地安装成功后跳过 FTP。
|
||||
- 代理 502:查看容器 `argus-web-proxy` 的 `/var/log/nginx/error.log` 与启动日志中 `upstream check` 行;若后端未就绪(尤其 Kibana),等待 `02_wait_ready.sh` 通过后再访问。
|
||||
34
src/sys/swarm_tests/docker-compose.nodes.yml
Normal file
34
src/sys/swarm_tests/docker-compose.nodes.yml
Normal file
@ -0,0 +1,34 @@
|
||||
version: "3.8"
|
||||
|
||||
networks:
|
||||
argus-sys-net:
|
||||
external: true
|
||||
|
||||
services:
|
||||
metric-test-node:
|
||||
image: ${NODE_BUNDLE_IMAGE_TAG:-argus-sys-metric-test-node-bundle:latest}
|
||||
container_name: argus-metric-test-node-swarm
|
||||
hostname: ${NODE_HOSTNAME:-swarm-metric-node-001}
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- TZ=Asia/Shanghai
|
||||
- DEBIAN_FRONTEND=noninteractive
|
||||
- MASTER_ENDPOINT=${MASTER_ENDPOINT:-http://master.argus.com:3000}
|
||||
- ES_HOST=es.log.argus.com
|
||||
- ES_PORT=9200
|
||||
- FTPIP=${FTPIP}
|
||||
- BINDIP=${BINDIP}
|
||||
- FTP_USER=${FTP_USER:-ftpuser}
|
||||
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
- AGENT_ENV=${AGENT_ENV:-dev2}
|
||||
- AGENT_USER=${AGENT_USER:-yuyr}
|
||||
- AGENT_INSTANCE=${AGENT_INSTANCE:-node001sX}
|
||||
- CLIENT_VERSION=${CLIENT_VERSION:-}
|
||||
dns:
|
||||
- ${BINDIP}
|
||||
networks: [argus-sys-net]
|
||||
volumes:
|
||||
- ./private-nodes/argus/agent:/private/argus/agent
|
||||
command: ["sleep", "infinity"]
|
||||
174
src/sys/swarm_tests/docker-compose.server.yml
Normal file
174
src/sys/swarm_tests/docker-compose.server.yml
Normal file
@ -0,0 +1,174 @@
|
||||
version: "3.8"
|
||||
|
||||
networks:
|
||||
argus-sys-net:
|
||||
external: true
|
||||
|
||||
services:
|
||||
bind:
|
||||
image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
|
||||
container_name: argus-bind-sys
|
||||
networks: [argus-sys-net]
|
||||
volumes:
|
||||
- ./private-server:/private
|
||||
restart: unless-stopped
|
||||
|
||||
master:
|
||||
image: ${MASTER_IMAGE_TAG:-argus-master:latest}
|
||||
container_name: argus-master-sys
|
||||
depends_on: [bind]
|
||||
environment:
|
||||
- OFFLINE_THRESHOLD_SECONDS=6
|
||||
- ONLINE_THRESHOLD_SECONDS=2
|
||||
- SCHEDULER_INTERVAL_SECONDS=1
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
ports:
|
||||
- "${MASTER_PORT:-32300}:3000"
|
||||
volumes:
|
||||
- ./private-server/argus/master:/private/argus/master
|
||||
- ./private-server/argus/metric/prometheus:/private/argus/metric/prometheus
|
||||
- ./private-server/argus/etc:/private/argus/etc
|
||||
networks: [argus-sys-net]
|
||||
restart: unless-stopped
|
||||
|
||||
es:
|
||||
image: ${ES_IMAGE_TAG:-argus-elasticsearch:latest}
|
||||
container_name: argus-es-sys
|
||||
environment:
|
||||
- discovery.type=single-node
|
||||
- xpack.security.enabled=false
|
||||
- ES_JAVA_OPTS=-Xms512m -Xmx512m
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
volumes:
|
||||
- ./private-server/argus/log/elasticsearch:/private/argus/log/elasticsearch
|
||||
- ./private-server/argus/etc:/private/argus/etc
|
||||
ports:
|
||||
- "${ES_HTTP_PORT:-9200}:9200"
|
||||
restart: unless-stopped
|
||||
networks: [argus-sys-net]
|
||||
|
||||
kibana:
|
||||
image: ${KIBANA_IMAGE_TAG:-argus-kibana:latest}
|
||||
container_name: argus-kibana-sys
|
||||
environment:
|
||||
- ELASTICSEARCH_HOSTS=http://es.log.argus.com:9200
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
volumes:
|
||||
- ./private-server/argus/log/kibana:/private/argus/log/kibana
|
||||
- ./private-server/argus/etc:/private/argus/etc
|
||||
depends_on: [es]
|
||||
ports:
|
||||
- "${KIBANA_PORT:-5601}:5601"
|
||||
restart: unless-stopped
|
||||
networks: [argus-sys-net]
|
||||
|
||||
ftp:
|
||||
image: ${FTP_IMAGE_TAG:-argus-metric-ftp:latest}
|
||||
container_name: argus-ftp
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- TZ=Asia/Shanghai
|
||||
- FTP_BASE_PATH=/private/argus/ftp
|
||||
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
|
||||
- DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com}
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
ports:
|
||||
- "${FTP_PORT:-21}:21"
|
||||
- "${FTP_DATA_PORT:-20}:20"
|
||||
- "${FTP_PASSIVE_HOST_RANGE:-21100-21110}:21100-21110"
|
||||
volumes:
|
||||
- ./private-server/argus/metric/ftp:/private/argus/ftp
|
||||
- ./private-server/argus/etc:/private/argus/etc
|
||||
networks: [argus-sys-net]
|
||||
|
||||
prometheus:
|
||||
image: ${PROM_IMAGE_TAG:-argus-metric-prometheus:latest}
|
||||
container_name: argus-prometheus
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- TZ=Asia/Shanghai
|
||||
- PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
ports:
|
||||
- "${PROMETHEUS_PORT:-9090}:9090"
|
||||
volumes:
|
||||
- ./private-server/argus/metric/prometheus:/private/argus/metric/prometheus
|
||||
- ./private-server/argus/etc:/private/argus/etc
|
||||
networks: [argus-sys-net]
|
||||
|
||||
grafana:
|
||||
image: ${GRAFANA_IMAGE_TAG:-argus-metric-grafana:latest}
|
||||
container_name: argus-grafana
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- TZ=Asia/Shanghai
|
||||
- GRAFANA_BASE_PATH=/private/argus/metric/grafana
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
- GF_SERVER_HTTP_PORT=3000
|
||||
- GF_LOG_LEVEL=warn
|
||||
- GF_LOG_MODE=console
|
||||
- GF_PATHS_PROVISIONING=/private/argus/metric/grafana/provisioning
|
||||
- GF_AUTH_ANONYMOUS_ENABLED=true
|
||||
- GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
|
||||
ports:
|
||||
- "${GRAFANA_PORT:-3000}:3000"
|
||||
volumes:
|
||||
- ./private-server/argus/metric/grafana:/private/argus/metric/grafana
|
||||
- ./private-server/argus/etc:/private/argus/etc
|
||||
depends_on: [prometheus]
|
||||
networks: [argus-sys-net]
|
||||
|
||||
alertmanager:
|
||||
image: ${ALERT_IMAGE_TAG:-argus-alertmanager:latest}
|
||||
container_name: argus-alertmanager
|
||||
environment:
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
volumes:
|
||||
- ./private-server/argus/etc:/private/argus/etc
|
||||
- ./private-server/argus/alert/alertmanager:/private/argus/alert/alertmanager
|
||||
networks: [argus-sys-net]
|
||||
ports:
|
||||
- "${ALERTMANAGER_PORT:-9093}:9093"
|
||||
restart: unless-stopped
|
||||
|
||||
web-frontend:
|
||||
image: ${FRONT_IMAGE_TAG:-argus-web-frontend:latest}
|
||||
container_name: argus-web-frontend
|
||||
environment:
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
- EXTERNAL_MASTER_PORT=${WEB_PROXY_PORT_8085:-8085}
|
||||
- EXTERNAL_ALERTMANAGER_PORT=${WEB_PROXY_PORT_8084:-8084}
|
||||
- EXTERNAL_GRAFANA_PORT=${WEB_PROXY_PORT_8081:-8081}
|
||||
- EXTERNAL_PROMETHEUS_PORT=${WEB_PROXY_PORT_8082:-8082}
|
||||
- EXTERNAL_KIBANA_PORT=${WEB_PROXY_PORT_8083:-8083}
|
||||
volumes:
|
||||
- ./private-server/argus/etc:/private/argus/etc
|
||||
networks: [argus-sys-net]
|
||||
restart: unless-stopped
|
||||
|
||||
web-proxy:
|
||||
image: ${WEB_PROXY_IMAGE_TAG:-argus-web-proxy:latest}
|
||||
container_name: argus-web-proxy
|
||||
depends_on: [bind, master, grafana, prometheus, kibana, alertmanager]
|
||||
environment:
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
volumes:
|
||||
- ./private-server/argus/etc:/private/argus/etc
|
||||
networks: [argus-sys-net]
|
||||
ports:
|
||||
- "${WEB_PROXY_PORT_8080:-8080}:8080"
|
||||
- "${WEB_PROXY_PORT_8081:-8081}:8081"
|
||||
- "${WEB_PROXY_PORT_8082:-8082}:8082"
|
||||
- "${WEB_PROXY_PORT_8083:-8083}:8083"
|
||||
- "${WEB_PROXY_PORT_8084:-8084}:8084"
|
||||
- "${WEB_PROXY_PORT_8085:-8085}:8085"
|
||||
restart: unless-stopped
|
||||
101
src/sys/swarm_tests/scripts/00_bootstrap.sh
Executable file
101
src/sys/swarm_tests/scripts/00_bootstrap.sh
Executable file
@ -0,0 +1,101 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
REPO_ROOT="$(cd "$ROOT/../../.." && pwd)"
|
||||
|
||||
ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] || cp "$ROOT/.env.example" "$ENV_FILE"
|
||||
|
||||
# Load build user (UID/GID) from repo config to match container runtime users
|
||||
if [[ -f "$REPO_ROOT/scripts/common/build_user.sh" ]]; then
|
||||
# shellcheck disable=SC1091
|
||||
source "$REPO_ROOT/scripts/common/build_user.sh" 2>/dev/null || true
|
||||
if declare -f load_build_user >/dev/null 2>&1; then
|
||||
load_build_user
|
||||
fi
|
||||
fi
|
||||
|
||||
# Capture resolved UID/GID from build_user before sourcing .env
|
||||
uid_resolved="${ARGUS_BUILD_UID:-2133}"
|
||||
gid_resolved="${ARGUS_BUILD_GID:-2015}"
|
||||
echo "[BOOT] resolved build user: UID=${uid_resolved} GID=${gid_resolved} (from scripts/common/build_user.sh or env)"
|
||||
|
||||
# After resolving UID/GID, load .env for other settings; then we will overwrite UID/GID entries
|
||||
set -a; source "$ENV_FILE"; set +a
|
||||
|
||||
echo "[BOOT] checking Docker Swarm"
|
||||
if ! docker info 2>/dev/null | grep -q "Swarm: active"; then
|
||||
echo "[BOOT] initializing swarm (single-node)"
|
||||
docker swarm init >/dev/null 2>&1 || true
|
||||
fi
|
||||
|
||||
NET_NAME=argus-sys-net
|
||||
if docker network inspect "$NET_NAME" >/dev/null 2>&1; then
|
||||
echo "[BOOT] overlay network exists: $NET_NAME"
|
||||
else
|
||||
echo "[BOOT] creating overlay network: $NET_NAME"
|
||||
docker network create -d overlay --attachable "$NET_NAME"
|
||||
fi
|
||||
|
||||
echo "[BOOT] preparing private directories (server/nodes)"
|
||||
# Server-side dirs (align with sys/tests 01_bootstrap.sh)
|
||||
mkdir -p \
|
||||
"$ROOT/private-server/argus/etc" \
|
||||
"$ROOT/private-server/argus/bind" \
|
||||
"$ROOT/private-server/argus/master" \
|
||||
"$ROOT/private-server/argus/metric/prometheus" \
|
||||
"$ROOT/private-server/argus/metric/prometheus/data" \
|
||||
"$ROOT/private-server/argus/metric/prometheus/rules" \
|
||||
"$ROOT/private-server/argus/metric/prometheus/targets" \
|
||||
"$ROOT/private-server/argus/alert/alertmanager" \
|
||||
"$ROOT/private-server/argus/metric/ftp/share" \
|
||||
"$ROOT/private-server/argus/metric/grafana/data" \
|
||||
"$ROOT/private-server/argus/metric/grafana/logs" \
|
||||
"$ROOT/private-server/argus/metric/grafana/plugins" \
|
||||
"$ROOT/private-server/argus/metric/grafana/provisioning/datasources" \
|
||||
"$ROOT/private-server/argus/metric/grafana/provisioning/dashboards" \
|
||||
"$ROOT/private-server/argus/metric/grafana/data/sessions" \
|
||||
"$ROOT/private-server/argus/metric/grafana/data/dashboards" \
|
||||
"$ROOT/private-server/argus/metric/grafana/config" \
|
||||
"$ROOT/private-server/argus/agent" \
|
||||
"$ROOT/private-server/argus/log/elasticsearch" \
|
||||
"$ROOT/private-server/argus/log/kibana"
|
||||
|
||||
mkdir -p "$ROOT/private-nodes/argus/agent"
|
||||
|
||||
uid="$uid_resolved"; gid="$gid_resolved"
|
||||
echo "[BOOT] chown -R ${uid}:${gid} for server core dirs (best-effort)"
|
||||
chown -R "$uid":"$gid" \
|
||||
"$ROOT/private-server/argus/log/elasticsearch" \
|
||||
"$ROOT/private-server/argus/log/kibana" \
|
||||
"$ROOT/private-server/argus/metric/grafana" \
|
||||
"$ROOT/private-server/argus/metric/prometheus" \
|
||||
"$ROOT/private-server/argus/alert" \
|
||||
"$ROOT/private-server/argus/metric/ftp" \
|
||||
"$ROOT/private-server/argus/agent" \
|
||||
"$ROOT/private-server/argus/etc" 2>/dev/null || true
|
||||
|
||||
# group-writable for etc/alert as in sys/tests
|
||||
chmod -R g+w "$ROOT/private-server/argus/alert" "$ROOT/private-server/argus/etc" 2>/dev/null || true
|
||||
|
||||
# ensure .env carries the resolved UID/GID for compose env interpolation
|
||||
if grep -q '^ARGUS_BUILD_UID=' "$ENV_FILE"; then
|
||||
sed -i "s/^ARGUS_BUILD_UID=.*/ARGUS_BUILD_UID=${uid}/" "$ENV_FILE"
|
||||
else
|
||||
echo "ARGUS_BUILD_UID=${uid}" >> "$ENV_FILE"
|
||||
fi
|
||||
if grep -q '^ARGUS_BUILD_GID=' "$ENV_FILE"; then
|
||||
sed -i "s/^ARGUS_BUILD_GID=.*/ARGUS_BUILD_GID=${gid}/" "$ENV_FILE"
|
||||
else
|
||||
echo "ARGUS_BUILD_GID=${gid}" >> "$ENV_FILE"
|
||||
fi
|
||||
|
||||
# distribute update-dns.sh
|
||||
BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh"
|
||||
BIND_UPDATE_DEST="$ROOT/private-server/argus/etc/update-dns.sh"
|
||||
if [[ -f "$BIND_UPDATE_SRC" ]]; then
|
||||
cp "$BIND_UPDATE_SRC" "$BIND_UPDATE_DEST" && chmod +x "$BIND_UPDATE_DEST" || true
|
||||
fi
|
||||
|
||||
echo "[BOOT] done"
|
||||
39
src/sys/swarm_tests/scripts/01_server_up.sh
Executable file
39
src/sys/swarm_tests/scripts/01_server_up.sh
Executable file
@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
REPO_ROOT="$(cd "$ROOT/../../.." && pwd)"
|
||||
ENV_FILE="$ROOT/.env"
|
||||
# load UID/GID from repo config first (so they take precedence over any stale .env values)
|
||||
if [[ -f "$REPO_ROOT/scripts/common/build_user.sh" ]]; then
|
||||
# shellcheck disable=SC1091
|
||||
source "$REPO_ROOT/scripts/common/build_user.sh" 2>/dev/null || true
|
||||
if declare -f load_build_user >/dev/null 2>&1; then
|
||||
load_build_user
|
||||
fi
|
||||
fi
|
||||
set -a; source "$ENV_FILE"; set +a
|
||||
|
||||
PROJECT="${SERVER_PROJECT:-argus-swarm-server}"
|
||||
COMPOSE_FILE="$ROOT/docker-compose.server.yml"
|
||||
|
||||
echo "[SERVER] starting compose project: $PROJECT"
|
||||
docker compose -p "$PROJECT" -f "$COMPOSE_FILE" up -d
|
||||
|
||||
echo "[SERVER] containers:"; docker compose -p "$PROJECT" -f "$COMPOSE_FILE" ps
|
||||
|
||||
# Optional post-start permission alignment (disabled by default). Enable with SWARM_FIX_PERMS=1
|
||||
if [[ "${SWARM_FIX_PERMS:-0}" == "1" ]]; then
|
||||
echo "[SERVER] aligning permissions in containers (best-effort)"
|
||||
for c in argus-master-sys argus-prometheus argus-grafana argus-ftp argus-es-sys argus-kibana-sys argus-web-frontend argus-web-proxy argus-alertmanager; do
|
||||
docker exec "$c" sh -lc 'mkdir -p /private/argus && chmod -R 777 /private/argus' 2>/dev/null || true
|
||||
done
|
||||
echo "[SERVER] restarting selected supervised programs to pick up new permissions"
|
||||
docker exec argus-prometheus sh -lc 'supervisorctl restart prometheus targets-updater >/dev/null 2>&1 || true' || true
|
||||
docker exec argus-grafana sh -lc 'rm -f /private/argus/etc/grafana.metric.argus.com 2>/dev/null || true; supervisorctl restart grafana >/dev/null 2>&1 || true' || true
|
||||
docker exec argus-es-sys sh -lc 'supervisorctl restart elasticsearch >/dev/null 2>&1 || true' || true
|
||||
docker exec argus-kibana-sys sh -lc 'supervisorctl restart kibana >/dev/null 2>&1 || true' || true
|
||||
fi
|
||||
|
||||
echo "[SERVER] done"
|
||||
84
src/sys/swarm_tests/scripts/02_wait_ready.sh
Executable file
84
src/sys/swarm_tests/scripts/02_wait_ready.sh
Executable file
@ -0,0 +1,84 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a
|
||||
|
||||
PROJECT="${SERVER_PROJECT:-argus-swarm-server}"
|
||||
RETRIES=${RETRIES:-60}
|
||||
SLEEP=${SLEEP:-5}
|
||||
|
||||
code() { curl -4 -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
|
||||
prom_ok() {
|
||||
# Consider ready if TCP:9090 is accepting on localhost (host side)
|
||||
(exec 3<>/dev/tcp/127.0.0.1/${PROMETHEUS_PORT:-9090}) >/dev/null 2>&1 && return 0
|
||||
return 1
|
||||
}
|
||||
|
||||
echo "[READY] waiting services (max $((RETRIES*SLEEP))s)"
|
||||
for i in $(seq 1 "$RETRIES"); do
|
||||
e1=$(code "http://127.0.0.1:${MASTER_PORT:-32300}/readyz")
|
||||
e2=$(code "http://127.0.0.1:${ES_HTTP_PORT:-9200}/_cluster/health")
|
||||
e3=000
|
||||
if prom_ok; then e3=200; fi
|
||||
e4=$(code "http://127.0.0.1:${GRAFANA_PORT:-3000}/api/health")
|
||||
e5=$(code "http://127.0.0.1:${KIBANA_PORT:-5601}/api/status")
|
||||
ok=0
|
||||
[[ "$e1" == 200 ]] && ok=$((ok+1))
|
||||
[[ "$e2" == 200 ]] && ok=$((ok+1))
|
||||
[[ "$e3" == 200 ]] && ok=$((ok+1))
|
||||
[[ "$e4" == 200 ]] && ok=$((ok+1))
|
||||
# Kibana 可放宽,等其它四项即可
|
||||
if [[ $ok -ge 4 ]]; then echo "[READY] base services OK"; break; fi
|
||||
echo "[..] waiting ($i/$RETRIES): master=$e1 es=$e2 prom=$e3 graf=$e4 kibana=$e5"; sleep "$SLEEP"
|
||||
done
|
||||
|
||||
if [[ $ok -lt 4 ]]; then echo "[ERROR] services not ready" >&2; exit 1; fi
|
||||
|
||||
echo "[READY] resolving overlay IPs"
|
||||
BINDIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-bind-sys)
|
||||
FTPIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-ftp)
|
||||
echo "BINDIP=$BINDIP FTPIP=$FTPIP"
|
||||
|
||||
ENV_NODES="$ROOT/.env.nodes"
|
||||
cat > "$ENV_NODES" <<EOF
|
||||
BINDIP=$BINDIP
|
||||
FTPIP=$FTPIP
|
||||
MASTER_ENDPOINT=http://master.argus.com:3000
|
||||
FTP_USER=ftpuser
|
||||
FTP_PASSWORD=ZGClab1234!
|
||||
AGENT_ENV=dev2
|
||||
AGENT_USER=yuyr
|
||||
AGENT_INSTANCE=node001sX
|
||||
EOF
|
||||
|
||||
echo "[READY] wrote $ENV_NODES"
|
||||
|
||||
# Inline: fix domain records -> actual overlay IPs and reload bind/nginx (best-effort)
|
||||
echo "[READY] fixing domain records to overlay IPs"
|
||||
ETC_DIR="$ROOT/private-server/argus/etc"; mkdir -p "$ETC_DIR"
|
||||
declare -A MAP
|
||||
MAP[web-frontend]=web.argus.com
|
||||
MAP[argus-grafana]=grafana.metric.argus.com
|
||||
MAP[argus-prometheus]=prom.metric.argus.com
|
||||
MAP[argus-kibana-sys]=kibana.log.argus.com
|
||||
MAP[argus-alertmanager]=alertmanager.alert.argus.com
|
||||
MAP[argus-master-sys]=master.argus.com
|
||||
changed=0
|
||||
for cname in "${!MAP[@]}"; do
|
||||
domain="${MAP[$cname]}"; fpath="$ETC_DIR/$domain"
|
||||
ip=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' "$cname" 2>/dev/null || true)
|
||||
[[ -z "$ip" ]] && { echo "[DNS-FIX][WARN] $domain: container $cname no overlay IP yet"; continue; }
|
||||
cur=$(cat "$fpath" 2>/dev/null || echo "")
|
||||
if [[ "$cur" != "$ip" ]]; then
|
||||
echo "$ip" > "$fpath"; echo "[DNS-FIX][SET] $domain = $ip (was: ${cur:-<empty>})"; changed=1
|
||||
else
|
||||
echo "[DNS-FIX][OK] $domain already $ip"
|
||||
fi
|
||||
done
|
||||
if [[ $changed -eq 1 ]]; then
|
||||
docker exec argus-bind-sys /usr/local/bin/reload-bind9.sh >/dev/null 2>&1 || true
|
||||
sleep 1
|
||||
fi
|
||||
docker exec argus-web-proxy nginx -s reload >/dev/null 2>&1 || true
|
||||
16
src/sys/swarm_tests/scripts/03_nodes_up.sh
Executable file
16
src/sys/swarm_tests/scripts/03_nodes_up.sh
Executable file
@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a
|
||||
ENV_NODES_FILE="$ROOT/.env.nodes"; set -a; source "$ENV_NODES_FILE"; set +a
|
||||
|
||||
PROJECT="${NODES_PROJECT:-argus-swarm-nodes}"
|
||||
COMPOSE_FILE="$ROOT/docker-compose.nodes.yml"
|
||||
|
||||
echo "[NODES] starting compose project: $PROJECT"
|
||||
docker compose -p "$PROJECT" --env-file "$ENV_NODES_FILE" -f "$COMPOSE_FILE" up -d
|
||||
docker compose -p "$PROJECT" -f "$COMPOSE_FILE" ps
|
||||
echo "[NODES] done"
|
||||
|
||||
173
src/sys/swarm_tests/scripts/04_metric_verify.sh
Executable file
173
src/sys/swarm_tests/scripts/04_metric_verify.sh
Executable file
@ -0,0 +1,173 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] && { set -a; source "$ENV_FILE"; set +a; }
|
||||
|
||||
PROM_PORT="${PROMETHEUS_PORT:-9090}"
|
||||
GRAF_PORT="${GRAFANA_PORT:-3000}"
|
||||
GRAF_URL="http://127.0.0.1:${GRAF_PORT}"
|
||||
PROM_DOMAIN="prom.metric.argus.com:${PROM_PORT}"
|
||||
|
||||
err() { echo "[ERR] $*" >&2; }
|
||||
ok() { echo "[OK] $*"; }
|
||||
info(){ echo "[INFO] $*"; }
|
||||
|
||||
fail() { err "$*"; exit 1; }
|
||||
|
||||
# Ensure fluent-bit is installed, configured and running to ship logs to ES
|
||||
# Best-effort remediation for swarm_tests only (does not change repo sources)
|
||||
ensure_fluentbit() {
|
||||
local cname="$1"
|
||||
# 1) ensure process exists or try local bundle installer
|
||||
if ! docker exec "$cname" pgrep -x fluent-bit >/dev/null 2>&1; then
|
||||
docker exec "$cname" bash -lc '
|
||||
set -e
|
||||
root=/opt/argus-metric/versions
|
||||
ver=$(ls -1 "$root" 2>/dev/null | sort -Vr | head -1 || true)
|
||||
[[ -z "$ver" ]] && ver=1.42.0
|
||||
verdir="$root/$ver"
|
||||
tb=$(ls -1 "$verdir"/fluent-bit-*.tar.gz 2>/dev/null | head -1 || true)
|
||||
if [ -n "$tb" ]; then tmp=$(mktemp -d); tar -xzf "$tb" -C "$tmp"; sub=$(find "$tmp" -mindepth 1 -maxdepth 1 -type d | head -n1 || true); [ -n "$sub" ] && (cd "$sub" && ./install.sh "$verdir") || true; fi
|
||||
' >/dev/null 2>&1 || true
|
||||
fi
|
||||
# 2) patch configs using literal placeholders with safe delimiter
|
||||
docker exec "$cname" bash -lc '
|
||||
set -e
|
||||
f=/etc/fluent-bit/fluent-bit.conf
|
||||
o=/etc/fluent-bit/outputs.d/10-es.conf
|
||||
LCL="\${CLUSTER}"; LRA="\${RACK}"; LHN="\${HOSTNAME}"; EH="\${ES_HOST:-localhost}"; EP="\${ES_PORT:-9200}"
|
||||
# record_modifier placeholders
|
||||
if grep -q "Record cluster $LCL" "$f"; then sed -i "s|Record cluster $LCL|Record cluster local|" "$f"; fi
|
||||
if grep -q "Record rack $LRA" "$f"; then sed -i "s|Record rack $LRA|Record rack dev|" "$f"; fi
|
||||
if grep -q "Record host $LHN" "$f"; then hn=$(hostname); sed -i "s|Record host $LHN|Record host ${hn}|" "$f"; fi
|
||||
# outputs placeholders
|
||||
if [ -f "$o" ] && (grep -q "$EH" "$o" || grep -q "$EP" "$o"); then
|
||||
sed -i "s|Host $EH|Host es.log.argus.com|g; s|Port $EP|Port 9200|g" "$o"
|
||||
fi
|
||||
' >/dev/null 2>&1 || true
|
||||
# 3) restart fluent-bit (best-effort) and wait
|
||||
docker exec "$cname" bash -lc 'pkill -x fluent-bit >/dev/null 2>&1 || true; sleep 1; setsid su -s /bin/bash fluent-bit -c "/opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf >> /var/log/fluent-bit.log 2>&1" &>/dev/null & echo ok' >/dev/null 2>&1 || true
|
||||
for i in {1..10}; do if docker exec "$cname" pgrep -x fluent-bit >/dev/null 2>&1; then return 0; fi; sleep 1; done
|
||||
echo "[WARN] fluent-bit not confirmed running; log pipeline may not ingest" >&2
|
||||
}
|
||||
|
||||
# ---- Grafana /api/health ----
|
||||
info "Grafana /api/health"
|
||||
HEALTH_JSON="$ROOT/tmp/metric-verify/graf_health.json"
|
||||
mkdir -p "$(dirname "$HEALTH_JSON")"
|
||||
code=$(curl -fsS -o "$HEALTH_JSON" -w '%{http_code}' --max-time 10 "$GRAF_URL/api/health" || true)
|
||||
[[ "$code" == 200 ]] || fail "/api/health HTTP $code"
|
||||
if grep -q '"database"\s*:\s*"ok"' "$HEALTH_JSON"; then ok "grafana health database=ok"; else fail "grafana health not ok: $(cat "$HEALTH_JSON")"; fi
|
||||
|
||||
# ---- Grafana datasource points to prom domain ----
|
||||
info "Grafana datasource URL uses domain: $PROM_DOMAIN"
|
||||
DS_FILE="/private/argus/metric/grafana/provisioning/datasources/datasources.yml"
|
||||
if ! docker exec argus-grafana sh -lc "test -f $DS_FILE" >/dev/null 2>&1; then
|
||||
DS_FILE="/etc/grafana/provisioning/datasources/datasources.yml"
|
||||
fi
|
||||
docker exec argus-grafana sh -lc "grep -E 'url:\s*http://$PROM_DOMAIN' '$DS_FILE'" >/dev/null 2>&1 || fail "datasource not pointing to $PROM_DOMAIN"
|
||||
ok "datasource points to domain"
|
||||
|
||||
# ---- DNS resolution inside grafana ----
|
||||
info "bind resolution inside grafana"
|
||||
tries=0
|
||||
until docker exec argus-grafana getent hosts prom.metric.argus.com >/dev/null 2>&1; do
|
||||
tries=$((tries+1)); (( tries > 24 )) && fail "grafana cannot resolve prom.metric.argus.com"
|
||||
echo "[..] waiting DNS propagation in grafana ($tries/24)"; sleep 5
|
||||
done
|
||||
ok "domain resolves"
|
||||
|
||||
# ---- Prometheus activeTargets down check ----
|
||||
info "Prometheus activeTargets health"
|
||||
targets_json="$ROOT/tmp/metric-verify/prom_targets.json"
|
||||
curl -fsS "http://127.0.0.1:${PROM_PORT}/api/v1/targets" -o "$targets_json" || { echo "[WARN] fetch targets failed" >&2; }
|
||||
down_all=""
|
||||
if command -v jq >/dev/null 2>&1; then
|
||||
down_all=$(jq -r '.data.activeTargets[] | select(.health=="down") | .scrapeUrl' "$targets_json" 2>/dev/null || true)
|
||||
else
|
||||
down_all=$(grep -o '"scrapeUrl":"[^"]\+"' "$targets_json" | sed 's/"scrapeUrl":"\(.*\)"/\1/' | paste -sd '\n' - | grep -v '^$' || true)
|
||||
grep -q '"health":"down"' "$targets_json" && [ -z "$down_all" ] && down_all="(one or more targets down)"
|
||||
fi
|
||||
# ignore dcgm-exporter(9400) and tolerate node-exporter(9100) in swarm tests
|
||||
down_filtered=$(echo "$down_all" | grep -Ev ':(9400|9100)/' || true)
|
||||
if [[ -n "$down_filtered" ]]; then
|
||||
err "prometheus down targets (filtered):"; echo "$down_filtered" >&2
|
||||
else
|
||||
ok "prometheus targets up (ignoring :9100 and :9400)"
|
||||
fi
|
||||
|
||||
# ---- nodes.json sanity: avoid 172.22/16 (gwbridge) ----
|
||||
nodes_json="$ROOT/private-server/argus/metric/prometheus/nodes.json"
|
||||
if [[ -f "$nodes_json" ]] && grep -q '"ip"\s*:\s*"172\.22\.' "$nodes_json"; then
|
||||
fail "nodes.json contains 172.22/16 addresses (gwbridge)"
|
||||
fi
|
||||
ok "nodes.json IPs look fine"
|
||||
|
||||
echo "[DONE] metric verify"
|
||||
|
||||
# ---- Log pipeline smoke test (adapted from sys/tests 07) ----
|
||||
info "Log pipeline: send logs in node container and assert ES counts"
|
||||
|
||||
ES_PORT="${ES_HTTP_PORT:-9200}"
|
||||
KIBANA_PORT="${KIBANA_PORT:-5601}"
|
||||
|
||||
get_count() {
|
||||
local idx="$1"; local tmp; tmp=$(mktemp)
|
||||
local code
|
||||
code=$(curl -s -o "$tmp" -w "%{http_code}" "http://127.0.0.1:${ES_PORT}/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" || true)
|
||||
if [[ "$code" == "200" ]]; then
|
||||
local val
|
||||
val=$(jq -r '(.count // 0) | tonumber? // 0' "$tmp" 2>/dev/null || echo 0)
|
||||
echo "$val"
|
||||
else
|
||||
echo 0
|
||||
fi
|
||||
rm -f "$tmp"
|
||||
}
|
||||
|
||||
train0=$(get_count "train-*")
|
||||
infer0=$(get_count "infer-*")
|
||||
base=$((train0 + infer0))
|
||||
info "initial ES counts: train=${train0} infer=${infer0} total=${base}"
|
||||
|
||||
send_logs() {
|
||||
local cname="$1"; local hosttag="$2"
|
||||
docker exec "$cname" sh -lc 'mkdir -p /logs/train /logs/infer'
|
||||
docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log"
|
||||
docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log"
|
||||
docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log"
|
||||
}
|
||||
|
||||
NODE_CONT="${SWARM_NODE_CNAME:-argus-metric-test-node-swarm}"
|
||||
ensure_fluentbit "$NODE_CONT"
|
||||
send_logs "$NODE_CONT" "swarm-node"
|
||||
|
||||
info "waiting for ES to ingest..."
|
||||
curl -s -X POST "http://127.0.0.1:${ES_PORT}/train-*/_refresh" >/dev/null 2>&1 || true
|
||||
curl -s -X POST "http://127.0.0.1:${ES_PORT}/infer-*/_refresh" >/dev/null 2>&1 || true
|
||||
|
||||
final=0; threshold=3
|
||||
for attempt in {1..60}; do
|
||||
train1=$(get_count "train-*"); infer1=$(get_count "infer-*"); final=$((train1 + infer1))
|
||||
if (( final > base && final >= threshold )); then break; fi
|
||||
echo "[..] waiting ES counts increase to >=${threshold} ($attempt/60) current=${final} base=${base}"; \
|
||||
curl -s -X POST "http://127.0.0.1:${ES_PORT}/train-*/_refresh" >/dev/null 2>&1 || true; \
|
||||
curl -s -X POST "http://127.0.0.1:${ES_PORT}/infer-*/_refresh" >/dev/null 2>&1 || true; \
|
||||
sleep 2
|
||||
done
|
||||
info "final ES counts: train=${train1} infer=${infer1} total=${final}"
|
||||
|
||||
(( final > base )) || fail "ES total did not increase (${base} -> ${final})"
|
||||
(( final >= threshold )) || fail "ES total below expected threshold: ${final} < ${threshold}"
|
||||
|
||||
es_health=$(curl -s "http://127.0.0.1:${ES_PORT}/_cluster/health" | grep -o '"status":"[^\"]*"' | cut -d'"' -f4)
|
||||
[[ "$es_health" == green || "$es_health" == yellow ]] || fail "ES health not green/yellow: $es_health"
|
||||
|
||||
if ! curl -fs "http://127.0.0.1:${KIBANA_PORT}/api/status" >/dev/null 2>&1; then
|
||||
echo "[WARN] Kibana status endpoint not available" >&2
|
||||
fi
|
||||
|
||||
ok "log pipeline verified"
|
||||
21
src/sys/swarm_tests/scripts/99_down.sh
Executable file
21
src/sys/swarm_tests/scripts/99_down.sh
Executable file
@ -0,0 +1,21 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a
|
||||
|
||||
echo "[DOWN] stopping nodes compose"
|
||||
docker compose -p "${NODES_PROJECT:-argus-swarm-nodes}" -f "$ROOT/docker-compose.nodes.yml" down --remove-orphans || true
|
||||
|
||||
echo "[DOWN] stopping server compose"
|
||||
docker compose -p "${SERVER_PROJECT:-argus-swarm-server}" -f "$ROOT/docker-compose.server.yml" down --remove-orphans || true
|
||||
|
||||
echo "[DOWN] removing overlay network"
|
||||
docker network rm argus-sys-net >/dev/null 2>&1 || true
|
||||
|
||||
echo "[DOWN] cleanup temp files"
|
||||
rm -rf "$ROOT/private-server/tmp" "$ROOT/private-nodes/tmp" 2>/dev/null || true
|
||||
|
||||
echo "[DOWN] done"
|
||||
|
||||
5
src/sys/swarm_tests/tmp/metric-verify.graf_health.json
Normal file
5
src/sys/swarm_tests/tmp/metric-verify.graf_health.json
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"commit": "5b85c4c2fcf5d32d4f68aaef345c53096359b2f1",
|
||||
"database": "ok",
|
||||
"version": "11.1.0"
|
||||
}
|
||||
5
src/sys/swarm_tests/tmp/metric-verify/graf_health.json
Normal file
5
src/sys/swarm_tests/tmp/metric-verify/graf_health.json
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"commit": "5b85c4c2fcf5d32d4f68aaef345c53096359b2f1",
|
||||
"database": "ok",
|
||||
"version": "11.1.0"
|
||||
}
|
||||
1
src/sys/swarm_tests/tmp/metric-verify/prom_targets.json
Normal file
1
src/sys/swarm_tests/tmp/metric-verify/prom_targets.json
Normal file
@ -0,0 +1 @@
|
||||
{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.13:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.13:9400/metrics","globalUrl":"http://10.0.1.13:9400/metrics","lastError":"","lastScrape":"2025-11-06T16:36:25.585236213+08:00","lastScrapeDuration":0.002520163,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.13:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.13:9100/metrics","globalUrl":"http://10.0.1.13:9100/metrics","lastError":"","lastScrape":"2025-11-06T16:36:33.694723606+08:00","lastScrapeDuration":0.021800606,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}}
|
||||
1
src/sys/swarm_tests/tmp/targets.json
Normal file
1
src/sys/swarm_tests/tmp/targets.json
Normal file
@ -0,0 +1 @@
|
||||
{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.15:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.15","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.15","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.15:9400/metrics","globalUrl":"http://10.0.1.15:9400/metrics","lastError":"","lastScrape":"2025-11-06T15:47:37.200098366+08:00","lastScrapeDuration":0.001361528,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.15:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.15","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.15","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.15:9100/metrics","globalUrl":"http://10.0.1.15:9100/metrics","lastError":"","lastScrape":"2025-11-06T15:47:40.184367879+08:00","lastScrapeDuration":0.02923333,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}}
|
||||
@ -92,6 +92,20 @@ while :; do
|
||||
WAITED=$((WAITED+1))
|
||||
done
|
||||
|
||||
# Quick upstream reachability snapshot (best-effort; does not block startup)
|
||||
declare -a _UPSTREAMS=(
|
||||
"http://web.argus.com:8080/"
|
||||
"http://grafana.metric.argus.com:3000/api/health"
|
||||
"http://prom.metric.argus.com:9090/-/ready"
|
||||
"http://kibana.log.argus.com:5601/api/status"
|
||||
"http://alertmanager.alert.argus.com:9093/api/v2/status"
|
||||
"http://master.argus.com:3000/readyz"
|
||||
)
|
||||
for u in "${_UPSTREAMS[@]}"; do
|
||||
code=$(curl -4 -s -o /dev/null -w "%{http_code}" "$u" || echo 000)
|
||||
echo "[INFO] upstream check: $u -> $code"
|
||||
done
|
||||
|
||||
echo "[INFO] Launching nginx..."
|
||||
|
||||
# 启动 nginx 前台模式
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user