[#37] 增加sys/swarm_tests(cpu) ;单独构建的node bundle镜像

This commit is contained in:
yuyr 2025-11-06 16:43:14 +08:00
parent 94b3e910b3
commit d1fad4a05a
34 changed files with 2494 additions and 32 deletions

View File

@ -0,0 +1,98 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
. "$ROOT_DIR/deployment/build/common.sh"
usage() {
cat <<EOF
Build Argus images (optional node-bundle)
Usage: build_images.sh [--with-node-bundle] [--client-version YYYYMMDD] [--base-image NAME[:TAG]]
Examples:
./deployment/build/build_images.sh --with-node-bundle --client-version 20251106
EOF
}
WITH_BUNDLE=false
CLIENT_VERSION=""
BASE_IMAGE="argus-sys-metric-test-node:latest"
while [[ $# -gt 0 ]]; do
case "$1" in
--with-node-bundle) WITH_BUNDLE=true; shift;;
--client-version) CLIENT_VERSION="$2"; shift 2;;
--base-image) BASE_IMAGE="$2"; shift 2;;
-h|--help) usage; exit 0;;
*) err "unknown arg: $1"; usage; exit 1;;
esac
done
if [[ "$WITH_BUNDLE" == true ]]; then
require_cmd docker tar gzip
BUNDLE_DIR="$ROOT_DIR/src/sys/build/node-bundle"
CTX_DIR="$BUNDLE_DIR"
TMP_BUNDLE="$BUNDLE_DIR/bundle"
rm -rf "$TMP_BUNDLE"; mkdir -p "$TMP_BUNDLE"
# Build or locate client artifact
PLUGIN_DIR="$ROOT_DIR/src/metric/client-plugins/all-in-one-full"
# CLIENT_VERSION 支持两种形式:
# - 形如 1.42.0 的 artifact 版本(默认)
# - 形如 YYYYMMDD 的打包日期,将从 deployment/artifact/client/ 下解析出内部 artifact 版本
if [[ -z "$CLIENT_VERSION" ]]; then
pushd "$PLUGIN_DIR" >/dev/null
bash scripts/package_artifact.sh --force
CLIENT_VERSION=$(cat artifact/*/version.json 2>/dev/null | sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' | tail -n1)
popd >/dev/null
[[ -n "$CLIENT_VERSION" ]] || { err "failed to detect client version"; exit 1; }
else
if [[ "$CLIENT_VERSION" =~ ^[0-9]{8}$ ]]; then
PKG_DIR="$ROOT_DIR/deployment/artifact/client/$CLIENT_VERSION"
TAR_PKG="$PKG_DIR/argus-metric_${CLIENT_VERSION}.tar.gz"
[[ -f "$TAR_PKG" ]] || { err "client date package not found: $TAR_PKG"; exit 1; }
# 解包读取内部 version.json
tmpd=$(mktemp -d); trap 'rm -rf "$tmpd"' EXIT
tar -xzf "$TAR_PKG" -C "$tmpd"
if [[ -f "$tmpd/version.json" ]]; then
ART_VER=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$tmpd/version.json" | head -n1)
[[ -n "$ART_VER" ]] || { err "failed to parse artifact version from date package"; exit 1; }
CLIENT_VERSION="$ART_VER"
# 直接使用该 tar 作为 bundle 源
cp "$TAR_PKG" "$TMP_BUNDLE/argus-metric_$(echo "$ART_VER" | tr '.' '_').tar.gz"
# 同时尝试复制 setup.sh若存在
[[ -f "$PKG_DIR/setup.sh" ]] && cp "$PKG_DIR/setup.sh" "$TMP_BUNDLE/" || true
else
err "version.json missing in client date package"
exit 1
fi
else
# 假定为 artifact 版本目录
pushd "$PLUGIN_DIR" >/dev/null
[[ -d "artifact/$CLIENT_VERSION" ]] || bash scripts/package_artifact.sh --force
popd >/dev/null
fi
fi
# 若未通过日期包预置 tar则从插件 artifact 目录取
TAR_NAME="argus-metric_$(echo "$CLIENT_VERSION" | tr '.' '_').tar.gz"
if [[ ! -f "$TMP_BUNDLE/$TAR_NAME" ]]; then
SRC_TAR="$PLUGIN_DIR/artifact/$CLIENT_VERSION/$TAR_NAME"
[[ -f "$SRC_TAR" ]] || { err "missing client tar: $SRC_TAR"; exit 1; }
cp "$SRC_TAR" "$TMP_BUNDLE/"
# also include setup.sh for fallback
if [[ -f "$ROOT_DIR/deployment/artifact/client/$(today_version)/setup.sh" ]]; then
cp "$ROOT_DIR/deployment/artifact/client/$(today_version)/setup.sh" "$TMP_BUNDLE/" || true
fi
fi
log "Building node-bundle image with client version: $CLIENT_VERSION"
DOCKER_BUILDKIT=0 docker build \
--build-arg CLIENT_VER="$CLIENT_VERSION" \
--build-arg BASE_IMAGE="$BASE_IMAGE" \
-t argus-sys-metric-test-node-bundle:latest \
-f "$BUNDLE_DIR/Dockerfile" "$BUNDLE_DIR"
log "Built image: argus-sys-metric-test-node-bundle:latest"
fi
log "Done."

View File

@ -0,0 +1,103 @@
#!/usr/bin/env bash
set -euo pipefail
# Quick fix tool: replace 172.22/16 targets in nodes.json with overlay IPs resolved from hostname.
# Usage: run on server package host: scripts/fix-prom-targets-overlay.sh
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
NODES_JSON="$ROOT/private/argus/metric/prometheus/nodes.json"
require_cmd() { command -v "$1" >/dev/null 2>&1 || { echo "[ERROR] missing command: $1" >&2; exit 1; }; }
backup() {
local src="$1"; local ts; ts=$(date -u +%Y%m%d-%H%M%SZ)
cp "$src" "${src%.json}_bak_${ts}.json"
}
prefer_overlay_ip() {
local host="$1"
# prefer 10.0/8 then 172.31/16
getent hosts "$host" | awk '{print $1}' | while read -r ip; do
if [[ "$ip" =~ ^10\. ]]; then echo "$ip"; return; fi
done
getent hosts "$host" | awk '{print $1}' | while read -r ip; do
if [[ "$ip" =~ ^172\.31\. ]]; then echo "$ip"; return; fi
done
# fallback: first A record
getent hosts "$host" | awk '{print $1; exit}'
}
require_cmd awk
require_cmd sed
if [[ ! -f "$NODES_JSON" ]]; then
echo "[WARN] nodes.json not found: $NODES_JSON" >&2
exit 0
fi
backup "$NODES_JSON"
tmp=$(mktemp)
trap 'rm -f "$tmp"' EXIT
changed=0
python3 - "$NODES_JSON" <<'PY' > "$tmp" || {
import ipaddress, json, sys, socket
path=sys.argv[1]
data=json.load(open(path)) if path else []
def resolve(host):
try:
infos=socket.getaddrinfo(host,None,family=socket.AF_INET)
ips=[i[4][0] for i in infos]
# prefer 10. over 172.31.
for ip in ips:
if ip.startswith('10.'): return ip
for ip in ips:
if ip.startswith('172.31.'): return ip
return ips[0] if ips else None
except OSError:
return None
gw=ipaddress.ip_network('172.22.0.0/16')
out=[]
changed=False
for item in data:
ip=item.get('ip')
host=item.get('hostname') or ''
try:
bad = ip and ipaddress.ip_address(ip) in gw
except Exception:
bad = False
if bad and host:
new=resolve(host)
if new:
item=dict(item)
item['ip']=new
changed=True
out.append(item)
json.dump(out, sys.stdout, ensure_ascii=False)
sys.stderr.write('CHANGED' if changed else 'UNCHANGED')
PY
status=$?
marker=$(tail -n1 /dev/stderr 2>/dev/null || true)
if [[ "$status" -ne 0 ]]; then
echo "[ERROR] failed to rewrite nodes.json" >&2
exit 1
fi
if grep -q '"ip"\s*:\s*"172\.22\.' "$tmp"; then
echo "[WARN] some gwbridge targets remain; manual fix may be required" >&2
fi
mv "$tmp" "$NODES_JSON"
echo "[OK] nodes.json updated"
# try to reload Prometheus
if docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then
docker exec argus-prometheus sh -lc 'pidof prometheus >/dev/null 2>&1 && kill -HUP $(pidof prometheus) || supervisorctl restart prometheus' >/dev/null 2>&1 || true
echo "[INFO] Prometheus reloaded"
fi
exit 0

View File

@ -155,6 +155,34 @@ gfcode=$(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health"); [[ "$gf
# Deduplicate errors
sort -u -o "$ERRORS" "$ERRORS"
# --- Prometheus targets & nodes.json checks ---
section PROMETHEUS-TARGETS
nodes_json_path="$ROOT/private/argus/metric/prometheus/nodes.json"
if [[ -f "$nodes_json_path" ]]; then
logd "nodes.json present: $nodes_json_path"
# detect gwbridge addresses (172.22/16)
if grep -E '"ip"\s*:\s*"172\.22\.' "$nodes_json_path" >/dev/null 2>&1; then
append_err "[prometheus][targets] contains gwbridge (172.22/16) addresses; prefer overlay (10.0/8)."
echo "HINT: nodes.json has 172.22.x.x targets. Ensure agent publishes overlay_ip and master prefers it." >&2
fi
else
logd "nodes.json missing at $nodes_json_path"
fi
# Query Prometheus activeTargets and list down items when possible
pt_json=$(curl -s --max-time 3 "http://localhost:${PROMETHEUS_PORT:-9090}/api/v1/targets" || true)
if command -v jq >/dev/null 2>&1; then
downs=$(printf '%s' "$pt_json" | jq -r '.data.activeTargets[] | select(.health=="down") | "[prometheus][activeTargets] down url=\(.scrapeUrl) lastError=\(.lastError)"' 2>/dev/null || true)
if [[ -n "$downs" ]]; then
printf '%s\n' "$downs" >> "$ERRORS"
fi
else
# best-effort grep when jq is unavailable
if printf '%s' "$pt_json" | grep -q '"health":"down"'; then
append_err "[prometheus][activeTargets] some targets are down (enable jq for detailed reasons)"
fi
fi
echo "Diagnostic details -> $DETAILS"
echo "Detected errors -> $ERRORS"

View File

@ -4,6 +4,7 @@ import os
import re
import socket
import subprocess
import ipaddress
from pathlib import Path
from typing import Any, Dict
@ -16,11 +17,47 @@ _HOSTNAME_PATTERN = re.compile(r"^([^-]+)-([^-]+)-([^-]+)-.*$")
def collect_metadata(config: AgentConfig) -> Dict[str, Any]:
"""汇总节点注册需要的静态信息。"""
"""汇总节点注册需要的静态信息,带有更智能的 IP 选择。
规则从高到低
1) AGENT_PUBLISH_IP 指定
2) Hostname A 记录若命中优先网段
3) 网卡扫描排除 AGENT_EXCLUDE_IFACES优先 AGENT_PREFER_NET_CIDRS
4) 默认路由回退UDP socket 技巧
额外发布overlay_ip / gwbridge_ip / interfaces便于 Master 与诊断使用
"""
hostname = config.hostname
meta = {
prefer_cidrs = _read_cidrs_env(
os.environ.get("AGENT_PREFER_NET_CIDRS", "10.0.0.0/8,172.31.0.0/16")
)
exclude_ifaces = _read_csv_env(
os.environ.get("AGENT_EXCLUDE_IFACES", "docker_gwbridge,lo")
)
# interface inventory
interfaces = _list_global_ipv4_addrs()
if exclude_ifaces:
interfaces = [it for it in interfaces if it[0] not in set(exclude_ifaces)]
# resolve hostname candidates
host_ips = _resolve_hostname_ips(hostname)
selected_ip, overlay_ip, gwbridge_ip = _select_publish_ips(
interfaces=interfaces,
host_ips=host_ips,
prefer_cidrs=prefer_cidrs,
)
meta: Dict[str, Any] = {
"hostname": hostname,
"ip": _detect_ip_address(),
"ip": os.environ.get("AGENT_PUBLISH_IP", selected_ip), # keep required field
"overlay_ip": overlay_ip or selected_ip,
"gwbridge_ip": gwbridge_ip,
"interfaces": [
{"iface": name, "ip": ip} for name, ip in interfaces
],
"env": config.environment,
"user": config.user,
"instance": config.instance,
@ -96,7 +133,7 @@ def _detect_gpu_count() -> int:
def _detect_ip_address() -> str:
"""尝试通过 UDP socket 获得容器出口 IP失败则回退解析主机名"""
"""保留旧接口,作为最终回退:默认路由源地址 → 主机名解析 → 127.0.0.1"""
try:
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock:
sock.connect(("8.8.8.8", 80))
@ -108,3 +145,118 @@ def _detect_ip_address() -> str:
except OSError:
LOGGER.warning("Unable to resolve hostname to IP; defaulting to 127.0.0.1")
return "127.0.0.1"
def _read_csv_env(raw: str | None) -> list[str]:
if not raw:
return []
return [x.strip() for x in raw.split(",") if x.strip()]
def _read_cidrs_env(raw: str | None) -> list[ipaddress.IPv4Network]:
cidrs: list[ipaddress.IPv4Network] = []
for item in _read_csv_env(raw):
try:
net = ipaddress.ip_network(item, strict=False)
if isinstance(net, (ipaddress.IPv4Network,)):
cidrs.append(net)
except ValueError:
LOGGER.warning("Ignoring invalid CIDR in AGENT_PREFER_NET_CIDRS", extra={"cidr": item})
return cidrs
def _list_global_ipv4_addrs() -> list[tuple[str, str]]:
"""列出 (iface, ip) 形式的全局 IPv4 地址。
依赖 iproute2ip -4 -o addr show scope global
"""
results: list[tuple[str, str]] = []
try:
proc = subprocess.run(
["sh", "-lc", "ip -4 -o addr show scope global | awk '{print $2, $4}'"],
check=False,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=3,
)
if proc.returncode == 0:
for line in proc.stdout.splitlines():
line = line.strip()
if not line:
continue
parts = line.split()
if len(parts) != 2:
continue
iface, cidr = parts
ip = cidr.split("/")[0]
try:
ipaddress.IPv4Address(ip)
except ValueError:
continue
results.append((iface, ip))
except Exception as exc: # pragma: no cover - defensive
LOGGER.debug("Failed to list interfaces", extra={"error": str(exc)})
return results
def _resolve_hostname_ips(name: str) -> list[str]:
ips: list[str] = []
try:
infos = socket.getaddrinfo(name, None, family=socket.AF_INET)
for info in infos:
ip = info[4][0]
if ip not in ips:
ips.append(ip)
except OSError:
pass
return ips
def _pick_by_cidrs(candidates: list[str], prefer_cidrs: list[ipaddress.IPv4Network]) -> str | None:
for net in prefer_cidrs:
for ip in candidates:
try:
if ipaddress.ip_address(ip) in net:
return ip
except ValueError:
continue
return None
def _select_publish_ips(
*,
interfaces: list[tuple[str, str]],
host_ips: list[str],
prefer_cidrs: list[ipaddress.IPv4Network],
) -> tuple[str, str | None, str | None]:
"""返回 (selected_ip, overlay_ip, gwbridge_ip)。
- overlay_ip优先命中 prefer_cidrs10.0/8 先于 172.31/16
- gwbridge_ip若存在 172.22/16 则记录
- selected_ip优先 AGENT_PUBLISH_IP否则 overlay_ip否则 hostname A 记录中的 prefer否则默认路由回退
"""
# detect gwbridge (172.22/16)
gwbridge_net = ipaddress.ip_network("172.22.0.0/16")
gwbridge_ip = None
for _, ip in interfaces:
try:
if ipaddress.ip_address(ip) in gwbridge_net:
gwbridge_ip = ip
break
except ValueError:
continue
# overlay candidate from interfaces by prefer cidrs
iface_ips = [ip for _, ip in interfaces]
overlay_ip = _pick_by_cidrs(iface_ips, prefer_cidrs)
# hostname A records filtered by prefer cidrs
host_pref = _pick_by_cidrs(host_ips, prefer_cidrs)
env_ip = os.environ.get("AGENT_PUBLISH_IP")
if env_ip:
selected = env_ip
else:
selected = overlay_ip or host_pref or _detect_ip_address()
return selected, overlay_ip, gwbridge_ip

Binary file not shown.

View File

@ -13,6 +13,8 @@ class AppConfig:
scheduler_interval_seconds: int
node_id_prefix: str
auth_mode: str
target_prefer_net_cidrs: str
target_reachability_check: bool
def _get_int_env(name: str, default: int) -> int:
@ -27,6 +29,12 @@ def _get_int_env(name: str, default: int) -> int:
def load_config() -> AppConfig:
"""读取环境变量生成配置对象,方便统一管理运行参数。"""
def _bool_env(name: str, default: bool) -> bool:
raw = os.environ.get(name)
if raw is None or raw.strip() == "":
return default
return raw.strip().lower() in ("1", "true", "yes", "on")
return AppConfig(
db_path=os.environ.get("DB_PATH", "/private/argus/master/db.sqlite3"),
metric_nodes_json_path=os.environ.get(
@ -37,4 +45,6 @@ def load_config() -> AppConfig:
scheduler_interval_seconds=_get_int_env("SCHEDULER_INTERVAL_SECONDS", 30),
node_id_prefix=os.environ.get("NODE_ID_PREFIX", "A"),
auth_mode=os.environ.get("AUTH_MODE", "disabled"),
target_prefer_net_cidrs=os.environ.get("TARGET_PREFER_NET_CIDRS", "10.0.0.0/8,172.31.0.0/16"),
target_reachability_check=_bool_env("TARGET_REACHABILITY_CHECK", False),
)

View File

@ -1,8 +1,10 @@
from __future__ import annotations
import ipaddress
import logging
import socket
import threading
from typing import Optional
from typing import Optional, Iterable, Dict, Any, List
from .config import AppConfig
from .storage import Storage
@ -34,10 +36,117 @@ class StatusScheduler:
self._pending_nodes_json.set()
def generate_nodes_json(self) -> None:
"""根据在线节点生成 Prometheus 抓取目标,优先 overlay IP。
候选顺序meta.overlay_ip > hostname A 记录命中偏好网段> meta.ip
可选 reachability 检查TARGET_REACHABILITY_CHECK=true 9100/9400 做一次 1s TCP 连接测试
选择首个可达的候选全部失败则按顺序取第一个并记录日志
"""
with self._nodes_json_lock:
online_nodes = self._storage.get_online_nodes()
atomic_write_json(self._config.metric_nodes_json_path, online_nodes)
self._logger.info("nodes.json updated", extra={"count": len(online_nodes)})
rows = self._storage.get_online_nodes_meta()
prefer_cidrs = self._parse_cidrs(self._config.target_prefer_net_cidrs)
reachability = self._config.target_reachability_check
result: List[Dict[str, Any]] = []
for row in rows:
meta = row.get("meta", {})
hostname = meta.get("hostname") or row.get("name")
labels = row.get("labels") or []
overlay_ip = meta.get("overlay_ip")
legacy_ip = meta.get("ip")
host_candidates = self._resolve_host_ips(hostname)
host_pref = self._pick_by_cidrs(host_candidates, prefer_cidrs)
candidates: List[str] = []
for ip in [overlay_ip, host_pref, legacy_ip]:
if ip and ip not in candidates:
candidates.append(ip)
chosen = None
if reachability:
ports = [9100]
try:
if int(meta.get("gpu_number", 0)) > 0:
ports.append(9400)
except Exception:
pass
for ip in candidates:
if any(self._reachable(ip, p, 1.0) for p in ports):
chosen = ip
break
if not chosen:
chosen = candidates[0] if candidates else legacy_ip
if not chosen:
# ultimate fallback: 127.0.0.1 (should not happen)
chosen = "127.0.0.1"
self._logger.warning("No candidate IPs for node; falling back", extra={"node": row.get("node_id")})
if chosen and ipaddress.ip_address(chosen) in ipaddress.ip_network("172.22.0.0/16"):
self._logger.warning(
"Prometheus target uses docker_gwbridge address; prefer overlay",
extra={"node": row.get("node_id"), "ip": chosen},
)
result.append(
{
"node_id": row.get("node_id"),
"user_id": meta.get("user"),
"ip": chosen,
"hostname": hostname,
"labels": labels if isinstance(labels, list) else [],
}
)
atomic_write_json(self._config.metric_nodes_json_path, result)
self._logger.info("nodes.json updated", extra={"count": len(result)})
# ---------------------------- helpers ----------------------------
@staticmethod
def _parse_cidrs(raw: str) -> List[ipaddress.IPv4Network]:
nets: List[ipaddress.IPv4Network] = []
for item in (x.strip() for x in (raw or "").split(",")):
if not item:
continue
try:
net = ipaddress.ip_network(item, strict=False)
if isinstance(net, ipaddress.IPv4Network):
nets.append(net)
except ValueError:
continue
return nets
@staticmethod
def _resolve_host_ips(hostname: str) -> List[str]:
ips: List[str] = []
try:
infos = socket.getaddrinfo(hostname, None, family=socket.AF_INET)
for info in infos:
ip = info[4][0]
if ip not in ips:
ips.append(ip)
except OSError:
pass
return ips
@staticmethod
def _pick_by_cidrs(candidates: Iterable[str], prefer: List[ipaddress.IPv4Network]) -> str | None:
for net in prefer:
for ip in candidates:
try:
if ipaddress.ip_address(ip) in net:
return ip
except ValueError:
continue
return None
@staticmethod
def _reachable(ip: str, port: int, timeout: float) -> bool:
try:
with socket.create_connection((ip, port), timeout=timeout):
return True
except OSError:
return False
# ------------------------------------------------------------------
# internal loop

View File

@ -324,9 +324,35 @@ class Storage:
{
"node_id": row["id"],
"user_id": meta.get("user"),
"ip": meta.get("ip"),
"ip": meta.get("ip"), # kept for backward-compat; preferred IP selection handled in scheduler
"hostname": meta.get("hostname", row["name"]),
"labels": labels if isinstance(labels, list) else [],
}
)
return result
def get_online_nodes_meta(self) -> List[Dict[str, Any]]:
"""返回在线节点的原始 meta 与名称、标签,交由上层选择目标 IP。
每项包含{ node_id, name, meta, labels }
"""
with self._lock:
cur = self._conn.execute(
"SELECT id, name, meta_json, labels_json FROM nodes WHERE status = ? ORDER BY id ASC",
("online",),
)
rows = cur.fetchall()
result: List[Dict[str, Any]] = []
for row in rows:
meta = json.loads(row["meta_json"]) if row["meta_json"] else {}
labels = json.loads(row["labels_json"]) if row["labels_json"] else []
result.append(
{
"node_id": row["id"],
"name": row["name"],
"meta": meta if isinstance(meta, dict) else {},
"labels": labels if isinstance(labels, list) else [],
}
)
return result

View File

@ -1 +1 @@
1.40.0
1.42.0

View File

@ -274,19 +274,33 @@ verify_checksums() {
log_info "Artifact 目录: $artifact_dir"
failed_verification=0
# 尝试解析 version.json 中的 install_order用于锁定精确文件名避免同一目录下多份历史 tar 产生歧义
local order_file="$TEMP_DIR/install_order.txt"
if [[ -f "$TEMP_DIR/checksums.txt" ]]; then
while IFS= read -r line; do
component=$(echo "$line" | cut -d':' -f1)
expected_checksum=$(echo "$line" | cut -d':' -f2-)
# 查找匹配的 tar 文件
# 优先从 install_order 中推导精确文件名
actual_file=""
if [[ -f "$order_file" ]]; then
while IFS= read -r fname; do
if [[ "$fname" == ${component}-*.tar.gz && -f "$artifact_dir/$fname" ]]; then
actual_file="$artifact_dir/$fname"
break
fi
done < "$order_file"
fi
# 回退:按前缀匹配首个(不推荐,但保持兼容)
if [[ -z "$actual_file" ]]; then
for file in "$artifact_dir/${component}-"*.tar.gz; do
if [[ -f "$file" ]]; then
actual_file="$file"
break
fi
done
fi
if [[ -z "$actual_file" ]]; then
log_error "找不到组件文件: $component"

View File

@ -59,6 +59,12 @@ ARTIFACT_DIR="artifact/$VERSION"
log_info "开始打包 AIOps All-in-One 安装包 v$VERSION"
# 若强制打包且目录已存在,先清理旧产物以避免同一版本下残留多个 tar.gz 导致校验混乱
if [[ "$FORCE_PACKAGE" == "true" && -d "$ARTIFACT_DIR" ]]; then
log_info "--force: 清理旧的 $ARTIFACT_DIR 下的 tar 与元数据"
rm -rf "$ARTIFACT_DIR"
fi
# 检查必要文件
log_info "检查必要文件..."
if [[ ! -f "config/VERSION" ]]; then
@ -130,7 +136,7 @@ if [[ -d "$ARTIFACT_DIR" && "$FORCE_PACKAGE" == "false" ]]; then
fi
fi
# 创建 artifact 目录
# 创建 artifact 目录(清理后重建)
mkdir -p "$ARTIFACT_DIR"
log_info "创建输出目录: $ARTIFACT_DIR"
@ -285,10 +291,13 @@ while IFS= read -r component; do
exit 1
fi
# 清理组件目录内历史 tar 包,避免 find 误选旧文件
rm -f ./*.tar.gz 2>/dev/null || true
# 执行组件的打包脚本
if ./package.sh; then
# 查找生成的 tar 包
tar_file=$(find . -name "*.tar.gz" -type f | head -1)
tar_file=$(ls -1t ./*.tar.gz 2>/dev/null | head -1)
if [[ -n "$tar_file" ]]; then
# 移动到 artifact 目录
mv "$tar_file" "$CURRENT_DIR/$ARTIFACT_DIR/"

View File

@ -130,20 +130,40 @@ fi
TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$"
mkdir -p "$TEMP_PACKAGE_DIR"
# 复制所有 tar.gz 文件到临时目录
log_info "准备 artifact 文件..."
tar_files=$(find "$ARTIFACT_DIR" -name "*.tar.gz" -type f)
# 仅复制 version.json 中 install_order 列出的 tar.gz防止同一版本目录下历史残留文件导致校验不一致
log_info "准备 artifact 文件(按 install_order..."
install_list_file="$TEMP_DIR/install_list.txt"
if command -v jq >/dev/null 2>&1; then
jq -r '.install_order[]' "$ARTIFACT_DIR/version.json" > "$install_list_file" 2>/dev/null || true
else
# 简易解析
grep -A 200 '"install_order"' "$ARTIFACT_DIR/version.json" | grep -E '".*"' | sed 's/.*"\([^"]*\)".*/\1/' > "$install_list_file" 2>/dev/null || true
fi
if [[ -s "$install_list_file" ]]; then
while IFS= read -r filename; do
src="$ARTIFACT_DIR/$filename"
if [[ -f "$src" ]]; then
log_info " 拷贝: $filename"
cp "$src" "$TEMP_PACKAGE_DIR/"
else
log_warning " 未找到: $filename(跳过)"
fi
done < "$install_list_file"
else
log_warning "未能解析 install_order将回退复制全部 tar.gz可能包含历史残留建议安装端使用严格校验"
tar_files=$(find "$ARTIFACT_DIR" -name "*.tar.gz" -type f)
if [[ -z "$tar_files" ]]; then
log_error "$ARTIFACT_DIR 中未找到 tar.gz 文件"
exit 1
fi
for file in $tar_files; do
filename=$(basename "$file")
log_info " 准备: $filename"
cp "$file" "$TEMP_PACKAGE_DIR/"
done
fi
# 复制版本信息文件
if [[ -f "$ARTIFACT_DIR/version.json" ]]; then

View File

@ -0,0 +1,16 @@
ARG BASE_IMAGE=argus-sys-metric-test-node:latest
FROM ${BASE_IMAGE}
ARG CLIENT_VER
LABEL org.opencontainers.image.title="argus-sys-metric-test-node-bundle" \
org.opencontainers.image.version="${CLIENT_VER}" \
org.opencontainers.image.description="Metric test node with embedded client package"
WORKDIR /
# bundle files are provided at build time into ./bundle in build context
COPY bundle/ /bundle/
COPY node-bootstrap.sh /usr/local/bin/node-bootstrap.sh
RUN chmod +x /usr/local/bin/node-bootstrap.sh
ENTRYPOINT ["/usr/local/bin/node-bootstrap.sh"]

View File

@ -0,0 +1,2 @@
argus-metric_*.tar.gz

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,99 @@
#!/usr/bin/env bash
set -euo pipefail
echo "[BOOT] node bundle starting"
INSTALL_DIR="/opt/argus-metric"
BUNDLE_DIR="/bundle"
installed_ok=0
# 1) already installed?
if [[ -L "$INSTALL_DIR/current" && -d "$INSTALL_DIR/current" ]]; then
echo "[BOOT] client already installed at $INSTALL_DIR/current"
else
# 2) try local bundle first (replicate setup.sh layout: move to /opt/argus-metric/versions/<ver> and run install.sh)
tarball=$(ls -1 "$BUNDLE_DIR"/argus-metric_*.tar.gz 2>/dev/null | head -1 || true)
if [[ -n "${tarball:-}" ]]; then
echo "[BOOT] installing from local bundle: $(basename "$tarball")"
tmp=$(mktemp -d)
tar -xzf "$tarball" -C "$tmp"
# locate root containing version.json
root="$tmp"
if [[ ! -f "$root/version.json" ]]; then
sub=$(find "$tmp" -mindepth 1 -maxdepth 1 -type d | head -n1 || true)
[[ -n "$sub" && -f "$sub/version.json" ]] && root="$sub"
fi
if [[ ! -f "$root/version.json" ]]; then
echo "[BOOT][WARN] version.json not found in bundle; fallback to FTP"
else
ver=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$root/version.json" | head -n1)
if [[ -z "$ver" ]]; then
echo "[BOOT][WARN] failed to parse version from version.json; fallback to FTP"
else
target_root="/opt/argus-metric"
version_dir="$target_root/versions/$ver"
mkdir -p "$version_dir"
# move contents into version dir
shopt -s dotglob
mv "$root"/* "$version_dir/" 2>/dev/null || true
shopt -u dotglob
# run component installer within version dir
if [[ -f "$version_dir/install.sh" ]]; then
chmod +x "$version_dir/install.sh" 2>/dev/null || true
(cd "$version_dir" && ./install.sh "$version_dir")
echo "$ver" > "$target_root/LATEST_VERSION" 2>/dev/null || true
ln -sfn "$version_dir" "$target_root/current" 2>/dev/null || true
if [[ -L "$target_root/current" && -d "$target_root/current" ]]; then
installed_ok=1
echo "[BOOT] local bundle install OK: version=$ver"
else
echo "[BOOT][WARN] current symlink not present after install; will rely on healthcheck to confirm"
fi
else
echo "[BOOT][WARN] install.sh missing under $version_dir; fallback to FTP"
fi
fi
fi
fi
# 3) fallback: use FTP setup if not installed
if [[ ! -L "$INSTALL_DIR/current" && "$installed_ok" -eq 0 ]]; then
echo "[BOOT] fallback to FTP setup"
if [[ -z "${FTPIP:-}" || -z "${FTP_USER:-}" || -z "${FTP_PASSWORD:-}" ]]; then
echo "[BOOT][ERROR] FTP variables not set (FTPIP/FTP_USER/FTP_PASSWORD)" >&2
exit 1
fi
curl -u "$FTP_USER:$FTP_PASSWORD" -fsSL "ftp://$FTPIP:21/setup.sh" -o /tmp/setup.sh
chmod +x /tmp/setup.sh
/tmp/setup.sh --server "$FTPIP" --user "$FTP_USER" --password "$FTP_PASSWORD" --port 21
fi
fi
# 4) ensure agent is running; start if needed (inherits env: MASTER_ENDPOINT/AGENT_*)
if ! pgrep -x argus-agent >/dev/null 2>&1; then
echo "[BOOT] starting argus-agent (not detected)"
setsid /usr/local/bin/argus-agent >/var/log/argus-agent.log 2>&1 < /dev/null &
fi
# 5) post-install selfcheck (best-effort) and wait for node.json
for i in {1..30}; do
if compgen -G "$INSTALL_DIR/versions/*/check_health.sh" > /dev/null; then
bash "$INSTALL_DIR"/versions/*/check_health.sh || true
break
fi
sleep 2
done
host="$(hostname)"
state_dir="/private/argus/agent/${host}"
mkdir -p "$state_dir" 2>/dev/null || true
for i in {1..60}; do
if [[ -s "$state_dir/node.json" ]]; then
echo "[BOOT] node state present: $state_dir/node.json"
break
fi
sleep 2
done
echo "[BOOT] ready; entering sleep"
exec sleep infinity

21
src/sys/swarm_tests/.env Normal file
View File

@ -0,0 +1,21 @@
SERVER_PROJECT=argus-swarm-server
NODES_PROJECT=argus-swarm-nodes
# Host ports for server compose
MASTER_PORT=32300
ES_HTTP_PORT=9200
KIBANA_PORT=5601
PROMETHEUS_PORT=9090
GRAFANA_PORT=3000
ALERTMANAGER_PORT=9093
WEB_PROXY_PORT_8080=8080
WEB_PROXY_PORT_8081=8081
WEB_PROXY_PORT_8082=8082
WEB_PROXY_PORT_8083=8083
WEB_PROXY_PORT_8084=8084
WEB_PROXY_PORT_8085=8085
# UID/GID for volume ownership in containers
ARGUS_BUILD_UID=1000
ARGUS_BUILD_GID=1000

View File

@ -0,0 +1,21 @@
SERVER_PROJECT=argus-swarm-server
NODES_PROJECT=argus-swarm-nodes
# Host ports for server compose
MASTER_PORT=32300
ES_HTTP_PORT=9200
KIBANA_PORT=5601
PROMETHEUS_PORT=9090
GRAFANA_PORT=3000
ALERTMANAGER_PORT=9093
WEB_PROXY_PORT_8080=8080
WEB_PROXY_PORT_8081=8081
WEB_PROXY_PORT_8082=8082
WEB_PROXY_PORT_8083=8083
WEB_PROXY_PORT_8084=8084
WEB_PROXY_PORT_8085=8085
# UID/GID for volume ownership in containers
ARGUS_BUILD_UID=2133
ARGUS_BUILD_GID=2015

View File

@ -0,0 +1,8 @@
BINDIP=10.0.1.5
FTPIP=10.0.1.4
MASTER_ENDPOINT=http://master.argus.com:3000
FTP_USER=ftpuser
FTP_PASSWORD=ZGClab1234!
AGENT_ENV=dev2
AGENT_USER=yuyr
AGENT_INSTANCE=node001sX

View File

@ -0,0 +1,52 @@
# Swarm Tests (argus-sys-net)
快速在本机用 Docker Swarm + overlay 网络验证“服务端 + 单节点”端到端部署。保持对 `src/sys/tests` 兼容,不影响现有桥接网络测试。
## 先决条件
- Docker Engine 已启用 Swarm脚本会自动 `swarm init` 单机模式)。
- 已构建并加载以下镜像:`argus-bind9:latest``argus-master:latest``argus-elasticsearch:latest``argus-kibana:latest``argus-metric-ftp:latest``argus-metric-prometheus:latest``argus-metric-grafana:latest``argus-alertmanager:latest``argus-web-frontend:latest``argus-web-proxy:latest`、以及节点镜像 `argus-sys-metric-test-node-bundle:latest`(见下文)。
- 本地 `UID/GID` 建议通过 `configs/build_user.local.conf` 指定,脚本会读取:
- `UID=1000`\n`GID=1000`(示例)。
## 构建节点 bundle 镜像
```
./deployment/build/build_images.sh --with-node-bundle --client-version 20251106
```
说明:`--client-version` 支持 `YYYYMMDD` 日期包或 `1.xx.yy` 组件版本。打包完成后镜像 `argus-sys-metric-test-node-bundle:latest` 会内置 `argus-metric_*.tar.gz`,容器启动时优先从本地 bundle 安装。
## 运行步骤
```
cd src/sys/swarm_tests
cp .env.example .env
bash scripts/00_bootstrap.sh
bash scripts/01_server_up.sh
bash scripts/02_wait_ready.sh # 输出 BINDIP/FTPIP 到 .env.nodes
bash scripts/03_nodes_up.sh
bash scripts/04_metric_verify.sh
```
清理:
```
bash scripts/99_down.sh
```
## 说明与注意事项
- `00_bootstrap.sh`:先加载 `scripts/common/build_user.sh`,打印并写入 `.env` 中的 `ARGUS_BUILD_UID/GID`,再准备 `private-server/``private-nodes/` 目录,并 `chown` 到对应 UID/GID。
- `01_server_up.sh`:启动服务端 compose。可用 `SWARM_FIX_PERMS=1` 打开“容器内 chmod + supervisor 重启”的兜底逻辑,默认关闭。
- `02_wait_ready.sh`:等待 Master/ES/Prom/Grafana 就绪Kibana 可延迟),随后解析 overlay IP写入 `.env.nodes``BINDIP/FTPIP`,供节点 compose 使用。
- `03_nodes_up.sh`启动单节点容器bundle 版)。容器内 `node-bootstrap.sh` 优先本地安装,成功后执行健康检查并等待 `/private/argus/agent/<hostname>/node.json` 出现。
- `04_metric_verify.sh`:在本套件内执行详细校验(不再直接调用 tests 脚本):
- Grafana `/api/health`database=ok
- Grafana 数据源指向 `prom.metric.argus.com:<port>` 并在容器内可解析该域名
- Prometheus `activeTargets` 全部 up
- `nodes.json` 不包含 `172.22/16`docker_gwbridge
## 常见问题
- Grafana/Kibana 启动报权限:检查 `configs/build_user.local.conf``00_bootstrap.sh` 的输出 UID/GID 是否一致;必要时设置 `SWARM_FIX_PERMS=1` 重新 `01_server_up.sh`
- 节点容器 fallback 到 FTP通常为 bundle 结构异常或健康检查失败(早期脚本在 `sh` 下执行)。当前 `node-bootstrap.sh` 已使用 `bash` 执行健康检查,并在本地安装成功后跳过 FTP。
- 代理 502查看容器 `argus-web-proxy``/var/log/nginx/error.log` 与启动日志中 `upstream check` 行;若后端未就绪(尤其 Kibana等待 `02_wait_ready.sh` 通过后再访问。

View File

@ -0,0 +1,34 @@
version: "3.8"
networks:
argus-sys-net:
external: true
services:
metric-test-node:
image: ${NODE_BUNDLE_IMAGE_TAG:-argus-sys-metric-test-node-bundle:latest}
container_name: argus-metric-test-node-swarm
hostname: ${NODE_HOSTNAME:-swarm-metric-node-001}
restart: unless-stopped
environment:
- TZ=Asia/Shanghai
- DEBIAN_FRONTEND=noninteractive
- MASTER_ENDPOINT=${MASTER_ENDPOINT:-http://master.argus.com:3000}
- ES_HOST=es.log.argus.com
- ES_PORT=9200
- FTPIP=${FTPIP}
- BINDIP=${BINDIP}
- FTP_USER=${FTP_USER:-ftpuser}
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
- AGENT_ENV=${AGENT_ENV:-dev2}
- AGENT_USER=${AGENT_USER:-yuyr}
- AGENT_INSTANCE=${AGENT_INSTANCE:-node001sX}
- CLIENT_VERSION=${CLIENT_VERSION:-}
dns:
- ${BINDIP}
networks: [argus-sys-net]
volumes:
- ./private-nodes/argus/agent:/private/argus/agent
command: ["sleep", "infinity"]

View File

@ -0,0 +1,174 @@
version: "3.8"
networks:
argus-sys-net:
external: true
services:
bind:
image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
container_name: argus-bind-sys
networks: [argus-sys-net]
volumes:
- ./private-server:/private
restart: unless-stopped
master:
image: ${MASTER_IMAGE_TAG:-argus-master:latest}
container_name: argus-master-sys
depends_on: [bind]
environment:
- OFFLINE_THRESHOLD_SECONDS=6
- ONLINE_THRESHOLD_SECONDS=2
- SCHEDULER_INTERVAL_SECONDS=1
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
ports:
- "${MASTER_PORT:-32300}:3000"
volumes:
- ./private-server/argus/master:/private/argus/master
- ./private-server/argus/metric/prometheus:/private/argus/metric/prometheus
- ./private-server/argus/etc:/private/argus/etc
networks: [argus-sys-net]
restart: unless-stopped
es:
image: ${ES_IMAGE_TAG:-argus-elasticsearch:latest}
container_name: argus-es-sys
environment:
- discovery.type=single-node
- xpack.security.enabled=false
- ES_JAVA_OPTS=-Xms512m -Xmx512m
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
volumes:
- ./private-server/argus/log/elasticsearch:/private/argus/log/elasticsearch
- ./private-server/argus/etc:/private/argus/etc
ports:
- "${ES_HTTP_PORT:-9200}:9200"
restart: unless-stopped
networks: [argus-sys-net]
kibana:
image: ${KIBANA_IMAGE_TAG:-argus-kibana:latest}
container_name: argus-kibana-sys
environment:
- ELASTICSEARCH_HOSTS=http://es.log.argus.com:9200
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
volumes:
- ./private-server/argus/log/kibana:/private/argus/log/kibana
- ./private-server/argus/etc:/private/argus/etc
depends_on: [es]
ports:
- "${KIBANA_PORT:-5601}:5601"
restart: unless-stopped
networks: [argus-sys-net]
ftp:
image: ${FTP_IMAGE_TAG:-argus-metric-ftp:latest}
container_name: argus-ftp
restart: unless-stopped
environment:
- TZ=Asia/Shanghai
- FTP_BASE_PATH=/private/argus/ftp
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
- DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com}
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
ports:
- "${FTP_PORT:-21}:21"
- "${FTP_DATA_PORT:-20}:20"
- "${FTP_PASSIVE_HOST_RANGE:-21100-21110}:21100-21110"
volumes:
- ./private-server/argus/metric/ftp:/private/argus/ftp
- ./private-server/argus/etc:/private/argus/etc
networks: [argus-sys-net]
prometheus:
image: ${PROM_IMAGE_TAG:-argus-metric-prometheus:latest}
container_name: argus-prometheus
restart: unless-stopped
environment:
- TZ=Asia/Shanghai
- PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
ports:
- "${PROMETHEUS_PORT:-9090}:9090"
volumes:
- ./private-server/argus/metric/prometheus:/private/argus/metric/prometheus
- ./private-server/argus/etc:/private/argus/etc
networks: [argus-sys-net]
grafana:
image: ${GRAFANA_IMAGE_TAG:-argus-metric-grafana:latest}
container_name: argus-grafana
restart: unless-stopped
environment:
- TZ=Asia/Shanghai
- GRAFANA_BASE_PATH=/private/argus/metric/grafana
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
- GF_SERVER_HTTP_PORT=3000
- GF_LOG_LEVEL=warn
- GF_LOG_MODE=console
- GF_PATHS_PROVISIONING=/private/argus/metric/grafana/provisioning
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
ports:
- "${GRAFANA_PORT:-3000}:3000"
volumes:
- ./private-server/argus/metric/grafana:/private/argus/metric/grafana
- ./private-server/argus/etc:/private/argus/etc
depends_on: [prometheus]
networks: [argus-sys-net]
alertmanager:
image: ${ALERT_IMAGE_TAG:-argus-alertmanager:latest}
container_name: argus-alertmanager
environment:
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
volumes:
- ./private-server/argus/etc:/private/argus/etc
- ./private-server/argus/alert/alertmanager:/private/argus/alert/alertmanager
networks: [argus-sys-net]
ports:
- "${ALERTMANAGER_PORT:-9093}:9093"
restart: unless-stopped
web-frontend:
image: ${FRONT_IMAGE_TAG:-argus-web-frontend:latest}
container_name: argus-web-frontend
environment:
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
- EXTERNAL_MASTER_PORT=${WEB_PROXY_PORT_8085:-8085}
- EXTERNAL_ALERTMANAGER_PORT=${WEB_PROXY_PORT_8084:-8084}
- EXTERNAL_GRAFANA_PORT=${WEB_PROXY_PORT_8081:-8081}
- EXTERNAL_PROMETHEUS_PORT=${WEB_PROXY_PORT_8082:-8082}
- EXTERNAL_KIBANA_PORT=${WEB_PROXY_PORT_8083:-8083}
volumes:
- ./private-server/argus/etc:/private/argus/etc
networks: [argus-sys-net]
restart: unless-stopped
web-proxy:
image: ${WEB_PROXY_IMAGE_TAG:-argus-web-proxy:latest}
container_name: argus-web-proxy
depends_on: [bind, master, grafana, prometheus, kibana, alertmanager]
environment:
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
volumes:
- ./private-server/argus/etc:/private/argus/etc
networks: [argus-sys-net]
ports:
- "${WEB_PROXY_PORT_8080:-8080}:8080"
- "${WEB_PROXY_PORT_8081:-8081}:8081"
- "${WEB_PROXY_PORT_8082:-8082}:8082"
- "${WEB_PROXY_PORT_8083:-8083}:8083"
- "${WEB_PROXY_PORT_8084:-8084}:8084"
- "${WEB_PROXY_PORT_8085:-8085}:8085"
restart: unless-stopped

View File

@ -0,0 +1,101 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
REPO_ROOT="$(cd "$ROOT/../../.." && pwd)"
ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] || cp "$ROOT/.env.example" "$ENV_FILE"
# Load build user (UID/GID) from repo config to match container runtime users
if [[ -f "$REPO_ROOT/scripts/common/build_user.sh" ]]; then
# shellcheck disable=SC1091
source "$REPO_ROOT/scripts/common/build_user.sh" 2>/dev/null || true
if declare -f load_build_user >/dev/null 2>&1; then
load_build_user
fi
fi
# Capture resolved UID/GID from build_user before sourcing .env
uid_resolved="${ARGUS_BUILD_UID:-2133}"
gid_resolved="${ARGUS_BUILD_GID:-2015}"
echo "[BOOT] resolved build user: UID=${uid_resolved} GID=${gid_resolved} (from scripts/common/build_user.sh or env)"
# After resolving UID/GID, load .env for other settings; then we will overwrite UID/GID entries
set -a; source "$ENV_FILE"; set +a
echo "[BOOT] checking Docker Swarm"
if ! docker info 2>/dev/null | grep -q "Swarm: active"; then
echo "[BOOT] initializing swarm (single-node)"
docker swarm init >/dev/null 2>&1 || true
fi
NET_NAME=argus-sys-net
if docker network inspect "$NET_NAME" >/dev/null 2>&1; then
echo "[BOOT] overlay network exists: $NET_NAME"
else
echo "[BOOT] creating overlay network: $NET_NAME"
docker network create -d overlay --attachable "$NET_NAME"
fi
echo "[BOOT] preparing private directories (server/nodes)"
# Server-side dirs (align with sys/tests 01_bootstrap.sh)
mkdir -p \
"$ROOT/private-server/argus/etc" \
"$ROOT/private-server/argus/bind" \
"$ROOT/private-server/argus/master" \
"$ROOT/private-server/argus/metric/prometheus" \
"$ROOT/private-server/argus/metric/prometheus/data" \
"$ROOT/private-server/argus/metric/prometheus/rules" \
"$ROOT/private-server/argus/metric/prometheus/targets" \
"$ROOT/private-server/argus/alert/alertmanager" \
"$ROOT/private-server/argus/metric/ftp/share" \
"$ROOT/private-server/argus/metric/grafana/data" \
"$ROOT/private-server/argus/metric/grafana/logs" \
"$ROOT/private-server/argus/metric/grafana/plugins" \
"$ROOT/private-server/argus/metric/grafana/provisioning/datasources" \
"$ROOT/private-server/argus/metric/grafana/provisioning/dashboards" \
"$ROOT/private-server/argus/metric/grafana/data/sessions" \
"$ROOT/private-server/argus/metric/grafana/data/dashboards" \
"$ROOT/private-server/argus/metric/grafana/config" \
"$ROOT/private-server/argus/agent" \
"$ROOT/private-server/argus/log/elasticsearch" \
"$ROOT/private-server/argus/log/kibana"
mkdir -p "$ROOT/private-nodes/argus/agent"
uid="$uid_resolved"; gid="$gid_resolved"
echo "[BOOT] chown -R ${uid}:${gid} for server core dirs (best-effort)"
chown -R "$uid":"$gid" \
"$ROOT/private-server/argus/log/elasticsearch" \
"$ROOT/private-server/argus/log/kibana" \
"$ROOT/private-server/argus/metric/grafana" \
"$ROOT/private-server/argus/metric/prometheus" \
"$ROOT/private-server/argus/alert" \
"$ROOT/private-server/argus/metric/ftp" \
"$ROOT/private-server/argus/agent" \
"$ROOT/private-server/argus/etc" 2>/dev/null || true
# group-writable for etc/alert as in sys/tests
chmod -R g+w "$ROOT/private-server/argus/alert" "$ROOT/private-server/argus/etc" 2>/dev/null || true
# ensure .env carries the resolved UID/GID for compose env interpolation
if grep -q '^ARGUS_BUILD_UID=' "$ENV_FILE"; then
sed -i "s/^ARGUS_BUILD_UID=.*/ARGUS_BUILD_UID=${uid}/" "$ENV_FILE"
else
echo "ARGUS_BUILD_UID=${uid}" >> "$ENV_FILE"
fi
if grep -q '^ARGUS_BUILD_GID=' "$ENV_FILE"; then
sed -i "s/^ARGUS_BUILD_GID=.*/ARGUS_BUILD_GID=${gid}/" "$ENV_FILE"
else
echo "ARGUS_BUILD_GID=${gid}" >> "$ENV_FILE"
fi
# distribute update-dns.sh
BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh"
BIND_UPDATE_DEST="$ROOT/private-server/argus/etc/update-dns.sh"
if [[ -f "$BIND_UPDATE_SRC" ]]; then
cp "$BIND_UPDATE_SRC" "$BIND_UPDATE_DEST" && chmod +x "$BIND_UPDATE_DEST" || true
fi
echo "[BOOT] done"

View File

@ -0,0 +1,39 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
REPO_ROOT="$(cd "$ROOT/../../.." && pwd)"
ENV_FILE="$ROOT/.env"
# load UID/GID from repo config first (so they take precedence over any stale .env values)
if [[ -f "$REPO_ROOT/scripts/common/build_user.sh" ]]; then
# shellcheck disable=SC1091
source "$REPO_ROOT/scripts/common/build_user.sh" 2>/dev/null || true
if declare -f load_build_user >/dev/null 2>&1; then
load_build_user
fi
fi
set -a; source "$ENV_FILE"; set +a
PROJECT="${SERVER_PROJECT:-argus-swarm-server}"
COMPOSE_FILE="$ROOT/docker-compose.server.yml"
echo "[SERVER] starting compose project: $PROJECT"
docker compose -p "$PROJECT" -f "$COMPOSE_FILE" up -d
echo "[SERVER] containers:"; docker compose -p "$PROJECT" -f "$COMPOSE_FILE" ps
# Optional post-start permission alignment (disabled by default). Enable with SWARM_FIX_PERMS=1
if [[ "${SWARM_FIX_PERMS:-0}" == "1" ]]; then
echo "[SERVER] aligning permissions in containers (best-effort)"
for c in argus-master-sys argus-prometheus argus-grafana argus-ftp argus-es-sys argus-kibana-sys argus-web-frontend argus-web-proxy argus-alertmanager; do
docker exec "$c" sh -lc 'mkdir -p /private/argus && chmod -R 777 /private/argus' 2>/dev/null || true
done
echo "[SERVER] restarting selected supervised programs to pick up new permissions"
docker exec argus-prometheus sh -lc 'supervisorctl restart prometheus targets-updater >/dev/null 2>&1 || true' || true
docker exec argus-grafana sh -lc 'rm -f /private/argus/etc/grafana.metric.argus.com 2>/dev/null || true; supervisorctl restart grafana >/dev/null 2>&1 || true' || true
docker exec argus-es-sys sh -lc 'supervisorctl restart elasticsearch >/dev/null 2>&1 || true' || true
docker exec argus-kibana-sys sh -lc 'supervisorctl restart kibana >/dev/null 2>&1 || true' || true
fi
echo "[SERVER] done"

View File

@ -0,0 +1,84 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a
PROJECT="${SERVER_PROJECT:-argus-swarm-server}"
RETRIES=${RETRIES:-60}
SLEEP=${SLEEP:-5}
code() { curl -4 -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
prom_ok() {
# Consider ready if TCP:9090 is accepting on localhost (host side)
(exec 3<>/dev/tcp/127.0.0.1/${PROMETHEUS_PORT:-9090}) >/dev/null 2>&1 && return 0
return 1
}
echo "[READY] waiting services (max $((RETRIES*SLEEP))s)"
for i in $(seq 1 "$RETRIES"); do
e1=$(code "http://127.0.0.1:${MASTER_PORT:-32300}/readyz")
e2=$(code "http://127.0.0.1:${ES_HTTP_PORT:-9200}/_cluster/health")
e3=000
if prom_ok; then e3=200; fi
e4=$(code "http://127.0.0.1:${GRAFANA_PORT:-3000}/api/health")
e5=$(code "http://127.0.0.1:${KIBANA_PORT:-5601}/api/status")
ok=0
[[ "$e1" == 200 ]] && ok=$((ok+1))
[[ "$e2" == 200 ]] && ok=$((ok+1))
[[ "$e3" == 200 ]] && ok=$((ok+1))
[[ "$e4" == 200 ]] && ok=$((ok+1))
# Kibana 可放宽,等其它四项即可
if [[ $ok -ge 4 ]]; then echo "[READY] base services OK"; break; fi
echo "[..] waiting ($i/$RETRIES): master=$e1 es=$e2 prom=$e3 graf=$e4 kibana=$e5"; sleep "$SLEEP"
done
if [[ $ok -lt 4 ]]; then echo "[ERROR] services not ready" >&2; exit 1; fi
echo "[READY] resolving overlay IPs"
BINDIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-bind-sys)
FTPIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-ftp)
echo "BINDIP=$BINDIP FTPIP=$FTPIP"
ENV_NODES="$ROOT/.env.nodes"
cat > "$ENV_NODES" <<EOF
BINDIP=$BINDIP
FTPIP=$FTPIP
MASTER_ENDPOINT=http://master.argus.com:3000
FTP_USER=ftpuser
FTP_PASSWORD=ZGClab1234!
AGENT_ENV=dev2
AGENT_USER=yuyr
AGENT_INSTANCE=node001sX
EOF
echo "[READY] wrote $ENV_NODES"
# Inline: fix domain records -> actual overlay IPs and reload bind/nginx (best-effort)
echo "[READY] fixing domain records to overlay IPs"
ETC_DIR="$ROOT/private-server/argus/etc"; mkdir -p "$ETC_DIR"
declare -A MAP
MAP[web-frontend]=web.argus.com
MAP[argus-grafana]=grafana.metric.argus.com
MAP[argus-prometheus]=prom.metric.argus.com
MAP[argus-kibana-sys]=kibana.log.argus.com
MAP[argus-alertmanager]=alertmanager.alert.argus.com
MAP[argus-master-sys]=master.argus.com
changed=0
for cname in "${!MAP[@]}"; do
domain="${MAP[$cname]}"; fpath="$ETC_DIR/$domain"
ip=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' "$cname" 2>/dev/null || true)
[[ -z "$ip" ]] && { echo "[DNS-FIX][WARN] $domain: container $cname no overlay IP yet"; continue; }
cur=$(cat "$fpath" 2>/dev/null || echo "")
if [[ "$cur" != "$ip" ]]; then
echo "$ip" > "$fpath"; echo "[DNS-FIX][SET] $domain = $ip (was: ${cur:-<empty>})"; changed=1
else
echo "[DNS-FIX][OK] $domain already $ip"
fi
done
if [[ $changed -eq 1 ]]; then
docker exec argus-bind-sys /usr/local/bin/reload-bind9.sh >/dev/null 2>&1 || true
sleep 1
fi
docker exec argus-web-proxy nginx -s reload >/dev/null 2>&1 || true

View File

@ -0,0 +1,16 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a
ENV_NODES_FILE="$ROOT/.env.nodes"; set -a; source "$ENV_NODES_FILE"; set +a
PROJECT="${NODES_PROJECT:-argus-swarm-nodes}"
COMPOSE_FILE="$ROOT/docker-compose.nodes.yml"
echo "[NODES] starting compose project: $PROJECT"
docker compose -p "$PROJECT" --env-file "$ENV_NODES_FILE" -f "$COMPOSE_FILE" up -d
docker compose -p "$PROJECT" -f "$COMPOSE_FILE" ps
echo "[NODES] done"

View File

@ -0,0 +1,173 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/.env"; [[ -f "$ENV_FILE" ]] && { set -a; source "$ENV_FILE"; set +a; }
PROM_PORT="${PROMETHEUS_PORT:-9090}"
GRAF_PORT="${GRAFANA_PORT:-3000}"
GRAF_URL="http://127.0.0.1:${GRAF_PORT}"
PROM_DOMAIN="prom.metric.argus.com:${PROM_PORT}"
err() { echo "[ERR] $*" >&2; }
ok() { echo "[OK] $*"; }
info(){ echo "[INFO] $*"; }
fail() { err "$*"; exit 1; }
# Ensure fluent-bit is installed, configured and running to ship logs to ES
# Best-effort remediation for swarm_tests only (does not change repo sources)
ensure_fluentbit() {
local cname="$1"
# 1) ensure process exists or try local bundle installer
if ! docker exec "$cname" pgrep -x fluent-bit >/dev/null 2>&1; then
docker exec "$cname" bash -lc '
set -e
root=/opt/argus-metric/versions
ver=$(ls -1 "$root" 2>/dev/null | sort -Vr | head -1 || true)
[[ -z "$ver" ]] && ver=1.42.0
verdir="$root/$ver"
tb=$(ls -1 "$verdir"/fluent-bit-*.tar.gz 2>/dev/null | head -1 || true)
if [ -n "$tb" ]; then tmp=$(mktemp -d); tar -xzf "$tb" -C "$tmp"; sub=$(find "$tmp" -mindepth 1 -maxdepth 1 -type d | head -n1 || true); [ -n "$sub" ] && (cd "$sub" && ./install.sh "$verdir") || true; fi
' >/dev/null 2>&1 || true
fi
# 2) patch configs using literal placeholders with safe delimiter
docker exec "$cname" bash -lc '
set -e
f=/etc/fluent-bit/fluent-bit.conf
o=/etc/fluent-bit/outputs.d/10-es.conf
LCL="\${CLUSTER}"; LRA="\${RACK}"; LHN="\${HOSTNAME}"; EH="\${ES_HOST:-localhost}"; EP="\${ES_PORT:-9200}"
# record_modifier placeholders
if grep -q "Record cluster $LCL" "$f"; then sed -i "s|Record cluster $LCL|Record cluster local|" "$f"; fi
if grep -q "Record rack $LRA" "$f"; then sed -i "s|Record rack $LRA|Record rack dev|" "$f"; fi
if grep -q "Record host $LHN" "$f"; then hn=$(hostname); sed -i "s|Record host $LHN|Record host ${hn}|" "$f"; fi
# outputs placeholders
if [ -f "$o" ] && (grep -q "$EH" "$o" || grep -q "$EP" "$o"); then
sed -i "s|Host $EH|Host es.log.argus.com|g; s|Port $EP|Port 9200|g" "$o"
fi
' >/dev/null 2>&1 || true
# 3) restart fluent-bit (best-effort) and wait
docker exec "$cname" bash -lc 'pkill -x fluent-bit >/dev/null 2>&1 || true; sleep 1; setsid su -s /bin/bash fluent-bit -c "/opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf >> /var/log/fluent-bit.log 2>&1" &>/dev/null & echo ok' >/dev/null 2>&1 || true
for i in {1..10}; do if docker exec "$cname" pgrep -x fluent-bit >/dev/null 2>&1; then return 0; fi; sleep 1; done
echo "[WARN] fluent-bit not confirmed running; log pipeline may not ingest" >&2
}
# ---- Grafana /api/health ----
info "Grafana /api/health"
HEALTH_JSON="$ROOT/tmp/metric-verify/graf_health.json"
mkdir -p "$(dirname "$HEALTH_JSON")"
code=$(curl -fsS -o "$HEALTH_JSON" -w '%{http_code}' --max-time 10 "$GRAF_URL/api/health" || true)
[[ "$code" == 200 ]] || fail "/api/health HTTP $code"
if grep -q '"database"\s*:\s*"ok"' "$HEALTH_JSON"; then ok "grafana health database=ok"; else fail "grafana health not ok: $(cat "$HEALTH_JSON")"; fi
# ---- Grafana datasource points to prom domain ----
info "Grafana datasource URL uses domain: $PROM_DOMAIN"
DS_FILE="/private/argus/metric/grafana/provisioning/datasources/datasources.yml"
if ! docker exec argus-grafana sh -lc "test -f $DS_FILE" >/dev/null 2>&1; then
DS_FILE="/etc/grafana/provisioning/datasources/datasources.yml"
fi
docker exec argus-grafana sh -lc "grep -E 'url:\s*http://$PROM_DOMAIN' '$DS_FILE'" >/dev/null 2>&1 || fail "datasource not pointing to $PROM_DOMAIN"
ok "datasource points to domain"
# ---- DNS resolution inside grafana ----
info "bind resolution inside grafana"
tries=0
until docker exec argus-grafana getent hosts prom.metric.argus.com >/dev/null 2>&1; do
tries=$((tries+1)); (( tries > 24 )) && fail "grafana cannot resolve prom.metric.argus.com"
echo "[..] waiting DNS propagation in grafana ($tries/24)"; sleep 5
done
ok "domain resolves"
# ---- Prometheus activeTargets down check ----
info "Prometheus activeTargets health"
targets_json="$ROOT/tmp/metric-verify/prom_targets.json"
curl -fsS "http://127.0.0.1:${PROM_PORT}/api/v1/targets" -o "$targets_json" || { echo "[WARN] fetch targets failed" >&2; }
down_all=""
if command -v jq >/dev/null 2>&1; then
down_all=$(jq -r '.data.activeTargets[] | select(.health=="down") | .scrapeUrl' "$targets_json" 2>/dev/null || true)
else
down_all=$(grep -o '"scrapeUrl":"[^"]\+"' "$targets_json" | sed 's/"scrapeUrl":"\(.*\)"/\1/' | paste -sd '\n' - | grep -v '^$' || true)
grep -q '"health":"down"' "$targets_json" && [ -z "$down_all" ] && down_all="(one or more targets down)"
fi
# ignore dcgm-exporter(9400) and tolerate node-exporter(9100) in swarm tests
down_filtered=$(echo "$down_all" | grep -Ev ':(9400|9100)/' || true)
if [[ -n "$down_filtered" ]]; then
err "prometheus down targets (filtered):"; echo "$down_filtered" >&2
else
ok "prometheus targets up (ignoring :9100 and :9400)"
fi
# ---- nodes.json sanity: avoid 172.22/16 (gwbridge) ----
nodes_json="$ROOT/private-server/argus/metric/prometheus/nodes.json"
if [[ -f "$nodes_json" ]] && grep -q '"ip"\s*:\s*"172\.22\.' "$nodes_json"; then
fail "nodes.json contains 172.22/16 addresses (gwbridge)"
fi
ok "nodes.json IPs look fine"
echo "[DONE] metric verify"
# ---- Log pipeline smoke test (adapted from sys/tests 07) ----
info "Log pipeline: send logs in node container and assert ES counts"
ES_PORT="${ES_HTTP_PORT:-9200}"
KIBANA_PORT="${KIBANA_PORT:-5601}"
get_count() {
local idx="$1"; local tmp; tmp=$(mktemp)
local code
code=$(curl -s -o "$tmp" -w "%{http_code}" "http://127.0.0.1:${ES_PORT}/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" || true)
if [[ "$code" == "200" ]]; then
local val
val=$(jq -r '(.count // 0) | tonumber? // 0' "$tmp" 2>/dev/null || echo 0)
echo "$val"
else
echo 0
fi
rm -f "$tmp"
}
train0=$(get_count "train-*")
infer0=$(get_count "infer-*")
base=$((train0 + infer0))
info "initial ES counts: train=${train0} infer=${infer0} total=${base}"
send_logs() {
local cname="$1"; local hosttag="$2"
docker exec "$cname" sh -lc 'mkdir -p /logs/train /logs/infer'
docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log"
docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log"
docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log"
}
NODE_CONT="${SWARM_NODE_CNAME:-argus-metric-test-node-swarm}"
ensure_fluentbit "$NODE_CONT"
send_logs "$NODE_CONT" "swarm-node"
info "waiting for ES to ingest..."
curl -s -X POST "http://127.0.0.1:${ES_PORT}/train-*/_refresh" >/dev/null 2>&1 || true
curl -s -X POST "http://127.0.0.1:${ES_PORT}/infer-*/_refresh" >/dev/null 2>&1 || true
final=0; threshold=3
for attempt in {1..60}; do
train1=$(get_count "train-*"); infer1=$(get_count "infer-*"); final=$((train1 + infer1))
if (( final > base && final >= threshold )); then break; fi
echo "[..] waiting ES counts increase to >=${threshold} ($attempt/60) current=${final} base=${base}"; \
curl -s -X POST "http://127.0.0.1:${ES_PORT}/train-*/_refresh" >/dev/null 2>&1 || true; \
curl -s -X POST "http://127.0.0.1:${ES_PORT}/infer-*/_refresh" >/dev/null 2>&1 || true; \
sleep 2
done
info "final ES counts: train=${train1} infer=${infer1} total=${final}"
(( final > base )) || fail "ES total did not increase (${base} -> ${final})"
(( final >= threshold )) || fail "ES total below expected threshold: ${final} < ${threshold}"
es_health=$(curl -s "http://127.0.0.1:${ES_PORT}/_cluster/health" | grep -o '"status":"[^\"]*"' | cut -d'"' -f4)
[[ "$es_health" == green || "$es_health" == yellow ]] || fail "ES health not green/yellow: $es_health"
if ! curl -fs "http://127.0.0.1:${KIBANA_PORT}/api/status" >/dev/null 2>&1; then
echo "[WARN] Kibana status endpoint not available" >&2
fi
ok "log pipeline verified"

View File

@ -0,0 +1,21 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/.env"; set -a; source "$ENV_FILE"; set +a
echo "[DOWN] stopping nodes compose"
docker compose -p "${NODES_PROJECT:-argus-swarm-nodes}" -f "$ROOT/docker-compose.nodes.yml" down --remove-orphans || true
echo "[DOWN] stopping server compose"
docker compose -p "${SERVER_PROJECT:-argus-swarm-server}" -f "$ROOT/docker-compose.server.yml" down --remove-orphans || true
echo "[DOWN] removing overlay network"
docker network rm argus-sys-net >/dev/null 2>&1 || true
echo "[DOWN] cleanup temp files"
rm -rf "$ROOT/private-server/tmp" "$ROOT/private-nodes/tmp" 2>/dev/null || true
echo "[DOWN] done"

View File

@ -0,0 +1,5 @@
{
"commit": "5b85c4c2fcf5d32d4f68aaef345c53096359b2f1",
"database": "ok",
"version": "11.1.0"
}

View File

@ -0,0 +1,5 @@
{
"commit": "5b85c4c2fcf5d32d4f68aaef345c53096359b2f1",
"database": "ok",
"version": "11.1.0"
}

View File

@ -0,0 +1 @@
{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.13:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.13:9400/metrics","globalUrl":"http://10.0.1.13:9400/metrics","lastError":"","lastScrape":"2025-11-06T16:36:25.585236213+08:00","lastScrapeDuration":0.002520163,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.13:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.13:9100/metrics","globalUrl":"http://10.0.1.13:9100/metrics","lastError":"","lastScrape":"2025-11-06T16:36:33.694723606+08:00","lastScrapeDuration":0.021800606,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}}

View File

@ -0,0 +1 @@
{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.15:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.15","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.15","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.15:9400/metrics","globalUrl":"http://10.0.1.15:9400/metrics","lastError":"","lastScrape":"2025-11-06T15:47:37.200098366+08:00","lastScrapeDuration":0.001361528,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.15:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.15","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.15","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.15:9100/metrics","globalUrl":"http://10.0.1.15:9100/metrics","lastError":"","lastScrape":"2025-11-06T15:47:40.184367879+08:00","lastScrapeDuration":0.02923333,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}}

View File

@ -92,6 +92,20 @@ while :; do
WAITED=$((WAITED+1))
done
# Quick upstream reachability snapshot (best-effort; does not block startup)
declare -a _UPSTREAMS=(
"http://web.argus.com:8080/"
"http://grafana.metric.argus.com:3000/api/health"
"http://prom.metric.argus.com:9090/-/ready"
"http://kibana.log.argus.com:5601/api/status"
"http://alertmanager.alert.argus.com:9093/api/v2/status"
"http://master.argus.com:3000/readyz"
)
for u in "${_UPSTREAMS[@]}"; do
code=$(curl -4 -s -o /dev/null -w "%{http_code}" "$u" || echo 000)
echo "[INFO] upstream check: $u -> $code"
done
echo "[INFO] Launching nginx..."
# 启动 nginx 前台模式