Compare commits
10 Commits
38f8ee9301
...
2ff7c55f3b
| Author | SHA1 | Date | |
|---|---|---|---|
| 2ff7c55f3b | |||
| 9858f4471e | |||
| c8279997a4 | |||
| 4ed5c64804 | |||
| 3551360687 | |||
| 3202e02b42 | |||
| 29eb75a374 | |||
| ccc141f557 | |||
| ed0d1ca904 | |||
| b6da5bc8b8 |
1
deployment/.gitignore
vendored
Normal file
1
deployment/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
artifact/
|
||||
16
deployment/build/README.md
Normal file
16
deployment/build/README.md
Normal file
@ -0,0 +1,16 @@
|
||||
# Deployment Build Toolkit
|
||||
|
||||
This folder provides scripts to produce offline server/client packages and publish the client package to FTP.
|
||||
|
||||
Commands
|
||||
- build_server_package.sh [--version YYYYMMDD]
|
||||
- build_client_package.sh [--version YYYYMMDD]
|
||||
- publish_client.sh --version YYYYMMDD --server <host> --user ftpuser --password <pass> [--port 21]
|
||||
|
||||
Outputs
|
||||
- deployment/artifact/server/<YYYYMMDD>/
|
||||
- deployment/artifact/client/<YYYYMMDD>/
|
||||
|
||||
Notes
|
||||
- Server package contains docker images (single all-images.tar.gz), compose/, scripts/, docs/, private/ skeleton.
|
||||
- Client package reuses all-in-one-full artifact, repacked as argus-metric_<YYYYMMDD>.tar.gz (compatible with setup.sh).
|
||||
90
deployment/build/build_client_package.sh
Executable file
90
deployment/build/build_client_package.sh
Executable file
@ -0,0 +1,90 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||
BUILD_DIR="$ROOT_DIR/deployment/build"
|
||||
ART_ROOT="$ROOT_DIR/deployment/artifact"
|
||||
|
||||
. "$BUILD_DIR/common.sh"
|
||||
|
||||
usage() { cat <<'EOF'
|
||||
Build Argus Client Offline Package
|
||||
|
||||
Usage: build_client_package.sh [--version YYYYMMDD] [--out DIR]
|
||||
|
||||
Produces: deployment/artifact/client/<YYYYMMDD>/argus-metric_<YYYYMMDD>.tar.gz
|
||||
EOF
|
||||
}
|
||||
|
||||
VERSION="$(today_version)"
|
||||
OUT_DIR=""
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--version) VERSION="$2"; shift 2;;
|
||||
--out) OUT_DIR="$2"; shift 2;;
|
||||
-h|--help) usage; exit 0;;
|
||||
*) err "unknown arg: $1"; usage; exit 1;;
|
||||
esac
|
||||
done
|
||||
|
||||
PKG_DIR="${OUT_DIR:-$ART_ROOT/client/$VERSION}"
|
||||
make_dir "$PKG_DIR"
|
||||
|
||||
log "Packaging client from all-in-one-full artifact"
|
||||
PLUGIN_DIR="$ROOT_DIR/src/metric/client-plugins/all-in-one-full"
|
||||
require_cmd bash tar gzip
|
||||
|
||||
(cd "$PLUGIN_DIR" && bash scripts/package_artifact.sh --force)
|
||||
|
||||
# pick latest artifact dir
|
||||
ART_BASE="$PLUGIN_DIR/artifact"
|
||||
latest_dir=$(ls -1dt "$ART_BASE"/*/ 2>/dev/null | head -n1 || true)
|
||||
[[ -n "$latest_dir" ]] || { err "no client artifact found in $ART_BASE"; exit 1; }
|
||||
|
||||
tmpdir=$(mktemp -d)
|
||||
trap 'rm -rf "$tmpdir"' EXIT
|
||||
# Filter-only copy: keep install_order files + scripts + deps + version.json
|
||||
mkdir -p "$tmpdir/src"
|
||||
cp -f "$latest_dir/version.json" "$tmpdir/src/version.json"
|
||||
if command -v jq >/dev/null 2>&1; then
|
||||
mapfile -t files < <(jq -r '.install_order[]' "$latest_dir/version.json")
|
||||
else
|
||||
files=( $(grep -E '"install_order"' -A10 "$latest_dir/version.json" | sed -n 's/\s*"\(.*\.tar\.gz\)".*/\1/p') )
|
||||
fi
|
||||
for f in "${files[@]}"; do
|
||||
[[ -f "$latest_dir/$f" ]] && cp -f "$latest_dir/$f" "$tmpdir/src/$f"
|
||||
done
|
||||
for aux in install.sh uninstall.sh check_health.sh check_version.sh sync_dns.sh restart_unhealthy.sh config.env; do
|
||||
[[ -f "$latest_dir/$aux" ]] && cp -f "$latest_dir/$aux" "$tmpdir/src/$aux";
|
||||
done
|
||||
if [[ -d "$latest_dir/deps" ]]; then
|
||||
mkdir -p "$tmpdir/src/deps" && rsync -a "$latest_dir/deps/" "$tmpdir/src/deps/" >/dev/null 2>&1 || cp -r "$latest_dir/deps/." "$tmpdir/src/deps/";
|
||||
fi
|
||||
|
||||
out_name="argus-metric_$(echo "$VERSION" | sed 's/\./_/g').tar.gz"
|
||||
|
||||
(cd "$tmpdir/src" && tar -czf "$PKG_DIR/$out_name" .)
|
||||
|
||||
log "Client package ready: $PKG_DIR/$out_name"
|
||||
echo "$VERSION" > "$PKG_DIR/LATEST_VERSION"
|
||||
|
||||
# include publish helper and setup.sh for convenience (place first)
|
||||
PUBLISH_TPL="$BUILD_DIR/templates/client/publish.sh"
|
||||
if [[ -f "$PUBLISH_TPL" ]]; then
|
||||
cp "$PUBLISH_TPL" "$PKG_DIR/publish.sh"
|
||||
fi
|
||||
|
||||
# also place a copy of setup.sh alongside
|
||||
SETUP_SRC="$ROOT_DIR/src/metric/client-plugins/all-in-one-full/scripts/setup.sh"
|
||||
[[ -f "$SETUP_SRC" ]] && cp "$SETUP_SRC" "$PKG_DIR/setup.sh" || true
|
||||
|
||||
# docs for end users (this may overwrite file modes), then fix execute bits
|
||||
CLIENT_DOC_DIR="$BUILD_DIR/templates/client"
|
||||
if [[ -d "$CLIENT_DOC_DIR" ]]; then
|
||||
rsync -a "$CLIENT_DOC_DIR/" "$PKG_DIR/" >/dev/null 2>&1 || cp -r "$CLIENT_DOC_DIR/." "$PKG_DIR/"
|
||||
fi
|
||||
|
||||
# ensure helpers are executable
|
||||
chmod +x "$PKG_DIR/publish.sh" "$PKG_DIR/setup.sh" 2>/dev/null || true
|
||||
|
||||
exit 0
|
||||
138
deployment/build/build_server_package.sh
Executable file
138
deployment/build/build_server_package.sh
Executable file
@ -0,0 +1,138 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||
BUILD_DIR="$ROOT_DIR/deployment/build"
|
||||
ART_ROOT="$ROOT_DIR/deployment/artifact"
|
||||
|
||||
. "$BUILD_DIR/common.sh"
|
||||
|
||||
usage() { cat <<'EOF'
|
||||
Build Argus Server Offline Package
|
||||
|
||||
Usage: build_server_package.sh [--version YYYYMMDD] [--out DIR] [--resave-image]
|
||||
|
||||
Outputs into deployment/artifact/server/<YYYYMMDD>/ by default.
|
||||
EOF
|
||||
}
|
||||
|
||||
VERSION="$(today_version)"
|
||||
OUT_DIR=""
|
||||
RESAVE_IMAGE=false
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--version) VERSION="$2"; shift 2;;
|
||||
--out) OUT_DIR="$2"; shift 2;;
|
||||
--resave-image) RESAVE_IMAGE=true; shift;;
|
||||
-h|--help) usage; exit 0;;
|
||||
*) err "unknown arg: $1"; usage; exit 1;;
|
||||
esac
|
||||
done
|
||||
|
||||
PKG_DIR="${OUT_DIR:-$ART_ROOT/server/$VERSION}"
|
||||
STAGE="$(mktemp -d)"
|
||||
trap 'rm -rf "$STAGE"' EXIT
|
||||
|
||||
log "Version: $VERSION"
|
||||
log "Staging: $STAGE"
|
||||
|
||||
# 1) Layout
|
||||
make_dir "$STAGE/images"
|
||||
make_dir "$STAGE/compose"
|
||||
make_dir "$STAGE/scripts"
|
||||
make_dir "$STAGE/docs"
|
||||
make_dir "$STAGE/private/argus"
|
||||
|
||||
# 2) Compose: derive from sys/tests by removing test-only services
|
||||
SRC_COMPOSE="$ROOT_DIR/src/sys/tests/docker-compose.yml"
|
||||
[[ -f "$SRC_COMPOSE" ]] || { err "missing $SRC_COMPOSE"; exit 1; }
|
||||
# 2.1 filter out test services
|
||||
tmp_compose1="$STAGE/compose/docker-compose.filtered.yml"
|
||||
awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$tmp_compose1"
|
||||
# 2.2 transform to external overlay network (remove sysnet and per-service blocks)
|
||||
awk -f "$BUILD_DIR/templates/docker-compose.overlay.awk" "$tmp_compose1" > "$STAGE/compose/docker-compose.yml"
|
||||
cp "$BUILD_DIR/templates/.env.example" "$STAGE/compose/.env.example"
|
||||
# fix relative private path to match package layout (compose/ and private/ are siblings)
|
||||
sed -i 's#\./private/#../private/#g' "$STAGE/compose/docker-compose.yml"
|
||||
# also handle bind mount form without trailing slash
|
||||
sed -i -E 's#- \./private:/private#- ../private:/private#g' "$STAGE/compose/docker-compose.yml"
|
||||
# drop timezone file bind which may not exist on target distros (e.g. NixOS)
|
||||
sed -i '/\/etc\/timezone:\/etc\/timezone:ro/d' "$STAGE/compose/docker-compose.yml"
|
||||
|
||||
# sanity-check: ensure test services are absent and external network present
|
||||
if grep -E '^(\s{2})(node-a|node-b|test-node|test-gpu-node):\s*$' -n "$STAGE/compose/docker-compose.yml" >/dev/null; then
|
||||
err "compose filter failed: test services still present"; exit 1;
|
||||
fi
|
||||
if ! grep -q '^networks:' "$STAGE/compose/docker-compose.yml" || ! grep -q 'argus-sys-net:' "$STAGE/compose/docker-compose.yml"; then
|
||||
err "compose overlay transform failed: external network missing"; exit 1;
|
||||
fi
|
||||
|
||||
# 3) Images (reuse if already exported unless --resave-image)
|
||||
existing_images_tar="$PKG_DIR/images/all-images.tar.gz"
|
||||
if [[ "$RESAVE_IMAGE" == false && -f "$existing_images_tar" ]]; then
|
||||
log "Reusing existing images tar: $existing_images_tar"
|
||||
cp "$existing_images_tar" "$STAGE/images/"
|
||||
elif [[ "$RESAVE_IMAGE" == false ]]; then
|
||||
# Try cross-version reuse from latest server_*.tar.gz
|
||||
latest_pkg=$(ls -1t "$ART_ROOT/server"/server_*.tar.gz 2>/dev/null | head -n1 || true)
|
||||
if [[ -n "$latest_pkg" ]]; then
|
||||
log "Reusing images from: $latest_pkg"
|
||||
mkdir -p "$STAGE/images"
|
||||
# extract matching file regardless of top-level dir
|
||||
if tar -xzf "$latest_pkg" -C "$STAGE" --wildcards '*/images/all-images.tar.gz' 2>/dev/null; then
|
||||
# locate and move
|
||||
found=$(find "$STAGE" -type f -path '*/images/all-images.tar.gz' | head -n1 || true)
|
||||
if [[ -n "$found" ]]; then
|
||||
mv "$found" "$STAGE/images/all-images.tar.gz"
|
||||
# cleanup leftover extracted dir
|
||||
dir_to_clean=$(dirname "$found")
|
||||
rm -rf "${dir_to_clean%/images}" 2>/dev/null || true
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# If still not present, save from local docker daemon
|
||||
if [[ ! -f "$STAGE/images/all-images.tar.gz" ]]; then
|
||||
require_cmd docker gzip
|
||||
images=(
|
||||
argus-bind9:latest
|
||||
argus-master:latest
|
||||
argus-elasticsearch:latest
|
||||
argus-kibana:latest
|
||||
argus-metric-ftp:latest
|
||||
argus-metric-prometheus:latest
|
||||
argus-metric-grafana:latest
|
||||
argus-alertmanager:latest
|
||||
argus-web-frontend:latest
|
||||
argus-web-proxy:latest
|
||||
)
|
||||
log "Saving images: ${#images[@]}"
|
||||
tarfile="$STAGE/images/all-images.tar"
|
||||
docker save -o "$tarfile" "${images[@]}"
|
||||
gzip -f "$tarfile"
|
||||
fi
|
||||
|
||||
# 4) Scripts & Docs
|
||||
copy_tree "$BUILD_DIR/templates/scripts" "$STAGE/scripts"
|
||||
copy_tree "$BUILD_DIR/templates/docs" "$STAGE/docs"
|
||||
|
||||
# 5) Manifests
|
||||
gen_manifest "$STAGE" "$STAGE/manifest.txt"
|
||||
checksum_dir "$STAGE" "$STAGE/checksums.txt"
|
||||
|
||||
# 6) Move to artifact
|
||||
make_dir "$PKG_DIR"
|
||||
rsync -a "$STAGE/" "$PKG_DIR/" 2>/dev/null || cp -r "$STAGE/." "$PKG_DIR/"
|
||||
log "Server package ready: $PKG_DIR"
|
||||
|
||||
echo "$VERSION" > "$PKG_DIR/version.json"
|
||||
|
||||
# 7) Create distributable tarball
|
||||
OUT_TAR_DIR="$(dirname "$PKG_DIR")"
|
||||
OUT_TAR="$OUT_TAR_DIR/server_${VERSION}.tar.gz"
|
||||
log "Creating tarball: $OUT_TAR"
|
||||
(cd "$PKG_DIR/.." && tar -czf "$OUT_TAR" "$(basename "$PKG_DIR")")
|
||||
log "Tarball ready: $OUT_TAR"
|
||||
|
||||
exit 0
|
||||
33
deployment/build/common.sh
Executable file
33
deployment/build/common.sh
Executable file
@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
log() { echo -e "\033[0;34m[INFO]\033[0m $*"; }
|
||||
warn() { echo -e "\033[1;33m[WARN]\033[0m $*"; }
|
||||
err() { echo -e "\033[0;31m[ERR ]\033[0m $*" >&2; }
|
||||
|
||||
require_cmd() {
|
||||
for c in "$@"; do
|
||||
command -v "$c" >/dev/null 2>&1 || { err "missing command: $c"; exit 1; }
|
||||
done
|
||||
}
|
||||
|
||||
today_version() {
|
||||
date +%Y%m%d
|
||||
}
|
||||
|
||||
checksum_dir() {
|
||||
local dir="$1"; local out="$2"; : > "$out";
|
||||
(cd "$dir" && find . -type f -print0 | sort -z | xargs -0 sha256sum) >> "$out"
|
||||
}
|
||||
|
||||
make_dir() { mkdir -p "$1"; }
|
||||
|
||||
copy_tree() {
|
||||
local src="$1" dst="$2"; rsync -a --delete "$src/" "$dst/" 2>/dev/null || cp -r "$src/." "$dst/";
|
||||
}
|
||||
|
||||
gen_manifest() {
|
||||
local root="$1"; local out="$2"; : > "$out";
|
||||
(cd "$root" && find . -maxdepth 3 -type f -printf "%p\n" | sort) >> "$out"
|
||||
}
|
||||
|
||||
32
deployment/build/templates/.env.example
Normal file
32
deployment/build/templates/.env.example
Normal file
@ -0,0 +1,32 @@
|
||||
# UID/GID for service processes
|
||||
ARGUS_BUILD_UID=1000
|
||||
ARGUS_BUILD_GID=1000
|
||||
|
||||
# Host ports (adjust if occupied)
|
||||
MASTER_PORT=32300
|
||||
ES_HTTP_PORT=9200
|
||||
KIBANA_PORT=5601
|
||||
NODE_A_PORT=2020
|
||||
NODE_B_PORT=2021
|
||||
PROMETHEUS_PORT=9090
|
||||
GRAFANA_PORT=3000
|
||||
ALERTMANAGER_PORT=9093
|
||||
WEB_PROXY_PORT_8080=8080
|
||||
WEB_PROXY_PORT_8081=8081
|
||||
WEB_PROXY_PORT_8082=8082
|
||||
WEB_PROXY_PORT_8083=8083
|
||||
WEB_PROXY_PORT_8084=8084
|
||||
WEB_PROXY_PORT_8085=8085
|
||||
|
||||
# FTP
|
||||
FTP_PORT=21
|
||||
FTP_DATA_PORT=20
|
||||
FTP_PASSIVE_HOST_RANGE=21100-21110
|
||||
FTP_PASSWORD=ZGClab1234!
|
||||
FTP_DOMAIN=ftp.metric.argus.com
|
||||
|
||||
# GPU profile disabled by default
|
||||
ENABLE_GPU=false
|
||||
|
||||
# External overlay network (Swarm attachable)
|
||||
OVERLAY_NET_NAME=argus-sys-net
|
||||
44
deployment/build/templates/client/INSTALL_CLIENT_zh.md
Normal file
44
deployment/build/templates/client/INSTALL_CLIENT_zh.md
Normal file
@ -0,0 +1,44 @@
|
||||
# Argus Metric 客户端安装指南(容器内普通用户场景)
|
||||
|
||||
## 准备与连通性检查
|
||||
- FTP 连接(需要账号密码,默认 `ftpuser / ZGClab1234!`)
|
||||
- `curl -u ftpuser:ZGClab1234! -I ftp://<FTP_IP>:21/LATEST_VERSION`
|
||||
- `curl -u ftpuser:ZGClab1234! -s ftp://<FTP_IP>:21/ | head`
|
||||
- 下载安装脚本
|
||||
- `curl -u ftpuser:ZGClab1234! -fsSL ftp://<FTP_IP>:21/setup.sh -o /tmp/setup.sh`
|
||||
- `chmod +x /tmp/setup.sh`
|
||||
|
||||
## 元数据与主机名
|
||||
- Agent 需要元数据(env/user/instance)与 Master 地址:
|
||||
- 方式A:hostname 形如 `env-user-instance-xxx`(推荐)
|
||||
- 方式B:导出环境变量:
|
||||
- `export AGENT_ENV=dev`
|
||||
- `export AGENT_USER=<your_user>`
|
||||
- `export AGENT_INSTANCE=<node_id>`
|
||||
- Master 地址:
|
||||
- `export MASTER_ENDPOINT=http://master.argus.com:3000`
|
||||
|
||||
> 安装脚本会在开头检查上述条件,缺失时会终止并给出修复提示。
|
||||
|
||||
## 执行安装
|
||||
- 以 root 运行(容器内如为非 root 用户请切换为 root):
|
||||
- `sudo /tmp/setup.sh --server <FTP_IP> --user ftpuser --password 'ZGClab1234!' --port 21`
|
||||
- 如需自定义安装根目录:`--install-dir /opt/argus-metric`
|
||||
|
||||
提示(容器接入 overlay 网络时):
|
||||
- 在执行 setup 前,先将容器内 DNS 指向 Bind9 的 overlay IP:
|
||||
- `echo "nameserver <BIND_OVERLAY_IP>" > /etc/resolv.conf`
|
||||
- 这样 `master.argus.com`、`es.log.argus.com` 等域名即可解析;首次下载 `setup.sh` 仍建议使用 FTP 的 overlay IP。
|
||||
|
||||
更多快速步骤请参考:`QUICK_NODE_DEPLOY_zh.md`。
|
||||
|
||||
## 安装后自检(setup 自动执行)
|
||||
- setup 会等待最多 5 分钟,确认以下条件后才报告完成:
|
||||
- `/private/argus/agent/<hostname>/node.json` 已生成;
|
||||
- `last_report` 在持续更新;
|
||||
- `health.metric-argus-agent|metric-node-exporter|metric-fluent-bit|metric-dcgm-exporter` 均为 `healthy` 且 `error` 为空。
|
||||
|
||||
## 手工验证(可选)
|
||||
- `cat /private/argus/agent/$(hostname)/node.json | jq '.'`
|
||||
- `curl -s -o /dev/null -w "%{http_code}\n" http://localhost:9100/metrics` 应为 200
|
||||
- 查看日志:`/var/log/argus-agent.log`、`/opt/argus-metric/versions/*/.install.log`
|
||||
57
deployment/build/templates/client/PUBLISH_CLIENT_zh.md
Normal file
57
deployment/build/templates/client/PUBLISH_CLIENT_zh.md
Normal file
@ -0,0 +1,57 @@
|
||||
# Argus Metric 客户端发布说明(FTP)
|
||||
|
||||
本说明面向“发布人员”,讲清楚如何把客户端离线包发布到 FTP,供各节点通过 `curl` 自动安装。
|
||||
|
||||
## 目录结构(构建后)
|
||||
- `client-YYYYMMDD/`
|
||||
- `argus-metric_YYYYMMDD.tar.gz` 客户端离线包
|
||||
- `setup.sh` 客户端安装入口脚本(提供给节点用 curl 下载)
|
||||
- `publish.sh` 发布脚本(将上述两项与 `LATEST_VERSION` 上传到 FTP)
|
||||
- `LATEST_VERSION` 文本(内容为 `YYYYMMDD`,或 `YYYYMMDD-rN`)
|
||||
- `INSTALL_CLIENT_zh.md` 本地安装指南(给使用者看,不会上载到 FTP)
|
||||
- `PUBLISH_CLIENT_zh.md` 本说明
|
||||
|
||||
> 注意:`publish.sh`/`setup.sh` 为可执行脚本;构建脚本已保证二者具有执行权限。
|
||||
|
||||
## 前置条件
|
||||
- FTP 服务已运行(默认容器:`argus-ftp`),并打开端口:21、20、21100–21110(被动模式)。
|
||||
- FTP 账号:默认 `ftpuser / ZGClab1234!`(如有更改,以实际为准)。
|
||||
|
||||
## 发布步骤(在 server 机器或能直连 FTP 的任意机器上)
|
||||
1) 进入发布目录:
|
||||
- `cd client-YYYYMMDD`
|
||||
|
||||
2) 执行发布:
|
||||
- `./publish.sh --server <FTP_HOST> --user <USER> --password '<PASS>' [--port 21]`
|
||||
- 例如在服务端本机:`./publish.sh --server localhost --user ftpuser --password 'ZGClab1234!' --port 21`
|
||||
|
||||
脚本会上传三类文件到 FTP 根:
|
||||
- `setup.sh`
|
||||
- `argus-metric_YYYYMMDD[ -rN ].tar.gz`
|
||||
- `LATEST_VERSION`(内容为当前版本号)
|
||||
|
||||
3) 发布后验证:
|
||||
- `curl -u ftpuser:****** -I ftp://<FTP_HOST>:21/LATEST_VERSION` 应返回 200
|
||||
- `curl -u ftpuser:****** -fsSL ftp://<FTP_HOST>:21/LATEST_VERSION` 内容为版本号(如 `20251104`)
|
||||
- `curl -u ftpuser:****** -I ftp://<FTP_HOST>:21/argus-metric_YYYYMMDD.tar.gz` 返回 200
|
||||
|
||||
## 节点侧使用方式(摘要)
|
||||
- 首次下载用 FTP 的“IP 地址”:
|
||||
- `curl -u ftpuser:****** -fsSL ftp://<FTP_IP>:21/setup.sh -o /tmp/setup.sh && chmod +x /tmp/setup.sh`
|
||||
- 执行安装:
|
||||
- 必需元数据:`AGENT_ENV/AGENT_USER/AGENT_INSTANCE`,以及 `MASTER_ENDPOINT=http://master.argus.com:3000`
|
||||
- `sudo /tmp/setup.sh --server <FTP_IP> --user ftpuser --password '******' --port 21`
|
||||
- overlay 容器场景:
|
||||
- 先将容器内 DNS 指向 Bind9 的 overlay IP:`echo "nameserver <BIND_OVERLAY_IP>" > /etc/resolv.conf`
|
||||
- 然后再执行上述安装;安装后约 1–2 分钟内 DNS 即可解析 `*.argus.com` 域名。
|
||||
|
||||
## 常见问题
|
||||
- `530 Access denied`:用户名/密码错误或 FTP 目录无权限;请核对账号与 FTP 容器状态。
|
||||
- `Permission denied` 执行 `publish.sh`:为脚本权限问题;`chmod +x publish.sh`。构建脚本已修复默认权限。
|
||||
- 被动端口不通导致失败:请开放 21100–21110。
|
||||
- 客户端安装后短时 `curl http://master.argus.com:3000` 为 000:服务冷启动或 DNS 同步延迟,等待 1–2 分钟再试。
|
||||
|
||||
## 版本与回滚
|
||||
- `LATEST_VERSION` 决定客户端默认安装的版本号。
|
||||
- 如需回滚:将旧版本号写回 `LATEST_VERSION` 并重新发布(或手动指定 `--version` 安装)。
|
||||
|
||||
58
deployment/build/templates/client/QUICK_NODE_DEPLOY_zh.md
Normal file
58
deployment/build/templates/client/QUICK_NODE_DEPLOY_zh.md
Normal file
@ -0,0 +1,58 @@
|
||||
# Argus Metric 节点快速部署(Overlay 网络容器)
|
||||
|
||||
本文档给出在 Docker Swarm external overlay 网络中,快速拉起一个测试节点并完成注册的最小可行步骤。
|
||||
|
||||
## 前提
|
||||
- 服务端已在 Manager 机安装完成并运行良好(`server-selfcheck` 通过)。
|
||||
- Overlay 网络名称:`argus-sys-net`(默认)。
|
||||
- 已通过 FTP 发布 `setup.sh` 与客户端包,且能从 FTP 获取 `LATEST_VERSION`。
|
||||
- 用于测试的镜像:`argus-sys-metric-test-node:latest` 已存在于目标机器。
|
||||
|
||||
## 步骤
|
||||
|
||||
- 获取 FTP 和 Bind 的 overlay IP(在 Manager 上执行)
|
||||
- `FTPIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-ftp)`
|
||||
- `BINDIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-bind-sys)`
|
||||
- `echo "FTP=$FTPIP BIND=$BINDIP"`
|
||||
|
||||
- 准备宿主挂载目录(以 s4 为例)
|
||||
- `mkdir -p /home2/yuyr/deploy/test-metric-node/s4`
|
||||
|
||||
- 启动测试节点容器(接入 overlay)
|
||||
- `docker run -d --name argus-metric-test-node-s4 \
|
||||
--hostname dev2-yuyr-node002s4 \
|
||||
--network argus-sys-net \
|
||||
-v /home2/yuyr/deploy/test-metric-node/s4:/private/argus/agent \
|
||||
argus-sys-metric-test-node:latest sleep infinity`
|
||||
|
||||
- 在容器内执行安装(先用 FTP IP 引导,DNS 指向 Bind)
|
||||
- `docker exec -it argus-metric-test-node-s4 bash`
|
||||
- `echo "nameserver $BINDIP" > /etc/resolv.conf`
|
||||
- `curl --ftp-method nocwd -u ftpuser:ZGClab1234! -fsSL "ftp://$FTPIP:21/setup.sh" -o /tmp/setup.sh`
|
||||
- `chmod +x /tmp/setup.sh`
|
||||
- `export AGENT_ENV=dev2 AGENT_USER=yuyr AGENT_INSTANCE=node002s4`
|
||||
- `export MASTER_ENDPOINT=http://master.argus.com:3000`
|
||||
- `/tmp/setup.sh --server "$FTPIP" --user ftpuser --password 'ZGClab1234!' --port 21`
|
||||
- 说明:setup 会自动执行安装后自检(最多 5 分钟),无需手动轮询。
|
||||
|
||||
## 验证(推荐在容器内执行,避免宿主权限问题)
|
||||
|
||||
- 查看 node.json 关键字段
|
||||
- `cat /private/argus/agent/dev2-yuyr-node002s4/node.json | jq '{last_report, health}'`
|
||||
- 期望:四个 health 全部 healthy;等待 ≥70s 再查看,`last_report` 持续更新。
|
||||
|
||||
- 指标端口
|
||||
- `curl -s -o /dev/null -w '%{http_code}\n' http://localhost:9100/metrics`(期望 200)
|
||||
- (如测试 GPU)`curl -s -o /dev/null -w '%{http_code}\n' http://localhost:9400/metrics`(有 GPU 时 200)
|
||||
|
||||
- 与服务端连通(域名经 Bind 解析)
|
||||
- `curl -s -o /dev/null -w '%{http_code}\n' http://master.argus.com:3000/readyz`(期望 200)
|
||||
- `curl -s -o /dev/null -w '%{http_code}\n' http://es.log.argus.com:9200/_cluster/health`(期望 200)
|
||||
|
||||
## (可选)在服务器主机侧观察 Prometheus 目标更新
|
||||
- `cat /home2/yuyr/deploy/versions/<VERSION>/private/argus/metric/prometheus/nodes.json | jq '.'`
|
||||
|
||||
## 常见提示
|
||||
- 初次安装后短时 `curl` 域名返回 000/超时属正常,多等待 1–2 分钟 DNS 同步/组件冷启动完成。
|
||||
- 如在宿主直接读取挂载的 node.json 报 Permission denied,请使用 `docker exec` 在容器内查看。
|
||||
- MASTER_ENDPOINT 固定使用域名 `http://master.argus.com:3000`,客户端无需固定 IP。
|
||||
54
deployment/build/templates/client/publish.sh
Normal file
54
deployment/build/templates/client/publish.sh
Normal file
@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() { cat <<'EOF'
|
||||
Publish Argus client package to FTP
|
||||
|
||||
Usage:
|
||||
./publish.sh --server HOST --user USER --password PASS [--port 21]
|
||||
|
||||
Notes:
|
||||
- This script expects to run inside the built client artifact directory.
|
||||
- It reads LATEST_VERSION and uploads setup.sh, argus-metric_<ver>.tar.gz, and LATEST_VERSION.
|
||||
EOF
|
||||
}
|
||||
|
||||
HOST=""; USERNAME=""; PASSWORD=""; PORT=21
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--server) HOST="$2"; shift 2;;
|
||||
--user) USERNAME="$2"; shift 2;;
|
||||
--password) PASSWORD="$2"; shift 2;;
|
||||
--port) PORT="$2"; shift 2;;
|
||||
-h|--help) usage; exit 0;;
|
||||
*) echo "unknown arg: $1" >&2; usage; exit 1;;
|
||||
esac
|
||||
done
|
||||
|
||||
[[ -n "$HOST" && -n "$USERNAME" && -n "$PASSWORD" ]] || { usage; exit 1; }
|
||||
|
||||
here="$(pwd)"
|
||||
if [[ ! -f "$here/LATEST_VERSION" ]]; then
|
||||
echo "LATEST_VERSION not found in $(pwd)" >&2; exit 1;
|
||||
fi
|
||||
VER=$(cat "$here/LATEST_VERSION" | tr -d '\n')
|
||||
PKG="argus-metric_${VER}.tar.gz"
|
||||
|
||||
if [[ ! -f "$here/$PKG" ]]; then
|
||||
echo "client tar not found: $PKG" >&2; exit 1
|
||||
fi
|
||||
|
||||
# locate setup.sh (prefer colocated, fallback to bundled path if provided)
|
||||
SETUP="${here}/setup.sh"
|
||||
if [[ ! -f "$SETUP" ]]; then
|
||||
echo "setup.sh not found in $(pwd)" >&2; exit 1
|
||||
fi
|
||||
|
||||
echo "[PUBLISH] server=$HOST port=$PORT version=$VER"
|
||||
|
||||
curl -u "$USERNAME:$PASSWORD" -sfT "$SETUP" "ftp://$HOST:$PORT/setup.sh"
|
||||
curl -u "$USERNAME:$PASSWORD" -sfT "$PKG" "ftp://$HOST:$PORT/$PKG"
|
||||
printf "%s" "$VER" | curl -u "$USERNAME:$PASSWORD" -sfT - "ftp://$HOST:$PORT/LATEST_VERSION"
|
||||
|
||||
echo "[OK] publish completed"
|
||||
|
||||
41
deployment/build/templates/docker-compose.filter.awk
Normal file
41
deployment/build/templates/docker-compose.filter.awk
Normal file
@ -0,0 +1,41 @@
|
||||
#!/usr/bin/awk -f
|
||||
# Remove specific service blocks from a docker-compose.yml by service name.
|
||||
# Usage: awk -f docker-compose.filter.awk -v remove="node-a,node-b,test-node,test-gpu-node" input.yml > output.yml
|
||||
|
||||
BEGIN{
|
||||
split(remove, rm, ",");
|
||||
for(i in rm){
|
||||
gsub(/^\s+|\s+$/,"",rm[i]);
|
||||
if (rm[i] != "") skipname[rm[i]] = 1;
|
||||
}
|
||||
in_services=0; skipping=0;
|
||||
}
|
||||
|
||||
function service_header(line, m) {
|
||||
# match exactly two leading spaces followed by name:
|
||||
if (match(line, /^ ([A-Za-z0-9_-]+):[ ]*$/, m)) return m[1];
|
||||
return "";
|
||||
}
|
||||
|
||||
{
|
||||
# Track top-level sections (no indentation)
|
||||
if ($0 ~ /^[A-Za-z0-9_-]+:[ ]*$/ && $0 !~ /^\s/) {
|
||||
in_services = ($0 ~ /^services:[ ]*$/) ? 1 : 0;
|
||||
}
|
||||
|
||||
if (skipping) {
|
||||
# Stop skipping at next service header or another top-level section
|
||||
if (service_header($0) != "" || ($0 ~ /^(networks|volumes):[ ]*$/ && $0 !~ /^\s/)) {
|
||||
skipping=0;
|
||||
} else {
|
||||
next;
|
||||
}
|
||||
}
|
||||
|
||||
if (in_services) {
|
||||
name = service_header($0);
|
||||
if (name != "" && (name in skipname)) { skipping=1; next; }
|
||||
}
|
||||
|
||||
print;
|
||||
}
|
||||
74
deployment/build/templates/docker-compose.overlay.awk
Normal file
74
deployment/build/templates/docker-compose.overlay.awk
Normal file
@ -0,0 +1,74 @@
|
||||
#!/usr/bin/awk -f
|
||||
# Transform docker-compose.yml to use an external overlay network for all services
|
||||
# - Remove top-level networks definition
|
||||
# - Remove per-service networks block (including ipv4_address and sysnet refs)
|
||||
# - Insert per-service networks: [argus-sys-net]
|
||||
# - Append external networks mapping at the end
|
||||
|
||||
BEGIN{
|
||||
in_top_networks=0; in_services=0; in_service=0; svc_indent=0; curr_name="";
|
||||
}
|
||||
|
||||
function is_service_header(line){ return svc_name(line)!=""; }
|
||||
function svc_name(line, m){ if (match(line, /^ ([A-Za-z0-9_-]+):[ ]*$/, m)) return m[1]; return ""; }
|
||||
|
||||
function indent_len(s, n){ n=match(s,/[^ ]/)-1; if(n<0) n=0; return n; }
|
||||
|
||||
{
|
||||
# Detect entry into top-level sections
|
||||
if ($0 ~ /^[A-Za-z0-9_-]+:[ ]*$/ && $0 !~ /^\s/) {
|
||||
in_services = ($0 ~ /^services:[ ]*$/);
|
||||
# If a new top-level section starts, stop skipping top networks
|
||||
in_top_networks = 0;
|
||||
}
|
||||
|
||||
# Handle removal of initial top-level 'networks:' block
|
||||
if ($0 ~ /^networks:[ ]*$/ && $0 !~ /^\s/) {
|
||||
in_top_networks = 1; next;
|
||||
}
|
||||
if (in_top_networks) {
|
||||
# skip until next top-level section (non-indented key)
|
||||
next;
|
||||
}
|
||||
|
||||
if (in_services) {
|
||||
# Track service boundaries
|
||||
if (is_service_header($0)) {
|
||||
in_service=1; svc_indent=2; networks_inserted=0; curr_name=svc_name($0); print; next;
|
||||
}
|
||||
if (in_service) {
|
||||
# If line is indented <= service indent, we've left this service
|
||||
if (indent_len($0) <= svc_indent && $0 !~ /^\s*$/) {
|
||||
in_service=0;
|
||||
}
|
||||
}
|
||||
|
||||
if (in_service) {
|
||||
# Skip any existing networks block under the service
|
||||
if ($0 ~ /^\s{4}networks:[ ]*$/) { skipping_nets=1; next; }
|
||||
if (skipping_nets) {
|
||||
if (indent_len($0) <= 4) { skipping_nets=0; }
|
||||
else next;
|
||||
}
|
||||
|
||||
# After container_name or image, inject networks once
|
||||
if (!networks_inserted && ($0 ~ /^\s{4}container_name:/ || $0 ~ /^\s{4}image:/)) {
|
||||
print;
|
||||
print " networks:";
|
||||
print " - argus-sys-net";
|
||||
networks_inserted=1; next;
|
||||
}
|
||||
# no host port injection; bind serves DNS inside overlay only
|
||||
}
|
||||
}
|
||||
|
||||
print;
|
||||
}
|
||||
|
||||
END{
|
||||
print "";
|
||||
print "networks:";
|
||||
print " argus-sys-net:";
|
||||
print " external: true";
|
||||
print " name: ${OVERLAY_NET_NAME:-argus-sys-net}";
|
||||
}
|
||||
50
deployment/build/templates/docs/INSTALL_SERVER.md
Normal file
50
deployment/build/templates/docs/INSTALL_SERVER.md
Normal file
@ -0,0 +1,50 @@
|
||||
# Argus Server Offline Installation
|
||||
|
||||
## Prerequisites
|
||||
- Linux x86_64 (Ubuntu 22.04 recommended; see OS compatibility for NixOS)
|
||||
- Docker & Docker Compose installed
|
||||
- Open ports: 32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 21100–21110 (or auto-fallback to high ports)
|
||||
|
||||
## Quick Start
|
||||
1. Extract to a target dir, e.g. `/opt/argus-deploy/versions/<YYYYMMDD>`
|
||||
2. `./server-install.sh` (non‑root is supported: it will precreate minimal dirs and auto-fix Kibana/ES/Bind in containers)
|
||||
3. `./server-status.sh`
|
||||
4. `./server-selfcheck.sh` (on failure it auto-runs diagnose and writes logs under `logs/`)
|
||||
5. `./server-uninstall.sh` to tear down
|
||||
|
||||
## What the Installer Does
|
||||
- Loads local images (`images/all-images.tar.gz`)
|
||||
- Generates OS-compat override (`security_opt: ["label=disable"]`, `userns_mode: host`, bind `tmpfs:/run/named`)
|
||||
- Starts server-only services: bind/master/es/kibana/ftp/prometheus/grafana/alertmanager/web-frontend/web-proxy
|
||||
- DNS Bootstrap:
|
||||
- Ensure `/private/argus/etc/dns.conf` exists (write `172.31.0.2` if missing);
|
||||
- Run `/private/argus/etc/update-dns.sh` in dependent containers so `/etc/resolv.conf` points to bind;
|
||||
- Wait for `*.argus.com` hint files, then reload bind;
|
||||
- Restart web‑proxy to re-render nginx resolver from `dns.conf`;
|
||||
- Writes `logs/selfcheck.json` as final summary
|
||||
|
||||
## OS Compatibility
|
||||
- NixOS / non-xattr FS: containers run with `security_opt: ["label=disable"]` and `userns_mode: host`.
|
||||
- If you cannot use sudo, the installer will:
|
||||
- create minimal data dirs (incl. `private/argus/log/{elasticsearch,kibana}`) with permissive perms when possible;
|
||||
- ensure inside containers: Kibana `data` → `/private/argus/log/kibana`, Elasticsearch `data` → `/private/argus/log/elasticsearch`, and Bind `rndc.key` is generated.
|
||||
(Manual pre-creation scripts are no longer required.)
|
||||
|
||||
## Files & Layout
|
||||
- `compose/` (docker-compose.yml, .env)
|
||||
- `private/` (data mounts)
|
||||
- `scripts/` (install/uninstall/status/selfcheck/diagnose)
|
||||
- `logs/` (selfcheck + diagnose outputs)
|
||||
|
||||
## Troubleshooting (Quick)
|
||||
- Run `./server-selfcheck.sh` → see `logs/selfcheck.json`
|
||||
- Run `./server-diagnose.sh` → produces timestamped logs:
|
||||
- `logs/diagnose_details_YYYYMMDD-HHMMSSZ.log`
|
||||
- `logs/diagnose_error_YYYYMMDD-HHMMSSZ.log`
|
||||
And updates `diagnose_details.log`/`diagnose_error.log` to the latest
|
||||
- Error lines are tagged `[service][source]`, e.g. `[kibana][http] /api/status=503`
|
||||
|
||||
Common issues:
|
||||
- Kibana 503: wait cold start or fix DNS so `es.log.argus.com` resolves
|
||||
- web‑proxy 504: check nginx `resolver` includes `172.31.0.2 127.0.0.11`
|
||||
- EACCES/locks: ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches UID:GID
|
||||
29
deployment/build/templates/docs/INSTALL_SERVER_zh.md
Normal file
29
deployment/build/templates/docs/INSTALL_SERVER_zh.md
Normal file
@ -0,0 +1,29 @@
|
||||
# Argus 服务端离线安装指南
|
||||
|
||||
## 先决条件
|
||||
- Linux x86_64(推荐 Ubuntu 22.04;NixOS 见“兼容说明”)
|
||||
- 已安装 Docker 与 Docker Compose
|
||||
- 端口:32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 21100–21110
|
||||
|
||||
## 快速开始
|
||||
1. 解压到目标目录(例如 `/opt/argus-deploy/versions/<YYYYMMDD>`)
|
||||
2. 安装:`./server-install.sh`(支持普通用户:会自动创建最小目录并在容器内修复 Kibana/ES/Bind)
|
||||
3. 状态:`./server-status.sh`
|
||||
4. 自检:`./server-selfcheck.sh`(失败会自动采集诊断)
|
||||
5. 卸载:`./server-uninstall.sh`
|
||||
|
||||
## 安装流程要点
|
||||
- 仅启动 10 个服务端组件(不包含测试节点);
|
||||
- DNS Bootstrap:补齐首次部署 DNS 依赖(生成/确认 `dns.conf`、统一容器 resolv.conf、写入 `*.argus.com`、reload bind、重启 web‑proxy);
|
||||
- 输出自检结果到 `logs/selfcheck.json`。
|
||||
|
||||
## 兼容说明(NixOS 等)
|
||||
- 使用 `security_opt: ["label=disable"]` 与 `userns_mode: host`。
|
||||
- 非 root 场景:安装器会创建最小目录(含 `private/argus/log/{elasticsearch,kibana}`),并在容器内完成:
|
||||
- Kibana 的 `data` 软链到 `/private/argus/log/kibana`
|
||||
- Elasticsearch 的 `data` 软链到 `/private/argus/log/elasticsearch`
|
||||
- Bind 生成 `/etc/bind/rndc.key`
|
||||
|
||||
## 故障排查(见下文 Troubleshooting_zh)
|
||||
- `./server-selfcheck.sh` → `logs/selfcheck.json`
|
||||
- `./server-diagnose.sh` → `logs/diagnose_error_*.log` / `logs/diagnose_details_*.log`
|
||||
50
deployment/build/templates/docs/SWARM_DEPLOY_zh.md
Normal file
50
deployment/build/templates/docs/SWARM_DEPLOY_zh.md
Normal file
@ -0,0 +1,50 @@
|
||||
# Argus 多机部署(Docker Swarm + External Overlay)
|
||||
|
||||
- 前提:Docker ≥ 20.10;Manager/Worker 节点开放 2377/tcp、7946/tcp+udp、4789/udp。
|
||||
- DNS:Bind9 为统一权威,解析 *.argus.com 至部署机固定对外主机 IP。
|
||||
|
||||
## 在部署机(Manager)
|
||||
- 初始化 Swarm:`docker swarm init --advertise-addr <manager_ip>`
|
||||
- 创建 overlay:`docker network create --driver overlay --attachable argus-sys-net`
|
||||
- 解压离线包后执行:
|
||||
- `./server-install.sh`(会验证 Swarm 状态、确保 overlay 存在、写入 dns.conf)
|
||||
- `./server-selfcheck.sh`(失败会自动触发诊断)
|
||||
|
||||
## 在节点机(Worker 或非 Docker 主机)
|
||||
- Swarm Worker:执行 Manager 的 `docker swarm join ...`;
|
||||
- 运行客户端容器:
|
||||
- `docker run -d --name argus-metric-node-001 --network argus-sys-net -v /data/argus/agent:/private/argus/agent argus-sys-metric-test-node:latest sleep infinity`
|
||||
- 进入容器安装(先 IP 引导,后域名):
|
||||
- `curl -u ftpuser:*** -fsSL ftp://<FTP_IP>:21/setup.sh -o /tmp/setup.sh && chmod +x /tmp/setup.sh`
|
||||
- `AGENT_ENV=dev AGENT_USER=alice AGENT_INSTANCE=node001 MASTER_ENDPOINT=http://master.argus.com:3000 /tmp/setup.sh --server ftp.argus.com --user ftpuser --password '***' --port 21`
|
||||
|
||||
## 关键点
|
||||
- 首次必须使用 FTP 的 IP 引导(下载 setup.sh 与 dns.conf)
|
||||
- MASTER_ENDPOINT 永远使用域名:`http://master.argus.com:3000`
|
||||
- docker compose 改为 external overlay;容器内不使用 Docker 服务名;web-proxy 与组件上游统一用域名
|
||||
|
||||
## 找回/轮换 Swarm 加入令牌与解锁密钥
|
||||
|
||||
在任意一个 Manager 节点上执行以下命令即可查看或轮换加入令牌(join token):
|
||||
|
||||
- 查看加入 Worker 的命令:
|
||||
- `docker swarm join-token worker`
|
||||
- 只打印 Worker 的 token:
|
||||
- `docker swarm join-token -q worker`
|
||||
- 查看加入 Manager 的命令:
|
||||
- `docker swarm join-token manager`
|
||||
- 只打印 Manager 的 token:
|
||||
- `docker swarm join-token -q manager`
|
||||
|
||||
在待加入节点执行(示例,替换 Manager_IP):
|
||||
- `docker swarm join --token <上面查到的token> <Manager_IP>:2377`
|
||||
|
||||
轮换 token(怀疑泄露或需要更新时):
|
||||
- 轮换 Worker:`docker swarm join-token --rotate worker`
|
||||
- 轮换 Manager:`docker swarm join-token --rotate manager`
|
||||
|
||||
如果你指的是“解锁密钥”(autolock 的 unlock key),在 Manager 上:
|
||||
- 查看:`docker swarm unlock-key`
|
||||
- 轮换:`docker swarm unlock-key --rotate`
|
||||
|
||||
提示:当看到 “This node is not a swarm manager.” 时,说明当前节点不是 Manager,需要到 Manager 节点执行,或在现有 Manager 上 `docker node promote <NODE-ID>` 将其提升为 Manager。
|
||||
20
deployment/build/templates/docs/TROUBLESHOOTING.md
Normal file
20
deployment/build/templates/docs/TROUBLESHOOTING.md
Normal file
@ -0,0 +1,20 @@
|
||||
# Troubleshooting
|
||||
|
||||
- Status: `scripts/server-status.sh`
|
||||
- Selfcheck: `scripts/server-selfcheck.sh`
|
||||
- Diagnose: `scripts/server-diagnose.sh`
|
||||
|
||||
Outputs:
|
||||
- `logs/selfcheck.json`
|
||||
- `logs/diagnose_details_*.log` (full details)
|
||||
- `logs/diagnose_error_*.log` (tagged errors)
|
||||
|
||||
Web‑Proxy:
|
||||
- 8083 expects 200/302/403; 8084/8085 must include CORS header
|
||||
- nginx resolver should be `172.31.0.2 127.0.0.11`
|
||||
|
||||
Kibana/ES:
|
||||
- Verify `es.log.argus.com` resolves inside Kibana
|
||||
|
||||
Permissions:
|
||||
- The installer auto-creates minimal dirs and applies container-side fixes (Kibana/ES/Bind). If you still see EACCES/lock errors, rerun `./server-install.sh` and review diagnose logs.
|
||||
16
deployment/build/templates/docs/TROUBLESHOOTING_zh.md
Normal file
16
deployment/build/templates/docs/TROUBLESHOOTING_zh.md
Normal file
@ -0,0 +1,16 @@
|
||||
# 故障排查
|
||||
|
||||
- 状态:`scripts/server-status.sh`
|
||||
- 自检:`scripts/server-selfcheck.sh`
|
||||
- 诊断:`scripts/server-diagnose.sh`
|
||||
|
||||
输出:
|
||||
- `logs/selfcheck.json`
|
||||
- `logs/diagnose_error_*.log`(错误摘要)
|
||||
- `logs/diagnose_details_*.log`(详细信息)
|
||||
|
||||
Web‑Proxy:8083=200/302/403;8084/8085 需包含 CORS
|
||||
Kibana:确认可解析 `es.log.argus.com`
|
||||
权限:
|
||||
- 非 root 安装时,安装器会创建最小目录并在容器内修复 Kibana/ES/Bind;
|
||||
- 如仍有 `EACCES`/锁文件报错,先重跑 `./server-install.sh`(会重复容器内修复),并查看诊断日志。
|
||||
147
deployment/build/templates/scripts/server-diagnose.sh
Executable file
147
deployment/build/templates/scripts/server-diagnose.sh
Executable file
@ -0,0 +1,147 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
||||
|
||||
ts="$(date -u +%Y%m%d-%H%M%SZ)"
|
||||
LOG_DIR="$ROOT/logs"
|
||||
mkdir -p "$LOG_DIR" || true
|
||||
# Fallback to /tmp when logs dir is not writable
|
||||
if ! ( : > "$LOG_DIR/.w" 2>/dev/null ); then
|
||||
LOG_DIR="/tmp/argus-logs"
|
||||
mkdir -p "$LOG_DIR" || true
|
||||
fi
|
||||
DETAILS="$LOG_DIR/diagnose_details_${ts}.log"
|
||||
ERRORS="$LOG_DIR/diagnose_error_${ts}.log"
|
||||
: > "$DETAILS"; : > "$ERRORS"
|
||||
|
||||
logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; }
|
||||
append_err() { echo "$*" >> "$ERRORS"; }
|
||||
|
||||
http_code() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
|
||||
http_body_head() { curl -s --max-time 3 "$1" 2>/dev/null | sed -n '1,5p' || true; }
|
||||
header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; }
|
||||
|
||||
section() {
|
||||
local name="$1"; logd "===== [$name] ====="; }
|
||||
|
||||
svc() {
|
||||
local svc_name="$1"; local cname="$2"; shift 2
|
||||
section "$svc_name ($cname)"
|
||||
logd "docker ps:"; docker ps -a --format '{{.Names}}\t{{.Status}}\t{{.Image}}' | awk -v n="$cname" '$1==n' >> "$DETAILS" || true
|
||||
logd "docker inspect (state/restartcount):"; docker inspect -f '{{.State.Status}} rc={{.RestartCount}} started={{.State.StartedAt}}' "$cname" >> "$DETAILS" 2>&1 || true
|
||||
logd "last 200 container logs:"; docker logs --tail 200 "$cname" >> "$DETAILS" 2>&1 || true
|
||||
|
||||
# extract error lines from container logs
|
||||
docker logs --tail 200 "$cname" 2>&1 | \
|
||||
grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \
|
||||
sed "s/^/[${svc_name}][container] /" >> "$ERRORS" || true
|
||||
|
||||
# supervisor status and logs
|
||||
if docker exec "$cname" sh -lc 'command -v supervisorctl >/dev/null 2>&1' >/dev/null 2>&1; then
|
||||
logd "supervisorctl status:"; docker exec "$cname" sh -lc 'supervisorctl status' >> "$DETAILS" 2>&1 || true
|
||||
# iterate supervisor logs and collect tails + errors per file
|
||||
local files
|
||||
files=$(docker exec "$cname" sh -lc 'ls /var/log/supervisor/*.log 2>/dev/null' || true)
|
||||
for f in $files; do
|
||||
logd "tail -n 80 $f:"; docker exec "$cname" sh -lc "tail -n 80 $f 2>/dev/null || true" >> "$DETAILS" 2>&1 || true
|
||||
docker exec "$cname" sh -lc "tail -n 200 $f 2>/dev/null" 2>/dev/null | \
|
||||
grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \
|
||||
sed "s/^/[${svc_name}][supervisor:$(basename $f)] /" >> "$ERRORS" || true
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
# Core services
|
||||
svc bind argus-bind-sys
|
||||
svc master argus-master-sys
|
||||
svc es argus-es-sys
|
||||
svc kibana argus-kibana-sys
|
||||
svc ftp argus-ftp
|
||||
svc prometheus argus-prometheus
|
||||
svc grafana argus-grafana
|
||||
svc alertmanager argus-alertmanager
|
||||
svc web-frontend argus-web-frontend
|
||||
svc web-proxy argus-web-proxy
|
||||
|
||||
# HTTP checks (host side)
|
||||
section HTTP
|
||||
logd "ES: $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health")"
|
||||
http_body_head "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" >> "$DETAILS" 2>&1 || true
|
||||
|
||||
logd "Kibana: $(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status")"
|
||||
http_body_head "http://localhost:${KIBANA_PORT:-5601}/api/status" >> "$DETAILS" 2>&1 || true
|
||||
|
||||
logd "Master readyz: $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz")"
|
||||
|
||||
logd "Prometheus: $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready")"
|
||||
logd "Grafana: $(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health")"
|
||||
http_body_head "http://localhost:${GRAFANA_PORT:-3000}/api/health" >> "$DETAILS" 2>&1 || true
|
||||
logd "Alertmanager: $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status")"
|
||||
|
||||
cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true)
|
||||
cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true)
|
||||
logd "Web-Proxy 8080: $(http_code "http://localhost:${WEB_PROXY_PORT_8080:-8080}/")"
|
||||
logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083}/")"
|
||||
logd "Web-Proxy 8084 CORS: ${cors8084}"
|
||||
logd "Web-Proxy 8085 CORS: ${cors8085}"
|
||||
|
||||
# Overlay network diagnostics
|
||||
section OVERLAY-NET
|
||||
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then
|
||||
logd "overlay present: ${OVERLAY_NET_NAME:-argus-sys-net}"
|
||||
docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" | sed -n '1,60p' >> "$DETAILS" 2>/dev/null || true
|
||||
else
|
||||
append_err "[overlay][network] missing ${OVERLAY_NET_NAME:-argus-sys-net}"
|
||||
fi
|
||||
|
||||
# Domain resolution & reachability from inside web-proxy (bind-backed)
|
||||
section DOMAIN
|
||||
for d in master.argus.com ftp.argus.com kibana.argus.com es.log.argus.com prom.argus.com grafana.argus.com; do
|
||||
logd "getent $d (web-proxy):"
|
||||
docker exec argus-web-proxy sh -lc "getent hosts $d || true" >> "$DETAILS" 2>&1 || true
|
||||
done
|
||||
logd "HTTP (web-proxy): master.readyz=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://master.argus.com:3000/readyz\" 2>/dev/null || echo 000)"
|
||||
logd "HTTP (web-proxy): es.health=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://es.log.argus.com:9200/_cluster/health\" 2>/dev/null || echo 000)"
|
||||
logd "HTTP (web-proxy): kibana.status=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://kibana.argus.com:5601/api/status\" 2>/dev/null || echo 000)"
|
||||
|
||||
# FTP share writability (container perspective)
|
||||
section FTP-SHARE
|
||||
docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true
|
||||
|
||||
# Collect system info for context
|
||||
section SYSTEM
|
||||
logd "uname -a:"; uname -a >> "$DETAILS"
|
||||
logd "docker version:"; docker version --format '{{.Server.Version}}' >> "$DETAILS" 2>&1 || true
|
||||
logd "compose ps:"; (cd "$ROOT/compose" && if docker compose version >/dev/null 2>&1; then docker compose -p argus-sys ps; else docker-compose -p argus-sys ps; fi) >> "$DETAILS" 2>&1 || true
|
||||
|
||||
section SUMMARY
|
||||
# Add HTTP failures and CORS problems to error log with tags
|
||||
[[ $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health") != 200 ]] && echo "[es][http] health not 200" >> "$ERRORS"
|
||||
kbcode=$(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status"); [[ "$kbcode" != 200 ]] && echo "[kibana][http] /api/status=$kbcode" >> "$ERRORS"
|
||||
[[ $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz") != 200 ]] && echo "[master][http] /readyz not 200" >> "$ERRORS"
|
||||
[[ $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready") != 200 ]] && echo "[prometheus][http] /-/ready not 200" >> "$ERRORS"
|
||||
gfcode=$(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health"); [[ "$gfcode" != 200 ]] && echo "[grafana][http] /api/health=$gfcode" >> "$ERRORS"
|
||||
[[ $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status") != 200 ]] && echo "[alertmanager][http] /api/v2/status not 200" >> "$ERRORS"
|
||||
[[ -z "$cors8084" ]] && echo "[web-proxy][cors] 8084 missing Access-Control-Allow-Origin" >> "$ERRORS"
|
||||
[[ -z "$cors8085" ]] && echo "[web-proxy][cors] 8085 missing Access-Control-Allow-Origin" >> "$ERRORS"
|
||||
|
||||
# Deduplicate errors
|
||||
sort -u -o "$ERRORS" "$ERRORS"
|
||||
|
||||
echo "Diagnostic details -> $DETAILS"
|
||||
echo "Detected errors -> $ERRORS"
|
||||
|
||||
if [[ "$LOG_DIR" == "$ROOT/logs" ]]; then
|
||||
# maintain latest symlinks when writing under package logs
|
||||
ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true
|
||||
ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true
|
||||
else
|
||||
echo "Diagnostic details -> $DETAILS"
|
||||
echo "Detected errors -> $ERRORS"
|
||||
fi
|
||||
|
||||
exit 0
|
||||
281
deployment/build/templates/scripts/server-install.sh
Executable file
281
deployment/build/templates/scripts/server-install.sh
Executable file
@ -0,0 +1,281 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" # version root
|
||||
|
||||
PROJECT_NAME="argus-sys"
|
||||
|
||||
log() { echo -e "\033[0;34m[INSTALL]\033[0m $*"; }
|
||||
err() { echo -e "\033[0;31m[ERROR ]\033[0m $*" >&2; }
|
||||
|
||||
require() { command -v "$1" >/dev/null 2>&1 || { err "missing command: $1"; exit 1; }; }
|
||||
|
||||
require docker
|
||||
if docker compose version >/dev/null 2>&1; then COMPOSE=(docker compose); else require docker-compose; COMPOSE=(docker-compose); fi
|
||||
|
||||
ENV_FILE="$PKG_ROOT/compose/.env"
|
||||
ENV_TEMPLATE="$PKG_ROOT/compose/.env.example"
|
||||
|
||||
find_free_port() {
|
||||
local prefer="$1"; local start=${2:-20000}; local max=${3:-65000};
|
||||
if ! ss -ltnH 2>/dev/null | awk -v pat=":"$prefer"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then echo "$prefer"; return; fi
|
||||
for ((p=start; p<=max; p++)); do
|
||||
if ! ss -ltnH 2>/dev/null | awk -v pat=":"$p"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then echo "$p"; return; fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
prepare_env() {
|
||||
if [[ -f "$ENV_FILE" ]]; then log ".env exists, keep as-is"; return; fi
|
||||
[[ -f "$ENV_TEMPLATE" ]] || { err "missing $ENV_TEMPLATE"; exit 1; }
|
||||
cp "$ENV_TEMPLATE" "$ENV_FILE"
|
||||
# overlay 模式下,避免为不同服务分配到同一新端口;保持模板端口不做自动改写
|
||||
}
|
||||
|
||||
prepare_data_dirs() {
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
echo -e "\033[1;33m[WARN]\033[0m running as non-root: will not chown data dirs."
|
||||
echo -e "\033[1;33m[WARN]\033[0m If you hit Permission denied, run: sudo $SCRIPT_DIR/server-prepare-dirs.sh"
|
||||
# still ensure basic directories exist (no chown)
|
||||
mkdir -p \
|
||||
"$PKG_ROOT/private/argus/etc" \
|
||||
"$PKG_ROOT/private/argus/log/elasticsearch" \
|
||||
"$PKG_ROOT/private/argus/log/kibana" \
|
||||
"$PKG_ROOT/private/argus/metric/prometheus" \
|
||||
"$PKG_ROOT/private/argus/metric/prometheus/data" \
|
||||
"$PKG_ROOT/private/argus/metric/prometheus/rules" \
|
||||
"$PKG_ROOT/private/argus/metric/grafana" \
|
||||
"$PKG_ROOT/private/argus/metric/grafana/data" \
|
||||
"$PKG_ROOT/private/argus/metric/grafana/logs" \
|
||||
"$PKG_ROOT/private/argus/metric/grafana/plugins" \
|
||||
"$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \
|
||||
"$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \
|
||||
"$PKG_ROOT/private/argus/metric/grafana/data/sessions" \
|
||||
"$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \
|
||||
"$PKG_ROOT/private/argus/alert/alertmanager" \
|
||||
"$PKG_ROOT/private/argus/metric/ftp/share"
|
||||
# non-root: relax permissions to avoid container UID mismatch blocking writes
|
||||
chmod -R a+rwx "$PKG_ROOT/private/argus" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
|
||||
ensure_swarm_and_overlay() {
|
||||
local net_name="${OVERLAY_NET_NAME:-argus-sys-net}"
|
||||
# Require swarm active
|
||||
local state
|
||||
state=$(docker info --format '{{.Swarm.LocalNodeState}}' 2>/dev/null || echo "")
|
||||
if [[ "$state" != "active" ]]; then
|
||||
err "Docker Swarm is not active. On this host run:"
|
||||
err " docker swarm init --advertise-addr <this_host_ip>"
|
||||
exit 1
|
||||
fi
|
||||
# Create attachable overlay if missing
|
||||
if ! docker network inspect "$net_name" >/dev/null 2>&1; then
|
||||
log "creating attachable overlay network: $net_name"
|
||||
docker network create --driver overlay --attachable "$net_name" >/dev/null
|
||||
fi
|
||||
}
|
||||
|
||||
bootstrap_dns_conf() {
|
||||
local etc_dir="$PKG_ROOT/private/argus/etc"
|
||||
mkdir -p "$etc_dir"
|
||||
local dns_file="$etc_dir/dns.conf"
|
||||
if [[ ! -s "$dns_file" ]]; then
|
||||
# detect host primary IP
|
||||
local host_ip
|
||||
host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7; exit}')
|
||||
[[ -z "$host_ip" ]] && host_ip=$(hostname -I 2>/dev/null | awk '{print $1}')
|
||||
if [[ -n "$host_ip" ]]; then
|
||||
echo "$host_ip" > "$dns_file"
|
||||
log "wrote initial dns.conf with host IP: $host_ip"
|
||||
else
|
||||
err "failed to determine host IP for dns.conf; please edit $dns_file manually"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
load_images() {
|
||||
local tar="$PKG_ROOT/images/all-images.tar.gz"
|
||||
[[ -f "$tar" ]] || { err "missing images tar: $tar"; exit 1; }
|
||||
log "loading images from $(basename "$tar") (may take minutes)"
|
||||
gunzip -c "$tar" | docker load >/dev/null
|
||||
}
|
||||
|
||||
bring_up() {
|
||||
log "starting services via compose"
|
||||
ensure_swarm_and_overlay
|
||||
bootstrap_dns_conf
|
||||
local ov="$PKG_ROOT/compose/docker-compose.os-compat.override.yml"
|
||||
if [[ ! -f "$ov" ]]; then
|
||||
cat > "$ov" <<'YAML'
|
||||
services:
|
||||
bind:
|
||||
security_opt: ["label=disable"]
|
||||
userns_mode: "host"
|
||||
tmpfs:
|
||||
- /run/named
|
||||
master:
|
||||
security_opt: ["label=disable"]
|
||||
userns_mode: "host"
|
||||
es:
|
||||
security_opt: ["label=disable"]
|
||||
userns_mode: "host"
|
||||
kibana:
|
||||
security_opt: ["label=disable"]
|
||||
userns_mode: "host"
|
||||
ftp:
|
||||
security_opt: ["label=disable"]
|
||||
userns_mode: "host"
|
||||
prometheus:
|
||||
security_opt: ["label=disable"]
|
||||
userns_mode: "host"
|
||||
grafana:
|
||||
security_opt: ["label=disable"]
|
||||
userns_mode: "host"
|
||||
alertmanager:
|
||||
security_opt: ["label=disable"]
|
||||
userns_mode: "host"
|
||||
# ensure runtime path matches container expectation
|
||||
volumes:
|
||||
- ../private/argus/etc:/private/argus/etc
|
||||
- ../private/argus/alert/alertmanager:/alertmanager
|
||||
web-frontend:
|
||||
security_opt: ["label=disable"]
|
||||
userns_mode: "host"
|
||||
web-proxy:
|
||||
security_opt: ["label=disable"]
|
||||
userns_mode: "host"
|
||||
YAML
|
||||
log "generated OS-compat override: $(basename "$ov")"
|
||||
fi
|
||||
# 仅启动服务端组件,避免误起测试节点(node-a/node-b/test-node/test-gpu-node)
|
||||
local services=(bind master es kibana ftp prometheus grafana alertmanager web-frontend web-proxy)
|
||||
log "services: ${services[*]}"
|
||||
(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" -f docker-compose.yml -f $(basename "$ov") up -d "${services[@]}")
|
||||
}
|
||||
|
||||
# Post bootstrap container-side fixes that do not require sudo on host.
|
||||
post_bootstrap_fixes() {
|
||||
# Kibana: ensure /usr/share/kibana/data is a symlink into mounted path to avoid EACCES
|
||||
if docker ps --format '{{.Names}}' | grep -q '^argus-kibana-sys$'; then
|
||||
docker exec argus-kibana-sys bash -lc '
|
||||
set -e
|
||||
mkdir -p /private/argus/log/kibana && chmod 777 /private/argus/log/kibana || true
|
||||
if [ -d /usr/share/kibana/data ] && [ ! -L /usr/share/kibana/data ]; then rm -rf /usr/share/kibana/data; fi
|
||||
if [ ! -e /usr/share/kibana/data ]; then ln -s /private/argus/log/kibana /usr/share/kibana/data; fi
|
||||
' >/dev/null 2>&1 || true
|
||||
fi
|
||||
# Elasticsearch: ensure data path points to mounted path and is writable
|
||||
if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then
|
||||
docker exec argus-es-sys bash -lc '
|
||||
set -e
|
||||
mkdir -p /private/argus/log/elasticsearch && chmod 777 /private/argus/log/elasticsearch || true
|
||||
if [ -d /usr/share/elasticsearch/data ] && [ ! -L /usr/share/elasticsearch/data ]; then rm -rf /usr/share/elasticsearch/data; fi
|
||||
if [ ! -e /usr/share/elasticsearch/data ]; then ln -s /private/argus/log/elasticsearch /usr/share/elasticsearch/data; fi
|
||||
' >/dev/null 2>&1 || true
|
||||
fi
|
||||
# Bind9: ensure rndc.key exists
|
||||
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
|
||||
docker exec argus-bind-sys bash -lc '
|
||||
set -e
|
||||
mkdir -p /etc/bind
|
||||
if [ ! -f /etc/bind/rndc.key ]; then rndc-confgen -a -c /etc/bind/rndc.key; fi
|
||||
chmod 644 /etc/bind/rndc.key || true
|
||||
' >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
dns_bootstrap() {
|
||||
log "DNS bootstrap: initializing shared dns.conf and container resolv.conf"
|
||||
local etc_dir="$PKG_ROOT/private/argus/etc"
|
||||
mkdir -p "$etc_dir"
|
||||
# 1) ensure dns.conf exists (fallback to bind IP 172.31.0.2)
|
||||
if [[ ! -s "$etc_dir/dns.conf" ]]; then
|
||||
if echo "172.31.0.2" > "$etc_dir/dns.conf" 2>/dev/null; then
|
||||
log "wrote fallback dns.conf with 172.31.0.2"
|
||||
else
|
||||
# host-side write denied (ownership 1000:1000); write via bind container instead
|
||||
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
|
||||
docker exec argus-bind-sys sh -lc 'echo 172.31.0.2 > /private/argus/etc/dns.conf && chmod 644 /private/argus/etc/dns.conf' || true
|
||||
log "fallback dns.conf written via bind container"
|
||||
else
|
||||
log "bind not ready; skip writing fallback dns.conf"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
# 2) wait briefly for bind to copy update-dns.sh into shared etc (bind startup.sh does this)
|
||||
local i=0
|
||||
while [[ ! -x "$etc_dir/update-dns.sh" && $i -lt 20 ]]; do
|
||||
sleep 0.5; ((i++));
|
||||
done
|
||||
if [[ ! -x "$etc_dir/update-dns.sh" ]]; then
|
||||
log "update-dns.sh not present yet; continuing with existing resolv.conf"
|
||||
fi
|
||||
# 3) run update-dns.sh inside key containers so /etc/resolv.conf points to bind
|
||||
local c
|
||||
for c in argus-master-sys argus-es-sys argus-kibana-sys argus-grafana argus-prometheus argus-ftp argus-web-frontend argus-web-proxy argus-alertmanager; do
|
||||
if docker ps --format '{{.Names}}' | grep -q "^${c}$"; then
|
||||
docker exec "$c" sh -lc 'test -x /private/argus/etc/update-dns.sh && /private/argus/etc/update-dns.sh || true' >/dev/null 2>&1 || true
|
||||
fi
|
||||
done
|
||||
# 4) wait for service A-record hint files generated by services (best-effort)
|
||||
local need=( es.log.argus.com kibana.log.argus.com master.argus.com grafana.metric.argus.com prom.metric.argus.com alertmanager.alert.argus.com )
|
||||
local waited=0; local missing=1
|
||||
while (( waited < 15 )); do
|
||||
missing=0
|
||||
for f in "${need[@]}"; do [[ -s "$etc_dir/$f" ]] || { missing=1; break; }; done
|
||||
[[ $missing -eq 0 ]] && break
|
||||
sleep 1; ((waited++))
|
||||
done
|
||||
# 5) reload bind zone (script uses supervisor to restart bind9)
|
||||
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
|
||||
docker exec argus-bind-sys sh -lc '/usr/local/bin/reload-bind9.sh' >/dev/null 2>&1 || true
|
||||
fi
|
||||
# 6) restart web-proxy once to re-render nginx resolver with latest dns.conf
|
||||
if docker ps --format '{{.Names}}' | grep -q '^argus-web-proxy$'; then
|
||||
docker restart argus-web-proxy >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
selfcheck() {
|
||||
# Initial selfcheck with retries to absorb cold starts
|
||||
local max_retries="${SELF_CHECK_RETRIES:-5}" # 重试次数(不含首次),默认 5
|
||||
local wait_seconds="${SELF_CHECK_WAIT_SECONDS:-30}" # 每次重试前等待秒数,默认 30s
|
||||
|
||||
local attempt=0
|
||||
while :; do
|
||||
attempt=$((attempt+1))
|
||||
if (( attempt == 1 )); then
|
||||
log "running selfcheck (attempt ${attempt})"
|
||||
else
|
||||
log "running selfcheck (attempt ${attempt}/${max_retries}+1)"
|
||||
fi
|
||||
|
||||
if bash "$PKG_ROOT/scripts/server-selfcheck.sh"; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
# failed
|
||||
if (( attempt > max_retries )); then
|
||||
err "selfcheck failed after ${attempt} attempt(s)"
|
||||
exit 1
|
||||
fi
|
||||
log "selfcheck not ready yet; retrying in ${wait_seconds}s..."
|
||||
sleep "$wait_seconds"
|
||||
done
|
||||
}
|
||||
|
||||
main() {
|
||||
mkdir -p "$PKG_ROOT/logs"
|
||||
prepare_env
|
||||
prepare_data_dirs
|
||||
load_images
|
||||
bring_up
|
||||
post_bootstrap_fixes
|
||||
dns_bootstrap
|
||||
selfcheck
|
||||
log "install completed. See logs in $PKG_ROOT/logs/"
|
||||
}
|
||||
|
||||
main "$@"
|
||||
73
deployment/build/templates/scripts/server-prepare-dirs.sh
Executable file
73
deployment/build/templates/scripts/server-prepare-dirs.sh
Executable file
@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
echo "[PREPARE] This script requires root (sudo)." >&2
|
||||
echo " Try: sudo $0" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ENV_FILE="$PKG_ROOT/compose/.env"
|
||||
[[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
||||
UIDV="${ARGUS_BUILD_UID:-1000}"; GIDV="${ARGUS_BUILD_GID:-1000}"
|
||||
|
||||
echo "[PREPARE] Using owner ${UIDV}:${GIDV}"
|
||||
|
||||
# Core etc and service data dirs (aligned with src/sys/tests/scripts/01_bootstrap.sh)
|
||||
mkdir -p \
|
||||
"$PKG_ROOT/private/argus/etc" \
|
||||
"$PKG_ROOT/private/argus/bind" \
|
||||
"$PKG_ROOT/private/argus/master" \
|
||||
"$PKG_ROOT/private/argus/agent" \
|
||||
"$PKG_ROOT/private/argus/log/elasticsearch" \
|
||||
"$PKG_ROOT/private/argus/log/kibana"
|
||||
|
||||
# Prometheus
|
||||
mkdir -p \
|
||||
"$PKG_ROOT/private/argus/metric/prometheus" \
|
||||
"$PKG_ROOT/private/argus/metric/prometheus/data" \
|
||||
"$PKG_ROOT/private/argus/metric/prometheus/rules" \
|
||||
"$PKG_ROOT/private/argus/metric/prometheus/targets"
|
||||
|
||||
# Grafana
|
||||
mkdir -p \
|
||||
"$PKG_ROOT/private/argus/metric/grafana" \
|
||||
"$PKG_ROOT/private/argus/metric/grafana/data" \
|
||||
"$PKG_ROOT/private/argus/metric/grafana/logs" \
|
||||
"$PKG_ROOT/private/argus/metric/grafana/plugins" \
|
||||
"$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \
|
||||
"$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \
|
||||
"$PKG_ROOT/private/argus/metric/grafana/data/sessions" \
|
||||
"$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \
|
||||
"$PKG_ROOT/private/argus/metric/grafana/config"
|
||||
|
||||
# FTP
|
||||
mkdir -p "$PKG_ROOT/private/argus/metric/ftp/share"
|
||||
|
||||
# Alertmanager
|
||||
mkdir -p "$PKG_ROOT/private/argus/alert/alertmanager"
|
||||
|
||||
chown -R "$UIDV":"$GIDV" \
|
||||
"$PKG_ROOT/private/argus/etc" \
|
||||
"$PKG_ROOT/private/argus/bind" \
|
||||
"$PKG_ROOT/private/argus/master" \
|
||||
"$PKG_ROOT/private/argus/agent" \
|
||||
"$PKG_ROOT/private/argus/log/elasticsearch" \
|
||||
"$PKG_ROOT/private/argus/log/kibana" \
|
||||
"$PKG_ROOT/private/argus/metric/prometheus" \
|
||||
"$PKG_ROOT/private/argus/metric/grafana" \
|
||||
"$PKG_ROOT/private/argus/metric/ftp" \
|
||||
"$PKG_ROOT/private/argus/alert"
|
||||
|
||||
chmod -R g+w "$PKG_ROOT/private/argus/alert" "$PKG_ROOT/private/argus/etc" || true
|
||||
|
||||
# Ensure parent directories also owned by runtime user for consistency
|
||||
chown "$UIDV":"$GIDV" \
|
||||
"$PKG_ROOT/private/argus" \
|
||||
"$PKG_ROOT/private/argus/log" \
|
||||
"$PKG_ROOT/private/argus/metric" || true
|
||||
|
||||
echo "[PREPARE] Done. You can now run server-install.sh"
|
||||
104
deployment/build/templates/scripts/server-selfcheck.sh
Executable file
104
deployment/build/templates/scripts/server-selfcheck.sh
Executable file
@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
log() { echo -e "\033[0;34m[CHECK]\033[0m $*"; }
|
||||
err() { echo -e "\033[0;31m[ERROR]\033[0m $*" >&2; }
|
||||
|
||||
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
||||
|
||||
wait_http() { local url="$1"; local attempts=${2:-120}; local i=1; while ((i<=attempts)); do curl -fsS "$url" >/dev/null 2>&1 && return 0; echo "[..] waiting $url ($i/$attempts)"; sleep 5; ((i++)); done; return 1; }
|
||||
code_for() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
|
||||
header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; }
|
||||
|
||||
LOG_DIR="$ROOT/logs"
|
||||
mkdir -p "$LOG_DIR" || true
|
||||
OUT_JSON="$LOG_DIR/selfcheck.json"
|
||||
tmp=$(mktemp)
|
||||
|
||||
ok=1
|
||||
|
||||
log "checking overlay network"
|
||||
net_ok=false
|
||||
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then
|
||||
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" | grep -q '"Driver": "overlay"'; then net_ok=true; fi
|
||||
fi
|
||||
[[ "$net_ok" == true ]] || ok=0
|
||||
|
||||
log "checking Elasticsearch (via domain inside web-proxy)"
|
||||
if docker exec argus-web-proxy sh -lc "curl -fsS http://es.log.argus.com:9200/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then es_ok=true; else es_ok=false; ok=0; fi
|
||||
|
||||
log "checking Kibana (via domain inside web-proxy)"
|
||||
kb_code=$(docker exec argus-web-proxy sh -lc "curl -s -o /dev/null -w '%{http_code}' http://kibana.log.argus.com:5601/api/status" || echo 000)
|
||||
kb_ok=false
|
||||
if [[ "$kb_code" == "200" ]]; then body=$(curl -sS "http://localhost:${KIBANA_PORT:-5601}/api/status"); echo "$body" | grep -q '"level":"available"' && kb_ok=true; fi
|
||||
[[ "$kb_ok" == true ]] || ok=0
|
||||
|
||||
log "checking Master (via domain inside web-proxy)"
|
||||
if docker exec argus-web-proxy sh -lc "curl -fsS http://master.argus.com:3000/readyz" >/dev/null 2>&1; then true; else ok=0; fi
|
||||
|
||||
log "checking FTP"
|
||||
if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then
|
||||
if docker exec argus-ftp sh -lc 'test -w /private/argus/ftp/share'; then ftp_ok=true; else ftp_ok=false; ok=0; fi
|
||||
else
|
||||
ftp_ok=false; ok=0;
|
||||
fi
|
||||
|
||||
log "checking Prometheus"
|
||||
wait_http "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready" 60 || ok=0
|
||||
|
||||
log "checking Grafana"
|
||||
gf_code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:${GRAFANA_PORT:-3000}/api/health" || echo 000)
|
||||
gf_ok=false; if [[ "$gf_code" == "200" ]]; then body=$(curl -sS "http://localhost:${GRAFANA_PORT:-3000}/api/health"); echo "$body" | grep -q '"database"\s*:\s*"ok"' && gf_ok=true; fi
|
||||
[[ "$gf_ok" == true ]] || ok=0
|
||||
|
||||
log "checking Alertmanager"
|
||||
wait_http "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status" 60 || ok=0
|
||||
|
||||
log "checking Web-Proxy"
|
||||
p8080=$(code_for "http://localhost:${WEB_PROXY_PORT_8080:-8080}/")
|
||||
p8083=$(code_for "http://localhost:${WEB_PROXY_PORT_8083:-8083}/")
|
||||
cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true)
|
||||
cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true)
|
||||
wp_ok=true
|
||||
# 有些环境首页可能 403,此处接受 200/403
|
||||
([[ "$p8080" == 200 || "$p8080" == 403 ]]) || wp_ok=false
|
||||
([[ "$p8083" == 200 || "$p8083" == 302 || "$p8083" == 403 ]]) || wp_ok=false
|
||||
[[ -n "$cors8084" && -n "$cors8085" ]] || wp_ok=false
|
||||
[[ "$wp_ok" == true ]] || ok=0
|
||||
|
||||
cat > "$tmp" <<JSON
|
||||
{
|
||||
"es": $es_ok,
|
||||
"kibana": $kb_ok,
|
||||
"master_readyz": true,
|
||||
"ftp_share_writable": $ftp_ok,
|
||||
"prometheus": true,
|
||||
"grafana": $gf_ok,
|
||||
"alertmanager": true,
|
||||
"web_proxy": $wp_ok,
|
||||
"overlay_net": $net_ok,
|
||||
"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
}
|
||||
JSON
|
||||
|
||||
if ! mv "$tmp" "$OUT_JSON" 2>/dev/null; then
|
||||
# fallback when logs dir not writable (no sudo allowed)
|
||||
OUT_JSON="/tmp/selfcheck_$(date -u +%Y%m%d-%H%M%SZ).json"
|
||||
cp "$tmp" "$OUT_JSON"
|
||||
log "selfcheck.json written to $OUT_JSON (logs dir not writable)"
|
||||
fi
|
||||
if [[ "$ok" == 1 ]]; then
|
||||
log "selfcheck OK"
|
||||
exit 0
|
||||
else
|
||||
err "selfcheck FAILED (see $OUT_JSON)"
|
||||
# If diagnose script exists, run it to collect more details
|
||||
if [[ -x "$SCRIPT_DIR/server-diagnose.sh" ]]; then
|
||||
# run diagnose; it will print the actual timestamped file paths and update 'latest' symlinks
|
||||
"$SCRIPT_DIR/server-diagnose.sh" || true
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
28
deployment/build/templates/scripts/server-status.sh
Executable file
28
deployment/build/templates/scripts/server-status.sh
Executable file
@ -0,0 +1,28 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
PROJECT_NAME="argus-sys"
|
||||
|
||||
if docker compose version >/dev/null 2>&1; then COMPOSE=(docker compose); else COMPOSE=(docker-compose); fi
|
||||
|
||||
echo "== Containers =="
|
||||
(cd "$ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" ps)
|
||||
|
||||
echo
|
||||
echo "== Key Endpoints =="
|
||||
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
||||
printf "master http://localhost:%s/readyz\n" "${MASTER_PORT:-32300}"
|
||||
printf "es http://localhost:%s/_cluster/health\n" "${ES_HTTP_PORT:-9200}"
|
||||
printf "kibana http://localhost:%s/api/status\n" "${KIBANA_PORT:-5601}"
|
||||
printf "prom http://localhost:%s/-/ready\n" "${PROMETHEUS_PORT:-9090}"
|
||||
printf "grafana http://localhost:%s/api/health\n" "${GRAFANA_PORT:-3000}"
|
||||
printf "alert http://localhost:%s/api/v2/status\n" "${ALERTMANAGER_PORT:-9093}"
|
||||
printf "web http://localhost:%s/ (8080)\n" "${WEB_PROXY_PORT_8080:-8080}"
|
||||
|
||||
echo
|
||||
echo "== Selfcheck result =="
|
||||
cat "$ROOT/logs/selfcheck.json" 2>/dev/null || echo "(no selfcheck yet)"
|
||||
|
||||
16
deployment/build/templates/scripts/server-uninstall.sh
Executable file
16
deployment/build/templates/scripts/server-uninstall.sh
Executable file
@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
PROJECT_NAME="argus-sys"
|
||||
|
||||
log() { echo -e "\033[0;34m[UNINSTALL]\033[0m $*"; }
|
||||
|
||||
if docker compose version >/dev/null 2>&1; then COMPOSE=(docker compose); else COMPOSE=(docker-compose); fi
|
||||
|
||||
(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" down -v || true)
|
||||
log "compose stack removed"
|
||||
log "you may remove data under $PKG_ROOT/private if you want a clean slate"
|
||||
|
||||
BIN
doc/metric_lists.xlsx
Normal file
BIN
doc/metric_lists.xlsx
Normal file
Binary file not shown.
@ -1,5 +1,5 @@
|
||||
DATA_ROOT=/home/argus/tmp/private/argus
|
||||
ARGUS_UID=1048
|
||||
ARGUS_GID=1048
|
||||
ARGUS_BUILD_UID=1048
|
||||
ARGUS_BUILD_GID=1048
|
||||
|
||||
USE_INTRANET=false
|
||||
USE_INTRANET=false
|
||||
@ -1,19 +0,0 @@
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
group_by: ['alertname', 'instance'] # 分组:相同 alertname + instance 的告警合并
|
||||
group_wait: 30s # 第一个告警后,等 30s 看是否有同组告警一起发
|
||||
group_interval: 5m # 同组告警变化后,至少 5 分钟再发一次
|
||||
repeat_interval: 3h # 相同告警,3 小时重复提醒一次
|
||||
receiver: 'null'
|
||||
|
||||
receivers:
|
||||
- name: 'null'
|
||||
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical' # critical 告警存在时
|
||||
target_match:
|
||||
severity: 'warning' # 抑制相同 instance 的 warning 告警
|
||||
equal: ['instance']
|
||||
@ -1 +0,0 @@
|
||||
172.18.0.2
|
||||
@ -1,19 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)"
|
||||
project_root="$(cd "$root/../../.." && pwd)"
|
||||
|
||||
source "$project_root/scripts/common/build_user.sh"
|
||||
load_build_user
|
||||
|
||||
# 创建新的private目录结构 (基于argus目录结构)
|
||||
echo "[INFO] Creating private directory structure for supervisor-based containers..."
|
||||
mkdir -p "$root/private/argus/alert/alertmanager"
|
||||
mkdir -p "$root/private/argus/etc/"
|
||||
|
||||
# 设置数据目录权限
|
||||
echo "[INFO] Setting permissions for data directories..."
|
||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/alert/alertmanager" 2>/dev/null || true
|
||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/etc" 2>/dev/null || true
|
||||
|
||||
echo "[INFO] Supervisor-based containers will manage their own scripts and configurations"
|
||||
@ -1,10 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
cd "$(dirname "$0")/.."
|
||||
compose_cmd="docker compose"
|
||||
if ! $compose_cmd version >/dev/null 2>&1; then
|
||||
if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else
|
||||
echo "需要 Docker Compose,请安装后重试" >&2; exit 1; fi
|
||||
fi
|
||||
$compose_cmd -p alert-mvp up -d --remove-orphans
|
||||
echo "[OK] 服务已启动:Alertmanager http://localhost:9093"
|
||||
@ -1,106 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# ==========================================================
|
||||
# Alertmanager 测试脚本
|
||||
# ==========================================================
|
||||
|
||||
ALERTMANAGER_URL="http://localhost:9093"
|
||||
TEST_ALERT_NAME_CRITICAL="NodeDown"
|
||||
TEST_ALERT_NAME_WARNING="HighCPU"
|
||||
TMP_LOG="/tmp/test-alertmanager.log"
|
||||
|
||||
# 等待参数
|
||||
am_wait_attempts=30
|
||||
am_wait_interval=2
|
||||
|
||||
GREEN="\033[1;32m"
|
||||
RED="\033[1;31m"
|
||||
YELLOW="\033[1;33m"
|
||||
RESET="\033[0m"
|
||||
|
||||
# ==========================================================
|
||||
# 函数定义
|
||||
# ==========================================================
|
||||
|
||||
wait_for_alertmanager() {
|
||||
local attempt=1
|
||||
echo "[INFO] 等待 Alertmanager 启动中..."
|
||||
while (( attempt <= am_wait_attempts )); do
|
||||
if curl -fsS "${ALERTMANAGER_URL}/api/v2/status" >/dev/null 2>&1; then
|
||||
echo -e "${GREEN}[OK] Alertmanager 已就绪 (attempt=${attempt}/${am_wait_attempts})${RESET}"
|
||||
return 0
|
||||
fi
|
||||
echo "[..] Alertmanager 尚未就绪 (${attempt}/${am_wait_attempts})"
|
||||
sleep "${am_wait_interval}"
|
||||
(( attempt++ ))
|
||||
done
|
||||
echo -e "${RED}[ERROR] Alertmanager 在 ${am_wait_attempts} 次尝试后仍未就绪${RESET}"
|
||||
return 1
|
||||
}
|
||||
|
||||
log_step() {
|
||||
echo -e "${YELLOW}==== $1 ====${RESET}"
|
||||
}
|
||||
|
||||
# ==========================================================
|
||||
# 主流程
|
||||
# ==========================================================
|
||||
|
||||
log_step "测试 Alertmanager 开始"
|
||||
echo "[INFO] Alertmanager 地址: $ALERTMANAGER_URL"
|
||||
|
||||
# Step 1: 等待 Alertmanager 启动
|
||||
wait_for_alertmanager
|
||||
|
||||
# Step 2: 触发一个critical测试告警
|
||||
echo "[INFO] 发送critical测试告警..."
|
||||
curl -fsS -X POST "${ALERTMANAGER_URL}/api/v2/alerts" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '[
|
||||
{
|
||||
"labels": {
|
||||
"alertname": "'"${TEST_ALERT_NAME_CRITICAL}"'",
|
||||
"instance": "node-1",
|
||||
"severity": "critical"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "节点 node-1 宕机"
|
||||
}
|
||||
}
|
||||
]' \
|
||||
-o "$TMP_LOG"
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo -e "${GREEN}[OK] 已成功发送critical测试告警${RESET}"
|
||||
else
|
||||
echo -e "${RED}[ERROR] critical告警发送失败!${RESET}"
|
||||
cat "$TMP_LOG"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Step 3: 触发一个warning测试告警
|
||||
echo "[INFO] 发送warning测试告警..."
|
||||
curl -fsS -X POST "${ALERTMANAGER_URL}/api/v2/alerts" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '[
|
||||
{
|
||||
"labels": {
|
||||
"alertname": "'"${TEST_ALERT_NAME_WARNING}"'",
|
||||
"instance": "node-1",
|
||||
"severity": "warning"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "节点 node-1 CPU 使用率过高"
|
||||
}
|
||||
}
|
||||
]' \
|
||||
-o "$TMP_LOG"
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo -e "${GREEN}[OK] 已成功发送warning测试告警${RESET}"
|
||||
else
|
||||
echo -e "${RED}[ERROR] warning告警发送失败!${RESET}"
|
||||
cat "$TMP_LOG"
|
||||
exit 1
|
||||
fi
|
||||
@ -1,71 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# ==========================================================
|
||||
# Alertmanager 测试脚本(含启动等待)
|
||||
# ==========================================================
|
||||
|
||||
ALERTMANAGER_URL="http://localhost:9093"
|
||||
TEST_ALERT_NAME_CRITICAL="NodeDown"
|
||||
TEST_ALERT_NAME_WARNING="HighCPU"
|
||||
TMP_LOG="/tmp/test-alertmanager.log"
|
||||
|
||||
# 等待参数
|
||||
am_wait_attempts=30
|
||||
am_wait_interval=2
|
||||
|
||||
GREEN="\033[1;32m"
|
||||
RED="\033[1;31m"
|
||||
YELLOW="\033[1;33m"
|
||||
RESET="\033[0m"
|
||||
|
||||
# ==========================================================
|
||||
# 函数定义
|
||||
# ==========================================================
|
||||
|
||||
wait_for_alertmanager() {
|
||||
local attempt=1
|
||||
echo "[INFO] 等待 Alertmanager 启动中..."
|
||||
while (( attempt <= am_wait_attempts )); do
|
||||
if curl -fsS "${ALERTMANAGER_URL}/api/v2/status" >/dev/null 2>&1; then
|
||||
echo -e "${GREEN}[OK] Alertmanager 已就绪 (attempt=${attempt}/${am_wait_attempts})${RESET}"
|
||||
return 0
|
||||
fi
|
||||
echo "[..] Alertmanager 尚未就绪 (${attempt}/${am_wait_attempts})"
|
||||
sleep "${am_wait_interval}"
|
||||
(( attempt++ ))
|
||||
done
|
||||
echo -e "${RED}[ERROR] Alertmanager 在 ${am_wait_attempts} 次尝试后仍未就绪${RESET}"
|
||||
return 1
|
||||
}
|
||||
|
||||
log_step() {
|
||||
echo -e "${YELLOW}==== $1 ====${RESET}"
|
||||
}
|
||||
|
||||
# ==========================================================
|
||||
# 主流程
|
||||
# ==========================================================
|
||||
|
||||
log_step "查询 Alertmanager 当前告警列表开始"
|
||||
echo "[INFO] Alertmanager 地址: $ALERTMANAGER_URL"
|
||||
|
||||
# Step 1: 等待 Alertmanager 启动
|
||||
wait_for_alertmanager
|
||||
|
||||
# Step 2: 查询当前告警列表
|
||||
echo "[INFO] 查询当前告警..."
|
||||
sleep 1
|
||||
curl -fsS "${ALERTMANAGER_URL}/api/v2/alerts" | jq '.' || {
|
||||
echo -e "${RED}[WARN] 无法解析返回 JSON,请检查 jq 是否安装${RESET}"
|
||||
curl -s "${ALERTMANAGER_URL}/api/v2/alerts"
|
||||
}
|
||||
|
||||
# Step 3: 检查告警是否包含 NodeDown
|
||||
if curl -fsS "${ALERTMANAGER_URL}/api/v2/alerts" | grep -q "${TEST_ALERT_NAME_CRITICAL}"; then
|
||||
echo -e "${GREEN}✅ 测试通过:Alertmanager 已成功接收告警 ${TEST_ALERT_NAME_CRITICAL}${RESET}"
|
||||
else
|
||||
echo -e "${RED}❌ 测试失败:未检测到告警 ${TEST_ALERT_NAME_CRITICAL}${RESET}"
|
||||
fi
|
||||
|
||||
log_step "测试结束"
|
||||
@ -1,21 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
cd "$(dirname "$0")/.."
|
||||
compose_cmd="docker compose"
|
||||
if ! $compose_cmd version >/dev/null 2>&1; then
|
||||
if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else
|
||||
echo "需要 Docker Compose,请安装后重试" >&2; exit 1; fi
|
||||
fi
|
||||
$compose_cmd -p alert-mvp down
|
||||
echo "[OK] 已停止所有容器"
|
||||
|
||||
# 清理private目录内容
|
||||
echo "[INFO] 清理private目录内容..."
|
||||
cd "$(dirname "$0")/.."
|
||||
if [ -d "private" ]; then
|
||||
# 删除private目录及其所有内容
|
||||
rm -rf private
|
||||
echo "[OK] 已清理private目录"
|
||||
else
|
||||
echo "[INFO] private目录不存在,无需清理"
|
||||
fi
|
||||
@ -1,105 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
echo "======================================="
|
||||
echo "ARGUS Alert System End-to-End Test"
|
||||
echo "======================================="
|
||||
echo ""
|
||||
|
||||
# 记录测试开始时间
|
||||
test_start_time=$(date +%s)
|
||||
|
||||
# 函数:等待服务就绪
|
||||
wait_for_services() {
|
||||
echo "[INFO] Waiting for all services to be ready..."
|
||||
local max_attempts=${SERVICE_WAIT_ATTEMPTS:-120}
|
||||
local attempt=1
|
||||
|
||||
while [ $attempt -le $max_attempts ]; do
|
||||
if curl -fs http://localhost:9093/api/v2/status >/dev/null 2>&1; then
|
||||
echo "[OK] All services are ready!"
|
||||
return 0
|
||||
fi
|
||||
echo " Waiting for services... ($attempt/$max_attempts)"
|
||||
sleep 5
|
||||
((attempt++))
|
||||
done
|
||||
|
||||
echo "[ERROR] Services not ready after $max_attempts attempts"
|
||||
return 1
|
||||
}
|
||||
|
||||
# 函数:显示测试步骤
|
||||
show_step() {
|
||||
echo ""
|
||||
echo "🔄 Step $1: $2"
|
||||
echo "----------------------------------------"
|
||||
}
|
||||
|
||||
# 函数:验证步骤结果
|
||||
verify_step() {
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "✅ $1 - SUCCESS"
|
||||
else
|
||||
echo "❌ $1 - FAILED"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 开始端到端测试
|
||||
show_step "1" "Bootstrap - Initialize environment"
|
||||
./scripts/01_bootstrap.sh
|
||||
verify_step "Bootstrap"
|
||||
|
||||
show_step "2" "Startup - Start all services"
|
||||
./scripts/02_up.sh
|
||||
verify_step "Service startup"
|
||||
|
||||
# 等待服务完全就绪
|
||||
wait_for_services || exit 1
|
||||
|
||||
# 发送告警数据
|
||||
show_step "3" "Add alerts - Send test alerts to Alertmanager"
|
||||
./scripts/03_alertmanager_add_alert.sh
|
||||
verify_step "Send test alerts"
|
||||
|
||||
# 查询告警数据
|
||||
show_step "4" "Verify data - Query Alertmanager"
|
||||
./scripts/04_query_alerts.sh
|
||||
verify_step "Data verification"
|
||||
|
||||
|
||||
# 检查服务健康状态
|
||||
show_step "Health" "Check service health"
|
||||
echo "[INFO] Checking service health..."
|
||||
|
||||
# 检查 Alertmanager 状态
|
||||
if curl -fs "http://localhost:9093/api/v2/status" >/dev/null 2>&1; then
|
||||
am_status="available"
|
||||
echo "✅ Alertmanager status: $am_status"
|
||||
else
|
||||
am_status="unavailable"
|
||||
echo "⚠️ Alertmanager status: $am_status"
|
||||
fi
|
||||
verify_step "Service health check"
|
||||
|
||||
# 清理环境
|
||||
show_step "5" "Cleanup - Stop all services"
|
||||
./scripts/05_down.sh
|
||||
verify_step "Service cleanup"
|
||||
|
||||
# 计算总测试时间
|
||||
test_end_time=$(date +%s)
|
||||
total_time=$((test_end_time - test_start_time))
|
||||
|
||||
echo ""
|
||||
echo "======================================="
|
||||
echo "🎉 END-TO-END TEST COMPLETED SUCCESSFULLY!"
|
||||
echo "======================================="
|
||||
echo "📊 Test Summary:"
|
||||
echo " • Total time: ${total_time}s"
|
||||
echo " • Alertmanager status: $am_status"
|
||||
echo " • All services started and stopped successfully"
|
||||
echo ""
|
||||
echo "✅ The ARGUS Alert system is working correctly!"
|
||||
echo ""
|
||||
113
src/alert/tests/scripts/verify_alertmanager.sh
Normal file
113
src/alert/tests/scripts/verify_alertmanager.sh
Normal file
@ -0,0 +1,113 @@
|
||||
#!/bin/bash
|
||||
# verify_alertmanager.sh
|
||||
# 用于部署后验证 Prometheus 与 Alertmanager 通信链路是否正常
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
#=============================
|
||||
# 基础配置
|
||||
#=============================
|
||||
PROM_URL="${PROM_URL:-http://prom.metric.argus.com:9090}"
|
||||
ALERT_URL="${ALERT_URL:-http://alertmanager.alert.argus.com:9093}"
|
||||
# TODO: 根据实际部署环境调整规则目录
|
||||
DATA_ROOT="${DATA_ROOT:-/private/argus}"
|
||||
RULE_DIR = "$DATA_ROOT/metric/prometheus/rules"
|
||||
TMP_RULE="/tmp/test_rule.yml"
|
||||
|
||||
#=============================
|
||||
# 辅助函数
|
||||
#=============================
|
||||
GREEN="\033[32m"; RED="\033[31m"; YELLOW="\033[33m"; RESET="\033[0m"
|
||||
|
||||
log_info() { echo -e "${YELLOW}[INFO]${RESET} $1"; }
|
||||
log_success() { echo -e "${GREEN}[OK]${RESET} $1"; }
|
||||
log_error() { echo -e "${RED}[ERROR]${RESET} $1"; }
|
||||
|
||||
fail_exit() { log_error "$1"; exit 1; }
|
||||
|
||||
#=============================
|
||||
# Step 1: 检查 Alertmanager 是否可访问
|
||||
#=============================
|
||||
log_info "检查 Alertmanager 状态..."
|
||||
if curl -sSf "${ALERT_URL}/api/v2/status" >/dev/null 2>&1; then
|
||||
log_success "Alertmanager 服务正常 (${ALERT_URL})"
|
||||
else
|
||||
fail_exit "无法访问 Alertmanager,请检查端口映射与容器状态。"
|
||||
fi
|
||||
|
||||
#=============================
|
||||
# Step 2: 手动发送测试告警
|
||||
#=============================
|
||||
log_info "发送手动测试告警..."
|
||||
curl -s -XPOST "${ALERT_URL}/api/v2/alerts" -H "Content-Type: application/json" -d '[
|
||||
{
|
||||
"labels": {
|
||||
"alertname": "ManualTestAlert",
|
||||
"severity": "info"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "This is a test alert from deploy verification"
|
||||
},
|
||||
"startsAt": "'$(date -Iseconds)'"
|
||||
}
|
||||
]' >/dev/null && log_success "测试告警已成功发送到 Alertmanager"
|
||||
|
||||
#=============================
|
||||
# Step 3: 检查 Prometheus 配置中是否包含 Alertmanager
|
||||
#=============================
|
||||
log_info "检查 Prometheus 是否配置了 Alertmanager..."
|
||||
if curl -s "${PROM_URL}/api/v1/status/config" | grep -q "alertmanagers"; then
|
||||
log_success "Prometheus 已配置 Alertmanager 目标"
|
||||
else
|
||||
fail_exit "Prometheus 未配置 Alertmanager,请检查 prometheus.yml"
|
||||
fi
|
||||
|
||||
#=============================
|
||||
# Step 4: 创建并加载测试告警规则
|
||||
#=============================
|
||||
log_info "创建临时测试规则 ${TMP_RULE} ..."
|
||||
cat <<EOF > "${TMP_RULE}"
|
||||
groups:
|
||||
- name: deploy-verify-group
|
||||
rules:
|
||||
- alert: DeployVerifyAlert
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Deployment verification alert"
|
||||
EOF
|
||||
|
||||
mkdir -p "${RULE_DIR}"
|
||||
cp "${TMP_RULE}" "${RULE_DIR}/test_rule.yml"
|
||||
|
||||
log_info "重载 Prometheus 以加载新规则..."
|
||||
if curl -s -X POST "${PROM_URL}/-/reload" >/dev/null; then
|
||||
log_success "Prometheus 已重载规则"
|
||||
else
|
||||
fail_exit "Prometheus reload 失败,请检查 API 可访问性。"
|
||||
fi
|
||||
|
||||
#=============================
|
||||
# Step 5: 等待并验证 Alertmanager 是否收到告警
|
||||
#=============================
|
||||
log_info "等待告警触发 (约5秒)..."
|
||||
sleep 5
|
||||
|
||||
if curl -s "${ALERT_URL}/api/v2/alerts" | grep -q "DeployVerifyAlert"; then
|
||||
log_success "Prometheus → Alertmanager 告警链路验证成功"
|
||||
else
|
||||
fail_exit "未在 Alertmanager 中检测到 DeployVerifyAlert,请检查网络或配置。"
|
||||
fi
|
||||
|
||||
#=============================
|
||||
# Step 6: 清理测试规则
|
||||
#=============================
|
||||
log_info "清理临时测试规则..."
|
||||
rm -f "${RULE_DIR}/test_rule.yml" "${TMP_RULE}"
|
||||
|
||||
curl -s -X POST "${PROM_URL}/-/reload" >/dev/null \
|
||||
&& log_success "Prometheus 已清理验证规则" \
|
||||
|| log_error "Prometheus reload 清理失败,请手动确认。"
|
||||
|
||||
log_success "部署验证全部通过!Prometheus ↔ Alertmanager 通信正常。"
|
||||
@ -1 +1 @@
|
||||
1.35.0
|
||||
1.40.0
|
||||
|
||||
@ -48,6 +48,15 @@ if [[ ${#missing_files[@]} -gt 0 ]]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 防御:阻止将 Git LFS 指针文件打包
|
||||
for f in bin/dcgm-exporter bin/datacenter-gpu-manager_3.3.9_amd64.deb; do
|
||||
if head -n1 "$f" 2>/dev/null | grep -q '^version https://git-lfs.github.com/spec/v1$'; then
|
||||
echo "[ERROR] $f 是 Git LFS 指针文件,未还原为真实制品"
|
||||
echo " 请在仓库根目录执行: git lfs fetch --all && git lfs checkout"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
log_success "所有必要文件检查完成"
|
||||
|
||||
# 创建临时目录
|
||||
|
||||
@ -47,6 +47,13 @@ if [[ ${#missing_files[@]} -gt 0 ]]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 防御:阻止将 Git LFS 指针文件打包
|
||||
if head -n1 bin/node_exporter 2>/dev/null | grep -q '^version https://git-lfs.github.com/spec/v1$'; then
|
||||
echo "[ERROR] bin/node_exporter 是 Git LFS 指针文件,未还原为真实二进制"
|
||||
echo " 请在仓库根目录执行: git lfs fetch --all && git lfs checkout"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_success "所有必要文件检查完成"
|
||||
|
||||
# 创建临时目录
|
||||
|
||||
@ -216,6 +216,36 @@ if [[ ${#missing_components[@]} -gt 0 ]]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 额外校验:阻止将 Git LFS 指针文件打进安装包
|
||||
# 仅检查各组件目录下的 bin/ 内文件(常见为二进制或 .deb/.tar.gz 制品)
|
||||
is_lfs_pointer() {
|
||||
local f="$1"
|
||||
# 读取首行判断是否为 LFS pointer(无需依赖 file 命令)
|
||||
head -n1 "$f" 2>/dev/null | grep -q '^version https://git-lfs.github.com/spec/v1$'
|
||||
}
|
||||
|
||||
log_info "检查组件二进制是否已从 LFS 拉取..."
|
||||
while IFS= read -r component; do
|
||||
component_path=$(grep "^$component:" "$TEMP_DIR/component_paths.txt" | cut -d':' -f2-)
|
||||
bin_dir="$component_path/bin"
|
||||
[[ -d "$bin_dir" ]] || continue
|
||||
while IFS= read -r f; do
|
||||
# 只检查常见可执行/包后缀;无后缀的也检查
|
||||
case "$f" in
|
||||
*.sh) continue;;
|
||||
*) :;;
|
||||
esac
|
||||
if is_lfs_pointer "$f"; then
|
||||
log_error "检测到 Git LFS 指针文件: $f"
|
||||
log_error "请在仓库根目录执行: git lfs fetch --all && git lfs checkout"
|
||||
log_error "或确保 CI 在打包前已还原 LFS 大文件。"
|
||||
rm -rf "$TEMP_DIR"
|
||||
exit 1
|
||||
fi
|
||||
done < <(find "$bin_dir" -maxdepth 1 -type f 2>/dev/null | sort)
|
||||
done < "$COMPONENTS_FILE"
|
||||
log_success "LFS 校验通过:未发现指针文件"
|
||||
|
||||
# 打包各个组件
|
||||
log_info "开始打包组件..."
|
||||
|
||||
@ -234,7 +264,19 @@ while IFS= read -r component; do
|
||||
|
||||
# 进入组件目录
|
||||
cd "$component_path"
|
||||
|
||||
|
||||
# 组件内二次防御:若包脚本缺失 LFS 校验,这里再次阻断
|
||||
if [[ -d bin ]]; then
|
||||
for f in bin/*; do
|
||||
[[ -f "$f" ]] || continue
|
||||
if head -n1 "$f" 2>/dev/null | grep -q '^version https://git-lfs.github.com/spec/v1$'; then
|
||||
log_error "组件 $component 含 LFS 指针文件: $f"
|
||||
log_error "请执行: git lfs fetch --all && git lfs checkout"
|
||||
cd "$CURRENT_DIR"; rm -rf "$TEMP_DIR"; exit 1
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# 检查组件是否有 package.sh
|
||||
if [[ ! -f "package.sh" ]]; then
|
||||
log_error "$component 缺少 package.sh 文件"
|
||||
|
||||
@ -48,6 +48,31 @@ BACKUPS_DIR="$INSTALL_DIR/backups" # 备份目录
|
||||
CURRENT_LINK="$INSTALL_DIR/current" # 当前版本软链接
|
||||
LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION" # 当前版本记录文件
|
||||
|
||||
# 预检查:Agent 元数据与 hostname 约束
|
||||
require_agent_metadata() {
|
||||
local hn
|
||||
hn="$(hostname)"
|
||||
local ok=false
|
||||
# 三元环境变量
|
||||
if [[ -n "${AGENT_ENV:-}" && -n "${AGENT_USER:-}" && -n "${AGENT_INSTANCE:-}" ]]; then
|
||||
ok=true
|
||||
fi
|
||||
# host 形如 env-user-instance-xxx
|
||||
if [[ "$hn" =~ ^[^-]+-[^-]+-[^-]+-.*$ ]]; then
|
||||
ok=true
|
||||
fi
|
||||
if [[ "$ok" == false ]]; then
|
||||
log_error "检测到 hostname 与 Agent 元数据不完整:"
|
||||
log_error " 当前 hostname: $hn"
|
||||
log_error " AGENT_ENV='${AGENT_ENV:-}' AGENT_USER='${AGENT_USER:-}' AGENT_INSTANCE='${AGENT_INSTANCE:-}'"
|
||||
echo
|
||||
log_info "请满足以下其一后重试:"
|
||||
log_info " 方式A:设置 hostname 为 env-user-instance-任意,例如 dev-alice-node001-pod-0"
|
||||
log_info " 方式B:导出环境变量:export AGENT_ENV=dev AGENT_USER=alice AGENT_INSTANCE=node001"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 检查必需的FTP参数
|
||||
check_ftp_params() {
|
||||
local missing_params=()
|
||||
@ -873,6 +898,47 @@ rollback_version() {
|
||||
fi
|
||||
}
|
||||
|
||||
# 自检实现:等待 node.json 就绪且健康,并验证 last_report 持续更新
|
||||
selfcheck_post_install() {
|
||||
local hn="$(hostname)"
|
||||
local node_file="/private/argus/agent/${AGENT_HOSTNAME:-$hn}/node.json"
|
||||
local deadline=$(( $(date +%s) + 300 ))
|
||||
local t1="" t2=""
|
||||
while :; do
|
||||
if [[ -f "$node_file" ]]; then
|
||||
if command -v jq >/dev/null 2>&1; then
|
||||
local ok_health lr
|
||||
ok_health=$(jq -er '(.health["metric-argus-agent"].status=="healthy") and (.health["metric-node-exporter"].status=="healthy") and (.health["metric-fluent-bit"].status=="healthy") and (.health["metric-dcgm-exporter"].status=="healthy")' "$node_file" 2>/dev/null || echo false)
|
||||
lr=$(jq -r '.last_report // ""' "$node_file" 2>/dev/null)
|
||||
if [[ "$ok_health" == true && -n "$lr" ]]; then
|
||||
if [[ -z "$t1" ]]; then
|
||||
t1="$lr"
|
||||
# agent 默认 60s 上报,等待 70s 再校验一次
|
||||
sleep 70
|
||||
continue
|
||||
fi
|
||||
t2="$lr"
|
||||
if [[ "$t2" != "$t1" ]]; then
|
||||
return 0
|
||||
fi
|
||||
# 若未变化,再等待一会儿直到超时
|
||||
sleep 10
|
||||
fi
|
||||
else
|
||||
# 无 jq 时的宽松校验
|
||||
if grep -q '"status"\s*:\s*"healthy"' "$node_file"; then
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
if (( $(date +%s) >= deadline )); then
|
||||
log_error "自检超时:未在 5 分钟内确认 last_report 持续更新 或 健康状态不满足(路径:$node_file)"
|
||||
return 1
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
echo "=========================================="
|
||||
@ -912,17 +978,26 @@ main() {
|
||||
# return 0
|
||||
# fi
|
||||
|
||||
check_ftp_params
|
||||
check_system
|
||||
check_ftp_params
|
||||
check_system
|
||||
require_agent_metadata
|
||||
|
||||
if [[ "$ACTION" == "uninstall" ]]; then
|
||||
uninstall_argus_metric
|
||||
else
|
||||
install_argus_metric
|
||||
fi
|
||||
|
||||
|
||||
# 安装后自检:最多等待 5 分钟,确认 node.json 存在且健康
|
||||
echo
|
||||
log_info "操作完成!"
|
||||
log_info "开始安装后自检(最多等待 5 分钟)..."
|
||||
selfcheck_post_install || {
|
||||
log_error "安装后自检未通过,请查看 /var/log/argus-agent.log 以及 /opt/argus-metric/versions/*/.install.log"
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo
|
||||
log_success "全部自检通过,安装完成!"
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
|
||||
@ -67,7 +67,8 @@ RUN chmod +x /usr/local/bin/start-ftp-supervised.sh
|
||||
COPY vsftpd.conf /etc/vsftpd/vsftpd.conf
|
||||
|
||||
COPY dns-monitor.sh /usr/local/bin/dns-monitor.sh
|
||||
RUN chmod +x /usr/local/bin/dns-monitor.sh
|
||||
COPY dns-publish.sh /usr/local/bin/dns-publish.sh
|
||||
RUN chmod +x /usr/local/bin/dns-monitor.sh /usr/local/bin/dns-publish.sh
|
||||
|
||||
USER root
|
||||
|
||||
|
||||
@ -66,6 +66,17 @@ ${FTP_BASE_PATH}/
|
||||
|
||||
/private/argus/etc/
|
||||
└── ${DOMAIN} # 容器IP记录文件
|
||||
|
||||
## DNS 同步到 FTP share(运行期)
|
||||
|
||||
- 运行期最新的 DNS 列表由 bind/master 写入挂载点 `/private/argus/etc/dns.conf`。
|
||||
- FTP 容器内置 `dns-publish`(Supervised):每 10s 比较并将该文件原子同步为 `${FTP_BASE_PATH}/share/dns.conf`,供客户端下载安装脚本直接读取。
|
||||
- 同步特性:
|
||||
- 原子更新:写入 `${DST}.tmp` 后 `mv -f` 覆盖,避免读到半写文件。
|
||||
- 权限:0644;属主 `${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}`。
|
||||
- 可观测:日志 `/var/log/supervisor/dns-publish.log`。
|
||||
|
||||
> 注:构建/发布阶段可能也会将静态 `config/dns.conf` 拷贝到 share;当 FTP 容器运行后,dns-publish 会用运行期最新文件覆盖该静态文件。
|
||||
```
|
||||
|
||||
## vsftpd 配置说明
|
||||
@ -156,4 +167,4 @@ curl -fsS 'ftp://ftpuser:ZGClab1234!@177.177.70.200/setup.sh' -o setup.sh
|
||||
# root用户直接执行,非root用户需要使用sudo
|
||||
chmod +x setup.sh
|
||||
bash setup.sh --server {$域名} --user ftpuser --password 'ZGClab1234!'
|
||||
```
|
||||
```
|
||||
|
||||
40
src/metric/ftp/build/dns-publish.sh
Normal file
40
src/metric/ftp/build/dns-publish.sh
Normal file
@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
set -uo pipefail
|
||||
|
||||
# Publish latest /private/argus/etc/dns.conf to ${FTP_BASE_PATH}/share/dns.conf
|
||||
|
||||
SRC="/private/argus/etc/dns.conf"
|
||||
FTP_BASE_PATH="${FTP_BASE_PATH:-/private/argus/ftp}"
|
||||
DST_DIR="${FTP_BASE_PATH}/share"
|
||||
DST="${DST_DIR}/dns.conf"
|
||||
UID_VAL="${ARGUS_BUILD_UID:-2133}"
|
||||
GID_VAL="${ARGUS_BUILD_GID:-2015}"
|
||||
INTERVAL="${DNS_PUBLISH_INTERVAL:-10}"
|
||||
|
||||
log() { echo "$(date '+%Y-%m-%d %H:%M:%S') [DNS-Publish] $*"; }
|
||||
|
||||
mkdir -p "$DST_DIR" 2>/dev/null || true
|
||||
|
||||
log "service start: SRC=$SRC DST=$DST interval=${INTERVAL}s"
|
||||
|
||||
while true; do
|
||||
if [[ -f "$SRC" ]]; then
|
||||
# Only sync when content differs
|
||||
if ! cmp -s "$SRC" "$DST" 2>/dev/null; then
|
||||
tmp="${DST}.tmp"
|
||||
if cp "$SRC" "$tmp" 2>/dev/null; then
|
||||
mv -f "$tmp" "$DST"
|
||||
chown "$UID_VAL":"$GID_VAL" "$DST" 2>/dev/null || true
|
||||
chmod 0644 "$DST" 2>/dev/null || true
|
||||
ts_src=$(date -r "$SRC" '+%Y-%m-%dT%H:%M:%S%z' 2>/dev/null || echo "?")
|
||||
log "synced dns.conf (src mtime=$ts_src) -> $DST"
|
||||
else
|
||||
log "ERROR: copy failed $SRC -> $tmp"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
log "waiting for source $SRC"
|
||||
fi
|
||||
sleep "$INTERVAL"
|
||||
done
|
||||
|
||||
@ -28,6 +28,18 @@ stopwaitsecs=10
|
||||
killasgroup=true
|
||||
stopasgroup=true
|
||||
|
||||
[program:dns-publish]
|
||||
command=/usr/local/bin/dns-publish.sh
|
||||
user=root
|
||||
stdout_logfile=/var/log/supervisor/dns-publish.log
|
||||
stderr_logfile=/var/log/supervisor/dns-publish_error.log
|
||||
autorestart=true
|
||||
startretries=3
|
||||
startsecs=5
|
||||
stopwaitsecs=10
|
||||
killasgroup=true
|
||||
stopasgroup=true
|
||||
|
||||
[unix_http_server]
|
||||
file=/var/run/supervisor.sock
|
||||
chmod=0700
|
||||
|
||||
7
src/sys/tests/.gitignore
vendored
Normal file
7
src/sys/tests/.gitignore
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
|
||||
private/
|
||||
private-nodea/
|
||||
private-nodeb/
|
||||
tmp/
|
||||
|
||||
.env
|
||||
@ -1,13 +1,17 @@
|
||||
# ARGUS 系统级端到端测试(Sys E2E)
|
||||
|
||||
本目录包含将 log 与 agent 两线验证合并后的系统级端到端测试。依赖 bind/master/es/kibana + 两个“日志节点”(每个节点容器内同时运行 Fluent Bit 与 argus-agent)。
|
||||
本目录包含将 log、metric 与 agent 三线验证合并后的系统级端到端测试。依赖 bind/master/es/kibana/metric(ftp+prometheus+grafana+alertmanager)/web-proxy/web-frontend + 两个“计算节点”(每个节点容器内同时运行 Fluent Bit 与 argus-agent)。
|
||||
|
||||
---
|
||||
|
||||
## 一、如何运行
|
||||
|
||||
- 前置条件
|
||||
- 已构建镜像:`argus-elasticsearch:latest`、`argus-kibana:latest`、`argus-bind9:latest`、`argus-master:latest`
|
||||
- 已构建镜像:
|
||||
- 基座:`argus-elasticsearch:latest`、`argus-kibana:latest`、`argus-bind9:latest`、`argus-master:latest`
|
||||
- 节点:`argus-sys-node:latest`
|
||||
- 指标:`argus-metric-ftp:latest`、`argus-metric-prometheus:latest`、`argus-metric-grafana:latest`、`argus-alertmanager:latest`
|
||||
- 前端与代理:`argus-web-frontend:latest`、`argus-web-proxy:latest`
|
||||
- 可用根目录命令构建:`./build/build_images.sh [--intranet]`
|
||||
- 主机具备 Docker 与 Docker Compose。
|
||||
|
||||
@ -33,11 +37,12 @@
|
||||
- 一键执行
|
||||
- `cd src/sys/tests`
|
||||
- `./scripts/00_e2e_test.sh`(CPU-only)或 `./scripts/00_e2e_test.sh --enable-gpu`(启用 GPU 流程)
|
||||
- 可选:`--no-clean` 跳过清理,便于失败后现场排查
|
||||
|
||||
- 分步执行(推荐用于排查)
|
||||
- `./scripts/01_bootstrap.sh` 生成目录/拷贝 `update-dns.sh`/构建 agent 二进制/写 `.env`
|
||||
- `./scripts/02_up.sh` 启动 Compose 栈(工程名 `argus-sys`)
|
||||
- `./scripts/03_wait_ready.sh` 等待 ES/Kibana/Master/Fluent‑Bit/Bind 就绪(Kibana 必须返回 200 且 overall.level=available)
|
||||
- `./scripts/03_wait_ready.sh` 等待 ES/Kibana/Master/Fluent‑Bit/Bind/Prometheus/Grafana/Alertmanager/Web‑Proxy 就绪(Kibana 必须 200 且 overall.level=available;Web‑Proxy 8084/8085 要有 CORS 头)
|
||||
- `./scripts/04_verify_dns_routing.sh` 校验 bind 解析与节点内域名解析
|
||||
- `./scripts/05_agent_register.sh` 获取两个节点的 `node_id` 与初始 IP,检查本地 `node.json`
|
||||
- `./scripts/06_write_health_and_assert.sh` 写健康文件并断言 `nodes.json` 仅包含 2 个在线节点
|
||||
@ -47,6 +52,8 @@
|
||||
- `./scripts/11_metric_node_install.sh` 在 CPU 节点安装并验证端点
|
||||
- `./scripts/12_metric_gpu_install.sh` 在 GPU 节点安装并等待 9100/9400 就绪(仅启用 GPU 时)
|
||||
- `./scripts/13_metric_verify.sh` 对 master/Prometheus/数据面/Grafana 做综合校验(含 GPU 时校验 dcgm 指标)
|
||||
- `./scripts/15_alert_verify.sh` 对alertmanager进行校验
|
||||
- `./scripts/16_web_verify.sh` 对web页面进行校验综合校验。
|
||||
- `./scripts/14_metric_cleanup.sh` 清理 FTP 产物
|
||||
- `./scripts/09_down.sh` 回收容器、网络并清理 `private*/`、`tmp/`
|
||||
|
||||
@ -58,7 +65,7 @@
|
||||
## 二、测试部署架构(docker-compose)
|
||||
|
||||
- 网络
|
||||
- 自定义 bridge:`argus-sys-net`,子网 `172.31.0.0/16`
|
||||
- 自定义 bridge:`sysnet`(Compose 工程名为 `argus-sys` 时实际为 `argus-sys_sysnet`),子网 `172.31.0.0/16`
|
||||
- 固定地址:bind=`172.31.0.2`,master=`172.31.0.10`
|
||||
|
||||
- 服务与端口(宿主机映射端口由 `01_bootstrap.sh` 自动分配并写入 `.env`)
|
||||
@ -66,9 +73,15 @@
|
||||
- `bind`(`argus-bind9:latest`):监听 53/tcp+udp;负责同步 `*.argus.com` 记录
|
||||
- `master`(`argus-master:latest`):对外 `${MASTER_PORT}→3000`;API `http://localhost:${MASTER_PORT}`
|
||||
- `es`(`argus-elasticsearch:latest`):`${ES_HTTP_PORT}→9200`;单节点,无安全
|
||||
- `kibana`(`argus-kibana:latest`):`${KIBANA_PORT}→5601`;通过 `ELASTICSEARCH_HOSTS=http://es:9200` 访问 ES
|
||||
- `node-a`(`ubuntu:22.04`):同时运行 Fluent Bit + argus-agent,`hostname=dev-yyrshare-nbnyx10-cp2f-pod-0`,`${NODE_A_PORT}→2020`
|
||||
- `node-b`(`ubuntu:22.04`):同时运行 Fluent Bit + argus-agent,`hostname=dev-yyrshare-uuuu10-ep2f-pod-0`,`${NODE_B_PORT}→2020`
|
||||
- `kibana`(`argus-kibana:latest`):`${KIBANA_PORT}→5601`
|
||||
- `node-a`(`argus-sys-node:latest`):同时运行 Fluent Bit + argus-agent,`hostname=dev-yyrshare-nbnyx10-cp2f-pod-0`,`${NODE_A_PORT}→2020`
|
||||
- `node-b`(`argus-sys-node:latest`):同时运行 Fluent Bit + argus-agent,`hostname=dev-yyrshare-uuuu10-ep2f-pod-0`,`${NODE_B_PORT}→2020`
|
||||
- `ftp`(`argus-metric-ftp:latest`):`${FTP_PORT}→21`/`${FTP_DATA_PORT}→20`/`${FTP_PASSIVE_HOST_RANGE}` 被动端口
|
||||
- `prometheus`(`argus-metric-prometheus:latest`):`${PROMETHEUS_PORT}→9090`
|
||||
- `grafana`(`argus-metric-grafana:latest`):`${GRAFANA_PORT}→3000`
|
||||
- `alertmanager`(`argus-alertmanager:latest`):`${ALERTMANAGER_PORT}→9093`
|
||||
- `web-frontend`(`argus-web-frontend:latest`):内部访问页面,使用 `web-proxy` 暴露的对外端口渲染超链
|
||||
- `web-proxy`(`argus-web-proxy:latest`):多端口转发 8080..8085(首页、Grafana、Prometheus、Kibana、Alertmanager、Master API)
|
||||
|
||||
- 卷与目录
|
||||
- 核心服务(bind/master/es/kibana)共享宿主 `./private` 挂载到容器 `/private`
|
||||
@ -83,7 +96,7 @@
|
||||
|
||||
- 节点入口
|
||||
- `scripts/node_entrypoint.sh`:
|
||||
- 复制 `/assets/fluent-bit/*` 到容器 `/private`,后台启动 Fluent Bit(监听 2020)
|
||||
- 离线优先:将 `/assets/fluent-bit/packages` 与 `etc` 拷贝到 `/private`,执行 `/private/start-fluent-bit.sh` 安装/拉起 Fluent Bit(监听 2020)
|
||||
- 以运行用户(映射 UID/GID)前台启动 `argus-agent`
|
||||
- 节点环境变量:`MASTER_ENDPOINT=http://master.argus.com:3000`、`REPORT_INTERVAL_SECONDS=2`、`ES_HOST=es`、`ES_PORT=9200`、`CLUSTER=local`、`RACK=dev`
|
||||
|
||||
@ -106,6 +119,10 @@
|
||||
- Master `/readyz` 成功
|
||||
- Fluent Bit 指标接口 `:2020/:2021` 可访问
|
||||
- bind `named-checkconf` 通过
|
||||
- Prometheus `/-/ready` 可用
|
||||
- Grafana `GET /api/health` 返回 200 且 `database=ok`
|
||||
- Alertmanager `GET /api/v2/status` 成功
|
||||
- Web‑Proxy:8080 首页 200;8083 首页 200/302;8084/8085 对来自 8080 的请求需返回 `Access-Control-Allow-Origin`(CORS)
|
||||
|
||||
- `04_verify_dns_routing.sh`
|
||||
- 目的:验证从 bind → 节点容器的解析链路
|
||||
@ -133,6 +150,13 @@
|
||||
- `09_down.sh`
|
||||
- 目的:栈销毁与环境清理;必要时使用临时容器修正属主再删除 `private*` 目录
|
||||
|
||||
- `15_alert_verify.sh`
|
||||
- 目的:验证alertmanager的可用性、Prometheus到alertmanager的连通性。
|
||||
- 操作:在Prometheus中增加一个恒为真的告警规则,查看alertmanager是否收到该告警
|
||||
- `16_web_verify.sh`
|
||||
- 目的:验证web页面是否可用。
|
||||
- 使用playwright分别验证各个模块的页面是否可用,以及符合预期。
|
||||
|
||||
---
|
||||
|
||||
### 常见问题与排查
|
||||
@ -142,6 +166,28 @@
|
||||
|
||||
---
|
||||
|
||||
## 注意事项(2025‑10‑29 更新)
|
||||
|
||||
- 宿主 inotify 限制导致 03 卡住(Fluent Bit in_tail EMFILE)
|
||||
- 现象:`03_wait_ready.sh` 一直等待 `:2020/:2021 /api/v2/metrics`;节点日志出现 `tail_fs_inotify.c errno=24 Too many open files`,Fluent Bit 启动失败。
|
||||
- 根因:宿主 `fs.inotify.max_user_instances` 上限过低(常见默认 128),被其他进程占满;并非容器内 `ulimit -n` 过低。
|
||||
- 处理:在宿主执行(临时):
|
||||
- `sudo sysctl -w fs.inotify.max_user_instances=1024 fs.inotify.max_user_watches=1048576`
|
||||
- 建议永久:写入 `/etc/sysctl.d/99-argus-inotify.conf` 后 `sudo sysctl --system`
|
||||
- 提示:节点入口里对 sysctl 的写操作不影响宿主;需在宿主调整。
|
||||
|
||||
- Metric 安装制品包含 Git LFS 指针导致 node‑exporter 启动失败
|
||||
- 现象:第 11 步在线安装后,日志显示 `Node Exporter 服务启动失败`;容器内 `/usr/local/bin/node-exporter` 头部是文本:`version https://git-lfs.github.com/spec/v1`。
|
||||
- 根因:发布到 FTP 的安装包在打包前未执行 `git lfs fetch/checkout`,将指针文件打入制品。
|
||||
- 处理:在仓库根目录执行 `git lfs fetch --all && git lfs checkout` 后,重跑 `src/metric/tests/scripts/02_publish_artifact.sh` 再重试 `11_metric_node_install.sh`。
|
||||
- 防呆:已在 `all-in-one-full/scripts/package_artifact.sh` 与组件 `plugins/*/package.sh` 增加 LFS 指针校验,发现即失败并提示修复。
|
||||
|
||||
建议:
|
||||
- 运行前检查宿主 inotify 值(≥1024/≥1048576)与宿主端口占用(8080..8085、9200/5601/9090/9093/2020/2021/32300 等)。
|
||||
- 如需排查失败,使用 `--no-clean` 保留现场,配合 `docker logs`、`curl` 与 `tmp/*.json` 进行定位。
|
||||
|
||||
---
|
||||
|
||||
如需更严格的断言(例如 Kibana 载入具体插件、ES 文档字段校验),可在 `07_*.sh` 中追加查询与校验逻辑。
|
||||
|
||||
---
|
||||
|
||||
@ -55,6 +55,8 @@ SCRIPTS=(
|
||||
"11_metric_node_install.sh"
|
||||
"12_metric_gpu_install.sh"
|
||||
"13_metric_verify.sh"
|
||||
"15_alert_verify.sh"
|
||||
"16_web_verify.sh"
|
||||
)
|
||||
|
||||
# 如未禁用清理,则追加清理与下线步骤(保持原有顺序)
|
||||
|
||||
103
src/sys/tests/scripts/15_alert_verify.sh
Normal file
103
src/sys/tests/scripts/15_alert_verify.sh
Normal file
@ -0,0 +1,103 @@
|
||||
#!/bin/bash
|
||||
# verify_alertmanager.sh
|
||||
# Verify the communication between Prometheus and Alertmanager after deployment
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
echo "[INFO] Verifying Prometheus ↔ Alertmanager communication..."
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
TMP_DIR="$TEST_ROOT/tmp"
|
||||
mkdir -p "$TMP_DIR"
|
||||
|
||||
PRIVATE_CORE="$TEST_ROOT/private"
|
||||
|
||||
#=============================
|
||||
# Load environment variables
|
||||
#=============================
|
||||
if [[ -f "$TEST_ROOT/.env" ]]; then
|
||||
set -a; source "$TEST_ROOT/.env"; set +a
|
||||
fi
|
||||
|
||||
#=============================
|
||||
# Basic configuration
|
||||
#=============================
|
||||
PROM_URL="http://localhost:${PROMETHEUS_PORT:-9090}"
|
||||
ALERT_URL="http://localhost:${ALERTMANAGER_PORT:-9093}"
|
||||
RULE_DIR="$PRIVATE_CORE/argus/metric/prometheus/rules"
|
||||
TMP_RULE="$TMP_DIR/test_rule.yml"
|
||||
|
||||
#=============================
|
||||
# Helper functions
|
||||
#=============================
|
||||
GREEN="\033[32m"; RED="\033[31m"; YELLOW="\033[33m"; RESET="\033[0m"
|
||||
|
||||
log_info() { echo -e "${YELLOW}[INFO]${RESET} $1"; }
|
||||
log_success() { echo -e "${GREEN}[OK]${RESET} $1"; }
|
||||
log_warn() { echo -e "${YELLOW}[WARN]${RESET} $1"; }
|
||||
log_error() { echo -e "${RED}[ERROR]${RESET} $1"; }
|
||||
|
||||
fail_exit() { log_error "$1"; exit 1; }
|
||||
|
||||
#=============================
|
||||
# Step 1: Check Alertmanager accessibility
|
||||
#=============================
|
||||
log_info "Checking Alertmanager status..."
|
||||
if curl -sSf "${ALERT_URL}/api/v2/status" >/dev/null 2>&1; then
|
||||
log_success "Alertmanager is reachable at ${ALERT_URL}"
|
||||
else
|
||||
fail_exit "Alertmanager is not reachable. Please check container or port mapping."
|
||||
fi
|
||||
|
||||
#=============================
|
||||
# Step 2: Create and load a temporary test alert rule
|
||||
#=============================
|
||||
log_info "Creating temporary alert rule at ${TMP_RULE}..."
|
||||
cat <<EOF > "${TMP_RULE}"
|
||||
groups:
|
||||
- name: deploy-verify-group
|
||||
rules:
|
||||
- alert: DeployVerifyAlert
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Deployment verification alert"
|
||||
EOF
|
||||
|
||||
mkdir -p "${RULE_DIR}"
|
||||
cp "${TMP_RULE}" "${RULE_DIR}/test_rule.yml"
|
||||
|
||||
log_info "Reloading Prometheus to apply the test rule..."
|
||||
if curl -s -X POST "${PROM_URL}/-/reload" >/dev/null; then
|
||||
log_success "Prometheus successfully reloaded rules"
|
||||
else
|
||||
fail_exit "Failed to reload Prometheus. Check API accessibility."
|
||||
fi
|
||||
|
||||
#=============================
|
||||
# Step 3: Verify alert received by Alertmanager
|
||||
#=============================
|
||||
log_info "Waiting for alert propagation (~30 seconds)..."
|
||||
sleep 30
|
||||
|
||||
if curl -s "${ALERT_URL}/api/v2/alerts" | grep -q "DeployVerifyAlert"; then
|
||||
log_success "Prometheus → Alertmanager alert path verified successfully"
|
||||
else
|
||||
fail_exit "DeployVerifyAlert not found in Alertmanager. Check configuration or network."
|
||||
fi
|
||||
|
||||
#=============================
|
||||
# Step 4: Cleanup test rule
|
||||
#=============================
|
||||
log_info "Cleaning up temporary alert rule..."
|
||||
rm -f "${RULE_DIR}/test_rule.yml" "${TMP_RULE}"
|
||||
|
||||
if curl -s -X POST "${PROM_URL}/-/reload" >/dev/null; then
|
||||
log_success "Prometheus successfully reloaded after cleanup"
|
||||
else
|
||||
log_warn "Prometheus reload after cleanup failed. Please check manually."
|
||||
fi
|
||||
|
||||
log_success "Alertmanager verification completed successfully. Communication with Prometheus is healthy."
|
||||
115
src/sys/tests/scripts/16_web_verify.sh
Normal file
115
src/sys/tests/scripts/16_web_verify.sh
Normal file
@ -0,0 +1,115 @@
|
||||
#!/usr/bin/env bash
|
||||
# verify-web-test.sh
|
||||
# Verify frontend service availability and run Playwright end-to-end tests
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
echo '[INFO] Verifying Web frontend...'
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
|
||||
WEB_DIR="$REPO_ROOT/src/web"
|
||||
|
||||
#=============================
|
||||
# Load environment variables
|
||||
#=============================
|
||||
if [[ -f "$TEST_ROOT/.env" ]]; then
|
||||
set -a; source "$TEST_ROOT/.env"; set +a
|
||||
fi
|
||||
|
||||
REPORT_DIR="$WEB_DIR/playwright-report"
|
||||
FRONTEND_URL="http://localhost:${WEB_PROXY_PORT_8080:-8080}"
|
||||
TIMEOUT=120 # max wait time (seconds) for frontend to be ready
|
||||
|
||||
#=============================
|
||||
# Helper functions
|
||||
#=============================
|
||||
GREEN="\033[32m"; RED="\033[31m"; YELLOW="\033[33m"; RESET="\033[0m"
|
||||
|
||||
log_info() { echo -e "${YELLOW}[INFO]${RESET} $1"; }
|
||||
log_success() { echo -e "${GREEN}[OK]${RESET} $1"; }
|
||||
log_warn() { echo -e "${YELLOW}[WARN]${RESET} $1"; }
|
||||
log_error() { echo -e "${RED}[ERROR]${RESET} $1"; }
|
||||
|
||||
fail_exit() { log_error "$1"; exit 1; }
|
||||
|
||||
#=============================
|
||||
# Step 1: Wait for frontend service
|
||||
#=============================
|
||||
log_info "[1/4] Checking if frontend service is up (${FRONTEND_URL})..."
|
||||
|
||||
for ((i=1; i<=TIMEOUT; i++)); do
|
||||
STATUS_CODE=$(curl -s -o /dev/null -w "%{http_code}" "$FRONTEND_URL" || true)
|
||||
if [[ "$STATUS_CODE" == "200" ]]; then
|
||||
log_success "Frontend service is accessible at ${FRONTEND_URL}"
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
if [[ $i -eq $TIMEOUT ]]; then
|
||||
fail_exit "Timeout waiting for frontend service to become ready (${TIMEOUT}s)."
|
||||
fi
|
||||
done
|
||||
|
||||
#=============================
|
||||
# Step 2: Run Playwright tests
|
||||
#=============================
|
||||
log_info "[2/4] Running Playwright automated tests in headless mode..."
|
||||
|
||||
cd "$WEB_DIR"
|
||||
|
||||
# Ensure dependencies installed
|
||||
if [ ! -d "node_modules" ]; then
|
||||
log_warn "Dependencies not found. Installing via npm ci..."
|
||||
npm ci
|
||||
fi
|
||||
|
||||
log_info "Checking Playwright browsers..."
|
||||
if [ -d "node_modules/playwright" ]; then
|
||||
log_info "Found node_modules/playwright, checking if browsers are complete..."
|
||||
# 使用 dry-run 确认浏览器是否完整
|
||||
if npx playwright install --dry-run | grep -q "All required browsers are installed"; then
|
||||
log_info "All Playwright browsers are already installed, skipping installation."
|
||||
exit 0
|
||||
else
|
||||
log_info "Playwright browsers incomplete, installing..."
|
||||
fi
|
||||
else
|
||||
log_info "Playwright browsers not found, installing..."
|
||||
npx playwright install --with-deps > /dev/null
|
||||
fi
|
||||
|
||||
# Clean previous reports
|
||||
rm -rf "$REPORT_DIR"
|
||||
|
||||
# Run Playwright tests wrapped with xvfb-run to avoid GUI
|
||||
set +e # temporarily disable exit-on-error
|
||||
env BASE_URL="$FRONTEND_URL" xvfb-run --auto-servernum npx playwright test tests/playwright --reporter=list
|
||||
TEST_RESULT=$?
|
||||
set -e # re-enable strict mode
|
||||
|
||||
#=============================
|
||||
# Step 3: Check test results
|
||||
#=============================
|
||||
log_info "[3/4] Checking test results..."
|
||||
|
||||
if [[ $TEST_RESULT -eq 0 ]]; then
|
||||
log_success "All Playwright tests passed successfully."
|
||||
else
|
||||
log_error "Some Playwright tests failed. Please review the test report."
|
||||
fi
|
||||
|
||||
#=============================
|
||||
# Step 4: Report generation
|
||||
#=============================
|
||||
log_info "[4/4] Checking Playwright report..."
|
||||
|
||||
if [[ -d "$REPORT_DIR" ]]; then
|
||||
log_success "Test report generated at: $REPORT_DIR"
|
||||
echo "You can view it using:"
|
||||
echo " npx playwright show-report"
|
||||
else
|
||||
log_warn "Report directory not found. Check Playwright execution logs."
|
||||
fi
|
||||
|
||||
log_success "Web frontend verify finished."
|
||||
3
src/web/.gitignore
vendored
3
src/web/.gitignore
vendored
@ -1,6 +1,9 @@
|
||||
# Node modules
|
||||
node_modules/
|
||||
|
||||
# playwright report
|
||||
playwright-report/
|
||||
|
||||
# Build output
|
||||
/dist
|
||||
/build
|
||||
|
||||
@ -24,13 +24,18 @@ else
|
||||
fi
|
||||
|
||||
# ========== 读取 DNS ==========
|
||||
if [ -f "$DNS_CONF_PRIVATE" ]; then
|
||||
echo "从 $DNS_CONF_PRIVATE 读取 DNS 服务器..."
|
||||
RESOLVERS=$(awk '/^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/ {print $1}' "$DNS_CONF_PRIVATE" | tr '\n' ' ')
|
||||
fi
|
||||
RESOLVERS=""
|
||||
# 优先等待 /private/argus/etc/dns.conf 生成并读取其中的 IP
|
||||
for i in $(seq 1 10); do
|
||||
if [ -f "$DNS_CONF_PRIVATE" ]; then
|
||||
RESOLVERS=$(awk '/^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/{print $1}' "$DNS_CONF_PRIVATE" | tr '\n' ' ')
|
||||
fi
|
||||
[ -n "$RESOLVERS" ] && break
|
||||
sleep 1
|
||||
done
|
||||
|
||||
# 如果 /private 文件不存在则 fallback
|
||||
if [ -z "${RESOLVERS:-}" ]; then
|
||||
# 若仍为空则回退到系统 resolv.conf
|
||||
if [ -z "$RESOLVERS" ]; then
|
||||
echo "未在 $DNS_CONF_PRIVATE 中找到有效 DNS,使用系统 /etc/resolv.conf"
|
||||
RESOLVERS=$(awk '/^nameserver/ {print $2}' "$DNS_CONF_SYSTEM" | tr '\n' ' ')
|
||||
fi
|
||||
@ -47,8 +52,9 @@ echo "检测到 DNS 服务器列表: $RESOLVERS"
|
||||
if [ -f "$TEMPLATE" ]; then
|
||||
echo "从模板生成 nginx.conf ..."
|
||||
# 合并 Docker 内置 DNS 以保障解析 Compose 服务名
|
||||
# 将 127.0.0.11 放在末尾,优先使用 /private/argus/etc/dns.conf 指向的 bind
|
||||
if ! echo " $RESOLVERS " | grep -q " 127.0.0.11 "; then
|
||||
RESOLVERS="127.0.0.11 ${RESOLVERS}"
|
||||
RESOLVERS="${RESOLVERS} 127.0.0.11"
|
||||
fi
|
||||
sed "s|__RESOLVERS__|$RESOLVERS|" "$TEMPLATE" > "$TARGET"
|
||||
else
|
||||
|
||||
64
src/web/package-lock.json
generated
64
src/web/package-lock.json
generated
@ -20,6 +20,7 @@
|
||||
},
|
||||
"devDependencies": {
|
||||
"@eslint/js": "^9.33.0",
|
||||
"@playwright/test": "^1.56.1",
|
||||
"@types/react": "^19.1.10",
|
||||
"@types/react-dom": "^19.1.7",
|
||||
"@vitejs/plugin-react": "^5.0.0",
|
||||
@ -1231,6 +1232,22 @@
|
||||
"react": "^18.x || ^19.x"
|
||||
}
|
||||
},
|
||||
"node_modules/@playwright/test": {
|
||||
"version": "1.56.1",
|
||||
"resolved": "https://registry.npmjs.org/@playwright/test/-/test-1.56.1.tgz",
|
||||
"integrity": "sha512-vSMYtL/zOcFpvJCW71Q/OEGQb7KYBPAdKh35WNSkaZA75JlAO8ED8UN6GUNTm3drWomcbcqRPFqQbLae8yBTdg==",
|
||||
"dev": true,
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"playwright": "1.56.1"
|
||||
},
|
||||
"bin": {
|
||||
"playwright": "cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/@rolldown/pluginutils": {
|
||||
"version": "1.0.0-beta.34",
|
||||
"resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.34.tgz",
|
||||
@ -2860,6 +2877,53 @@
|
||||
"url": "https://github.com/sponsors/jonschlinkert"
|
||||
}
|
||||
},
|
||||
"node_modules/playwright": {
|
||||
"version": "1.56.1",
|
||||
"resolved": "https://registry.npmjs.org/playwright/-/playwright-1.56.1.tgz",
|
||||
"integrity": "sha512-aFi5B0WovBHTEvpM3DzXTUaeN6eN0qWnTkKx4NQaH4Wvcmc153PdaY2UBdSYKaGYw+UyWXSVyxDUg5DoPEttjw==",
|
||||
"dev": true,
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"playwright-core": "1.56.1"
|
||||
},
|
||||
"bin": {
|
||||
"playwright": "cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"fsevents": "2.3.2"
|
||||
}
|
||||
},
|
||||
"node_modules/playwright-core": {
|
||||
"version": "1.56.1",
|
||||
"resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.56.1.tgz",
|
||||
"integrity": "sha512-hutraynyn31F+Bifme+Ps9Vq59hKuUCz7H1kDOcBs+2oGguKkWTU50bBWrtz34OUWmIwpBTWDxaRPXrIXkgvmQ==",
|
||||
"dev": true,
|
||||
"license": "Apache-2.0",
|
||||
"bin": {
|
||||
"playwright-core": "cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/playwright/node_modules/fsevents": {
|
||||
"version": "2.3.2",
|
||||
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
|
||||
"integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
|
||||
"dev": true,
|
||||
"hasInstallScript": true,
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
],
|
||||
"engines": {
|
||||
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/postcss": {
|
||||
"version": "8.5.6",
|
||||
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz",
|
||||
|
||||
@ -7,7 +7,9 @@
|
||||
"dev": "vite",
|
||||
"build": "vite build",
|
||||
"lint": "eslint .",
|
||||
"preview": "vite preview"
|
||||
"preview": "vite preview",
|
||||
"test:web": "playwright test",
|
||||
"test:web:report": "playwright show-report"
|
||||
},
|
||||
"dependencies": {
|
||||
"@emotion/react": "^11.14.0",
|
||||
@ -22,6 +24,7 @@
|
||||
},
|
||||
"devDependencies": {
|
||||
"@eslint/js": "^9.33.0",
|
||||
"@playwright/test": "^1.56.1",
|
||||
"@types/react": "^19.1.10",
|
||||
"@types/react-dom": "^19.1.7",
|
||||
"@vitejs/plugin-react": "^5.0.0",
|
||||
|
||||
28
src/web/playwright.config.ts
Normal file
28
src/web/playwright.config.ts
Normal file
@ -0,0 +1,28 @@
|
||||
import { defineConfig } from '@playwright/test';
|
||||
|
||||
export default defineConfig({
|
||||
testDir: './tests',
|
||||
testIgnore: ['**/src/assets/**', '**/*.png', '**/*.jpg', '**/*.svg'],
|
||||
timeout: 60 * 1000,
|
||||
retries: 1,
|
||||
use: {
|
||||
headless: true,
|
||||
viewport: { width: 1280, height: 720 },
|
||||
ignoreHTTPSErrors: true,
|
||||
screenshot: 'only-on-failure',
|
||||
video: 'retain-on-failure',
|
||||
launchOptions: {
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-gpu',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-software-rasterizer',
|
||||
'--headless=new'
|
||||
],
|
||||
},
|
||||
},
|
||||
reporter: [
|
||||
['list'],
|
||||
['html', { open: 'never', outputFolder: 'playwright-report' }]
|
||||
]
|
||||
});
|
||||
@ -1,6 +1,5 @@
|
||||
import grafanaLogo from "../assets/grafana.png";
|
||||
import prometheusLogo from "../assets/prometheus.png";
|
||||
import esLogo from "../assets/es.png";
|
||||
import kibanaLogo from "../assets/kibana.png";
|
||||
import { EXTERNAL_HOST } from "./api";
|
||||
|
||||
|
||||
@ -1 +0,0 @@
|
||||
172.18.0.3
|
||||
87
src/web/tests/playwright/alerts.spec.ts
Normal file
87
src/web/tests/playwright/alerts.spec.ts
Normal file
@ -0,0 +1,87 @@
|
||||
import {test, expect} from "@playwright/test";
|
||||
import {BASE_URL} from './helpers/utils'
|
||||
|
||||
test.describe("Alerts 页面功能测试", () => {
|
||||
test.beforeEach(async ({page}) => {
|
||||
await page.goto(`${BASE_URL}/alerts`); // 根据你实际路由调整
|
||||
});
|
||||
|
||||
test("页面加载并显示告警统计", async ({page}) => {
|
||||
await expect(page.locator("text=告警详情").first()).toBeVisible();
|
||||
await expect(page.locator("text=总数").first()).toBeVisible();
|
||||
await expect(page.locator("text=严重").first()).toBeVisible();
|
||||
await expect(page.locator("text=警告").first()).toBeVisible();
|
||||
await expect(page.locator("text=信息").first()).toBeVisible();
|
||||
});
|
||||
|
||||
test("筛选功能验证", async ({ page }) => {
|
||||
// 等待页面加载完成
|
||||
await page.waitForSelector("table");
|
||||
|
||||
// ==========================
|
||||
// 1️⃣ 选择“严重性”= critical
|
||||
// ==========================
|
||||
const severitySelect = page.locator('label:has-text("严重性")').locator('..').locator('input');
|
||||
await severitySelect.click(); // 打开下拉菜单
|
||||
|
||||
const criticalOption = page.locator('[role="option"]:has-text("critical")');
|
||||
await criticalOption.waitFor({ state: 'visible', timeout: 5000 });
|
||||
await criticalOption.click();
|
||||
|
||||
// 验证选择已生效
|
||||
await expect(severitySelect).toHaveValue("critical");
|
||||
|
||||
// ==========================
|
||||
// 2️⃣ 选择“状态”= active
|
||||
// ==========================
|
||||
const stateSelect = page.locator('label:has-text("状态")').locator('..').locator('input');
|
||||
await stateSelect.click();
|
||||
|
||||
const activeOption = page.locator('[role="option"]:has-text("Active")');
|
||||
await activeOption.waitFor({ state: 'visible', timeout: 5000 });
|
||||
await activeOption.click();
|
||||
|
||||
await expect(stateSelect).toHaveValue("Active");
|
||||
|
||||
// ==========================
|
||||
// 4️⃣ 验证筛选结果(可选)
|
||||
// ==========================
|
||||
await page.waitForTimeout(1000);
|
||||
const rows = page.locator('table tbody tr');
|
||||
const count = await rows.count();
|
||||
expect(count).toBeGreaterThanOrEqual(0);
|
||||
});
|
||||
|
||||
|
||||
test("排序功能", async ({page}) => {
|
||||
const severityHeader = page.locator("th:has-text('严重性') button").first();
|
||||
await severityHeader.click(); // 切换升序
|
||||
await severityHeader.click(); // 切换降序
|
||||
|
||||
const instanceHeader = page.locator("th:has-text('节点') button").first();
|
||||
await instanceHeader.click();
|
||||
await instanceHeader.click();
|
||||
});
|
||||
|
||||
test("分页功能", async ({page}) => {
|
||||
const nextButton = page.locator("button:has-text('下一页')").first();
|
||||
const prevButton = page.locator("button:has-text('上一页')").first();
|
||||
|
||||
if (await nextButton.isEnabled()) {
|
||||
await nextButton.click();
|
||||
await expect(prevButton).toBeEnabled();
|
||||
}
|
||||
});
|
||||
|
||||
test("展开更多信息行", async ({page}) => {
|
||||
const infoIcons = page.locator("table tbody tr td [title='显示/隐藏更多信息']");
|
||||
if (await infoIcons.count() > 0) {
|
||||
await infoIcons.first().click();
|
||||
// 展开的详情行应出现
|
||||
const details = page.locator("table tbody tr >> text=alertname");
|
||||
const detailCount = await details.count();
|
||||
expect(detailCount).toBeGreaterThan(0);
|
||||
}
|
||||
});
|
||||
|
||||
});
|
||||
52
src/web/tests/playwright/dashboard.spec.ts
Normal file
52
src/web/tests/playwright/dashboard.spec.ts
Normal file
@ -0,0 +1,52 @@
|
||||
import {test, expect} from '@playwright/test';
|
||||
import {BASE_URL} from './helpers/utils'
|
||||
|
||||
test.describe('Dashboard 页面测试', () => {
|
||||
|
||||
test.beforeEach(async ({page}) => {
|
||||
// 打开仪表盘页面
|
||||
await page.goto(`${BASE_URL}/dashboard`, {waitUntil: 'networkidle'});
|
||||
});
|
||||
|
||||
test('应能成功加载页面并显示标题', async ({page}) => {
|
||||
await expect(page.locator('text=仪表盘').first()).toBeVisible();
|
||||
});
|
||||
|
||||
test('应显示节点健康状态卡片', async ({page}) => {
|
||||
const healthCard = page.locator('text=节点健康状态');
|
||||
await expect(healthCard).toBeVisible();
|
||||
|
||||
// 检查环形图是否渲染
|
||||
const ring = page.locator('svg'); // RingProgress 是 SVG 渲染的
|
||||
const ringCount = await ring.count();
|
||||
expect(ringCount).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
test('应显示告警统计信息', async ({page}) => {
|
||||
const alertCard = page.locator('text=告警统计');
|
||||
await expect(alertCard).toBeVisible();
|
||||
|
||||
// 检查告警类别
|
||||
const labels = ['总数', '严重', '警告', '信息'];
|
||||
for (const label of labels) {
|
||||
await expect(page.locator(`text=${label}`).first()).toBeVisible();
|
||||
}
|
||||
});
|
||||
|
||||
test('应正确渲染集群节点表格', async ({page}) => {
|
||||
const tableHeaders = ['ID', '名称', '状态', '类型', '版本'];
|
||||
for (const header of tableHeaders) {
|
||||
await expect(page.locator(`th:has-text("${header}")`).first()).toBeVisible();
|
||||
}
|
||||
|
||||
// 至少有一行节点数据
|
||||
const rows = await page.locator('tbody tr').count();
|
||||
expect(rows).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
test('页面应无加载错误提示', async ({page}) => {
|
||||
await expect(page.locator('text=加载中...')).toHaveCount(0);
|
||||
await expect(page.locator('text=数据加载失败')).toHaveCount(0);
|
||||
});
|
||||
|
||||
});
|
||||
28
src/web/tests/playwright/helpers/entrycards-helpers.ts
Normal file
28
src/web/tests/playwright/helpers/entrycards-helpers.ts
Normal file
@ -0,0 +1,28 @@
|
||||
import { Page, expect } from '@playwright/test';
|
||||
import type { metricsEntries } from '../../../src/config/entries';
|
||||
|
||||
export async function testEntryCards(
|
||||
page: Page,
|
||||
entries: typeof metricsEntries,
|
||||
checkLinkNavigation = false
|
||||
) {
|
||||
for (const entry of entries) {
|
||||
// 先根据 label 找到包含该文本的卡片
|
||||
const card = page.locator(`.mantine-Card-root:has-text("${entry.label}")`);
|
||||
await expect(card).toBeVisible({ timeout: 10000 });
|
||||
|
||||
// 检查卡片内部的链接,忽略端口号
|
||||
const link = card.locator('a');
|
||||
const href = await link.getAttribute('href');
|
||||
|
||||
// 正则:保留协议和 host,忽略端口号
|
||||
const expectedHrefPattern = entry.href.replace(/:(\d+)/, '(:\\d+)?');
|
||||
expect(href).toMatch(new RegExp(`^${expectedHrefPattern}$`));
|
||||
|
||||
// 检查图标
|
||||
const img = card.locator('img');
|
||||
await expect(img).toBeVisible();
|
||||
await expect(img).toHaveAttribute('src', /(\/assets\/.+|data:image\/png;base64,)/);
|
||||
|
||||
}
|
||||
}
|
||||
25
src/web/tests/playwright/helpers/testUtils.ts
Normal file
25
src/web/tests/playwright/helpers/testUtils.ts
Normal file
@ -0,0 +1,25 @@
|
||||
import { Page, expect } from '@playwright/test';
|
||||
import { BASE_URL } from './utils'
|
||||
/**
|
||||
* 通用函数:验证页面导航是否正确
|
||||
*/
|
||||
export async function checkPage(page: Page, path: string, title: string) {
|
||||
await page.goto(`${BASE_URL}`);
|
||||
const menu = page.getByRole('link', { name: title });
|
||||
await expect(menu).toBeVisible();
|
||||
await menu.click();
|
||||
await expect(page).toHaveURL(new RegExp(`${path}`));
|
||||
await expect(page.locator('body')).toContainText(title);
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查页面是否存在 JS 错误
|
||||
*/
|
||||
export async function noConsoleError(page: Page) {
|
||||
const errors: string[] = [];
|
||||
page.on('console', msg => {
|
||||
if (msg.type() === 'error') errors.push(msg.text());
|
||||
});
|
||||
await page.waitForLoadState('networkidle');
|
||||
expect(errors, `发现 JS 错误: ${errors.join(', ')}`).toHaveLength(0);
|
||||
}
|
||||
1
src/web/tests/playwright/helpers/utils.ts
Normal file
1
src/web/tests/playwright/helpers/utils.ts
Normal file
@ -0,0 +1 @@
|
||||
export const BASE_URL = process.env.BASE_URL || "http://localhost:8080";
|
||||
17
src/web/tests/playwright/logs.spec.ts
Normal file
17
src/web/tests/playwright/logs.spec.ts
Normal file
@ -0,0 +1,17 @@
|
||||
import { test, expect } from '@playwright/test';
|
||||
import { logsEntries } from './test-entries';
|
||||
import { testEntryCards } from './helpers/entrycards-helpers';
|
||||
import { BASE_URL } from './helpers/utils';
|
||||
|
||||
test.describe('Logs Page', () => {
|
||||
test('should render all log cards', async ({ page }) => {
|
||||
await page.goto(`${BASE_URL}/logs`);
|
||||
|
||||
// 等待标题可见
|
||||
const title = page.locator('h2', { hasText: '日志详情' });
|
||||
await expect(title).toBeVisible({ timeout: 10000 });
|
||||
|
||||
// 测试所有 log card
|
||||
await testEntryCards(page, logsEntries);
|
||||
});
|
||||
});
|
||||
15
src/web/tests/playwright/metric.spec.ts
Normal file
15
src/web/tests/playwright/metric.spec.ts
Normal file
@ -0,0 +1,15 @@
|
||||
import { test, expect } from '@playwright/test';
|
||||
import { metricsEntries } from './test-entries';
|
||||
import { testEntryCards } from './helpers/entrycards-helpers';
|
||||
import { BASE_URL } from './helpers/utils';
|
||||
|
||||
test.describe('Metrics Page', () => {
|
||||
test('should render all metric cards', async ({ page }) => {
|
||||
await page.goto(`${BASE_URL}/metrics`);
|
||||
|
||||
const title = page.locator('h2', { hasText: '指标详情' });
|
||||
await expect(title).toBeVisible({ timeout: 10000 });
|
||||
|
||||
await testEntryCards(page, metricsEntries);
|
||||
});
|
||||
});
|
||||
64
src/web/tests/playwright/node-info.spec.ts
Normal file
64
src/web/tests/playwright/node-info.spec.ts
Normal file
@ -0,0 +1,64 @@
|
||||
import {test, expect} from "@playwright/test";
|
||||
import {BASE_URL} from './helpers/utils'
|
||||
|
||||
test.describe("节点信息页面 NodeInfo", () => {
|
||||
test.beforeEach(async ({page}) => {
|
||||
await page.goto(`${BASE_URL}/nodeInfo`);
|
||||
});
|
||||
|
||||
test("页面标题应该正确显示", async ({page}) => {
|
||||
const title = page.locator('h1,h2,h3:has-text("节点信息")').first();
|
||||
await title.waitFor({timeout: 10000});
|
||||
await expect(title).toBeVisible();
|
||||
});
|
||||
|
||||
test("节点表格应该加载数据", async ({page}) => {
|
||||
const rows = page.locator("table tbody tr");
|
||||
await rows.first().waitFor({timeout: 10000});
|
||||
const count = await rows.count();
|
||||
expect(count).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
test('节点详情测试', async ({page}) => {
|
||||
const firstDetailBtn = page.locator('text=查看详情').first();
|
||||
await firstDetailBtn.waitFor({timeout: 10000});
|
||||
await firstDetailBtn.scrollIntoViewIfNeeded();
|
||||
await firstDetailBtn.click({force: true});
|
||||
|
||||
const drawer = page.locator('role=dialog[name="节点详情"]');
|
||||
await drawer.waitFor({timeout: 10000});
|
||||
await expect(drawer).toBeVisible();
|
||||
|
||||
for (const label of ['注册时间', '最近上报时间', '最后更新时间', '元数据信息', '健康信息', '配置信息', '标签信息']) {
|
||||
const el = drawer.locator(`text=${label}`).first();
|
||||
await el.waitFor({timeout: 5000});
|
||||
await expect(el).toBeVisible();
|
||||
}
|
||||
|
||||
});
|
||||
test("每个节点的 Grafana 按钮链接正确", async ({ page }) => {
|
||||
await page.waitForSelector("table tbody tr", { timeout: 10000 });
|
||||
|
||||
// 查找 Grafana 链接(根据快照,它是 link 而非 button)
|
||||
const grafanaLinks = page.getByRole("link", { name: "Grafana" });
|
||||
const count = await grafanaLinks.count();
|
||||
|
||||
// 如果没找到,保存上下文方便排查
|
||||
if (count === 0) {
|
||||
const html = await page.content();
|
||||
console.error("❌ 未找到 Grafana 链接,页面 HTML 片段如下:\n", html.slice(0, 2000));
|
||||
}
|
||||
|
||||
// 至少应该有一行节点
|
||||
expect(count).toBeGreaterThan(0);
|
||||
|
||||
// 校验链接 href
|
||||
for (let i = 0; i < count; i++) {
|
||||
const link = grafanaLinks.nth(i);
|
||||
await expect(link).toHaveAttribute(
|
||||
"href",
|
||||
/\/d\/node_gpu_metrics_by_hostname\/node-and-gpu-metrics-by-hostname\?var-hostname=/
|
||||
);
|
||||
}
|
||||
});
|
||||
});
|
||||
14
src/web/tests/playwright/test-entries.ts
Normal file
14
src/web/tests/playwright/test-entries.ts
Normal file
@ -0,0 +1,14 @@
|
||||
import { EXTERNAL_HOST } from "../../src/config/api";
|
||||
|
||||
export const metricsEntries = [
|
||||
{ label: "Grafana", href: EXTERNAL_HOST.GRAFANA_DASHBOARD, icon: '' },
|
||||
{ label: "Prometheus", href: EXTERNAL_HOST.PROMETHEUS, icon: '' },
|
||||
];
|
||||
|
||||
export const logsEntries = [
|
||||
{ label: "Kibana", href: EXTERNAL_HOST.KIBANA, icon: '' },
|
||||
];
|
||||
|
||||
export const alertsEntries = [
|
||||
{ label: "Alertmanager", href: EXTERNAL_HOST.ALERTS, icon: '' },
|
||||
];
|
||||
21
src/web/tests/playwright/web-pages.spec.ts
Normal file
21
src/web/tests/playwright/web-pages.spec.ts
Normal file
@ -0,0 +1,21 @@
|
||||
import { test } from '@playwright/test';
|
||||
import { checkPage, noConsoleError } from './helpers/testUtils';
|
||||
import { BASE_URL } from './helpers/utils'
|
||||
|
||||
const pages = [
|
||||
{ path: '/dashboard', title: '仪表盘' },
|
||||
{ path: '/nodeInfo', title: '节点信息' },
|
||||
{ path: '/metrics', title: '指标详情' },
|
||||
{ path: '/logs', title: '日志详情' },
|
||||
{ path: '/alerts', title: '告警详情' }
|
||||
];
|
||||
|
||||
test.describe('Argus Web 页面可用性巡检', () => {
|
||||
for (const { path, title } of pages) {
|
||||
test(`${title} 页面加载验证`, async ({ page }) => {
|
||||
await page.goto(`${BASE_URL}${path}`);
|
||||
await checkPage(page, path, title);
|
||||
await noConsoleError(page);
|
||||
});
|
||||
}
|
||||
});
|
||||
@ -1,19 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)"
|
||||
project_root="$(cd "$root/../../.." && pwd)"
|
||||
|
||||
source "$project_root/scripts/common/build_user.sh"
|
||||
load_build_user
|
||||
|
||||
# 创建新的private目录结构 (基于argus目录结构)
|
||||
echo "[INFO] Creating private directory structure for supervisor-based containers..."
|
||||
mkdir -p "$root/private/argus/web"
|
||||
mkdir -p "$root/private/argus/etc/"
|
||||
|
||||
# 设置数据目录权限
|
||||
echo "[INFO] Setting permissions for data directories..."
|
||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/web" 2>/dev/null || true
|
||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/etc" 2>/dev/null || true
|
||||
|
||||
echo "[INFO] Supervisor-based containers will manage their own scripts and configurations"
|
||||
@ -1,10 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
cd "$(dirname "$0")/.."
|
||||
compose_cmd="docker compose"
|
||||
if ! $compose_cmd version >/dev/null 2>&1; then
|
||||
if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else
|
||||
echo "需要 Docker Compose,请安装后重试" >&2; exit 1; fi
|
||||
fi
|
||||
$compose_cmd -p alert-mvp up -d --remove-orphans
|
||||
echo "[OK] 服务已启动:Web Frontend http://localhost:8080"
|
||||
@ -1,93 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
WEB_URL=${WEB_URL:-"http://localhost:8080"}
|
||||
API_URL=${API_URL:-"http://master.argus.com/api/v1/master/nodes"}
|
||||
TIMEOUT=10
|
||||
|
||||
GREEN="\033[1;32m"
|
||||
RED="\033[1;31m"
|
||||
YELLOW="\033[1;33m"
|
||||
RESET="\033[0m"
|
||||
|
||||
echo "[info] 测试 Argus Web 前端启动状态..."
|
||||
echo "--------------------------------------------------"
|
||||
|
||||
# 等待 web 前端可用
|
||||
attempt=1
|
||||
while (( attempt <= 10 )); do
|
||||
if curl -fsS -m "$TIMEOUT" -o /dev/null "$WEB_URL"; then
|
||||
echo "[ok] Web 前端已启动 (${attempt}/10)"
|
||||
break
|
||||
fi
|
||||
echo "[..] 等待 Web 前端启动中 (${attempt}/10)"
|
||||
sleep 3
|
||||
(( attempt++ ))
|
||||
done
|
||||
|
||||
if (( attempt > 10 )); then
|
||||
echo "[err] Web 前端在 30 秒内未启动"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 1. 检查首页可访问性
|
||||
echo "[test] 检查首页访问..."
|
||||
if curl -fsS "$WEB_URL" -m "$TIMEOUT" | grep -q "<title>"; then
|
||||
echo -e "[${GREEN}ok${RESET}] 首页可访问"
|
||||
else
|
||||
echo -e "[${RED}err${RESET}] 首页访问失败"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 2. 检查静态资源加载
|
||||
echo "[test] 检查静态资源..."
|
||||
if curl -fsS "$WEB_URL/static/js" -m "$TIMEOUT" | grep -q "Cannot GET"; then
|
||||
echo -e "[${YELLOW}warn${RESET}] 静态资源路径可能未正确配置"
|
||||
else
|
||||
echo -e "[${GREEN}ok${RESET}] 静态资源服务正常"
|
||||
fi
|
||||
|
||||
# 3. 检查前端路由兼容
|
||||
echo "[test] 检查 React Router 路由兼容..."
|
||||
if curl -fsS "$WEB_URL/dashboard" -m "$TIMEOUT" | grep -q "<title>"; then
|
||||
echo -e "[${GREEN}ok${RESET}] React Router 路由兼容正常"
|
||||
else
|
||||
echo -e "[${YELLOW}warn${RESET}] /dashboard 路由未正确返回 index.html"
|
||||
fi
|
||||
|
||||
# 4. 测试 API 代理访问
|
||||
echo "[test] 检查 API 代理..."
|
||||
if curl -fsS "$API_URL" -m "$TIMEOUT" | grep -q "nodes"; then
|
||||
echo -e "[${GREEN}ok${RESET}] API 代理成功"
|
||||
else
|
||||
echo -e "[${YELLOW}warn${RESET}] API 代理请求失败,请检查 Nginx proxy_pass"
|
||||
fi
|
||||
|
||||
# 5. 页面关键字验证
|
||||
echo "[test] 检查关键内容..."
|
||||
if curl -fsS "$WEB_URL" | grep -q "Argus"; then
|
||||
echo -e "[${GREEN}ok${RESET}] 页面包含关键字 'Argus'"
|
||||
else
|
||||
echo -e "[${YELLOW}warn${RESET}] 页面内容中未找到 'Argus'"
|
||||
fi
|
||||
|
||||
# 6. DNS 检查
|
||||
echo "[test] 检查 DNS 解析..."
|
||||
if dig +short web.argus.com >/dev/null; then
|
||||
echo -e "[${GREEN}ok${RESET}] 域名 web.argus.com 解析正常"
|
||||
else
|
||||
echo -e "[${YELLOW}warn${RESET}] 域名 web.argus.com 解析失败"
|
||||
fi
|
||||
|
||||
# 7. 响应时间测试
|
||||
echo "[test] 检查响应时间..."
|
||||
response_time=$(curl -o /dev/null -s -w "%{time_total}\n" "$WEB_URL")
|
||||
echo "[info] 响应时间: ${response_time}s"
|
||||
if (( $(echo "$response_time > 2.0" | bc -l) )); then
|
||||
echo -e "[${YELLOW}warn${RESET}] 响应时间较慢(>2s)"
|
||||
else
|
||||
echo -e "[${GREEN}ok${RESET}] 响应时间正常"
|
||||
fi
|
||||
|
||||
echo "--------------------------------------------------"
|
||||
echo "[done] Web 前端测试完成 ✅"
|
||||
@ -1,21 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
cd "$(dirname "$0")/.."
|
||||
compose_cmd="docker compose"
|
||||
if ! $compose_cmd version >/dev/null 2>&1; then
|
||||
if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else
|
||||
echo "需要 Docker Compose,请安装后重试" >&2; exit 1; fi
|
||||
fi
|
||||
$compose_cmd -p alert-mvp down
|
||||
echo "[OK] 已停止所有容器"
|
||||
|
||||
# 清理private目录内容
|
||||
echo "[INFO] 清理private目录内容..."
|
||||
cd "$(dirname "$0")/.."
|
||||
if [ -d "private" ]; then
|
||||
# 删除private目录及其所有内容
|
||||
rm -rf private
|
||||
echo "[OK] 已清理private目录"
|
||||
else
|
||||
echo "[INFO] private目录不存在,无需清理"
|
||||
fi
|
||||
@ -1,85 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
echo "======================================="
|
||||
echo "ARGUS Web System End-to-End Test"
|
||||
echo "======================================="
|
||||
echo ""
|
||||
|
||||
# 记录测试开始时间
|
||||
test_start_time=$(date +%s)
|
||||
|
||||
# 函数:等待服务就绪
|
||||
wait_for_services() {
|
||||
echo "[INFO] Waiting for all services to be ready..."
|
||||
local max_attempts=${SERVICE_WAIT_ATTEMPTS:-120}
|
||||
local attempt=1
|
||||
|
||||
while [ $attempt -le $max_attempts ]; do
|
||||
if curl -fs http://localhost:8080 >/dev/null 2>&1; then
|
||||
echo "[OK] All services are ready!"
|
||||
return 0
|
||||
fi
|
||||
echo " Waiting for services... ($attempt/$max_attempts)"
|
||||
sleep 5
|
||||
((attempt++))
|
||||
done
|
||||
|
||||
echo "[ERROR] Services not ready after $max_attempts attempts"
|
||||
return 1
|
||||
}
|
||||
|
||||
# 函数:显示测试步骤
|
||||
show_step() {
|
||||
echo ""
|
||||
echo "🔄 Step $1: $2"
|
||||
echo "----------------------------------------"
|
||||
}
|
||||
|
||||
# 函数:验证步骤结果
|
||||
verify_step() {
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "✅ $1 - SUCCESS"
|
||||
else
|
||||
echo "❌ $1 - FAILED"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 开始端到端测试
|
||||
show_step "1" "Bootstrap - Initialize environment"
|
||||
./scripts/01_bootstrap.sh
|
||||
verify_step "Bootstrap"
|
||||
|
||||
show_step "2" "Startup - Start all services"
|
||||
./scripts/02_up.sh
|
||||
verify_step "Service startup"
|
||||
|
||||
# 等待服务完全就绪
|
||||
wait_for_services || exit 1
|
||||
|
||||
# 测试前端页面
|
||||
show_step "3" "Web - Check frontend availability"
|
||||
./scripts/03_web_health_check.sh
|
||||
verify_step "Web frontend availability"
|
||||
|
||||
# 清理环境
|
||||
show_step "4" "Cleanup - Stop all services"
|
||||
./scripts/04_down.sh
|
||||
verify_step "Service cleanup"
|
||||
|
||||
# 计算总测试时间
|
||||
test_end_time=$(date +%s)
|
||||
total_time=$((test_end_time - test_start_time))
|
||||
|
||||
echo ""
|
||||
echo "======================================="
|
||||
echo "🎉 END-TO-END TEST COMPLETED SUCCESSFULLY!"
|
||||
echo "======================================="
|
||||
echo "📊 Test Summary:"
|
||||
echo " • Total time: ${total_time}s"
|
||||
echo " • Alertmanager status: $am_status"
|
||||
echo " • All services started and stopped successfully"
|
||||
echo ""
|
||||
echo "✅ The ARGUS Web system is working correctly!"
|
||||
echo ""
|
||||
77
src/web/tests/scripts/verify-web-frontend.sh
Normal file
77
src/web/tests/scripts/verify-web-frontend.sh
Normal file
@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# -----------------------------------------
|
||||
# Web 前端自动化验证脚本(部署后执行)
|
||||
# -----------------------------------------
|
||||
|
||||
PROJECT_ROOT="$(dirname "$0")"
|
||||
WEB_DIR="$PROJECT_ROOT"
|
||||
REPORT_DIR="$WEB_DIR/playwright-report"
|
||||
FRONTEND_URL="http://web.argus.com:8080"
|
||||
TIMEOUT=120 # 最长等待前端启动时间(秒)
|
||||
|
||||
echo "🔍 [1/4] 检查前端服务是否已启动 (${FRONTEND_URL}) ..."
|
||||
|
||||
# 等待前端服务可访问
|
||||
for ((i=1; i<=$TIMEOUT; i++)); do
|
||||
STATUS_CODE=$(curl -s -o /dev/null -w "%{http_code}" "$FRONTEND_URL" || true)
|
||||
if [[ "$STATUS_CODE" == "200" ]]; then
|
||||
echo "✅ 前端服务已启动并可访问"
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
if [ $i -eq $TIMEOUT ]; then
|
||||
echo "❌ 等待前端启动超时 (${TIMEOUT}s)"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
# -----------------------------------------
|
||||
# 2. 执行 Playwright 测试
|
||||
# -----------------------------------------
|
||||
echo "[2/4] 执行 Playwright 自动化测试..."
|
||||
|
||||
cd "$WEB_DIR"
|
||||
|
||||
# 确保依赖已安装
|
||||
if [ ! -d "node_modules" ]; then
|
||||
echo "未检测到依赖,开始安装..."
|
||||
npm ci
|
||||
fi
|
||||
|
||||
# 清理旧报告
|
||||
rm -rf "$REPORT_DIR"
|
||||
|
||||
# 运行测试(带失败检测)
|
||||
set +e # 暂时关闭自动退出,便于捕获测试结果
|
||||
npx playwright test tests/playwright --reporter=list,html
|
||||
TEST_RESULT=$?
|
||||
set -e # 恢复严格模式
|
||||
|
||||
# -----------------------------------------
|
||||
# 3. 检查测试结果
|
||||
# -----------------------------------------
|
||||
echo "[3/4] 检查测试结果..."
|
||||
|
||||
if [ $TEST_RESULT -eq 0 ]; then
|
||||
echo "[✓] 所有测试通过!"
|
||||
else
|
||||
echo "[X] 存在测试未通过,请查看报告。"
|
||||
fi
|
||||
|
||||
# -----------------------------------------
|
||||
# 4. 输出报告信息
|
||||
# -----------------------------------------
|
||||
echo "[4/4] 生成测试报告..."
|
||||
|
||||
if [ -d "$REPORT_DIR" ]; then
|
||||
echo "测试报告已生成:$REPORT_DIR"
|
||||
echo "可执行以下命令查看详细报告:"
|
||||
echo " npx playwright show-report"
|
||||
else
|
||||
echo "未生成报告目录,请检查执行日志。"
|
||||
fi
|
||||
|
||||
# 将测试结果作为退出码返回
|
||||
exit $TEST_RESULT
|
||||
Loading…
x
Reference in New Issue
Block a user