Compare commits

..

3 Commits

Author SHA1 Message Date
252461c642 [#37] 优化client构建 2025-11-03 14:53:12 +08:00
629aa88550 [#37] 优化client安装包 2025-10-31 15:23:43 +08:00
c0c90a526d [#37] 测试NixOS部署server通过 2025-10-31 14:18:58 +08:00
21 changed files with 696 additions and 117 deletions

35
deployment/build/build_client_package.sh Normal file → Executable file
View File

@ -43,7 +43,23 @@ latest_dir=$(ls -1dt "$ART_BASE"/*/ 2>/dev/null | head -n1 || true)
tmpdir=$(mktemp -d)
trap 'rm -rf "$tmpdir"' EXIT
rsync -a "$latest_dir" "$tmpdir/src" >/dev/null 2>&1 || cp -r "$latest_dir" "$tmpdir/src"
# Filter-only copy: keep install_order files + scripts + deps + version.json
mkdir -p "$tmpdir/src"
cp -f "$latest_dir/version.json" "$tmpdir/src/version.json"
if command -v jq >/dev/null 2>&1; then
mapfile -t files < <(jq -r '.install_order[]' "$latest_dir/version.json")
else
files=( $(grep -E '"install_order"' -A10 "$latest_dir/version.json" | sed -n 's/\s*"\(.*\.tar\.gz\)".*/\1/p') )
fi
for f in "${files[@]}"; do
[[ -f "$latest_dir/$f" ]] && cp -f "$latest_dir/$f" "$tmpdir/src/$f"
done
for aux in install.sh uninstall.sh check_health.sh check_version.sh sync_dns.sh restart_unhealthy.sh config.env; do
[[ -f "$latest_dir/$aux" ]] && cp -f "$latest_dir/$aux" "$tmpdir/src/$aux";
done
if [[ -d "$latest_dir/deps" ]]; then
mkdir -p "$tmpdir/src/deps" && rsync -a "$latest_dir/deps/" "$tmpdir/src/deps/" >/dev/null 2>&1 || cp -r "$latest_dir/deps/." "$tmpdir/src/deps/";
fi
out_name="argus-metric_$(echo "$VERSION" | sed 's/\./_/g').tar.gz"
@ -52,5 +68,20 @@ out_name="argus-metric_$(echo "$VERSION" | sed 's/\./_/g').tar.gz"
log "Client package ready: $PKG_DIR/$out_name"
echo "$VERSION" > "$PKG_DIR/LATEST_VERSION"
exit 0
# include publish helper and setup.sh for convenience
PUBLISH_TPL="$BUILD_DIR/templates/client/publish.sh"
if [[ -f "$PUBLISH_TPL" ]]; then
cp "$PUBLISH_TPL" "$PKG_DIR/publish.sh" && chmod +x "$PKG_DIR/publish.sh"
fi
# also place a copy of setup.sh alongside
SETUP_SRC="$ROOT_DIR/src/metric/client-plugins/all-in-one-full/scripts/setup.sh"
[[ -f "$SETUP_SRC" ]] && cp "$SETUP_SRC" "$PKG_DIR/setup.sh" || true
# docs for end users
CLIENT_DOC_DIR="$BUILD_DIR/templates/client"
if [[ -d "$CLIENT_DOC_DIR" ]]; then
rsync -a "$CLIENT_DOC_DIR/" "$PKG_DIR/" >/dev/null 2>&1 || cp -r "$CLIENT_DOC_DIR/." "$PKG_DIR/"
fi
exit 0

30
deployment/build/build_server_package.sh Normal file → Executable file
View File

@ -48,6 +48,17 @@ SRC_COMPOSE="$ROOT_DIR/src/sys/tests/docker-compose.yml"
[[ -f "$SRC_COMPOSE" ]] || { err "missing $SRC_COMPOSE"; exit 1; }
awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$STAGE/compose/docker-compose.yml"
cp "$BUILD_DIR/templates/.env.example" "$STAGE/compose/.env.example"
# fix relative private path to match package layout (compose/ and private/ are siblings)
sed -i 's#\./private/#../private/#g' "$STAGE/compose/docker-compose.yml"
# also handle bind mount form without trailing slash
sed -i -E 's#- \./private:/private#- ../private:/private#g' "$STAGE/compose/docker-compose.yml"
# drop timezone file bind which may not exist on target distros (e.g. NixOS)
sed -i '/\/etc\/timezone:\/etc\/timezone:ro/d' "$STAGE/compose/docker-compose.yml"
# sanity-check: ensure test services are absent
if grep -E '^(\s{2})(node-a|node-b|test-node|test-gpu-node):\s*$' -n "$STAGE/compose/docker-compose.yml" >/dev/null; then
err "compose filter failed: test services still present"; exit 1;
fi
# 3) Images (reuse if already exported unless --resave-image)
existing_images_tar="$PKG_DIR/images/all-images.tar.gz"
@ -76,24 +87,7 @@ fi
# 4) Scripts & Docs
copy_tree "$BUILD_DIR/templates/scripts" "$STAGE/scripts"
cat > "$STAGE/docs/INSTALL_SERVER.md" << 'MD'
# Argus Server Offline Installation
## Prerequisites
- Ubuntu 22.04 x86_64
- Docker & Docker Compose installed
- Open ports: 32300,9200,5601,9090,9093,8080..8085,21,20,21100-21110 (or auto-fallback to high ports)
## Steps
1. Extract this package to /opt/argus-deploy/versions/<YYYYMMDD>
2. cd scripts && sudo ./server-install.sh
3. Check status: ./server-status.sh
4. Uninstall: ./server-uninstall.sh
## Notes
- Selfcheck result is written to logs/selfcheck.json
- DNS will be managed by internal bind; FTP dns.conf is auto-published to share/dns.conf
MD
copy_tree "$BUILD_DIR/templates/docs" "$STAGE/docs"
# 5) Manifests
gen_manifest "$STAGE" "$STAGE/manifest.txt"

0
deployment/build/common.sh Normal file → Executable file
View File

View File

@ -1,57 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
. "$ROOT_DIR/build/common.sh"
usage() { cat <<'EOF'
Publish client package to FTP server
Usage: publish_client.sh --version YYYYMMDD --server HOST --user USER --password PASS [--port 21]
It uploads: setup.sh, argus-metric_<YYYYMMDD>.tar.gz, LATEST_VERSION to the FTP share root.
EOF
}
VERSION=""; HOST=""; USERNAME=""; PASSWORD=""; PORT=21
while [[ $# -gt 0 ]]; do
case "$1" in
--version) VERSION="$2"; shift 2;;
--server) HOST="$2"; shift 2;;
--user) USERNAME="$2"; shift 2;;
--password) PASSWORD="$2"; shift 2;;
--port) PORT="$2"; shift 2;;
-h|--help) usage; exit 0;;
*) err "unknown arg: $1"; usage; exit 1;;
esac
done
[[ -n "$VERSION" && -n "$HOST" && -n "$USERNAME" && -n "$PASSWORD" ]] || { usage; exit 1; }
CLIENT_DIR="$ROOT_DIR/artifact/client/$VERSION"
TAR_NAME="argus-metric_${VERSION}.tar.gz"
PKG="$CLIENT_DIR/$TAR_NAME"
SETUP_SRC="$ROOT_DIR/../src/sys/tests/private/argus/metric/ftp/share/setup.sh"
ALT_SETUP="$ROOT_DIR/../src/metric/client-plugins/all-in-one-full/scripts/setup.sh"
[[ -f "$PKG" ]] || { err "missing client package: $PKG"; exit 1; }
if [[ ! -f "$SETUP_SRC" ]]; then
if [[ -f "$ALT_SETUP" ]]; then
SETUP_SRC="$ALT_SETUP"
else
err "missing setup.sh (checked $SETUP_SRC and $ALT_SETUP)"; exit 1
fi
fi
log "Uploading setup.sh"
curl -u "$USERNAME:$PASSWORD" -sfT "$SETUP_SRC" "ftp://$HOST:$PORT/setup.sh"
log "Uploading client tar: $TAR_NAME"
curl -u "$USERNAME:$PASSWORD" -sfT "$PKG" "ftp://$HOST:$PORT/$TAR_NAME"
log "Updating LATEST_VERSION -> $VERSION"
printf "%s" "$VERSION" | curl -u "$USERNAME:$PASSWORD" -sfT - "ftp://$HOST:$PORT/LATEST_VERSION"
log "Publish done"
exit 0

View File

@ -0,0 +1,37 @@
# Argus Metric 客户端安装指南(容器内普通用户场景)
## 准备与连通性检查
- FTP 连接(需要账号密码,默认 `ftpuser / ZGClab1234!`
- `curl -u ftpuser:ZGClab1234! -I ftp://<FTP_IP>:21/LATEST_VERSION`
- `curl -u ftpuser:ZGClab1234! -s ftp://<FTP_IP>:21/ | head`
- 下载安装脚本
- `curl -u ftpuser:ZGClab1234! -fsSL ftp://<FTP_IP>:21/setup.sh -o /tmp/setup.sh`
- `chmod +x /tmp/setup.sh`
## 元数据与主机名
- Agent 需要元数据env/user/instance与 Master 地址:
- 方式Ahostname 形如 `env-user-instance-xxx`(推荐)
- 方式B导出环境变量
- `export AGENT_ENV=dev`
- `export AGENT_USER=<your_user>`
- `export AGENT_INSTANCE=<node_id>`
- Master 地址:
- `export MASTER_ENDPOINT=http://master.argus.com:3000`
> 安装脚本会在开头检查上述条件,缺失时会终止并给出修复提示。
## 执行安装
- 以 root 运行(容器内如为非 root 用户请切换为 root
- `sudo /tmp/setup.sh --server <FTP_IP> --user ftpuser --password 'ZGClab1234!' --port 21`
- 如需自定义安装根目录:`--install-dir /opt/argus-metric`
## 安装后自检setup 自动执行)
- setup 会等待最多 5 分钟,确认以下条件后才报告完成:
- `/private/argus/agent/<hostname>/node.json` 已生成;
- `last_report` 在持续更新;
- `health.metric-argus-agent|metric-node-exporter|metric-fluent-bit|metric-dcgm-exporter` 均为 `healthy``error` 为空。
## 手工验证(可选)
- `cat /private/argus/agent/$(hostname)/node.json | jq '.'`
- `curl -s -o /dev/null -w "%{http_code}\n" http://localhost:9100/metrics` 应为 200
- 查看日志:`/var/log/argus-agent.log``/opt/argus-metric/versions/*/.install.log`

View File

@ -0,0 +1,54 @@
#!/usr/bin/env bash
set -euo pipefail
usage() { cat <<'EOF'
Publish Argus client package to FTP
Usage:
./publish.sh --server HOST --user USER --password PASS [--port 21]
Notes:
- This script expects to run inside the built client artifact directory.
- It reads LATEST_VERSION and uploads setup.sh, argus-metric_<ver>.tar.gz, and LATEST_VERSION.
EOF
}
HOST=""; USERNAME=""; PASSWORD=""; PORT=21
while [[ $# -gt 0 ]]; do
case "$1" in
--server) HOST="$2"; shift 2;;
--user) USERNAME="$2"; shift 2;;
--password) PASSWORD="$2"; shift 2;;
--port) PORT="$2"; shift 2;;
-h|--help) usage; exit 0;;
*) echo "unknown arg: $1" >&2; usage; exit 1;;
esac
done
[[ -n "$HOST" && -n "$USERNAME" && -n "$PASSWORD" ]] || { usage; exit 1; }
here="$(pwd)"
if [[ ! -f "$here/LATEST_VERSION" ]]; then
echo "LATEST_VERSION not found in $(pwd)" >&2; exit 1;
fi
VER=$(cat "$here/LATEST_VERSION" | tr -d '\n')
PKG="argus-metric_${VER}.tar.gz"
if [[ ! -f "$here/$PKG" ]]; then
echo "client tar not found: $PKG" >&2; exit 1
fi
# locate setup.sh (prefer colocated, fallback to bundled path if provided)
SETUP="${here}/setup.sh"
if [[ ! -f "$SETUP" ]]; then
echo "setup.sh not found in $(pwd)" >&2; exit 1
fi
echo "[PUBLISH] server=$HOST port=$PORT version=$VER"
curl -u "$USERNAME:$PASSWORD" -sfT "$SETUP" "ftp://$HOST:$PORT/setup.sh"
curl -u "$USERNAME:$PASSWORD" -sfT "$PKG" "ftp://$HOST:$PORT/$PKG"
printf "%s" "$VER" | curl -u "$USERNAME:$PASSWORD" -sfT - "ftp://$HOST:$PORT/LATEST_VERSION"
echo "[OK] publish completed"

View File

@ -4,38 +4,38 @@
BEGIN{
split(remove, rm, ",");
for(i in rm) skipname[rm[i]] = 1;
for(i in rm){
gsub(/^\s+|\s+$/,"",rm[i]);
if (rm[i] != "") skipname[rm[i]] = 1;
}
in_services=0; skipping=0;
}
function starts_service_line(line, name) {
if (match(line, /^\s{2}([a-zA-Z0-9_-]+):\s*$/, m)) {
name = m[1];
return name;
}
function service_header(line, m) {
# match exactly two leading spaces followed by name:
if (match(line, /^ ([A-Za-z0-9_-]+):[ ]*$/, m)) return m[1];
return "";
}
{
name = starts_service_line($0);
if (name != "") {
# detect top-level keys (networks:, services:, etc.)
if ($0 ~ /^services:\s*$/) { in_services=1; print; next; }
if ($0 ~ /^[a-zA-Z0-9_-]+:\s*$/ && $0 !~ /^\s/) {
in_services= ($0 ~ /^services:\s*$/);
}
# Track top-level sections (no indentation)
if ($0 ~ /^[A-Za-z0-9_-]+:[ ]*$/ && $0 !~ /^\s/) {
in_services = ($0 ~ /^services:[ ]*$/) ? 1 : 0;
}
if (in_services && (name in skipname)) {
skipping=1; next;
if (skipping) {
# Stop skipping at next service header or another top-level section
if (service_header($0) != "" || ($0 ~ /^(networks|volumes):[ ]*$/ && $0 !~ /^\s/)) {
skipping=0;
} else {
next;
}
}
# end skipping when next top-level service appears
if (skipping) {
if (starts_service_line($0) != "") { skipping=0; }
else if ($0 ~ /^(networks|volumes):\s*$/) { skipping=0; }
else { next; }
if (in_services) {
name = service_header($0);
if (name != "" && (name in skipname)) { skipping=1; next; }
}
print;
}

View File

@ -0,0 +1,48 @@
# Argus Server Offline Installation
## Prerequisites
- Linux x86_64 (Ubuntu 22.04 recommended; see OS compatibility for NixOS)
- Docker & Docker Compose installed
- Open ports: 32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 2110021110 (or auto-fallback to high ports)
## Quick Start
1. Extract to a target dir, e.g. `/opt/argus-deploy/versions/<YYYYMMDD>`
2. `cd scripts && sudo ./server-prepare-dirs.sh`
3. `./server-install.sh`
4. `./server-status.sh`
5. `./server-selfcheck.sh` (on failure it auto-runs diagnose and writes logs under `logs/`)
6. `./server-uninstall.sh` to tear down
## What the Installer Does
- Loads local images (`images/all-images.tar.gz`)
- Generates OS-compat override (`security_opt: ["label=disable"]`, `userns_mode: host`, bind `tmpfs:/run/named`)
- Starts server-only services: bind/master/es/kibana/ftp/prometheus/grafana/alertmanager/web-frontend/web-proxy
- DNS Bootstrap:
- Ensure `/private/argus/etc/dns.conf` exists (write `172.31.0.2` if missing);
- Run `/private/argus/etc/update-dns.sh` in dependent containers so `/etc/resolv.conf` points to bind;
- Wait for `*.argus.com` hint files, then reload bind;
- Restart webproxy to re-render nginx resolver from `dns.conf`;
- Writes `logs/selfcheck.json` as final summary
## OS Compatibility
- NixOS / non-xattr FS: containers run with `security_opt: ["label=disable"]` and `userns_mode: host`; ensure data dirs are pre-created via `sudo ./server-prepare-dirs.sh` and owned by runtime UID:GID (default 1000:1000).
## Files & Layout
- `compose/` (docker-compose.yml, .env)
- `private/` (data mounts)
- `scripts/` (install/uninstall/status/selfcheck/diagnose)
- `logs/` (selfcheck + diagnose outputs)
## Troubleshooting (Quick)
- Run `./server-selfcheck.sh` → see `logs/selfcheck.json`
- Run `./server-diagnose.sh` → produces timestamped logs:
- `logs/diagnose_details_YYYYMMDD-HHMMSSZ.log`
- `logs/diagnose_error_YYYYMMDD-HHMMSSZ.log`
And updates `diagnose_details.log`/`diagnose_error.log` to the latest
- Error lines are tagged `[service][source]`, e.g. `[kibana][http] /api/status=503`
Common issues:
- Kibana 503: wait cold start or fix DNS so `es.log.argus.com` resolves
- webproxy 504: check nginx `resolver` includes `172.31.0.2 127.0.0.11`
- EACCES/locks: ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches UID:GID

View File

@ -0,0 +1,28 @@
# Argus 服务端离线安装指南
## 先决条件
- Linux x86_64推荐 Ubuntu 22.04NixOS 见“兼容说明”)
- 已安装 Docker 与 Docker Compose
- 端口32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 2110021110
## 快速开始
1. 解压到目标目录(例如 `/opt/argus-deploy/versions/<YYYYMMDD>`
2. 进入 `scripts/``sudo ./server-prepare-dirs.sh`
3. 安装:`./server-install.sh`
4. 状态:`./server-status.sh`
5. 自检:`./server-selfcheck.sh`(失败会自动采集诊断)
6. 卸载:`./server-uninstall.sh`
## 安装流程要点
- 仅启动 10 个服务端组件(不包含测试节点);
- DNS Bootstrap补齐首次部署 DNS 依赖(生成/确认 `dns.conf`、统一容器 resolv.conf、写入 `*.argus.com`、reload bind、重启 webproxy
- 输出自检结果到 `logs/selfcheck.json`
## 兼容说明NixOS 等)
- 使用 `security_opt: ["label=disable"]``userns_mode: host`
- 先运行 `sudo ./server-prepare-dirs.sh` 创建/授权目录为 `1000:1000`
## 故障排查(见下文 Troubleshooting_zh
- `./server-selfcheck.sh``logs/selfcheck.json`
- `./server-diagnose.sh``logs/diagnose_error_*.log` / `logs/diagnose_details_*.log`

View File

@ -0,0 +1,21 @@
# Troubleshooting
- Status: `scripts/server-status.sh`
- Selfcheck: `scripts/server-selfcheck.sh`
- Diagnose: `scripts/server-diagnose.sh`
Outputs:
- `logs/selfcheck.json`
- `logs/diagnose_details_*.log` (full details)
- `logs/diagnose_error_*.log` (tagged errors)
WebProxy:
- 8083 expects 200/302/403; 8084/8085 must include CORS header
- nginx resolver should be `172.31.0.2 127.0.0.11`
Kibana/ES:
- Verify `es.log.argus.com` resolves inside Kibana
Permissions:
- Ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches runtime UID:GID

View File

@ -0,0 +1,15 @@
# 故障排查
- 状态:`scripts/server-status.sh`
- 自检:`scripts/server-selfcheck.sh`
- 诊断:`scripts/server-diagnose.sh`
输出:
- `logs/selfcheck.json`
- `logs/diagnose_error_*.log`(错误摘要)
- `logs/diagnose_details_*.log`(详细信息)
WebProxy8083=200/302/4038084/8085 需包含 CORS
Kibana确认可解析 `es.log.argus.com`
权限:先运行 `sudo ./server-prepare-dirs.sh`

View File

@ -0,0 +1,117 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
mkdir -p "$ROOT/logs"
ts="$(date -u +%Y%m%d-%H%M%SZ)"
DETAILS="$ROOT/logs/diagnose_details_${ts}.log"
ERRORS="$ROOT/logs/diagnose_error_${ts}.log"
: > "$DETAILS"; : > "$ERRORS"
logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; }
append_err() { echo "$*" >> "$ERRORS"; }
http_code() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
http_body_head() { curl -s --max-time 3 "$1" 2>/dev/null | sed -n '1,5p' || true; }
header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; }
section() {
local name="$1"; logd "===== [$name] ====="; }
svc() {
local svc_name="$1"; local cname="$2"; shift 2
section "$svc_name ($cname)"
logd "docker ps:"; docker ps -a --format '{{.Names}}\t{{.Status}}\t{{.Image}}' | awk -v n="$cname" '$1==n' >> "$DETAILS" || true
logd "docker inspect (state/restartcount):"; docker inspect -f '{{.State.Status}} rc={{.RestartCount}} started={{.State.StartedAt}}' "$cname" >> "$DETAILS" 2>&1 || true
logd "last 200 container logs:"; docker logs --tail 200 "$cname" >> "$DETAILS" 2>&1 || true
# extract error lines from container logs
docker logs --tail 200 "$cname" 2>&1 | \
grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \
sed "s/^/[${svc_name}][container] /" >> "$ERRORS" || true
# supervisor status and logs
if docker exec "$cname" sh -lc 'command -v supervisorctl >/dev/null 2>&1' >/dev/null 2>&1; then
logd "supervisorctl status:"; docker exec "$cname" sh -lc 'supervisorctl status' >> "$DETAILS" 2>&1 || true
# iterate supervisor logs and collect tails + errors per file
local files
files=$(docker exec "$cname" sh -lc 'ls /var/log/supervisor/*.log 2>/dev/null' || true)
for f in $files; do
logd "tail -n 80 $f:"; docker exec "$cname" sh -lc "tail -n 80 $f 2>/dev/null || true" >> "$DETAILS" 2>&1 || true
docker exec "$cname" sh -lc "tail -n 200 $f 2>/dev/null" 2>/dev/null | \
grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \
sed "s/^/[${svc_name}][supervisor:$(basename $f)] /" >> "$ERRORS" || true
done
fi
}
# Core services
svc bind argus-bind-sys
svc master argus-master-sys
svc es argus-es-sys
svc kibana argus-kibana-sys
svc ftp argus-ftp
svc prometheus argus-prometheus
svc grafana argus-grafana
svc alertmanager argus-alertmanager
svc web-frontend argus-web-frontend
svc web-proxy argus-web-proxy
# HTTP checks (host side)
section HTTP
logd "ES: $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health")"
http_body_head "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" >> "$DETAILS" 2>&1 || true
logd "Kibana: $(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status")"
http_body_head "http://localhost:${KIBANA_PORT:-5601}/api/status" >> "$DETAILS" 2>&1 || true
logd "Master readyz: $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz")"
logd "Prometheus: $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready")"
logd "Grafana: $(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health")"
http_body_head "http://localhost:${GRAFANA_PORT:-3000}/api/health" >> "$DETAILS" 2>&1 || true
logd "Alertmanager: $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status")"
cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true)
cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true)
logd "Web-Proxy 8080: $(http_code "http://localhost:${WEB_PROXY_PORT_8080:-8080}/")"
logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083}/")"
logd "Web-Proxy 8084 CORS: ${cors8084}"
logd "Web-Proxy 8085 CORS: ${cors8085}"
# FTP share writability (container perspective)
section FTP-SHARE
docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true
# Collect system info for context
section SYSTEM
logd "uname -a:"; uname -a >> "$DETAILS"
logd "docker version:"; docker version --format '{{.Server.Version}}' >> "$DETAILS" 2>&1 || true
logd "compose ps:"; (cd "$ROOT/compose" && if docker compose version >/dev/null 2>&1; then docker compose -p argus-sys ps; else docker-compose -p argus-sys ps; fi) >> "$DETAILS" 2>&1 || true
section SUMMARY
# Add HTTP failures and CORS problems to error log with tags
[[ $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health") != 200 ]] && echo "[es][http] health not 200" >> "$ERRORS"
kbcode=$(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status"); [[ "$kbcode" != 200 ]] && echo "[kibana][http] /api/status=$kbcode" >> "$ERRORS"
[[ $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz") != 200 ]] && echo "[master][http] /readyz not 200" >> "$ERRORS"
[[ $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready") != 200 ]] && echo "[prometheus][http] /-/ready not 200" >> "$ERRORS"
gfcode=$(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health"); [[ "$gfcode" != 200 ]] && echo "[grafana][http] /api/health=$gfcode" >> "$ERRORS"
[[ $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status") != 200 ]] && echo "[alertmanager][http] /api/v2/status not 200" >> "$ERRORS"
[[ -z "$cors8084" ]] && echo "[web-proxy][cors] 8084 missing Access-Control-Allow-Origin" >> "$ERRORS"
[[ -z "$cors8085" ]] && echo "[web-proxy][cors] 8085 missing Access-Control-Allow-Origin" >> "$ERRORS"
# Deduplicate errors
sort -u -o "$ERRORS" "$ERRORS"
echo "Diagnostic details -> $DETAILS"
echo "Detected errors -> $ERRORS"
# maintain latest symlinks for convenience
ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true
ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true
exit 0

126
deployment/build/templates/scripts/server-install.sh Normal file → Executable file
View File

@ -43,6 +43,29 @@ prepare_env() {
done
}
prepare_data_dirs() {
if [[ $EUID -ne 0 ]]; then
echo -e "\033[1;33m[WARN]\033[0m running as non-root: will not chown data dirs."
echo -e "\033[1;33m[WARN]\033[0m If you hit Permission denied, run: sudo $SCRIPT_DIR/server-prepare-dirs.sh"
# still ensure basic directories exist (no chown)
mkdir -p \
"$PKG_ROOT/private/argus/etc" \
"$PKG_ROOT/private/argus/metric/prometheus" \
"$PKG_ROOT/private/argus/metric/prometheus/data" \
"$PKG_ROOT/private/argus/metric/prometheus/rules" \
"$PKG_ROOT/private/argus/metric/grafana" \
"$PKG_ROOT/private/argus/metric/grafana/data" \
"$PKG_ROOT/private/argus/metric/grafana/logs" \
"$PKG_ROOT/private/argus/metric/grafana/plugins" \
"$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \
"$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \
"$PKG_ROOT/private/argus/metric/grafana/data/sessions" \
"$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \
"$PKG_ROOT/private/argus/alert/alertmanager" \
"$PKG_ROOT/private/argus/metric/ftp/share"
fi
}
load_images() {
local tar="$PKG_ROOT/images/all-images.tar.gz"
[[ -f "$tar" ]] || { err "missing images tar: $tar"; exit 1; }
@ -52,7 +75,105 @@ load_images() {
bring_up() {
log "starting services via compose"
(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" up -d)
local ov="$PKG_ROOT/compose/docker-compose.os-compat.override.yml"
if [[ ! -f "$ov" ]]; then
cat > "$ov" <<'YAML'
services:
bind:
security_opt: ["label=disable"]
userns_mode: "host"
tmpfs:
- /run/named
master:
security_opt: ["label=disable"]
userns_mode: "host"
es:
security_opt: ["label=disable"]
userns_mode: "host"
kibana:
security_opt: ["label=disable"]
userns_mode: "host"
ftp:
security_opt: ["label=disable"]
userns_mode: "host"
prometheus:
security_opt: ["label=disable"]
userns_mode: "host"
grafana:
security_opt: ["label=disable"]
userns_mode: "host"
alertmanager:
security_opt: ["label=disable"]
userns_mode: "host"
# ensure runtime path matches container expectation
volumes:
- ../private/argus/etc:/private/argus/etc
- ../private/argus/alert/alertmanager:/alertmanager
web-frontend:
security_opt: ["label=disable"]
userns_mode: "host"
web-proxy:
security_opt: ["label=disable"]
userns_mode: "host"
YAML
log "generated OS-compat override: $(basename "$ov")"
fi
# 仅启动服务端组件避免误起测试节点node-a/node-b/test-node/test-gpu-node
local services=(bind master es kibana ftp prometheus grafana alertmanager web-frontend web-proxy)
log "services: ${services[*]}"
(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" -f docker-compose.yml -f $(basename "$ov") up -d "${services[@]}")
}
dns_bootstrap() {
log "DNS bootstrap: initializing shared dns.conf and container resolv.conf"
local etc_dir="$PKG_ROOT/private/argus/etc"
mkdir -p "$etc_dir"
# 1) ensure dns.conf exists (fallback to bind IP 172.31.0.2)
if [[ ! -s "$etc_dir/dns.conf" ]]; then
if echo "172.31.0.2" > "$etc_dir/dns.conf" 2>/dev/null; then
log "wrote fallback dns.conf with 172.31.0.2"
else
# host-side write denied (ownership 1000:1000); write via bind container instead
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
docker exec argus-bind-sys sh -lc 'echo 172.31.0.2 > /private/argus/etc/dns.conf && chmod 644 /private/argus/etc/dns.conf' || true
log "fallback dns.conf written via bind container"
else
log "bind not ready; skip writing fallback dns.conf"
fi
fi
fi
# 2) wait briefly for bind to copy update-dns.sh into shared etc (bind startup.sh does this)
local i=0
while [[ ! -x "$etc_dir/update-dns.sh" && $i -lt 20 ]]; do
sleep 0.5; ((i++));
done
if [[ ! -x "$etc_dir/update-dns.sh" ]]; then
log "update-dns.sh not present yet; continuing with existing resolv.conf"
fi
# 3) run update-dns.sh inside key containers so /etc/resolv.conf points to bind
local c
for c in argus-master-sys argus-es-sys argus-kibana-sys argus-grafana argus-prometheus argus-ftp argus-web-frontend argus-web-proxy argus-alertmanager; do
if docker ps --format '{{.Names}}' | grep -q "^${c}$"; then
docker exec "$c" sh -lc 'test -x /private/argus/etc/update-dns.sh && /private/argus/etc/update-dns.sh || true' >/dev/null 2>&1 || true
fi
done
# 4) wait for service A-record hint files generated by services (best-effort)
local need=( es.log.argus.com kibana.log.argus.com master.argus.com grafana.metric.argus.com prom.metric.argus.com alertmanager.alert.argus.com )
local waited=0; local missing=1
while (( waited < 15 )); do
missing=0
for f in "${need[@]}"; do [[ -s "$etc_dir/$f" ]] || { missing=1; break; }; done
[[ $missing -eq 0 ]] && break
sleep 1; ((waited++))
done
# 5) reload bind zone (script uses supervisor to restart bind9)
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
docker exec argus-bind-sys sh -lc '/usr/local/bin/reload-bind9.sh' >/dev/null 2>&1 || true
fi
# 6) restart web-proxy once to re-render nginx resolver with latest dns.conf
if docker ps --format '{{.Names}}' | grep -q '^argus-web-proxy$'; then
docker restart argus-web-proxy >/dev/null 2>&1 || true
fi
}
selfcheck() {
@ -62,11 +183,12 @@ selfcheck() {
main() {
prepare_env
prepare_data_dirs
load_images
bring_up
dns_bootstrap
selfcheck
log "install completed. See logs in $PKG_ROOT/logs/"
}
main "$@"

View File

@ -0,0 +1,73 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
if [[ $EUID -ne 0 ]]; then
echo "[PREPARE] This script requires root (sudo)." >&2
echo " Try: sudo $0" >&2
exit 1
fi
ENV_FILE="$PKG_ROOT/compose/.env"
[[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
UIDV="${ARGUS_BUILD_UID:-1000}"; GIDV="${ARGUS_BUILD_GID:-1000}"
echo "[PREPARE] Using owner ${UIDV}:${GIDV}"
# Core etc and service data dirs (aligned with src/sys/tests/scripts/01_bootstrap.sh)
mkdir -p \
"$PKG_ROOT/private/argus/etc" \
"$PKG_ROOT/private/argus/bind" \
"$PKG_ROOT/private/argus/master" \
"$PKG_ROOT/private/argus/agent" \
"$PKG_ROOT/private/argus/log/elasticsearch" \
"$PKG_ROOT/private/argus/log/kibana"
# Prometheus
mkdir -p \
"$PKG_ROOT/private/argus/metric/prometheus" \
"$PKG_ROOT/private/argus/metric/prometheus/data" \
"$PKG_ROOT/private/argus/metric/prometheus/rules" \
"$PKG_ROOT/private/argus/metric/prometheus/targets"
# Grafana
mkdir -p \
"$PKG_ROOT/private/argus/metric/grafana" \
"$PKG_ROOT/private/argus/metric/grafana/data" \
"$PKG_ROOT/private/argus/metric/grafana/logs" \
"$PKG_ROOT/private/argus/metric/grafana/plugins" \
"$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \
"$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \
"$PKG_ROOT/private/argus/metric/grafana/data/sessions" \
"$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \
"$PKG_ROOT/private/argus/metric/grafana/config"
# FTP
mkdir -p "$PKG_ROOT/private/argus/metric/ftp/share"
# Alertmanager
mkdir -p "$PKG_ROOT/private/argus/alert/alertmanager"
chown -R "$UIDV":"$GIDV" \
"$PKG_ROOT/private/argus/etc" \
"$PKG_ROOT/private/argus/bind" \
"$PKG_ROOT/private/argus/master" \
"$PKG_ROOT/private/argus/agent" \
"$PKG_ROOT/private/argus/log/elasticsearch" \
"$PKG_ROOT/private/argus/log/kibana" \
"$PKG_ROOT/private/argus/metric/prometheus" \
"$PKG_ROOT/private/argus/metric/grafana" \
"$PKG_ROOT/private/argus/metric/ftp" \
"$PKG_ROOT/private/argus/alert"
chmod -R g+w "$PKG_ROOT/private/argus/alert" "$PKG_ROOT/private/argus/etc" || true
# Ensure parent directories also owned by runtime user for consistency
chown "$UIDV":"$GIDV" \
"$PKG_ROOT/private/argus" \
"$PKG_ROOT/private/argus/log" \
"$PKG_ROOT/private/argus/metric" || true
echo "[PREPARE] Done. You can now run server-install.sh"

25
deployment/build/templates/scripts/server-selfcheck.sh Normal file → Executable file
View File

@ -32,7 +32,11 @@ log "checking Master"
wait_http "http://localhost:${MASTER_PORT:-32300}/readyz" 60 || ok=0
log "checking FTP"
ftp_root="$ROOT/private/argus/metric/ftp/share"; [[ -d "$ftp_root" && -w "$ftp_root" ]] && ftp_ok=true || { ftp_ok=false; ok=0; }
if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then
if docker exec argus-ftp sh -lc 'test -w /private/argus/ftp/share'; then ftp_ok=true; else ftp_ok=false; ok=0; fi
else
ftp_ok=false; ok=0;
fi
log "checking Prometheus"
wait_http "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready" 60 || ok=0
@ -51,8 +55,9 @@ p8083=$(code_for "http://localhost:${WEB_PROXY_PORT_8083:-8083}/")
cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true)
cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true)
wp_ok=true
[[ "$p8080" == 200 ]] || wp_ok=false
([[ "$p8083" == 200 || "$p8083" == 302 ]]) || wp_ok=false
# 有些环境首页可能 403此处接受 200/403
([[ "$p8080" == 200 || "$p8080" == 403 ]]) || wp_ok=false
([[ "$p8083" == 200 || "$p8083" == 302 || "$p8083" == 403 ]]) || wp_ok=false
[[ -n "$cors8084" && -n "$cors8085" ]] || wp_ok=false
[[ "$wp_ok" == true ]] || ok=0
@ -71,5 +76,15 @@ cat > "$tmp" <<JSON
JSON
mv "$tmp" "$OUT_JSON"
[[ "$ok" == 1 ]] && { log "selfcheck OK"; exit 0; } || { err "selfcheck FAILED (see $OUT_JSON)"; exit 1; }
if [[ "$ok" == 1 ]]; then
log "selfcheck OK"
exit 0
else
err "selfcheck FAILED (see $OUT_JSON)"
# If diagnose script exists, run it to collect more details
if [[ -x "$SCRIPT_DIR/server-diagnose.sh" ]]; then
# run diagnose; it will print the actual timestamped file paths and update 'latest' symlinks
"$SCRIPT_DIR/server-diagnose.sh" || true
fi
exit 1
fi

0
deployment/build/templates/scripts/server-status.sh Normal file → Executable file
View File

0
deployment/build/templates/scripts/server-uninstall.sh Normal file → Executable file
View File

BIN
doc/metric_lists.xlsx Normal file

Binary file not shown.

View File

@ -1 +1 @@
1.38.0
1.40.0

View File

@ -48,6 +48,31 @@ BACKUPS_DIR="$INSTALL_DIR/backups" # 备份目录
CURRENT_LINK="$INSTALL_DIR/current" # 当前版本软链接
LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION" # 当前版本记录文件
# 预检查Agent 元数据与 hostname 约束
require_agent_metadata() {
local hn
hn="$(hostname)"
local ok=false
# 三元环境变量
if [[ -n "${AGENT_ENV:-}" && -n "${AGENT_USER:-}" && -n "${AGENT_INSTANCE:-}" ]]; then
ok=true
fi
# host 形如 env-user-instance-xxx
if [[ "$hn" =~ ^[^-]+-[^-]+-[^-]+-.*$ ]]; then
ok=true
fi
if [[ "$ok" == false ]]; then
log_error "检测到 hostname 与 Agent 元数据不完整:"
log_error " 当前 hostname: $hn"
log_error " AGENT_ENV='${AGENT_ENV:-}' AGENT_USER='${AGENT_USER:-}' AGENT_INSTANCE='${AGENT_INSTANCE:-}'"
echo
log_info "请满足以下其一后重试:"
log_info " 方式A设置 hostname 为 env-user-instance-任意,例如 dev-alice-node001-pod-0"
log_info " 方式B导出环境变量export AGENT_ENV=dev AGENT_USER=alice AGENT_INSTANCE=node001"
exit 1
fi
}
# 检查必需的FTP参数
check_ftp_params() {
local missing_params=()
@ -873,6 +898,47 @@ rollback_version() {
fi
}
# 自检实现:等待 node.json 就绪且健康,并验证 last_report 持续更新
selfcheck_post_install() {
local hn="$(hostname)"
local node_file="/private/argus/agent/${AGENT_HOSTNAME:-$hn}/node.json"
local deadline=$(( $(date +%s) + 300 ))
local t1="" t2=""
while :; do
if [[ -f "$node_file" ]]; then
if command -v jq >/dev/null 2>&1; then
local ok_health lr
ok_health=$(jq -er '(.health["metric-argus-agent"].status=="healthy") and (.health["metric-node-exporter"].status=="healthy") and (.health["metric-fluent-bit"].status=="healthy") and (.health["metric-dcgm-exporter"].status=="healthy")' "$node_file" 2>/dev/null || echo false)
lr=$(jq -r '.last_report // ""' "$node_file" 2>/dev/null)
if [[ "$ok_health" == true && -n "$lr" ]]; then
if [[ -z "$t1" ]]; then
t1="$lr"
# agent 默认 60s 上报,等待 70s 再校验一次
sleep 70
continue
fi
t2="$lr"
if [[ "$t2" != "$t1" ]]; then
return 0
fi
# 若未变化,再等待一会儿直到超时
sleep 10
fi
else
# 无 jq 时的宽松校验
if grep -q '"status"\s*:\s*"healthy"' "$node_file"; then
return 0
fi
fi
fi
if (( $(date +%s) >= deadline )); then
log_error "自检超时:未在 5 分钟内确认 last_report 持续更新 或 健康状态不满足(路径:$node_file"
return 1
fi
sleep 5
done
}
# 主函数
main() {
echo "=========================================="
@ -912,17 +978,26 @@ main() {
# return 0
# fi
check_ftp_params
check_system
check_ftp_params
check_system
require_agent_metadata
if [[ "$ACTION" == "uninstall" ]]; then
uninstall_argus_metric
else
install_argus_metric
fi
# 安装后自检:最多等待 5 分钟,确认 node.json 存在且健康
echo
log_info "操作完成!"
log_info "开始安装后自检(最多等待 5 分钟)..."
selfcheck_post_install || {
log_error "安装后自检未通过,请查看 /var/log/argus-agent.log 以及 /opt/argus-metric/versions/*/.install.log"
exit 1
}
echo
log_success "全部自检通过,安装完成!"
}
# 脚本入口

View File

@ -24,13 +24,18 @@ else
fi
# ========== 读取 DNS ==========
if [ -f "$DNS_CONF_PRIVATE" ]; then
echo "$DNS_CONF_PRIVATE 读取 DNS 服务器..."
RESOLVERS=$(awk '/^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/ {print $1}' "$DNS_CONF_PRIVATE" | tr '\n' ' ')
fi
RESOLVERS=""
# 优先等待 /private/argus/etc/dns.conf 生成并读取其中的 IP
for i in $(seq 1 10); do
if [ -f "$DNS_CONF_PRIVATE" ]; then
RESOLVERS=$(awk '/^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/{print $1}' "$DNS_CONF_PRIVATE" | tr '\n' ' ')
fi
[ -n "$RESOLVERS" ] && break
sleep 1
done
# 如果 /private 文件不存在则 fallback
if [ -z "${RESOLVERS:-}" ]; then
# 若仍为空则回退到系统 resolv.conf
if [ -z "$RESOLVERS" ]; then
echo "未在 $DNS_CONF_PRIVATE 中找到有效 DNS使用系统 /etc/resolv.conf"
RESOLVERS=$(awk '/^nameserver/ {print $2}' "$DNS_CONF_SYSTEM" | tr '\n' ' ')
fi
@ -47,8 +52,9 @@ echo "检测到 DNS 服务器列表: $RESOLVERS"
if [ -f "$TEMPLATE" ]; then
echo "从模板生成 nginx.conf ..."
# 合并 Docker 内置 DNS 以保障解析 Compose 服务名
# 将 127.0.0.11 放在末尾,优先使用 /private/argus/etc/dns.conf 指向的 bind
if ! echo " $RESOLVERS " | grep -q " 127.0.0.11 "; then
RESOLVERS="127.0.0.11 ${RESOLVERS}"
RESOLVERS="${RESOLVERS} 127.0.0.11"
fi
sed "s|__RESOLVERS__|$RESOLVERS|" "$TEMPLATE" > "$TARGET"
else