[#37] 测试NixOS部署server通过

This commit is contained in:
yuyr 2025-10-31 14:18:58 +08:00
parent 29eb75a374
commit 3202e02b42
18 changed files with 493 additions and 54 deletions

0
deployment/build/build_client_package.sh Normal file → Executable file
View File

30
deployment/build/build_server_package.sh Normal file → Executable file
View File

@ -48,6 +48,17 @@ SRC_COMPOSE="$ROOT_DIR/src/sys/tests/docker-compose.yml"
[[ -f "$SRC_COMPOSE" ]] || { err "missing $SRC_COMPOSE"; exit 1; }
awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$STAGE/compose/docker-compose.yml"
cp "$BUILD_DIR/templates/.env.example" "$STAGE/compose/.env.example"
# fix relative private path to match package layout (compose/ and private/ are siblings)
sed -i 's#\./private/#../private/#g' "$STAGE/compose/docker-compose.yml"
# also handle bind mount form without trailing slash
sed -i -E 's#- \./private:/private#- ../private:/private#g' "$STAGE/compose/docker-compose.yml"
# drop timezone file bind which may not exist on target distros (e.g. NixOS)
sed -i '/\/etc\/timezone:\/etc\/timezone:ro/d' "$STAGE/compose/docker-compose.yml"
# sanity-check: ensure test services are absent
if grep -E '^(\s{2})(node-a|node-b|test-node|test-gpu-node):\s*$' -n "$STAGE/compose/docker-compose.yml" >/dev/null; then
err "compose filter failed: test services still present"; exit 1;
fi
# 3) Images (reuse if already exported unless --resave-image)
existing_images_tar="$PKG_DIR/images/all-images.tar.gz"
@ -76,24 +87,7 @@ fi
# 4) Scripts & Docs
copy_tree "$BUILD_DIR/templates/scripts" "$STAGE/scripts"
cat > "$STAGE/docs/INSTALL_SERVER.md" << 'MD'
# Argus Server Offline Installation
## Prerequisites
- Ubuntu 22.04 x86_64
- Docker & Docker Compose installed
- Open ports: 32300,9200,5601,9090,9093,8080..8085,21,20,21100-21110 (or auto-fallback to high ports)
## Steps
1. Extract this package to /opt/argus-deploy/versions/<YYYYMMDD>
2. cd scripts && sudo ./server-install.sh
3. Check status: ./server-status.sh
4. Uninstall: ./server-uninstall.sh
## Notes
- Selfcheck result is written to logs/selfcheck.json
- DNS will be managed by internal bind; FTP dns.conf is auto-published to share/dns.conf
MD
copy_tree "$BUILD_DIR/templates/docs" "$STAGE/docs"
# 5) Manifests
gen_manifest "$STAGE" "$STAGE/manifest.txt"

0
deployment/build/common.sh Normal file → Executable file
View File

0
deployment/build/publish_client.sh Normal file → Executable file
View File

View File

@ -4,38 +4,38 @@
BEGIN{
split(remove, rm, ",");
for(i in rm) skipname[rm[i]] = 1;
for(i in rm){
gsub(/^\s+|\s+$/,"",rm[i]);
if (rm[i] != "") skipname[rm[i]] = 1;
}
in_services=0; skipping=0;
}
function starts_service_line(line, name) {
if (match(line, /^\s{2}([a-zA-Z0-9_-]+):\s*$/, m)) {
name = m[1];
return name;
}
function service_header(line, m) {
# match exactly two leading spaces followed by name:
if (match(line, /^ ([A-Za-z0-9_-]+):[ ]*$/, m)) return m[1];
return "";
}
{
name = starts_service_line($0);
if (name != "") {
# detect top-level keys (networks:, services:, etc.)
if ($0 ~ /^services:\s*$/) { in_services=1; print; next; }
if ($0 ~ /^[a-zA-Z0-9_-]+:\s*$/ && $0 !~ /^\s/) {
in_services= ($0 ~ /^services:\s*$/);
}
# Track top-level sections (no indentation)
if ($0 ~ /^[A-Za-z0-9_-]+:[ ]*$/ && $0 !~ /^\s/) {
in_services = ($0 ~ /^services:[ ]*$/) ? 1 : 0;
}
if (in_services && (name in skipname)) {
skipping=1; next;
if (skipping) {
# Stop skipping at next service header or another top-level section
if (service_header($0) != "" || ($0 ~ /^(networks|volumes):[ ]*$/ && $0 !~ /^\s/)) {
skipping=0;
} else {
next;
}
}
# end skipping when next top-level service appears
if (skipping) {
if (starts_service_line($0) != "") { skipping=0; }
else if ($0 ~ /^(networks|volumes):\s*$/) { skipping=0; }
else { next; }
if (in_services) {
name = service_header($0);
if (name != "" && (name in skipname)) { skipping=1; next; }
}
print;
}

View File

@ -0,0 +1,48 @@
# Argus Server Offline Installation
## Prerequisites
- Linux x86_64 (Ubuntu 22.04 recommended; see OS compatibility for NixOS)
- Docker & Docker Compose installed
- Open ports: 32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 2110021110 (or auto-fallback to high ports)
## Quick Start
1. Extract to a target dir, e.g. `/opt/argus-deploy/versions/<YYYYMMDD>`
2. `cd scripts && sudo ./server-prepare-dirs.sh`
3. `./server-install.sh`
4. `./server-status.sh`
5. `./server-selfcheck.sh` (on failure it auto-runs diagnose and writes logs under `logs/`)
6. `./server-uninstall.sh` to tear down
## What the Installer Does
- Loads local images (`images/all-images.tar.gz`)
- Generates OS-compat override (`security_opt: ["label=disable"]`, `userns_mode: host`, bind `tmpfs:/run/named`)
- Starts server-only services: bind/master/es/kibana/ftp/prometheus/grafana/alertmanager/web-frontend/web-proxy
- DNS Bootstrap:
- Ensure `/private/argus/etc/dns.conf` exists (write `172.31.0.2` if missing);
- Run `/private/argus/etc/update-dns.sh` in dependent containers so `/etc/resolv.conf` points to bind;
- Wait for `*.argus.com` hint files, then reload bind;
- Restart webproxy to re-render nginx resolver from `dns.conf`;
- Writes `logs/selfcheck.json` as final summary
## OS Compatibility
- NixOS / non-xattr FS: containers run with `security_opt: ["label=disable"]` and `userns_mode: host`; ensure data dirs are pre-created via `sudo ./server-prepare-dirs.sh` and owned by runtime UID:GID (default 1000:1000).
## Files & Layout
- `compose/` (docker-compose.yml, .env)
- `private/` (data mounts)
- `scripts/` (install/uninstall/status/selfcheck/diagnose)
- `logs/` (selfcheck + diagnose outputs)
## Troubleshooting (Quick)
- Run `./server-selfcheck.sh` → see `logs/selfcheck.json`
- Run `./server-diagnose.sh` → produces timestamped logs:
- `logs/diagnose_details_YYYYMMDD-HHMMSSZ.log`
- `logs/diagnose_error_YYYYMMDD-HHMMSSZ.log`
And updates `diagnose_details.log`/`diagnose_error.log` to the latest
- Error lines are tagged `[service][source]`, e.g. `[kibana][http] /api/status=503`
Common issues:
- Kibana 503: wait cold start or fix DNS so `es.log.argus.com` resolves
- webproxy 504: check nginx `resolver` includes `172.31.0.2 127.0.0.11`
- EACCES/locks: ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches UID:GID

View File

@ -0,0 +1,28 @@
# Argus 服务端离线安装指南
## 先决条件
- Linux x86_64推荐 Ubuntu 22.04NixOS 见“兼容说明”)
- 已安装 Docker 与 Docker Compose
- 端口32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 2110021110
## 快速开始
1. 解压到目标目录(例如 `/opt/argus-deploy/versions/<YYYYMMDD>`
2. 进入 `scripts/``sudo ./server-prepare-dirs.sh`
3. 安装:`./server-install.sh`
4. 状态:`./server-status.sh`
5. 自检:`./server-selfcheck.sh`(失败会自动采集诊断)
6. 卸载:`./server-uninstall.sh`
## 安装流程要点
- 仅启动 10 个服务端组件(不包含测试节点);
- DNS Bootstrap补齐首次部署 DNS 依赖(生成/确认 `dns.conf`、统一容器 resolv.conf、写入 `*.argus.com`、reload bind、重启 webproxy
- 输出自检结果到 `logs/selfcheck.json`
## 兼容说明NixOS 等)
- 使用 `security_opt: ["label=disable"]``userns_mode: host`
- 先运行 `sudo ./server-prepare-dirs.sh` 创建/授权目录为 `1000:1000`
## 故障排查(见下文 Troubleshooting_zh
- `./server-selfcheck.sh``logs/selfcheck.json`
- `./server-diagnose.sh``logs/diagnose_error_*.log` / `logs/diagnose_details_*.log`

View File

@ -0,0 +1,21 @@
# Troubleshooting
- Status: `scripts/server-status.sh`
- Selfcheck: `scripts/server-selfcheck.sh`
- Diagnose: `scripts/server-diagnose.sh`
Outputs:
- `logs/selfcheck.json`
- `logs/diagnose_details_*.log` (full details)
- `logs/diagnose_error_*.log` (tagged errors)
WebProxy:
- 8083 expects 200/302/403; 8084/8085 must include CORS header
- nginx resolver should be `172.31.0.2 127.0.0.11`
Kibana/ES:
- Verify `es.log.argus.com` resolves inside Kibana
Permissions:
- Ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches runtime UID:GID

View File

@ -0,0 +1,15 @@
# 故障排查
- 状态:`scripts/server-status.sh`
- 自检:`scripts/server-selfcheck.sh`
- 诊断:`scripts/server-diagnose.sh`
输出:
- `logs/selfcheck.json`
- `logs/diagnose_error_*.log`(错误摘要)
- `logs/diagnose_details_*.log`(详细信息)
WebProxy8083=200/302/4038084/8085 需包含 CORS
Kibana确认可解析 `es.log.argus.com`
权限:先运行 `sudo ./server-prepare-dirs.sh`

View File

@ -0,0 +1,117 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
mkdir -p "$ROOT/logs"
ts="$(date -u +%Y%m%d-%H%M%SZ)"
DETAILS="$ROOT/logs/diagnose_details_${ts}.log"
ERRORS="$ROOT/logs/diagnose_error_${ts}.log"
: > "$DETAILS"; : > "$ERRORS"
logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; }
append_err() { echo "$*" >> "$ERRORS"; }
http_code() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
http_body_head() { curl -s --max-time 3 "$1" 2>/dev/null | sed -n '1,5p' || true; }
header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; }
section() {
local name="$1"; logd "===== [$name] ====="; }
svc() {
local svc_name="$1"; local cname="$2"; shift 2
section "$svc_name ($cname)"
logd "docker ps:"; docker ps -a --format '{{.Names}}\t{{.Status}}\t{{.Image}}' | awk -v n="$cname" '$1==n' >> "$DETAILS" || true
logd "docker inspect (state/restartcount):"; docker inspect -f '{{.State.Status}} rc={{.RestartCount}} started={{.State.StartedAt}}' "$cname" >> "$DETAILS" 2>&1 || true
logd "last 200 container logs:"; docker logs --tail 200 "$cname" >> "$DETAILS" 2>&1 || true
# extract error lines from container logs
docker logs --tail 200 "$cname" 2>&1 | \
grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \
sed "s/^/[${svc_name}][container] /" >> "$ERRORS" || true
# supervisor status and logs
if docker exec "$cname" sh -lc 'command -v supervisorctl >/dev/null 2>&1' >/dev/null 2>&1; then
logd "supervisorctl status:"; docker exec "$cname" sh -lc 'supervisorctl status' >> "$DETAILS" 2>&1 || true
# iterate supervisor logs and collect tails + errors per file
local files
files=$(docker exec "$cname" sh -lc 'ls /var/log/supervisor/*.log 2>/dev/null' || true)
for f in $files; do
logd "tail -n 80 $f:"; docker exec "$cname" sh -lc "tail -n 80 $f 2>/dev/null || true" >> "$DETAILS" 2>&1 || true
docker exec "$cname" sh -lc "tail -n 200 $f 2>/dev/null" 2>/dev/null | \
grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \
sed "s/^/[${svc_name}][supervisor:$(basename $f)] /" >> "$ERRORS" || true
done
fi
}
# Core services
svc bind argus-bind-sys
svc master argus-master-sys
svc es argus-es-sys
svc kibana argus-kibana-sys
svc ftp argus-ftp
svc prometheus argus-prometheus
svc grafana argus-grafana
svc alertmanager argus-alertmanager
svc web-frontend argus-web-frontend
svc web-proxy argus-web-proxy
# HTTP checks (host side)
section HTTP
logd "ES: $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health")"
http_body_head "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" >> "$DETAILS" 2>&1 || true
logd "Kibana: $(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status")"
http_body_head "http://localhost:${KIBANA_PORT:-5601}/api/status" >> "$DETAILS" 2>&1 || true
logd "Master readyz: $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz")"
logd "Prometheus: $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready")"
logd "Grafana: $(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health")"
http_body_head "http://localhost:${GRAFANA_PORT:-3000}/api/health" >> "$DETAILS" 2>&1 || true
logd "Alertmanager: $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status")"
cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true)
cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true)
logd "Web-Proxy 8080: $(http_code "http://localhost:${WEB_PROXY_PORT_8080:-8080}/")"
logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083}/")"
logd "Web-Proxy 8084 CORS: ${cors8084}"
logd "Web-Proxy 8085 CORS: ${cors8085}"
# FTP share writability (container perspective)
section FTP-SHARE
docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true
# Collect system info for context
section SYSTEM
logd "uname -a:"; uname -a >> "$DETAILS"
logd "docker version:"; docker version --format '{{.Server.Version}}' >> "$DETAILS" 2>&1 || true
logd "compose ps:"; (cd "$ROOT/compose" && if docker compose version >/dev/null 2>&1; then docker compose -p argus-sys ps; else docker-compose -p argus-sys ps; fi) >> "$DETAILS" 2>&1 || true
section SUMMARY
# Add HTTP failures and CORS problems to error log with tags
[[ $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health") != 200 ]] && echo "[es][http] health not 200" >> "$ERRORS"
kbcode=$(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status"); [[ "$kbcode" != 200 ]] && echo "[kibana][http] /api/status=$kbcode" >> "$ERRORS"
[[ $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz") != 200 ]] && echo "[master][http] /readyz not 200" >> "$ERRORS"
[[ $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready") != 200 ]] && echo "[prometheus][http] /-/ready not 200" >> "$ERRORS"
gfcode=$(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health"); [[ "$gfcode" != 200 ]] && echo "[grafana][http] /api/health=$gfcode" >> "$ERRORS"
[[ $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status") != 200 ]] && echo "[alertmanager][http] /api/v2/status not 200" >> "$ERRORS"
[[ -z "$cors8084" ]] && echo "[web-proxy][cors] 8084 missing Access-Control-Allow-Origin" >> "$ERRORS"
[[ -z "$cors8085" ]] && echo "[web-proxy][cors] 8085 missing Access-Control-Allow-Origin" >> "$ERRORS"
# Deduplicate errors
sort -u -o "$ERRORS" "$ERRORS"
echo "Diagnostic details -> $DETAILS"
echo "Detected errors -> $ERRORS"
# maintain latest symlinks for convenience
ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true
ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true
exit 0

126
deployment/build/templates/scripts/server-install.sh Normal file → Executable file
View File

@ -43,6 +43,29 @@ prepare_env() {
done
}
prepare_data_dirs() {
if [[ $EUID -ne 0 ]]; then
echo -e "\033[1;33m[WARN]\033[0m running as non-root: will not chown data dirs."
echo -e "\033[1;33m[WARN]\033[0m If you hit Permission denied, run: sudo $SCRIPT_DIR/server-prepare-dirs.sh"
# still ensure basic directories exist (no chown)
mkdir -p \
"$PKG_ROOT/private/argus/etc" \
"$PKG_ROOT/private/argus/metric/prometheus" \
"$PKG_ROOT/private/argus/metric/prometheus/data" \
"$PKG_ROOT/private/argus/metric/prometheus/rules" \
"$PKG_ROOT/private/argus/metric/grafana" \
"$PKG_ROOT/private/argus/metric/grafana/data" \
"$PKG_ROOT/private/argus/metric/grafana/logs" \
"$PKG_ROOT/private/argus/metric/grafana/plugins" \
"$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \
"$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \
"$PKG_ROOT/private/argus/metric/grafana/data/sessions" \
"$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \
"$PKG_ROOT/private/argus/alert/alertmanager" \
"$PKG_ROOT/private/argus/metric/ftp/share"
fi
}
load_images() {
local tar="$PKG_ROOT/images/all-images.tar.gz"
[[ -f "$tar" ]] || { err "missing images tar: $tar"; exit 1; }
@ -52,7 +75,105 @@ load_images() {
bring_up() {
log "starting services via compose"
(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" up -d)
local ov="$PKG_ROOT/compose/docker-compose.os-compat.override.yml"
if [[ ! -f "$ov" ]]; then
cat > "$ov" <<'YAML'
services:
bind:
security_opt: ["label=disable"]
userns_mode: "host"
tmpfs:
- /run/named
master:
security_opt: ["label=disable"]
userns_mode: "host"
es:
security_opt: ["label=disable"]
userns_mode: "host"
kibana:
security_opt: ["label=disable"]
userns_mode: "host"
ftp:
security_opt: ["label=disable"]
userns_mode: "host"
prometheus:
security_opt: ["label=disable"]
userns_mode: "host"
grafana:
security_opt: ["label=disable"]
userns_mode: "host"
alertmanager:
security_opt: ["label=disable"]
userns_mode: "host"
# ensure runtime path matches container expectation
volumes:
- ../private/argus/etc:/private/argus/etc
- ../private/argus/alert/alertmanager:/alertmanager
web-frontend:
security_opt: ["label=disable"]
userns_mode: "host"
web-proxy:
security_opt: ["label=disable"]
userns_mode: "host"
YAML
log "generated OS-compat override: $(basename "$ov")"
fi
# 仅启动服务端组件避免误起测试节点node-a/node-b/test-node/test-gpu-node
local services=(bind master es kibana ftp prometheus grafana alertmanager web-frontend web-proxy)
log "services: ${services[*]}"
(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" -f docker-compose.yml -f $(basename "$ov") up -d "${services[@]}")
}
dns_bootstrap() {
log "DNS bootstrap: initializing shared dns.conf and container resolv.conf"
local etc_dir="$PKG_ROOT/private/argus/etc"
mkdir -p "$etc_dir"
# 1) ensure dns.conf exists (fallback to bind IP 172.31.0.2)
if [[ ! -s "$etc_dir/dns.conf" ]]; then
if echo "172.31.0.2" > "$etc_dir/dns.conf" 2>/dev/null; then
log "wrote fallback dns.conf with 172.31.0.2"
else
# host-side write denied (ownership 1000:1000); write via bind container instead
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
docker exec argus-bind-sys sh -lc 'echo 172.31.0.2 > /private/argus/etc/dns.conf && chmod 644 /private/argus/etc/dns.conf' || true
log "fallback dns.conf written via bind container"
else
log "bind not ready; skip writing fallback dns.conf"
fi
fi
fi
# 2) wait briefly for bind to copy update-dns.sh into shared etc (bind startup.sh does this)
local i=0
while [[ ! -x "$etc_dir/update-dns.sh" && $i -lt 20 ]]; do
sleep 0.5; ((i++));
done
if [[ ! -x "$etc_dir/update-dns.sh" ]]; then
log "update-dns.sh not present yet; continuing with existing resolv.conf"
fi
# 3) run update-dns.sh inside key containers so /etc/resolv.conf points to bind
local c
for c in argus-master-sys argus-es-sys argus-kibana-sys argus-grafana argus-prometheus argus-ftp argus-web-frontend argus-web-proxy argus-alertmanager; do
if docker ps --format '{{.Names}}' | grep -q "^${c}$"; then
docker exec "$c" sh -lc 'test -x /private/argus/etc/update-dns.sh && /private/argus/etc/update-dns.sh || true' >/dev/null 2>&1 || true
fi
done
# 4) wait for service A-record hint files generated by services (best-effort)
local need=( es.log.argus.com kibana.log.argus.com master.argus.com grafana.metric.argus.com prom.metric.argus.com alertmanager.alert.argus.com )
local waited=0; local missing=1
while (( waited < 15 )); do
missing=0
for f in "${need[@]}"; do [[ -s "$etc_dir/$f" ]] || { missing=1; break; }; done
[[ $missing -eq 0 ]] && break
sleep 1; ((waited++))
done
# 5) reload bind zone (script uses supervisor to restart bind9)
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
docker exec argus-bind-sys sh -lc '/usr/local/bin/reload-bind9.sh' >/dev/null 2>&1 || true
fi
# 6) restart web-proxy once to re-render nginx resolver with latest dns.conf
if docker ps --format '{{.Names}}' | grep -q '^argus-web-proxy$'; then
docker restart argus-web-proxy >/dev/null 2>&1 || true
fi
}
selfcheck() {
@ -62,11 +183,12 @@ selfcheck() {
main() {
prepare_env
prepare_data_dirs
load_images
bring_up
dns_bootstrap
selfcheck
log "install completed. See logs in $PKG_ROOT/logs/"
}
main "$@"

View File

@ -0,0 +1,73 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
if [[ $EUID -ne 0 ]]; then
echo "[PREPARE] This script requires root (sudo)." >&2
echo " Try: sudo $0" >&2
exit 1
fi
ENV_FILE="$PKG_ROOT/compose/.env"
[[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
UIDV="${ARGUS_BUILD_UID:-1000}"; GIDV="${ARGUS_BUILD_GID:-1000}"
echo "[PREPARE] Using owner ${UIDV}:${GIDV}"
# Core etc and service data dirs (aligned with src/sys/tests/scripts/01_bootstrap.sh)
mkdir -p \
"$PKG_ROOT/private/argus/etc" \
"$PKG_ROOT/private/argus/bind" \
"$PKG_ROOT/private/argus/master" \
"$PKG_ROOT/private/argus/agent" \
"$PKG_ROOT/private/argus/log/elasticsearch" \
"$PKG_ROOT/private/argus/log/kibana"
# Prometheus
mkdir -p \
"$PKG_ROOT/private/argus/metric/prometheus" \
"$PKG_ROOT/private/argus/metric/prometheus/data" \
"$PKG_ROOT/private/argus/metric/prometheus/rules" \
"$PKG_ROOT/private/argus/metric/prometheus/targets"
# Grafana
mkdir -p \
"$PKG_ROOT/private/argus/metric/grafana" \
"$PKG_ROOT/private/argus/metric/grafana/data" \
"$PKG_ROOT/private/argus/metric/grafana/logs" \
"$PKG_ROOT/private/argus/metric/grafana/plugins" \
"$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \
"$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \
"$PKG_ROOT/private/argus/metric/grafana/data/sessions" \
"$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \
"$PKG_ROOT/private/argus/metric/grafana/config"
# FTP
mkdir -p "$PKG_ROOT/private/argus/metric/ftp/share"
# Alertmanager
mkdir -p "$PKG_ROOT/private/argus/alert/alertmanager"
chown -R "$UIDV":"$GIDV" \
"$PKG_ROOT/private/argus/etc" \
"$PKG_ROOT/private/argus/bind" \
"$PKG_ROOT/private/argus/master" \
"$PKG_ROOT/private/argus/agent" \
"$PKG_ROOT/private/argus/log/elasticsearch" \
"$PKG_ROOT/private/argus/log/kibana" \
"$PKG_ROOT/private/argus/metric/prometheus" \
"$PKG_ROOT/private/argus/metric/grafana" \
"$PKG_ROOT/private/argus/metric/ftp" \
"$PKG_ROOT/private/argus/alert"
chmod -R g+w "$PKG_ROOT/private/argus/alert" "$PKG_ROOT/private/argus/etc" || true
# Ensure parent directories also owned by runtime user for consistency
chown "$UIDV":"$GIDV" \
"$PKG_ROOT/private/argus" \
"$PKG_ROOT/private/argus/log" \
"$PKG_ROOT/private/argus/metric" || true
echo "[PREPARE] Done. You can now run server-install.sh"

25
deployment/build/templates/scripts/server-selfcheck.sh Normal file → Executable file
View File

@ -32,7 +32,11 @@ log "checking Master"
wait_http "http://localhost:${MASTER_PORT:-32300}/readyz" 60 || ok=0
log "checking FTP"
ftp_root="$ROOT/private/argus/metric/ftp/share"; [[ -d "$ftp_root" && -w "$ftp_root" ]] && ftp_ok=true || { ftp_ok=false; ok=0; }
if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then
if docker exec argus-ftp sh -lc 'test -w /private/argus/ftp/share'; then ftp_ok=true; else ftp_ok=false; ok=0; fi
else
ftp_ok=false; ok=0;
fi
log "checking Prometheus"
wait_http "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready" 60 || ok=0
@ -51,8 +55,9 @@ p8083=$(code_for "http://localhost:${WEB_PROXY_PORT_8083:-8083}/")
cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true)
cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true)
wp_ok=true
[[ "$p8080" == 200 ]] || wp_ok=false
([[ "$p8083" == 200 || "$p8083" == 302 ]]) || wp_ok=false
# 有些环境首页可能 403此处接受 200/403
([[ "$p8080" == 200 || "$p8080" == 403 ]]) || wp_ok=false
([[ "$p8083" == 200 || "$p8083" == 302 || "$p8083" == 403 ]]) || wp_ok=false
[[ -n "$cors8084" && -n "$cors8085" ]] || wp_ok=false
[[ "$wp_ok" == true ]] || ok=0
@ -71,5 +76,15 @@ cat > "$tmp" <<JSON
JSON
mv "$tmp" "$OUT_JSON"
[[ "$ok" == 1 ]] && { log "selfcheck OK"; exit 0; } || { err "selfcheck FAILED (see $OUT_JSON)"; exit 1; }
if [[ "$ok" == 1 ]]; then
log "selfcheck OK"
exit 0
else
err "selfcheck FAILED (see $OUT_JSON)"
# If diagnose script exists, run it to collect more details
if [[ -x "$SCRIPT_DIR/server-diagnose.sh" ]]; then
# run diagnose; it will print the actual timestamped file paths and update 'latest' symlinks
"$SCRIPT_DIR/server-diagnose.sh" || true
fi
exit 1
fi

0
deployment/build/templates/scripts/server-status.sh Normal file → Executable file
View File

0
deployment/build/templates/scripts/server-uninstall.sh Normal file → Executable file
View File

BIN
doc/metric_lists.xlsx Normal file

Binary file not shown.

View File

@ -1 +1 @@
1.38.0
1.40.0

View File

@ -24,13 +24,18 @@ else
fi
# ========== 读取 DNS ==========
if [ -f "$DNS_CONF_PRIVATE" ]; then
echo "$DNS_CONF_PRIVATE 读取 DNS 服务器..."
RESOLVERS=$(awk '/^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/ {print $1}' "$DNS_CONF_PRIVATE" | tr '\n' ' ')
fi
RESOLVERS=""
# 优先等待 /private/argus/etc/dns.conf 生成并读取其中的 IP
for i in $(seq 1 10); do
if [ -f "$DNS_CONF_PRIVATE" ]; then
RESOLVERS=$(awk '/^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/{print $1}' "$DNS_CONF_PRIVATE" | tr '\n' ' ')
fi
[ -n "$RESOLVERS" ] && break
sleep 1
done
# 如果 /private 文件不存在则 fallback
if [ -z "${RESOLVERS:-}" ]; then
# 若仍为空则回退到系统 resolv.conf
if [ -z "$RESOLVERS" ]; then
echo "未在 $DNS_CONF_PRIVATE 中找到有效 DNS使用系统 /etc/resolv.conf"
RESOLVERS=$(awk '/^nameserver/ {print $2}' "$DNS_CONF_SYSTEM" | tr '\n' ' ')
fi
@ -47,8 +52,9 @@ echo "检测到 DNS 服务器列表: $RESOLVERS"
if [ -f "$TEMPLATE" ]; then
echo "从模板生成 nginx.conf ..."
# 合并 Docker 内置 DNS 以保障解析 Compose 服务名
# 将 127.0.0.11 放在末尾,优先使用 /private/argus/etc/dns.conf 指向的 bind
if ! echo " $RESOLVERS " | grep -q " 127.0.0.11 "; then
RESOLVERS="127.0.0.11 ${RESOLVERS}"
RESOLVERS="${RESOLVERS} 127.0.0.11"
fi
sed "s|__RESOLVERS__|$RESOLVERS|" "$TEMPLATE" > "$TARGET"
else