From 3202e02b4269cbbfc467c9bf04ab25535fea124c Mon Sep 17 00:00:00 2001 From: yuyr Date: Fri, 31 Oct 2025 14:18:58 +0800 Subject: [PATCH] =?UTF-8?q?[#37]=20=E6=B5=8B=E8=AF=95NixOS=E9=83=A8?= =?UTF-8?q?=E7=BD=B2server=E9=80=9A=E8=BF=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deployment/build/build_client_package.sh | 0 deployment/build/build_server_package.sh | 30 ++--- deployment/build/common.sh | 0 deployment/build/publish_client.sh | 0 .../build/templates/docker-compose.filter.awk | 42 +++--- .../build/templates/docs/INSTALL_SERVER.md | 48 +++++++ .../build/templates/docs/INSTALL_SERVER_zh.md | 28 ++++ .../build/templates/docs/TROUBLESHOOTING.md | 21 +++ .../templates/docs/TROUBLESHOOTING_zh.md | 15 +++ .../templates/scripts/server-diagnose.sh | 117 ++++++++++++++++ .../build/templates/scripts/server-install.sh | 126 +++++++++++++++++- .../templates/scripts/server-prepare-dirs.sh | 73 ++++++++++ .../templates/scripts/server-selfcheck.sh | 25 +++- .../build/templates/scripts/server-status.sh | 0 .../templates/scripts/server-uninstall.sh | 0 doc/metric_lists.xlsx | Bin 0 -> 4279 bytes .../all-in-one-full/config/VERSION | 2 +- .../proxy/start-proxy-supervised.sh | 20 ++- 18 files changed, 493 insertions(+), 54 deletions(-) mode change 100644 => 100755 deployment/build/build_client_package.sh mode change 100644 => 100755 deployment/build/build_server_package.sh mode change 100644 => 100755 deployment/build/common.sh mode change 100644 => 100755 deployment/build/publish_client.sh create mode 100644 deployment/build/templates/docs/INSTALL_SERVER.md create mode 100644 deployment/build/templates/docs/INSTALL_SERVER_zh.md create mode 100644 deployment/build/templates/docs/TROUBLESHOOTING.md create mode 100644 deployment/build/templates/docs/TROUBLESHOOTING_zh.md create mode 100755 deployment/build/templates/scripts/server-diagnose.sh mode change 100644 => 100755 deployment/build/templates/scripts/server-install.sh create mode 100755 deployment/build/templates/scripts/server-prepare-dirs.sh mode change 100644 => 100755 deployment/build/templates/scripts/server-selfcheck.sh mode change 100644 => 100755 deployment/build/templates/scripts/server-status.sh mode change 100644 => 100755 deployment/build/templates/scripts/server-uninstall.sh create mode 100644 doc/metric_lists.xlsx diff --git a/deployment/build/build_client_package.sh b/deployment/build/build_client_package.sh old mode 100644 new mode 100755 diff --git a/deployment/build/build_server_package.sh b/deployment/build/build_server_package.sh old mode 100644 new mode 100755 index dc4a6a9..77a3249 --- a/deployment/build/build_server_package.sh +++ b/deployment/build/build_server_package.sh @@ -48,6 +48,17 @@ SRC_COMPOSE="$ROOT_DIR/src/sys/tests/docker-compose.yml" [[ -f "$SRC_COMPOSE" ]] || { err "missing $SRC_COMPOSE"; exit 1; } awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$STAGE/compose/docker-compose.yml" cp "$BUILD_DIR/templates/.env.example" "$STAGE/compose/.env.example" +# fix relative private path to match package layout (compose/ and private/ are siblings) +sed -i 's#\./private/#../private/#g' "$STAGE/compose/docker-compose.yml" +# also handle bind mount form without trailing slash +sed -i -E 's#- \./private:/private#- ../private:/private#g' "$STAGE/compose/docker-compose.yml" +# drop timezone file bind which may not exist on target distros (e.g. NixOS) +sed -i '/\/etc\/timezone:\/etc\/timezone:ro/d' "$STAGE/compose/docker-compose.yml" + +# sanity-check: ensure test services are absent +if grep -E '^(\s{2})(node-a|node-b|test-node|test-gpu-node):\s*$' -n "$STAGE/compose/docker-compose.yml" >/dev/null; then + err "compose filter failed: test services still present"; exit 1; +fi # 3) Images (reuse if already exported unless --resave-image) existing_images_tar="$PKG_DIR/images/all-images.tar.gz" @@ -76,24 +87,7 @@ fi # 4) Scripts & Docs copy_tree "$BUILD_DIR/templates/scripts" "$STAGE/scripts" -cat > "$STAGE/docs/INSTALL_SERVER.md" << 'MD' -# Argus Server Offline Installation - -## Prerequisites -- Ubuntu 22.04 x86_64 -- Docker & Docker Compose installed -- Open ports: 32300,9200,5601,9090,9093,8080..8085,21,20,21100-21110 (or auto-fallback to high ports) - -## Steps -1. Extract this package to /opt/argus-deploy/versions/ -2. cd scripts && sudo ./server-install.sh -3. Check status: ./server-status.sh -4. Uninstall: ./server-uninstall.sh - -## Notes -- Selfcheck result is written to logs/selfcheck.json -- DNS will be managed by internal bind; FTP dns.conf is auto-published to share/dns.conf -MD +copy_tree "$BUILD_DIR/templates/docs" "$STAGE/docs" # 5) Manifests gen_manifest "$STAGE" "$STAGE/manifest.txt" diff --git a/deployment/build/common.sh b/deployment/build/common.sh old mode 100644 new mode 100755 diff --git a/deployment/build/publish_client.sh b/deployment/build/publish_client.sh old mode 100644 new mode 100755 diff --git a/deployment/build/templates/docker-compose.filter.awk b/deployment/build/templates/docker-compose.filter.awk index ae2c7f4..72c6159 100644 --- a/deployment/build/templates/docker-compose.filter.awk +++ b/deployment/build/templates/docker-compose.filter.awk @@ -4,38 +4,38 @@ BEGIN{ split(remove, rm, ","); - for(i in rm) skipname[rm[i]] = 1; + for(i in rm){ + gsub(/^\s+|\s+$/,"",rm[i]); + if (rm[i] != "") skipname[rm[i]] = 1; + } + in_services=0; skipping=0; } -function starts_service_line(line, name) { - if (match(line, /^\s{2}([a-zA-Z0-9_-]+):\s*$/, m)) { - name = m[1]; - return name; - } +function service_header(line, m) { + # match exactly two leading spaces followed by name: + if (match(line, /^ ([A-Za-z0-9_-]+):[ ]*$/, m)) return m[1]; return ""; } { - name = starts_service_line($0); - if (name != "") { - # detect top-level keys (networks:, services:, etc.) - if ($0 ~ /^services:\s*$/) { in_services=1; print; next; } - if ($0 ~ /^[a-zA-Z0-9_-]+:\s*$/ && $0 !~ /^\s/) { - in_services= ($0 ~ /^services:\s*$/); - } + # Track top-level sections (no indentation) + if ($0 ~ /^[A-Za-z0-9_-]+:[ ]*$/ && $0 !~ /^\s/) { + in_services = ($0 ~ /^services:[ ]*$/) ? 1 : 0; + } - if (in_services && (name in skipname)) { - skipping=1; next; + if (skipping) { + # Stop skipping at next service header or another top-level section + if (service_header($0) != "" || ($0 ~ /^(networks|volumes):[ ]*$/ && $0 !~ /^\s/)) { + skipping=0; + } else { + next; } } - # end skipping when next top-level service appears - if (skipping) { - if (starts_service_line($0) != "") { skipping=0; } - else if ($0 ~ /^(networks|volumes):\s*$/) { skipping=0; } - else { next; } + if (in_services) { + name = service_header($0); + if (name != "" && (name in skipname)) { skipping=1; next; } } print; } - diff --git a/deployment/build/templates/docs/INSTALL_SERVER.md b/deployment/build/templates/docs/INSTALL_SERVER.md new file mode 100644 index 0000000..e72f5c8 --- /dev/null +++ b/deployment/build/templates/docs/INSTALL_SERVER.md @@ -0,0 +1,48 @@ +# Argus Server Offline Installation + +## Prerequisites +- Linux x86_64 (Ubuntu 22.04 recommended; see OS compatibility for NixOS) +- Docker & Docker Compose installed +- Open ports: 32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 21100–21110 (or auto-fallback to high ports) + +## Quick Start +1. Extract to a target dir, e.g. `/opt/argus-deploy/versions/` +2. `cd scripts && sudo ./server-prepare-dirs.sh` +3. `./server-install.sh` +4. `./server-status.sh` +5. `./server-selfcheck.sh` (on failure it auto-runs diagnose and writes logs under `logs/`) +6. `./server-uninstall.sh` to tear down + +## What the Installer Does +- Loads local images (`images/all-images.tar.gz`) +- Generates OS-compat override (`security_opt: ["label=disable"]`, `userns_mode: host`, bind `tmpfs:/run/named`) +- Starts server-only services: bind/master/es/kibana/ftp/prometheus/grafana/alertmanager/web-frontend/web-proxy +- DNS Bootstrap: + - Ensure `/private/argus/etc/dns.conf` exists (write `172.31.0.2` if missing); + - Run `/private/argus/etc/update-dns.sh` in dependent containers so `/etc/resolv.conf` points to bind; + - Wait for `*.argus.com` hint files, then reload bind; + - Restart web‑proxy to re-render nginx resolver from `dns.conf`; +- Writes `logs/selfcheck.json` as final summary + +## OS Compatibility +- NixOS / non-xattr FS: containers run with `security_opt: ["label=disable"]` and `userns_mode: host`; ensure data dirs are pre-created via `sudo ./server-prepare-dirs.sh` and owned by runtime UID:GID (default 1000:1000). + +## Files & Layout +- `compose/` (docker-compose.yml, .env) +- `private/` (data mounts) +- `scripts/` (install/uninstall/status/selfcheck/diagnose) +- `logs/` (selfcheck + diagnose outputs) + +## Troubleshooting (Quick) +- Run `./server-selfcheck.sh` → see `logs/selfcheck.json` +- Run `./server-diagnose.sh` → produces timestamped logs: + - `logs/diagnose_details_YYYYMMDD-HHMMSSZ.log` + - `logs/diagnose_error_YYYYMMDD-HHMMSSZ.log` + And updates `diagnose_details.log`/`diagnose_error.log` to the latest +- Error lines are tagged `[service][source]`, e.g. `[kibana][http] /api/status=503` + +Common issues: +- Kibana 503: wait cold start or fix DNS so `es.log.argus.com` resolves +- web‑proxy 504: check nginx `resolver` includes `172.31.0.2 127.0.0.11` +- EACCES/locks: ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches UID:GID + diff --git a/deployment/build/templates/docs/INSTALL_SERVER_zh.md b/deployment/build/templates/docs/INSTALL_SERVER_zh.md new file mode 100644 index 0000000..2f999a9 --- /dev/null +++ b/deployment/build/templates/docs/INSTALL_SERVER_zh.md @@ -0,0 +1,28 @@ +# Argus 服务端离线安装指南 + +## 先决条件 +- Linux x86_64(推荐 Ubuntu 22.04;NixOS 见“兼容说明”) +- 已安装 Docker 与 Docker Compose +- 端口:32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 21100–21110 + +## 快速开始 +1. 解压到目标目录(例如 `/opt/argus-deploy/versions/`) +2. 进入 `scripts/`:`sudo ./server-prepare-dirs.sh` +3. 安装:`./server-install.sh` +4. 状态:`./server-status.sh` +5. 自检:`./server-selfcheck.sh`(失败会自动采集诊断) +6. 卸载:`./server-uninstall.sh` + +## 安装流程要点 +- 仅启动 10 个服务端组件(不包含测试节点); +- DNS Bootstrap:补齐首次部署 DNS 依赖(生成/确认 `dns.conf`、统一容器 resolv.conf、写入 `*.argus.com`、reload bind、重启 web‑proxy); +- 输出自检结果到 `logs/selfcheck.json`。 + +## 兼容说明(NixOS 等) +- 使用 `security_opt: ["label=disable"]` 与 `userns_mode: host`; +- 先运行 `sudo ./server-prepare-dirs.sh` 创建/授权目录为 `1000:1000`; + +## 故障排查(见下文 Troubleshooting_zh) +- `./server-selfcheck.sh` → `logs/selfcheck.json` +- `./server-diagnose.sh` → `logs/diagnose_error_*.log` / `logs/diagnose_details_*.log` + diff --git a/deployment/build/templates/docs/TROUBLESHOOTING.md b/deployment/build/templates/docs/TROUBLESHOOTING.md new file mode 100644 index 0000000..49b5375 --- /dev/null +++ b/deployment/build/templates/docs/TROUBLESHOOTING.md @@ -0,0 +1,21 @@ +# Troubleshooting + +- Status: `scripts/server-status.sh` +- Selfcheck: `scripts/server-selfcheck.sh` +- Diagnose: `scripts/server-diagnose.sh` + +Outputs: +- `logs/selfcheck.json` +- `logs/diagnose_details_*.log` (full details) +- `logs/diagnose_error_*.log` (tagged errors) + +Web‑Proxy: +- 8083 expects 200/302/403; 8084/8085 must include CORS header +- nginx resolver should be `172.31.0.2 127.0.0.11` + +Kibana/ES: +- Verify `es.log.argus.com` resolves inside Kibana + +Permissions: +- Ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches runtime UID:GID + diff --git a/deployment/build/templates/docs/TROUBLESHOOTING_zh.md b/deployment/build/templates/docs/TROUBLESHOOTING_zh.md new file mode 100644 index 0000000..e4d6b47 --- /dev/null +++ b/deployment/build/templates/docs/TROUBLESHOOTING_zh.md @@ -0,0 +1,15 @@ +# 故障排查 + +- 状态:`scripts/server-status.sh` +- 自检:`scripts/server-selfcheck.sh` +- 诊断:`scripts/server-diagnose.sh` + +输出: +- `logs/selfcheck.json` +- `logs/diagnose_error_*.log`(错误摘要) +- `logs/diagnose_details_*.log`(详细信息) + +Web‑Proxy:8083=200/302/403;8084/8085 需包含 CORS +Kibana:确认可解析 `es.log.argus.com` +权限:先运行 `sudo ./server-prepare-dirs.sh` + diff --git a/deployment/build/templates/scripts/server-diagnose.sh b/deployment/build/templates/scripts/server-diagnose.sh new file mode 100755 index 0000000..5120d6e --- /dev/null +++ b/deployment/build/templates/scripts/server-diagnose.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a + +mkdir -p "$ROOT/logs" +ts="$(date -u +%Y%m%d-%H%M%SZ)" +DETAILS="$ROOT/logs/diagnose_details_${ts}.log" +ERRORS="$ROOT/logs/diagnose_error_${ts}.log" +: > "$DETAILS"; : > "$ERRORS" + +logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; } +append_err() { echo "$*" >> "$ERRORS"; } + +http_code() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; } +http_body_head() { curl -s --max-time 3 "$1" 2>/dev/null | sed -n '1,5p' || true; } +header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; } + +section() { + local name="$1"; logd "===== [$name] ====="; } + +svc() { + local svc_name="$1"; local cname="$2"; shift 2 + section "$svc_name ($cname)" + logd "docker ps:"; docker ps -a --format '{{.Names}}\t{{.Status}}\t{{.Image}}' | awk -v n="$cname" '$1==n' >> "$DETAILS" || true + logd "docker inspect (state/restartcount):"; docker inspect -f '{{.State.Status}} rc={{.RestartCount}} started={{.State.StartedAt}}' "$cname" >> "$DETAILS" 2>&1 || true + logd "last 200 container logs:"; docker logs --tail 200 "$cname" >> "$DETAILS" 2>&1 || true + + # extract error lines from container logs + docker logs --tail 200 "$cname" 2>&1 | \ + grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \ + sed "s/^/[${svc_name}][container] /" >> "$ERRORS" || true + + # supervisor status and logs + if docker exec "$cname" sh -lc 'command -v supervisorctl >/dev/null 2>&1' >/dev/null 2>&1; then + logd "supervisorctl status:"; docker exec "$cname" sh -lc 'supervisorctl status' >> "$DETAILS" 2>&1 || true + # iterate supervisor logs and collect tails + errors per file + local files + files=$(docker exec "$cname" sh -lc 'ls /var/log/supervisor/*.log 2>/dev/null' || true) + for f in $files; do + logd "tail -n 80 $f:"; docker exec "$cname" sh -lc "tail -n 80 $f 2>/dev/null || true" >> "$DETAILS" 2>&1 || true + docker exec "$cname" sh -lc "tail -n 200 $f 2>/dev/null" 2>/dev/null | \ + grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \ + sed "s/^/[${svc_name}][supervisor:$(basename $f)] /" >> "$ERRORS" || true + done + fi +} + +# Core services +svc bind argus-bind-sys +svc master argus-master-sys +svc es argus-es-sys +svc kibana argus-kibana-sys +svc ftp argus-ftp +svc prometheus argus-prometheus +svc grafana argus-grafana +svc alertmanager argus-alertmanager +svc web-frontend argus-web-frontend +svc web-proxy argus-web-proxy + +# HTTP checks (host side) +section HTTP +logd "ES: $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health")" +http_body_head "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" >> "$DETAILS" 2>&1 || true + +logd "Kibana: $(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status")" +http_body_head "http://localhost:${KIBANA_PORT:-5601}/api/status" >> "$DETAILS" 2>&1 || true + +logd "Master readyz: $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz")" + +logd "Prometheus: $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready")" +logd "Grafana: $(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health")" +http_body_head "http://localhost:${GRAFANA_PORT:-3000}/api/health" >> "$DETAILS" 2>&1 || true +logd "Alertmanager: $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status")" + +cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true) +cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true) +logd "Web-Proxy 8080: $(http_code "http://localhost:${WEB_PROXY_PORT_8080:-8080}/")" +logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083}/")" +logd "Web-Proxy 8084 CORS: ${cors8084}" +logd "Web-Proxy 8085 CORS: ${cors8085}" + +# FTP share writability (container perspective) +section FTP-SHARE +docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true + +# Collect system info for context +section SYSTEM +logd "uname -a:"; uname -a >> "$DETAILS" +logd "docker version:"; docker version --format '{{.Server.Version}}' >> "$DETAILS" 2>&1 || true +logd "compose ps:"; (cd "$ROOT/compose" && if docker compose version >/dev/null 2>&1; then docker compose -p argus-sys ps; else docker-compose -p argus-sys ps; fi) >> "$DETAILS" 2>&1 || true + +section SUMMARY +# Add HTTP failures and CORS problems to error log with tags +[[ $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health") != 200 ]] && echo "[es][http] health not 200" >> "$ERRORS" +kbcode=$(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status"); [[ "$kbcode" != 200 ]] && echo "[kibana][http] /api/status=$kbcode" >> "$ERRORS" +[[ $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz") != 200 ]] && echo "[master][http] /readyz not 200" >> "$ERRORS" +[[ $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready") != 200 ]] && echo "[prometheus][http] /-/ready not 200" >> "$ERRORS" +gfcode=$(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health"); [[ "$gfcode" != 200 ]] && echo "[grafana][http] /api/health=$gfcode" >> "$ERRORS" +[[ $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status") != 200 ]] && echo "[alertmanager][http] /api/v2/status not 200" >> "$ERRORS" +[[ -z "$cors8084" ]] && echo "[web-proxy][cors] 8084 missing Access-Control-Allow-Origin" >> "$ERRORS" +[[ -z "$cors8085" ]] && echo "[web-proxy][cors] 8085 missing Access-Control-Allow-Origin" >> "$ERRORS" + +# Deduplicate errors +sort -u -o "$ERRORS" "$ERRORS" + +echo "Diagnostic details -> $DETAILS" +echo "Detected errors -> $ERRORS" + +# maintain latest symlinks for convenience +ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true +ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true + +exit 0 diff --git a/deployment/build/templates/scripts/server-install.sh b/deployment/build/templates/scripts/server-install.sh old mode 100644 new mode 100755 index 8605031..a9b9fef --- a/deployment/build/templates/scripts/server-install.sh +++ b/deployment/build/templates/scripts/server-install.sh @@ -43,6 +43,29 @@ prepare_env() { done } +prepare_data_dirs() { + if [[ $EUID -ne 0 ]]; then + echo -e "\033[1;33m[WARN]\033[0m running as non-root: will not chown data dirs." + echo -e "\033[1;33m[WARN]\033[0m If you hit Permission denied, run: sudo $SCRIPT_DIR/server-prepare-dirs.sh" + # still ensure basic directories exist (no chown) + mkdir -p \ + "$PKG_ROOT/private/argus/etc" \ + "$PKG_ROOT/private/argus/metric/prometheus" \ + "$PKG_ROOT/private/argus/metric/prometheus/data" \ + "$PKG_ROOT/private/argus/metric/prometheus/rules" \ + "$PKG_ROOT/private/argus/metric/grafana" \ + "$PKG_ROOT/private/argus/metric/grafana/data" \ + "$PKG_ROOT/private/argus/metric/grafana/logs" \ + "$PKG_ROOT/private/argus/metric/grafana/plugins" \ + "$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \ + "$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \ + "$PKG_ROOT/private/argus/metric/grafana/data/sessions" \ + "$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \ + "$PKG_ROOT/private/argus/alert/alertmanager" \ + "$PKG_ROOT/private/argus/metric/ftp/share" + fi +} + load_images() { local tar="$PKG_ROOT/images/all-images.tar.gz" [[ -f "$tar" ]] || { err "missing images tar: $tar"; exit 1; } @@ -52,7 +75,105 @@ load_images() { bring_up() { log "starting services via compose" - (cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" up -d) + local ov="$PKG_ROOT/compose/docker-compose.os-compat.override.yml" + if [[ ! -f "$ov" ]]; then + cat > "$ov" <<'YAML' +services: + bind: + security_opt: ["label=disable"] + userns_mode: "host" + tmpfs: + - /run/named + master: + security_opt: ["label=disable"] + userns_mode: "host" + es: + security_opt: ["label=disable"] + userns_mode: "host" + kibana: + security_opt: ["label=disable"] + userns_mode: "host" + ftp: + security_opt: ["label=disable"] + userns_mode: "host" + prometheus: + security_opt: ["label=disable"] + userns_mode: "host" + grafana: + security_opt: ["label=disable"] + userns_mode: "host" + alertmanager: + security_opt: ["label=disable"] + userns_mode: "host" + # ensure runtime path matches container expectation + volumes: + - ../private/argus/etc:/private/argus/etc + - ../private/argus/alert/alertmanager:/alertmanager + web-frontend: + security_opt: ["label=disable"] + userns_mode: "host" + web-proxy: + security_opt: ["label=disable"] + userns_mode: "host" +YAML + log "generated OS-compat override: $(basename "$ov")" + fi + # 仅启动服务端组件,避免误起测试节点(node-a/node-b/test-node/test-gpu-node) + local services=(bind master es kibana ftp prometheus grafana alertmanager web-frontend web-proxy) + log "services: ${services[*]}" + (cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" -f docker-compose.yml -f $(basename "$ov") up -d "${services[@]}") +} + +dns_bootstrap() { + log "DNS bootstrap: initializing shared dns.conf and container resolv.conf" + local etc_dir="$PKG_ROOT/private/argus/etc" + mkdir -p "$etc_dir" + # 1) ensure dns.conf exists (fallback to bind IP 172.31.0.2) + if [[ ! -s "$etc_dir/dns.conf" ]]; then + if echo "172.31.0.2" > "$etc_dir/dns.conf" 2>/dev/null; then + log "wrote fallback dns.conf with 172.31.0.2" + else + # host-side write denied (ownership 1000:1000); write via bind container instead + if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then + docker exec argus-bind-sys sh -lc 'echo 172.31.0.2 > /private/argus/etc/dns.conf && chmod 644 /private/argus/etc/dns.conf' || true + log "fallback dns.conf written via bind container" + else + log "bind not ready; skip writing fallback dns.conf" + fi + fi + fi + # 2) wait briefly for bind to copy update-dns.sh into shared etc (bind startup.sh does this) + local i=0 + while [[ ! -x "$etc_dir/update-dns.sh" && $i -lt 20 ]]; do + sleep 0.5; ((i++)); + done + if [[ ! -x "$etc_dir/update-dns.sh" ]]; then + log "update-dns.sh not present yet; continuing with existing resolv.conf" + fi + # 3) run update-dns.sh inside key containers so /etc/resolv.conf points to bind + local c + for c in argus-master-sys argus-es-sys argus-kibana-sys argus-grafana argus-prometheus argus-ftp argus-web-frontend argus-web-proxy argus-alertmanager; do + if docker ps --format '{{.Names}}' | grep -q "^${c}$"; then + docker exec "$c" sh -lc 'test -x /private/argus/etc/update-dns.sh && /private/argus/etc/update-dns.sh || true' >/dev/null 2>&1 || true + fi + done + # 4) wait for service A-record hint files generated by services (best-effort) + local need=( es.log.argus.com kibana.log.argus.com master.argus.com grafana.metric.argus.com prom.metric.argus.com alertmanager.alert.argus.com ) + local waited=0; local missing=1 + while (( waited < 15 )); do + missing=0 + for f in "${need[@]}"; do [[ -s "$etc_dir/$f" ]] || { missing=1; break; }; done + [[ $missing -eq 0 ]] && break + sleep 1; ((waited++)) + done + # 5) reload bind zone (script uses supervisor to restart bind9) + if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then + docker exec argus-bind-sys sh -lc '/usr/local/bin/reload-bind9.sh' >/dev/null 2>&1 || true + fi + # 6) restart web-proxy once to re-render nginx resolver with latest dns.conf + if docker ps --format '{{.Names}}' | grep -q '^argus-web-proxy$'; then + docker restart argus-web-proxy >/dev/null 2>&1 || true + fi } selfcheck() { @@ -62,11 +183,12 @@ selfcheck() { main() { prepare_env + prepare_data_dirs load_images bring_up + dns_bootstrap selfcheck log "install completed. See logs in $PKG_ROOT/logs/" } main "$@" - diff --git a/deployment/build/templates/scripts/server-prepare-dirs.sh b/deployment/build/templates/scripts/server-prepare-dirs.sh new file mode 100755 index 0000000..3be214d --- /dev/null +++ b/deployment/build/templates/scripts/server-prepare-dirs.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +if [[ $EUID -ne 0 ]]; then + echo "[PREPARE] This script requires root (sudo)." >&2 + echo " Try: sudo $0" >&2 + exit 1 +fi + +ENV_FILE="$PKG_ROOT/compose/.env" +[[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a +UIDV="${ARGUS_BUILD_UID:-1000}"; GIDV="${ARGUS_BUILD_GID:-1000}" + +echo "[PREPARE] Using owner ${UIDV}:${GIDV}" + +# Core etc and service data dirs (aligned with src/sys/tests/scripts/01_bootstrap.sh) +mkdir -p \ + "$PKG_ROOT/private/argus/etc" \ + "$PKG_ROOT/private/argus/bind" \ + "$PKG_ROOT/private/argus/master" \ + "$PKG_ROOT/private/argus/agent" \ + "$PKG_ROOT/private/argus/log/elasticsearch" \ + "$PKG_ROOT/private/argus/log/kibana" + +# Prometheus +mkdir -p \ + "$PKG_ROOT/private/argus/metric/prometheus" \ + "$PKG_ROOT/private/argus/metric/prometheus/data" \ + "$PKG_ROOT/private/argus/metric/prometheus/rules" \ + "$PKG_ROOT/private/argus/metric/prometheus/targets" + +# Grafana +mkdir -p \ + "$PKG_ROOT/private/argus/metric/grafana" \ + "$PKG_ROOT/private/argus/metric/grafana/data" \ + "$PKG_ROOT/private/argus/metric/grafana/logs" \ + "$PKG_ROOT/private/argus/metric/grafana/plugins" \ + "$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \ + "$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \ + "$PKG_ROOT/private/argus/metric/grafana/data/sessions" \ + "$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \ + "$PKG_ROOT/private/argus/metric/grafana/config" + +# FTP +mkdir -p "$PKG_ROOT/private/argus/metric/ftp/share" + +# Alertmanager +mkdir -p "$PKG_ROOT/private/argus/alert/alertmanager" + +chown -R "$UIDV":"$GIDV" \ + "$PKG_ROOT/private/argus/etc" \ + "$PKG_ROOT/private/argus/bind" \ + "$PKG_ROOT/private/argus/master" \ + "$PKG_ROOT/private/argus/agent" \ + "$PKG_ROOT/private/argus/log/elasticsearch" \ + "$PKG_ROOT/private/argus/log/kibana" \ + "$PKG_ROOT/private/argus/metric/prometheus" \ + "$PKG_ROOT/private/argus/metric/grafana" \ + "$PKG_ROOT/private/argus/metric/ftp" \ + "$PKG_ROOT/private/argus/alert" + +chmod -R g+w "$PKG_ROOT/private/argus/alert" "$PKG_ROOT/private/argus/etc" || true + +# Ensure parent directories also owned by runtime user for consistency +chown "$UIDV":"$GIDV" \ + "$PKG_ROOT/private/argus" \ + "$PKG_ROOT/private/argus/log" \ + "$PKG_ROOT/private/argus/metric" || true + +echo "[PREPARE] Done. You can now run server-install.sh" diff --git a/deployment/build/templates/scripts/server-selfcheck.sh b/deployment/build/templates/scripts/server-selfcheck.sh old mode 100644 new mode 100755 index 145f709..2d82829 --- a/deployment/build/templates/scripts/server-selfcheck.sh +++ b/deployment/build/templates/scripts/server-selfcheck.sh @@ -32,7 +32,11 @@ log "checking Master" wait_http "http://localhost:${MASTER_PORT:-32300}/readyz" 60 || ok=0 log "checking FTP" -ftp_root="$ROOT/private/argus/metric/ftp/share"; [[ -d "$ftp_root" && -w "$ftp_root" ]] && ftp_ok=true || { ftp_ok=false; ok=0; } +if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then + if docker exec argus-ftp sh -lc 'test -w /private/argus/ftp/share'; then ftp_ok=true; else ftp_ok=false; ok=0; fi +else + ftp_ok=false; ok=0; +fi log "checking Prometheus" wait_http "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready" 60 || ok=0 @@ -51,8 +55,9 @@ p8083=$(code_for "http://localhost:${WEB_PROXY_PORT_8083:-8083}/") cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true) cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true) wp_ok=true -[[ "$p8080" == 200 ]] || wp_ok=false -([[ "$p8083" == 200 || "$p8083" == 302 ]]) || wp_ok=false +# 有些环境首页可能 403,此处接受 200/403 +([[ "$p8080" == 200 || "$p8080" == 403 ]]) || wp_ok=false +([[ "$p8083" == 200 || "$p8083" == 302 || "$p8083" == 403 ]]) || wp_ok=false [[ -n "$cors8084" && -n "$cors8085" ]] || wp_ok=false [[ "$wp_ok" == true ]] || ok=0 @@ -71,5 +76,15 @@ cat > "$tmp" <!h=OhYbW;-q&)SzOGevi`hvdlmCxQqjP1$(R(Ukf7RDV-`alMQ_!PBX~Kgoo6^#r zUp{sA*8`o~$x6aCr^OdN&L#Q{#U*DMxk$4 zJ}Ex0P*q*YC_8~eur_el1eqtFeD?M%*mhxY%(btrAqUgH>^$|DZ{IA#Yj+aZt!LkS zyHG=YTFW$x-FB77cTIEdxu$W#b$a!Bd(qH*)kP}m{Jd(H_DJlo2{-C`Zf&=S|EbH# zjFkQfpKNy|O<~uLce-WQzCrZ<`=3pwC36E;G~e1OBq$=>w2aZvT6>wMjD-Dv!9!2f zEqjm3Z{G9%zwN0{`vavcYY$(2BZ?YyvG@yYo%oftm@+rq1Hwin!<;s&-U#E05vh_KQ_wf{$5w6mJMWkK?uQ zou<&Jy~Ew#y=(VyPeBS79B`--=`U=mXGQ=SJtbypE58oq%txv@G~$lROINF z=NDxs<>zOE(nRkBTff5&0myOk-RAEXtCQugFtZiOPcJCZM+gSOK$H})_#x7%Y`4zmYL;j?P1K-woUrdtoP|p zO!aFHwN|4)?#;W;nO2-UyVt#Ivd4jg$EL(pnR8v48T9P7i{H~b8k_8v?%#J)CZ7NJ zc4wJQ`6({=^f?!oro z-s?6kV$Xl1siI%`X~Ld8zKeHEczjTbGjmRZ!s1-x8GeU0-dk^`a6|j@ii=W{HV8OJ zy8LW(dwQ7f`{6T64m{p3_Rinlv8H;K$ni2Y;YrtadH%exi!b*!->kB*`RYb;n^htg zSGzk3a(;9a4^=)G>dUn#F74Mz(LGV+kLI0VUr{7m$+=$e@!}lG$Xow1pG>Jgr~tJHiq!EE0c!cioP_#utn7S{}6} zC~N#rJ?wMb|6i*ubB^_?or!#R^S9KVtDX1Us{Q$#YPFJQ#~*R}W8&NNp% z$*|EwC-`dc$odBvGKWYQpqtq%f7#Pd+v91kD$?<%zasB z^`i6tZIi#d#(r0-{k?xBTU;*IaB1FX5Mh~B%Xmns zZkErFHaA<-uK&m3nO}Fm$)x4hKQ%7B<-PQF!WU1mOp$lGo_?;1DJJ{&eaT8MZvHm) z*Q3z+C6z%BnY*XdHoVPfc-xS0J2D|~$M#t>dJZo7Se-EUx&Pnhr=G3+IzitUw{2$J zCfxKkq0#l>y{xM0Eiqwz8+R;v_+`xkh3{3&sWaa&8dxrSp4{0|`{iaZ@3iF#r~Smd zMgBz^?(Ndq9I@!(HJ_{E-hD5BMkpo9>+F`&*{yIoSZA8XpU=V(QzuJ``B==Y=3lXQ z%RRl|DgR^kvqgOJInOSv`MaTV7vpD>mYYYLIuF@fXI?%f@T61aTmS#Xokh}C4fBg44rSURO zSGf1|EUOh?c~%xpJ1KVZ!o*z*__(f}Vov`R{`;R4YmLidhoGl>ZsrKFUp}OzoI66Gal%Z8@-q^|srreZ3Bk-DVu)Jyu`z z;r%tWbRog-7fvn>iA-#AkJ|mf-sf)Ns_!pfzTmc*lr*oa=TF~erVFjVA{v+7R=)5~ zaM|7FkN3FP9~=6HP1$nj-p313wzEpMy?=W}?QOU>(^UbxGqxqGuFm!S?#MdXI^Jl` zTA8R|`G(nI6C8rwX2=N~tIqcNs{UwYag@J`sC38iE1Rp$b=7iI+*P@bu^wEd8@>9y z)bBfWwyFMka<#lRb(7|FH>iEu%;EdmV?nu!&wGx?d%u3&9cLlfe`@8LHK|uSW+msB zFDX5__362+v$aQjCQnSBHAyaEQTr_Gt=4*9w|9R#b$PFv^Y6X3`!m+YmQTz3o%u6h z*3D1+Cx1`4)^8&?=k|FnQ{F5AsVfSX7aXd{Ut05;`D|==_u;2O|F192Jh+a{_C5c? zSl&(X3cn1ywk+BEYs)^)^h;;N`ri(dIR5^p z$h`GJCIx4#gD0)J@GkG_lC9yF#Uy_-$klb$@Fhwb99%LnNWybF-;4#x8PV6{M3!w^ zq}%v0b;I14kD_VqR~0|leAL|CJ@@(BKmU%q|Ff=roPAk<>&U`gxv4HYtyQ^>1m*f} zS2KFFcei{wZzoe~*HX>4+=)FADj|C&zV$eNDGOtz9lkc`j4@x+I8s(#jXpX1o<>DS7-&w^j3DXu;bX#!bRJ zDruLqemq{L!=-u}%rq)*8;gsZ+$du(c1U= z%@W_4OJ^LcGxH3DADA0i3Kg(pl!PMWryGUYhg+G;Mg_+^Qf z_`Wl9^_lHkH)#EP-~I1eTJHW=cK27<{?H3p{r6|{)rFVSUN5|CQ+z-&-8ASz=%<6* zA5G^xyzO&wU(&fi^{H>()s*u&G9|pVWRd7&k#J*`Xks~VXmUc@k$ zzZCc5-C$KH#gH5q%$CE%C;24gvd5AD#WqLj#-rxbG6cn<7A7oT>7qZu!tg@UzmLxQ zcfDQv_oc1X0}+J>J%3wEX5Pk-}Zpli!Lsx<8osq-p!%Jr`sxS-qN$ZeKIe zvdh(X?$NBKs_kuRc|Q|t6DNiXuMulkyRhfF(9+(`J)%ors-|A?IkjM>%VX*8GuGR@ ziqCCHeR+*#1=F-R>y=l^!uPfwy>)Wh|B|D>CBNPObNA=Hi5Hq|L}vdBa^qdbBjscA z?~v88o&xC%1y`ND`yxxr>wA};xIcIDqyHP`6kRSpwc_Oz$IG3^{GU5!MeNwWRlM$N z<%h}9arX6lzlirXI6k@ z=|ZCyP8WY1%agdy>(o6((AfQJ{;fUh7pLy|dMWekf%?d=tF7PNyM2Ot@B3`)TMCW* zR@Q!zhVpW2&YbDLY`NP(##yuLbL{luW3ociCr?>-afNY#=z<$JvhJLk9I+_o?Z|zs-k#9qTqb_Fd|5!kqPXDGDWP6uE45V}GQ~own%x!<&8e zi~-(^Od`y<`^5}U(7*^{VCf~J8-U(B1Ze}|21aW}l)fUmM)aN+ND~M*FiJ3C^uy3K zqjyq38bP>$QHvR@8Aq1|-5m6mKgbLaZeWyU0h@!hAAoKOdbI*F0fZYE*|@Np0 "$TARGET" else