diff --git a/deployment/build/build_client_package.sh b/deployment/build/build_client_package.sh old mode 100644 new mode 100755 diff --git a/deployment/build/build_server_package.sh b/deployment/build/build_server_package.sh old mode 100644 new mode 100755 index dc4a6a9..77a3249 --- a/deployment/build/build_server_package.sh +++ b/deployment/build/build_server_package.sh @@ -48,6 +48,17 @@ SRC_COMPOSE="$ROOT_DIR/src/sys/tests/docker-compose.yml" [[ -f "$SRC_COMPOSE" ]] || { err "missing $SRC_COMPOSE"; exit 1; } awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$STAGE/compose/docker-compose.yml" cp "$BUILD_DIR/templates/.env.example" "$STAGE/compose/.env.example" +# fix relative private path to match package layout (compose/ and private/ are siblings) +sed -i 's#\./private/#../private/#g' "$STAGE/compose/docker-compose.yml" +# also handle bind mount form without trailing slash +sed -i -E 's#- \./private:/private#- ../private:/private#g' "$STAGE/compose/docker-compose.yml" +# drop timezone file bind which may not exist on target distros (e.g. NixOS) +sed -i '/\/etc\/timezone:\/etc\/timezone:ro/d' "$STAGE/compose/docker-compose.yml" + +# sanity-check: ensure test services are absent +if grep -E '^(\s{2})(node-a|node-b|test-node|test-gpu-node):\s*$' -n "$STAGE/compose/docker-compose.yml" >/dev/null; then + err "compose filter failed: test services still present"; exit 1; +fi # 3) Images (reuse if already exported unless --resave-image) existing_images_tar="$PKG_DIR/images/all-images.tar.gz" @@ -76,24 +87,7 @@ fi # 4) Scripts & Docs copy_tree "$BUILD_DIR/templates/scripts" "$STAGE/scripts" -cat > "$STAGE/docs/INSTALL_SERVER.md" << 'MD' -# Argus Server Offline Installation - -## Prerequisites -- Ubuntu 22.04 x86_64 -- Docker & Docker Compose installed -- Open ports: 32300,9200,5601,9090,9093,8080..8085,21,20,21100-21110 (or auto-fallback to high ports) - -## Steps -1. Extract this package to /opt/argus-deploy/versions/ -2. cd scripts && sudo ./server-install.sh -3. Check status: ./server-status.sh -4. Uninstall: ./server-uninstall.sh - -## Notes -- Selfcheck result is written to logs/selfcheck.json -- DNS will be managed by internal bind; FTP dns.conf is auto-published to share/dns.conf -MD +copy_tree "$BUILD_DIR/templates/docs" "$STAGE/docs" # 5) Manifests gen_manifest "$STAGE" "$STAGE/manifest.txt" diff --git a/deployment/build/common.sh b/deployment/build/common.sh old mode 100644 new mode 100755 diff --git a/deployment/build/publish_client.sh b/deployment/build/publish_client.sh old mode 100644 new mode 100755 diff --git a/deployment/build/templates/docker-compose.filter.awk b/deployment/build/templates/docker-compose.filter.awk index ae2c7f4..72c6159 100644 --- a/deployment/build/templates/docker-compose.filter.awk +++ b/deployment/build/templates/docker-compose.filter.awk @@ -4,38 +4,38 @@ BEGIN{ split(remove, rm, ","); - for(i in rm) skipname[rm[i]] = 1; + for(i in rm){ + gsub(/^\s+|\s+$/,"",rm[i]); + if (rm[i] != "") skipname[rm[i]] = 1; + } + in_services=0; skipping=0; } -function starts_service_line(line, name) { - if (match(line, /^\s{2}([a-zA-Z0-9_-]+):\s*$/, m)) { - name = m[1]; - return name; - } +function service_header(line, m) { + # match exactly two leading spaces followed by name: + if (match(line, /^ ([A-Za-z0-9_-]+):[ ]*$/, m)) return m[1]; return ""; } { - name = starts_service_line($0); - if (name != "") { - # detect top-level keys (networks:, services:, etc.) - if ($0 ~ /^services:\s*$/) { in_services=1; print; next; } - if ($0 ~ /^[a-zA-Z0-9_-]+:\s*$/ && $0 !~ /^\s/) { - in_services= ($0 ~ /^services:\s*$/); - } + # Track top-level sections (no indentation) + if ($0 ~ /^[A-Za-z0-9_-]+:[ ]*$/ && $0 !~ /^\s/) { + in_services = ($0 ~ /^services:[ ]*$/) ? 1 : 0; + } - if (in_services && (name in skipname)) { - skipping=1; next; + if (skipping) { + # Stop skipping at next service header or another top-level section + if (service_header($0) != "" || ($0 ~ /^(networks|volumes):[ ]*$/ && $0 !~ /^\s/)) { + skipping=0; + } else { + next; } } - # end skipping when next top-level service appears - if (skipping) { - if (starts_service_line($0) != "") { skipping=0; } - else if ($0 ~ /^(networks|volumes):\s*$/) { skipping=0; } - else { next; } + if (in_services) { + name = service_header($0); + if (name != "" && (name in skipname)) { skipping=1; next; } } print; } - diff --git a/deployment/build/templates/docs/INSTALL_SERVER.md b/deployment/build/templates/docs/INSTALL_SERVER.md new file mode 100644 index 0000000..e72f5c8 --- /dev/null +++ b/deployment/build/templates/docs/INSTALL_SERVER.md @@ -0,0 +1,48 @@ +# Argus Server Offline Installation + +## Prerequisites +- Linux x86_64 (Ubuntu 22.04 recommended; see OS compatibility for NixOS) +- Docker & Docker Compose installed +- Open ports: 32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 21100–21110 (or auto-fallback to high ports) + +## Quick Start +1. Extract to a target dir, e.g. `/opt/argus-deploy/versions/` +2. `cd scripts && sudo ./server-prepare-dirs.sh` +3. `./server-install.sh` +4. `./server-status.sh` +5. `./server-selfcheck.sh` (on failure it auto-runs diagnose and writes logs under `logs/`) +6. `./server-uninstall.sh` to tear down + +## What the Installer Does +- Loads local images (`images/all-images.tar.gz`) +- Generates OS-compat override (`security_opt: ["label=disable"]`, `userns_mode: host`, bind `tmpfs:/run/named`) +- Starts server-only services: bind/master/es/kibana/ftp/prometheus/grafana/alertmanager/web-frontend/web-proxy +- DNS Bootstrap: + - Ensure `/private/argus/etc/dns.conf` exists (write `172.31.0.2` if missing); + - Run `/private/argus/etc/update-dns.sh` in dependent containers so `/etc/resolv.conf` points to bind; + - Wait for `*.argus.com` hint files, then reload bind; + - Restart web‑proxy to re-render nginx resolver from `dns.conf`; +- Writes `logs/selfcheck.json` as final summary + +## OS Compatibility +- NixOS / non-xattr FS: containers run with `security_opt: ["label=disable"]` and `userns_mode: host`; ensure data dirs are pre-created via `sudo ./server-prepare-dirs.sh` and owned by runtime UID:GID (default 1000:1000). + +## Files & Layout +- `compose/` (docker-compose.yml, .env) +- `private/` (data mounts) +- `scripts/` (install/uninstall/status/selfcheck/diagnose) +- `logs/` (selfcheck + diagnose outputs) + +## Troubleshooting (Quick) +- Run `./server-selfcheck.sh` → see `logs/selfcheck.json` +- Run `./server-diagnose.sh` → produces timestamped logs: + - `logs/diagnose_details_YYYYMMDD-HHMMSSZ.log` + - `logs/diagnose_error_YYYYMMDD-HHMMSSZ.log` + And updates `diagnose_details.log`/`diagnose_error.log` to the latest +- Error lines are tagged `[service][source]`, e.g. `[kibana][http] /api/status=503` + +Common issues: +- Kibana 503: wait cold start or fix DNS so `es.log.argus.com` resolves +- web‑proxy 504: check nginx `resolver` includes `172.31.0.2 127.0.0.11` +- EACCES/locks: ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches UID:GID + diff --git a/deployment/build/templates/docs/INSTALL_SERVER_zh.md b/deployment/build/templates/docs/INSTALL_SERVER_zh.md new file mode 100644 index 0000000..2f999a9 --- /dev/null +++ b/deployment/build/templates/docs/INSTALL_SERVER_zh.md @@ -0,0 +1,28 @@ +# Argus 服务端离线安装指南 + +## 先决条件 +- Linux x86_64(推荐 Ubuntu 22.04;NixOS 见“兼容说明”) +- 已安装 Docker 与 Docker Compose +- 端口:32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 21100–21110 + +## 快速开始 +1. 解压到目标目录(例如 `/opt/argus-deploy/versions/`) +2. 进入 `scripts/`:`sudo ./server-prepare-dirs.sh` +3. 安装:`./server-install.sh` +4. 状态:`./server-status.sh` +5. 自检:`./server-selfcheck.sh`(失败会自动采集诊断) +6. 卸载:`./server-uninstall.sh` + +## 安装流程要点 +- 仅启动 10 个服务端组件(不包含测试节点); +- DNS Bootstrap:补齐首次部署 DNS 依赖(生成/确认 `dns.conf`、统一容器 resolv.conf、写入 `*.argus.com`、reload bind、重启 web‑proxy); +- 输出自检结果到 `logs/selfcheck.json`。 + +## 兼容说明(NixOS 等) +- 使用 `security_opt: ["label=disable"]` 与 `userns_mode: host`; +- 先运行 `sudo ./server-prepare-dirs.sh` 创建/授权目录为 `1000:1000`; + +## 故障排查(见下文 Troubleshooting_zh) +- `./server-selfcheck.sh` → `logs/selfcheck.json` +- `./server-diagnose.sh` → `logs/diagnose_error_*.log` / `logs/diagnose_details_*.log` + diff --git a/deployment/build/templates/docs/TROUBLESHOOTING.md b/deployment/build/templates/docs/TROUBLESHOOTING.md new file mode 100644 index 0000000..49b5375 --- /dev/null +++ b/deployment/build/templates/docs/TROUBLESHOOTING.md @@ -0,0 +1,21 @@ +# Troubleshooting + +- Status: `scripts/server-status.sh` +- Selfcheck: `scripts/server-selfcheck.sh` +- Diagnose: `scripts/server-diagnose.sh` + +Outputs: +- `logs/selfcheck.json` +- `logs/diagnose_details_*.log` (full details) +- `logs/diagnose_error_*.log` (tagged errors) + +Web‑Proxy: +- 8083 expects 200/302/403; 8084/8085 must include CORS header +- nginx resolver should be `172.31.0.2 127.0.0.11` + +Kibana/ES: +- Verify `es.log.argus.com` resolves inside Kibana + +Permissions: +- Ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches runtime UID:GID + diff --git a/deployment/build/templates/docs/TROUBLESHOOTING_zh.md b/deployment/build/templates/docs/TROUBLESHOOTING_zh.md new file mode 100644 index 0000000..e4d6b47 --- /dev/null +++ b/deployment/build/templates/docs/TROUBLESHOOTING_zh.md @@ -0,0 +1,15 @@ +# 故障排查 + +- 状态:`scripts/server-status.sh` +- 自检:`scripts/server-selfcheck.sh` +- 诊断:`scripts/server-diagnose.sh` + +输出: +- `logs/selfcheck.json` +- `logs/diagnose_error_*.log`(错误摘要) +- `logs/diagnose_details_*.log`(详细信息) + +Web‑Proxy:8083=200/302/403;8084/8085 需包含 CORS +Kibana:确认可解析 `es.log.argus.com` +权限:先运行 `sudo ./server-prepare-dirs.sh` + diff --git a/deployment/build/templates/scripts/server-diagnose.sh b/deployment/build/templates/scripts/server-diagnose.sh new file mode 100755 index 0000000..5120d6e --- /dev/null +++ b/deployment/build/templates/scripts/server-diagnose.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a + +mkdir -p "$ROOT/logs" +ts="$(date -u +%Y%m%d-%H%M%SZ)" +DETAILS="$ROOT/logs/diagnose_details_${ts}.log" +ERRORS="$ROOT/logs/diagnose_error_${ts}.log" +: > "$DETAILS"; : > "$ERRORS" + +logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; } +append_err() { echo "$*" >> "$ERRORS"; } + +http_code() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; } +http_body_head() { curl -s --max-time 3 "$1" 2>/dev/null | sed -n '1,5p' || true; } +header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; } + +section() { + local name="$1"; logd "===== [$name] ====="; } + +svc() { + local svc_name="$1"; local cname="$2"; shift 2 + section "$svc_name ($cname)" + logd "docker ps:"; docker ps -a --format '{{.Names}}\t{{.Status}}\t{{.Image}}' | awk -v n="$cname" '$1==n' >> "$DETAILS" || true + logd "docker inspect (state/restartcount):"; docker inspect -f '{{.State.Status}} rc={{.RestartCount}} started={{.State.StartedAt}}' "$cname" >> "$DETAILS" 2>&1 || true + logd "last 200 container logs:"; docker logs --tail 200 "$cname" >> "$DETAILS" 2>&1 || true + + # extract error lines from container logs + docker logs --tail 200 "$cname" 2>&1 | \ + grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \ + sed "s/^/[${svc_name}][container] /" >> "$ERRORS" || true + + # supervisor status and logs + if docker exec "$cname" sh -lc 'command -v supervisorctl >/dev/null 2>&1' >/dev/null 2>&1; then + logd "supervisorctl status:"; docker exec "$cname" sh -lc 'supervisorctl status' >> "$DETAILS" 2>&1 || true + # iterate supervisor logs and collect tails + errors per file + local files + files=$(docker exec "$cname" sh -lc 'ls /var/log/supervisor/*.log 2>/dev/null' || true) + for f in $files; do + logd "tail -n 80 $f:"; docker exec "$cname" sh -lc "tail -n 80 $f 2>/dev/null || true" >> "$DETAILS" 2>&1 || true + docker exec "$cname" sh -lc "tail -n 200 $f 2>/dev/null" 2>/dev/null | \ + grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \ + sed "s/^/[${svc_name}][supervisor:$(basename $f)] /" >> "$ERRORS" || true + done + fi +} + +# Core services +svc bind argus-bind-sys +svc master argus-master-sys +svc es argus-es-sys +svc kibana argus-kibana-sys +svc ftp argus-ftp +svc prometheus argus-prometheus +svc grafana argus-grafana +svc alertmanager argus-alertmanager +svc web-frontend argus-web-frontend +svc web-proxy argus-web-proxy + +# HTTP checks (host side) +section HTTP +logd "ES: $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health")" +http_body_head "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" >> "$DETAILS" 2>&1 || true + +logd "Kibana: $(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status")" +http_body_head "http://localhost:${KIBANA_PORT:-5601}/api/status" >> "$DETAILS" 2>&1 || true + +logd "Master readyz: $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz")" + +logd "Prometheus: $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready")" +logd "Grafana: $(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health")" +http_body_head "http://localhost:${GRAFANA_PORT:-3000}/api/health" >> "$DETAILS" 2>&1 || true +logd "Alertmanager: $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status")" + +cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true) +cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true) +logd "Web-Proxy 8080: $(http_code "http://localhost:${WEB_PROXY_PORT_8080:-8080}/")" +logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083}/")" +logd "Web-Proxy 8084 CORS: ${cors8084}" +logd "Web-Proxy 8085 CORS: ${cors8085}" + +# FTP share writability (container perspective) +section FTP-SHARE +docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true + +# Collect system info for context +section SYSTEM +logd "uname -a:"; uname -a >> "$DETAILS" +logd "docker version:"; docker version --format '{{.Server.Version}}' >> "$DETAILS" 2>&1 || true +logd "compose ps:"; (cd "$ROOT/compose" && if docker compose version >/dev/null 2>&1; then docker compose -p argus-sys ps; else docker-compose -p argus-sys ps; fi) >> "$DETAILS" 2>&1 || true + +section SUMMARY +# Add HTTP failures and CORS problems to error log with tags +[[ $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health") != 200 ]] && echo "[es][http] health not 200" >> "$ERRORS" +kbcode=$(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status"); [[ "$kbcode" != 200 ]] && echo "[kibana][http] /api/status=$kbcode" >> "$ERRORS" +[[ $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz") != 200 ]] && echo "[master][http] /readyz not 200" >> "$ERRORS" +[[ $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready") != 200 ]] && echo "[prometheus][http] /-/ready not 200" >> "$ERRORS" +gfcode=$(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health"); [[ "$gfcode" != 200 ]] && echo "[grafana][http] /api/health=$gfcode" >> "$ERRORS" +[[ $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status") != 200 ]] && echo "[alertmanager][http] /api/v2/status not 200" >> "$ERRORS" +[[ -z "$cors8084" ]] && echo "[web-proxy][cors] 8084 missing Access-Control-Allow-Origin" >> "$ERRORS" +[[ -z "$cors8085" ]] && echo "[web-proxy][cors] 8085 missing Access-Control-Allow-Origin" >> "$ERRORS" + +# Deduplicate errors +sort -u -o "$ERRORS" "$ERRORS" + +echo "Diagnostic details -> $DETAILS" +echo "Detected errors -> $ERRORS" + +# maintain latest symlinks for convenience +ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true +ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true + +exit 0 diff --git a/deployment/build/templates/scripts/server-install.sh b/deployment/build/templates/scripts/server-install.sh old mode 100644 new mode 100755 index 8605031..a9b9fef --- a/deployment/build/templates/scripts/server-install.sh +++ b/deployment/build/templates/scripts/server-install.sh @@ -43,6 +43,29 @@ prepare_env() { done } +prepare_data_dirs() { + if [[ $EUID -ne 0 ]]; then + echo -e "\033[1;33m[WARN]\033[0m running as non-root: will not chown data dirs." + echo -e "\033[1;33m[WARN]\033[0m If you hit Permission denied, run: sudo $SCRIPT_DIR/server-prepare-dirs.sh" + # still ensure basic directories exist (no chown) + mkdir -p \ + "$PKG_ROOT/private/argus/etc" \ + "$PKG_ROOT/private/argus/metric/prometheus" \ + "$PKG_ROOT/private/argus/metric/prometheus/data" \ + "$PKG_ROOT/private/argus/metric/prometheus/rules" \ + "$PKG_ROOT/private/argus/metric/grafana" \ + "$PKG_ROOT/private/argus/metric/grafana/data" \ + "$PKG_ROOT/private/argus/metric/grafana/logs" \ + "$PKG_ROOT/private/argus/metric/grafana/plugins" \ + "$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \ + "$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \ + "$PKG_ROOT/private/argus/metric/grafana/data/sessions" \ + "$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \ + "$PKG_ROOT/private/argus/alert/alertmanager" \ + "$PKG_ROOT/private/argus/metric/ftp/share" + fi +} + load_images() { local tar="$PKG_ROOT/images/all-images.tar.gz" [[ -f "$tar" ]] || { err "missing images tar: $tar"; exit 1; } @@ -52,7 +75,105 @@ load_images() { bring_up() { log "starting services via compose" - (cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" up -d) + local ov="$PKG_ROOT/compose/docker-compose.os-compat.override.yml" + if [[ ! -f "$ov" ]]; then + cat > "$ov" <<'YAML' +services: + bind: + security_opt: ["label=disable"] + userns_mode: "host" + tmpfs: + - /run/named + master: + security_opt: ["label=disable"] + userns_mode: "host" + es: + security_opt: ["label=disable"] + userns_mode: "host" + kibana: + security_opt: ["label=disable"] + userns_mode: "host" + ftp: + security_opt: ["label=disable"] + userns_mode: "host" + prometheus: + security_opt: ["label=disable"] + userns_mode: "host" + grafana: + security_opt: ["label=disable"] + userns_mode: "host" + alertmanager: + security_opt: ["label=disable"] + userns_mode: "host" + # ensure runtime path matches container expectation + volumes: + - ../private/argus/etc:/private/argus/etc + - ../private/argus/alert/alertmanager:/alertmanager + web-frontend: + security_opt: ["label=disable"] + userns_mode: "host" + web-proxy: + security_opt: ["label=disable"] + userns_mode: "host" +YAML + log "generated OS-compat override: $(basename "$ov")" + fi + # 仅启动服务端组件,避免误起测试节点(node-a/node-b/test-node/test-gpu-node) + local services=(bind master es kibana ftp prometheus grafana alertmanager web-frontend web-proxy) + log "services: ${services[*]}" + (cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" -f docker-compose.yml -f $(basename "$ov") up -d "${services[@]}") +} + +dns_bootstrap() { + log "DNS bootstrap: initializing shared dns.conf and container resolv.conf" + local etc_dir="$PKG_ROOT/private/argus/etc" + mkdir -p "$etc_dir" + # 1) ensure dns.conf exists (fallback to bind IP 172.31.0.2) + if [[ ! -s "$etc_dir/dns.conf" ]]; then + if echo "172.31.0.2" > "$etc_dir/dns.conf" 2>/dev/null; then + log "wrote fallback dns.conf with 172.31.0.2" + else + # host-side write denied (ownership 1000:1000); write via bind container instead + if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then + docker exec argus-bind-sys sh -lc 'echo 172.31.0.2 > /private/argus/etc/dns.conf && chmod 644 /private/argus/etc/dns.conf' || true + log "fallback dns.conf written via bind container" + else + log "bind not ready; skip writing fallback dns.conf" + fi + fi + fi + # 2) wait briefly for bind to copy update-dns.sh into shared etc (bind startup.sh does this) + local i=0 + while [[ ! -x "$etc_dir/update-dns.sh" && $i -lt 20 ]]; do + sleep 0.5; ((i++)); + done + if [[ ! -x "$etc_dir/update-dns.sh" ]]; then + log "update-dns.sh not present yet; continuing with existing resolv.conf" + fi + # 3) run update-dns.sh inside key containers so /etc/resolv.conf points to bind + local c + for c in argus-master-sys argus-es-sys argus-kibana-sys argus-grafana argus-prometheus argus-ftp argus-web-frontend argus-web-proxy argus-alertmanager; do + if docker ps --format '{{.Names}}' | grep -q "^${c}$"; then + docker exec "$c" sh -lc 'test -x /private/argus/etc/update-dns.sh && /private/argus/etc/update-dns.sh || true' >/dev/null 2>&1 || true + fi + done + # 4) wait for service A-record hint files generated by services (best-effort) + local need=( es.log.argus.com kibana.log.argus.com master.argus.com grafana.metric.argus.com prom.metric.argus.com alertmanager.alert.argus.com ) + local waited=0; local missing=1 + while (( waited < 15 )); do + missing=0 + for f in "${need[@]}"; do [[ -s "$etc_dir/$f" ]] || { missing=1; break; }; done + [[ $missing -eq 0 ]] && break + sleep 1; ((waited++)) + done + # 5) reload bind zone (script uses supervisor to restart bind9) + if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then + docker exec argus-bind-sys sh -lc '/usr/local/bin/reload-bind9.sh' >/dev/null 2>&1 || true + fi + # 6) restart web-proxy once to re-render nginx resolver with latest dns.conf + if docker ps --format '{{.Names}}' | grep -q '^argus-web-proxy$'; then + docker restart argus-web-proxy >/dev/null 2>&1 || true + fi } selfcheck() { @@ -62,11 +183,12 @@ selfcheck() { main() { prepare_env + prepare_data_dirs load_images bring_up + dns_bootstrap selfcheck log "install completed. See logs in $PKG_ROOT/logs/" } main "$@" - diff --git a/deployment/build/templates/scripts/server-prepare-dirs.sh b/deployment/build/templates/scripts/server-prepare-dirs.sh new file mode 100755 index 0000000..3be214d --- /dev/null +++ b/deployment/build/templates/scripts/server-prepare-dirs.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +if [[ $EUID -ne 0 ]]; then + echo "[PREPARE] This script requires root (sudo)." >&2 + echo " Try: sudo $0" >&2 + exit 1 +fi + +ENV_FILE="$PKG_ROOT/compose/.env" +[[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a +UIDV="${ARGUS_BUILD_UID:-1000}"; GIDV="${ARGUS_BUILD_GID:-1000}" + +echo "[PREPARE] Using owner ${UIDV}:${GIDV}" + +# Core etc and service data dirs (aligned with src/sys/tests/scripts/01_bootstrap.sh) +mkdir -p \ + "$PKG_ROOT/private/argus/etc" \ + "$PKG_ROOT/private/argus/bind" \ + "$PKG_ROOT/private/argus/master" \ + "$PKG_ROOT/private/argus/agent" \ + "$PKG_ROOT/private/argus/log/elasticsearch" \ + "$PKG_ROOT/private/argus/log/kibana" + +# Prometheus +mkdir -p \ + "$PKG_ROOT/private/argus/metric/prometheus" \ + "$PKG_ROOT/private/argus/metric/prometheus/data" \ + "$PKG_ROOT/private/argus/metric/prometheus/rules" \ + "$PKG_ROOT/private/argus/metric/prometheus/targets" + +# Grafana +mkdir -p \ + "$PKG_ROOT/private/argus/metric/grafana" \ + "$PKG_ROOT/private/argus/metric/grafana/data" \ + "$PKG_ROOT/private/argus/metric/grafana/logs" \ + "$PKG_ROOT/private/argus/metric/grafana/plugins" \ + "$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \ + "$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \ + "$PKG_ROOT/private/argus/metric/grafana/data/sessions" \ + "$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \ + "$PKG_ROOT/private/argus/metric/grafana/config" + +# FTP +mkdir -p "$PKG_ROOT/private/argus/metric/ftp/share" + +# Alertmanager +mkdir -p "$PKG_ROOT/private/argus/alert/alertmanager" + +chown -R "$UIDV":"$GIDV" \ + "$PKG_ROOT/private/argus/etc" \ + "$PKG_ROOT/private/argus/bind" \ + "$PKG_ROOT/private/argus/master" \ + "$PKG_ROOT/private/argus/agent" \ + "$PKG_ROOT/private/argus/log/elasticsearch" \ + "$PKG_ROOT/private/argus/log/kibana" \ + "$PKG_ROOT/private/argus/metric/prometheus" \ + "$PKG_ROOT/private/argus/metric/grafana" \ + "$PKG_ROOT/private/argus/metric/ftp" \ + "$PKG_ROOT/private/argus/alert" + +chmod -R g+w "$PKG_ROOT/private/argus/alert" "$PKG_ROOT/private/argus/etc" || true + +# Ensure parent directories also owned by runtime user for consistency +chown "$UIDV":"$GIDV" \ + "$PKG_ROOT/private/argus" \ + "$PKG_ROOT/private/argus/log" \ + "$PKG_ROOT/private/argus/metric" || true + +echo "[PREPARE] Done. You can now run server-install.sh" diff --git a/deployment/build/templates/scripts/server-selfcheck.sh b/deployment/build/templates/scripts/server-selfcheck.sh old mode 100644 new mode 100755 index 145f709..2d82829 --- a/deployment/build/templates/scripts/server-selfcheck.sh +++ b/deployment/build/templates/scripts/server-selfcheck.sh @@ -32,7 +32,11 @@ log "checking Master" wait_http "http://localhost:${MASTER_PORT:-32300}/readyz" 60 || ok=0 log "checking FTP" -ftp_root="$ROOT/private/argus/metric/ftp/share"; [[ -d "$ftp_root" && -w "$ftp_root" ]] && ftp_ok=true || { ftp_ok=false; ok=0; } +if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then + if docker exec argus-ftp sh -lc 'test -w /private/argus/ftp/share'; then ftp_ok=true; else ftp_ok=false; ok=0; fi +else + ftp_ok=false; ok=0; +fi log "checking Prometheus" wait_http "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready" 60 || ok=0 @@ -51,8 +55,9 @@ p8083=$(code_for "http://localhost:${WEB_PROXY_PORT_8083:-8083}/") cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true) cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true) wp_ok=true -[[ "$p8080" == 200 ]] || wp_ok=false -([[ "$p8083" == 200 || "$p8083" == 302 ]]) || wp_ok=false +# 有些环境首页可能 403,此处接受 200/403 +([[ "$p8080" == 200 || "$p8080" == 403 ]]) || wp_ok=false +([[ "$p8083" == 200 || "$p8083" == 302 || "$p8083" == 403 ]]) || wp_ok=false [[ -n "$cors8084" && -n "$cors8085" ]] || wp_ok=false [[ "$wp_ok" == true ]] || ok=0 @@ -71,5 +76,15 @@ cat > "$tmp" < "$TARGET" else