[#37] server install 增加重试自检

This commit is contained in:
yuyr 2025-11-04 11:37:27 +08:00
parent 37af47076b
commit b9611c2dd2
4 changed files with 77 additions and 13 deletions

View File

@ -7,8 +7,8 @@
## Quick Start ## Quick Start
1. Extract to a target dir, e.g. `/opt/argus-deploy/versions/<YYYYMMDD>` 1. Extract to a target dir, e.g. `/opt/argus-deploy/versions/<YYYYMMDD>`
2. `cd scripts && sudo ./server-prepare-dirs.sh` 2. `cd scripts && sudo ./server-prepare-dirs.sh` (recommended)
3. `./server-install.sh` 3. `./server-install.sh` (nonroot is supported: it will precreate minimal dirs and auto-fix Kibana/ES/Bind in containers)
4. `./server-status.sh` 4. `./server-status.sh`
5. `./server-selfcheck.sh` (on failure it auto-runs diagnose and writes logs under `logs/`) 5. `./server-selfcheck.sh` (on failure it auto-runs diagnose and writes logs under `logs/`)
6. `./server-uninstall.sh` to tear down 6. `./server-uninstall.sh` to tear down
@ -25,7 +25,11 @@
- Writes `logs/selfcheck.json` as final summary - Writes `logs/selfcheck.json` as final summary
## OS Compatibility ## OS Compatibility
- NixOS / non-xattr FS: containers run with `security_opt: ["label=disable"]` and `userns_mode: host`; ensure data dirs are pre-created via `sudo ./server-prepare-dirs.sh` and owned by runtime UID:GID (default 1000:1000). - NixOS / non-xattr FS: containers run with `security_opt: ["label=disable"]` and `userns_mode: host`.
- If you cannot use sudo, the installer will:
- create minimal data dirs (incl. `private/argus/log/{elasticsearch,kibana}`) with permissive perms when possible;
- ensure inside containers: Kibana `data``/private/argus/log/kibana`, Elasticsearch `data``/private/argus/log/elasticsearch`, and Bind `rndc.key` is generated.
You can still run `sudo ./server-prepare-dirs.sh` later to normalize ownership.
## Files & Layout ## Files & Layout
- `compose/` (docker-compose.yml, .env) - `compose/` (docker-compose.yml, .env)
@ -45,4 +49,3 @@ Common issues:
- Kibana 503: wait cold start or fix DNS so `es.log.argus.com` resolves - Kibana 503: wait cold start or fix DNS so `es.log.argus.com` resolves
- webproxy 504: check nginx `resolver` includes `172.31.0.2 127.0.0.11` - webproxy 504: check nginx `resolver` includes `172.31.0.2 127.0.0.11`
- EACCES/locks: ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches UID:GID - EACCES/locks: ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches UID:GID

View File

@ -7,8 +7,8 @@
## 快速开始 ## 快速开始
1. 解压到目标目录(例如 `/opt/argus-deploy/versions/<YYYYMMDD>` 1. 解压到目标目录(例如 `/opt/argus-deploy/versions/<YYYYMMDD>`
2. 进入 `scripts/``sudo ./server-prepare-dirs.sh` 2. 进入 `scripts/``sudo ./server-prepare-dirs.sh`(推荐)
3. 安装:`./server-install.sh` 3. 安装:`./server-install.sh`(支持普通用户:会自动创建最小目录并在容器内修复 Kibana/ES/Bind
4. 状态:`./server-status.sh` 4. 状态:`./server-status.sh`
5. 自检:`./server-selfcheck.sh`(失败会自动采集诊断) 5. 自检:`./server-selfcheck.sh`(失败会自动采集诊断)
6. 卸载:`./server-uninstall.sh` 6. 卸载:`./server-uninstall.sh`
@ -19,10 +19,13 @@
- 输出自检结果到 `logs/selfcheck.json` - 输出自检结果到 `logs/selfcheck.json`
## 兼容说明NixOS 等) ## 兼容说明NixOS 等)
- 使用 `security_opt: ["label=disable"]``userns_mode: host` - 使用 `security_opt: ["label=disable"]``userns_mode: host`
- 先运行 `sudo ./server-prepare-dirs.sh` 创建/授权目录为 `1000:1000` - 若不能使用 sudo安装器会创建最小目录`private/argus/log/{elasticsearch,kibana}`),并在容器内完成:
- Kibana 的 `data` 软链到 `/private/argus/log/kibana`
- Elasticsearch 的 `data` 软链到 `/private/argus/log/elasticsearch`
- Bind 生成 `/etc/bind/rndc.key`
安装后也可再执行 `sudo ./server-prepare-dirs.sh` 统一目录属主。
## 故障排查(见下文 Troubleshooting_zh ## 故障排查(见下文 Troubleshooting_zh
- `./server-selfcheck.sh``logs/selfcheck.json` - `./server-selfcheck.sh``logs/selfcheck.json`
- `./server-diagnose.sh``logs/diagnose_error_*.log` / `logs/diagnose_details_*.log` - `./server-diagnose.sh``logs/diagnose_error_*.log` / `logs/diagnose_details_*.log`

View File

@ -11,5 +11,6 @@
WebProxy8083=200/302/4038084/8085 需包含 CORS WebProxy8083=200/302/4038084/8085 需包含 CORS
Kibana确认可解析 `es.log.argus.com` Kibana确认可解析 `es.log.argus.com`
权限:先运行 `sudo ./server-prepare-dirs.sh` 权限:
- 非 root 安装时,安装器已创建最小目录并在容器内修复 Kibana/ES/Bind
- 如仍有 `EACCES`/锁文件报错,可再运行 `sudo ./server-prepare-dirs.sh` 统一目录属主。

View File

@ -40,6 +40,8 @@ prepare_data_dirs() {
# still ensure basic directories exist (no chown) # still ensure basic directories exist (no chown)
mkdir -p \ mkdir -p \
"$PKG_ROOT/private/argus/etc" \ "$PKG_ROOT/private/argus/etc" \
"$PKG_ROOT/private/argus/log/elasticsearch" \
"$PKG_ROOT/private/argus/log/kibana" \
"$PKG_ROOT/private/argus/metric/prometheus" \ "$PKG_ROOT/private/argus/metric/prometheus" \
"$PKG_ROOT/private/argus/metric/prometheus/data" \ "$PKG_ROOT/private/argus/metric/prometheus/data" \
"$PKG_ROOT/private/argus/metric/prometheus/rules" \ "$PKG_ROOT/private/argus/metric/prometheus/rules" \
@ -153,6 +155,37 @@ YAML
(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" -f docker-compose.yml -f $(basename "$ov") up -d "${services[@]}") (cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" -f docker-compose.yml -f $(basename "$ov") up -d "${services[@]}")
} }
# Post bootstrap container-side fixes that do not require sudo on host.
post_bootstrap_fixes() {
# Kibana: ensure /usr/share/kibana/data is a symlink into mounted path to avoid EACCES
if docker ps --format '{{.Names}}' | grep -q '^argus-kibana-sys$'; then
docker exec argus-kibana-sys bash -lc '
set -e
mkdir -p /private/argus/log/kibana && chmod 777 /private/argus/log/kibana || true
if [ -d /usr/share/kibana/data ] && [ ! -L /usr/share/kibana/data ]; then rm -rf /usr/share/kibana/data; fi
if [ ! -e /usr/share/kibana/data ]; then ln -s /private/argus/log/kibana /usr/share/kibana/data; fi
' >/dev/null 2>&1 || true
fi
# Elasticsearch: ensure data path points to mounted path and is writable
if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then
docker exec argus-es-sys bash -lc '
set -e
mkdir -p /private/argus/log/elasticsearch && chmod 777 /private/argus/log/elasticsearch || true
if [ -d /usr/share/elasticsearch/data ] && [ ! -L /usr/share/elasticsearch/data ]; then rm -rf /usr/share/elasticsearch/data; fi
if [ ! -e /usr/share/elasticsearch/data ]; then ln -s /private/argus/log/elasticsearch /usr/share/elasticsearch/data; fi
' >/dev/null 2>&1 || true
fi
# Bind9: ensure rndc.key exists
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
docker exec argus-bind-sys bash -lc '
set -e
mkdir -p /etc/bind
if [ ! -f /etc/bind/rndc.key ]; then rndc-confgen -a -c /etc/bind/rndc.key; fi
chmod 644 /etc/bind/rndc.key || true
' >/dev/null 2>&1 || true
fi
}
dns_bootstrap() { dns_bootstrap() {
log "DNS bootstrap: initializing shared dns.conf and container resolv.conf" log "DNS bootstrap: initializing shared dns.conf and container resolv.conf"
local etc_dir="$PKG_ROOT/private/argus/etc" local etc_dir="$PKG_ROOT/private/argus/etc"
@ -206,8 +239,31 @@ dns_bootstrap() {
} }
selfcheck() { selfcheck() {
log "running selfcheck" # Initial selfcheck with retries to absorb cold starts
bash "$PKG_ROOT/scripts/server-selfcheck.sh" || { err "selfcheck failed"; exit 1; } local max_retries="${SELF_CHECK_RETRIES:-5}" # 重试次数(不含首次),默认 5
local wait_seconds="${SELF_CHECK_WAIT_SECONDS:-30}" # 每次重试前等待秒数,默认 30s
local attempt=0
while :; do
attempt=$((attempt+1))
if (( attempt == 1 )); then
log "running selfcheck (attempt ${attempt})"
else
log "running selfcheck (attempt ${attempt}/${max_retries}+1)"
fi
if bash "$PKG_ROOT/scripts/server-selfcheck.sh"; then
return 0
fi
# failed
if (( attempt > max_retries )); then
err "selfcheck failed after ${attempt} attempt(s)"
exit 1
fi
log "selfcheck not ready yet; retrying in ${wait_seconds}s..."
sleep "$wait_seconds"
done
} }
main() { main() {
@ -216,6 +272,7 @@ main() {
prepare_data_dirs prepare_data_dirs
load_images load_images
bring_up bring_up
post_bootstrap_fixes
dns_bootstrap dns_bootstrap
selfcheck selfcheck
log "install completed. See logs in $PKG_ROOT/logs/" log "install completed. See logs in $PKG_ROOT/logs/"