[#37] server install 增加重试自检
This commit is contained in:
parent
37af47076b
commit
b9611c2dd2
@ -7,8 +7,8 @@
|
|||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
1. Extract to a target dir, e.g. `/opt/argus-deploy/versions/<YYYYMMDD>`
|
1. Extract to a target dir, e.g. `/opt/argus-deploy/versions/<YYYYMMDD>`
|
||||||
2. `cd scripts && sudo ./server-prepare-dirs.sh`
|
2. `cd scripts && sudo ./server-prepare-dirs.sh` (recommended)
|
||||||
3. `./server-install.sh`
|
3. `./server-install.sh` (non‑root is supported: it will precreate minimal dirs and auto-fix Kibana/ES/Bind in containers)
|
||||||
4. `./server-status.sh`
|
4. `./server-status.sh`
|
||||||
5. `./server-selfcheck.sh` (on failure it auto-runs diagnose and writes logs under `logs/`)
|
5. `./server-selfcheck.sh` (on failure it auto-runs diagnose and writes logs under `logs/`)
|
||||||
6. `./server-uninstall.sh` to tear down
|
6. `./server-uninstall.sh` to tear down
|
||||||
@ -25,7 +25,11 @@
|
|||||||
- Writes `logs/selfcheck.json` as final summary
|
- Writes `logs/selfcheck.json` as final summary
|
||||||
|
|
||||||
## OS Compatibility
|
## OS Compatibility
|
||||||
- NixOS / non-xattr FS: containers run with `security_opt: ["label=disable"]` and `userns_mode: host`; ensure data dirs are pre-created via `sudo ./server-prepare-dirs.sh` and owned by runtime UID:GID (default 1000:1000).
|
- NixOS / non-xattr FS: containers run with `security_opt: ["label=disable"]` and `userns_mode: host`.
|
||||||
|
- If you cannot use sudo, the installer will:
|
||||||
|
- create minimal data dirs (incl. `private/argus/log/{elasticsearch,kibana}`) with permissive perms when possible;
|
||||||
|
- ensure inside containers: Kibana `data` → `/private/argus/log/kibana`, Elasticsearch `data` → `/private/argus/log/elasticsearch`, and Bind `rndc.key` is generated.
|
||||||
|
You can still run `sudo ./server-prepare-dirs.sh` later to normalize ownership.
|
||||||
|
|
||||||
## Files & Layout
|
## Files & Layout
|
||||||
- `compose/` (docker-compose.yml, .env)
|
- `compose/` (docker-compose.yml, .env)
|
||||||
@ -45,4 +49,3 @@ Common issues:
|
|||||||
- Kibana 503: wait cold start or fix DNS so `es.log.argus.com` resolves
|
- Kibana 503: wait cold start or fix DNS so `es.log.argus.com` resolves
|
||||||
- web‑proxy 504: check nginx `resolver` includes `172.31.0.2 127.0.0.11`
|
- web‑proxy 504: check nginx `resolver` includes `172.31.0.2 127.0.0.11`
|
||||||
- EACCES/locks: ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches UID:GID
|
- EACCES/locks: ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches UID:GID
|
||||||
|
|
||||||
|
|||||||
@ -7,8 +7,8 @@
|
|||||||
|
|
||||||
## 快速开始
|
## 快速开始
|
||||||
1. 解压到目标目录(例如 `/opt/argus-deploy/versions/<YYYYMMDD>`)
|
1. 解压到目标目录(例如 `/opt/argus-deploy/versions/<YYYYMMDD>`)
|
||||||
2. 进入 `scripts/`:`sudo ./server-prepare-dirs.sh`
|
2. 进入 `scripts/`:`sudo ./server-prepare-dirs.sh`(推荐)
|
||||||
3. 安装:`./server-install.sh`
|
3. 安装:`./server-install.sh`(支持普通用户:会自动创建最小目录并在容器内修复 Kibana/ES/Bind)
|
||||||
4. 状态:`./server-status.sh`
|
4. 状态:`./server-status.sh`
|
||||||
5. 自检:`./server-selfcheck.sh`(失败会自动采集诊断)
|
5. 自检:`./server-selfcheck.sh`(失败会自动采集诊断)
|
||||||
6. 卸载:`./server-uninstall.sh`
|
6. 卸载:`./server-uninstall.sh`
|
||||||
@ -19,10 +19,13 @@
|
|||||||
- 输出自检结果到 `logs/selfcheck.json`。
|
- 输出自检结果到 `logs/selfcheck.json`。
|
||||||
|
|
||||||
## 兼容说明(NixOS 等)
|
## 兼容说明(NixOS 等)
|
||||||
- 使用 `security_opt: ["label=disable"]` 与 `userns_mode: host`;
|
- 使用 `security_opt: ["label=disable"]` 与 `userns_mode: host`。
|
||||||
- 先运行 `sudo ./server-prepare-dirs.sh` 创建/授权目录为 `1000:1000`;
|
- 若不能使用 sudo:安装器会创建最小目录(含 `private/argus/log/{elasticsearch,kibana}`),并在容器内完成:
|
||||||
|
- Kibana 的 `data` 软链到 `/private/argus/log/kibana`
|
||||||
|
- Elasticsearch 的 `data` 软链到 `/private/argus/log/elasticsearch`
|
||||||
|
- Bind 生成 `/etc/bind/rndc.key`
|
||||||
|
安装后也可再执行 `sudo ./server-prepare-dirs.sh` 统一目录属主。
|
||||||
|
|
||||||
## 故障排查(见下文 Troubleshooting_zh)
|
## 故障排查(见下文 Troubleshooting_zh)
|
||||||
- `./server-selfcheck.sh` → `logs/selfcheck.json`
|
- `./server-selfcheck.sh` → `logs/selfcheck.json`
|
||||||
- `./server-diagnose.sh` → `logs/diagnose_error_*.log` / `logs/diagnose_details_*.log`
|
- `./server-diagnose.sh` → `logs/diagnose_error_*.log` / `logs/diagnose_details_*.log`
|
||||||
|
|
||||||
|
|||||||
@ -11,5 +11,6 @@
|
|||||||
|
|
||||||
Web‑Proxy:8083=200/302/403;8084/8085 需包含 CORS
|
Web‑Proxy:8083=200/302/403;8084/8085 需包含 CORS
|
||||||
Kibana:确认可解析 `es.log.argus.com`
|
Kibana:确认可解析 `es.log.argus.com`
|
||||||
权限:先运行 `sudo ./server-prepare-dirs.sh`
|
权限:
|
||||||
|
- 非 root 安装时,安装器已创建最小目录并在容器内修复 Kibana/ES/Bind;
|
||||||
|
- 如仍有 `EACCES`/锁文件报错,可再运行 `sudo ./server-prepare-dirs.sh` 统一目录属主。
|
||||||
|
|||||||
@ -40,6 +40,8 @@ prepare_data_dirs() {
|
|||||||
# still ensure basic directories exist (no chown)
|
# still ensure basic directories exist (no chown)
|
||||||
mkdir -p \
|
mkdir -p \
|
||||||
"$PKG_ROOT/private/argus/etc" \
|
"$PKG_ROOT/private/argus/etc" \
|
||||||
|
"$PKG_ROOT/private/argus/log/elasticsearch" \
|
||||||
|
"$PKG_ROOT/private/argus/log/kibana" \
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus" \
|
"$PKG_ROOT/private/argus/metric/prometheus" \
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus/data" \
|
"$PKG_ROOT/private/argus/metric/prometheus/data" \
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus/rules" \
|
"$PKG_ROOT/private/argus/metric/prometheus/rules" \
|
||||||
@ -153,6 +155,37 @@ YAML
|
|||||||
(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" -f docker-compose.yml -f $(basename "$ov") up -d "${services[@]}")
|
(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" -f docker-compose.yml -f $(basename "$ov") up -d "${services[@]}")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Post bootstrap container-side fixes that do not require sudo on host.
|
||||||
|
post_bootstrap_fixes() {
|
||||||
|
# Kibana: ensure /usr/share/kibana/data is a symlink into mounted path to avoid EACCES
|
||||||
|
if docker ps --format '{{.Names}}' | grep -q '^argus-kibana-sys$'; then
|
||||||
|
docker exec argus-kibana-sys bash -lc '
|
||||||
|
set -e
|
||||||
|
mkdir -p /private/argus/log/kibana && chmod 777 /private/argus/log/kibana || true
|
||||||
|
if [ -d /usr/share/kibana/data ] && [ ! -L /usr/share/kibana/data ]; then rm -rf /usr/share/kibana/data; fi
|
||||||
|
if [ ! -e /usr/share/kibana/data ]; then ln -s /private/argus/log/kibana /usr/share/kibana/data; fi
|
||||||
|
' >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
# Elasticsearch: ensure data path points to mounted path and is writable
|
||||||
|
if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then
|
||||||
|
docker exec argus-es-sys bash -lc '
|
||||||
|
set -e
|
||||||
|
mkdir -p /private/argus/log/elasticsearch && chmod 777 /private/argus/log/elasticsearch || true
|
||||||
|
if [ -d /usr/share/elasticsearch/data ] && [ ! -L /usr/share/elasticsearch/data ]; then rm -rf /usr/share/elasticsearch/data; fi
|
||||||
|
if [ ! -e /usr/share/elasticsearch/data ]; then ln -s /private/argus/log/elasticsearch /usr/share/elasticsearch/data; fi
|
||||||
|
' >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
# Bind9: ensure rndc.key exists
|
||||||
|
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
|
||||||
|
docker exec argus-bind-sys bash -lc '
|
||||||
|
set -e
|
||||||
|
mkdir -p /etc/bind
|
||||||
|
if [ ! -f /etc/bind/rndc.key ]; then rndc-confgen -a -c /etc/bind/rndc.key; fi
|
||||||
|
chmod 644 /etc/bind/rndc.key || true
|
||||||
|
' >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
dns_bootstrap() {
|
dns_bootstrap() {
|
||||||
log "DNS bootstrap: initializing shared dns.conf and container resolv.conf"
|
log "DNS bootstrap: initializing shared dns.conf and container resolv.conf"
|
||||||
local etc_dir="$PKG_ROOT/private/argus/etc"
|
local etc_dir="$PKG_ROOT/private/argus/etc"
|
||||||
@ -206,8 +239,31 @@ dns_bootstrap() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
selfcheck() {
|
selfcheck() {
|
||||||
log "running selfcheck"
|
# Initial selfcheck with retries to absorb cold starts
|
||||||
bash "$PKG_ROOT/scripts/server-selfcheck.sh" || { err "selfcheck failed"; exit 1; }
|
local max_retries="${SELF_CHECK_RETRIES:-5}" # 重试次数(不含首次),默认 5
|
||||||
|
local wait_seconds="${SELF_CHECK_WAIT_SECONDS:-30}" # 每次重试前等待秒数,默认 30s
|
||||||
|
|
||||||
|
local attempt=0
|
||||||
|
while :; do
|
||||||
|
attempt=$((attempt+1))
|
||||||
|
if (( attempt == 1 )); then
|
||||||
|
log "running selfcheck (attempt ${attempt})"
|
||||||
|
else
|
||||||
|
log "running selfcheck (attempt ${attempt}/${max_retries}+1)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if bash "$PKG_ROOT/scripts/server-selfcheck.sh"; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# failed
|
||||||
|
if (( attempt > max_retries )); then
|
||||||
|
err "selfcheck failed after ${attempt} attempt(s)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
log "selfcheck not ready yet; retrying in ${wait_seconds}s..."
|
||||||
|
sleep "$wait_seconds"
|
||||||
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
main() {
|
main() {
|
||||||
@ -216,6 +272,7 @@ main() {
|
|||||||
prepare_data_dirs
|
prepare_data_dirs
|
||||||
load_images
|
load_images
|
||||||
bring_up
|
bring_up
|
||||||
|
post_bootstrap_fixes
|
||||||
dns_bootstrap
|
dns_bootstrap
|
||||||
selfcheck
|
selfcheck
|
||||||
log "install completed. See logs in $PKG_ROOT/logs/"
|
log "install completed. See logs in $PKG_ROOT/logs/"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user