Compare commits
2 Commits
252461c642
...
b9611c2dd2
| Author | SHA1 | Date | |
|---|---|---|---|
| b9611c2dd2 | |||
| 37af47076b |
@ -46,7 +46,11 @@ make_dir "$STAGE/private/argus"
|
||||
# 2) Compose: derive from sys/tests by removing test-only services
|
||||
SRC_COMPOSE="$ROOT_DIR/src/sys/tests/docker-compose.yml"
|
||||
[[ -f "$SRC_COMPOSE" ]] || { err "missing $SRC_COMPOSE"; exit 1; }
|
||||
awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$STAGE/compose/docker-compose.yml"
|
||||
# 2.1 filter out test services
|
||||
tmp_compose1="$STAGE/compose/docker-compose.filtered.yml"
|
||||
awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$tmp_compose1"
|
||||
# 2.2 transform to external overlay network (remove sysnet and per-service blocks)
|
||||
awk -f "$BUILD_DIR/templates/docker-compose.overlay.awk" "$tmp_compose1" > "$STAGE/compose/docker-compose.yml"
|
||||
cp "$BUILD_DIR/templates/.env.example" "$STAGE/compose/.env.example"
|
||||
# fix relative private path to match package layout (compose/ and private/ are siblings)
|
||||
sed -i 's#\./private/#../private/#g' "$STAGE/compose/docker-compose.yml"
|
||||
@ -55,17 +59,41 @@ sed -i -E 's#- \./private:/private#- ../private:/private#g' "$STAGE/compose/dock
|
||||
# drop timezone file bind which may not exist on target distros (e.g. NixOS)
|
||||
sed -i '/\/etc\/timezone:\/etc\/timezone:ro/d' "$STAGE/compose/docker-compose.yml"
|
||||
|
||||
# sanity-check: ensure test services are absent
|
||||
# sanity-check: ensure test services are absent and external network present
|
||||
if grep -E '^(\s{2})(node-a|node-b|test-node|test-gpu-node):\s*$' -n "$STAGE/compose/docker-compose.yml" >/dev/null; then
|
||||
err "compose filter failed: test services still present"; exit 1;
|
||||
fi
|
||||
if ! grep -q '^networks:' "$STAGE/compose/docker-compose.yml" || ! grep -q 'argus-sys-net:' "$STAGE/compose/docker-compose.yml"; then
|
||||
err "compose overlay transform failed: external network missing"; exit 1;
|
||||
fi
|
||||
|
||||
# 3) Images (reuse if already exported unless --resave-image)
|
||||
existing_images_tar="$PKG_DIR/images/all-images.tar.gz"
|
||||
if [[ "$RESAVE_IMAGE" == false && -f "$existing_images_tar" ]]; then
|
||||
log "Reusing existing images tar: $existing_images_tar"
|
||||
cp "$existing_images_tar" "$STAGE/images/"
|
||||
else
|
||||
elif [[ "$RESAVE_IMAGE" == false ]]; then
|
||||
# Try cross-version reuse from latest server_*.tar.gz
|
||||
latest_pkg=$(ls -1t "$ART_ROOT/server"/server_*.tar.gz 2>/dev/null | head -n1 || true)
|
||||
if [[ -n "$latest_pkg" ]]; then
|
||||
log "Reusing images from: $latest_pkg"
|
||||
mkdir -p "$STAGE/images"
|
||||
# extract matching file regardless of top-level dir
|
||||
if tar -xzf "$latest_pkg" -C "$STAGE" --wildcards '*/images/all-images.tar.gz' 2>/dev/null; then
|
||||
# locate and move
|
||||
found=$(find "$STAGE" -type f -path '*/images/all-images.tar.gz' | head -n1 || true)
|
||||
if [[ -n "$found" ]]; then
|
||||
mv "$found" "$STAGE/images/all-images.tar.gz"
|
||||
# cleanup leftover extracted dir
|
||||
dir_to_clean=$(dirname "$found")
|
||||
rm -rf "${dir_to_clean%/images}" 2>/dev/null || true
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# If still not present, save from local docker daemon
|
||||
if [[ ! -f "$STAGE/images/all-images.tar.gz" ]]; then
|
||||
require_cmd docker gzip
|
||||
images=(
|
||||
argus-bind9:latest
|
||||
|
||||
@ -27,3 +27,6 @@ FTP_DOMAIN=ftp.metric.argus.com
|
||||
|
||||
# GPU profile disabled by default
|
||||
ENABLE_GPU=false
|
||||
|
||||
# External overlay network (Swarm attachable)
|
||||
OVERLAY_NET_NAME=argus-sys-net
|
||||
|
||||
74
deployment/build/templates/docker-compose.overlay.awk
Normal file
74
deployment/build/templates/docker-compose.overlay.awk
Normal file
@ -0,0 +1,74 @@
|
||||
#!/usr/bin/awk -f
|
||||
# Transform docker-compose.yml to use an external overlay network for all services
|
||||
# - Remove top-level networks definition
|
||||
# - Remove per-service networks block (including ipv4_address and sysnet refs)
|
||||
# - Insert per-service networks: [argus-sys-net]
|
||||
# - Append external networks mapping at the end
|
||||
|
||||
BEGIN{
|
||||
in_top_networks=0; in_services=0; in_service=0; svc_indent=0; curr_name="";
|
||||
}
|
||||
|
||||
function is_service_header(line){ return svc_name(line)!=""; }
|
||||
function svc_name(line, m){ if (match(line, /^ ([A-Za-z0-9_-]+):[ ]*$/, m)) return m[1]; return ""; }
|
||||
|
||||
function indent_len(s, n){ n=match(s,/[^ ]/)-1; if(n<0) n=0; return n; }
|
||||
|
||||
{
|
||||
# Detect entry into top-level sections
|
||||
if ($0 ~ /^[A-Za-z0-9_-]+:[ ]*$/ && $0 !~ /^\s/) {
|
||||
in_services = ($0 ~ /^services:[ ]*$/);
|
||||
# If a new top-level section starts, stop skipping top networks
|
||||
in_top_networks = 0;
|
||||
}
|
||||
|
||||
# Handle removal of initial top-level 'networks:' block
|
||||
if ($0 ~ /^networks:[ ]*$/ && $0 !~ /^\s/) {
|
||||
in_top_networks = 1; next;
|
||||
}
|
||||
if (in_top_networks) {
|
||||
# skip until next top-level section (non-indented key)
|
||||
next;
|
||||
}
|
||||
|
||||
if (in_services) {
|
||||
# Track service boundaries
|
||||
if (is_service_header($0)) {
|
||||
in_service=1; svc_indent=2; networks_inserted=0; curr_name=svc_name($0); print; next;
|
||||
}
|
||||
if (in_service) {
|
||||
# If line is indented <= service indent, we've left this service
|
||||
if (indent_len($0) <= svc_indent && $0 !~ /^\s*$/) {
|
||||
in_service=0;
|
||||
}
|
||||
}
|
||||
|
||||
if (in_service) {
|
||||
# Skip any existing networks block under the service
|
||||
if ($0 ~ /^\s{4}networks:[ ]*$/) { skipping_nets=1; next; }
|
||||
if (skipping_nets) {
|
||||
if (indent_len($0) <= 4) { skipping_nets=0; }
|
||||
else next;
|
||||
}
|
||||
|
||||
# After container_name or image, inject networks once
|
||||
if (!networks_inserted && ($0 ~ /^\s{4}container_name:/ || $0 ~ /^\s{4}image:/)) {
|
||||
print;
|
||||
print " networks:";
|
||||
print " - argus-sys-net";
|
||||
networks_inserted=1; next;
|
||||
}
|
||||
# no host port injection; bind serves DNS inside overlay only
|
||||
}
|
||||
}
|
||||
|
||||
print;
|
||||
}
|
||||
|
||||
END{
|
||||
print "";
|
||||
print "networks:";
|
||||
print " argus-sys-net:";
|
||||
print " external: true";
|
||||
print " name: ${OVERLAY_NET_NAME:-argus-sys-net}";
|
||||
}
|
||||
@ -7,8 +7,8 @@
|
||||
|
||||
## Quick Start
|
||||
1. Extract to a target dir, e.g. `/opt/argus-deploy/versions/<YYYYMMDD>`
|
||||
2. `cd scripts && sudo ./server-prepare-dirs.sh`
|
||||
3. `./server-install.sh`
|
||||
2. `cd scripts && sudo ./server-prepare-dirs.sh` (recommended)
|
||||
3. `./server-install.sh` (non‑root is supported: it will precreate minimal dirs and auto-fix Kibana/ES/Bind in containers)
|
||||
4. `./server-status.sh`
|
||||
5. `./server-selfcheck.sh` (on failure it auto-runs diagnose and writes logs under `logs/`)
|
||||
6. `./server-uninstall.sh` to tear down
|
||||
@ -25,7 +25,11 @@
|
||||
- Writes `logs/selfcheck.json` as final summary
|
||||
|
||||
## OS Compatibility
|
||||
- NixOS / non-xattr FS: containers run with `security_opt: ["label=disable"]` and `userns_mode: host`; ensure data dirs are pre-created via `sudo ./server-prepare-dirs.sh` and owned by runtime UID:GID (default 1000:1000).
|
||||
- NixOS / non-xattr FS: containers run with `security_opt: ["label=disable"]` and `userns_mode: host`.
|
||||
- If you cannot use sudo, the installer will:
|
||||
- create minimal data dirs (incl. `private/argus/log/{elasticsearch,kibana}`) with permissive perms when possible;
|
||||
- ensure inside containers: Kibana `data` → `/private/argus/log/kibana`, Elasticsearch `data` → `/private/argus/log/elasticsearch`, and Bind `rndc.key` is generated.
|
||||
You can still run `sudo ./server-prepare-dirs.sh` later to normalize ownership.
|
||||
|
||||
## Files & Layout
|
||||
- `compose/` (docker-compose.yml, .env)
|
||||
@ -45,4 +49,3 @@ Common issues:
|
||||
- Kibana 503: wait cold start or fix DNS so `es.log.argus.com` resolves
|
||||
- web‑proxy 504: check nginx `resolver` includes `172.31.0.2 127.0.0.11`
|
||||
- EACCES/locks: ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches UID:GID
|
||||
|
||||
|
||||
@ -7,8 +7,8 @@
|
||||
|
||||
## 快速开始
|
||||
1. 解压到目标目录(例如 `/opt/argus-deploy/versions/<YYYYMMDD>`)
|
||||
2. 进入 `scripts/`:`sudo ./server-prepare-dirs.sh`
|
||||
3. 安装:`./server-install.sh`
|
||||
2. 进入 `scripts/`:`sudo ./server-prepare-dirs.sh`(推荐)
|
||||
3. 安装:`./server-install.sh`(支持普通用户:会自动创建最小目录并在容器内修复 Kibana/ES/Bind)
|
||||
4. 状态:`./server-status.sh`
|
||||
5. 自检:`./server-selfcheck.sh`(失败会自动采集诊断)
|
||||
6. 卸载:`./server-uninstall.sh`
|
||||
@ -19,10 +19,13 @@
|
||||
- 输出自检结果到 `logs/selfcheck.json`。
|
||||
|
||||
## 兼容说明(NixOS 等)
|
||||
- 使用 `security_opt: ["label=disable"]` 与 `userns_mode: host`;
|
||||
- 先运行 `sudo ./server-prepare-dirs.sh` 创建/授权目录为 `1000:1000`;
|
||||
- 使用 `security_opt: ["label=disable"]` 与 `userns_mode: host`。
|
||||
- 若不能使用 sudo:安装器会创建最小目录(含 `private/argus/log/{elasticsearch,kibana}`),并在容器内完成:
|
||||
- Kibana 的 `data` 软链到 `/private/argus/log/kibana`
|
||||
- Elasticsearch 的 `data` 软链到 `/private/argus/log/elasticsearch`
|
||||
- Bind 生成 `/etc/bind/rndc.key`
|
||||
安装后也可再执行 `sudo ./server-prepare-dirs.sh` 统一目录属主。
|
||||
|
||||
## 故障排查(见下文 Troubleshooting_zh)
|
||||
- `./server-selfcheck.sh` → `logs/selfcheck.json`
|
||||
- `./server-diagnose.sh` → `logs/diagnose_error_*.log` / `logs/diagnose_details_*.log`
|
||||
|
||||
|
||||
26
deployment/build/templates/docs/SWARM_DEPLOY_zh.md
Normal file
26
deployment/build/templates/docs/SWARM_DEPLOY_zh.md
Normal file
@ -0,0 +1,26 @@
|
||||
# Argus 多机部署(Docker Swarm + External Overlay)
|
||||
|
||||
- 前提:Docker ≥ 20.10;Manager/Worker 节点开放 2377/tcp、7946/tcp+udp、4789/udp。
|
||||
- DNS:Bind9 为统一权威,解析 *.argus.com 至部署机固定对外主机 IP。
|
||||
|
||||
## 在部署机(Manager)
|
||||
- 初始化 Swarm:`docker swarm init --advertise-addr <manager_ip>`
|
||||
- 创建 overlay:`docker network create --driver overlay --attachable argus-sys-net`
|
||||
- 解压离线包后执行:
|
||||
- `cd scripts && sudo ./server-prepare-dirs.sh`
|
||||
- `./server-install.sh`(会验证 Swarm 状态、确保 overlay 存在、写入 dns.conf)
|
||||
- `./server-selfcheck.sh`(失败会自动触发诊断)
|
||||
|
||||
## 在节点机(Worker 或非 Docker 主机)
|
||||
- Swarm Worker:执行 Manager 的 `docker swarm join ...`;
|
||||
- 运行客户端容器:
|
||||
- `docker run -d --name argus-metric-node-001 --network argus-sys-net -v /data/argus/agent:/private/argus/agent argus-sys-metric-test-node:latest sleep infinity`
|
||||
- 进入容器安装(先 IP 引导,后域名):
|
||||
- `curl -u ftpuser:*** -fsSL ftp://<FTP_IP>:21/setup.sh -o /tmp/setup.sh && chmod +x /tmp/setup.sh`
|
||||
- `AGENT_ENV=dev AGENT_USER=alice AGENT_INSTANCE=node001 MASTER_ENDPOINT=http://master.argus.com:3000 /tmp/setup.sh --server ftp.argus.com --user ftpuser --password '***' --port 21`
|
||||
|
||||
## 关键点
|
||||
- 首次必须使用 FTP 的 IP 引导(下载 setup.sh 与 dns.conf)
|
||||
- MASTER_ENDPOINT 永远使用域名:`http://master.argus.com:3000`
|
||||
- docker compose 改为 external overlay;容器内不使用 Docker 服务名;web-proxy 与组件上游统一用域名
|
||||
|
||||
@ -11,5 +11,6 @@
|
||||
|
||||
Web‑Proxy:8083=200/302/403;8084/8085 需包含 CORS
|
||||
Kibana:确认可解析 `es.log.argus.com`
|
||||
权限:先运行 `sudo ./server-prepare-dirs.sh`
|
||||
|
||||
权限:
|
||||
- 非 root 安装时,安装器已创建最小目录并在容器内修复 Kibana/ES/Bind;
|
||||
- 如仍有 `EACCES`/锁文件报错,可再运行 `sudo ./server-prepare-dirs.sh` 统一目录属主。
|
||||
|
||||
@ -6,10 +6,16 @@ ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
||||
|
||||
mkdir -p "$ROOT/logs"
|
||||
ts="$(date -u +%Y%m%d-%H%M%SZ)"
|
||||
DETAILS="$ROOT/logs/diagnose_details_${ts}.log"
|
||||
ERRORS="$ROOT/logs/diagnose_error_${ts}.log"
|
||||
LOG_DIR="$ROOT/logs"
|
||||
mkdir -p "$LOG_DIR" || true
|
||||
# Fallback to /tmp when logs dir is not writable
|
||||
if ! ( : > "$LOG_DIR/.w" 2>/dev/null ); then
|
||||
LOG_DIR="/tmp/argus-logs"
|
||||
mkdir -p "$LOG_DIR" || true
|
||||
fi
|
||||
DETAILS="$LOG_DIR/diagnose_details_${ts}.log"
|
||||
ERRORS="$LOG_DIR/diagnose_error_${ts}.log"
|
||||
: > "$DETAILS"; : > "$ERRORS"
|
||||
|
||||
logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; }
|
||||
@ -83,6 +89,25 @@ logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083}
|
||||
logd "Web-Proxy 8084 CORS: ${cors8084}"
|
||||
logd "Web-Proxy 8085 CORS: ${cors8085}"
|
||||
|
||||
# Overlay network diagnostics
|
||||
section OVERLAY-NET
|
||||
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then
|
||||
logd "overlay present: ${OVERLAY_NET_NAME:-argus-sys-net}"
|
||||
docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" | sed -n '1,60p' >> "$DETAILS" 2>/dev/null || true
|
||||
else
|
||||
append_err "[overlay][network] missing ${OVERLAY_NET_NAME:-argus-sys-net}"
|
||||
fi
|
||||
|
||||
# Domain resolution & reachability from inside web-proxy (bind-backed)
|
||||
section DOMAIN
|
||||
for d in master.argus.com ftp.argus.com kibana.argus.com es.log.argus.com prom.argus.com grafana.argus.com; do
|
||||
logd "getent $d (web-proxy):"
|
||||
docker exec argus-web-proxy sh -lc "getent hosts $d || true" >> "$DETAILS" 2>&1 || true
|
||||
done
|
||||
logd "HTTP (web-proxy): master.readyz=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://master.argus.com:3000/readyz\" 2>/dev/null || echo 000)"
|
||||
logd "HTTP (web-proxy): es.health=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://es.log.argus.com:9200/_cluster/health\" 2>/dev/null || echo 000)"
|
||||
logd "HTTP (web-proxy): kibana.status=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://kibana.argus.com:5601/api/status\" 2>/dev/null || echo 000)"
|
||||
|
||||
# FTP share writability (container perspective)
|
||||
section FTP-SHARE
|
||||
docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true
|
||||
@ -110,8 +135,13 @@ sort -u -o "$ERRORS" "$ERRORS"
|
||||
echo "Diagnostic details -> $DETAILS"
|
||||
echo "Detected errors -> $ERRORS"
|
||||
|
||||
# maintain latest symlinks for convenience
|
||||
ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true
|
||||
ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true
|
||||
if [[ "$LOG_DIR" == "$ROOT/logs" ]]; then
|
||||
# maintain latest symlinks when writing under package logs
|
||||
ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true
|
||||
ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true
|
||||
else
|
||||
echo "Diagnostic details -> $DETAILS"
|
||||
echo "Detected errors -> $ERRORS"
|
||||
fi
|
||||
|
||||
exit 0
|
||||
|
||||
@ -30,17 +30,7 @@ prepare_env() {
|
||||
if [[ -f "$ENV_FILE" ]]; then log ".env exists, keep as-is"; return; fi
|
||||
[[ -f "$ENV_TEMPLATE" ]] || { err "missing $ENV_TEMPLATE"; exit 1; }
|
||||
cp "$ENV_TEMPLATE" "$ENV_FILE"
|
||||
# auto-assign ports if busy
|
||||
for key in MASTER_PORT ES_HTTP_PORT KIBANA_PORT NODE_A_PORT NODE_B_PORT PROMETHEUS_PORT GRAFANA_PORT ALERTMANAGER_PORT \
|
||||
WEB_PROXY_PORT_8080 WEB_PROXY_PORT_8081 WEB_PROXY_PORT_8082 WEB_PROXY_PORT_8083 WEB_PROXY_PORT_8084 WEB_PROXY_PORT_8085 \
|
||||
FTP_PORT FTP_DATA_PORT; do
|
||||
val=$(grep -E "^${key}=" "$ENV_FILE" | tail -1 | cut -d= -f2)
|
||||
new=$(find_free_port "$val") || true
|
||||
if [[ -n "${new:-}" && "$new" != "$val" ]]; then
|
||||
sed -i "s/^${key}=.*/${key}=${new}/" "$ENV_FILE"
|
||||
log "port ${key} busy -> ${new}"
|
||||
fi
|
||||
done
|
||||
# overlay 模式下,避免为不同服务分配到同一新端口;保持模板端口不做自动改写
|
||||
}
|
||||
|
||||
prepare_data_dirs() {
|
||||
@ -50,6 +40,8 @@ prepare_data_dirs() {
|
||||
# still ensure basic directories exist (no chown)
|
||||
mkdir -p \
|
||||
"$PKG_ROOT/private/argus/etc" \
|
||||
"$PKG_ROOT/private/argus/log/elasticsearch" \
|
||||
"$PKG_ROOT/private/argus/log/kibana" \
|
||||
"$PKG_ROOT/private/argus/metric/prometheus" \
|
||||
"$PKG_ROOT/private/argus/metric/prometheus/data" \
|
||||
"$PKG_ROOT/private/argus/metric/prometheus/rules" \
|
||||
@ -63,6 +55,43 @@ prepare_data_dirs() {
|
||||
"$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \
|
||||
"$PKG_ROOT/private/argus/alert/alertmanager" \
|
||||
"$PKG_ROOT/private/argus/metric/ftp/share"
|
||||
# non-root: relax permissions to avoid container UID mismatch blocking writes
|
||||
chmod -R a+rwx "$PKG_ROOT/private/argus" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
|
||||
ensure_swarm_and_overlay() {
|
||||
local net_name="${OVERLAY_NET_NAME:-argus-sys-net}"
|
||||
# Require swarm active
|
||||
local state
|
||||
state=$(docker info --format '{{.Swarm.LocalNodeState}}' 2>/dev/null || echo "")
|
||||
if [[ "$state" != "active" ]]; then
|
||||
err "Docker Swarm is not active. On this host run:"
|
||||
err " docker swarm init --advertise-addr <this_host_ip>"
|
||||
exit 1
|
||||
fi
|
||||
# Create attachable overlay if missing
|
||||
if ! docker network inspect "$net_name" >/dev/null 2>&1; then
|
||||
log "creating attachable overlay network: $net_name"
|
||||
docker network create --driver overlay --attachable "$net_name" >/dev/null
|
||||
fi
|
||||
}
|
||||
|
||||
bootstrap_dns_conf() {
|
||||
local etc_dir="$PKG_ROOT/private/argus/etc"
|
||||
mkdir -p "$etc_dir"
|
||||
local dns_file="$etc_dir/dns.conf"
|
||||
if [[ ! -s "$dns_file" ]]; then
|
||||
# detect host primary IP
|
||||
local host_ip
|
||||
host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7; exit}')
|
||||
[[ -z "$host_ip" ]] && host_ip=$(hostname -I 2>/dev/null | awk '{print $1}')
|
||||
if [[ -n "$host_ip" ]]; then
|
||||
echo "$host_ip" > "$dns_file"
|
||||
log "wrote initial dns.conf with host IP: $host_ip"
|
||||
else
|
||||
err "failed to determine host IP for dns.conf; please edit $dns_file manually"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
@ -75,6 +104,8 @@ load_images() {
|
||||
|
||||
bring_up() {
|
||||
log "starting services via compose"
|
||||
ensure_swarm_and_overlay
|
||||
bootstrap_dns_conf
|
||||
local ov="$PKG_ROOT/compose/docker-compose.os-compat.override.yml"
|
||||
if [[ ! -f "$ov" ]]; then
|
||||
cat > "$ov" <<'YAML'
|
||||
@ -124,6 +155,37 @@ YAML
|
||||
(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" -f docker-compose.yml -f $(basename "$ov") up -d "${services[@]}")
|
||||
}
|
||||
|
||||
# Post bootstrap container-side fixes that do not require sudo on host.
|
||||
post_bootstrap_fixes() {
|
||||
# Kibana: ensure /usr/share/kibana/data is a symlink into mounted path to avoid EACCES
|
||||
if docker ps --format '{{.Names}}' | grep -q '^argus-kibana-sys$'; then
|
||||
docker exec argus-kibana-sys bash -lc '
|
||||
set -e
|
||||
mkdir -p /private/argus/log/kibana && chmod 777 /private/argus/log/kibana || true
|
||||
if [ -d /usr/share/kibana/data ] && [ ! -L /usr/share/kibana/data ]; then rm -rf /usr/share/kibana/data; fi
|
||||
if [ ! -e /usr/share/kibana/data ]; then ln -s /private/argus/log/kibana /usr/share/kibana/data; fi
|
||||
' >/dev/null 2>&1 || true
|
||||
fi
|
||||
# Elasticsearch: ensure data path points to mounted path and is writable
|
||||
if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then
|
||||
docker exec argus-es-sys bash -lc '
|
||||
set -e
|
||||
mkdir -p /private/argus/log/elasticsearch && chmod 777 /private/argus/log/elasticsearch || true
|
||||
if [ -d /usr/share/elasticsearch/data ] && [ ! -L /usr/share/elasticsearch/data ]; then rm -rf /usr/share/elasticsearch/data; fi
|
||||
if [ ! -e /usr/share/elasticsearch/data ]; then ln -s /private/argus/log/elasticsearch /usr/share/elasticsearch/data; fi
|
||||
' >/dev/null 2>&1 || true
|
||||
fi
|
||||
# Bind9: ensure rndc.key exists
|
||||
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
|
||||
docker exec argus-bind-sys bash -lc '
|
||||
set -e
|
||||
mkdir -p /etc/bind
|
||||
if [ ! -f /etc/bind/rndc.key ]; then rndc-confgen -a -c /etc/bind/rndc.key; fi
|
||||
chmod 644 /etc/bind/rndc.key || true
|
||||
' >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
dns_bootstrap() {
|
||||
log "DNS bootstrap: initializing shared dns.conf and container resolv.conf"
|
||||
local etc_dir="$PKG_ROOT/private/argus/etc"
|
||||
@ -177,15 +239,40 @@ dns_bootstrap() {
|
||||
}
|
||||
|
||||
selfcheck() {
|
||||
log "running selfcheck"
|
||||
bash "$PKG_ROOT/scripts/server-selfcheck.sh" || { err "selfcheck failed"; exit 1; }
|
||||
# Initial selfcheck with retries to absorb cold starts
|
||||
local max_retries="${SELF_CHECK_RETRIES:-5}" # 重试次数(不含首次),默认 5
|
||||
local wait_seconds="${SELF_CHECK_WAIT_SECONDS:-30}" # 每次重试前等待秒数,默认 30s
|
||||
|
||||
local attempt=0
|
||||
while :; do
|
||||
attempt=$((attempt+1))
|
||||
if (( attempt == 1 )); then
|
||||
log "running selfcheck (attempt ${attempt})"
|
||||
else
|
||||
log "running selfcheck (attempt ${attempt}/${max_retries}+1)"
|
||||
fi
|
||||
|
||||
if bash "$PKG_ROOT/scripts/server-selfcheck.sh"; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
# failed
|
||||
if (( attempt > max_retries )); then
|
||||
err "selfcheck failed after ${attempt} attempt(s)"
|
||||
exit 1
|
||||
fi
|
||||
log "selfcheck not ready yet; retrying in ${wait_seconds}s..."
|
||||
sleep "$wait_seconds"
|
||||
done
|
||||
}
|
||||
|
||||
main() {
|
||||
mkdir -p "$PKG_ROOT/logs"
|
||||
prepare_env
|
||||
prepare_data_dirs
|
||||
load_images
|
||||
bring_up
|
||||
post_bootstrap_fixes
|
||||
dns_bootstrap
|
||||
selfcheck
|
||||
log "install completed. See logs in $PKG_ROOT/logs/"
|
||||
|
||||
@ -13,23 +13,31 @@ wait_http() { local url="$1"; local attempts=${2:-120}; local i=1; while ((i<=at
|
||||
code_for() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
|
||||
header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; }
|
||||
|
||||
mkdir -p "$ROOT/logs"
|
||||
OUT_JSON="$ROOT/logs/selfcheck.json"
|
||||
LOG_DIR="$ROOT/logs"
|
||||
mkdir -p "$LOG_DIR" || true
|
||||
OUT_JSON="$LOG_DIR/selfcheck.json"
|
||||
tmp=$(mktemp)
|
||||
|
||||
ok=1
|
||||
|
||||
log "checking Elasticsearch"
|
||||
if curl -fsS "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then es_ok=true; else es_ok=false; ok=0; fi
|
||||
log "checking overlay network"
|
||||
net_ok=false
|
||||
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then
|
||||
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" | grep -q '"Driver": "overlay"'; then net_ok=true; fi
|
||||
fi
|
||||
[[ "$net_ok" == true ]] || ok=0
|
||||
|
||||
log "checking Kibana"
|
||||
kb_code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:${KIBANA_PORT:-5601}/api/status" || echo 000)
|
||||
log "checking Elasticsearch (via domain inside web-proxy)"
|
||||
if docker exec argus-web-proxy sh -lc "curl -fsS http://es.log.argus.com:9200/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then es_ok=true; else es_ok=false; ok=0; fi
|
||||
|
||||
log "checking Kibana (via domain inside web-proxy)"
|
||||
kb_code=$(docker exec argus-web-proxy sh -lc "curl -s -o /dev/null -w '%{http_code}' http://kibana.log.argus.com:5601/api/status" || echo 000)
|
||||
kb_ok=false
|
||||
if [[ "$kb_code" == "200" ]]; then body=$(curl -sS "http://localhost:${KIBANA_PORT:-5601}/api/status"); echo "$body" | grep -q '"level":"available"' && kb_ok=true; fi
|
||||
[[ "$kb_ok" == true ]] || ok=0
|
||||
|
||||
log "checking Master"
|
||||
wait_http "http://localhost:${MASTER_PORT:-32300}/readyz" 60 || ok=0
|
||||
log "checking Master (via domain inside web-proxy)"
|
||||
if docker exec argus-web-proxy sh -lc "curl -fsS http://master.argus.com:3000/readyz" >/dev/null 2>&1; then true; else ok=0; fi
|
||||
|
||||
log "checking FTP"
|
||||
if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then
|
||||
@ -71,11 +79,17 @@ cat > "$tmp" <<JSON
|
||||
"grafana": $gf_ok,
|
||||
"alertmanager": true,
|
||||
"web_proxy": $wp_ok,
|
||||
"overlay_net": $net_ok,
|
||||
"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
}
|
||||
JSON
|
||||
|
||||
mv "$tmp" "$OUT_JSON"
|
||||
if ! mv "$tmp" "$OUT_JSON" 2>/dev/null; then
|
||||
# fallback when logs dir not writable (no sudo allowed)
|
||||
OUT_JSON="/tmp/selfcheck_$(date -u +%Y%m%d-%H%M%SZ).json"
|
||||
cp "$tmp" "$OUT_JSON"
|
||||
log "selfcheck.json written to $OUT_JSON (logs dir not writable)"
|
||||
fi
|
||||
if [[ "$ok" == 1 ]]; then
|
||||
log "selfcheck OK"
|
||||
exit 0
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user