Compare commits

...

2 Commits

Author SHA1 Message Date
b9611c2dd2 [#37] server install 增加重试自检 2025-11-04 11:37:27 +08:00
37af47076b [#37] swarm 部署优化 2025-11-04 09:34:42 +08:00
10 changed files with 311 additions and 42 deletions

View File

@ -46,7 +46,11 @@ make_dir "$STAGE/private/argus"
# 2) Compose: derive from sys/tests by removing test-only services
SRC_COMPOSE="$ROOT_DIR/src/sys/tests/docker-compose.yml"
[[ -f "$SRC_COMPOSE" ]] || { err "missing $SRC_COMPOSE"; exit 1; }
awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$STAGE/compose/docker-compose.yml"
# 2.1 filter out test services
tmp_compose1="$STAGE/compose/docker-compose.filtered.yml"
awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$tmp_compose1"
# 2.2 transform to external overlay network (remove sysnet and per-service blocks)
awk -f "$BUILD_DIR/templates/docker-compose.overlay.awk" "$tmp_compose1" > "$STAGE/compose/docker-compose.yml"
cp "$BUILD_DIR/templates/.env.example" "$STAGE/compose/.env.example"
# fix relative private path to match package layout (compose/ and private/ are siblings)
sed -i 's#\./private/#../private/#g' "$STAGE/compose/docker-compose.yml"
@ -55,17 +59,41 @@ sed -i -E 's#- \./private:/private#- ../private:/private#g' "$STAGE/compose/dock
# drop timezone file bind which may not exist on target distros (e.g. NixOS)
sed -i '/\/etc\/timezone:\/etc\/timezone:ro/d' "$STAGE/compose/docker-compose.yml"
# sanity-check: ensure test services are absent
# sanity-check: ensure test services are absent and external network present
if grep -E '^(\s{2})(node-a|node-b|test-node|test-gpu-node):\s*$' -n "$STAGE/compose/docker-compose.yml" >/dev/null; then
err "compose filter failed: test services still present"; exit 1;
fi
if ! grep -q '^networks:' "$STAGE/compose/docker-compose.yml" || ! grep -q 'argus-sys-net:' "$STAGE/compose/docker-compose.yml"; then
err "compose overlay transform failed: external network missing"; exit 1;
fi
# 3) Images (reuse if already exported unless --resave-image)
existing_images_tar="$PKG_DIR/images/all-images.tar.gz"
if [[ "$RESAVE_IMAGE" == false && -f "$existing_images_tar" ]]; then
log "Reusing existing images tar: $existing_images_tar"
cp "$existing_images_tar" "$STAGE/images/"
else
elif [[ "$RESAVE_IMAGE" == false ]]; then
# Try cross-version reuse from latest server_*.tar.gz
latest_pkg=$(ls -1t "$ART_ROOT/server"/server_*.tar.gz 2>/dev/null | head -n1 || true)
if [[ -n "$latest_pkg" ]]; then
log "Reusing images from: $latest_pkg"
mkdir -p "$STAGE/images"
# extract matching file regardless of top-level dir
if tar -xzf "$latest_pkg" -C "$STAGE" --wildcards '*/images/all-images.tar.gz' 2>/dev/null; then
# locate and move
found=$(find "$STAGE" -type f -path '*/images/all-images.tar.gz' | head -n1 || true)
if [[ -n "$found" ]]; then
mv "$found" "$STAGE/images/all-images.tar.gz"
# cleanup leftover extracted dir
dir_to_clean=$(dirname "$found")
rm -rf "${dir_to_clean%/images}" 2>/dev/null || true
fi
fi
fi
fi
# If still not present, save from local docker daemon
if [[ ! -f "$STAGE/images/all-images.tar.gz" ]]; then
require_cmd docker gzip
images=(
argus-bind9:latest

View File

@ -27,3 +27,6 @@ FTP_DOMAIN=ftp.metric.argus.com
# GPU profile disabled by default
ENABLE_GPU=false
# External overlay network (Swarm attachable)
OVERLAY_NET_NAME=argus-sys-net

View File

@ -0,0 +1,74 @@
#!/usr/bin/awk -f
# Transform docker-compose.yml to use an external overlay network for all services
# - Remove top-level networks definition
# - Remove per-service networks block (including ipv4_address and sysnet refs)
# - Insert per-service networks: [argus-sys-net]
# - Append external networks mapping at the end
BEGIN{
in_top_networks=0; in_services=0; in_service=0; svc_indent=0; curr_name="";
}
function is_service_header(line){ return svc_name(line)!=""; }
function svc_name(line, m){ if (match(line, /^ ([A-Za-z0-9_-]+):[ ]*$/, m)) return m[1]; return ""; }
function indent_len(s, n){ n=match(s,/[^ ]/)-1; if(n<0) n=0; return n; }
{
# Detect entry into top-level sections
if ($0 ~ /^[A-Za-z0-9_-]+:[ ]*$/ && $0 !~ /^\s/) {
in_services = ($0 ~ /^services:[ ]*$/);
# If a new top-level section starts, stop skipping top networks
in_top_networks = 0;
}
# Handle removal of initial top-level 'networks:' block
if ($0 ~ /^networks:[ ]*$/ && $0 !~ /^\s/) {
in_top_networks = 1; next;
}
if (in_top_networks) {
# skip until next top-level section (non-indented key)
next;
}
if (in_services) {
# Track service boundaries
if (is_service_header($0)) {
in_service=1; svc_indent=2; networks_inserted=0; curr_name=svc_name($0); print; next;
}
if (in_service) {
# If line is indented <= service indent, we've left this service
if (indent_len($0) <= svc_indent && $0 !~ /^\s*$/) {
in_service=0;
}
}
if (in_service) {
# Skip any existing networks block under the service
if ($0 ~ /^\s{4}networks:[ ]*$/) { skipping_nets=1; next; }
if (skipping_nets) {
if (indent_len($0) <= 4) { skipping_nets=0; }
else next;
}
# After container_name or image, inject networks once
if (!networks_inserted && ($0 ~ /^\s{4}container_name:/ || $0 ~ /^\s{4}image:/)) {
print;
print " networks:";
print " - argus-sys-net";
networks_inserted=1; next;
}
# no host port injection; bind serves DNS inside overlay only
}
}
print;
}
END{
print "";
print "networks:";
print " argus-sys-net:";
print " external: true";
print " name: ${OVERLAY_NET_NAME:-argus-sys-net}";
}

View File

@ -7,8 +7,8 @@
## Quick Start
1. Extract to a target dir, e.g. `/opt/argus-deploy/versions/<YYYYMMDD>`
2. `cd scripts && sudo ./server-prepare-dirs.sh`
3. `./server-install.sh`
2. `cd scripts && sudo ./server-prepare-dirs.sh` (recommended)
3. `./server-install.sh` (nonroot is supported: it will precreate minimal dirs and auto-fix Kibana/ES/Bind in containers)
4. `./server-status.sh`
5. `./server-selfcheck.sh` (on failure it auto-runs diagnose and writes logs under `logs/`)
6. `./server-uninstall.sh` to tear down
@ -25,7 +25,11 @@
- Writes `logs/selfcheck.json` as final summary
## OS Compatibility
- NixOS / non-xattr FS: containers run with `security_opt: ["label=disable"]` and `userns_mode: host`; ensure data dirs are pre-created via `sudo ./server-prepare-dirs.sh` and owned by runtime UID:GID (default 1000:1000).
- NixOS / non-xattr FS: containers run with `security_opt: ["label=disable"]` and `userns_mode: host`.
- If you cannot use sudo, the installer will:
- create minimal data dirs (incl. `private/argus/log/{elasticsearch,kibana}`) with permissive perms when possible;
- ensure inside containers: Kibana `data``/private/argus/log/kibana`, Elasticsearch `data``/private/argus/log/elasticsearch`, and Bind `rndc.key` is generated.
You can still run `sudo ./server-prepare-dirs.sh` later to normalize ownership.
## Files & Layout
- `compose/` (docker-compose.yml, .env)
@ -45,4 +49,3 @@ Common issues:
- Kibana 503: wait cold start or fix DNS so `es.log.argus.com` resolves
- webproxy 504: check nginx `resolver` includes `172.31.0.2 127.0.0.11`
- EACCES/locks: ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches UID:GID

View File

@ -7,8 +7,8 @@
## 快速开始
1. 解压到目标目录(例如 `/opt/argus-deploy/versions/<YYYYMMDD>`
2. 进入 `scripts/``sudo ./server-prepare-dirs.sh`
3. 安装:`./server-install.sh`
2. 进入 `scripts/``sudo ./server-prepare-dirs.sh`(推荐)
3. 安装:`./server-install.sh`(支持普通用户:会自动创建最小目录并在容器内修复 Kibana/ES/Bind
4. 状态:`./server-status.sh`
5. 自检:`./server-selfcheck.sh`(失败会自动采集诊断)
6. 卸载:`./server-uninstall.sh`
@ -19,10 +19,13 @@
- 输出自检结果到 `logs/selfcheck.json`
## 兼容说明NixOS 等)
- 使用 `security_opt: ["label=disable"]``userns_mode: host`
- 先运行 `sudo ./server-prepare-dirs.sh` 创建/授权目录为 `1000:1000`
- 使用 `security_opt: ["label=disable"]``userns_mode: host`
- 若不能使用 sudo安装器会创建最小目录`private/argus/log/{elasticsearch,kibana}`),并在容器内完成:
- Kibana 的 `data` 软链到 `/private/argus/log/kibana`
- Elasticsearch 的 `data` 软链到 `/private/argus/log/elasticsearch`
- Bind 生成 `/etc/bind/rndc.key`
安装后也可再执行 `sudo ./server-prepare-dirs.sh` 统一目录属主。
## 故障排查(见下文 Troubleshooting_zh
- `./server-selfcheck.sh``logs/selfcheck.json`
- `./server-diagnose.sh``logs/diagnose_error_*.log` / `logs/diagnose_details_*.log`

View File

@ -0,0 +1,26 @@
# Argus 多机部署Docker Swarm + External Overlay
- 前提Docker ≥ 20.10Manager/Worker 节点开放 2377/tcp、7946/tcp+udp、4789/udp。
- DNSBind9 为统一权威,解析 *.argus.com 至部署机固定对外主机 IP。
## 在部署机Manager
- 初始化 Swarm`docker swarm init --advertise-addr <manager_ip>`
- 创建 overlay`docker network create --driver overlay --attachable argus-sys-net`
- 解压离线包后执行:
- `cd scripts && sudo ./server-prepare-dirs.sh`
- `./server-install.sh`(会验证 Swarm 状态、确保 overlay 存在、写入 dns.conf
- `./server-selfcheck.sh`(失败会自动触发诊断)
## 在节点机Worker 或非 Docker 主机)
- Swarm Worker执行 Manager 的 `docker swarm join ...`
- 运行客户端容器:
- `docker run -d --name argus-metric-node-001 --network argus-sys-net -v /data/argus/agent:/private/argus/agent argus-sys-metric-test-node:latest sleep infinity`
- 进入容器安装(先 IP 引导,后域名):
- `curl -u ftpuser:*** -fsSL ftp://<FTP_IP>:21/setup.sh -o /tmp/setup.sh && chmod +x /tmp/setup.sh`
- `AGENT_ENV=dev AGENT_USER=alice AGENT_INSTANCE=node001 MASTER_ENDPOINT=http://master.argus.com:3000 /tmp/setup.sh --server ftp.argus.com --user ftpuser --password '***' --port 21`
## 关键点
- 首次必须使用 FTP 的 IP 引导(下载 setup.sh 与 dns.conf
- MASTER_ENDPOINT 永远使用域名:`http://master.argus.com:3000`
- docker compose 改为 external overlay容器内不使用 Docker 服务名web-proxy 与组件上游统一用域名

View File

@ -11,5 +11,6 @@
WebProxy8083=200/302/4038084/8085 需包含 CORS
Kibana确认可解析 `es.log.argus.com`
权限:先运行 `sudo ./server-prepare-dirs.sh`
权限:
- 非 root 安装时,安装器已创建最小目录并在容器内修复 Kibana/ES/Bind
- 如仍有 `EACCES`/锁文件报错,可再运行 `sudo ./server-prepare-dirs.sh` 统一目录属主。

View File

@ -6,10 +6,16 @@ ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
mkdir -p "$ROOT/logs"
ts="$(date -u +%Y%m%d-%H%M%SZ)"
DETAILS="$ROOT/logs/diagnose_details_${ts}.log"
ERRORS="$ROOT/logs/diagnose_error_${ts}.log"
LOG_DIR="$ROOT/logs"
mkdir -p "$LOG_DIR" || true
# Fallback to /tmp when logs dir is not writable
if ! ( : > "$LOG_DIR/.w" 2>/dev/null ); then
LOG_DIR="/tmp/argus-logs"
mkdir -p "$LOG_DIR" || true
fi
DETAILS="$LOG_DIR/diagnose_details_${ts}.log"
ERRORS="$LOG_DIR/diagnose_error_${ts}.log"
: > "$DETAILS"; : > "$ERRORS"
logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; }
@ -83,6 +89,25 @@ logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083}
logd "Web-Proxy 8084 CORS: ${cors8084}"
logd "Web-Proxy 8085 CORS: ${cors8085}"
# Overlay network diagnostics
section OVERLAY-NET
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then
logd "overlay present: ${OVERLAY_NET_NAME:-argus-sys-net}"
docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" | sed -n '1,60p' >> "$DETAILS" 2>/dev/null || true
else
append_err "[overlay][network] missing ${OVERLAY_NET_NAME:-argus-sys-net}"
fi
# Domain resolution & reachability from inside web-proxy (bind-backed)
section DOMAIN
for d in master.argus.com ftp.argus.com kibana.argus.com es.log.argus.com prom.argus.com grafana.argus.com; do
logd "getent $d (web-proxy):"
docker exec argus-web-proxy sh -lc "getent hosts $d || true" >> "$DETAILS" 2>&1 || true
done
logd "HTTP (web-proxy): master.readyz=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://master.argus.com:3000/readyz\" 2>/dev/null || echo 000)"
logd "HTTP (web-proxy): es.health=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://es.log.argus.com:9200/_cluster/health\" 2>/dev/null || echo 000)"
logd "HTTP (web-proxy): kibana.status=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://kibana.argus.com:5601/api/status\" 2>/dev/null || echo 000)"
# FTP share writability (container perspective)
section FTP-SHARE
docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true
@ -110,8 +135,13 @@ sort -u -o "$ERRORS" "$ERRORS"
echo "Diagnostic details -> $DETAILS"
echo "Detected errors -> $ERRORS"
# maintain latest symlinks for convenience
ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true
ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true
if [[ "$LOG_DIR" == "$ROOT/logs" ]]; then
# maintain latest symlinks when writing under package logs
ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true
ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true
else
echo "Diagnostic details -> $DETAILS"
echo "Detected errors -> $ERRORS"
fi
exit 0

View File

@ -30,17 +30,7 @@ prepare_env() {
if [[ -f "$ENV_FILE" ]]; then log ".env exists, keep as-is"; return; fi
[[ -f "$ENV_TEMPLATE" ]] || { err "missing $ENV_TEMPLATE"; exit 1; }
cp "$ENV_TEMPLATE" "$ENV_FILE"
# auto-assign ports if busy
for key in MASTER_PORT ES_HTTP_PORT KIBANA_PORT NODE_A_PORT NODE_B_PORT PROMETHEUS_PORT GRAFANA_PORT ALERTMANAGER_PORT \
WEB_PROXY_PORT_8080 WEB_PROXY_PORT_8081 WEB_PROXY_PORT_8082 WEB_PROXY_PORT_8083 WEB_PROXY_PORT_8084 WEB_PROXY_PORT_8085 \
FTP_PORT FTP_DATA_PORT; do
val=$(grep -E "^${key}=" "$ENV_FILE" | tail -1 | cut -d= -f2)
new=$(find_free_port "$val") || true
if [[ -n "${new:-}" && "$new" != "$val" ]]; then
sed -i "s/^${key}=.*/${key}=${new}/" "$ENV_FILE"
log "port ${key} busy -> ${new}"
fi
done
# overlay 模式下,避免为不同服务分配到同一新端口;保持模板端口不做自动改写
}
prepare_data_dirs() {
@ -50,6 +40,8 @@ prepare_data_dirs() {
# still ensure basic directories exist (no chown)
mkdir -p \
"$PKG_ROOT/private/argus/etc" \
"$PKG_ROOT/private/argus/log/elasticsearch" \
"$PKG_ROOT/private/argus/log/kibana" \
"$PKG_ROOT/private/argus/metric/prometheus" \
"$PKG_ROOT/private/argus/metric/prometheus/data" \
"$PKG_ROOT/private/argus/metric/prometheus/rules" \
@ -63,6 +55,43 @@ prepare_data_dirs() {
"$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \
"$PKG_ROOT/private/argus/alert/alertmanager" \
"$PKG_ROOT/private/argus/metric/ftp/share"
# non-root: relax permissions to avoid container UID mismatch blocking writes
chmod -R a+rwx "$PKG_ROOT/private/argus" 2>/dev/null || true
fi
}
ensure_swarm_and_overlay() {
local net_name="${OVERLAY_NET_NAME:-argus-sys-net}"
# Require swarm active
local state
state=$(docker info --format '{{.Swarm.LocalNodeState}}' 2>/dev/null || echo "")
if [[ "$state" != "active" ]]; then
err "Docker Swarm is not active. On this host run:"
err " docker swarm init --advertise-addr <this_host_ip>"
exit 1
fi
# Create attachable overlay if missing
if ! docker network inspect "$net_name" >/dev/null 2>&1; then
log "creating attachable overlay network: $net_name"
docker network create --driver overlay --attachable "$net_name" >/dev/null
fi
}
bootstrap_dns_conf() {
local etc_dir="$PKG_ROOT/private/argus/etc"
mkdir -p "$etc_dir"
local dns_file="$etc_dir/dns.conf"
if [[ ! -s "$dns_file" ]]; then
# detect host primary IP
local host_ip
host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7; exit}')
[[ -z "$host_ip" ]] && host_ip=$(hostname -I 2>/dev/null | awk '{print $1}')
if [[ -n "$host_ip" ]]; then
echo "$host_ip" > "$dns_file"
log "wrote initial dns.conf with host IP: $host_ip"
else
err "failed to determine host IP for dns.conf; please edit $dns_file manually"
fi
fi
}
@ -75,6 +104,8 @@ load_images() {
bring_up() {
log "starting services via compose"
ensure_swarm_and_overlay
bootstrap_dns_conf
local ov="$PKG_ROOT/compose/docker-compose.os-compat.override.yml"
if [[ ! -f "$ov" ]]; then
cat > "$ov" <<'YAML'
@ -124,6 +155,37 @@ YAML
(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" -f docker-compose.yml -f $(basename "$ov") up -d "${services[@]}")
}
# Post bootstrap container-side fixes that do not require sudo on host.
post_bootstrap_fixes() {
# Kibana: ensure /usr/share/kibana/data is a symlink into mounted path to avoid EACCES
if docker ps --format '{{.Names}}' | grep -q '^argus-kibana-sys$'; then
docker exec argus-kibana-sys bash -lc '
set -e
mkdir -p /private/argus/log/kibana && chmod 777 /private/argus/log/kibana || true
if [ -d /usr/share/kibana/data ] && [ ! -L /usr/share/kibana/data ]; then rm -rf /usr/share/kibana/data; fi
if [ ! -e /usr/share/kibana/data ]; then ln -s /private/argus/log/kibana /usr/share/kibana/data; fi
' >/dev/null 2>&1 || true
fi
# Elasticsearch: ensure data path points to mounted path and is writable
if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then
docker exec argus-es-sys bash -lc '
set -e
mkdir -p /private/argus/log/elasticsearch && chmod 777 /private/argus/log/elasticsearch || true
if [ -d /usr/share/elasticsearch/data ] && [ ! -L /usr/share/elasticsearch/data ]; then rm -rf /usr/share/elasticsearch/data; fi
if [ ! -e /usr/share/elasticsearch/data ]; then ln -s /private/argus/log/elasticsearch /usr/share/elasticsearch/data; fi
' >/dev/null 2>&1 || true
fi
# Bind9: ensure rndc.key exists
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
docker exec argus-bind-sys bash -lc '
set -e
mkdir -p /etc/bind
if [ ! -f /etc/bind/rndc.key ]; then rndc-confgen -a -c /etc/bind/rndc.key; fi
chmod 644 /etc/bind/rndc.key || true
' >/dev/null 2>&1 || true
fi
}
dns_bootstrap() {
log "DNS bootstrap: initializing shared dns.conf and container resolv.conf"
local etc_dir="$PKG_ROOT/private/argus/etc"
@ -177,15 +239,40 @@ dns_bootstrap() {
}
selfcheck() {
log "running selfcheck"
bash "$PKG_ROOT/scripts/server-selfcheck.sh" || { err "selfcheck failed"; exit 1; }
# Initial selfcheck with retries to absorb cold starts
local max_retries="${SELF_CHECK_RETRIES:-5}" # 重试次数(不含首次),默认 5
local wait_seconds="${SELF_CHECK_WAIT_SECONDS:-30}" # 每次重试前等待秒数,默认 30s
local attempt=0
while :; do
attempt=$((attempt+1))
if (( attempt == 1 )); then
log "running selfcheck (attempt ${attempt})"
else
log "running selfcheck (attempt ${attempt}/${max_retries}+1)"
fi
if bash "$PKG_ROOT/scripts/server-selfcheck.sh"; then
return 0
fi
# failed
if (( attempt > max_retries )); then
err "selfcheck failed after ${attempt} attempt(s)"
exit 1
fi
log "selfcheck not ready yet; retrying in ${wait_seconds}s..."
sleep "$wait_seconds"
done
}
main() {
mkdir -p "$PKG_ROOT/logs"
prepare_env
prepare_data_dirs
load_images
bring_up
post_bootstrap_fixes
dns_bootstrap
selfcheck
log "install completed. See logs in $PKG_ROOT/logs/"

View File

@ -13,23 +13,31 @@ wait_http() { local url="$1"; local attempts=${2:-120}; local i=1; while ((i<=at
code_for() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; }
mkdir -p "$ROOT/logs"
OUT_JSON="$ROOT/logs/selfcheck.json"
LOG_DIR="$ROOT/logs"
mkdir -p "$LOG_DIR" || true
OUT_JSON="$LOG_DIR/selfcheck.json"
tmp=$(mktemp)
ok=1
log "checking Elasticsearch"
if curl -fsS "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then es_ok=true; else es_ok=false; ok=0; fi
log "checking overlay network"
net_ok=false
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" | grep -q '"Driver": "overlay"'; then net_ok=true; fi
fi
[[ "$net_ok" == true ]] || ok=0
log "checking Kibana"
kb_code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:${KIBANA_PORT:-5601}/api/status" || echo 000)
log "checking Elasticsearch (via domain inside web-proxy)"
if docker exec argus-web-proxy sh -lc "curl -fsS http://es.log.argus.com:9200/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then es_ok=true; else es_ok=false; ok=0; fi
log "checking Kibana (via domain inside web-proxy)"
kb_code=$(docker exec argus-web-proxy sh -lc "curl -s -o /dev/null -w '%{http_code}' http://kibana.log.argus.com:5601/api/status" || echo 000)
kb_ok=false
if [[ "$kb_code" == "200" ]]; then body=$(curl -sS "http://localhost:${KIBANA_PORT:-5601}/api/status"); echo "$body" | grep -q '"level":"available"' && kb_ok=true; fi
[[ "$kb_ok" == true ]] || ok=0
log "checking Master"
wait_http "http://localhost:${MASTER_PORT:-32300}/readyz" 60 || ok=0
log "checking Master (via domain inside web-proxy)"
if docker exec argus-web-proxy sh -lc "curl -fsS http://master.argus.com:3000/readyz" >/dev/null 2>&1; then true; else ok=0; fi
log "checking FTP"
if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then
@ -71,11 +79,17 @@ cat > "$tmp" <<JSON
"grafana": $gf_ok,
"alertmanager": true,
"web_proxy": $wp_ok,
"overlay_net": $net_ok,
"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
}
JSON
mv "$tmp" "$OUT_JSON"
if ! mv "$tmp" "$OUT_JSON" 2>/dev/null; then
# fallback when logs dir not writable (no sudo allowed)
OUT_JSON="/tmp/selfcheck_$(date -u +%Y%m%d-%H%M%SZ).json"
cp "$tmp" "$OUT_JSON"
log "selfcheck.json written to $OUT_JSON (logs dir not writable)"
fi
if [[ "$ok" == 1 ]]; then
log "selfcheck OK"
exit 0