#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" # version root PROJECT_NAME="argus-sys" log() { echo -e "\033[0;34m[INSTALL]\033[0m $*"; } err() { echo -e "\033[0;31m[ERROR ]\033[0m $*" >&2; } require() { command -v "$1" >/dev/null 2>&1 || { err "missing command: $1"; exit 1; }; } require docker if docker compose version >/dev/null 2>&1; then COMPOSE=(docker compose); else require docker-compose; COMPOSE=(docker-compose); fi ENV_FILE="$PKG_ROOT/compose/.env" ENV_TEMPLATE="$PKG_ROOT/compose/.env.example" find_free_port() { local prefer="$1"; local start=${2:-20000}; local max=${3:-65000}; if ! ss -ltnH 2>/dev/null | awk -v pat=":"$prefer"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then echo "$prefer"; return; fi for ((p=start; p<=max; p++)); do if ! ss -ltnH 2>/dev/null | awk -v pat=":"$p"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then echo "$p"; return; fi done return 1 } prepare_env() { if [[ -f "$ENV_FILE" ]]; then log ".env exists, keep as-is"; return; fi [[ -f "$ENV_TEMPLATE" ]] || { err "missing $ENV_TEMPLATE"; exit 1; } cp "$ENV_TEMPLATE" "$ENV_FILE" # overlay 模式下,避免为不同服务分配到同一新端口;保持模板端口不做自动改写 } prepare_data_dirs() { if [[ $EUID -ne 0 ]]; then echo -e "\033[1;33m[WARN]\033[0m running as non-root: will not chown data dirs." echo -e "\033[1;33m[WARN]\033[0m If you hit Permission denied, run: sudo $SCRIPT_DIR/server-prepare-dirs.sh" # still ensure basic directories exist (no chown) mkdir -p \ "$PKG_ROOT/private/argus/etc" \ "$PKG_ROOT/private/argus/log/elasticsearch" \ "$PKG_ROOT/private/argus/log/kibana" \ "$PKG_ROOT/private/argus/metric/prometheus" \ "$PKG_ROOT/private/argus/metric/prometheus/data" \ "$PKG_ROOT/private/argus/metric/prometheus/rules" \ "$PKG_ROOT/private/argus/metric/grafana" \ "$PKG_ROOT/private/argus/metric/grafana/data" \ "$PKG_ROOT/private/argus/metric/grafana/logs" \ "$PKG_ROOT/private/argus/metric/grafana/plugins" \ "$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \ "$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \ "$PKG_ROOT/private/argus/metric/grafana/data/sessions" \ "$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \ "$PKG_ROOT/private/argus/alert/alertmanager" \ "$PKG_ROOT/private/argus/metric/ftp/share" # non-root: relax permissions to avoid container UID mismatch blocking writes chmod -R a+rwx "$PKG_ROOT/private/argus" 2>/dev/null || true fi } ensure_swarm_and_overlay() { local net_name="${OVERLAY_NET_NAME:-argus-sys-net}" # Require swarm active local state state=$(docker info --format '{{.Swarm.LocalNodeState}}' 2>/dev/null || echo "") if [[ "$state" != "active" ]]; then err "Docker Swarm is not active. On this host run:" err " docker swarm init --advertise-addr " exit 1 fi # Create attachable overlay if missing if ! docker network inspect "$net_name" >/dev/null 2>&1; then log "creating attachable overlay network: $net_name" docker network create --driver overlay --attachable "$net_name" >/dev/null fi } bootstrap_dns_conf() { local etc_dir="$PKG_ROOT/private/argus/etc" mkdir -p "$etc_dir" local dns_file="$etc_dir/dns.conf" if [[ ! -s "$dns_file" ]]; then # detect host primary IP local host_ip host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7; exit}') [[ -z "$host_ip" ]] && host_ip=$(hostname -I 2>/dev/null | awk '{print $1}') if [[ -n "$host_ip" ]]; then echo "$host_ip" > "$dns_file" log "wrote initial dns.conf with host IP: $host_ip" else err "failed to determine host IP for dns.conf; please edit $dns_file manually" fi fi } load_images() { local tar="$PKG_ROOT/images/all-images.tar.gz" [[ -f "$tar" ]] || { err "missing images tar: $tar"; exit 1; } log "loading images from $(basename "$tar") (may take minutes)" gunzip -c "$tar" | docker load >/dev/null } bring_up() { log "starting services via compose" ensure_swarm_and_overlay bootstrap_dns_conf local ov="$PKG_ROOT/compose/docker-compose.os-compat.override.yml" if [[ ! -f "$ov" ]]; then cat > "$ov" <<'YAML' services: bind: security_opt: ["label=disable"] userns_mode: "host" tmpfs: - /run/named master: security_opt: ["label=disable"] userns_mode: "host" es: security_opt: ["label=disable"] userns_mode: "host" kibana: security_opt: ["label=disable"] userns_mode: "host" ftp: security_opt: ["label=disable"] userns_mode: "host" prometheus: security_opt: ["label=disable"] userns_mode: "host" grafana: security_opt: ["label=disable"] userns_mode: "host" alertmanager: security_opt: ["label=disable"] userns_mode: "host" # ensure runtime path matches container expectation volumes: - ../private/argus/etc:/private/argus/etc - ../private/argus/alert/alertmanager:/alertmanager web-frontend: security_opt: ["label=disable"] userns_mode: "host" web-proxy: security_opt: ["label=disable"] userns_mode: "host" YAML log "generated OS-compat override: $(basename "$ov")" fi # 仅启动服务端组件,避免误起测试节点(node-a/node-b/test-node/test-gpu-node) local services=(bind master es kibana ftp prometheus grafana alertmanager web-frontend web-proxy) log "services: ${services[*]}" (cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" -f docker-compose.yml -f $(basename "$ov") up -d "${services[@]}") } # Post bootstrap container-side fixes that do not require sudo on host. post_bootstrap_fixes() { # Kibana: ensure /usr/share/kibana/data is a symlink into mounted path to avoid EACCES if docker ps --format '{{.Names}}' | grep -q '^argus-kibana-sys$'; then docker exec argus-kibana-sys bash -lc ' set -e mkdir -p /private/argus/log/kibana && chmod 777 /private/argus/log/kibana || true if [ -d /usr/share/kibana/data ] && [ ! -L /usr/share/kibana/data ]; then rm -rf /usr/share/kibana/data; fi if [ ! -e /usr/share/kibana/data ]; then ln -s /private/argus/log/kibana /usr/share/kibana/data; fi ' >/dev/null 2>&1 || true fi # Elasticsearch: ensure data path points to mounted path and is writable if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then docker exec argus-es-sys bash -lc ' set -e mkdir -p /private/argus/log/elasticsearch && chmod 777 /private/argus/log/elasticsearch || true if [ -d /usr/share/elasticsearch/data ] && [ ! -L /usr/share/elasticsearch/data ]; then rm -rf /usr/share/elasticsearch/data; fi if [ ! -e /usr/share/elasticsearch/data ]; then ln -s /private/argus/log/elasticsearch /usr/share/elasticsearch/data; fi ' >/dev/null 2>&1 || true fi # Bind9: ensure rndc.key exists if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then docker exec argus-bind-sys bash -lc ' set -e mkdir -p /etc/bind if [ ! -f /etc/bind/rndc.key ]; then rndc-confgen -a -c /etc/bind/rndc.key; fi chmod 644 /etc/bind/rndc.key || true ' >/dev/null 2>&1 || true fi } dns_bootstrap() { log "DNS bootstrap: initializing shared dns.conf and container resolv.conf" local etc_dir="$PKG_ROOT/private/argus/etc" mkdir -p "$etc_dir" # 1) ensure dns.conf exists (fallback to bind IP 172.31.0.2) if [[ ! -s "$etc_dir/dns.conf" ]]; then if echo "172.31.0.2" > "$etc_dir/dns.conf" 2>/dev/null; then log "wrote fallback dns.conf with 172.31.0.2" else # host-side write denied (ownership 1000:1000); write via bind container instead if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then docker exec argus-bind-sys sh -lc 'echo 172.31.0.2 > /private/argus/etc/dns.conf && chmod 644 /private/argus/etc/dns.conf' || true log "fallback dns.conf written via bind container" else log "bind not ready; skip writing fallback dns.conf" fi fi fi # 2) wait briefly for bind to copy update-dns.sh into shared etc (bind startup.sh does this) local i=0 while [[ ! -x "$etc_dir/update-dns.sh" && $i -lt 20 ]]; do sleep 0.5; ((i++)); done if [[ ! -x "$etc_dir/update-dns.sh" ]]; then log "update-dns.sh not present yet; continuing with existing resolv.conf" fi # 3) run update-dns.sh inside key containers so /etc/resolv.conf points to bind local c for c in argus-master-sys argus-es-sys argus-kibana-sys argus-grafana argus-prometheus argus-ftp argus-web-frontend argus-web-proxy argus-alertmanager; do if docker ps --format '{{.Names}}' | grep -q "^${c}$"; then docker exec "$c" sh -lc 'test -x /private/argus/etc/update-dns.sh && /private/argus/etc/update-dns.sh || true' >/dev/null 2>&1 || true fi done # 4) wait for service A-record hint files generated by services (best-effort) local need=( es.log.argus.com kibana.log.argus.com master.argus.com grafana.metric.argus.com prom.metric.argus.com alertmanager.alert.argus.com ) local waited=0; local missing=1 while (( waited < 15 )); do missing=0 for f in "${need[@]}"; do [[ -s "$etc_dir/$f" ]] || { missing=1; break; }; done [[ $missing -eq 0 ]] && break sleep 1; ((waited++)) done # 5) reload bind zone (script uses supervisor to restart bind9) if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then docker exec argus-bind-sys sh -lc '/usr/local/bin/reload-bind9.sh' >/dev/null 2>&1 || true fi # 6) restart web-proxy once to re-render nginx resolver with latest dns.conf if docker ps --format '{{.Names}}' | grep -q '^argus-web-proxy$'; then docker restart argus-web-proxy >/dev/null 2>&1 || true fi } selfcheck() { # Initial selfcheck with retries to absorb cold starts local max_retries="${SELF_CHECK_RETRIES:-5}" # 重试次数(不含首次),默认 5 local wait_seconds="${SELF_CHECK_WAIT_SECONDS:-30}" # 每次重试前等待秒数,默认 30s local attempt=0 while :; do attempt=$((attempt+1)) if (( attempt == 1 )); then log "running selfcheck (attempt ${attempt})" else log "running selfcheck (attempt ${attempt}/${max_retries}+1)" fi if bash "$PKG_ROOT/scripts/server-selfcheck.sh"; then return 0 fi # failed if (( attempt > max_retries )); then err "selfcheck failed after ${attempt} attempt(s)" exit 1 fi log "selfcheck not ready yet; retrying in ${wait_seconds}s..." sleep "$wait_seconds" done } main() { mkdir -p "$PKG_ROOT/logs" prepare_env prepare_data_dirs load_images bring_up post_bootstrap_fixes dns_bootstrap selfcheck log "install completed. See logs in $PKG_ROOT/logs/" } main "$@"