#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" # version root PROJECT_NAME="argus-sys" log() { echo -e "\033[0;34m[INSTALL]\033[0m $*"; } err() { echo -e "\033[0;31m[ERROR ]\033[0m $*" >&2; } require() { command -v "$1" >/dev/null 2>&1 || { err "missing command: $1"; exit 1; }; } require docker if docker compose version >/dev/null 2>&1; then COMPOSE=(docker compose); else require docker-compose; COMPOSE=(docker-compose); fi ENV_FILE="$PKG_ROOT/compose/.env" ENV_TEMPLATE="$PKG_ROOT/compose/.env.example" find_free_port() { local prefer="$1"; local start=${2:-20000}; local max=${3:-65000}; if ! ss -ltnH 2>/dev/null | awk -v pat=":"$prefer"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then echo "$prefer"; return; fi for ((p=start; p<=max; p++)); do if ! ss -ltnH 2>/dev/null | awk -v pat=":"$p"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then echo "$p"; return; fi done return 1 } prepare_env() { if [[ -f "$ENV_FILE" ]]; then log ".env exists, keep as-is"; return; fi [[ -f "$ENV_TEMPLATE" ]] || { err "missing $ENV_TEMPLATE"; exit 1; } cp "$ENV_TEMPLATE" "$ENV_FILE" # overlay 模式下,避免为不同服务分配到同一新端口;保持模板端口不做自动改写 } # read VAR from .env (simple parser) _read_env_var() { local var="$1"; local f="$ENV_FILE"; [[ -f "$f" ]] || return 1; awk -F'=' -v k="$var" 'BEGIN{found=0} $1==k{print substr($0,index($0,"=")+1); found=1; exit} END{exit found?0:1}' "$f"; } # set or append VAR=VAL in .env atomically _set_env_var() { local var="$1"; local val="$2"; local f="$ENV_FILE"; local tmp="$ENV_FILE.tmp$$" if [[ -f "$f" ]] && grep -qE "^${var}=" "$f"; then sed -E "s#^(${var}=).*#\\1${val}#" "$f" >"$tmp" && mv "$tmp" "$f" else [[ -f "$f" ]] || : >"$f" printf "%s=%s\n" "$var" "$val" >>"$f" fi } auto_assign_ports() { local enable="${AUTO_ASSIGN_PORTS:-true}" case "$enable" in 0|false|no|off) log "AUTO_ASSIGN_PORTS disabled"; return;; esac [[ -f "$ENV_FILE" ]] || return 0 log "auto-assigning free host ports (with fallback)" cp "$ENV_FILE" "$ENV_FILE.bak.$(date +%Y%m%d-%H%M%S)" || true # list of VAR:default pairs to try; FTP 相关端口与被动端口范围不自动改写 local pairs=( "MASTER_PORT:32300" "ES_HTTP_PORT:9200" "KIBANA_PORT:5601" "PROMETHEUS_PORT:9090" "ALERTMANAGER_PORT:9093" "GRAFANA_PORT:3000" "WEB_PROXY_PORT_8080:8080" "WEB_PROXY_PORT_8081:8081" "WEB_PROXY_PORT_8082:8082" "WEB_PROXY_PORT_8083:8083" "WEB_PROXY_PORT_8084:8084" "WEB_PROXY_PORT_8085:8085" ) # track ports reserved in this run to avoid duplicates declare -A reserved=() # pre-mark currently listening ports to avoid choosing them twice within the same run while read -r lp; do reserved["$lp"]=1; done < <(ss -ltnH 2>/dev/null | awk '{print $4}' | sed -n 's#.*:##p') for ent in "${pairs[@]}"; do local var=${ent%%:*}; local def=${ent##*:} local cur if ! cur=$(_read_env_var "$var"); then cur="$def"; fi # strip quotes if any cur=${cur%\r}; cur=${cur%\n}; cur=${cur//\"/} # find a free port, avoiding ones we already reserved in this loop local cand="$cur" # if already in use or reserved, pick a free one if ss -ltnH 2>/dev/null | awk -v pat=":"$cand"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then cand=$(find_free_port "$cand" 20000 65000) fi # avoid duplicates chosen in this loop local attempts=0 while [[ -n "${reserved[$cand]:-}" ]]; do attempts=$((attempts+1)) local start=$((cand+1)); [[ $start -lt 20000 ]] && start=20000 local next next=$(find_free_port "$start" "$start" 65000 || true) if [[ -z "$next" ]]; then next=$(find_free_port 20000 20000 65000 || true) fi if [[ -z "$next" || "$next" == "$cand" ]]; then err "no free port available while assigning for $var (last tried: $cand)"; exit 1 fi cand="$next" if (( attempts > 1000 )); then err "port assignment loop exceeded for $var"; exit 1; fi done reserved["$cand"]=1 if [[ "$cand" != "$cur" ]]; then log " port reassigned: $var $cur -> $cand" _set_env_var "$var" "$cand" else # ensure the var exists in .env for clarity _set_env_var "$var" "$cand" fi done } prepare_data_dirs() { if [[ $EUID -ne 0 ]]; then echo -e "\033[1;33m[WARN]\033[0m running as non-root: will not chown data dirs." echo -e "\033[1;33m[WARN]\033[0m If you hit Permission denied, run: sudo $SCRIPT_DIR/server-prepare-dirs.sh" # still ensure basic directories exist (no chown) mkdir -p \ "$PKG_ROOT/private/argus/etc" \ "$PKG_ROOT/private/argus/log/elasticsearch" \ "$PKG_ROOT/private/argus/log/kibana" \ "$PKG_ROOT/private/argus/metric/prometheus" \ "$PKG_ROOT/private/argus/metric/prometheus/data" \ "$PKG_ROOT/private/argus/metric/prometheus/rules" \ "$PKG_ROOT/private/argus/metric/grafana" \ "$PKG_ROOT/private/argus/metric/grafana/data" \ "$PKG_ROOT/private/argus/metric/grafana/logs" \ "$PKG_ROOT/private/argus/metric/grafana/plugins" \ "$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \ "$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \ "$PKG_ROOT/private/argus/metric/grafana/data/sessions" \ "$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \ "$PKG_ROOT/private/argus/alert/alertmanager" \ "$PKG_ROOT/private/argus/metric/ftp/share" # non-root: relax permissions to avoid container UID mismatch blocking writes chmod -R a+rwx "$PKG_ROOT/private/argus" 2>/dev/null || true fi } ensure_swarm_and_overlay() { local net_name="${OVERLAY_NET_NAME:-argus-sys-net}" # Require swarm active local state state=$(docker info --format '{{.Swarm.LocalNodeState}}' 2>/dev/null || echo "") if [[ "$state" != "active" ]]; then err "Docker Swarm is not active. On this host run:" err " docker swarm init --advertise-addr " exit 1 fi # Create attachable overlay if missing if ! docker network inspect "$net_name" >/dev/null 2>&1; then log "creating attachable overlay network: $net_name" docker network create --driver overlay --attachable "$net_name" >/dev/null fi } bootstrap_dns_conf() { local etc_dir="$PKG_ROOT/private/argus/etc" mkdir -p "$etc_dir" local dns_file="$etc_dir/dns.conf" if [[ ! -s "$dns_file" ]]; then # detect host primary IP local host_ip host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7; exit}') [[ -z "$host_ip" ]] && host_ip=$(hostname -I 2>/dev/null | awk '{print $1}') if [[ -n "$host_ip" ]]; then echo "$host_ip" > "$dns_file" log "wrote initial dns.conf with host IP: $host_ip" else err "failed to determine host IP for dns.conf; please edit $dns_file manually" fi fi } load_images() { local tar="$PKG_ROOT/images/all-images.tar.gz" [[ -f "$tar" ]] || { err "missing images tar: $tar"; exit 1; } log "loading images from $(basename "$tar") (may take minutes)" gunzip -c "$tar" | docker load >/dev/null } bring_up() { log "starting services via compose" ensure_swarm_and_overlay bootstrap_dns_conf local ov="$PKG_ROOT/compose/docker-compose.os-compat.override.yml" if [[ ! -f "$ov" ]]; then cat > "$ov" <<'YAML' services: bind: security_opt: ["label=disable"] userns_mode: "host" tmpfs: - /run/named master: security_opt: ["label=disable"] userns_mode: "host" es: security_opt: ["label=disable"] userns_mode: "host" kibana: security_opt: ["label=disable"] userns_mode: "host" ftp: security_opt: ["label=disable"] userns_mode: "host" prometheus: security_opt: ["label=disable"] userns_mode: "host" grafana: security_opt: ["label=disable"] userns_mode: "host" alertmanager: security_opt: ["label=disable"] userns_mode: "host" # ensure runtime path matches container expectation volumes: - ../private/argus/etc:/private/argus/etc - ../private/argus/alert/alertmanager:/alertmanager web-frontend: security_opt: ["label=disable"] userns_mode: "host" web-proxy: security_opt: ["label=disable"] userns_mode: "host" YAML log "generated OS-compat override: $(basename "$ov")" fi # 仅启动服务端组件,避免误起测试节点(node-a/node-b/test-node/test-gpu-node) local services=(bind master es kibana ftp prometheus grafana alertmanager web-frontend web-proxy) log "services: ${services[*]}" (cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" -f docker-compose.yml -f $(basename "$ov") up -d "${services[@]}") } # Post bootstrap container-side fixes that do not require sudo on host. post_bootstrap_fixes() { # Kibana: ensure /usr/share/kibana/data is a symlink into mounted path to avoid EACCES if docker ps --format '{{.Names}}' | grep -q '^argus-kibana-sys$'; then docker exec argus-kibana-sys bash -lc ' set -e mkdir -p /private/argus/log/kibana && chmod 777 /private/argus/log/kibana || true if [ -d /usr/share/kibana/data ] && [ ! -L /usr/share/kibana/data ]; then rm -rf /usr/share/kibana/data; fi if [ ! -e /usr/share/kibana/data ]; then ln -s /private/argus/log/kibana /usr/share/kibana/data; fi ' >/dev/null 2>&1 || true fi # Elasticsearch: ensure data path points to mounted path and is writable if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then docker exec argus-es-sys bash -lc ' set -e mkdir -p /private/argus/log/elasticsearch && chmod 777 /private/argus/log/elasticsearch || true if [ -d /usr/share/elasticsearch/data ] && [ ! -L /usr/share/elasticsearch/data ]; then rm -rf /usr/share/elasticsearch/data; fi if [ ! -e /usr/share/elasticsearch/data ]; then ln -s /private/argus/log/elasticsearch /usr/share/elasticsearch/data; fi ' >/dev/null 2>&1 || true fi # Bind9: ensure rndc.key exists if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then docker exec argus-bind-sys bash -lc ' set -e mkdir -p /etc/bind if [ ! -f /etc/bind/rndc.key ]; then rndc-confgen -a -c /etc/bind/rndc.key; fi chmod 644 /etc/bind/rndc.key || true ' >/dev/null 2>&1 || true fi } dns_bootstrap() { log "DNS bootstrap: initializing shared dns.conf and container resolv.conf" local etc_dir="$PKG_ROOT/private/argus/etc" mkdir -p "$etc_dir" # 1) ensure dns.conf exists (fallback to bind IP 172.31.0.2) if [[ ! -s "$etc_dir/dns.conf" ]]; then if echo "172.31.0.2" > "$etc_dir/dns.conf" 2>/dev/null; then log "wrote fallback dns.conf with 172.31.0.2" else # host-side write denied (ownership 1000:1000); write via bind container instead if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then docker exec argus-bind-sys sh -lc 'echo 172.31.0.2 > /private/argus/etc/dns.conf && chmod 644 /private/argus/etc/dns.conf' || true log "fallback dns.conf written via bind container" else log "bind not ready; skip writing fallback dns.conf" fi fi fi # 2) wait briefly for bind to copy update-dns.sh into shared etc (bind startup.sh does this) local i=0 while [[ ! -x "$etc_dir/update-dns.sh" && $i -lt 20 ]]; do sleep 0.5; ((i++)); done if [[ ! -x "$etc_dir/update-dns.sh" ]]; then log "update-dns.sh not present yet; continuing with existing resolv.conf" fi # 3) run update-dns.sh inside key containers so /etc/resolv.conf points to bind local c for c in argus-master-sys argus-es-sys argus-kibana-sys argus-grafana argus-prometheus argus-ftp argus-web-frontend argus-web-proxy argus-alertmanager; do if docker ps --format '{{.Names}}' | grep -q "^${c}$"; then docker exec "$c" sh -lc 'test -x /private/argus/etc/update-dns.sh && /private/argus/etc/update-dns.sh || true' >/dev/null 2>&1 || true fi done # 4) wait for service A-record hint files generated by services (best-effort) local need=( es.log.argus.com kibana.log.argus.com master.argus.com grafana.metric.argus.com prom.metric.argus.com alertmanager.alert.argus.com ) local waited=0; local missing=1 while (( waited < 15 )); do missing=0 for f in "${need[@]}"; do [[ -s "$etc_dir/$f" ]] || { missing=1; break; }; done [[ $missing -eq 0 ]] && break sleep 1; ((waited++)) done # 5) reload bind zone (script uses supervisor to restart bind9) if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then docker exec argus-bind-sys sh -lc '/usr/local/bin/reload-bind9.sh' >/dev/null 2>&1 || true fi # 6) restart web-proxy once to re-render nginx resolver with latest dns.conf if docker ps --format '{{.Names}}' | grep -q '^argus-web-proxy$'; then docker restart argus-web-proxy >/dev/null 2>&1 || true fi } selfcheck() { # Initial selfcheck with retries to absorb cold starts local max_retries="${SELF_CHECK_RETRIES:-5}" # 重试次数(不含首次),默认 5 local wait_seconds="${SELF_CHECK_WAIT_SECONDS:-30}" # 每次重试前等待秒数,默认 30s local attempt=0 while :; do attempt=$((attempt+1)) if (( attempt == 1 )); then log "running selfcheck (attempt ${attempt})" else log "running selfcheck (attempt ${attempt}/${max_retries}+1)" fi if bash "$PKG_ROOT/scripts/server-selfcheck.sh"; then return 0 fi # failed if (( attempt > max_retries )); then err "selfcheck failed after ${attempt} attempt(s)" exit 1 fi log "selfcheck not ready yet; retrying in ${wait_seconds}s..." sleep "$wait_seconds" done } main() { mkdir -p "$PKG_ROOT/logs" prepare_env auto_assign_ports prepare_data_dirs load_images bring_up post_bootstrap_fixes dns_bootstrap selfcheck log "install completed. See logs in $PKG_ROOT/logs/" } main "$@"