From 94b3e910b32a125153f17b46b82f95ef23292f54 Mon Sep 17 00:00:00 2001 From: yuyr Date: Wed, 5 Nov 2025 16:21:34 +0800 Subject: [PATCH] =?UTF-8?q?[#37]=20=E5=A2=9E=E5=8A=A0=E9=83=A8=E7=BD=B2?= =?UTF-8?q?=E6=97=B6=E8=87=AA=E5=8A=A8=E6=A3=80=E6=B5=8B=E7=A9=BA=E9=97=B2?= =?UTF-8?q?=E7=AB=AF=E5=8F=A3=EF=BC=9B=E5=A2=9E=E5=8A=A0es=20=E6=B0=B4?= =?UTF-8?q?=E4=BD=8D=E6=A3=80=E6=B5=8B=E5=92=8C=E4=B8=B4=E6=97=B6=E5=BA=94?= =?UTF-8?q?=E6=80=A5=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deployment/build/build_server_package.sh | 1 + .../templates/scripts/es-watermark-relax.sh | 82 ++++++++++++++++++ .../templates/scripts/es-watermark-restore.sh | 37 ++++++++ .../templates/scripts/server-diagnose.sh | 23 +++++ .../build/templates/scripts/server-install.sh | 84 +++++++++++++++++++ 5 files changed, 227 insertions(+) create mode 100644 deployment/build/templates/scripts/es-watermark-relax.sh create mode 100644 deployment/build/templates/scripts/es-watermark-restore.sh diff --git a/deployment/build/build_server_package.sh b/deployment/build/build_server_package.sh index 1717f8a..4d2486f 100755 --- a/deployment/build/build_server_package.sh +++ b/deployment/build/build_server_package.sh @@ -116,6 +116,7 @@ fi # 4) Scripts & Docs copy_tree "$BUILD_DIR/templates/scripts" "$STAGE/scripts" copy_tree "$BUILD_DIR/templates/docs" "$STAGE/docs" +find "$STAGE/scripts" -type f -name '*.sh' -exec chmod +x {} + 2>/dev/null || true # 5) Manifests gen_manifest "$STAGE" "$STAGE/manifest.txt" diff --git a/deployment/build/templates/scripts/es-watermark-relax.sh b/deployment/build/templates/scripts/es-watermark-relax.sh new file mode 100644 index 0000000..d3eb867 --- /dev/null +++ b/deployment/build/templates/scripts/es-watermark-relax.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a + +ES_URL="http://localhost:${ES_HTTP_PORT:-9200}" + +# Tunables (env overrides) +RELAX_WM_LOW="${RELAX_WM_LOW:-99%}" +RELAX_WM_HIGH="${RELAX_WM_HIGH:-99%}" +RELAX_WM_FLOOD="${RELAX_WM_FLOOD:-99%}" +DISABLE_WATERMARK="${DISABLE_WATERMARK:-1}" +SET_KIBANA_REPLICAS_ZERO="${SET_KIBANA_REPLICAS_ZERO:-1}" +CLEAR_READONLY_BLOCKS="${CLEAR_READONLY_BLOCKS:-1}" + +echo "[RELAX] Checking Elasticsearch at $ES_URL" +code=$(curl -s -o /dev/null -w '%{http_code}' "$ES_URL/_cluster/health" || true) +if [[ "$code" != "200" ]]; then + echo "[RELAX][ERROR] ES not reachable (code=$code). Ensure argus-es-sys is running." >&2 + exit 1 +fi + +echo "[RELAX] Applying transient cluster settings (watermarks)" +th_enabled=$([[ "$DISABLE_WATERMARK" == "1" ]] && echo false || echo true) +curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d "{ + \"transient\": { + \"cluster.routing.allocation.disk.threshold_enabled\": $th_enabled, + \"cluster.routing.allocation.disk.watermark.low\": \"$RELAX_WM_LOW\", + \"cluster.routing.allocation.disk.watermark.high\": \"$RELAX_WM_HIGH\", + \"cluster.routing.allocation.disk.watermark.flood_stage\": \"$RELAX_WM_FLOOD\" + } +}" | sed -n '1,5p' + +if [[ "$CLEAR_READONLY_BLOCKS" == "1" ]]; then + echo "[RELAX] Clearing read_only/read_only_allow_delete blocks on all indices (best-effort)" + curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_all/_settings" -d '{ + "index.blocks.read_only": false, + "index.blocks.read_only_allow_delete": false + }' >/dev/null || true +fi + +if [[ "${SET_KIBANA_REPLICAS_ZERO:-1}" != "0" ]]; then + echo "[RELAX] Ensure .kibana* use replicas=0 via index template and per-index settings (best-effort)" + # high priority template for .kibana* only, avoid impacting other indices + curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_index_template/kibana-replicas-0" -d '{ + "index_patterns": [".kibana*"], + "priority": 200, + "template": { "settings": { "number_of_replicas": 0 } } + }' >/dev/null || true + # set existing .kibana* to replicas=0 + idxs=$(curl -sS "$ES_URL/_cat/indices/.kibana*?h=index" | awk '{print $1}') + for i in $idxs; do + [[ -n "$i" ]] || continue + curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/$i/_settings" -d '{"index":{"number_of_replicas":0}}' >/dev/null || true + done +fi + +# Retry failed shard allocations (best-effort) +curl -sS -H 'Content-Type: application/json' -X POST "$ES_URL/_cluster/reroute?retry_failed=true" -d '{}' >/dev/null || true + +echo "[RELAX] Cluster health (post):" +curl -sS "$ES_URL/_cluster/health?pretty" | sed -n '1,80p' + +# Simple current status summary +ch=$(curl -sS "$ES_URL/_cluster/health" || true) +status=$(printf '%s' "$ch" | awk -F'"' '/"status"/{print $4; exit}') +unassigned=$(printf '%s' "$ch" | awk -F'[,: ]+' '/"unassigned_shards"/{print $3; exit}') +duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true) +settings=$(curl -sS "$ES_URL/_cluster/settings?flat_settings=true" || true) +th=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.threshold_enabled"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1) +low=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.low"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1) +high=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.high"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1) +flood=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.flood_stage"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1) +ks=$(curl -sS "$ES_URL/_cat/shards/.kibana*?h=state" || true) +total=$(printf '%s' "$ks" | awk 'NF{c++} END{print c+0}') +started=$(printf '%s' "$ks" | awk '/STARTED/{c++} END{print c+0}') +unass=$(printf '%s' "$ks" | awk '/UNASSIGNED/{c++} END{print c+0}') +echo "[RELAX][SUMMARY] status=${status:-?} unassigned=${unassigned:-?} es.data.use=${duse:-?} watermarks(threshold=${th:-?} low=${low:-?} high=${high:-?} flood=${flood:-?}) kibana_shards(total=${total},started=${started},unassigned=${unass})" + +echo "[RELAX] Done. Remember to run scripts/es-watermark-restore.sh after freeing disk space and cluster becomes stable." diff --git a/deployment/build/templates/scripts/es-watermark-restore.sh b/deployment/build/templates/scripts/es-watermark-restore.sh new file mode 100644 index 0000000..a20383e --- /dev/null +++ b/deployment/build/templates/scripts/es-watermark-restore.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a + +ES_URL="http://localhost:${ES_HTTP_PORT:-9200}" + +echo "[RESTORE] Checking Elasticsearch at $ES_URL" +code=$(curl -s -o /dev/null -w '%{http_code}' "$ES_URL/_cluster/health" || true) +if [[ "$code" != "200" ]]; then + echo "[RESTORE][ERROR] ES not reachable (code=$code). Ensure argus-es-sys is running." >&2 + exit 1 +fi + +echo "[RESTORE] Re-enabling disk threshold and clearing relaxed watermarks (transient)" +curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d '{ + "transient": { + "cluster.routing.allocation.disk.threshold_enabled": true, + "cluster.routing.allocation.disk.watermark.low": null, + "cluster.routing.allocation.disk.watermark.high": null, + "cluster.routing.allocation.disk.watermark.flood_stage": null + } +}' | sed -n '1,5p' + +# Optionally restore default replicas to 1 (set RESTORE_DEFAULT_REPLICAS=1 to enable) +if [[ "${RESTORE_DEFAULT_REPLICAS:-0}" == "1" ]]; then + echo "[RESTORE] Setting transient default index.number_of_replicas=1" + curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d '{"transient":{"index.number_of_replicas":"1"}}' >/dev/null || true +fi + +echo "[RESTORE] Cluster health:" +curl -sS "$ES_URL/_cluster/health?pretty" | sed -n '1,80p' + +echo "[RESTORE] Done. Verify shards and consider keeping replicas=0 for single-node deployments." + diff --git a/deployment/build/templates/scripts/server-diagnose.sh b/deployment/build/templates/scripts/server-diagnose.sh index 27520e8..3c0de7f 100755 --- a/deployment/build/templates/scripts/server-diagnose.sh +++ b/deployment/build/templates/scripts/server-diagnose.sh @@ -89,6 +89,29 @@ logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083} logd "Web-Proxy 8084 CORS: ${cors8084}" logd "Web-Proxy 8085 CORS: ${cors8085}" +# Elasticsearch deep checks: disk watermark and Kibana index status +section ES-CHECKS +ch=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" || true) +status=$(printf '%s' "$ch" | awk -F'\"' '/"status"/{print $4; exit}') +if [[ -n "$status" ]]; then logd "cluster.status=$status"; fi +if [[ "$status" != "green" ]]; then append_err "[es][cluster] status=$status"; fi + +if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then + duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true) + logd "es.data.df_use=$duse" + usep=${duse%%%} + if [[ -n "$usep" ]] && (( usep >= 90 )); then + append_err "[es][disk] data path usage ${duse} (>=90%) likely hit high/flood watermarks" + echo "HINT: High ES disk usage. Free space or run scripts/es-watermark-relax.sh (then scripts/es-watermark-restore.sh)." >&2 + fi +fi + +ks=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cat/shards/.kibana*?h=index,shard,prirep,state,unassigned.reason" || true) +if printf '%s' "$ks" | grep -Eiq '\\b(UNASSIGNED|INITIALIZING|RELOCATING)\\b'; then + append_err "[kibana][index] .kibana* shards not green"; logd "$ks" + echo "HINT: .kibana* shards not green. On single-node, set replicas=0 and ensure ES disk watermarks are not exceeded." >&2 +fi + # Overlay network diagnostics section OVERLAY-NET if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then diff --git a/deployment/build/templates/scripts/server-install.sh b/deployment/build/templates/scripts/server-install.sh index 365b02c..a48216b 100755 --- a/deployment/build/templates/scripts/server-install.sh +++ b/deployment/build/templates/scripts/server-install.sh @@ -33,6 +33,89 @@ prepare_env() { # overlay 模式下,避免为不同服务分配到同一新端口;保持模板端口不做自动改写 } +# read VAR from .env (simple parser) +_read_env_var() { local var="$1"; local f="$ENV_FILE"; [[ -f "$f" ]] || return 1; awk -F'=' -v k="$var" 'BEGIN{found=0} $1==k{print substr($0,index($0,"=")+1); found=1; exit} END{exit found?0:1}' "$f"; } + +# set or append VAR=VAL in .env atomically +_set_env_var() { + local var="$1"; local val="$2"; local f="$ENV_FILE"; local tmp="$ENV_FILE.tmp$$" + if [[ -f "$f" ]] && grep -qE "^${var}=" "$f"; then + sed -E "s#^(${var}=).*#\\1${val}#" "$f" >"$tmp" && mv "$tmp" "$f" + else + [[ -f "$f" ]] || : >"$f" + printf "%s=%s\n" "$var" "$val" >>"$f" + fi +} + +auto_assign_ports() { + local enable="${AUTO_ASSIGN_PORTS:-true}" + case "$enable" in + 0|false|no|off) log "AUTO_ASSIGN_PORTS disabled"; return;; + esac + [[ -f "$ENV_FILE" ]] || return 0 + log "auto-assigning free host ports (with fallback)" + cp "$ENV_FILE" "$ENV_FILE.bak.$(date +%Y%m%d-%H%M%S)" || true + + # list of VAR:default pairs to try; FTP 相关端口与被动端口范围不自动改写 + local pairs=( + "MASTER_PORT:32300" + "ES_HTTP_PORT:9200" + "KIBANA_PORT:5601" + "PROMETHEUS_PORT:9090" + "ALERTMANAGER_PORT:9093" + "GRAFANA_PORT:3000" + "WEB_PROXY_PORT_8080:8080" + "WEB_PROXY_PORT_8081:8081" + "WEB_PROXY_PORT_8082:8082" + "WEB_PROXY_PORT_8083:8083" + "WEB_PROXY_PORT_8084:8084" + "WEB_PROXY_PORT_8085:8085" + ) + + # track ports reserved in this run to avoid duplicates + declare -A reserved=() + # pre-mark currently listening ports to avoid choosing them twice within the same run + while read -r lp; do reserved["$lp"]=1; done < <(ss -ltnH 2>/dev/null | awk '{print $4}' | sed -n 's#.*:##p') + + for ent in "${pairs[@]}"; do + local var=${ent%%:*}; local def=${ent##*:} + local cur + if ! cur=$(_read_env_var "$var"); then cur="$def"; fi + # strip quotes if any + cur=${cur%\r}; cur=${cur%\n}; cur=${cur//\"/} + # find a free port, avoiding ones we already reserved in this loop + local cand="$cur" + # if already in use or reserved, pick a free one + if ss -ltnH 2>/dev/null | awk -v pat=":"$cand"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then + cand=$(find_free_port "$cand" 20000 65000) + fi + # avoid duplicates chosen in this loop + local attempts=0 + while [[ -n "${reserved[$cand]:-}" ]]; do + attempts=$((attempts+1)) + local start=$((cand+1)); [[ $start -lt 20000 ]] && start=20000 + local next + next=$(find_free_port "$start" "$start" 65000 || true) + if [[ -z "$next" ]]; then + next=$(find_free_port 20000 20000 65000 || true) + fi + if [[ -z "$next" || "$next" == "$cand" ]]; then + err "no free port available while assigning for $var (last tried: $cand)"; exit 1 + fi + cand="$next" + if (( attempts > 1000 )); then err "port assignment loop exceeded for $var"; exit 1; fi + done + reserved["$cand"]=1 + if [[ "$cand" != "$cur" ]]; then + log " port reassigned: $var $cur -> $cand" + _set_env_var "$var" "$cand" + else + # ensure the var exists in .env for clarity + _set_env_var "$var" "$cand" + fi + done +} + prepare_data_dirs() { if [[ $EUID -ne 0 ]]; then echo -e "\033[1;33m[WARN]\033[0m running as non-root: will not chown data dirs." @@ -269,6 +352,7 @@ selfcheck() { main() { mkdir -p "$PKG_ROOT/logs" prepare_env + auto_assign_ports prepare_data_dirs load_images bring_up