[#37] 增加部署时自动检测空闲端口;增加es 水位检测和临时应急处理

This commit is contained in:
yuyr 2025-11-05 16:21:34 +08:00
parent 2ff7c55f3b
commit 94b3e910b3
5 changed files with 227 additions and 0 deletions

View File

@ -116,6 +116,7 @@ fi
# 4) Scripts & Docs # 4) Scripts & Docs
copy_tree "$BUILD_DIR/templates/scripts" "$STAGE/scripts" copy_tree "$BUILD_DIR/templates/scripts" "$STAGE/scripts"
copy_tree "$BUILD_DIR/templates/docs" "$STAGE/docs" copy_tree "$BUILD_DIR/templates/docs" "$STAGE/docs"
find "$STAGE/scripts" -type f -name '*.sh' -exec chmod +x {} + 2>/dev/null || true
# 5) Manifests # 5) Manifests
gen_manifest "$STAGE" "$STAGE/manifest.txt" gen_manifest "$STAGE" "$STAGE/manifest.txt"

View File

@ -0,0 +1,82 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
ES_URL="http://localhost:${ES_HTTP_PORT:-9200}"
# Tunables (env overrides)
RELAX_WM_LOW="${RELAX_WM_LOW:-99%}"
RELAX_WM_HIGH="${RELAX_WM_HIGH:-99%}"
RELAX_WM_FLOOD="${RELAX_WM_FLOOD:-99%}"
DISABLE_WATERMARK="${DISABLE_WATERMARK:-1}"
SET_KIBANA_REPLICAS_ZERO="${SET_KIBANA_REPLICAS_ZERO:-1}"
CLEAR_READONLY_BLOCKS="${CLEAR_READONLY_BLOCKS:-1}"
echo "[RELAX] Checking Elasticsearch at $ES_URL"
code=$(curl -s -o /dev/null -w '%{http_code}' "$ES_URL/_cluster/health" || true)
if [[ "$code" != "200" ]]; then
echo "[RELAX][ERROR] ES not reachable (code=$code). Ensure argus-es-sys is running." >&2
exit 1
fi
echo "[RELAX] Applying transient cluster settings (watermarks)"
th_enabled=$([[ "$DISABLE_WATERMARK" == "1" ]] && echo false || echo true)
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d "{
\"transient\": {
\"cluster.routing.allocation.disk.threshold_enabled\": $th_enabled,
\"cluster.routing.allocation.disk.watermark.low\": \"$RELAX_WM_LOW\",
\"cluster.routing.allocation.disk.watermark.high\": \"$RELAX_WM_HIGH\",
\"cluster.routing.allocation.disk.watermark.flood_stage\": \"$RELAX_WM_FLOOD\"
}
}" | sed -n '1,5p'
if [[ "$CLEAR_READONLY_BLOCKS" == "1" ]]; then
echo "[RELAX] Clearing read_only/read_only_allow_delete blocks on all indices (best-effort)"
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_all/_settings" -d '{
"index.blocks.read_only": false,
"index.blocks.read_only_allow_delete": false
}' >/dev/null || true
fi
if [[ "${SET_KIBANA_REPLICAS_ZERO:-1}" != "0" ]]; then
echo "[RELAX] Ensure .kibana* use replicas=0 via index template and per-index settings (best-effort)"
# high priority template for .kibana* only, avoid impacting other indices
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_index_template/kibana-replicas-0" -d '{
"index_patterns": [".kibana*"],
"priority": 200,
"template": { "settings": { "number_of_replicas": 0 } }
}' >/dev/null || true
# set existing .kibana* to replicas=0
idxs=$(curl -sS "$ES_URL/_cat/indices/.kibana*?h=index" | awk '{print $1}')
for i in $idxs; do
[[ -n "$i" ]] || continue
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/$i/_settings" -d '{"index":{"number_of_replicas":0}}' >/dev/null || true
done
fi
# Retry failed shard allocations (best-effort)
curl -sS -H 'Content-Type: application/json' -X POST "$ES_URL/_cluster/reroute?retry_failed=true" -d '{}' >/dev/null || true
echo "[RELAX] Cluster health (post):"
curl -sS "$ES_URL/_cluster/health?pretty" | sed -n '1,80p'
# Simple current status summary
ch=$(curl -sS "$ES_URL/_cluster/health" || true)
status=$(printf '%s' "$ch" | awk -F'"' '/"status"/{print $4; exit}')
unassigned=$(printf '%s' "$ch" | awk -F'[,: ]+' '/"unassigned_shards"/{print $3; exit}')
duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true)
settings=$(curl -sS "$ES_URL/_cluster/settings?flat_settings=true" || true)
th=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.threshold_enabled"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
low=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.low"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
high=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.high"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
flood=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.flood_stage"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
ks=$(curl -sS "$ES_URL/_cat/shards/.kibana*?h=state" || true)
total=$(printf '%s' "$ks" | awk 'NF{c++} END{print c+0}')
started=$(printf '%s' "$ks" | awk '/STARTED/{c++} END{print c+0}')
unass=$(printf '%s' "$ks" | awk '/UNASSIGNED/{c++} END{print c+0}')
echo "[RELAX][SUMMARY] status=${status:-?} unassigned=${unassigned:-?} es.data.use=${duse:-?} watermarks(threshold=${th:-?} low=${low:-?} high=${high:-?} flood=${flood:-?}) kibana_shards(total=${total},started=${started},unassigned=${unass})"
echo "[RELAX] Done. Remember to run scripts/es-watermark-restore.sh after freeing disk space and cluster becomes stable."

View File

@ -0,0 +1,37 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
ES_URL="http://localhost:${ES_HTTP_PORT:-9200}"
echo "[RESTORE] Checking Elasticsearch at $ES_URL"
code=$(curl -s -o /dev/null -w '%{http_code}' "$ES_URL/_cluster/health" || true)
if [[ "$code" != "200" ]]; then
echo "[RESTORE][ERROR] ES not reachable (code=$code). Ensure argus-es-sys is running." >&2
exit 1
fi
echo "[RESTORE] Re-enabling disk threshold and clearing relaxed watermarks (transient)"
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d '{
"transient": {
"cluster.routing.allocation.disk.threshold_enabled": true,
"cluster.routing.allocation.disk.watermark.low": null,
"cluster.routing.allocation.disk.watermark.high": null,
"cluster.routing.allocation.disk.watermark.flood_stage": null
}
}' | sed -n '1,5p'
# Optionally restore default replicas to 1 (set RESTORE_DEFAULT_REPLICAS=1 to enable)
if [[ "${RESTORE_DEFAULT_REPLICAS:-0}" == "1" ]]; then
echo "[RESTORE] Setting transient default index.number_of_replicas=1"
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d '{"transient":{"index.number_of_replicas":"1"}}' >/dev/null || true
fi
echo "[RESTORE] Cluster health:"
curl -sS "$ES_URL/_cluster/health?pretty" | sed -n '1,80p'
echo "[RESTORE] Done. Verify shards and consider keeping replicas=0 for single-node deployments."

View File

@ -89,6 +89,29 @@ logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083}
logd "Web-Proxy 8084 CORS: ${cors8084}" logd "Web-Proxy 8084 CORS: ${cors8084}"
logd "Web-Proxy 8085 CORS: ${cors8085}" logd "Web-Proxy 8085 CORS: ${cors8085}"
# Elasticsearch deep checks: disk watermark and Kibana index status
section ES-CHECKS
ch=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" || true)
status=$(printf '%s' "$ch" | awk -F'\"' '/"status"/{print $4; exit}')
if [[ -n "$status" ]]; then logd "cluster.status=$status"; fi
if [[ "$status" != "green" ]]; then append_err "[es][cluster] status=$status"; fi
if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then
duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true)
logd "es.data.df_use=$duse"
usep=${duse%%%}
if [[ -n "$usep" ]] && (( usep >= 90 )); then
append_err "[es][disk] data path usage ${duse} (>=90%) likely hit high/flood watermarks"
echo "HINT: High ES disk usage. Free space or run scripts/es-watermark-relax.sh (then scripts/es-watermark-restore.sh)." >&2
fi
fi
ks=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cat/shards/.kibana*?h=index,shard,prirep,state,unassigned.reason" || true)
if printf '%s' "$ks" | grep -Eiq '\\b(UNASSIGNED|INITIALIZING|RELOCATING)\\b'; then
append_err "[kibana][index] .kibana* shards not green"; logd "$ks"
echo "HINT: .kibana* shards not green. On single-node, set replicas=0 and ensure ES disk watermarks are not exceeded." >&2
fi
# Overlay network diagnostics # Overlay network diagnostics
section OVERLAY-NET section OVERLAY-NET
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then

View File

@ -33,6 +33,89 @@ prepare_env() {
# overlay 模式下,避免为不同服务分配到同一新端口;保持模板端口不做自动改写 # overlay 模式下,避免为不同服务分配到同一新端口;保持模板端口不做自动改写
} }
# read VAR from .env (simple parser)
_read_env_var() { local var="$1"; local f="$ENV_FILE"; [[ -f "$f" ]] || return 1; awk -F'=' -v k="$var" 'BEGIN{found=0} $1==k{print substr($0,index($0,"=")+1); found=1; exit} END{exit found?0:1}' "$f"; }
# set or append VAR=VAL in .env atomically
_set_env_var() {
local var="$1"; local val="$2"; local f="$ENV_FILE"; local tmp="$ENV_FILE.tmp$$"
if [[ -f "$f" ]] && grep -qE "^${var}=" "$f"; then
sed -E "s#^(${var}=).*#\\1${val}#" "$f" >"$tmp" && mv "$tmp" "$f"
else
[[ -f "$f" ]] || : >"$f"
printf "%s=%s\n" "$var" "$val" >>"$f"
fi
}
auto_assign_ports() {
local enable="${AUTO_ASSIGN_PORTS:-true}"
case "$enable" in
0|false|no|off) log "AUTO_ASSIGN_PORTS disabled"; return;;
esac
[[ -f "$ENV_FILE" ]] || return 0
log "auto-assigning free host ports (with fallback)"
cp "$ENV_FILE" "$ENV_FILE.bak.$(date +%Y%m%d-%H%M%S)" || true
# list of VAR:default pairs to try; FTP 相关端口与被动端口范围不自动改写
local pairs=(
"MASTER_PORT:32300"
"ES_HTTP_PORT:9200"
"KIBANA_PORT:5601"
"PROMETHEUS_PORT:9090"
"ALERTMANAGER_PORT:9093"
"GRAFANA_PORT:3000"
"WEB_PROXY_PORT_8080:8080"
"WEB_PROXY_PORT_8081:8081"
"WEB_PROXY_PORT_8082:8082"
"WEB_PROXY_PORT_8083:8083"
"WEB_PROXY_PORT_8084:8084"
"WEB_PROXY_PORT_8085:8085"
)
# track ports reserved in this run to avoid duplicates
declare -A reserved=()
# pre-mark currently listening ports to avoid choosing them twice within the same run
while read -r lp; do reserved["$lp"]=1; done < <(ss -ltnH 2>/dev/null | awk '{print $4}' | sed -n 's#.*:##p')
for ent in "${pairs[@]}"; do
local var=${ent%%:*}; local def=${ent##*:}
local cur
if ! cur=$(_read_env_var "$var"); then cur="$def"; fi
# strip quotes if any
cur=${cur%\r}; cur=${cur%\n}; cur=${cur//\"/}
# find a free port, avoiding ones we already reserved in this loop
local cand="$cur"
# if already in use or reserved, pick a free one
if ss -ltnH 2>/dev/null | awk -v pat=":"$cand"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then
cand=$(find_free_port "$cand" 20000 65000)
fi
# avoid duplicates chosen in this loop
local attempts=0
while [[ -n "${reserved[$cand]:-}" ]]; do
attempts=$((attempts+1))
local start=$((cand+1)); [[ $start -lt 20000 ]] && start=20000
local next
next=$(find_free_port "$start" "$start" 65000 || true)
if [[ -z "$next" ]]; then
next=$(find_free_port 20000 20000 65000 || true)
fi
if [[ -z "$next" || "$next" == "$cand" ]]; then
err "no free port available while assigning for $var (last tried: $cand)"; exit 1
fi
cand="$next"
if (( attempts > 1000 )); then err "port assignment loop exceeded for $var"; exit 1; fi
done
reserved["$cand"]=1
if [[ "$cand" != "$cur" ]]; then
log " port reassigned: $var $cur -> $cand"
_set_env_var "$var" "$cand"
else
# ensure the var exists in .env for clarity
_set_env_var "$var" "$cand"
fi
done
}
prepare_data_dirs() { prepare_data_dirs() {
if [[ $EUID -ne 0 ]]; then if [[ $EUID -ne 0 ]]; then
echo -e "\033[1;33m[WARN]\033[0m running as non-root: will not chown data dirs." echo -e "\033[1;33m[WARN]\033[0m running as non-root: will not chown data dirs."
@ -269,6 +352,7 @@ selfcheck() {
main() { main() {
mkdir -p "$PKG_ROOT/logs" mkdir -p "$PKG_ROOT/logs"
prepare_env prepare_env
auto_assign_ports
prepare_data_dirs prepare_data_dirs
load_images load_images
bring_up bring_up