[#37] 增加部署时自动检测空闲端口;增加es 水位检测和临时应急处理
This commit is contained in:
parent
2ff7c55f3b
commit
94b3e910b3
@ -116,6 +116,7 @@ fi
|
||||
# 4) Scripts & Docs
|
||||
copy_tree "$BUILD_DIR/templates/scripts" "$STAGE/scripts"
|
||||
copy_tree "$BUILD_DIR/templates/docs" "$STAGE/docs"
|
||||
find "$STAGE/scripts" -type f -name '*.sh' -exec chmod +x {} + 2>/dev/null || true
|
||||
|
||||
# 5) Manifests
|
||||
gen_manifest "$STAGE" "$STAGE/manifest.txt"
|
||||
|
||||
82
deployment/build/templates/scripts/es-watermark-relax.sh
Normal file
82
deployment/build/templates/scripts/es-watermark-relax.sh
Normal file
@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
||||
|
||||
ES_URL="http://localhost:${ES_HTTP_PORT:-9200}"
|
||||
|
||||
# Tunables (env overrides)
|
||||
RELAX_WM_LOW="${RELAX_WM_LOW:-99%}"
|
||||
RELAX_WM_HIGH="${RELAX_WM_HIGH:-99%}"
|
||||
RELAX_WM_FLOOD="${RELAX_WM_FLOOD:-99%}"
|
||||
DISABLE_WATERMARK="${DISABLE_WATERMARK:-1}"
|
||||
SET_KIBANA_REPLICAS_ZERO="${SET_KIBANA_REPLICAS_ZERO:-1}"
|
||||
CLEAR_READONLY_BLOCKS="${CLEAR_READONLY_BLOCKS:-1}"
|
||||
|
||||
echo "[RELAX] Checking Elasticsearch at $ES_URL"
|
||||
code=$(curl -s -o /dev/null -w '%{http_code}' "$ES_URL/_cluster/health" || true)
|
||||
if [[ "$code" != "200" ]]; then
|
||||
echo "[RELAX][ERROR] ES not reachable (code=$code). Ensure argus-es-sys is running." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "[RELAX] Applying transient cluster settings (watermarks)"
|
||||
th_enabled=$([[ "$DISABLE_WATERMARK" == "1" ]] && echo false || echo true)
|
||||
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d "{
|
||||
\"transient\": {
|
||||
\"cluster.routing.allocation.disk.threshold_enabled\": $th_enabled,
|
||||
\"cluster.routing.allocation.disk.watermark.low\": \"$RELAX_WM_LOW\",
|
||||
\"cluster.routing.allocation.disk.watermark.high\": \"$RELAX_WM_HIGH\",
|
||||
\"cluster.routing.allocation.disk.watermark.flood_stage\": \"$RELAX_WM_FLOOD\"
|
||||
}
|
||||
}" | sed -n '1,5p'
|
||||
|
||||
if [[ "$CLEAR_READONLY_BLOCKS" == "1" ]]; then
|
||||
echo "[RELAX] Clearing read_only/read_only_allow_delete blocks on all indices (best-effort)"
|
||||
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_all/_settings" -d '{
|
||||
"index.blocks.read_only": false,
|
||||
"index.blocks.read_only_allow_delete": false
|
||||
}' >/dev/null || true
|
||||
fi
|
||||
|
||||
if [[ "${SET_KIBANA_REPLICAS_ZERO:-1}" != "0" ]]; then
|
||||
echo "[RELAX] Ensure .kibana* use replicas=0 via index template and per-index settings (best-effort)"
|
||||
# high priority template for .kibana* only, avoid impacting other indices
|
||||
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_index_template/kibana-replicas-0" -d '{
|
||||
"index_patterns": [".kibana*"],
|
||||
"priority": 200,
|
||||
"template": { "settings": { "number_of_replicas": 0 } }
|
||||
}' >/dev/null || true
|
||||
# set existing .kibana* to replicas=0
|
||||
idxs=$(curl -sS "$ES_URL/_cat/indices/.kibana*?h=index" | awk '{print $1}')
|
||||
for i in $idxs; do
|
||||
[[ -n "$i" ]] || continue
|
||||
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/$i/_settings" -d '{"index":{"number_of_replicas":0}}' >/dev/null || true
|
||||
done
|
||||
fi
|
||||
|
||||
# Retry failed shard allocations (best-effort)
|
||||
curl -sS -H 'Content-Type: application/json' -X POST "$ES_URL/_cluster/reroute?retry_failed=true" -d '{}' >/dev/null || true
|
||||
|
||||
echo "[RELAX] Cluster health (post):"
|
||||
curl -sS "$ES_URL/_cluster/health?pretty" | sed -n '1,80p'
|
||||
|
||||
# Simple current status summary
|
||||
ch=$(curl -sS "$ES_URL/_cluster/health" || true)
|
||||
status=$(printf '%s' "$ch" | awk -F'"' '/"status"/{print $4; exit}')
|
||||
unassigned=$(printf '%s' "$ch" | awk -F'[,: ]+' '/"unassigned_shards"/{print $3; exit}')
|
||||
duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true)
|
||||
settings=$(curl -sS "$ES_URL/_cluster/settings?flat_settings=true" || true)
|
||||
th=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.threshold_enabled"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
|
||||
low=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.low"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
|
||||
high=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.high"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
|
||||
flood=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.flood_stage"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
|
||||
ks=$(curl -sS "$ES_URL/_cat/shards/.kibana*?h=state" || true)
|
||||
total=$(printf '%s' "$ks" | awk 'NF{c++} END{print c+0}')
|
||||
started=$(printf '%s' "$ks" | awk '/STARTED/{c++} END{print c+0}')
|
||||
unass=$(printf '%s' "$ks" | awk '/UNASSIGNED/{c++} END{print c+0}')
|
||||
echo "[RELAX][SUMMARY] status=${status:-?} unassigned=${unassigned:-?} es.data.use=${duse:-?} watermarks(threshold=${th:-?} low=${low:-?} high=${high:-?} flood=${flood:-?}) kibana_shards(total=${total},started=${started},unassigned=${unass})"
|
||||
|
||||
echo "[RELAX] Done. Remember to run scripts/es-watermark-restore.sh after freeing disk space and cluster becomes stable."
|
||||
37
deployment/build/templates/scripts/es-watermark-restore.sh
Normal file
37
deployment/build/templates/scripts/es-watermark-restore.sh
Normal file
@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
||||
|
||||
ES_URL="http://localhost:${ES_HTTP_PORT:-9200}"
|
||||
|
||||
echo "[RESTORE] Checking Elasticsearch at $ES_URL"
|
||||
code=$(curl -s -o /dev/null -w '%{http_code}' "$ES_URL/_cluster/health" || true)
|
||||
if [[ "$code" != "200" ]]; then
|
||||
echo "[RESTORE][ERROR] ES not reachable (code=$code). Ensure argus-es-sys is running." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "[RESTORE] Re-enabling disk threshold and clearing relaxed watermarks (transient)"
|
||||
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d '{
|
||||
"transient": {
|
||||
"cluster.routing.allocation.disk.threshold_enabled": true,
|
||||
"cluster.routing.allocation.disk.watermark.low": null,
|
||||
"cluster.routing.allocation.disk.watermark.high": null,
|
||||
"cluster.routing.allocation.disk.watermark.flood_stage": null
|
||||
}
|
||||
}' | sed -n '1,5p'
|
||||
|
||||
# Optionally restore default replicas to 1 (set RESTORE_DEFAULT_REPLICAS=1 to enable)
|
||||
if [[ "${RESTORE_DEFAULT_REPLICAS:-0}" == "1" ]]; then
|
||||
echo "[RESTORE] Setting transient default index.number_of_replicas=1"
|
||||
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d '{"transient":{"index.number_of_replicas":"1"}}' >/dev/null || true
|
||||
fi
|
||||
|
||||
echo "[RESTORE] Cluster health:"
|
||||
curl -sS "$ES_URL/_cluster/health?pretty" | sed -n '1,80p'
|
||||
|
||||
echo "[RESTORE] Done. Verify shards and consider keeping replicas=0 for single-node deployments."
|
||||
|
||||
@ -89,6 +89,29 @@ logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083}
|
||||
logd "Web-Proxy 8084 CORS: ${cors8084}"
|
||||
logd "Web-Proxy 8085 CORS: ${cors8085}"
|
||||
|
||||
# Elasticsearch deep checks: disk watermark and Kibana index status
|
||||
section ES-CHECKS
|
||||
ch=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" || true)
|
||||
status=$(printf '%s' "$ch" | awk -F'\"' '/"status"/{print $4; exit}')
|
||||
if [[ -n "$status" ]]; then logd "cluster.status=$status"; fi
|
||||
if [[ "$status" != "green" ]]; then append_err "[es][cluster] status=$status"; fi
|
||||
|
||||
if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then
|
||||
duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true)
|
||||
logd "es.data.df_use=$duse"
|
||||
usep=${duse%%%}
|
||||
if [[ -n "$usep" ]] && (( usep >= 90 )); then
|
||||
append_err "[es][disk] data path usage ${duse} (>=90%) likely hit high/flood watermarks"
|
||||
echo "HINT: High ES disk usage. Free space or run scripts/es-watermark-relax.sh (then scripts/es-watermark-restore.sh)." >&2
|
||||
fi
|
||||
fi
|
||||
|
||||
ks=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cat/shards/.kibana*?h=index,shard,prirep,state,unassigned.reason" || true)
|
||||
if printf '%s' "$ks" | grep -Eiq '\\b(UNASSIGNED|INITIALIZING|RELOCATING)\\b'; then
|
||||
append_err "[kibana][index] .kibana* shards not green"; logd "$ks"
|
||||
echo "HINT: .kibana* shards not green. On single-node, set replicas=0 and ensure ES disk watermarks are not exceeded." >&2
|
||||
fi
|
||||
|
||||
# Overlay network diagnostics
|
||||
section OVERLAY-NET
|
||||
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then
|
||||
|
||||
@ -33,6 +33,89 @@ prepare_env() {
|
||||
# overlay 模式下,避免为不同服务分配到同一新端口;保持模板端口不做自动改写
|
||||
}
|
||||
|
||||
# read VAR from .env (simple parser)
|
||||
_read_env_var() { local var="$1"; local f="$ENV_FILE"; [[ -f "$f" ]] || return 1; awk -F'=' -v k="$var" 'BEGIN{found=0} $1==k{print substr($0,index($0,"=")+1); found=1; exit} END{exit found?0:1}' "$f"; }
|
||||
|
||||
# set or append VAR=VAL in .env atomically
|
||||
_set_env_var() {
|
||||
local var="$1"; local val="$2"; local f="$ENV_FILE"; local tmp="$ENV_FILE.tmp$$"
|
||||
if [[ -f "$f" ]] && grep -qE "^${var}=" "$f"; then
|
||||
sed -E "s#^(${var}=).*#\\1${val}#" "$f" >"$tmp" && mv "$tmp" "$f"
|
||||
else
|
||||
[[ -f "$f" ]] || : >"$f"
|
||||
printf "%s=%s\n" "$var" "$val" >>"$f"
|
||||
fi
|
||||
}
|
||||
|
||||
auto_assign_ports() {
|
||||
local enable="${AUTO_ASSIGN_PORTS:-true}"
|
||||
case "$enable" in
|
||||
0|false|no|off) log "AUTO_ASSIGN_PORTS disabled"; return;;
|
||||
esac
|
||||
[[ -f "$ENV_FILE" ]] || return 0
|
||||
log "auto-assigning free host ports (with fallback)"
|
||||
cp "$ENV_FILE" "$ENV_FILE.bak.$(date +%Y%m%d-%H%M%S)" || true
|
||||
|
||||
# list of VAR:default pairs to try; FTP 相关端口与被动端口范围不自动改写
|
||||
local pairs=(
|
||||
"MASTER_PORT:32300"
|
||||
"ES_HTTP_PORT:9200"
|
||||
"KIBANA_PORT:5601"
|
||||
"PROMETHEUS_PORT:9090"
|
||||
"ALERTMANAGER_PORT:9093"
|
||||
"GRAFANA_PORT:3000"
|
||||
"WEB_PROXY_PORT_8080:8080"
|
||||
"WEB_PROXY_PORT_8081:8081"
|
||||
"WEB_PROXY_PORT_8082:8082"
|
||||
"WEB_PROXY_PORT_8083:8083"
|
||||
"WEB_PROXY_PORT_8084:8084"
|
||||
"WEB_PROXY_PORT_8085:8085"
|
||||
)
|
||||
|
||||
# track ports reserved in this run to avoid duplicates
|
||||
declare -A reserved=()
|
||||
# pre-mark currently listening ports to avoid choosing them twice within the same run
|
||||
while read -r lp; do reserved["$lp"]=1; done < <(ss -ltnH 2>/dev/null | awk '{print $4}' | sed -n 's#.*:##p')
|
||||
|
||||
for ent in "${pairs[@]}"; do
|
||||
local var=${ent%%:*}; local def=${ent##*:}
|
||||
local cur
|
||||
if ! cur=$(_read_env_var "$var"); then cur="$def"; fi
|
||||
# strip quotes if any
|
||||
cur=${cur%\r}; cur=${cur%\n}; cur=${cur//\"/}
|
||||
# find a free port, avoiding ones we already reserved in this loop
|
||||
local cand="$cur"
|
||||
# if already in use or reserved, pick a free one
|
||||
if ss -ltnH 2>/dev/null | awk -v pat=":"$cand"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then
|
||||
cand=$(find_free_port "$cand" 20000 65000)
|
||||
fi
|
||||
# avoid duplicates chosen in this loop
|
||||
local attempts=0
|
||||
while [[ -n "${reserved[$cand]:-}" ]]; do
|
||||
attempts=$((attempts+1))
|
||||
local start=$((cand+1)); [[ $start -lt 20000 ]] && start=20000
|
||||
local next
|
||||
next=$(find_free_port "$start" "$start" 65000 || true)
|
||||
if [[ -z "$next" ]]; then
|
||||
next=$(find_free_port 20000 20000 65000 || true)
|
||||
fi
|
||||
if [[ -z "$next" || "$next" == "$cand" ]]; then
|
||||
err "no free port available while assigning for $var (last tried: $cand)"; exit 1
|
||||
fi
|
||||
cand="$next"
|
||||
if (( attempts > 1000 )); then err "port assignment loop exceeded for $var"; exit 1; fi
|
||||
done
|
||||
reserved["$cand"]=1
|
||||
if [[ "$cand" != "$cur" ]]; then
|
||||
log " port reassigned: $var $cur -> $cand"
|
||||
_set_env_var "$var" "$cand"
|
||||
else
|
||||
# ensure the var exists in .env for clarity
|
||||
_set_env_var "$var" "$cand"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
prepare_data_dirs() {
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
echo -e "\033[1;33m[WARN]\033[0m running as non-root: will not chown data dirs."
|
||||
@ -269,6 +352,7 @@ selfcheck() {
|
||||
main() {
|
||||
mkdir -p "$PKG_ROOT/logs"
|
||||
prepare_env
|
||||
auto_assign_ports
|
||||
prepare_data_dirs
|
||||
load_images
|
||||
bring_up
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user