[#37] 增加部署时自动检测空闲端口;增加es 水位检测和临时应急处理
This commit is contained in:
parent
2ff7c55f3b
commit
94b3e910b3
@ -116,6 +116,7 @@ fi
|
|||||||
# 4) Scripts & Docs
|
# 4) Scripts & Docs
|
||||||
copy_tree "$BUILD_DIR/templates/scripts" "$STAGE/scripts"
|
copy_tree "$BUILD_DIR/templates/scripts" "$STAGE/scripts"
|
||||||
copy_tree "$BUILD_DIR/templates/docs" "$STAGE/docs"
|
copy_tree "$BUILD_DIR/templates/docs" "$STAGE/docs"
|
||||||
|
find "$STAGE/scripts" -type f -name '*.sh' -exec chmod +x {} + 2>/dev/null || true
|
||||||
|
|
||||||
# 5) Manifests
|
# 5) Manifests
|
||||||
gen_manifest "$STAGE" "$STAGE/manifest.txt"
|
gen_manifest "$STAGE" "$STAGE/manifest.txt"
|
||||||
|
|||||||
82
deployment/build/templates/scripts/es-watermark-relax.sh
Normal file
82
deployment/build/templates/scripts/es-watermark-relax.sh
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
||||||
|
|
||||||
|
ES_URL="http://localhost:${ES_HTTP_PORT:-9200}"
|
||||||
|
|
||||||
|
# Tunables (env overrides)
|
||||||
|
RELAX_WM_LOW="${RELAX_WM_LOW:-99%}"
|
||||||
|
RELAX_WM_HIGH="${RELAX_WM_HIGH:-99%}"
|
||||||
|
RELAX_WM_FLOOD="${RELAX_WM_FLOOD:-99%}"
|
||||||
|
DISABLE_WATERMARK="${DISABLE_WATERMARK:-1}"
|
||||||
|
SET_KIBANA_REPLICAS_ZERO="${SET_KIBANA_REPLICAS_ZERO:-1}"
|
||||||
|
CLEAR_READONLY_BLOCKS="${CLEAR_READONLY_BLOCKS:-1}"
|
||||||
|
|
||||||
|
echo "[RELAX] Checking Elasticsearch at $ES_URL"
|
||||||
|
code=$(curl -s -o /dev/null -w '%{http_code}' "$ES_URL/_cluster/health" || true)
|
||||||
|
if [[ "$code" != "200" ]]; then
|
||||||
|
echo "[RELAX][ERROR] ES not reachable (code=$code). Ensure argus-es-sys is running." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[RELAX] Applying transient cluster settings (watermarks)"
|
||||||
|
th_enabled=$([[ "$DISABLE_WATERMARK" == "1" ]] && echo false || echo true)
|
||||||
|
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d "{
|
||||||
|
\"transient\": {
|
||||||
|
\"cluster.routing.allocation.disk.threshold_enabled\": $th_enabled,
|
||||||
|
\"cluster.routing.allocation.disk.watermark.low\": \"$RELAX_WM_LOW\",
|
||||||
|
\"cluster.routing.allocation.disk.watermark.high\": \"$RELAX_WM_HIGH\",
|
||||||
|
\"cluster.routing.allocation.disk.watermark.flood_stage\": \"$RELAX_WM_FLOOD\"
|
||||||
|
}
|
||||||
|
}" | sed -n '1,5p'
|
||||||
|
|
||||||
|
if [[ "$CLEAR_READONLY_BLOCKS" == "1" ]]; then
|
||||||
|
echo "[RELAX] Clearing read_only/read_only_allow_delete blocks on all indices (best-effort)"
|
||||||
|
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_all/_settings" -d '{
|
||||||
|
"index.blocks.read_only": false,
|
||||||
|
"index.blocks.read_only_allow_delete": false
|
||||||
|
}' >/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "${SET_KIBANA_REPLICAS_ZERO:-1}" != "0" ]]; then
|
||||||
|
echo "[RELAX] Ensure .kibana* use replicas=0 via index template and per-index settings (best-effort)"
|
||||||
|
# high priority template for .kibana* only, avoid impacting other indices
|
||||||
|
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_index_template/kibana-replicas-0" -d '{
|
||||||
|
"index_patterns": [".kibana*"],
|
||||||
|
"priority": 200,
|
||||||
|
"template": { "settings": { "number_of_replicas": 0 } }
|
||||||
|
}' >/dev/null || true
|
||||||
|
# set existing .kibana* to replicas=0
|
||||||
|
idxs=$(curl -sS "$ES_URL/_cat/indices/.kibana*?h=index" | awk '{print $1}')
|
||||||
|
for i in $idxs; do
|
||||||
|
[[ -n "$i" ]] || continue
|
||||||
|
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/$i/_settings" -d '{"index":{"number_of_replicas":0}}' >/dev/null || true
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Retry failed shard allocations (best-effort)
|
||||||
|
curl -sS -H 'Content-Type: application/json' -X POST "$ES_URL/_cluster/reroute?retry_failed=true" -d '{}' >/dev/null || true
|
||||||
|
|
||||||
|
echo "[RELAX] Cluster health (post):"
|
||||||
|
curl -sS "$ES_URL/_cluster/health?pretty" | sed -n '1,80p'
|
||||||
|
|
||||||
|
# Simple current status summary
|
||||||
|
ch=$(curl -sS "$ES_URL/_cluster/health" || true)
|
||||||
|
status=$(printf '%s' "$ch" | awk -F'"' '/"status"/{print $4; exit}')
|
||||||
|
unassigned=$(printf '%s' "$ch" | awk -F'[,: ]+' '/"unassigned_shards"/{print $3; exit}')
|
||||||
|
duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true)
|
||||||
|
settings=$(curl -sS "$ES_URL/_cluster/settings?flat_settings=true" || true)
|
||||||
|
th=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.threshold_enabled"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
|
||||||
|
low=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.low"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
|
||||||
|
high=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.high"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
|
||||||
|
flood=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.flood_stage"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
|
||||||
|
ks=$(curl -sS "$ES_URL/_cat/shards/.kibana*?h=state" || true)
|
||||||
|
total=$(printf '%s' "$ks" | awk 'NF{c++} END{print c+0}')
|
||||||
|
started=$(printf '%s' "$ks" | awk '/STARTED/{c++} END{print c+0}')
|
||||||
|
unass=$(printf '%s' "$ks" | awk '/UNASSIGNED/{c++} END{print c+0}')
|
||||||
|
echo "[RELAX][SUMMARY] status=${status:-?} unassigned=${unassigned:-?} es.data.use=${duse:-?} watermarks(threshold=${th:-?} low=${low:-?} high=${high:-?} flood=${flood:-?}) kibana_shards(total=${total},started=${started},unassigned=${unass})"
|
||||||
|
|
||||||
|
echo "[RELAX] Done. Remember to run scripts/es-watermark-restore.sh after freeing disk space and cluster becomes stable."
|
||||||
37
deployment/build/templates/scripts/es-watermark-restore.sh
Normal file
37
deployment/build/templates/scripts/es-watermark-restore.sh
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
||||||
|
|
||||||
|
ES_URL="http://localhost:${ES_HTTP_PORT:-9200}"
|
||||||
|
|
||||||
|
echo "[RESTORE] Checking Elasticsearch at $ES_URL"
|
||||||
|
code=$(curl -s -o /dev/null -w '%{http_code}' "$ES_URL/_cluster/health" || true)
|
||||||
|
if [[ "$code" != "200" ]]; then
|
||||||
|
echo "[RESTORE][ERROR] ES not reachable (code=$code). Ensure argus-es-sys is running." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[RESTORE] Re-enabling disk threshold and clearing relaxed watermarks (transient)"
|
||||||
|
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d '{
|
||||||
|
"transient": {
|
||||||
|
"cluster.routing.allocation.disk.threshold_enabled": true,
|
||||||
|
"cluster.routing.allocation.disk.watermark.low": null,
|
||||||
|
"cluster.routing.allocation.disk.watermark.high": null,
|
||||||
|
"cluster.routing.allocation.disk.watermark.flood_stage": null
|
||||||
|
}
|
||||||
|
}' | sed -n '1,5p'
|
||||||
|
|
||||||
|
# Optionally restore default replicas to 1 (set RESTORE_DEFAULT_REPLICAS=1 to enable)
|
||||||
|
if [[ "${RESTORE_DEFAULT_REPLICAS:-0}" == "1" ]]; then
|
||||||
|
echo "[RESTORE] Setting transient default index.number_of_replicas=1"
|
||||||
|
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d '{"transient":{"index.number_of_replicas":"1"}}' >/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[RESTORE] Cluster health:"
|
||||||
|
curl -sS "$ES_URL/_cluster/health?pretty" | sed -n '1,80p'
|
||||||
|
|
||||||
|
echo "[RESTORE] Done. Verify shards and consider keeping replicas=0 for single-node deployments."
|
||||||
|
|
||||||
@ -89,6 +89,29 @@ logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083}
|
|||||||
logd "Web-Proxy 8084 CORS: ${cors8084}"
|
logd "Web-Proxy 8084 CORS: ${cors8084}"
|
||||||
logd "Web-Proxy 8085 CORS: ${cors8085}"
|
logd "Web-Proxy 8085 CORS: ${cors8085}"
|
||||||
|
|
||||||
|
# Elasticsearch deep checks: disk watermark and Kibana index status
|
||||||
|
section ES-CHECKS
|
||||||
|
ch=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" || true)
|
||||||
|
status=$(printf '%s' "$ch" | awk -F'\"' '/"status"/{print $4; exit}')
|
||||||
|
if [[ -n "$status" ]]; then logd "cluster.status=$status"; fi
|
||||||
|
if [[ "$status" != "green" ]]; then append_err "[es][cluster] status=$status"; fi
|
||||||
|
|
||||||
|
if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then
|
||||||
|
duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true)
|
||||||
|
logd "es.data.df_use=$duse"
|
||||||
|
usep=${duse%%%}
|
||||||
|
if [[ -n "$usep" ]] && (( usep >= 90 )); then
|
||||||
|
append_err "[es][disk] data path usage ${duse} (>=90%) likely hit high/flood watermarks"
|
||||||
|
echo "HINT: High ES disk usage. Free space or run scripts/es-watermark-relax.sh (then scripts/es-watermark-restore.sh)." >&2
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
ks=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cat/shards/.kibana*?h=index,shard,prirep,state,unassigned.reason" || true)
|
||||||
|
if printf '%s' "$ks" | grep -Eiq '\\b(UNASSIGNED|INITIALIZING|RELOCATING)\\b'; then
|
||||||
|
append_err "[kibana][index] .kibana* shards not green"; logd "$ks"
|
||||||
|
echo "HINT: .kibana* shards not green. On single-node, set replicas=0 and ensure ES disk watermarks are not exceeded." >&2
|
||||||
|
fi
|
||||||
|
|
||||||
# Overlay network diagnostics
|
# Overlay network diagnostics
|
||||||
section OVERLAY-NET
|
section OVERLAY-NET
|
||||||
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then
|
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then
|
||||||
|
|||||||
@ -33,6 +33,89 @@ prepare_env() {
|
|||||||
# overlay 模式下,避免为不同服务分配到同一新端口;保持模板端口不做自动改写
|
# overlay 模式下,避免为不同服务分配到同一新端口;保持模板端口不做自动改写
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# read VAR from .env (simple parser)
|
||||||
|
_read_env_var() { local var="$1"; local f="$ENV_FILE"; [[ -f "$f" ]] || return 1; awk -F'=' -v k="$var" 'BEGIN{found=0} $1==k{print substr($0,index($0,"=")+1); found=1; exit} END{exit found?0:1}' "$f"; }
|
||||||
|
|
||||||
|
# set or append VAR=VAL in .env atomically
|
||||||
|
_set_env_var() {
|
||||||
|
local var="$1"; local val="$2"; local f="$ENV_FILE"; local tmp="$ENV_FILE.tmp$$"
|
||||||
|
if [[ -f "$f" ]] && grep -qE "^${var}=" "$f"; then
|
||||||
|
sed -E "s#^(${var}=).*#\\1${val}#" "$f" >"$tmp" && mv "$tmp" "$f"
|
||||||
|
else
|
||||||
|
[[ -f "$f" ]] || : >"$f"
|
||||||
|
printf "%s=%s\n" "$var" "$val" >>"$f"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
auto_assign_ports() {
|
||||||
|
local enable="${AUTO_ASSIGN_PORTS:-true}"
|
||||||
|
case "$enable" in
|
||||||
|
0|false|no|off) log "AUTO_ASSIGN_PORTS disabled"; return;;
|
||||||
|
esac
|
||||||
|
[[ -f "$ENV_FILE" ]] || return 0
|
||||||
|
log "auto-assigning free host ports (with fallback)"
|
||||||
|
cp "$ENV_FILE" "$ENV_FILE.bak.$(date +%Y%m%d-%H%M%S)" || true
|
||||||
|
|
||||||
|
# list of VAR:default pairs to try; FTP 相关端口与被动端口范围不自动改写
|
||||||
|
local pairs=(
|
||||||
|
"MASTER_PORT:32300"
|
||||||
|
"ES_HTTP_PORT:9200"
|
||||||
|
"KIBANA_PORT:5601"
|
||||||
|
"PROMETHEUS_PORT:9090"
|
||||||
|
"ALERTMANAGER_PORT:9093"
|
||||||
|
"GRAFANA_PORT:3000"
|
||||||
|
"WEB_PROXY_PORT_8080:8080"
|
||||||
|
"WEB_PROXY_PORT_8081:8081"
|
||||||
|
"WEB_PROXY_PORT_8082:8082"
|
||||||
|
"WEB_PROXY_PORT_8083:8083"
|
||||||
|
"WEB_PROXY_PORT_8084:8084"
|
||||||
|
"WEB_PROXY_PORT_8085:8085"
|
||||||
|
)
|
||||||
|
|
||||||
|
# track ports reserved in this run to avoid duplicates
|
||||||
|
declare -A reserved=()
|
||||||
|
# pre-mark currently listening ports to avoid choosing them twice within the same run
|
||||||
|
while read -r lp; do reserved["$lp"]=1; done < <(ss -ltnH 2>/dev/null | awk '{print $4}' | sed -n 's#.*:##p')
|
||||||
|
|
||||||
|
for ent in "${pairs[@]}"; do
|
||||||
|
local var=${ent%%:*}; local def=${ent##*:}
|
||||||
|
local cur
|
||||||
|
if ! cur=$(_read_env_var "$var"); then cur="$def"; fi
|
||||||
|
# strip quotes if any
|
||||||
|
cur=${cur%\r}; cur=${cur%\n}; cur=${cur//\"/}
|
||||||
|
# find a free port, avoiding ones we already reserved in this loop
|
||||||
|
local cand="$cur"
|
||||||
|
# if already in use or reserved, pick a free one
|
||||||
|
if ss -ltnH 2>/dev/null | awk -v pat=":"$cand"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then
|
||||||
|
cand=$(find_free_port "$cand" 20000 65000)
|
||||||
|
fi
|
||||||
|
# avoid duplicates chosen in this loop
|
||||||
|
local attempts=0
|
||||||
|
while [[ -n "${reserved[$cand]:-}" ]]; do
|
||||||
|
attempts=$((attempts+1))
|
||||||
|
local start=$((cand+1)); [[ $start -lt 20000 ]] && start=20000
|
||||||
|
local next
|
||||||
|
next=$(find_free_port "$start" "$start" 65000 || true)
|
||||||
|
if [[ -z "$next" ]]; then
|
||||||
|
next=$(find_free_port 20000 20000 65000 || true)
|
||||||
|
fi
|
||||||
|
if [[ -z "$next" || "$next" == "$cand" ]]; then
|
||||||
|
err "no free port available while assigning for $var (last tried: $cand)"; exit 1
|
||||||
|
fi
|
||||||
|
cand="$next"
|
||||||
|
if (( attempts > 1000 )); then err "port assignment loop exceeded for $var"; exit 1; fi
|
||||||
|
done
|
||||||
|
reserved["$cand"]=1
|
||||||
|
if [[ "$cand" != "$cur" ]]; then
|
||||||
|
log " port reassigned: $var $cur -> $cand"
|
||||||
|
_set_env_var "$var" "$cand"
|
||||||
|
else
|
||||||
|
# ensure the var exists in .env for clarity
|
||||||
|
_set_env_var "$var" "$cand"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
prepare_data_dirs() {
|
prepare_data_dirs() {
|
||||||
if [[ $EUID -ne 0 ]]; then
|
if [[ $EUID -ne 0 ]]; then
|
||||||
echo -e "\033[1;33m[WARN]\033[0m running as non-root: will not chown data dirs."
|
echo -e "\033[1;33m[WARN]\033[0m running as non-root: will not chown data dirs."
|
||||||
@ -269,6 +352,7 @@ selfcheck() {
|
|||||||
main() {
|
main() {
|
||||||
mkdir -p "$PKG_ROOT/logs"
|
mkdir -p "$PKG_ROOT/logs"
|
||||||
prepare_env
|
prepare_env
|
||||||
|
auto_assign_ports
|
||||||
prepare_data_dirs
|
prepare_data_dirs
|
||||||
load_images
|
load_images
|
||||||
bring_up
|
bring_up
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user