[#37] 修复alert镜像用户

This commit is contained in:
yuyr 2025-11-07 12:21:41 +08:00
parent 7548e46d1f
commit 1819fb9c46
5 changed files with 91 additions and 24 deletions

View File

@ -6,7 +6,7 @@ user=root
[program:alertmanager] [program:alertmanager]
command=/usr/local/bin/start-am-supervised.sh command=/usr/local/bin/start-am-supervised.sh
user=ubuntu user=alertmanager
stdout_logfile=/var/log/supervisor/alertmanager.log stdout_logfile=/var/log/supervisor/alertmanager.log
stderr_logfile=/var/log/supervisor/alertmanager_error.log stderr_logfile=/var/log/supervisor/alertmanager_error.log
autorestart=true autorestart=true

View File

@ -1,21 +0,0 @@
SERVER_PROJECT=argus-swarm-server
NODES_PROJECT=argus-swarm-nodes
# Host ports for server compose
MASTER_PORT=32300
ES_HTTP_PORT=9200
KIBANA_PORT=5601
PROMETHEUS_PORT=9090
GRAFANA_PORT=3000
ALERTMANAGER_PORT=9093
WEB_PROXY_PORT_8080=8080
WEB_PROXY_PORT_8081=8081
WEB_PROXY_PORT_8082=8082
WEB_PROXY_PORT_8083=8083
WEB_PROXY_PORT_8084=8084
WEB_PROXY_PORT_8085=8085
# UID/GID for volume ownership in containers
ARGUS_BUILD_UID=1000
ARGUS_BUILD_GID=1000

View File

@ -1,5 +1,5 @@
BINDIP=10.0.1.5 BINDIP=10.0.4.25
FTPIP=10.0.1.4 FTPIP=10.0.4.29
MASTER_ENDPOINT=http://master.argus.com:3000 MASTER_ENDPOINT=http://master.argus.com:3000
FTP_USER=ftpuser FTP_USER=ftpuser
FTP_PASSWORD=ZGClab1234! FTP_PASSWORD=ZGClab1234!

View File

@ -1,2 +1,7 @@
private-*/ private-*/
tmp/
.env
.env.nodes

View File

@ -0,0 +1,83 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
ES_URL="http://localhost:${ES_HTTP_PORT:-9200}"
# Tunables (env overrides)
RELAX_WM_LOW="${RELAX_WM_LOW:-99%}"
RELAX_WM_HIGH="${RELAX_WM_HIGH:-99%}"
RELAX_WM_FLOOD="${RELAX_WM_FLOOD:-99%}"
DISABLE_WATERMARK="${DISABLE_WATERMARK:-1}"
SET_KIBANA_REPLICAS_ZERO="${SET_KIBANA_REPLICAS_ZERO:-1}"
CLEAR_READONLY_BLOCKS="${CLEAR_READONLY_BLOCKS:-1}"
echo "[RELAX] Checking Elasticsearch at $ES_URL"
code=$(curl -s -o /dev/null -w '%{http_code}' "$ES_URL/_cluster/health" || true)
if [[ "$code" != "200" ]]; then
echo "[RELAX][ERROR] ES not reachable (code=$code). Ensure argus-es-sys is running." >&2
exit 1
fi
echo "[RELAX] Applying transient cluster settings (watermarks)"
th_enabled=$([[ "$DISABLE_WATERMARK" == "1" ]] && echo false || echo true)
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d "{
\"transient\": {
\"cluster.routing.allocation.disk.threshold_enabled\": $th_enabled,
\"cluster.routing.allocation.disk.watermark.low\": \"$RELAX_WM_LOW\",
\"cluster.routing.allocation.disk.watermark.high\": \"$RELAX_WM_HIGH\",
\"cluster.routing.allocation.disk.watermark.flood_stage\": \"$RELAX_WM_FLOOD\"
}
}" | sed -n '1,5p'
if [[ "$CLEAR_READONLY_BLOCKS" == "1" ]]; then
echo "[RELAX] Clearing read_only/read_only_allow_delete blocks on all indices (best-effort)"
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_all/_settings" -d '{
"index.blocks.read_only": false,
"index.blocks.read_only_allow_delete": false
}' >/dev/null || true
fi
if [[ "${SET_KIBANA_REPLICAS_ZERO:-1}" != "0" ]]; then
echo "[RELAX] Ensure .kibana* use replicas=0 via index template and per-index settings (best-effort)"
# high priority template for .kibana* only, avoid impacting other indices
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_index_template/kibana-replicas-0" -d '{
"index_patterns": [".kibana*"],
"priority": 200,
"template": { "settings": { "number_of_replicas": 0 } }
}' >/dev/null || true
# set existing .kibana* to replicas=0
idxs=$(curl -sS "$ES_URL/_cat/indices/.kibana*?h=index" | awk '{print $1}')
for i in $idxs; do
[[ -n "$i" ]] || continue
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/$i/_settings" -d '{"index":{"number_of_replicas":0}}' >/dev/null || true
done
fi
# Retry failed shard allocations (best-effort)
curl -sS -H 'Content-Type: application/json' -X POST "$ES_URL/_cluster/reroute?retry_failed=true" -d '{}' >/dev/null || true
echo "[RELAX] Cluster health (post):"
curl -sS "$ES_URL/_cluster/health?pretty" | sed -n '1,80p'
# Simple current status summary
ch=$(curl -sS "$ES_URL/_cluster/health" || true)
status=$(printf '%s' "$ch" | awk -F'"' '/"status"/{print $4; exit}')
unassigned=$(printf '%s' "$ch" | awk -F'[,: ]+' '/"unassigned_shards"/{print $3; exit}')
duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true)
settings=$(curl -sS "$ES_URL/_cluster/settings?flat_settings=true" || true)
th=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.threshold_enabled"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
low=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.low"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
high=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.high"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
flood=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.flood_stage"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
ks=$(curl -sS "$ES_URL/_cat/shards/.kibana*?h=state" || true)
total=$(printf '%s' "$ks" | awk 'NF{c++} END{print c+0}')
started=$(printf '%s' "$ks" | awk '/STARTED/{c++} END{print c+0}')
unass=$(printf '%s' "$ks" | awk '/UNASSIGNED/{c++} END{print c+0}')
echo "[RELAX][SUMMARY] status=${status:-?} unassigned=${unassigned:-?} es.data.use=${duse:-?} watermarks(threshold=${th:-?} low=${low:-?} high=${high:-?} flood=${flood:-?}) kibana_shards(total=${total},started=${started},unassigned=${unass})"
echo "[RELAX] Done. Remember to run scripts/es-watermark-restore.sh after freeing disk space and cluster becomes stable."