[#37] 修复alert镜像用户

2025-11-07 12:21:41 +08:00 · 2025-11-07 12:21:41 +08:00 · 1819fb9c46
commit 1819fb9c46
parent 7548e46d1f
5 changed files with 91 additions and 24 deletions
--- a/src/alert/alertmanager/build/supervisord.conf
+++ b/src/alert/alertmanager/build/supervisord.conf
@ -6,7 +6,7 @@ user=root
 [program:alertmanager]
 command=/usr/local/bin/start-am-supervised.sh
-user=ubuntu
+user=alertmanager
 stdout_logfile=/var/log/supervisor/alertmanager.log
 stderr_logfile=/var/log/supervisor/alertmanager_error.log
 autorestart=true
--- a/src/sys/swarm_tests/.env
+++ b/src/sys/swarm_tests/.env
@ -1,21 +0,0 @@
 SERVER_PROJECT=argus-swarm-server
 NODES_PROJECT=argus-swarm-nodes
 # Host ports for server compose
 MASTER_PORT=32300
 ES_HTTP_PORT=9200
 KIBANA_PORT=5601
 PROMETHEUS_PORT=9090
 GRAFANA_PORT=3000
 ALERTMANAGER_PORT=9093
 WEB_PROXY_PORT_8080=8080
 WEB_PROXY_PORT_8081=8081
 WEB_PROXY_PORT_8082=8082
 WEB_PROXY_PORT_8083=8083
 WEB_PROXY_PORT_8084=8084
 WEB_PROXY_PORT_8085=8085
 # UID/GID for volume ownership in containers
 ARGUS_BUILD_UID=1000
 ARGUS_BUILD_GID=1000
--- a/src/sys/swarm_tests/.env.nodes.template
+++ b/src/sys/swarm_tests/.env.nodes.template
@ -1,5 +1,5 @@
-BINDIP=10.0.1.5
+BINDIP=10.0.4.25
-FTPIP=10.0.1.4
+FTPIP=10.0.4.29
 MASTER_ENDPOINT=http://master.argus.com:3000
 FTP_USER=ftpuser
 FTP_PASSWORD=ZGClab1234!
--- a/src/sys/swarm_tests/.gitignore
+++ b/src/sys/swarm_tests/.gitignore
@ -1,2 +1,7 @@
 private-*/
 tmp/
 .env
 .env.nodes
--- a/src/sys/swarm_tests/scripts/es-relax.sh
+++ b/src/sys/swarm_tests/scripts/es-relax.sh
@ -0,0 +1,83 @@
 #!/usr/bin/env bash
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
 ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
 ES_URL="http://localhost:${ES_HTTP_PORT:-9200}"
 # Tunables (env overrides)
 RELAX_WM_LOW="${RELAX_WM_LOW:-99%}"
 RELAX_WM_HIGH="${RELAX_WM_HIGH:-99%}"
 RELAX_WM_FLOOD="${RELAX_WM_FLOOD:-99%}"
 DISABLE_WATERMARK="${DISABLE_WATERMARK:-1}"
 SET_KIBANA_REPLICAS_ZERO="${SET_KIBANA_REPLICAS_ZERO:-1}"
 CLEAR_READONLY_BLOCKS="${CLEAR_READONLY_BLOCKS:-1}"
 echo "[RELAX] Checking Elasticsearch at $ES_URL"
 code=$(curl -s -o /dev/null -w '%{http_code}' "$ES_URL/_cluster/health" || true)
 if [[ "$code" != "200" ]]; then
  echo "[RELAX][ERROR] ES not reachable (code=$code). Ensure argus-es-sys is running." >&2
  exit 1
 fi
 echo "[RELAX] Applying transient cluster settings (watermarks)"
 th_enabled=$([[ "$DISABLE_WATERMARK" == "1" ]] && echo false || echo true)
 curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d "{
  \"transient\": {
    \"cluster.routing.allocation.disk.threshold_enabled\": $th_enabled,
    \"cluster.routing.allocation.disk.watermark.low\": \"$RELAX_WM_LOW\",
    \"cluster.routing.allocation.disk.watermark.high\": \"$RELAX_WM_HIGH\",
    \"cluster.routing.allocation.disk.watermark.flood_stage\": \"$RELAX_WM_FLOOD\"
  }
 }" | sed -n '1,5p'
 if [[ "$CLEAR_READONLY_BLOCKS" == "1" ]]; then
  echo "[RELAX] Clearing read_only/read_only_allow_delete blocks on all indices (best-effort)"
  curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_all/_settings" -d '{
    "index.blocks.read_only": false,
    "index.blocks.read_only_allow_delete": false
  }' >/dev/null || true
 fi
 if [[ "${SET_KIBANA_REPLICAS_ZERO:-1}" != "0" ]]; then
  echo "[RELAX] Ensure .kibana* use replicas=0 via index template and per-index settings (best-effort)"
  # high priority template for .kibana* only, avoid impacting other indices
  curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_index_template/kibana-replicas-0" -d '{
    "index_patterns": [".kibana*"],
    "priority": 200,
    "template": { "settings": { "number_of_replicas": 0 } }
  }' >/dev/null || true
  # set existing .kibana* to replicas=0
  idxs=$(curl -sS "$ES_URL/_cat/indices/.kibana*?h=index" | awk '{print $1}')
  for i in $idxs; do
    [[ -n "$i" ]] || continue
    curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/$i/_settings" -d '{"index":{"number_of_replicas":0}}' >/dev/null || true
  done
 fi
 # Retry failed shard allocations (best-effort)
 curl -sS -H 'Content-Type: application/json' -X POST "$ES_URL/_cluster/reroute?retry_failed=true" -d '{}' >/dev/null || true
 echo "[RELAX] Cluster health (post):"
 curl -sS "$ES_URL/_cluster/health?pretty" | sed -n '1,80p'
 # Simple current status summary
 ch=$(curl -sS "$ES_URL/_cluster/health" || true)
 status=$(printf '%s' "$ch" | awk -F'"' '/"status"/{print $4; exit}')
 unassigned=$(printf '%s' "$ch" | awk -F'[,: ]+' '/"unassigned_shards"/{print $3; exit}')
 duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true)
 settings=$(curl -sS "$ES_URL/_cluster/settings?flat_settings=true" || true)
 th=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.threshold_enabled"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
 low=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.low"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
 high=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.high"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
 flood=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.flood_stage"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
 ks=$(curl -sS "$ES_URL/_cat/shards/.kibana*?h=state" || true)
 total=$(printf '%s' "$ks" | awk 'NF{c++} END{print c+0}')
 started=$(printf '%s' "$ks" | awk '/STARTED/{c++} END{print c+0}')
 unass=$(printf '%s' "$ks" | awk '/UNASSIGNED/{c++} END{print c+0}')
 echo "[RELAX][SUMMARY] status=${status:-?} unassigned=${unassigned:-?} es.data.use=${duse:-?} watermarks(threshold=${th:-?} low=${low:-?} high=${high:-?} flood=${flood:-?}) kibana_shards(total=${total},started=${started},unassigned=${unass})"
 echo "[RELAX] Done. Remember to run scripts/es-watermark-restore.sh after freeing disk space and cluster becomes stable."