diff --git a/src/alert/alertmanager/build/supervisord.conf b/src/alert/alertmanager/build/supervisord.conf index da05ac7..d284547 100644 --- a/src/alert/alertmanager/build/supervisord.conf +++ b/src/alert/alertmanager/build/supervisord.conf @@ -6,7 +6,7 @@ user=root [program:alertmanager] command=/usr/local/bin/start-am-supervised.sh -user=ubuntu +user=alertmanager stdout_logfile=/var/log/supervisor/alertmanager.log stderr_logfile=/var/log/supervisor/alertmanager_error.log autorestart=true diff --git a/src/sys/swarm_tests/.env b/src/sys/swarm_tests/.env deleted file mode 100644 index ca39819..0000000 --- a/src/sys/swarm_tests/.env +++ /dev/null @@ -1,21 +0,0 @@ -SERVER_PROJECT=argus-swarm-server -NODES_PROJECT=argus-swarm-nodes - -# Host ports for server compose -MASTER_PORT=32300 -ES_HTTP_PORT=9200 -KIBANA_PORT=5601 -PROMETHEUS_PORT=9090 -GRAFANA_PORT=3000 -ALERTMANAGER_PORT=9093 -WEB_PROXY_PORT_8080=8080 -WEB_PROXY_PORT_8081=8081 -WEB_PROXY_PORT_8082=8082 -WEB_PROXY_PORT_8083=8083 -WEB_PROXY_PORT_8084=8084 -WEB_PROXY_PORT_8085=8085 - -# UID/GID for volume ownership in containers -ARGUS_BUILD_UID=1000 -ARGUS_BUILD_GID=1000 - diff --git a/src/sys/swarm_tests/.env.nodes b/src/sys/swarm_tests/.env.nodes.template similarity index 81% rename from src/sys/swarm_tests/.env.nodes rename to src/sys/swarm_tests/.env.nodes.template index 58b8a01..7004b30 100644 --- a/src/sys/swarm_tests/.env.nodes +++ b/src/sys/swarm_tests/.env.nodes.template @@ -1,5 +1,5 @@ -BINDIP=10.0.1.5 -FTPIP=10.0.1.4 +BINDIP=10.0.4.25 +FTPIP=10.0.4.29 MASTER_ENDPOINT=http://master.argus.com:3000 FTP_USER=ftpuser FTP_PASSWORD=ZGClab1234! diff --git a/src/sys/swarm_tests/.gitignore b/src/sys/swarm_tests/.gitignore index c333a3f..3ae67f6 100644 --- a/src/sys/swarm_tests/.gitignore +++ b/src/sys/swarm_tests/.gitignore @@ -1,2 +1,7 @@ private-*/ + +tmp/ + +.env +.env.nodes diff --git a/src/sys/swarm_tests/scripts/es-relax.sh b/src/sys/swarm_tests/scripts/es-relax.sh new file mode 100755 index 0000000..3b0910f --- /dev/null +++ b/src/sys/swarm_tests/scripts/es-relax.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a + +ES_URL="http://localhost:${ES_HTTP_PORT:-9200}" + +# Tunables (env overrides) +RELAX_WM_LOW="${RELAX_WM_LOW:-99%}" +RELAX_WM_HIGH="${RELAX_WM_HIGH:-99%}" +RELAX_WM_FLOOD="${RELAX_WM_FLOOD:-99%}" +DISABLE_WATERMARK="${DISABLE_WATERMARK:-1}" +SET_KIBANA_REPLICAS_ZERO="${SET_KIBANA_REPLICAS_ZERO:-1}" +CLEAR_READONLY_BLOCKS="${CLEAR_READONLY_BLOCKS:-1}" + +echo "[RELAX] Checking Elasticsearch at $ES_URL" +code=$(curl -s -o /dev/null -w '%{http_code}' "$ES_URL/_cluster/health" || true) +if [[ "$code" != "200" ]]; then + echo "[RELAX][ERROR] ES not reachable (code=$code). Ensure argus-es-sys is running." >&2 + exit 1 +fi + +echo "[RELAX] Applying transient cluster settings (watermarks)" +th_enabled=$([[ "$DISABLE_WATERMARK" == "1" ]] && echo false || echo true) +curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d "{ + \"transient\": { + \"cluster.routing.allocation.disk.threshold_enabled\": $th_enabled, + \"cluster.routing.allocation.disk.watermark.low\": \"$RELAX_WM_LOW\", + \"cluster.routing.allocation.disk.watermark.high\": \"$RELAX_WM_HIGH\", + \"cluster.routing.allocation.disk.watermark.flood_stage\": \"$RELAX_WM_FLOOD\" + } +}" | sed -n '1,5p' + +if [[ "$CLEAR_READONLY_BLOCKS" == "1" ]]; then + echo "[RELAX] Clearing read_only/read_only_allow_delete blocks on all indices (best-effort)" + curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_all/_settings" -d '{ + "index.blocks.read_only": false, + "index.blocks.read_only_allow_delete": false + }' >/dev/null || true +fi + +if [[ "${SET_KIBANA_REPLICAS_ZERO:-1}" != "0" ]]; then + echo "[RELAX] Ensure .kibana* use replicas=0 via index template and per-index settings (best-effort)" + # high priority template for .kibana* only, avoid impacting other indices + curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_index_template/kibana-replicas-0" -d '{ + "index_patterns": [".kibana*"], + "priority": 200, + "template": { "settings": { "number_of_replicas": 0 } } + }' >/dev/null || true + # set existing .kibana* to replicas=0 + idxs=$(curl -sS "$ES_URL/_cat/indices/.kibana*?h=index" | awk '{print $1}') + for i in $idxs; do + [[ -n "$i" ]] || continue + curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/$i/_settings" -d '{"index":{"number_of_replicas":0}}' >/dev/null || true + done +fi + +# Retry failed shard allocations (best-effort) +curl -sS -H 'Content-Type: application/json' -X POST "$ES_URL/_cluster/reroute?retry_failed=true" -d '{}' >/dev/null || true + +echo "[RELAX] Cluster health (post):" +curl -sS "$ES_URL/_cluster/health?pretty" | sed -n '1,80p' + +# Simple current status summary +ch=$(curl -sS "$ES_URL/_cluster/health" || true) +status=$(printf '%s' "$ch" | awk -F'"' '/"status"/{print $4; exit}') +unassigned=$(printf '%s' "$ch" | awk -F'[,: ]+' '/"unassigned_shards"/{print $3; exit}') +duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true) +settings=$(curl -sS "$ES_URL/_cluster/settings?flat_settings=true" || true) +th=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.threshold_enabled"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1) +low=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.low"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1) +high=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.high"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1) +flood=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.flood_stage"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1) +ks=$(curl -sS "$ES_URL/_cat/shards/.kibana*?h=state" || true) +total=$(printf '%s' "$ks" | awk 'NF{c++} END{print c+0}') +started=$(printf '%s' "$ks" | awk '/STARTED/{c++} END{print c+0}') +unass=$(printf '%s' "$ks" | awk '/UNASSIGNED/{c++} END{print c+0}') +echo "[RELAX][SUMMARY] status=${status:-?} unassigned=${unassigned:-?} es.data.use=${duse:-?} watermarks(threshold=${th:-?} low=${low:-?} high=${high:-?} flood=${flood:-?}) kibana_shards(total=${total},started=${started},unassigned=${unass})" + +echo "[RELAX] Done. Remember to run scripts/es-watermark-restore.sh after freeing disk space and cluster becomes stable." +