199 lines
10 KiB
Bash
Executable File
199 lines
10 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
|
|
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
|
|
|
ts="$(date -u +%Y%m%d-%H%M%SZ)"
|
|
LOG_DIR="$ROOT/logs"
|
|
mkdir -p "$LOG_DIR" || true
|
|
# Fallback to /tmp when logs dir is not writable
|
|
if ! ( : > "$LOG_DIR/.w" 2>/dev/null ); then
|
|
LOG_DIR="/tmp/argus-logs"
|
|
mkdir -p "$LOG_DIR" || true
|
|
fi
|
|
DETAILS="$LOG_DIR/diagnose_details_${ts}.log"
|
|
ERRORS="$LOG_DIR/diagnose_error_${ts}.log"
|
|
: > "$DETAILS"; : > "$ERRORS"
|
|
|
|
logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; }
|
|
append_err() { echo "$*" >> "$ERRORS"; }
|
|
|
|
http_code() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
|
|
http_body_head() { curl -s --max-time 3 "$1" 2>/dev/null | sed -n '1,5p' || true; }
|
|
header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; }
|
|
|
|
section() {
|
|
local name="$1"; logd "===== [$name] ====="; }
|
|
|
|
svc() {
|
|
local svc_name="$1"; local cname="$2"; shift 2
|
|
section "$svc_name ($cname)"
|
|
logd "docker ps:"; docker ps -a --format '{{.Names}}\t{{.Status}}\t{{.Image}}' | awk -v n="$cname" '$1==n' >> "$DETAILS" || true
|
|
logd "docker inspect (state/restartcount):"; docker inspect -f '{{.State.Status}} rc={{.RestartCount}} started={{.State.StartedAt}}' "$cname" >> "$DETAILS" 2>&1 || true
|
|
logd "last 200 container logs:"; docker logs --tail 200 "$cname" >> "$DETAILS" 2>&1 || true
|
|
|
|
# extract error lines from container logs
|
|
docker logs --tail 200 "$cname" 2>&1 | \
|
|
grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \
|
|
sed "s/^/[${svc_name}][container] /" >> "$ERRORS" || true
|
|
|
|
# supervisor status and logs
|
|
if docker exec "$cname" sh -lc 'command -v supervisorctl >/dev/null 2>&1' >/dev/null 2>&1; then
|
|
logd "supervisorctl status:"; docker exec "$cname" sh -lc 'supervisorctl status' >> "$DETAILS" 2>&1 || true
|
|
# iterate supervisor logs and collect tails + errors per file
|
|
local files
|
|
files=$(docker exec "$cname" sh -lc 'ls /var/log/supervisor/*.log 2>/dev/null' || true)
|
|
for f in $files; do
|
|
logd "tail -n 80 $f:"; docker exec "$cname" sh -lc "tail -n 80 $f 2>/dev/null || true" >> "$DETAILS" 2>&1 || true
|
|
docker exec "$cname" sh -lc "tail -n 200 $f 2>/dev/null" 2>/dev/null | \
|
|
grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \
|
|
sed "s/^/[${svc_name}][supervisor:$(basename $f)] /" >> "$ERRORS" || true
|
|
done
|
|
fi
|
|
}
|
|
|
|
# Core services
|
|
svc bind argus-bind-sys
|
|
svc master argus-master-sys
|
|
svc es argus-es-sys
|
|
svc kibana argus-kibana-sys
|
|
svc ftp argus-ftp
|
|
svc prometheus argus-prometheus
|
|
svc grafana argus-grafana
|
|
svc alertmanager argus-alertmanager
|
|
svc web-frontend argus-web-frontend
|
|
svc web-proxy argus-web-proxy
|
|
|
|
# HTTP checks (host side)
|
|
section HTTP
|
|
logd "ES: $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health")"
|
|
http_body_head "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" >> "$DETAILS" 2>&1 || true
|
|
|
|
logd "Kibana: $(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status")"
|
|
http_body_head "http://localhost:${KIBANA_PORT:-5601}/api/status" >> "$DETAILS" 2>&1 || true
|
|
|
|
logd "Master readyz: $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz")"
|
|
|
|
logd "Prometheus: $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready")"
|
|
logd "Grafana: $(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health")"
|
|
http_body_head "http://localhost:${GRAFANA_PORT:-3000}/api/health" >> "$DETAILS" 2>&1 || true
|
|
logd "Alertmanager: $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status")"
|
|
|
|
cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true)
|
|
cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true)
|
|
logd "Web-Proxy 8080: $(http_code "http://localhost:${WEB_PROXY_PORT_8080:-8080}/")"
|
|
logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083}/")"
|
|
logd "Web-Proxy 8084 CORS: ${cors8084}"
|
|
logd "Web-Proxy 8085 CORS: ${cors8085}"
|
|
|
|
# Elasticsearch deep checks: disk watermark and Kibana index status
|
|
section ES-CHECKS
|
|
ch=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" || true)
|
|
status=$(printf '%s' "$ch" | awk -F'\"' '/"status"/{print $4; exit}')
|
|
if [[ -n "$status" ]]; then logd "cluster.status=$status"; fi
|
|
if [[ "$status" != "green" ]]; then append_err "[es][cluster] status=$status"; fi
|
|
|
|
if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then
|
|
duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true)
|
|
logd "es.data.df_use=$duse"
|
|
usep=${duse%%%}
|
|
if [[ -n "$usep" ]] && (( usep >= 90 )); then
|
|
append_err "[es][disk] data path usage ${duse} (>=90%) likely hit high/flood watermarks"
|
|
echo "HINT: High ES disk usage. Free space or run scripts/es-watermark-relax.sh (then scripts/es-watermark-restore.sh)." >&2
|
|
fi
|
|
fi
|
|
|
|
ks=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cat/shards/.kibana*?h=index,shard,prirep,state,unassigned.reason" || true)
|
|
if printf '%s' "$ks" | grep -Eiq '\\b(UNASSIGNED|INITIALIZING|RELOCATING)\\b'; then
|
|
append_err "[kibana][index] .kibana* shards not green"; logd "$ks"
|
|
echo "HINT: .kibana* shards not green. On single-node, set replicas=0 and ensure ES disk watermarks are not exceeded." >&2
|
|
fi
|
|
|
|
# Overlay network diagnostics
|
|
section OVERLAY-NET
|
|
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then
|
|
logd "overlay present: ${OVERLAY_NET_NAME:-argus-sys-net}"
|
|
docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" | sed -n '1,60p' >> "$DETAILS" 2>/dev/null || true
|
|
else
|
|
append_err "[overlay][network] missing ${OVERLAY_NET_NAME:-argus-sys-net}"
|
|
fi
|
|
|
|
# Domain resolution & reachability from inside web-proxy (bind-backed)
|
|
section DOMAIN
|
|
for d in master.argus.com ftp.argus.com kibana.argus.com es.log.argus.com prom.argus.com grafana.argus.com; do
|
|
logd "getent $d (web-proxy):"
|
|
docker exec argus-web-proxy sh -lc "getent hosts $d || true" >> "$DETAILS" 2>&1 || true
|
|
done
|
|
logd "HTTP (web-proxy): master.readyz=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://master.argus.com:3000/readyz\" 2>/dev/null || echo 000)"
|
|
logd "HTTP (web-proxy): es.health=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://es.log.argus.com:9200/_cluster/health\" 2>/dev/null || echo 000)"
|
|
logd "HTTP (web-proxy): kibana.status=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://kibana.argus.com:5601/api/status\" 2>/dev/null || echo 000)"
|
|
|
|
# FTP share writability (container perspective)
|
|
section FTP-SHARE
|
|
docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true
|
|
|
|
# Collect system info for context
|
|
section SYSTEM
|
|
logd "uname -a:"; uname -a >> "$DETAILS"
|
|
logd "docker version:"; docker version --format '{{.Server.Version}}' >> "$DETAILS" 2>&1 || true
|
|
logd "compose ps:"; (cd "$ROOT/compose" && if docker compose version >/dev/null 2>&1; then docker compose -p argus-sys ps; else docker-compose -p argus-sys ps; fi) >> "$DETAILS" 2>&1 || true
|
|
|
|
section SUMMARY
|
|
# Add HTTP failures and CORS problems to error log with tags
|
|
[[ $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health") != 200 ]] && echo "[es][http] health not 200" >> "$ERRORS"
|
|
kbcode=$(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status"); [[ "$kbcode" != 200 ]] && echo "[kibana][http] /api/status=$kbcode" >> "$ERRORS"
|
|
[[ $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz") != 200 ]] && echo "[master][http] /readyz not 200" >> "$ERRORS"
|
|
[[ $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready") != 200 ]] && echo "[prometheus][http] /-/ready not 200" >> "$ERRORS"
|
|
gfcode=$(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health"); [[ "$gfcode" != 200 ]] && echo "[grafana][http] /api/health=$gfcode" >> "$ERRORS"
|
|
[[ $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status") != 200 ]] && echo "[alertmanager][http] /api/v2/status not 200" >> "$ERRORS"
|
|
[[ -z "$cors8084" ]] && echo "[web-proxy][cors] 8084 missing Access-Control-Allow-Origin" >> "$ERRORS"
|
|
[[ -z "$cors8085" ]] && echo "[web-proxy][cors] 8085 missing Access-Control-Allow-Origin" >> "$ERRORS"
|
|
|
|
# Deduplicate errors
|
|
sort -u -o "$ERRORS" "$ERRORS"
|
|
|
|
# --- Prometheus targets & nodes.json checks ---
|
|
section PROMETHEUS-TARGETS
|
|
nodes_json_path="$ROOT/private/argus/metric/prometheus/nodes.json"
|
|
if [[ -f "$nodes_json_path" ]]; then
|
|
logd "nodes.json present: $nodes_json_path"
|
|
# detect gwbridge addresses (172.22/16)
|
|
if grep -E '"ip"\s*:\s*"172\.22\.' "$nodes_json_path" >/dev/null 2>&1; then
|
|
append_err "[prometheus][targets] contains gwbridge (172.22/16) addresses; prefer overlay (10.0/8)."
|
|
echo "HINT: nodes.json has 172.22.x.x targets. Ensure agent publishes overlay_ip and master prefers it." >&2
|
|
fi
|
|
else
|
|
logd "nodes.json missing at $nodes_json_path"
|
|
fi
|
|
|
|
# Query Prometheus activeTargets and list down items when possible
|
|
pt_json=$(curl -s --max-time 3 "http://localhost:${PROMETHEUS_PORT:-9090}/api/v1/targets" || true)
|
|
if command -v jq >/dev/null 2>&1; then
|
|
downs=$(printf '%s' "$pt_json" | jq -r '.data.activeTargets[] | select(.health=="down") | "[prometheus][activeTargets] down url=\(.scrapeUrl) lastError=\(.lastError)"' 2>/dev/null || true)
|
|
if [[ -n "$downs" ]]; then
|
|
printf '%s\n' "$downs" >> "$ERRORS"
|
|
fi
|
|
else
|
|
# best-effort grep when jq is unavailable
|
|
if printf '%s' "$pt_json" | grep -q '"health":"down"'; then
|
|
append_err "[prometheus][activeTargets] some targets are down (enable jq for detailed reasons)"
|
|
fi
|
|
fi
|
|
|
|
echo "Diagnostic details -> $DETAILS"
|
|
echo "Detected errors -> $ERRORS"
|
|
|
|
if [[ "$LOG_DIR" == "$ROOT/logs" ]]; then
|
|
# maintain latest symlinks when writing under package logs
|
|
ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true
|
|
ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true
|
|
else
|
|
echo "Diagnostic details -> $DETAILS"
|
|
echo "Detected errors -> $ERRORS"
|
|
fi
|
|
|
|
exit 0
|