#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a ts="$(date -u +%Y%m%d-%H%M%SZ)" LOG_DIR="$ROOT/logs"; mkdir -p "$LOG_DIR" || true if ! ( : > "$LOG_DIR/.w" 2>/dev/null ); then LOG_DIR="/tmp/argus-logs"; mkdir -p "$LOG_DIR" || true; fi # load compose project for accurate ps output ENV_FILE="$ROOT/compose/.env" if [[ -f "$ENV_FILE" ]]; then set -a; source "$ENV_FILE"; set +a; fi PROJECT="${COMPOSE_PROJECT_NAME:-argus-server}" DETAILS="$LOG_DIR/diagnose_details_${ts}.log"; ERRORS="$LOG_DIR/diagnose_error_${ts}.log"; : > "$DETAILS"; : > "$ERRORS" logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; } append_err() { echo "$*" >> "$ERRORS"; } http_code() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; } http_body_head() { curl -s --max-time 3 "$1" 2>/dev/null | sed -n '1,5p' || true; } header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; } section() { local name="$1"; logd "===== [$name] ====="; } svc() { local svc_name="$1"; local cname="$2"; shift 2 section "$svc_name ($cname)" logd "docker ps:"; docker ps -a --format '{{.Names}}\t{{.Status}}\t{{.Image}}' | awk -v n="$cname" '$1==n' >> "$DETAILS" || true logd "docker inspect:"; docker inspect -f '{{.State.Status}} rc={{.RestartCount}} started={{.State.StartedAt}}' "$cname" >> "$DETAILS" 2>&1 || true logd "last 200 container logs:"; docker logs --tail 200 "$cname" >> "$DETAILS" 2>&1 || true docker logs --tail 200 "$cname" 2>&1 | grep -Ei '\\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\\b' | sed "s/^/[${svc_name}][container] /" >> "$ERRORS" || true if docker exec "$cname" sh -lc 'command -v supervisorctl >/dev/null 2>&1' >/dev/null 2>&1; then logd "supervisorctl status:"; docker exec "$cname" sh -lc 'supervisorctl status' >> "$DETAILS" 2>&1 || true local files; files=$(docker exec "$cname" sh -lc 'ls /var/log/supervisor/*.log 2>/dev/null' || true) for f in $files; do logd "tail -n 80 $f:"; docker exec "$cname" sh -lc "tail -n 80 $f 2>/dev/null || true" >> "$DETAILS" 2>&1 || true docker exec "$cname" sh -lc "tail -n 200 $f 2>/dev/null" 2>/dev/null | grep -Ei '\\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\\b' | sed "s/^/[${svc_name}][supervisor:$(basename $f)] /" >> "$ERRORS" || true done fi } svc master argus-master-sys svc es argus-es-sys svc kibana argus-kibana-sys svc prometheus argus-prometheus svc grafana argus-grafana svc alertmanager argus-alertmanager svc web-frontend argus-web-frontend svc web-proxy argus-web-proxy section HTTP logd "ES: $(http_code \"http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health\")"; http_body_head "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" >> "$DETAILS" 2>&1 || true logd "Kibana: $(http_code \"http://localhost:${KIBANA_PORT:-5601}/api/status\")"; http_body_head "http://localhost:${KIBANA_PORT:-5601}/api/status" >> "$DETAILS" 2>&1 || true logd "Master readyz: $(http_code \"http://localhost:${MASTER_PORT:-32300}/readyz\")" logd "Prometheus: $(http_code \"http://localhost:${PROMETHEUS_PORT:-9090}/-/ready\")" logd "Grafana: $(http_code \"http://localhost:${GRAFANA_PORT:-3000}/api/health\")"; http_body_head "http://localhost:${GRAFANA_PORT:-3000}/api/health" >> "$DETAILS" 2>&1 || true logd "Alertmanager: $(http_code \"http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status\")" cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true) cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true) logd "Web-Proxy 8080: $(http_code \"http://localhost:${WEB_PROXY_PORT_8080:-8080}/\")" logd "Web-Proxy 8083: $(http_code \"http://localhost:${WEB_PROXY_PORT_8083:-8083}/\")" logd "Web-Proxy 8084 CORS: ${cors8084}" logd "Web-Proxy 8085 CORS: ${cors8085}" section ES-CHECKS ch=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" || true) status=$(printf '%s' "$ch" | awk -F'\"' '/"status"/{print $4; exit}') if [[ -n "$status" ]]; then logd "cluster.status=$status"; fi if [[ "$status" != "green" ]]; then append_err "[es][cluster] status=$status"; fi if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true) logd "es.data.df_use=$duse"; usep=${duse%%%} if [[ -n "$usep" ]] && (( usep >= 90 )); then append_err "[es][disk] data path usage=${usep}%"; fi fi section DNS-IN-PROXY for d in master.argus.com es.log.argus.com kibana.log.argus.com grafana.metric.argus.com prom.metric.argus.com alertmanager.alert.argus.com; do docker exec argus-web-proxy sh -lc "getent hosts $d || nslookup $d 2>/dev/null | tail -n+1" >> "$DETAILS" 2>&1 || true done logd "HTTP (web-proxy): master.readyz=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://master.argus.com:3000/readyz\" 2>/dev/null || echo 000)" logd "HTTP (web-proxy): es.health=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://es.log.argus.com:9200/_cluster/health\" 2>/dev/null || echo 000)" logd "HTTP (web-proxy): kibana.status=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://kibana.log.argus.com:5601/api/status\" 2>/dev/null || echo 000)" section SYSTEM logd "uname -a:"; uname -a >> "$DETAILS" logd "docker version:"; docker version --format '{{.Server.Version}}' >> "$DETAILS" 2>&1 || true logd "compose ps (project=$PROJECT):"; (cd "$ROOT/compose" && docker compose -p "$PROJECT" --env-file "$ENV_FILE" -f docker-compose.yml ps) >> "$DETAILS" 2>&1 || true section SUMMARY [[ $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health") != 200 ]] && echo "[es][http] health not 200" >> "$ERRORS" kbcode=$(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status"); [[ "$kbcode" != 200 ]] && echo "[kibana][http] /api/status=$kbcode" >> "$ERRORS" [[ $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz") != 200 ]] && echo "[master][http] /readyz not 200" >> "$ERRORS" [[ $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready") != 200 ]] && echo "[prometheus][http] /-/ready not 200" >> "$ERRORS" gfcode=$(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health"); [[ "$gfcode" != 200 ]] && echo "[grafana][http] /api/health=$gfcode" >> "$ERRORS" [[ $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status") != 200 ]] && echo "[alertmanager][http] /api/v2/status not 200" >> "$ERRORS" [[ -z "$cors8084" ]] && echo "[web-proxy][cors] 8084 missing Access-Control-Allow-Origin" >> "$ERRORS" [[ -z "$cors8085" ]] && echo "[web-proxy][cors] 8085 missing Access-Control-Allow-Origin" >> "$ERRORS" sort -u -o "$ERRORS" "$ERRORS" echo "Diagnostic details -> $DETAILS" echo "Detected errors -> $ERRORS" if [[ "$LOG_DIR" == "$ROOT/logs" ]]; then ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true fi exit 0