diff --git a/build/build_images.sh b/build/build_images.sh index d56b79e..5503f52 100755 --- a/build/build_images.sh +++ b/build/build_images.sh @@ -12,6 +12,7 @@ Options: --master-offline Build master offline image (requires src/master/offline_wheels.tar.gz) --metric Build metric module images (ftp, prometheus, grafana, test nodes) --no-cache Build all images without using Docker layer cache + --only LIST Comma-separated targets to build: core,master,metric,web,alert,all -h, --help Show this help message Examples: @@ -24,9 +25,12 @@ EOF } use_intranet=false +build_core=true build_master=true build_master_offline=false build_metric=true +build_web=true +build_alert=true no_cache=false while [[ $# -gt 0 ]]; do @@ -52,6 +56,26 @@ while [[ $# -gt 0 ]]; do no_cache=true shift ;; + --only) + if [[ -z ${2:-} ]]; then + echo "--only requires a target list" >&2; exit 1 + fi + sel="$2"; shift 2 + # reset all, then enable selected + build_core=false; build_master=false; build_metric=false; build_web=false; build_alert=false + IFS=',' read -ra parts <<< "$sel" + for p in "${parts[@]}"; do + case "$p" in + core) build_core=true ;; + master) build_master=true ;; + metric) build_metric=true ;; + web) build_web=true ;; + alert) build_alert=true ;; + all) build_core=true; build_master=true; build_metric=true; build_web=true; build_alert=true ;; + *) echo "Unknown --only target: $p" >&2; exit 1 ;; + esac + done + ;; -h|--help) show_help exit 0 @@ -177,26 +201,28 @@ pull_base_image() { images_built=() build_failed=false -if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then - images_built+=("argus-elasticsearch:latest") -else - build_failed=true -fi +if [[ "$build_core" == true ]]; then + if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then + images_built+=("argus-elasticsearch:latest") + else + build_failed=true + fi -echo "" + echo "" -if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then - images_built+=("argus-kibana:latest") -else - build_failed=true -fi + if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then + images_built+=("argus-kibana:latest") + else + build_failed=true + fi -echo "" + echo "" -if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then - images_built+=("argus-bind9:latest") -else - build_failed=true + if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then + images_built+=("argus-bind9:latest") + else + build_failed=true + fi fi echo "" @@ -264,49 +290,53 @@ fi # Web & Alert module images # ======================================= -echo "" -echo "Building Web and Alert module images..." - -# Pre-pull commonly used base images for stability -web_alert_base_images=( - "node:20" - "ubuntu:24.04" -) - -for base_image in "${web_alert_base_images[@]}"; do - if ! pull_base_image "$base_image"; then - build_failed=true - fi -done - -web_builds=( - "Web Frontend|src/web/build_tools/frontend/Dockerfile|argus-web-frontend:latest|." - "Web Proxy|src/web/build_tools/proxy/Dockerfile|argus-web-proxy:latest|." -) - -for build_spec in "${web_builds[@]}"; do - IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec" - if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then - images_built+=("$image_tag") - else - build_failed=true - fi +if [[ "$build_web" == true || "$build_alert" == true ]]; then echo "" -done + echo "Building Web and Alert module images..." -alert_builds=( - "Alertmanager|src/alert/alertmanager/build/Dockerfile|argus-alertmanager:latest|." -) + # Pre-pull commonly used base images for stability + web_alert_base_images=( + "node:20" + "ubuntu:24.04" + ) -for build_spec in "${alert_builds[@]}"; do - IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec" - if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then - images_built+=("$image_tag") - else - build_failed=true + for base_image in "${web_alert_base_images[@]}"; do + if ! pull_base_image "$base_image"; then + build_failed=true + fi + done + + if [[ "$build_web" == true ]]; then + web_builds=( + "Web Frontend|src/web/build_tools/frontend/Dockerfile|argus-web-frontend:latest|." + "Web Proxy|src/web/build_tools/proxy/Dockerfile|argus-web-proxy:latest|." + ) + for build_spec in "${web_builds[@]}"; do + IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec" + if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then + images_built+=("$image_tag") + else + build_failed=true + fi + echo "" + done fi - echo "" -done + + if [[ "$build_alert" == true ]]; then + alert_builds=( + "Alertmanager|src/alert/alertmanager/build/Dockerfile|argus-alertmanager:latest|." + ) + for build_spec in "${alert_builds[@]}"; do + IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec" + if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then + images_built+=("$image_tag") + else + build_failed=true + fi + echo "" + done + fi +fi echo "=======================================" echo "📦 Build Summary" diff --git a/src/agent/scripts/build_binary.sh b/src/agent/scripts/build_binary.sh index 7e5a720..bb19ed4 100755 --- a/src/agent/scripts/build_binary.sh +++ b/src/agent/scripts/build_binary.sh @@ -12,6 +12,8 @@ VENV_DIR="$BUILD_ROOT/venv" AGENT_BUILD_IMAGE="${AGENT_BUILD_IMAGE:-python:3.11-slim-bullseye}" AGENT_BUILD_USE_DOCKER="${AGENT_BUILD_USE_DOCKER:-1}" +# 默认在容器内忽略代理以避免公司内网代理在 Docker 网络不可达导致 pip 失败(可用 0 关闭) +AGENT_BUILD_IGNORE_PROXY="${AGENT_BUILD_IGNORE_PROXY:-1}" USED_DOCKER=0 run_host_build() { @@ -71,6 +73,7 @@ run_docker_build() { pass_env_if_set http_proxy pass_env_if_set https_proxy pass_env_if_set no_proxy + pass_env_if_set AGENT_BUILD_IGNORE_PROXY build_script=$(cat <<'INNER' set -euo pipefail @@ -82,6 +85,10 @@ rm -rf build dist mkdir -p build/pyinstaller dist python3 -m venv --copies build/venv source build/venv/bin/activate +# 若指定忽略代理,则清空常见代理与 pip 镜像环境变量,避免容器内代理不可达 +if [ "${AGENT_BUILD_IGNORE_PROXY:-1}" = "1" ]; then + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY PIP_INDEX_URL PIP_EXTRA_INDEX_URL PIP_TRUSTED_HOST +fi pip install --upgrade pip pip install . pip install pyinstaller==6.6.0 diff --git a/src/alert/alertmanager/build/Dockerfile b/src/alert/alertmanager/build/Dockerfile index 781714a..2045db9 100644 --- a/src/alert/alertmanager/build/Dockerfile +++ b/src/alert/alertmanager/build/Dockerfile @@ -9,14 +9,14 @@ RUN apt-get update && \ apt-get install -y wget supervisor net-tools inetutils-ping vim ca-certificates passwd && \ apt-get clean && rm -rf /var/lib/apt/lists/* -# 设置 Alertmanager 版本 +# 设置 Alertmanager 版本(与本地离线包保持一致) ARG ALERTMANAGER_VERSION=0.28.1 -# 下载并解压 Alertmanager 二进制 -RUN wget https://github.com/prometheus/alertmanager/releases/download/v${ALERTMANAGER_VERSION}/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz && \ - tar xvf alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz && \ - mv alertmanager-${ALERTMANAGER_VERSION}.linux-amd64 /usr/local/alertmanager && \ - rm alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz +# 使用仓库内预置的离线包构建(无需联网) +COPY src/alert/alertmanager/build/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz /tmp/ +RUN tar xvf /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz -C /tmp && \ + mv /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64 /usr/local/alertmanager && \ + rm -f /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz ENV ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager @@ -34,21 +34,20 @@ RUN mkdir -p /usr/share/alertmanager && \ # 创建 alertmanager 用户(可自定义 UID/GID) # 创建 alertmanager 用户组 RUN set -eux; \ + # 确保目标 GID 存在;若已被占用,直接使用该 GID(组名不限)\ if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \ groupadd -g "${ARGUS_BUILD_GID}" alertmanager || true; \ fi; \ - if id alertmanager >/dev/null 2>&1; then \ - current_uid="$(id -u alertmanager)"; \ - if [ "$current_uid" != "${ARGUS_BUILD_UID}" ] && ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \ - usermod -u "${ARGUS_BUILD_UID}" alertmanager; \ - fi; \ - usermod -g "${ARGUS_BUILD_GID}" alertmanager || true; \ - else \ - if ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \ - useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" alertmanager; \ + # 确保存在 alertmanager 用户;若 UID 已被占用,跳过并继续使用现有 UID 的用户 + if ! id alertmanager >/dev/null 2>&1; then \ + if getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \ + # UID 已占用,则创建同名用户但不指定 UID(避免冲突),仅保证 user 存在 + useradd -M -s /usr/sbin/nologin -g "${ARGUS_BUILD_GID}" alertmanager || true; \ else \ - echo "UID ${ARGUS_BUILD_UID} already exists; skip creating user 'alertmanager'"; \ + useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" alertmanager || true; \ fi; \ + else \ + usermod -g "${ARGUS_BUILD_GID}" alertmanager || true; \ fi RUN chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/alertmanager /alertmanager ${ALERTMANAGER_BASE_PATH} /private/argus/etc /usr/local/bin || true diff --git a/src/alert/alertmanager/build/alertmanager-0.28.1.linux-amd64.tar.gz b/src/alert/alertmanager/build/alertmanager-0.28.1.linux-amd64.tar.gz new file mode 100644 index 0000000..8c0ca37 Binary files /dev/null and b/src/alert/alertmanager/build/alertmanager-0.28.1.linux-amd64.tar.gz differ diff --git a/src/alert/alertmanager/build/fetch-dist.sh b/src/alert/alertmanager/build/fetch-dist.sh new file mode 100644 index 0000000..9f4140f --- /dev/null +++ b/src/alert/alertmanager/build/fetch-dist.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -euo pipefail + +# 下载 Alertmanager 离线安装包到本目录,用于 Docker 构建时 COPY +# 用法: +# ./fetch-dist.sh [version] +# 示例: +# ./fetch-dist.sh 0.28.1 + +VER="${1:-0.28.1}" +OUT="alertmanager-${VER}.linux-amd64.tar.gz" +URL="https://github.com/prometheus/alertmanager/releases/download/v${VER}/${OUT}" + +if [[ -f "$OUT" ]]; then + echo "[INFO] $OUT already exists, skip download" + exit 0 +fi + +echo "[INFO] Downloading $URL" +curl -fL --retry 3 --connect-timeout 10 -o "$OUT" "$URL" +echo "[OK] Saved to $(pwd)/$OUT" + diff --git a/src/alert/alertmanager/build/start-am-supervised.sh b/src/alert/alertmanager/build/start-am-supervised.sh index 76bbb8a..3d64ec4 100644 --- a/src/alert/alertmanager/build/start-am-supervised.sh +++ b/src/alert/alertmanager/build/start-am-supervised.sh @@ -7,10 +7,8 @@ ALERTMANAGER_BASE_PATH=${ALERTMANAGER_BASE_PATH:-/private/argus/alert/alertmanag echo "[INFO] Alertmanager base path: ${ALERTMANAGER_BASE_PATH}" -# 生成配置文件 -echo "[INFO] Generating Alertmanager configuration file..." -sed "s|\${ALERTMANAGER_BASE_PATH}|${ALERTMANAGER_BASE_PATH}|g" \ - /etc/alertmanager/alertmanager.yml > ${ALERTMANAGER_BASE_PATH}/alertmanager.yml +# 使用容器内的 /etc/alertmanager/alertmanager.yml 作为配置文件,避免写入挂载卷导致的权限问题 +echo "[INFO] Using /etc/alertmanager/alertmanager.yml as configuration" # 记录容器 IP 地址 diff --git a/src/sys/tests/README.md b/src/sys/tests/README.md index 77435a5..8dbb262 100644 --- a/src/sys/tests/README.md +++ b/src/sys/tests/README.md @@ -42,7 +42,7 @@ - `./scripts/05_agent_register.sh` 获取两个节点的 `node_id` 与初始 IP,检查本地 `node.json` - `./scripts/06_write_health_and_assert.sh` 写健康文件并断言 `nodes.json` 仅包含 2 个在线节点 - `./scripts/07_logs_send_and_assert.sh` 向两个节点写日志,断言 ES `train-*`/`infer-*` 计数增长 - - `./scripts/08_restart_agent_reregister.sh` `node-b` 改为固定 IP `172.29.0.200`,验证保持同一节点 ID 且 IP/时间戳更新 + - `./scripts/08_restart_agent_reregister.sh` `node-b` 改为固定 IP `172.31.0.200`,验证保持同一节点 ID 且 IP/时间戳更新 - `./scripts/09_down.sh` 回收容器、网络并清理 `private*/`、`tmp/` - 重置环境 @@ -53,8 +53,8 @@ ## 二、测试部署架构(docker-compose) - 网络 - - 自定义 bridge:`argus-sys-net`,子网 `172.29.0.0/16` - - 固定地址:bind=`172.29.0.2`,master=`172.29.0.10` + - 自定义 bridge:`argus-sys-net`,子网 `172.31.0.0/16` + - 固定地址:bind=`172.31.0.2`,master=`172.31.0.10` - 服务与端口 - `bind`(`argus-bind9:latest`):监听 53/tcp+udp;负责同步 `*.argus.com` 记录 @@ -72,7 +72,7 @@ - 节点容器的 Fluent Bit/agent 资产以只读方式挂载到 `/assets`/`/usr/local/bin/argus-agent` - DNS 配置 - - 节点容器通过 compose 配置 `dns: [172.29.0.2]` 指向 bind,不挂载 `/etc/resolv.conf`,也不依赖 `update-dns.sh` + - 节点容器通过 compose 配置 `dns: [172.31.0.2]` 指向 bind,不挂载 `/etc/resolv.conf`,也不依赖 `update-dns.sh` - master/es/kibana 仍共享 `./private`,master 启动会写 `/private/argus/etc/master.argus.com` 供 bind 同步 A 记录 - 节点入口 diff --git a/src/sys/tests/docker-compose.yml b/src/sys/tests/docker-compose.yml index 135cb03..fbec5cb 100644 --- a/src/sys/tests/docker-compose.yml +++ b/src/sys/tests/docker-compose.yml @@ -4,7 +4,7 @@ networks: ipam: driver: default config: - - subnet: 172.29.0.0/16 + - subnet: 172.31.0.0/16 services: bind: @@ -12,7 +12,7 @@ services: container_name: argus-bind-sys networks: sysnet: - ipv4_address: 172.29.0.2 + ipv4_address: 172.31.0.2 volumes: - ./private:/private restart: unless-stopped @@ -36,7 +36,7 @@ services: - ./private/argus/etc:/private/argus/etc networks: sysnet: - ipv4_address: 172.29.0.10 + ipv4_address: 172.31.0.10 restart: unless-stopped es: @@ -56,7 +56,7 @@ services: restart: unless-stopped networks: sysnet: - ipv4_address: 172.29.0.3 + ipv4_address: 172.31.0.3 kibana: image: argus-kibana:latest @@ -75,7 +75,7 @@ services: restart: unless-stopped networks: sysnet: - ipv4_address: 172.29.0.4 + ipv4_address: 172.31.0.4 node-a: image: ubuntu:22.04 @@ -104,7 +104,7 @@ services: entrypoint: - /usr/local/bin/node-entrypoint.sh dns: - - 172.29.0.2 + - 172.31.0.2 ports: - "2020:2020" restart: unless-stopped @@ -138,7 +138,7 @@ services: entrypoint: - /usr/local/bin/node-entrypoint.sh dns: - - 172.29.0.2 + - 172.31.0.2 ports: - "2021:2020" restart: unless-stopped @@ -167,7 +167,7 @@ services: - /etc/timezone:/etc/timezone:ro networks: sysnet: - ipv4_address: 172.29.0.40 + ipv4_address: 172.31.0.40 logging: driver: "json-file" options: @@ -192,7 +192,7 @@ services: - /etc/timezone:/etc/timezone:ro networks: sysnet: - ipv4_address: 172.29.0.41 + ipv4_address: 172.31.0.41 logging: driver: "json-file" options: @@ -223,7 +223,7 @@ services: - /etc/timezone:/etc/timezone:ro networks: sysnet: - ipv4_address: 172.29.0.42 + ipv4_address: 172.31.0.42 depends_on: - prometheus logging: @@ -232,6 +232,25 @@ services: max-size: "10m" max-file: "3" + # --- Added: Web Frontend (no host port; resolved by DNS as web.argus.com) --- + web-frontend: + image: argus-web-frontend:latest + container_name: argus-web-frontend + environment: + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ./private/argus/etc:/private/argus/etc + networks: + sysnet: + ipv4_address: 172.31.0.80 + restart: unless-stopped + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + test-node: image: ubuntu:22.04 container_name: argus-metric-test-node @@ -245,7 +264,7 @@ services: - TZ=Asia/Shanghai - DEBIAN_FRONTEND=noninteractive - FTP_DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com} - - FTP_SERVER=${FTP_SERVER:-172.29.0.40} + - FTP_SERVER=${FTP_SERVER:-172.31.0.40} - FTP_USER=${FTP_USER:-ftpuser} - FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!} - FTP_PORT=${FTP_PORT:-21} @@ -264,7 +283,7 @@ services: - infinity networks: sysnet: - ipv4_address: 172.29.0.50 + ipv4_address: 172.31.0.50 logging: driver: "json-file" options: @@ -311,7 +330,62 @@ services: - infinity networks: sysnet: - ipv4_address: 172.29.0.51 + ipv4_address: 172.31.0.51 + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + # --- Added: Alertmanager --- + alertmanager: + image: argus-alertmanager:latest + container_name: argus-alertmanager + environment: + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ./private/argus/etc:/private/argus/etc + - ./private/argus/alert/alertmanager:/private/argus/alert/alertmanager + networks: + sysnet: + ipv4_address: 172.31.0.82 + ports: + - "9093:9093" + restart: unless-stopped + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + # --- Added: Web Proxy (multi-port gateway) --- + web-proxy: + image: argus-web-proxy:latest + container_name: argus-web-proxy + depends_on: + - bind + - master + - grafana + - prometheus + - kibana + - alertmanager + environment: + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ./private/argus/etc:/private/argus/etc + networks: + sysnet: + ipv4_address: 172.31.0.81 + ports: + - "8080:8080" + - "8081:8081" + - "8082:8082" + - "8083:8083" + - "8084:8084" + - "8085:8085" + restart: unless-stopped logging: driver: "json-file" options: diff --git a/src/sys/tests/scripts/01_bootstrap.sh b/src/sys/tests/scripts/01_bootstrap.sh index 7b20969..a2fd8a4 100755 --- a/src/sys/tests/scripts/01_bootstrap.sh +++ b/src/sys/tests/scripts/01_bootstrap.sh @@ -45,6 +45,7 @@ mkdir -p \ "$PRIVATE_CORE/argus/bind" \ "$PRIVATE_CORE/argus/master" \ "$PRIVATE_CORE/argus/metric/prometheus" \ + "$PRIVATE_CORE/argus/alert/alertmanager" \ "$PRIVATE_CORE/argus/metric/ftp/share" \ "$PRIVATE_CORE/argus/metric/grafana/data" \ "$PRIVATE_CORE/argus/metric/grafana/logs" \ @@ -71,10 +72,14 @@ chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" \ "$PRIVATE_CORE/argus/log/kibana" \ "$PRIVATE_CORE/argus/metric/grafana" \ "$PRIVATE_CORE/argus/metric/prometheus" \ + "$PRIVATE_CORE/argus/alert" \ "$PRIVATE_CORE/argus/metric/ftp" \ "$PRIVATE_CORE/argus/agent" \ "$PRIVATE_CORE/argus/etc" 2>/dev/null || true +# 确保 alert 与 etc 目录组可写,便于非 root 且仅匹配 GID 的服务写入运行文件 +chmod -R g+w "$PRIVATE_CORE/argus/alert" "$PRIVATE_CORE/argus/etc" 2>/dev/null || true + echo "[INFO] Using compose-managed network (auto-created by docker compose)" echo "[INFO] Distributing update-dns.sh for core services (bind/master/es/kibana)" @@ -95,6 +100,9 @@ ensure_image "argus-master:latest" ensure_image "argus-metric-ftp:latest" ensure_image "argus-metric-prometheus:latest" ensure_image "argus-metric-grafana:latest" +ensure_image "argus-web-frontend:latest" +ensure_image "argus-web-proxy:latest" +ensure_image "argus-alertmanager:latest" echo "[INFO] Building agent binary..." pushd "$REPO_ROOT/src/agent" >/dev/null diff --git a/src/sys/tests/scripts/02_up.sh b/src/sys/tests/scripts/02_up.sh index e6f7a1c..3603621 100755 --- a/src/sys/tests/scripts/02_up.sh +++ b/src/sys/tests/scripts/02_up.sh @@ -47,19 +47,40 @@ for name in argus-node-b; do fi done +# 预检:检查多端口网关所需宿主端口是否空闲 +check_port_free() { + local p="$1" + if ss -ltnp 2>/dev/null | grep -q ":${p} "; then + echo "[ERR] Host port ${p} is already in use. Please free it before running 02_up.sh" >&2 + ss -ltnp | awk -v p=":${p} " '$0 ~ p {print " " $0}' || true + return 1 + fi + return 0 +} + +for port in 8080 8081 8082 8083 8084 8085; do + check_port_free "$port" || { echo "[ERR] Required port busy: $port"; exit 1; } +done + # 根据GPU可用性决定启动的服务 if [[ "$GPU_AVAILABLE" == true ]]; then echo "[INFO] 启动所有服务(包含 gpu profile)..." - compose -p argus-sys --profile gpu up -d + compose -p argus-sys --profile gpu up -d || true else echo "[INFO] 启动基础服务(不含 gpu profile)..." - compose -p argus-sys up -d + compose -p argus-sys up -d || true +fi + +# 若 web-proxy 处于 Created 状态,尝试单独启动一次(处理偶发 Address already in use 后端已释放的场景) +if docker ps -a --format '{{.Names}}\t{{.Status}}' | grep -q '^argus-web-proxy\s\+Created'; then + echo "[WARN] web-proxy in Created state; retry starting it..." + docker start argus-web-proxy || true fi popd >/dev/null if [[ "$GPU_AVAILABLE" == true ]]; then - echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 test-gpu-node:172.29.0.51" + echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 test-gpu-node:172.31.0.51" else echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 (gpu skipped)" fi diff --git a/src/sys/tests/scripts/03_wait_ready.sh b/src/sys/tests/scripts/03_wait_ready.sh index 4887181..a4f92fb 100755 --- a/src/sys/tests/scripts/03_wait_ready.sh +++ b/src/sys/tests/scripts/03_wait_ready.sh @@ -29,6 +29,7 @@ echo "[INFO] Waiting for ES/Kibana/Master/Fluent Bit/Bind..." # ES (>= yellow) attempt=1; max=120 +ES_T0=$(date +%s) while (( attempt <= max )); do if curl -fsS "http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then break @@ -36,16 +37,19 @@ while (( attempt <= max )); do echo "[..] waiting ES ($attempt/$max)"; sleep 5; ((attempt++)) done [[ $attempt -le $max ]] || { echo "[ERR] ES not ready" >&2; exit 1; } +ES_T1=$(date +%s); echo "[TIME] ES ready in $((ES_T1-ES_T0))s" # Kibana: must be HTTP 200 and overall.level=available echo "[INFO] Waiting for Kibana to be available (HTTP 200)..." kb_attempt=1; kb_max=180 +KB_T0=$(date +%s) while (( kb_attempt <= kb_max )); do body=$(curl -sS "http://localhost:5601/api/status" 2>/dev/null || true) code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:5601/api/status" || echo 000) if [[ "$code" == "200" ]]; then if echo "$body" | grep -q '"level":"available"'; then - echo "[OK] Kibana available (HTTP 200)" + KB_T1=$(date +%s) + echo "[OK] Kibana available (HTTP 200) in $((KB_T1-KB_T0))s" break fi fi @@ -58,11 +62,13 @@ if (( kb_attempt > kb_max )); then fi # Master +MASTER_T0=$(date +%s) wait_http "http://localhost:32300/readyz" 120 +MASTER_T1=$(date +%s); echo "[TIME] Master readyz in $((MASTER_T1-MASTER_T0))s" # Fluent Bit (host metrics on host ports) -wait_http "http://localhost:2020/api/v2/metrics" 120 -wait_http "http://localhost:2021/api/v2/metrics" 120 +FB1_T0=$(date +%s); wait_http "http://localhost:2020/api/v2/metrics" 120; FB1_T1=$(date +%s); echo "[TIME] FluentBit:2020 in $((FB1_T1-FB1_T0))s" +FB2_T0=$(date +%s); wait_http "http://localhost:2021/api/v2/metrics" 120; FB2_T1=$(date +%s); echo "[TIME] FluentBit:2021 in $((FB2_T1-FB2_T0))s" # Bind config check BIND_ID="$(service_id bind)" @@ -72,4 +78,63 @@ else echo "[WARN] bind container id not found" fi +# ========== Additional module readiness checks ========== + +# Prometheus +PROM_T0=$(date +%s); wait_http "http://localhost:9090/-/ready" 120; PROM_T1=$(date +%s); echo "[TIME] Prometheus ready in $((PROM_T1-PROM_T0))s" + +# Grafana health (database: ok) +echo "[INFO] Waiting for Grafana health..." +gf_attempt=1; gf_max=120 +while (( gf_attempt <= gf_max )); do + gf_body=$(curl -sS "http://localhost:3000/api/health" 2>/dev/null || true) + gf_code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:3000/api/health" || echo 000) + if [[ "$gf_code" == "200" ]] && echo "$gf_body" | grep -q '"database"\s*:\s*"ok"'; then + echo "[OK] Grafana health database=ok" + break + fi + echo "[..] waiting grafana health ($gf_attempt/$gf_max), last_code=$gf_code" + sleep 3; ((gf_attempt++)) +done +if (( gf_attempt > gf_max )); then + echo "[ERR] Grafana /api/health not ready" >&2; exit 1 +fi + +# Alertmanager +wait_http "http://localhost:9093/api/v2/status" 120 + +# Web proxy checks(按端口细化) +code_for() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; } +header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; } + +echo "[INFO] Checking web-proxy ports..." + +# 8080 首页必须 200 +tries=1; max=60; P8080_T0=$(date +%s) +while (( tries <= max )); do + c=$(code_for "http://localhost:8080/") + if [[ "$c" == "200" ]]; then P8080_T1=$(date +%s); echo "[OK] 8080 / ($c) in $((P8080_T1-P8080_T0))s"; break; fi + echo "[..] waiting 8080/ ($tries/$max), code=$c"; sleep 3; ((tries++)) +done +(( tries <= max )) || { echo "[ERR] 8080/ not ready" >&2; exit 1; } + +# 8083 Kibana 允许 200/302(上面已就绪,端口侧再快速确认) +tries=1; max=40; P8083_T0=$(date +%s) +while (( tries <= max )); do + c=$(code_for "http://localhost:8083/") + if [[ "$c" == "200" || "$c" == "302" ]]; then P8083_T1=$(date +%s); echo "[OK] 8083 / ($c) in $((P8083_T1-P8083_T0))s"; break; fi + echo "[..] waiting 8083/ ($tries/$max), code=$c"; sleep 3; ((tries++)) +done +(( tries <= max )) || { echo "[ERR] 8083/ not ready" >&2; exit 1; } + +# 8084 Alertmanager + CORS +P8084_T0=$(date +%s); wait_http "http://localhost:8084/api/v2/status" 60; P8084_T1=$(date +%s) +cors=$(header_val -H "Origin: http://localhost:8080" "http://localhost:8084/api/v2/status" || true) +if [[ -z "$cors" ]]; then echo "[ERR] 8084 CORS missing" >&2; exit 1; else echo "[OK] 8084 CORS: $cors in $((P8084_T1-P8084_T0))s"; fi + +# 8085 Master /readyz + CORS(API 走 8085 才需跨域) +P8085_T0=$(date +%s); wait_http "http://localhost:8085/readyz" 60; P8085_T1=$(date +%s) +cors=$(header_val -H "Origin: http://localhost:8080" "http://localhost:8085/api/v1/master/nodes" || true) +if [[ -z "$cors" ]]; then echo "[ERR] 8085 CORS missing" >&2; exit 1; else echo "[OK] 8085 CORS: $cors in $((P8085_T1-P8085_T0))s"; fi + echo "[OK] All services are ready" diff --git a/src/sys/tests/scripts/08_restart_agent_reregister.sh b/src/sys/tests/scripts/08_restart_agent_reregister.sh index baa763d..6af9a0d 100755 --- a/src/sys/tests/scripts/08_restart_agent_reregister.sh +++ b/src/sys/tests/scripts/08_restart_agent_reregister.sh @@ -49,7 +49,7 @@ compose() { fi } -echo "[INFO] Recreating node-b with static IP 172.29.0.200..." +echo "[INFO] Recreating node-b with static IP 172.31.0.200..." pushd "$TEST_ROOT" >/dev/null compose -p argus-sys rm -sf node-b || true popd >/dev/null @@ -77,8 +77,8 @@ docker run -d \ --name argus-node-b \ --hostname dev-yyrshare-uuuu10-ep2f-pod-0 \ --network "$SYSNET_NAME" \ - --ip 172.29.0.200 \ - --dns 172.29.0.2 \ + --ip 172.31.0.200 \ + --dns 172.31.0.2 \ -e MASTER_ENDPOINT=http://master.argus.com:3000 \ -e REPORT_INTERVAL_SECONDS=2 \ -e ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} \ @@ -105,15 +105,15 @@ node=json.load(open(sys.argv[1])) last0=sys.argv[2] ip=node.get("meta_data",{}).get("ip") lu=node.get("last_updated") -assert ip=="172.29.0.200" +assert ip=="172.31.0.200" assert lu and lu!=last0 PY then - echo "[OK] node-b re-registered with new IP 172.29.0.200" + echo "[OK] node-b re-registered with new IP 172.31.0.200" exit 0 fi fi done -echo "[ERR] node-b did not update to IP 172.29.0.200 in time" >&2 +echo "[ERR] node-b did not update to IP 172.31.0.200 in time" >&2 exit 1 diff --git a/src/sys/tests/scripts/11_metric_node_install.sh b/src/sys/tests/scripts/11_metric_node_install.sh index 11a6104..63ff81b 100755 --- a/src/sys/tests/scripts/11_metric_node_install.sh +++ b/src/sys/tests/scripts/11_metric_node_install.sh @@ -16,7 +16,7 @@ if ! docker ps --format '{{.Names}}' | grep -q "^${CONTAINER}$"; then exit 1 fi -FTP_HOST="${FTP_SERVER:-172.29.0.40}" +FTP_HOST="${FTP_SERVER:-172.31.0.40}" FTP_USER="${FTP_USER:-ftpuser}" FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" FTP_PORT="${FTP_PORT:-21}" diff --git a/src/sys/tests/scripts/12_metric_gpu_install.sh b/src/sys/tests/scripts/12_metric_gpu_install.sh index ba3b875..917221a 100755 --- a/src/sys/tests/scripts/12_metric_gpu_install.sh +++ b/src/sys/tests/scripts/12_metric_gpu_install.sh @@ -23,7 +23,7 @@ if ! docker ps --format '{{.Names}}' | grep -q "^${CONTAINER}$"; then exit 1 fi -FTP_HOST="${FTP_SERVER:-172.29.0.40}" +FTP_HOST="${FTP_SERVER:-172.31.0.40}" FTP_USER="${FTP_USER:-ftpuser}" FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" FTP_PORT="${FTP_PORT:-21}" diff --git a/src/sys/tests/scripts/13_metric_verify_dataplane.sh b/src/sys/tests/scripts/13_metric_verify_dataplane.sh index 527aae8..4c22faf 100755 --- a/src/sys/tests/scripts/13_metric_verify_dataplane.sh +++ b/src/sys/tests/scripts/13_metric_verify_dataplane.sh @@ -5,7 +5,7 @@ TMP_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. && pwd)/tmp/metric-verify" mkdir -p "$TMP_DIR" PROM_BASE="http://localhost:9090/api/v1" -INSTANCE="${METRIC_TEST_INSTANCE:-172.29.0.50:9100}" +INSTANCE="${METRIC_TEST_INSTANCE:-172.31.0.50:9100}" IP_ONLY="${INSTANCE%%:*}" echo "[VERIFY:DATA] node exporter metrics present in container" diff --git a/src/web/build_tools/frontend/nginx.conf b/src/web/build_tools/frontend/nginx.conf index 93491ae..7addad2 100644 --- a/src/web/build_tools/frontend/nginx.conf +++ b/src/web/build_tools/frontend/nginx.conf @@ -1,4 +1,4 @@ -user web; +user root; worker_processes auto; events { diff --git a/src/web/build_tools/frontend/supervisord.conf b/src/web/build_tools/frontend/supervisord.conf index ee7c3b3..36244aa 100644 --- a/src/web/build_tools/frontend/supervisord.conf +++ b/src/web/build_tools/frontend/supervisord.conf @@ -18,7 +18,7 @@ stopasgroup=true [program:web-health] command=/usr/local/bin/health-check.sh -user=web +user=root stdout_logfile=/var/log/supervisor/web-health.log stderr_logfile=/var/log/supervisor/web-health_error.log autorestart=true diff --git a/src/web/build_tools/proxy/Dockerfile b/src/web/build_tools/proxy/Dockerfile index 748b384..870afef 100644 --- a/src/web/build_tools/proxy/Dockerfile +++ b/src/web/build_tools/proxy/Dockerfile @@ -66,13 +66,16 @@ RUN mkdir -p /var/log/supervisor # 复制启动脚本 COPY src/web/build_tools/proxy/start-proxy-supervised.sh /usr/local/bin/start-proxy-supervised.sh RUN chmod +x /usr/local/bin/start-proxy-supervised.sh +COPY src/web/build_tools/proxy/start-proxy-retry.sh /usr/local/bin/start-proxy-retry.sh +RUN chmod +x /usr/local/bin/start-proxy-retry.sh # 复制 DNS 监控脚本 -COPY src/web/build_tools/proxy/dns-monitor.sh /usr/local/bin/dns-monitor.sh +# 统一复用 bind 模块的 dns-monitor 脚本,保持行为一致 +COPY src/bind/build/dns-monitor.sh /usr/local/bin/dns-monitor.sh RUN chmod +x /usr/local/bin/dns-monitor.sh # 暴露端口 -EXPOSE 80 +EXPOSE 80 8080 8081 8082 8083 8084 8085 # 保持 root 用户,由 supervisor 控制 user 切换 USER root diff --git a/src/web/build_tools/proxy/conf.d/alert.conf b/src/web/build_tools/proxy/conf.d/alert.conf deleted file mode 100644 index 1aa9224..0000000 --- a/src/web/build_tools/proxy/conf.d/alert.conf +++ /dev/null @@ -1,9 +0,0 @@ -server { - listen 80; - server_name alertmanager.alert.argus.com; - - location / { - set $alert_backend http://alertmanager.alert.argus.com:9093; - proxy_pass $alert_backend; - } -} diff --git a/src/web/build_tools/proxy/conf.d/log.conf b/src/web/build_tools/proxy/conf.d/log.conf deleted file mode 100644 index 0441bb5..0000000 --- a/src/web/build_tools/proxy/conf.d/log.conf +++ /dev/null @@ -1,21 +0,0 @@ -# Elasticsearch -server { - listen 80; - server_name es.log.argus.com; - - location / { - set $es_backend http://es.log.argus.com:9200; - proxy_pass $es_backend; - } -} - -# Kibana -server { - listen 80; - server_name kibana.log.argus.com; - - location / { - set $kibana_backend http://kibana.log.argus.com:5601; - proxy_pass $kibana_backend; - } -} diff --git a/src/web/build_tools/proxy/conf.d/master.conf b/src/web/build_tools/proxy/conf.d/master.conf deleted file mode 100644 index a85a99f..0000000 --- a/src/web/build_tools/proxy/conf.d/master.conf +++ /dev/null @@ -1,27 +0,0 @@ -server { - listen 80; - server_name master.argus.com; - - location / { - set $master_backend http://master.argus.com:3000; - proxy_pass $master_backend; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - - # CORS 支持 - add_header 'Access-Control-Allow-Origin' 'http://web.argus.com' always; - add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always; - add_header 'Access-Control-Allow-Headers' 'Origin, Content-Type, Accept, Authorization' always; - - if ($request_method = OPTIONS) { - add_header 'Access-Control-Allow-Origin' 'http://web.argus.com' always; - add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always; - add_header 'Access-Control-Allow-Headers' 'Origin, Content-Type, Accept, Authorization' always; - add_header 'Content-Length' 0; - add_header 'Content-Type' 'text/plain'; - return 204; - } - } -} diff --git a/src/web/build_tools/proxy/conf.d/metric.conf b/src/web/build_tools/proxy/conf.d/metric.conf deleted file mode 100644 index 81d68c2..0000000 --- a/src/web/build_tools/proxy/conf.d/metric.conf +++ /dev/null @@ -1,21 +0,0 @@ -# Prometheus -server { - listen 80; - server_name prometheus.metric.argus.com; - - location / { - set $prom_backend http://prom.metric.argus.com:9090; - proxy_pass $prom_backend; - } -} - -# Grafana -server { - listen 80; - server_name grafana.metric.argus.com; - - location / { - set $grafana_backend http://grafana.metric.argus.com:3000; - proxy_pass $grafana_backend; - } -} diff --git a/src/web/build_tools/proxy/conf.d/ports.conf b/src/web/build_tools/proxy/conf.d/ports.conf new file mode 100644 index 0000000..c592518 --- /dev/null +++ b/src/web/build_tools/proxy/conf.d/ports.conf @@ -0,0 +1,94 @@ +map $http_upgrade $connection_upgrade { default upgrade; "" close; } + +# 允许的跨域来源(仅用于 8084/8085) +map $http_origin $cors_allow { + default ""; + "http://localhost:8080" "http://localhost:8080"; + "http://127.0.0.1:8080" "http://127.0.0.1:8080"; +} + +# 8080 - Portal +server { + listen 8080; + server_name _; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $connection_upgrade; + proxy_http_version 1.1; + location / { proxy_pass http://web.argus.com:8080/; } +} + +# 8081 - Grafana +server { + listen 8081; + server_name _; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $connection_upgrade; + proxy_http_version 1.1; + location / { proxy_pass http://grafana.metric.argus.com:3000/; } +} + +# 8082 - Prometheus +server { + listen 8082; + server_name _; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_http_version 1.1; + location / { proxy_pass http://prom.metric.argus.com:9090/; } +} + +# 8083 - Kibana +server { + listen 8083; + server_name _; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $connection_upgrade; + proxy_http_version 1.1; + location / { proxy_pass http://kibana.log.argus.com:5601/; } +} + +# 8084 - Alertmanager(含 CORS) +server { + listen 8084; + server_name _; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + add_header 'Access-Control-Allow-Origin' $cors_allow always; + add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always; + add_header 'Access-Control-Allow-Headers' 'Origin, Content-Type, Accept, Authorization' always; + if ($request_method = OPTIONS) { return 204; } + proxy_http_version 1.1; + location / { proxy_pass http://alertmanager.alert.argus.com:9093/; } +} + +# 8085 - Master(新增,含 CORS) +server { + listen 8085; + server_name _; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + add_header 'Access-Control-Allow-Origin' $cors_allow always; + add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always; + add_header 'Access-Control-Allow-Headers' 'Origin, Content-Type, Accept, Authorization' always; + if ($request_method = OPTIONS) { return 204; } + proxy_http_version 1.1; + location / { proxy_pass http://master.argus.com:3000/; } +} diff --git a/src/web/build_tools/proxy/conf.d/web.conf b/src/web/build_tools/proxy/conf.d/web.conf deleted file mode 100644 index 27397d0..0000000 --- a/src/web/build_tools/proxy/conf.d/web.conf +++ /dev/null @@ -1,9 +0,0 @@ -server { - listen 80; - server_name web.argus.com; - - location / { - set $web_backend http://web.argus.com:8080; - proxy_pass $web_backend; - } -} diff --git a/src/web/build_tools/proxy/dns-monitor.sh b/src/web/build_tools/proxy/dns-monitor.sh deleted file mode 100644 index 2890b47..0000000 --- a/src/web/build_tools/proxy/dns-monitor.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash - -# DNS监控脚本 - 每10秒检查dns.conf是否有变化 -# 如果有变化则执行update-dns.sh脚本 - -DNS_CONF="/private/argus/etc/dns.conf" -DNS_BACKUP="/tmp/dns.conf.backup" -UPDATE_SCRIPT="/private/argus/etc/update-dns.sh" -LOG_FILE="/var/log/supervisor/dns-monitor.log" - -# 确保日志文件存在 -touch "$LOG_FILE" - -log_message() { - echo "$(date '+%Y-%m-%d %H:%M:%S') [DNS-Monitor] $1" >> "$LOG_FILE" -} - -log_message "DNS监控脚本启动" - -while true; do - if [ -f "$DNS_CONF" ]; then - if [ -f "$DNS_BACKUP" ]; then - # 比较文件内容 - if ! cmp -s "$DNS_CONF" "$DNS_BACKUP"; then - log_message "检测到DNS配置变化" - - # 更新备份文件 - cp "$DNS_CONF" "$DNS_BACKUP" - - # 执行更新脚本 - if [ -x "$UPDATE_SCRIPT" ]; then - log_message "执行DNS更新脚本: $UPDATE_SCRIPT" - "$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1 - if [ $? -eq 0 ]; then - log_message "DNS更新脚本执行成功" - else - log_message "DNS更新脚本执行失败" - fi - else - log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT" - fi - fi - else - - # 第一次检测到配置文件,执行更新脚本 - if [ -x "$UPDATE_SCRIPT" ]; then - log_message "执行DNS更新脚本: $UPDATE_SCRIPT" - "$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1 - if [ $? -eq 0 ]; then - log_message "DNS更新脚本执行成功" - - # 第一次运行,创建备份并执行更新 - cp "$DNS_CONF" "$DNS_BACKUP" - log_message "创建DNS配置备份文件" - - else - log_message "DNS更新脚本执行失败" - fi - else - log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT" - fi - fi - else - log_message "警告: DNS配置文件不存在: $DNS_CONF" - fi - - sleep 10 -done diff --git a/src/web/build_tools/proxy/nginx.conf.template b/src/web/build_tools/proxy/nginx.conf.template index 41e29ec..5fb04ba 100644 --- a/src/web/build_tools/proxy/nginx.conf.template +++ b/src/web/build_tools/proxy/nginx.conf.template @@ -1,4 +1,4 @@ -user web_proxy; +user root; worker_processes auto; events { @@ -13,6 +13,7 @@ http { # 使用系统 resolv.conf(由 update-dns.sh 动态更新) resolver __RESOLVERS__ valid=30s ipv6=off; + resolver_timeout 5s; # 启用访问日志 access_log /var/log/nginx/access.log; diff --git a/src/web/build_tools/proxy/start-proxy-retry.sh b/src/web/build_tools/proxy/start-proxy-retry.sh new file mode 100644 index 0000000..73d3baa --- /dev/null +++ b/src/web/build_tools/proxy/start-proxy-retry.sh @@ -0,0 +1,20 @@ +#!/bin/sh +set -eu + +MAX=${RETRY_MAX:-10} +DELAY=${RETRY_DELAY:-10} +ATTEMPT=1 + +echo "[INFO] proxy retry wrapper: max=${MAX}, delay=${DELAY}s" + +while [ "$ATTEMPT" -le "$MAX" ]; do + echo "[INFO] starting proxy attempt ${ATTEMPT}/${MAX}" + /usr/local/bin/start-proxy-supervised.sh && exit 0 || true + echo "[WARN] proxy exited (attempt ${ATTEMPT}/${MAX}); sleeping ${DELAY}s before retry" + sleep "$DELAY" + ATTEMPT=$((ATTEMPT+1)) +done + +echo "[ERROR] proxy failed after ${MAX} attempts" +exit 1 + diff --git a/src/web/build_tools/proxy/start-proxy-supervised.sh b/src/web/build_tools/proxy/start-proxy-supervised.sh index ac276dd..d8dba07 100644 --- a/src/web/build_tools/proxy/start-proxy-supervised.sh +++ b/src/web/build_tools/proxy/start-proxy-supervised.sh @@ -46,6 +46,10 @@ echo "检测到 DNS 服务器列表: $RESOLVERS" # ========== 生成 nginx.conf ========== if [ -f "$TEMPLATE" ]; then echo "从模板生成 nginx.conf ..." + # 合并 Docker 内置 DNS 以保障解析 Compose 服务名 + if ! echo " $RESOLVERS " | grep -q " 127.0.0.11 "; then + RESOLVERS="127.0.0.11 ${RESOLVERS}" + fi sed "s|__RESOLVERS__|$RESOLVERS|" "$TEMPLATE" > "$TARGET" else echo "错误: 找不到 nginx.conf.template ($TEMPLATE)" @@ -55,6 +59,33 @@ fi # 打印生成结果供排查 grep resolver "$TARGET" || true +# ========== 等待上游域名准备(避免启动即解析失败) ========== +UPSTREAM_DOMAINS=( + web.argus.com + grafana.metric.argus.com + prom.metric.argus.com + kibana.log.argus.com + alertmanager.alert.argus.com + master.argus.com +) +WAIT_MAX=15 +WAITED=0 +MISSING=() +while :; do + MISSING=() + for d in "${UPSTREAM_DOMAINS[@]}"; do + if [ ! -s "/private/argus/etc/${d}" ]; then + MISSING+=("$d") + fi + done + if [ ${#MISSING[@]} -eq 0 ] || [ "$WAITED" -ge "$WAIT_MAX" ]; then + break + fi + echo "[INFO] 等待上游域名记录生成(${WAITED}/${WAIT_MAX}) 缺失: ${MISSING[*]}" + sleep 1 + WAITED=$((WAITED+1)) +done + echo "[INFO] Launching nginx..." # 启动 nginx 前台模式 diff --git a/src/web/build_tools/proxy/supervisord.conf b/src/web/build_tools/proxy/supervisord.conf index 57bdfc5..3f668ab 100644 --- a/src/web/build_tools/proxy/supervisord.conf +++ b/src/web/build_tools/proxy/supervisord.conf @@ -5,12 +5,12 @@ pidfile=/var/run/supervisord.pid user=root [program:proxy] -command=/usr/local/bin/start-proxy-supervised.sh +command=/usr/local/bin/start-proxy-retry.sh user=root stdout_logfile=/var/log/supervisor/web-proxy.log stderr_logfile=/var/log/supervisor/web-proxy_error.log autorestart=true -startretries=3 +startretries=10 startsecs=5 stopwaitsecs=10 killasgroup=true diff --git a/src/web/src/config/api.js b/src/web/src/config/api.js index ef8a71b..44ce73f 100644 --- a/src/web/src/config/api.js +++ b/src/web/src/config/api.js @@ -1,30 +1,42 @@ // config/api.js -// Master 节点相关 API +// 运行时解析主机名,统一按端口访问多服务 +const HOST = (typeof window !== 'undefined' && (window.__ARGUS_PUBLIC_HOST__ || window.location.hostname)) || 'localhost'; + +const PORTS = { + MASTER: 8085, // 经网关(含 CORS) + ALERTMANAGER: 8084, + GRAFANA: 8081, + PROMETHEUS: 8082, + KIBANA: 8083, +}; + +const BASE = { + MASTER: `http://${HOST}:${PORTS.MASTER}`, + ALERT: `http://${HOST}:${PORTS.ALERTMANAGER}`, + GRAFANA: `http://${HOST}:${PORTS.GRAFANA}`, + PROM: `http://${HOST}:${PORTS.PROMETHEUS}`, + KIBANA: `http://${HOST}:${PORTS.KIBANA}`, +}; + +// Master 节点相关 API(统一走 8085) export const MASTER_API = { - // 节点列表 - LIST: "http://master.argus.com/api/v1/master/nodes", - - // 节点详情(需要 nodeId) - DETAIL: (nodeId) => `http://master.argus.com/api/v1/master/nodes/${nodeId}`, - - // 节点配置(需要 nodeId) - CONFIG: (nodeId) => `http://master.argus.com/api/v1/master/nodes/${nodeId}/config`, - - // 节点统计信息 - STATISTICS: "http://master.argus.com/api/v1/master/nodes/statistics", + LIST: `${BASE.MASTER}/api/v1/master/nodes`, + DETAIL: (nodeId) => `${BASE.MASTER}/api/v1/master/nodes/${nodeId}`, + CONFIG: (nodeId) => `${BASE.MASTER}/api/v1/master/nodes/${nodeId}/config`, + STATISTICS: `${BASE.MASTER}/api/v1/master/nodes/statistics`, }; -// 其他外部 API +// 其他外部 API(8084) export const EXTERNAL_API = { - ALERTS_INFOS: "http://alertmanager.alert.argus.com/api/v2/alerts", + ALERTS_INFOS: `${BASE.ALERT}/api/v2/alerts`, }; -// 外部服务 Host +// 外部服务 Host(端口化) export const EXTERNAL_HOST = { - ALERTS: "http://alertmanager.alert.argus.com", - GRAFANA: "http://grafana.metric.argus.com", - GRAFANA_DASHBOARD: "http://grafana.metric.argus.com/d/cluster-dashboard/cluster-dashboard", - PROMETHEUS: "http://prometheus.metric.argus.com", - KIBANA: "http://kibana.log.argus.com/app/discover", + ALERTS: `${BASE.ALERT}`, + GRAFANA: `${BASE.GRAFANA}`, + GRAFANA_DASHBOARD: `${BASE.GRAFANA}/d/cluster-dashboard/cluster-dashboard`, + PROMETHEUS: `${BASE.PROM}`, + KIBANA: `${BASE.KIBANA}/app/discover`, };