[#29] 完成proxy到web/grafana/prom/alert,遗留web 列表也信息获取不到

This commit is contained in:
yuyr 2025-10-24 12:21:33 +08:00
parent 1d4208ed3c
commit 2c799f2c1e
30 changed files with 519 additions and 289 deletions

View File

@ -12,6 +12,7 @@ Options:
--master-offline Build master offline image (requires src/master/offline_wheels.tar.gz)
--metric Build metric module images (ftp, prometheus, grafana, test nodes)
--no-cache Build all images without using Docker layer cache
--only LIST Comma-separated targets to build: core,master,metric,web,alert,all
-h, --help Show this help message
Examples:
@ -24,9 +25,12 @@ EOF
}
use_intranet=false
build_core=true
build_master=true
build_master_offline=false
build_metric=true
build_web=true
build_alert=true
no_cache=false
while [[ $# -gt 0 ]]; do
@ -52,6 +56,26 @@ while [[ $# -gt 0 ]]; do
no_cache=true
shift
;;
--only)
if [[ -z ${2:-} ]]; then
echo "--only requires a target list" >&2; exit 1
fi
sel="$2"; shift 2
# reset all, then enable selected
build_core=false; build_master=false; build_metric=false; build_web=false; build_alert=false
IFS=',' read -ra parts <<< "$sel"
for p in "${parts[@]}"; do
case "$p" in
core) build_core=true ;;
master) build_master=true ;;
metric) build_metric=true ;;
web) build_web=true ;;
alert) build_alert=true ;;
all) build_core=true; build_master=true; build_metric=true; build_web=true; build_alert=true ;;
*) echo "Unknown --only target: $p" >&2; exit 1 ;;
esac
done
;;
-h|--help)
show_help
exit 0
@ -177,26 +201,28 @@ pull_base_image() {
images_built=()
build_failed=false
if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then
if [[ "$build_core" == true ]]; then
if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then
images_built+=("argus-elasticsearch:latest")
else
else
build_failed=true
fi
fi
echo ""
echo ""
if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then
if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then
images_built+=("argus-kibana:latest")
else
else
build_failed=true
fi
fi
echo ""
echo ""
if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then
if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then
images_built+=("argus-bind9:latest")
else
else
build_failed=true
fi
fi
echo ""
@ -264,27 +290,28 @@ fi
# Web & Alert module images
# =======================================
echo ""
echo "Building Web and Alert module images..."
if [[ "$build_web" == true || "$build_alert" == true ]]; then
echo ""
echo "Building Web and Alert module images..."
# Pre-pull commonly used base images for stability
web_alert_base_images=(
# Pre-pull commonly used base images for stability
web_alert_base_images=(
"node:20"
"ubuntu:24.04"
)
)
for base_image in "${web_alert_base_images[@]}"; do
for base_image in "${web_alert_base_images[@]}"; do
if ! pull_base_image "$base_image"; then
build_failed=true
fi
done
done
web_builds=(
if [[ "$build_web" == true ]]; then
web_builds=(
"Web Frontend|src/web/build_tools/frontend/Dockerfile|argus-web-frontend:latest|."
"Web Proxy|src/web/build_tools/proxy/Dockerfile|argus-web-proxy:latest|."
)
for build_spec in "${web_builds[@]}"; do
)
for build_spec in "${web_builds[@]}"; do
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
images_built+=("$image_tag")
@ -292,13 +319,14 @@ for build_spec in "${web_builds[@]}"; do
build_failed=true
fi
echo ""
done
done
fi
alert_builds=(
if [[ "$build_alert" == true ]]; then
alert_builds=(
"Alertmanager|src/alert/alertmanager/build/Dockerfile|argus-alertmanager:latest|."
)
for build_spec in "${alert_builds[@]}"; do
)
for build_spec in "${alert_builds[@]}"; do
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
images_built+=("$image_tag")
@ -306,7 +334,9 @@ for build_spec in "${alert_builds[@]}"; do
build_failed=true
fi
echo ""
done
done
fi
fi
echo "======================================="
echo "📦 Build Summary"

View File

@ -12,6 +12,8 @@ VENV_DIR="$BUILD_ROOT/venv"
AGENT_BUILD_IMAGE="${AGENT_BUILD_IMAGE:-python:3.11-slim-bullseye}"
AGENT_BUILD_USE_DOCKER="${AGENT_BUILD_USE_DOCKER:-1}"
# 默认在容器内忽略代理以避免公司内网代理在 Docker 网络不可达导致 pip 失败(可用 0 关闭)
AGENT_BUILD_IGNORE_PROXY="${AGENT_BUILD_IGNORE_PROXY:-1}"
USED_DOCKER=0
run_host_build() {
@ -71,6 +73,7 @@ run_docker_build() {
pass_env_if_set http_proxy
pass_env_if_set https_proxy
pass_env_if_set no_proxy
pass_env_if_set AGENT_BUILD_IGNORE_PROXY
build_script=$(cat <<'INNER'
set -euo pipefail
@ -82,6 +85,10 @@ rm -rf build dist
mkdir -p build/pyinstaller dist
python3 -m venv --copies build/venv
source build/venv/bin/activate
# 若指定忽略代理,则清空常见代理与 pip 镜像环境变量,避免容器内代理不可达
if [ "${AGENT_BUILD_IGNORE_PROXY:-1}" = "1" ]; then
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY PIP_INDEX_URL PIP_EXTRA_INDEX_URL PIP_TRUSTED_HOST
fi
pip install --upgrade pip
pip install .
pip install pyinstaller==6.6.0

View File

@ -9,14 +9,14 @@ RUN apt-get update && \
apt-get install -y wget supervisor net-tools inetutils-ping vim ca-certificates passwd && \
apt-get clean && rm -rf /var/lib/apt/lists/*
# 设置 Alertmanager 版本
# 设置 Alertmanager 版本(与本地离线包保持一致)
ARG ALERTMANAGER_VERSION=0.28.1
# 下载并解压 Alertmanager 二进制
RUN wget https://github.com/prometheus/alertmanager/releases/download/v${ALERTMANAGER_VERSION}/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz && \
tar xvf alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz && \
mv alertmanager-${ALERTMANAGER_VERSION}.linux-amd64 /usr/local/alertmanager && \
rm alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz
# 使用仓库内预置的离线包构建(无需联网)
COPY src/alert/alertmanager/build/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz /tmp/
RUN tar xvf /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz -C /tmp && \
mv /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64 /usr/local/alertmanager && \
rm -f /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz
ENV ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
@ -34,21 +34,20 @@ RUN mkdir -p /usr/share/alertmanager && \
# 创建 alertmanager 用户(可自定义 UID/GID
# 创建 alertmanager 用户组
RUN set -eux; \
# 确保目标 GID 存在;若已被占用,直接使用该 GID组名不限\
if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \
groupadd -g "${ARGUS_BUILD_GID}" alertmanager || true; \
fi; \
if id alertmanager >/dev/null 2>&1; then \
current_uid="$(id -u alertmanager)"; \
if [ "$current_uid" != "${ARGUS_BUILD_UID}" ] && ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
usermod -u "${ARGUS_BUILD_UID}" alertmanager; \
# 确保存在 alertmanager 用户;若 UID 已被占用,跳过并继续使用现有 UID 的用户
if ! id alertmanager >/dev/null 2>&1; then \
if getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
# UID 已占用,则创建同名用户但不指定 UID避免冲突仅保证 user 存在
useradd -M -s /usr/sbin/nologin -g "${ARGUS_BUILD_GID}" alertmanager || true; \
else \
useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" alertmanager || true; \
fi; \
else \
usermod -g "${ARGUS_BUILD_GID}" alertmanager || true; \
else \
if ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" alertmanager; \
else \
echo "UID ${ARGUS_BUILD_UID} already exists; skip creating user 'alertmanager'"; \
fi; \
fi
RUN chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/alertmanager /alertmanager ${ALERTMANAGER_BASE_PATH} /private/argus/etc /usr/local/bin || true

View File

@ -0,0 +1,22 @@
#!/usr/bin/env bash
set -euo pipefail
# 下载 Alertmanager 离线安装包到本目录,用于 Docker 构建时 COPY
# 用法:
# ./fetch-dist.sh [version]
# 示例:
# ./fetch-dist.sh 0.28.1
VER="${1:-0.28.1}"
OUT="alertmanager-${VER}.linux-amd64.tar.gz"
URL="https://github.com/prometheus/alertmanager/releases/download/v${VER}/${OUT}"
if [[ -f "$OUT" ]]; then
echo "[INFO] $OUT already exists, skip download"
exit 0
fi
echo "[INFO] Downloading $URL"
curl -fL --retry 3 --connect-timeout 10 -o "$OUT" "$URL"
echo "[OK] Saved to $(pwd)/$OUT"

View File

@ -7,10 +7,8 @@ ALERTMANAGER_BASE_PATH=${ALERTMANAGER_BASE_PATH:-/private/argus/alert/alertmanag
echo "[INFO] Alertmanager base path: ${ALERTMANAGER_BASE_PATH}"
# 生成配置文件
echo "[INFO] Generating Alertmanager configuration file..."
sed "s|\${ALERTMANAGER_BASE_PATH}|${ALERTMANAGER_BASE_PATH}|g" \
/etc/alertmanager/alertmanager.yml > ${ALERTMANAGER_BASE_PATH}/alertmanager.yml
# 使用容器内的 /etc/alertmanager/alertmanager.yml 作为配置文件,避免写入挂载卷导致的权限问题
echo "[INFO] Using /etc/alertmanager/alertmanager.yml as configuration"
# 记录容器 IP 地址

View File

@ -42,7 +42,7 @@
- `./scripts/05_agent_register.sh` 获取两个节点的 `node_id` 与初始 IP检查本地 `node.json`
- `./scripts/06_write_health_and_assert.sh` 写健康文件并断言 `nodes.json` 仅包含 2 个在线节点
- `./scripts/07_logs_send_and_assert.sh` 向两个节点写日志,断言 ES `train-*`/`infer-*` 计数增长
- `./scripts/08_restart_agent_reregister.sh` `node-b` 改为固定 IP `172.29.0.200`,验证保持同一节点 ID 且 IP/时间戳更新
- `./scripts/08_restart_agent_reregister.sh` `node-b` 改为固定 IP `172.31.0.200`,验证保持同一节点 ID 且 IP/时间戳更新
- `./scripts/09_down.sh` 回收容器、网络并清理 `private*/``tmp/`
- 重置环境
@ -53,8 +53,8 @@
## 二、测试部署架构docker-compose
- 网络
- 自定义 bridge`argus-sys-net`,子网 `172.29.0.0/16`
- 固定地址bind=`172.29.0.2`master=`172.29.0.10`
- 自定义 bridge`argus-sys-net`,子网 `172.31.0.0/16`
- 固定地址bind=`172.31.0.2`master=`172.31.0.10`
- 服务与端口
- `bind``argus-bind9:latest`):监听 53/tcp+udp负责同步 `*.argus.com` 记录
@ -72,7 +72,7 @@
- 节点容器的 Fluent Bit/agent 资产以只读方式挂载到 `/assets`/`/usr/local/bin/argus-agent`
- DNS 配置
- 节点容器通过 compose 配置 `dns: [172.29.0.2]` 指向 bind不挂载 `/etc/resolv.conf`,也不依赖 `update-dns.sh`
- 节点容器通过 compose 配置 `dns: [172.31.0.2]` 指向 bind不挂载 `/etc/resolv.conf`,也不依赖 `update-dns.sh`
- master/es/kibana 仍共享 `./private`master 启动会写 `/private/argus/etc/master.argus.com` 供 bind 同步 A 记录
- 节点入口

View File

@ -4,7 +4,7 @@ networks:
ipam:
driver: default
config:
- subnet: 172.29.0.0/16
- subnet: 172.31.0.0/16
services:
bind:
@ -12,7 +12,7 @@ services:
container_name: argus-bind-sys
networks:
sysnet:
ipv4_address: 172.29.0.2
ipv4_address: 172.31.0.2
volumes:
- ./private:/private
restart: unless-stopped
@ -36,7 +36,7 @@ services:
- ./private/argus/etc:/private/argus/etc
networks:
sysnet:
ipv4_address: 172.29.0.10
ipv4_address: 172.31.0.10
restart: unless-stopped
es:
@ -56,7 +56,7 @@ services:
restart: unless-stopped
networks:
sysnet:
ipv4_address: 172.29.0.3
ipv4_address: 172.31.0.3
kibana:
image: argus-kibana:latest
@ -75,7 +75,7 @@ services:
restart: unless-stopped
networks:
sysnet:
ipv4_address: 172.29.0.4
ipv4_address: 172.31.0.4
node-a:
image: ubuntu:22.04
@ -104,7 +104,7 @@ services:
entrypoint:
- /usr/local/bin/node-entrypoint.sh
dns:
- 172.29.0.2
- 172.31.0.2
ports:
- "2020:2020"
restart: unless-stopped
@ -138,7 +138,7 @@ services:
entrypoint:
- /usr/local/bin/node-entrypoint.sh
dns:
- 172.29.0.2
- 172.31.0.2
ports:
- "2021:2020"
restart: unless-stopped
@ -167,7 +167,7 @@ services:
- /etc/timezone:/etc/timezone:ro
networks:
sysnet:
ipv4_address: 172.29.0.40
ipv4_address: 172.31.0.40
logging:
driver: "json-file"
options:
@ -192,7 +192,7 @@ services:
- /etc/timezone:/etc/timezone:ro
networks:
sysnet:
ipv4_address: 172.29.0.41
ipv4_address: 172.31.0.41
logging:
driver: "json-file"
options:
@ -223,7 +223,7 @@ services:
- /etc/timezone:/etc/timezone:ro
networks:
sysnet:
ipv4_address: 172.29.0.42
ipv4_address: 172.31.0.42
depends_on:
- prometheus
logging:
@ -232,6 +232,25 @@ services:
max-size: "10m"
max-file: "3"
# --- Added: Web Frontend (no host port; resolved by DNS as web.argus.com) ---
web-frontend:
image: argus-web-frontend:latest
container_name: argus-web-frontend
environment:
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
volumes:
- ./private/argus/etc:/private/argus/etc
networks:
sysnet:
ipv4_address: 172.31.0.80
restart: unless-stopped
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
test-node:
image: ubuntu:22.04
container_name: argus-metric-test-node
@ -245,7 +264,7 @@ services:
- TZ=Asia/Shanghai
- DEBIAN_FRONTEND=noninteractive
- FTP_DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com}
- FTP_SERVER=${FTP_SERVER:-172.29.0.40}
- FTP_SERVER=${FTP_SERVER:-172.31.0.40}
- FTP_USER=${FTP_USER:-ftpuser}
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
- FTP_PORT=${FTP_PORT:-21}
@ -264,7 +283,7 @@ services:
- infinity
networks:
sysnet:
ipv4_address: 172.29.0.50
ipv4_address: 172.31.0.50
logging:
driver: "json-file"
options:
@ -311,7 +330,62 @@ services:
- infinity
networks:
sysnet:
ipv4_address: 172.29.0.51
ipv4_address: 172.31.0.51
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
# --- Added: Alertmanager ---
alertmanager:
image: argus-alertmanager:latest
container_name: argus-alertmanager
environment:
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
volumes:
- ./private/argus/etc:/private/argus/etc
- ./private/argus/alert/alertmanager:/private/argus/alert/alertmanager
networks:
sysnet:
ipv4_address: 172.31.0.82
ports:
- "9093:9093"
restart: unless-stopped
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
# --- Added: Web Proxy (multi-port gateway) ---
web-proxy:
image: argus-web-proxy:latest
container_name: argus-web-proxy
depends_on:
- bind
- master
- grafana
- prometheus
- kibana
- alertmanager
environment:
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
volumes:
- ./private/argus/etc:/private/argus/etc
networks:
sysnet:
ipv4_address: 172.31.0.81
ports:
- "8080:8080"
- "8081:8081"
- "8082:8082"
- "8083:8083"
- "8084:8084"
- "8085:8085"
restart: unless-stopped
logging:
driver: "json-file"
options:

View File

@ -45,6 +45,7 @@ mkdir -p \
"$PRIVATE_CORE/argus/bind" \
"$PRIVATE_CORE/argus/master" \
"$PRIVATE_CORE/argus/metric/prometheus" \
"$PRIVATE_CORE/argus/alert/alertmanager" \
"$PRIVATE_CORE/argus/metric/ftp/share" \
"$PRIVATE_CORE/argus/metric/grafana/data" \
"$PRIVATE_CORE/argus/metric/grafana/logs" \
@ -71,10 +72,14 @@ chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" \
"$PRIVATE_CORE/argus/log/kibana" \
"$PRIVATE_CORE/argus/metric/grafana" \
"$PRIVATE_CORE/argus/metric/prometheus" \
"$PRIVATE_CORE/argus/alert" \
"$PRIVATE_CORE/argus/metric/ftp" \
"$PRIVATE_CORE/argus/agent" \
"$PRIVATE_CORE/argus/etc" 2>/dev/null || true
# 确保 alert 与 etc 目录组可写,便于非 root 且仅匹配 GID 的服务写入运行文件
chmod -R g+w "$PRIVATE_CORE/argus/alert" "$PRIVATE_CORE/argus/etc" 2>/dev/null || true
echo "[INFO] Using compose-managed network (auto-created by docker compose)"
echo "[INFO] Distributing update-dns.sh for core services (bind/master/es/kibana)"
@ -95,6 +100,9 @@ ensure_image "argus-master:latest"
ensure_image "argus-metric-ftp:latest"
ensure_image "argus-metric-prometheus:latest"
ensure_image "argus-metric-grafana:latest"
ensure_image "argus-web-frontend:latest"
ensure_image "argus-web-proxy:latest"
ensure_image "argus-alertmanager:latest"
echo "[INFO] Building agent binary..."
pushd "$REPO_ROOT/src/agent" >/dev/null

View File

@ -47,19 +47,40 @@ for name in argus-node-b; do
fi
done
# 预检:检查多端口网关所需宿主端口是否空闲
check_port_free() {
local p="$1"
if ss -ltnp 2>/dev/null | grep -q ":${p} "; then
echo "[ERR] Host port ${p} is already in use. Please free it before running 02_up.sh" >&2
ss -ltnp | awk -v p=":${p} " '$0 ~ p {print " " $0}' || true
return 1
fi
return 0
}
for port in 8080 8081 8082 8083 8084 8085; do
check_port_free "$port" || { echo "[ERR] Required port busy: $port"; exit 1; }
done
# 根据GPU可用性决定启动的服务
if [[ "$GPU_AVAILABLE" == true ]]; then
echo "[INFO] 启动所有服务(包含 gpu profile..."
compose -p argus-sys --profile gpu up -d
compose -p argus-sys --profile gpu up -d || true
else
echo "[INFO] 启动基础服务(不含 gpu profile..."
compose -p argus-sys up -d
compose -p argus-sys up -d || true
fi
# 若 web-proxy 处于 Created 状态,尝试单独启动一次(处理偶发 Address already in use 后端已释放的场景)
if docker ps -a --format '{{.Names}}\t{{.Status}}' | grep -q '^argus-web-proxy\s\+Created'; then
echo "[WARN] web-proxy in Created state; retry starting it..."
docker start argus-web-proxy || true
fi
popd >/dev/null
if [[ "$GPU_AVAILABLE" == true ]]; then
echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 test-gpu-node:172.29.0.51"
echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 test-gpu-node:172.31.0.51"
else
echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 (gpu skipped)"
fi

View File

@ -29,6 +29,7 @@ echo "[INFO] Waiting for ES/Kibana/Master/Fluent Bit/Bind..."
# ES (>= yellow)
attempt=1; max=120
ES_T0=$(date +%s)
while (( attempt <= max )); do
if curl -fsS "http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then
break
@ -36,16 +37,19 @@ while (( attempt <= max )); do
echo "[..] waiting ES ($attempt/$max)"; sleep 5; ((attempt++))
done
[[ $attempt -le $max ]] || { echo "[ERR] ES not ready" >&2; exit 1; }
ES_T1=$(date +%s); echo "[TIME] ES ready in $((ES_T1-ES_T0))s"
# Kibana: must be HTTP 200 and overall.level=available
echo "[INFO] Waiting for Kibana to be available (HTTP 200)..."
kb_attempt=1; kb_max=180
KB_T0=$(date +%s)
while (( kb_attempt <= kb_max )); do
body=$(curl -sS "http://localhost:5601/api/status" 2>/dev/null || true)
code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:5601/api/status" || echo 000)
if [[ "$code" == "200" ]]; then
if echo "$body" | grep -q '"level":"available"'; then
echo "[OK] Kibana available (HTTP 200)"
KB_T1=$(date +%s)
echo "[OK] Kibana available (HTTP 200) in $((KB_T1-KB_T0))s"
break
fi
fi
@ -58,11 +62,13 @@ if (( kb_attempt > kb_max )); then
fi
# Master
MASTER_T0=$(date +%s)
wait_http "http://localhost:32300/readyz" 120
MASTER_T1=$(date +%s); echo "[TIME] Master readyz in $((MASTER_T1-MASTER_T0))s"
# Fluent Bit (host metrics on host ports)
wait_http "http://localhost:2020/api/v2/metrics" 120
wait_http "http://localhost:2021/api/v2/metrics" 120
FB1_T0=$(date +%s); wait_http "http://localhost:2020/api/v2/metrics" 120; FB1_T1=$(date +%s); echo "[TIME] FluentBit:2020 in $((FB1_T1-FB1_T0))s"
FB2_T0=$(date +%s); wait_http "http://localhost:2021/api/v2/metrics" 120; FB2_T1=$(date +%s); echo "[TIME] FluentBit:2021 in $((FB2_T1-FB2_T0))s"
# Bind config check
BIND_ID="$(service_id bind)"
@ -72,4 +78,63 @@ else
echo "[WARN] bind container id not found"
fi
# ========== Additional module readiness checks ==========
# Prometheus
PROM_T0=$(date +%s); wait_http "http://localhost:9090/-/ready" 120; PROM_T1=$(date +%s); echo "[TIME] Prometheus ready in $((PROM_T1-PROM_T0))s"
# Grafana health (database: ok)
echo "[INFO] Waiting for Grafana health..."
gf_attempt=1; gf_max=120
while (( gf_attempt <= gf_max )); do
gf_body=$(curl -sS "http://localhost:3000/api/health" 2>/dev/null || true)
gf_code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:3000/api/health" || echo 000)
if [[ "$gf_code" == "200" ]] && echo "$gf_body" | grep -q '"database"\s*:\s*"ok"'; then
echo "[OK] Grafana health database=ok"
break
fi
echo "[..] waiting grafana health ($gf_attempt/$gf_max), last_code=$gf_code"
sleep 3; ((gf_attempt++))
done
if (( gf_attempt > gf_max )); then
echo "[ERR] Grafana /api/health not ready" >&2; exit 1
fi
# Alertmanager
wait_http "http://localhost:9093/api/v2/status" 120
# Web proxy checks按端口细化
code_for() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; }
echo "[INFO] Checking web-proxy ports..."
# 8080 首页必须 200
tries=1; max=60; P8080_T0=$(date +%s)
while (( tries <= max )); do
c=$(code_for "http://localhost:8080/")
if [[ "$c" == "200" ]]; then P8080_T1=$(date +%s); echo "[OK] 8080 / ($c) in $((P8080_T1-P8080_T0))s"; break; fi
echo "[..] waiting 8080/ ($tries/$max), code=$c"; sleep 3; ((tries++))
done
(( tries <= max )) || { echo "[ERR] 8080/ not ready" >&2; exit 1; }
# 8083 Kibana 允许 200/302上面已就绪端口侧再快速确认
tries=1; max=40; P8083_T0=$(date +%s)
while (( tries <= max )); do
c=$(code_for "http://localhost:8083/")
if [[ "$c" == "200" || "$c" == "302" ]]; then P8083_T1=$(date +%s); echo "[OK] 8083 / ($c) in $((P8083_T1-P8083_T0))s"; break; fi
echo "[..] waiting 8083/ ($tries/$max), code=$c"; sleep 3; ((tries++))
done
(( tries <= max )) || { echo "[ERR] 8083/ not ready" >&2; exit 1; }
# 8084 Alertmanager + CORS
P8084_T0=$(date +%s); wait_http "http://localhost:8084/api/v2/status" 60; P8084_T1=$(date +%s)
cors=$(header_val -H "Origin: http://localhost:8080" "http://localhost:8084/api/v2/status" || true)
if [[ -z "$cors" ]]; then echo "[ERR] 8084 CORS missing" >&2; exit 1; else echo "[OK] 8084 CORS: $cors in $((P8084_T1-P8084_T0))s"; fi
# 8085 Master /readyz + CORSAPI 走 8085 才需跨域)
P8085_T0=$(date +%s); wait_http "http://localhost:8085/readyz" 60; P8085_T1=$(date +%s)
cors=$(header_val -H "Origin: http://localhost:8080" "http://localhost:8085/api/v1/master/nodes" || true)
if [[ -z "$cors" ]]; then echo "[ERR] 8085 CORS missing" >&2; exit 1; else echo "[OK] 8085 CORS: $cors in $((P8085_T1-P8085_T0))s"; fi
echo "[OK] All services are ready"

View File

@ -49,7 +49,7 @@ compose() {
fi
}
echo "[INFO] Recreating node-b with static IP 172.29.0.200..."
echo "[INFO] Recreating node-b with static IP 172.31.0.200..."
pushd "$TEST_ROOT" >/dev/null
compose -p argus-sys rm -sf node-b || true
popd >/dev/null
@ -77,8 +77,8 @@ docker run -d \
--name argus-node-b \
--hostname dev-yyrshare-uuuu10-ep2f-pod-0 \
--network "$SYSNET_NAME" \
--ip 172.29.0.200 \
--dns 172.29.0.2 \
--ip 172.31.0.200 \
--dns 172.31.0.2 \
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
-e REPORT_INTERVAL_SECONDS=2 \
-e ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} \
@ -105,15 +105,15 @@ node=json.load(open(sys.argv[1]))
last0=sys.argv[2]
ip=node.get("meta_data",{}).get("ip")
lu=node.get("last_updated")
assert ip=="172.29.0.200"
assert ip=="172.31.0.200"
assert lu and lu!=last0
PY
then
echo "[OK] node-b re-registered with new IP 172.29.0.200"
echo "[OK] node-b re-registered with new IP 172.31.0.200"
exit 0
fi
fi
done
echo "[ERR] node-b did not update to IP 172.29.0.200 in time" >&2
echo "[ERR] node-b did not update to IP 172.31.0.200 in time" >&2
exit 1

View File

@ -16,7 +16,7 @@ if ! docker ps --format '{{.Names}}' | grep -q "^${CONTAINER}$"; then
exit 1
fi
FTP_HOST="${FTP_SERVER:-172.29.0.40}"
FTP_HOST="${FTP_SERVER:-172.31.0.40}"
FTP_USER="${FTP_USER:-ftpuser}"
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
FTP_PORT="${FTP_PORT:-21}"

View File

@ -23,7 +23,7 @@ if ! docker ps --format '{{.Names}}' | grep -q "^${CONTAINER}$"; then
exit 1
fi
FTP_HOST="${FTP_SERVER:-172.29.0.40}"
FTP_HOST="${FTP_SERVER:-172.31.0.40}"
FTP_USER="${FTP_USER:-ftpuser}"
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
FTP_PORT="${FTP_PORT:-21}"

View File

@ -5,7 +5,7 @@ TMP_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. && pwd)/tmp/metric-verify"
mkdir -p "$TMP_DIR"
PROM_BASE="http://localhost:9090/api/v1"
INSTANCE="${METRIC_TEST_INSTANCE:-172.29.0.50:9100}"
INSTANCE="${METRIC_TEST_INSTANCE:-172.31.0.50:9100}"
IP_ONLY="${INSTANCE%%:*}"
echo "[VERIFY:DATA] node exporter metrics present in container"

View File

@ -1,4 +1,4 @@
user web;
user root;
worker_processes auto;
events {

View File

@ -18,7 +18,7 @@ stopasgroup=true
[program:web-health]
command=/usr/local/bin/health-check.sh
user=web
user=root
stdout_logfile=/var/log/supervisor/web-health.log
stderr_logfile=/var/log/supervisor/web-health_error.log
autorestart=true

View File

@ -66,13 +66,16 @@ RUN mkdir -p /var/log/supervisor
# 复制启动脚本
COPY src/web/build_tools/proxy/start-proxy-supervised.sh /usr/local/bin/start-proxy-supervised.sh
RUN chmod +x /usr/local/bin/start-proxy-supervised.sh
COPY src/web/build_tools/proxy/start-proxy-retry.sh /usr/local/bin/start-proxy-retry.sh
RUN chmod +x /usr/local/bin/start-proxy-retry.sh
# 复制 DNS 监控脚本
COPY src/web/build_tools/proxy/dns-monitor.sh /usr/local/bin/dns-monitor.sh
# 统一复用 bind 模块的 dns-monitor 脚本,保持行为一致
COPY src/bind/build/dns-monitor.sh /usr/local/bin/dns-monitor.sh
RUN chmod +x /usr/local/bin/dns-monitor.sh
# 暴露端口
EXPOSE 80
EXPOSE 80 8080 8081 8082 8083 8084 8085
# 保持 root 用户,由 supervisor 控制 user 切换
USER root

View File

@ -1,9 +0,0 @@
server {
listen 80;
server_name alertmanager.alert.argus.com;
location / {
set $alert_backend http://alertmanager.alert.argus.com:9093;
proxy_pass $alert_backend;
}
}

View File

@ -1,21 +0,0 @@
# Elasticsearch
server {
listen 80;
server_name es.log.argus.com;
location / {
set $es_backend http://es.log.argus.com:9200;
proxy_pass $es_backend;
}
}
# Kibana
server {
listen 80;
server_name kibana.log.argus.com;
location / {
set $kibana_backend http://kibana.log.argus.com:5601;
proxy_pass $kibana_backend;
}
}

View File

@ -1,27 +0,0 @@
server {
listen 80;
server_name master.argus.com;
location / {
set $master_backend http://master.argus.com:3000;
proxy_pass $master_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# CORS 支持
add_header 'Access-Control-Allow-Origin' 'http://web.argus.com' always;
add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always;
add_header 'Access-Control-Allow-Headers' 'Origin, Content-Type, Accept, Authorization' always;
if ($request_method = OPTIONS) {
add_header 'Access-Control-Allow-Origin' 'http://web.argus.com' always;
add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always;
add_header 'Access-Control-Allow-Headers' 'Origin, Content-Type, Accept, Authorization' always;
add_header 'Content-Length' 0;
add_header 'Content-Type' 'text/plain';
return 204;
}
}
}

View File

@ -1,21 +0,0 @@
# Prometheus
server {
listen 80;
server_name prometheus.metric.argus.com;
location / {
set $prom_backend http://prom.metric.argus.com:9090;
proxy_pass $prom_backend;
}
}
# Grafana
server {
listen 80;
server_name grafana.metric.argus.com;
location / {
set $grafana_backend http://grafana.metric.argus.com:3000;
proxy_pass $grafana_backend;
}
}

View File

@ -0,0 +1,94 @@
map $http_upgrade $connection_upgrade { default upgrade; "" close; }
# 允许的跨域来源(仅用于 8084/8085
map $http_origin $cors_allow {
default "";
"http://localhost:8080" "http://localhost:8080";
"http://127.0.0.1:8080" "http://127.0.0.1:8080";
}
# 8080 - Portal
server {
listen 8080;
server_name _;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade;
proxy_http_version 1.1;
location / { proxy_pass http://web.argus.com:8080/; }
}
# 8081 - Grafana
server {
listen 8081;
server_name _;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade;
proxy_http_version 1.1;
location / { proxy_pass http://grafana.metric.argus.com:3000/; }
}
# 8082 - Prometheus
server {
listen 8082;
server_name _;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_http_version 1.1;
location / { proxy_pass http://prom.metric.argus.com:9090/; }
}
# 8083 - Kibana
server {
listen 8083;
server_name _;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade;
proxy_http_version 1.1;
location / { proxy_pass http://kibana.log.argus.com:5601/; }
}
# 8084 - Alertmanager含 CORS
server {
listen 8084;
server_name _;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
add_header 'Access-Control-Allow-Origin' $cors_allow always;
add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always;
add_header 'Access-Control-Allow-Headers' 'Origin, Content-Type, Accept, Authorization' always;
if ($request_method = OPTIONS) { return 204; }
proxy_http_version 1.1;
location / { proxy_pass http://alertmanager.alert.argus.com:9093/; }
}
# 8085 - Master新增含 CORS
server {
listen 8085;
server_name _;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
add_header 'Access-Control-Allow-Origin' $cors_allow always;
add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always;
add_header 'Access-Control-Allow-Headers' 'Origin, Content-Type, Accept, Authorization' always;
if ($request_method = OPTIONS) { return 204; }
proxy_http_version 1.1;
location / { proxy_pass http://master.argus.com:3000/; }
}

View File

@ -1,9 +0,0 @@
server {
listen 80;
server_name web.argus.com;
location / {
set $web_backend http://web.argus.com:8080;
proxy_pass $web_backend;
}
}

View File

@ -1,68 +0,0 @@
#!/bin/bash
# DNS监控脚本 - 每10秒检查dns.conf是否有变化
# 如果有变化则执行update-dns.sh脚本
DNS_CONF="/private/argus/etc/dns.conf"
DNS_BACKUP="/tmp/dns.conf.backup"
UPDATE_SCRIPT="/private/argus/etc/update-dns.sh"
LOG_FILE="/var/log/supervisor/dns-monitor.log"
# 确保日志文件存在
touch "$LOG_FILE"
log_message() {
echo "$(date '+%Y-%m-%d %H:%M:%S') [DNS-Monitor] $1" >> "$LOG_FILE"
}
log_message "DNS监控脚本启动"
while true; do
if [ -f "$DNS_CONF" ]; then
if [ -f "$DNS_BACKUP" ]; then
# 比较文件内容
if ! cmp -s "$DNS_CONF" "$DNS_BACKUP"; then
log_message "检测到DNS配置变化"
# 更新备份文件
cp "$DNS_CONF" "$DNS_BACKUP"
# 执行更新脚本
if [ -x "$UPDATE_SCRIPT" ]; then
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
if [ $? -eq 0 ]; then
log_message "DNS更新脚本执行成功"
else
log_message "DNS更新脚本执行失败"
fi
else
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
fi
fi
else
# 第一次检测到配置文件,执行更新脚本
if [ -x "$UPDATE_SCRIPT" ]; then
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
if [ $? -eq 0 ]; then
log_message "DNS更新脚本执行成功"
# 第一次运行,创建备份并执行更新
cp "$DNS_CONF" "$DNS_BACKUP"
log_message "创建DNS配置备份文件"
else
log_message "DNS更新脚本执行失败"
fi
else
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
fi
fi
else
log_message "警告: DNS配置文件不存在: $DNS_CONF"
fi
sleep 10
done

View File

@ -1,4 +1,4 @@
user web_proxy;
user root;
worker_processes auto;
events {
@ -13,6 +13,7 @@ http {
# 使用系统 resolv.conf由 update-dns.sh 动态更新)
resolver __RESOLVERS__ valid=30s ipv6=off;
resolver_timeout 5s;
# 启用访问日志
access_log /var/log/nginx/access.log;

View File

@ -0,0 +1,20 @@
#!/bin/sh
set -eu
MAX=${RETRY_MAX:-10}
DELAY=${RETRY_DELAY:-10}
ATTEMPT=1
echo "[INFO] proxy retry wrapper: max=${MAX}, delay=${DELAY}s"
while [ "$ATTEMPT" -le "$MAX" ]; do
echo "[INFO] starting proxy attempt ${ATTEMPT}/${MAX}"
/usr/local/bin/start-proxy-supervised.sh && exit 0 || true
echo "[WARN] proxy exited (attempt ${ATTEMPT}/${MAX}); sleeping ${DELAY}s before retry"
sleep "$DELAY"
ATTEMPT=$((ATTEMPT+1))
done
echo "[ERROR] proxy failed after ${MAX} attempts"
exit 1

View File

@ -46,6 +46,10 @@ echo "检测到 DNS 服务器列表: $RESOLVERS"
# ========== 生成 nginx.conf ==========
if [ -f "$TEMPLATE" ]; then
echo "从模板生成 nginx.conf ..."
# 合并 Docker 内置 DNS 以保障解析 Compose 服务名
if ! echo " $RESOLVERS " | grep -q " 127.0.0.11 "; then
RESOLVERS="127.0.0.11 ${RESOLVERS}"
fi
sed "s|__RESOLVERS__|$RESOLVERS|" "$TEMPLATE" > "$TARGET"
else
echo "错误: 找不到 nginx.conf.template ($TEMPLATE)"
@ -55,6 +59,33 @@ fi
# 打印生成结果供排查
grep resolver "$TARGET" || true
# ========== 等待上游域名准备(避免启动即解析失败) ==========
UPSTREAM_DOMAINS=(
web.argus.com
grafana.metric.argus.com
prom.metric.argus.com
kibana.log.argus.com
alertmanager.alert.argus.com
master.argus.com
)
WAIT_MAX=15
WAITED=0
MISSING=()
while :; do
MISSING=()
for d in "${UPSTREAM_DOMAINS[@]}"; do
if [ ! -s "/private/argus/etc/${d}" ]; then
MISSING+=("$d")
fi
done
if [ ${#MISSING[@]} -eq 0 ] || [ "$WAITED" -ge "$WAIT_MAX" ]; then
break
fi
echo "[INFO] 等待上游域名记录生成(${WAITED}/${WAIT_MAX}) 缺失: ${MISSING[*]}"
sleep 1
WAITED=$((WAITED+1))
done
echo "[INFO] Launching nginx..."
# 启动 nginx 前台模式

View File

@ -5,12 +5,12 @@ pidfile=/var/run/supervisord.pid
user=root
[program:proxy]
command=/usr/local/bin/start-proxy-supervised.sh
command=/usr/local/bin/start-proxy-retry.sh
user=root
stdout_logfile=/var/log/supervisor/web-proxy.log
stderr_logfile=/var/log/supervisor/web-proxy_error.log
autorestart=true
startretries=3
startretries=10
startsecs=5
stopwaitsecs=10
killasgroup=true

View File

@ -1,30 +1,42 @@
// config/api.js
// Master 节点相关 API
// 运行时解析主机名,统一按端口访问多服务
const HOST = (typeof window !== 'undefined' && (window.__ARGUS_PUBLIC_HOST__ || window.location.hostname)) || 'localhost';
const PORTS = {
MASTER: 8085, // 经网关(含 CORS
ALERTMANAGER: 8084,
GRAFANA: 8081,
PROMETHEUS: 8082,
KIBANA: 8083,
};
const BASE = {
MASTER: `http://${HOST}:${PORTS.MASTER}`,
ALERT: `http://${HOST}:${PORTS.ALERTMANAGER}`,
GRAFANA: `http://${HOST}:${PORTS.GRAFANA}`,
PROM: `http://${HOST}:${PORTS.PROMETHEUS}`,
KIBANA: `http://${HOST}:${PORTS.KIBANA}`,
};
// Master 节点相关 API统一走 8085
export const MASTER_API = {
// 节点列表
LIST: "http://master.argus.com/api/v1/master/nodes",
// 节点详情(需要 nodeId
DETAIL: (nodeId) => `http://master.argus.com/api/v1/master/nodes/${nodeId}`,
// 节点配置(需要 nodeId
CONFIG: (nodeId) => `http://master.argus.com/api/v1/master/nodes/${nodeId}/config`,
// 节点统计信息
STATISTICS: "http://master.argus.com/api/v1/master/nodes/statistics",
LIST: `${BASE.MASTER}/api/v1/master/nodes`,
DETAIL: (nodeId) => `${BASE.MASTER}/api/v1/master/nodes/${nodeId}`,
CONFIG: (nodeId) => `${BASE.MASTER}/api/v1/master/nodes/${nodeId}/config`,
STATISTICS: `${BASE.MASTER}/api/v1/master/nodes/statistics`,
};
// 其他外部 API
// 其他外部 API8084
export const EXTERNAL_API = {
ALERTS_INFOS: "http://alertmanager.alert.argus.com/api/v2/alerts",
ALERTS_INFOS: `${BASE.ALERT}/api/v2/alerts`,
};
// 外部服务 Host
// 外部服务 Host(端口化)
export const EXTERNAL_HOST = {
ALERTS: "http://alertmanager.alert.argus.com",
GRAFANA: "http://grafana.metric.argus.com",
GRAFANA_DASHBOARD: "http://grafana.metric.argus.com/d/cluster-dashboard/cluster-dashboard",
PROMETHEUS: "http://prometheus.metric.argus.com",
KIBANA: "http://kibana.log.argus.com/app/discover",
ALERTS: `${BASE.ALERT}`,
GRAFANA: `${BASE.GRAFANA}`,
GRAFANA_DASHBOARD: `${BASE.GRAFANA}/d/cluster-dashboard/cluster-dashboard`,
PROMETHEUS: `${BASE.PROM}`,
KIBANA: `${BASE.KIBANA}/app/discover`,
};