From a1cdd05950ba13cb478f2a3af3323cb52a4f20cc Mon Sep 17 00:00:00 2001 From: yuyr Date: Wed, 22 Oct 2025 12:08:34 +0800 Subject: [PATCH] =?UTF-8?q?[#29]=20=E6=95=B4=E5=90=88metric=E6=A8=A1?= =?UTF-8?q?=E5=9D=97=E7=AB=AF=E5=88=B0=E7=AB=AF=E6=B5=8B=E8=AF=95=E8=B7=91?= =?UTF-8?q?=E9=80=9A=EF=BC=9B=E6=95=B4=E5=90=88web/alert=E6=A8=A1=E5=9D=97?= =?UTF-8?q?=E9=95=9C=E5=83=8F=E6=9E=84=E5=BB=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/build_images.sh | 48 ++++++ build/save_images.sh | 3 + src/alert/alertmanager/build/Dockerfile | 36 +++-- src/alert/alertmanager/build/build.sh | 6 +- src/alert/tests/.env | 4 +- src/alert/tests/docker-compose.yml | 8 +- src/bind/build/Dockerfile | 1 + .../scripts/publish_artifact.sh | 48 ++++-- src/metric/grafana/build/Dockerfile | 2 + .../grafana/build/datasources/datasources.yml | 2 +- .../grafana/build/start-grafana-supervised.sh | 50 ++++-- src/sys/tests/docker-compose.yml | 58 +++++-- src/sys/tests/scripts/00_e2e_test.sh | 38 ++++- src/sys/tests/scripts/01_bootstrap.sh | 46 ++++-- src/sys/tests/scripts/02_up.sh | 48 +++--- .../tests/scripts/04_verify_dns_routing.sh | 24 ++- src/sys/tests/scripts/05_agent_register.sh | 29 +++- .../tests/scripts/07_logs_send_and_assert.sh | 38 ++++- .../scripts/08_restart_agent_reregister.sh | 17 ++- src/sys/tests/scripts/09_down.sh | 25 ++- src/sys/tests/scripts/10_metric_publish.sh | 66 ++++++++ .../tests/scripts/11_metric_node_install.sh | 50 ++++++ .../tests/scripts/12_metric_gpu_install.sh | 64 ++++++++ src/sys/tests/scripts/13_metric_verify.sh | 40 +++++ .../scripts/13_metric_verify_dataplane.sh | 47 ++++++ .../tests/scripts/13_metric_verify_grafana.sh | 39 +++++ .../13_metric_verify_grafana_panels.sh | 70 +++++++++ .../tests/scripts/13_metric_verify_master.sh | 105 +++++++++++++ .../scripts/13_metric_verify_prometheus.sh | 142 ++++++++++++++++++ src/sys/tests/scripts/14_metric_cleanup.sh | 18 +++ .../scripts/metric/test-node-entrypoint.sh | 45 ++++++ src/sys/tests/scripts/node_entrypoint.sh | 5 +- src/web/build_tools/frontend/Dockerfile | 37 +++-- src/web/build_tools/frontend/build.sh | 4 +- .../frontend/start-web-supervised.sh | 4 +- src/web/build_tools/proxy/Dockerfile | 34 +++-- src/web/build_tools/proxy/build.sh | 4 +- .../proxy/start-proxy-supervised.sh | 4 +- src/web/tests/docker-compose.yml | 16 +- 39 files changed, 1167 insertions(+), 158 deletions(-) create mode 100755 src/sys/tests/scripts/10_metric_publish.sh create mode 100755 src/sys/tests/scripts/11_metric_node_install.sh create mode 100755 src/sys/tests/scripts/12_metric_gpu_install.sh create mode 100755 src/sys/tests/scripts/13_metric_verify.sh create mode 100755 src/sys/tests/scripts/13_metric_verify_dataplane.sh create mode 100755 src/sys/tests/scripts/13_metric_verify_grafana.sh create mode 100755 src/sys/tests/scripts/13_metric_verify_grafana_panels.sh create mode 100755 src/sys/tests/scripts/13_metric_verify_master.sh create mode 100755 src/sys/tests/scripts/13_metric_verify_prometheus.sh create mode 100755 src/sys/tests/scripts/14_metric_cleanup.sh create mode 100755 src/sys/tests/scripts/metric/test-node-entrypoint.sh diff --git a/build/build_images.sh b/build/build_images.sh index 603dc66..b4a023c 100755 --- a/build/build_images.sh +++ b/build/build_images.sh @@ -254,6 +254,54 @@ if [[ "$build_metric" == true ]]; then done fi +# ======================================= +# Web & Alert module images +# ======================================= + +echo "" +echo "Building Web and Alert module images..." + +# Pre-pull commonly used base images for stability +web_alert_base_images=( + "node:20" + "ubuntu:24.04" +) + +for base_image in "${web_alert_base_images[@]}"; do + if ! pull_base_image "$base_image"; then + build_failed=true + fi +done + +web_builds=( + "Web Frontend|src/web/build_tools/frontend/Dockerfile|argus-web-frontend:latest|." + "Web Proxy|src/web/build_tools/proxy/Dockerfile|argus-web-proxy:latest|." +) + +for build_spec in "${web_builds[@]}"; do + IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec" + if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then + images_built+=("$image_tag") + else + build_failed=true + fi + echo "" +done + +alert_builds=( + "Alertmanager|src/alert/alertmanager/build/Dockerfile|argus-alertmanager:latest|." +) + +for build_spec in "${alert_builds[@]}"; do + IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec" + if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then + images_built+=("$image_tag") + else + build_failed=true + fi + echo "" +done + echo "=======================================" echo "📦 Build Summary" echo "=======================================" diff --git a/build/save_images.sh b/build/save_images.sh index 236e32e..083d587 100755 --- a/build/save_images.sh +++ b/build/save_images.sh @@ -71,6 +71,9 @@ declare -A images=( ["argus-metric-ftp:latest"]="argus-metric-ftp-latest.tar" ["argus-metric-prometheus:latest"]="argus-metric-prometheus-latest.tar" ["argus-metric-grafana:latest"]="argus-metric-grafana-latest.tar" + ["argus-web-frontend:latest"]="argus-web-frontend-latest.tar" + ["argus-web-proxy:latest"]="argus-web-proxy-latest.tar" + ["argus-alertmanager:latest"]="argus-alertmanager-latest.tar" ) # 函数:检查镜像是否存在 diff --git a/src/alert/alertmanager/build/Dockerfile b/src/alert/alertmanager/build/Dockerfile index a606569..781714a 100644 --- a/src/alert/alertmanager/build/Dockerfile +++ b/src/alert/alertmanager/build/Dockerfile @@ -20,10 +20,10 @@ RUN wget https://github.com/prometheus/alertmanager/releases/download/v${ALERTMA ENV ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager -ARG ARGUS_UID=2133 -ARG ARGUS_GID=2015 -ENV ARGUS_UID=${ARGUS_UID} -ENV ARGUS_GID=${ARGUS_GID} +ARG ARGUS_BUILD_UID=2133 +ARG ARGUS_BUILD_GID=2015 +ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} +ENV ARGUS_BUILD_GID=${ARGUS_BUILD_GID} RUN mkdir -p /usr/share/alertmanager && \ mkdir -p ${ALERTMANAGER_BASE_PATH} && \ @@ -33,16 +33,25 @@ RUN mkdir -p /usr/share/alertmanager && \ # 创建 alertmanager 用户(可自定义 UID/GID) # 创建 alertmanager 用户组 -RUN groupadd -g ${ARGUS_GID} alertmanager +RUN set -eux; \ + if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \ + groupadd -g "${ARGUS_BUILD_GID}" alertmanager || true; \ + fi; \ + if id alertmanager >/dev/null 2>&1; then \ + current_uid="$(id -u alertmanager)"; \ + if [ "$current_uid" != "${ARGUS_BUILD_UID}" ] && ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \ + usermod -u "${ARGUS_BUILD_UID}" alertmanager; \ + fi; \ + usermod -g "${ARGUS_BUILD_GID}" alertmanager || true; \ + else \ + if ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \ + useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" alertmanager; \ + else \ + echo "UID ${ARGUS_BUILD_UID} already exists; skip creating user 'alertmanager'"; \ + fi; \ + fi -# 创建 alertmanager 用户并指定组 -RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} alertmanager - -RUN chown -R alertmanager:alertmanager /usr/share/alertmanager && \ - chown -R alertmanager:alertmanager /alertmanager && \ - chown -R alertmanager:alertmanager ${ALERTMANAGER_BASE_PATH} && \ - chown -R alertmanager:alertmanager /private/argus/etc && \ - chown -R alertmanager:alertmanager /usr/local/bin +RUN chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/alertmanager /alertmanager ${ALERTMANAGER_BASE_PATH} /private/argus/etc /usr/local/bin || true # 配置内网 apt 源 (如果指定了内网选项) RUN if [ "$USE_INTRANET" = "true" ]; then \ @@ -86,4 +95,3 @@ EXPOSE 9093 # 使用 supervisor 作为入口点 CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] - diff --git a/src/alert/alertmanager/build/build.sh b/src/alert/alertmanager/build/build.sh index c7520e7..2640042 100644 --- a/src/alert/alertmanager/build/build.sh +++ b/src/alert/alertmanager/build/build.sh @@ -5,9 +5,9 @@ docker pull ubuntu:24.04 source src/alert/tests/.env docker build \ - --build-arg ARGUS_UID=${ARGUS_UID} \ - --build-arg ARGUS_GID=${ARGUS_GID} \ + --build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \ + --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID} \ -f src/alert/alertmanager/build/Dockerfile \ -t argus-alertmanager:latest . -docker save -o argus-alertmanager-latest.tar argus-alertmanager:latest \ No newline at end of file +docker save -o argus-alertmanager-latest.tar argus-alertmanager:latest diff --git a/src/alert/tests/.env b/src/alert/tests/.env index 00f4b76..b9d89f5 100644 --- a/src/alert/tests/.env +++ b/src/alert/tests/.env @@ -1,5 +1,5 @@ DATA_ROOT=/home/argus/tmp/private/argus -ARGUS_UID=1048 -ARGUS_GID=1048 +ARGUS_BUILD_UID=1048 +ARGUS_BUILD_GID=1048 USE_INTRANET=false diff --git a/src/alert/tests/docker-compose.yml b/src/alert/tests/docker-compose.yml index 63b9f40..c399df8 100644 --- a/src/alert/tests/docker-compose.yml +++ b/src/alert/tests/docker-compose.yml @@ -4,15 +4,15 @@ services: context: ../../../ dockerfile: src/alert/alertmanager/build/Dockerfile args: - ARGUS_UID: ${ARGUS_UID:-2133} - ARGUS_GID: ${ARGUS_GID:-2015} + ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133} + ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015} USE_INTRANET: ${USE_INTRANET:-false} image: argus-alertmanager:latest container_name: argus-alertmanager environment: - ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager - - ARGUS_UID=${ARGUS_UID:-2133} - - ARGUS_GID=${ARGUS_GID:-2015} + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} ports: - "${ARGUS_PORT:-9093}:9093" volumes: diff --git a/src/bind/build/Dockerfile b/src/bind/build/Dockerfile index c6293d3..637e227 100644 --- a/src/bind/build/Dockerfile +++ b/src/bind/build/Dockerfile @@ -26,6 +26,7 @@ RUN apt-get update && \ apt-get install -y \ bind9 \ bind9utils \ + dnsutils \ bind9-doc \ supervisor \ net-tools \ diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh b/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh index 2f16b19..5441cf1 100755 --- a/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh +++ b/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh @@ -104,7 +104,26 @@ log_info "文件所有者: $OWNER" # 确保发布目录存在 log_info "确保发布目录存在: $PUBLISH_DIR" -sudo mkdir -p "$PUBLISH_DIR" +mkdir -p "$PUBLISH_DIR" + +IFS=':' read -r OWNER_UID OWNER_GID <<< "$OWNER" +if [[ -z "$OWNER_UID" || -z "$OWNER_GID" ]]; then + log_error "--owner 格式不正确,应为 uid:gid" + exit 1 +fi + +CURRENT_UID=$(id -u) +CURRENT_GID=$(id -g) +if [[ "$OWNER_UID" != "$CURRENT_UID" || "$OWNER_GID" != "$CURRENT_GID" ]]; then + if [[ "$CURRENT_UID" -ne 0 ]]; then + log_error "当前用户 (${CURRENT_UID}:${CURRENT_GID}) 无法设置所有者为 ${OWNER_UID}:${OWNER_GID}" + log_error "请以目标用户运行脚本或预先调整目录权限" + exit 1 + fi + NEED_CHOWN=true +else + NEED_CHOWN=false +fi # 创建临时目录用于打包 TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$" @@ -208,26 +227,31 @@ fi TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz" log_info "创建发布包: $TAR_NAME" cd "$TEMP_PACKAGE_DIR" -sudo tar -czf "$PUBLISH_DIR/$TAR_NAME" * +tar -czf "$PUBLISH_DIR/$TAR_NAME" * cd - > /dev/null -# 设置文件所有者 -log_info "设置文件所有者为: $OWNER" -sudo chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME" +if [[ "$NEED_CHOWN" == true ]]; then + log_info "设置文件所有者为: $OWNER" + chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME" +fi # 清理临时目录 rm -rf "$TEMP_PACKAGE_DIR" # 更新 LATEST_VERSION 文件 log_info "更新 LATEST_VERSION 文件..." -echo "$VERSION" | sudo tee "$PUBLISH_DIR/LATEST_VERSION" > /dev/null -sudo chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION" +echo "$VERSION" > "$PUBLISH_DIR/LATEST_VERSION" +if [[ "$NEED_CHOWN" == true ]]; then + chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION" +fi # 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制) if [[ -f "config/dns.conf" ]]; then log_info "复制 DNS 配置文件到发布目录根目录..." - sudo cp "config/dns.conf" "$PUBLISH_DIR/" - sudo chown "$OWNER" "$PUBLISH_DIR/dns.conf" + cp "config/dns.conf" "$PUBLISH_DIR/" + if [[ "$NEED_CHOWN" == true ]]; then + chown "$OWNER" "$PUBLISH_DIR/dns.conf" + fi log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf" else log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制" @@ -236,8 +260,10 @@ fi # 复制 setup.sh 到发布目录 if [[ -f "scripts/setup.sh" ]]; then log_info "复制 setup.sh 到发布目录..." - sudo cp "scripts/setup.sh" "$PUBLISH_DIR/" - sudo chown "$OWNER" "$PUBLISH_DIR/setup.sh" + cp "scripts/setup.sh" "$PUBLISH_DIR/" + if [[ "$NEED_CHOWN" == true ]]; then + chown "$OWNER" "$PUBLISH_DIR/setup.sh" + fi fi # 显示发布结果 diff --git a/src/metric/grafana/build/Dockerfile b/src/metric/grafana/build/Dockerfile index 0615d08..2c121cb 100644 --- a/src/metric/grafana/build/Dockerfile +++ b/src/metric/grafana/build/Dockerfile @@ -65,6 +65,8 @@ COPY grafana.ini /tmp/grafana.ini COPY datasources/datasources.yml /tmp/datasources.yml COPY dashboards/dashboards.yml /tmp/dashboards.yml COPY dashboards/default_dashboard_by_hostname.json /tmp/default_dashboard.json +COPY dashboards/default_cluster_dashboard.json /tmp/default_cluster_dashboard.json +COPY dashboards/default_dashboard_by_instance.json /tmp/default_dashboard_by_instance.json # supervisor 配置 COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf diff --git a/src/metric/grafana/build/datasources/datasources.yml b/src/metric/grafana/build/datasources/datasources.yml index fb277cc..752d0f3 100644 --- a/src/metric/grafana/build/datasources/datasources.yml +++ b/src/metric/grafana/build/datasources/datasources.yml @@ -8,7 +8,7 @@ datasources: type: prometheus access: proxy uid: eezk1zvkie4g0a - url: http://10.211.55.5:9090 + url: http://prom.metric.argus.com:9090 isDefault: true editable: true jsonData: diff --git a/src/metric/grafana/build/start-grafana-supervised.sh b/src/metric/grafana/build/start-grafana-supervised.sh index 95bb267..46ece73 100644 --- a/src/metric/grafana/build/start-grafana-supervised.sh +++ b/src/metric/grafana/build/start-grafana-supervised.sh @@ -44,12 +44,18 @@ else fi # 复制数据源配置文件到挂载目录 -if [ -f "/tmp/datasources.yml" ]; then - echo "[INFO] Copying datasource configuration to /private/argus/metric/grafana/provisioning/datasources/" - cp /tmp/datasources.yml /private/argus/metric/grafana/provisioning/datasources/datasources.yml - echo "[INFO] Datasource configuration copied successfully" -elif [ -d "/private/argus/metric/grafana/provisioning/datasources" ] && [ "$(ls -A /private/argus/metric/grafana/provisioning/datasources)" ]; then - echo "[INFO] Found existing datasource provisioning files in /private/argus/metric/grafana/provisioning/datasources" +DS_OUT="/private/argus/metric/grafana/provisioning/datasources/datasources.yml" +PROM_DOMAIN="prom.metric.argus.com:9090" + +if [ -f "/tmp/datasources.yml" ] && [ ! -f "$DS_OUT" ]; then + echo "[INFO] Initializing datasource provisioning file from /tmp" + cp /tmp/datasources.yml "$DS_OUT" +fi + +# 统一将数据源 URL 规范为 prom.metric.argus.com:9090 +if [ -f "$DS_OUT" ]; then + sed -i -E "s#^\s*url:\s*http://[^[:space:]]+# url: http://$PROM_DOMAIN#g" "$DS_OUT" || true + echo "[INFO] Datasource URL normalized to http://$PROM_DOMAIN" elif [ -d "/etc/grafana/provisioning/datasources" ] && [ "$(ls -A /etc/grafana/provisioning/datasources)" ]; then echo "[INFO] Found datasource provisioning files in /etc/grafana/provisioning/datasources" # 确保数据源配置目录权限正确 @@ -65,11 +71,33 @@ if [ -f "/tmp/dashboards.yml" ]; then echo "[INFO] Dashboard configuration copied successfully" fi -# 复制默认仪表板到挂载目录 -if [ -f "/tmp/default_dashboard.json" ]; then - echo "[INFO] Copying default dashboard to /private/argus/metric/grafana/provisioning/dashboards/" - cp /tmp/default_dashboard.json /private/argus/metric/grafana/provisioning/dashboards/default_dashboard.json - echo "[INFO] Default dashboard copied successfully" +# 复制默认仪表板到挂载目录(按需,不覆盖已存在文件) +copy_dashboard_if_missing() { + local src="$1"; local dst_name="$2" + local dst_dir="/private/argus/metric/grafana/provisioning/dashboards" + local dst="$dst_dir/$dst_name" + if [ -f "$src" ]; then + if [ ! -f "$dst" ]; then + echo "[INFO] Installing dashboard: $dst_name" + cp "$src" "$dst" + else + echo "[INFO] Dashboard exists, skip: $dst_name" + fi + fi +} + +copy_dashboard_if_missing "/tmp/default_dashboard.json" "default_dashboard.json" +copy_dashboard_if_missing "/tmp/default_cluster_dashboard.json" "default_cluster_dashboard.json" +copy_dashboard_if_missing "/tmp/default_dashboard_by_instance.json" "default_dashboard_by_instance.json" + +# 规范面板中的数据源字段:将字符串 "prometheus" 替换为 null(使用默认数据源) +DB_DIR="/private/argus/metric/grafana/provisioning/dashboards" +if [ -d "$DB_DIR" ]; then + for f in "$DB_DIR"/*.json; do + [ -f "$f" ] || continue + sed -i -E 's/"datasource"\s*:\s*"prometheus"/"datasource": null/g' "$f" || true + done + echo "[INFO] Normalized dashboard datasource to default (null)" fi # 启动 Grafana diff --git a/src/sys/tests/docker-compose.yml b/src/sys/tests/docker-compose.yml index badf4ec..135cb03 100644 --- a/src/sys/tests/docker-compose.yml +++ b/src/sys/tests/docker-compose.yml @@ -1,9 +1,5 @@ --version: "3.8" - networks: - default: - external: true - name: argus-sys-net + sysnet: driver: bridge ipam: driver: default @@ -15,7 +11,7 @@ services: image: ${BIND_IMAGE_TAG:-argus-bind9:latest} container_name: argus-bind-sys networks: - default: + sysnet: ipv4_address: 172.29.0.2 volumes: - ./private:/private @@ -39,7 +35,7 @@ services: - ./private/argus/metric/prometheus:/private/argus/metric/prometheus - ./private/argus/etc:/private/argus/etc networks: - default: + sysnet: ipv4_address: 172.29.0.10 restart: unless-stopped @@ -58,6 +54,9 @@ services: ports: - "9200:9200" restart: unless-stopped + networks: + sysnet: + ipv4_address: 172.29.0.3 kibana: image: argus-kibana:latest @@ -74,6 +73,9 @@ services: ports: - "5601:5601" restart: unless-stopped + networks: + sysnet: + ipv4_address: 172.29.0.4 node-a: image: ubuntu:22.04 @@ -106,6 +108,8 @@ services: ports: - "2020:2020" restart: unless-stopped + networks: + - sysnet node-b: image: ubuntu:22.04 @@ -138,6 +142,8 @@ services: ports: - "2021:2020" restart: unless-stopped + networks: + - sysnet ftp: image: argus-metric-ftp:latest @@ -160,7 +166,7 @@ services: - /etc/localtime:/etc/localtime:ro - /etc/timezone:/etc/timezone:ro networks: - default: + sysnet: ipv4_address: 172.29.0.40 logging: driver: "json-file" @@ -185,7 +191,7 @@ services: - /etc/localtime:/etc/localtime:ro - /etc/timezone:/etc/timezone:ro networks: - default: + sysnet: ipv4_address: 172.29.0.41 logging: driver: "json-file" @@ -205,6 +211,9 @@ services: - GF_SERVER_HTTP_PORT=3000 - GF_LOG_LEVEL=warn - GF_LOG_MODE=console + - GF_PATHS_PROVISIONING=/private/argus/metric/grafana/provisioning + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer ports: - "${GRAFANA_PORT:-3000}:3000" volumes: @@ -213,7 +222,7 @@ services: - /etc/localtime:/etc/localtime:ro - /etc/timezone:/etc/timezone:ro networks: - default: + sysnet: ipv4_address: 172.29.0.42 depends_on: - prometheus @@ -224,7 +233,7 @@ services: max-file: "3" test-node: - image: argus-metric-test-node:latest + image: ubuntu:22.04 container_name: argus-metric-test-node hostname: test-metric-node-001 restart: unless-stopped @@ -240,13 +249,21 @@ services: - FTP_USER=${FTP_USER:-ftpuser} - FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!} - FTP_PORT=${FTP_PORT:-21} + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - METRIC_NODE_ROLE=cpu volumes: - ./private/argus/agent:/private/argus/agent + - ./scripts/metric/test-node-entrypoint.sh:/usr/local/bin/metric-test-node-entrypoint.sh:ro - /etc/localtime:/etc/localtime:ro - /etc/timezone:/etc/timezone:ro - command: sleep infinity + entrypoint: + - /usr/local/bin/metric-test-node-entrypoint.sh + command: + - sleep + - infinity networks: - default: + sysnet: ipv4_address: 172.29.0.50 logging: driver: "json-file" @@ -255,7 +272,8 @@ services: max-file: "3" test-gpu-node: - image: argus-metric-test-gpu-node:latest + profiles: ["gpu"] + image: nvidia/cuda:12.2.2-runtime-ubuntu22.04 container_name: argus-metric-test-gpu-node hostname: test-metric-gpu-node-001 restart: unless-stopped @@ -278,13 +296,21 @@ services: - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=compute,utility - GPU_MODE=gpu + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - METRIC_NODE_ROLE=gpu volumes: - ./private/argus/agent:/private/argus/agent + - ./scripts/metric/test-node-entrypoint.sh:/usr/local/bin/metric-test-node-entrypoint.sh:ro - /etc/localtime:/etc/localtime:ro - /etc/timezone:/etc/timezone:ro - command: sleep infinity + entrypoint: + - /usr/local/bin/metric-test-node-entrypoint.sh + command: + - sleep + - infinity networks: - default: + sysnet: ipv4_address: 172.29.0.51 logging: driver: "json-file" diff --git a/src/sys/tests/scripts/00_e2e_test.sh b/src/sys/tests/scripts/00_e2e_test.sh index 2079c4f..d6f80c1 100755 --- a/src/sys/tests/scripts/00_e2e_test.sh +++ b/src/sys/tests/scripts/00_e2e_test.sh @@ -3,6 +3,38 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ENABLE_GPU=false + +usage() { + cat <<'EOF' +Usage: 00_e2e_test.sh [options] + +Options: + --enable-gpu 启用 GPU 相关拓扑与测试流程 + -h, --help 显示帮助信息 +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --enable-gpu) + ENABLE_GPU=true + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +export ARGUS_SYS_ENABLE_GPU=$ENABLE_GPU + SCRIPTS=( "01_bootstrap.sh" "02_up.sh" @@ -12,6 +44,11 @@ SCRIPTS=( "06_write_health_and_assert.sh" "07_logs_send_and_assert.sh" "08_restart_agent_reregister.sh" + "10_metric_publish.sh" + "11_metric_node_install.sh" + "12_metric_gpu_install.sh" + "13_metric_verify.sh" + "14_metric_cleanup.sh" "09_down.sh" ) @@ -23,4 +60,3 @@ for script in "${SCRIPTS[@]}"; do done echo "[SYS-E2E] All tests completed" - diff --git a/src/sys/tests/scripts/01_bootstrap.sh b/src/sys/tests/scripts/01_bootstrap.sh index 130eb63..7b20969 100755 --- a/src/sys/tests/scripts/01_bootstrap.sh +++ b/src/sys/tests/scripts/01_bootstrap.sh @@ -22,6 +22,24 @@ ensure_image() { } echo "[INFO] Preparing directories..." +ensure_writable_dir() { + local path="$1" + local parent + parent="$(dirname "$path")" + mkdir -p "$parent" 2>/dev/null || true + mkdir -p "$path" 2>/dev/null || true + if [[ ! -w "$path" ]]; then + docker run --rm -v "$parent:/target" ubuntu:24.04 bash -lc "chown -R $(id -u):$(id -g) /target" >/dev/null 2>&1 || true + fi + mkdir -p "$path" +} + +# preflight: make base dirs writable if inherited from root-owned mounts +ensure_writable_dir "$PRIVATE_CORE/argus" +ensure_writable_dir "$PRIVATE_CORE/argus/metric" +ensure_writable_dir "$PRIVATE_CORE/argus/metric/grafana" +ensure_writable_dir "$PRIVATE_CORE/argus/metric/prometheus" + mkdir -p \ "$PRIVATE_CORE/argus/etc" \ "$PRIVATE_CORE/argus/bind" \ @@ -57,6 +75,8 @@ chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" \ "$PRIVATE_CORE/argus/agent" \ "$PRIVATE_CORE/argus/etc" 2>/dev/null || true +echo "[INFO] Using compose-managed network (auto-created by docker compose)" + echo "[INFO] Distributing update-dns.sh for core services (bind/master/es/kibana)" BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh" BIND_UPDATE_DEST="$PRIVATE_CORE/argus/etc/update-dns.sh" @@ -75,8 +95,6 @@ ensure_image "argus-master:latest" ensure_image "argus-metric-ftp:latest" ensure_image "argus-metric-prometheus:latest" ensure_image "argus-metric-grafana:latest" -ensure_image "argus-metric-test-node:latest" -ensure_image "argus-metric-test-gpu-node:latest" echo "[INFO] Building agent binary..." pushd "$REPO_ROOT/src/agent" >/dev/null @@ -91,19 +109,25 @@ fi echo "$AGENT_BIN" > "$TMP_DIR/agent_binary_path" # 检测GPU环境 -echo "[INFO] 检测GPU环境..." +REQUEST_GPU=${ARGUS_SYS_ENABLE_GPU:-false} GPU_CHECK_SCRIPT="$REPO_ROOT/src/metric/tests/scripts/common/check-gpu.sh" -if [ -f "$GPU_CHECK_SCRIPT" ]; then +if [[ "$REQUEST_GPU" == "true" ]]; then + echo "[INFO] --enable-gpu 已启用,开始检测GPU环境..." + if [[ -f "$GPU_CHECK_SCRIPT" ]]; then if bash "$GPU_CHECK_SCRIPT" >/dev/null 2>&1; then - echo "[INFO] GPU环境可用,将启动test-gpu-node容器" - GPU_AVAILABLE=true + echo "[INFO] GPU环境可用,将在 compose 中启用 test-gpu-node" + GPU_AVAILABLE=true else - echo "[INFO] GPU环境不可用,跳过test-gpu-node容器" - GPU_AVAILABLE=false + echo "[ERROR] 未检测到可用 GPU,但指定了 --enable-gpu" >&2 + exit 1 fi + else + echo "[ERROR] 未找到 GPU 检测脚本: $GPU_CHECK_SCRIPT" >&2 + exit 1 + fi else - echo "[WARN] 未找到GPU检测脚本: $GPU_CHECK_SCRIPT,跳过GPU检测" - GPU_AVAILABLE=false + GPU_AVAILABLE=false + echo "[INFO] GPU 支持未启用,跳过 GPU 检测" fi echo "[INFO] Writing .env with UID/GID and metric configuration" @@ -112,7 +136,7 @@ ARGUS_BUILD_UID=$ARGUS_BUILD_UID ARGUS_BUILD_GID=$ARGUS_BUILD_GID # GPU 配置 -GPU_AVAILABLE=$GPU_AVAILABLE +ENABLE_GPU=$GPU_AVAILABLE # FTP 配置 FTP_PORT=21 diff --git a/src/sys/tests/scripts/02_up.sh b/src/sys/tests/scripts/02_up.sh index 30df6b9..e6f7a1c 100755 --- a/src/sys/tests/scripts/02_up.sh +++ b/src/sys/tests/scripts/02_up.sh @@ -15,39 +15,51 @@ compose() { echo "[INFO] Bringing up system stack..." -# 检测GPU环境 -echo "[INFO] 检测GPU环境..." +REQUEST_GPU=${ARGUS_SYS_ENABLE_GPU:-false} +GPU_AVAILABLE=false GPU_CHECK_SCRIPT="$REPO_ROOT/src/metric/tests/scripts/common/check-gpu.sh" -if [ -f "$GPU_CHECK_SCRIPT" ]; then + +if [[ "$REQUEST_GPU" == "true" ]]; then + echo "[INFO] --enable-gpu 生效,验证主机 GPU..." + if [[ -f "$GPU_CHECK_SCRIPT" ]]; then if bash "$GPU_CHECK_SCRIPT" >/dev/null 2>&1; then - echo "[INFO] GPU环境可用,将启动GPU测试节点" - GPU_AVAILABLE=true + GPU_AVAILABLE=true + echo "[INFO] GPU 检测通过,将启动 gpu profile" else - echo "[INFO] GPU环境不可用,将跳过GPU测试节点" - GPU_AVAILABLE=false + echo "[ERROR] 主机缺少可用 GPU,无法继续 --enable-gpu 流程" >&2 + exit 1 fi + else + echo "[ERROR] 未找到 GPU 检测脚本: $GPU_CHECK_SCRIPT" >&2 + exit 1 + fi else - echo "[WARN] 未找到GPU检测脚本: $GPU_CHECK_SCRIPT,跳过GPU检测" - GPU_AVAILABLE=false + echo "[INFO] 未启用 GPU 流程" fi pushd "$TEST_ROOT" >/dev/null compose -p argus-sys down --remove-orphans || true +# 清理可能由 08 脚本创建的同名容器,避免 compose up 冲突 +for name in argus-node-b; do + if docker ps -aqf "name=^${name}$" >/dev/null 2>&1 && [[ -n "$(docker ps -aqf "name=^${name}$")" ]]; then + docker rm -f "$name" >/dev/null 2>&1 || true + fi +done + # 根据GPU可用性决定启动的服务 -if [ "$GPU_AVAILABLE" = true ]; then - echo "[INFO] 启动所有服务(包括test-gpu-node)..." - compose -p argus-sys up -d +if [[ "$GPU_AVAILABLE" == true ]]; then + echo "[INFO] 启动所有服务(包含 gpu profile)..." + compose -p argus-sys --profile gpu up -d else - echo "[INFO] 启动基础服务(跳过test-gpu-node)..." - compose -p argus-sys up -d --scale test-gpu-node=0 + echo "[INFO] 启动基础服务(不含 gpu profile)..." + compose -p argus-sys up -d fi popd >/dev/null -if [ "$GPU_AVAILABLE" = true ]; then - echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 test-gpu-node:172.29.0.51" +if [[ "$GPU_AVAILABLE" == true ]]; then + echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 test-gpu-node:172.29.0.51" else - echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 (test-gpu-node skipped)" + echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 (gpu skipped)" fi - diff --git a/src/sys/tests/scripts/04_verify_dns_routing.sh b/src/sys/tests/scripts/04_verify_dns_routing.sh index 635c4fe..3b389d7 100755 --- a/src/sys/tests/scripts/04_verify_dns_routing.sh +++ b/src/sys/tests/scripts/04_verify_dns_routing.sh @@ -4,20 +4,15 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" -compose() { - if docker compose version >/dev/null 2>&1; then - docker compose "$@" - else - docker-compose "$@" - fi -} - -service_id() { - compose -p argus-sys ps -q "$1" +# 直接根据 container_name 获取容器ID,避免 compose project 名称不一致导致查找失败 +cid_by_name() { + docker ps -aqf "name=^$1$" } echo "[INFO] Verifying DNS routing via bind..." +pushd "$TEST_ROOT" >/dev/null + # Check master IP file exists in shared private MASTER_FILE="$TEST_ROOT/private/argus/etc/master.argus.com" if [[ ! -f "$MASTER_FILE" ]]; then @@ -28,7 +23,7 @@ MASTER_IP_HOST="$(cat "$MASTER_FILE" | tr -d '\r\n' || true)" echo "[INFO] master.argus.com file content: ${MASTER_IP_HOST}" # dig inside bind container -BIN_ID="$(service_id bind)" +BIN_ID="$(cid_by_name argus-bind-sys)" if [[ -n "$BIN_ID" ]]; then DIG_IP="$(docker exec "$BIN_ID" dig +short master.argus.com A | tail -n1 || true)" echo "[INFO] dig(master.argus.com) from bind container -> $DIG_IP" @@ -39,8 +34,8 @@ else echo "[WARN] bind container not found; skip dig" fi -for node in node-a node-b; do - CID="$(service_id "$node")" +for node in argus-node-a argus-node-b; do + CID="$(cid_by_name "$node")" echo "[INFO] Checking resolution inside $node..." if ! docker exec "$CID" getent hosts master.argus.com >/dev/null 2>&1; then echo "[ERR] $node cannot resolve master.argus.com" >&2 @@ -50,5 +45,6 @@ for node in node-a node-b; do echo "[OK] $node resolved master.argus.com -> $RES" done -echo "[OK] DNS routing verified" +popd >/dev/null +echo "[OK] DNS routing verified" diff --git a/src/sys/tests/scripts/05_agent_register.sh b/src/sys/tests/scripts/05_agent_register.sh index 073d949..d310a3f 100755 --- a/src/sys/tests/scripts/05_agent_register.sh +++ b/src/sys/tests/scripts/05_agent_register.sh @@ -49,8 +49,35 @@ for _ in {1..60}; do fi done +# 若仍未全部注册,尝试重启 node-b 并再等待一轮(兼容 DNS/启动时序抖动) if [[ ! -s "$TMP_DIR/node_id_a" || ! -s "$TMP_DIR/node_id_b" ]]; then - echo "[ERR] Agents did not register in time" >&2 + echo "[WARN] node-a or node-b not registered in first window; restarting node-b and retrying..." >&2 + # 仅重启 node-b,避免影响 es/kibana/master + if docker ps --format '{{.Names}}' | grep -q '^argus-node-b$'; then + docker restart argus-node-b >/dev/null 2>&1 || true + fi + # 再等待一轮(最多 120 秒) + > "$TMP_DIR/node_id_b" + for _ in {1..60}; do + sleep 2 + resp=$(curl -fsS "$API_BASE/nodes" 2>/dev/null || true) + [[ -z "$resp" ]] && continue + if ! echo "$resp" | head -c1 | grep -q '\['; then + continue + fi + echo "$resp" > "$TMP_DIR/nodes_list.json" + ID_A=$(extract_node "$HOST_A" "$TMP_DIR/node_id_a" "$TMP_DIR/nodes_list.json" 2>/dev/null || true) + ID_B=$(extract_node "$HOST_B" "$TMP_DIR/node_id_b" "$TMP_DIR/nodes_list.json" 2>/dev/null || true) + if [[ -s "$TMP_DIR/node_id_a" && -s "$TMP_DIR/node_id_b" ]]; then + break + fi + done +fi + +if [[ ! -s "$TMP_DIR/node_id_a" || ! -s "$TMP_DIR/node_id_b" ]]; then + echo "[ERR] Agents did not register in time (after retry)" >&2 + echo "[HINT] Current /nodes response:" >&2 + sed -n '1,200p' "$TMP_DIR/nodes_list.json" >&2 || true exit 1 fi diff --git a/src/sys/tests/scripts/07_logs_send_and_assert.sh b/src/sys/tests/scripts/07_logs_send_and_assert.sh index 0363ebf..a7d9e24 100755 --- a/src/sys/tests/scripts/07_logs_send_and_assert.sh +++ b/src/sys/tests/scripts/07_logs_send_and_assert.sh @@ -3,9 +3,19 @@ set -euo pipefail echo "[INFO] Sending logs via node-a/node-b and asserting ES counts..." +# Robust count helper: tolerates 404/503 and non-JSON responses, returns integer >=0 get_count() { - local idx="$1" - curl -s "http://localhost:9200/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" | sed -E 's/.*"count":([0-9]+).*/\1/' | awk 'NF{print $0;exit} END{if(NR==0)print 0}' + local idx="$1"; local tmp; tmp=$(mktemp) + local code + code=$(curl -s -o "$tmp" -w "%{http_code}" "http://localhost:9200/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" || true) + if [[ "$code" == "200" ]]; then + local val + val=$(jq -r '(.count // 0) | tonumber? // 0' "$tmp" 2>/dev/null || echo 0) + echo "$val" + else + echo 0 + fi + rm -f "$tmp" } train0=$(get_count "train-*") @@ -32,11 +42,26 @@ send_logs "$node_a" "host01" send_logs "$node_b" "host02" echo "[INFO] Waiting for ES to ingest..." -sleep 10 +# Proactively refresh indices (ignore errors if not created yet) +curl -s -X POST "http://localhost:9200/train-*/_refresh" >/dev/null 2>&1 || true +curl -s -X POST "http://localhost:9200/infer-*/_refresh" >/dev/null 2>&1 || true -train1=$(get_count "train-*") -infer1=$(get_count "infer-*") -final=$((train1 + infer1)) +# Retry up to 120s for counts to increase and reach threshold (>=4) +final=0 +threshold=4 +for attempt in {1..60}; do + train1=$(get_count "train-*") + infer1=$(get_count "infer-*") + final=$((train1 + infer1)) + if (( final > base && final >= threshold )); then + break + fi + echo "[..] waiting ES counts increase to >=${threshold} ($attempt/60) current=${final} base=${base}" + # refresh indices again to speed up visibility + curl -s -X POST "http://localhost:9200/train-*/_refresh" >/dev/null 2>&1 || true + curl -s -X POST "http://localhost:9200/infer-*/_refresh" >/dev/null 2>&1 || true + sleep 2 +done echo "[INFO] final counts: train=${train1} infer=${infer1} total=${final}" if (( final <= base )); then @@ -44,6 +69,7 @@ if (( final <= base )); then exit 1 fi +# Minimal threshold to be tolerant: expect at least 4 documents (2 train + 1 infer per node) if (( final < 4 )); then echo "[ERR] ES total below expected threshold: ${final} < 4" >&2 exit 1 diff --git a/src/sys/tests/scripts/08_restart_agent_reregister.sh b/src/sys/tests/scripts/08_restart_agent_reregister.sh index d9bf43a..baa763d 100755 --- a/src/sys/tests/scripts/08_restart_agent_reregister.sh +++ b/src/sys/tests/scripts/08_restart_agent_reregister.sh @@ -58,10 +58,25 @@ docker rm -f argus-node-b >/dev/null 2>&1 || true AGENT_BIN_PATH="$(cat "$TMP_DIR/agent_binary_path")" +# 选择 compose 管理的网络名(默认 argus-sys_sysnet)。 +detect_sysnet() { + if docker network inspect argus-sys_sysnet >/dev/null 2>&1; then + echo argus-sys_sysnet; return + fi + # 回退:从 master 容器推断所连网络(取第一个) + local n + n=$(docker inspect -f '{{range $k, $_ := .NetworkSettings.Networks}}{{println $k}}{{end}}' argus-master-sys 2>/dev/null | head -n1 || true) + if [[ -n "$n" ]]; then echo "$n"; return; fi + # 最后兜底:尝试项目默认网络(不保证有 IPAM) + echo argus-sys_default +} +SYSNET_NAME=$(detect_sysnet) +echo "[INFO] Using docker network: $SYSNET_NAME" + docker run -d \ --name argus-node-b \ --hostname dev-yyrshare-uuuu10-ep2f-pod-0 \ - --network argus-sys-net \ + --network "$SYSNET_NAME" \ --ip 172.29.0.200 \ --dns 172.29.0.2 \ -e MASTER_ENDPOINT=http://master.argus.com:3000 \ diff --git a/src/sys/tests/scripts/09_down.sh b/src/sys/tests/scripts/09_down.sh index d200540..ceb297d 100755 --- a/src/sys/tests/scripts/09_down.sh +++ b/src/sys/tests/scripts/09_down.sh @@ -12,12 +12,33 @@ compose() { fi } -docker rm -f argus-node-b >/dev/null 2>&1 || true - pushd "$TEST_ROOT" >/dev/null compose -p argus-sys down --remove-orphans || true +compose down --remove-orphans || true popd >/dev/null +echo "[INFO] Force removing containers by name (if any)..." +containers=( + argus-node-a + argus-node-b + argus-metric-test-node + argus-grafana + argus-kibana-sys + argus-master-sys + argus-bind-sys + argus-ftp + argus-es-sys + argus-prometheus +) +for c in "${containers[@]}"; do + id=$(docker ps -aqf "name=^${c}$" || true) + if [[ -n "$id" ]]; then + docker rm -f "$id" >/dev/null 2>&1 || true + fi +done + +echo "[INFO] Removing compose networks (handled by compose down)" + echo "[INFO] Cleaning private directories..." if [[ -d "$TEST_ROOT/private" ]]; then docker run --rm -v "$TEST_ROOT/private:/target" ubuntu:24.04 chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true diff --git a/src/sys/tests/scripts/10_metric_publish.sh b/src/sys/tests/scripts/10_metric_publish.sh new file mode 100755 index 0000000..d7f31b1 --- /dev/null +++ b/src/sys/tests/scripts/10_metric_publish.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)" + +PLUGIN_DIR="$REPO_ROOT/src/metric/client-plugins/all-in-one-full" +FTP_CONTAINER="argus-ftp" + +if [[ ! -d "$PLUGIN_DIR" ]]; then + echo "[SYS-METRIC] Metric client plugin directory not found: $PLUGIN_DIR" >&2 + exit 1 +fi + +if [[ -f "$TEST_ROOT/.env" ]]; then + # shellcheck source=/dev/null + source "$TEST_ROOT/.env" +fi + +OWNER="${ARGUS_BUILD_UID:-2133}:${ARGUS_BUILD_GID:-2015}" + +resolve_output_dir() { + local host_mount + if docker ps --format '{{.Names}}' | grep -q "^${FTP_CONTAINER}$"; then + host_mount=$(docker inspect "$FTP_CONTAINER" --format '{{range .Mounts}}{{if eq .Destination "/private/argus/ftp"}}{{.Source}}{{end}}{{end}}' 2>/dev/null || true) + if [[ -n "$host_mount" ]]; then + echo "$host_mount/share" + return 0 + fi + fi + echo "$TEST_ROOT/private/argus/metric/ftp/share" +} + +OUTPUT_DIR="$(resolve_output_dir)" +mkdir -p "$OUTPUT_DIR" + +if [[ ! -w "$OUTPUT_DIR" ]]; then + echo "[SYS-METRIC] 无法写入 FTP 输出目录: $OUTPUT_DIR" >&2 + echo " 请确认目录权限与 ARGUS_BUILD_UID/GID 一致" >&2 + exit 1 +fi + +pushd "$PLUGIN_DIR" >/dev/null + +echo "[SYS-METRIC] Bumping metric artifact version..." +bash scripts/version-manager.sh bump minor + +VERSION_FILE="config/VERSION" +if [[ ! -f "$VERSION_FILE" ]]; then + echo "[SYS-METRIC] VERSION 文件缺失: $VERSION_FILE" >&2 + exit 1 +fi + +VERSION=$(tr -d '\n' < "$VERSION_FILE") +echo "[SYS-METRIC] 当前版本: $VERSION" + +echo "[SYS-METRIC] Packaging metric artifact..." +bash scripts/package_artifact.sh --force + +echo "[SYS-METRIC] Publishing artifact to FTP share..." +bash scripts/publish_artifact.sh "$VERSION" --output-dir "$OUTPUT_DIR" --owner "$OWNER" + +popd >/dev/null + +echo "[SYS-METRIC] Metric artifact published to $OUTPUT_DIR" diff --git a/src/sys/tests/scripts/11_metric_node_install.sh b/src/sys/tests/scripts/11_metric_node_install.sh new file mode 100755 index 0000000..11a6104 --- /dev/null +++ b/src/sys/tests/scripts/11_metric_node_install.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +if [[ -f "$TEST_ROOT/.env" ]]; then + # shellcheck source=/dev/null + source "$TEST_ROOT/.env" +fi + +CONTAINER="argus-metric-test-node" + +if ! docker ps --format '{{.Names}}' | grep -q "^${CONTAINER}$"; then + echo "[SYS-METRIC] 容器 ${CONTAINER} 未运行,无法执行安装" >&2 + exit 1 +fi + +FTP_HOST="${FTP_SERVER:-172.29.0.40}" +FTP_USER="${FTP_USER:-ftpuser}" +FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" +FTP_PORT="${FTP_PORT:-21}" + +echo "[SYS-METRIC] 在 ${CONTAINER} 内执行安装 (FTP: ${FTP_HOST}:${FTP_PORT})" + +docker exec \ + -e FTP_HOST="$FTP_HOST" \ + -e FTP_USER="$FTP_USER" \ + -e FTP_PASSWORD="$FTP_PASSWORD" \ + -e FTP_PORT="$FTP_PORT" \ + "$CONTAINER" bash -c ' +set -e + +if ! command -v curl &>/dev/null; then + echo "[SYS-METRIC] curl 未安装,开始安装依赖..." + apt-get update >/dev/null && apt-get install -y curl >/dev/null +fi + +cd /tmp +echo "[SYS-METRIC] 下载 setup.sh..." +curl -u "${FTP_USER}:${FTP_PASSWORD}" "ftp://${FTP_HOST}:${FTP_PORT}/setup.sh" -o setup.sh + +echo "[SYS-METRIC] 执行安装..." +chmod +x setup.sh +bash setup.sh --server "${FTP_HOST}" --user "${FTP_USER}" --password "${FTP_PASSWORD}" --port "${FTP_PORT}" + +echo "[SYS-METRIC] 安装完成" +' + +echo "[SYS-METRIC] Metric test node 安装流程完成" diff --git a/src/sys/tests/scripts/12_metric_gpu_install.sh b/src/sys/tests/scripts/12_metric_gpu_install.sh new file mode 100755 index 0000000..ba3b875 --- /dev/null +++ b/src/sys/tests/scripts/12_metric_gpu_install.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +ENABLE_GPU=${ARGUS_SYS_ENABLE_GPU:-false} + +if [[ "$ENABLE_GPU" != "true" ]]; then + echo "[SYS-METRIC] 未启用 GPU 流程,跳过 GPU 节点安装" + exit 0 +fi + +if [[ -f "$TEST_ROOT/.env" ]]; then + # shellcheck source=/dev/null + source "$TEST_ROOT/.env" +fi + +CONTAINER="argus-metric-test-gpu-node" + +if ! docker ps --format '{{.Names}}' | grep -q "^${CONTAINER}$"; then + echo "[SYS-METRIC] 预期启动的 ${CONTAINER} 未运行" >&2 + exit 1 +fi + +FTP_HOST="${FTP_SERVER:-172.29.0.40}" +FTP_USER="${FTP_USER:-ftpuser}" +FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" +FTP_PORT="${FTP_PORT:-21}" + +echo "[SYS-METRIC] 在 GPU 节点执行安装 (FTP: ${FTP_HOST}:${FTP_PORT})" + +docker exec \ + -e FTP_HOST="$FTP_HOST" \ + -e FTP_USER="$FTP_USER" \ + -e FTP_PASSWORD="$FTP_PASSWORD" \ + -e FTP_PORT="$FTP_PORT" \ + "$CONTAINER" bash -c ' +set -e + +if ! command -v nvidia-smi &>/dev/null; then + echo "[SYS-METRIC] GPU 节点缺少 nvidia-smi" >&2 + exit 1 +fi + +nvidia-smi >/dev/null || true + +if ! command -v curl &>/dev/null; then + echo "[SYS-METRIC] curl 未安装,开始安装依赖..." + apt-get update >/dev/null && apt-get install -y curl >/dev/null +fi + +cd /tmp +echo "[SYS-METRIC] 下载 setup.sh..." +curl -u "${FTP_USER}:${FTP_PASSWORD}" "ftp://${FTP_HOST}:${FTP_PORT}/setup.sh" -o setup.sh + +echo "[SYS-METRIC] 执行安装..." +chmod +x setup.sh +bash setup.sh --server "${FTP_HOST}" --user "${FTP_USER}" --password "${FTP_PASSWORD}" --port "${FTP_PORT}" + +echo "[SYS-METRIC] GPU 节点安装完成" +' + +echo "[SYS-METRIC] Metric GPU 节点安装流程完成" diff --git a/src/sys/tests/scripts/13_metric_verify.sh b/src/sys/tests/scripts/13_metric_verify.sh new file mode 100755 index 0000000..f60b1b5 --- /dev/null +++ b/src/sys/tests/scripts/13_metric_verify.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +echo "[SYS-METRIC] Verify: master" +"$SCRIPT_DIR/13_metric_verify_master.sh" +echo + +echo "[SYS-METRIC] Verify: prometheus" +PROM_RETRIES=${PROM_VERIFY_RETRIES:-2} +PROM_BACKOFF=${PROM_VERIFY_BACKOFF_SECONDS:-30} +attempt=0 +while true; do + if "$SCRIPT_DIR/13_metric_verify_prometheus.sh"; then + break + fi + attempt=$((attempt+1)) + if (( attempt > PROM_RETRIES )); then + echo "[ERR] prometheus verify failed after $PROM_RETRIES retries" >&2 + exit 1 + fi + echo "[WARN] prometheus verify failed; retry $attempt/$PROM_RETRIES after ${PROM_BACKOFF}s" + sleep "$PROM_BACKOFF" +done +echo + +echo "[SYS-METRIC] Verify: dataplane" +"$SCRIPT_DIR/13_metric_verify_dataplane.sh" +echo + +echo "[SYS-METRIC] Verify: grafana" +"$SCRIPT_DIR/13_metric_verify_grafana.sh" +echo + +echo "[SYS-METRIC] Verify: grafana panels" +"$SCRIPT_DIR/13_metric_verify_grafana_panels.sh" +echo + +echo "[SYS-METRIC] Metric verification completed" diff --git a/src/sys/tests/scripts/13_metric_verify_dataplane.sh b/src/sys/tests/scripts/13_metric_verify_dataplane.sh new file mode 100755 index 0000000..527aae8 --- /dev/null +++ b/src/sys/tests/scripts/13_metric_verify_dataplane.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +set -euo pipefail + +TMP_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. && pwd)/tmp/metric-verify" +mkdir -p "$TMP_DIR" + +PROM_BASE="http://localhost:9090/api/v1" +INSTANCE="${METRIC_TEST_INSTANCE:-172.29.0.50:9100}" +IP_ONLY="${INSTANCE%%:*}" + +echo "[VERIFY:DATA] node exporter metrics present in container" +docker exec argus-metric-test-node bash -lc "curl -fsS --max-time 5 http://localhost:9100/metrics | head -n 5" > "$TMP_DIR/node_metrics_head.txt" || { echo "[ERR] cannot fetch node exporter metrics" >&2; exit 1; } +if ! grep -E "node_(exporter_build_info|time_seconds)" -q "$TMP_DIR/node_metrics_head.txt"; then + echo "[WARN] head did not show expected lines; continuing (exporter may output later lines)" +fi +echo "[OK] node exporter endpoint reachable" + +echo "[VERIFY:DATA] Prometheus has recent sample for build_info" +curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=node_exporter_build_info{job=\"node\",ip=\"$IP_ONLY\"}" > "$TMP_DIR/prom_ne_build_info_1.json" + +python3 - "$TMP_DIR/prom_ne_build_info_1.json" <<'PY' +import json,sys,time +j=json.load(open(sys.argv[1])) +res=j.get('data',{}).get('result',[]) +assert res, 'no result for node_exporter_build_info' +ts=float(res[0]['value'][0]) +now=time.time() +assert now-ts<180, f"sample too old: now={now} ts={ts}" +print(int(ts)) +PY +T1=$? +sleep 30 +curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=node_exporter_build_info{job=\"node\",ip=\"$IP_ONLY\"}" > "$TMP_DIR/prom_ne_build_info_2.json" + +TS1=$(python3 - "$TMP_DIR/prom_ne_build_info_1.json" <<'PY' +import json,sys +print(float(json.load(open(sys.argv[1]))['data']['result'][0]['value'][0])) +PY +) +TS2=$(python3 - "$TMP_DIR/prom_ne_build_info_2.json" <<'PY' +import json,sys +print(float(json.load(open(sys.argv[1]))['data']['result'][0]['value'][0])) +PY +) +awk -v a="$TS1" -v b="$TS2" 'BEGIN{ if (b>=a) exit 0; else exit 1 }' || { echo "[ERR] sample timestamp did not advance" >&2; exit 1; } +echo "[OK] sample timestamp advanced" +echo "[DONE] dataplane verify" diff --git a/src/sys/tests/scripts/13_metric_verify_grafana.sh b/src/sys/tests/scripts/13_metric_verify_grafana.sh new file mode 100755 index 0000000..baa9bd3 --- /dev/null +++ b/src/sys/tests/scripts/13_metric_verify_grafana.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROM_DOMAIN="prom.metric.argus.com:9090" +GRAF="http://localhost:3000" + +echo "[VERIFY:GRAFANA] /api/health" +TMP_FILE="$(cd "$(dirname "$0")"/.. && pwd)/tmp/metric-verify/graf_health.json" +mkdir -p "$(dirname "$TMP_FILE")" +curl -fsS --max-time 10 "$GRAF/api/health" -o "$TMP_FILE" || { echo "[ERR] failed to GET /api/health" >&2; exit 1; } +python3 - "$TMP_FILE" <<'PY' +import sys,json +with open(sys.argv[1],'r',encoding='utf-8') as f: + j=json.load(f) +assert j.get('database')=='ok', f"health not ok: {j}" +print('OK') +PY + +echo "[VERIFY:GRAFANA] datasource URL uses domain: $PROM_DOMAIN" +DS_FILE="/private/argus/metric/grafana/provisioning/datasources/datasources.yml" +if ! docker exec argus-grafana sh -lc "test -f $DS_FILE"; then + DS_FILE="/etc/grafana/provisioning/datasources/datasources.yml" +fi +docker exec argus-grafana sh -lc "grep -E 'url:\s*http://$PROM_DOMAIN' '$DS_FILE'" >/dev/null 2>&1 || { echo "[ERR] datasource not pointing to $PROM_DOMAIN" >&2; exit 1; } +echo "[OK] datasource points to domain" + +echo "[VERIFY:GRAFANA] bind resolution inside grafana" +tries=0 +until docker exec argus-grafana getent hosts prom.metric.argus.com >/dev/null 2>&1; do + tries=$((tries+1)) + if (( tries > 24 )); then + echo "[ERR] grafana cannot resolve prom.metric.argus.com" >&2 + exit 1 + fi + echo "[..] waiting DNS propagation in grafana ($tries/24)"; sleep 5 +done +echo "[OK] domain resolves" + +echo "[DONE] grafana verify" diff --git a/src/sys/tests/scripts/13_metric_verify_grafana_panels.sh b/src/sys/tests/scripts/13_metric_verify_grafana_panels.sh new file mode 100755 index 0000000..962e75b --- /dev/null +++ b/src/sys/tests/scripts/13_metric_verify_grafana_panels.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_DIR="$TEST_ROOT/tmp/metric-verify" +mkdir -p "$TMP_DIR" + +GRAF="http://localhost:3000" +HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}" + +echo "[VERIFY:GRAF-PANELS] resolve Prometheus datasource UID via Grafana" +DS_JSON="$TMP_DIR/graf_ds.json" +curl -fsS --max-time 10 "$GRAF/api/datasources" >"$DS_JSON" +DS_UID=$(python3 - "$DS_JSON" <<'PY' +import json,sys +arr=json.load(open(sys.argv[1])) +for ds in arr: + if (ds.get('type')=='prometheus'): + print(ds.get('uid','')) + break +PY +) +if [[ -z "$DS_UID" ]]; then echo "[ERR] no prometheus datasource found in grafana" >&2; exit 1; fi +echo "[OK] Prometheus DS UID=$DS_UID" + +proxy_query() { + local q="$1"; local out="$2" + curl -fsS --max-time 10 --get "$GRAF/api/datasources/proxy/uid/$DS_UID/api/v1/query" \ + --data-urlencode "query=$q" >"$out" +} + +assert_vector_recent_nonempty() { + local json="$1"; local max_age_sec="${2:-180}" + python3 - <<'PY' "$json" "$max_age_sec" +import json,sys,time +doc=json.load(open(sys.argv[1])) +if doc.get('status')!='success': + raise SystemExit('prom status != success') +res=doc.get('data',{}).get('result',[]) +assert res, 'empty result' +ts=float(res[0]['value'][0]) +assert time.time()-ts < float(sys.argv[2]), f'timestamp too old: {ts}' +print(int(ts)) +PY +} + +echo "[VERIFY:GRAF-PANELS] Dashboard: Node and GPU Metrics — System Load" +Q_NODE_LOAD="node_load1{hostname=\"$HOSTNAME\"}" +proxy_query "$Q_NODE_LOAD" "$TMP_DIR/graf_panel_node_load.json" +assert_vector_recent_nonempty "$TMP_DIR/graf_panel_node_load.json" 300 >/dev/null +echo "[OK] node_load1 has recent sample via Grafana proxy" + +echo "[VERIFY:GRAF-PANELS] Dashboard: Cluster Dashboard — Node online count" +Q_NODE_ONLINE='count(count by(hostname) (up{job="node"} == 1))' +proxy_query "$Q_NODE_ONLINE" "$TMP_DIR/graf_panel_node_online.json" +python3 - "$TMP_DIR/graf_panel_node_online.json" <<'PY' +import json,sys +doc=json.load(open(sys.argv[1])) +assert doc.get('status')=='success', 'prom status not success' +res=doc.get('data',{}).get('result',[]) +assert res, 'no series for node online count' +val=float(res[0]['value'][1]) +assert val>=1, f'node online < 1: {val}' +print('OK',val) +PY +echo "[OK] cluster node online count >= 1 via Grafana proxy" + +echo "[DONE] grafana panels verify" + diff --git a/src/sys/tests/scripts/13_metric_verify_master.sh b/src/sys/tests/scripts/13_metric_verify_master.sh new file mode 100755 index 0000000..8e4032f --- /dev/null +++ b/src/sys/tests/scripts/13_metric_verify_master.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_DIR="$TEST_ROOT/tmp/metric-verify" +mkdir -p "$TMP_DIR" + +MASTER_BASE="http://localhost:32300/api/v1/master" +HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}" + +curl_json() { curl -fsS --max-time 5 "$1"; } + +echo "[VERIFY:MASTER] list nodes and locate target hostname=$HOSTNAME" +ALL_NODES_JSON="$TMP_DIR/master_nodes.json" + +# 重试等待节点出现在 /nodes 列表(最多 120s) +NODE_ID="" +for attempt in {1..24}; do + curl_json "$MASTER_BASE/nodes" > "$ALL_NODES_JSON" || true + NODE_ID=$(python3 - "$ALL_NODES_JSON" "$HOSTNAME" <<'PY' +import json,sys +try: + nodes=json.load(open(sys.argv[1])) +except Exception: + nodes=[] +name=sys.argv[2] +for n in nodes: + if n.get('name')==name: + print(n.get('id','')) + break +PY + ) + if [[ -n "$NODE_ID" ]]; then break; fi + echo "[..] waiting node to appear in /nodes ($attempt/24)"; sleep 5 +done + +if [[ -z "$NODE_ID" ]]; then + echo "[ERR] master /nodes 中未找到 $HOSTNAME(等待超时)" >&2 + echo "[HINT] 当前 /nodes 列表如下:" >&2 + sed -n '1,160p' "$ALL_NODES_JSON" >&2 || true + exit 1 +fi +echo "[OK] node id=$NODE_ID" + +echo "[VERIFY:MASTER] get node detail and assert fields" +DETAIL1_JSON="$TMP_DIR/master_node_${NODE_ID}_detail_1.json" +curl_json "$MASTER_BASE/nodes/$NODE_ID" > "$DETAIL1_JSON" + +# 基础字段与健康项检查(不强制立即 online) +python3 - "$DETAIL1_JSON" "$HOSTNAME" <<'PY' +import json,sys,datetime +j=json.load(open(sys.argv[1])) +host=sys.argv[2] +assert j.get('name')==host, f"name mismatch: {j.get('name')} != {host}" +status=j.get('status') +assert status in ('initialized','online','offline'), f"unexpected status: {status}" +md=j.get('meta_data',{}) +assert md.get('hostname',j.get('name'))==host, 'meta_data.hostname mismatch' +assert 'last_report' in j and j['last_report'], 'last_report missing' +h=j.get('health',{}) +for key in ('metric-node-exporter','metric-fluent-bit','metric-argus-agent'): + if key in h: + assert h[key].get('status')=='healthy', f"{key} not healthy: {h[key]}" +print('OK') +PY + +# 轮询等待 last_report 前进并最终转为 online(最多 90s),容忍短暂 5xx/网络错误 +attempt=0 +T_PRE=0 +until [[ $attempt -ge 18 ]]; do + sleep 5 + DETAIL_CUR="$TMP_DIR/master_node_${NODE_ID}_detail_cur.json" + if ! curl_json "$MASTER_BASE/nodes/$NODE_ID" > "$DETAIL_CUR" 2>/dev/null; then + echo "[..] retrying node detail fetch ($attempt/18)"; ((attempt++)); continue + fi + read -r STATUS_CUR T_CUR < <(python3 - "$DETAIL_CUR" <<'PY' +import json,sys,datetime +j=json.load(open(sys.argv[1])) +st=j.get('status','') +ts=j.get('last_report','') +if ts.endswith('Z'): ts=ts.replace('Z','+00:00') +try: + t=float(datetime.datetime.fromisoformat(ts).timestamp()) +except Exception: + t=0.0 +print(st) +print(t) +PY + ) + if awk -v a="$T_PRE" -v b="$T_CUR" 'BEGIN{exit !(b>a)}'; then + T_PRE="$T_CUR" + fi + if [[ "$STATUS_CUR" == "online" ]]; then + echo "[OK] status online and last_report progressed" + break + fi + ((attempt++)) +done +if (( attempt >= 18 )) && [[ "$STATUS_CUR" != "online" ]]; then + echo "[WARN] status did not reach online within timeout; continuing" +fi + +echo "$NODE_ID" > "$TMP_DIR/node_id_metric" +echo "[DONE] master verify" diff --git a/src/sys/tests/scripts/13_metric_verify_prometheus.sh b/src/sys/tests/scripts/13_metric_verify_prometheus.sh new file mode 100755 index 0000000..b0d45e3 --- /dev/null +++ b/src/sys/tests/scripts/13_metric_verify_prometheus.sh @@ -0,0 +1,142 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_DIR="$TEST_ROOT/tmp/metric-verify" +mkdir -p "$TMP_DIR" + +PROM_BASE="http://localhost:9090/api/v1" +HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}" + +nodes_json="$TEST_ROOT/private/argus/metric/prometheus/nodes.json" +targets_json="$TEST_ROOT/private/argus/metric/prometheus/targets/node_exporter.json" + +echo "[VERIFY:PROM] nodes.json present and contains hostname=$HOSTNAME" +[[ -f "$nodes_json" ]] || { echo "[ERR] $nodes_json missing" >&2; exit 1; } +python3 - "$nodes_json" "$HOSTNAME" <<'PY' +import json,sys +arr=json.load(open(sys.argv[1])) +host=sys.argv[2] +assert any((i.get('hostname')==host) for i in arr), f"{host} not found in nodes.json" +PY +echo "[OK] nodes.json contains target" + +echo "[VERIFY:PROM] file_sd targets exist for nodes.json entries" +[[ -f "$targets_json" ]] || { echo "[ERR] $targets_json missing" >&2; exit 1; } +python3 - "$nodes_json" "$targets_json" "$HOSTNAME" >"$TMP_DIR/prom_targets_ip_inst.txt" <<'PY' +import json,sys +nodes=json.load(open(sys.argv[1])) +file_sd=json.load(open(sys.argv[2])) +host=sys.argv[3] +targets=set() +for item in file_sd: + for t in item.get('targets',[]): targets.add(t) +# choose node matching hostname; fallback to first metric user node; otherwise first +sel = None +for n in nodes: + if n.get('hostname') == host: + sel = n + break +if not sel: + for n in nodes: + if n.get('user_id') == 'metric': + sel = n + break +if not sel and nodes: + sel = nodes[0] +if not sel: + raise SystemExit('nodes.json empty or no suitable node found') +ip = sel['ip'] +inst = f"{ip}:9100" +print(ip) +print(inst) +PY +IP_FIRST=$(sed -n '1p' "$TMP_DIR/prom_targets_ip_inst.txt") +INSTANCE=$(sed -n '2p' "$TMP_DIR/prom_targets_ip_inst.txt") +echo "[INFO] expecting instance in file_sd: $INSTANCE" + +# 尝试在 Prometheus 容器内主动刷新 targets(可选加速) +if docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then + echo "[..] triggering update_targets inside argus-prometheus" + docker exec argus-prometheus bash -lc \ + 'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true' +fi + +# 给 Prometheus 一次初始 scrape 周期 +sleep 10 + +# 若短暂未生成,进行重试(最多 180s),期间多次触发刷新 +retry=0 +until jq -r '.[].targets[]' "$targets_json" 2>/dev/null | grep -q "^${IP_FIRST}:9100$"; do + if (( retry >= 36 )); then + echo "[ERR] ${IP_FIRST}:9100 not present in file_sd after timeout" >&2 + echo "[HINT] current targets file content:" >&2 + sed -n '1,200p' "$targets_json" >&2 || true + exit 1 + fi + if (( retry % 3 == 0 )) && docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then + docker exec argus-prometheus bash -lc \ + 'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true' + fi + echo "[..] waiting file_sd refresh ($retry/36)"; sleep 5; ((retry++)) +done + +# 改为以 PromQL up 指标作为健康依据,避免 targets 页面状态抖动 +echo "[VERIFY:PROM] up{job=\"node\",ip=\"$IP_FIRST\"} > 0" +attempt=0 +until (( attempt >= 60 )); do + curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"node\",ip=\"$IP_FIRST\"}" > "$TMP_DIR/prom_up_inst_active.json" || true + if python3 - "$TMP_DIR/prom_up_inst_active.json" <<'PY' +import json,sys +try: + j=json.load(open(sys.argv[1])) +except Exception: + raise SystemExit(1) +res=j.get('data',{}).get('result',[]) +if res: + try: + val=float(res[0]['value'][1]) + if val>0: raise SystemExit(0) + except Exception: + pass +raise SystemExit(1) +PY + then + echo "[OK] up > 0 (control-plane scrape works)"; break + fi + if (( attempt % 6 == 0 )) && docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then + docker exec argus-prometheus bash -lc \ + 'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true' + fi + echo "[..] waiting up{job=\"node\",ip=\"$IP_FIRST\"} > 0 ($attempt/60)"; sleep 5; ((attempt++)) +done +if (( attempt >= 60 )); then + echo "[ERR] up{job=\"node\",ip=\"$IP_FIRST\"} did not become > 0" >&2 + exit 1 +fi + +echo "[VERIFY:PROM] instant up query > 0" +curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"node\",ip=\"$IP_FIRST\"}" > "$TMP_DIR/prom_up_inst.json" +python3 - "$TMP_DIR/prom_up_inst.json" <<'PY' +import json,sys +j=json.load(open(sys.argv[1])) +res=j.get('data',{}).get('result',[]) +assert res, 'empty result for up{job="node",instance=...}' +val=float(res[0]['value'][1]) +assert val>0, f"up value not > 0: {val}" +PY +echo "[OK] up > 0" + +echo "[VERIFY:PROM] count(up{job=\"node\"}==1) >= 1" +curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=count(up{job=\"node\"}==1)" > "$TMP_DIR/prom_up_count.json" +python3 - "$TMP_DIR/prom_up_count.json" <<'PY' +import json,sys +j=json.load(open(sys.argv[1])) +res=j.get('data',{}).get('result',[]) +assert res, 'empty result for count(up{job="node"}==1)' +val=float(res[0]['value'][1]) +assert val>=1, f"count < 1: {val}" +PY +echo "[OK] up count satisfied" +echo "[DONE] prometheus verify" diff --git a/src/sys/tests/scripts/14_metric_cleanup.sh b/src/sys/tests/scripts/14_metric_cleanup.sh new file mode 100755 index 0000000..5c4f3b6 --- /dev/null +++ b/src/sys/tests/scripts/14_metric_cleanup.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +FTP_SHARE="$TEST_ROOT/private/argus/metric/ftp/share" + +if [[ -d "$FTP_SHARE" ]]; then + echo "[SYS-METRIC] 清理 FTP 发布产物..." + rm -f "$FTP_SHARE"/argus-metric_*.tar.gz 2>/dev/null || true + rm -f "$FTP_SHARE"/LATEST_VERSION 2>/dev/null || true + rm -f "$FTP_SHARE"/dns.conf "$FTP_SHARE"/setup.sh 2>/dev/null || true +else + echo "[SYS-METRIC] FTP 目录不存在,跳过清理" +fi + +echo "[SYS-METRIC] Metric 清理完成" diff --git a/src/sys/tests/scripts/metric/test-node-entrypoint.sh b/src/sys/tests/scripts/metric/test-node-entrypoint.sh new file mode 100755 index 0000000..1f1c5c4 --- /dev/null +++ b/src/sys/tests/scripts/metric/test-node-entrypoint.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +set -euo pipefail + +ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} +ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} +AGENT_ROOT=${AGENT_ROOT:-/private/argus/agent} +PREPARED_FLAG="/tmp/.metric_node_prepared" + +export DEBIAN_FRONTEND=${DEBIAN_FRONTEND:-noninteractive} + +if [[ ! -f "$PREPARED_FLAG" ]]; then + apt-get update -qq + apt-get install -y -qq \ + curl \ + net-tools \ + iproute2 \ + lsof \ + procps \ + ca-certificates \ + gnupg2 || { + echo "[metric-node] Failed to install base packages" >&2 + exit 1 + } + + mkdir -p "$(dirname "$PREPARED_FLAG")" + touch "$PREPARED_FLAG" +fi + +if [[ -n "${TZ:-}" ]]; then + ln -snf "/usr/share/zoneinfo/${TZ}" /etc/localtime 2>/dev/null || true + echo "$TZ" > /etc/timezone 2>/dev/null || true +fi + +mkdir -p "$AGENT_ROOT" +chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$AGENT_ROOT" 2>/dev/null || true + +if [[ "${METRIC_NODE_ROLE:-cpu}" == "gpu" ]]; then + if ! command -v nvidia-smi >/dev/null 2>&1; then + echo "[metric-node] nvidia-smi not available but GPU role requested" >&2 + exit 1 + fi + nvidia-smi || true +fi + +exec "$@" diff --git a/src/sys/tests/scripts/node_entrypoint.sh b/src/sys/tests/scripts/node_entrypoint.sh index e1ed888..b313506 100755 --- a/src/sys/tests/scripts/node_entrypoint.sh +++ b/src/sys/tests/scripts/node_entrypoint.sh @@ -46,7 +46,9 @@ fi # Start Fluent Bit in background (will block, so run via bash -lc &) if [[ -x /private/start-fluent-bit.sh ]]; then log "starting fluent-bit" - bash -lc '/private/start-fluent-bit.sh' & + sysctl -w fs.inotify.max_user_instances=512 >/dev/null 2>&1 || true + sysctl -w fs.inotify.max_user_watches=524288 >/dev/null 2>&1 || true + bash -lc 'ulimit -n 65536 || true; exec /private/start-fluent-bit.sh' & else log "missing /private/start-fluent-bit.sh; fluent-bit will not start" fi @@ -54,4 +56,3 @@ fi # Start agent in foreground as runtime user log "starting argus-agent" exec su -s /bin/bash -c /usr/local/bin/argus-agent "$RUNTIME_USER" - diff --git a/src/web/build_tools/frontend/Dockerfile b/src/web/build_tools/frontend/Dockerfile index 3c87684..94aa7da 100644 --- a/src/web/build_tools/frontend/Dockerfile +++ b/src/web/build_tools/frontend/Dockerfile @@ -24,24 +24,37 @@ RUN apt-get update && \ apt-get clean && rm -rf /var/lib/apt/lists/* ENV FRONTEND_BASE_PATH=/private/argus/web/frontend -ARG ARGUS_UID=2133 -ARG ARGUS_GID=2015 -ENV ARGUS_UID=${ARGUS_UID} -ENV ARGUS_GID=${ARGUS_GID} +ARG ARGUS_BUILD_UID=2133 +ARG ARGUS_BUILD_GID=2015 +ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} +ENV ARGUS_BUILD_GID=${ARGUS_BUILD_GID} RUN mkdir -p ${FRONTEND_BASE_PATH} && \ mkdir -p /private/argus/etc # 创建 web 用户(可自定义 UID/GID) # 创建 web 用户组 -RUN groupadd -g ${ARGUS_GID} web - -# 创建 web 用户并指定组 -RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} web - -RUN chown -R web:web ${FRONTEND_BASE_PATH} && \ - chown -R web:web /private/argus/etc && \ - chown -R web:web /usr/local/bin +RUN set -eux; \ + # 确保目标 GID 存在(组名可不固定)\ + if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \ + groupadd -g "${ARGUS_BUILD_GID}" web || true; \ + fi; \ + # 若存在 web 用户则尽量对齐 UID/GID;否则仅在 UID 未被占用时创建 + if id web >/dev/null 2>&1; then \ + current_uid="$(id -u web)"; \ + if [ "$current_uid" != "${ARGUS_BUILD_UID}" ] && ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \ + usermod -u "${ARGUS_BUILD_UID}" web; \ + fi; \ + usermod -g "${ARGUS_BUILD_GID}" web || true; \ + else \ + if ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \ + useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" web; \ + else \ + echo "UID ${ARGUS_BUILD_UID} already exists; skip creating user 'web'"; \ + fi; \ + fi; \ + # 用数值 UID:GID 赋权,避免依赖用户名/组名 + chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" ${FRONTEND_BASE_PATH} /private/argus/etc /usr/local/bin || true # 配置内网 apt 源 (如果指定了内网选项) RUN if [ "$USE_INTRANET" = "true" ]; then \ diff --git a/src/web/build_tools/frontend/build.sh b/src/web/build_tools/frontend/build.sh index 972e0d0..33e29c0 100644 --- a/src/web/build_tools/frontend/build.sh +++ b/src/web/build_tools/frontend/build.sh @@ -4,7 +4,7 @@ docker pull ubuntu:24.04 source src/web/tests/.env docker build \ - --build-arg ARGUS_UID=${ARGUS_UID} \ - --build-arg ARGUS_GID=${ARGUS_GID} \ + --build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \ + --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID} \ -f src/web/build_tools/frontend/Dockerfile -t argus-web-frontend:latest . docker save -o argus-web-frontend-latest.tar argus-web-frontend:latest diff --git a/src/web/build_tools/frontend/start-web-supervised.sh b/src/web/build_tools/frontend/start-web-supervised.sh index 84382a1..b5cfc76 100644 --- a/src/web/build_tools/frontend/start-web-supervised.sh +++ b/src/web/build_tools/frontend/start-web-supervised.sh @@ -8,8 +8,8 @@ DNS_SCRIPT="${DNS_DIR}/update-dns.sh" DOMAIN=web.argus.com WEB_DOMAIN_FILE="${DNS_DIR}/${DOMAIN}" RUNTIME_USER="${ARGUS_RUNTIME_USER:-argus}" -RUNTIME_UID="${ARGUS_UID:-2133}" -RUNTIME_GID="${ARGUS_GID:-2015}" +RUNTIME_UID="${ARGUS_BUILD_UID:-2133}" +RUNTIME_GID="${ARGUS_BUILD_GID:-2015}" mkdir -p "$DNS_DIR" chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true diff --git a/src/web/build_tools/proxy/Dockerfile b/src/web/build_tools/proxy/Dockerfile index e43e36f..748b384 100644 --- a/src/web/build_tools/proxy/Dockerfile +++ b/src/web/build_tools/proxy/Dockerfile @@ -8,24 +8,34 @@ RUN apt-get update && \ apt-get clean && rm -rf /var/lib/apt/lists/* ENV FRONTEND_BASE_PATH=/private/argus/web/proxy -ARG ARGUS_UID=2133 -ARG ARGUS_GID=2015 -ENV ARGUS_UID=${ARGUS_UID} -ENV ARGUS_GID=${ARGUS_GID} +ARG ARGUS_BUILD_UID=2133 +ARG ARGUS_BUILD_GID=2015 +ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} +ENV ARGUS_BUILD_GID=${ARGUS_BUILD_GID} RUN mkdir -p ${FRONTEND_BASE_PATH} && \ mkdir -p /private/argus/etc # 创建 proxy 用户(可自定义 UID/GID) # 创建 proxy 用户组 -RUN groupadd -g ${ARGUS_GID} web_proxy - -# 创建 proxy 用户并指定组 -RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} web_proxy - -RUN chown -R web_proxy:web_proxy ${FRONTEND_BASE_PATH} && \ - chown -R web_proxy:web_proxy /private/argus/etc && \ - chown -R web_proxy:web_proxy /usr/local/bin +RUN set -eux; \ + if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \ + groupadd -g "${ARGUS_BUILD_GID}" web_proxy || true; \ + fi; \ + if id web_proxy >/dev/null 2>&1; then \ + current_uid="$(id -u web_proxy)"; \ + if [ "$current_uid" != "${ARGUS_BUILD_UID}" ] && ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \ + usermod -u "${ARGUS_BUILD_UID}" web_proxy; \ + fi; \ + usermod -g "${ARGUS_BUILD_GID}" web_proxy || true; \ + else \ + if ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \ + useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" web_proxy; \ + else \ + echo "UID ${ARGUS_BUILD_UID} already exists; skip creating user 'web_proxy'"; \ + fi; \ + fi; \ + chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" ${FRONTEND_BASE_PATH} /private/argus/etc /usr/local/bin || true # 配置内网 apt 源 (如果指定了内网选项) RUN if [ "$USE_INTRANET" = "true" ]; then \ diff --git a/src/web/build_tools/proxy/build.sh b/src/web/build_tools/proxy/build.sh index 063e378..98c4f65 100644 --- a/src/web/build_tools/proxy/build.sh +++ b/src/web/build_tools/proxy/build.sh @@ -3,7 +3,7 @@ docker pull ubuntu:24.04 source src/web/tests/.env docker build \ - --build-arg ARGUS_UID=${ARGUS_UID} \ - --build-arg ARGUS_GID=${ARGUS_GID} \ + --build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \ + --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID} \ -f src/web/build_tools/proxy/Dockerfile -t argus-web-proxy:latest . docker save -o argus-web-proxy-latest.tar argus-web-proxy:latest diff --git a/src/web/build_tools/proxy/start-proxy-supervised.sh b/src/web/build_tools/proxy/start-proxy-supervised.sh index 51c2b7b..ac276dd 100644 --- a/src/web/build_tools/proxy/start-proxy-supervised.sh +++ b/src/web/build_tools/proxy/start-proxy-supervised.sh @@ -9,8 +9,8 @@ DNS_CONF_PRIVATE="/private/argus/etc/dns.conf" DNS_CONF_SYSTEM="/etc/resolv.conf" DNS_DIR="/private/argus/etc" DNS_SCRIPT="${DNS_DIR}/update-dns.sh" -RUNTIME_UID="${ARGUS_UID:-2133}" -RUNTIME_GID="${ARGUS_GID:-2015}" +RUNTIME_UID="${ARGUS_BUILD_UID:-2133}" +RUNTIME_GID="${ARGUS_BUILD_GID:-2015}" mkdir -p "$DNS_DIR" chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true diff --git a/src/web/tests/docker-compose.yml b/src/web/tests/docker-compose.yml index 985e22c..7be6106 100644 --- a/src/web/tests/docker-compose.yml +++ b/src/web/tests/docker-compose.yml @@ -4,15 +4,15 @@ services: context: ../../../ dockerfile: src/web/build_tools/frontend/Dockerfile args: - ARGUS_UID: ${ARGUS_UID:-2133} - ARGUS_GID: ${ARGUS_GID:-2015} + ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133} + ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015} USE_INTRANET: ${USE_INTRANET:-false} image: argus-web-frontend:latest container_name: argus-web-frontend environment: - ALERTMANAGER_BASE_PATH=/private/argus/web/frontend - - ARGUS_UID=${ARGUS_UID:-2133} - - ARGUS_GID=${ARGUS_GID:-2015} + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} ports: - "${ARGUS_WEB_PORT:-8080}:80" volumes: @@ -31,14 +31,14 @@ services: context: ../../../ dockerfile: src/web/build_tools/proxy/Dockerfile args: - ARGUS_UID: ${ARGUS_UID:-2133} - ARGUS_GID: ${ARGUS_GID:-2015} + ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133} + ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015} USE_INTRANET: ${USE_INTRANET:-false} image: argus-web-proxy:latest container_name: argus-web-proxy environment: - - ARGUS_UID=${ARGUS_UID:-2133} - - ARGUS_GID=${ARGUS_GID:-2015} + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} ports: - "8088:80" volumes: