完成a6000测试系统构建、部署、测试整合 #35

Merged
yuyr merged 18 commits from dev_1.0.0_yuyr_5 into dev_1.0.0 2025-10-29 10:04:29 +08:00
39 changed files with 1167 additions and 158 deletions
Showing only changes of commit a1cdd05950 - Show all commits

View File

@ -254,6 +254,54 @@ if [[ "$build_metric" == true ]]; then
done done
fi fi
# =======================================
# Web & Alert module images
# =======================================
echo ""
echo "Building Web and Alert module images..."
# Pre-pull commonly used base images for stability
web_alert_base_images=(
"node:20"
"ubuntu:24.04"
)
for base_image in "${web_alert_base_images[@]}"; do
if ! pull_base_image "$base_image"; then
build_failed=true
fi
done
web_builds=(
"Web Frontend|src/web/build_tools/frontend/Dockerfile|argus-web-frontend:latest|."
"Web Proxy|src/web/build_tools/proxy/Dockerfile|argus-web-proxy:latest|."
)
for build_spec in "${web_builds[@]}"; do
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
images_built+=("$image_tag")
else
build_failed=true
fi
echo ""
done
alert_builds=(
"Alertmanager|src/alert/alertmanager/build/Dockerfile|argus-alertmanager:latest|."
)
for build_spec in "${alert_builds[@]}"; do
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
images_built+=("$image_tag")
else
build_failed=true
fi
echo ""
done
echo "=======================================" echo "======================================="
echo "📦 Build Summary" echo "📦 Build Summary"
echo "=======================================" echo "======================================="

View File

@ -71,6 +71,9 @@ declare -A images=(
["argus-metric-ftp:latest"]="argus-metric-ftp-latest.tar" ["argus-metric-ftp:latest"]="argus-metric-ftp-latest.tar"
["argus-metric-prometheus:latest"]="argus-metric-prometheus-latest.tar" ["argus-metric-prometheus:latest"]="argus-metric-prometheus-latest.tar"
["argus-metric-grafana:latest"]="argus-metric-grafana-latest.tar" ["argus-metric-grafana:latest"]="argus-metric-grafana-latest.tar"
["argus-web-frontend:latest"]="argus-web-frontend-latest.tar"
["argus-web-proxy:latest"]="argus-web-proxy-latest.tar"
["argus-alertmanager:latest"]="argus-alertmanager-latest.tar"
) )
# 函数:检查镜像是否存在 # 函数:检查镜像是否存在

View File

@ -20,10 +20,10 @@ RUN wget https://github.com/prometheus/alertmanager/releases/download/v${ALERTMA
ENV ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager ENV ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
ARG ARGUS_UID=2133 ARG ARGUS_BUILD_UID=2133
ARG ARGUS_GID=2015 ARG ARGUS_BUILD_GID=2015
ENV ARGUS_UID=${ARGUS_UID} ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID}
ENV ARGUS_GID=${ARGUS_GID} ENV ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
RUN mkdir -p /usr/share/alertmanager && \ RUN mkdir -p /usr/share/alertmanager && \
mkdir -p ${ALERTMANAGER_BASE_PATH} && \ mkdir -p ${ALERTMANAGER_BASE_PATH} && \
@ -33,16 +33,25 @@ RUN mkdir -p /usr/share/alertmanager && \
# 创建 alertmanager 用户(可自定义 UID/GID # 创建 alertmanager 用户(可自定义 UID/GID
# 创建 alertmanager 用户组 # 创建 alertmanager 用户组
RUN groupadd -g ${ARGUS_GID} alertmanager RUN set -eux; \
if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \
groupadd -g "${ARGUS_BUILD_GID}" alertmanager || true; \
fi; \
if id alertmanager >/dev/null 2>&1; then \
current_uid="$(id -u alertmanager)"; \
if [ "$current_uid" != "${ARGUS_BUILD_UID}" ] && ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
usermod -u "${ARGUS_BUILD_UID}" alertmanager; \
fi; \
usermod -g "${ARGUS_BUILD_GID}" alertmanager || true; \
else \
if ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" alertmanager; \
else \
echo "UID ${ARGUS_BUILD_UID} already exists; skip creating user 'alertmanager'"; \
fi; \
fi
# 创建 alertmanager 用户并指定组 RUN chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/alertmanager /alertmanager ${ALERTMANAGER_BASE_PATH} /private/argus/etc /usr/local/bin || true
RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} alertmanager
RUN chown -R alertmanager:alertmanager /usr/share/alertmanager && \
chown -R alertmanager:alertmanager /alertmanager && \
chown -R alertmanager:alertmanager ${ALERTMANAGER_BASE_PATH} && \
chown -R alertmanager:alertmanager /private/argus/etc && \
chown -R alertmanager:alertmanager /usr/local/bin
# 配置内网 apt 源 (如果指定了内网选项) # 配置内网 apt 源 (如果指定了内网选项)
RUN if [ "$USE_INTRANET" = "true" ]; then \ RUN if [ "$USE_INTRANET" = "true" ]; then \
@ -86,4 +95,3 @@ EXPOSE 9093
# 使用 supervisor 作为入口点 # 使用 supervisor 作为入口点
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]

View File

@ -5,8 +5,8 @@ docker pull ubuntu:24.04
source src/alert/tests/.env source src/alert/tests/.env
docker build \ docker build \
--build-arg ARGUS_UID=${ARGUS_UID} \ --build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
--build-arg ARGUS_GID=${ARGUS_GID} \ --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID} \
-f src/alert/alertmanager/build/Dockerfile \ -f src/alert/alertmanager/build/Dockerfile \
-t argus-alertmanager:latest . -t argus-alertmanager:latest .

View File

@ -1,5 +1,5 @@
DATA_ROOT=/home/argus/tmp/private/argus DATA_ROOT=/home/argus/tmp/private/argus
ARGUS_UID=1048 ARGUS_BUILD_UID=1048
ARGUS_GID=1048 ARGUS_BUILD_GID=1048
USE_INTRANET=false USE_INTRANET=false

View File

@ -4,15 +4,15 @@ services:
context: ../../../ context: ../../../
dockerfile: src/alert/alertmanager/build/Dockerfile dockerfile: src/alert/alertmanager/build/Dockerfile
args: args:
ARGUS_UID: ${ARGUS_UID:-2133} ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
ARGUS_GID: ${ARGUS_GID:-2015} ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
USE_INTRANET: ${USE_INTRANET:-false} USE_INTRANET: ${USE_INTRANET:-false}
image: argus-alertmanager:latest image: argus-alertmanager:latest
container_name: argus-alertmanager container_name: argus-alertmanager
environment: environment:
- ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager - ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
- ARGUS_UID=${ARGUS_UID:-2133} - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_GID=${ARGUS_GID:-2015} - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
ports: ports:
- "${ARGUS_PORT:-9093}:9093" - "${ARGUS_PORT:-9093}:9093"
volumes: volumes:

View File

@ -26,6 +26,7 @@ RUN apt-get update && \
apt-get install -y \ apt-get install -y \
bind9 \ bind9 \
bind9utils \ bind9utils \
dnsutils \
bind9-doc \ bind9-doc \
supervisor \ supervisor \
net-tools \ net-tools \

View File

@ -104,7 +104,26 @@ log_info "文件所有者: $OWNER"
# 确保发布目录存在 # 确保发布目录存在
log_info "确保发布目录存在: $PUBLISH_DIR" log_info "确保发布目录存在: $PUBLISH_DIR"
sudo mkdir -p "$PUBLISH_DIR" mkdir -p "$PUBLISH_DIR"
IFS=':' read -r OWNER_UID OWNER_GID <<< "$OWNER"
if [[ -z "$OWNER_UID" || -z "$OWNER_GID" ]]; then
log_error "--owner 格式不正确,应为 uid:gid"
exit 1
fi
CURRENT_UID=$(id -u)
CURRENT_GID=$(id -g)
if [[ "$OWNER_UID" != "$CURRENT_UID" || "$OWNER_GID" != "$CURRENT_GID" ]]; then
if [[ "$CURRENT_UID" -ne 0 ]]; then
log_error "当前用户 (${CURRENT_UID}:${CURRENT_GID}) 无法设置所有者为 ${OWNER_UID}:${OWNER_GID}"
log_error "请以目标用户运行脚本或预先调整目录权限"
exit 1
fi
NEED_CHOWN=true
else
NEED_CHOWN=false
fi
# 创建临时目录用于打包 # 创建临时目录用于打包
TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$" TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$"
@ -208,26 +227,31 @@ fi
TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz" TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz"
log_info "创建发布包: $TAR_NAME" log_info "创建发布包: $TAR_NAME"
cd "$TEMP_PACKAGE_DIR" cd "$TEMP_PACKAGE_DIR"
sudo tar -czf "$PUBLISH_DIR/$TAR_NAME" * tar -czf "$PUBLISH_DIR/$TAR_NAME" *
cd - > /dev/null cd - > /dev/null
# 设置文件所有者 if [[ "$NEED_CHOWN" == true ]]; then
log_info "设置文件所有者为: $OWNER" log_info "设置文件所有者为: $OWNER"
sudo chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME" chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME"
fi
# 清理临时目录 # 清理临时目录
rm -rf "$TEMP_PACKAGE_DIR" rm -rf "$TEMP_PACKAGE_DIR"
# 更新 LATEST_VERSION 文件 # 更新 LATEST_VERSION 文件
log_info "更新 LATEST_VERSION 文件..." log_info "更新 LATEST_VERSION 文件..."
echo "$VERSION" | sudo tee "$PUBLISH_DIR/LATEST_VERSION" > /dev/null echo "$VERSION" > "$PUBLISH_DIR/LATEST_VERSION"
sudo chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION" if [[ "$NEED_CHOWN" == true ]]; then
chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION"
fi
# 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制) # 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制)
if [[ -f "config/dns.conf" ]]; then if [[ -f "config/dns.conf" ]]; then
log_info "复制 DNS 配置文件到发布目录根目录..." log_info "复制 DNS 配置文件到发布目录根目录..."
sudo cp "config/dns.conf" "$PUBLISH_DIR/" cp "config/dns.conf" "$PUBLISH_DIR/"
sudo chown "$OWNER" "$PUBLISH_DIR/dns.conf" if [[ "$NEED_CHOWN" == true ]]; then
chown "$OWNER" "$PUBLISH_DIR/dns.conf"
fi
log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf" log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf"
else else
log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制" log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制"
@ -236,8 +260,10 @@ fi
# 复制 setup.sh 到发布目录 # 复制 setup.sh 到发布目录
if [[ -f "scripts/setup.sh" ]]; then if [[ -f "scripts/setup.sh" ]]; then
log_info "复制 setup.sh 到发布目录..." log_info "复制 setup.sh 到发布目录..."
sudo cp "scripts/setup.sh" "$PUBLISH_DIR/" cp "scripts/setup.sh" "$PUBLISH_DIR/"
sudo chown "$OWNER" "$PUBLISH_DIR/setup.sh" if [[ "$NEED_CHOWN" == true ]]; then
chown "$OWNER" "$PUBLISH_DIR/setup.sh"
fi
fi fi
# 显示发布结果 # 显示发布结果

View File

@ -65,6 +65,8 @@ COPY grafana.ini /tmp/grafana.ini
COPY datasources/datasources.yml /tmp/datasources.yml COPY datasources/datasources.yml /tmp/datasources.yml
COPY dashboards/dashboards.yml /tmp/dashboards.yml COPY dashboards/dashboards.yml /tmp/dashboards.yml
COPY dashboards/default_dashboard_by_hostname.json /tmp/default_dashboard.json COPY dashboards/default_dashboard_by_hostname.json /tmp/default_dashboard.json
COPY dashboards/default_cluster_dashboard.json /tmp/default_cluster_dashboard.json
COPY dashboards/default_dashboard_by_instance.json /tmp/default_dashboard_by_instance.json
# supervisor 配置 # supervisor 配置
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf

View File

@ -8,7 +8,7 @@ datasources:
type: prometheus type: prometheus
access: proxy access: proxy
uid: eezk1zvkie4g0a uid: eezk1zvkie4g0a
url: http://10.211.55.5:9090 url: http://prom.metric.argus.com:9090
isDefault: true isDefault: true
editable: true editable: true
jsonData: jsonData:

View File

@ -44,12 +44,18 @@ else
fi fi
# 复制数据源配置文件到挂载目录 # 复制数据源配置文件到挂载目录
if [ -f "/tmp/datasources.yml" ]; then DS_OUT="/private/argus/metric/grafana/provisioning/datasources/datasources.yml"
echo "[INFO] Copying datasource configuration to /private/argus/metric/grafana/provisioning/datasources/" PROM_DOMAIN="prom.metric.argus.com:9090"
cp /tmp/datasources.yml /private/argus/metric/grafana/provisioning/datasources/datasources.yml
echo "[INFO] Datasource configuration copied successfully" if [ -f "/tmp/datasources.yml" ] && [ ! -f "$DS_OUT" ]; then
elif [ -d "/private/argus/metric/grafana/provisioning/datasources" ] && [ "$(ls -A /private/argus/metric/grafana/provisioning/datasources)" ]; then echo "[INFO] Initializing datasource provisioning file from /tmp"
echo "[INFO] Found existing datasource provisioning files in /private/argus/metric/grafana/provisioning/datasources" cp /tmp/datasources.yml "$DS_OUT"
fi
# 统一将数据源 URL 规范为 prom.metric.argus.com:9090
if [ -f "$DS_OUT" ]; then
sed -i -E "s#^\s*url:\s*http://[^[:space:]]+# url: http://$PROM_DOMAIN#g" "$DS_OUT" || true
echo "[INFO] Datasource URL normalized to http://$PROM_DOMAIN"
elif [ -d "/etc/grafana/provisioning/datasources" ] && [ "$(ls -A /etc/grafana/provisioning/datasources)" ]; then elif [ -d "/etc/grafana/provisioning/datasources" ] && [ "$(ls -A /etc/grafana/provisioning/datasources)" ]; then
echo "[INFO] Found datasource provisioning files in /etc/grafana/provisioning/datasources" echo "[INFO] Found datasource provisioning files in /etc/grafana/provisioning/datasources"
# 确保数据源配置目录权限正确 # 确保数据源配置目录权限正确
@ -65,11 +71,33 @@ if [ -f "/tmp/dashboards.yml" ]; then
echo "[INFO] Dashboard configuration copied successfully" echo "[INFO] Dashboard configuration copied successfully"
fi fi
# 复制默认仪表板到挂载目录 # 复制默认仪表板到挂载目录(按需,不覆盖已存在文件)
if [ -f "/tmp/default_dashboard.json" ]; then copy_dashboard_if_missing() {
echo "[INFO] Copying default dashboard to /private/argus/metric/grafana/provisioning/dashboards/" local src="$1"; local dst_name="$2"
cp /tmp/default_dashboard.json /private/argus/metric/grafana/provisioning/dashboards/default_dashboard.json local dst_dir="/private/argus/metric/grafana/provisioning/dashboards"
echo "[INFO] Default dashboard copied successfully" local dst="$dst_dir/$dst_name"
if [ -f "$src" ]; then
if [ ! -f "$dst" ]; then
echo "[INFO] Installing dashboard: $dst_name"
cp "$src" "$dst"
else
echo "[INFO] Dashboard exists, skip: $dst_name"
fi
fi
}
copy_dashboard_if_missing "/tmp/default_dashboard.json" "default_dashboard.json"
copy_dashboard_if_missing "/tmp/default_cluster_dashboard.json" "default_cluster_dashboard.json"
copy_dashboard_if_missing "/tmp/default_dashboard_by_instance.json" "default_dashboard_by_instance.json"
# 规范面板中的数据源字段:将字符串 "prometheus" 替换为 null使用默认数据源
DB_DIR="/private/argus/metric/grafana/provisioning/dashboards"
if [ -d "$DB_DIR" ]; then
for f in "$DB_DIR"/*.json; do
[ -f "$f" ] || continue
sed -i -E 's/"datasource"\s*:\s*"prometheus"/"datasource": null/g' "$f" || true
done
echo "[INFO] Normalized dashboard datasource to default (null)"
fi fi
# 启动 Grafana # 启动 Grafana

View File

@ -1,9 +1,5 @@
-version: "3.8"
networks: networks:
default: sysnet:
external: true
name: argus-sys-net
driver: bridge driver: bridge
ipam: ipam:
driver: default driver: default
@ -15,7 +11,7 @@ services:
image: ${BIND_IMAGE_TAG:-argus-bind9:latest} image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
container_name: argus-bind-sys container_name: argus-bind-sys
networks: networks:
default: sysnet:
ipv4_address: 172.29.0.2 ipv4_address: 172.29.0.2
volumes: volumes:
- ./private:/private - ./private:/private
@ -39,7 +35,7 @@ services:
- ./private/argus/metric/prometheus:/private/argus/metric/prometheus - ./private/argus/metric/prometheus:/private/argus/metric/prometheus
- ./private/argus/etc:/private/argus/etc - ./private/argus/etc:/private/argus/etc
networks: networks:
default: sysnet:
ipv4_address: 172.29.0.10 ipv4_address: 172.29.0.10
restart: unless-stopped restart: unless-stopped
@ -58,6 +54,9 @@ services:
ports: ports:
- "9200:9200" - "9200:9200"
restart: unless-stopped restart: unless-stopped
networks:
sysnet:
ipv4_address: 172.29.0.3
kibana: kibana:
image: argus-kibana:latest image: argus-kibana:latest
@ -74,6 +73,9 @@ services:
ports: ports:
- "5601:5601" - "5601:5601"
restart: unless-stopped restart: unless-stopped
networks:
sysnet:
ipv4_address: 172.29.0.4
node-a: node-a:
image: ubuntu:22.04 image: ubuntu:22.04
@ -106,6 +108,8 @@ services:
ports: ports:
- "2020:2020" - "2020:2020"
restart: unless-stopped restart: unless-stopped
networks:
- sysnet
node-b: node-b:
image: ubuntu:22.04 image: ubuntu:22.04
@ -138,6 +142,8 @@ services:
ports: ports:
- "2021:2020" - "2021:2020"
restart: unless-stopped restart: unless-stopped
networks:
- sysnet
ftp: ftp:
image: argus-metric-ftp:latest image: argus-metric-ftp:latest
@ -160,7 +166,7 @@ services:
- /etc/localtime:/etc/localtime:ro - /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro - /etc/timezone:/etc/timezone:ro
networks: networks:
default: sysnet:
ipv4_address: 172.29.0.40 ipv4_address: 172.29.0.40
logging: logging:
driver: "json-file" driver: "json-file"
@ -185,7 +191,7 @@ services:
- /etc/localtime:/etc/localtime:ro - /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro - /etc/timezone:/etc/timezone:ro
networks: networks:
default: sysnet:
ipv4_address: 172.29.0.41 ipv4_address: 172.29.0.41
logging: logging:
driver: "json-file" driver: "json-file"
@ -205,6 +211,9 @@ services:
- GF_SERVER_HTTP_PORT=3000 - GF_SERVER_HTTP_PORT=3000
- GF_LOG_LEVEL=warn - GF_LOG_LEVEL=warn
- GF_LOG_MODE=console - GF_LOG_MODE=console
- GF_PATHS_PROVISIONING=/private/argus/metric/grafana/provisioning
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
ports: ports:
- "${GRAFANA_PORT:-3000}:3000" - "${GRAFANA_PORT:-3000}:3000"
volumes: volumes:
@ -213,7 +222,7 @@ services:
- /etc/localtime:/etc/localtime:ro - /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro - /etc/timezone:/etc/timezone:ro
networks: networks:
default: sysnet:
ipv4_address: 172.29.0.42 ipv4_address: 172.29.0.42
depends_on: depends_on:
- prometheus - prometheus
@ -224,7 +233,7 @@ services:
max-file: "3" max-file: "3"
test-node: test-node:
image: argus-metric-test-node:latest image: ubuntu:22.04
container_name: argus-metric-test-node container_name: argus-metric-test-node
hostname: test-metric-node-001 hostname: test-metric-node-001
restart: unless-stopped restart: unless-stopped
@ -240,13 +249,21 @@ services:
- FTP_USER=${FTP_USER:-ftpuser} - FTP_USER=${FTP_USER:-ftpuser}
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!} - FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
- FTP_PORT=${FTP_PORT:-21} - FTP_PORT=${FTP_PORT:-21}
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
- METRIC_NODE_ROLE=cpu
volumes: volumes:
- ./private/argus/agent:/private/argus/agent - ./private/argus/agent:/private/argus/agent
- ./scripts/metric/test-node-entrypoint.sh:/usr/local/bin/metric-test-node-entrypoint.sh:ro
- /etc/localtime:/etc/localtime:ro - /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro - /etc/timezone:/etc/timezone:ro
command: sleep infinity entrypoint:
- /usr/local/bin/metric-test-node-entrypoint.sh
command:
- sleep
- infinity
networks: networks:
default: sysnet:
ipv4_address: 172.29.0.50 ipv4_address: 172.29.0.50
logging: logging:
driver: "json-file" driver: "json-file"
@ -255,7 +272,8 @@ services:
max-file: "3" max-file: "3"
test-gpu-node: test-gpu-node:
image: argus-metric-test-gpu-node:latest profiles: ["gpu"]
image: nvidia/cuda:12.2.2-runtime-ubuntu22.04
container_name: argus-metric-test-gpu-node container_name: argus-metric-test-gpu-node
hostname: test-metric-gpu-node-001 hostname: test-metric-gpu-node-001
restart: unless-stopped restart: unless-stopped
@ -278,13 +296,21 @@ services:
- NVIDIA_VISIBLE_DEVICES=all - NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility - NVIDIA_DRIVER_CAPABILITIES=compute,utility
- GPU_MODE=gpu - GPU_MODE=gpu
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
- METRIC_NODE_ROLE=gpu
volumes: volumes:
- ./private/argus/agent:/private/argus/agent - ./private/argus/agent:/private/argus/agent
- ./scripts/metric/test-node-entrypoint.sh:/usr/local/bin/metric-test-node-entrypoint.sh:ro
- /etc/localtime:/etc/localtime:ro - /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro - /etc/timezone:/etc/timezone:ro
command: sleep infinity entrypoint:
- /usr/local/bin/metric-test-node-entrypoint.sh
command:
- sleep
- infinity
networks: networks:
default: sysnet:
ipv4_address: 172.29.0.51 ipv4_address: 172.29.0.51
logging: logging:
driver: "json-file" driver: "json-file"

View File

@ -3,6 +3,38 @@ set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ENABLE_GPU=false
usage() {
cat <<'EOF'
Usage: 00_e2e_test.sh [options]
Options:
--enable-gpu 启用 GPU 相关拓扑与测试流程
-h, --help 显示帮助信息
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--enable-gpu)
ENABLE_GPU=true
shift
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage
exit 1
;;
esac
done
export ARGUS_SYS_ENABLE_GPU=$ENABLE_GPU
SCRIPTS=( SCRIPTS=(
"01_bootstrap.sh" "01_bootstrap.sh"
"02_up.sh" "02_up.sh"
@ -12,6 +44,11 @@ SCRIPTS=(
"06_write_health_and_assert.sh" "06_write_health_and_assert.sh"
"07_logs_send_and_assert.sh" "07_logs_send_and_assert.sh"
"08_restart_agent_reregister.sh" "08_restart_agent_reregister.sh"
"10_metric_publish.sh"
"11_metric_node_install.sh"
"12_metric_gpu_install.sh"
"13_metric_verify.sh"
"14_metric_cleanup.sh"
"09_down.sh" "09_down.sh"
) )
@ -23,4 +60,3 @@ for script in "${SCRIPTS[@]}"; do
done done
echo "[SYS-E2E] All tests completed" echo "[SYS-E2E] All tests completed"

View File

@ -22,6 +22,24 @@ ensure_image() {
} }
echo "[INFO] Preparing directories..." echo "[INFO] Preparing directories..."
ensure_writable_dir() {
local path="$1"
local parent
parent="$(dirname "$path")"
mkdir -p "$parent" 2>/dev/null || true
mkdir -p "$path" 2>/dev/null || true
if [[ ! -w "$path" ]]; then
docker run --rm -v "$parent:/target" ubuntu:24.04 bash -lc "chown -R $(id -u):$(id -g) /target" >/dev/null 2>&1 || true
fi
mkdir -p "$path"
}
# preflight: make base dirs writable if inherited from root-owned mounts
ensure_writable_dir "$PRIVATE_CORE/argus"
ensure_writable_dir "$PRIVATE_CORE/argus/metric"
ensure_writable_dir "$PRIVATE_CORE/argus/metric/grafana"
ensure_writable_dir "$PRIVATE_CORE/argus/metric/prometheus"
mkdir -p \ mkdir -p \
"$PRIVATE_CORE/argus/etc" \ "$PRIVATE_CORE/argus/etc" \
"$PRIVATE_CORE/argus/bind" \ "$PRIVATE_CORE/argus/bind" \
@ -57,6 +75,8 @@ chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" \
"$PRIVATE_CORE/argus/agent" \ "$PRIVATE_CORE/argus/agent" \
"$PRIVATE_CORE/argus/etc" 2>/dev/null || true "$PRIVATE_CORE/argus/etc" 2>/dev/null || true
echo "[INFO] Using compose-managed network (auto-created by docker compose)"
echo "[INFO] Distributing update-dns.sh for core services (bind/master/es/kibana)" echo "[INFO] Distributing update-dns.sh for core services (bind/master/es/kibana)"
BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh" BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh"
BIND_UPDATE_DEST="$PRIVATE_CORE/argus/etc/update-dns.sh" BIND_UPDATE_DEST="$PRIVATE_CORE/argus/etc/update-dns.sh"
@ -75,8 +95,6 @@ ensure_image "argus-master:latest"
ensure_image "argus-metric-ftp:latest" ensure_image "argus-metric-ftp:latest"
ensure_image "argus-metric-prometheus:latest" ensure_image "argus-metric-prometheus:latest"
ensure_image "argus-metric-grafana:latest" ensure_image "argus-metric-grafana:latest"
ensure_image "argus-metric-test-node:latest"
ensure_image "argus-metric-test-gpu-node:latest"
echo "[INFO] Building agent binary..." echo "[INFO] Building agent binary..."
pushd "$REPO_ROOT/src/agent" >/dev/null pushd "$REPO_ROOT/src/agent" >/dev/null
@ -91,19 +109,25 @@ fi
echo "$AGENT_BIN" > "$TMP_DIR/agent_binary_path" echo "$AGENT_BIN" > "$TMP_DIR/agent_binary_path"
# 检测GPU环境 # 检测GPU环境
echo "[INFO] 检测GPU环境..." REQUEST_GPU=${ARGUS_SYS_ENABLE_GPU:-false}
GPU_CHECK_SCRIPT="$REPO_ROOT/src/metric/tests/scripts/common/check-gpu.sh" GPU_CHECK_SCRIPT="$REPO_ROOT/src/metric/tests/scripts/common/check-gpu.sh"
if [ -f "$GPU_CHECK_SCRIPT" ]; then if [[ "$REQUEST_GPU" == "true" ]]; then
echo "[INFO] --enable-gpu 已启用开始检测GPU环境..."
if [[ -f "$GPU_CHECK_SCRIPT" ]]; then
if bash "$GPU_CHECK_SCRIPT" >/dev/null 2>&1; then if bash "$GPU_CHECK_SCRIPT" >/dev/null 2>&1; then
echo "[INFO] GPU环境可用将启动test-gpu-node容器" echo "[INFO] GPU环境可用将在 compose 中启用 test-gpu-node"
GPU_AVAILABLE=true GPU_AVAILABLE=true
else else
echo "[INFO] GPU环境不可用跳过test-gpu-node容器" echo "[ERROR] 未检测到可用 GPU但指定了 --enable-gpu" >&2
GPU_AVAILABLE=false exit 1
fi
else
echo "[ERROR] 未找到 GPU 检测脚本: $GPU_CHECK_SCRIPT" >&2
exit 1
fi fi
else else
echo "[WARN] 未找到GPU检测脚本: $GPU_CHECK_SCRIPT跳过GPU检测"
GPU_AVAILABLE=false GPU_AVAILABLE=false
echo "[INFO] GPU 支持未启用,跳过 GPU 检测"
fi fi
echo "[INFO] Writing .env with UID/GID and metric configuration" echo "[INFO] Writing .env with UID/GID and metric configuration"
@ -112,7 +136,7 @@ ARGUS_BUILD_UID=$ARGUS_BUILD_UID
ARGUS_BUILD_GID=$ARGUS_BUILD_GID ARGUS_BUILD_GID=$ARGUS_BUILD_GID
# GPU 配置 # GPU 配置
GPU_AVAILABLE=$GPU_AVAILABLE ENABLE_GPU=$GPU_AVAILABLE
# FTP 配置 # FTP 配置
FTP_PORT=21 FTP_PORT=21

View File

@ -15,39 +15,51 @@ compose() {
echo "[INFO] Bringing up system stack..." echo "[INFO] Bringing up system stack..."
# 检测GPU环境 REQUEST_GPU=${ARGUS_SYS_ENABLE_GPU:-false}
echo "[INFO] 检测GPU环境..." GPU_AVAILABLE=false
GPU_CHECK_SCRIPT="$REPO_ROOT/src/metric/tests/scripts/common/check-gpu.sh" GPU_CHECK_SCRIPT="$REPO_ROOT/src/metric/tests/scripts/common/check-gpu.sh"
if [ -f "$GPU_CHECK_SCRIPT" ]; then
if [[ "$REQUEST_GPU" == "true" ]]; then
echo "[INFO] --enable-gpu 生效,验证主机 GPU..."
if [[ -f "$GPU_CHECK_SCRIPT" ]]; then
if bash "$GPU_CHECK_SCRIPT" >/dev/null 2>&1; then if bash "$GPU_CHECK_SCRIPT" >/dev/null 2>&1; then
echo "[INFO] GPU环境可用将启动GPU测试节点"
GPU_AVAILABLE=true GPU_AVAILABLE=true
echo "[INFO] GPU 检测通过,将启动 gpu profile"
else else
echo "[INFO] GPU环境不可用将跳过GPU测试节点" echo "[ERROR] 主机缺少可用 GPU无法继续 --enable-gpu 流程" >&2
GPU_AVAILABLE=false exit 1
fi
else
echo "[ERROR] 未找到 GPU 检测脚本: $GPU_CHECK_SCRIPT" >&2
exit 1
fi fi
else else
echo "[WARN] 未找到GPU检测脚本: $GPU_CHECK_SCRIPT跳过GPU检测" echo "[INFO] 未启用 GPU 流程"
GPU_AVAILABLE=false
fi fi
pushd "$TEST_ROOT" >/dev/null pushd "$TEST_ROOT" >/dev/null
compose -p argus-sys down --remove-orphans || true compose -p argus-sys down --remove-orphans || true
# 清理可能由 08 脚本创建的同名容器,避免 compose up 冲突
for name in argus-node-b; do
if docker ps -aqf "name=^${name}$" >/dev/null 2>&1 && [[ -n "$(docker ps -aqf "name=^${name}$")" ]]; then
docker rm -f "$name" >/dev/null 2>&1 || true
fi
done
# 根据GPU可用性决定启动的服务 # 根据GPU可用性决定启动的服务
if [ "$GPU_AVAILABLE" = true ]; then if [[ "$GPU_AVAILABLE" == true ]]; then
echo "[INFO] 启动所有服务包括test-gpu-node..." echo "[INFO] 启动所有服务(包含 gpu profile..."
compose -p argus-sys up -d compose -p argus-sys --profile gpu up -d
else else
echo "[INFO] 启动基础服务跳过test-gpu-node..." echo "[INFO] 启动基础服务(不含 gpu profile..."
compose -p argus-sys up -d --scale test-gpu-node=0 compose -p argus-sys up -d
fi fi
popd >/dev/null popd >/dev/null
if [ "$GPU_AVAILABLE" = true ]; then if [[ "$GPU_AVAILABLE" == true ]]; then
echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 test-gpu-node:172.29.0.51" echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 test-gpu-node:172.29.0.51"
else else
echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 (test-gpu-node skipped)" echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 (gpu skipped)"
fi fi

View File

@ -4,20 +4,15 @@ set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
compose() { # 直接根据 container_name 获取容器ID避免 compose project 名称不一致导致查找失败
if docker compose version >/dev/null 2>&1; then cid_by_name() {
docker compose "$@" docker ps -aqf "name=^$1$"
else
docker-compose "$@"
fi
}
service_id() {
compose -p argus-sys ps -q "$1"
} }
echo "[INFO] Verifying DNS routing via bind..." echo "[INFO] Verifying DNS routing via bind..."
pushd "$TEST_ROOT" >/dev/null
# Check master IP file exists in shared private # Check master IP file exists in shared private
MASTER_FILE="$TEST_ROOT/private/argus/etc/master.argus.com" MASTER_FILE="$TEST_ROOT/private/argus/etc/master.argus.com"
if [[ ! -f "$MASTER_FILE" ]]; then if [[ ! -f "$MASTER_FILE" ]]; then
@ -28,7 +23,7 @@ MASTER_IP_HOST="$(cat "$MASTER_FILE" | tr -d '\r\n' || true)"
echo "[INFO] master.argus.com file content: ${MASTER_IP_HOST}" echo "[INFO] master.argus.com file content: ${MASTER_IP_HOST}"
# dig inside bind container # dig inside bind container
BIN_ID="$(service_id bind)" BIN_ID="$(cid_by_name argus-bind-sys)"
if [[ -n "$BIN_ID" ]]; then if [[ -n "$BIN_ID" ]]; then
DIG_IP="$(docker exec "$BIN_ID" dig +short master.argus.com A | tail -n1 || true)" DIG_IP="$(docker exec "$BIN_ID" dig +short master.argus.com A | tail -n1 || true)"
echo "[INFO] dig(master.argus.com) from bind container -> $DIG_IP" echo "[INFO] dig(master.argus.com) from bind container -> $DIG_IP"
@ -39,8 +34,8 @@ else
echo "[WARN] bind container not found; skip dig" echo "[WARN] bind container not found; skip dig"
fi fi
for node in node-a node-b; do for node in argus-node-a argus-node-b; do
CID="$(service_id "$node")" CID="$(cid_by_name "$node")"
echo "[INFO] Checking resolution inside $node..." echo "[INFO] Checking resolution inside $node..."
if ! docker exec "$CID" getent hosts master.argus.com >/dev/null 2>&1; then if ! docker exec "$CID" getent hosts master.argus.com >/dev/null 2>&1; then
echo "[ERR] $node cannot resolve master.argus.com" >&2 echo "[ERR] $node cannot resolve master.argus.com" >&2
@ -50,5 +45,6 @@ for node in node-a node-b; do
echo "[OK] $node resolved master.argus.com -> $RES" echo "[OK] $node resolved master.argus.com -> $RES"
done done
echo "[OK] DNS routing verified" popd >/dev/null
echo "[OK] DNS routing verified"

View File

@ -49,8 +49,35 @@ for _ in {1..60}; do
fi fi
done done
# 若仍未全部注册,尝试重启 node-b 并再等待一轮(兼容 DNS/启动时序抖动)
if [[ ! -s "$TMP_DIR/node_id_a" || ! -s "$TMP_DIR/node_id_b" ]]; then if [[ ! -s "$TMP_DIR/node_id_a" || ! -s "$TMP_DIR/node_id_b" ]]; then
echo "[ERR] Agents did not register in time" >&2 echo "[WARN] node-a or node-b not registered in first window; restarting node-b and retrying..." >&2
# 仅重启 node-b避免影响 es/kibana/master
if docker ps --format '{{.Names}}' | grep -q '^argus-node-b$'; then
docker restart argus-node-b >/dev/null 2>&1 || true
fi
# 再等待一轮(最多 120 秒)
> "$TMP_DIR/node_id_b"
for _ in {1..60}; do
sleep 2
resp=$(curl -fsS "$API_BASE/nodes" 2>/dev/null || true)
[[ -z "$resp" ]] && continue
if ! echo "$resp" | head -c1 | grep -q '\['; then
continue
fi
echo "$resp" > "$TMP_DIR/nodes_list.json"
ID_A=$(extract_node "$HOST_A" "$TMP_DIR/node_id_a" "$TMP_DIR/nodes_list.json" 2>/dev/null || true)
ID_B=$(extract_node "$HOST_B" "$TMP_DIR/node_id_b" "$TMP_DIR/nodes_list.json" 2>/dev/null || true)
if [[ -s "$TMP_DIR/node_id_a" && -s "$TMP_DIR/node_id_b" ]]; then
break
fi
done
fi
if [[ ! -s "$TMP_DIR/node_id_a" || ! -s "$TMP_DIR/node_id_b" ]]; then
echo "[ERR] Agents did not register in time (after retry)" >&2
echo "[HINT] Current /nodes response:" >&2
sed -n '1,200p' "$TMP_DIR/nodes_list.json" >&2 || true
exit 1 exit 1
fi fi

View File

@ -3,9 +3,19 @@ set -euo pipefail
echo "[INFO] Sending logs via node-a/node-b and asserting ES counts..." echo "[INFO] Sending logs via node-a/node-b and asserting ES counts..."
# Robust count helper: tolerates 404/503 and non-JSON responses, returns integer >=0
get_count() { get_count() {
local idx="$1" local idx="$1"; local tmp; tmp=$(mktemp)
curl -s "http://localhost:9200/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" | sed -E 's/.*"count":([0-9]+).*/\1/' | awk 'NF{print $0;exit} END{if(NR==0)print 0}' local code
code=$(curl -s -o "$tmp" -w "%{http_code}" "http://localhost:9200/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" || true)
if [[ "$code" == "200" ]]; then
local val
val=$(jq -r '(.count // 0) | tonumber? // 0' "$tmp" 2>/dev/null || echo 0)
echo "$val"
else
echo 0
fi
rm -f "$tmp"
} }
train0=$(get_count "train-*") train0=$(get_count "train-*")
@ -32,11 +42,26 @@ send_logs "$node_a" "host01"
send_logs "$node_b" "host02" send_logs "$node_b" "host02"
echo "[INFO] Waiting for ES to ingest..." echo "[INFO] Waiting for ES to ingest..."
sleep 10 # Proactively refresh indices (ignore errors if not created yet)
curl -s -X POST "http://localhost:9200/train-*/_refresh" >/dev/null 2>&1 || true
curl -s -X POST "http://localhost:9200/infer-*/_refresh" >/dev/null 2>&1 || true
train1=$(get_count "train-*") # Retry up to 120s for counts to increase and reach threshold (>=4)
infer1=$(get_count "infer-*") final=0
final=$((train1 + infer1)) threshold=4
for attempt in {1..60}; do
train1=$(get_count "train-*")
infer1=$(get_count "infer-*")
final=$((train1 + infer1))
if (( final > base && final >= threshold )); then
break
fi
echo "[..] waiting ES counts increase to >=${threshold} ($attempt/60) current=${final} base=${base}"
# refresh indices again to speed up visibility
curl -s -X POST "http://localhost:9200/train-*/_refresh" >/dev/null 2>&1 || true
curl -s -X POST "http://localhost:9200/infer-*/_refresh" >/dev/null 2>&1 || true
sleep 2
done
echo "[INFO] final counts: train=${train1} infer=${infer1} total=${final}" echo "[INFO] final counts: train=${train1} infer=${infer1} total=${final}"
if (( final <= base )); then if (( final <= base )); then
@ -44,6 +69,7 @@ if (( final <= base )); then
exit 1 exit 1
fi fi
# Minimal threshold to be tolerant: expect at least 4 documents (2 train + 1 infer per node)
if (( final < 4 )); then if (( final < 4 )); then
echo "[ERR] ES total below expected threshold: ${final} < 4" >&2 echo "[ERR] ES total below expected threshold: ${final} < 4" >&2
exit 1 exit 1

View File

@ -58,10 +58,25 @@ docker rm -f argus-node-b >/dev/null 2>&1 || true
AGENT_BIN_PATH="$(cat "$TMP_DIR/agent_binary_path")" AGENT_BIN_PATH="$(cat "$TMP_DIR/agent_binary_path")"
# 选择 compose 管理的网络名(默认 argus-sys_sysnet
detect_sysnet() {
if docker network inspect argus-sys_sysnet >/dev/null 2>&1; then
echo argus-sys_sysnet; return
fi
# 回退:从 master 容器推断所连网络(取第一个)
local n
n=$(docker inspect -f '{{range $k, $_ := .NetworkSettings.Networks}}{{println $k}}{{end}}' argus-master-sys 2>/dev/null | head -n1 || true)
if [[ -n "$n" ]]; then echo "$n"; return; fi
# 最后兜底:尝试项目默认网络(不保证有 IPAM
echo argus-sys_default
}
SYSNET_NAME=$(detect_sysnet)
echo "[INFO] Using docker network: $SYSNET_NAME"
docker run -d \ docker run -d \
--name argus-node-b \ --name argus-node-b \
--hostname dev-yyrshare-uuuu10-ep2f-pod-0 \ --hostname dev-yyrshare-uuuu10-ep2f-pod-0 \
--network argus-sys-net \ --network "$SYSNET_NAME" \
--ip 172.29.0.200 \ --ip 172.29.0.200 \
--dns 172.29.0.2 \ --dns 172.29.0.2 \
-e MASTER_ENDPOINT=http://master.argus.com:3000 \ -e MASTER_ENDPOINT=http://master.argus.com:3000 \

View File

@ -12,12 +12,33 @@ compose() {
fi fi
} }
docker rm -f argus-node-b >/dev/null 2>&1 || true
pushd "$TEST_ROOT" >/dev/null pushd "$TEST_ROOT" >/dev/null
compose -p argus-sys down --remove-orphans || true compose -p argus-sys down --remove-orphans || true
compose down --remove-orphans || true
popd >/dev/null popd >/dev/null
echo "[INFO] Force removing containers by name (if any)..."
containers=(
argus-node-a
argus-node-b
argus-metric-test-node
argus-grafana
argus-kibana-sys
argus-master-sys
argus-bind-sys
argus-ftp
argus-es-sys
argus-prometheus
)
for c in "${containers[@]}"; do
id=$(docker ps -aqf "name=^${c}$" || true)
if [[ -n "$id" ]]; then
docker rm -f "$id" >/dev/null 2>&1 || true
fi
done
echo "[INFO] Removing compose networks (handled by compose down)"
echo "[INFO] Cleaning private directories..." echo "[INFO] Cleaning private directories..."
if [[ -d "$TEST_ROOT/private" ]]; then if [[ -d "$TEST_ROOT/private" ]]; then
docker run --rm -v "$TEST_ROOT/private:/target" ubuntu:24.04 chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true docker run --rm -v "$TEST_ROOT/private:/target" ubuntu:24.04 chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true

View File

@ -0,0 +1,66 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
PLUGIN_DIR="$REPO_ROOT/src/metric/client-plugins/all-in-one-full"
FTP_CONTAINER="argus-ftp"
if [[ ! -d "$PLUGIN_DIR" ]]; then
echo "[SYS-METRIC] Metric client plugin directory not found: $PLUGIN_DIR" >&2
exit 1
fi
if [[ -f "$TEST_ROOT/.env" ]]; then
# shellcheck source=/dev/null
source "$TEST_ROOT/.env"
fi
OWNER="${ARGUS_BUILD_UID:-2133}:${ARGUS_BUILD_GID:-2015}"
resolve_output_dir() {
local host_mount
if docker ps --format '{{.Names}}' | grep -q "^${FTP_CONTAINER}$"; then
host_mount=$(docker inspect "$FTP_CONTAINER" --format '{{range .Mounts}}{{if eq .Destination "/private/argus/ftp"}}{{.Source}}{{end}}{{end}}' 2>/dev/null || true)
if [[ -n "$host_mount" ]]; then
echo "$host_mount/share"
return 0
fi
fi
echo "$TEST_ROOT/private/argus/metric/ftp/share"
}
OUTPUT_DIR="$(resolve_output_dir)"
mkdir -p "$OUTPUT_DIR"
if [[ ! -w "$OUTPUT_DIR" ]]; then
echo "[SYS-METRIC] 无法写入 FTP 输出目录: $OUTPUT_DIR" >&2
echo " 请确认目录权限与 ARGUS_BUILD_UID/GID 一致" >&2
exit 1
fi
pushd "$PLUGIN_DIR" >/dev/null
echo "[SYS-METRIC] Bumping metric artifact version..."
bash scripts/version-manager.sh bump minor
VERSION_FILE="config/VERSION"
if [[ ! -f "$VERSION_FILE" ]]; then
echo "[SYS-METRIC] VERSION 文件缺失: $VERSION_FILE" >&2
exit 1
fi
VERSION=$(tr -d '\n' < "$VERSION_FILE")
echo "[SYS-METRIC] 当前版本: $VERSION"
echo "[SYS-METRIC] Packaging metric artifact..."
bash scripts/package_artifact.sh --force
echo "[SYS-METRIC] Publishing artifact to FTP share..."
bash scripts/publish_artifact.sh "$VERSION" --output-dir "$OUTPUT_DIR" --owner "$OWNER"
popd >/dev/null
echo "[SYS-METRIC] Metric artifact published to $OUTPUT_DIR"

View File

@ -0,0 +1,50 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
if [[ -f "$TEST_ROOT/.env" ]]; then
# shellcheck source=/dev/null
source "$TEST_ROOT/.env"
fi
CONTAINER="argus-metric-test-node"
if ! docker ps --format '{{.Names}}' | grep -q "^${CONTAINER}$"; then
echo "[SYS-METRIC] 容器 ${CONTAINER} 未运行,无法执行安装" >&2
exit 1
fi
FTP_HOST="${FTP_SERVER:-172.29.0.40}"
FTP_USER="${FTP_USER:-ftpuser}"
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
FTP_PORT="${FTP_PORT:-21}"
echo "[SYS-METRIC] 在 ${CONTAINER} 内执行安装 (FTP: ${FTP_HOST}:${FTP_PORT})"
docker exec \
-e FTP_HOST="$FTP_HOST" \
-e FTP_USER="$FTP_USER" \
-e FTP_PASSWORD="$FTP_PASSWORD" \
-e FTP_PORT="$FTP_PORT" \
"$CONTAINER" bash -c '
set -e
if ! command -v curl &>/dev/null; then
echo "[SYS-METRIC] curl 未安装,开始安装依赖..."
apt-get update >/dev/null && apt-get install -y curl >/dev/null
fi
cd /tmp
echo "[SYS-METRIC] 下载 setup.sh..."
curl -u "${FTP_USER}:${FTP_PASSWORD}" "ftp://${FTP_HOST}:${FTP_PORT}/setup.sh" -o setup.sh
echo "[SYS-METRIC] 执行安装..."
chmod +x setup.sh
bash setup.sh --server "${FTP_HOST}" --user "${FTP_USER}" --password "${FTP_PASSWORD}" --port "${FTP_PORT}"
echo "[SYS-METRIC] 安装完成"
'
echo "[SYS-METRIC] Metric test node 安装流程完成"

View File

@ -0,0 +1,64 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENABLE_GPU=${ARGUS_SYS_ENABLE_GPU:-false}
if [[ "$ENABLE_GPU" != "true" ]]; then
echo "[SYS-METRIC] 未启用 GPU 流程,跳过 GPU 节点安装"
exit 0
fi
if [[ -f "$TEST_ROOT/.env" ]]; then
# shellcheck source=/dev/null
source "$TEST_ROOT/.env"
fi
CONTAINER="argus-metric-test-gpu-node"
if ! docker ps --format '{{.Names}}' | grep -q "^${CONTAINER}$"; then
echo "[SYS-METRIC] 预期启动的 ${CONTAINER} 未运行" >&2
exit 1
fi
FTP_HOST="${FTP_SERVER:-172.29.0.40}"
FTP_USER="${FTP_USER:-ftpuser}"
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
FTP_PORT="${FTP_PORT:-21}"
echo "[SYS-METRIC] 在 GPU 节点执行安装 (FTP: ${FTP_HOST}:${FTP_PORT})"
docker exec \
-e FTP_HOST="$FTP_HOST" \
-e FTP_USER="$FTP_USER" \
-e FTP_PASSWORD="$FTP_PASSWORD" \
-e FTP_PORT="$FTP_PORT" \
"$CONTAINER" bash -c '
set -e
if ! command -v nvidia-smi &>/dev/null; then
echo "[SYS-METRIC] GPU 节点缺少 nvidia-smi" >&2
exit 1
fi
nvidia-smi >/dev/null || true
if ! command -v curl &>/dev/null; then
echo "[SYS-METRIC] curl 未安装,开始安装依赖..."
apt-get update >/dev/null && apt-get install -y curl >/dev/null
fi
cd /tmp
echo "[SYS-METRIC] 下载 setup.sh..."
curl -u "${FTP_USER}:${FTP_PASSWORD}" "ftp://${FTP_HOST}:${FTP_PORT}/setup.sh" -o setup.sh
echo "[SYS-METRIC] 执行安装..."
chmod +x setup.sh
bash setup.sh --server "${FTP_HOST}" --user "${FTP_USER}" --password "${FTP_PASSWORD}" --port "${FTP_PORT}"
echo "[SYS-METRIC] GPU 节点安装完成"
'
echo "[SYS-METRIC] Metric GPU 节点安装流程完成"

View File

@ -0,0 +1,40 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "[SYS-METRIC] Verify: master"
"$SCRIPT_DIR/13_metric_verify_master.sh"
echo
echo "[SYS-METRIC] Verify: prometheus"
PROM_RETRIES=${PROM_VERIFY_RETRIES:-2}
PROM_BACKOFF=${PROM_VERIFY_BACKOFF_SECONDS:-30}
attempt=0
while true; do
if "$SCRIPT_DIR/13_metric_verify_prometheus.sh"; then
break
fi
attempt=$((attempt+1))
if (( attempt > PROM_RETRIES )); then
echo "[ERR] prometheus verify failed after $PROM_RETRIES retries" >&2
exit 1
fi
echo "[WARN] prometheus verify failed; retry $attempt/$PROM_RETRIES after ${PROM_BACKOFF}s"
sleep "$PROM_BACKOFF"
done
echo
echo "[SYS-METRIC] Verify: dataplane"
"$SCRIPT_DIR/13_metric_verify_dataplane.sh"
echo
echo "[SYS-METRIC] Verify: grafana"
"$SCRIPT_DIR/13_metric_verify_grafana.sh"
echo
echo "[SYS-METRIC] Verify: grafana panels"
"$SCRIPT_DIR/13_metric_verify_grafana_panels.sh"
echo
echo "[SYS-METRIC] Metric verification completed"

View File

@ -0,0 +1,47 @@
#!/usr/bin/env bash
set -euo pipefail
TMP_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. && pwd)/tmp/metric-verify"
mkdir -p "$TMP_DIR"
PROM_BASE="http://localhost:9090/api/v1"
INSTANCE="${METRIC_TEST_INSTANCE:-172.29.0.50:9100}"
IP_ONLY="${INSTANCE%%:*}"
echo "[VERIFY:DATA] node exporter metrics present in container"
docker exec argus-metric-test-node bash -lc "curl -fsS --max-time 5 http://localhost:9100/metrics | head -n 5" > "$TMP_DIR/node_metrics_head.txt" || { echo "[ERR] cannot fetch node exporter metrics" >&2; exit 1; }
if ! grep -E "node_(exporter_build_info|time_seconds)" -q "$TMP_DIR/node_metrics_head.txt"; then
echo "[WARN] head did not show expected lines; continuing (exporter may output later lines)"
fi
echo "[OK] node exporter endpoint reachable"
echo "[VERIFY:DATA] Prometheus has recent sample for build_info"
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=node_exporter_build_info{job=\"node\",ip=\"$IP_ONLY\"}" > "$TMP_DIR/prom_ne_build_info_1.json"
python3 - "$TMP_DIR/prom_ne_build_info_1.json" <<'PY'
import json,sys,time
j=json.load(open(sys.argv[1]))
res=j.get('data',{}).get('result',[])
assert res, 'no result for node_exporter_build_info'
ts=float(res[0]['value'][0])
now=time.time()
assert now-ts<180, f"sample too old: now={now} ts={ts}"
print(int(ts))
PY
T1=$?
sleep 30
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=node_exporter_build_info{job=\"node\",ip=\"$IP_ONLY\"}" > "$TMP_DIR/prom_ne_build_info_2.json"
TS1=$(python3 - "$TMP_DIR/prom_ne_build_info_1.json" <<'PY'
import json,sys
print(float(json.load(open(sys.argv[1]))['data']['result'][0]['value'][0]))
PY
)
TS2=$(python3 - "$TMP_DIR/prom_ne_build_info_2.json" <<'PY'
import json,sys
print(float(json.load(open(sys.argv[1]))['data']['result'][0]['value'][0]))
PY
)
awk -v a="$TS1" -v b="$TS2" 'BEGIN{ if (b>=a) exit 0; else exit 1 }' || { echo "[ERR] sample timestamp did not advance" >&2; exit 1; }
echo "[OK] sample timestamp advanced"
echo "[DONE] dataplane verify"

View File

@ -0,0 +1,39 @@
#!/usr/bin/env bash
set -euo pipefail
PROM_DOMAIN="prom.metric.argus.com:9090"
GRAF="http://localhost:3000"
echo "[VERIFY:GRAFANA] /api/health"
TMP_FILE="$(cd "$(dirname "$0")"/.. && pwd)/tmp/metric-verify/graf_health.json"
mkdir -p "$(dirname "$TMP_FILE")"
curl -fsS --max-time 10 "$GRAF/api/health" -o "$TMP_FILE" || { echo "[ERR] failed to GET /api/health" >&2; exit 1; }
python3 - "$TMP_FILE" <<'PY'
import sys,json
with open(sys.argv[1],'r',encoding='utf-8') as f:
j=json.load(f)
assert j.get('database')=='ok', f"health not ok: {j}"
print('OK')
PY
echo "[VERIFY:GRAFANA] datasource URL uses domain: $PROM_DOMAIN"
DS_FILE="/private/argus/metric/grafana/provisioning/datasources/datasources.yml"
if ! docker exec argus-grafana sh -lc "test -f $DS_FILE"; then
DS_FILE="/etc/grafana/provisioning/datasources/datasources.yml"
fi
docker exec argus-grafana sh -lc "grep -E 'url:\s*http://$PROM_DOMAIN' '$DS_FILE'" >/dev/null 2>&1 || { echo "[ERR] datasource not pointing to $PROM_DOMAIN" >&2; exit 1; }
echo "[OK] datasource points to domain"
echo "[VERIFY:GRAFANA] bind resolution inside grafana"
tries=0
until docker exec argus-grafana getent hosts prom.metric.argus.com >/dev/null 2>&1; do
tries=$((tries+1))
if (( tries > 24 )); then
echo "[ERR] grafana cannot resolve prom.metric.argus.com" >&2
exit 1
fi
echo "[..] waiting DNS propagation in grafana ($tries/24)"; sleep 5
done
echo "[OK] domain resolves"
echo "[DONE] grafana verify"

View File

@ -0,0 +1,70 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_DIR="$TEST_ROOT/tmp/metric-verify"
mkdir -p "$TMP_DIR"
GRAF="http://localhost:3000"
HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}"
echo "[VERIFY:GRAF-PANELS] resolve Prometheus datasource UID via Grafana"
DS_JSON="$TMP_DIR/graf_ds.json"
curl -fsS --max-time 10 "$GRAF/api/datasources" >"$DS_JSON"
DS_UID=$(python3 - "$DS_JSON" <<'PY'
import json,sys
arr=json.load(open(sys.argv[1]))
for ds in arr:
if (ds.get('type')=='prometheus'):
print(ds.get('uid',''))
break
PY
)
if [[ -z "$DS_UID" ]]; then echo "[ERR] no prometheus datasource found in grafana" >&2; exit 1; fi
echo "[OK] Prometheus DS UID=$DS_UID"
proxy_query() {
local q="$1"; local out="$2"
curl -fsS --max-time 10 --get "$GRAF/api/datasources/proxy/uid/$DS_UID/api/v1/query" \
--data-urlencode "query=$q" >"$out"
}
assert_vector_recent_nonempty() {
local json="$1"; local max_age_sec="${2:-180}"
python3 - <<'PY' "$json" "$max_age_sec"
import json,sys,time
doc=json.load(open(sys.argv[1]))
if doc.get('status')!='success':
raise SystemExit('prom status != success')
res=doc.get('data',{}).get('result',[])
assert res, 'empty result'
ts=float(res[0]['value'][0])
assert time.time()-ts < float(sys.argv[2]), f'timestamp too old: {ts}'
print(int(ts))
PY
}
echo "[VERIFY:GRAF-PANELS] Dashboard: Node and GPU Metrics — System Load"
Q_NODE_LOAD="node_load1{hostname=\"$HOSTNAME\"}"
proxy_query "$Q_NODE_LOAD" "$TMP_DIR/graf_panel_node_load.json"
assert_vector_recent_nonempty "$TMP_DIR/graf_panel_node_load.json" 300 >/dev/null
echo "[OK] node_load1 has recent sample via Grafana proxy"
echo "[VERIFY:GRAF-PANELS] Dashboard: Cluster Dashboard — Node online count"
Q_NODE_ONLINE='count(count by(hostname) (up{job="node"} == 1))'
proxy_query "$Q_NODE_ONLINE" "$TMP_DIR/graf_panel_node_online.json"
python3 - "$TMP_DIR/graf_panel_node_online.json" <<'PY'
import json,sys
doc=json.load(open(sys.argv[1]))
assert doc.get('status')=='success', 'prom status not success'
res=doc.get('data',{}).get('result',[])
assert res, 'no series for node online count'
val=float(res[0]['value'][1])
assert val>=1, f'node online < 1: {val}'
print('OK',val)
PY
echo "[OK] cluster node online count >= 1 via Grafana proxy"
echo "[DONE] grafana panels verify"

View File

@ -0,0 +1,105 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_DIR="$TEST_ROOT/tmp/metric-verify"
mkdir -p "$TMP_DIR"
MASTER_BASE="http://localhost:32300/api/v1/master"
HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}"
curl_json() { curl -fsS --max-time 5 "$1"; }
echo "[VERIFY:MASTER] list nodes and locate target hostname=$HOSTNAME"
ALL_NODES_JSON="$TMP_DIR/master_nodes.json"
# 重试等待节点出现在 /nodes 列表(最多 120s
NODE_ID=""
for attempt in {1..24}; do
curl_json "$MASTER_BASE/nodes" > "$ALL_NODES_JSON" || true
NODE_ID=$(python3 - "$ALL_NODES_JSON" "$HOSTNAME" <<'PY'
import json,sys
try:
nodes=json.load(open(sys.argv[1]))
except Exception:
nodes=[]
name=sys.argv[2]
for n in nodes:
if n.get('name')==name:
print(n.get('id',''))
break
PY
)
if [[ -n "$NODE_ID" ]]; then break; fi
echo "[..] waiting node to appear in /nodes ($attempt/24)"; sleep 5
done
if [[ -z "$NODE_ID" ]]; then
echo "[ERR] master /nodes 中未找到 $HOSTNAME(等待超时)" >&2
echo "[HINT] 当前 /nodes 列表如下:" >&2
sed -n '1,160p' "$ALL_NODES_JSON" >&2 || true
exit 1
fi
echo "[OK] node id=$NODE_ID"
echo "[VERIFY:MASTER] get node detail and assert fields"
DETAIL1_JSON="$TMP_DIR/master_node_${NODE_ID}_detail_1.json"
curl_json "$MASTER_BASE/nodes/$NODE_ID" > "$DETAIL1_JSON"
# 基础字段与健康项检查(不强制立即 online
python3 - "$DETAIL1_JSON" "$HOSTNAME" <<'PY'
import json,sys,datetime
j=json.load(open(sys.argv[1]))
host=sys.argv[2]
assert j.get('name')==host, f"name mismatch: {j.get('name')} != {host}"
status=j.get('status')
assert status in ('initialized','online','offline'), f"unexpected status: {status}"
md=j.get('meta_data',{})
assert md.get('hostname',j.get('name'))==host, 'meta_data.hostname mismatch'
assert 'last_report' in j and j['last_report'], 'last_report missing'
h=j.get('health',{})
for key in ('metric-node-exporter','metric-fluent-bit','metric-argus-agent'):
if key in h:
assert h[key].get('status')=='healthy', f"{key} not healthy: {h[key]}"
print('OK')
PY
# 轮询等待 last_report 前进并最终转为 online最多 90s容忍短暂 5xx/网络错误
attempt=0
T_PRE=0
until [[ $attempt -ge 18 ]]; do
sleep 5
DETAIL_CUR="$TMP_DIR/master_node_${NODE_ID}_detail_cur.json"
if ! curl_json "$MASTER_BASE/nodes/$NODE_ID" > "$DETAIL_CUR" 2>/dev/null; then
echo "[..] retrying node detail fetch ($attempt/18)"; ((attempt++)); continue
fi
read -r STATUS_CUR T_CUR < <(python3 - "$DETAIL_CUR" <<'PY'
import json,sys,datetime
j=json.load(open(sys.argv[1]))
st=j.get('status','')
ts=j.get('last_report','')
if ts.endswith('Z'): ts=ts.replace('Z','+00:00')
try:
t=float(datetime.datetime.fromisoformat(ts).timestamp())
except Exception:
t=0.0
print(st)
print(t)
PY
)
if awk -v a="$T_PRE" -v b="$T_CUR" 'BEGIN{exit !(b>a)}'; then
T_PRE="$T_CUR"
fi
if [[ "$STATUS_CUR" == "online" ]]; then
echo "[OK] status online and last_report progressed"
break
fi
((attempt++))
done
if (( attempt >= 18 )) && [[ "$STATUS_CUR" != "online" ]]; then
echo "[WARN] status did not reach online within timeout; continuing"
fi
echo "$NODE_ID" > "$TMP_DIR/node_id_metric"
echo "[DONE] master verify"

View File

@ -0,0 +1,142 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_DIR="$TEST_ROOT/tmp/metric-verify"
mkdir -p "$TMP_DIR"
PROM_BASE="http://localhost:9090/api/v1"
HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}"
nodes_json="$TEST_ROOT/private/argus/metric/prometheus/nodes.json"
targets_json="$TEST_ROOT/private/argus/metric/prometheus/targets/node_exporter.json"
echo "[VERIFY:PROM] nodes.json present and contains hostname=$HOSTNAME"
[[ -f "$nodes_json" ]] || { echo "[ERR] $nodes_json missing" >&2; exit 1; }
python3 - "$nodes_json" "$HOSTNAME" <<'PY'
import json,sys
arr=json.load(open(sys.argv[1]))
host=sys.argv[2]
assert any((i.get('hostname')==host) for i in arr), f"{host} not found in nodes.json"
PY
echo "[OK] nodes.json contains target"
echo "[VERIFY:PROM] file_sd targets exist for nodes.json entries"
[[ -f "$targets_json" ]] || { echo "[ERR] $targets_json missing" >&2; exit 1; }
python3 - "$nodes_json" "$targets_json" "$HOSTNAME" >"$TMP_DIR/prom_targets_ip_inst.txt" <<'PY'
import json,sys
nodes=json.load(open(sys.argv[1]))
file_sd=json.load(open(sys.argv[2]))
host=sys.argv[3]
targets=set()
for item in file_sd:
for t in item.get('targets',[]): targets.add(t)
# choose node matching hostname; fallback to first metric user node; otherwise first
sel = None
for n in nodes:
if n.get('hostname') == host:
sel = n
break
if not sel:
for n in nodes:
if n.get('user_id') == 'metric':
sel = n
break
if not sel and nodes:
sel = nodes[0]
if not sel:
raise SystemExit('nodes.json empty or no suitable node found')
ip = sel['ip']
inst = f"{ip}:9100"
print(ip)
print(inst)
PY
IP_FIRST=$(sed -n '1p' "$TMP_DIR/prom_targets_ip_inst.txt")
INSTANCE=$(sed -n '2p' "$TMP_DIR/prom_targets_ip_inst.txt")
echo "[INFO] expecting instance in file_sd: $INSTANCE"
# 尝试在 Prometheus 容器内主动刷新 targets可选加速
if docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then
echo "[..] triggering update_targets inside argus-prometheus"
docker exec argus-prometheus bash -lc \
'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true'
fi
# 给 Prometheus 一次初始 scrape 周期
sleep 10
# 若短暂未生成,进行重试(最多 180s期间多次触发刷新
retry=0
until jq -r '.[].targets[]' "$targets_json" 2>/dev/null | grep -q "^${IP_FIRST}:9100$"; do
if (( retry >= 36 )); then
echo "[ERR] ${IP_FIRST}:9100 not present in file_sd after timeout" >&2
echo "[HINT] current targets file content:" >&2
sed -n '1,200p' "$targets_json" >&2 || true
exit 1
fi
if (( retry % 3 == 0 )) && docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then
docker exec argus-prometheus bash -lc \
'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true'
fi
echo "[..] waiting file_sd refresh ($retry/36)"; sleep 5; ((retry++))
done
# 改为以 PromQL up 指标作为健康依据,避免 targets 页面状态抖动
echo "[VERIFY:PROM] up{job=\"node\",ip=\"$IP_FIRST\"} > 0"
attempt=0
until (( attempt >= 60 )); do
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"node\",ip=\"$IP_FIRST\"}" > "$TMP_DIR/prom_up_inst_active.json" || true
if python3 - "$TMP_DIR/prom_up_inst_active.json" <<'PY'
import json,sys
try:
j=json.load(open(sys.argv[1]))
except Exception:
raise SystemExit(1)
res=j.get('data',{}).get('result',[])
if res:
try:
val=float(res[0]['value'][1])
if val>0: raise SystemExit(0)
except Exception:
pass
raise SystemExit(1)
PY
then
echo "[OK] up > 0 (control-plane scrape works)"; break
fi
if (( attempt % 6 == 0 )) && docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then
docker exec argus-prometheus bash -lc \
'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true'
fi
echo "[..] waiting up{job=\"node\",ip=\"$IP_FIRST\"} > 0 ($attempt/60)"; sleep 5; ((attempt++))
done
if (( attempt >= 60 )); then
echo "[ERR] up{job=\"node\",ip=\"$IP_FIRST\"} did not become > 0" >&2
exit 1
fi
echo "[VERIFY:PROM] instant up query > 0"
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"node\",ip=\"$IP_FIRST\"}" > "$TMP_DIR/prom_up_inst.json"
python3 - "$TMP_DIR/prom_up_inst.json" <<'PY'
import json,sys
j=json.load(open(sys.argv[1]))
res=j.get('data',{}).get('result',[])
assert res, 'empty result for up{job="node",instance=...}'
val=float(res[0]['value'][1])
assert val>0, f"up value not > 0: {val}"
PY
echo "[OK] up > 0"
echo "[VERIFY:PROM] count(up{job=\"node\"}==1) >= 1"
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=count(up{job=\"node\"}==1)" > "$TMP_DIR/prom_up_count.json"
python3 - "$TMP_DIR/prom_up_count.json" <<'PY'
import json,sys
j=json.load(open(sys.argv[1]))
res=j.get('data',{}).get('result',[])
assert res, 'empty result for count(up{job="node"}==1)'
val=float(res[0]['value'][1])
assert val>=1, f"count < 1: {val}"
PY
echo "[OK] up count satisfied"
echo "[DONE] prometheus verify"

View File

@ -0,0 +1,18 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
FTP_SHARE="$TEST_ROOT/private/argus/metric/ftp/share"
if [[ -d "$FTP_SHARE" ]]; then
echo "[SYS-METRIC] 清理 FTP 发布产物..."
rm -f "$FTP_SHARE"/argus-metric_*.tar.gz 2>/dev/null || true
rm -f "$FTP_SHARE"/LATEST_VERSION 2>/dev/null || true
rm -f "$FTP_SHARE"/dns.conf "$FTP_SHARE"/setup.sh 2>/dev/null || true
else
echo "[SYS-METRIC] FTP 目录不存在,跳过清理"
fi
echo "[SYS-METRIC] Metric 清理完成"

View File

@ -0,0 +1,45 @@
#!/usr/bin/env bash
set -euo pipefail
ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
AGENT_ROOT=${AGENT_ROOT:-/private/argus/agent}
PREPARED_FLAG="/tmp/.metric_node_prepared"
export DEBIAN_FRONTEND=${DEBIAN_FRONTEND:-noninteractive}
if [[ ! -f "$PREPARED_FLAG" ]]; then
apt-get update -qq
apt-get install -y -qq \
curl \
net-tools \
iproute2 \
lsof \
procps \
ca-certificates \
gnupg2 || {
echo "[metric-node] Failed to install base packages" >&2
exit 1
}
mkdir -p "$(dirname "$PREPARED_FLAG")"
touch "$PREPARED_FLAG"
fi
if [[ -n "${TZ:-}" ]]; then
ln -snf "/usr/share/zoneinfo/${TZ}" /etc/localtime 2>/dev/null || true
echo "$TZ" > /etc/timezone 2>/dev/null || true
fi
mkdir -p "$AGENT_ROOT"
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$AGENT_ROOT" 2>/dev/null || true
if [[ "${METRIC_NODE_ROLE:-cpu}" == "gpu" ]]; then
if ! command -v nvidia-smi >/dev/null 2>&1; then
echo "[metric-node] nvidia-smi not available but GPU role requested" >&2
exit 1
fi
nvidia-smi || true
fi
exec "$@"

View File

@ -46,7 +46,9 @@ fi
# Start Fluent Bit in background (will block, so run via bash -lc &) # Start Fluent Bit in background (will block, so run via bash -lc &)
if [[ -x /private/start-fluent-bit.sh ]]; then if [[ -x /private/start-fluent-bit.sh ]]; then
log "starting fluent-bit" log "starting fluent-bit"
bash -lc '/private/start-fluent-bit.sh' & sysctl -w fs.inotify.max_user_instances=512 >/dev/null 2>&1 || true
sysctl -w fs.inotify.max_user_watches=524288 >/dev/null 2>&1 || true
bash -lc 'ulimit -n 65536 || true; exec /private/start-fluent-bit.sh' &
else else
log "missing /private/start-fluent-bit.sh; fluent-bit will not start" log "missing /private/start-fluent-bit.sh; fluent-bit will not start"
fi fi
@ -54,4 +56,3 @@ fi
# Start agent in foreground as runtime user # Start agent in foreground as runtime user
log "starting argus-agent" log "starting argus-agent"
exec su -s /bin/bash -c /usr/local/bin/argus-agent "$RUNTIME_USER" exec su -s /bin/bash -c /usr/local/bin/argus-agent "$RUNTIME_USER"

View File

@ -24,24 +24,37 @@ RUN apt-get update && \
apt-get clean && rm -rf /var/lib/apt/lists/* apt-get clean && rm -rf /var/lib/apt/lists/*
ENV FRONTEND_BASE_PATH=/private/argus/web/frontend ENV FRONTEND_BASE_PATH=/private/argus/web/frontend
ARG ARGUS_UID=2133 ARG ARGUS_BUILD_UID=2133
ARG ARGUS_GID=2015 ARG ARGUS_BUILD_GID=2015
ENV ARGUS_UID=${ARGUS_UID} ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID}
ENV ARGUS_GID=${ARGUS_GID} ENV ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
RUN mkdir -p ${FRONTEND_BASE_PATH} && \ RUN mkdir -p ${FRONTEND_BASE_PATH} && \
mkdir -p /private/argus/etc mkdir -p /private/argus/etc
# 创建 web 用户(可自定义 UID/GID # 创建 web 用户(可自定义 UID/GID
# 创建 web 用户组 # 创建 web 用户组
RUN groupadd -g ${ARGUS_GID} web RUN set -eux; \
# 确保目标 GID 存在(组名可不固定)\
# 创建 web 用户并指定组 if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \
RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} web groupadd -g "${ARGUS_BUILD_GID}" web || true; \
fi; \
RUN chown -R web:web ${FRONTEND_BASE_PATH} && \ # 若存在 web 用户则尽量对齐 UID/GID否则仅在 UID 未被占用时创建
chown -R web:web /private/argus/etc && \ if id web >/dev/null 2>&1; then \
chown -R web:web /usr/local/bin current_uid="$(id -u web)"; \
if [ "$current_uid" != "${ARGUS_BUILD_UID}" ] && ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
usermod -u "${ARGUS_BUILD_UID}" web; \
fi; \
usermod -g "${ARGUS_BUILD_GID}" web || true; \
else \
if ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" web; \
else \
echo "UID ${ARGUS_BUILD_UID} already exists; skip creating user 'web'"; \
fi; \
fi; \
# 用数值 UID:GID 赋权,避免依赖用户名/组名
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" ${FRONTEND_BASE_PATH} /private/argus/etc /usr/local/bin || true
# 配置内网 apt 源 (如果指定了内网选项) # 配置内网 apt 源 (如果指定了内网选项)
RUN if [ "$USE_INTRANET" = "true" ]; then \ RUN if [ "$USE_INTRANET" = "true" ]; then \

View File

@ -4,7 +4,7 @@ docker pull ubuntu:24.04
source src/web/tests/.env source src/web/tests/.env
docker build \ docker build \
--build-arg ARGUS_UID=${ARGUS_UID} \ --build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
--build-arg ARGUS_GID=${ARGUS_GID} \ --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID} \
-f src/web/build_tools/frontend/Dockerfile -t argus-web-frontend:latest . -f src/web/build_tools/frontend/Dockerfile -t argus-web-frontend:latest .
docker save -o argus-web-frontend-latest.tar argus-web-frontend:latest docker save -o argus-web-frontend-latest.tar argus-web-frontend:latest

View File

@ -8,8 +8,8 @@ DNS_SCRIPT="${DNS_DIR}/update-dns.sh"
DOMAIN=web.argus.com DOMAIN=web.argus.com
WEB_DOMAIN_FILE="${DNS_DIR}/${DOMAIN}" WEB_DOMAIN_FILE="${DNS_DIR}/${DOMAIN}"
RUNTIME_USER="${ARGUS_RUNTIME_USER:-argus}" RUNTIME_USER="${ARGUS_RUNTIME_USER:-argus}"
RUNTIME_UID="${ARGUS_UID:-2133}" RUNTIME_UID="${ARGUS_BUILD_UID:-2133}"
RUNTIME_GID="${ARGUS_GID:-2015}" RUNTIME_GID="${ARGUS_BUILD_GID:-2015}"
mkdir -p "$DNS_DIR" mkdir -p "$DNS_DIR"
chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true

View File

@ -8,24 +8,34 @@ RUN apt-get update && \
apt-get clean && rm -rf /var/lib/apt/lists/* apt-get clean && rm -rf /var/lib/apt/lists/*
ENV FRONTEND_BASE_PATH=/private/argus/web/proxy ENV FRONTEND_BASE_PATH=/private/argus/web/proxy
ARG ARGUS_UID=2133 ARG ARGUS_BUILD_UID=2133
ARG ARGUS_GID=2015 ARG ARGUS_BUILD_GID=2015
ENV ARGUS_UID=${ARGUS_UID} ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID}
ENV ARGUS_GID=${ARGUS_GID} ENV ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
RUN mkdir -p ${FRONTEND_BASE_PATH} && \ RUN mkdir -p ${FRONTEND_BASE_PATH} && \
mkdir -p /private/argus/etc mkdir -p /private/argus/etc
# 创建 proxy 用户(可自定义 UID/GID # 创建 proxy 用户(可自定义 UID/GID
# 创建 proxy 用户组 # 创建 proxy 用户组
RUN groupadd -g ${ARGUS_GID} web_proxy RUN set -eux; \
if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \
# 创建 proxy 用户并指定组 groupadd -g "${ARGUS_BUILD_GID}" web_proxy || true; \
RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} web_proxy fi; \
if id web_proxy >/dev/null 2>&1; then \
RUN chown -R web_proxy:web_proxy ${FRONTEND_BASE_PATH} && \ current_uid="$(id -u web_proxy)"; \
chown -R web_proxy:web_proxy /private/argus/etc && \ if [ "$current_uid" != "${ARGUS_BUILD_UID}" ] && ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
chown -R web_proxy:web_proxy /usr/local/bin usermod -u "${ARGUS_BUILD_UID}" web_proxy; \
fi; \
usermod -g "${ARGUS_BUILD_GID}" web_proxy || true; \
else \
if ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" web_proxy; \
else \
echo "UID ${ARGUS_BUILD_UID} already exists; skip creating user 'web_proxy'"; \
fi; \
fi; \
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" ${FRONTEND_BASE_PATH} /private/argus/etc /usr/local/bin || true
# 配置内网 apt 源 (如果指定了内网选项) # 配置内网 apt 源 (如果指定了内网选项)
RUN if [ "$USE_INTRANET" = "true" ]; then \ RUN if [ "$USE_INTRANET" = "true" ]; then \

View File

@ -3,7 +3,7 @@ docker pull ubuntu:24.04
source src/web/tests/.env source src/web/tests/.env
docker build \ docker build \
--build-arg ARGUS_UID=${ARGUS_UID} \ --build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
--build-arg ARGUS_GID=${ARGUS_GID} \ --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID} \
-f src/web/build_tools/proxy/Dockerfile -t argus-web-proxy:latest . -f src/web/build_tools/proxy/Dockerfile -t argus-web-proxy:latest .
docker save -o argus-web-proxy-latest.tar argus-web-proxy:latest docker save -o argus-web-proxy-latest.tar argus-web-proxy:latest

View File

@ -9,8 +9,8 @@ DNS_CONF_PRIVATE="/private/argus/etc/dns.conf"
DNS_CONF_SYSTEM="/etc/resolv.conf" DNS_CONF_SYSTEM="/etc/resolv.conf"
DNS_DIR="/private/argus/etc" DNS_DIR="/private/argus/etc"
DNS_SCRIPT="${DNS_DIR}/update-dns.sh" DNS_SCRIPT="${DNS_DIR}/update-dns.sh"
RUNTIME_UID="${ARGUS_UID:-2133}" RUNTIME_UID="${ARGUS_BUILD_UID:-2133}"
RUNTIME_GID="${ARGUS_GID:-2015}" RUNTIME_GID="${ARGUS_BUILD_GID:-2015}"
mkdir -p "$DNS_DIR" mkdir -p "$DNS_DIR"
chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true

View File

@ -4,15 +4,15 @@ services:
context: ../../../ context: ../../../
dockerfile: src/web/build_tools/frontend/Dockerfile dockerfile: src/web/build_tools/frontend/Dockerfile
args: args:
ARGUS_UID: ${ARGUS_UID:-2133} ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
ARGUS_GID: ${ARGUS_GID:-2015} ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
USE_INTRANET: ${USE_INTRANET:-false} USE_INTRANET: ${USE_INTRANET:-false}
image: argus-web-frontend:latest image: argus-web-frontend:latest
container_name: argus-web-frontend container_name: argus-web-frontend
environment: environment:
- ALERTMANAGER_BASE_PATH=/private/argus/web/frontend - ALERTMANAGER_BASE_PATH=/private/argus/web/frontend
- ARGUS_UID=${ARGUS_UID:-2133} - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_GID=${ARGUS_GID:-2015} - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
ports: ports:
- "${ARGUS_WEB_PORT:-8080}:80" - "${ARGUS_WEB_PORT:-8080}:80"
volumes: volumes:
@ -31,14 +31,14 @@ services:
context: ../../../ context: ../../../
dockerfile: src/web/build_tools/proxy/Dockerfile dockerfile: src/web/build_tools/proxy/Dockerfile
args: args:
ARGUS_UID: ${ARGUS_UID:-2133} ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
ARGUS_GID: ${ARGUS_GID:-2015} ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
USE_INTRANET: ${USE_INTRANET:-false} USE_INTRANET: ${USE_INTRANET:-false}
image: argus-web-proxy:latest image: argus-web-proxy:latest
container_name: argus-web-proxy container_name: argus-web-proxy
environment: environment:
- ARGUS_UID=${ARGUS_UID:-2133} - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_GID=${ARGUS_GID:-2015} - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
ports: ports:
- "8088:80" - "8088:80"
volumes: volumes: