Compare commits
No commits in common. "a1cdd05950ba13cb478f2a3af3323cb52a4f20cc" and "1d9a8ec6956246fbf8189d50448579500ac0c0f0" have entirely different histories.
a1cdd05950
...
1d9a8ec695
@ -254,54 +254,6 @@ if [[ "$build_metric" == true ]]; then
|
|||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# =======================================
|
|
||||||
# Web & Alert module images
|
|
||||||
# =======================================
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Building Web and Alert module images..."
|
|
||||||
|
|
||||||
# Pre-pull commonly used base images for stability
|
|
||||||
web_alert_base_images=(
|
|
||||||
"node:20"
|
|
||||||
"ubuntu:24.04"
|
|
||||||
)
|
|
||||||
|
|
||||||
for base_image in "${web_alert_base_images[@]}"; do
|
|
||||||
if ! pull_base_image "$base_image"; then
|
|
||||||
build_failed=true
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
web_builds=(
|
|
||||||
"Web Frontend|src/web/build_tools/frontend/Dockerfile|argus-web-frontend:latest|."
|
|
||||||
"Web Proxy|src/web/build_tools/proxy/Dockerfile|argus-web-proxy:latest|."
|
|
||||||
)
|
|
||||||
|
|
||||||
for build_spec in "${web_builds[@]}"; do
|
|
||||||
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
|
|
||||||
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
|
|
||||||
images_built+=("$image_tag")
|
|
||||||
else
|
|
||||||
build_failed=true
|
|
||||||
fi
|
|
||||||
echo ""
|
|
||||||
done
|
|
||||||
|
|
||||||
alert_builds=(
|
|
||||||
"Alertmanager|src/alert/alertmanager/build/Dockerfile|argus-alertmanager:latest|."
|
|
||||||
)
|
|
||||||
|
|
||||||
for build_spec in "${alert_builds[@]}"; do
|
|
||||||
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
|
|
||||||
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
|
|
||||||
images_built+=("$image_tag")
|
|
||||||
else
|
|
||||||
build_failed=true
|
|
||||||
fi
|
|
||||||
echo ""
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "======================================="
|
echo "======================================="
|
||||||
echo "📦 Build Summary"
|
echo "📦 Build Summary"
|
||||||
echo "======================================="
|
echo "======================================="
|
||||||
|
|||||||
@ -71,9 +71,6 @@ declare -A images=(
|
|||||||
["argus-metric-ftp:latest"]="argus-metric-ftp-latest.tar"
|
["argus-metric-ftp:latest"]="argus-metric-ftp-latest.tar"
|
||||||
["argus-metric-prometheus:latest"]="argus-metric-prometheus-latest.tar"
|
["argus-metric-prometheus:latest"]="argus-metric-prometheus-latest.tar"
|
||||||
["argus-metric-grafana:latest"]="argus-metric-grafana-latest.tar"
|
["argus-metric-grafana:latest"]="argus-metric-grafana-latest.tar"
|
||||||
["argus-web-frontend:latest"]="argus-web-frontend-latest.tar"
|
|
||||||
["argus-web-proxy:latest"]="argus-web-proxy-latest.tar"
|
|
||||||
["argus-alertmanager:latest"]="argus-alertmanager-latest.tar"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# 函数:检查镜像是否存在
|
# 函数:检查镜像是否存在
|
||||||
|
|||||||
@ -20,10 +20,10 @@ RUN wget https://github.com/prometheus/alertmanager/releases/download/v${ALERTMA
|
|||||||
|
|
||||||
ENV ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
|
ENV ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
|
||||||
|
|
||||||
ARG ARGUS_BUILD_UID=2133
|
ARG ARGUS_UID=2133
|
||||||
ARG ARGUS_BUILD_GID=2015
|
ARG ARGUS_GID=2015
|
||||||
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID}
|
ENV ARGUS_UID=${ARGUS_UID}
|
||||||
ENV ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
ENV ARGUS_GID=${ARGUS_GID}
|
||||||
|
|
||||||
RUN mkdir -p /usr/share/alertmanager && \
|
RUN mkdir -p /usr/share/alertmanager && \
|
||||||
mkdir -p ${ALERTMANAGER_BASE_PATH} && \
|
mkdir -p ${ALERTMANAGER_BASE_PATH} && \
|
||||||
@ -33,25 +33,16 @@ RUN mkdir -p /usr/share/alertmanager && \
|
|||||||
|
|
||||||
# 创建 alertmanager 用户(可自定义 UID/GID)
|
# 创建 alertmanager 用户(可自定义 UID/GID)
|
||||||
# 创建 alertmanager 用户组
|
# 创建 alertmanager 用户组
|
||||||
RUN set -eux; \
|
RUN groupadd -g ${ARGUS_GID} alertmanager
|
||||||
if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \
|
|
||||||
groupadd -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
|
||||||
fi; \
|
|
||||||
if id alertmanager >/dev/null 2>&1; then \
|
|
||||||
current_uid="$(id -u alertmanager)"; \
|
|
||||||
if [ "$current_uid" != "${ARGUS_BUILD_UID}" ] && ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
|
|
||||||
usermod -u "${ARGUS_BUILD_UID}" alertmanager; \
|
|
||||||
fi; \
|
|
||||||
usermod -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
|
||||||
else \
|
|
||||||
if ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
|
|
||||||
useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" alertmanager; \
|
|
||||||
else \
|
|
||||||
echo "UID ${ARGUS_BUILD_UID} already exists; skip creating user 'alertmanager'"; \
|
|
||||||
fi; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
RUN chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/alertmanager /alertmanager ${ALERTMANAGER_BASE_PATH} /private/argus/etc /usr/local/bin || true
|
# 创建 alertmanager 用户并指定组
|
||||||
|
RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} alertmanager
|
||||||
|
|
||||||
|
RUN chown -R alertmanager:alertmanager /usr/share/alertmanager && \
|
||||||
|
chown -R alertmanager:alertmanager /alertmanager && \
|
||||||
|
chown -R alertmanager:alertmanager ${ALERTMANAGER_BASE_PATH} && \
|
||||||
|
chown -R alertmanager:alertmanager /private/argus/etc && \
|
||||||
|
chown -R alertmanager:alertmanager /usr/local/bin
|
||||||
|
|
||||||
# 配置内网 apt 源 (如果指定了内网选项)
|
# 配置内网 apt 源 (如果指定了内网选项)
|
||||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||||
@ -95,3 +86,4 @@ EXPOSE 9093
|
|||||||
|
|
||||||
# 使用 supervisor 作为入口点
|
# 使用 supervisor 作为入口点
|
||||||
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
|
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
|
||||||
|
|
||||||
|
|||||||
@ -5,8 +5,8 @@ docker pull ubuntu:24.04
|
|||||||
source src/alert/tests/.env
|
source src/alert/tests/.env
|
||||||
|
|
||||||
docker build \
|
docker build \
|
||||||
--build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
--build-arg ARGUS_UID=${ARGUS_UID} \
|
||||||
--build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID} \
|
--build-arg ARGUS_GID=${ARGUS_GID} \
|
||||||
-f src/alert/alertmanager/build/Dockerfile \
|
-f src/alert/alertmanager/build/Dockerfile \
|
||||||
-t argus-alertmanager:latest .
|
-t argus-alertmanager:latest .
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
DATA_ROOT=/home/argus/tmp/private/argus
|
DATA_ROOT=/home/argus/tmp/private/argus
|
||||||
ARGUS_BUILD_UID=1048
|
ARGUS_UID=1048
|
||||||
ARGUS_BUILD_GID=1048
|
ARGUS_GID=1048
|
||||||
|
|
||||||
USE_INTRANET=false
|
USE_INTRANET=false
|
||||||
|
|||||||
@ -4,15 +4,15 @@ services:
|
|||||||
context: ../../../
|
context: ../../../
|
||||||
dockerfile: src/alert/alertmanager/build/Dockerfile
|
dockerfile: src/alert/alertmanager/build/Dockerfile
|
||||||
args:
|
args:
|
||||||
ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
|
ARGUS_UID: ${ARGUS_UID:-2133}
|
||||||
ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
|
ARGUS_GID: ${ARGUS_GID:-2015}
|
||||||
USE_INTRANET: ${USE_INTRANET:-false}
|
USE_INTRANET: ${USE_INTRANET:-false}
|
||||||
image: argus-alertmanager:latest
|
image: argus-alertmanager:latest
|
||||||
container_name: argus-alertmanager
|
container_name: argus-alertmanager
|
||||||
environment:
|
environment:
|
||||||
- ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
|
- ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
|
||||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
- ARGUS_UID=${ARGUS_UID:-2133}
|
||||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
- ARGUS_GID=${ARGUS_GID:-2015}
|
||||||
ports:
|
ports:
|
||||||
- "${ARGUS_PORT:-9093}:9093"
|
- "${ARGUS_PORT:-9093}:9093"
|
||||||
volumes:
|
volumes:
|
||||||
|
|||||||
@ -26,7 +26,6 @@ RUN apt-get update && \
|
|||||||
apt-get install -y \
|
apt-get install -y \
|
||||||
bind9 \
|
bind9 \
|
||||||
bind9utils \
|
bind9utils \
|
||||||
dnsutils \
|
|
||||||
bind9-doc \
|
bind9-doc \
|
||||||
supervisor \
|
supervisor \
|
||||||
net-tools \
|
net-tools \
|
||||||
|
|||||||
@ -104,26 +104,7 @@ log_info "文件所有者: $OWNER"
|
|||||||
|
|
||||||
# 确保发布目录存在
|
# 确保发布目录存在
|
||||||
log_info "确保发布目录存在: $PUBLISH_DIR"
|
log_info "确保发布目录存在: $PUBLISH_DIR"
|
||||||
mkdir -p "$PUBLISH_DIR"
|
sudo mkdir -p "$PUBLISH_DIR"
|
||||||
|
|
||||||
IFS=':' read -r OWNER_UID OWNER_GID <<< "$OWNER"
|
|
||||||
if [[ -z "$OWNER_UID" || -z "$OWNER_GID" ]]; then
|
|
||||||
log_error "--owner 格式不正确,应为 uid:gid"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
CURRENT_UID=$(id -u)
|
|
||||||
CURRENT_GID=$(id -g)
|
|
||||||
if [[ "$OWNER_UID" != "$CURRENT_UID" || "$OWNER_GID" != "$CURRENT_GID" ]]; then
|
|
||||||
if [[ "$CURRENT_UID" -ne 0 ]]; then
|
|
||||||
log_error "当前用户 (${CURRENT_UID}:${CURRENT_GID}) 无法设置所有者为 ${OWNER_UID}:${OWNER_GID}"
|
|
||||||
log_error "请以目标用户运行脚本或预先调整目录权限"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
NEED_CHOWN=true
|
|
||||||
else
|
|
||||||
NEED_CHOWN=false
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 创建临时目录用于打包
|
# 创建临时目录用于打包
|
||||||
TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$"
|
TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$"
|
||||||
@ -227,31 +208,26 @@ fi
|
|||||||
TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz"
|
TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz"
|
||||||
log_info "创建发布包: $TAR_NAME"
|
log_info "创建发布包: $TAR_NAME"
|
||||||
cd "$TEMP_PACKAGE_DIR"
|
cd "$TEMP_PACKAGE_DIR"
|
||||||
tar -czf "$PUBLISH_DIR/$TAR_NAME" *
|
sudo tar -czf "$PUBLISH_DIR/$TAR_NAME" *
|
||||||
cd - > /dev/null
|
cd - > /dev/null
|
||||||
|
|
||||||
if [[ "$NEED_CHOWN" == true ]]; then
|
# 设置文件所有者
|
||||||
log_info "设置文件所有者为: $OWNER"
|
log_info "设置文件所有者为: $OWNER"
|
||||||
chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME"
|
sudo chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME"
|
||||||
fi
|
|
||||||
|
|
||||||
# 清理临时目录
|
# 清理临时目录
|
||||||
rm -rf "$TEMP_PACKAGE_DIR"
|
rm -rf "$TEMP_PACKAGE_DIR"
|
||||||
|
|
||||||
# 更新 LATEST_VERSION 文件
|
# 更新 LATEST_VERSION 文件
|
||||||
log_info "更新 LATEST_VERSION 文件..."
|
log_info "更新 LATEST_VERSION 文件..."
|
||||||
echo "$VERSION" > "$PUBLISH_DIR/LATEST_VERSION"
|
echo "$VERSION" | sudo tee "$PUBLISH_DIR/LATEST_VERSION" > /dev/null
|
||||||
if [[ "$NEED_CHOWN" == true ]]; then
|
sudo chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION"
|
||||||
chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制)
|
# 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制)
|
||||||
if [[ -f "config/dns.conf" ]]; then
|
if [[ -f "config/dns.conf" ]]; then
|
||||||
log_info "复制 DNS 配置文件到发布目录根目录..."
|
log_info "复制 DNS 配置文件到发布目录根目录..."
|
||||||
cp "config/dns.conf" "$PUBLISH_DIR/"
|
sudo cp "config/dns.conf" "$PUBLISH_DIR/"
|
||||||
if [[ "$NEED_CHOWN" == true ]]; then
|
sudo chown "$OWNER" "$PUBLISH_DIR/dns.conf"
|
||||||
chown "$OWNER" "$PUBLISH_DIR/dns.conf"
|
|
||||||
fi
|
|
||||||
log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf"
|
log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf"
|
||||||
else
|
else
|
||||||
log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制"
|
log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制"
|
||||||
@ -260,10 +236,8 @@ fi
|
|||||||
# 复制 setup.sh 到发布目录
|
# 复制 setup.sh 到发布目录
|
||||||
if [[ -f "scripts/setup.sh" ]]; then
|
if [[ -f "scripts/setup.sh" ]]; then
|
||||||
log_info "复制 setup.sh 到发布目录..."
|
log_info "复制 setup.sh 到发布目录..."
|
||||||
cp "scripts/setup.sh" "$PUBLISH_DIR/"
|
sudo cp "scripts/setup.sh" "$PUBLISH_DIR/"
|
||||||
if [[ "$NEED_CHOWN" == true ]]; then
|
sudo chown "$OWNER" "$PUBLISH_DIR/setup.sh"
|
||||||
chown "$OWNER" "$PUBLISH_DIR/setup.sh"
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 显示发布结果
|
# 显示发布结果
|
||||||
|
|||||||
@ -65,8 +65,6 @@ COPY grafana.ini /tmp/grafana.ini
|
|||||||
COPY datasources/datasources.yml /tmp/datasources.yml
|
COPY datasources/datasources.yml /tmp/datasources.yml
|
||||||
COPY dashboards/dashboards.yml /tmp/dashboards.yml
|
COPY dashboards/dashboards.yml /tmp/dashboards.yml
|
||||||
COPY dashboards/default_dashboard_by_hostname.json /tmp/default_dashboard.json
|
COPY dashboards/default_dashboard_by_hostname.json /tmp/default_dashboard.json
|
||||||
COPY dashboards/default_cluster_dashboard.json /tmp/default_cluster_dashboard.json
|
|
||||||
COPY dashboards/default_dashboard_by_instance.json /tmp/default_dashboard_by_instance.json
|
|
||||||
|
|
||||||
# supervisor 配置
|
# supervisor 配置
|
||||||
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
||||||
|
|||||||
@ -8,7 +8,7 @@ datasources:
|
|||||||
type: prometheus
|
type: prometheus
|
||||||
access: proxy
|
access: proxy
|
||||||
uid: eezk1zvkie4g0a
|
uid: eezk1zvkie4g0a
|
||||||
url: http://prom.metric.argus.com:9090
|
url: http://10.211.55.5:9090
|
||||||
isDefault: true
|
isDefault: true
|
||||||
editable: true
|
editable: true
|
||||||
jsonData:
|
jsonData:
|
||||||
|
|||||||
@ -44,18 +44,12 @@ else
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# 复制数据源配置文件到挂载目录
|
# 复制数据源配置文件到挂载目录
|
||||||
DS_OUT="/private/argus/metric/grafana/provisioning/datasources/datasources.yml"
|
if [ -f "/tmp/datasources.yml" ]; then
|
||||||
PROM_DOMAIN="prom.metric.argus.com:9090"
|
echo "[INFO] Copying datasource configuration to /private/argus/metric/grafana/provisioning/datasources/"
|
||||||
|
cp /tmp/datasources.yml /private/argus/metric/grafana/provisioning/datasources/datasources.yml
|
||||||
if [ -f "/tmp/datasources.yml" ] && [ ! -f "$DS_OUT" ]; then
|
echo "[INFO] Datasource configuration copied successfully"
|
||||||
echo "[INFO] Initializing datasource provisioning file from /tmp"
|
elif [ -d "/private/argus/metric/grafana/provisioning/datasources" ] && [ "$(ls -A /private/argus/metric/grafana/provisioning/datasources)" ]; then
|
||||||
cp /tmp/datasources.yml "$DS_OUT"
|
echo "[INFO] Found existing datasource provisioning files in /private/argus/metric/grafana/provisioning/datasources"
|
||||||
fi
|
|
||||||
|
|
||||||
# 统一将数据源 URL 规范为 prom.metric.argus.com:9090
|
|
||||||
if [ -f "$DS_OUT" ]; then
|
|
||||||
sed -i -E "s#^\s*url:\s*http://[^[:space:]]+# url: http://$PROM_DOMAIN#g" "$DS_OUT" || true
|
|
||||||
echo "[INFO] Datasource URL normalized to http://$PROM_DOMAIN"
|
|
||||||
elif [ -d "/etc/grafana/provisioning/datasources" ] && [ "$(ls -A /etc/grafana/provisioning/datasources)" ]; then
|
elif [ -d "/etc/grafana/provisioning/datasources" ] && [ "$(ls -A /etc/grafana/provisioning/datasources)" ]; then
|
||||||
echo "[INFO] Found datasource provisioning files in /etc/grafana/provisioning/datasources"
|
echo "[INFO] Found datasource provisioning files in /etc/grafana/provisioning/datasources"
|
||||||
# 确保数据源配置目录权限正确
|
# 确保数据源配置目录权限正确
|
||||||
@ -71,33 +65,11 @@ if [ -f "/tmp/dashboards.yml" ]; then
|
|||||||
echo "[INFO] Dashboard configuration copied successfully"
|
echo "[INFO] Dashboard configuration copied successfully"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 复制默认仪表板到挂载目录(按需,不覆盖已存在文件)
|
# 复制默认仪表板到挂载目录
|
||||||
copy_dashboard_if_missing() {
|
if [ -f "/tmp/default_dashboard.json" ]; then
|
||||||
local src="$1"; local dst_name="$2"
|
echo "[INFO] Copying default dashboard to /private/argus/metric/grafana/provisioning/dashboards/"
|
||||||
local dst_dir="/private/argus/metric/grafana/provisioning/dashboards"
|
cp /tmp/default_dashboard.json /private/argus/metric/grafana/provisioning/dashboards/default_dashboard.json
|
||||||
local dst="$dst_dir/$dst_name"
|
echo "[INFO] Default dashboard copied successfully"
|
||||||
if [ -f "$src" ]; then
|
|
||||||
if [ ! -f "$dst" ]; then
|
|
||||||
echo "[INFO] Installing dashboard: $dst_name"
|
|
||||||
cp "$src" "$dst"
|
|
||||||
else
|
|
||||||
echo "[INFO] Dashboard exists, skip: $dst_name"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
copy_dashboard_if_missing "/tmp/default_dashboard.json" "default_dashboard.json"
|
|
||||||
copy_dashboard_if_missing "/tmp/default_cluster_dashboard.json" "default_cluster_dashboard.json"
|
|
||||||
copy_dashboard_if_missing "/tmp/default_dashboard_by_instance.json" "default_dashboard_by_instance.json"
|
|
||||||
|
|
||||||
# 规范面板中的数据源字段:将字符串 "prometheus" 替换为 null(使用默认数据源)
|
|
||||||
DB_DIR="/private/argus/metric/grafana/provisioning/dashboards"
|
|
||||||
if [ -d "$DB_DIR" ]; then
|
|
||||||
for f in "$DB_DIR"/*.json; do
|
|
||||||
[ -f "$f" ] || continue
|
|
||||||
sed -i -E 's/"datasource"\s*:\s*"prometheus"/"datasource": null/g' "$f" || true
|
|
||||||
done
|
|
||||||
echo "[INFO] Normalized dashboard datasource to default (null)"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 启动 Grafana
|
# 启动 Grafana
|
||||||
|
|||||||
@ -1,5 +1,9 @@
|
|||||||
|
-version: "3.8"
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
sysnet:
|
default:
|
||||||
|
external: true
|
||||||
|
name: argus-sys-net
|
||||||
driver: bridge
|
driver: bridge
|
||||||
ipam:
|
ipam:
|
||||||
driver: default
|
driver: default
|
||||||
@ -11,7 +15,7 @@ services:
|
|||||||
image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
|
image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
|
||||||
container_name: argus-bind-sys
|
container_name: argus-bind-sys
|
||||||
networks:
|
networks:
|
||||||
sysnet:
|
default:
|
||||||
ipv4_address: 172.29.0.2
|
ipv4_address: 172.29.0.2
|
||||||
volumes:
|
volumes:
|
||||||
- ./private:/private
|
- ./private:/private
|
||||||
@ -35,7 +39,7 @@ services:
|
|||||||
- ./private/argus/metric/prometheus:/private/argus/metric/prometheus
|
- ./private/argus/metric/prometheus:/private/argus/metric/prometheus
|
||||||
- ./private/argus/etc:/private/argus/etc
|
- ./private/argus/etc:/private/argus/etc
|
||||||
networks:
|
networks:
|
||||||
sysnet:
|
default:
|
||||||
ipv4_address: 172.29.0.10
|
ipv4_address: 172.29.0.10
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
@ -54,9 +58,6 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "9200:9200"
|
- "9200:9200"
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
networks:
|
|
||||||
sysnet:
|
|
||||||
ipv4_address: 172.29.0.3
|
|
||||||
|
|
||||||
kibana:
|
kibana:
|
||||||
image: argus-kibana:latest
|
image: argus-kibana:latest
|
||||||
@ -73,9 +74,6 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "5601:5601"
|
- "5601:5601"
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
networks:
|
|
||||||
sysnet:
|
|
||||||
ipv4_address: 172.29.0.4
|
|
||||||
|
|
||||||
node-a:
|
node-a:
|
||||||
image: ubuntu:22.04
|
image: ubuntu:22.04
|
||||||
@ -108,8 +106,6 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "2020:2020"
|
- "2020:2020"
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
networks:
|
|
||||||
- sysnet
|
|
||||||
|
|
||||||
node-b:
|
node-b:
|
||||||
image: ubuntu:22.04
|
image: ubuntu:22.04
|
||||||
@ -142,8 +138,6 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "2021:2020"
|
- "2021:2020"
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
networks:
|
|
||||||
- sysnet
|
|
||||||
|
|
||||||
ftp:
|
ftp:
|
||||||
image: argus-metric-ftp:latest
|
image: argus-metric-ftp:latest
|
||||||
@ -166,7 +160,7 @@ services:
|
|||||||
- /etc/localtime:/etc/localtime:ro
|
- /etc/localtime:/etc/localtime:ro
|
||||||
- /etc/timezone:/etc/timezone:ro
|
- /etc/timezone:/etc/timezone:ro
|
||||||
networks:
|
networks:
|
||||||
sysnet:
|
default:
|
||||||
ipv4_address: 172.29.0.40
|
ipv4_address: 172.29.0.40
|
||||||
logging:
|
logging:
|
||||||
driver: "json-file"
|
driver: "json-file"
|
||||||
@ -191,7 +185,7 @@ services:
|
|||||||
- /etc/localtime:/etc/localtime:ro
|
- /etc/localtime:/etc/localtime:ro
|
||||||
- /etc/timezone:/etc/timezone:ro
|
- /etc/timezone:/etc/timezone:ro
|
||||||
networks:
|
networks:
|
||||||
sysnet:
|
default:
|
||||||
ipv4_address: 172.29.0.41
|
ipv4_address: 172.29.0.41
|
||||||
logging:
|
logging:
|
||||||
driver: "json-file"
|
driver: "json-file"
|
||||||
@ -211,9 +205,6 @@ services:
|
|||||||
- GF_SERVER_HTTP_PORT=3000
|
- GF_SERVER_HTTP_PORT=3000
|
||||||
- GF_LOG_LEVEL=warn
|
- GF_LOG_LEVEL=warn
|
||||||
- GF_LOG_MODE=console
|
- GF_LOG_MODE=console
|
||||||
- GF_PATHS_PROVISIONING=/private/argus/metric/grafana/provisioning
|
|
||||||
- GF_AUTH_ANONYMOUS_ENABLED=true
|
|
||||||
- GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
|
|
||||||
ports:
|
ports:
|
||||||
- "${GRAFANA_PORT:-3000}:3000"
|
- "${GRAFANA_PORT:-3000}:3000"
|
||||||
volumes:
|
volumes:
|
||||||
@ -222,7 +213,7 @@ services:
|
|||||||
- /etc/localtime:/etc/localtime:ro
|
- /etc/localtime:/etc/localtime:ro
|
||||||
- /etc/timezone:/etc/timezone:ro
|
- /etc/timezone:/etc/timezone:ro
|
||||||
networks:
|
networks:
|
||||||
sysnet:
|
default:
|
||||||
ipv4_address: 172.29.0.42
|
ipv4_address: 172.29.0.42
|
||||||
depends_on:
|
depends_on:
|
||||||
- prometheus
|
- prometheus
|
||||||
@ -233,7 +224,7 @@ services:
|
|||||||
max-file: "3"
|
max-file: "3"
|
||||||
|
|
||||||
test-node:
|
test-node:
|
||||||
image: ubuntu:22.04
|
image: argus-metric-test-node:latest
|
||||||
container_name: argus-metric-test-node
|
container_name: argus-metric-test-node
|
||||||
hostname: test-metric-node-001
|
hostname: test-metric-node-001
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
@ -249,21 +240,13 @@ services:
|
|||||||
- FTP_USER=${FTP_USER:-ftpuser}
|
- FTP_USER=${FTP_USER:-ftpuser}
|
||||||
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
|
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
|
||||||
- FTP_PORT=${FTP_PORT:-21}
|
- FTP_PORT=${FTP_PORT:-21}
|
||||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
|
||||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
|
||||||
- METRIC_NODE_ROLE=cpu
|
|
||||||
volumes:
|
volumes:
|
||||||
- ./private/argus/agent:/private/argus/agent
|
- ./private/argus/agent:/private/argus/agent
|
||||||
- ./scripts/metric/test-node-entrypoint.sh:/usr/local/bin/metric-test-node-entrypoint.sh:ro
|
|
||||||
- /etc/localtime:/etc/localtime:ro
|
- /etc/localtime:/etc/localtime:ro
|
||||||
- /etc/timezone:/etc/timezone:ro
|
- /etc/timezone:/etc/timezone:ro
|
||||||
entrypoint:
|
command: sleep infinity
|
||||||
- /usr/local/bin/metric-test-node-entrypoint.sh
|
|
||||||
command:
|
|
||||||
- sleep
|
|
||||||
- infinity
|
|
||||||
networks:
|
networks:
|
||||||
sysnet:
|
default:
|
||||||
ipv4_address: 172.29.0.50
|
ipv4_address: 172.29.0.50
|
||||||
logging:
|
logging:
|
||||||
driver: "json-file"
|
driver: "json-file"
|
||||||
@ -272,8 +255,7 @@ services:
|
|||||||
max-file: "3"
|
max-file: "3"
|
||||||
|
|
||||||
test-gpu-node:
|
test-gpu-node:
|
||||||
profiles: ["gpu"]
|
image: argus-metric-test-gpu-node:latest
|
||||||
image: nvidia/cuda:12.2.2-runtime-ubuntu22.04
|
|
||||||
container_name: argus-metric-test-gpu-node
|
container_name: argus-metric-test-gpu-node
|
||||||
hostname: test-metric-gpu-node-001
|
hostname: test-metric-gpu-node-001
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
@ -296,21 +278,13 @@ services:
|
|||||||
- NVIDIA_VISIBLE_DEVICES=all
|
- NVIDIA_VISIBLE_DEVICES=all
|
||||||
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||||
- GPU_MODE=gpu
|
- GPU_MODE=gpu
|
||||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
|
||||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
|
||||||
- METRIC_NODE_ROLE=gpu
|
|
||||||
volumes:
|
volumes:
|
||||||
- ./private/argus/agent:/private/argus/agent
|
- ./private/argus/agent:/private/argus/agent
|
||||||
- ./scripts/metric/test-node-entrypoint.sh:/usr/local/bin/metric-test-node-entrypoint.sh:ro
|
|
||||||
- /etc/localtime:/etc/localtime:ro
|
- /etc/localtime:/etc/localtime:ro
|
||||||
- /etc/timezone:/etc/timezone:ro
|
- /etc/timezone:/etc/timezone:ro
|
||||||
entrypoint:
|
command: sleep infinity
|
||||||
- /usr/local/bin/metric-test-node-entrypoint.sh
|
|
||||||
command:
|
|
||||||
- sleep
|
|
||||||
- infinity
|
|
||||||
networks:
|
networks:
|
||||||
sysnet:
|
default:
|
||||||
ipv4_address: 172.29.0.51
|
ipv4_address: 172.29.0.51
|
||||||
logging:
|
logging:
|
||||||
driver: "json-file"
|
driver: "json-file"
|
||||||
|
|||||||
@ -3,38 +3,6 @@ set -euo pipefail
|
|||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
|
||||||
ENABLE_GPU=false
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
cat <<'EOF'
|
|
||||||
Usage: 00_e2e_test.sh [options]
|
|
||||||
|
|
||||||
Options:
|
|
||||||
--enable-gpu 启用 GPU 相关拓扑与测试流程
|
|
||||||
-h, --help 显示帮助信息
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case "$1" in
|
|
||||||
--enable-gpu)
|
|
||||||
ENABLE_GPU=true
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
-h|--help)
|
|
||||||
usage
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Unknown argument: $1" >&2
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
export ARGUS_SYS_ENABLE_GPU=$ENABLE_GPU
|
|
||||||
|
|
||||||
SCRIPTS=(
|
SCRIPTS=(
|
||||||
"01_bootstrap.sh"
|
"01_bootstrap.sh"
|
||||||
"02_up.sh"
|
"02_up.sh"
|
||||||
@ -44,11 +12,6 @@ SCRIPTS=(
|
|||||||
"06_write_health_and_assert.sh"
|
"06_write_health_and_assert.sh"
|
||||||
"07_logs_send_and_assert.sh"
|
"07_logs_send_and_assert.sh"
|
||||||
"08_restart_agent_reregister.sh"
|
"08_restart_agent_reregister.sh"
|
||||||
"10_metric_publish.sh"
|
|
||||||
"11_metric_node_install.sh"
|
|
||||||
"12_metric_gpu_install.sh"
|
|
||||||
"13_metric_verify.sh"
|
|
||||||
"14_metric_cleanup.sh"
|
|
||||||
"09_down.sh"
|
"09_down.sh"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -60,3 +23,4 @@ for script in "${SCRIPTS[@]}"; do
|
|||||||
done
|
done
|
||||||
|
|
||||||
echo "[SYS-E2E] All tests completed"
|
echo "[SYS-E2E] All tests completed"
|
||||||
|
|
||||||
|
|||||||
@ -22,24 +22,6 @@ ensure_image() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
echo "[INFO] Preparing directories..."
|
echo "[INFO] Preparing directories..."
|
||||||
ensure_writable_dir() {
|
|
||||||
local path="$1"
|
|
||||||
local parent
|
|
||||||
parent="$(dirname "$path")"
|
|
||||||
mkdir -p "$parent" 2>/dev/null || true
|
|
||||||
mkdir -p "$path" 2>/dev/null || true
|
|
||||||
if [[ ! -w "$path" ]]; then
|
|
||||||
docker run --rm -v "$parent:/target" ubuntu:24.04 bash -lc "chown -R $(id -u):$(id -g) /target" >/dev/null 2>&1 || true
|
|
||||||
fi
|
|
||||||
mkdir -p "$path"
|
|
||||||
}
|
|
||||||
|
|
||||||
# preflight: make base dirs writable if inherited from root-owned mounts
|
|
||||||
ensure_writable_dir "$PRIVATE_CORE/argus"
|
|
||||||
ensure_writable_dir "$PRIVATE_CORE/argus/metric"
|
|
||||||
ensure_writable_dir "$PRIVATE_CORE/argus/metric/grafana"
|
|
||||||
ensure_writable_dir "$PRIVATE_CORE/argus/metric/prometheus"
|
|
||||||
|
|
||||||
mkdir -p \
|
mkdir -p \
|
||||||
"$PRIVATE_CORE/argus/etc" \
|
"$PRIVATE_CORE/argus/etc" \
|
||||||
"$PRIVATE_CORE/argus/bind" \
|
"$PRIVATE_CORE/argus/bind" \
|
||||||
@ -75,8 +57,6 @@ chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" \
|
|||||||
"$PRIVATE_CORE/argus/agent" \
|
"$PRIVATE_CORE/argus/agent" \
|
||||||
"$PRIVATE_CORE/argus/etc" 2>/dev/null || true
|
"$PRIVATE_CORE/argus/etc" 2>/dev/null || true
|
||||||
|
|
||||||
echo "[INFO] Using compose-managed network (auto-created by docker compose)"
|
|
||||||
|
|
||||||
echo "[INFO] Distributing update-dns.sh for core services (bind/master/es/kibana)"
|
echo "[INFO] Distributing update-dns.sh for core services (bind/master/es/kibana)"
|
||||||
BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh"
|
BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh"
|
||||||
BIND_UPDATE_DEST="$PRIVATE_CORE/argus/etc/update-dns.sh"
|
BIND_UPDATE_DEST="$PRIVATE_CORE/argus/etc/update-dns.sh"
|
||||||
@ -95,6 +75,8 @@ ensure_image "argus-master:latest"
|
|||||||
ensure_image "argus-metric-ftp:latest"
|
ensure_image "argus-metric-ftp:latest"
|
||||||
ensure_image "argus-metric-prometheus:latest"
|
ensure_image "argus-metric-prometheus:latest"
|
||||||
ensure_image "argus-metric-grafana:latest"
|
ensure_image "argus-metric-grafana:latest"
|
||||||
|
ensure_image "argus-metric-test-node:latest"
|
||||||
|
ensure_image "argus-metric-test-gpu-node:latest"
|
||||||
|
|
||||||
echo "[INFO] Building agent binary..."
|
echo "[INFO] Building agent binary..."
|
||||||
pushd "$REPO_ROOT/src/agent" >/dev/null
|
pushd "$REPO_ROOT/src/agent" >/dev/null
|
||||||
@ -109,25 +91,19 @@ fi
|
|||||||
echo "$AGENT_BIN" > "$TMP_DIR/agent_binary_path"
|
echo "$AGENT_BIN" > "$TMP_DIR/agent_binary_path"
|
||||||
|
|
||||||
# 检测GPU环境
|
# 检测GPU环境
|
||||||
REQUEST_GPU=${ARGUS_SYS_ENABLE_GPU:-false}
|
echo "[INFO] 检测GPU环境..."
|
||||||
GPU_CHECK_SCRIPT="$REPO_ROOT/src/metric/tests/scripts/common/check-gpu.sh"
|
GPU_CHECK_SCRIPT="$REPO_ROOT/src/metric/tests/scripts/common/check-gpu.sh"
|
||||||
if [[ "$REQUEST_GPU" == "true" ]]; then
|
if [ -f "$GPU_CHECK_SCRIPT" ]; then
|
||||||
echo "[INFO] --enable-gpu 已启用,开始检测GPU环境..."
|
|
||||||
if [[ -f "$GPU_CHECK_SCRIPT" ]]; then
|
|
||||||
if bash "$GPU_CHECK_SCRIPT" >/dev/null 2>&1; then
|
if bash "$GPU_CHECK_SCRIPT" >/dev/null 2>&1; then
|
||||||
echo "[INFO] GPU环境可用,将在 compose 中启用 test-gpu-node"
|
echo "[INFO] GPU环境可用,将启动test-gpu-node容器"
|
||||||
GPU_AVAILABLE=true
|
GPU_AVAILABLE=true
|
||||||
else
|
else
|
||||||
echo "[ERROR] 未检测到可用 GPU,但指定了 --enable-gpu" >&2
|
echo "[INFO] GPU环境不可用,跳过test-gpu-node容器"
|
||||||
exit 1
|
GPU_AVAILABLE=false
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "[ERROR] 未找到 GPU 检测脚本: $GPU_CHECK_SCRIPT" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
|
echo "[WARN] 未找到GPU检测脚本: $GPU_CHECK_SCRIPT,跳过GPU检测"
|
||||||
GPU_AVAILABLE=false
|
GPU_AVAILABLE=false
|
||||||
echo "[INFO] GPU 支持未启用,跳过 GPU 检测"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "[INFO] Writing .env with UID/GID and metric configuration"
|
echo "[INFO] Writing .env with UID/GID and metric configuration"
|
||||||
@ -136,7 +112,7 @@ ARGUS_BUILD_UID=$ARGUS_BUILD_UID
|
|||||||
ARGUS_BUILD_GID=$ARGUS_BUILD_GID
|
ARGUS_BUILD_GID=$ARGUS_BUILD_GID
|
||||||
|
|
||||||
# GPU 配置
|
# GPU 配置
|
||||||
ENABLE_GPU=$GPU_AVAILABLE
|
GPU_AVAILABLE=$GPU_AVAILABLE
|
||||||
|
|
||||||
# FTP 配置
|
# FTP 配置
|
||||||
FTP_PORT=21
|
FTP_PORT=21
|
||||||
|
|||||||
@ -15,51 +15,39 @@ compose() {
|
|||||||
|
|
||||||
echo "[INFO] Bringing up system stack..."
|
echo "[INFO] Bringing up system stack..."
|
||||||
|
|
||||||
REQUEST_GPU=${ARGUS_SYS_ENABLE_GPU:-false}
|
# 检测GPU环境
|
||||||
GPU_AVAILABLE=false
|
echo "[INFO] 检测GPU环境..."
|
||||||
GPU_CHECK_SCRIPT="$REPO_ROOT/src/metric/tests/scripts/common/check-gpu.sh"
|
GPU_CHECK_SCRIPT="$REPO_ROOT/src/metric/tests/scripts/common/check-gpu.sh"
|
||||||
|
if [ -f "$GPU_CHECK_SCRIPT" ]; then
|
||||||
if [[ "$REQUEST_GPU" == "true" ]]; then
|
|
||||||
echo "[INFO] --enable-gpu 生效,验证主机 GPU..."
|
|
||||||
if [[ -f "$GPU_CHECK_SCRIPT" ]]; then
|
|
||||||
if bash "$GPU_CHECK_SCRIPT" >/dev/null 2>&1; then
|
if bash "$GPU_CHECK_SCRIPT" >/dev/null 2>&1; then
|
||||||
|
echo "[INFO] GPU环境可用,将启动GPU测试节点"
|
||||||
GPU_AVAILABLE=true
|
GPU_AVAILABLE=true
|
||||||
echo "[INFO] GPU 检测通过,将启动 gpu profile"
|
|
||||||
else
|
else
|
||||||
echo "[ERROR] 主机缺少可用 GPU,无法继续 --enable-gpu 流程" >&2
|
echo "[INFO] GPU环境不可用,将跳过GPU测试节点"
|
||||||
exit 1
|
GPU_AVAILABLE=false
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "[ERROR] 未找到 GPU 检测脚本: $GPU_CHECK_SCRIPT" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
echo "[INFO] 未启用 GPU 流程"
|
echo "[WARN] 未找到GPU检测脚本: $GPU_CHECK_SCRIPT,跳过GPU检测"
|
||||||
|
GPU_AVAILABLE=false
|
||||||
fi
|
fi
|
||||||
|
|
||||||
pushd "$TEST_ROOT" >/dev/null
|
pushd "$TEST_ROOT" >/dev/null
|
||||||
compose -p argus-sys down --remove-orphans || true
|
compose -p argus-sys down --remove-orphans || true
|
||||||
|
|
||||||
# 清理可能由 08 脚本创建的同名容器,避免 compose up 冲突
|
|
||||||
for name in argus-node-b; do
|
|
||||||
if docker ps -aqf "name=^${name}$" >/dev/null 2>&1 && [[ -n "$(docker ps -aqf "name=^${name}$")" ]]; then
|
|
||||||
docker rm -f "$name" >/dev/null 2>&1 || true
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
# 根据GPU可用性决定启动的服务
|
# 根据GPU可用性决定启动的服务
|
||||||
if [[ "$GPU_AVAILABLE" == true ]]; then
|
if [ "$GPU_AVAILABLE" = true ]; then
|
||||||
echo "[INFO] 启动所有服务(包含 gpu profile)..."
|
echo "[INFO] 启动所有服务(包括test-gpu-node)..."
|
||||||
compose -p argus-sys --profile gpu up -d
|
|
||||||
else
|
|
||||||
echo "[INFO] 启动基础服务(不含 gpu profile)..."
|
|
||||||
compose -p argus-sys up -d
|
compose -p argus-sys up -d
|
||||||
|
else
|
||||||
|
echo "[INFO] 启动基础服务(跳过test-gpu-node)..."
|
||||||
|
compose -p argus-sys up -d --scale test-gpu-node=0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
popd >/dev/null
|
popd >/dev/null
|
||||||
|
|
||||||
if [[ "$GPU_AVAILABLE" == true ]]; then
|
if [ "$GPU_AVAILABLE" = true ]; then
|
||||||
echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 test-gpu-node:172.29.0.51"
|
echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 test-gpu-node:172.29.0.51"
|
||||||
else
|
else
|
||||||
echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 (gpu skipped)"
|
echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021 (test-gpu-node skipped)"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@ -4,15 +4,20 @@ set -euo pipefail
|
|||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
|
||||||
# 直接根据 container_name 获取容器ID,避免 compose project 名称不一致导致查找失败
|
compose() {
|
||||||
cid_by_name() {
|
if docker compose version >/dev/null 2>&1; then
|
||||||
docker ps -aqf "name=^$1$"
|
docker compose "$@"
|
||||||
|
else
|
||||||
|
docker-compose "$@"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
service_id() {
|
||||||
|
compose -p argus-sys ps -q "$1"
|
||||||
}
|
}
|
||||||
|
|
||||||
echo "[INFO] Verifying DNS routing via bind..."
|
echo "[INFO] Verifying DNS routing via bind..."
|
||||||
|
|
||||||
pushd "$TEST_ROOT" >/dev/null
|
|
||||||
|
|
||||||
# Check master IP file exists in shared private
|
# Check master IP file exists in shared private
|
||||||
MASTER_FILE="$TEST_ROOT/private/argus/etc/master.argus.com"
|
MASTER_FILE="$TEST_ROOT/private/argus/etc/master.argus.com"
|
||||||
if [[ ! -f "$MASTER_FILE" ]]; then
|
if [[ ! -f "$MASTER_FILE" ]]; then
|
||||||
@ -23,7 +28,7 @@ MASTER_IP_HOST="$(cat "$MASTER_FILE" | tr -d '\r\n' || true)"
|
|||||||
echo "[INFO] master.argus.com file content: ${MASTER_IP_HOST}"
|
echo "[INFO] master.argus.com file content: ${MASTER_IP_HOST}"
|
||||||
|
|
||||||
# dig inside bind container
|
# dig inside bind container
|
||||||
BIN_ID="$(cid_by_name argus-bind-sys)"
|
BIN_ID="$(service_id bind)"
|
||||||
if [[ -n "$BIN_ID" ]]; then
|
if [[ -n "$BIN_ID" ]]; then
|
||||||
DIG_IP="$(docker exec "$BIN_ID" dig +short master.argus.com A | tail -n1 || true)"
|
DIG_IP="$(docker exec "$BIN_ID" dig +short master.argus.com A | tail -n1 || true)"
|
||||||
echo "[INFO] dig(master.argus.com) from bind container -> $DIG_IP"
|
echo "[INFO] dig(master.argus.com) from bind container -> $DIG_IP"
|
||||||
@ -34,8 +39,8 @@ else
|
|||||||
echo "[WARN] bind container not found; skip dig"
|
echo "[WARN] bind container not found; skip dig"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
for node in argus-node-a argus-node-b; do
|
for node in node-a node-b; do
|
||||||
CID="$(cid_by_name "$node")"
|
CID="$(service_id "$node")"
|
||||||
echo "[INFO] Checking resolution inside $node..."
|
echo "[INFO] Checking resolution inside $node..."
|
||||||
if ! docker exec "$CID" getent hosts master.argus.com >/dev/null 2>&1; then
|
if ! docker exec "$CID" getent hosts master.argus.com >/dev/null 2>&1; then
|
||||||
echo "[ERR] $node cannot resolve master.argus.com" >&2
|
echo "[ERR] $node cannot resolve master.argus.com" >&2
|
||||||
@ -45,6 +50,5 @@ for node in argus-node-a argus-node-b; do
|
|||||||
echo "[OK] $node resolved master.argus.com -> $RES"
|
echo "[OK] $node resolved master.argus.com -> $RES"
|
||||||
done
|
done
|
||||||
|
|
||||||
popd >/dev/null
|
|
||||||
|
|
||||||
echo "[OK] DNS routing verified"
|
echo "[OK] DNS routing verified"
|
||||||
|
|
||||||
|
|||||||
@ -49,35 +49,8 @@ for _ in {1..60}; do
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
# 若仍未全部注册,尝试重启 node-b 并再等待一轮(兼容 DNS/启动时序抖动)
|
|
||||||
if [[ ! -s "$TMP_DIR/node_id_a" || ! -s "$TMP_DIR/node_id_b" ]]; then
|
if [[ ! -s "$TMP_DIR/node_id_a" || ! -s "$TMP_DIR/node_id_b" ]]; then
|
||||||
echo "[WARN] node-a or node-b not registered in first window; restarting node-b and retrying..." >&2
|
echo "[ERR] Agents did not register in time" >&2
|
||||||
# 仅重启 node-b,避免影响 es/kibana/master
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q '^argus-node-b$'; then
|
|
||||||
docker restart argus-node-b >/dev/null 2>&1 || true
|
|
||||||
fi
|
|
||||||
# 再等待一轮(最多 120 秒)
|
|
||||||
> "$TMP_DIR/node_id_b"
|
|
||||||
for _ in {1..60}; do
|
|
||||||
sleep 2
|
|
||||||
resp=$(curl -fsS "$API_BASE/nodes" 2>/dev/null || true)
|
|
||||||
[[ -z "$resp" ]] && continue
|
|
||||||
if ! echo "$resp" | head -c1 | grep -q '\['; then
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
echo "$resp" > "$TMP_DIR/nodes_list.json"
|
|
||||||
ID_A=$(extract_node "$HOST_A" "$TMP_DIR/node_id_a" "$TMP_DIR/nodes_list.json" 2>/dev/null || true)
|
|
||||||
ID_B=$(extract_node "$HOST_B" "$TMP_DIR/node_id_b" "$TMP_DIR/nodes_list.json" 2>/dev/null || true)
|
|
||||||
if [[ -s "$TMP_DIR/node_id_a" && -s "$TMP_DIR/node_id_b" ]]; then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ! -s "$TMP_DIR/node_id_a" || ! -s "$TMP_DIR/node_id_b" ]]; then
|
|
||||||
echo "[ERR] Agents did not register in time (after retry)" >&2
|
|
||||||
echo "[HINT] Current /nodes response:" >&2
|
|
||||||
sed -n '1,200p' "$TMP_DIR/nodes_list.json" >&2 || true
|
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
61
src/sys/tests/scripts/05_publish_artifact.sh
Executable file
61
src/sys/tests/scripts/05_publish_artifact.sh
Executable file
@ -0,0 +1,61 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||||
|
TEST_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
REPO_ROOT="$(cd "$TEST_DIR/../../.." && pwd)"
|
||||||
|
PLUGIN_DIR="$REPO_ROOT/src/metric/client-plugins/all-in-one-full"
|
||||||
|
|
||||||
|
# 加载 .env
|
||||||
|
if [ -f "$TEST_DIR/.env" ]; then
|
||||||
|
source "$TEST_DIR/.env"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 检测容器挂载目录
|
||||||
|
if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then
|
||||||
|
FTP_MOUNT=$(docker inspect argus-ftp --format '{{range .Mounts}}{{if eq .Destination "/private/argus/ftp"}}{{.Source}}{{end}}{{end}}')
|
||||||
|
OUTPUT_DIR="${FTP_MOUNT}/share"
|
||||||
|
echo "[02] 容器挂载: $OUTPUT_DIR"
|
||||||
|
else
|
||||||
|
OUTPUT_DIR="${DATA_ROOT:-$TEST_DIR/private}/ftp/share"
|
||||||
|
echo "[02] 默认路径: $OUTPUT_DIR"
|
||||||
|
fi
|
||||||
|
|
||||||
|
OWNER="${ARGUS_BUILD_UID:-2133}:${ARGUS_BUILD_GID:-2015}"
|
||||||
|
|
||||||
|
cd "$PLUGIN_DIR"
|
||||||
|
|
||||||
|
echo "[02] 递增版本号..."
|
||||||
|
bash scripts/version-manager.sh bump minor
|
||||||
|
|
||||||
|
VERSION_FILE="config/VERSION"
|
||||||
|
if [ ! -f "$VERSION_FILE" ]; then
|
||||||
|
echo "[02] 错误: 未找到 $VERSION_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
VERSION=$(cat "$VERSION_FILE" | tr -d '[:space:]')
|
||||||
|
echo "[02] 新版本: $VERSION"
|
||||||
|
|
||||||
|
echo "[02] 构建安装包..."
|
||||||
|
bash scripts/package_artifact.sh --force
|
||||||
|
|
||||||
|
echo "[02] 发布到 FTP: $OUTPUT_DIR"
|
||||||
|
sudo bash scripts/publish_artifact.sh "$VERSION" --output-dir "$OUTPUT_DIR" --owner "$OWNER"
|
||||||
|
|
||||||
|
echo "[02] 设置文件权限..."
|
||||||
|
# 设置所有者
|
||||||
|
sudo chown -R "$OWNER" "$OUTPUT_DIR"
|
||||||
|
# 设置目录权限为 755 (rwxr-xr-x)
|
||||||
|
sudo find "$OUTPUT_DIR" -type d -exec chmod 755 {} \;
|
||||||
|
# 设置文件权限为 644 (rw-r--r--)
|
||||||
|
sudo find "$OUTPUT_DIR" -type f -exec chmod 644 {} \;
|
||||||
|
# 特别处理 .sh 文件,给予执行权限 755
|
||||||
|
sudo find "$OUTPUT_DIR" -type f -name "*.sh" -exec chmod 755 {} \;
|
||||||
|
echo "[02] 权限设置完成 (UID:GID=$OWNER, dirs=755, files=644, scripts=755)"
|
||||||
|
|
||||||
|
echo "[02] 发布完成,验证文件..."
|
||||||
|
ls -lh "$OUTPUT_DIR"
|
||||||
|
|
||||||
|
echo "[02] 完成"
|
||||||
|
|
||||||
33
src/sys/tests/scripts/06_test_node_install.sh
Executable file
33
src/sys/tests/scripts/06_test_node_install.sh
Executable file
@ -0,0 +1,33 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
FTP_SERVER="${FTP_SERVER:-172.30.0.40}"
|
||||||
|
FTP_USER="${FTP_USER:-ftpuser}"
|
||||||
|
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
|
||||||
|
FTP_PORT="${FTP_PORT:-21}"
|
||||||
|
|
||||||
|
FTP_HOST="${FTP_SERVER}"
|
||||||
|
|
||||||
|
echo "[03] 进入测试节点执行安装..."
|
||||||
|
echo "[03] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}"
|
||||||
|
|
||||||
|
docker exec argus-metric-test-node bash -c "
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if ! command -v curl &>/dev/null; then
|
||||||
|
echo '[03] curl 未安装,正在安装...'
|
||||||
|
apt-get update && apt-get install -y curl
|
||||||
|
fi
|
||||||
|
|
||||||
|
cd /tmp
|
||||||
|
echo '[03] 下载 setup.sh...'
|
||||||
|
curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_HOST}:${FTP_PORT}/setup.sh -o setup.sh
|
||||||
|
|
||||||
|
echo '[03] 执行安装...'
|
||||||
|
chmod +x setup.sh
|
||||||
|
bash setup.sh --server ${FTP_HOST} --user ${FTP_USER} --password '${FTP_PASSWORD}' --port ${FTP_PORT}
|
||||||
|
|
||||||
|
echo '[03] 安装完成'
|
||||||
|
"
|
||||||
|
|
||||||
|
echo "[03] 完成"
|
||||||
55
src/sys/tests/scripts/07_test_gpu_node_install.sh
Executable file
55
src/sys/tests/scripts/07_test_gpu_node_install.sh
Executable file
@ -0,0 +1,55 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
|
||||||
|
|
||||||
|
FTP_SERVER="${FTP_SERVER:-172.29.0.40}"
|
||||||
|
FTP_USER="${FTP_USER:-ftpuser}"
|
||||||
|
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
|
||||||
|
FTP_PORT="${FTP_PORT:-21}"
|
||||||
|
|
||||||
|
FTP_HOST="${FTP_SERVER}"
|
||||||
|
|
||||||
|
echo "[04] 检测GPU环境..."
|
||||||
|
# 检测GPU环境
|
||||||
|
GPU_CHECK_SCRIPT="$REPO_ROOT/metric/tests/scripts/common/check-gpu.sh"
|
||||||
|
if [ -f "$GPU_CHECK_SCRIPT" ]; then
|
||||||
|
if bash "$GPU_CHECK_SCRIPT"; then
|
||||||
|
echo "[04] GPU环境可用,继续执行GPU节点安装"
|
||||||
|
GPU_AVAILABLE=true
|
||||||
|
else
|
||||||
|
echo "[04] GPU环境不可用,跳过GPU节点安装"
|
||||||
|
GPU_AVAILABLE=false
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "[04] 未找到GPU检测脚本: $GPU_CHECK_SCRIPT,跳过GPU节点安装"
|
||||||
|
GPU_AVAILABLE=false
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[04] 进入测试节点执行安装..."
|
||||||
|
echo "[04] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}"
|
||||||
|
|
||||||
|
docker exec argus-metric-test-gpu-node bash -c "
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if ! command -v curl &>/dev/null; then
|
||||||
|
echo '[04] curl 未安装,正在安装...'
|
||||||
|
apt-get update && apt-get install -y curl
|
||||||
|
fi
|
||||||
|
|
||||||
|
cd /tmp
|
||||||
|
echo '[04] 下载 setup.sh...'
|
||||||
|
curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_HOST}:${FTP_PORT}/setup.sh -o setup.sh
|
||||||
|
|
||||||
|
echo '[04] 执行安装...'
|
||||||
|
chmod +x setup.sh
|
||||||
|
bash setup.sh --server ${FTP_HOST} --user ${FTP_USER} --password '${FTP_PASSWORD}' --port ${FTP_PORT}
|
||||||
|
|
||||||
|
echo '[04] 安装完成'
|
||||||
|
"
|
||||||
|
|
||||||
|
echo "[04] 完成"
|
||||||
96
src/sys/tests/scripts/08_check_services_installed.sh
Executable file
96
src/sys/tests/scripts/08_check_services_installed.sh
Executable file
@ -0,0 +1,96 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo "[04] 验证安装结果 - 检查监控端口..."
|
||||||
|
echo "=========================================="
|
||||||
|
|
||||||
|
# 检查容器是否运行
|
||||||
|
if ! docker ps --format '{{.Names}}' | grep -q '^argus-metric-test-node$'; then
|
||||||
|
echo "错误: 容器 argus-metric-test-node 未运行"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ERRORS=0
|
||||||
|
|
||||||
|
# ==================== 检查监听端口 ====================
|
||||||
|
echo ""
|
||||||
|
echo "[1] 检查监听端口..."
|
||||||
|
echo "----------------------------------------"
|
||||||
|
CHECK_RESULT=$(docker exec argus-metric-test-node bash -c '
|
||||||
|
if command -v netstat >/dev/null 2>&1; then
|
||||||
|
echo "使用 netstat 检查端口:"
|
||||||
|
if netstat -tlnp 2>/dev/null | grep -E ":(9100|9400|2020)"; then
|
||||||
|
echo "✓ 找到监控端口"
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
echo "✗ 未找到监控端口 (9100/9400/2020)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
elif command -v ss >/dev/null 2>&1; then
|
||||||
|
echo "使用 ss 检查端口:"
|
||||||
|
if ss -tlnp 2>/dev/null | grep -E ":(9100|9400|2020)"; then
|
||||||
|
echo "✓ 找到监控端口"
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
echo "✗ 未找到监控端口 (9100/9400/2020)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
elif command -v lsof >/dev/null 2>&1; then
|
||||||
|
echo "使用 lsof 检查端口:"
|
||||||
|
if lsof -i :9100 -i :9400 -i :2020 2>/dev/null | grep LISTEN; then
|
||||||
|
echo "✓ 找到监控端口"
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
echo "✗ 未找到监控端口 (9100/9400/2020)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "? 没有可用的端口检查工具 (netstat/ss/lsof),跳过此检查"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
')
|
||||||
|
echo "$CHECK_RESULT"
|
||||||
|
# 只有在明确失败时才计入错误(exit 1),没有工具(exit 0)不算错误
|
||||||
|
if echo "$CHECK_RESULT" | grep -q "✗ 未找到监控端口"; then
|
||||||
|
ERRORS=$((ERRORS + 1))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ==================== 测试端口连通性 ====================
|
||||||
|
echo ""
|
||||||
|
echo "[2] 测试端口连通性..."
|
||||||
|
echo "----------------------------------------"
|
||||||
|
docker exec argus-metric-test-node bash -c '
|
||||||
|
if command -v curl >/dev/null 2>&1; then
|
||||||
|
FAILED=0
|
||||||
|
for port in 9100 9400 2020; do
|
||||||
|
echo -n "端口 $port: "
|
||||||
|
if curl -s --connect-timeout 2 "http://localhost:$port/metrics" > /dev/null 2>&1; then
|
||||||
|
echo "✓ 可访问 (/metrics)"
|
||||||
|
elif curl -s --connect-timeout 2 "http://localhost:$port/" > /dev/null 2>&1; then
|
||||||
|
echo "✓ 可访问 (根路径)"
|
||||||
|
else
|
||||||
|
echo "✗ 不可访问"
|
||||||
|
FAILED=$((FAILED + 1))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
exit $FAILED
|
||||||
|
else
|
||||||
|
echo "? curl 不可用,跳过连通性测试"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
' || ERRORS=$((ERRORS + 1))
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=========================================="
|
||||||
|
if [ $ERRORS -eq 0 ]; then
|
||||||
|
echo "✓ [04] 验证完成 - 所有端口检查通过"
|
||||||
|
else
|
||||||
|
echo "✗ [04] 验证失败 - 发现 $ERRORS 个问题"
|
||||||
|
echo ""
|
||||||
|
echo "调试建议:"
|
||||||
|
echo " 1. 进入容器检查: docker exec -it argus-metric-test-node bash"
|
||||||
|
echo " 2. 查看进程: docker exec argus-metric-test-node ps aux"
|
||||||
|
echo " 3. 查看日志: docker exec argus-metric-test-node cat /tmp/argus_install.log"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "=========================================="
|
||||||
@ -3,19 +3,9 @@ set -euo pipefail
|
|||||||
|
|
||||||
echo "[INFO] Sending logs via node-a/node-b and asserting ES counts..."
|
echo "[INFO] Sending logs via node-a/node-b and asserting ES counts..."
|
||||||
|
|
||||||
# Robust count helper: tolerates 404/503 and non-JSON responses, returns integer >=0
|
|
||||||
get_count() {
|
get_count() {
|
||||||
local idx="$1"; local tmp; tmp=$(mktemp)
|
local idx="$1"
|
||||||
local code
|
curl -s "http://localhost:9200/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" | sed -E 's/.*"count":([0-9]+).*/\1/' | awk 'NF{print $0;exit} END{if(NR==0)print 0}'
|
||||||
code=$(curl -s -o "$tmp" -w "%{http_code}" "http://localhost:9200/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" || true)
|
|
||||||
if [[ "$code" == "200" ]]; then
|
|
||||||
local val
|
|
||||||
val=$(jq -r '(.count // 0) | tonumber? // 0' "$tmp" 2>/dev/null || echo 0)
|
|
||||||
echo "$val"
|
|
||||||
else
|
|
||||||
echo 0
|
|
||||||
fi
|
|
||||||
rm -f "$tmp"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
train0=$(get_count "train-*")
|
train0=$(get_count "train-*")
|
||||||
@ -42,26 +32,11 @@ send_logs "$node_a" "host01"
|
|||||||
send_logs "$node_b" "host02"
|
send_logs "$node_b" "host02"
|
||||||
|
|
||||||
echo "[INFO] Waiting for ES to ingest..."
|
echo "[INFO] Waiting for ES to ingest..."
|
||||||
# Proactively refresh indices (ignore errors if not created yet)
|
sleep 10
|
||||||
curl -s -X POST "http://localhost:9200/train-*/_refresh" >/dev/null 2>&1 || true
|
|
||||||
curl -s -X POST "http://localhost:9200/infer-*/_refresh" >/dev/null 2>&1 || true
|
|
||||||
|
|
||||||
# Retry up to 120s for counts to increase and reach threshold (>=4)
|
train1=$(get_count "train-*")
|
||||||
final=0
|
infer1=$(get_count "infer-*")
|
||||||
threshold=4
|
final=$((train1 + infer1))
|
||||||
for attempt in {1..60}; do
|
|
||||||
train1=$(get_count "train-*")
|
|
||||||
infer1=$(get_count "infer-*")
|
|
||||||
final=$((train1 + infer1))
|
|
||||||
if (( final > base && final >= threshold )); then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
echo "[..] waiting ES counts increase to >=${threshold} ($attempt/60) current=${final} base=${base}"
|
|
||||||
# refresh indices again to speed up visibility
|
|
||||||
curl -s -X POST "http://localhost:9200/train-*/_refresh" >/dev/null 2>&1 || true
|
|
||||||
curl -s -X POST "http://localhost:9200/infer-*/_refresh" >/dev/null 2>&1 || true
|
|
||||||
sleep 2
|
|
||||||
done
|
|
||||||
echo "[INFO] final counts: train=${train1} infer=${infer1} total=${final}"
|
echo "[INFO] final counts: train=${train1} infer=${infer1} total=${final}"
|
||||||
|
|
||||||
if (( final <= base )); then
|
if (( final <= base )); then
|
||||||
@ -69,7 +44,6 @@ if (( final <= base )); then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Minimal threshold to be tolerant: expect at least 4 documents (2 train + 1 infer per node)
|
|
||||||
if (( final < 4 )); then
|
if (( final < 4 )); then
|
||||||
echo "[ERR] ES total below expected threshold: ${final} < 4" >&2
|
echo "[ERR] ES total below expected threshold: ${final} < 4" >&2
|
||||||
exit 1
|
exit 1
|
||||||
@ -1,66 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
|
|
||||||
|
|
||||||
PLUGIN_DIR="$REPO_ROOT/src/metric/client-plugins/all-in-one-full"
|
|
||||||
FTP_CONTAINER="argus-ftp"
|
|
||||||
|
|
||||||
if [[ ! -d "$PLUGIN_DIR" ]]; then
|
|
||||||
echo "[SYS-METRIC] Metric client plugin directory not found: $PLUGIN_DIR" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -f "$TEST_ROOT/.env" ]]; then
|
|
||||||
# shellcheck source=/dev/null
|
|
||||||
source "$TEST_ROOT/.env"
|
|
||||||
fi
|
|
||||||
|
|
||||||
OWNER="${ARGUS_BUILD_UID:-2133}:${ARGUS_BUILD_GID:-2015}"
|
|
||||||
|
|
||||||
resolve_output_dir() {
|
|
||||||
local host_mount
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q "^${FTP_CONTAINER}$"; then
|
|
||||||
host_mount=$(docker inspect "$FTP_CONTAINER" --format '{{range .Mounts}}{{if eq .Destination "/private/argus/ftp"}}{{.Source}}{{end}}{{end}}' 2>/dev/null || true)
|
|
||||||
if [[ -n "$host_mount" ]]; then
|
|
||||||
echo "$host_mount/share"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
echo "$TEST_ROOT/private/argus/metric/ftp/share"
|
|
||||||
}
|
|
||||||
|
|
||||||
OUTPUT_DIR="$(resolve_output_dir)"
|
|
||||||
mkdir -p "$OUTPUT_DIR"
|
|
||||||
|
|
||||||
if [[ ! -w "$OUTPUT_DIR" ]]; then
|
|
||||||
echo "[SYS-METRIC] 无法写入 FTP 输出目录: $OUTPUT_DIR" >&2
|
|
||||||
echo " 请确认目录权限与 ARGUS_BUILD_UID/GID 一致" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
pushd "$PLUGIN_DIR" >/dev/null
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] Bumping metric artifact version..."
|
|
||||||
bash scripts/version-manager.sh bump minor
|
|
||||||
|
|
||||||
VERSION_FILE="config/VERSION"
|
|
||||||
if [[ ! -f "$VERSION_FILE" ]]; then
|
|
||||||
echo "[SYS-METRIC] VERSION 文件缺失: $VERSION_FILE" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
VERSION=$(tr -d '\n' < "$VERSION_FILE")
|
|
||||||
echo "[SYS-METRIC] 当前版本: $VERSION"
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] Packaging metric artifact..."
|
|
||||||
bash scripts/package_artifact.sh --force
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] Publishing artifact to FTP share..."
|
|
||||||
bash scripts/publish_artifact.sh "$VERSION" --output-dir "$OUTPUT_DIR" --owner "$OWNER"
|
|
||||||
|
|
||||||
popd >/dev/null
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] Metric artifact published to $OUTPUT_DIR"
|
|
||||||
@ -1,50 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
|
|
||||||
if [[ -f "$TEST_ROOT/.env" ]]; then
|
|
||||||
# shellcheck source=/dev/null
|
|
||||||
source "$TEST_ROOT/.env"
|
|
||||||
fi
|
|
||||||
|
|
||||||
CONTAINER="argus-metric-test-node"
|
|
||||||
|
|
||||||
if ! docker ps --format '{{.Names}}' | grep -q "^${CONTAINER}$"; then
|
|
||||||
echo "[SYS-METRIC] 容器 ${CONTAINER} 未运行,无法执行安装" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
FTP_HOST="${FTP_SERVER:-172.29.0.40}"
|
|
||||||
FTP_USER="${FTP_USER:-ftpuser}"
|
|
||||||
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
|
|
||||||
FTP_PORT="${FTP_PORT:-21}"
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] 在 ${CONTAINER} 内执行安装 (FTP: ${FTP_HOST}:${FTP_PORT})"
|
|
||||||
|
|
||||||
docker exec \
|
|
||||||
-e FTP_HOST="$FTP_HOST" \
|
|
||||||
-e FTP_USER="$FTP_USER" \
|
|
||||||
-e FTP_PASSWORD="$FTP_PASSWORD" \
|
|
||||||
-e FTP_PORT="$FTP_PORT" \
|
|
||||||
"$CONTAINER" bash -c '
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if ! command -v curl &>/dev/null; then
|
|
||||||
echo "[SYS-METRIC] curl 未安装,开始安装依赖..."
|
|
||||||
apt-get update >/dev/null && apt-get install -y curl >/dev/null
|
|
||||||
fi
|
|
||||||
|
|
||||||
cd /tmp
|
|
||||||
echo "[SYS-METRIC] 下载 setup.sh..."
|
|
||||||
curl -u "${FTP_USER}:${FTP_PASSWORD}" "ftp://${FTP_HOST}:${FTP_PORT}/setup.sh" -o setup.sh
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] 执行安装..."
|
|
||||||
chmod +x setup.sh
|
|
||||||
bash setup.sh --server "${FTP_HOST}" --user "${FTP_USER}" --password "${FTP_PASSWORD}" --port "${FTP_PORT}"
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] 安装完成"
|
|
||||||
'
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] Metric test node 安装流程完成"
|
|
||||||
@ -58,25 +58,10 @@ docker rm -f argus-node-b >/dev/null 2>&1 || true
|
|||||||
|
|
||||||
AGENT_BIN_PATH="$(cat "$TMP_DIR/agent_binary_path")"
|
AGENT_BIN_PATH="$(cat "$TMP_DIR/agent_binary_path")"
|
||||||
|
|
||||||
# 选择 compose 管理的网络名(默认 argus-sys_sysnet)。
|
|
||||||
detect_sysnet() {
|
|
||||||
if docker network inspect argus-sys_sysnet >/dev/null 2>&1; then
|
|
||||||
echo argus-sys_sysnet; return
|
|
||||||
fi
|
|
||||||
# 回退:从 master 容器推断所连网络(取第一个)
|
|
||||||
local n
|
|
||||||
n=$(docker inspect -f '{{range $k, $_ := .NetworkSettings.Networks}}{{println $k}}{{end}}' argus-master-sys 2>/dev/null | head -n1 || true)
|
|
||||||
if [[ -n "$n" ]]; then echo "$n"; return; fi
|
|
||||||
# 最后兜底:尝试项目默认网络(不保证有 IPAM)
|
|
||||||
echo argus-sys_default
|
|
||||||
}
|
|
||||||
SYSNET_NAME=$(detect_sysnet)
|
|
||||||
echo "[INFO] Using docker network: $SYSNET_NAME"
|
|
||||||
|
|
||||||
docker run -d \
|
docker run -d \
|
||||||
--name argus-node-b \
|
--name argus-node-b \
|
||||||
--hostname dev-yyrshare-uuuu10-ep2f-pod-0 \
|
--hostname dev-yyrshare-uuuu10-ep2f-pod-0 \
|
||||||
--network "$SYSNET_NAME" \
|
--network argus-sys-net \
|
||||||
--ip 172.29.0.200 \
|
--ip 172.29.0.200 \
|
||||||
--dns 172.29.0.2 \
|
--dns 172.29.0.2 \
|
||||||
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
|
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
|
||||||
@ -12,33 +12,12 @@ compose() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
docker rm -f argus-node-b >/dev/null 2>&1 || true
|
||||||
|
|
||||||
pushd "$TEST_ROOT" >/dev/null
|
pushd "$TEST_ROOT" >/dev/null
|
||||||
compose -p argus-sys down --remove-orphans || true
|
compose -p argus-sys down --remove-orphans || true
|
||||||
compose down --remove-orphans || true
|
|
||||||
popd >/dev/null
|
popd >/dev/null
|
||||||
|
|
||||||
echo "[INFO] Force removing containers by name (if any)..."
|
|
||||||
containers=(
|
|
||||||
argus-node-a
|
|
||||||
argus-node-b
|
|
||||||
argus-metric-test-node
|
|
||||||
argus-grafana
|
|
||||||
argus-kibana-sys
|
|
||||||
argus-master-sys
|
|
||||||
argus-bind-sys
|
|
||||||
argus-ftp
|
|
||||||
argus-es-sys
|
|
||||||
argus-prometheus
|
|
||||||
)
|
|
||||||
for c in "${containers[@]}"; do
|
|
||||||
id=$(docker ps -aqf "name=^${c}$" || true)
|
|
||||||
if [[ -n "$id" ]]; then
|
|
||||||
docker rm -f "$id" >/dev/null 2>&1 || true
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "[INFO] Removing compose networks (handled by compose down)"
|
|
||||||
|
|
||||||
echo "[INFO] Cleaning private directories..."
|
echo "[INFO] Cleaning private directories..."
|
||||||
if [[ -d "$TEST_ROOT/private" ]]; then
|
if [[ -d "$TEST_ROOT/private" ]]; then
|
||||||
docker run --rm -v "$TEST_ROOT/private:/target" ubuntu:24.04 chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true
|
docker run --rm -v "$TEST_ROOT/private:/target" ubuntu:24.04 chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true
|
||||||
@ -1,64 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
|
|
||||||
ENABLE_GPU=${ARGUS_SYS_ENABLE_GPU:-false}
|
|
||||||
|
|
||||||
if [[ "$ENABLE_GPU" != "true" ]]; then
|
|
||||||
echo "[SYS-METRIC] 未启用 GPU 流程,跳过 GPU 节点安装"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -f "$TEST_ROOT/.env" ]]; then
|
|
||||||
# shellcheck source=/dev/null
|
|
||||||
source "$TEST_ROOT/.env"
|
|
||||||
fi
|
|
||||||
|
|
||||||
CONTAINER="argus-metric-test-gpu-node"
|
|
||||||
|
|
||||||
if ! docker ps --format '{{.Names}}' | grep -q "^${CONTAINER}$"; then
|
|
||||||
echo "[SYS-METRIC] 预期启动的 ${CONTAINER} 未运行" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
FTP_HOST="${FTP_SERVER:-172.29.0.40}"
|
|
||||||
FTP_USER="${FTP_USER:-ftpuser}"
|
|
||||||
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
|
|
||||||
FTP_PORT="${FTP_PORT:-21}"
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] 在 GPU 节点执行安装 (FTP: ${FTP_HOST}:${FTP_PORT})"
|
|
||||||
|
|
||||||
docker exec \
|
|
||||||
-e FTP_HOST="$FTP_HOST" \
|
|
||||||
-e FTP_USER="$FTP_USER" \
|
|
||||||
-e FTP_PASSWORD="$FTP_PASSWORD" \
|
|
||||||
-e FTP_PORT="$FTP_PORT" \
|
|
||||||
"$CONTAINER" bash -c '
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if ! command -v nvidia-smi &>/dev/null; then
|
|
||||||
echo "[SYS-METRIC] GPU 节点缺少 nvidia-smi" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
nvidia-smi >/dev/null || true
|
|
||||||
|
|
||||||
if ! command -v curl &>/dev/null; then
|
|
||||||
echo "[SYS-METRIC] curl 未安装,开始安装依赖..."
|
|
||||||
apt-get update >/dev/null && apt-get install -y curl >/dev/null
|
|
||||||
fi
|
|
||||||
|
|
||||||
cd /tmp
|
|
||||||
echo "[SYS-METRIC] 下载 setup.sh..."
|
|
||||||
curl -u "${FTP_USER}:${FTP_PASSWORD}" "ftp://${FTP_HOST}:${FTP_PORT}/setup.sh" -o setup.sh
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] 执行安装..."
|
|
||||||
chmod +x setup.sh
|
|
||||||
bash setup.sh --server "${FTP_HOST}" --user "${FTP_USER}" --password "${FTP_PASSWORD}" --port "${FTP_PORT}"
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] GPU 节点安装完成"
|
|
||||||
'
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] Metric GPU 节点安装流程完成"
|
|
||||||
@ -1,40 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] Verify: master"
|
|
||||||
"$SCRIPT_DIR/13_metric_verify_master.sh"
|
|
||||||
echo
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] Verify: prometheus"
|
|
||||||
PROM_RETRIES=${PROM_VERIFY_RETRIES:-2}
|
|
||||||
PROM_BACKOFF=${PROM_VERIFY_BACKOFF_SECONDS:-30}
|
|
||||||
attempt=0
|
|
||||||
while true; do
|
|
||||||
if "$SCRIPT_DIR/13_metric_verify_prometheus.sh"; then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
attempt=$((attempt+1))
|
|
||||||
if (( attempt > PROM_RETRIES )); then
|
|
||||||
echo "[ERR] prometheus verify failed after $PROM_RETRIES retries" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "[WARN] prometheus verify failed; retry $attempt/$PROM_RETRIES after ${PROM_BACKOFF}s"
|
|
||||||
sleep "$PROM_BACKOFF"
|
|
||||||
done
|
|
||||||
echo
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] Verify: dataplane"
|
|
||||||
"$SCRIPT_DIR/13_metric_verify_dataplane.sh"
|
|
||||||
echo
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] Verify: grafana"
|
|
||||||
"$SCRIPT_DIR/13_metric_verify_grafana.sh"
|
|
||||||
echo
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] Verify: grafana panels"
|
|
||||||
"$SCRIPT_DIR/13_metric_verify_grafana_panels.sh"
|
|
||||||
echo
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] Metric verification completed"
|
|
||||||
@ -1,47 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
TMP_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. && pwd)/tmp/metric-verify"
|
|
||||||
mkdir -p "$TMP_DIR"
|
|
||||||
|
|
||||||
PROM_BASE="http://localhost:9090/api/v1"
|
|
||||||
INSTANCE="${METRIC_TEST_INSTANCE:-172.29.0.50:9100}"
|
|
||||||
IP_ONLY="${INSTANCE%%:*}"
|
|
||||||
|
|
||||||
echo "[VERIFY:DATA] node exporter metrics present in container"
|
|
||||||
docker exec argus-metric-test-node bash -lc "curl -fsS --max-time 5 http://localhost:9100/metrics | head -n 5" > "$TMP_DIR/node_metrics_head.txt" || { echo "[ERR] cannot fetch node exporter metrics" >&2; exit 1; }
|
|
||||||
if ! grep -E "node_(exporter_build_info|time_seconds)" -q "$TMP_DIR/node_metrics_head.txt"; then
|
|
||||||
echo "[WARN] head did not show expected lines; continuing (exporter may output later lines)"
|
|
||||||
fi
|
|
||||||
echo "[OK] node exporter endpoint reachable"
|
|
||||||
|
|
||||||
echo "[VERIFY:DATA] Prometheus has recent sample for build_info"
|
|
||||||
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=node_exporter_build_info{job=\"node\",ip=\"$IP_ONLY\"}" > "$TMP_DIR/prom_ne_build_info_1.json"
|
|
||||||
|
|
||||||
python3 - "$TMP_DIR/prom_ne_build_info_1.json" <<'PY'
|
|
||||||
import json,sys,time
|
|
||||||
j=json.load(open(sys.argv[1]))
|
|
||||||
res=j.get('data',{}).get('result',[])
|
|
||||||
assert res, 'no result for node_exporter_build_info'
|
|
||||||
ts=float(res[0]['value'][0])
|
|
||||||
now=time.time()
|
|
||||||
assert now-ts<180, f"sample too old: now={now} ts={ts}"
|
|
||||||
print(int(ts))
|
|
||||||
PY
|
|
||||||
T1=$?
|
|
||||||
sleep 30
|
|
||||||
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=node_exporter_build_info{job=\"node\",ip=\"$IP_ONLY\"}" > "$TMP_DIR/prom_ne_build_info_2.json"
|
|
||||||
|
|
||||||
TS1=$(python3 - "$TMP_DIR/prom_ne_build_info_1.json" <<'PY'
|
|
||||||
import json,sys
|
|
||||||
print(float(json.load(open(sys.argv[1]))['data']['result'][0]['value'][0]))
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
TS2=$(python3 - "$TMP_DIR/prom_ne_build_info_2.json" <<'PY'
|
|
||||||
import json,sys
|
|
||||||
print(float(json.load(open(sys.argv[1]))['data']['result'][0]['value'][0]))
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
awk -v a="$TS1" -v b="$TS2" 'BEGIN{ if (b>=a) exit 0; else exit 1 }' || { echo "[ERR] sample timestamp did not advance" >&2; exit 1; }
|
|
||||||
echo "[OK] sample timestamp advanced"
|
|
||||||
echo "[DONE] dataplane verify"
|
|
||||||
@ -1,39 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
PROM_DOMAIN="prom.metric.argus.com:9090"
|
|
||||||
GRAF="http://localhost:3000"
|
|
||||||
|
|
||||||
echo "[VERIFY:GRAFANA] /api/health"
|
|
||||||
TMP_FILE="$(cd "$(dirname "$0")"/.. && pwd)/tmp/metric-verify/graf_health.json"
|
|
||||||
mkdir -p "$(dirname "$TMP_FILE")"
|
|
||||||
curl -fsS --max-time 10 "$GRAF/api/health" -o "$TMP_FILE" || { echo "[ERR] failed to GET /api/health" >&2; exit 1; }
|
|
||||||
python3 - "$TMP_FILE" <<'PY'
|
|
||||||
import sys,json
|
|
||||||
with open(sys.argv[1],'r',encoding='utf-8') as f:
|
|
||||||
j=json.load(f)
|
|
||||||
assert j.get('database')=='ok', f"health not ok: {j}"
|
|
||||||
print('OK')
|
|
||||||
PY
|
|
||||||
|
|
||||||
echo "[VERIFY:GRAFANA] datasource URL uses domain: $PROM_DOMAIN"
|
|
||||||
DS_FILE="/private/argus/metric/grafana/provisioning/datasources/datasources.yml"
|
|
||||||
if ! docker exec argus-grafana sh -lc "test -f $DS_FILE"; then
|
|
||||||
DS_FILE="/etc/grafana/provisioning/datasources/datasources.yml"
|
|
||||||
fi
|
|
||||||
docker exec argus-grafana sh -lc "grep -E 'url:\s*http://$PROM_DOMAIN' '$DS_FILE'" >/dev/null 2>&1 || { echo "[ERR] datasource not pointing to $PROM_DOMAIN" >&2; exit 1; }
|
|
||||||
echo "[OK] datasource points to domain"
|
|
||||||
|
|
||||||
echo "[VERIFY:GRAFANA] bind resolution inside grafana"
|
|
||||||
tries=0
|
|
||||||
until docker exec argus-grafana getent hosts prom.metric.argus.com >/dev/null 2>&1; do
|
|
||||||
tries=$((tries+1))
|
|
||||||
if (( tries > 24 )); then
|
|
||||||
echo "[ERR] grafana cannot resolve prom.metric.argus.com" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "[..] waiting DNS propagation in grafana ($tries/24)"; sleep 5
|
|
||||||
done
|
|
||||||
echo "[OK] domain resolves"
|
|
||||||
|
|
||||||
echo "[DONE] grafana verify"
|
|
||||||
@ -1,70 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
TMP_DIR="$TEST_ROOT/tmp/metric-verify"
|
|
||||||
mkdir -p "$TMP_DIR"
|
|
||||||
|
|
||||||
GRAF="http://localhost:3000"
|
|
||||||
HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}"
|
|
||||||
|
|
||||||
echo "[VERIFY:GRAF-PANELS] resolve Prometheus datasource UID via Grafana"
|
|
||||||
DS_JSON="$TMP_DIR/graf_ds.json"
|
|
||||||
curl -fsS --max-time 10 "$GRAF/api/datasources" >"$DS_JSON"
|
|
||||||
DS_UID=$(python3 - "$DS_JSON" <<'PY'
|
|
||||||
import json,sys
|
|
||||||
arr=json.load(open(sys.argv[1]))
|
|
||||||
for ds in arr:
|
|
||||||
if (ds.get('type')=='prometheus'):
|
|
||||||
print(ds.get('uid',''))
|
|
||||||
break
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
if [[ -z "$DS_UID" ]]; then echo "[ERR] no prometheus datasource found in grafana" >&2; exit 1; fi
|
|
||||||
echo "[OK] Prometheus DS UID=$DS_UID"
|
|
||||||
|
|
||||||
proxy_query() {
|
|
||||||
local q="$1"; local out="$2"
|
|
||||||
curl -fsS --max-time 10 --get "$GRAF/api/datasources/proxy/uid/$DS_UID/api/v1/query" \
|
|
||||||
--data-urlencode "query=$q" >"$out"
|
|
||||||
}
|
|
||||||
|
|
||||||
assert_vector_recent_nonempty() {
|
|
||||||
local json="$1"; local max_age_sec="${2:-180}"
|
|
||||||
python3 - <<'PY' "$json" "$max_age_sec"
|
|
||||||
import json,sys,time
|
|
||||||
doc=json.load(open(sys.argv[1]))
|
|
||||||
if doc.get('status')!='success':
|
|
||||||
raise SystemExit('prom status != success')
|
|
||||||
res=doc.get('data',{}).get('result',[])
|
|
||||||
assert res, 'empty result'
|
|
||||||
ts=float(res[0]['value'][0])
|
|
||||||
assert time.time()-ts < float(sys.argv[2]), f'timestamp too old: {ts}'
|
|
||||||
print(int(ts))
|
|
||||||
PY
|
|
||||||
}
|
|
||||||
|
|
||||||
echo "[VERIFY:GRAF-PANELS] Dashboard: Node and GPU Metrics — System Load"
|
|
||||||
Q_NODE_LOAD="node_load1{hostname=\"$HOSTNAME\"}"
|
|
||||||
proxy_query "$Q_NODE_LOAD" "$TMP_DIR/graf_panel_node_load.json"
|
|
||||||
assert_vector_recent_nonempty "$TMP_DIR/graf_panel_node_load.json" 300 >/dev/null
|
|
||||||
echo "[OK] node_load1 has recent sample via Grafana proxy"
|
|
||||||
|
|
||||||
echo "[VERIFY:GRAF-PANELS] Dashboard: Cluster Dashboard — Node online count"
|
|
||||||
Q_NODE_ONLINE='count(count by(hostname) (up{job="node"} == 1))'
|
|
||||||
proxy_query "$Q_NODE_ONLINE" "$TMP_DIR/graf_panel_node_online.json"
|
|
||||||
python3 - "$TMP_DIR/graf_panel_node_online.json" <<'PY'
|
|
||||||
import json,sys
|
|
||||||
doc=json.load(open(sys.argv[1]))
|
|
||||||
assert doc.get('status')=='success', 'prom status not success'
|
|
||||||
res=doc.get('data',{}).get('result',[])
|
|
||||||
assert res, 'no series for node online count'
|
|
||||||
val=float(res[0]['value'][1])
|
|
||||||
assert val>=1, f'node online < 1: {val}'
|
|
||||||
print('OK',val)
|
|
||||||
PY
|
|
||||||
echo "[OK] cluster node online count >= 1 via Grafana proxy"
|
|
||||||
|
|
||||||
echo "[DONE] grafana panels verify"
|
|
||||||
|
|
||||||
@ -1,105 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
TMP_DIR="$TEST_ROOT/tmp/metric-verify"
|
|
||||||
mkdir -p "$TMP_DIR"
|
|
||||||
|
|
||||||
MASTER_BASE="http://localhost:32300/api/v1/master"
|
|
||||||
HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}"
|
|
||||||
|
|
||||||
curl_json() { curl -fsS --max-time 5 "$1"; }
|
|
||||||
|
|
||||||
echo "[VERIFY:MASTER] list nodes and locate target hostname=$HOSTNAME"
|
|
||||||
ALL_NODES_JSON="$TMP_DIR/master_nodes.json"
|
|
||||||
|
|
||||||
# 重试等待节点出现在 /nodes 列表(最多 120s)
|
|
||||||
NODE_ID=""
|
|
||||||
for attempt in {1..24}; do
|
|
||||||
curl_json "$MASTER_BASE/nodes" > "$ALL_NODES_JSON" || true
|
|
||||||
NODE_ID=$(python3 - "$ALL_NODES_JSON" "$HOSTNAME" <<'PY'
|
|
||||||
import json,sys
|
|
||||||
try:
|
|
||||||
nodes=json.load(open(sys.argv[1]))
|
|
||||||
except Exception:
|
|
||||||
nodes=[]
|
|
||||||
name=sys.argv[2]
|
|
||||||
for n in nodes:
|
|
||||||
if n.get('name')==name:
|
|
||||||
print(n.get('id',''))
|
|
||||||
break
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
if [[ -n "$NODE_ID" ]]; then break; fi
|
|
||||||
echo "[..] waiting node to appear in /nodes ($attempt/24)"; sleep 5
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ -z "$NODE_ID" ]]; then
|
|
||||||
echo "[ERR] master /nodes 中未找到 $HOSTNAME(等待超时)" >&2
|
|
||||||
echo "[HINT] 当前 /nodes 列表如下:" >&2
|
|
||||||
sed -n '1,160p' "$ALL_NODES_JSON" >&2 || true
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "[OK] node id=$NODE_ID"
|
|
||||||
|
|
||||||
echo "[VERIFY:MASTER] get node detail and assert fields"
|
|
||||||
DETAIL1_JSON="$TMP_DIR/master_node_${NODE_ID}_detail_1.json"
|
|
||||||
curl_json "$MASTER_BASE/nodes/$NODE_ID" > "$DETAIL1_JSON"
|
|
||||||
|
|
||||||
# 基础字段与健康项检查(不强制立即 online)
|
|
||||||
python3 - "$DETAIL1_JSON" "$HOSTNAME" <<'PY'
|
|
||||||
import json,sys,datetime
|
|
||||||
j=json.load(open(sys.argv[1]))
|
|
||||||
host=sys.argv[2]
|
|
||||||
assert j.get('name')==host, f"name mismatch: {j.get('name')} != {host}"
|
|
||||||
status=j.get('status')
|
|
||||||
assert status in ('initialized','online','offline'), f"unexpected status: {status}"
|
|
||||||
md=j.get('meta_data',{})
|
|
||||||
assert md.get('hostname',j.get('name'))==host, 'meta_data.hostname mismatch'
|
|
||||||
assert 'last_report' in j and j['last_report'], 'last_report missing'
|
|
||||||
h=j.get('health',{})
|
|
||||||
for key in ('metric-node-exporter','metric-fluent-bit','metric-argus-agent'):
|
|
||||||
if key in h:
|
|
||||||
assert h[key].get('status')=='healthy', f"{key} not healthy: {h[key]}"
|
|
||||||
print('OK')
|
|
||||||
PY
|
|
||||||
|
|
||||||
# 轮询等待 last_report 前进并最终转为 online(最多 90s),容忍短暂 5xx/网络错误
|
|
||||||
attempt=0
|
|
||||||
T_PRE=0
|
|
||||||
until [[ $attempt -ge 18 ]]; do
|
|
||||||
sleep 5
|
|
||||||
DETAIL_CUR="$TMP_DIR/master_node_${NODE_ID}_detail_cur.json"
|
|
||||||
if ! curl_json "$MASTER_BASE/nodes/$NODE_ID" > "$DETAIL_CUR" 2>/dev/null; then
|
|
||||||
echo "[..] retrying node detail fetch ($attempt/18)"; ((attempt++)); continue
|
|
||||||
fi
|
|
||||||
read -r STATUS_CUR T_CUR < <(python3 - "$DETAIL_CUR" <<'PY'
|
|
||||||
import json,sys,datetime
|
|
||||||
j=json.load(open(sys.argv[1]))
|
|
||||||
st=j.get('status','')
|
|
||||||
ts=j.get('last_report','')
|
|
||||||
if ts.endswith('Z'): ts=ts.replace('Z','+00:00')
|
|
||||||
try:
|
|
||||||
t=float(datetime.datetime.fromisoformat(ts).timestamp())
|
|
||||||
except Exception:
|
|
||||||
t=0.0
|
|
||||||
print(st)
|
|
||||||
print(t)
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
if awk -v a="$T_PRE" -v b="$T_CUR" 'BEGIN{exit !(b>a)}'; then
|
|
||||||
T_PRE="$T_CUR"
|
|
||||||
fi
|
|
||||||
if [[ "$STATUS_CUR" == "online" ]]; then
|
|
||||||
echo "[OK] status online and last_report progressed"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
((attempt++))
|
|
||||||
done
|
|
||||||
if (( attempt >= 18 )) && [[ "$STATUS_CUR" != "online" ]]; then
|
|
||||||
echo "[WARN] status did not reach online within timeout; continuing"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "$NODE_ID" > "$TMP_DIR/node_id_metric"
|
|
||||||
echo "[DONE] master verify"
|
|
||||||
@ -1,142 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
TMP_DIR="$TEST_ROOT/tmp/metric-verify"
|
|
||||||
mkdir -p "$TMP_DIR"
|
|
||||||
|
|
||||||
PROM_BASE="http://localhost:9090/api/v1"
|
|
||||||
HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}"
|
|
||||||
|
|
||||||
nodes_json="$TEST_ROOT/private/argus/metric/prometheus/nodes.json"
|
|
||||||
targets_json="$TEST_ROOT/private/argus/metric/prometheus/targets/node_exporter.json"
|
|
||||||
|
|
||||||
echo "[VERIFY:PROM] nodes.json present and contains hostname=$HOSTNAME"
|
|
||||||
[[ -f "$nodes_json" ]] || { echo "[ERR] $nodes_json missing" >&2; exit 1; }
|
|
||||||
python3 - "$nodes_json" "$HOSTNAME" <<'PY'
|
|
||||||
import json,sys
|
|
||||||
arr=json.load(open(sys.argv[1]))
|
|
||||||
host=sys.argv[2]
|
|
||||||
assert any((i.get('hostname')==host) for i in arr), f"{host} not found in nodes.json"
|
|
||||||
PY
|
|
||||||
echo "[OK] nodes.json contains target"
|
|
||||||
|
|
||||||
echo "[VERIFY:PROM] file_sd targets exist for nodes.json entries"
|
|
||||||
[[ -f "$targets_json" ]] || { echo "[ERR] $targets_json missing" >&2; exit 1; }
|
|
||||||
python3 - "$nodes_json" "$targets_json" "$HOSTNAME" >"$TMP_DIR/prom_targets_ip_inst.txt" <<'PY'
|
|
||||||
import json,sys
|
|
||||||
nodes=json.load(open(sys.argv[1]))
|
|
||||||
file_sd=json.load(open(sys.argv[2]))
|
|
||||||
host=sys.argv[3]
|
|
||||||
targets=set()
|
|
||||||
for item in file_sd:
|
|
||||||
for t in item.get('targets',[]): targets.add(t)
|
|
||||||
# choose node matching hostname; fallback to first metric user node; otherwise first
|
|
||||||
sel = None
|
|
||||||
for n in nodes:
|
|
||||||
if n.get('hostname') == host:
|
|
||||||
sel = n
|
|
||||||
break
|
|
||||||
if not sel:
|
|
||||||
for n in nodes:
|
|
||||||
if n.get('user_id') == 'metric':
|
|
||||||
sel = n
|
|
||||||
break
|
|
||||||
if not sel and nodes:
|
|
||||||
sel = nodes[0]
|
|
||||||
if not sel:
|
|
||||||
raise SystemExit('nodes.json empty or no suitable node found')
|
|
||||||
ip = sel['ip']
|
|
||||||
inst = f"{ip}:9100"
|
|
||||||
print(ip)
|
|
||||||
print(inst)
|
|
||||||
PY
|
|
||||||
IP_FIRST=$(sed -n '1p' "$TMP_DIR/prom_targets_ip_inst.txt")
|
|
||||||
INSTANCE=$(sed -n '2p' "$TMP_DIR/prom_targets_ip_inst.txt")
|
|
||||||
echo "[INFO] expecting instance in file_sd: $INSTANCE"
|
|
||||||
|
|
||||||
# 尝试在 Prometheus 容器内主动刷新 targets(可选加速)
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then
|
|
||||||
echo "[..] triggering update_targets inside argus-prometheus"
|
|
||||||
docker exec argus-prometheus bash -lc \
|
|
||||||
'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true'
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 给 Prometheus 一次初始 scrape 周期
|
|
||||||
sleep 10
|
|
||||||
|
|
||||||
# 若短暂未生成,进行重试(最多 180s),期间多次触发刷新
|
|
||||||
retry=0
|
|
||||||
until jq -r '.[].targets[]' "$targets_json" 2>/dev/null | grep -q "^${IP_FIRST}:9100$"; do
|
|
||||||
if (( retry >= 36 )); then
|
|
||||||
echo "[ERR] ${IP_FIRST}:9100 not present in file_sd after timeout" >&2
|
|
||||||
echo "[HINT] current targets file content:" >&2
|
|
||||||
sed -n '1,200p' "$targets_json" >&2 || true
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
if (( retry % 3 == 0 )) && docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then
|
|
||||||
docker exec argus-prometheus bash -lc \
|
|
||||||
'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true'
|
|
||||||
fi
|
|
||||||
echo "[..] waiting file_sd refresh ($retry/36)"; sleep 5; ((retry++))
|
|
||||||
done
|
|
||||||
|
|
||||||
# 改为以 PromQL up 指标作为健康依据,避免 targets 页面状态抖动
|
|
||||||
echo "[VERIFY:PROM] up{job=\"node\",ip=\"$IP_FIRST\"} > 0"
|
|
||||||
attempt=0
|
|
||||||
until (( attempt >= 60 )); do
|
|
||||||
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"node\",ip=\"$IP_FIRST\"}" > "$TMP_DIR/prom_up_inst_active.json" || true
|
|
||||||
if python3 - "$TMP_DIR/prom_up_inst_active.json" <<'PY'
|
|
||||||
import json,sys
|
|
||||||
try:
|
|
||||||
j=json.load(open(sys.argv[1]))
|
|
||||||
except Exception:
|
|
||||||
raise SystemExit(1)
|
|
||||||
res=j.get('data',{}).get('result',[])
|
|
||||||
if res:
|
|
||||||
try:
|
|
||||||
val=float(res[0]['value'][1])
|
|
||||||
if val>0: raise SystemExit(0)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
raise SystemExit(1)
|
|
||||||
PY
|
|
||||||
then
|
|
||||||
echo "[OK] up > 0 (control-plane scrape works)"; break
|
|
||||||
fi
|
|
||||||
if (( attempt % 6 == 0 )) && docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then
|
|
||||||
docker exec argus-prometheus bash -lc \
|
|
||||||
'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true'
|
|
||||||
fi
|
|
||||||
echo "[..] waiting up{job=\"node\",ip=\"$IP_FIRST\"} > 0 ($attempt/60)"; sleep 5; ((attempt++))
|
|
||||||
done
|
|
||||||
if (( attempt >= 60 )); then
|
|
||||||
echo "[ERR] up{job=\"node\",ip=\"$IP_FIRST\"} did not become > 0" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[VERIFY:PROM] instant up query > 0"
|
|
||||||
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"node\",ip=\"$IP_FIRST\"}" > "$TMP_DIR/prom_up_inst.json"
|
|
||||||
python3 - "$TMP_DIR/prom_up_inst.json" <<'PY'
|
|
||||||
import json,sys
|
|
||||||
j=json.load(open(sys.argv[1]))
|
|
||||||
res=j.get('data',{}).get('result',[])
|
|
||||||
assert res, 'empty result for up{job="node",instance=...}'
|
|
||||||
val=float(res[0]['value'][1])
|
|
||||||
assert val>0, f"up value not > 0: {val}"
|
|
||||||
PY
|
|
||||||
echo "[OK] up > 0"
|
|
||||||
|
|
||||||
echo "[VERIFY:PROM] count(up{job=\"node\"}==1) >= 1"
|
|
||||||
curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=count(up{job=\"node\"}==1)" > "$TMP_DIR/prom_up_count.json"
|
|
||||||
python3 - "$TMP_DIR/prom_up_count.json" <<'PY'
|
|
||||||
import json,sys
|
|
||||||
j=json.load(open(sys.argv[1]))
|
|
||||||
res=j.get('data',{}).get('result',[])
|
|
||||||
assert res, 'empty result for count(up{job="node"}==1)'
|
|
||||||
val=float(res[0]['value'][1])
|
|
||||||
assert val>=1, f"count < 1: {val}"
|
|
||||||
PY
|
|
||||||
echo "[OK] up count satisfied"
|
|
||||||
echo "[DONE] prometheus verify"
|
|
||||||
@ -1,18 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
|
|
||||||
FTP_SHARE="$TEST_ROOT/private/argus/metric/ftp/share"
|
|
||||||
|
|
||||||
if [[ -d "$FTP_SHARE" ]]; then
|
|
||||||
echo "[SYS-METRIC] 清理 FTP 发布产物..."
|
|
||||||
rm -f "$FTP_SHARE"/argus-metric_*.tar.gz 2>/dev/null || true
|
|
||||||
rm -f "$FTP_SHARE"/LATEST_VERSION 2>/dev/null || true
|
|
||||||
rm -f "$FTP_SHARE"/dns.conf "$FTP_SHARE"/setup.sh 2>/dev/null || true
|
|
||||||
else
|
|
||||||
echo "[SYS-METRIC] FTP 目录不存在,跳过清理"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[SYS-METRIC] Metric 清理完成"
|
|
||||||
@ -1,45 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
|
||||||
ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
|
||||||
AGENT_ROOT=${AGENT_ROOT:-/private/argus/agent}
|
|
||||||
PREPARED_FLAG="/tmp/.metric_node_prepared"
|
|
||||||
|
|
||||||
export DEBIAN_FRONTEND=${DEBIAN_FRONTEND:-noninteractive}
|
|
||||||
|
|
||||||
if [[ ! -f "$PREPARED_FLAG" ]]; then
|
|
||||||
apt-get update -qq
|
|
||||||
apt-get install -y -qq \
|
|
||||||
curl \
|
|
||||||
net-tools \
|
|
||||||
iproute2 \
|
|
||||||
lsof \
|
|
||||||
procps \
|
|
||||||
ca-certificates \
|
|
||||||
gnupg2 || {
|
|
||||||
echo "[metric-node] Failed to install base packages" >&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
mkdir -p "$(dirname "$PREPARED_FLAG")"
|
|
||||||
touch "$PREPARED_FLAG"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -n "${TZ:-}" ]]; then
|
|
||||||
ln -snf "/usr/share/zoneinfo/${TZ}" /etc/localtime 2>/dev/null || true
|
|
||||||
echo "$TZ" > /etc/timezone 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
mkdir -p "$AGENT_ROOT"
|
|
||||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$AGENT_ROOT" 2>/dev/null || true
|
|
||||||
|
|
||||||
if [[ "${METRIC_NODE_ROLE:-cpu}" == "gpu" ]]; then
|
|
||||||
if ! command -v nvidia-smi >/dev/null 2>&1; then
|
|
||||||
echo "[metric-node] nvidia-smi not available but GPU role requested" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
nvidia-smi || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
exec "$@"
|
|
||||||
@ -46,9 +46,7 @@ fi
|
|||||||
# Start Fluent Bit in background (will block, so run via bash -lc &)
|
# Start Fluent Bit in background (will block, so run via bash -lc &)
|
||||||
if [[ -x /private/start-fluent-bit.sh ]]; then
|
if [[ -x /private/start-fluent-bit.sh ]]; then
|
||||||
log "starting fluent-bit"
|
log "starting fluent-bit"
|
||||||
sysctl -w fs.inotify.max_user_instances=512 >/dev/null 2>&1 || true
|
bash -lc '/private/start-fluent-bit.sh' &
|
||||||
sysctl -w fs.inotify.max_user_watches=524288 >/dev/null 2>&1 || true
|
|
||||||
bash -lc 'ulimit -n 65536 || true; exec /private/start-fluent-bit.sh' &
|
|
||||||
else
|
else
|
||||||
log "missing /private/start-fluent-bit.sh; fluent-bit will not start"
|
log "missing /private/start-fluent-bit.sh; fluent-bit will not start"
|
||||||
fi
|
fi
|
||||||
@ -56,3 +54,4 @@ fi
|
|||||||
# Start agent in foreground as runtime user
|
# Start agent in foreground as runtime user
|
||||||
log "starting argus-agent"
|
log "starting argus-agent"
|
||||||
exec su -s /bin/bash -c /usr/local/bin/argus-agent "$RUNTIME_USER"
|
exec su -s /bin/bash -c /usr/local/bin/argus-agent "$RUNTIME_USER"
|
||||||
|
|
||||||
|
|||||||
@ -24,37 +24,24 @@ RUN apt-get update && \
|
|||||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
ENV FRONTEND_BASE_PATH=/private/argus/web/frontend
|
ENV FRONTEND_BASE_PATH=/private/argus/web/frontend
|
||||||
ARG ARGUS_BUILD_UID=2133
|
ARG ARGUS_UID=2133
|
||||||
ARG ARGUS_BUILD_GID=2015
|
ARG ARGUS_GID=2015
|
||||||
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID}
|
ENV ARGUS_UID=${ARGUS_UID}
|
||||||
ENV ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
ENV ARGUS_GID=${ARGUS_GID}
|
||||||
|
|
||||||
RUN mkdir -p ${FRONTEND_BASE_PATH} && \
|
RUN mkdir -p ${FRONTEND_BASE_PATH} && \
|
||||||
mkdir -p /private/argus/etc
|
mkdir -p /private/argus/etc
|
||||||
|
|
||||||
# 创建 web 用户(可自定义 UID/GID)
|
# 创建 web 用户(可自定义 UID/GID)
|
||||||
# 创建 web 用户组
|
# 创建 web 用户组
|
||||||
RUN set -eux; \
|
RUN groupadd -g ${ARGUS_GID} web
|
||||||
# 确保目标 GID 存在(组名可不固定)\
|
|
||||||
if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \
|
# 创建 web 用户并指定组
|
||||||
groupadd -g "${ARGUS_BUILD_GID}" web || true; \
|
RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} web
|
||||||
fi; \
|
|
||||||
# 若存在 web 用户则尽量对齐 UID/GID;否则仅在 UID 未被占用时创建
|
RUN chown -R web:web ${FRONTEND_BASE_PATH} && \
|
||||||
if id web >/dev/null 2>&1; then \
|
chown -R web:web /private/argus/etc && \
|
||||||
current_uid="$(id -u web)"; \
|
chown -R web:web /usr/local/bin
|
||||||
if [ "$current_uid" != "${ARGUS_BUILD_UID}" ] && ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
|
|
||||||
usermod -u "${ARGUS_BUILD_UID}" web; \
|
|
||||||
fi; \
|
|
||||||
usermod -g "${ARGUS_BUILD_GID}" web || true; \
|
|
||||||
else \
|
|
||||||
if ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
|
|
||||||
useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" web; \
|
|
||||||
else \
|
|
||||||
echo "UID ${ARGUS_BUILD_UID} already exists; skip creating user 'web'"; \
|
|
||||||
fi; \
|
|
||||||
fi; \
|
|
||||||
# 用数值 UID:GID 赋权,避免依赖用户名/组名
|
|
||||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" ${FRONTEND_BASE_PATH} /private/argus/etc /usr/local/bin || true
|
|
||||||
|
|
||||||
# 配置内网 apt 源 (如果指定了内网选项)
|
# 配置内网 apt 源 (如果指定了内网选项)
|
||||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||||
|
|||||||
@ -4,7 +4,7 @@ docker pull ubuntu:24.04
|
|||||||
source src/web/tests/.env
|
source src/web/tests/.env
|
||||||
|
|
||||||
docker build \
|
docker build \
|
||||||
--build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
--build-arg ARGUS_UID=${ARGUS_UID} \
|
||||||
--build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID} \
|
--build-arg ARGUS_GID=${ARGUS_GID} \
|
||||||
-f src/web/build_tools/frontend/Dockerfile -t argus-web-frontend:latest .
|
-f src/web/build_tools/frontend/Dockerfile -t argus-web-frontend:latest .
|
||||||
docker save -o argus-web-frontend-latest.tar argus-web-frontend:latest
|
docker save -o argus-web-frontend-latest.tar argus-web-frontend:latest
|
||||||
|
|||||||
@ -8,8 +8,8 @@ DNS_SCRIPT="${DNS_DIR}/update-dns.sh"
|
|||||||
DOMAIN=web.argus.com
|
DOMAIN=web.argus.com
|
||||||
WEB_DOMAIN_FILE="${DNS_DIR}/${DOMAIN}"
|
WEB_DOMAIN_FILE="${DNS_DIR}/${DOMAIN}"
|
||||||
RUNTIME_USER="${ARGUS_RUNTIME_USER:-argus}"
|
RUNTIME_USER="${ARGUS_RUNTIME_USER:-argus}"
|
||||||
RUNTIME_UID="${ARGUS_BUILD_UID:-2133}"
|
RUNTIME_UID="${ARGUS_UID:-2133}"
|
||||||
RUNTIME_GID="${ARGUS_BUILD_GID:-2015}"
|
RUNTIME_GID="${ARGUS_GID:-2015}"
|
||||||
|
|
||||||
mkdir -p "$DNS_DIR"
|
mkdir -p "$DNS_DIR"
|
||||||
chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true
|
chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true
|
||||||
|
|||||||
@ -8,34 +8,24 @@ RUN apt-get update && \
|
|||||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
ENV FRONTEND_BASE_PATH=/private/argus/web/proxy
|
ENV FRONTEND_BASE_PATH=/private/argus/web/proxy
|
||||||
ARG ARGUS_BUILD_UID=2133
|
ARG ARGUS_UID=2133
|
||||||
ARG ARGUS_BUILD_GID=2015
|
ARG ARGUS_GID=2015
|
||||||
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID}
|
ENV ARGUS_UID=${ARGUS_UID}
|
||||||
ENV ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
ENV ARGUS_GID=${ARGUS_GID}
|
||||||
|
|
||||||
RUN mkdir -p ${FRONTEND_BASE_PATH} && \
|
RUN mkdir -p ${FRONTEND_BASE_PATH} && \
|
||||||
mkdir -p /private/argus/etc
|
mkdir -p /private/argus/etc
|
||||||
|
|
||||||
# 创建 proxy 用户(可自定义 UID/GID)
|
# 创建 proxy 用户(可自定义 UID/GID)
|
||||||
# 创建 proxy 用户组
|
# 创建 proxy 用户组
|
||||||
RUN set -eux; \
|
RUN groupadd -g ${ARGUS_GID} web_proxy
|
||||||
if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \
|
|
||||||
groupadd -g "${ARGUS_BUILD_GID}" web_proxy || true; \
|
# 创建 proxy 用户并指定组
|
||||||
fi; \
|
RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} web_proxy
|
||||||
if id web_proxy >/dev/null 2>&1; then \
|
|
||||||
current_uid="$(id -u web_proxy)"; \
|
RUN chown -R web_proxy:web_proxy ${FRONTEND_BASE_PATH} && \
|
||||||
if [ "$current_uid" != "${ARGUS_BUILD_UID}" ] && ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
|
chown -R web_proxy:web_proxy /private/argus/etc && \
|
||||||
usermod -u "${ARGUS_BUILD_UID}" web_proxy; \
|
chown -R web_proxy:web_proxy /usr/local/bin
|
||||||
fi; \
|
|
||||||
usermod -g "${ARGUS_BUILD_GID}" web_proxy || true; \
|
|
||||||
else \
|
|
||||||
if ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
|
|
||||||
useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" web_proxy; \
|
|
||||||
else \
|
|
||||||
echo "UID ${ARGUS_BUILD_UID} already exists; skip creating user 'web_proxy'"; \
|
|
||||||
fi; \
|
|
||||||
fi; \
|
|
||||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" ${FRONTEND_BASE_PATH} /private/argus/etc /usr/local/bin || true
|
|
||||||
|
|
||||||
# 配置内网 apt 源 (如果指定了内网选项)
|
# 配置内网 apt 源 (如果指定了内网选项)
|
||||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||||
|
|||||||
@ -3,7 +3,7 @@ docker pull ubuntu:24.04
|
|||||||
source src/web/tests/.env
|
source src/web/tests/.env
|
||||||
|
|
||||||
docker build \
|
docker build \
|
||||||
--build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
--build-arg ARGUS_UID=${ARGUS_UID} \
|
||||||
--build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID} \
|
--build-arg ARGUS_GID=${ARGUS_GID} \
|
||||||
-f src/web/build_tools/proxy/Dockerfile -t argus-web-proxy:latest .
|
-f src/web/build_tools/proxy/Dockerfile -t argus-web-proxy:latest .
|
||||||
docker save -o argus-web-proxy-latest.tar argus-web-proxy:latest
|
docker save -o argus-web-proxy-latest.tar argus-web-proxy:latest
|
||||||
|
|||||||
@ -9,8 +9,8 @@ DNS_CONF_PRIVATE="/private/argus/etc/dns.conf"
|
|||||||
DNS_CONF_SYSTEM="/etc/resolv.conf"
|
DNS_CONF_SYSTEM="/etc/resolv.conf"
|
||||||
DNS_DIR="/private/argus/etc"
|
DNS_DIR="/private/argus/etc"
|
||||||
DNS_SCRIPT="${DNS_DIR}/update-dns.sh"
|
DNS_SCRIPT="${DNS_DIR}/update-dns.sh"
|
||||||
RUNTIME_UID="${ARGUS_BUILD_UID:-2133}"
|
RUNTIME_UID="${ARGUS_UID:-2133}"
|
||||||
RUNTIME_GID="${ARGUS_BUILD_GID:-2015}"
|
RUNTIME_GID="${ARGUS_GID:-2015}"
|
||||||
|
|
||||||
mkdir -p "$DNS_DIR"
|
mkdir -p "$DNS_DIR"
|
||||||
chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true
|
chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true
|
||||||
|
|||||||
@ -4,15 +4,15 @@ services:
|
|||||||
context: ../../../
|
context: ../../../
|
||||||
dockerfile: src/web/build_tools/frontend/Dockerfile
|
dockerfile: src/web/build_tools/frontend/Dockerfile
|
||||||
args:
|
args:
|
||||||
ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
|
ARGUS_UID: ${ARGUS_UID:-2133}
|
||||||
ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
|
ARGUS_GID: ${ARGUS_GID:-2015}
|
||||||
USE_INTRANET: ${USE_INTRANET:-false}
|
USE_INTRANET: ${USE_INTRANET:-false}
|
||||||
image: argus-web-frontend:latest
|
image: argus-web-frontend:latest
|
||||||
container_name: argus-web-frontend
|
container_name: argus-web-frontend
|
||||||
environment:
|
environment:
|
||||||
- ALERTMANAGER_BASE_PATH=/private/argus/web/frontend
|
- ALERTMANAGER_BASE_PATH=/private/argus/web/frontend
|
||||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
- ARGUS_UID=${ARGUS_UID:-2133}
|
||||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
- ARGUS_GID=${ARGUS_GID:-2015}
|
||||||
ports:
|
ports:
|
||||||
- "${ARGUS_WEB_PORT:-8080}:80"
|
- "${ARGUS_WEB_PORT:-8080}:80"
|
||||||
volumes:
|
volumes:
|
||||||
@ -31,14 +31,14 @@ services:
|
|||||||
context: ../../../
|
context: ../../../
|
||||||
dockerfile: src/web/build_tools/proxy/Dockerfile
|
dockerfile: src/web/build_tools/proxy/Dockerfile
|
||||||
args:
|
args:
|
||||||
ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
|
ARGUS_UID: ${ARGUS_UID:-2133}
|
||||||
ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
|
ARGUS_GID: ${ARGUS_GID:-2015}
|
||||||
USE_INTRANET: ${USE_INTRANET:-false}
|
USE_INTRANET: ${USE_INTRANET:-false}
|
||||||
image: argus-web-proxy:latest
|
image: argus-web-proxy:latest
|
||||||
container_name: argus-web-proxy
|
container_name: argus-web-proxy
|
||||||
environment:
|
environment:
|
||||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
- ARGUS_UID=${ARGUS_UID:-2133}
|
||||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
- ARGUS_GID=${ARGUS_GID:-2015}
|
||||||
ports:
|
ports:
|
||||||
- "8088:80"
|
- "8088:80"
|
||||||
volumes:
|
volumes:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user