diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..15e6b91 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +src/metric/client-plugins/all-in-one-full/plugins/*/bin/* filter=lfs diff=lfs merge=lfs -text diff --git a/build/build_images.sh b/build/build_images.sh index 562c964..e32908c 100755 --- a/build/build_images.sh +++ b/build/build_images.sh @@ -10,20 +10,28 @@ Usage: $0 [OPTIONS] Options: --intranet Use intranet mirror for log/bind builds --master-offline Build master offline image (requires src/master/offline_wheels.tar.gz) + --metric Build metric module images (ftp, prometheus, grafana, test nodes) --no-cache Build all images without using Docker layer cache + --only LIST Comma-separated targets to build: core,master,metric,web,alert,sys,all -h, --help Show this help message Examples: $0 # Build with default sources $0 --intranet # Build with intranet mirror $0 --master-offline # Additionally build argus-master:offline - $0 --intranet --master-offline + $0 --metric # Additionally build metric module images + $0 --intranet --master-offline --metric EOF } use_intranet=false +build_core=true build_master=true build_master_offline=false +build_metric=true +build_web=true +build_alert=true +build_sys=true no_cache=false while [[ $# -gt 0 ]]; do @@ -41,10 +49,35 @@ while [[ $# -gt 0 ]]; do build_master_offline=true shift ;; + --metric) + build_metric=true + shift + ;; --no-cache) no_cache=true shift ;; + --only) + if [[ -z ${2:-} ]]; then + echo "--only requires a target list" >&2; exit 1 + fi + sel="$2"; shift 2 + # reset all, then enable selected + build_core=false; build_master=false; build_metric=false; build_web=false; build_alert=false; build_sys=false + IFS=',' read -ra parts <<< "$sel" + for p in "${parts[@]}"; do + case "$p" in + core) build_core=true ;; + master) build_master=true ;; + metric) build_metric=true ;; + web) build_web=true ;; + alert) build_alert=true ;; + sys) build_sys=true ;; + all) build_core=true; build_master=true; build_metric=true; build_web=true; build_alert=true; build_sys=true ;; + *) echo "Unknown --only target: $p" >&2; exit 1 ;; + esac + done + ;; -h|--help) show_help exit 0 @@ -115,14 +148,22 @@ build_image() { local image_name=$1 local dockerfile_path=$2 local tag=$3 + local context="." shift 3 + + if [[ $# -gt 0 ]]; then + context=$1 + shift + fi + local extra_args=("$@") echo "🔄 Building $image_name image..." echo " Dockerfile: $dockerfile_path" echo " Tag: $tag" + echo " Context: $context" - if docker build "${build_args[@]}" "${extra_args[@]}" -f "$dockerfile_path" -t "$tag" .; then + if docker build "${build_args[@]}" "${extra_args[@]}" -f "$dockerfile_path" -t "$tag" "$context"; then echo "✅ $image_name image built successfully" return 0 else @@ -131,29 +172,59 @@ build_image() { fi } +pull_base_image() { + local image_ref=$1 + local attempts=${2:-3} + local delay=${3:-5} + + # If the image already exists locally, skip pulling. + if docker image inspect "$image_ref" >/dev/null 2>&1; then + echo " Local image present; skip pull: $image_ref" + return 0 + fi + + for ((i=1; i<=attempts; i++)); do + echo " Pulling base image ($i/$attempts): $image_ref" + if docker pull "$image_ref" >/dev/null; then + echo " Base image ready: $image_ref" + return 0 + fi + echo " Pull failed: $image_ref" + if (( i < attempts )); then + echo " Retrying in ${delay}s..." + sleep "$delay" + fi + done + + echo "❌ Unable to pull base image after ${attempts} attempts: $image_ref" + return 1 +} + images_built=() build_failed=false -if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then - images_built+=("argus-elasticsearch:latest") -else - build_failed=true -fi +if [[ "$build_core" == true ]]; then + if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then + images_built+=("argus-elasticsearch:latest") + else + build_failed=true + fi -echo "" + echo "" -if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then - images_built+=("argus-kibana:latest") -else - build_failed=true -fi + if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then + images_built+=("argus-kibana:latest") + else + build_failed=true + fi -echo "" + echo "" -if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then - images_built+=("argus-bind9:latest") -else - build_failed=true + if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then + images_built+=("argus-bind9:latest") + else + build_failed=true + fi fi echo "" @@ -184,6 +255,127 @@ if [[ "$build_master" == true ]]; then popd >/dev/null fi +if [[ "$build_metric" == true ]]; then + echo "" + echo "Building Metric module images..." + + metric_base_images=( + "ubuntu:22.04" + "ubuntu/prometheus:3-24.04_stable" + "grafana/grafana:11.1.0" + ) + + for base_image in "${metric_base_images[@]}"; do + if ! pull_base_image "$base_image"; then + build_failed=true + fi + done + + metric_builds=( + "Metric FTP|src/metric/ftp/build/Dockerfile|argus-metric-ftp:latest|src/metric/ftp/build" + "Metric Prometheus|src/metric/prometheus/build/Dockerfile|argus-metric-prometheus:latest|src/metric/prometheus/build" + "Metric Grafana|src/metric/grafana/build/Dockerfile|argus-metric-grafana:latest|src/metric/grafana/build" + ) + + for build_spec in "${metric_builds[@]}"; do + IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec" + if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then + images_built+=("$image_tag") + else + build_failed=true + fi + echo "" + done +fi + +# ======================================= +# Sys (system tests) node images +# ======================================= + +if [[ "$build_sys" == true ]]; then + echo "" + echo "Building Sys node images..." + + sys_base_images=( + "ubuntu:22.04" + "nvidia/cuda:12.2.2-runtime-ubuntu22.04" + ) + + for base_image in "${sys_base_images[@]}"; do + if ! pull_base_image "$base_image"; then + build_failed=true + fi + done + + sys_builds=( + "Sys Node|src/sys/build/node/Dockerfile|argus-sys-node:latest|." + "Sys Metric Test Node|src/sys/build/test-node/Dockerfile|argus-sys-metric-test-node:latest|." + "Sys Metric Test GPU Node|src/sys/build/test-gpu-node/Dockerfile|argus-sys-metric-test-gpu-node:latest|." + ) + + for build_spec in "${sys_builds[@]}"; do + IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec" + if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then + images_built+=("$image_tag") + else + build_failed=true + fi + echo "" + done +fi + +# ======================================= +# Web & Alert module images +# ======================================= + +if [[ "$build_web" == true || "$build_alert" == true ]]; then + echo "" + echo "Building Web and Alert module images..." + + # Pre-pull commonly used base images for stability + web_alert_base_images=( + "node:20" + "ubuntu:24.04" + ) + + for base_image in "${web_alert_base_images[@]}"; do + if ! pull_base_image "$base_image"; then + build_failed=true + fi + done + + if [[ "$build_web" == true ]]; then + web_builds=( + "Web Frontend|src/web/build_tools/frontend/Dockerfile|argus-web-frontend:latest|." + "Web Proxy|src/web/build_tools/proxy/Dockerfile|argus-web-proxy:latest|." + ) + for build_spec in "${web_builds[@]}"; do + IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec" + if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then + images_built+=("$image_tag") + else + build_failed=true + fi + echo "" + done + fi + + if [[ "$build_alert" == true ]]; then + alert_builds=( + "Alertmanager|src/alert/alertmanager/build/Dockerfile|argus-alertmanager:latest|." + ) + for build_spec in "${alert_builds[@]}"; do + IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec" + if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then + images_built+=("$image_tag") + else + build_failed=true + fi + echo "" + done + fi +fi + echo "=======================================" echo "📦 Build Summary" echo "=======================================" @@ -210,7 +402,6 @@ if [[ "$build_master_offline" == true ]]; then echo "" echo "🧳 Master offline wheels 已解压到 $master_offline_dir" fi - echo "" echo "🚀 Next steps:" echo " ./build/save_images.sh --compress # 导出镜像" diff --git a/build/save_images.sh b/build/save_images.sh index 20d9c1b..083d587 100755 --- a/build/save_images.sh +++ b/build/save_images.sh @@ -68,6 +68,12 @@ declare -A images=( ["argus-kibana:latest"]="argus-kibana-latest.tar" ["argus-bind9:latest"]="argus-bind9-latest.tar" ["argus-master:offline"]="argus-master-offline.tar" + ["argus-metric-ftp:latest"]="argus-metric-ftp-latest.tar" + ["argus-metric-prometheus:latest"]="argus-metric-prometheus-latest.tar" + ["argus-metric-grafana:latest"]="argus-metric-grafana-latest.tar" + ["argus-web-frontend:latest"]="argus-web-frontend-latest.tar" + ["argus-web-proxy:latest"]="argus-web-proxy-latest.tar" + ["argus-alertmanager:latest"]="argus-alertmanager-latest.tar" ) # 函数:检查镜像是否存在 @@ -220,4 +226,4 @@ fi echo "" echo "✅ Image export completed successfully!" -echo "" \ No newline at end of file +echo "" diff --git a/src/agent/scripts/build_binary.sh b/src/agent/scripts/build_binary.sh index 7e5a720..bb19ed4 100755 --- a/src/agent/scripts/build_binary.sh +++ b/src/agent/scripts/build_binary.sh @@ -12,6 +12,8 @@ VENV_DIR="$BUILD_ROOT/venv" AGENT_BUILD_IMAGE="${AGENT_BUILD_IMAGE:-python:3.11-slim-bullseye}" AGENT_BUILD_USE_DOCKER="${AGENT_BUILD_USE_DOCKER:-1}" +# 默认在容器内忽略代理以避免公司内网代理在 Docker 网络不可达导致 pip 失败(可用 0 关闭) +AGENT_BUILD_IGNORE_PROXY="${AGENT_BUILD_IGNORE_PROXY:-1}" USED_DOCKER=0 run_host_build() { @@ -71,6 +73,7 @@ run_docker_build() { pass_env_if_set http_proxy pass_env_if_set https_proxy pass_env_if_set no_proxy + pass_env_if_set AGENT_BUILD_IGNORE_PROXY build_script=$(cat <<'INNER' set -euo pipefail @@ -82,6 +85,10 @@ rm -rf build dist mkdir -p build/pyinstaller dist python3 -m venv --copies build/venv source build/venv/bin/activate +# 若指定忽略代理,则清空常见代理与 pip 镜像环境变量,避免容器内代理不可达 +if [ "${AGENT_BUILD_IGNORE_PROXY:-1}" = "1" ]; then + unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY PIP_INDEX_URL PIP_EXTRA_INDEX_URL PIP_TRUSTED_HOST +fi pip install --upgrade pip pip install . pip install pyinstaller==6.6.0 diff --git a/src/alert/alertmanager/build/Dockerfile b/src/alert/alertmanager/build/Dockerfile index a606569..2045db9 100644 --- a/src/alert/alertmanager/build/Dockerfile +++ b/src/alert/alertmanager/build/Dockerfile @@ -9,21 +9,21 @@ RUN apt-get update && \ apt-get install -y wget supervisor net-tools inetutils-ping vim ca-certificates passwd && \ apt-get clean && rm -rf /var/lib/apt/lists/* -# 设置 Alertmanager 版本 +# 设置 Alertmanager 版本(与本地离线包保持一致) ARG ALERTMANAGER_VERSION=0.28.1 -# 下载并解压 Alertmanager 二进制 -RUN wget https://github.com/prometheus/alertmanager/releases/download/v${ALERTMANAGER_VERSION}/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz && \ - tar xvf alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz && \ - mv alertmanager-${ALERTMANAGER_VERSION}.linux-amd64 /usr/local/alertmanager && \ - rm alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz +# 使用仓库内预置的离线包构建(无需联网) +COPY src/alert/alertmanager/build/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz /tmp/ +RUN tar xvf /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz -C /tmp && \ + mv /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64 /usr/local/alertmanager && \ + rm -f /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz ENV ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager -ARG ARGUS_UID=2133 -ARG ARGUS_GID=2015 -ENV ARGUS_UID=${ARGUS_UID} -ENV ARGUS_GID=${ARGUS_GID} +ARG ARGUS_BUILD_UID=2133 +ARG ARGUS_BUILD_GID=2015 +ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} +ENV ARGUS_BUILD_GID=${ARGUS_BUILD_GID} RUN mkdir -p /usr/share/alertmanager && \ mkdir -p ${ALERTMANAGER_BASE_PATH} && \ @@ -33,16 +33,24 @@ RUN mkdir -p /usr/share/alertmanager && \ # 创建 alertmanager 用户(可自定义 UID/GID) # 创建 alertmanager 用户组 -RUN groupadd -g ${ARGUS_GID} alertmanager +RUN set -eux; \ + # 确保目标 GID 存在;若已被占用,直接使用该 GID(组名不限)\ + if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \ + groupadd -g "${ARGUS_BUILD_GID}" alertmanager || true; \ + fi; \ + # 确保存在 alertmanager 用户;若 UID 已被占用,跳过并继续使用现有 UID 的用户 + if ! id alertmanager >/dev/null 2>&1; then \ + if getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \ + # UID 已占用,则创建同名用户但不指定 UID(避免冲突),仅保证 user 存在 + useradd -M -s /usr/sbin/nologin -g "${ARGUS_BUILD_GID}" alertmanager || true; \ + else \ + useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" alertmanager || true; \ + fi; \ + else \ + usermod -g "${ARGUS_BUILD_GID}" alertmanager || true; \ + fi -# 创建 alertmanager 用户并指定组 -RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} alertmanager - -RUN chown -R alertmanager:alertmanager /usr/share/alertmanager && \ - chown -R alertmanager:alertmanager /alertmanager && \ - chown -R alertmanager:alertmanager ${ALERTMANAGER_BASE_PATH} && \ - chown -R alertmanager:alertmanager /private/argus/etc && \ - chown -R alertmanager:alertmanager /usr/local/bin +RUN chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/alertmanager /alertmanager ${ALERTMANAGER_BASE_PATH} /private/argus/etc /usr/local/bin || true # 配置内网 apt 源 (如果指定了内网选项) RUN if [ "$USE_INTRANET" = "true" ]; then \ @@ -86,4 +94,3 @@ EXPOSE 9093 # 使用 supervisor 作为入口点 CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] - diff --git a/src/alert/alertmanager/build/alertmanager-0.28.1.linux-amd64.tar.gz b/src/alert/alertmanager/build/alertmanager-0.28.1.linux-amd64.tar.gz new file mode 100644 index 0000000..8c0ca37 Binary files /dev/null and b/src/alert/alertmanager/build/alertmanager-0.28.1.linux-amd64.tar.gz differ diff --git a/src/alert/alertmanager/build/build.sh b/src/alert/alertmanager/build/build.sh index c7520e7..2640042 100644 --- a/src/alert/alertmanager/build/build.sh +++ b/src/alert/alertmanager/build/build.sh @@ -5,9 +5,9 @@ docker pull ubuntu:24.04 source src/alert/tests/.env docker build \ - --build-arg ARGUS_UID=${ARGUS_UID} \ - --build-arg ARGUS_GID=${ARGUS_GID} \ + --build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \ + --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID} \ -f src/alert/alertmanager/build/Dockerfile \ -t argus-alertmanager:latest . -docker save -o argus-alertmanager-latest.tar argus-alertmanager:latest \ No newline at end of file +docker save -o argus-alertmanager-latest.tar argus-alertmanager:latest diff --git a/src/alert/alertmanager/build/fetch-dist.sh b/src/alert/alertmanager/build/fetch-dist.sh new file mode 100644 index 0000000..9f4140f --- /dev/null +++ b/src/alert/alertmanager/build/fetch-dist.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -euo pipefail + +# 下载 Alertmanager 离线安装包到本目录,用于 Docker 构建时 COPY +# 用法: +# ./fetch-dist.sh [version] +# 示例: +# ./fetch-dist.sh 0.28.1 + +VER="${1:-0.28.1}" +OUT="alertmanager-${VER}.linux-amd64.tar.gz" +URL="https://github.com/prometheus/alertmanager/releases/download/v${VER}/${OUT}" + +if [[ -f "$OUT" ]]; then + echo "[INFO] $OUT already exists, skip download" + exit 0 +fi + +echo "[INFO] Downloading $URL" +curl -fL --retry 3 --connect-timeout 10 -o "$OUT" "$URL" +echo "[OK] Saved to $(pwd)/$OUT" + diff --git a/src/alert/alertmanager/build/start-am-supervised.sh b/src/alert/alertmanager/build/start-am-supervised.sh index 76bbb8a..3d64ec4 100644 --- a/src/alert/alertmanager/build/start-am-supervised.sh +++ b/src/alert/alertmanager/build/start-am-supervised.sh @@ -7,10 +7,8 @@ ALERTMANAGER_BASE_PATH=${ALERTMANAGER_BASE_PATH:-/private/argus/alert/alertmanag echo "[INFO] Alertmanager base path: ${ALERTMANAGER_BASE_PATH}" -# 生成配置文件 -echo "[INFO] Generating Alertmanager configuration file..." -sed "s|\${ALERTMANAGER_BASE_PATH}|${ALERTMANAGER_BASE_PATH}|g" \ - /etc/alertmanager/alertmanager.yml > ${ALERTMANAGER_BASE_PATH}/alertmanager.yml +# 使用容器内的 /etc/alertmanager/alertmanager.yml 作为配置文件,避免写入挂载卷导致的权限问题 +echo "[INFO] Using /etc/alertmanager/alertmanager.yml as configuration" # 记录容器 IP 地址 diff --git a/src/alert/alertmanager/build/supervisord.conf b/src/alert/alertmanager/build/supervisord.conf index d284547..da05ac7 100644 --- a/src/alert/alertmanager/build/supervisord.conf +++ b/src/alert/alertmanager/build/supervisord.conf @@ -6,7 +6,7 @@ user=root [program:alertmanager] command=/usr/local/bin/start-am-supervised.sh -user=alertmanager +user=ubuntu stdout_logfile=/var/log/supervisor/alertmanager.log stderr_logfile=/var/log/supervisor/alertmanager_error.log autorestart=true diff --git a/src/alert/tests/.env b/src/alert/tests/.env index 00f4b76..b9d89f5 100644 --- a/src/alert/tests/.env +++ b/src/alert/tests/.env @@ -1,5 +1,5 @@ DATA_ROOT=/home/argus/tmp/private/argus -ARGUS_UID=1048 -ARGUS_GID=1048 +ARGUS_BUILD_UID=1048 +ARGUS_BUILD_GID=1048 USE_INTRANET=false diff --git a/src/alert/tests/docker-compose.yml b/src/alert/tests/docker-compose.yml index 63b9f40..c399df8 100644 --- a/src/alert/tests/docker-compose.yml +++ b/src/alert/tests/docker-compose.yml @@ -4,15 +4,15 @@ services: context: ../../../ dockerfile: src/alert/alertmanager/build/Dockerfile args: - ARGUS_UID: ${ARGUS_UID:-2133} - ARGUS_GID: ${ARGUS_GID:-2015} + ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133} + ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015} USE_INTRANET: ${USE_INTRANET:-false} image: argus-alertmanager:latest container_name: argus-alertmanager environment: - ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager - - ARGUS_UID=${ARGUS_UID:-2133} - - ARGUS_GID=${ARGUS_GID:-2015} + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} ports: - "${ARGUS_PORT:-9093}:9093" volumes: diff --git a/src/bind/build/Dockerfile b/src/bind/build/Dockerfile index c6293d3..637e227 100644 --- a/src/bind/build/Dockerfile +++ b/src/bind/build/Dockerfile @@ -26,6 +26,7 @@ RUN apt-get update && \ apt-get install -y \ bind9 \ bind9utils \ + dnsutils \ bind9-doc \ supervisor \ net-tools \ diff --git a/src/log/fluent-bit/build/packages/libbrotli1_1.0.9-2build6_amd64.deb b/src/log/fluent-bit/build/packages/libbrotli1_1.0.9-2build6_amd64.deb new file mode 100644 index 0000000..ab0e6d8 Binary files /dev/null and b/src/log/fluent-bit/build/packages/libbrotli1_1.0.9-2build6_amd64.deb differ diff --git a/src/log/fluent-bit/build/packages/libidn2-0_2.3.2-2build1_amd64.deb b/src/log/fluent-bit/build/packages/libidn2-0_2.3.2-2build1_amd64.deb new file mode 100644 index 0000000..017d14f Binary files /dev/null and b/src/log/fluent-bit/build/packages/libidn2-0_2.3.2-2build1_amd64.deb differ diff --git a/src/log/fluent-bit/build/packages/libldap-2.5-0_2.5.19+dfsg-0ubuntu0.22.04.1_amd64.deb b/src/log/fluent-bit/build/packages/libldap-2.5-0_2.5.19+dfsg-0ubuntu0.22.04.1_amd64.deb new file mode 100644 index 0000000..375f621 Binary files /dev/null and b/src/log/fluent-bit/build/packages/libldap-2.5-0_2.5.19+dfsg-0ubuntu0.22.04.1_amd64.deb differ diff --git a/src/log/fluent-bit/build/packages/libpq5_14.19-0ubuntu0.22.04.1_amd64.deb b/src/log/fluent-bit/build/packages/libpq5_14.19-0ubuntu0.22.04.1_amd64.deb new file mode 100644 index 0000000..9832c54 Binary files /dev/null and b/src/log/fluent-bit/build/packages/libpq5_14.19-0ubuntu0.22.04.1_amd64.deb differ diff --git a/src/log/fluent-bit/build/packages/libsasl2-2_2.1.27+dfsg2-3ubuntu1.2_amd64.deb b/src/log/fluent-bit/build/packages/libsasl2-2_2.1.27+dfsg2-3ubuntu1.2_amd64.deb new file mode 100644 index 0000000..a5a960c Binary files /dev/null and b/src/log/fluent-bit/build/packages/libsasl2-2_2.1.27+dfsg2-3ubuntu1.2_amd64.deb differ diff --git a/src/log/fluent-bit/build/packages/libsasl2-modules-db_2.1.27+dfsg2-3ubuntu1.2_amd64.deb b/src/log/fluent-bit/build/packages/libsasl2-modules-db_2.1.27+dfsg2-3ubuntu1.2_amd64.deb new file mode 100644 index 0000000..fb1d510 Binary files /dev/null and b/src/log/fluent-bit/build/packages/libsasl2-modules-db_2.1.27+dfsg2-3ubuntu1.2_amd64.deb differ diff --git a/src/log/fluent-bit/build/packages/libssl3_3.0.2-0ubuntu1.20_amd64.deb b/src/log/fluent-bit/build/packages/libssl3_3.0.2-0ubuntu1.20_amd64.deb new file mode 100644 index 0000000..cfc883f Binary files /dev/null and b/src/log/fluent-bit/build/packages/libssl3_3.0.2-0ubuntu1.20_amd64.deb differ diff --git a/src/log/fluent-bit/build/packages/libyaml-0-2_0.2.2-1build2_amd64.deb b/src/log/fluent-bit/build/packages/libyaml-0-2_0.2.2-1build2_amd64.deb new file mode 100644 index 0000000..a995886 Binary files /dev/null and b/src/log/fluent-bit/build/packages/libyaml-0-2_0.2.2-1build2_amd64.deb differ diff --git a/src/log/fluent-bit/build/start-fluent-bit.sh b/src/log/fluent-bit/build/start-fluent-bit.sh index 5db6aa7..5b4cd35 100755 --- a/src/log/fluent-bit/build/start-fluent-bit.sh +++ b/src/log/fluent-bit/build/start-fluent-bit.sh @@ -1,47 +1,96 @@ #!/bin/bash set -euo pipefail -echo "[INFO] Starting Fluent Bit setup in Ubuntu container..." +echo "[INFO] Starting Fluent Bit setup in Ubuntu container (offline-first)..." -# 安装必要的工具 -echo "[INFO] Installing required packages..." export DEBIAN_FRONTEND=noninteractive -apt-get update -qq -apt-get install -y -qq curl -# 解压bundle到/tmp -echo "[INFO] Extracting fluent-bit bundle..." -cp -r /private/etc /tmp -cp -r /private/packages /tmp -cd /tmp +# Stage bundle to /tmp (read-only mount under /private) +echo "[INFO] Staging fluent-bit bundle..." +rm -rf /tmp/flb && mkdir -p /tmp/flb +cp -r /private/etc /tmp/flb/ +mkdir -p /tmp/flb/packages +cp -r /private/packages/* /tmp/flb/packages/ 2>/dev/null || true -# 安装 Fluent Bit 从 deb 包 -echo "[INFO] Installing Fluent Bit from deb package..." -dpkg -i /tmp/packages/fluent-bit_3.1.9_amd64.deb || true -apt-get install -f -y -qq # 解决依赖问题 +# Helper: check and install a local deb if not already satisfied +ensure_lib() { + local soname="$1"; shift + local pattern="$1"; shift + if ldconfig -p 2>/dev/null | grep -q "$soname"; then + echo "[OK] $soname already present" + return 0 + fi + local deb="$(ls /tmp/flb/packages/$pattern 2>/dev/null | head -n1 || true)" + if [[ -n "$deb" ]]; then + echo "[INFO] Installing local dependency: $(basename "$deb")" + dpkg -i "$deb" >/dev/null 2>&1 || true + else + echo "[WARN] Local deb for $soname not found (pattern=$pattern)" + fi + if ! ldconfig -p 2>/dev/null | grep -q "$soname"; then + echo "[WARN] $soname still missing after local install; attempting apt fallback" + apt-get update -qq || true + case "$soname" in + libpq.so.5) apt-get install -y -qq libpq5 || true ;; + libyaml-0.so.2) apt-get install -y -qq libyaml-0-2 || true ;; + esac + fi + ldconfig 2>/dev/null || true +} + +# Offline-first: satisfy runtime deps from local debs, fallback to apt only if necessary +ensure_lib "libpq.so.5" "libpq5_*_amd64.deb" +ensure_lib "libyaml-0.so.2" "libyaml-0-2_*_amd64.deb" +ensure_lib "libsasl2.so.2" "libsasl2-2_*_amd64.deb" +ensure_lib "libldap-2.5.so.0" "libldap-2.5-0_*_amd64.deb" + +# Install fluent-bit main package from local bundle +FLB_DEB="$(ls /tmp/flb/packages/fluent-bit_*_amd64.deb 2>/dev/null | head -n1 || true)" +if [[ -z "$FLB_DEB" ]]; then + echo "[ERROR] fluent-bit deb not found under /private/packages" >&2 + exit 1 +fi +echo "[INFO] Installing Fluent Bit: $(basename "$FLB_DEB")" +dpkg -i "$FLB_DEB" >/dev/null 2>&1 || true + +# If dpkg reported unresolved dependencies, try apt -f only as last resort +if ! command -v /opt/fluent-bit/bin/fluent-bit >/dev/null 2>&1; then + echo "[WARN] fluent-bit binary missing after dpkg; attempting apt --fix-broken" + apt-get install -f -y -qq || true +fi + +# Ensure runtime library dependencies are satisfied (libsasl2, libldap are required via libpq/curl) +MISSING=$(ldd /opt/fluent-bit/bin/fluent-bit 2>/dev/null | awk '/not found/{print $1}' | xargs -r echo || true) +if [[ -n "$MISSING" ]]; then + echo "[WARN] missing shared libs: $MISSING" + apt-get update -qq || true + apt-get install -y -qq libsasl2-2 libldap-2.5-0 || true + apt-get install -f -y -qq || true +fi -# 验证 Fluent Bit 可以运行 echo "[INFO] Fluent Bit version:" -/opt/fluent-bit/bin/fluent-bit --version +/opt/fluent-bit/bin/fluent-bit --version || { echo "[ERROR] fluent-bit not installed or libraries missing" >&2; exit 1; } -# 创建配置目录 +# Place configuration mkdir -p /etc/fluent-bit -cp -r /tmp/etc/* /etc/fluent-bit/ +cp -r /tmp/flb/etc/* /etc/fluent-bit/ -# 创建日志和缓冲区目录 +# Create logs/buffers dirs mkdir -p /logs/train /logs/infer /buffers chmod 755 /logs/train /logs/infer /buffers -# 等待 Elasticsearch 就绪 -echo "[INFO] Waiting for Elasticsearch to be ready..." -while ! curl -fs http://${ES_HOST}:${ES_PORT}/_cluster/health >/dev/null 2>&1; do - echo " Waiting for ES at ${ES_HOST}:${ES_PORT}..." - sleep 5 +# Wait for Elasticsearch via bash /dev/tcp to avoid curl dependency +echo "[INFO] Waiting for Elasticsearch to be ready (tcp ${ES_HOST}:${ES_PORT})..." +for i in $(seq 1 120); do + if exec 3<>/dev/tcp/${ES_HOST}/${ES_PORT}; then + exec 3<&- 3>&- + echo "[INFO] Elasticsearch is ready" + break + fi + [[ $i -eq 120 ]] && { echo "[ERROR] ES not reachable" >&2; exit 1; } + sleep 1 done -echo "[INFO] Elasticsearch is ready" -# 启动 Fluent Bit echo "[INFO] Starting Fluent Bit with configuration from /etc/fluent-bit/" echo "[INFO] Command: /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf" -exec /opt/fluent-bit/bin/fluent-bit \ - --config=/etc/fluent-bit/fluent-bit.conf +exec /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf diff --git a/src/log/tests/scripts/01_bootstrap.sh b/src/log/tests/scripts/01_bootstrap.sh index 93898e0..fb322ab 100755 --- a/src/log/tests/scripts/01_bootstrap.sh +++ b/src/log/tests/scripts/01_bootstrap.sh @@ -32,3 +32,42 @@ fi echo "[OK] 初始化完成: private/argus/log/{elasticsearch,kibana}" echo "[INFO] Fluent-bit files should be in fluent-bit/ directory" + +# 准备 Fluent Bit 离线依赖(从 metric all-in-one-full 复制 deb 到 ../fluent-bit/build/packages) +FLB_BUILD_PACKAGES_DIR="$root/../fluent-bit/build/packages" +mkdir -p "$FLB_BUILD_PACKAGES_DIR" +for deb in \ + "$project_root/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_"*_amd64.deb \ + "$project_root/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_"*_amd64.deb ; do + if ls $deb >/dev/null 2>&1; then + for f in $deb; do + base="$(basename "$f")" + if [[ ! -f "$FLB_BUILD_PACKAGES_DIR/$base" ]]; then + cp "$f" "$FLB_BUILD_PACKAGES_DIR/" + echo " [+] copied $base" + fi + done + fi +done + +# 额外:从 all-in-one-full 的 ubuntu22/curl.tar.gz 解包必要依赖(libsasl2/ldap),便于离线安装 +CURLOPT_TAR="$project_root/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz" +if [[ -f "$CURLOPT_TAR" ]]; then + tmpdir=$(mktemp -d) + if tar -xzf "$CURLOPT_TAR" -C "$tmpdir" 2>/dev/null; then + for p in \ + libsasl2-2_*_amd64.deb \ + libsasl2-modules-db_*_amd64.deb \ + libldap-2.5-0_*_amd64.deb \ + libidn2-0_*_amd64.deb \ + libbrotli1_*_amd64.deb \ + libssl3_*_amd64.deb ; do + src=$(ls "$tmpdir"/curl/$p 2>/dev/null | head -n1 || true) + if [[ -n "$src" ]]; then + base="$(basename "$src")" + [[ -f "$FLB_BUILD_PACKAGES_DIR/$base" ]] || cp "$src" "$FLB_BUILD_PACKAGES_DIR/" && echo " [+] staged $base" + fi + done + fi + rm -rf "$tmpdir" +fi diff --git a/src/metric/.gitignore b/src/metric/.gitignore index 43f5e6d..50cf728 100644 --- a/src/metric/.gitignore +++ b/src/metric/.gitignore @@ -4,4 +4,4 @@ /client-plugins/demo-all-in-one/publish/ /client-plugins/demo-all-in-one/checklist /client-plugins/demo-all-in-one/VERSION -/client-plugins/all-in-one-full/ +/client-plugins/all-in-one-full/artifact/ diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh b/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh index 2f16b19..5441cf1 100755 --- a/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh +++ b/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh @@ -104,7 +104,26 @@ log_info "文件所有者: $OWNER" # 确保发布目录存在 log_info "确保发布目录存在: $PUBLISH_DIR" -sudo mkdir -p "$PUBLISH_DIR" +mkdir -p "$PUBLISH_DIR" + +IFS=':' read -r OWNER_UID OWNER_GID <<< "$OWNER" +if [[ -z "$OWNER_UID" || -z "$OWNER_GID" ]]; then + log_error "--owner 格式不正确,应为 uid:gid" + exit 1 +fi + +CURRENT_UID=$(id -u) +CURRENT_GID=$(id -g) +if [[ "$OWNER_UID" != "$CURRENT_UID" || "$OWNER_GID" != "$CURRENT_GID" ]]; then + if [[ "$CURRENT_UID" -ne 0 ]]; then + log_error "当前用户 (${CURRENT_UID}:${CURRENT_GID}) 无法设置所有者为 ${OWNER_UID}:${OWNER_GID}" + log_error "请以目标用户运行脚本或预先调整目录权限" + exit 1 + fi + NEED_CHOWN=true +else + NEED_CHOWN=false +fi # 创建临时目录用于打包 TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$" @@ -208,26 +227,31 @@ fi TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz" log_info "创建发布包: $TAR_NAME" cd "$TEMP_PACKAGE_DIR" -sudo tar -czf "$PUBLISH_DIR/$TAR_NAME" * +tar -czf "$PUBLISH_DIR/$TAR_NAME" * cd - > /dev/null -# 设置文件所有者 -log_info "设置文件所有者为: $OWNER" -sudo chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME" +if [[ "$NEED_CHOWN" == true ]]; then + log_info "设置文件所有者为: $OWNER" + chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME" +fi # 清理临时目录 rm -rf "$TEMP_PACKAGE_DIR" # 更新 LATEST_VERSION 文件 log_info "更新 LATEST_VERSION 文件..." -echo "$VERSION" | sudo tee "$PUBLISH_DIR/LATEST_VERSION" > /dev/null -sudo chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION" +echo "$VERSION" > "$PUBLISH_DIR/LATEST_VERSION" +if [[ "$NEED_CHOWN" == true ]]; then + chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION" +fi # 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制) if [[ -f "config/dns.conf" ]]; then log_info "复制 DNS 配置文件到发布目录根目录..." - sudo cp "config/dns.conf" "$PUBLISH_DIR/" - sudo chown "$OWNER" "$PUBLISH_DIR/dns.conf" + cp "config/dns.conf" "$PUBLISH_DIR/" + if [[ "$NEED_CHOWN" == true ]]; then + chown "$OWNER" "$PUBLISH_DIR/dns.conf" + fi log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf" else log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制" @@ -236,8 +260,10 @@ fi # 复制 setup.sh 到发布目录 if [[ -f "scripts/setup.sh" ]]; then log_info "复制 setup.sh 到发布目录..." - sudo cp "scripts/setup.sh" "$PUBLISH_DIR/" - sudo chown "$OWNER" "$PUBLISH_DIR/setup.sh" + cp "scripts/setup.sh" "$PUBLISH_DIR/" + if [[ "$NEED_CHOWN" == true ]]; then + chown "$OWNER" "$PUBLISH_DIR/setup.sh" + fi fi # 显示发布结果 diff --git a/src/metric/client-plugins/all-in-one-full/README.md b/src/metric/client-plugins/all-in-one-full/README.md new file mode 100644 index 0000000..da8f84e --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/README.md @@ -0,0 +1,59 @@ +# 客户侧组件安装包构建、发布流程 + +## 第一步:配置版本和组件 + +首先搞定配置文件: + +1. 把 `.checklist.example` 重命名成 `checklist` +2. 把 `.VERSION.example` 重命名成 `VERSION` + +### checklist 文件格式 +``` +# 组件名称 目录路径 版本号 [依赖组件] [安装顺序] +dcgm-exporter-installer /path/to/dcgm-exporter-installer 1.1.0 +node-exporter-installer /path/to/node-exporter-installer 1.1.0 +``` + +### VERSION 文件 +设置需要发布的版本号,比如 `1.29.0` + +> 建议用 `version-manager.sh` 来管理版本 + +## 第二步:构建安装包 + +直接跑脚本: +```bash +./package_artifact.sh +``` + +构建完的东西会放在 `artifact/` 目录下,按版本分文件夹。 + +如果版本已经存在了,想要覆盖重新构建: +```bash +./package_artifact.sh --force +``` + +构建完可以手工测试安装包。 + +## 第三步:发布安装包 + +用这个脚本发布: +```bash +./publish_artifact.sh +``` + +发布后的内容在 `publish/` 目录里,包含: +- 压缩版本的安装包 +- 一键安装的bash脚本 + +## 第四步:部署到FTP服务器 + +把发布的内容上传到FTP服务器,客户端就可以通过一键命令安装: + +```bash +curl -fsSL http://your-ftp-server/install.sh | sh - + +curl -fsSL "ftp://ftpuser:{PASSWD}!@10.211.55.4/share/setup.sh" | sudo bash -s -- --server 10.211.55.4 --user ftpuser --password {PASSWD} +``` + +这样客户就能直接从FTP服务器下载并安装组件了。 \ No newline at end of file diff --git a/src/metric/client-plugins/all-in-one-full/config/.VERSION.example b/src/metric/client-plugins/all-in-one-full/config/.VERSION.example new file mode 100644 index 0000000..5e57fb8 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/config/.VERSION.example @@ -0,0 +1 @@ +1.29.0 diff --git a/src/metric/client-plugins/all-in-one-full/config/.checklist.example b/src/metric/client-plugins/all-in-one-full/config/.checklist.example new file mode 100644 index 0000000..89cf322 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/config/.checklist.example @@ -0,0 +1,3 @@ +# 组件名称 目录路径 版本号 [依赖组件] [安装顺序] +dcgm-exporter-installer /Users/sundapeng/Project/nlp/aiops/client-plugins/dcgm-exporter-installer 1.1.0 +node-exporter-installer /Users/sundapeng/Project/nlp/aiops/client-plugins/node-exporter-installer 1.1.0 diff --git a/src/metric/client-plugins/all-in-one-full/config/VERSION b/src/metric/client-plugins/all-in-one-full/config/VERSION new file mode 100644 index 0000000..2aeaa11 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/config/VERSION @@ -0,0 +1 @@ +1.35.0 diff --git a/src/metric/client-plugins/all-in-one-full/config/checklist b/src/metric/client-plugins/all-in-one-full/config/checklist new file mode 100644 index 0000000..e97d45e --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/config/checklist @@ -0,0 +1,5 @@ +# 组件名称 目录路径 版本号 [依赖组件] [安装顺序] +argus-agent plugins/argus-agent 1.0.0 +node-exporter plugins/node-exporter 1.0.0 +dcgm-exporter plugins/dcgm-exporter 1.0.0 +fluent-bit plugins/fluent-bit 1.0.0 diff --git a/src/metric/client-plugins/all-in-one-full/config/config.env b/src/metric/client-plugins/all-in-one-full/config/config.env new file mode 100644 index 0000000..b5bea3c --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/config/config.env @@ -0,0 +1,14 @@ +# Elasticsearch +ES_HOST=es.log.argus.com +ES_PORT=9200 + +# Argus-Agent +# 连接master服务 +MASTER_ENDPOINT=master.argus.com:3000 +# 上报状态间隔描述 +REPORT_INTERVAL_SECONDS=5 + +# FTP +FTP_SERVER=172.31.0.40 +FTP_USER=ftpuser +FTP_PASSWORD=ZGClab1234! diff --git a/src/metric/client-plugins/all-in-one-full/config/config.env.example b/src/metric/client-plugins/all-in-one-full/config/config.env.example new file mode 100644 index 0000000..8871dfe --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/config/config.env.example @@ -0,0 +1,8 @@ +# Argus Metric 配置文件示例 +# 复制此文件为 config.env 并根据需要修改配置 + +# 连接master服务 +MASTER_ENDPOINT=master.argus.com:3000 + +# 上报状态间隔描述(秒) +REPORT_INTERVAL_SECONDS=60 diff --git a/src/metric/client-plugins/all-in-one-full/config/dns.conf b/src/metric/client-plugins/all-in-one-full/config/dns.conf new file mode 100644 index 0000000..5a9c316 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/config/dns.conf @@ -0,0 +1 @@ +172.31.0.2 diff --git a/src/metric/client-plugins/all-in-one-full/deps/cron-offline.tar.gz b/src/metric/client-plugins/all-in-one-full/deps/cron-offline.tar.gz new file mode 100644 index 0000000..77104f7 Binary files /dev/null and b/src/metric/client-plugins/all-in-one-full/deps/cron-offline.tar.gz differ diff --git a/src/metric/client-plugins/all-in-one-full/deps/jq-curl.tar.gz b/src/metric/client-plugins/all-in-one-full/deps/jq-curl.tar.gz new file mode 100644 index 0000000..27f4ccc Binary files /dev/null and b/src/metric/client-plugins/all-in-one-full/deps/jq-curl.tar.gz differ diff --git a/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/cron.tar.gz b/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/cron.tar.gz new file mode 100755 index 0000000..376a089 Binary files /dev/null and b/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/cron.tar.gz differ diff --git a/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/curl.tar.gz b/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/curl.tar.gz new file mode 100755 index 0000000..5c4fcc8 Binary files /dev/null and b/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/curl.tar.gz differ diff --git a/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/jq.tar.gz b/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/jq.tar.gz new file mode 100755 index 0000000..a322155 Binary files /dev/null and b/src/metric/client-plugins/all-in-one-full/deps/ubuntu20/jq.tar.gz differ diff --git a/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/cron.tar.gz b/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/cron.tar.gz new file mode 100755 index 0000000..702f63f Binary files /dev/null and b/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/cron.tar.gz differ diff --git a/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz b/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz new file mode 100755 index 0000000..3237287 Binary files /dev/null and b/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz differ diff --git a/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/jq.tar.gz b/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/jq.tar.gz new file mode 100755 index 0000000..b50273f Binary files /dev/null and b/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/jq.tar.gz differ diff --git a/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/README.md b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/README.md new file mode 100644 index 0000000..4e9e690 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/README.md @@ -0,0 +1,94 @@ +# Argus Agent 插件 + +这是 Argus Agent 的安装和管理插件,提供了完整的安装、卸载、健康检查功能。 + +## 文件结构 + +``` +argus-agent/ +├── bin/ +│ └── argus-agent # Argus Agent 二进制文件 +├── config/ # 配置文件目录 +├── install.sh # 安装脚本 +├── uninstall.sh # 卸载脚本 +├── check_health.sh # 健康检查脚本 +├── package.sh # 打包脚本 +└── README.md # 说明文档 +``` + +## 使用方法 + +### 安装 + +```bash +sudo ./install.sh +``` + +安装脚本会: +- 检查系统要求 +- 停止可能运行的服务 +- 安装二进制文件到 `/usr/local/bin/argus-agent` +- 创建 `argus-agent` 用户 +- 创建配置和数据目录 +- 启动服务并记录 PID + +### 卸载 + +```bash +sudo ./uninstall.sh +``` + +卸载脚本会: +- 停止所有 argus-agent 进程 +- 删除二进制文件 +- 删除配置和数据目录 +- 清理日志文件 +- 更新安装记录 + +### 健康检查 + +```bash +./check_health.sh +``` + +健康检查脚本会: +- 检查安装记录中的 PID +- 验证进程是否正在运行 +- 输出 JSON 格式的健康状态 + +### 打包 + +```bash +./package.sh +``` + +打包脚本会: +- 检查所有必要文件 +- 创建时间戳命名的压缩包 +- 输出安装包信息 + +## 安装后的文件位置 + +- 二进制文件: `/usr/local/bin/argus-agent` +- 配置目录: `/etc/argus-agent/` +- 数据目录: `/var/lib/argus-agent/` +- 日志文件: `/var/log/argus-agent.log` +- PID 文件: `/var/run/argus-agent.pid` +- 安装记录: `/opt/argus-metric/current/.install_record` + +## 健康检查输出格式 + +```json +{ + "name": "argus-agent", + "status": "health|unhealth", + "reason": "状态说明" +} +``` + +## 注意事项 + +1. 安装和卸载脚本需要 root 权限 +2. 健康检查脚本使用安装记录中的 PID 来验证进程状态 +3. 如果 jq 命令不可用,健康检查会使用简单的文本解析 +4. 卸载时会保留 `argus-agent` 用户,避免影响其他服务 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent new file mode 100755 index 0000000..bb3f86b --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d2cf989d0089223b34a27a32d14aad83459afe25a58b1d9f4f3be9f3c5b82e1 +size 7580232 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/check_health.sh b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/check_health.sh new file mode 100755 index 0000000..3bd9a99 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/check_health.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# Argus Agent 健康检查脚本 +# 输出 JSON 格式结果 + +set -e + +# 检查 Argus Agent 健康状态 +check_health() { + local name="argus-agent" + local status="unhealth" + local reason="" + local install_record="/opt/argus-metric/current/.install_record" + + # 首先尝试通过安装记录文件检查进程 + if [[ -f "$install_record" ]]; then + # 尝试使用jq解析JSON格式的安装记录文件 + local pid="" + if command -v jq &> /dev/null; then + pid=$(jq -r '.components."argus-agent".pid // empty' "$install_record" 2>/dev/null || echo "") + else + # 如果没有jq,使用简单的文本解析方法 + pid=$(grep -A 10 '"argus-agent"' "$install_record" | grep '"pid"' | cut -d'"' -f4 | head -1) + fi + + if [[ -n "$pid" && "$pid" =~ ^[0-9]+$ ]]; then + if kill -0 "$pid" 2>/dev/null; then + # 进程存在且运行正常 + status="health" + reason="进程运行正常 (PID: $pid)" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 0 + else + reason="安装记录中的 PID $pid 进程不存在" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + else + reason="安装记录文件中未找到有效的 argus-agent PID" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + else + # 如果安装记录文件不存在,尝试查找 argus-agent 进程 + local pids=$(pgrep -f "argus-agent" 2>/dev/null || true) + if [[ -n "$pids" ]]; then + # 取第一个找到的 PID + local pid=$(echo "$pids" | head -1) + status="health" + reason="发现 argus-agent 进程运行 (PID: $pid),但未找到安装记录" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 0 + else + reason="未找到 argus-agent 进程,且安装记录文件不存在" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + fi +} + +# 主函数 +main() { + check_health +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/install.sh b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/install.sh new file mode 100755 index 0000000..7c085ec --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/install.sh @@ -0,0 +1,289 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 显示帮助信息 +show_help() { + echo "Argus Agent 安装脚本" + echo + echo "用法: $0 [选项]" + echo + echo "选项:" + echo " --help 显示此帮助信息" + echo + echo "示例:" + echo " $0 # 安装 Argus Agent" + echo +} + +# 解析命令行参数 +INSTALL_DIR="" +for arg in "$@"; do + case $arg in + --help|-h) + show_help + exit 0 + ;; + *) + # 如果参数不是以--开头,则认为是安装目录 + if [[ ! "$arg" =~ ^-- ]]; then + INSTALL_DIR="$arg" + else + log_error "未知参数: $arg" + show_help + exit 1 + fi + ;; + esac +done + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0" + exit 1 + fi +} + +# 检查系统要求 +check_system() { + log_info "检查系统要求..." + + # 检查操作系统 + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + exit 1 + fi + + source /etc/os-release + log_info "检测到操作系统: $NAME $VERSION" + + # 检查是否为 Linux 系统 + if [[ "$ID" != "ubuntu" && "$ID" != "debian" && "$ID" != "centos" && "$ID" != "rhel" && "$ID" != "fedora" ]]; then + log_warning "此脚本主要针对常见 Linux 发行版,其他系统可能需要调整" + fi + + # 检查系统架构 + local arch=$(uname -m) + log_info "系统架构: $arch" + + if [[ "$arch" != "x86_64" && "$arch" != "amd64" ]]; then + log_warning "当前架构为 $arch,argus-agent 主要支持 x86_64/amd64" + fi +} + +# 停止可能运行的服务 +stop_existing_service() { + log_info "检查并停止可能运行的服务..." + local pid_file="/var/run/argus-agent.pid" + + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if ps -p "$pid" -o comm= | grep -q "^argus-agent$"; then + kill "$pid" 2>/dev/null || true + sleep 2 + kill -9 "$pid" 2>/dev/null || true + log_success "服务已停止" + fi + rm -f "$pid_file" + fi + + local pids=$(pgrep -x argus-agent 2>/dev/null || true) + if [[ -n "$pids" ]]; then + for pid in $pids; do kill -9 "$pid" 2>/dev/null || true; done + fi + + # 检查僵尸进程 + local zombies=$(ps -eo pid,stat,comm | grep '[a]rgus-agent' | awk '$2 ~ /Z/ {print $1}') + if [[ -n "$zombies" ]]; then + for pid in $zombies; do + local ppid=$(ps -o ppid= -p $pid) + log_warning "检测到僵尸 argus-agent (PID=$pid, PPID=$ppid),尝试清理" + [[ "$ppid" -ne 1 ]] && kill -9 "$ppid" 2>/dev/null || true + done + fi +} + + +# 安装 Argus Agent 二进制文件 +install_argus_agent() { + log_info "安装 Argus Agent..." + local binary_file="bin/argus-agent" + local install_dir="/usr/local/bin" + local target_file="$install_dir/argus-agent" + + [[ ! -f "$binary_file" ]] && log_error "找不到 Argus Agent 二进制文件: $binary_file" && exit 1 + + stop_existing_service + + local timeout=10 + while [[ $timeout -gt 0 ]]; do + remaining_pids=$(pgrep -x argus-agent | grep -vw $$ || true) + [[ -z "$remaining_pids" ]] && break + if ps -eo pid,stat,comm | grep -E 'argus-agent' | grep -q 'Z'; then + log_warning "检测到僵尸 argus-agent,跳过等待" + break + fi + log_warning "等待 argus-agent 完全退出... ($timeout)" + sleep 1 + ((timeout--)) + done + + cp "$binary_file" "${target_file}.new" + chmod +x "${target_file}.new" + mv -f "${target_file}.new" "$target_file" + log_success "Argus Agent 二进制文件安装完成" +} + + +# 创建用户和组 +create_user() { + log_info "创建 argus-agent 用户..." + + # 检查用户是否已存在 + if id "argus-agent" &>/dev/null; then + log_info "用户 argus-agent 已存在" + else + useradd --no-create-home --shell /bin/false argus-agent + log_success "用户 argus-agent 创建完成" + fi +} + +# 安装配置文件 +install_config() { + log_info "安装配置文件..." + + local config_dir="/etc/argus-agent" + + # 创建配置目录 + mkdir -p "$config_dir" + + # 创建健康检查目录 + mkdir -p "/var/lib/argus-agent/health" + chown argus-agent:argus-agent "/var/lib/argus-agent/health" +} + +# 启动 Argus Agent 服务 +start_argus_agent() { + log_info "启动 Argus Agent 服务..." + local binary_path="/usr/local/bin/argus-agent" + local log_file="/var/log/argus-agent.log" + local pid_file="/var/run/argus-agent.pid" + + [[ -f "$pid_file" ]] && rm -f "$pid_file" + + log_info "正在启动 Argus Agent..." + setsid "$binary_path" > "$log_file" 2>&1 < /dev/null & + local pid=$! + echo "$pid" > "$pid_file" + sleep 2 + + if kill -0 "$pid" 2>/dev/null; then + log_success "Argus Agent 服务启动成功 (PID: $pid)" + else + log_error "Argus Agent 启动失败" + [[ -f "$log_file" ]] && tail -n 10 "$log_file" + rm -f "$pid_file" + fi +} + + +# 更新安装记录 +update_install_record() { + local pid="$1" + # 使用传入的安装目录参数,如果没有则使用默认值 + local install_base_dir="${2:-/opt/argus-metric/current}" + local install_record="$install_base_dir/.install_record" + + # 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建 + if [[ ! -f "$install_record" ]]; then + log_info "安装记录文件不存在,将由主安装脚本创建" + return 0 + fi + + # 如果文件存在,说明是重启场景,只更新 PID 字段 + if command -v jq &> /dev/null; then + # 读取当前 PID + local current_pid=$(jq -r '.components."argus-agent".pid // ""' "$install_record" 2>/dev/null) + + if [[ -z "$current_pid" ]]; then + log_warning "无法读取当前 PID,跳过更新" + return 1 + fi + + # 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段 + jq --arg new_pid "$pid" '.components."argus-agent".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record" + log_info "PID 已更新: $current_pid -> $pid" + else + log_warning "jq 命令不可用,无法更新安装记录文件" + fi +} + +# 显示安装信息 +show_install_info() { + log_success "Argus Agent 安装完成!" + echo + echo "安装信息:" + echo " 二进制文件: /usr/local/bin/argus-agent" + echo " 运行用户: argus-agent" + echo " 配置目录: /etc/argus-agent/" + echo " 健康检查目录: /var/lib/argus-agent/health" + echo + echo "使用方法:" + echo " 手动启动: /usr/local/bin/argus-agent" + echo " 后台启动: nohup /usr/local/bin/argus-agent &" + echo + echo "健康检查:" + echo " ./check_health.sh" + echo +} + +# 主函数 +main() { + echo "==========================================" + echo " Argus Agent 安装脚本 v1.0" + echo "==========================================" + echo + + check_root + check_system + + log_info "开始安装 Argus Agent..." + + install_argus_agent + create_user + install_config + start_argus_agent + + show_install_info +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/package.sh b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/package.sh new file mode 100755 index 0000000..a1d6394 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/package.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +set -e + +# 颜色定义 +GREEN='\033[0;32m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +# 获取当前目录 +CURRENT_DIR=$(pwd) +PACKAGE_NAME="argus-agent-$(date +%Y%m%d-%H%M%S)" +PACKAGE_FILE="${PACKAGE_NAME}.tar.gz" + +log_info "开始打包 Argus Agent 安装包..." + +# 检查必要文件 +log_info "检查必要文件..." + +required_files=( + "install.sh" + "uninstall.sh" + "bin/argus-agent" + "check_health.sh" +) + +missing_files=() +for file in "${required_files[@]}"; do + if [[ ! -f "$file" ]]; then + missing_files+=("$file") + fi +done + +if [[ ${#missing_files[@]} -gt 0 ]]; then + echo "缺少以下文件:" + for file in "${missing_files[@]}"; do + echo " - $file" + done + exit 1 +fi + +log_success "所有必要文件检查完成" + +# 创建临时目录 +TEMP_DIR=$(mktemp -d) +log_info "创建临时目录: $TEMP_DIR" + +# 复制文件到临时目录 +cp -r . "$TEMP_DIR/$PACKAGE_NAME" + +# 进入临时目录 +cd "$TEMP_DIR" + +# 创建压缩包 +log_info "创建压缩包: $PACKAGE_FILE" +tar -czf "$PACKAGE_FILE" "$PACKAGE_NAME" + +# 移动压缩包到原目录 +mv "$PACKAGE_FILE" "$CURRENT_DIR/" + +# 清理临时目录 +rm -rf "$TEMP_DIR" + +# 返回原目录 +cd "$CURRENT_DIR" + +# 显示结果 +log_success "打包完成!" +echo +echo "安装包文件: $PACKAGE_FILE" +echo "文件大小: $(du -h "$PACKAGE_FILE" | cut -f1)" +echo +echo "使用方法:" +echo "1. 将 $PACKAGE_FILE 传输到目标服务器" +echo "2. 解压: tar -xzf $PACKAGE_FILE" +echo "3. 进入目录: cd $PACKAGE_NAME" +echo "4. 运行安装: sudo ./install.sh" +echo +echo "注意: 请确保所有必要文件都存在" diff --git a/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/uninstall.sh b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/uninstall.sh new file mode 100755 index 0000000..d64a370 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/uninstall.sh @@ -0,0 +1,255 @@ +#!/bin/bash + +# Argus Agent 卸载脚本 +# 版本: 1.0 +# 作者: AIOps Team +# 日期: $(date +%Y-%m-%d) + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0" + exit 1 + fi +} + +# 停止运行中的进程 +stop_processes() { + log_info "停止 Argus Agent 进程..." + + local pid_file="/var/run/argus-agent.pid" + local stopped=false + + # 首先尝试通过 PID 文件停止服务 + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if kill -0 "$pid" 2>/dev/null; then + log_info "通过 PID 文件停止服务 (PID: $pid)..." + kill "$pid" + sleep 3 + + # 检查进程是否已停止 + if kill -0 "$pid" 2>/dev/null; then + log_warning "进程未响应,强制终止..." + kill -9 "$pid" 2>/dev/null || true + fi + log_success "Argus Agent 进程已停止" + stopped=true + else + log_warning "PID 文件存在但进程已不存在,清理 PID 文件" + rm -f "$pid_file" + fi + fi + + # 查找并杀死所有 argus-agent 进程 + local pids=$(pgrep -f "argus-agent" 2>/dev/null || true) + if [[ -n "$pids" ]]; then + log_info "发现 argus-agent 进程,正在停止..." + for pid in $pids; do + log_info "停止进程 PID: $pid" + kill "$pid" 2>/dev/null || true + done + sleep 2 + + # 检查是否还有进程在运行,如果有则强制终止 + local remaining_pids=$(pgrep -f "argus-agent" 2>/dev/null || true) + if [[ -n "$remaining_pids" ]]; then + log_warning "进程未响应,强制终止..." + for pid in $remaining_pids; do + log_info "强制终止进程 PID: $pid" + kill -9 "$pid" 2>/dev/null || true + done + sleep 1 + fi + + # 最终检查 + if pgrep -f "argus-agent" > /dev/null; then + log_error "无法停止所有 argus-agent 进程" + else + log_success "所有 Argus Agent 进程已停止" + stopped=true + fi + else + log_info "Argus Agent 进程未运行" + fi + + # 清理 PID 文件 + rm -f "$pid_file" + + if [[ "$stopped" == "false" ]]; then + log_warning "未发现需要停止的 Argus Agent 进程" + fi +} + +# 删除二进制文件 +remove_binary() { + log_info "删除 Argus Agent 二进制文件..." + + local binary_files=( + "/usr/local/bin/argus-agent" + ) + + local deleted=false + for binary_file in "${binary_files[@]}"; do + if [[ -f "$binary_file" ]]; then + rm -f "$binary_file" + log_success "二进制文件已删除: $binary_file" + deleted=true + fi + done + + if [[ "$deleted" == "false" ]]; then + log_info "二进制文件不存在" + fi +} + +# 删除配置文件 +remove_config() { + log_info "删除配置文件..." + + local config_dir="/etc/argus-agent" + + if [[ -d "$config_dir" ]]; then + rm -rf "$config_dir" + log_success "配置目录已删除" + else + log_info "配置目录不存在" + fi +} + +# 删除数据目录 +remove_data_dir() { + log_info "删除数据目录..." + + local data_dir="/var/lib/argus-agent" + + if [[ -d "$data_dir" ]]; then + rm -rf "$data_dir" + log_success "数据目录已删除" + else + log_info "数据目录不存在" + fi +} + +# 检查用户状态(可选) +check_user_status() { + log_info "检查 argus-agent 用户状态..." + + if id "argus-agent" &>/dev/null; then + log_info "检测到 argus-agent 用户存在" + log_warning "argus-agent 是系统用户,可能被其他服务使用" + log_info "为了系统稳定性,将保留 argus-agent 用户" + log_info "如需手动删除,请运行: sudo userdel argus-agent" + else + log_info "argus-agent 用户不存在" + fi +} + +# 清理日志文件 +cleanup_logs() { + log_info "清理日志文件..." + + # 删除安装脚本创建的日志文件 + rm -f /var/log/argus-agent.log + + log_success "日志文件已清理" +} + +# 清理安装记录 +cleanup_install_record() { + log_info "清理安装记录..." + + local install_record="/opt/argus-metric/current/.install_record" + + if [[ -f "$install_record" ]]; then + if command -v jq &> /dev/null; then + # 使用 jq 删除 argus-agent 记录 + jq 'del(.components."argus-agent")' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record" + log_success "安装记录已更新" + else + log_warning "jq 命令不可用,无法清理安装记录" + fi + else + log_info "安装记录文件不存在" + fi +} + +# 显示卸载信息 +show_uninstall_info() { + log_success "Argus Agent 卸载完成!" + echo + echo "已删除的内容:" + echo " - 二进制文件: /usr/local/bin/argus-agent" + echo " - 配置目录: /etc/argus-agent" + echo " - 数据目录: /var/lib/argus-agent" + echo " - 相关日志文件" + echo + echo "注意:" + echo " - argus-agent 用户已保留(系统用户,可能被其他服务使用)" + echo " - 如需完全清理,请手动检查并删除相关文件" + echo +} + +# 主函数 +main() { + echo "==========================================" + echo " Argus Agent 卸载脚本 v1.0" + echo "==========================================" + echo + + check_root + + log_warning "此操作将完全卸载 Argus Agent" + read -p "确认继续?(y/N): " confirm + + if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "取消卸载操作" + exit 0 + fi + + log_info "开始卸载 Argus Agent..." + + stop_processes + remove_binary + remove_config + remove_data_dir + cleanup_logs + cleanup_install_record + + # 检查用户状态 + check_user_status + + show_uninstall_info +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/bin/datacenter-gpu-manager_3.3.9_amd64.deb b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/bin/datacenter-gpu-manager_3.3.9_amd64.deb new file mode 100644 index 0000000..683d8cf --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/bin/datacenter-gpu-manager_3.3.9_amd64.deb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bf3a081e24603bc995a8aa041ff7819df60563da3e1f7887dae366baed6d45c +size 911205922 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/bin/dcgm-exporter b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/bin/dcgm-exporter new file mode 100755 index 0000000..5b374f1 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/bin/dcgm-exporter @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8159d5eb6617ff7a06dd0166d14cf17186dd2a578b7b5413026395a0b123c4c7 +size 58360760 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/check_health.sh b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/check_health.sh new file mode 100755 index 0000000..b7ec881 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/check_health.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# DCGM Exporter 健康检查脚本 +# 输出 JSON 格式结果 + +set -e + +# 检查 DCGM Exporter 健康状态 +check_health() { + local url="http://localhost:9400" + local metrics_url="$url/metrics" + local name="dcgm-exporter" + local status="unhealth" + local reason="" + + # 检查 curl 是否可用 + if ! command -v curl &> /dev/null; then + reason="curl 命令不可用,无法进行健康检查" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + + # 测试根路径连接 + local http_code=$(curl -s -o /dev/null -w "%{http_code}" "$url" 2>/dev/null || echo "000") + + if [[ "$http_code" == "200" ]]; then + # 测试 metrics 端点 + local metrics_code=$(curl -s -o /dev/null -w "%{http_code}" "$metrics_url" 2>/dev/null || echo "000") + + if [[ "$metrics_code" == "200" ]]; then + status="health" + reason="success" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 0 + else + reason="Metrics 端点异常 (HTTP $metrics_code)" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + else + reason="HTTP 服务异常 (HTTP $http_code),请检查 DCGM Exporter 是否正在运行在端口 9400" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi +} + +# 主函数 +main() { + check_health +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/config/default-counters.csv b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/config/default-counters.csv new file mode 100644 index 0000000..ad949dd --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/config/default-counters.csv @@ -0,0 +1,77 @@ +# Format +# If line starts with a '#' it is considered a comment +# DCGM FIELD, Prometheus metric type, help message + +# Clocks +DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). +DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). +# DCGM_EXP_CLOCK_EVENTS_COUNT, gauge, Count of clock events within the user-specified time window (see clock-events-count-window-size param). + +# Temperature +DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). +DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). + +# Power +DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). + +# PCIE +DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML. +DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML. +DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. + +# Utilization (the sample period varies depending on the product) +DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). +DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). +DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). +DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). + +# Errors and violations +DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. +# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). +# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). +# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). +# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). +# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). +# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). +# DCGM_EXP_XID_ERRORS_COUNT, gauge, Count of XID Errors within user-specified time window (see xid-count-window-size param). +# Memory usage +DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB). +DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB). + +# ECC +# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. +# DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. +# DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. +# DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. + +# Retired pages +# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. +# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. +# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. + +# NVLink +# DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. +# DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. +# DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. +# DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes + +# VGPU License status +DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status + +# Remapped rows +DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors +DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors +DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed + +# Static configuration information. These appear as labels on the other metrics +DCGM_FI_DRIVER_VERSION, label, Driver Version +# DCGM_FI_NVML_VERSION, label, NVML Version +# DCGM_FI_DEV_BRAND, label, Device Brand +# DCGM_FI_DEV_SERIAL, label, Device Serial Number +# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version +# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version +# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version +# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version +# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device diff --git a/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/install.sh b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/install.sh new file mode 100755 index 0000000..7c97d6b --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/install.sh @@ -0,0 +1,365 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 更新安装记录 +update_install_record() { + local pid="$1" + # 使用传入的安装目录参数,如果没有则使用默认值 + local install_base_dir="${2:-/opt/argus-metric/current}" + local install_record="$install_base_dir/.install_record" + + # 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建 + if [[ ! -f "$install_record" ]]; then + log_info "安装记录文件不存在,将由主安装脚本创建" + return 0 + fi + + # 如果文件存在,说明是重启场景,只更新 PID 字段 + if command -v jq &> /dev/null; then + # 读取当前 PID + local current_pid=$(jq -r '.components."dcgm-exporter".pid // ""' "$install_record" 2>/dev/null) + + if [[ -z "$current_pid" ]]; then + log_warning "无法读取当前 PID,跳过更新" + return 1 + fi + + # 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段 + jq --arg new_pid "$pid" '.components."dcgm-exporter".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record" + log_info "PID 已更新: $current_pid -> $pid" + else + log_warning "jq 命令不可用,无法更新安装记录文件" + fi +} + +# 显示帮助信息 +show_help() { + echo "DCGM Exporter 安装脚本" + echo + echo "用法: $0 [选项]" + echo + echo "选项:" + echo " --help 显示此帮助信息" + echo + echo "示例:" + echo " $0 # 安装 DCGM Exporter" + echo +} + +# 解析命令行参数 +INSTALL_DIR="" +for arg in "$@"; do + case $arg in + --help|-h) + show_help + exit 0 + ;; + *) + # 如果参数不是以--开头,则认为是安装目录 + if [[ ! "$arg" =~ ^-- ]]; then + INSTALL_DIR="$arg" + else + log_error "未知参数: $arg" + show_help + exit 1 + fi + ;; + esac +done + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0" + exit 1 + fi +} + +# 检查系统要求 +check_system() { + log_info "检查系统要求..." + + # 检查操作系统 + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + exit 1 + fi + + source /etc/os-release + log_info "检测到操作系统: $NAME $VERSION" + + # 检查是否为 Ubuntu/Debian + if [[ "$ID" != "ubuntu" && "$ID" != "debian" ]]; then + log_warning "此脚本主要针对 Ubuntu/Debian 系统,其他系统可能需要调整" + fi + + # 检查 NVIDIA GPU + if ! command -v nvidia-smi &> /dev/null; then + log_warning "未检测到 nvidia-smi,请确保已安装 NVIDIA 驱动" + else + log_success "检测到 NVIDIA GPU" + nvidia-smi --query-gpu=name --format=csv,noheader,nounits | head -1 + fi +} + +# 安装 DCGM 依赖 +install_dcgm_dependency() { + log_info "安装 DCGM 依赖..." + + local deb_file="bin/datacenter-gpu-manager_3.3.9_amd64.deb" + + if [[ ! -f "$deb_file" ]]; then + log_error "找不到 DCGM 依赖文件: $deb_file" + exit 1 + fi + + # 安装 deb 包 + dpkg -i "$deb_file" || { + log_warning "dpkg 安装失败,尝试使用 apt 修复依赖..." + apt-get update + apt-get install -f -y + dpkg -i "$deb_file" + } + + log_success "DCGM 依赖安装完成" +} + +# 检查 DCGM 服务状态 +check_dcgm_service() { + log_info "检查 DCGM 服务状态..." + + # 检查 DCGM 服务是否在运行 + if systemctl is-active --quiet dcgm 2>/dev/null; then + log_success "DCGM 服务已在运行" + elif pgrep -f nv-hostengine > /dev/null; then + log_success "nv-hostengine 进程已在运行" + else + log_warning "DCGM 服务未运行,需要手动启动" + log_info "启动 DCGM 服务的方法:" + log_info " 1. 使用 systemd: sudo systemctl start dcgm" + log_info " 2. 手动启动: nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &" + fi + + # 测试 DCGM 连接 + if systemctl is-active --quiet dcgm 2>/dev/null || pgrep -f nv-hostengine > /dev/null; then + log_info "测试 DCGM 连接..." + if dcgmi discovery -l > /dev/null 2>&1; then + log_success "DCGM 连接测试成功" + else + log_warning "DCGM 连接测试失败,请检查服务状态" + fi + fi +} + +# 停止可能运行的服务 +stop_existing_service() { + log_info "检查并停止可能运行的服务..." + + local pid_file="/var/run/dcgm-exporter.pid" + + # 检查并停止通过 PID 文件管理的服务 + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if kill -0 "$pid" 2>/dev/null; then + log_info "发现正在运行的 DCGM Exporter 服务 (PID: $pid),正在停止..." + kill "$pid" > /dev/null 2>&1 || true + sleep 2 + if kill -0 "$pid" 2>/dev/null; then + log_warning "进程未响应,强制终止..." + kill -9 "$pid" > /dev/null 2>&1 || true + fi + rm -f "$pid_file" + log_success "服务已停止" + else + log_warning "发现过期的 PID 文件,正在清理..." + rm -f "$pid_file" + fi + fi + + # 查找并停止所有 dcgm-exporter 进程(排除脚本自身) + local exporter_bin="/usr/local/bin/dcgm-exporter" + local pids=$(pgrep -f "$exporter_bin") + + if [[ -n "$pids" ]]; then + log_info "发现其他 dcgm-exporter 进程,正在停止..." + for pid in $pids; do + if [[ "$pid" != "$$" ]]; then + kill "$pid" > /dev/null 2>&1 || true + sleep 1 + if kill -0 "$pid" 2>/dev/null; then + log_warning "进程 $pid 未响应,强制终止..." + kill -9 "$pid" > /dev/null 2>&1 || true + fi + fi + done + log_success "所有 dcgm-exporter 进程已停止" + fi +} + +# 安装 DCGM Exporter 二进制文件 +install_dcgm_exporter() { + log_info "安装 DCGM Exporter..." + + local binary_file="bin/dcgm-exporter" + local install_dir="/usr/local/bin" + + if [[ ! -f "$binary_file" ]]; then + log_error "找不到 DCGM Exporter 二进制文件: $binary_file" + exit 1 + fi + + # 停止可能运行的服务 + stop_existing_service + + # 复制二进制文件 + cp "$binary_file" "$install_dir/" + chmod +x "$install_dir/dcgm-exporter" + + log_success "DCGM Exporter 二进制文件安装完成" +} + +# 安装配置文件 +install_config() { + log_info "安装配置文件..." + + local config_dir="/etc/dcgm-exporter" + local config_file="config/default-counters.csv" + + # 创建配置目录 + mkdir -p "$config_dir" + + if [[ -f "$config_file" ]]; then + cp "$config_file" "$config_dir/" + log_success "配置文件安装完成" + else + log_warning "未找到配置文件,使用默认配置" + fi +} + +# 启动 DCGM Exporter 服务 +start_dcgm_exporter() { + log_info "启动 DCGM Exporter 服务..." + + local binary_path="/usr/local/bin/dcgm-exporter" + local log_file="/var/log/dcgm-exporter.log" + local pid_file="/var/run/dcgm-exporter.pid" + + # 检查服务是否已经在运行 + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if kill -0 "$pid" 2>/dev/null; then + log_info "DCGM Exporter 服务已在运行 (PID: $pid)" + return 0 + else + log_warning "发现过期的 PID 文件,正在清理..." + rm -f "$pid_file" + fi + fi + + # 检查端口是否被占用 + if netstat -tuln 2>/dev/null | grep -q ":9400 "; then + log_warning "端口 9400 已被占用,请检查是否有其他服务在运行" + return 1 + fi + + # 启动服务 + log_info "正在启动 DCGM Exporter..." + nohup "$binary_path" --address=:9400 > "$log_file" 2>&1 & + local pid=$! + + # 保存 PID + echo "$pid" > "$pid_file" + + # 等待服务启动 + sleep 2 + + # 检查服务是否成功启动 + if kill -0 "$pid" 2>/dev/null; then + log_success "DCGM Exporter 服务启动成功 (PID: $pid)" + log_info "日志文件: $log_file" + log_info "PID 文件: $pid_file" + + # 更新安装记录 + update_install_record "$pid" "$INSTALL_DIR" + else + log_error "DCGM Exporter 服务启动失败" + rm -f "$pid_file" + return 1 + fi +} + + + +# 显示安装信息 +show_install_info() { + log_success "DCGM Exporter 安装完成!" + echo + echo "安装信息:" + echo " 二进制文件: /usr/local/bin/dcgm-exporter" + echo " 配置文件: /etc/dcgm-exporter/default-counters.csv" + echo " 默认端口: 9400" + echo + echo "使用方法:" + echo " 1. 启动 DCGM 服务:" + echo " sudo systemctl start dcgm" + echo " 或: nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &" + echo " 2. 启动 DCGM Exporter:" + echo " /usr/local/bin/dcgm-exporter --address=:9400" + echo " 或: nohup /usr/local/bin/dcgm-exporter --address=:9400 &" + echo + echo "测试连接:" + echo " curl http://localhost:9400/metrics" + echo +} + +# 主函数 +main() { + echo "==========================================" + echo " DCGM Exporter 安装脚本 v1.0" + echo "==========================================" + echo + + check_root + check_system + + log_info "开始安装 DCGM Exporter..." + + install_dcgm_dependency + check_dcgm_service + install_dcgm_exporter + install_config + start_dcgm_exporter + + show_install_info +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/package.sh b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/package.sh new file mode 100755 index 0000000..103913f --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/package.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +set -e + +# 颜色定义 +GREEN='\033[0;32m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +# 获取当前目录 +CURRENT_DIR=$(pwd) +PACKAGE_NAME="dcgm-exporter-$(date +%Y%m%d-%H%M%S)" +PACKAGE_FILE="${PACKAGE_NAME}.tar.gz" + +log_info "开始打包 DCGM Exporter 安装包..." + +# 检查必要文件 +log_info "检查必要文件..." + +required_files=( + "install.sh" + "uninstall.sh" + "bin/dcgm-exporter" + "bin/datacenter-gpu-manager_3.3.9_amd64.deb" + "check_health.sh" +) + +missing_files=() +for file in "${required_files[@]}"; do + if [[ ! -f "$file" ]]; then + missing_files+=("$file") + fi +done + +if [[ ${#missing_files[@]} -gt 0 ]]; then + echo "缺少以下文件:" + for file in "${missing_files[@]}"; do + echo " - $file" + done + exit 1 +fi + +log_success "所有必要文件检查完成" + +# 创建临时目录 +TEMP_DIR=$(mktemp -d) +log_info "创建临时目录: $TEMP_DIR" + +# 复制文件到临时目录 +cp -r . "$TEMP_DIR/$PACKAGE_NAME" + +# 进入临时目录 +cd "$TEMP_DIR" + +# 创建压缩包 +log_info "创建压缩包: $PACKAGE_FILE" +tar -czf "$PACKAGE_FILE" "$PACKAGE_NAME" + +# 移动压缩包到原目录 +mv "$PACKAGE_FILE" "$CURRENT_DIR/" + +# 清理临时目录 +rm -rf "$TEMP_DIR" + +# 返回原目录 +cd "$CURRENT_DIR" + +# 显示结果 +log_success "打包完成!" +echo +echo "安装包文件: $PACKAGE_FILE" +echo "文件大小: $(du -h "$PACKAGE_FILE" | cut -f1)" +echo +echo "使用方法:" +echo "1. 将 $PACKAGE_FILE 传输到目标服务器" +echo "2. 解压: tar -xzf $PACKAGE_FILE" +echo "3. 进入目录: cd $PACKAGE_NAME" +echo "4. 运行安装: sudo ./install.sh" +echo +echo "注意: 请确保 config/default-counters.csv 文件存在" diff --git a/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/uninstall.sh b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/uninstall.sh new file mode 100755 index 0000000..816a8ae --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/uninstall.sh @@ -0,0 +1,216 @@ +#!/bin/bash + +# DCGM Exporter 卸载脚本 +# 版本: 1.0 +# 作者: AIOps Team +# 日期: $(date +%Y-%m-%d) + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0" + exit 1 + fi +} + +# 停止运行中的进程 +stop_processes() { + log_info "停止 DCGM Exporter 进程..." + + local pid_file="/var/run/dcgm-exporter.pid" + local stopped=false + + # 首先尝试通过 PID 文件停止服务 + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if kill -0 "$pid" 2>/dev/null; then + log_info "通过 PID 文件停止服务 (PID: $pid)..." + kill "$pid" + sleep 3 + + # 检查进程是否已停止 + if kill -0 "$pid" 2>/dev/null; then + log_warning "进程未响应,强制终止..." + kill -9 "$pid" 2>/dev/null || true + fi + log_success "DCGM Exporter 进程已停止" + stopped=true + else + log_warning "PID 文件存在但进程已不存在,清理 PID 文件" + rm -f "$pid_file" + fi + fi + + # 查找并杀死所有 dcgm-exporter 进程 + local pids=$(pgrep -f "dcgm-exporter" 2>/dev/null || true) + if [[ -n "$pids" ]]; then + log_info "发现 dcgm-exporter 进程,正在停止..." + for pid in $pids; do + log_info "停止进程 PID: $pid" + kill "$pid" 2>/dev/null || true + done + sleep 2 + + # 检查是否还有进程在运行,如果有则强制终止 + local remaining_pids=$(pgrep -f "dcgm-exporter" 2>/dev/null || true) + if [[ -n "$remaining_pids" ]]; then + log_warning "进程未响应,强制终止..." + for pid in $remaining_pids; do + log_info "强制终止进程 PID: $pid" + kill -9 "$pid" 2>/dev/null || true + done + sleep 1 + fi + + # 最终检查 + if pgrep -f "dcgm-exporter" > /dev/null; then + log_error "无法停止所有 dcgm-exporter 进程" + else + log_success "所有 DCGM Exporter 进程已停止" + stopped=true + fi + else + log_info "DCGM Exporter 进程未运行" + fi + + # 清理 PID 文件 + rm -f "$pid_file" + + if [[ "$stopped" == "false" ]]; then + log_warning "未发现需要停止的 DCGM Exporter 进程" + fi +} + +# 删除二进制文件 +remove_binary() { + log_info "删除 DCGM Exporter 二进制文件..." + + local binary_file="/usr/local/bin/dcgm-exporter" + + if [[ -f "$binary_file" ]]; then + rm -f "$binary_file" + log_success "二进制文件已删除" + else + log_info "二进制文件不存在" + fi +} + +# 删除配置文件 +remove_config() { + log_info "删除配置文件..." + + local config_dir="/etc/dcgm-exporter" + + if [[ -d "$config_dir" ]]; then + rm -rf "$config_dir" + log_success "配置目录已删除" + else + log_info "配置目录不存在" + fi +} + +# 卸载 DCGM 依赖(可选) +remove_dcgm_dependency() { + log_info "检查 DCGM 依赖状态..." + + # 检查是否安装了 DCGM 包 + if dpkg -l | grep -q datacenter-gpu-manager; then + log_info "检测到 DCGM 依赖包已安装" + log_warning "DCGM 是系统级依赖,可能被其他应用程序使用" + log_info "为了系统稳定性,将保留 DCGM 依赖包" + log_info "如需手动卸载,请运行: sudo apt-get remove --purge datacenter-gpu-manager" + else + log_info "DCGM 依赖包未安装" + fi +} + +# 清理日志文件 +cleanup_logs() { + log_info "清理日志文件..." + + # 清理 journal 日志 + journalctl --vacuum-time=1s --quiet || true + + # 删除可能的日志文件 + rm -f /var/log/nv-hostengine.log + rm -f /var/log/dcgm-exporter.log + + log_success "日志文件已清理" +} + +# 显示卸载信息 +show_uninstall_info() { + log_success "DCGM Exporter 卸载完成!" + echo + echo "已删除的内容:" + echo " - 二进制文件: /usr/local/bin/dcgm-exporter" + echo " - 配置目录: /etc/dcgm-exporter" + echo " - 相关日志文件" + echo + echo "注意:" + echo " - DCGM 依赖包可能仍然存在" + echo " - 如需完全清理,请手动检查并删除相关文件" + echo +} + +# 主函数 +main() { + echo "==========================================" + echo " DCGM Exporter 卸载脚本 v1.0" + echo "==========================================" + echo + + check_root + + log_warning "此操作将完全卸载 DCGM Exporter" + read -p "确认继续?(y/N): " confirm + + if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "取消卸载操作" + exit 0 + fi + + log_info "开始卸载 DCGM Exporter..." + + stop_processes + remove_binary + remove_config + cleanup_logs + + # 询问是否卸载 DCGM 依赖 + remove_dcgm_dependency + + show_uninstall_info +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/README.md b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/README.md new file mode 100644 index 0000000..ca8ce92 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/README.md @@ -0,0 +1,181 @@ +# Fluent Bit 安装包 + +这是一个 Fluent Bit 的自动化安装包,提供了完整的安装、卸载和健康检查功能。 + +## 目录结构 + +``` +fluent-bit-installer/ +├── install.sh # 安装脚本 +├── uninstall.sh # 卸载脚本 +├── package.sh # 打包脚本 +├── check_health.sh # 健康检查脚本 +├── bin/ +│ └── fluent-bit_3.1.9_amd64.deb # Fluent Bit 安装包 +└── config/ + ├── fluent-bit.conf # 主配置文件 + ├── inject_labels.lua # Lua 脚本 + ├── parsers.conf # 解析器配置 + ├── inputs.d/ # 输入配置目录 + │ ├── 10-train.conf + │ └── 20-infer.conf + └── outputs.d/ # 输出配置目录 + └── 10-es.conf +``` + +## 功能特性 + +- **自动化安装**: 一键安装 Fluent Bit 及其依赖 +- **配置管理**: 自动部署预配置的配置文件 +- **服务管理**: 自动启动和停止 Fluent Bit 服务 +- **健康检查**: 提供 JSON 格式的健康状态检查 +- **完整卸载**: 彻底清理所有相关文件和配置 +- **用户管理**: 自动创建专用的 fluent-bit 用户 + +## 使用方法 + +### 1. 打包安装包 + +```bash +./package.sh +``` + +这将创建一个带时间戳的压缩包,例如:`fluent-bit-installer-20250924-160954.tar.gz` + +### 2. 安装 Fluent Bit + +```bash +# 解压安装包 +tar -xzf fluent-bit-installer-*.tar.gz +cd fluent-bit-installer-* + +# 运行安装脚本(需要 root 权限) +sudo ./install.sh +``` + +### 3. 健康检查 + +```bash +./check_health.sh +``` + +输出示例: +```json +{"name": "fluent-bit", "status": "health", "reason": "success"} +``` + +### 4. 卸载 Fluent Bit + +```bash +sudo ./uninstall.sh +``` + +## 安装后的文件位置 + +- **二进制文件**: `/opt/fluent-bit/bin/fluent-bit` +- **配置文件**: `/etc/fluent-bit/` +- **日志文件**: `/var/log/fluent-bit/` +- **缓冲区目录**: `/var/lib/fluent-bit/buffers/` +- **运行用户**: `fluent-bit` +- **HTTP 端口**: `2020` + +## 配置说明 + +### 主配置文件 + +主配置文件位于 `/etc/fluent-bit/fluent-bit.conf`,包含以下主要部分: + +- **SERVICE**: 服务配置,包括 HTTP 服务器设置 +- **INPUT**: 输入配置,通过 `inputs.d/` 目录管理 +- **FILTER**: 过滤器配置,包括解析器和标签注入 +- **OUTPUT**: 输出配置,通过 `outputs.d/` 目录管理 + +### 输入配置 + +- `10-train.conf`: 训练日志输入配置 +- `20-infer.conf`: 推理日志输入配置 + +### 输出配置 + +- `10-es.conf`: Elasticsearch 输出配置 + +## 服务管理 + +### 手动启动 + +```bash +/opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf +``` + +### 后台启动 + +```bash +nohup /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf & +``` + +### 检查服务状态 + +```bash +# 检查进程 +ps aux | grep fluent-bit + +# 检查端口 +netstat -tuln | grep 2020 + +# 检查日志 +tail -f /var/log/fluent-bit/fluent-bit.log +``` + +## API 接口 + +Fluent Bit 提供 HTTP API 用于监控和管理: + +- **根路径**: `http://localhost:2020` +- **状态接口**: `http://localhost:2020/api/v1/status` +- **指标接口**: `http://localhost:2020/api/v1/metrics` + +## 故障排除 + +### 常见问题 + +1. **端口被占用** + - 检查端口 2020 是否被其他服务占用 + - 修改配置文件中的端口设置 + +2. **权限问题** + - 确保 fluent-bit 用户有足够的权限访问日志文件 + - 检查目录权限设置 + +3. **配置文件错误** + - 检查配置文件语法 + - 查看日志文件中的错误信息 + +### 日志查看 + +```bash +# 查看服务日志 +tail -f /var/log/fluent-bit/fluent-bit.log + +# 查看系统日志 +journalctl -u fluent-bit -f +``` + +## 系统要求 + +- **操作系统**: Ubuntu/Debian/CentOS/RHEL/Fedora +- **架构**: x86_64/amd64 +- **权限**: root 权限(用于安装和卸载) +- **依赖**: curl(用于健康检查) + +## 版本信息 + +- **Fluent Bit 版本**: 3.1.9 +- **安装包版本**: 1.0 +- **支持架构**: amd64 + +## 注意事项 + +1. 安装前请确保系统已更新 +2. 卸载时会保留 fluent-bit 用户(系统用户,可能被其他服务使用) +3. 配置文件包含环境变量,请根据实际环境调整 +4. 建议在生产环境使用前进行充分测试 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/fluent-bit_3.1.9_amd64.deb b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/fluent-bit_3.1.9_amd64.deb new file mode 100644 index 0000000..f52cb53 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/fluent-bit_3.1.9_amd64.deb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bdc163534a062c3addd705a65326800b4e362a0f54a891ed0bb8776556e2361 +size 42047204 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_14.19-0ubuntu0.22.04.1_amd64.deb b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_14.19-0ubuntu0.22.04.1_amd64.deb new file mode 100644 index 0000000..e731f32 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_14.19-0ubuntu0.22.04.1_amd64.deb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4610f6aae2b19dcc326458aaa596d06f965d0a00abb36ea3317c7157a60fd1ce +size 152282 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_0.2.2-1build2_amd64.deb b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_0.2.2-1build2_amd64.deb new file mode 100644 index 0000000..474abdc --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_0.2.2-1build2_amd64.deb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b137d89a463b671383b6eaec404a494c8bd630a4adb79fc059c3aa48af170dcb +size 51622 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/check_health.sh b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/check_health.sh new file mode 100755 index 0000000..37f4090 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/check_health.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# Fluent Bit 健康检查脚本 +# 输出 JSON 格式结果 + +set -e + +# 检查 Fluent Bit 健康状态 +check_health() { + local name="fluent-bit" + local status="unhealth" + local reason="" + local install_record="/opt/argus-metric/current/.install_record" + + # 首先尝试通过安装记录文件检查进程 + if [[ -f "$install_record" ]]; then + # 尝试使用jq解析JSON格式的安装记录文件 + local pid="" + if command -v jq &> /dev/null; then + pid=$(jq -r '.components."fluent-bit".pid // empty' "$install_record" 2>/dev/null || echo "") + else + # 如果没有jq,使用简单的文本解析方法 + pid=$(grep -A 10 '"fluent-bit"' "$install_record" | grep '"pid"' | cut -d'"' -f4 | head -1) + fi + + if [[ -n "$pid" && "$pid" =~ ^[0-9]+$ ]]; then + if kill -0 "$pid" 2>/dev/null; then + # 进程存在且运行正常 + status="health" + reason="进程运行正常 (PID: $pid)" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 0 + else + reason="安装记录中的 PID $pid 进程不存在" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + else + reason="安装记录文件中未找到有效的 fluent-bit PID" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + else + # 如果安装记录文件不存在,尝试查找 fluent-bit 进程 + local pids=$(pgrep -f "fluent-bit" 2>/dev/null || true) + if [[ -n "$pids" ]]; then + # 取第一个找到的 PID + local pid=$(echo "$pids" | head -1) + status="health" + reason="发现 fluent-bit 进程运行 (PID: $pid),但未找到安装记录" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 0 + else + reason="未找到 fluent-bit 进程,且安装记录文件不存在" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + fi +} + +# 主函数 +main() { + check_health +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/fluent-bit.conf b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/fluent-bit.conf new file mode 100644 index 0000000..95ed374 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/fluent-bit.conf @@ -0,0 +1,37 @@ +[SERVICE] + Daemon Off + Parsers_File parsers.conf + HTTP_Server On + HTTP_Listen 0.0.0.0 + HTTP_Port 2020 + storage.path /buffers + storage.sync normal + storage.checksum on + storage.backlog.mem_limit 128M + # 备注:该镜像默认未开启 Hot Reload,修改配置后请重启容器。 + +@INCLUDE inputs.d/*.conf + +[FILTER] + Name parser + Match app.* + Key_Name log + Parser timestamp_parser + Reserve_Data On + Preserve_Key On + Unescape_Key On + +[FILTER] + Name record_modifier + Match * + Record cluster ${CLUSTER} + Record rack ${RACK} + Record host ${HOSTNAME} + +[FILTER] + Name lua + Match app.* + script inject_labels.lua + call add_labels + +@INCLUDE outputs.d/*.conf diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inject_labels.lua b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inject_labels.lua new file mode 100644 index 0000000..0d87f7a --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inject_labels.lua @@ -0,0 +1,15 @@ +function add_labels(tag, ts, record) + record["job_id"] = os.getenv("FB_JOB_ID") or record["job_id"] or "unknown" + record["user"] = os.getenv("FB_USER") or record["user"] or "unknown" + record["model"] = os.getenv("FB_MODEL") or record["model"] or "unknown" + record["gpu_id"] = os.getenv("FB_GPU_ID") or record["gpu_id"] or "na" + local p = record["log_path"] or "" + if string.find(p, "/logs/infer/") then + record["role"] = "infer" + elseif string.find(p, "/logs/train/") then + record["role"] = "train" + else + record["role"] = record["role"] or "app" + end + return 1, ts, record +end diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inputs.d/10-train.conf b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inputs.d/10-train.conf new file mode 100644 index 0000000..3ea9e25 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inputs.d/10-train.conf @@ -0,0 +1,10 @@ +[INPUT] + Name tail + Path /logs/train/*.log + Tag app.train + Path_Key log_path + Refresh_Interval 5 + DB /buffers/train.db + Skip_Long_Lines On + storage.type filesystem + multiline.parser python,go,java diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inputs.d/20-infer.conf b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inputs.d/20-infer.conf new file mode 100644 index 0000000..793e203 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/inputs.d/20-infer.conf @@ -0,0 +1,10 @@ +[INPUT] + Name tail + Path /logs/infer/*.log + Tag app.infer + Path_Key log_path + Refresh_Interval 5 + DB /buffers/infer.db + Skip_Long_Lines On + storage.type filesystem + multiline.parser python,go,java diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/outputs.d/10-es.conf b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/outputs.d/10-es.conf new file mode 100644 index 0000000..f273270 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/outputs.d/10-es.conf @@ -0,0 +1,24 @@ +# 重要:使用 Logstash_Format + Logstash_Prefix,生成 train-*/infer-* 索引 +[OUTPUT] + Name es + Match app.train + Host ${ES_HOST:-localhost} + Port ${ES_PORT:-9200} + Logstash_Format On + Logstash_Prefix train + Replace_Dots On + Generate_ID On + Retry_Limit False + Suppress_Type_Name On + +[OUTPUT] + Name es + Match app.infer + Host ${ES_HOST:-localhost} + Port ${ES_PORT:-9200} + Logstash_Format On + Logstash_Prefix infer + Replace_Dots On + Generate_ID On + Retry_Limit False + Suppress_Type_Name On diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/parsers.conf b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/parsers.conf new file mode 100644 index 0000000..d86fa06 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/parsers.conf @@ -0,0 +1,27 @@ +[MULTILINE_PARSER] + Name python + Type regex + Flush 2 + Rule "start_state" "/^\d{4}-\d{2}-\d{2}[\sT]/" "cont" + Rule "cont" "/^\s+|^Traceback|^\tat\s+/" "cont" + +[MULTILINE_PARSER] + Name go + Type regex + Flush 2 + Rule "start_state" "/^[0-9]{4}\/[0-9]{2}\/[0-9]{2}/" "cont" + Rule "cont" "/^\s+|^\t/" "cont" + +[MULTILINE_PARSER] + Name java + Type regex + Flush 2 + Rule "start_state" "/^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/" "cont" + Rule "cont" "/^\s+at\s+|^\t.../" "cont" + +[PARSER] + Name timestamp_parser + Format regex + Regex ^(?\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?\w+)\s+(?.*)$ + Time_Key timestamp + Time_Format %Y-%m-%d %H:%M:%S diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/install.sh b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/install.sh new file mode 100755 index 0000000..aef6e34 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/install.sh @@ -0,0 +1,291 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +log_info "Starting Fluent Bit installation..." + +# 解析命令行参数 +INSTALL_DIR="${1:-/opt/argus-metric/current}" + +# 更新安装记录 +update_install_record() { + local pid="$1" + # 使用传入的安装目录参数,如果没有则使用默认值 + local install_base_dir="${2:-/opt/argus-metric/current}" + local install_record="$install_base_dir/.install_record" + + # 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建 + if [[ ! -f "$install_record" ]]; then + log_info "安装记录文件不存在,将由主安装脚本创建" + return 0 + fi + + # 如果文件存在,说明是重启场景,只更新 PID 字段 + if command -v jq &> /dev/null; then + # 读取当前 PID + local current_pid=$(jq -r '.components."fluent-bit".pid // ""' "$install_record" 2>/dev/null) + + if [[ -z "$current_pid" ]]; then + log_warning "无法读取当前 PID,跳过更新" + return 1 + fi + + # 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段 + jq --arg new_pid "$pid" '.components."fluent-bit".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record" + log_info "PID updated: $current_pid -> $pid" + else + log_warning "jq 命令不可用,无法更新安装记录文件" + fi +} + +# 检查是否为 root 用户 +if [[ $EUID -ne 0 ]]; then + log_error "This script requires root privileges" + log_info "Please use: sudo $0" + exit 1 +fi + +# 停止可能运行的服务 +log_info "Stopping existing fluent-bit processes..." + +# 只匹配进程名为 fluent-bit 的进程 +pids=$(pgrep -x fluent-bit 2>/dev/null || true) + +if [[ -n "$pids" ]]; then + for pid in $pids; do + log_info "Stopping process PID: $pid" + kill "$pid" 2>/dev/null || true + done + sleep 2 + + # 检查是否还有残留进程 + remaining_pids=$(pgrep -x fluent-bit 2>/dev/null || true) + if [[ -n "$remaining_pids" ]]; then + log_warning "Force killing unresponsive processes..." + for pid in $remaining_pids; do + kill -9 "$pid" 2>/dev/null || true + done + fi +fi + +# 安装 Fluent Bit 依赖库 libpq5(离线模式) +log_info "Checking Fluent Bit dependency: libpq5 ..." +if ! ldconfig -p | grep -q libpq.so.5; then + if ls bin/libpq5_*.deb >/dev/null 2>&1; then + log_info "Installing local dependency package: libpq5" + DEBIAN_FRONTEND=noninteractive dpkg -i bin/libpq5_*.deb >/dev/null 2>&1 || { + log_error "Failed to install libpq5 from bin/, please check package validity" + exit 1 + } + else + log_error "Missing dependency: libpq5 (libpq.so.5). Please put bin/libpq5_*.deb in the bin/ directory." + exit 1 + fi +else + log_info "libpq.so.5 already present on system" +fi + +# 安装 Fluent Bit 依赖库 libyaml-0-2(离线模式) +log_info "Checking Fluent Bit dependency: libyaml-0.so.2 ..." +if ! ldconfig -p | grep -q libyaml-0.so.2; then + if ls bin/libyaml-0-2_*.deb >/dev/null 2>&1; then + log_info "Installing local dependency package: libyaml-0-2" + DEBIAN_FRONTEND=noninteractive dpkg -i bin/libyaml-0-2_*.deb >/dev/null 2>&1 || { + log_error "Failed to install libyaml-0-2 from bin/, please check package validity" + exit 1 + } + else + log_error "Missing dependency: libyaml-0-2 (libyaml-0.so.2). Please put bin/libyaml-0-2_*.deb in the bin/ directory." + exit 1 + fi +else + log_info "libyaml-0.so.2 already present on system" +fi + +# 清理可能存在的旧 fluent-bit 安装(避免配置文件冲突) +log_info "Cleaning up old fluent-bit installation if exists..." +if dpkg -l | grep -q "^ii.*fluent-bit"; then + log_info "Found existing fluent-bit package, removing..." + dpkg --purge fluent-bit 2>/dev/null || true + apt-get remove --purge -y fluent-bit 2>/dev/null || true +fi + +# 确保清理残留的配置文件 +if [[ -d "/etc/fluent-bit" ]]; then + log_info "Removing old fluent-bit configuration directory..." + rm -rf /etc/fluent-bit +fi + +# 安装 Fluent Bit 主包 +log_info "Installing Fluent Bit from deb package..." +deb_file="bin/fluent-bit_3.1.9_amd64.deb" +if [[ ! -f "$deb_file" ]]; then + log_error "Fluent Bit package not found: $deb_file" + exit 1 +fi + +DEBIAN_FRONTEND=noninteractive dpkg -i "$deb_file" >/dev/null 2>&1 || true + +# 验证 Fluent Bit 可以运行 +fb_version=$(/opt/fluent-bit/bin/fluent-bit --version 2>&1 | head -1) +log_info "Fluent Bit version: $fb_version" + +# 创建 fluent-bit 用户 +log_info "Creating fluent-bit user..." +if ! id "fluent-bit" &>/dev/null; then + useradd --no-create-home --shell /bin/false fluent-bit +fi + +# 创建配置目录 +log_info "Installing configuration files..." +mkdir -p /etc/fluent-bit +if [[ -d "config" ]]; then + cp -r config/* /etc/fluent-bit/ + chown -R fluent-bit:fluent-bit /etc/fluent-bit +fi + +# 创建日志和缓冲区目录 +log_info "Creating log and buffer directories..." +mkdir -p /logs/train /logs/infer /buffers +chmod 755 /logs/train /logs/infer +chmod 770 /buffers +chown -R fluent-bit:fluent-bit /logs /buffers + +# 启动 Fluent Bit +log_info "Starting Fluent Bit with configuration from /etc/fluent-bit/" +config_path="/etc/fluent-bit/fluent-bit.conf" + +if [[ ! -f "$config_path" ]]; then + log_error "Configuration file not found: $config_path" + exit 1 +fi + +# 设置环境变量 +log_info "Setting environment variables..." + +# 获取非 127.0.0.1 的 IP 地址作为 HOSTNAME +if [[ -z "${HOSTNAME:-}" ]]; then + # 获取 177.x.x.x 段的 IP 地址 + HOSTNAME=$(ip route get 8.8.8.8 2>/dev/null | grep -oP 'src \K\S+' | grep '^177\.' | head -1) + + # 如果没有找到 177.x.x.x 段的 IP,则获取第一个非 127.0.0.1 的 IP + if [[ -z "$HOSTNAME" ]]; then + HOSTNAME=$(ip route get 8.8.8.8 2>/dev/null | grep -oP 'src \K\S+' | grep -v '^127\.' | head -1) + fi + + # 如果还是没有找到,使用 hostname 命令 + if [[ -z "$HOSTNAME" ]]; then + HOSTNAME=$(hostname) + fi +fi +export HOSTNAME + +export CLUSTER="${CLUSTER:-local}" +export RACK="${RACK:-dev}" +export ES_HOST="${ES_HOST:-localhost}" +export ES_PORT="${ES_PORT:-9200}" + +log_info "Environment variables:" +log_info " CLUSTER=$CLUSTER" +log_info " RACK=$RACK" +log_info " HOSTNAME=$HOSTNAME" +log_info " ES_HOST=$ES_HOST" +log_info " ES_PORT=$ES_PORT" + +# 检查 fluent-bit 二进制文件 +log_info "[DEBUG] Checking fluent-bit binary..." +if [[ ! -f "/opt/fluent-bit/bin/fluent-bit" ]]; then + log_error "fluent-bit binary not found at /opt/fluent-bit/bin/fluent-bit" + exit 1 +fi +log_info "[DEBUG] fluent-bit binary exists and is executable: $(ls -lh /opt/fluent-bit/bin/fluent-bit)" + +# 检查配置文件 +log_info "[DEBUG] Checking configuration file: $config_path" +if [[ ! -f "$config_path" ]]; then + log_error "Configuration file not found: $config_path" + exit 1 +fi +log_info "[DEBUG] Configuration file exists: $(ls -lh $config_path)" + +# 显示完整的启动命令 +log_info "[DEBUG] Full command to execute:" +log_info "[DEBUG] su -s /bin/bash fluent-bit -c 'env CLUSTER=\"$CLUSTER\" RACK=\"$RACK\" HOSTNAME=\"$HOSTNAME\" ES_HOST=\"$ES_HOST\" ES_PORT=\"$ES_PORT\" /opt/fluent-bit/bin/fluent-bit --config=\"$config_path\"'" + +# 清空或创建日志文件 +log_info "[DEBUG] Preparing log file: /var/log/fluent-bit.log" +: > /var/log/fluent-bit.log +chmod 666 /var/log/fluent-bit.log + +log_info "Command: /opt/fluent-bit/bin/fluent-bit --config=$config_path" +log_info "[DEBUG] Starting fluent-bit process as fluent-bit user (using su)..." +nohup su -s /bin/bash fluent-bit -c "env CLUSTER='$CLUSTER' RACK='$RACK' HOSTNAME='$HOSTNAME' ES_HOST='$ES_HOST' ES_PORT='$ES_PORT' /opt/fluent-bit/bin/fluent-bit --config='$config_path' >> /var/log/fluent-bit.log 2>&1" & + +bg_pid=$! +log_info "[DEBUG] Background process started with PID: $bg_pid" + +# 等待服务启动 +log_info "[DEBUG] Waiting 3 seconds for service to start..." +sleep 3 + +# 查找实际的 fluent-bit 进程 PID +log_info "[DEBUG] Searching for fluent-bit process..." +log_info "[DEBUG] Running: pgrep -u fluent-bit -x fluent-bit" +actual_pid=$(pgrep -u fluent-bit -x fluent-bit | head -1) + +# 显示所有 fluent-bit 相关进程 +log_info "[DEBUG] All fluent-bit related processes:" +ps aux | grep fluent-bit | grep -v grep || log_warning "No fluent-bit processes found in ps output" + +if [[ -n "$actual_pid" ]]; then + log_success "Fluent Bit started successfully (PID: $actual_pid)" + log_info "[DEBUG] Process details: $(ps -p $actual_pid -o pid,user,cmd --no-headers)" + + # 更新安装记录 + update_install_record "$actual_pid" "$INSTALL_DIR" +else + log_error "Fluent Bit failed to start - no fluent-bit process found" + log_info "[DEBUG] Checking if background process $bg_pid still exists..." + if ps -p $bg_pid > /dev/null 2>&1; then + log_warning "Background shell process $bg_pid still exists" + else + log_warning "Background shell process $bg_pid has exited" + fi + + log_info "[DEBUG] Last 20 lines of /var/log/fluent-bit.log:" + if [[ -f "/var/log/fluent-bit.log" ]]; then + tail -20 /var/log/fluent-bit.log | while IFS= read -r line; do + log_info "[LOG] $line" + done + else + log_error "Log file /var/log/fluent-bit.log does not exist" + fi + + exit 1 +fi + +log_success "Fluent Bit installation completed!" diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/package.sh b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/package.sh new file mode 100755 index 0000000..faf702b --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/package.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +set -e + +# 颜色定义 +GREEN='\033[0;32m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +# 获取当前目录 +CURRENT_DIR=$(pwd) +PACKAGE_NAME="fluent-bit-$(date +%Y%m%d-%H%M%S)" +PACKAGE_FILE="${PACKAGE_NAME}.tar.gz" + +log_info "开始打包 Fluent Bit 安装包..." + +# 检查必要文件 +log_info "检查必要文件..." + +required_files=( + "install.sh" + "uninstall.sh" + "bin/fluent-bit_3.1.9_amd64.deb" + "check_health.sh" +) + +missing_files=() +for file in "${required_files[@]}"; do + if [[ ! -f "$file" ]]; then + missing_files+=("$file") + fi +done + +if [[ ${#missing_files[@]} -gt 0 ]]; then + echo "缺少以下文件:" + for file in "${missing_files[@]}"; do + echo " - $file" + done + exit 1 +fi + +log_success "所有必要文件检查完成" + +# 创建临时目录 +TEMP_DIR=$(mktemp -d) +log_info "创建临时目录: $TEMP_DIR" + +# 复制文件到临时目录 +cp -r . "$TEMP_DIR/$PACKAGE_NAME" + +# 进入临时目录 +cd "$TEMP_DIR" + +# 创建压缩包 +log_info "创建压缩包: $PACKAGE_FILE" +tar -czf "$PACKAGE_FILE" "$PACKAGE_NAME" + +# 移动压缩包到原目录 +mv "$PACKAGE_FILE" "$CURRENT_DIR/" + +# 清理临时目录 +rm -rf "$TEMP_DIR" + +# 返回原目录 +cd "$CURRENT_DIR" + +# 显示结果 +log_success "打包完成!" +echo +echo "安装包文件: $PACKAGE_FILE" +echo "文件大小: $(du -h "$PACKAGE_FILE" | cut -f1)" +echo +echo "使用方法:" +echo "1. 将 $PACKAGE_FILE 传输到目标服务器" +echo "2. 解压: tar -xzf $PACKAGE_FILE" +echo "3. 进入目录: cd $PACKAGE_NAME" +echo "4. 运行安装: sudo ./install.sh" +echo +echo "注意: 请确保所有必要文件都存在" diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/uninstall.sh b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/uninstall.sh new file mode 100755 index 0000000..ceba076 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/uninstall.sh @@ -0,0 +1,169 @@ +#!/bin/bash +set -euo pipefail + +echo "[INFO] Starting Fluent Bit uninstallation..." + +# 检查是否为 root 用户 +if [[ $EUID -ne 0 ]]; then + echo "[ERROR] This script requires root privileges" + echo "[INFO] Please use: sudo $0" + exit 1 +fi + +echo "[WARNING] This operation will completely uninstall Fluent Bit" +read -p "Confirm to continue? (y/N): " confirm + +if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + echo "[INFO] Uninstallation cancelled" + exit 0 +fi + +# 停止运行中的进程 +echo "[INFO] Stopping Fluent Bit processes..." +install_record="/opt/argus-metric/current/.install_record" +stopped=false + +# 首先尝试通过安装记录文件停止服务 +if [[ -f "$install_record" ]]; then + # 尝试使用jq解析JSON格式的安装记录文件 + pid="" + if command -v jq &> /dev/null; then + pid=$(jq -r '.components."fluent-bit".pid // empty' "$install_record" 2>/dev/null || echo "") + else + # 如果没有jq,使用简单的文本解析方法 + pid=$(grep -A 10 '"fluent-bit"' "$install_record" | grep '"pid"' | cut -d'"' -f4 | head -1) + fi + + if [[ -n "$pid" && "$pid" =~ ^[0-9]+$ ]]; then + if kill -0 "$pid" 2>/dev/null; then + echo "[INFO] Stopping service via installation record (PID: $pid)..." + kill "$pid" + sleep 3 + + # 检查进程是否已停止 + if kill -0 "$pid" 2>/dev/null; then + echo "[WARNING] Process unresponsive, force killing..." + kill -9 "$pid" 2>/dev/null || true + fi + echo "[SUCCESS] Fluent Bit process stopped" + stopped=true + else + echo "[WARNING] PID in installation record no longer exists" + fi + fi +fi + +# 查找并杀死所有 fluent-bit 进程 +pids=$(pgrep -f "fluent-bit" 2>/dev/null || true) +if [[ -n "$pids" ]]; then + echo "[INFO] Found fluent-bit processes, stopping..." + for pid in $pids; do + echo "[INFO] Stopping process PID: $pid" + kill "$pid" 2>/dev/null || true + done + sleep 2 + + # 检查是否还有进程在运行,如果有则强制终止 + remaining_pids=$(pgrep -f "fluent-bit" 2>/dev/null || true) + if [[ -n "$remaining_pids" ]]; then + echo "[WARNING] Processes unresponsive, force killing..." + for pid in $remaining_pids; do + echo "[INFO] Force killing process PID: $pid" + kill -9 "$pid" 2>/dev/null || true + done + sleep 1 + fi + + # 最终检查 + if pgrep -f "fluent-bit" > /dev/null; then + echo "[ERROR] Unable to stop all fluent-bit processes" + else + echo "[SUCCESS] All Fluent Bit processes stopped" + stopped=true + fi +else + echo "[INFO] No Fluent Bit processes running" +fi + +if [[ "$stopped" == "false" ]]; then + echo "[WARNING] No Fluent Bit processes found to stop" +fi + +# 卸载 Fluent Bit 包 +echo "[INFO] Uninstalling Fluent Bit package..." +if dpkg -l | grep -q "fluent-bit"; then + echo "[INFO] Found fluent-bit package installed via dpkg, uninstalling..." + dpkg --remove --force-remove-reinstreq fluent-bit || true + echo "[SUCCESS] Fluent Bit package uninstalled" +else + echo "[INFO] No fluent-bit package found via package manager" +fi + +# 删除二进制文件 +echo "[INFO] Removing Fluent Bit binary files..." +binary_dir="/opt/fluent-bit" +if [[ -d "$binary_dir" ]]; then + rm -rf "$binary_dir" + echo "[SUCCESS] Binary directory removed: $binary_dir" +else + echo "[INFO] Binary directory does not exist" +fi + +# 删除配置文件 +echo "[INFO] Removing configuration files..." +config_dir="/etc/fluent-bit" +if [[ -d "$config_dir" ]]; then + rm -rf "$config_dir" + echo "[SUCCESS] Configuration directory removed" +else + echo "[INFO] Configuration directory does not exist" +fi + +# 删除数据目录 +echo "[INFO] Removing data directories..." +data_dirs=("/logs" "/buffers") +deleted=false +for data_dir in "${data_dirs[@]}"; do + if [[ -d "$data_dir" ]]; then + rm -rf "$data_dir" + echo "[SUCCESS] Data directory removed: $data_dir" + deleted=true + fi +done + +if [[ "$deleted" == "false" ]]; then + echo "[INFO] No data directories found" +fi + +# 清理安装记录 +echo "[INFO] Cleaning up installation record..." +if [[ -f "$install_record" ]]; then + # 从安装记录中移除 fluent-bit 条目 + sed -i '/^fluent-bit:/d' "$install_record" + echo "[SUCCESS] Installation record cleaned" +else + echo "[INFO] Installation record file does not exist" +fi + +# 检查用户状态 +echo "[INFO] Checking fluent-bit user status..." +if id "fluent-bit" &>/dev/null; then + echo "[INFO] fluent-bit user exists" + echo "[WARNING] fluent-bit is a system user, may be used by other services" + echo "[INFO] fluent-bit user will be preserved for system stability" + echo "[INFO] To manually remove, run: sudo userdel fluent-bit" +else + echo "[INFO] fluent-bit user does not exist" +fi + +echo "[SUCCESS] Fluent Bit uninstallation completed!" +echo +echo "Removed content:" +echo " - Binary directory: /opt/fluent-bit" +echo " - Configuration directory: /etc/fluent-bit" +echo " - Application log directory: /logs" +echo " - Buffer directory: /buffers" +echo +echo "Note:" +echo " - fluent-bit user preserved (system user, may be used by other services)" +echo " - For complete cleanup, manually check and remove related files" diff --git a/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/bin/node_exporter b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/bin/node_exporter new file mode 100755 index 0000000..bccf467 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/bin/node_exporter @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d548f65fe29db403603c0f0c6a5d15e3ac74b6ed69ec445258e8fff4bc88601 +size 19925095 diff --git a/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/check_health.sh b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/check_health.sh new file mode 100755 index 0000000..ed168e3 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/check_health.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# Node Exporter 健康检查脚本 +# 输出 JSON 格式结果 + +set -e + +# 检查 Node Exporter 健康状态 +check_health() { + local url="http://localhost:9100" + local metrics_url="$url/metrics" + local name="node-exporter" + local status="unhealth" + local reason="" + + # 检查 curl 是否可用 + if ! command -v curl &> /dev/null; then + reason="curl 命令不可用,无法进行健康检查" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + + # 测试根路径连接 + local http_code=$(curl -s -o /dev/null -w "%{http_code}" "$url" 2>/dev/null || echo "000") + + if [[ "$http_code" == "200" ]]; then + # 测试 metrics 端点 + local metrics_code=$(curl -s -o /dev/null -w "%{http_code}" "$metrics_url" 2>/dev/null || echo "000") + + if [[ "$metrics_code" == "200" ]]; then + status="health" + reason="success" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 0 + else + reason="Metrics 端点异常 (HTTP $metrics_code)" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + else + reason="HTTP 服务异常 (HTTP $http_code),请检查 Node Exporter 是否正在运行在端口 9100" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi +} + +# 主函数 +main() { + check_health +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/install.sh b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/install.sh new file mode 100755 index 0000000..28ba2d1 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/install.sh @@ -0,0 +1,343 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 更新安装记录 +update_install_record() { + local pid="$1" + # 使用传入的安装目录参数,如果没有则使用默认值 + local install_base_dir="${2:-/opt/argus-metric/current}" + local install_record="$install_base_dir/.install_record" + + # 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建 + if [[ ! -f "$install_record" ]]; then + log_info "安装记录文件不存在,将由主安装脚本创建" + return 0 + fi + + # 如果文件存在,说明是重启场景,只更新 PID 字段 + if command -v jq &> /dev/null; then + # 读取当前 PID + local current_pid=$(jq -r '.components."node-exporter".pid // ""' "$install_record" 2>/dev/null) + + if [[ -z "$current_pid" ]]; then + log_warning "无法读取当前 PID,跳过更新" + return 1 + fi + + # 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段 + jq --arg new_pid "$pid" '.components."node-exporter".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record" + log_info "PID 已更新: $current_pid -> $pid" + else + log_warning "jq 命令不可用,无法更新安装记录文件" + fi +} + +# 显示帮助信息 +show_help() { + echo "Node Exporter 安装脚本" + echo + echo "用法: $0 [选项]" + echo + echo "选项:" + echo " --help 显示此帮助信息" + echo + echo "示例:" + echo " $0 # 安装 Node Exporter" + echo +} + +# 解析命令行参数 +INSTALL_DIR="" +for arg in "$@"; do + case $arg in + --help|-h) + show_help + exit 0 + ;; + *) + # 如果参数不是以--开头,则认为是安装目录 + if [[ ! "$arg" =~ ^-- ]]; then + INSTALL_DIR="$arg" + else + log_error "未知参数: $arg" + show_help + exit 1 + fi + ;; + esac +done + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0" + exit 1 + fi +} + +# 检查系统要求 +check_system() { + log_info "检查系统要求..." + + # 检查操作系统 + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + exit 1 + fi + + source /etc/os-release + log_info "检测到操作系统: $NAME $VERSION" + + # 检查是否为 Linux 系统 + if [[ "$ID" != "ubuntu" && "$ID" != "debian" && "$ID" != "centos" && "$ID" != "rhel" && "$ID" != "fedora" ]]; then + log_warning "此脚本主要针对常见 Linux 发行版,其他系统可能需要调整" + fi + + # 检查系统架构 + local arch=$(uname -m) + log_info "系统架构: $arch" + + if [[ "$arch" != "x86_64" && "$arch" != "amd64" ]]; then + log_warning "当前架构为 $arch,node_exporter 主要支持 x86_64/amd64" + fi +} + +stop_existing_service() { + log_info "检查并停止可能运行的 Node Exporter 服务..." + + # 当前脚本 PID,防止误杀 + SELF_PID=$$ + + # 1. 停止 systemd 服务(如果存在) + if systemctl list-units --full -all | grep -q "node_exporter.service"; then + log_info "检测到 systemd 服务 node_exporter,正在停止..." + systemctl stop node_exporter || true + systemctl disable node_exporter || true + fi + + # 2. 清理可能存在的 PID 文件 + for pid_file in /var/run/node-exporter.pid /var/run/node_exporter.pid /tmp/node_exporter.pid; do + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if kill -0 "$pid" 2>/dev/null; then + log_info "发现 Node Exporter (PID: $pid),正在停止..." + kill "$pid" + sleep 2 + kill -0 "$pid" 2>/dev/null && kill -9 "$pid" + fi + rm -f "$pid_file" + fi + done + + # 3. 用 pgrep 查找进程,排除当前脚本 + local pids=$(pgrep -f "node_exporter|node-exporter|/usr/local/bin/node-exporter" | grep -vw "$SELF_PID" || true) + if [[ -n "$pids" ]]; then + log_info "发现 Node Exporter 进程 (PID: $pids),正在停止..." + for pid in $pids; do + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" 2>/dev/null || true + sleep 1 + kill -0 "$pid" 2>/dev/null && kill -9 "$pid" 2>/dev/null || true + fi + done + fi + + # 4. 兜底:检查是否有进程占用 9100 端口 + local listen_pids=$(lsof -ti:9100 2>/dev/null || true) + if [[ -n "$listen_pids" ]]; then + log_warning "发现占用 9100 端口的进程 (PID: $listen_pids),强制终止..." + for pid in $listen_pids; do + kill -9 "$pid" 2>/dev/null || true + done + sleep 1 + fi + + # 5. 最终验证 + if netstat -tuln 2>/dev/null | grep -q ":9100 "; then + log_error "端口 9100 仍被占用,请手动检查" + return 1 + else + log_success "旧的 Node Exporter 已完全停止" + fi +} + + +# 安装 Node Exporter 二进制文件 +install_node_exporter() { + log_info "安装 Node Exporter..." + + local binary_file="bin/node_exporter" + local install_dir="/usr/local/bin" + + if [[ ! -f "$binary_file" ]]; then + log_error "找不到 Node Exporter 二进制文件: $binary_file" + exit 1 + fi + + # 停止可能运行的服务 + stop_existing_service + + # 复制二进制文件并重命名为统一格式 + cp "$binary_file" "$install_dir/node-exporter" + chmod +x "$install_dir/node-exporter" + + log_success "Node Exporter 二进制文件安装完成" +} + +# 创建用户和组 +create_user() { + log_info "创建 node_exporter 用户..." + + # 检查用户是否已存在 + if id "node_exporter" &>/dev/null; then + log_info "用户 node_exporter 已存在" + else + useradd --no-create-home --shell /bin/false node_exporter + log_success "用户 node_exporter 创建完成" + fi +} + +# 安装配置文件 +install_config() { + log_info "安装配置文件..." + + local config_dir="/etc/node_exporter" + + # 创建配置目录 + mkdir -p "$config_dir" + + # 创建文本文件收集器目录 + mkdir -p "/var/lib/node_exporter/textfile_collector" + chown node_exporter:node_exporter "/var/lib/node_exporter/textfile_collector" +} + +# 启动 Node Exporter 服务 +start_node_exporter() { + log_info "启动 Node Exporter 服务..." + + local binary_path="/usr/local/bin/node-exporter" + local log_file="/var/log/node-exporter.log" + local pid_file="/var/run/node-exporter.pid" + + # 检查服务是否已经在运行 + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if kill -0 "$pid" 2>/dev/null; then + log_info "Node Exporter 服务已在运行 (PID: $pid)" + return 0 + else + log_warning "发现过期的 PID 文件,正在清理..." + rm -f "$pid_file" + fi + fi + + # 检查端口是否被占用 + if netstat -tuln 2>/dev/null | grep -q ":9100 "; then + log_warning "端口 9100 已被占用,请检查是否有其他服务在运行" + return 1 + fi + + # 启动服务 + log_info "正在启动 Node Exporter..." + nohup "$binary_path" --web.listen-address=:9100 > "$log_file" 2>&1 & + local pid=$! + + # 保存 PID + echo "$pid" > "$pid_file" + + # 等待服务启动 + sleep 2 + + # 检查服务是否成功启动 + if kill -0 "$pid" 2>/dev/null; then + log_success "Node Exporter 服务启动成功 (PID: $pid)" + log_info "日志文件: $log_file" + log_info "PID 文件: $pid_file" + + # 更新安装记录 + update_install_record "$pid" "$INSTALL_DIR" + else + log_error "Node Exporter 服务启动失败" + rm -f "$pid_file" + return 1 + fi +} + + + +# 显示安装信息 +show_install_info() { + log_success "Node Exporter 安装完成!" + echo + echo "安装信息:" + echo " 二进制文件: /usr/local/bin/node-exporter" + echo " 运行用户: node_exporter" + echo " 配置目录: /etc/node_exporter/" + echo " 默认端口: 9100" + echo + echo "使用方法:" + echo " 手动启动: /usr/local/bin/node-exporter --web.listen-address=:9100" + echo " 后台启动: nohup /usr/local/bin/node-exporter --web.listen-address=:9100 &" + echo + echo "测试连接:" + echo " curl http://localhost:9100/metrics" + echo " curl http://localhost:9100" + echo + echo "Prometheus 配置示例:" + echo " - job_name: 'node_exporter'" + echo " static_configs:" + echo " - targets: ['localhost:9100']" + echo +} + +# 主函数 +main() { + echo "==========================================" + echo " Node Exporter 安装脚本 v1.0" + echo "==========================================" + echo + + check_root + check_system + + log_info "开始安装 Node Exporter..." + + install_node_exporter + create_user + install_config + start_node_exporter + + show_install_info +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi + diff --git a/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/package.sh b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/package.sh new file mode 100755 index 0000000..b38c733 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/package.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +set -e + +# 颜色定义 +GREEN='\033[0;32m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +# 获取当前目录 +CURRENT_DIR=$(pwd) +PACKAGE_NAME="node-exporter-$(date +%Y%m%d-%H%M%S)" +PACKAGE_FILE="${PACKAGE_NAME}.tar.gz" + +log_info "开始打包 Node Exporter 安装包..." + +# 检查必要文件 +log_info "检查必要文件..." + +required_files=( + "install.sh" + "uninstall.sh" + "bin/node_exporter" + "check_health.sh" +) + +missing_files=() +for file in "${required_files[@]}"; do + if [[ ! -f "$file" ]]; then + missing_files+=("$file") + fi +done + +if [[ ${#missing_files[@]} -gt 0 ]]; then + echo "缺少以下文件:" + for file in "${missing_files[@]}"; do + echo " - $file" + done + exit 1 +fi + +log_success "所有必要文件检查完成" + +# 创建临时目录 +TEMP_DIR=$(mktemp -d) +log_info "创建临时目录: $TEMP_DIR" + +# 复制文件到临时目录 +cp -r . "$TEMP_DIR/$PACKAGE_NAME" + +# 进入临时目录 +cd "$TEMP_DIR" + +# 创建压缩包 +log_info "创建压缩包: $PACKAGE_FILE" +tar -czf "$PACKAGE_FILE" "$PACKAGE_NAME" + +# 移动压缩包到原目录 +mv "$PACKAGE_FILE" "$CURRENT_DIR/" + +# 清理临时目录 +rm -rf "$TEMP_DIR" + +# 返回原目录 +cd "$CURRENT_DIR" + +# 显示结果 +log_success "打包完成!" +echo +echo "安装包文件: $PACKAGE_FILE" +echo "文件大小: $(du -h "$PACKAGE_FILE" | cut -f1)" +echo +echo "使用方法:" +echo "1. 将 $PACKAGE_FILE 传输到目标服务器" +echo "2. 解压: tar -xzf $PACKAGE_FILE" +echo "3. 进入目录: cd $PACKAGE_NAME" +echo "4. 运行安装: sudo ./install.sh" +echo +echo "注意: 请确保所有必要文件都存在" diff --git a/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/uninstall.sh b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/uninstall.sh new file mode 100755 index 0000000..14801c1 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/node-exporter/uninstall.sh @@ -0,0 +1,239 @@ +#!/bin/bash + +# Node Exporter 卸载脚本 +# 版本: 1.0 +# 作者: AIOps Team +# 日期: $(date +%Y-%m-%d) + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0" + exit 1 + fi +} + +# 停止运行中的进程 +stop_processes() { + log_info "停止 Node Exporter 进程..." + + local pid_file="/var/run/node-exporter.pid" + local stopped=false + + # 首先尝试通过 PID 文件停止服务 + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if kill -0 "$pid" 2>/dev/null; then + log_info "通过 PID 文件停止服务 (PID: $pid)..." + kill "$pid" + sleep 3 + + # 检查进程是否已停止 + if kill -0 "$pid" 2>/dev/null; then + log_warning "进程未响应,强制终止..." + kill -9 "$pid" 2>/dev/null || true + fi + log_success "Node Exporter 进程已停止" + stopped=true + else + log_warning "PID 文件存在但进程已不存在,清理 PID 文件" + rm -f "$pid_file" + fi + fi + + # 查找并杀死所有 node_exporter 和 node-exporter 进程 + local pids=$(pgrep -f "node_exporter\|node-exporter" 2>/dev/null || true) + if [[ -n "$pids" ]]; then + log_info "发现 node_exporter 或 node-exporter 进程,正在停止..." + for pid in $pids; do + log_info "停止进程 PID: $pid" + kill "$pid" 2>/dev/null || true + done + sleep 2 + + # 检查是否还有进程在运行,如果有则强制终止 + local remaining_pids=$(pgrep -f "node_exporter\|node-exporter" 2>/dev/null || true) + if [[ -n "$remaining_pids" ]]; then + log_warning "进程未响应,强制终止..." + for pid in $remaining_pids; do + log_info "强制终止进程 PID: $pid" + kill -9 "$pid" 2>/dev/null || true + done + sleep 1 + fi + + # 最终检查 + if pgrep -f "node_exporter\|node-exporter" > /dev/null; then + log_error "无法停止所有 node_exporter 进程" + else + log_success "所有 Node Exporter 进程已停止" + stopped=true + fi + else + log_info "Node Exporter 进程未运行" + fi + + # 清理 PID 文件 + rm -f "$pid_file" + + if [[ "$stopped" == "false" ]]; then + log_warning "未发现需要停止的 Node Exporter 进程" + fi +} + +# 删除二进制文件 +remove_binary() { + log_info "删除 Node Exporter 二进制文件..." + + local binary_files=( + "/usr/local/bin/node-exporter" + "/usr/local/bin/node_exporter" + ) + + local deleted=false + for binary_file in "${binary_files[@]}"; do + if [[ -f "$binary_file" ]]; then + rm -f "$binary_file" + log_success "二进制文件已删除: $binary_file" + deleted=true + fi + done + + if [[ "$deleted" == "false" ]]; then + log_info "二进制文件不存在" + fi +} + +# 删除配置文件 +remove_config() { + log_info "删除配置文件..." + + local config_dir="/etc/node_exporter" + + if [[ -d "$config_dir" ]]; then + rm -rf "$config_dir" + log_success "配置目录已删除" + else + log_info "配置目录不存在" + fi +} + +# 删除数据目录 +remove_data_dir() { + log_info "删除数据目录..." + + local data_dir="/var/lib/node_exporter" + + if [[ -d "$data_dir" ]]; then + rm -rf "$data_dir" + log_success "数据目录已删除" + else + log_info "数据目录不存在" + fi +} + +# 检查用户状态(可选) +check_user_status() { + log_info "检查 node_exporter 用户状态..." + + if id "node_exporter" &>/dev/null; then + log_info "检测到 node_exporter 用户存在" + log_warning "node_exporter 是系统用户,可能被其他服务使用" + log_info "为了系统稳定性,将保留 node_exporter 用户" + log_info "如需手动删除,请运行: sudo userdel node_exporter" + else + log_info "node_exporter 用户不存在" + fi +} + +# 清理日志文件 +cleanup_logs() { + log_info "清理日志文件..." + + # 清理 journal 日志 + journalctl --vacuum-time=1s --quiet || true + + # 删除安装脚本创建的日志文件 + rm -f /var/log/node-exporter.log + + log_success "日志文件已清理" +} + +# 显示卸载信息 +show_uninstall_info() { + log_success "Node Exporter 卸载完成!" + echo + echo "已删除的内容:" + echo " - 二进制文件: /usr/local/bin/node-exporter" + echo " - 配置目录: /etc/node_exporter" + echo " - 数据目录: /var/lib/node_exporter" + echo " - 相关日志文件" + echo + echo "注意:" + echo " - node_exporter 用户已保留(系统用户,可能被其他服务使用)" + echo " - 如需完全清理,请手动检查并删除相关文件" + echo +} + +# 主函数 +main() { + echo "==========================================" + echo " Node Exporter 卸载脚本 v1.0" + echo "==========================================" + echo + + check_root + + log_warning "此操作将完全卸载 Node Exporter" + read -p "确认继续?(y/N): " confirm + + if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "取消卸载操作" + exit 0 + fi + + log_info "开始卸载 Node Exporter..." + + stop_processes + remove_binary + remove_config + remove_data_dir + cleanup_logs + + # 检查用户状态 + check_user_status + + show_uninstall_info +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/scripts/check_health.sh b/src/metric/client-plugins/all-in-one-full/scripts/check_health.sh new file mode 100755 index 0000000..6b3c866 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/check_health.sh @@ -0,0 +1,286 @@ +#!/bin/bash + +# 整体健康检查脚本,调用各个组件的健康检查并将结果写入 .health_log 文件 + +set -e + +# PID 文件检测,防止重复执行 +PIDFILE="/var/run/check_health.pid" +if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then + echo "健康检查脚本已在运行中,跳过本次执行" >&2 + exit 0 +fi +echo $$ > "$PIDFILE" +trap "rm -f $PIDFILE" EXIT + +# 获取脚本所在目录 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +HEALTH_LOG_FILE="$SCRIPT_DIR/.health_log" +INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record" + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 - 输出到 stderr 避免影响 JSON 结果 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" >&2 +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" >&2 +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" >&2 +} + +# 检查单个组件健康状态 +check_component() { + local component_name="$1" + local check_script_path="$2" + + log_info "检查 $component_name 健康状态..." + + if [[ ! -f "$check_script_path" ]]; then + log_error "健康检查脚本不存在: $check_script_path" + echo "{\"name\": \"$component_name\", \"status\": \"unhealth\", \"reason\": \"健康检查脚本不存在: $check_script_path\"}" + return 1 + fi + + if [[ ! -x "$check_script_path" ]]; then + log_error "健康检查脚本无执行权限: $check_script_path" + echo "{\"name\": \"$component_name\", \"status\": \"unhealth\", \"reason\": \"健康检查脚本无执行权限: $check_script_path\"}" + return 1 + fi + + # 执行健康检查脚本,只捕获 stdout,stderr 输出到终端 + local result + if result=$("$check_script_path" 2>/dev/null); then + log_success "$component_name 健康检查通过" + echo "$result" + return 0 + else + log_warning "$component_name 健康检查失败" + echo "$result" + return 1 + fi +} + +# 生成时间戳 +get_timestamp() { + date '+%Y-%m-%d %H:%M:%S' +} + +# 生成UTC时间戳 +get_utc_timestamp() { + date -u '+%Y-%m-%dT%H:%M:%SZ' +} + +# 获取主机名 +get_hostname() { + echo "${HOSTNAME:-$(hostname)}" +} + +# 创建健康状态目录 +create_health_dir() { + local hostname=$(get_hostname) + local health_dir="/private/argus/agent/$hostname/health" + + if [[ ! -d "$health_dir" ]]; then + log_info "创建健康状态目录: $health_dir" + mkdir -p "$health_dir" + fi + + echo "$health_dir" +} + +# 写入单个模块的健康状态JSON文件 +write_component_health_json() { + local component_name="$1" + local status="$2" + local error_msg="$3" + local health_dir="$4" + + # 生成模块名前缀-xxx.json格式的文件名 + local module_prefix="metric" + local filename="${module_prefix}-${component_name}.json" + local filepath="$health_dir/$filename" + + # 生成UTC时间戳 + local timestamp=$(get_utc_timestamp) + + # 构建JSON内容 + local json_content=$(cat << EOF +{ + "status": "$status", + "error": "$error_msg", + "timestamp": "$timestamp" +} +EOF +) + + # 写入文件 + echo "$json_content" > "$filepath" + log_info "已写入模块健康状态文件: $filepath" +} + +# 从安装记录文件中读取组件安装目录 +read_install_record() { + local install_record_file="$1" + + if [[ ! -f "$install_record_file" ]]; then + log_error "安装记录文件不存在: $install_record_file" + return 1 + fi + + # 检查是否有 jq 命令来解析 JSON + if command -v jq &> /dev/null; then + # 使用 jq 解析 JSON + local components_json + if components_json=$(jq -r '.components | to_entries[] | "\(.key):\(.value.install_dir)"' "$install_record_file" 2>/dev/null); then + echo "$components_json" + return 0 + else + log_error "无法解析安装记录文件 JSON 格式: $install_record_file" + return 1 + fi + else + # 如果没有 jq,尝试简单的文本解析 + log_warning "jq 命令不可用,尝试简单文本解析" + + # 查找所有 install_dir 行 + local components=() + while IFS= read -r line; do + if [[ "$line" =~ \"install_dir\":[[:space:]]*\"([^\"]+)\" ]]; then + local install_dir="${BASH_REMATCH[1]}" + # 从路径中提取组件名称 + local component_name=$(basename "$install_dir") + components+=("$component_name:$install_dir") + fi + done < "$install_record_file" + + if [[ ${#components[@]} -gt 0 ]]; then + printf '%s\n' "${components[@]}" + return 0 + else + log_error "无法从安装记录文件中提取组件信息" + return 1 + fi + fi +} + +# 主函数 +main() { + echo "==========================================" >&2 + echo " 整体健康检查脚本" >&2 + echo "==========================================" >&2 + echo >&2 + + # 记录健康检查开始时间 + local start_time=$(get_timestamp) + log_info "健康检查开始时间: $start_time" + + # 创建健康状态目录 + local health_dir + health_dir=$(create_health_dir) + + # 从安装记录文件中读取组件信息 + log_info "从安装记录文件读取组件信息: $INSTALL_RECORD_FILE" + local components_info + if ! components_info=$(read_install_record "$INSTALL_RECORD_FILE"); then + log_error "无法读取安装记录文件,健康检查终止" + exit 1 + fi + + # 存储所有检查结果 + local all_results=() + local overall_status="health" + + # 逐个检查组件 + while IFS= read -r component_info; do + if [[ -n "$component_info" ]]; then + IFS=':' read -r component_name install_dir <<< "$component_info" + local check_script_path="$install_dir/check_health.sh" + + local result + local component_status="healthy" + local error_msg="" + + if result=$(check_component "$component_name" "$check_script_path"); then + all_results+=("$result") + else + all_results+=("$result") + overall_status="unhealth" + component_status="unhealthy" + # 从结果中提取错误信息 + if command -v jq &> /dev/null; then + error_msg=$(echo "$result" | jq -r '.reason // ""' 2>/dev/null || echo "") + else + # 简单的文本解析提取错误信息 + if [[ "$result" =~ \"reason\":[[:space:]]*\"([^\"]+)\" ]]; then + error_msg="${BASH_REMATCH[1]}" + fi + fi + fi + + # 写入单个模块的健康状态JSON文件 + write_component_health_json "$component_name" "$component_status" "$error_msg" "$health_dir" + fi + done <<< "$components_info" + + # 记录健康检查结束时间 + local end_time=$(get_timestamp) + log_info "健康检查结束时间: $end_time" + + # 构建完整的健康检查结果 JSON + local health_check_result=$(cat << EOF +{ + "start_time": "$start_time", + "end_time": "$end_time", + "overall_status": "$overall_status", + "components": [ +$(printf '%s,\n' "${all_results[@]}" | sed '$s/,$//') + ] +} +EOF +) + + # 写入健康日志文件 + log_info "将健康检查结果写入日志文件: $HEALTH_LOG_FILE" + echo "$health_check_result" >> "$HEALTH_LOG_FILE" + + # 输出 JSON 结果到 stdout + echo "$health_check_result" + + # 显示总结到 stderr + echo >&2 + echo "==========================================" >&2 + echo " 健康检查总结" >&2 + echo "==========================================" >&2 + echo "开始时间: $start_time" >&2 + echo "结束时间: $end_time" >&2 + echo "整体状态: $overall_status" >&2 + echo "日志文件: $HEALTH_LOG_FILE" >&2 + echo >&2 + + if [[ "$overall_status" == "health" ]]; then + log_success "所有组件健康检查通过!" + exit 0 + else + log_error "部分组件健康检查失败,请查看上述详细信息" + exit 1 + fi +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi \ No newline at end of file diff --git a/src/metric/client-plugins/all-in-one-full/scripts/check_version.sh b/src/metric/client-plugins/all-in-one-full/scripts/check_version.sh new file mode 100755 index 0000000..fce49f3 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/check_version.sh @@ -0,0 +1,240 @@ +#!/bin/bash + +# 版本校验脚本 +# 比较本地 LATEST_VERSION 与 FTP 的 VERSION 版本,如果不一致则更新对应版本 + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 - 输出到 stderr 避免影响函数返回值 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" >&2 +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" >&2 +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" >&2 +} + +# 获取脚本所在目录 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# 动态获取当前版本目录 +get_current_version_dir() { + # 查找 /opt/argus-metric/versions/ 下的最新版本目录 + local versions_dir="/opt/argus-metric/versions" + if [[ -d "$versions_dir" ]]; then + # 按版本号排序,获取最新的版本目录 + local latest_version_dir=$(ls -1 "$versions_dir" 2>/dev/null | sort -V | tail -1) + if [[ -n "$latest_version_dir" ]]; then + echo "$versions_dir/$latest_version_dir" + else + echo "/opt/argus-metric" + fi + else + echo "/opt/argus-metric" + fi +} + +# 获取当前版本目录 +CURRENT_VERSION_DIR=$(get_current_version_dir) +# LATEST_VERSION 文件在根目录 +LOCAL_VERSION_FILE="/opt/argus-metric/LATEST_VERSION" +REMOTE_VERSION_URL="" +LOG_FILE="$CURRENT_VERSION_DIR/.version_check.log" + +# 从环境变量或配置文件获取 FTP 服务器信息 +get_ftp_config() { + # 优先从环境变量获取配置 + log_info "获取 FTP 配置信息..." + + # 如果环境变量中没有设置,则尝试从配置文件读取 + if [[ -z "$FTP_SERVER" || -z "$FTP_USER" || -z "$FTP_PASSWORD" ]]; then + local config_file="$SCRIPT_DIR/../config/config.env" + if [[ -f "$config_file" ]]; then + log_info "从配置文件读取 FTP 配置: $config_file" + source "$config_file" + fi + else + log_info "使用环境变量中的 FTP 配置" + fi + + # 设置默认值(如果环境变量和配置文件都没有设置) + FTP_SERVER="${FTP_SERVER:-localhost}" + FTP_USER="${FTP_USER:-ftpuser}" + FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" + + # 构建远程版本文件 URL + REMOTE_VERSION_URL="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/LATEST_VERSION" + + log_info "FTP 配置来源: ${FTP_CONFIG_SOURCE:-环境变量/配置文件}" +} + +# 获取远程版本号 +get_remote_version() { + log_info "从 FTP 服务器获取远程版本号..." + log_info "远程地址: $REMOTE_VERSION_URL" + + # 先测试 FTP 连接 + log_info "测试 FTP 连接..." + if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null 2>&1; then + log_success "FTP 服务器连接成功" + else + log_error "无法连接到 FTP 服务器: $FTP_SERVER" + return 1 + fi + + # 测试 LATEST_VERSION 文件是否存在 + log_info "检查远程 LATEST_VERSION 文件是否存在..." + if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/LATEST_VERSION" >/dev/null 2>&1; then + log_success "远程 LATEST_VERSION 文件存在" + else + log_error "远程 LATEST_VERSION 文件不存在或无法访问" + return 1 + fi + + # 获取远程版本号 + local remote_version + if remote_version=$(curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfL "ftp://${FTP_SERVER}/LATEST_VERSION" 2>/dev/null | tr -d '[:space:]'); then + if [[ -n "$remote_version" ]]; then + log_success "获取到远程版本号: $remote_version" + echo "$remote_version" + else + log_error "远程版本号为空" + return 1 + fi + else + log_error "获取远程版本号失败" + return 1 + fi +} + +# 获取本地版本号 +get_local_version() { + if [[ -f "$LOCAL_VERSION_FILE" ]]; then + local local_version=$(cat "$LOCAL_VERSION_FILE" 2>/dev/null | tr -d '[:space:]') + if [[ -n "$local_version" ]]; then + log_info "本地版本号: $local_version" + echo "$local_version" + else + log_warning "本地版本文件为空" + echo "" + fi + else + log_warning "本地版本文件不存在: $LOCAL_VERSION_FILE" + echo "" + fi +} + +# 更新到新版本 +update_to_version() { + local new_version="$1" + local temp_dir="/tmp/argus-update-$$" + local setup_script="$temp_dir/setup.sh" + + log_info "开始更新到版本: $new_version" + + # 创建临时目录 + mkdir -p "$temp_dir" + + # 下载最新的 setup.sh + log_info "从 FTP 服务器下载最新的安装脚本..." + local setup_url="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/setup.sh" + + if curl -fsS "$setup_url" -o "$setup_script"; then + log_success "安装脚本下载完成" + else + log_error "下载安装脚本失败: $setup_url" + rm -rf "$temp_dir" + return 1 + fi + + # 添加执行权限 + chmod +x "$setup_script" + + # 执行安装脚本 + log_info "执行安装脚本进行版本更新..." + if "$setup_script" --server "$FTP_SERVER" --user "$FTP_USER" --password "$FTP_PASSWORD" --version "$new_version"; then + log_success "版本更新完成: $new_version" + rm -rf "$temp_dir" + return 0 + else + log_error "版本更新失败: $new_version" + rm -rf "$temp_dir" + return 1 + fi +} + +# 记录检查日志 +log_check() { + local message="$1" + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + echo "[$timestamp] $message" >> "$LOG_FILE" +} + +# 主函数 +main() { + log_info "开始版本校验检查..." + log_check "版本校验检查开始" + + # 确保系统目录存在 + mkdir -p "/opt/argus-metric" + mkdir -p "$CURRENT_VERSION_DIR" + + log_info "当前版本目录: $CURRENT_VERSION_DIR" + + # 获取 FTP 配置 + get_ftp_config + + # 获取本地版本号 + local local_version + local_version=$(get_local_version) + + # 获取远程版本号 + local remote_version + if ! remote_version=$(get_remote_version); then + log_error "无法获取远程版本号,跳过本次检查" + log_check "版本校验失败:无法获取远程版本号" + exit 1 + fi + + # 比较版本号 + if [[ "$local_version" == "$remote_version" ]]; then + log_info "版本一致,无需更新 (本地: $local_version, 远程: $remote_version)" + log_check "版本校验完成:版本一致 ($local_version)" + else + log_info "检测到版本不一致 (本地: $local_version, 远程: $remote_version)" + log_check "检测到版本不一致:本地($local_version) -> 远程($remote_version)" + + # 更新到新版本 + if update_to_version "$remote_version"; then + log_success "版本更新成功: $local_version -> $remote_version" + log_check "版本更新成功:$local_version -> $remote_version" + else + log_error "版本更新失败" + log_check "版本更新失败:$local_version -> $remote_version" + exit 1 + fi + fi + + log_success "版本校验检查完成" + log_check "版本校验检查完成" +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/scripts/install_artifact.sh b/src/metric/client-plugins/all-in-one-full/scripts/install_artifact.sh new file mode 100755 index 0000000..722f2e8 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/install_artifact.sh @@ -0,0 +1,991 @@ +#!/bin/bash + +set -e + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +log_info() { + local message="[INFO] $1" + echo -e "${BLUE}${message}${NC}" + echo "$(date '+%Y-%m-%d %H:%M:%S') $message" >> "$LOG_FILE" +} + +log_success() { + local message="[SUCCESS] $1" + echo -e "${GREEN}${message}${NC}" + echo "$(date '+%Y-%m-%d %H:%M:%S') $message" >> "$LOG_FILE" +} + +log_warning() { + local message="[WARNING] $1" + echo -e "${YELLOW}${message}${NC}" + echo "$(date '+%Y-%m-%d %H:%M:%S') $message" >> "$LOG_FILE" +} + +log_error() { + local message="[ERROR] $1" + echo -e "${RED}${message}${NC}" + echo "$(date '+%Y-%m-%d %H:%M:%S') $message" >> "$LOG_FILE" +} + +# 配置变量 +INSTALL_DIR="${1:-$(pwd)}" # 使用第一个参数作为安装目录,如果没有参数则使用当前目录 +TEMP_DIR="/tmp/metrics-install-$$" +VERSION_FILE="version.json" +LOG_FILE="${INSTALL_DIR}/.install.log" # 安装日志文件 + + +# 加载配置文件 +load_config() { + local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + local config_file="$script_dir/config.env" + + if [[ -f "$config_file" ]]; then + log_info "加载配置文件: $config_file" + # 导出配置文件中的环境变量 + set -a # 自动导出所有变量 + source "$config_file" + set +a # 关闭自动导出 + log_success "配置文件加载完成" + else + log_warning "配置文件不存在: $config_file,使用默认配置" + fi +} + +# 复制配置文件到安装目录 +copy_config_files() { + log_info "复制配置文件到安装目录..." + + local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + local source_config="$script_dir/../config/config.env" + local target_config="$INSTALL_DIR/config.env" + + if [[ -f "$source_config" ]]; then + # 检查源文件和目标文件是否是同一个文件 + if [[ "$source_config" == "$target_config" ]]; then + log_info "配置文件已在目标位置,跳过复制" + log_success "配置文件已存在: $target_config" + else + if cp "$source_config" "$target_config"; then + log_success "配置文件复制完成: $target_config" + else + log_error "配置文件复制失败" + return 1 + fi + fi + else + log_warning "源配置文件不存在: $source_config" + fi + + # 复制版本校验脚本 + log_info "复制版本校验脚本到安装目录..." + local target_check_version="$INSTALL_DIR/check_version.sh" + + # 检查目标文件是否已存在(从 artifact 包中解压出来的) + if [[ -f "$target_check_version" ]]; then + log_info "版本校验脚本已存在,设置执行权限..." + chmod +x "$target_check_version" + log_success "版本校验脚本权限设置完成: $target_check_version" + else + log_warning "版本校验脚本不存在: $target_check_version" + log_info "请确保 check_version.sh 已包含在 artifact 包中" + fi +} + +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0 [安装目录]" + log_info "如果不指定安装目录,将使用当前目录: $(pwd)" + exit 1 + fi +} + +# 检查系统要求 +check_system() { + log_info "检查系统要求..." + + # 检查操作系统 + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + exit 1 + fi + + source /etc/os-release + log_info "检测到操作系统: $NAME $VERSION" + + # 检查系统架构 + arch=$(uname -m) + log_info "系统架构: $arch" + + # 检查磁盘空间 + available_space=$(df / | awk 'NR==2 {print $4}') + if [[ $available_space -lt 10485760 ]]; then # 10GB in KB + log_warning "可用磁盘空间不足 10GB,当前可用: $(($available_space / 1024 / 1024))GB" + fi + + # 检查内存 + total_mem=$(free -m | awk 'NR==2{print $2}') + if [[ $total_mem -lt 4096 ]]; then # 4GB + log_warning "系统内存不足 4GB,当前: ${total_mem}MB" + fi +} + +# 查找版本文件 +find_version_file() { + log_info "查找版本信息文件..." + + # 在当前目录查找 + if [[ -f "$VERSION_FILE" ]]; then + VERSION_FILE_PATH="$(pwd)/$VERSION_FILE" + log_success "找到版本文件: $VERSION_FILE" + return 0 + fi + + # 在 artifact 目录查找 + for version_dir in artifact/*/; do + if [[ -f "${version_dir}${VERSION_FILE}" ]]; then + VERSION_FILE_PATH="$(cd "$(dirname "${version_dir}${VERSION_FILE}")" && pwd)/$(basename "${version_dir}${VERSION_FILE}")" + log_success "找到版本文件: $VERSION_FILE_PATH" + return 0 + fi + done + + log_error "未找到版本信息文件 $VERSION_FILE" + exit 1 +} + +# 解析版本信息 +parse_version_info() { + log_info "解析版本信息..." + + if [[ ! -f "$VERSION_FILE_PATH" ]]; then + log_error "版本文件不存在: $VERSION_FILE_PATH" + exit 1 + fi + + # 使用 jq 解析 JSON(如果可用) + if command -v jq &> /dev/null; then + # 验证JSON文件格式 + if ! jq empty "$VERSION_FILE_PATH" 2>/dev/null; then + log_error "JSON文件格式错误,请检查 $VERSION_FILE_PATH" + exit 1 + fi + + VERSION=$(jq -r '.version' "$VERSION_FILE_PATH") + BUILD_TIME=$(jq -r '.build_time' "$VERSION_FILE_PATH") + + # 解析 artifact_list + if jq -e '.artifact_list' "$VERSION_FILE_PATH" > /dev/null 2>&1; then + jq -r '.artifact_list | to_entries[] | "\(.key):\(.value)"' "$VERSION_FILE_PATH" > "$TEMP_DIR/components.txt" + else + log_error "version.json 中缺少 artifact_list 字段" + exit 1 + fi + + # 解析 checksums + if jq -e '.checksums' "$VERSION_FILE_PATH" > /dev/null 2>&1; then + jq -r '.checksums | to_entries[] | "\(.key):\(.value)"' "$VERSION_FILE_PATH" > "$TEMP_DIR/checksums.txt" + else + log_error "version.json 中缺少 checksums 字段" + exit 1 + fi + + # 解析 install_order(现在包含完整的文件名) + if jq -e '.install_order' "$VERSION_FILE_PATH" > /dev/null 2>&1; then + jq -r '.install_order[]' "$VERSION_FILE_PATH" > "$TEMP_DIR/install_order.txt" + else + log_error "version.json 中缺少 install_order 字段" + exit 1 + fi + + else + log_warning "jq 未安装,使用简单的 JSON 解析" + # 简单的 JSON 解析 + VERSION=$(grep '"version"' "$VERSION_FILE_PATH" | sed 's/.*"version": *"\([^"]*\)".*/\1/') + BUILD_TIME=$(grep '"build_time"' "$VERSION_FILE_PATH" | sed 's/.*"build_time": *"\([^"]*\)".*/\1/') + + # 解析 artifact_list(跳过字段名本身) + grep -A 100 '"artifact_list"' "$VERSION_FILE_PATH" | grep -v '"artifact_list"' | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do + component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/') + version=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/') + echo "$component:$version" >> "$TEMP_DIR/components.txt" + done + + # 解析 checksums(跳过字段名本身) + grep -A 100 '"checksums"' "$VERSION_FILE_PATH" | grep -v '"checksums"' | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do + component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/') + checksum=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/') + echo "$component:$checksum" >> "$TEMP_DIR/checksums.txt" + done + + # 解析 install_order(跳过字段名本身,只取数组元素) + grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -v '"install_order"' | grep -E '^\s*"[^"]+"' | while read line; do + component=$(echo "$line" | sed 's/.*"\([^"]*\)".*/\1/') + echo "$component" >> "$TEMP_DIR/install_order.txt" + done + + # 验证解析结果 + if [[ ! -f "$TEMP_DIR/components.txt" || ! -s "$TEMP_DIR/components.txt" ]]; then + log_error "无法解析 artifact_list,请检查 version.json 格式" + exit 1 + fi + + if [[ ! -f "$TEMP_DIR/checksums.txt" || ! -s "$TEMP_DIR/checksums.txt" ]]; then + log_error "无法解析 checksums,请检查 version.json 格式" + exit 1 + fi + + if [[ ! -f "$TEMP_DIR/install_order.txt" || ! -s "$TEMP_DIR/install_order.txt" ]]; then + log_error "无法解析 install_order,请检查 version.json 格式" + exit 1 + fi + fi + + log_success "版本信息解析完成" + log_info " 版本: $VERSION" + log_info " 构建时间: $BUILD_TIME" + + component_count=0 + if [[ -f "$TEMP_DIR/components.txt" ]]; then + component_count=$(wc -l < "$TEMP_DIR/components.txt") + log_info " 组件数量: $component_count" + log_info " 组件列表:" + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + version=$(echo "$line" | cut -d':' -f2) + log_info " - $component v$version" + done < "$TEMP_DIR/components.txt" + else + log_error "components.txt 文件不存在" + exit 1 + fi +} + +# 验证文件完整性 +verify_checksums() { + log_info "验证文件完整性..." + + artifact_dir=$(dirname "$VERSION_FILE_PATH") + log_info "Artifact 目录: $artifact_dir" + failed_verification=0 + + if [[ -f "$TEMP_DIR/checksums.txt" ]]; then + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + expected_checksum=$(echo "$line" | cut -d':' -f2-) + + # 查找匹配的 tar 文件 + actual_file="" + for file in "$artifact_dir/${component}-"*.tar.gz; do + if [[ -f "$file" ]]; then + actual_file="$file" + break + fi + done + + if [[ -z "$actual_file" ]]; then + log_error "找不到组件文件: $component" + failed_verification=1 + continue + fi + + # 计算实际校验和 + actual_checksum="sha256:$(sha256sum "$actual_file" | cut -d' ' -f1)" + + if [[ "$actual_checksum" == "$expected_checksum" ]]; then + log_success " $component: 校验通过" + else + log_error " $component: 校验失败" + log_error " 期望: $expected_checksum" + log_error " 实际: $actual_checksum" + failed_verification=1 + fi + done < "$TEMP_DIR/checksums.txt" + fi + + if [[ $failed_verification -eq 1 ]]; then + log_error "文件完整性验证失败" + exit 1 + fi + + log_success "所有文件校验通过" +} + +# 创建安装目录 +create_install_dirs() { + log_info "创建安装目录..." + + mkdir -p "$INSTALL_DIR" + mkdir -p "$TEMP_DIR" + + log_success "安装目录创建完成: $INSTALL_DIR" +} + +# 获取系统版本 +get_system_version() { + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + return 1 + fi + + source /etc/os-release + + # 提取主版本号 + case "$VERSION_ID" in + "20.04") + echo "ubuntu20" + ;; + "22.04") + echo "ubuntu22" + ;; + *) + log_warning "未识别的Ubuntu版本: $VERSION_ID,尝试使用ubuntu22" + echo "ubuntu22" + ;; + esac +} + +# 安装系统依赖包 +install_system_deps() { + log_info "开始安装系统依赖包(离线模式)..." + + local artifact_dir + artifact_dir=$(dirname "$VERSION_FILE_PATH") + local deps_dir="$artifact_dir/deps" + local system_version + system_version=$(get_system_version) + local version_deps_dir="$deps_dir/$system_version" + + if [[ ! -d "$version_deps_dir" ]]; then + log_warning "未找到 $system_version 版本的依赖目录: $version_deps_dir,跳过安装" + return 0 + fi + + log_info "找到系统版本依赖目录: $version_deps_dir" + + local deps_temp_dir="/tmp/argus_deps" + mkdir -p "$deps_temp_dir" + rm -rf "$deps_temp_dir"/* + + local FAILED_DEPS=() + local CORE_DEPS=(jq cron curl) # 核心依赖列表 + + # 遍历每个 tar.gz + for tar_file in "$version_deps_dir"/*.tar.gz; do + [[ -f "$tar_file" ]] || continue + + local tar_basename + tar_basename=$(basename "$tar_file") + log_info "处理依赖包: $tar_basename" + + local extract_dir="$deps_temp_dir/${tar_basename%.tar.gz}" + mkdir -p "$extract_dir" + + if tar -xzf "$tar_file" -C "$extract_dir"; then + log_success " $tar_basename 解压完成" + else + log_error " $tar_basename 解压失败" + FAILED_DEPS+=("$tar_basename") + continue + fi + + # 递归查找所有 deb 文件,一次性安装 + mapfile -t deb_files < <(find "$extract_dir" -type f -name "*.deb") + if [[ ${#deb_files[@]} -eq 0 ]]; then + log_warning " 没有找到 deb 包,跳过" + continue + fi + + log_info " 安装 ${#deb_files[@]} 个 deb 包..." + if dpkg -i "${deb_files[@]}" &>/tmp/dpkg_install.log; then + log_success " 所有 deb 包安装成功" + else + dpkg --configure -a || true + if dpkg -l | grep -q '^ii'; then + log_success " dpkg --configure 修复后安装成功" + else + log_error " 部分 deb 包安装失败,请手动安装" + for deb in "${deb_files[@]}"; do + pkg_name=$(dpkg-deb -f "$deb" Package 2>/dev/null || true) + FAILED_DEPS+=("${pkg_name:-$deb}") + done + fi + fi + done + + # 启动 cron 服务或其它必要服务 + start_cron_service + + # 检查核心依赖是否都已安装 + local missing_core=() + for dep in "${CORE_DEPS[@]}"; do + if ! dpkg -s "$dep" &>/dev/null; then + missing_core+=("$dep") + fi + done + + if [[ ${#missing_core[@]} -gt 0 ]]; then + log_error "核心依赖安装失败,请手动安装以下组件:" + for d in "${missing_core[@]}"; do + echo " - $d" + done + exit 1 + fi + + # 最终处理其他安装失败的包 + if [[ ${#FAILED_DEPS[@]} -gt 0 ]]; then + log_error "以下系统依赖安装失败,请手动安装后重试:" + for f in "${FAILED_DEPS[@]}"; do + echo " - $f" + done + exit 1 + fi + + log_success "系统依赖安装完成,全部就绪" +} + +# 启动 cron 服务 +start_cron_service() { + log_info "检查并启动 cron 服务..." + + # 检查 cron 是否已经在运行 + if pgrep -x "cron" > /dev/null; then + log_success "cron 服务已在运行" + return 0 + fi + + # 检查 /usr/sbin/cron 是否存在 + if [[ ! -f "/usr/sbin/cron" ]]; then + log_warning "cron 可执行文件不存在,跳过启动" + return 1 + fi + + # 启动 cron 服务 + log_info "启动 cron 服务..." + if /usr/sbin/cron start 2>/dev/null || /usr/sbin/cron 2>/dev/null; then + log_success "cron 服务启动成功" + + sleep 2 + + if pgrep -x "cron" > /dev/null; then + log_success "cron 服务运行正常" + else + log_warning "cron 服务可能未正常启动" + fi + else + log_error "cron 服务启动失败" + return 1 + fi +} + +# 安装组件 +install_components() { + log_info "开始安装组件..." + + artifact_dir=$(dirname "$VERSION_FILE_PATH") + log_info "Artifact 目录: $artifact_dir" + install_count=0 + total_count=0 + + if [[ -f "$TEMP_DIR/install_order.txt" ]]; then + total_count=$(wc -l < "$TEMP_DIR/install_order.txt") + fi + + if [[ -f "$TEMP_DIR/install_order.txt" ]]; then + while IFS= read -r filename; do + install_count=$((install_count + 1)) + + # 从文件名中提取组件名(去掉时间戳后缀) + component=$(echo "$filename" | sed 's/-[0-9]\{8\}-[0-9]\{6\}\.tar\.gz$//') + + log_info "[$install_count/$total_count] 安装 $component..." + log_info " 文件名: $filename" + + # 直接使用完整的文件名 + tar_file="$artifact_dir/$filename" + + if [[ ! -f "$tar_file" ]]; then + log_error "找不到组件文件: $filename" + log_info " 期望路径: $tar_file" + log_info " 当前目录: $(pwd)" + log_info " 目录内容:" + ls -la "$artifact_dir" | while read line; do + log_info " $line" + done + exit 1 + fi + + log_info " 找到文件: $tar_file" + + # 解压到临时目录 + component_temp_dir="$TEMP_DIR/$component" + mkdir -p "$component_temp_dir" + + if tar -xzf "$tar_file" -C "$component_temp_dir" 2>/dev/null; then + log_success " $component 解压完成" + else + log_error " $component 解压失败" + exit 1 + fi + + # 查找解压后的目录 + extracted_dir="" + for dir in "$component_temp_dir"/*; do + if [[ -d "$dir" ]]; then + extracted_dir="$dir" + break + fi + done + + if [[ -z "$extracted_dir" ]]; then + log_error " $component 解压后未找到目录" + exit 1 + fi + + # 执行安装脚本 + if [[ -f "$extracted_dir/install.sh" ]]; then + log_info " 执行 $component 安装脚本..." + if (cd "$extracted_dir" && ./install.sh "$INSTALL_DIR"); then + log_success " $component 安装完成" + else + log_error " $component 安装失败" + exit 1 + fi + else + log_error " $component 缺少 install.sh 文件" + exit 1 + fi + + # 将解压后的目录移动到安装目录,保留组件目录 + component_install_dir="$INSTALL_DIR/$component" + # 简化安装逻辑:直接删除旧目录,不进行备份 + if [[ -d "$component_install_dir" ]]; then + log_info " 组件目录已存在,删除旧版本: $component_install_dir" + rm -rf "$component_install_dir" + # log_info " 组件目录已存在,备份后更新: $component_install_dir" + # mv "$component_install_dir" "${component_install_dir}.backup.$(date +%Y%m%d_%H%M%S)" + fi + mv "$extracted_dir" "$component_install_dir" + log_success " 组件目录已保存: $component_install_dir" + + # 清理临时文件 + rm -rf "$component_temp_dir" + done < "$TEMP_DIR/install_order.txt" + fi + + log_success "所有组件安装完成" +} + +# 创建安装记录 +create_install_record() { + log_info "创建安装记录..." + + # 等待一段时间确保所有进程都已启动 + log_info "等待进程启动..." + sleep 3 + + local install_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + local install_record_file="$INSTALL_DIR/.install_record" + + # 创建 JSON 格式的安装记录 + cat > "$install_record_file" << EOF +{ + "version": "$VERSION", + "build_time": "$BUILD_TIME", + "install_time": "$install_time", + "install_dir": "$INSTALL_DIR", + "install_pid": $$, + "components": { +EOF + + # 添加组件信息 + local first_component=true + if [[ -f "$TEMP_DIR/components.txt" ]]; then + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + version=$(echo "$line" | cut -d':' -f2) + + # 获取组件的进程信息 + local component_pid="" + + # 根据组件名查找进程,使用多种方法确保能找到PID + case "$component" in + "node-exporter") + # 尝试多种方式查找node_exporter进程 + component_pid=$(pgrep -f "node_exporter" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "node-exporter" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "node_exporter" | awk '{print $2}' | head -1) + fi + ;; + "dcgm-exporter") + # 查找dcgm-exporter进程 + component_pid=$(pgrep -f "dcgm-exporter" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "dcgm_exporter" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "dcgm-exporter" | awk '{print $2}' | head -1) + fi + ;; + "fluent-bit") + # 查找fluent-bit进程 + component_pid=$(pgrep -f "fluent-bit" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "fluent_bit" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "fluent-bit" | awk '{print $2}' | head -1) + fi + ;; + "argus-agent") + # 查找argus-agent进程 + component_pid=$(pgrep -f "argus-agent" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "argus-agent" | awk '{print $2}' | head -1) + fi + ;; + esac + + # 记录找到的PID信息 + if [[ -n "$component_pid" ]]; then + log_info " 找到 $component 进程 PID: $component_pid" + else + log_warning " 未找到 $component 进程" + fi + + # 添加逗号分隔符 + if [[ "$first_component" == "true" ]]; then + first_component=false + else + echo "," >> "$install_record_file" + fi + + # 添加组件信息 + cat >> "$install_record_file" << EOF + "$component": { + "version": "$version", + "pid": "$component_pid", + "install_dir": "$INSTALL_DIR/$component" + } +EOF + done < "$TEMP_DIR/components.txt" + fi + + # 结束 JSON + cat >> "$install_record_file" << EOF + } +} +EOF + + log_success "安装记录已创建: $install_record_file" +} + +# 检查cron任务是否已存在 +check_cron_task_exists() { + local task_pattern="$1" + local temp_cron="$2" + + if grep -q "$task_pattern" "$temp_cron"; then + return 0 # 任务已存在 + else + return 1 # 任务不存在 + fi +} + +# 设置健康检查定时任务 +setup_health_check_cron() { + log_info "设置健康检查定时任务..." + + # 直接使用当前安装目录,不依赖current软链接 + # INSTALL_DIR 是 /opt/argus-metric/versions/1.34.0 + local check_health_script="$INSTALL_DIR/check_health.sh" + + # 检查健康检查脚本是否存在 + if [[ ! -f "$check_health_script" ]]; then + log_error "健康检查脚本不存在: $check_health_script" + return 1 + fi + + # 确保脚本有执行权限 + chmod +x "$check_health_script" + + # 创建临时crontab文件 + local temp_cron="/tmp/crontab_$$" + + # 获取当前用户的crontab(如果存在) + crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron" + + # 检查并删除旧的健康检查任务 + if check_cron_task_exists "check_health.sh" "$temp_cron"; then + log_info "发现旧的健康检查定时任务,正在更新..." + # 删除所有包含check_health.sh的行 + grep -v "check_health.sh" "$temp_cron" > "$temp_cron.new" + mv "$temp_cron.new" "$temp_cron" + log_info "旧的健康检查定时任务已删除" + fi + + # 添加新的定时任务(每5分钟执行一次) + echo "# Argus-Metrics 健康检查定时任务" >> "$temp_cron" + echo "*/5 * * * * $check_health_script >> $INSTALL_DIR/.health_cron.log 2>&1" >> "$temp_cron" + + # 安装新的crontab + if crontab "$temp_cron"; then + log_success "健康检查定时任务设置成功" + log_info " 执行频率: 每5分钟" + log_info " 日志文件: $INSTALL_DIR/.health_cron.log" + log_info " 查看定时任务: crontab -l" + log_info " 删除定时任务: crontab -e" + else + log_error "健康检查定时任务设置失败" + rm -f "$temp_cron" + return 1 + fi + + # 清理临时文件 + rm -f "$temp_cron" + + log_info "健康检查通过crontab自动执行" +} + +# 设置 DNS 同步定时任务 +setup_dns_sync_cron() { + log_info "设置 DNS 同步定时任务..." + + # 使用当前版本目录中的 DNS 同步脚本 + local sync_dns_script="$INSTALL_DIR/sync_dns.sh" + + # 检查 DNS 同步脚本是否存在 + if [[ ! -f "$sync_dns_script" ]]; then + log_warning "DNS 同步脚本不存在: $sync_dns_script" + log_warning "跳过 DNS 同步定时任务设置" + return 0 + fi + + # 确保脚本有执行权限 + chmod +x "$sync_dns_script" + + # 创建临时crontab文件 + local temp_cron="/tmp/crontab_$$" + + # 获取当前用户的crontab(如果存在) + crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron" + + # 检查并删除旧的 DNS 同步任务 + if check_cron_task_exists "sync_dns.sh" "$temp_cron"; then + log_info "发现旧的 DNS 同步定时任务,正在更新..." + # 删除所有包含sync_dns.sh的行 + grep -v "sync_dns.sh" "$temp_cron" > "$temp_cron.new" + mv "$temp_cron.new" "$temp_cron" + log_info "旧的 DNS 同步定时任务已删除" + fi + + # 添加新的定时任务(每1分钟执行一次) + # 直接使用版本目录中的 DNS 同步脚本 + echo "# Argus-Metrics DNS 同步定时任务" >> "$temp_cron" + echo "* * * * * $sync_dns_script >> $INSTALL_DIR/.dns_sync.log 2>&1" >> "$temp_cron" + + # 安装新的crontab + if crontab "$temp_cron"; then + log_success "DNS 同步定时任务设置成功" + log_info " 执行频率: 每1分钟" + log_info " 日志文件: $INSTALL_DIR/.dns_sync.log" + log_info " 查看定时任务: crontab -l" + log_info " 删除定时任务: crontab -e" + else + log_error "DNS 同步定时任务设置失败" + rm -f "$temp_cron" + return 1 + fi + + # 清理临时文件 + rm -f "$temp_cron" + + log_info "DNS 同步通过crontab自动执行" +} + +# 设置版本校验定时任务 +setup_version_check_cron() { + log_info "设置版本校验定时任务..." + + # 使用当前版本目录中的版本校验脚本 + local check_version_script="$INSTALL_DIR/check_version.sh" + + # 检查脚本是否存在 + if [[ ! -f "$check_version_script" ]]; then + log_warning "版本校验脚本不存在: $check_version_script" + log_info "跳过版本校验定时任务设置" + return 0 + fi + + # 确保脚本可执行 + chmod +x "$check_version_script" + + # 创建临时crontab文件 + local temp_cron="/tmp/crontab_$$" + crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron" + + # 检查是否已存在版本校验定时任务 + if check_cron_task_exists "check_version.sh" "$temp_cron"; then + log_info "发现旧的版本校验定时任务,正在更新..." + # 删除所有包含check_version.sh的行 + grep -v "check_version.sh" "$temp_cron" > "$temp_cron.new" + mv "$temp_cron.new" "$temp_cron" + log_info "旧的版本校验定时任务已删除" + fi + + # 添加新的定时任务(每30分钟执行一次) + echo "# Argus-Metrics 版本校验定时任务" >> "$temp_cron" + echo "*/1 * * * * $check_version_script >> $INSTALL_DIR/.version_check.log 2>&1" >> "$temp_cron" + + # 安装新的crontab + if crontab "$temp_cron"; then + log_success "版本校验定时任务设置成功" + log_info " 执行频率: 每1分钟" + log_info " 日志文件: $INSTALL_DIR/.version_check.log" + log_info " 查看定时任务: crontab -l" + log_info " 删除定时任务: crontab -e" + else + log_error "版本校验定时任务设置失败" + rm -f "$temp_cron" + return 1 + fi + + # 清理临时文件 + rm -f "$temp_cron" + + log_info "版本校验通过crontab自动执行" +} + +# 设置自动重启定时任务 +setup_restart_cron() { + log_info "设置自动重启定时任务..." + + # 使用当前版本目录中的重启脚本 + local restart_script="$INSTALL_DIR/restart_unhealthy.sh" + + # 检查脚本是否存在 + if [[ ! -f "$restart_script" ]]; then + log_warning "重启脚本不存在: $restart_script" + log_info "跳过自动重启定时任务设置" + return 0 + fi + + # 确保脚本可执行 + chmod +x "$restart_script" + + # 创建临时crontab文件 + local temp_cron="/tmp/crontab_$$" + crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron" + + # 检查是否已存在自动重启定时任务 + if check_cron_task_exists "restart_unhealthy.sh" "$temp_cron"; then + log_info "发现旧的自动重启定时任务,正在更新..." + # 删除所有包含restart_unhealthy.sh的行 + grep -v "restart_unhealthy.sh" "$temp_cron" > "$temp_cron.new" + mv "$temp_cron.new" "$temp_cron" + log_info "旧的自动重启定时任务已删除" + fi + + # 添加新的定时任务(每2分钟执行一次) + echo "# Argus-Metrics 自动重启定时任务" >> "$temp_cron" + echo "*/2 * * * * $restart_script >> $INSTALL_DIR/.restart.log 2>&1" >> "$temp_cron" + + # 安装新的crontab + if crontab "$temp_cron"; then + log_success "自动重启定时任务设置成功" + log_info " 执行频率: 每2分钟" + log_info " 日志文件: $INSTALL_DIR/.restart.log" + log_info " 查看定时任务: crontab -l" + log_info " 删除定时任务: crontab -e" + else + log_error "自动重启定时任务设置失败" + rm -f "$temp_cron" + return 1 + fi + + # 清理临时文件 + rm -f "$temp_cron" + + log_info "自动重启检查通过crontab自动执行" +} + +# 显示安装信息 +show_install_info() { + log_success "Argus-Metrics All-in-One 安装完成!" + echo + log_info "安装日志已保存到: $LOG_FILE" + log_info "如需查看详细日志,请执行: cat $LOG_FILE" + echo +} + +cleanup() { + if [[ -d "$TEMP_DIR" ]]; then + rm -rf "$TEMP_DIR" + fi +} + +trap cleanup EXIT + +# 主函数 +main() { + echo "==========================================" + echo " Argus-Metrics All-in-One 安装脚本 v1.0" + echo "==========================================" + echo + + # 初始化日志文件 + mkdir -p "$INSTALL_DIR" + echo "==========================================" > "$LOG_FILE" + echo " Argus-Metrics All-in-One 安装日志" >> "$LOG_FILE" + echo " 开始时间: $(date '+%Y-%m-%d %H:%M:%S')" >> "$LOG_FILE" + echo "==========================================" >> "$LOG_FILE" + + # 加载配置文件 + load_config + + log_info "安装目录: $INSTALL_DIR" + log_info "日志文件: $LOG_FILE" + echo + + check_root + check_system + find_version_file + create_install_dirs + install_system_deps + parse_version_info + verify_checksums + install_components + copy_config_files + create_install_record + setup_health_check_cron + setup_dns_sync_cron + setup_version_check_cron + setup_restart_cron + + # 注释掉立即执行健康检查,避免与cron任务重复执行 + # log_info "立即执行一次健康检查..." + # local check_health_script="$INSTALL_DIR/check_health.sh" + # if [[ -f "$check_health_script" ]]; then + # if "$check_health_script" >> "$INSTALL_DIR/.health_check.log" 2>&1; then + # log_success "健康检查执行完成" + # else + # log_warning "健康检查执行失败,请检查日志: $INSTALL_DIR/.health_check.log" + # fi + # else + # log_warning "健康检查脚本不存在: $check_health_script" + # fi + + show_install_info +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/scripts/package_artifact.sh b/src/metric/client-plugins/all-in-one-full/scripts/package_artifact.sh new file mode 100755 index 0000000..2c4bb6b --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/package_artifact.sh @@ -0,0 +1,474 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 显示帮助信息 +show_help() { + echo "AIOps All-in-One 打包脚本" + echo + echo "用法: $0 [选项]" + echo + echo "选项:" + echo " --force 强制重新打包,即使版本已存在" + echo " --help 显示此帮助信息" + echo + echo "示例:" + echo " $0 # 正常打包,跳过已存在的版本" + echo " $0 --force # 强制重新打包" + echo +} + +# 解析命令行参数 +FORCE_PACKAGE=false +if [[ "$1" == "--force" ]]; then + FORCE_PACKAGE=true + log_info "强制重新打包模式" +elif [[ "$1" == "--help" || "$1" == "-h" ]]; then + show_help + exit 0 +fi + +# 获取当前目录和版本 +CURRENT_DIR=$(pwd) +VERSION=$(cat config/VERSION 2>/dev/null || echo "1.0.0") +ARTIFACT_DIR="artifact/$VERSION" + +log_info "开始打包 AIOps All-in-One 安装包 v$VERSION" + +# 检查必要文件 +log_info "检查必要文件..." +if [[ ! -f "config/VERSION" ]]; then + log_error "VERSION 文件不存在" + exit 1 +fi + +if [[ ! -f "config/checklist" ]]; then + log_error "checklist 文件不存在" + exit 1 +fi + +# 检查是否已存在该版本 +if [[ -d "$ARTIFACT_DIR" && "$FORCE_PACKAGE" == "false" ]]; then + log_info "检查版本 $VERSION 是否已存在..." + + # 检查 version.json 是否存在 + if [[ -f "$ARTIFACT_DIR/version.json" ]]; then + log_info "找到已存在的版本信息文件" + + # 检查是否所有组件文件都存在 + missing_files=0 + existing_components=0 + + # 解析已存在的 version.json 来检查文件 + if command -v jq &> /dev/null; then + # 使用 jq 解析 + while IFS= read -r component; do + existing_components=$((existing_components + 1)) + # 查找对应的 tar 文件 + found_file=false + for file in "$ARTIFACT_DIR/${component}-"*.tar.gz; do + if [[ -f "$file" ]]; then + found_file=true + break + fi + done + if [[ "$found_file" == "false" ]]; then + missing_files=$((missing_files + 1)) + log_warning " 缺少文件: $component" + fi + done < <(jq -r '.artifact_list | keys[]' "$ARTIFACT_DIR/version.json" 2>/dev/null) + else + # 简单的文件检查 + for file in "$ARTIFACT_DIR"/*.tar.gz; do + if [[ -f "$file" ]]; then + existing_components=$((existing_components + 1)) + fi + done + fi + + # 如果所有文件都存在,则跳过打包 + if [[ $missing_files -eq 0 && $existing_components -gt 0 ]]; then + log_success "版本 $VERSION 已完整打包,跳过重复打包" + echo + echo "现有文件:" + ls -la "$ARTIFACT_DIR" + echo + echo "如需强制重新打包,请删除目录: rm -rf $ARTIFACT_DIR" + echo "或使用: ./package.sh --force" + exit 0 + else + log_warning "版本 $VERSION 存在但不完整,将重新打包" + log_info " 现有组件: $existing_components" + log_info " 缺少文件: $missing_files" + fi + else + log_warning "版本目录存在但缺少 version.json,将重新打包" + fi +fi + +# 创建 artifact 目录 +mkdir -p "$ARTIFACT_DIR" +log_info "创建输出目录: $ARTIFACT_DIR" + +# 创建临时文件存储数据 +TEMP_DIR=$(mktemp -d) +COMPONENTS_FILE="$TEMP_DIR/components.txt" +VERSIONS_FILE="$TEMP_DIR/versions.txt" +DEPENDENCIES_FILE="$TEMP_DIR/dependencies.txt" +INSTALL_ORDER_FILE="$TEMP_DIR/install_order.txt" +CHECKSUMS_FILE="$TEMP_DIR/checksums.txt" +ARTIFACT_LIST_FILE="$TEMP_DIR/artifact_list.txt" + +# 解析 checklist 文件 +log_info "解析组件清单..." +line_num=0 +component_count=0 + +while IFS= read -r line; do + [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue + + line_num=$((line_num + 1)) + + # 解析行: 组件名 目录路径 版本 [依赖组件] [安装顺序] + read -r component component_path version dep_component order <<< "$line" + + if [[ -z "$component" || -z "$component_path" || -z "$version" ]]; then + log_warning "跳过无效行 $line_num: $line" + continue + fi + + # 存储组件信息 + echo "$component" >> "$COMPONENTS_FILE" + echo "$component:$version" >> "$VERSIONS_FILE" + echo "$component:$component_path" >> "$TEMP_DIR/component_paths.txt" + + if [[ -n "$dep_component" && "$dep_component" != "$component" ]]; then + echo "$component:$dep_component" >> "$DEPENDENCIES_FILE" + fi + + if [[ -n "$order" && "$order" =~ ^[0-9]+$ ]]; then + echo "$order:$component" >> "$INSTALL_ORDER_FILE" + else + # 如果没有指定顺序,按解析顺序分配 + echo "$line_num:$component" >> "$INSTALL_ORDER_FILE" + fi + + component_count=$((component_count + 1)) + log_info " - $component v$version" +done < config/checklist + +if [[ $component_count -eq 0 ]]; then + log_error "没有找到有效的组件" + rm -rf "$TEMP_DIR" + exit 1 +fi + +log_success "找到 $component_count 个组件" + +# 检查组件目录是否存在 +log_info "检查组件目录..." +missing_components=() + +while IFS= read -r component; do + # 获取组件路径 + component_path=$(grep "^$component:" "$TEMP_DIR/component_paths.txt" | cut -d':' -f2-) + if [[ -z "$component_path" ]]; then + log_error "未找到组件 $component 的路径配置" + log_info "请检查 component_paths.txt 文件或添加路径配置" + exit 1 + fi + + if [[ ! -d "$component_path" ]]; then + missing_components+=("$component:$component_path") + fi +done < "$COMPONENTS_FILE" + +if [[ ${#missing_components[@]} -gt 0 ]]; then + log_error "以下组件目录不存在:" + for component_path in "${missing_components[@]}"; do + echo " - $component_path" + done + rm -rf "$TEMP_DIR" + exit 1 +fi + +# 打包各个组件 +log_info "开始打包组件..." + +while IFS= read -r component; do + # 获取组件版本和路径 + version=$(grep "^$component:" "$VERSIONS_FILE" | cut -d':' -f2) + component_path=$(grep "^$component:" "$TEMP_DIR/component_paths.txt" | cut -d':' -f2-) + if [[ -z "$component_path" ]]; then + log_error "未找到组件 $component 的路径配置" + log_info "请检查 component_paths.txt 文件或添加路径配置" + exit 1 + fi + + log_info "打包 $component v$version..." + log_info " 组件路径: $component_path" + + # 进入组件目录 + cd "$component_path" + + # 检查组件是否有 package.sh + if [[ ! -f "package.sh" ]]; then + log_error "$component 缺少 package.sh 文件" + cd "$CURRENT_DIR" + rm -rf "$TEMP_DIR" + exit 1 + fi + + # 执行组件的打包脚本 + if ./package.sh; then + # 查找生成的 tar 包 + tar_file=$(find . -name "*.tar.gz" -type f | head -1) + if [[ -n "$tar_file" ]]; then + # 移动到 artifact 目录 + mv "$tar_file" "$CURRENT_DIR/$ARTIFACT_DIR/" + tar_filename=$(basename "$tar_file") + + # 计算校验和 + checksum=$(sha256sum "$CURRENT_DIR/$ARTIFACT_DIR/$tar_filename" | cut -d' ' -f1) + echo "$component:sha256:$checksum" >> "$CHECKSUMS_FILE" + echo "$component:$version" >> "$ARTIFACT_LIST_FILE" + + # 将完整的文件名存储到安装顺序文件中 + echo "$tar_filename" >> "$TEMP_DIR/install_order_files.txt" + + log_success " $component 打包完成: $tar_filename" + else + log_error "$component 打包失败,未找到生成的 tar 包" + cd "$CURRENT_DIR" + rm -rf "$TEMP_DIR" + exit 1 + fi + else + log_error "$component 打包失败" + cd "$CURRENT_DIR" + rm -rf "$TEMP_DIR" + exit 1 + fi + + # 返回主目录 + cd "$CURRENT_DIR" +done < "$COMPONENTS_FILE" + +# 生成 version.json +log_info "生成版本信息文件..." +version_json="$ARTIFACT_DIR/version.json" + +# 构建依赖关系 JSON +deps_json="" +if [[ -f "$DEPENDENCIES_FILE" ]]; then + first=true + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + dep=$(echo "$line" | cut -d':' -f2) + if [[ "$first" == "true" ]]; then + deps_json="\"$component\":[\"$dep\"]" + first=false + else + deps_json="$deps_json,\"$component\":[\"$dep\"]" + fi + done < "$DEPENDENCIES_FILE" +fi + +# 构建安装顺序数组 +order_array="" +if [[ -f "$TEMP_DIR/install_order_files.txt" ]]; then + first=true + while IFS= read -r filename; do + if [[ "$first" == "true" ]]; then + order_array="\"$filename\"" + first=false + else + order_array="$order_array,\"$filename\"" + fi + done < "$TEMP_DIR/install_order_files.txt" +fi + +# 构建 artifact_list JSON +artifact_json="" +if [[ -f "$ARTIFACT_LIST_FILE" ]]; then + first=true + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + version=$(echo "$line" | cut -d':' -f2) + if [[ "$first" == "true" ]]; then + artifact_json="\"$component\":\"$version\"" + first=false + else + artifact_json="$artifact_json,\"$component\":\"$version\"" + fi + done < "$ARTIFACT_LIST_FILE" +fi + +# 构建 checksums JSON +checksums_json="" +if [[ -f "$CHECKSUMS_FILE" ]]; then + first=true + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + checksum=$(echo "$line" | cut -d':' -f2-) + if [[ "$first" == "true" ]]; then + checksums_json="\"$component\":\"$checksum\"" + first=false + else + checksums_json="$checksums_json,\"$component\":\"$checksum\"" + fi + done < "$CHECKSUMS_FILE" +fi + +# 生成完整的 version.json +cat > "$version_json" << EOF +{ + "version": "$VERSION", + "build_time": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", + "artifact_list": { + $artifact_json + }, + "checksums": { + $checksums_json + }, + "dependencies": { + $deps_json + }, + "install_order": [ + $order_array + ] +} +EOF + +log_success "版本信息文件生成完成: $version_json" + +# 复制`安装`脚本到 artifact 目录 +log_info "复制安装脚本..." +if [[ -f "scripts/install_artifact.sh" ]]; then + cp "scripts/install_artifact.sh" "$ARTIFACT_DIR/install.sh" + chmod +x "$ARTIFACT_DIR/install.sh" + log_success "安装脚本复制完成: $ARTIFACT_DIR/install.sh" +else + log_warning "scripts/install_artifact.sh 文件不存在" +fi + +# 复制`卸载`脚本到 artifact 目录 +log_info "复制卸载脚本..." +if [[ -f "scripts/uninstall_artifact.sh" ]]; then + cp "scripts/uninstall_artifact.sh" "$ARTIFACT_DIR/uninstall.sh" + chmod +x "$ARTIFACT_DIR/uninstall.sh" + log_success "卸载脚本复制完成: $ARTIFACT_DIR/uninstall.sh" +else + log_warning "scripts/uninstall_artifact.sh 文件不存在" +fi + +# 复制`健康检查`脚本到 artifact 目录 +log_info "复制健康检查脚本..." +if [[ -f "scripts/check_health.sh" ]]; then + cp "scripts/check_health.sh" "$ARTIFACT_DIR/check_health.sh" + chmod +x "$ARTIFACT_DIR/check_health.sh" + log_success "健康检查脚本复制完成: $ARTIFACT_DIR/check_health.sh" +else + log_warning "scripts/check_health.sh 文件不存在" +fi + +# 复制`DNS 同步`脚本到 artifact 目录 +log_info "复制 DNS 同步脚本..." +if [[ -f "scripts/sync_dns.sh" ]]; then + cp "scripts/sync_dns.sh" "$ARTIFACT_DIR/sync_dns.sh" + chmod +x "$ARTIFACT_DIR/sync_dns.sh" + log_success "DNS 同步脚本复制完成: $ARTIFACT_DIR/sync_dns.sh" +else + log_warning "scripts/sync_dns.sh 文件不存在" +fi + +# 复制`版本校验`脚本到 artifact 目录 +log_info "复制版本校验脚本..." +if [[ -f "scripts/check_version.sh" ]]; then + cp "scripts/check_version.sh" "$ARTIFACT_DIR/check_version.sh" + chmod +x "$ARTIFACT_DIR/check_version.sh" + log_success "版本校验脚本复制完成: $ARTIFACT_DIR/check_version.sh" +else + log_warning "scripts/check_version.sh 文件不存在" +fi + +# 复制`自动重启`脚本到 artifact 目录 +log_info "复制自动重启脚本..." +if [[ -f "scripts/restart_unhealthy.sh" ]]; then + cp "scripts/restart_unhealthy.sh" "$ARTIFACT_DIR/restart_unhealthy.sh" + chmod +x "$ARTIFACT_DIR/restart_unhealthy.sh" + log_success "自动重启脚本复制完成: $ARTIFACT_DIR/restart_unhealthy.sh" +else + log_warning "scripts/restart_unhealthy.sh 文件不存在" +fi + +# 复制配置文件到 artifact 目录 +log_info "复制配置文件..." +if [[ -f "config/config.env" ]]; then + cp "config/config.env" "$ARTIFACT_DIR/" + log_success "配置文件复制完成: $ARTIFACT_DIR/config.env" +else + log_warning "config 目录不存在,跳过配置文件复制" +fi + +# DNS 配置文件不需要复制到版本目录,直接从 FTP 服务器根目录获取 + +# 复制 deps 目录到 artifact 目录 +log_info "复制系统依赖包..." +if [[ -d "deps" ]]; then + cp -r "deps" "$ARTIFACT_DIR/" + log_success "系统依赖包复制完成: $ARTIFACT_DIR/deps" + + # 显示deps目录内容 + log_info " 依赖包列表:" + find "$ARTIFACT_DIR/deps" -name "*.tar.gz" -exec basename {} \; | while read dep_file; do + log_info " - $dep_file" + done +else + log_warning "deps 目录不存在,跳过依赖包复制" +fi + +# 显示打包结果 +log_success "打包完成!" +echo +echo "版本: $VERSION" +echo "输出目录: $ARTIFACT_DIR" +echo "包含组件:" +if [[ -f "$ARTIFACT_LIST_FILE" ]]; then + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + version=$(echo "$line" | cut -d':' -f2) + echo " - $component v$version" + done < "$ARTIFACT_LIST_FILE" +fi +echo +echo "文件列表:" +ls -la "$ARTIFACT_DIR" +echo + +# 清理临时文件 +rm -rf "$TEMP_DIR" diff --git a/src/metric/client-plugins/all-in-one-full/scripts/publish_artifact.sh b/src/metric/client-plugins/all-in-one-full/scripts/publish_artifact.sh new file mode 100755 index 0000000..b292a8d --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/publish_artifact.sh @@ -0,0 +1,293 @@ +#!/bin/bash + +set -e + +# 颜色定义 +GREEN='\033[0;32m' +BLUE='\033[0;34m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 显示帮助信息 +show_help() { + echo "Argus-Metric Artifact 发布脚本" + echo + echo "用法: $0 <版本号> [选项]" + echo + echo "参数:" + echo " <版本号> 要发布的版本号,对应 artifact 目录中的版本" + echo + echo "选项:" + echo " --output-dir <路径> 指定输出目录 (默认: /private/argus/ftp/share/)" + echo " --owner 指定文件所有者 (默认: 2133:2015)" + echo " -h, --help 显示此帮助信息" + echo + echo "示例:" + echo " $0 1.20.0 # 使用默认配置发布" + echo " $0 1.20.0 --output-dir /tmp/publish # 指定输出目录" + echo " $0 1.20.0 --owner 1000:1000 # 指定文件所有者" + echo " $0 1.20.0 --output-dir /srv/ftp --owner root:root # 同时指定两者" + echo +} + +# 默认配置 +DEFAULT_PUBLISH_DIR="/private/argus/ftp/share/" +DEFAULT_OWNER="2133:2015" + +# 解析参数 +VERSION="" +PUBLISH_DIR="$DEFAULT_PUBLISH_DIR" +OWNER="$DEFAULT_OWNER" + +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + show_help + exit 0 + ;; + --output-dir) + PUBLISH_DIR="$2" + shift 2 + ;; + --owner) + OWNER="$2" + shift 2 + ;; + *) + if [[ -z "$VERSION" ]]; then + VERSION="$1" + shift + else + log_error "未知参数: $1" + show_help + exit 1 + fi + ;; + esac +done + +# 检查版本号是否提供 +if [[ -z "$VERSION" ]]; then + log_error "请提供版本号参数" + show_help + exit 1 +fi + +ARTIFACT_DIR="artifact/$VERSION" + +# 检查版本目录是否存在 +if [[ ! -d "$ARTIFACT_DIR" ]]; then + log_error "版本目录不存在: $ARTIFACT_DIR" + exit 1 +fi + +log_info "开始发布版本: $VERSION" +log_info "输出目录: $PUBLISH_DIR" +log_info "文件所有者: $OWNER" + +# 确保发布目录存在 +log_info "确保发布目录存在: $PUBLISH_DIR" +mkdir -p "$PUBLISH_DIR" + +# 解析并校验所有者(仅在需要时 chown) +IFS=':' read -r OWNER_UID OWNER_GID <<< "$OWNER" +if [[ -z "$OWNER_UID" || -z "$OWNER_GID" ]]; then + log_error "--owner 格式不正确,应为 uid:gid" + exit 1 +fi + +CURRENT_UID=$(id -u) +CURRENT_GID=$(id -g) +if [[ "$OWNER_UID" != "$CURRENT_UID" || "$OWNER_GID" != "$CURRENT_GID" ]]; then + if [[ "$CURRENT_UID" -ne 0 ]]; then + log_error "当前用户 (${CURRENT_UID}:${CURRENT_GID}) 无法设置所有者为 ${OWNER_UID}:${OWNER_GID}" + log_error "请以目标用户运行脚本或预先调整目录权限" + exit 1 + fi + NEED_CHOWN=true +else + NEED_CHOWN=false +fi + +# 创建临时目录用于打包 +TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$" +mkdir -p "$TEMP_PACKAGE_DIR" + +# 复制所有 tar.gz 文件到临时目录 +log_info "准备 artifact 文件..." +tar_files=$(find "$ARTIFACT_DIR" -name "*.tar.gz" -type f) + +if [[ -z "$tar_files" ]]; then + log_error "在 $ARTIFACT_DIR 中未找到 tar.gz 文件" + exit 1 +fi + +for file in $tar_files; do + filename=$(basename "$file") + log_info " 准备: $filename" + cp "$file" "$TEMP_PACKAGE_DIR/" +done + +# 复制版本信息文件 +if [[ -f "$ARTIFACT_DIR/version.json" ]]; then + log_info "复制版本信息文件..." + cp "$ARTIFACT_DIR/version.json" "$TEMP_PACKAGE_DIR/" +fi + +# 复制健康检查脚本 +if [[ -f "$ARTIFACT_DIR/check_health.sh" ]]; then + log_info "复制健康检查脚本..." + cp "$ARTIFACT_DIR/check_health.sh" "$TEMP_PACKAGE_DIR/" +elif [[ -f "scripts/check_health.sh" ]]; then + log_info "复制健康检查脚本 (从当前目录)..." + cp "scripts/check_health.sh" "$TEMP_PACKAGE_DIR/" +else + log_warning "未找到 check_health.sh 文件" +fi + +# 复制 DNS 同步脚本 +if [[ -f "$ARTIFACT_DIR/sync_dns.sh" ]]; then + log_info "复制 DNS 同步脚本..." + cp "$ARTIFACT_DIR/sync_dns.sh" "$TEMP_PACKAGE_DIR/" +elif [[ -f "scripts/sync_dns.sh" ]]; then + log_info "复制 DNS 同步脚本 (从当前目录)..." + cp "scripts/sync_dns.sh" "$TEMP_PACKAGE_DIR/" +else + log_warning "未找到 sync_dns.sh 文件" +fi + +# 复制版本校验脚本 +if [[ -f "$ARTIFACT_DIR/check_version.sh" ]]; then + log_info "复制版本校验脚本..." + cp "$ARTIFACT_DIR/check_version.sh" "$TEMP_PACKAGE_DIR/" +elif [[ -f "scripts/check_version.sh" ]]; then + log_info "复制版本校验脚本 (从当前目录)..." + cp "scripts/check_version.sh" "$TEMP_PACKAGE_DIR/" +else + log_warning "未找到 check_version.sh 文件" +fi + +# 复制重启失败脚本 +if [[ -f "$ARTIFACT_DIR/restart_unhealthy.sh" ]]; then + log_info "复制重启失败脚本..." + cp "$ARTIFACT_DIR/restart_unhealthy.sh" "$TEMP_PACKAGE_DIR/" +elif [[ -f "scripts/restart_unhealthy.sh" ]]; then + log_info "复制重启失败脚本 (从当前目录)..." + cp "scripts/restart_unhealthy.sh" "$TEMP_PACKAGE_DIR/" +else + log_warning "未找到 restart_unhealthy.sh 文件" +fi + +# 复制安装脚本并重命名为 install.sh +if [[ -f "scripts/install_artifact.sh" ]]; then + log_info "复制安装脚本..." + cp "scripts/install_artifact.sh" "$TEMP_PACKAGE_DIR/install.sh" +fi + +if [[ -f "scripts/uninstall_artifact.sh" ]]; then + log_info "复制卸载脚本..." + cp "scripts/uninstall_artifact.sh" "$TEMP_PACKAGE_DIR/uninstall.sh" +fi + +# 复制配置文件 +if [[ -f "$ARTIFACT_DIR/config.env" ]]; then + log_info "复制配置文件..." + cp "$ARTIFACT_DIR/config.env" "$TEMP_PACKAGE_DIR/" + log_success "配置文件复制完成" +else + log_warning "未找到 config.env 文件" +fi + +# DNS 配置文件将在后面直接复制到发布目录根目录,不包含在 tar.gz 中 + +# 复制 deps 目录 +if [[ -d "$ARTIFACT_DIR/deps" ]]; then + log_info "复制系统依赖包..." + cp -r "$ARTIFACT_DIR/deps" "$TEMP_PACKAGE_DIR/" + log_success "系统依赖包复制完成" +fi + +# 创建tar包,使用新的命名规范 +TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz" +log_info "创建发布包: $TAR_NAME" +cd "$TEMP_PACKAGE_DIR" +tar -czf "$PUBLISH_DIR/$TAR_NAME" . +cd - > /dev/null + +# 设置文件所有者 +log_info "设置文件所有者为: $OWNER" +if [[ "$NEED_CHOWN" == true ]]; then + chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME" +fi + +# 清理临时目录 +rm -rf "$TEMP_PACKAGE_DIR" + +# 更新 LATEST_VERSION 文件 +log_info "更新 LATEST_VERSION 文件..." +echo "$VERSION" > "$PUBLISH_DIR/LATEST_VERSION" +if [[ "$NEED_CHOWN" == true ]]; then + chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION" +fi + +# 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制) +if [[ -f "config/dns.conf" ]]; then + log_info "复制 DNS 配置文件到发布目录根目录..." + cp "config/dns.conf" "$PUBLISH_DIR/" + if [[ "$NEED_CHOWN" == true ]]; then + chown "$OWNER" "$PUBLISH_DIR/dns.conf" + fi + log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf" +else + log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制" +fi + +# 复制 setup.sh 到发布目录 +if [[ -f "scripts/setup.sh" ]]; then + log_info "复制 setup.sh 到发布目录..." + cp "scripts/setup.sh" "$PUBLISH_DIR/" + if [[ "$NEED_CHOWN" == true ]]; then + chown "$OWNER" "$PUBLISH_DIR/setup.sh" + fi +fi + +# 显示发布结果 +log_success "版本 $VERSION 发布完成!" +echo +echo "发布目录: $PUBLISH_DIR" +echo "发布包: $PUBLISH_DIR/$TAR_NAME" +echo "包大小: $(du -h "$PUBLISH_DIR/$TAR_NAME" | cut -f1)" +echo "最新版本: $(cat "$PUBLISH_DIR/LATEST_VERSION")" +echo +echo "发布目录中的文件:" +ls -la "$PUBLISH_DIR" | while read line; do + echo " $line" +done +echo +echo "使用方法:" +echo " 1. 确保 /srv/ftp/share 目录可通过 FTP 访问" +echo " 2. 用户首先下载安装脚本:" +echo " curl -u ftpuser:admin1234 ftp://10.211.55.4/setup.sh -o setup.sh" +echo " 3. 然后执行安装 (自动获取最新版本):" +echo " sudo sh setup.sh" +echo " 4. 或者指定版本安装:" +echo " sudo sh setup.sh --version $VERSION" +echo " 5. 或者指定不同的FTP服务器:" +echo " sudo sh setup.sh --server 192.168.1.100 --user myuser --password mypass" diff --git a/src/metric/client-plugins/all-in-one-full/scripts/restart_unhealthy.sh b/src/metric/client-plugins/all-in-one-full/scripts/restart_unhealthy.sh new file mode 100755 index 0000000..cd2065b --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/restart_unhealthy.sh @@ -0,0 +1,337 @@ +#!/bin/bash + +# 此脚本会检查各组件的健康状态,并重启不健康的组件 + +# PID 文件检测,防止重复执行 +PIDFILE="/var/run/restart_unhealthy.pid" +if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then + echo "自动重启脚本已在运行中,跳过本次执行" >&2 + exit 0 +fi +echo $$ > "$PIDFILE" +trap "rm -f $PIDFILE" EXIT + +# 获取脚本所在目录 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record" + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1" +} + +# 加载配置文件 +load_config() { + local config_file="$SCRIPT_DIR/config.env" + + if [[ -f "$config_file" ]]; then + log_info "加载配置文件: $config_file" + set -a + source "$config_file" + set +a + log_success "配置文件加载完成" + else + log_warning "配置文件不存在: $config_file,使用默认配置" + fi +} + +# 检查单个组件健康状态 +check_component_health() { + local component_name="$1" + local check_script_path="$2" + + if [[ ! -f "$check_script_path" ]]; then + log_error "$component_name: 健康检查脚本不存在: $check_script_path" + return 1 + fi + + if [[ ! -x "$check_script_path" ]]; then + chmod +x "$check_script_path" 2>/dev/null || true + fi + + # 执行健康检查,捕获退出码 + if "$check_script_path" > /dev/null 2>&1; then + return 0 + else + return 1 + fi +} + +# 重启单个组件 +restart_component() { + local component_name="$1" + local install_dir="$2" + + log_warning "正在重启组件: $component_name" + + # 先执行卸载脚本 + local uninstall_script="$install_dir/uninstall.sh" + if [[ -f "$uninstall_script" ]]; then + log_info "$component_name: 执行卸载脚本..." + chmod +x "$uninstall_script" 2>/dev/null || true + # 使用 yes 命令自动回答所有确认提示 + yes 2>/dev/null | (cd "$install_dir" && "$uninstall_script") || true + log_info "$component_name: 卸载完成" + fi + + # 执行安装脚本 + local install_script="$install_dir/install.sh" + if [[ ! -f "$install_script" ]]; then + log_error "$component_name: 安装脚本不存在: $install_script" + return 1 + fi + + chmod +x "$install_script" 2>/dev/null || true + log_info "$component_name: 执行安装脚本..." + + # 使用 yes 命令自动回答所有确认提示,传递 SCRIPT_DIR 作为参数 + yes 2>/dev/null | (cd "$install_dir" && "$install_script" "$SCRIPT_DIR") || true + + log_info "$component_name: 安装脚本执行完成" + return 0 +} + +# 查找组件进程 PID +find_component_pid() { + local component_name="$1" + local component_pid="" + + case "$component_name" in + "node-exporter") + component_pid=$(pgrep -f "node_exporter" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "node-exporter" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "node_exporter" | awk '{print $2}' | head -1) + fi + ;; + "dcgm-exporter") + component_pid=$(pgrep -f "dcgm-exporter" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "dcgm_exporter" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "dcgm-exporter" | awk '{print $2}' | head -1) + fi + ;; + "fluent-bit") + component_pid=$(pgrep -f "fluent-bit" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "fluent_bit" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "fluent-bit" | awk '{print $2}' | head -1) + fi + ;; + "argus-agent") + component_pid=$(pgrep -f "argus-agent" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "argus-agent" | awk '{print $2}' | head -1) + fi + ;; + esac + + echo "$component_pid" +} + +# 更新安装记录文件中的 PID +update_install_record_pid() { + local component_name="$1" + local new_pid="$2" + + if [[ ! -f "$INSTALL_RECORD_FILE" ]]; then + log_error "安装记录文件不存在: $INSTALL_RECORD_FILE" + return 1 + fi + + # 读取当前 PID + local current_pid="" + if command -v jq &> /dev/null; then + current_pid=$(jq -r --arg comp "$component_name" '.components[$comp].pid // ""' "$INSTALL_RECORD_FILE" 2>/dev/null) + fi + + if [[ -z "$current_pid" ]]; then + log_warning "$component_name: 无法读取当前 PID,跳过更新" + return 1 + fi + + # 使用 sed 精确替换 PID,保持原有格式不变 + # 只替换指定组件块中的 pid 字段 + local temp_file="${INSTALL_RECORD_FILE}.tmp" + local in_component=0 + local updated=0 + + while IFS= read -r line; do + if [[ "$line" =~ \"$component_name\":[[:space:]]*\{ ]]; then + in_component=1 + echo "$line" + elif [[ $in_component -eq 1 && "$line" =~ \"pid\":[[:space:]]*\"$current_pid\" ]]; then + echo "$line" | sed "s/\"pid\": \"$current_pid\"/\"pid\": \"$new_pid\"/" + updated=1 + in_component=0 + else + echo "$line" + if [[ "$line" =~ ^[[:space:]]*\}[[:space:]]*$ ]]; then + in_component=0 + fi + fi + done < "$INSTALL_RECORD_FILE" > "$temp_file" + + # 验证替换是否成功 + if [[ $updated -eq 1 ]]; then + mv "$temp_file" "$INSTALL_RECORD_FILE" + log_success "$component_name: PID 已更新为 $new_pid(原值: $current_pid)" + return 0 + else + log_error "$component_name: PID 替换失败" + rm -f "$temp_file" + return 1 + fi +} + +# 从安装记录文件中读取组件信息 +read_install_record() { + local install_record_file="$1" + + if [[ ! -f "$install_record_file" ]]; then + log_error "安装记录文件不存在: $install_record_file" + return 1 + fi + + # 检查是否有 jq 命令来解析 JSON + if command -v jq &> /dev/null; then + # 使用 jq 解析 JSON + local components_json + if components_json=$(jq -r '.components | to_entries[] | "\(.key):\(.value.install_dir)"' "$install_record_file" 2>/dev/null); then + echo "$components_json" + return 0 + else + log_error "无法解析安装记录文件 JSON 格式: $install_record_file" + return 1 + fi + else + # 如果没有 jq,尝试简单的文本解析 + log_warning "jq 命令不可用,尝试简单文本解析" + + # 查找所有 install_dir 行 + local components=() + while IFS= read -r line; do + if [[ "$line" =~ \"install_dir\":[[:space:]]*\"([^\"]+)\" ]]; then + local install_dir="${BASH_REMATCH[1]}" + # 从路径中提取组件名称 + local component_name=$(basename "$install_dir") + components+=("$component_name:$install_dir") + fi + done < "$install_record_file" + + if [[ ${#components[@]} -gt 0 ]]; then + printf '%s\n' "${components[@]}" + return 0 + else + log_error "无法从安装记录文件中提取组件信息" + return 1 + fi + fi +} + +# 主函数 +main() { + log_info "==========================================" + log_info " 组件自动重启检查" + log_info "==========================================" + + # 检查是否是root用户 + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + exit 1 + fi + + # 加载配置文件 + load_config + + # 从安装记录文件中读取组件信息 + log_info "从安装记录文件读取组件信息: $INSTALL_RECORD_FILE" + local components_info + if ! components_info=$(read_install_record "$INSTALL_RECORD_FILE"); then + log_error "无法读取安装记录文件,自动重启检查终止" + exit 1 + fi + + local restart_count=0 + local check_count=0 + + # 逐个检查组件 + while IFS= read -r component_info; do + if [[ -n "$component_info" ]]; then + IFS=':' read -r component_name install_dir <<< "$component_info" + check_count=$((check_count + 1)) + + local check_script_path="$install_dir/check_health.sh" + + log_info "检查组件: $component_name" + + # 检查健康状态 + if check_component_health "$component_name" "$check_script_path"; then + log_success "$component_name: 运行正常" + else + log_warning "$component_name: 健康检查失败,尝试重启" + restart_count=$((restart_count + 1)) + + # 执行重启 + restart_component "$component_name" "$install_dir" + + # 等待服务启动 + log_info "$component_name: 等待进程启动..." + sleep 10 + + # 查找新的进程 PID + local new_pid=$(find_component_pid "$component_name") + if [[ -n "$new_pid" ]]; then + log_info "$component_name: 找到新进程 PID: $new_pid" + update_install_record_pid "$component_name" "$new_pid" + else + log_warning "$component_name: 未找到新进程 PID" + fi + + # 再次检查健康状态 + if check_component_health "$component_name" "$check_script_path"; then + log_success "$component_name: 重启成功" + else + log_warning "$component_name: 重启后仍不健康,可能需要手动检查" + fi + fi + fi + done <<< "$components_info" + + log_info "==========================================" + log_info "检查完成: 共检查 $check_count 个组件,尝试重启 $restart_count 个" + log_info "==========================================" + + exit 0 +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi + diff --git a/src/metric/client-plugins/all-in-one-full/scripts/setup.sh b/src/metric/client-plugins/all-in-one-full/scripts/setup.sh new file mode 100755 index 0000000..0c36bce --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/setup.sh @@ -0,0 +1,931 @@ +#!/bin/bash + +set -e + +# 加载配置文件(仅在解压后的目录中可用) +load_config() { + # setup.sh 脚本不需要配置文件,FTP参数通过命令行参数或环境变量提供 + log_info "setup.sh 脚本使用命令行参数或环境变量获取FTP配置" +} + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +FTP_SERVER="${FTP_SERVER}" +FTP_USER="${FTP_USER}" +FTP_PASS="${FTP_PASS}" +FTP_PORT="${FTP_PORT:-21}" +BASE_URL="" # FTP基础URL (将在check_ftp_params中设置) +LATEST_VERSION_URL="" # 版本文件URL (将在check_ftp_params中设置) +TEMP_DIR="/tmp/argus-metric-install-$$" + +# 安装目录配置 +DEFAULT_INSTALL_DIR="/opt/argus-metric" # 默认安装目录 +INSTALL_DIR="${INSTALL_DIR:-$DEFAULT_INSTALL_DIR}" # 可通过环境变量覆盖 +VERSIONS_DIR="$INSTALL_DIR/versions" # 版本目录 +BACKUPS_DIR="$INSTALL_DIR/backups" # 备份目录 +CURRENT_LINK="$INSTALL_DIR/current" # 当前版本软链接 +LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION" # 当前版本记录文件 + +# 检查必需的FTP参数 +check_ftp_params() { + local missing_params=() + + if [[ -z "$FTP_SERVER" ]]; then + missing_params+=("FTP_SERVER") + fi + + if [[ -z "$FTP_USER" ]]; then + missing_params+=("FTP_USER") + fi + + if [[ -z "$FTP_PASS" ]]; then + missing_params+=("FTP_PASS") + fi + + if [[ ${#missing_params[@]} -gt 0 ]]; then + log_error "缺少必需的FTP参数: ${missing_params[*]}" + log_error "请通过以下方式之一设置FTP参数:" + log_error " 1. 命令行参数: --server <地址> --user <用户名> --password <密码>" + log_error " 2. 环境变量: FTP_SERVER=<地址> FTP_USER=<用户名> FTP_PASS=<密码>" + log_error "" + log_error "示例:" + log_error " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234" + log_error " FTP_SERVER=10.211.55.4 FTP_USER=ftpuser FTP_PASS=admin1234 sudo sh setup.sh" + exit 1 + fi + + # 设置BASE_URL和LATEST_VERSION_URL + BASE_URL="ftp://${FTP_SERVER}:${FTP_PORT}" + LATEST_VERSION_URL="$BASE_URL/LATEST_VERSION" + + log_info "FTP配置:" + log_info " 服务器: $FTP_SERVER:$FTP_PORT" + log_info " 用户: $FTP_USER" +} + +# 获取最新版本号的函数 +get_latest_version() { + log_info "获取最新版本信息..." >&2 + log_info "尝试从URL获取: $LATEST_VERSION_URL" >&2 + + # 先测试FTP连接 + log_info "测试FTP连接..." >&2 + if ! curl -u "${FTP_USER}:${FTP_PASS}" -sfI "$LATEST_VERSION_URL" >/dev/null 2>&1; then + log_error "无法连接到FTP服务器或文件不存在" >&2 + log_error "URL: $LATEST_VERSION_URL" >&2 + log_error "请检查:" >&2 + log_error " 1. FTP服务器是否运行: $FTP_SERVER:$FTP_PORT" >&2 + log_error " 2. 用户名密码是否正确: $FTP_USER" >&2 + log_error " 3. LATEST_VERSION文件是否存在" >&2 + log_error "手动测试命令: curl -u ${FTP_USER}:${FTP_PASS} ftp://${FTP_SERVER}/LATEST_VERSION" >&2 + exit 1 + fi + + # 获取文件内容 + if ! LATEST_VERSION=$(curl -u "${FTP_USER}:${FTP_PASS}" -sfL "$LATEST_VERSION_URL" 2>/dev/null | tr -d '[:space:]'); then + log_error "下载LATEST_VERSION文件失败" >&2 + exit 1 + fi + + log_info "原始获取内容: '$LATEST_VERSION'" >&2 + + if [[ -z "$LATEST_VERSION" ]]; then + log_error "获取到的版本信息为空" >&2 + log_error "可能的原因:" >&2 + log_error " 1. LATEST_VERSION文件为空" >&2 + log_error " 2. 文件内容格式不正确" >&2 + log_error " 3. 网络传输问题" >&2 + log_error "请检查FTP服务器上的 /srv/ftp/share/LATEST_VERSION 文件" >&2 + exit 1 + fi + + log_info "检测到最新版本: $LATEST_VERSION" >&2 + echo "$LATEST_VERSION" +} + +# 解析参数 +ARGUS_VERSION="" # 使用不同的变量名避免与系统VERSION冲突 +ACTION="install" +FORCE_INSTALL=false + +while [[ $# -gt 0 ]]; do + case $1 in + --version) + ARGUS_VERSION="$2" + shift 2 + ;; + --server) + FTP_SERVER="$2" + shift 2 + ;; + --user) + FTP_USER="$2" + shift 2 + ;; + --password) + FTP_PASS="$2" + shift 2 + ;; + --port) + FTP_PORT="$2" + shift 2 + ;; + --uninstall) + ACTION="uninstall" + shift + ;; + --install-dir) + INSTALL_DIR="$2" + shift 2 + ;; + # 简化安装逻辑:不再支持回滚和备份列表功能 + # --rollback) + # ACTION="rollback" + # shift + # ;; + # --backup-list) + # ACTION="backup-list" + # shift + # ;; + --status) + ACTION="status" + shift + ;; + --force) + FORCE_INSTALL=true + shift + ;; + --help) + echo "Argus Metric FTP在线安装脚本" + echo + echo "用法: curl -u <用户名>:<密码> ftp://<服务器>/setup.sh -o setup.sh && sh setup.sh [选项]" + echo + echo "必需参数 (必须通过命令行参数或环境变量设置):" + echo " --server SERVER FTP服务器地址 (必须)" + echo " --user USER FTP用户名 (必须)" + echo " --password PASS FTP密码 (必须)" + echo + echo "可选参数:" + echo " --version VERSION 指定版本 (默认: 自动获取最新版本)" + echo " --port PORT FTP端口 (默认: 21)" + echo " --install-dir DIR 安装目录 (默认: /opt/argus-metric)" + echo " --force 强制重新安装 (即使相同版本)" + echo " --uninstall 卸载 (自动确认)" + # echo " --rollback 回滚到上一个备份版本" + # echo " --backup-list 列出所有备份版本" + echo " --status 显示当前安装状态" + echo " --help 显示帮助" + echo + echo "环境变量:" + echo " FTP_SERVER FTP服务器地址 (必须)" + echo " FTP_USER FTP用户名 (必须)" + echo " FTP_PASS FTP密码 (必须)" + echo " FTP_PORT FTP端口 (默认: 21)" + echo + echo "示例:" + echo " # 方式1: 使用命令行参数" + echo " curl -u ftpuser:admin1234 ftp://10.211.55.4/setup.sh -o setup.sh" + echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234" + echo " " + echo " # 方式2: 使用环境变量" + echo " FTP_SERVER=10.211.55.4 FTP_USER=ftpuser FTP_PASS=admin1234 sudo sh setup.sh" + echo " " + echo " # 指定版本安装" + echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --version 1.30.0" + echo " " + echo " # 强制重新安装" + echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --force" + echo " " + echo " # 卸载" + echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --uninstall" + exit 0 + ;; + *) + log_error "未知参数: $1" + echo "使用 --help 查看帮助信息" + exit 1 + ;; + esac +done + +# 清理函数 +cleanup() { + if [[ -d "$TEMP_DIR" ]]; then + rm -rf "$TEMP_DIR" + fi +} + +trap cleanup EXIT + +# 创建安装目录结构 +create_install_directories() { + log_info "创建安装目录结构..." + + # 创建主要目录 + mkdir -p "$VERSIONS_DIR" + mkdir -p "$BACKUPS_DIR" + + log_success "安装目录结构创建完成: $INSTALL_DIR" +} + +# 获取当前安装的版本 +get_current_version() { + # 优先从LATEST_VERSION文件读取 + if [[ -f "$LATEST_VERSION_FILE" ]]; then + local version_from_file=$(cat "$LATEST_VERSION_FILE" 2>/dev/null | tr -d '[:space:]') + if [[ -n "$version_from_file" ]]; then + # 确保版本号格式一致(不带v前缀) + echo "$version_from_file" + return 0 + fi + fi + + # 如果文件不存在或为空,从软链接读取 + if [[ -L "$CURRENT_LINK" ]]; then + local current_path=$(readlink "$CURRENT_LINK") + # 从版本目录名中提取版本号(现在不带v前缀) + basename "$current_path" + else + echo "" + fi +} + +# 检查是否已安装 +check_installed() { + if [[ -L "$CURRENT_LINK" ]] && [[ -d "$CURRENT_LINK" ]]; then + local current_version=$(get_current_version) + if [[ -n "$current_version" ]]; then + log_info "检测到已安装版本: v$current_version" + return 0 + fi + fi + return 1 +} + +# 更新LATEST_VERSION文件 +update_latest_version_file() { + local version="$1" + log_info "更新LATEST_VERSION文件: $version" + + if echo "$version" > "$LATEST_VERSION_FILE"; then + log_success "LATEST_VERSION文件已更新" + else + log_error "更新LATEST_VERSION文件失败" + return 1 + fi +} + +# 初始化 DNS 配置文件到系统目录 +init_dns_config_to_system() { + log_info "初始化 DNS 配置文件到系统目录..." + + # 系统 DNS 配置文件 + local system_dns_conf="$INSTALL_DIR/dns.conf" + + # 如果系统目录中还没有 dns.conf,创建一个空的占位文件 + if [[ ! -f "$system_dns_conf" ]]; then + touch "$system_dns_conf" + chmod 644 "$system_dns_conf" + log_success "DNS 配置文件占位文件已创建: $system_dns_conf" + log_info "DNS 同步脚本将从 FTP 服务器下载实际的 DNS 配置" + else + log_info "DNS 配置文件已存在: $system_dns_conf" + fi +} + +# 备份当前版本 +backup_current_version() { + local current_version=$(get_current_version) + if [[ -z "$current_version" ]]; then + log_info "没有当前版本需要备份" + return 0 + fi + + # 确保备份目录存在 + mkdir -p "$BACKUPS_DIR" + + local backup_name="$current_version" + local backup_path="$BACKUPS_DIR/$backup_name" + + log_info "备份当前版本 $current_version 到: $backup_path" + + # 如果备份已存在,先删除 + if [[ -d "$backup_path" ]]; then + log_info "备份版本已存在,覆盖: $backup_path" + rm -rf "$backup_path" + fi + + # 复制当前版本目录(跟随软链接复制实际内容) + if cp -rL "$CURRENT_LINK" "$backup_path"; then + log_success "版本备份完成: $backup_name" + + else + log_error "版本备份失败" + exit 1 + fi +} + +# 回滚到备份版本 +rollback_to_backup() { + local backup_name="$1" + + # 确保备份目录存在 + mkdir -p "$BACKUPS_DIR" + + local backup_path="$BACKUPS_DIR/$backup_name" + + if [[ ! -d "$backup_path" ]]; then + log_error "备份不存在: $backup_path" + return 1 + fi + + log_info "回滚到备份版本: $backup_name" + + # 停止当前服务 + stop_services + + # 检查是否存在对应的版本目录 + local version_dir="$VERSIONS_DIR/$backup_name" + + if [[ ! -d "$version_dir" ]]; then + log_info "版本目录不存在,从备份恢复版本目录: $version_dir" + # 从备份目录恢复到版本目录 + mkdir -p "$VERSIONS_DIR" + cp -r "$backup_path" "$version_dir" + fi + + # 恢复软链接指向版本目录 + if ln -sfn "$version_dir" "$CURRENT_LINK"; then + log_success "版本回滚完成: $backup_name" + + # 更新LATEST_VERSION文件 + update_latest_version_file "$backup_name" + + return 0 + else + log_error "版本回滚失败" + return 1 + fi +} + +# 停止服务 +stop_services() { + log_info "停止当前服务..." + + # 检查服务是否正在运行 + if ! check_services_running; then + log_info "服务未运行,无需停止" + return 0 + fi + + # 尝试使用卸载脚本停止服务 + if [[ -f "$CURRENT_LINK/uninstall.sh" ]]; then + cd "$CURRENT_LINK" + chmod +x uninstall.sh + + # 自动确认停止服务(避免交互式确认) + echo "y" | ./uninstall.sh >/dev/null 2>&1 + local stop_exit_code=$? + + if [[ $stop_exit_code -eq 0 ]]; then + log_success "服务停止完成" + else + log_warning "停止服务时出现警告,尝试手动停止" + manual_stop_services + fi + else + log_warning "未找到卸载脚本,尝试手动停止服务" + manual_stop_services + fi +} + +# 手动停止服务 +manual_stop_services() { + log_info "手动停止服务..." + + # 停止 node_exporter + if pgrep -f "node_exporter" >/dev/null 2>&1; then + pkill -f "node_exporter" && log_info "node_exporter 已停止" + fi + + # 停止 dcgm_exporter + if pgrep -f "dcgm_exporter" >/dev/null 2>&1; then + pkill -f "dcgm_exporter" && log_info "dcgm_exporter 已停止" + fi + + # 等待进程完全停止 + sleep 2 + + # 检查是否还有残留进程 + if pgrep -f "node_exporter\|dcgm_exporter" >/dev/null 2>&1; then + log_warning "仍有服务进程运行,尝试强制停止" + pkill -9 -f "node_exporter\|dcgm_exporter" 2>/dev/null || true + fi + + log_success "手动停止服务完成" +} + +# 启动服务 +start_services() { + log_info "启动服务..." + + # 检查服务是否已经在运行 + if check_services_running; then + log_info "服务已在运行,跳过启动" + return 0 + fi + + # 由于 install_artifact.sh 已经安装了所有组件并设置了健康检查定时任务 + # 这里只需要简单验证服务状态即可 + log_info "组件已安装完成,健康检查定时任务已设置" + log_info "服务将在健康检查时自动启动(每5分钟检查一次)" + + # 等待一下让服务有时间启动 + sleep 3 + + # 验证服务状态 + if check_services_running; then + log_success "服务启动成功" + else + log_info "服务可能正在启动中,健康检查机制将自动监控" + fi + + return 0 +} + +# 检查服务是否正在运行 +check_services_running() { + # 检查常见的服务端口是否在监听 + local ports=(9100 9400) # node-exporter 和 dcgm-exporter 的默认端口 + + for port in "${ports[@]}"; do + if netstat -tlnp 2>/dev/null | grep -q ":$port "; then + log_info "检测到服务正在端口 $port 上运行" + return 0 + fi + done + + # 检查相关进程 + if pgrep -f "node_exporter\|dcgm_exporter" >/dev/null 2>&1; then + log_info "检测到相关服务进程正在运行" + return 0 + fi + + return 1 +} + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo sh setup.sh" + exit 1 + fi +} + +# 检查系统要求 +check_system() { + log_info "检查系统要求..." + + # 检查操作系统 + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + exit 1 + fi + + # 读取系统信息,使用子shell避免污染当前环境变量 + local OS_INFO=$(source /etc/os-release && echo "$NAME $VERSION_ID") + log_info "检测到操作系统: $OS_INFO" + + # 检查系统架构 + arch=$(uname -m) + log_info "系统架构: $arch" + + # 检查磁盘空间 + available_space=$(df / | awk 'NR==2 {print $4}') + if [[ $available_space -lt 1024 ]]; then + log_warning "可用磁盘空间不足 1GB,当前可用: $(($available_space / 1024 / 1024))GB" + fi +} + +# 下载并安装 +install_argus_metric() { + # 如果没有指定版本,获取最新版本 + if [[ -z "$ARGUS_VERSION" ]]; then + ARGUS_VERSION=$(get_latest_version) + fi + + log_info "开始安装 Argus Metric v$ARGUS_VERSION..." + log_info "安装目录: $INSTALL_DIR" + + # 创建安装目录结构(必须先创建,以便备份时目录存在) + create_install_directories + + # 检查是否已安装 + local is_upgrade=false + if check_installed; then + local current_version=$(get_current_version) + if [[ "$current_version" == "$ARGUS_VERSION" ]]; then + if [[ "$FORCE_INSTALL" == true ]]; then + log_info "检测到相同版本 v$ARGUS_VERSION,但使用了 --force 参数,将强制重新安装" + is_upgrade=true + # 简化安装逻辑:不再备份当前版本 + # backup_current_version + else + log_info "版本 v$ARGUS_VERSION 已安装,无需重复安装" + log_info "如需强制重新安装,请使用 --force 参数" + return 0 + fi + else + log_info "检测到版本升级: v$current_version -> v$ARGUS_VERSION" + is_upgrade=true + + # 简化安装逻辑:不再备份当前版本 + # backup_current_version + fi + fi + + # 创建临时目录 + mkdir -p "$TEMP_DIR" + cd "$TEMP_DIR" + + # 下载发布包,使用新的命名规范 + TAR_NAME="argus-metric_$(echo $ARGUS_VERSION | tr '.' '_').tar.gz" + log_info "下载发布包: $TAR_NAME" + log_info "从FTP服务器下载: $FTP_SERVER:$FTP_PORT, 用户: $FTP_USER" + + # 构造curl命令并显示(隐藏密码) + CURL_CMD="curl -u \"${FTP_USER}:***\" -sfL \"$BASE_URL/$TAR_NAME\" -o \"$TAR_NAME\"" + log_info "执行命令: $CURL_CMD" + + if ! curl -u "${FTP_USER}:${FTP_PASS}" -sfL "$BASE_URL/$TAR_NAME" -o "$TAR_NAME"; then + log_error "下载发布包失败: $BASE_URL/$TAR_NAME" + log_error "完整命令: curl -u \"${FTP_USER}:${FTP_PASS}\" -sfL \"$BASE_URL/$TAR_NAME\" -o \"$TAR_NAME\"" + log_error "请检查FTP服务器连接、用户名密码是否正确" + exit 1 + fi + + # 解压发布包到当前目录 + log_info "解压发布包..." + if ! tar -xzf "$TAR_NAME"; then + log_error "解压发布包失败" + exit 1 + fi + + # 显示解压后的文件结构 + log_info "解压后的文件结构:" + ls -la "$TEMP_DIR" + + # 准备版本目录 + local version_dir="$VERSIONS_DIR/$ARGUS_VERSION" + log_info "安装到版本目录: $version_dir" + + # 如果升级,先停止服务 + if [[ "$is_upgrade" == true ]]; then + stop_services + fi + + # 创建版本目录 + if [[ -d "$version_dir" ]]; then + log_info "版本目录已存在,备份后更新" + rm -rf "$version_dir" + fi + + # 创建新的版本目录 + mkdir -p "$version_dir" + + # 移动解压的文件到版本目录 + log_info "移动文件到版本目录: $TEMP_DIR/* -> $version_dir/" + + # 检查源目录是否有内容 + if [[ ! "$(ls -A "$TEMP_DIR" 2>/dev/null)" ]]; then + log_error "临时目录为空,无法移动文件" + exit 1 + fi + + # 检查目标目录是否存在 + if [[ ! -d "$version_dir" ]]; then + log_error "目标版本目录不存在: $version_dir" + exit 1 + fi + + # 执行文件移动 + if mv "$TEMP_DIR"/* "$version_dir" 2>/dev/null; then + log_success "文件移动到版本目录完成" + else + log_error "移动文件到版本目录失败" + log_error "源目录内容:" + ls -la "$TEMP_DIR" || true + log_error "目标目录状态:" + ls -la "$version_dir" || true + log_error "权限检查:" + ls -ld "$TEMP_DIR" "$version_dir" || true + exit 1 + fi + + # 执行安装脚本 + log_info "执行安装脚本..." + cd "$version_dir" + if [[ -f "install.sh" ]]; then + chmod +x install.sh + # 传递安装根目录给安装脚本,让install_artifact.sh安装到正确的版本目录 + if ./install.sh "$version_dir"; then + log_success "安装脚本执行完成" + else + log_error "安装脚本执行失败" + # 简化安装逻辑:不再自动回滚 + # if [[ "$is_upgrade" == true ]]; then + # log_warning "升级失败,尝试回滚到之前版本..." + # # 确保备份目录存在 + # mkdir -p "$BACKUPS_DIR" + # local latest_backup=$(ls -1t "$BACKUPS_DIR" 2>/dev/null | head -n 1) + # if [[ -n "$latest_backup" ]]; then + # rollback_to_backup "$latest_backup" + # return 1 + # fi + # fi + exit 1 + fi + else + log_error "未找到安装脚本 install.sh" + exit 1 + fi + + # 更新软链接指向新版本 + log_info "更新当前版本链接..." + + # 如果 current 已经存在且是目录,先删除它 + if [[ -d "$CURRENT_LINK" ]] && [[ ! -L "$CURRENT_LINK" ]]; then + log_warning "发现 current 是目录而不是符号链接,正在删除..." + rm -rf "$CURRENT_LINK" + fi + + if ln -sfn "$version_dir" "$CURRENT_LINK"; then + log_success "版本链接更新完成: $CURRENT_LINK -> $version_dir" + else + log_error "版本链接更新失败" + exit 1 + fi + + # 更新LATEST_VERSION文件 + update_latest_version_file "$ARGUS_VERSION" + + # 初始化 DNS 配置文件到系统目录 + init_dns_config_to_system + + # 启动服务 + # start_services + + log_success "Argus Metric v$ARGUS_VERSION 安装完成!" + + # 显示安装信息 + echo + log_info "安装信息:" + log_info " 版本: $ARGUS_VERSION" + log_info " 安装目录: $INSTALL_DIR" + log_info " 版本目录: $version_dir" + log_info " 当前链接: $CURRENT_LINK" + if [[ "$is_upgrade" == true ]]; then + log_info " 升级类型: 版本升级" + else + log_info " 安装类型: 全新安装" + fi +} + +# 卸载 +uninstall_argus_metric() { + log_info "开始卸载 Argus Metric..." + log_info "安装目录: $INSTALL_DIR" + + # 检查是否已安装 + if ! check_installed; then + log_info "未检测到已安装的 Argus Metric" + return 0 + fi + + local current_version=$(get_current_version) + log_info "检测到当前版本: v$current_version" + + # 停止服务 + stop_services + + # 执行卸载脚本 + log_info "执行卸载脚本..." + if [[ -f "$CURRENT_LINK/uninstall.sh" ]]; then + cd "$CURRENT_LINK" + chmod +x uninstall.sh + + # 自动确认卸载(因为用户已经明确使用了 --uninstall 参数) + log_info "自动确认卸载操作..." + echo "y" | ./uninstall.sh + local uninstall_exit_code=$? + + if [[ $uninstall_exit_code -eq 0 ]]; then + log_success "卸载脚本执行完成" + else + log_error "卸载脚本执行失败 (退出码: $uninstall_exit_code)" + exit 1 + fi + else + log_warning "未找到卸载脚本,执行基本清理" + fi + + # 清理安装目录 + log_info "清理安装目录..." + if [[ -d "$INSTALL_DIR" ]]; then + # 询问是否完全删除安装目录 + log_warning "这将删除整个安装目录: $INSTALL_DIR" + log_warning "包括所有版本、备份和配置文件" + + # 在自动化环境中,直接删除 + if rm -rf "$INSTALL_DIR"; then + log_success "安装目录已完全清理: $INSTALL_DIR" + else + log_error "清理安装目录失败" + exit 1 + fi + else + log_info "安装目录不存在,无需清理" + fi + + log_success "Argus Metric 卸载完成!" +} + +# 显示状态 +show_status() { + echo "==========================================" + echo " Argus Metric 安装状态" + echo "==========================================" + echo + + if check_installed; then + local current_version=$(get_current_version) + log_info "当前版本: $current_version" + log_info "安装目录: $INSTALL_DIR" + log_info "当前链接: $CURRENT_LINK" + log_info "版本目录: $VERSIONS_DIR/$current_version" + log_info "版本文件: $LATEST_VERSION_FILE" + + # 显示LATEST_VERSION文件内容 + if [[ -f "$LATEST_VERSION_FILE" ]]; then + local file_version=$(cat "$LATEST_VERSION_FILE" 2>/dev/null | tr -d '[:space:]') + log_info "版本文件内容: $file_version" + fi + + echo + log_info "目录结构:" + if [[ -d "$INSTALL_DIR" ]]; then + tree -L 2 "$INSTALL_DIR" 2>/dev/null || ls -la "$INSTALL_DIR" + fi + + echo + log_info "可用版本:" + if [[ -d "$VERSIONS_DIR" ]]; then + ls -1 "$VERSIONS_DIR" 2>/dev/null | sed 's/^/ - /' + else + echo " 无" + fi + + # 简化安装逻辑:不再显示备份版本信息 + # echo + # log_info "备份版本:" + # if [[ -d "$BACKUPS_DIR" ]] && [[ $(ls -1 "$BACKUPS_DIR" 2>/dev/null | wc -l) -gt 0 ]]; then + # ls -1t "$BACKUPS_DIR" 2>/dev/null | sed 's/^/ - /' + # else + # echo " 无" + # fi + else + log_warning "Argus Metric 未安装" + log_info "安装目录: $INSTALL_DIR" + fi +} + +# 列出备份 +list_backups() { + echo "==========================================" + echo " Argus Metric 备份列表" + echo "==========================================" + echo + + if [[ -d "$BACKUPS_DIR" ]] && [[ $(ls -1 "$BACKUPS_DIR" 2>/dev/null | wc -l) -gt 0 ]]; then + log_info "可用备份版本:" + ls -1t "$BACKUPS_DIR" 2>/dev/null | while read backup; do + local backup_time=$(stat -c %y "$BACKUPS_DIR/$backup" 2>/dev/null | cut -d' ' -f1-2) + echo " - $backup (创建时间: $backup_time)" + done + else + log_warning "没有可用的备份版本" + fi +} + +# 回滚功能 +rollback_version() { + log_info "开始回滚操作..." + + if ! check_installed; then + log_error "没有检测到已安装的版本,无法回滚" + exit 1 + fi + + # 确保备份目录存在 + mkdir -p "$BACKUPS_DIR" + + # 获取最新的备份 + local latest_backup=$(ls -1t "$BACKUPS_DIR" 2>/dev/null | head -n 1) + if [[ -z "$latest_backup" ]]; then + log_error "没有找到可用的备份版本" + exit 1 + fi + + log_info "将回滚到备份版本: $latest_backup" + + if rollback_to_backup "$latest_backup"; then + log_success "回滚完成!" + + # 显示当前状态 + echo + show_status + else + log_error "回滚失败" + exit 1 + fi +} + +# 主函数 +main() { + echo "==========================================" + echo " Argus Metric 在线安装脚本 v1.0" + echo "==========================================" + echo + + # 加载配置文件 + load_config + + # 对于状态操作,不需要FTP参数和root权限 + # 简化安装逻辑:不再支持备份列表操作 + if [[ "$ACTION" == "status" ]]; then + show_status + return 0 + fi + # if [[ "$ACTION" == "status" || "$ACTION" == "backup-list" ]]; then + # if [[ "$ACTION" == "status" ]]; then + # show_status + # elif [[ "$ACTION" == "backup-list" ]]; then + # list_backups + # fi + # return 0 + # fi + + check_root + + # 更新目录配置变量(在设置INSTALL_DIR后) + VERSIONS_DIR="$INSTALL_DIR/versions" + BACKUPS_DIR="$INSTALL_DIR/backups" + CURRENT_LINK="$INSTALL_DIR/current" + LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION" + + # 简化安装逻辑:不再支持回滚操作 + # if [[ "$ACTION" == "rollback" ]]; then + # rollback_version + # return 0 + # fi + + check_ftp_params + check_system + + if [[ "$ACTION" == "uninstall" ]]; then + uninstall_argus_metric + else + install_argus_metric + fi + + echo + log_info "操作完成!" +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/scripts/sync_dns.sh b/src/metric/client-plugins/all-in-one-full/scripts/sync_dns.sh new file mode 100755 index 0000000..ba8a84c --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/sync_dns.sh @@ -0,0 +1,143 @@ +#!/bin/bash +set -e + +# 颜色 +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m' + +# 日志函数 +log_info() { echo -e "${BLUE}[INFO]${NC} $1" >&2; } +log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" >&2; } +log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" >&2; } +log_error() { echo -e "${RED}[ERROR]${NC} $1" >&2; } + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LOCAL_DNS_CONF="/opt/argus-metric/dns.conf" +RESOLV_CONF="/etc/resolv.conf" +ALT_RESOLV_CONF="/run/resolv.conf" +LOG_FILE="/opt/argus-metric/.dns_sync.log" +REMOTE_DNS_CONF_URL="" + +# 获取 FTP 配置 +get_ftp_config() { + log_info "获取 FTP 配置信息..." + if [[ -z "$FTP_SERVER" || -z "$FTP_USER" || -z "$FTP_PASSWORD" ]]; then + [[ -f "$SCRIPT_DIR/config.env" ]] && source "$SCRIPT_DIR/config.env" + fi + FTP_SERVER="${FTP_SERVER:-localhost}" + FTP_USER="${FTP_USER:-ftpuser}" + FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" + REMOTE_DNS_CONF_URL="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/dns.conf" +} + +# 下载远程 dns.conf +download_remote_dns_conf() { + local tmp="/tmp/dns.remote.$$" + log_info "测试 FTP 连接..." + if ! curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null; then + log_error "无法连接到 FTP 服务器: $FTP_SERVER"; return 1 + fi + if ! curl -u "${FTP_USER}:${FTP_PASSWORD}" -sf "ftp://${FTP_SERVER}/dns.conf" -o "$tmp" 2>/dev/null; then + log_error "下载 dns.conf 失败"; rm -f "$tmp"; return 1 + fi + echo "$tmp" +} + +# 文件比较 +compare_files() { diff -q "$1" "$2" >/dev/null 2>&1; } + +# 从 dns.conf 提取有效 IP +get_dns_ips() { + grep -Eo '^[0-9]{1,3}(\.[0-9]{1,3}){3}$' "$1" | sort -u +} + +# 安全更新 resolv.conf(保留符号链接) +update_resolv_conf() { + local dns_conf="$1" + local dns_ips + mapfile -t dns_ips < <(get_dns_ips "$dns_conf") + [[ ${#dns_ips[@]} -eq 0 ]] && { log_warning "未检测到有效 DNS"; return; } + + local target_file="$RESOLV_CONF" + if [[ ! -w "$RESOLV_CONF" ]]; then + log_warning "/etc/resolv.conf 不可写,使用兜底路径 $ALT_RESOLV_CONF" + target_file="$ALT_RESOLV_CONF" + fi + + local temp="/tmp/resolv.new.$$" + cp "$target_file" "${target_file}.backup.$(date +%Y%m%d_%H%M%S)" 2>/dev/null || true + log_info "更新 DNS 配置文件: $target_file" + + # 写入新的 nameserver 行 + for ip in "${dns_ips[@]}"; do + echo "nameserver $ip" + done >"$temp" + + # 追加原内容(去掉重复 nameserver) + grep -v '^nameserver' "$target_file" >>"$temp" 2>/dev/null || true + awk '!a[$0]++' "$temp" >"${temp}.uniq" + + # ⚙️ 使用 cat 原地覆盖,避免 mv 引发 “设备忙” + if cat "${temp}.uniq" >"$target_file" 2>/dev/null; then + chmod 644 "$target_file" + log_success "DNS 更新完成: ${dns_ips[*]}" + else + log_error "无法写入 $target_file,可能被系统锁定" + fi + + rm -f "$temp" "${temp}.uniq" +} + +# 检查 resolv.conf 是否包含 dns.conf 内容 +ensure_dns_in_resolv() { + local dns_conf="$1" + local dns_ips + mapfile -t dns_ips < <(get_dns_ips "$dns_conf") + [[ ${#dns_ips[@]} -eq 0 ]] && return + + for ip in "${dns_ips[@]}"; do + if ! grep -q "nameserver $ip" "$RESOLV_CONF" 2>/dev/null; then + log_warning "检测到 /etc/resolv.conf 缺少 $ip,执行兜底修复" + update_resolv_conf "$dns_conf" + return + fi + done + log_info "/etc/resolv.conf 已包含所有 DNS" +} + +log_sync() { echo "[$(date '+%F %T')] $1" >>"$LOG_FILE"; } + +main() { + log_info "开始 DNS 同步检查..." + mkdir -p /opt/argus-metric + + get_ftp_config + local remote_file + if ! remote_file=$(download_remote_dns_conf); then + log_error "下载失败"; log_sync "同步失败"; exit 1 + fi + + if [[ ! -f "$LOCAL_DNS_CONF" ]]; then + log_info "本地 dns.conf 不存在,初始化..." + cp "$remote_file" "$LOCAL_DNS_CONF" + update_resolv_conf "$LOCAL_DNS_CONF" + log_sync "首次同步完成" + else + if compare_files "$LOCAL_DNS_CONF" "$remote_file"; then + log_info "dns.conf 无变化" + ensure_dns_in_resolv "$LOCAL_DNS_CONF" + log_sync "dns.conf 无变化,执行兜底检查" + else + log_info "检测到 DNS 配置更新" + cp "$remote_file" "$LOCAL_DNS_CONF" + update_resolv_conf "$LOCAL_DNS_CONF" + log_sync "DNS 配置同步完成" + fi + fi + + rm -f "$remote_file" + log_success "DNS 同步流程完成" +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-full/scripts/uninstall_artifact.sh b/src/metric/client-plugins/all-in-one-full/scripts/uninstall_artifact.sh new file mode 100755 index 0000000..ca137a7 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/uninstall_artifact.sh @@ -0,0 +1,274 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 配置变量 +INSTALL_DIR="/opt/argus-metric" +TEMP_DIR="/tmp/argus-metric-uninstall-$$" +VERSION_FILE="version.json" + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0" + exit 1 + fi +} + +# 查找版本文件 +find_version_file() { + log_info "查找版本信息文件..." + + # 在当前目录查找 + if [[ -f "$VERSION_FILE" ]]; then + VERSION_FILE_PATH="$VERSION_FILE" + log_success "找到版本文件: $VERSION_FILE" + return 0 + fi + + # 在 artifact 目录查找 + for version_dir in artifact/*/; do + if [[ -f "${version_dir}${VERSION_FILE}" ]]; then + VERSION_FILE_PATH="${version_dir}${VERSION_FILE}" + log_success "找到版本文件: $VERSION_FILE_PATH" + return 0 + fi + done + + log_error "未找到版本信息文件 $VERSION_FILE" + log_info "请确保在正确的目录下运行此脚本" + exit 1 +} + +# 解析版本信息 +parse_version_info() { + log_info "解析版本信息..." + + if [[ ! -f "$VERSION_FILE_PATH" ]]; then + log_error "版本文件不存在: $VERSION_FILE_PATH" + exit 1 + fi + + # 使用 jq 解析 JSON(如果可用) + if command -v jq &> /dev/null; then + VERSION=$(jq -r '.version' "$VERSION_FILE_PATH") + BUILD_TIME=$(jq -r '.build_time' "$VERSION_FILE_PATH") + + # 解析 install_order(现在包含完整的文件名) + if jq -e '.install_order' "$VERSION_FILE_PATH" > /dev/null 2>&1; then + jq -r '.install_order[]' "$VERSION_FILE_PATH" > "$TEMP_DIR/install_order.txt" + else + log_error "version.json 中缺少 install_order 字段" + exit 1 + fi + else + log_warning "jq 未安装,使用简单的 JSON 解析" + VERSION=$(grep '"version"' "$VERSION_FILE_PATH" | sed 's/.*"version": *"\([^"]*\)".*/\1/') + BUILD_TIME=$(grep '"build_time"' "$VERSION_FILE_PATH" | sed 's/.*"build_time": *"\([^"]*\)".*/\1/') + + # 解析 install_order + grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+"' | while read line; do + component=$(echo "$line" | sed 's/.*"\([^"]*\)".*/\1/') + echo "$component" >> "$TEMP_DIR/install_order.txt" + done + fi + + log_success "版本信息解析完成" + log_info " 版本: $VERSION" + log_info " 构建时间: $BUILD_TIME" +} + +# 创建临时目录 +create_temp_dirs() { + log_info "创建临时目录..." + mkdir -p "$TEMP_DIR" + log_success "临时目录创建完成: $TEMP_DIR" +} + +# 卸载组件 +uninstall_components() { + log_info "开始卸载组件..." + + artifact_dir=$(dirname "$VERSION_FILE_PATH") + uninstall_count=0 + total_count=0 + + if [[ -f "$TEMP_DIR/install_order.txt" ]]; then + total_count=$(wc -l < "$TEMP_DIR/install_order.txt") + fi + + if [[ -f "$TEMP_DIR/install_order.txt" ]]; then + while IFS= read -r filename; do + uninstall_count=$((uninstall_count + 1)) + + # 从文件名中提取组件名(去掉时间戳后缀) + component=$(echo "$filename" | sed 's/-[0-9]\{8\}-[0-9]\{6\}\.tar\.gz$//') + + log_info "[$uninstall_count/$total_count] 卸载 $component..." + + # 直接使用完整的文件名 + tar_file="$artifact_dir/$filename" + + if [[ ! -f "$tar_file" ]]; then + log_error "找不到组件文件: $filename" + exit 1 + fi + + # 解压到临时目录 + component_temp_dir="$TEMP_DIR/$component" + mkdir -p "$component_temp_dir" + + if tar -xzf "$tar_file" -C "$component_temp_dir"; then + log_success " $component 解压完成" + else + log_error " $component 解压失败" + exit 1 + fi + + # 查找解压后的目录 + extracted_dir="" + for dir in "$component_temp_dir"/*; do + if [[ -d "$dir" ]]; then + extracted_dir="$dir" + break + fi + done + + if [[ -z "$extracted_dir" ]]; then + log_error " $component 解压后未找到目录" + exit 1 + fi + + # 执行卸载脚本 + if [[ -f "$extracted_dir/uninstall.sh" ]]; then + log_info " 执行 $component 卸载脚本..." + # 所有组件都只需要一个确认 + if (cd "$extracted_dir" && echo "y" | ./uninstall.sh); then + log_success " $component 卸载完成" + else + log_error " $component 卸载失败" + exit 1 + fi + else + log_warning " $component 缺少 uninstall.sh 文件,跳过卸载" + fi + + # 清理临时文件 + rm -rf "$component_temp_dir" + done < "$TEMP_DIR/install_order.txt" + fi + + log_success "所有组件卸载完成" +} + +# 清理全局文件 +cleanup_global_files() { + log_info "清理全局文件..." + + # 清理安装目录 + if [[ -d "$INSTALL_DIR" ]]; then + rm -rf "$INSTALL_DIR" + log_success "安装目录已清理: $INSTALL_DIR" + else + log_info "安装目录不存在: $INSTALL_DIR" + fi + + # 清理可能的全局配置文件 + local global_configs=( + "/etc/argus-metric" + "/var/log/argus-metric" + ) + + for config in "${global_configs[@]}"; do + if [[ -d "$config" ]]; then + rm -rf "$config" + log_success "全局配置已清理: $config" + fi + done +} + +# 显示卸载信息 +show_uninstall_info() { + log_success "Argus-Metrics All-in-One 卸载完成!" + echo + echo "卸载信息:" + echo " 版本: $VERSION" + echo " 构建时间: $BUILD_TIME" + echo + echo "清理内容:" + echo " - 二进制文件" + echo " - 配置文件" + echo " - 数据目录" + echo " - 进程和服务" + echo " - 全局安装目录" + echo + echo "注意:" + echo " - 系统依赖包可能仍然存在" + echo " - 如需完全清理,请手动检查并删除相关文件" + echo +} + +# 清理函数 +cleanup() { + if [[ -d "$TEMP_DIR" ]]; then + rm -rf "$TEMP_DIR" + fi +} + +# 设置清理陷阱 +trap cleanup EXIT + +# 主函数 +main() { + echo "==========================================" + echo " Argus-Metrics All-in-One 卸载脚本" + echo "==========================================" + echo + + check_root + find_version_file + create_temp_dirs + parse_version_info + + log_warning "此操作将完全卸载 Argus-Metrics All-in-One" + read -p "确认继续?(y/N): " confirm + + if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "取消卸载操作" + exit 0 + fi + + uninstall_components + cleanup_global_files + show_uninstall_info +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi \ No newline at end of file diff --git a/src/metric/client-plugins/all-in-one-full/scripts/version-manager.sh b/src/metric/client-plugins/all-in-one-full/scripts/version-manager.sh new file mode 100755 index 0000000..65e566c --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/scripts/version-manager.sh @@ -0,0 +1,350 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 显示帮助信息 +show_help() { + echo "AIOps 版本管理工具" + echo + echo "用法: $0 [options]" + echo + echo "命令:" + echo " bump - 升级版本号 (major|minor|patch)" + echo " set - 设置指定版本号" + echo " show - 显示当前版本信息" + echo " list - 列出所有版本" + echo " clean - 清理旧版本" + echo " validate - 验证版本配置" + echo + echo "示例:" + echo " $0 bump minor # 升级次版本号 1.0.0 -> 1.1.0" + echo " $0 set 2.0.0 # 设置版本为 2.0.0" + echo " $0 show # 显示当前版本" + echo " $0 list # 列出所有版本" +} + +# 获取当前版本 +get_current_version() { + if [[ -f "config/VERSION" ]]; then + cat config/VERSION + else + echo "0.0.0" + fi +} + +# 设置版本号 +set_version() { + local new_version="$1" + + # 验证版本号格式 + if [[ ! "$new_version" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + log_error "无效的版本号格式: $new_version" + log_info "版本号格式应为: major.minor.patch (如: 1.2.3)" + exit 1 + fi + + echo "$new_version" > config/VERSION + log_success "版本号已设置为: $new_version" +} + +# 升级版本号 +bump_version() { + local bump_type="$1" + local current_version=$(get_current_version) + + # 解析当前版本号 + IFS='.' read -r major minor patch <<< "$current_version" + + case "$bump_type" in + "major") + major=$((major + 1)) + minor=0 + patch=0 + ;; + "minor") + minor=$((minor + 1)) + patch=0 + ;; + "patch") + patch=$((patch + 1)) + ;; + *) + log_error "无效的升级类型: $bump_type" + log_info "支持的类型: major, minor, patch" + exit 1 + ;; + esac + + local new_version="$major.$minor.$patch" + set_version "$new_version" + log_success "版本号已从 $current_version 升级到 $new_version" +} + +# 显示当前版本信息 +show_version() { + local current_version=$(get_current_version) + log_info "当前版本: $current_version" + + if [[ -f "config/checklist" ]]; then + echo + echo "组件清单:" + while IFS= read -r line; do + [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue + read -r component version dep order <<< "$line" + if [[ -n "$component" && -n "$version" ]]; then + echo " - $component v$version" + fi + done < config/checklist + fi + + # 检查是否有对应的 artifact + local artifact_dir="artifact/$current_version" + if [[ -d "$artifact_dir" ]]; then + echo + echo "已构建的组件:" + for file in "$artifact_dir"/*.tar.gz; do + if [[ -f "$file" ]]; then + local filename=$(basename "$file") + local size=$(du -h "$file" | cut -f1) + echo " - $filename ($size)" + fi + done + + if [[ -f "$artifact_dir/version.json" ]]; then + echo + echo "版本信息文件: $artifact_dir/version.json" + fi + else + echo + log_warning "未找到对应的构建目录: $artifact_dir" + log_info "运行 ./package.sh 进行构建" + fi +} + +# 列出所有版本 +list_versions() { + log_info "所有版本列表:" + echo + + if [[ ! -d "artifact" ]]; then + log_warning "artifact 目录不存在" + return + fi + + for version_dir in artifact/*/; do + if [[ -d "$version_dir" ]]; then + local version=$(basename "$version_dir") + local current_version=$(get_current_version) + + if [[ "$version" == "$current_version" ]]; then + echo " * $version (当前版本)" + else + echo " $version" + fi + + # 显示该版本的组件 + local component_count=0 + for file in "$version_dir"/*.tar.gz; do + if [[ -f "$file" ]]; then + component_count=$((component_count + 1)) + fi + done + + if [[ $component_count -gt 0 ]]; then + echo " 包含 $component_count 个组件" + fi + fi + done +} + +# 清理旧版本 +clean_versions() { + local current_version=$(get_current_version) + local keep_versions=5 # 保留最近5个版本 + + log_info "清理旧版本 (保留最近 $keep_versions 个版本)..." + + if [[ ! -d "artifact" ]]; then + log_warning "artifact 目录不存在" + return + fi + + # 获取所有版本目录,按修改时间排序 + local versions=() + while IFS= read -r -d '' version_dir; do + versions+=("$(basename "$version_dir")") + done < <(find artifact -maxdepth 1 -type d -name "[0-9]*" -print0 | sort -z) + + local total_versions=${#versions[@]} + local versions_to_remove=$((total_versions - keep_versions)) + + if [[ $versions_to_remove -le 0 ]]; then + log_info "无需清理,当前只有 $total_versions 个版本" + return + fi + + log_info "将删除 $versions_to_remove 个旧版本..." + + for ((i=0; i /etc/apt/sources.list && \ echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \ echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \ - else \ - echo "Configuring fast apt sources for external network..." && \ - find /etc/apt -name "sources.list*" -exec sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g' {} \; && \ - find /etc/apt -name "sources.list*" -exec sed -i 's/security.ubuntu.com/mirrors.aliyun.com/g' {} \; && \ - echo "deb http://mirrors.aliyun.com/ubuntu/ jammy main restricted universe multiverse" > /etc/apt/sources.list && \ - echo "deb http://mirrors.aliyun.com/ubuntu/ jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \ - echo "deb http://mirrors.aliyun.com/ubuntu/ jammy-security main restricted universe multiverse" >> /etc/apt/sources.list; \ fi # 验证源配置并安装常用工具 @@ -61,10 +54,25 @@ RUN mkdir -p ${PROMETHEUS_BASE_PATH}/rules \ && ln -s ${PROMETHEUS_BASE_PATH} /prometheus # 修改 Prometheus 用户 UID/GID 并授权 -RUN usermod -u ${ARGUS_BUILD_UID} nobody && \ - groupmod -g ${ARGUS_BUILD_GID} nogroup && \ - chown -h nobody:nogroup /prometheus && \ - chown -R nobody:nogroup ${PROMETHEUS_BASE_PATH} && \ +RUN set -eux; \ + existing_user=""; \ + if getent passwd "${ARGUS_BUILD_UID}" >/dev/null 2>&1; then \ + existing_user="$(getent passwd "${ARGUS_BUILD_UID}" | cut -d: -f1)"; \ + fi; \ + if [ -n "$existing_user" ] && [ "$existing_user" != "nobody" ]; then \ + userdel -r "$existing_user" || true; \ + fi; \ + existing_group=""; \ + if getent group "${ARGUS_BUILD_GID}" >/dev/null 2>&1; then \ + existing_group="$(getent group "${ARGUS_BUILD_GID}" | cut -d: -f1)"; \ + fi; \ + if [ -n "$existing_group" ] && [ "$existing_group" != "nogroup" ]; then \ + groupdel "$existing_group" || true; \ + fi; \ + usermod -u ${ARGUS_BUILD_UID} nobody; \ + groupmod -g ${ARGUS_BUILD_GID} nogroup; \ + chown -h nobody:nogroup /prometheus; \ + chown -R nobody:nogroup ${PROMETHEUS_BASE_PATH}; \ chown -R nobody:nogroup /etc/prometheus # supervisor 配置 diff --git a/src/metric/tests/docker-compose.yml b/src/metric/tests/docker-compose.yml index d05853b..f14603e 100644 --- a/src/metric/tests/docker-compose.yml +++ b/src/metric/tests/docker-compose.yml @@ -5,13 +5,6 @@ networks: services: ftp: - build: - context: ../ftp/build - dockerfile: Dockerfile - args: - ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133} - ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015} - USE_INTRANET: ${USE_INTRANET:-false} image: argus-metric-ftp:latest container_name: argus-ftp restart: unless-stopped @@ -41,13 +34,6 @@ services: max-file: "3" prometheus: - build: - context: ../prometheus/build - dockerfile: Dockerfile - args: - ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133} - ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015} - USE_INTRANET: ${USE_INTRANET:-false} image: argus-metric-prometheus:latest container_name: argus-prometheus restart: unless-stopped @@ -73,12 +59,6 @@ services: max-file: "3" grafana: - build: - context: ../grafana/build - dockerfile: Dockerfile - args: - ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133} - ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015} image: argus-metric-grafana:latest container_name: argus-grafana restart: unless-stopped @@ -109,9 +89,6 @@ services: max-file: "3" test-node: - build: - context: ./client-test-node/build - dockerfile: Dockerfile image: argus-metric-test-node:latest container_name: argus-metric-test-node hostname: test-metric-node-001 @@ -143,9 +120,6 @@ services: max-file: "3" test-gpu-node: - build: - context: ./client-test-gpu-node/build - dockerfile: Dockerfile image: argus-metric-test-gpu-node:latest container_name: argus-metric-test-gpu-node hostname: test-metric-gpu-node-001 diff --git a/src/metric/tests/scripts/01_start_services.sh b/src/metric/tests/scripts/01_start_services.sh index 01e587f..7faa6c4 100755 --- a/src/metric/tests/scripts/01_start_services.sh +++ b/src/metric/tests/scripts/01_start_services.sh @@ -3,15 +3,8 @@ set -e SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -# 解析参数 -REBUILD_FLAG="" -if [[ "$1" == "--rebuild" || "$1" == "-r" ]]; then - REBUILD_FLAG="--rebuild" - echo "[01] 启用强制重新构建模式" -fi - echo "[01] 启动所有服务..." -bash "$SCRIPT_DIR/common/start-all.sh" $REBUILD_FLAG +bash "$SCRIPT_DIR/common/start-all.sh" echo "[01] 等待服务就绪..." sleep 5 diff --git a/src/metric/tests/scripts/04_test_gpu_node_install.sh b/src/metric/tests/scripts/04_test_gpu_node_install.sh index ce1d19a..b0e2355 100755 --- a/src/metric/tests/scripts/04_test_gpu_node_install.sh +++ b/src/metric/tests/scripts/04_test_gpu_node_install.sh @@ -1,6 +1,9 @@ #!/bin/bash set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +COMMON_DIR="$SCRIPT_DIR/common" + FTP_SERVER="${FTP_SERVER:-172.30.0.40}" FTP_USER="${FTP_USER:-ftpuser}" FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" @@ -8,26 +11,37 @@ FTP_PORT="${FTP_PORT:-21}" FTP_HOST="${FTP_SERVER}" -echo "[03] 进入测试节点执行安装..." -echo "[03] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}" +echo "[04] 检测GPU环境..." +# 检测GPU环境 +if bash "$COMMON_DIR/check-gpu.sh"; then + echo "[04] GPU环境可用,继续执行GPU节点安装" + GPU_AVAILABLE=true +else + echo "[04] GPU环境不可用,跳过GPU节点安装" + GPU_AVAILABLE=false + exit 0 +fi + +echo "[04] 进入测试节点执行安装..." +echo "[04] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}" docker exec argus-metric-test-gpu-node bash -c " set -e if ! command -v curl &>/dev/null; then - echo '[03] curl 未安装,正在安装...' + echo '[04] curl 未安装,正在安装...' apt-get update && apt-get install -y curl fi cd /tmp -echo '[03] 下载 setup.sh...' +echo '[04] 下载 setup.sh...' curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_HOST}:${FTP_PORT}/setup.sh -o setup.sh -echo '[03] 执行安装...' +echo '[04] 执行安装...' chmod +x setup.sh bash setup.sh --server ${FTP_HOST} --user ${FTP_USER} --password '${FTP_PASSWORD}' --port ${FTP_PORT} -echo '[03] 安装完成' +echo '[04] 安装完成' " -echo "[03] 完成" +echo "[04] 完成" diff --git a/src/metric/tests/scripts/common/check-gpu.sh b/src/metric/tests/scripts/common/check-gpu.sh new file mode 100755 index 0000000..c602304 --- /dev/null +++ b/src/metric/tests/scripts/common/check-gpu.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# GPU环境检测脚本 +# 检测系统是否有NVIDIA GPU硬件 + +set -e + +# 检测函数 +check_gpu_support() { + echo "检测GPU环境..." + + # 方法1: 检测GPU设备文件 + if ls /dev/nvidia* &>/dev/null; then + echo "✓ 检测到NVIDIA GPU设备文件" + return 0 + fi + + # 方法2: 检测lspci中的NVIDIA设备(Linux) + if command -v lspci &> /dev/null; then + if lspci | grep -i nvidia &> /dev/null; then + echo "✓ 检测到NVIDIA GPU硬件" + return 0 + fi + fi + + # 方法3: 检测nvidia-smi + if command -v nvidia-smi &> /dev/null; then + if nvidia-smi &> /dev/null; then + echo "✓ 检测到NVIDIA GPU硬件" + return 0 + fi + fi + + echo "✗ 未检测到NVIDIA GPU硬件" + return 1 +} + +# 主函数 +main() { + echo "==========================================" + echo " GPU环境检测" + echo "==========================================" + echo "" + + if check_gpu_support; then + echo "" + echo "结果: GPU环境可用" + exit 0 + else + echo "" + echo "结果: GPU环境不可用,将跳过GPU相关服务" + exit 1 + fi +} + +# 如果直接运行此脚本 +if [ "${BASH_SOURCE[0]}" = "${0}" ]; then + main "$@" +fi diff --git a/src/metric/tests/scripts/common/start-all.sh b/src/metric/tests/scripts/common/start-all.sh index 5521367..7f0e7d5 100755 --- a/src/metric/tests/scripts/common/start-all.sh +++ b/src/metric/tests/scripts/common/start-all.sh @@ -1,7 +1,8 @@ #!/bin/bash # 一键启动脚本 -# 用于初始化目录、构建镜像并启动所有服务 +# 用于初始化目录并启动所有服务 +# 镜像构建已移至 build/build_images.sh set -e @@ -9,12 +10,6 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" cd "$TEST_DIR" -# 解析参数 -FORCE_REBUILD=false -if [[ "$1" == "--rebuild" ]]; then - FORCE_REBUILD=true -fi - echo "==========================================" echo " Argus Metrics 一键启动脚本" echo "==========================================" @@ -37,26 +32,6 @@ echo "使用: docker compose" echo "Compose 文件: $TEST_DIR/docker-compose.yml" echo "" -# 检查必要的构建目录 -echo "检查构建目录..." -BUILD_DIRS=( - "../ftp/build" - "../prometheus/build" - "../grafana/build" - "client-test-node/build" - "client-test-gpu-node/build" -) - -for dir in "${BUILD_DIRS[@]}"; do - if [ ! -d "$dir" ]; then - echo "错误: 构建目录不存在: $dir" - echo "完整路径: $(cd "$(dirname "$dir")" 2>/dev/null && pwd)/$(basename "$dir")" - exit 1 - else - echo " ✓ 找到: $dir" - fi -done -echo "" # 检查并创建 .env 文件 if [ ! -f .env ]; then @@ -84,118 +59,65 @@ echo "1. 初始化目录结构..." bash "$SCRIPT_DIR/init-directories.sh" echo "" -echo "2. 准备 Docker 镜像..." - -# 检查镜像是否存在 -IMAGE_CACHE_DIR="$TEST_DIR/images-cache" -IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest" "argus-metric-test-node:latest" "argus-metric-test-gpu-node:latest") -all_images_exist=true - -for image in "${IMAGES[@]}"; do - if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then - all_images_exist=false - break - fi -done - -if $FORCE_REBUILD; then - echo "强制重新构建镜像(--rebuild 模式)..." - cd "$TEST_DIR" - docker compose build --no-cache - echo "镜像重新构建完成" -elif $all_images_exist; then - echo "所有镜像已存在,跳过构建" +echo "2. 检测GPU环境..." +# 检测GPU环境 +if bash "$SCRIPT_DIR/check-gpu.sh"; then + echo "GPU环境可用,将启动GPU节点" + GPU_AVAILABLE=true else - echo "检测到缺失镜像,尝试从缓存加载..." - - # 尝试从缓存加载 - loaded_from_cache=false - if [ -d "$IMAGE_CACHE_DIR" ]; then - for image in "${IMAGES[@]}"; do - if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then - # 镜像不存在,尝试加载 - case "$image" in - "argus-metric-ftp:latest") - cache_file="${IMAGE_CACHE_DIR}/argus-ftp.tar" - ;; - "argus-metric-prometheus:latest") - cache_file="${IMAGE_CACHE_DIR}/argus-prometheus.tar" - ;; - "argus-metric-grafana:latest") - cache_file="${IMAGE_CACHE_DIR}/argus-grafana.tar" - ;; - "argus-metric-test-node:latest") - cache_file="${IMAGE_CACHE_DIR}/argus-test-node.tar" - ;; - "argus-metric-test-gpu-node:latest") - cache_file="${IMAGE_CACHE_DIR}/argus-test-gpu-node.tar" - ;; - esac - - if [ -f "$cache_file" ]; then - echo " 从缓存加载: $image" - docker load -i "$cache_file" - loaded_from_cache=true - fi - fi - done - fi - - # 检查加载后是否还有缺失的镜像 - need_build=false - for image in "${IMAGES[@]}"; do - if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then - need_build=true - break - fi - done - - if $need_build; then - echo "" - echo "部分镜像缺失,开始构建..." - echo "工作目录: $(pwd)" - cd "$TEST_DIR" - docker compose build --no-cache - - # 询问是否保存镜像 - echo "" - read -p "是否保存镜像到缓存以便下次快速启动? (Y/n): " -n 1 -r - echo - if [[ ! $REPLY =~ ^[Nn]$ ]]; then - mkdir -p "$IMAGE_CACHE_DIR" - echo "保存镜像到缓存..." - for image in "${IMAGES[@]}"; do - case "$image" in - "argus-metric-ftp:latest") - docker save -o "${IMAGE_CACHE_DIR}/argus-ftp.tar" "$image" && echo " 已保存: argus-ftp.tar" - ;; - "argus-metric-prometheus:latest") - docker save -o "${IMAGE_CACHE_DIR}/argus-prometheus.tar" "$image" && echo " 已保存: argus-prometheus.tar" - ;; - "argus-metric-grafana:latest") - docker save -o "${IMAGE_CACHE_DIR}/argus-grafana.tar" "$image" && echo " 已保存: argus-grafana.tar" - ;; - "argus-metric-test-node:latest") - docker save -o "${IMAGE_CACHE_DIR}/argus-test-node.tar" "$image" && echo " 已保存: argus-test-node.tar" - ;; - "argus-metric-test-gpu-node:latest") - docker save -o "${IMAGE_CACHE_DIR}/argus-test-gpu-node.tar" "$image" && echo " 已保存: argus-test-gpu-node.tar" - ;; - esac - done - echo "镜像已保存到: $IMAGE_CACHE_DIR/" - fi - elif $loaded_from_cache; then - echo "" - echo "所有镜像已从缓存加载完成!" - fi + echo "GPU环境不可用,跳过GPU节点" + GPU_AVAILABLE=false fi echo "" -echo "3. 启动基础服务..." +echo "3. 检查 Docker 镜像..." + +# 检查必要的镜像是否存在 +BASE_IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest" "argus-metric-test-node:latest") +GPU_IMAGES=("argus-metric-test-gpu-node:latest") + +# 先检查基础镜像 +missing_images=() +for image in "${BASE_IMAGES[@]}"; do + if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then + missing_images+=("$image") + fi +done + +# 检查GPU镜像(如果GPU环境可用) +if [ "$GPU_AVAILABLE" = true ]; then + for image in "${GPU_IMAGES[@]}"; do + if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then + missing_images+=("$image") + fi + done +fi + +if [ ${#missing_images[@]} -gt 0 ]; then + echo "以下镜像缺失,请先运行 build/build_images.sh 构建镜像:" + for image in "${missing_images[@]}"; do + echo " • $image" + done + echo "" + echo "构建命令:" + echo " ./build/build_images.sh --metric" + exit 1 +else + echo "所有必要镜像已存在" +fi + +echo "" +echo "4. 启动基础服务..." cd "$TEST_DIR" -# 启动除GPU节点外的所有服务 -docker compose up -d ftp prometheus grafana test-node test-gpu-node + +# 根据GPU环境决定启动的服务 +if [ "$GPU_AVAILABLE" = true ]; then + echo "启动所有服务(包括GPU节点)..." + docker compose up -d ftp prometheus grafana test-node test-gpu-node +else + echo "启动基础服务(跳过GPU节点)..." + docker compose up -d ftp prometheus grafana test-node +fi echo "" echo "4. 等待服务启动..." diff --git a/src/sys/build/node/Dockerfile b/src/sys/build/node/Dockerfile new file mode 100644 index 0000000..d47d71f --- /dev/null +++ b/src/sys/build/node/Dockerfile @@ -0,0 +1,36 @@ +FROM ubuntu:22.04 + +ENV DEBIAN_FRONTEND=noninteractive \ + TZ=Asia/Shanghai + +ARG USE_INTRANET=false +ARG ARGUS_BUILD_UID=2133 +ARG ARGUS_BUILD_GID=2015 + +ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \ + ARGUS_BUILD_GID=${ARGUS_BUILD_GID} + +# Optional: switch to intranet apt mirrors during build +RUN if [ "$USE_INTRANET" = "true" ]; then \ + echo "Configuring intranet apt sources..." && \ + cp /etc/apt/sources.list /etc/apt/sources.list.bak && \ + echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \ + echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \ + echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \ + fi + +# Install base tools and all libs that Fluent Bit may require at runtime +# so that start-fluent-bit.sh will NOT fallback to apt during container start. +RUN set -eux; \ + apt-get update; \ + apt-get install -y --no-install-recommends \ + ca-certificates tzdata \ + procps iproute2 net-tools lsof \ + libpq5 libyaml-0-2 libsasl2-2 libldap-2.5-0; \ + rm -rf /var/lib/apt/lists/* + +# Keep root; compose provides entrypoint via bind mount +USER root + +CMD ["bash", "-lc", "sleep infinity"] + diff --git a/src/sys/build/test-gpu-node/Dockerfile b/src/sys/build/test-gpu-node/Dockerfile new file mode 100644 index 0000000..a2ac383 --- /dev/null +++ b/src/sys/build/test-gpu-node/Dockerfile @@ -0,0 +1,34 @@ +FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive \ + TZ=Asia/Shanghai \ + NVIDIA_VISIBLE_DEVICES=all \ + NVIDIA_DRIVER_CAPABILITIES=compute,utility + +ARG USE_INTRANET=false +ARG ARGUS_BUILD_UID=2133 +ARG ARGUS_BUILD_GID=2015 + +ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \ + ARGUS_BUILD_GID=${ARGUS_BUILD_GID} + +# Optional intranet mirror for build-time apt +RUN if [ "$USE_INTRANET" = "true" ]; then \ + echo "Configuring intranet apt sources..." && \ + cp /etc/apt/sources.list /etc/apt/sources.list.bak && \ + echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \ + echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \ + echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \ + fi + +# Pre-install curl and diagnostics to avoid runtime apt installs in GPU test node +RUN set -eux; \ + apt-get update; \ + apt-get install -y --no-install-recommends \ + curl ca-certificates tzdata \ + procps iproute2 net-tools lsof; \ + rm -rf /var/lib/apt/lists/* + +USER root +CMD ["bash", "-lc", "sleep infinity"] + diff --git a/src/sys/build/test-node/Dockerfile b/src/sys/build/test-node/Dockerfile new file mode 100644 index 0000000..6c2c277 --- /dev/null +++ b/src/sys/build/test-node/Dockerfile @@ -0,0 +1,32 @@ +FROM ubuntu:22.04 + +ENV DEBIAN_FRONTEND=noninteractive \ + TZ=Asia/Shanghai + +ARG USE_INTRANET=false +ARG ARGUS_BUILD_UID=2133 +ARG ARGUS_BUILD_GID=2015 + +ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \ + ARGUS_BUILD_GID=${ARGUS_BUILD_GID} + +# Optional intranet mirror for build-time apt +RUN if [ "$USE_INTRANET" = "true" ]; then \ + echo "Configuring intranet apt sources..." && \ + cp /etc/apt/sources.list /etc/apt/sources.list.bak && \ + echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \ + echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \ + echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \ + fi + +# Pre-install curl and common diagnostics to avoid runtime apt installs +RUN set -eux; \ + apt-get update; \ + apt-get install -y --no-install-recommends \ + curl ca-certificates tzdata \ + procps iproute2 net-tools lsof; \ + rm -rf /var/lib/apt/lists/* + +USER root +CMD ["bash", "-lc", "sleep infinity"] + diff --git a/src/sys/tests/README.md b/src/sys/tests/README.md index 77435a5..c166625 100644 --- a/src/sys/tests/README.md +++ b/src/sys/tests/README.md @@ -32,7 +32,7 @@ - 一键执行 - `cd src/sys/tests` - - `./scripts/00_e2e_test.sh` + - `./scripts/00_e2e_test.sh`(CPU-only)或 `./scripts/00_e2e_test.sh --enable-gpu`(启用 GPU 流程) - 分步执行(推荐用于排查) - `./scripts/01_bootstrap.sh` 生成目录/拷贝 `update-dns.sh`/构建 agent 二进制/写 `.env` @@ -42,7 +42,12 @@ - `./scripts/05_agent_register.sh` 获取两个节点的 `node_id` 与初始 IP,检查本地 `node.json` - `./scripts/06_write_health_and_assert.sh` 写健康文件并断言 `nodes.json` 仅包含 2 个在线节点 - `./scripts/07_logs_send_and_assert.sh` 向两个节点写日志,断言 ES `train-*`/`infer-*` 计数增长 - - `./scripts/08_restart_agent_reregister.sh` `node-b` 改为固定 IP `172.29.0.200`,验证保持同一节点 ID 且 IP/时间戳更新 + - `./scripts/08_restart_agent_reregister.sh` `node-b` 改为固定 IP `172.31.0.200`,验证保持同一节点 ID 且 IP/时间戳更新 + - `./scripts/10_metric_publish.sh` 发布 metric 客户端包到 FTP + - `./scripts/11_metric_node_install.sh` 在 CPU 节点安装并验证端点 + - `./scripts/12_metric_gpu_install.sh` 在 GPU 节点安装并等待 9100/9400 就绪(仅启用 GPU 时) + - `./scripts/13_metric_verify.sh` 对 master/Prometheus/数据面/Grafana 做综合校验(含 GPU 时校验 dcgm 指标) + - `./scripts/14_metric_cleanup.sh` 清理 FTP 产物 - `./scripts/09_down.sh` 回收容器、网络并清理 `private*/`、`tmp/` - 重置环境 @@ -53,16 +58,17 @@ ## 二、测试部署架构(docker-compose) - 网络 - - 自定义 bridge:`argus-sys-net`,子网 `172.29.0.0/16` - - 固定地址:bind=`172.29.0.2`,master=`172.29.0.10` + - 自定义 bridge:`argus-sys-net`,子网 `172.31.0.0/16` + - 固定地址:bind=`172.31.0.2`,master=`172.31.0.10` -- 服务与端口 +- 服务与端口(宿主机映射端口由 `01_bootstrap.sh` 自动分配并写入 `.env`) + - 关键变量:`MASTER_PORT`、`ES_HTTP_PORT`、`KIBANA_PORT`、`NODE_A_PORT`、`NODE_B_PORT`、`PROMETHEUS_PORT`、`GRAFANA_PORT`、`ALERTMANAGER_PORT`、`WEB_PROXY_PORT_8080..8085`、`FTP_PORT`、`FTP_DATA_PORT`、`FTP_PASSIVE_HOST_RANGE` - `bind`(`argus-bind9:latest`):监听 53/tcp+udp;负责同步 `*.argus.com` 记录 - - `master`(`argus-master:latest`):对外 `32300→3000`;API `http://localhost:32300` - - `es`(`argus-elasticsearch:latest`):`9200→9200`;单节点,无安全 - - `kibana`(`argus-kibana:latest`):`5601→5601`;通过 `ELASTICSEARCH_HOSTS=http://es:9200` 访问 ES - - `node-a`(`ubuntu:22.04`):同时运行 Fluent Bit + argus-agent,`hostname=dev-yyrshare-nbnyx10-cp2f-pod-0`,`2020→2020` - - `node-b`(`ubuntu:22.04`):同时运行 Fluent Bit + argus-agent,`hostname=dev-yyrshare-uuuu10-ep2f-pod-0`,`2021→2020` + - `master`(`argus-master:latest`):对外 `${MASTER_PORT}→3000`;API `http://localhost:${MASTER_PORT}` + - `es`(`argus-elasticsearch:latest`):`${ES_HTTP_PORT}→9200`;单节点,无安全 + - `kibana`(`argus-kibana:latest`):`${KIBANA_PORT}→5601`;通过 `ELASTICSEARCH_HOSTS=http://es:9200` 访问 ES + - `node-a`(`ubuntu:22.04`):同时运行 Fluent Bit + argus-agent,`hostname=dev-yyrshare-nbnyx10-cp2f-pod-0`,`${NODE_A_PORT}→2020` + - `node-b`(`ubuntu:22.04`):同时运行 Fluent Bit + argus-agent,`hostname=dev-yyrshare-uuuu10-ep2f-pod-0`,`${NODE_B_PORT}→2020` - 卷与目录 - 核心服务(bind/master/es/kibana)共享宿主 `./private` 挂载到容器 `/private` @@ -72,7 +78,7 @@ - 节点容器的 Fluent Bit/agent 资产以只读方式挂载到 `/assets`/`/usr/local/bin/argus-agent` - DNS 配置 - - 节点容器通过 compose 配置 `dns: [172.29.0.2]` 指向 bind,不挂载 `/etc/resolv.conf`,也不依赖 `update-dns.sh` + - 节点容器通过 compose 配置 `dns: [172.31.0.2]` 指向 bind,不挂载 `/etc/resolv.conf`,也不依赖 `update-dns.sh` - master/es/kibana 仍共享 `./private`,master 启动会写 `/private/argus/etc/master.argus.com` 供 bind 同步 A 记录 - 节点入口 @@ -106,6 +112,7 @@ - 判定: - `private/argus/etc/master.argus.com` 存在且为 master IP - 在 node-a/node-b 内 `getent hosts master.argus.com` 成功解析到 master IP + - 在 metric CPU/GPU 节点内可解析 `master.argus.com` 与 `prom.metric.argus.com` - `05_agent_register.sh` - 目的:确认两个节点注册到 master 并持久化 `node.json` @@ -136,3 +143,16 @@ --- 如需更严格的断言(例如 Kibana 载入具体插件、ES 文档字段校验),可在 `07_*.sh` 中追加查询与校验逻辑。 + +--- + +## 可选:GPU 流程说明 +- 前置条件:宿主安装 NVIDIA 驱动与 `nvidia-container-toolkit`,`nvidia-smi` 在宿主可用。 +- 启用方式: + - 一键:`./scripts/00_e2e_test.sh --enable-gpu` + - 分步:设置 `ARGUS_SYS_ENABLE_GPU=true` 后执行 `01_bootstrap.sh`、`02_up.sh`;或直接在 `.env` 中将 `ENABLE_GPU=true` 后单独运行 `02_up.sh`。 +- `01_bootstrap.sh` 会写入: + - `METRIC_TEST_HOSTNAME_GPU=test-metric-gpu-node-001` + - `METRIC_TEST_INSTANCE_GPU=172.31.0.51:9100` + - `METRIC_TEST_DCGM_GPU=172.31.0.51:9400` +- 验证点:`04_verify_dns_routing.sh` 增加对 metric 节点的域名解析;`12_metric_gpu_install.sh` 等待 9100/9400;`13_metric_verify_*` 校验 dcgm 指标与 Grafana 面板。 diff --git a/src/sys/tests/docker-compose.yml b/src/sys/tests/docker-compose.yml index 03b9f76..ba06411 100644 --- a/src/sys/tests/docker-compose.yml +++ b/src/sys/tests/docker-compose.yml @@ -1,21 +1,18 @@ -version: "3.8" - networks: - default: - name: argus-sys-net + sysnet: driver: bridge ipam: driver: default config: - - subnet: 172.29.0.0/16 + - subnet: 172.31.0.0/16 services: bind: image: ${BIND_IMAGE_TAG:-argus-bind9:latest} container_name: argus-bind-sys networks: - default: - ipv4_address: 172.29.0.2 + sysnet: + ipv4_address: 172.31.0.2 volumes: - ./private:/private restart: unless-stopped @@ -32,14 +29,14 @@ services: - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} ports: - - "32300:3000" + - "${MASTER_PORT:-32300}:3000" volumes: - ./private/argus/master:/private/argus/master - ./private/argus/metric/prometheus:/private/argus/metric/prometheus - ./private/argus/etc:/private/argus/etc networks: - default: - ipv4_address: 172.29.0.10 + sysnet: + ipv4_address: 172.31.0.10 restart: unless-stopped es: @@ -55,8 +52,11 @@ services: - ./private/argus/log/elasticsearch:/private/argus/log/elasticsearch - ./private/argus/etc:/private/argus/etc ports: - - "9200:9200" + - "${ES_HTTP_PORT:-9200}:9200" restart: unless-stopped + networks: + sysnet: + ipv4_address: 172.31.0.3 kibana: image: argus-kibana:latest @@ -71,11 +71,14 @@ services: depends_on: - es ports: - - "5601:5601" + - "${KIBANA_PORT:-5601}:5601" restart: unless-stopped + networks: + sysnet: + ipv4_address: 172.31.0.4 node-a: - image: ubuntu:22.04 + image: argus-sys-node:latest container_name: argus-node-a hostname: dev-yyrshare-nbnyx10-cp2f-pod-0 depends_on: @@ -101,13 +104,16 @@ services: entrypoint: - /usr/local/bin/node-entrypoint.sh dns: - - 172.29.0.2 + - 172.31.0.2 # internal bind for *.argus.com + - 8.8.8.8 # external fallback for apt/external domains ports: - - "2020:2020" + - "${NODE_A_PORT:-2020}:2020" restart: unless-stopped + networks: + - sysnet node-b: - image: ubuntu:22.04 + image: argus-sys-node:latest container_name: argus-node-b hostname: dev-yyrshare-uuuu10-ep2f-pod-0 depends_on: @@ -133,7 +139,269 @@ services: entrypoint: - /usr/local/bin/node-entrypoint.sh dns: - - 172.29.0.2 + - 172.31.0.2 + - 8.8.8.8 ports: - - "2021:2020" + - "${NODE_B_PORT:-2021}:2020" restart: unless-stopped + networks: + - sysnet + + ftp: + image: argus-metric-ftp:latest + container_name: argus-ftp + restart: unless-stopped + environment: + - TZ=Asia/Shanghai + - FTP_BASE_PATH=/private/argus/ftp + - FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!} + - DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com} + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + ports: + - "${FTP_PORT:-21}:21" + - "${FTP_DATA_PORT:-20}:20" + - "${FTP_PASSIVE_HOST_RANGE:-21100-21110}:21100-21110" + volumes: + - ./private/argus/metric/ftp:/private/argus/ftp + - ./private/argus/etc:/private/argus/etc + - /etc/localtime:/etc/localtime:ro + - /etc/timezone:/etc/timezone:ro + networks: + sysnet: + ipv4_address: 172.31.0.40 + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + prometheus: + image: argus-metric-prometheus:latest + container_name: argus-prometheus + restart: unless-stopped + environment: + - TZ=Asia/Shanghai + - PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + ports: + - "${PROMETHEUS_PORT:-9090}:9090" + volumes: + - ./private/argus/metric/prometheus:/private/argus/metric/prometheus + - ./private/argus/etc:/private/argus/etc + - /etc/localtime:/etc/localtime:ro + - /etc/timezone:/etc/timezone:ro + networks: + sysnet: + ipv4_address: 172.31.0.41 + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + grafana: + image: argus-metric-grafana:latest + container_name: argus-grafana + restart: unless-stopped + environment: + - TZ=Asia/Shanghai + - GRAFANA_BASE_PATH=/private/argus/metric/grafana + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - GF_SERVER_HTTP_PORT=3000 + - GF_LOG_LEVEL=warn + - GF_LOG_MODE=console + - GF_PATHS_PROVISIONING=/private/argus/metric/grafana/provisioning + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer + ports: + - "${GRAFANA_PORT:-3000}:3000" + volumes: + - ./private/argus/metric/grafana:/private/argus/metric/grafana + - ./private/argus/etc:/private/argus/etc + - /etc/localtime:/etc/localtime:ro + - /etc/timezone:/etc/timezone:ro + networks: + sysnet: + ipv4_address: 172.31.0.42 + depends_on: + - prometheus + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + # --- Added: Web Frontend (no host port; resolved by DNS as web.argus.com) --- + web-frontend: + image: argus-web-frontend:latest + container_name: argus-web-frontend + environment: + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + # Frontend runtime-injected external ports (used to render hyperlinks) + - EXTERNAL_MASTER_PORT=${WEB_PROXY_PORT_8085:-8085} + - EXTERNAL_ALERTMANAGER_PORT=${WEB_PROXY_PORT_8084:-8084} + - EXTERNAL_GRAFANA_PORT=${WEB_PROXY_PORT_8081:-8081} + - EXTERNAL_PROMETHEUS_PORT=${WEB_PROXY_PORT_8082:-8082} + - EXTERNAL_KIBANA_PORT=${WEB_PROXY_PORT_8083:-8083} + volumes: + - ./private/argus/etc:/private/argus/etc + networks: + sysnet: + ipv4_address: 172.31.0.80 + restart: unless-stopped + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + test-node: + image: argus-sys-metric-test-node:latest + container_name: argus-metric-test-node + hostname: test-metric-node-001 + restart: unless-stopped + privileged: true + depends_on: + - ftp + - prometheus + environment: + - TZ=Asia/Shanghai + - DEBIAN_FRONTEND=noninteractive + - FTP_DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com} + - FTP_SERVER=${FTP_SERVER:-172.31.0.40} + - FTP_USER=${FTP_USER:-ftpuser} + - FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!} + - FTP_PORT=${FTP_PORT:-21} + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - METRIC_NODE_ROLE=cpu + volumes: + - ./private/argus/agent:/private/argus/agent + - ./scripts/metric/test-node-entrypoint.sh:/usr/local/bin/metric-test-node-entrypoint.sh:ro + - /etc/localtime:/etc/localtime:ro + - /etc/timezone:/etc/timezone:ro + entrypoint: + - /usr/local/bin/metric-test-node-entrypoint.sh + command: + - sleep + - infinity + dns: + - 172.31.0.2 + - 8.8.8.8 + networks: + sysnet: + ipv4_address: 172.31.0.50 + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + test-gpu-node: + profiles: ["gpu"] + image: argus-sys-metric-test-gpu-node:latest + container_name: argus-metric-test-gpu-node + hostname: test-metric-gpu-node-001 + restart: unless-stopped + privileged: true + runtime: nvidia + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: + - gpu + depends_on: + - ftp + - prometheus + environment: + - TZ=Asia/Shanghai + - DEBIAN_FRONTEND=noninteractive + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - GPU_MODE=gpu + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - METRIC_NODE_ROLE=gpu + volumes: + - ./private/argus/agent:/private/argus/agent + - ./scripts/metric/test-node-entrypoint.sh:/usr/local/bin/metric-test-node-entrypoint.sh:ro + - /etc/localtime:/etc/localtime:ro + - /etc/timezone:/etc/timezone:ro + entrypoint: + - /usr/local/bin/metric-test-node-entrypoint.sh + command: + - sleep + - infinity + dns: + - 172.31.0.2 + - 8.8.8.8 + networks: + sysnet: + ipv4_address: 172.31.0.51 + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + # --- Added: Alertmanager --- + alertmanager: + image: argus-alertmanager:latest + container_name: argus-alertmanager + environment: + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ./private/argus/etc:/private/argus/etc + - ./private/argus/alert/alertmanager:/private/argus/alert/alertmanager + networks: + sysnet: + ipv4_address: 172.31.0.82 + ports: + - "${ALERTMANAGER_PORT:-9093}:9093" + restart: unless-stopped + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + # --- Added: Web Proxy (multi-port gateway) --- + web-proxy: + image: argus-web-proxy:latest + container_name: argus-web-proxy + depends_on: + - bind + - master + - grafana + - prometheus + - kibana + - alertmanager + environment: + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ./private/argus/etc:/private/argus/etc + networks: + sysnet: + ipv4_address: 172.31.0.81 + ports: + - "${WEB_PROXY_PORT_8080:-8080}:8080" + - "${WEB_PROXY_PORT_8081:-8081}:8081" + - "${WEB_PROXY_PORT_8082:-8082}:8082" + - "${WEB_PROXY_PORT_8083:-8083}:8083" + - "${WEB_PROXY_PORT_8084:-8084}:8084" + - "${WEB_PROXY_PORT_8085:-8085}:8085" + restart: unless-stopped + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" diff --git a/src/sys/tests/scripts/00_e2e_test.sh b/src/sys/tests/scripts/00_e2e_test.sh index 2079c4f..bbc9507 100755 --- a/src/sys/tests/scripts/00_e2e_test.sh +++ b/src/sys/tests/scripts/00_e2e_test.sh @@ -3,6 +3,45 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ENABLE_GPU=false +CLEANUP=true + +usage() { + cat <<'EOF' +Usage: 00_e2e_test.sh [options] + +Options: + --enable-gpu 启用 GPU 相关拓扑与测试流程 + --no-clean 跳过清理流程(不执行 14 和 09) + -h, --help 显示帮助信息 +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --enable-gpu) + ENABLE_GPU=true + shift + ;; + --no-clean) + CLEANUP=false + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +export ARGUS_SYS_ENABLE_GPU=$ENABLE_GPU + +# 基础步骤(不包含清理与下线) SCRIPTS=( "01_bootstrap.sh" "02_up.sh" @@ -12,9 +51,20 @@ SCRIPTS=( "06_write_health_and_assert.sh" "07_logs_send_and_assert.sh" "08_restart_agent_reregister.sh" - "09_down.sh" + "10_metric_publish.sh" + "11_metric_node_install.sh" + "12_metric_gpu_install.sh" + "13_metric_verify.sh" ) +# 如未禁用清理,则追加清理与下线步骤(保持原有顺序) +if [[ "$CLEANUP" == "true" ]]; then + SCRIPTS+=( + "14_metric_cleanup.sh" + "09_down.sh" + ) +fi + for script in "${SCRIPTS[@]}"; do echo "[SYS-E2E] Running $script" "$SCRIPT_DIR/$script" @@ -22,5 +72,8 @@ for script in "${SCRIPTS[@]}"; do echo done -echo "[SYS-E2E] All tests completed" - +if [[ "$CLEANUP" == "true" ]]; then + echo "[SYS-E2E] All tests completed" +else + echo "[SYS-E2E] All tests completed (cleanup skipped)" +fi diff --git a/src/sys/tests/scripts/01_bootstrap.sh b/src/sys/tests/scripts/01_bootstrap.sh index e550a43..a4dd69e 100755 --- a/src/sys/tests/scripts/01_bootstrap.sh +++ b/src/sys/tests/scripts/01_bootstrap.sh @@ -22,24 +22,66 @@ ensure_image() { } echo "[INFO] Preparing directories..." +ensure_writable_dir() { + local path="$1" + local parent + parent="$(dirname "$path")" + mkdir -p "$parent" 2>/dev/null || true + mkdir -p "$path" 2>/dev/null || true + if [[ ! -w "$path" ]]; then + docker run --rm -v "$parent:/target" ubuntu:24.04 bash -lc "chown -R $(id -u):$(id -g) /target" >/dev/null 2>&1 || true + fi + mkdir -p "$path" +} + +# preflight: make base dirs writable if inherited from root-owned mounts +ensure_writable_dir "$PRIVATE_CORE/argus" +ensure_writable_dir "$PRIVATE_CORE/argus/metric" +ensure_writable_dir "$PRIVATE_CORE/argus/metric/grafana" +ensure_writable_dir "$PRIVATE_CORE/argus/metric/prometheus" + mkdir -p \ "$PRIVATE_CORE/argus/etc" \ "$PRIVATE_CORE/argus/bind" \ "$PRIVATE_CORE/argus/master" \ "$PRIVATE_CORE/argus/metric/prometheus" \ + "$PRIVATE_CORE/argus/alert/alertmanager" \ + "$PRIVATE_CORE/argus/metric/ftp/share" \ + "$PRIVATE_CORE/argus/metric/grafana/data" \ + "$PRIVATE_CORE/argus/metric/grafana/logs" \ + "$PRIVATE_CORE/argus/metric/grafana/plugins" \ + "$PRIVATE_CORE/argus/metric/grafana/provisioning/datasources" \ + "$PRIVATE_CORE/argus/metric/grafana/provisioning/dashboards" \ + "$PRIVATE_CORE/argus/metric/grafana/data/sessions" \ + "$PRIVATE_CORE/argus/metric/grafana/data/dashboards" \ + "$PRIVATE_CORE/argus/metric/grafana/config" \ + "$PRIVATE_CORE/argus/metric/prometheus/data" \ + "$PRIVATE_CORE/argus/metric/prometheus/rules" \ + "$PRIVATE_CORE/argus/metric/prometheus/targets" \ + "$PRIVATE_CORE/argus/agent" \ "$PRIVATE_CORE/argus/log/elasticsearch" \ "$PRIVATE_CORE/argus/log/kibana" \ "$PRIVATE_NODEA/argus/agent/dev-yyrshare-nbnyx10-cp2f-pod-0/health" \ "$PRIVATE_NODEB/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0/health" \ "$TMP_DIR" -# Align ownership for supervisor-managed services (ES/Kibana expect UID/GID inside container) +# Align ownership for supervisor-managed services (ES/Kibana/Grafana expect UID/GID inside container) echo "[INFO] Fixing ownership for core private directories..." chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" \ "$PRIVATE_CORE/argus/log/elasticsearch" \ "$PRIVATE_CORE/argus/log/kibana" \ + "$PRIVATE_CORE/argus/metric/grafana" \ + "$PRIVATE_CORE/argus/metric/prometheus" \ + "$PRIVATE_CORE/argus/alert" \ + "$PRIVATE_CORE/argus/metric/ftp" \ + "$PRIVATE_CORE/argus/agent" \ "$PRIVATE_CORE/argus/etc" 2>/dev/null || true +# 确保 alert 与 etc 目录组可写,便于非 root 且仅匹配 GID 的服务写入运行文件 +chmod -R g+w "$PRIVATE_CORE/argus/alert" "$PRIVATE_CORE/argus/etc" 2>/dev/null || true + +echo "[INFO] Using compose-managed network (auto-created by docker compose)" + echo "[INFO] Distributing update-dns.sh for core services (bind/master/es/kibana)" BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh" BIND_UPDATE_DEST="$PRIVATE_CORE/argus/etc/update-dns.sh" @@ -55,6 +97,51 @@ ensure_image "argus-elasticsearch:latest" ensure_image "argus-kibana:latest" ensure_image "argus-bind9:latest" ensure_image "argus-master:latest" +ensure_image "argus-metric-ftp:latest" +ensure_image "argus-metric-prometheus:latest" +ensure_image "argus-metric-grafana:latest" +ensure_image "argus-web-frontend:latest" +ensure_image "argus-web-proxy:latest" +ensure_image "argus-alertmanager:latest" + +echo "[INFO] Preparing Fluent Bit local dependency packages..." +FLB_BUILD_PACKAGES_DIR="$REPO_ROOT/src/log/fluent-bit/build/packages" +mkdir -p "$FLB_BUILD_PACKAGES_DIR" +for deb in \ + "$REPO_ROOT/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_"*_amd64.deb \ + "$REPO_ROOT/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_"*_amd64.deb ; do + if ls $deb >/dev/null 2>&1; then + for f in $deb; do + base="$(basename "$f")" + if [[ ! -f "$FLB_BUILD_PACKAGES_DIR/$base" ]]; then + cp "$f" "$FLB_BUILD_PACKAGES_DIR/" + echo " [+] copied $base" + fi + done + fi +done + +# 额外:从 all-in-one-full 的 ubuntu22/curl.tar.gz 解包必要依赖(libsasl2/ldap),便于离线安装 +CURLOPT_TAR="$REPO_ROOT/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz" +if [[ -f "$CURLOPT_TAR" ]]; then + tmpdir=$(mktemp -d) + if tar -xzf "$CURLOPT_TAR" -C "$tmpdir" 2>/dev/null; then + for p in \ + libsasl2-2_*_amd64.deb \ + libsasl2-modules-db_*_amd64.deb \ + libldap-2.5-0_*_amd64.deb \ + libidn2-0_*_amd64.deb \ + libbrotli1_*_amd64.deb \ + libssl3_*_amd64.deb ; do + src=$(ls "$tmpdir"/curl/$p 2>/dev/null | head -n1 || true) + if [[ -n "$src" ]]; then + base="$(basename "$src")" + [[ -f "$FLB_BUILD_PACKAGES_DIR/$base" ]] || cp "$src" "$FLB_BUILD_PACKAGES_DIR/" && echo " [+] staged $base" + fi + done + fi + rm -rf "$tmpdir" +fi echo "[INFO] Building agent binary..." pushd "$REPO_ROOT/src/agent" >/dev/null @@ -68,10 +155,139 @@ if [[ ! -x "$AGENT_BIN" ]]; then fi echo "$AGENT_BIN" > "$TMP_DIR/agent_binary_path" -echo "[INFO] Writing .env with UID/GID" +# 检测GPU环境 +REQUEST_GPU=${ARGUS_SYS_ENABLE_GPU:-false} +GPU_CHECK_SCRIPT="$REPO_ROOT/src/metric/tests/scripts/common/check-gpu.sh" +if [[ "$REQUEST_GPU" == "true" ]]; then + echo "[INFO] --enable-gpu 已启用,开始检测GPU环境..." + if [[ -f "$GPU_CHECK_SCRIPT" ]]; then + if bash "$GPU_CHECK_SCRIPT" >/dev/null 2>&1; then + echo "[INFO] GPU环境可用,将在 compose 中启用 test-gpu-node" + GPU_AVAILABLE=true + else + echo "[ERROR] 未检测到可用 GPU,但指定了 --enable-gpu" >&2 + exit 1 + fi + else + echo "[ERROR] 未找到 GPU 检测脚本: $GPU_CHECK_SCRIPT" >&2 + exit 1 + fi +else + GPU_AVAILABLE=false + echo "[INFO] GPU 支持未启用,跳过 GPU 检测" +fi + +echo "[INFO] Writing .env with UID/GID and metric configuration" +############################################# +# 动态分配宿主机端口并写入 .env +############################################# + +# 读取现有 .env(若存在),用于保留密码/域名等 +EXIST_DOTENV="$TEST_ROOT/.env" +if [[ -f "$EXIST_DOTENV" ]]; then + EXISTING_FTP_PASSWORD="$(grep -E '^FTP_PASSWORD=' "$EXIST_DOTENV" | tail -n1 | sed 's/^FTP_PASSWORD=//')" + EXISTING_FTP_DOMAIN="$(grep -E '^FTP_DOMAIN=' "$EXIST_DOTENV" | tail -n1 | sed 's/^FTP_DOMAIN=//')" + EXISTING_USE_INTRANET="$(grep -E '^USE_INTRANET=' "$EXIST_DOTENV" | tail -n1 | sed 's/^USE_INTRANET=//')" +else + EXISTING_FTP_PASSWORD="" + EXISTING_FTP_DOMAIN="" + EXISTING_USE_INTRANET="" +fi + +is_port_free() { + local p="$1" + ss -ltnH 2>/dev/null | awk -v pat=":${p}$" '$4 ~ pat{f=1} END{exit f?1:0}' +} + +find_free_port() { + local prefer="$1"; local start_scan="${2:-20000}"; local max="${3:-65000}" + if is_port_free "$prefer"; then echo "$prefer"; return 0; fi + local p + for (( p=start_scan; p<=max; p++ )); do + if is_port_free "$p"; then echo "$p"; return 0; fi + done + return 1 +} + +find_free_range() { + local begin="$1"; local end="$2"; local need_count=$((end-begin+1)) + local try_start="$begin" + while (( try_start + need_count - 1 <= 65000 )); do + local ok=1 + for (( p=try_start; p "$TEST_ROOT/.env" </dev/null 2>&1; then @@ -13,10 +14,92 @@ compose() { } echo "[INFO] Bringing up system stack..." + +# 加载 .env 以获取端口(由 01_bootstrap 生成) +if [[ -f "$TEST_ROOT/.env" ]]; then + set -a; source "$TEST_ROOT/.env"; set +a +fi + +# GPU 开关优先级:显式环境变量 > .env 中的 ENABLE_GPU > 默认 false +if [[ "${ARGUS_SYS_ENABLE_GPU:-}" == "true" ]]; then + REQUEST_GPU=true +elif [[ "${ARGUS_SYS_ENABLE_GPU:-}" == "false" ]]; then + REQUEST_GPU=false +else + REQUEST_GPU=${ENABLE_GPU:-false} +fi + +GPU_AVAILABLE=false +GPU_CHECK_SCRIPT="$REPO_ROOT/src/metric/tests/scripts/common/check-gpu.sh" + +if [[ "$REQUEST_GPU" == "true" ]]; then + echo "[INFO] --enable-gpu 生效,验证主机 GPU..." + if [[ -f "$GPU_CHECK_SCRIPT" ]]; then + if bash "$GPU_CHECK_SCRIPT" >/dev/null 2>&1; then + GPU_AVAILABLE=true + echo "[INFO] GPU 检测通过,将启动 gpu profile" + else + echo "[ERROR] 主机缺少可用 GPU,无法继续 --enable-gpu 流程" >&2 + exit 1 + fi + else + echo "[ERROR] 未找到 GPU 检测脚本: $GPU_CHECK_SCRIPT" >&2 + exit 1 + fi +else + echo "[INFO] 未启用 GPU 流程" +fi + pushd "$TEST_ROOT" >/dev/null compose -p argus-sys down --remove-orphans || true -compose -p argus-sys up -d + +# 清理可能由 08 脚本创建的同名容器,避免 compose up 冲突 +for name in argus-node-b; do + if docker ps -aqf "name=^${name}$" >/dev/null 2>&1 && [[ -n "$(docker ps -aqf "name=^${name}$")" ]]; then + docker rm -f "$name" >/dev/null 2>&1 || true + fi +done + +# 预检:检查多端口网关所需宿主端口是否空闲 +check_port_free() { + local p="$1" + if ss -ltnp 2>/dev/null | grep -q ":${p} "; then + echo "[ERR] Host port ${p} is already in use. Please free it before running 02_up.sh" >&2 + ss -ltnp | awk -v p=":${p} " '$0 ~ p {print " " $0}' || true + return 1 + fi + return 0 +} + +for port in \ + "${WEB_PROXY_PORT_8080:-8080}" \ + "${WEB_PROXY_PORT_8081:-8081}" \ + "${WEB_PROXY_PORT_8082:-8082}" \ + "${WEB_PROXY_PORT_8083:-8083}" \ + "${WEB_PROXY_PORT_8084:-8084}" \ + "${WEB_PROXY_PORT_8085:-8085}"; do + check_port_free "$port" || { echo "[ERR] Required port busy: $port"; exit 1; } +done + +# 根据GPU可用性决定启动的服务 +if [[ "$GPU_AVAILABLE" == true ]]; then + echo "[INFO] 启动所有服务(包含 gpu profile)..." + compose -p argus-sys --profile gpu up -d || true +else + echo "[INFO] 启动基础服务(不含 gpu profile)..." + compose -p argus-sys up -d || true +fi + +# 若 web-proxy 处于 Created 状态,尝试单独启动一次(处理偶发 Address already in use 后端已释放的场景) +if docker ps -a --format '{{.Names}}\t{{.Status}}' | grep -q '^argus-web-proxy\s\+Created'; then + echo "[WARN] web-proxy in Created state; retry starting it..." + docker start argus-web-proxy || true +fi + popd >/dev/null -echo "[OK] Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021" - +if [[ "$GPU_AVAILABLE" == true ]]; then + echo "[OK] Services started: master:${MASTER_PORT:-32300} es:${ES_HTTP_PORT:-9200} kibana:${KIBANA_PORT:-5601} node-a:${NODE_A_PORT:-2020} node-b:${NODE_B_PORT:-2021} test-gpu-node:172.31.0.51" +else + echo "[OK] Services started: master:${MASTER_PORT:-32300} es:${ES_HTTP_PORT:-9200} kibana:${KIBANA_PORT:-5601} node-a:${NODE_A_PORT:-2020} node-b:${NODE_B_PORT:-2021} (gpu skipped)" +fi diff --git a/src/sys/tests/scripts/03_wait_ready.sh b/src/sys/tests/scripts/03_wait_ready.sh index 4887181..07cd4c2 100755 --- a/src/sys/tests/scripts/03_wait_ready.sh +++ b/src/sys/tests/scripts/03_wait_ready.sh @@ -27,25 +27,34 @@ wait_http() { echo "[INFO] Waiting for ES/Kibana/Master/Fluent Bit/Bind..." +# 载入端口变量 +if [[ -f "$TEST_ROOT/.env" ]]; then + set -a; source "$TEST_ROOT/.env"; set +a +fi + # ES (>= yellow) attempt=1; max=120 +ES_T0=$(date +%s) while (( attempt <= max )); do - if curl -fsS "http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then + if curl -fsS "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then break fi echo "[..] waiting ES ($attempt/$max)"; sleep 5; ((attempt++)) done [[ $attempt -le $max ]] || { echo "[ERR] ES not ready" >&2; exit 1; } +ES_T1=$(date +%s); echo "[TIME] ES ready in $((ES_T1-ES_T0))s" # Kibana: must be HTTP 200 and overall.level=available echo "[INFO] Waiting for Kibana to be available (HTTP 200)..." kb_attempt=1; kb_max=180 +KB_T0=$(date +%s) while (( kb_attempt <= kb_max )); do - body=$(curl -sS "http://localhost:5601/api/status" 2>/dev/null || true) - code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:5601/api/status" || echo 000) + body=$(curl -sS "http://localhost:${KIBANA_PORT:-5601}/api/status" 2>/dev/null || true) + code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:${KIBANA_PORT:-5601}/api/status" || echo 000) if [[ "$code" == "200" ]]; then if echo "$body" | grep -q '"level":"available"'; then - echo "[OK] Kibana available (HTTP 200)" + KB_T1=$(date +%s) + echo "[OK] Kibana available (HTTP 200) in $((KB_T1-KB_T0))s" break fi fi @@ -58,11 +67,13 @@ if (( kb_attempt > kb_max )); then fi # Master -wait_http "http://localhost:32300/readyz" 120 +MASTER_T0=$(date +%s) +wait_http "http://localhost:${MASTER_PORT:-32300}/readyz" 120 +MASTER_T1=$(date +%s); echo "[TIME] Master readyz in $((MASTER_T1-MASTER_T0))s" # Fluent Bit (host metrics on host ports) -wait_http "http://localhost:2020/api/v2/metrics" 120 -wait_http "http://localhost:2021/api/v2/metrics" 120 +FB1_T0=$(date +%s); wait_http "http://localhost:${NODE_A_PORT:-2020}/api/v2/metrics" 120; FB1_T1=$(date +%s); echo "[TIME] FluentBit:${NODE_A_PORT:-2020} in $((FB1_T1-FB1_T0))s" +FB2_T0=$(date +%s); wait_http "http://localhost:${NODE_B_PORT:-2021}/api/v2/metrics" 120; FB2_T1=$(date +%s); echo "[TIME] FluentBit:${NODE_B_PORT:-2021} in $((FB2_T1-FB2_T0))s" # Bind config check BIND_ID="$(service_id bind)" @@ -72,4 +83,63 @@ else echo "[WARN] bind container id not found" fi +# ========== Additional module readiness checks ========== + +# Prometheus +PROM_T0=$(date +%s); wait_http "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready" 120; PROM_T1=$(date +%s); echo "[TIME] Prometheus ready in $((PROM_T1-PROM_T0))s" + +# Grafana health (database: ok) +echo "[INFO] Waiting for Grafana health..." +gf_attempt=1; gf_max=120 +while (( gf_attempt <= gf_max )); do + gf_body=$(curl -sS "http://localhost:${GRAFANA_PORT:-3000}/api/health" 2>/dev/null || true) + gf_code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:${GRAFANA_PORT:-3000}/api/health" || echo 000) + if [[ "$gf_code" == "200" ]] && echo "$gf_body" | grep -q '"database"\s*:\s*"ok"'; then + echo "[OK] Grafana health database=ok" + break + fi + echo "[..] waiting grafana health ($gf_attempt/$gf_max), last_code=$gf_code" + sleep 3; ((gf_attempt++)) +done +if (( gf_attempt > gf_max )); then + echo "[ERR] Grafana /api/health not ready" >&2; exit 1 +fi + +# Alertmanager +wait_http "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status" 120 + +# Web proxy checks(按端口细化) +code_for() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; } +header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; } + +echo "[INFO] Checking web-proxy ports..." + +# 8080 首页必须 200 +tries=1; max=60; P8080_T0=$(date +%s) +while (( tries <= max )); do + c=$(code_for "http://localhost:${WEB_PROXY_PORT_8080:-8080}/") + if [[ "$c" == "200" ]]; then P8080_T1=$(date +%s); echo "[OK] 8080 / ($c) in $((P8080_T1-P8080_T0))s"; break; fi + echo "[..] waiting 8080/ ($tries/$max), code=$c"; sleep 3; ((tries++)) +done +(( tries <= max )) || { echo "[ERR] 8080/ not ready" >&2; exit 1; } + +# 8083 Kibana 允许 200/302(上面已就绪,端口侧再快速确认) +tries=1; max=40; P8083_T0=$(date +%s) +while (( tries <= max )); do + c=$(code_for "http://localhost:${WEB_PROXY_PORT_8083:-8083}/") + if [[ "$c" == "200" || "$c" == "302" ]]; then P8083_T1=$(date +%s); echo "[OK] 8083 / ($c) in $((P8083_T1-P8083_T0))s"; break; fi + echo "[..] waiting 8083/ ($tries/$max), code=$c"; sleep 3; ((tries++)) +done +(( tries <= max )) || { echo "[ERR] 8083/ not ready" >&2; exit 1; } + +# 8084 Alertmanager + CORS +P8084_T0=$(date +%s); wait_http "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" 60; P8084_T1=$(date +%s) +cors=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true) +if [[ -z "$cors" ]]; then echo "[ERR] 8084 CORS missing" >&2; exit 1; else echo "[OK] 8084 CORS: $cors in $((P8084_T1-P8084_T0))s"; fi + +# 8085 Master /readyz + CORS(API 走 8085 才需跨域) +P8085_T0=$(date +%s); wait_http "http://localhost:${WEB_PROXY_PORT_8085:-8085}/readyz" 60; P8085_T1=$(date +%s) +cors=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true) +if [[ -z "$cors" ]]; then echo "[ERR] 8085 CORS missing" >&2; exit 1; else echo "[OK] 8085 CORS: $cors in $((P8085_T1-P8085_T0))s"; fi + echo "[OK] All services are ready" diff --git a/src/sys/tests/scripts/04_verify_dns_routing.sh b/src/sys/tests/scripts/04_verify_dns_routing.sh index 635c4fe..1895131 100755 --- a/src/sys/tests/scripts/04_verify_dns_routing.sh +++ b/src/sys/tests/scripts/04_verify_dns_routing.sh @@ -4,20 +4,15 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" -compose() { - if docker compose version >/dev/null 2>&1; then - docker compose "$@" - else - docker-compose "$@" - fi -} - -service_id() { - compose -p argus-sys ps -q "$1" +# 直接根据 container_name 获取容器ID,避免 compose project 名称不一致导致查找失败 +cid_by_name() { + docker ps -aqf "name=^$1$" } echo "[INFO] Verifying DNS routing via bind..." +pushd "$TEST_ROOT" >/dev/null + # Check master IP file exists in shared private MASTER_FILE="$TEST_ROOT/private/argus/etc/master.argus.com" if [[ ! -f "$MASTER_FILE" ]]; then @@ -28,7 +23,7 @@ MASTER_IP_HOST="$(cat "$MASTER_FILE" | tr -d '\r\n' || true)" echo "[INFO] master.argus.com file content: ${MASTER_IP_HOST}" # dig inside bind container -BIN_ID="$(service_id bind)" +BIN_ID="$(cid_by_name argus-bind-sys)" if [[ -n "$BIN_ID" ]]; then DIG_IP="$(docker exec "$BIN_ID" dig +short master.argus.com A | tail -n1 || true)" echo "[INFO] dig(master.argus.com) from bind container -> $DIG_IP" @@ -39,8 +34,27 @@ else echo "[WARN] bind container not found; skip dig" fi -for node in node-a node-b; do - CID="$(service_id "$node")" +check_inside() { + local cname="$1"; shift + local domains=("$@") + CID="$(cid_by_name "$cname")" + if [[ -z "$CID" ]]; then + echo "[WARN] container $cname not found; skip" + return 0 + fi + for d in "${domains[@]}"; do + echo "[INFO] Checking resolution inside $cname for $d..." + if ! docker exec "$CID" getent hosts "$d" >/dev/null 2>&1; then + echo "[ERR] $cname cannot resolve $d" >&2 + return 1 + fi + RES="$(docker exec "$CID" getent hosts "$d" | awk '{print $1}' | head -n1)" + echo "[OK] $cname resolved $d -> $RES" + done +} + +for node in argus-node-a argus-node-b; do + CID="$(cid_by_name "$node")" echo "[INFO] Checking resolution inside $node..." if ! docker exec "$CID" getent hosts master.argus.com >/dev/null 2>&1; then echo "[ERR] $node cannot resolve master.argus.com" >&2 @@ -50,5 +64,10 @@ for node in node-a node-b; do echo "[OK] $node resolved master.argus.com -> $RES" done -echo "[OK] DNS routing verified" +popd >/dev/null +# 追加:在 metric 节点中验证 master 与 prom 域名解析 +check_inside argus-metric-test-node master.argus.com prom.metric.argus.com || exit 1 +check_inside argus-metric-test-gpu-node master.argus.com prom.metric.argus.com || exit 1 + +echo "[OK] DNS routing verified" diff --git a/src/sys/tests/scripts/05_agent_register.sh b/src/sys/tests/scripts/05_agent_register.sh index 073d949..40079d5 100755 --- a/src/sys/tests/scripts/05_agent_register.sh +++ b/src/sys/tests/scripts/05_agent_register.sh @@ -5,7 +5,12 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" TMP_DIR="$TEST_ROOT/tmp" -API_BASE="http://localhost:32300/api/v1/master" +# 载入端口变量 +if [[ -f "$TEST_ROOT/.env" ]]; then + set -a; source "$TEST_ROOT/.env"; set +a +fi + +API_BASE="http://localhost:${MASTER_PORT:-32300}/api/v1/master" HOST_A="dev-yyrshare-nbnyx10-cp2f-pod-0" HOST_B="dev-yyrshare-uuuu10-ep2f-pod-0" @@ -49,8 +54,35 @@ for _ in {1..60}; do fi done +# 若仍未全部注册,尝试重启 node-b 并再等待一轮(兼容 DNS/启动时序抖动) if [[ ! -s "$TMP_DIR/node_id_a" || ! -s "$TMP_DIR/node_id_b" ]]; then - echo "[ERR] Agents did not register in time" >&2 + echo "[WARN] node-a or node-b not registered in first window; restarting node-b and retrying..." >&2 + # 仅重启 node-b,避免影响 es/kibana/master + if docker ps --format '{{.Names}}' | grep -q '^argus-node-b$'; then + docker restart argus-node-b >/dev/null 2>&1 || true + fi + # 再等待一轮(最多 120 秒) + > "$TMP_DIR/node_id_b" + for _ in {1..60}; do + sleep 2 + resp=$(curl -fsS "$API_BASE/nodes" 2>/dev/null || true) + [[ -z "$resp" ]] && continue + if ! echo "$resp" | head -c1 | grep -q '\['; then + continue + fi + echo "$resp" > "$TMP_DIR/nodes_list.json" + ID_A=$(extract_node "$HOST_A" "$TMP_DIR/node_id_a" "$TMP_DIR/nodes_list.json" 2>/dev/null || true) + ID_B=$(extract_node "$HOST_B" "$TMP_DIR/node_id_b" "$TMP_DIR/nodes_list.json" 2>/dev/null || true) + if [[ -s "$TMP_DIR/node_id_a" && -s "$TMP_DIR/node_id_b" ]]; then + break + fi + done +fi + +if [[ ! -s "$TMP_DIR/node_id_a" || ! -s "$TMP_DIR/node_id_b" ]]; then + echo "[ERR] Agents did not register in time (after retry)" >&2 + echo "[HINT] Current /nodes response:" >&2 + sed -n '1,200p' "$TMP_DIR/nodes_list.json" >&2 || true exit 1 fi diff --git a/src/sys/tests/scripts/06_write_health_and_assert.sh b/src/sys/tests/scripts/06_write_health_and_assert.sh index 6f888e6..dd9d538 100755 --- a/src/sys/tests/scripts/06_write_health_and_assert.sh +++ b/src/sys/tests/scripts/06_write_health_and_assert.sh @@ -5,7 +5,12 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" TMP_DIR="$TEST_ROOT/tmp" -API_BASE="http://localhost:32300/api/v1/master" +# 载入端口变量 +if [[ -f "$TEST_ROOT/.env" ]]; then + set -a; source "$TEST_ROOT/.env"; set +a +fi + +API_BASE="http://localhost:${MASTER_PORT:-32300}/api/v1/master" HOST_A="dev-yyrshare-nbnyx10-cp2f-pod-0" HOST_B="dev-yyrshare-uuuu10-ep2f-pod-0" diff --git a/src/sys/tests/scripts/07_logs_send_and_assert.sh b/src/sys/tests/scripts/07_logs_send_and_assert.sh index 0363ebf..7c58319 100755 --- a/src/sys/tests/scripts/07_logs_send_and_assert.sh +++ b/src/sys/tests/scripts/07_logs_send_and_assert.sh @@ -3,9 +3,25 @@ set -euo pipefail echo "[INFO] Sending logs via node-a/node-b and asserting ES counts..." +# 载入端口变量 +TEST_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. && pwd)" +if [[ -f "$TEST_ROOT/.env" ]]; then + set -a; source "$TEST_ROOT/.env"; set +a +fi + +# Robust count helper: tolerates 404/503 and non-JSON responses, returns integer >=0 get_count() { - local idx="$1" - curl -s "http://localhost:9200/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" | sed -E 's/.*"count":([0-9]+).*/\1/' | awk 'NF{print $0;exit} END{if(NR==0)print 0}' + local idx="$1"; local tmp; tmp=$(mktemp) + local code + code=$(curl -s -o "$tmp" -w "%{http_code}" "http://localhost:${ES_HTTP_PORT:-9200}/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" || true) + if [[ "$code" == "200" ]]; then + local val + val=$(jq -r '(.count // 0) | tonumber? // 0' "$tmp" 2>/dev/null || echo 0) + echo "$val" + else + echo 0 + fi + rm -f "$tmp" } train0=$(get_count "train-*") @@ -32,11 +48,26 @@ send_logs "$node_a" "host01" send_logs "$node_b" "host02" echo "[INFO] Waiting for ES to ingest..." -sleep 10 +# Proactively refresh indices (ignore errors if not created yet) +curl -s -X POST "http://localhost:${ES_HTTP_PORT:-9200}/train-*/_refresh" >/dev/null 2>&1 || true +curl -s -X POST "http://localhost:${ES_HTTP_PORT:-9200}/infer-*/_refresh" >/dev/null 2>&1 || true -train1=$(get_count "train-*") -infer1=$(get_count "infer-*") -final=$((train1 + infer1)) +# Retry up to 120s for counts to increase and reach threshold (>=4) +final=0 +threshold=4 +for attempt in {1..60}; do + train1=$(get_count "train-*") + infer1=$(get_count "infer-*") + final=$((train1 + infer1)) + if (( final > base && final >= threshold )); then + break + fi + echo "[..] waiting ES counts increase to >=${threshold} ($attempt/60) current=${final} base=${base}" + # refresh indices again to speed up visibility + curl -s -X POST "http://localhost:${ES_HTTP_PORT:-9200}/train-*/_refresh" >/dev/null 2>&1 || true + curl -s -X POST "http://localhost:${ES_HTTP_PORT:-9200}/infer-*/_refresh" >/dev/null 2>&1 || true + sleep 2 +done echo "[INFO] final counts: train=${train1} infer=${infer1} total=${final}" if (( final <= base )); then @@ -44,19 +75,20 @@ if (( final <= base )); then exit 1 fi +# Minimal threshold to be tolerant: expect at least 4 documents (2 train + 1 infer per node) if (( final < 4 )); then echo "[ERR] ES total below expected threshold: ${final} < 4" >&2 exit 1 fi # Health endpoints -es_health=$(curl -s "http://localhost:9200/_cluster/health" | grep -o '"status":"[^"]*"' | cut -d'"' -f4) +es_health=$(curl -s "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" | grep -o '"status":"[^"]*"' | cut -d'"' -f4) if [[ "$es_health" != "green" && "$es_health" != "yellow" ]]; then echo "[ERR] ES health not green/yellow: $es_health" >&2 exit 1 fi -if ! curl -fs "http://localhost:5601/api/status" >/dev/null 2>&1; then +if ! curl -fs "http://localhost:${KIBANA_PORT:-5601}/api/status" >/dev/null 2>&1; then echo "[WARN] Kibana status endpoint not available" fi diff --git a/src/sys/tests/scripts/08_restart_agent_reregister.sh b/src/sys/tests/scripts/08_restart_agent_reregister.sh index d9bf43a..b91031f 100755 --- a/src/sys/tests/scripts/08_restart_agent_reregister.sh +++ b/src/sys/tests/scripts/08_restart_agent_reregister.sh @@ -6,7 +6,12 @@ TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" TMP_DIR="$TEST_ROOT/tmp" REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)" -API_BASE="http://localhost:32300/api/v1/master" +# 载入端口变量 +if [[ -f "$TEST_ROOT/.env" ]]; then + set -a; source "$TEST_ROOT/.env"; set +a +fi + +API_BASE="http://localhost:${MASTER_PORT:-32300}/api/v1/master" if [[ -f "$TEST_ROOT/.env" ]]; then set -a @@ -49,7 +54,7 @@ compose() { fi } -echo "[INFO] Recreating node-b with static IP 172.29.0.200..." +echo "[INFO] Recreating node-b with static IP 172.31.0.200..." pushd "$TEST_ROOT" >/dev/null compose -p argus-sys rm -sf node-b || true popd >/dev/null @@ -58,19 +63,34 @@ docker rm -f argus-node-b >/dev/null 2>&1 || true AGENT_BIN_PATH="$(cat "$TMP_DIR/agent_binary_path")" +# 选择 compose 管理的网络名(默认 argus-sys_sysnet)。 +detect_sysnet() { + if docker network inspect argus-sys_sysnet >/dev/null 2>&1; then + echo argus-sys_sysnet; return + fi + # 回退:从 master 容器推断所连网络(取第一个) + local n + n=$(docker inspect -f '{{range $k, $_ := .NetworkSettings.Networks}}{{println $k}}{{end}}' argus-master-sys 2>/dev/null | head -n1 || true) + if [[ -n "$n" ]]; then echo "$n"; return; fi + # 最后兜底:尝试项目默认网络(不保证有 IPAM) + echo argus-sys_default +} +SYSNET_NAME=$(detect_sysnet) +echo "[INFO] Using docker network: $SYSNET_NAME" + docker run -d \ --name argus-node-b \ --hostname dev-yyrshare-uuuu10-ep2f-pod-0 \ - --network argus-sys-net \ - --ip 172.29.0.200 \ - --dns 172.29.0.2 \ + --network "$SYSNET_NAME" \ + --ip 172.31.0.200 \ + --dns 172.31.0.2 \ -e MASTER_ENDPOINT=http://master.argus.com:3000 \ -e REPORT_INTERVAL_SECONDS=2 \ -e ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} \ -e ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} \ -e ES_HOST=es \ -e ES_PORT=9200 \ - -p 2021:2020 \ + -p ${NODE_B_PORT:-2021}:2020 \ -v "$TEST_ROOT/private-nodeb/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0:/private/argus/agent/dev-yyrshare-uuuu10-ep2f-pod-0" \ -v "$AGENT_BIN_PATH:/usr/local/bin/argus-agent:ro" \ -v "$SCRIPT_DIR/node_entrypoint.sh:/usr/local/bin/node-entrypoint.sh:ro" \ @@ -90,15 +110,15 @@ node=json.load(open(sys.argv[1])) last0=sys.argv[2] ip=node.get("meta_data",{}).get("ip") lu=node.get("last_updated") -assert ip=="172.29.0.200" +assert ip=="172.31.0.200" assert lu and lu!=last0 PY then - echo "[OK] node-b re-registered with new IP 172.29.0.200" + echo "[OK] node-b re-registered with new IP 172.31.0.200" exit 0 fi fi done -echo "[ERR] node-b did not update to IP 172.29.0.200 in time" >&2 +echo "[ERR] node-b did not update to IP 172.31.0.200 in time" >&2 exit 1 diff --git a/src/sys/tests/scripts/09_down.sh b/src/sys/tests/scripts/09_down.sh index d200540..ceb297d 100755 --- a/src/sys/tests/scripts/09_down.sh +++ b/src/sys/tests/scripts/09_down.sh @@ -12,12 +12,33 @@ compose() { fi } -docker rm -f argus-node-b >/dev/null 2>&1 || true - pushd "$TEST_ROOT" >/dev/null compose -p argus-sys down --remove-orphans || true +compose down --remove-orphans || true popd >/dev/null +echo "[INFO] Force removing containers by name (if any)..." +containers=( + argus-node-a + argus-node-b + argus-metric-test-node + argus-grafana + argus-kibana-sys + argus-master-sys + argus-bind-sys + argus-ftp + argus-es-sys + argus-prometheus +) +for c in "${containers[@]}"; do + id=$(docker ps -aqf "name=^${c}$" || true) + if [[ -n "$id" ]]; then + docker rm -f "$id" >/dev/null 2>&1 || true + fi +done + +echo "[INFO] Removing compose networks (handled by compose down)" + echo "[INFO] Cleaning private directories..." if [[ -d "$TEST_ROOT/private" ]]; then docker run --rm -v "$TEST_ROOT/private:/target" ubuntu:24.04 chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1 || true diff --git a/src/sys/tests/scripts/10_metric_publish.sh b/src/sys/tests/scripts/10_metric_publish.sh new file mode 100755 index 0000000..1768720 --- /dev/null +++ b/src/sys/tests/scripts/10_metric_publish.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)" + +PLUGIN_DIR="$REPO_ROOT/src/metric/client-plugins/all-in-one-full" +FTP_CONTAINER="argus-ftp" + +if [[ ! -d "$PLUGIN_DIR" ]]; then + echo "[SYS-METRIC] Metric client plugin directory not found: $PLUGIN_DIR" >&2 + exit 1 +fi + +if [[ -f "$TEST_ROOT/.env" ]]; then + # shellcheck source=/dev/null + source "$TEST_ROOT/.env" +fi + +OWNER="${ARGUS_BUILD_UID:-2133}:${ARGUS_BUILD_GID:-2015}" + +resolve_output_dir() { + local host_mount + if docker ps --format '{{.Names}}' | grep -q "^${FTP_CONTAINER}$"; then + host_mount=$(docker inspect "$FTP_CONTAINER" --format '{{range .Mounts}}{{if eq .Destination "/private/argus/ftp"}}{{.Source}}{{end}}{{end}}' 2>/dev/null || true) + if [[ -n "$host_mount" ]]; then + echo "$host_mount/share" + return 0 + fi + fi + echo "$TEST_ROOT/private/argus/metric/ftp/share" +} + +OUTPUT_DIR="$(resolve_output_dir)" +mkdir -p "$OUTPUT_DIR" + +if [[ ! -w "$OUTPUT_DIR" ]]; then + echo "[SYS-METRIC] 无法写入 FTP 输出目录: $OUTPUT_DIR" >&2 + echo " 请确认目录权限与 ARGUS_BUILD_UID/GID 一致" >&2 + exit 1 +fi + +pushd "$PLUGIN_DIR" >/dev/null + +# --- Inject agent binary built in 01_bootstrap (if present) --- +AGENT_PATH_FILE="$TEST_ROOT/tmp/agent_binary_path" +AGENT_BIN_CANDIDATE="$REPO_ROOT/src/agent/dist/argus-agent" +if [[ -f "$AGENT_PATH_FILE" ]]; then + AGENT_BIN="$(tr -d '\n' < "$AGENT_PATH_FILE")" +else + AGENT_BIN="$AGENT_BIN_CANDIDATE" +fi + +if [[ -x "$AGENT_BIN" ]]; then + echo "[SYS-METRIC] 使用 01 阶段构建的 agent: $AGENT_BIN" + TARGET_BIN="plugins/argus-agent/bin/argus-agent" + if [[ -f "$TARGET_BIN" ]]; then + cp -f "$AGENT_BIN" "$TARGET_BIN" + else + mkdir -p "$(dirname "$TARGET_BIN")" + cp "$AGENT_BIN" "$TARGET_BIN" + fi + chmod +x "$TARGET_BIN" +else + echo "[SYS-METRIC] 未找到可执行的 agent 二进制(预期: $AGENT_BIN),继续使用插件目录内置版本" +fi + +echo "[SYS-METRIC] Bumping metric artifact version..." +bash scripts/version-manager.sh bump minor + +VERSION_FILE="config/VERSION" +if [[ ! -f "$VERSION_FILE" ]]; then + echo "[SYS-METRIC] VERSION 文件缺失: $VERSION_FILE" >&2 + exit 1 +fi + +VERSION=$(tr -d '\n' < "$VERSION_FILE") +echo "[SYS-METRIC] 当前版本: $VERSION" + +echo "[SYS-METRIC] Packaging metric artifact..." +bash scripts/package_artifact.sh --force + +echo "[SYS-METRIC] Publishing artifact to FTP share..." +bash scripts/publish_artifact.sh "$VERSION" --output-dir "$OUTPUT_DIR" --owner "$OWNER" + +popd >/dev/null + +echo "[SYS-METRIC] Metric artifact published to $OUTPUT_DIR" diff --git a/src/sys/tests/scripts/11_metric_node_install.sh b/src/sys/tests/scripts/11_metric_node_install.sh new file mode 100755 index 0000000..63ff81b --- /dev/null +++ b/src/sys/tests/scripts/11_metric_node_install.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +if [[ -f "$TEST_ROOT/.env" ]]; then + # shellcheck source=/dev/null + source "$TEST_ROOT/.env" +fi + +CONTAINER="argus-metric-test-node" + +if ! docker ps --format '{{.Names}}' | grep -q "^${CONTAINER}$"; then + echo "[SYS-METRIC] 容器 ${CONTAINER} 未运行,无法执行安装" >&2 + exit 1 +fi + +FTP_HOST="${FTP_SERVER:-172.31.0.40}" +FTP_USER="${FTP_USER:-ftpuser}" +FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" +FTP_PORT="${FTP_PORT:-21}" + +echo "[SYS-METRIC] 在 ${CONTAINER} 内执行安装 (FTP: ${FTP_HOST}:${FTP_PORT})" + +docker exec \ + -e FTP_HOST="$FTP_HOST" \ + -e FTP_USER="$FTP_USER" \ + -e FTP_PASSWORD="$FTP_PASSWORD" \ + -e FTP_PORT="$FTP_PORT" \ + "$CONTAINER" bash -c ' +set -e + +if ! command -v curl &>/dev/null; then + echo "[SYS-METRIC] curl 未安装,开始安装依赖..." + apt-get update >/dev/null && apt-get install -y curl >/dev/null +fi + +cd /tmp +echo "[SYS-METRIC] 下载 setup.sh..." +curl -u "${FTP_USER}:${FTP_PASSWORD}" "ftp://${FTP_HOST}:${FTP_PORT}/setup.sh" -o setup.sh + +echo "[SYS-METRIC] 执行安装..." +chmod +x setup.sh +bash setup.sh --server "${FTP_HOST}" --user "${FTP_USER}" --password "${FTP_PASSWORD}" --port "${FTP_PORT}" + +echo "[SYS-METRIC] 安装完成" +' + +echo "[SYS-METRIC] Metric test node 安装流程完成" diff --git a/src/sys/tests/scripts/12_metric_gpu_install.sh b/src/sys/tests/scripts/12_metric_gpu_install.sh new file mode 100755 index 0000000..c92bf4f --- /dev/null +++ b/src/sys/tests/scripts/12_metric_gpu_install.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +ENABLE_GPU=${ARGUS_SYS_ENABLE_GPU:-false} + +if [[ "$ENABLE_GPU" != "true" ]]; then + echo "[SYS-METRIC] 未启用 GPU 流程,跳过 GPU 节点安装" + exit 0 +fi + +if [[ -f "$TEST_ROOT/.env" ]]; then + # shellcheck source=/dev/null + source "$TEST_ROOT/.env" +fi + +CONTAINER="argus-metric-test-gpu-node" + +if ! docker ps --format '{{.Names}}' | grep -q "^${CONTAINER}$"; then + echo "[SYS-METRIC] 预期启动的 ${CONTAINER} 未运行" >&2 + exit 1 +fi + +FTP_HOST="${FTP_SERVER:-172.31.0.40}" +FTP_USER="${FTP_USER:-ftpuser}" +FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" +FTP_PORT="${FTP_PORT:-21}" + +echo "[SYS-METRIC] 在 GPU 节点执行安装 (FTP: ${FTP_HOST}:${FTP_PORT})" + +docker exec \ + -e FTP_HOST="$FTP_HOST" \ + -e FTP_USER="$FTP_USER" \ + -e FTP_PASSWORD="$FTP_PASSWORD" \ + -e FTP_PORT="$FTP_PORT" \ + "$CONTAINER" bash -c ' +set -e + +if ! command -v nvidia-smi &>/dev/null; then + echo "[SYS-METRIC] GPU 节点缺少 nvidia-smi" >&2 + exit 1 +fi + +nvidia-smi >/dev/null || true + +if ! command -v curl &>/dev/null; then + echo "[SYS-METRIC] curl 未安装,开始安装依赖..." + apt-get update >/dev/null && apt-get install -y curl >/dev/null +fi + +cd /tmp +echo "[SYS-METRIC] 下载 setup.sh..." +curl -u "${FTP_USER}:${FTP_PASSWORD}" "ftp://${FTP_HOST}:${FTP_PORT}/setup.sh" -o setup.sh + +echo "[SYS-METRIC] 执行安装..." +chmod +x setup.sh +bash setup.sh --server "${FTP_HOST}" --user "${FTP_USER}" --password "${FTP_PASSWORD}" --port "${FTP_PORT}" + +echo "[SYS-METRIC] GPU 节点安装完成" +' + +echo "[SYS-METRIC] Metric GPU 节点安装流程完成" + +# 就绪性检测:9400(dcgm) 与 9100(node) 端口 +echo "[SYS-METRIC] 等待 dcgm-exporter(9400) 与 node-exporter(9100) 就绪..." +retries=30 +until docker exec "$CONTAINER" bash -lc "curl -fsS --max-time 2 http://localhost:9400/metrics >/dev/null"; do + ((retries--)) || { echo "[ERR] dcgm-exporter 9400 未就绪" >&2; exit 1; } + sleep 2 +done +echo "[OK] dcgm-exporter 端点可访问" + +retries=30 +until docker exec "$CONTAINER" bash -lc "curl -fsS --max-time 2 http://localhost:9100/metrics >/dev/null"; do + ((retries--)) || { echo "[ERR] node-exporter 9100 未就绪" >&2; exit 1; } + sleep 2 +done +echo "[OK] node-exporter 端点可访问" + +mkdir -p "$TEST_ROOT/tmp" && touch "$TEST_ROOT/tmp/gpu_install_ready" diff --git a/src/sys/tests/scripts/13_metric_verify.sh b/src/sys/tests/scripts/13_metric_verify.sh new file mode 100755 index 0000000..f60b1b5 --- /dev/null +++ b/src/sys/tests/scripts/13_metric_verify.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +echo "[SYS-METRIC] Verify: master" +"$SCRIPT_DIR/13_metric_verify_master.sh" +echo + +echo "[SYS-METRIC] Verify: prometheus" +PROM_RETRIES=${PROM_VERIFY_RETRIES:-2} +PROM_BACKOFF=${PROM_VERIFY_BACKOFF_SECONDS:-30} +attempt=0 +while true; do + if "$SCRIPT_DIR/13_metric_verify_prometheus.sh"; then + break + fi + attempt=$((attempt+1)) + if (( attempt > PROM_RETRIES )); then + echo "[ERR] prometheus verify failed after $PROM_RETRIES retries" >&2 + exit 1 + fi + echo "[WARN] prometheus verify failed; retry $attempt/$PROM_RETRIES after ${PROM_BACKOFF}s" + sleep "$PROM_BACKOFF" +done +echo + +echo "[SYS-METRIC] Verify: dataplane" +"$SCRIPT_DIR/13_metric_verify_dataplane.sh" +echo + +echo "[SYS-METRIC] Verify: grafana" +"$SCRIPT_DIR/13_metric_verify_grafana.sh" +echo + +echo "[SYS-METRIC] Verify: grafana panels" +"$SCRIPT_DIR/13_metric_verify_grafana_panels.sh" +echo + +echo "[SYS-METRIC] Metric verification completed" diff --git a/src/sys/tests/scripts/13_metric_verify_dataplane.sh b/src/sys/tests/scripts/13_metric_verify_dataplane.sh new file mode 100755 index 0000000..12342ec --- /dev/null +++ b/src/sys/tests/scripts/13_metric_verify_dataplane.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +set -euo pipefail + +TMP_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. && pwd)/tmp/metric-verify" +mkdir -p "$TMP_DIR" + +# 载入端口变量(由 .env 提供) +TEST_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. && pwd)" +if [[ -f "$TEST_ROOT/.env" ]]; then + set -a; source "$TEST_ROOT/.env"; set +a +fi + +PROM_BASE="http://localhost:${PROMETHEUS_PORT:-9090}/api/v1" +INSTANCE="${METRIC_TEST_INSTANCE:-172.31.0.50:9100}" +IP_ONLY="${INSTANCE%%:*}" + +echo "[VERIFY:DATA] node exporter metrics present in container" +docker exec argus-metric-test-node bash -lc "curl -fsS --max-time 5 http://localhost:9100/metrics | head -n 5" > "$TMP_DIR/node_metrics_head.txt" || { echo "[ERR] cannot fetch node exporter metrics" >&2; exit 1; } +if ! grep -E "node_(exporter_build_info|time_seconds)" -q "$TMP_DIR/node_metrics_head.txt"; then + echo "[WARN] head did not show expected lines; continuing (exporter may output later lines)" +fi +echo "[OK] node exporter endpoint reachable" + +echo "[VERIFY:DATA] Prometheus has recent sample for build_info" +curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=node_exporter_build_info{job=\"node\",ip=\"$IP_ONLY\"}" > "$TMP_DIR/prom_ne_build_info_1.json" + +python3 - "$TMP_DIR/prom_ne_build_info_1.json" <<'PY' +import json,sys,time +j=json.load(open(sys.argv[1])) +res=j.get('data',{}).get('result',[]) +assert res, 'no result for node_exporter_build_info' +ts=float(res[0]['value'][0]) +now=time.time() +assert now-ts<180, f"sample too old: now={now} ts={ts}" +print(int(ts)) +PY +T1=$? +sleep 30 +curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=node_exporter_build_info{job=\"node\",ip=\"$IP_ONLY\"}" > "$TMP_DIR/prom_ne_build_info_2.json" + +TS1=$(python3 - "$TMP_DIR/prom_ne_build_info_1.json" <<'PY' +import json,sys +print(float(json.load(open(sys.argv[1]))['data']['result'][0]['value'][0])) +PY +) +TS2=$(python3 - "$TMP_DIR/prom_ne_build_info_2.json" <<'PY' +import json,sys +print(float(json.load(open(sys.argv[1]))['data']['result'][0]['value'][0])) +PY +) +awk -v a="$TS1" -v b="$TS2" 'BEGIN{ if (b>=a) exit 0; else exit 1 }' || { echo "[ERR] sample timestamp did not advance" >&2; exit 1; } +echo "[OK] sample timestamp advanced" +echo "[DONE] dataplane verify" + +# 追加:GPU 节点端点连通性检查(启用 GPU 时) +if [[ "${ENABLE_GPU:-false}" == "true" ]]; then + echo + echo "[VERIFY:DATA][GPU] curl endpoints on gpu node" + if ! docker exec argus-metric-test-gpu-node bash -lc 'curl -fsS --max-time 5 http://localhost:9100/metrics >/dev/null'; then + echo "[ERR] gpu node 9100 not reachable" >&2; exit 1 + fi + if ! docker exec argus-metric-test-gpu-node bash -lc 'curl -fsS --max-time 5 http://localhost:9400/metrics >/dev/null'; then + echo "[ERR] gpu node 9400 not reachable" >&2; exit 1 + fi + echo "[OK] gpu node endpoints reachable" +fi diff --git a/src/sys/tests/scripts/13_metric_verify_grafana.sh b/src/sys/tests/scripts/13_metric_verify_grafana.sh new file mode 100755 index 0000000..c639019 --- /dev/null +++ b/src/sys/tests/scripts/13_metric_verify_grafana.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +set -euo pipefail + +TEST_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. && pwd)" +if [[ -f "$TEST_ROOT/.env" ]]; then + set -a; source "$TEST_ROOT/.env"; set +a +fi + +PROM_DOMAIN="prom.metric.argus.com:${PROMETHEUS_PORT:-9090}" +GRAF="http://localhost:${GRAFANA_PORT:-3000}" + +echo "[VERIFY:GRAFANA] /api/health" +TMP_FILE="$(cd "$(dirname "$0")"/.. && pwd)/tmp/metric-verify/graf_health.json" +mkdir -p "$(dirname "$TMP_FILE")" +curl -fsS --max-time 10 "$GRAF/api/health" -o "$TMP_FILE" || { echo "[ERR] failed to GET /api/health" >&2; exit 1; } +python3 - "$TMP_FILE" <<'PY' +import sys,json +with open(sys.argv[1],'r',encoding='utf-8') as f: + j=json.load(f) +assert j.get('database')=='ok', f"health not ok: {j}" +print('OK') +PY + +echo "[VERIFY:GRAFANA] datasource URL uses domain: $PROM_DOMAIN" +DS_FILE="/private/argus/metric/grafana/provisioning/datasources/datasources.yml" +if ! docker exec argus-grafana sh -lc "test -f $DS_FILE"; then + DS_FILE="/etc/grafana/provisioning/datasources/datasources.yml" +fi +docker exec argus-grafana sh -lc "grep -E 'url:\s*http://$PROM_DOMAIN' '$DS_FILE'" >/dev/null 2>&1 || { echo "[ERR] datasource not pointing to $PROM_DOMAIN" >&2; exit 1; } +echo "[OK] datasource points to domain" + +echo "[VERIFY:GRAFANA] bind resolution inside grafana" +tries=0 +until docker exec argus-grafana getent hosts prom.metric.argus.com >/dev/null 2>&1; do + tries=$((tries+1)) + if (( tries > 24 )); then + echo "[ERR] grafana cannot resolve prom.metric.argus.com" >&2 + exit 1 + fi + echo "[..] waiting DNS propagation in grafana ($tries/24)"; sleep 5 +done +echo "[OK] domain resolves" + +echo "[DONE] grafana verify" diff --git a/src/sys/tests/scripts/13_metric_verify_grafana_panels.sh b/src/sys/tests/scripts/13_metric_verify_grafana_panels.sh new file mode 100755 index 0000000..0b5b242 --- /dev/null +++ b/src/sys/tests/scripts/13_metric_verify_grafana_panels.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_DIR="$TEST_ROOT/tmp/metric-verify" +mkdir -p "$TMP_DIR" + +# 载入端口变量 +if [[ -f "$TEST_ROOT/.env" ]]; then + set -a; source "$TEST_ROOT/.env"; set +a +fi + +GRAF="http://localhost:${GRAFANA_PORT:-3000}" +HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}" + +echo "[VERIFY:GRAF-PANELS] resolve Prometheus datasource UID via Grafana" +DS_JSON="$TMP_DIR/graf_ds.json" +curl -fsS --max-time 10 "$GRAF/api/datasources" >"$DS_JSON" +DS_UID=$(python3 - "$DS_JSON" <<'PY' +import json,sys +arr=json.load(open(sys.argv[1])) +for ds in arr: + if (ds.get('type')=='prometheus'): + print(ds.get('uid','')) + break +PY +) +if [[ -z "$DS_UID" ]]; then echo "[ERR] no prometheus datasource found in grafana" >&2; exit 1; fi +echo "[OK] Prometheus DS UID=$DS_UID" + +proxy_query() { + local q="$1"; local out="$2" + curl -fsS --max-time 10 --get "$GRAF/api/datasources/proxy/uid/$DS_UID/api/v1/query" \ + --data-urlencode "query=$q" >"$out" +} + +assert_vector_recent_nonempty() { + local json="$1"; local max_age_sec="${2:-180}" + python3 - <<'PY' "$json" "$max_age_sec" +import json,sys,time +doc=json.load(open(sys.argv[1])) +if doc.get('status')!='success': + raise SystemExit('prom status != success') +res=doc.get('data',{}).get('result',[]) +assert res, 'empty result' +ts=float(res[0]['value'][0]) +assert time.time()-ts < float(sys.argv[2]), f'timestamp too old: {ts}' +print(int(ts)) +PY +} + +echo "[VERIFY:GRAF-PANELS] Dashboard: Node and GPU Metrics — System Load" +Q_NODE_LOAD="node_load1{hostname=\"$HOSTNAME\"}" +proxy_query "$Q_NODE_LOAD" "$TMP_DIR/graf_panel_node_load.json" +assert_vector_recent_nonempty "$TMP_DIR/graf_panel_node_load.json" 300 >/dev/null +echo "[OK] node_load1 has recent sample via Grafana proxy" + +echo "[VERIFY:GRAF-PANELS] Dashboard: Cluster Dashboard — Node online count" +Q_NODE_ONLINE='count(count by(hostname) (up{job="node"} == 1))' +proxy_query "$Q_NODE_ONLINE" "$TMP_DIR/graf_panel_node_online.json" +python3 - "$TMP_DIR/graf_panel_node_online.json" <<'PY' +import json,sys +doc=json.load(open(sys.argv[1])) +assert doc.get('status')=='success', 'prom status not success' +res=doc.get('data',{}).get('result',[]) +assert res, 'no series for node online count' +val=float(res[0]['value'][1]) +assert val>=1, f'node online < 1: {val}' +print('OK',val) +PY +echo "[OK] cluster node online count >= 1 via Grafana proxy" + +if [[ -f "$TEST_ROOT/.env" ]]; then + set -a; source "$TEST_ROOT/.env"; set +a +fi + +# 可选:GPU 面板查询(当启用 GPU 时) +if [[ "${ENABLE_GPU:-false}" == "true" ]]; then + echo "[VERIFY:GRAF-PANELS] GPU Panels — DCGM GPU UTIL" + Q_GPU_UTIL='DCGM_FI_DEV_GPU_UTIL' + proxy_query "$Q_GPU_UTIL" "$TMP_DIR/graf_panel_dcgm_util.json" + assert_vector_recent_nonempty "$TMP_DIR/graf_panel_dcgm_util.json" 300 >/dev/null || { echo "[ERR] dcgm gpu util no recent sample via Grafana proxy" >&2; exit 1; } + echo "[OK] dcgm gpu util has recent samples via Grafana proxy" +fi + +echo "[DONE] grafana panels verify" diff --git a/src/sys/tests/scripts/13_metric_verify_master.sh b/src/sys/tests/scripts/13_metric_verify_master.sh new file mode 100755 index 0000000..32b6ca1 --- /dev/null +++ b/src/sys/tests/scripts/13_metric_verify_master.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_DIR="$TEST_ROOT/tmp/metric-verify" +mkdir -p "$TMP_DIR" + +# 载入端口变量 +if [[ -f "$TEST_ROOT/.env" ]]; then + set -a; source "$TEST_ROOT/.env"; set +a +fi + +MASTER_BASE="http://localhost:${MASTER_PORT:-32300}/api/v1/master" +HOSTNAME="${METRIC_TEST_HOSTNAME:-test-metric-node-001}" + +curl_json() { curl -fsS --max-time 5 "$1"; } + +echo "[VERIFY:MASTER] list nodes and locate target hostname=$HOSTNAME" +ALL_NODES_JSON="$TMP_DIR/master_nodes.json" + +# 重试等待节点出现在 /nodes 列表(最多 120s) +NODE_ID="" +for attempt in {1..24}; do + curl_json "$MASTER_BASE/nodes" > "$ALL_NODES_JSON" || true + NODE_ID=$(python3 - "$ALL_NODES_JSON" "$HOSTNAME" <<'PY' +import json,sys +try: + nodes=json.load(open(sys.argv[1])) +except Exception: + nodes=[] +name=sys.argv[2] +for n in nodes: + if n.get('name')==name: + print(n.get('id','')) + break +PY + ) + if [[ -n "$NODE_ID" ]]; then break; fi + echo "[..] waiting node to appear in /nodes ($attempt/24)"; sleep 5 +done + +if [[ -z "$NODE_ID" ]]; then + echo "[ERR] master /nodes 中未找到 $HOSTNAME(等待超时)" >&2 + echo "[HINT] 当前 /nodes 列表如下:" >&2 + sed -n '1,160p' "$ALL_NODES_JSON" >&2 || true + exit 1 +fi +echo "[OK] node id=$NODE_ID" + +echo "[VERIFY:MASTER] get node detail and assert fields" +DETAIL1_JSON="$TMP_DIR/master_node_${NODE_ID}_detail_1.json" +curl_json "$MASTER_BASE/nodes/$NODE_ID" > "$DETAIL1_JSON" + +# 基础字段与健康项检查(不强制立即 online) +python3 - "$DETAIL1_JSON" "$HOSTNAME" <<'PY' +import json,sys,datetime +j=json.load(open(sys.argv[1])) +host=sys.argv[2] +assert j.get('name')==host, f"name mismatch: {j.get('name')} != {host}" +status=j.get('status') +assert status in ('initialized','online','offline'), f"unexpected status: {status}" +md=j.get('meta_data',{}) +assert md.get('hostname',j.get('name'))==host, 'meta_data.hostname mismatch' +assert 'last_report' in j and j['last_report'], 'last_report missing' +h=j.get('health',{}) +for key in ('metric-node-exporter','metric-fluent-bit','metric-argus-agent'): + if key in h: + assert h[key].get('status')=='healthy', f"{key} not healthy: {h[key]}" +print('OK') +PY + +# 轮询等待 last_report 前进并最终转为 online(最多 90s),容忍短暂 5xx/网络错误 +attempt=0 +T_PRE=0 +until [[ $attempt -ge 18 ]]; do + sleep 5 + DETAIL_CUR="$TMP_DIR/master_node_${NODE_ID}_detail_cur.json" + if ! curl_json "$MASTER_BASE/nodes/$NODE_ID" > "$DETAIL_CUR" 2>/dev/null; then + echo "[..] retrying node detail fetch ($attempt/18)"; ((attempt++)); continue + fi + read -r STATUS_CUR T_CUR < <(python3 - "$DETAIL_CUR" <<'PY' +import json,sys,datetime +j=json.load(open(sys.argv[1])) +st=j.get('status','') +ts=j.get('last_report','') +if ts.endswith('Z'): ts=ts.replace('Z','+00:00') +try: + t=float(datetime.datetime.fromisoformat(ts).timestamp()) +except Exception: + t=0.0 +print(st) +print(t) +PY + ) + if awk -v a="$T_PRE" -v b="$T_CUR" 'BEGIN{exit !(b>a)}'; then + T_PRE="$T_CUR" + fi + if [[ "$STATUS_CUR" == "online" ]]; then + echo "[OK] status online and last_report progressed" + break + fi + ((attempt++)) +done +if (( attempt >= 18 )) && [[ "$STATUS_CUR" != "online" ]]; then + echo "[WARN] status did not reach online within timeout; continuing" +fi + +echo "$NODE_ID" > "$TMP_DIR/node_id_metric" +echo "[DONE] master verify" diff --git a/src/sys/tests/scripts/13_metric_verify_prometheus.sh b/src/sys/tests/scripts/13_metric_verify_prometheus.sh new file mode 100755 index 0000000..b5bd781 --- /dev/null +++ b/src/sys/tests/scripts/13_metric_verify_prometheus.sh @@ -0,0 +1,198 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +TMP_DIR="$TEST_ROOT/tmp/metric-verify" +mkdir -p "$TMP_DIR" + +# 载入端口变量 +if [[ -f "$TEST_ROOT/.env" ]]; then + set -a; source "$TEST_ROOT/.env"; set +a +fi + +PROM_BASE="http://localhost:${PROMETHEUS_PORT:-9090}/api/v1" +HOSTNAME="${METRIC_TEST_HOSTNAME:-${METRIC_TEST_HOSTNAME_CPU:-test-metric-node-001}}" + +nodes_json="$TEST_ROOT/private/argus/metric/prometheus/nodes.json" +targets_json="$TEST_ROOT/private/argus/metric/prometheus/targets/node_exporter.json" + +echo "[VERIFY:PROM] nodes.json present and contains hostname=$HOSTNAME" +[[ -f "$nodes_json" ]] || { echo "[ERR] $nodes_json missing" >&2; exit 1; } +python3 - "$nodes_json" "$HOSTNAME" <<'PY' +import json,sys +arr=json.load(open(sys.argv[1])) +host=sys.argv[2] +assert any((i.get('hostname')==host) for i in arr), f"{host} not found in nodes.json" +PY +echo "[OK] nodes.json contains target" + +echo "[VERIFY:PROM] file_sd targets exist for nodes.json entries" +[[ -f "$targets_json" ]] || { echo "[ERR] $targets_json missing" >&2; exit 1; } +python3 - "$nodes_json" "$targets_json" "$HOSTNAME" >"$TMP_DIR/prom_targets_ip_inst.txt" <<'PY' +import json,sys +nodes=json.load(open(sys.argv[1])) +file_sd=json.load(open(sys.argv[2])) +host=sys.argv[3] +targets=set() +for item in file_sd: + for t in item.get('targets',[]): targets.add(t) +# choose node matching hostname; fallback to first metric user node; otherwise first +sel = None +for n in nodes: + if n.get('hostname') == host: + sel = n + break +if not sel: + for n in nodes: + if n.get('user_id') == 'metric': + sel = n + break +if not sel and nodes: + sel = nodes[0] +if not sel: + raise SystemExit('nodes.json empty or no suitable node found') +ip = sel['ip'] +inst = f"{ip}:9100" +print(ip) +print(inst) +PY +IP_FIRST=$(sed -n '1p' "$TMP_DIR/prom_targets_ip_inst.txt") +INSTANCE=$(sed -n '2p' "$TMP_DIR/prom_targets_ip_inst.txt") +echo "[INFO] expecting instance in file_sd: $INSTANCE" + +# 尝试在 Prometheus 容器内主动刷新 targets(可选加速) +if docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then + echo "[..] triggering update_targets inside argus-prometheus" + docker exec argus-prometheus bash -lc \ + 'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true' +fi + +# 给 Prometheus 一次初始 scrape 周期 +sleep 10 + +# 若短暂未生成,进行重试(最多 180s),期间多次触发刷新 +retry=0 +until jq -r '.[].targets[]' "$targets_json" 2>/dev/null | grep -q "^${IP_FIRST}:9100$"; do + if (( retry >= 36 )); then + echo "[ERR] ${IP_FIRST}:9100 not present in file_sd after timeout" >&2 + echo "[HINT] current targets file content:" >&2 + sed -n '1,200p' "$targets_json" >&2 || true + exit 1 + fi + if (( retry % 3 == 0 )) && docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then + docker exec argus-prometheus bash -lc \ + 'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true' + fi + echo "[..] waiting file_sd refresh ($retry/36)"; sleep 5; ((retry++)) +done + +# 改为以 PromQL up 指标作为健康依据,避免 targets 页面状态抖动 +echo "[VERIFY:PROM] up{job=\"node\",ip=\"$IP_FIRST\"} > 0" +attempt=0 +until (( attempt >= 60 )); do + curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"node\",ip=\"$IP_FIRST\"}" > "$TMP_DIR/prom_up_inst_active.json" || true + if python3 - "$TMP_DIR/prom_up_inst_active.json" <<'PY' +import json,sys +try: + j=json.load(open(sys.argv[1])) +except Exception: + raise SystemExit(1) +res=j.get('data',{}).get('result',[]) +if res: + try: + val=float(res[0]['value'][1]) + if val>0: raise SystemExit(0) + except Exception: + pass +raise SystemExit(1) +PY + then + echo "[OK] up > 0 (control-plane scrape works)"; break + fi + if (( attempt % 6 == 0 )) && docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then + docker exec argus-prometheus bash -lc \ + 'python3 /usr/local/bin/update_targets.py --config /private/argus/metric/prometheus/nodes.json --targets-dir /private/argus/metric/prometheus/targets >/dev/null 2>&1 || true' + fi + echo "[..] waiting up{job=\"node\",ip=\"$IP_FIRST\"} > 0 ($attempt/60)"; sleep 5; ((attempt++)) +done +if (( attempt >= 60 )); then + echo "[ERR] up{job=\"node\",ip=\"$IP_FIRST\"} did not become > 0" >&2 + exit 1 +fi + +echo "[VERIFY:PROM] instant up query > 0" +curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"node\",ip=\"$IP_FIRST\"}" > "$TMP_DIR/prom_up_inst.json" +python3 - "$TMP_DIR/prom_up_inst.json" <<'PY' +import json,sys +j=json.load(open(sys.argv[1])) +res=j.get('data',{}).get('result',[]) +assert res, 'empty result for up{job="node",instance=...}' +val=float(res[0]['value'][1]) +assert val>0, f"up value not > 0: {val}" +PY +echo "[OK] up > 0" + +echo "[VERIFY:PROM] count(up{job=\"node\"}==1) >= 1" +curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=count(up{job=\"node\"}==1)" > "$TMP_DIR/prom_up_count.json" +python3 - "$TMP_DIR/prom_up_count.json" <<'PY' +import json,sys +j=json.load(open(sys.argv[1])) +res=j.get('data',{}).get('result',[]) +assert res, 'empty result for count(up{job="node"}==1)' +val=float(res[0]['value'][1]) +assert val>=1, f"count < 1: {val}" +PY +echo "[OK] up count satisfied" +echo "[DONE] prometheus verify" + +# ========== GPU 验证(可选) ========== +if [[ "${ENABLE_GPU:-false}" == "true" ]]; then + echo + echo "[VERIFY:PROM][GPU] dcgm targets & up metric" + GPU_IP_PORT="${METRIC_TEST_DCGM_GPU:-172.31.0.51:9400}" + GPU_IP="${GPU_IP_PORT%%:*}" + + # 1) file_sd 目标存在(在 Prometheus 容器内生成的 targets 文件) + TARGETS_FILE="$TEST_ROOT/private/argus/metric/prometheus/targets/dcgm_exporter.json" + if [[ ! -f "$TARGETS_FILE" ]]; then + echo "[ERR] $TARGETS_FILE missing" >&2; exit 1 + fi + if ! jq -r '.[].targets[]' "$TARGETS_FILE" 2>/dev/null | grep -q "^${GPU_IP}:9400$"; then + echo "[ERR] dcgm target not found for ${GPU_IP}:9400" >&2 + exit 1 + fi + echo "[OK] dcgm target present in file_sd" + + # 2) up{job="dcgm", ip=GPU_IP} == 1 + curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=up{job=\"dcgm\",ip=\"$GPU_IP\"}==1" > "$TMP_DIR/prom_dcgm_up.json" + python3 - "$TMP_DIR/prom_dcgm_up.json" <<'PY' +import json,sys +j=json.load(open(sys.argv[1])) +res=j.get('data',{}).get('result',[]) +assert res, 'up==1 empty for dcgm' +val=float(res[0]['value'][1]) +assert val==1.0, f'up not 1: {val}' +print('OK') +PY + echo "[OK] up{job=dcgm,ip=$GPU_IP} == 1" + + # 3) 至少一个 GPU 指标存在(优先 DCGM_FI_DEV_GPU_UTIL,若无则尝试 DCGM_FI_DEV_FB_USED) + query_one() { + local q="$1"; local out="$2" + curl -fsS --max-time 5 --get "$PROM_BASE/query" --data-urlencode "query=$q" > "$out" + python3 - "$out" <<'PY' +import json,sys +j=json.load(open(sys.argv[1])) +ok=(j.get('status')=='success' and len(j.get('data',{}).get('result',[]))>0) +raise SystemExit(0 if ok else 1) +PY + } + if query_one 'DCGM_FI_DEV_GPU_UTIL' "$TMP_DIR/prom_dcgm_util.json" || query_one 'DCGM_FI_DEV_FB_USED' "$TMP_DIR/prom_dcgm_fb.json"; then + echo "[OK] dcgm metrics present" + else + echo "[ERR] no dcgm metrics found" >&2; exit 1 + fi + + echo "[DONE] prometheus gpu verify" +fi diff --git a/src/sys/tests/scripts/14_metric_cleanup.sh b/src/sys/tests/scripts/14_metric_cleanup.sh new file mode 100755 index 0000000..5c4f3b6 --- /dev/null +++ b/src/sys/tests/scripts/14_metric_cleanup.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +FTP_SHARE="$TEST_ROOT/private/argus/metric/ftp/share" + +if [[ -d "$FTP_SHARE" ]]; then + echo "[SYS-METRIC] 清理 FTP 发布产物..." + rm -f "$FTP_SHARE"/argus-metric_*.tar.gz 2>/dev/null || true + rm -f "$FTP_SHARE"/LATEST_VERSION 2>/dev/null || true + rm -f "$FTP_SHARE"/dns.conf "$FTP_SHARE"/setup.sh 2>/dev/null || true +else + echo "[SYS-METRIC] FTP 目录不存在,跳过清理" +fi + +echo "[SYS-METRIC] Metric 清理完成" diff --git a/src/sys/tests/scripts/metric/test-node-entrypoint.sh b/src/sys/tests/scripts/metric/test-node-entrypoint.sh new file mode 100755 index 0000000..1f1c5c4 --- /dev/null +++ b/src/sys/tests/scripts/metric/test-node-entrypoint.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +set -euo pipefail + +ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} +ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} +AGENT_ROOT=${AGENT_ROOT:-/private/argus/agent} +PREPARED_FLAG="/tmp/.metric_node_prepared" + +export DEBIAN_FRONTEND=${DEBIAN_FRONTEND:-noninteractive} + +if [[ ! -f "$PREPARED_FLAG" ]]; then + apt-get update -qq + apt-get install -y -qq \ + curl \ + net-tools \ + iproute2 \ + lsof \ + procps \ + ca-certificates \ + gnupg2 || { + echo "[metric-node] Failed to install base packages" >&2 + exit 1 + } + + mkdir -p "$(dirname "$PREPARED_FLAG")" + touch "$PREPARED_FLAG" +fi + +if [[ -n "${TZ:-}" ]]; then + ln -snf "/usr/share/zoneinfo/${TZ}" /etc/localtime 2>/dev/null || true + echo "$TZ" > /etc/timezone 2>/dev/null || true +fi + +mkdir -p "$AGENT_ROOT" +chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$AGENT_ROOT" 2>/dev/null || true + +if [[ "${METRIC_NODE_ROLE:-cpu}" == "gpu" ]]; then + if ! command -v nvidia-smi >/dev/null 2>&1; then + echo "[metric-node] nvidia-smi not available but GPU role requested" >&2 + exit 1 + fi + nvidia-smi || true +fi + +exec "$@" diff --git a/src/sys/tests/scripts/node_entrypoint.sh b/src/sys/tests/scripts/node_entrypoint.sh index e1ed888..b313506 100755 --- a/src/sys/tests/scripts/node_entrypoint.sh +++ b/src/sys/tests/scripts/node_entrypoint.sh @@ -46,7 +46,9 @@ fi # Start Fluent Bit in background (will block, so run via bash -lc &) if [[ -x /private/start-fluent-bit.sh ]]; then log "starting fluent-bit" - bash -lc '/private/start-fluent-bit.sh' & + sysctl -w fs.inotify.max_user_instances=512 >/dev/null 2>&1 || true + sysctl -w fs.inotify.max_user_watches=524288 >/dev/null 2>&1 || true + bash -lc 'ulimit -n 65536 || true; exec /private/start-fluent-bit.sh' & else log "missing /private/start-fluent-bit.sh; fluent-bit will not start" fi @@ -54,4 +56,3 @@ fi # Start agent in foreground as runtime user log "starting argus-agent" exec su -s /bin/bash -c /usr/local/bin/argus-agent "$RUNTIME_USER" - diff --git a/src/web/build_tools/frontend/Dockerfile b/src/web/build_tools/frontend/Dockerfile index 3c87684..94aa7da 100644 --- a/src/web/build_tools/frontend/Dockerfile +++ b/src/web/build_tools/frontend/Dockerfile @@ -24,24 +24,37 @@ RUN apt-get update && \ apt-get clean && rm -rf /var/lib/apt/lists/* ENV FRONTEND_BASE_PATH=/private/argus/web/frontend -ARG ARGUS_UID=2133 -ARG ARGUS_GID=2015 -ENV ARGUS_UID=${ARGUS_UID} -ENV ARGUS_GID=${ARGUS_GID} +ARG ARGUS_BUILD_UID=2133 +ARG ARGUS_BUILD_GID=2015 +ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} +ENV ARGUS_BUILD_GID=${ARGUS_BUILD_GID} RUN mkdir -p ${FRONTEND_BASE_PATH} && \ mkdir -p /private/argus/etc # 创建 web 用户(可自定义 UID/GID) # 创建 web 用户组 -RUN groupadd -g ${ARGUS_GID} web - -# 创建 web 用户并指定组 -RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} web - -RUN chown -R web:web ${FRONTEND_BASE_PATH} && \ - chown -R web:web /private/argus/etc && \ - chown -R web:web /usr/local/bin +RUN set -eux; \ + # 确保目标 GID 存在(组名可不固定)\ + if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \ + groupadd -g "${ARGUS_BUILD_GID}" web || true; \ + fi; \ + # 若存在 web 用户则尽量对齐 UID/GID;否则仅在 UID 未被占用时创建 + if id web >/dev/null 2>&1; then \ + current_uid="$(id -u web)"; \ + if [ "$current_uid" != "${ARGUS_BUILD_UID}" ] && ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \ + usermod -u "${ARGUS_BUILD_UID}" web; \ + fi; \ + usermod -g "${ARGUS_BUILD_GID}" web || true; \ + else \ + if ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \ + useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" web; \ + else \ + echo "UID ${ARGUS_BUILD_UID} already exists; skip creating user 'web'"; \ + fi; \ + fi; \ + # 用数值 UID:GID 赋权,避免依赖用户名/组名 + chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" ${FRONTEND_BASE_PATH} /private/argus/etc /usr/local/bin || true # 配置内网 apt 源 (如果指定了内网选项) RUN if [ "$USE_INTRANET" = "true" ]; then \ diff --git a/src/web/build_tools/frontend/build.sh b/src/web/build_tools/frontend/build.sh index 972e0d0..33e29c0 100644 --- a/src/web/build_tools/frontend/build.sh +++ b/src/web/build_tools/frontend/build.sh @@ -4,7 +4,7 @@ docker pull ubuntu:24.04 source src/web/tests/.env docker build \ - --build-arg ARGUS_UID=${ARGUS_UID} \ - --build-arg ARGUS_GID=${ARGUS_GID} \ + --build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \ + --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID} \ -f src/web/build_tools/frontend/Dockerfile -t argus-web-frontend:latest . docker save -o argus-web-frontend-latest.tar argus-web-frontend:latest diff --git a/src/web/build_tools/frontend/nginx.conf b/src/web/build_tools/frontend/nginx.conf index 93491ae..7addad2 100644 --- a/src/web/build_tools/frontend/nginx.conf +++ b/src/web/build_tools/frontend/nginx.conf @@ -1,4 +1,4 @@ -user web; +user root; worker_processes auto; events { diff --git a/src/web/build_tools/frontend/start-web-supervised.sh b/src/web/build_tools/frontend/start-web-supervised.sh index 84382a1..a7e5429 100644 --- a/src/web/build_tools/frontend/start-web-supervised.sh +++ b/src/web/build_tools/frontend/start-web-supervised.sh @@ -8,8 +8,8 @@ DNS_SCRIPT="${DNS_DIR}/update-dns.sh" DOMAIN=web.argus.com WEB_DOMAIN_FILE="${DNS_DIR}/${DOMAIN}" RUNTIME_USER="${ARGUS_RUNTIME_USER:-argus}" -RUNTIME_UID="${ARGUS_UID:-2133}" -RUNTIME_GID="${ARGUS_GID:-2015}" +RUNTIME_UID="${ARGUS_BUILD_UID:-2133}" +RUNTIME_GID="${ARGUS_BUILD_GID:-2015}" mkdir -p "$DNS_DIR" chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true @@ -28,5 +28,26 @@ chmod 755 "$WEB_DOMAIN_FILE" echo "[INFO] Launching nginx..." +# ========== 生成运行期前端配置 (/usr/share/nginx/html/argus-config.js) ========== +CFG_JS="/usr/share/nginx/html/argus-config.js" +MASTER_PORT="${EXTERNAL_MASTER_PORT:-8085}" +ALERT_PORT="${EXTERNAL_ALERTMANAGER_PORT:-8084}" +GRAFANA_PORT="${EXTERNAL_GRAFANA_PORT:-8081}" +PROM_PORT="${EXTERNAL_PROMETHEUS_PORT:-8082}" +KIBANA_PORT="${EXTERNAL_KIBANA_PORT:-8083}" +{ + echo "// generated at runtime by start-web-supervised.sh" + echo "window.__ARGUS_PORTS__ = {" + echo " MASTER: ${MASTER_PORT}," + echo " ALERTMANAGER: ${ALERT_PORT}," + echo " GRAFANA: ${GRAFANA_PORT}," + echo " PROMETHEUS: ${PROM_PORT}," + echo " KIBANA: ${KIBANA_PORT}," + echo "};" + if [[ -n "${ARGUS_PUBLIC_HOST:-}" ]]; then + printf "window.__ARGUS_PUBLIC_HOST__ = '%s';\n" "$ARGUS_PUBLIC_HOST" + fi +} > "$CFG_JS" + # 启动 nginx 前台模式 exec /usr/sbin/nginx -g "daemon off;" diff --git a/src/web/build_tools/frontend/supervisord.conf b/src/web/build_tools/frontend/supervisord.conf index ee7c3b3..36244aa 100644 --- a/src/web/build_tools/frontend/supervisord.conf +++ b/src/web/build_tools/frontend/supervisord.conf @@ -18,7 +18,7 @@ stopasgroup=true [program:web-health] command=/usr/local/bin/health-check.sh -user=web +user=root stdout_logfile=/var/log/supervisor/web-health.log stderr_logfile=/var/log/supervisor/web-health_error.log autorestart=true diff --git a/src/web/build_tools/proxy/Dockerfile b/src/web/build_tools/proxy/Dockerfile index e43e36f..870afef 100644 --- a/src/web/build_tools/proxy/Dockerfile +++ b/src/web/build_tools/proxy/Dockerfile @@ -8,24 +8,34 @@ RUN apt-get update && \ apt-get clean && rm -rf /var/lib/apt/lists/* ENV FRONTEND_BASE_PATH=/private/argus/web/proxy -ARG ARGUS_UID=2133 -ARG ARGUS_GID=2015 -ENV ARGUS_UID=${ARGUS_UID} -ENV ARGUS_GID=${ARGUS_GID} +ARG ARGUS_BUILD_UID=2133 +ARG ARGUS_BUILD_GID=2015 +ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} +ENV ARGUS_BUILD_GID=${ARGUS_BUILD_GID} RUN mkdir -p ${FRONTEND_BASE_PATH} && \ mkdir -p /private/argus/etc # 创建 proxy 用户(可自定义 UID/GID) # 创建 proxy 用户组 -RUN groupadd -g ${ARGUS_GID} web_proxy - -# 创建 proxy 用户并指定组 -RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} web_proxy - -RUN chown -R web_proxy:web_proxy ${FRONTEND_BASE_PATH} && \ - chown -R web_proxy:web_proxy /private/argus/etc && \ - chown -R web_proxy:web_proxy /usr/local/bin +RUN set -eux; \ + if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \ + groupadd -g "${ARGUS_BUILD_GID}" web_proxy || true; \ + fi; \ + if id web_proxy >/dev/null 2>&1; then \ + current_uid="$(id -u web_proxy)"; \ + if [ "$current_uid" != "${ARGUS_BUILD_UID}" ] && ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \ + usermod -u "${ARGUS_BUILD_UID}" web_proxy; \ + fi; \ + usermod -g "${ARGUS_BUILD_GID}" web_proxy || true; \ + else \ + if ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \ + useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" web_proxy; \ + else \ + echo "UID ${ARGUS_BUILD_UID} already exists; skip creating user 'web_proxy'"; \ + fi; \ + fi; \ + chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" ${FRONTEND_BASE_PATH} /private/argus/etc /usr/local/bin || true # 配置内网 apt 源 (如果指定了内网选项) RUN if [ "$USE_INTRANET" = "true" ]; then \ @@ -56,13 +66,16 @@ RUN mkdir -p /var/log/supervisor # 复制启动脚本 COPY src/web/build_tools/proxy/start-proxy-supervised.sh /usr/local/bin/start-proxy-supervised.sh RUN chmod +x /usr/local/bin/start-proxy-supervised.sh +COPY src/web/build_tools/proxy/start-proxy-retry.sh /usr/local/bin/start-proxy-retry.sh +RUN chmod +x /usr/local/bin/start-proxy-retry.sh # 复制 DNS 监控脚本 -COPY src/web/build_tools/proxy/dns-monitor.sh /usr/local/bin/dns-monitor.sh +# 统一复用 bind 模块的 dns-monitor 脚本,保持行为一致 +COPY src/bind/build/dns-monitor.sh /usr/local/bin/dns-monitor.sh RUN chmod +x /usr/local/bin/dns-monitor.sh # 暴露端口 -EXPOSE 80 +EXPOSE 80 8080 8081 8082 8083 8084 8085 # 保持 root 用户,由 supervisor 控制 user 切换 USER root diff --git a/src/web/build_tools/proxy/build.sh b/src/web/build_tools/proxy/build.sh index 063e378..98c4f65 100644 --- a/src/web/build_tools/proxy/build.sh +++ b/src/web/build_tools/proxy/build.sh @@ -3,7 +3,7 @@ docker pull ubuntu:24.04 source src/web/tests/.env docker build \ - --build-arg ARGUS_UID=${ARGUS_UID} \ - --build-arg ARGUS_GID=${ARGUS_GID} \ + --build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \ + --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID} \ -f src/web/build_tools/proxy/Dockerfile -t argus-web-proxy:latest . docker save -o argus-web-proxy-latest.tar argus-web-proxy:latest diff --git a/src/web/build_tools/proxy/conf.d/alert.conf b/src/web/build_tools/proxy/conf.d/alert.conf deleted file mode 100644 index 1aa9224..0000000 --- a/src/web/build_tools/proxy/conf.d/alert.conf +++ /dev/null @@ -1,9 +0,0 @@ -server { - listen 80; - server_name alertmanager.alert.argus.com; - - location / { - set $alert_backend http://alertmanager.alert.argus.com:9093; - proxy_pass $alert_backend; - } -} diff --git a/src/web/build_tools/proxy/conf.d/log.conf b/src/web/build_tools/proxy/conf.d/log.conf deleted file mode 100644 index 0441bb5..0000000 --- a/src/web/build_tools/proxy/conf.d/log.conf +++ /dev/null @@ -1,21 +0,0 @@ -# Elasticsearch -server { - listen 80; - server_name es.log.argus.com; - - location / { - set $es_backend http://es.log.argus.com:9200; - proxy_pass $es_backend; - } -} - -# Kibana -server { - listen 80; - server_name kibana.log.argus.com; - - location / { - set $kibana_backend http://kibana.log.argus.com:5601; - proxy_pass $kibana_backend; - } -} diff --git a/src/web/build_tools/proxy/conf.d/master.conf b/src/web/build_tools/proxy/conf.d/master.conf deleted file mode 100644 index a85a99f..0000000 --- a/src/web/build_tools/proxy/conf.d/master.conf +++ /dev/null @@ -1,27 +0,0 @@ -server { - listen 80; - server_name master.argus.com; - - location / { - set $master_backend http://master.argus.com:3000; - proxy_pass $master_backend; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - - # CORS 支持 - add_header 'Access-Control-Allow-Origin' 'http://web.argus.com' always; - add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always; - add_header 'Access-Control-Allow-Headers' 'Origin, Content-Type, Accept, Authorization' always; - - if ($request_method = OPTIONS) { - add_header 'Access-Control-Allow-Origin' 'http://web.argus.com' always; - add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always; - add_header 'Access-Control-Allow-Headers' 'Origin, Content-Type, Accept, Authorization' always; - add_header 'Content-Length' 0; - add_header 'Content-Type' 'text/plain'; - return 204; - } - } -} diff --git a/src/web/build_tools/proxy/conf.d/metric.conf b/src/web/build_tools/proxy/conf.d/metric.conf deleted file mode 100644 index 81d68c2..0000000 --- a/src/web/build_tools/proxy/conf.d/metric.conf +++ /dev/null @@ -1,21 +0,0 @@ -# Prometheus -server { - listen 80; - server_name prometheus.metric.argus.com; - - location / { - set $prom_backend http://prom.metric.argus.com:9090; - proxy_pass $prom_backend; - } -} - -# Grafana -server { - listen 80; - server_name grafana.metric.argus.com; - - location / { - set $grafana_backend http://grafana.metric.argus.com:3000; - proxy_pass $grafana_backend; - } -} diff --git a/src/web/build_tools/proxy/conf.d/ports.conf b/src/web/build_tools/proxy/conf.d/ports.conf new file mode 100644 index 0000000..d528dad --- /dev/null +++ b/src/web/build_tools/proxy/conf.d/ports.conf @@ -0,0 +1,95 @@ +map $http_upgrade $connection_upgrade { default upgrade; "" close; } + +# 允许的跨域来源(仅用于 8084/8085) +# 放开为任意来源:将来端口/域名变更均无需调整。 +# 注意:若前端需要携带凭证(cookies/Authorization),这种“回显 Origin”的方式比 "*" 更通用。 +map $http_origin $cors_allow { + default $http_origin; +} + +# 8080 - Portal +server { + listen 8080; + server_name _; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $connection_upgrade; + proxy_http_version 1.1; + location / { proxy_pass http://web.argus.com:8080/; } +} + +# 8081 - Grafana +server { + listen 8081; + server_name _; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $connection_upgrade; + proxy_http_version 1.1; + location / { proxy_pass http://grafana.metric.argus.com:3000/; } +} + +# 8082 - Prometheus +server { + listen 8082; + server_name _; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_http_version 1.1; + location / { proxy_pass http://prom.metric.argus.com:9090/; } +} + +# 8083 - Kibana +server { + listen 8083; + server_name _; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $connection_upgrade; + proxy_http_version 1.1; + location / { proxy_pass http://kibana.log.argus.com:5601/; } +} + +# 8084 - Alertmanager(含 CORS) +server { + listen 8084; + server_name _; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_hide_header Access-Control-Allow-Origin; + add_header 'Access-Control-Allow-Origin' $cors_allow always; + add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always; + add_header 'Access-Control-Allow-Headers' 'Origin, Content-Type, Accept, Authorization' always; + if ($request_method = OPTIONS) { return 204; } + proxy_http_version 1.1; + location / { proxy_pass http://alertmanager.alert.argus.com:9093/; } +} + +# 8085 - Master(新增,含 CORS) +server { + listen 8085; + server_name _; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + add_header 'Access-Control-Allow-Origin' $cors_allow always; + add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always; + add_header 'Access-Control-Allow-Headers' 'Origin, Content-Type, Accept, Authorization' always; + if ($request_method = OPTIONS) { return 204; } + proxy_http_version 1.1; + location / { proxy_pass http://master.argus.com:3000/; } +} diff --git a/src/web/build_tools/proxy/conf.d/web.conf b/src/web/build_tools/proxy/conf.d/web.conf deleted file mode 100644 index 27397d0..0000000 --- a/src/web/build_tools/proxy/conf.d/web.conf +++ /dev/null @@ -1,9 +0,0 @@ -server { - listen 80; - server_name web.argus.com; - - location / { - set $web_backend http://web.argus.com:8080; - proxy_pass $web_backend; - } -} diff --git a/src/web/build_tools/proxy/dns-monitor.sh b/src/web/build_tools/proxy/dns-monitor.sh deleted file mode 100644 index 2890b47..0000000 --- a/src/web/build_tools/proxy/dns-monitor.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash - -# DNS监控脚本 - 每10秒检查dns.conf是否有变化 -# 如果有变化则执行update-dns.sh脚本 - -DNS_CONF="/private/argus/etc/dns.conf" -DNS_BACKUP="/tmp/dns.conf.backup" -UPDATE_SCRIPT="/private/argus/etc/update-dns.sh" -LOG_FILE="/var/log/supervisor/dns-monitor.log" - -# 确保日志文件存在 -touch "$LOG_FILE" - -log_message() { - echo "$(date '+%Y-%m-%d %H:%M:%S') [DNS-Monitor] $1" >> "$LOG_FILE" -} - -log_message "DNS监控脚本启动" - -while true; do - if [ -f "$DNS_CONF" ]; then - if [ -f "$DNS_BACKUP" ]; then - # 比较文件内容 - if ! cmp -s "$DNS_CONF" "$DNS_BACKUP"; then - log_message "检测到DNS配置变化" - - # 更新备份文件 - cp "$DNS_CONF" "$DNS_BACKUP" - - # 执行更新脚本 - if [ -x "$UPDATE_SCRIPT" ]; then - log_message "执行DNS更新脚本: $UPDATE_SCRIPT" - "$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1 - if [ $? -eq 0 ]; then - log_message "DNS更新脚本执行成功" - else - log_message "DNS更新脚本执行失败" - fi - else - log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT" - fi - fi - else - - # 第一次检测到配置文件,执行更新脚本 - if [ -x "$UPDATE_SCRIPT" ]; then - log_message "执行DNS更新脚本: $UPDATE_SCRIPT" - "$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1 - if [ $? -eq 0 ]; then - log_message "DNS更新脚本执行成功" - - # 第一次运行,创建备份并执行更新 - cp "$DNS_CONF" "$DNS_BACKUP" - log_message "创建DNS配置备份文件" - - else - log_message "DNS更新脚本执行失败" - fi - else - log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT" - fi - fi - else - log_message "警告: DNS配置文件不存在: $DNS_CONF" - fi - - sleep 10 -done diff --git a/src/web/build_tools/proxy/nginx.conf.template b/src/web/build_tools/proxy/nginx.conf.template index 41e29ec..5fb04ba 100644 --- a/src/web/build_tools/proxy/nginx.conf.template +++ b/src/web/build_tools/proxy/nginx.conf.template @@ -1,4 +1,4 @@ -user web_proxy; +user root; worker_processes auto; events { @@ -13,6 +13,7 @@ http { # 使用系统 resolv.conf(由 update-dns.sh 动态更新) resolver __RESOLVERS__ valid=30s ipv6=off; + resolver_timeout 5s; # 启用访问日志 access_log /var/log/nginx/access.log; diff --git a/src/web/build_tools/proxy/start-proxy-retry.sh b/src/web/build_tools/proxy/start-proxy-retry.sh new file mode 100644 index 0000000..73d3baa --- /dev/null +++ b/src/web/build_tools/proxy/start-proxy-retry.sh @@ -0,0 +1,20 @@ +#!/bin/sh +set -eu + +MAX=${RETRY_MAX:-10} +DELAY=${RETRY_DELAY:-10} +ATTEMPT=1 + +echo "[INFO] proxy retry wrapper: max=${MAX}, delay=${DELAY}s" + +while [ "$ATTEMPT" -le "$MAX" ]; do + echo "[INFO] starting proxy attempt ${ATTEMPT}/${MAX}" + /usr/local/bin/start-proxy-supervised.sh && exit 0 || true + echo "[WARN] proxy exited (attempt ${ATTEMPT}/${MAX}); sleeping ${DELAY}s before retry" + sleep "$DELAY" + ATTEMPT=$((ATTEMPT+1)) +done + +echo "[ERROR] proxy failed after ${MAX} attempts" +exit 1 + diff --git a/src/web/build_tools/proxy/start-proxy-supervised.sh b/src/web/build_tools/proxy/start-proxy-supervised.sh index 51c2b7b..d8dba07 100644 --- a/src/web/build_tools/proxy/start-proxy-supervised.sh +++ b/src/web/build_tools/proxy/start-proxy-supervised.sh @@ -9,8 +9,8 @@ DNS_CONF_PRIVATE="/private/argus/etc/dns.conf" DNS_CONF_SYSTEM="/etc/resolv.conf" DNS_DIR="/private/argus/etc" DNS_SCRIPT="${DNS_DIR}/update-dns.sh" -RUNTIME_UID="${ARGUS_UID:-2133}" -RUNTIME_GID="${ARGUS_GID:-2015}" +RUNTIME_UID="${ARGUS_BUILD_UID:-2133}" +RUNTIME_GID="${ARGUS_BUILD_GID:-2015}" mkdir -p "$DNS_DIR" chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true @@ -46,6 +46,10 @@ echo "检测到 DNS 服务器列表: $RESOLVERS" # ========== 生成 nginx.conf ========== if [ -f "$TEMPLATE" ]; then echo "从模板生成 nginx.conf ..." + # 合并 Docker 内置 DNS 以保障解析 Compose 服务名 + if ! echo " $RESOLVERS " | grep -q " 127.0.0.11 "; then + RESOLVERS="127.0.0.11 ${RESOLVERS}" + fi sed "s|__RESOLVERS__|$RESOLVERS|" "$TEMPLATE" > "$TARGET" else echo "错误: 找不到 nginx.conf.template ($TEMPLATE)" @@ -55,6 +59,33 @@ fi # 打印生成结果供排查 grep resolver "$TARGET" || true +# ========== 等待上游域名准备(避免启动即解析失败) ========== +UPSTREAM_DOMAINS=( + web.argus.com + grafana.metric.argus.com + prom.metric.argus.com + kibana.log.argus.com + alertmanager.alert.argus.com + master.argus.com +) +WAIT_MAX=15 +WAITED=0 +MISSING=() +while :; do + MISSING=() + for d in "${UPSTREAM_DOMAINS[@]}"; do + if [ ! -s "/private/argus/etc/${d}" ]; then + MISSING+=("$d") + fi + done + if [ ${#MISSING[@]} -eq 0 ] || [ "$WAITED" -ge "$WAIT_MAX" ]; then + break + fi + echo "[INFO] 等待上游域名记录生成(${WAITED}/${WAIT_MAX}) 缺失: ${MISSING[*]}" + sleep 1 + WAITED=$((WAITED+1)) +done + echo "[INFO] Launching nginx..." # 启动 nginx 前台模式 diff --git a/src/web/build_tools/proxy/supervisord.conf b/src/web/build_tools/proxy/supervisord.conf index 57bdfc5..3f668ab 100644 --- a/src/web/build_tools/proxy/supervisord.conf +++ b/src/web/build_tools/proxy/supervisord.conf @@ -5,12 +5,12 @@ pidfile=/var/run/supervisord.pid user=root [program:proxy] -command=/usr/local/bin/start-proxy-supervised.sh +command=/usr/local/bin/start-proxy-retry.sh user=root stdout_logfile=/var/log/supervisor/web-proxy.log stderr_logfile=/var/log/supervisor/web-proxy_error.log autorestart=true -startretries=3 +startretries=10 startsecs=5 stopwaitsecs=10 killasgroup=true diff --git a/src/web/index.html b/src/web/index.html index 86f56fa..9c8f5a4 100644 --- a/src/web/index.html +++ b/src/web/index.html @@ -8,6 +8,8 @@
+ + diff --git a/src/web/src/components/NodeTable.jsx b/src/web/src/components/NodeTable.jsx index dc89443..3f03a02 100644 --- a/src/web/src/components/NodeTable.jsx +++ b/src/web/src/components/NodeTable.jsx @@ -71,7 +71,7 @@ export function NodeTable({ size="xs" variant="outline" component="a" - href={`${EXTERNAL_HOST.GRAFANA}/d/node_gpu_metrics/node-and-gpu-metrics?var-hostname=${encodeURIComponent(node.name)}`} + href={`${EXTERNAL_HOST.GRAFANA}/d/node_gpu_metrics_by_hostname/node-and-gpu-metrics-by-hostname?var-hostname=${encodeURIComponent(node.name)}`} target="_blank" rel="noopener noreferrer" > diff --git a/src/web/src/config/api.js b/src/web/src/config/api.js index ef8a71b..479e755 100644 --- a/src/web/src/config/api.js +++ b/src/web/src/config/api.js @@ -1,30 +1,53 @@ // config/api.js -// Master 节点相关 API +// 运行时解析主机名,统一按端口访问多服务 +const HOST = (typeof window !== 'undefined' && (window.__ARGUS_PUBLIC_HOST__ || window.location.hostname)) || 'localhost'; + +// 默认端口常量(作为兜底值) +const DEFAULT_PORTS = { + MASTER: 8085, // 经网关(含 CORS) + ALERTMANAGER: 8084, + GRAFANA: 8081, + PROMETHEUS: 8082, + KIBANA: 8083, +}; + +// 运行期注入:/argus-config.js 会在 window.__ARGUS_PORTS__ 写入外部端口 +const RUNTIME_PORTS = (typeof window !== 'undefined' && window.__ARGUS_PORTS__) || {}; +const PORTS = { + MASTER: Number(RUNTIME_PORTS.MASTER) || DEFAULT_PORTS.MASTER, + ALERTMANAGER: Number(RUNTIME_PORTS.ALERTMANAGER) || DEFAULT_PORTS.ALERTMANAGER, + GRAFANA: Number(RUNTIME_PORTS.GRAFANA) || DEFAULT_PORTS.GRAFANA, + PROMETHEUS: Number(RUNTIME_PORTS.PROMETHEUS) || DEFAULT_PORTS.PROMETHEUS, + KIBANA: Number(RUNTIME_PORTS.KIBANA) || DEFAULT_PORTS.KIBANA, +}; + +const BASE = { + MASTER: `http://${HOST}:${PORTS.MASTER}`, + ALERT: `http://${HOST}:${PORTS.ALERTMANAGER}`, + GRAFANA: `http://${HOST}:${PORTS.GRAFANA}`, + PROM: `http://${HOST}:${PORTS.PROMETHEUS}`, + KIBANA: `http://${HOST}:${PORTS.KIBANA}`, +}; + +// Master 节点相关 API(统一走 8085) export const MASTER_API = { - // 节点列表 - LIST: "http://master.argus.com/api/v1/master/nodes", - - // 节点详情(需要 nodeId) - DETAIL: (nodeId) => `http://master.argus.com/api/v1/master/nodes/${nodeId}`, - - // 节点配置(需要 nodeId) - CONFIG: (nodeId) => `http://master.argus.com/api/v1/master/nodes/${nodeId}/config`, - - // 节点统计信息 - STATISTICS: "http://master.argus.com/api/v1/master/nodes/statistics", + LIST: `${BASE.MASTER}/api/v1/master/nodes`, + DETAIL: (nodeId) => `${BASE.MASTER}/api/v1/master/nodes/${nodeId}`, + CONFIG: (nodeId) => `${BASE.MASTER}/api/v1/master/nodes/${nodeId}/config`, + STATISTICS: `${BASE.MASTER}/api/v1/master/nodes/statistics`, }; -// 其他外部 API +// 其他外部 API(8084) export const EXTERNAL_API = { - ALERTS_INFOS: "http://alertmanager.alert.argus.com/api/v2/alerts", + ALERTS_INFOS: `${BASE.ALERT}/api/v2/alerts`, }; -// 外部服务 Host +// 外部服务 Host(端口化) export const EXTERNAL_HOST = { - ALERTS: "http://alertmanager.alert.argus.com", - GRAFANA: "http://grafana.metric.argus.com", - GRAFANA_DASHBOARD: "http://grafana.metric.argus.com/d/cluster-dashboard/cluster-dashboard", - PROMETHEUS: "http://prometheus.metric.argus.com", - KIBANA: "http://kibana.log.argus.com/app/discover", + ALERTS: `${BASE.ALERT}`, + GRAFANA: `${BASE.GRAFANA}`, + GRAFANA_DASHBOARD: `${BASE.GRAFANA}/d/cluster-dashboard/cluster-dashboard`, + PROMETHEUS: `${BASE.PROM}`, + KIBANA: `${BASE.KIBANA}/app/discover`, }; diff --git a/src/web/tests/docker-compose.yml b/src/web/tests/docker-compose.yml index 985e22c..7be6106 100644 --- a/src/web/tests/docker-compose.yml +++ b/src/web/tests/docker-compose.yml @@ -4,15 +4,15 @@ services: context: ../../../ dockerfile: src/web/build_tools/frontend/Dockerfile args: - ARGUS_UID: ${ARGUS_UID:-2133} - ARGUS_GID: ${ARGUS_GID:-2015} + ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133} + ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015} USE_INTRANET: ${USE_INTRANET:-false} image: argus-web-frontend:latest container_name: argus-web-frontend environment: - ALERTMANAGER_BASE_PATH=/private/argus/web/frontend - - ARGUS_UID=${ARGUS_UID:-2133} - - ARGUS_GID=${ARGUS_GID:-2015} + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} ports: - "${ARGUS_WEB_PORT:-8080}:80" volumes: @@ -31,14 +31,14 @@ services: context: ../../../ dockerfile: src/web/build_tools/proxy/Dockerfile args: - ARGUS_UID: ${ARGUS_UID:-2133} - ARGUS_GID: ${ARGUS_GID:-2015} + ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133} + ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015} USE_INTRANET: ${USE_INTRANET:-false} image: argus-web-proxy:latest container_name: argus-web-proxy environment: - - ARGUS_UID=${ARGUS_UID:-2133} - - ARGUS_GID=${ARGUS_GID:-2015} + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} ports: - "8088:80" volumes: