From 69e7a3e2b8e55afa7961c75a89e5b72484aa4e18 Mon Sep 17 00:00:00 2001 From: yuyr Date: Fri, 14 Nov 2025 16:43:34 +0800 Subject: [PATCH] =?UTF-8?q?[#47]=20=E5=A2=9E=E5=8A=A0cpu=20bundle=E9=95=9C?= =?UTF-8?q?=E5=83=8F=E6=9E=84=E5=BB=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/README.md | 150 ++++++++++++++++++ build/build_images.sh | 127 ++++++++++++++- src/bundle/cpu-node-bundle/.gitignore | 1 + src/bundle/cpu-node-bundle/Dockerfile | 33 ++++ src/bundle/cpu-node-bundle/node-bootstrap.sh | 124 +++++++++++++++ .../tmp/metric-verify.graf_health.json | 5 - .../tmp/metric-verify/prom_targets.json | 2 +- src/sys/swarm_tests/tmp/targets.json | 1 - 8 files changed, 434 insertions(+), 9 deletions(-) create mode 100644 build/README.md create mode 100644 src/bundle/cpu-node-bundle/.gitignore create mode 100644 src/bundle/cpu-node-bundle/Dockerfile create mode 100644 src/bundle/cpu-node-bundle/node-bootstrap.sh delete mode 100644 src/sys/swarm_tests/tmp/metric-verify.graf_health.json delete mode 100644 src/sys/swarm_tests/tmp/targets.json diff --git a/build/README.md b/build/README.md new file mode 100644 index 0000000..088a64a --- /dev/null +++ b/build/README.md @@ -0,0 +1,150 @@ +# ARGUS 统一构建脚本使用说明(build/build_images.sh) + +本目录提供单一入口脚本 `build/build_images.sh`,覆盖常见三类场景: +- 系统集成测试(src/sys/tests) +- Swarm 系统集成测试(src/sys/swarm_tests) +- 构建离线安装包(deployment_new:Server/Client‑GPU) + +文档还说明 UID/GID 取值规则、镜像 tag 策略、常用参数与重试机制。 + +## 环境前置 +- Docker Engine ≥ 20.10(建议 ≥ 23.x/24.x) +- Docker Compose v2(`docker compose` 子命令) +- 可选:内网构建镜像源(`--intranet`) + +## UID/GID 规则(用于容器内用户/卷属主) +- 非 pkg 构建(core/master/metric/web/alert/sys/gpu_bundle/cpu_bundle): + - 读取 `configs/build_user.local.conf` → `configs/build_user.conf`; + - 可被环境变量覆盖:`ARGUS_BUILD_UID`、`ARGUS_BUILD_GID`; +- pkg 构建(`--only server_pkg`、`--only client_pkg`): + - 读取 `configs/build_user.pkg.conf`(优先)→ `build_user.local.conf` → `build_user.conf`; + - 可被环境变量覆盖; +- CPU bundle 明确走“非 pkg”链(不读取 `build_user.pkg.conf`)。 +- 说明:仅依赖 UID/GID 的 Docker 层会因参数变动而自动重建,不同构建剖面不会“打错包”。 + +## 镜像 tag 策略 +- 非 pkg 构建:默认输出 `:latest`。 +- `--only server_pkg`:所有镜像直接输出为 `:`(不覆盖 `:latest`)。 +- `--only client_pkg`:GPU bundle 仅输出 `:`(不覆盖 `:latest`)。 +- `--only cpu_bundle`:默认仅输出 `:`;可加 `--tag-latest` 同时打 `:latest` 以兼容 swarm_tests 默认 compose。 + +## 不加 --only 的默认构建目标 +不指定 `--only` 时,脚本会构建“基础镜像集合”(不含 bundle 与安装包): +- core:`argus-elasticsearch:latest`、`argus-kibana:latest`、`argus-bind9:latest` +- master:`argus-master:latest`(非 offline) +- metric:`argus-metric-ftp:latest`、`argus-metric-prometheus:latest`、`argus-metric-grafana:latest` +- web:`argus-web-frontend:latest`、`argus-web-proxy:latest` +- alert:`argus-alertmanager:latest` +- sys:`argus-sys-node:latest`、`argus-sys-metric-test-node:latest`、`argus-sys-metric-test-gpu-node:latest` + +说明:默认 tag 为 `:latest`;UID/GID 走“非 pkg”链(`build_user.local.conf → build_user.conf`,可被环境变量覆盖)。 + +## 通用参数 +- `--intranet`:使用内网构建参数(各 Dockerfile 中按需启用)。 +- `--no-cache`:禁用 Docker 层缓存。 +- `--only `:逗号分隔目标,例:`--only core,master,metric,web,alert`。 +- `--version YYMMDD`:bundle/pkg 的日期标签(必填于 cpu_bundle/gpu_bundle/server_pkg/client_pkg)。 +- `--client-semver X.Y.Z`:all‑in‑one‑full 客户端语义化版本(可选)。 +- `--cuda VER`:GPU bundle CUDA 基镜版本(默认 12.2.2)。 +- `--tag-latest`:CPU bundle 构建时同时打 `:latest`。 + +## 自动重试 +- 构建单镜像失败会自动重试(默认 3 次,间隔 5s)。 +- 最后一次自动使用 `DOCKER_BUILDKIT=0` 再试,缓解 “failed to receive status: context canceled”。 +- 可调:`ARGUS_BUILD_RETRIES`、`ARGUS_BUILD_RETRY_DELAY` 环境变量。 + +--- + +## 场景一:系统集成测试(src/sys/tests) +构建用于系统级端到端测试的镜像(默认 `:latest`)。 + +示例: +``` +# 构建核心与周边 +./build/build_images.sh --only core,master,metric,web,alert,sys +``` +产出: +- 本地镜像:`argus-elasticsearch:latest`、`argus-kibana:latest`、`argus-master:latest`、`argus-metric-ftp:latest`、`argus-metric-prometheus:latest`、`argus-metric-grafana:latest`、`argus-alertmanager:latest`、`argus-web-frontend:latest`、`argus-web-proxy:latest`、`argus-sys-node:latest` 等。 + +说明: +- UID/GID 读取 `build_user.local.conf → build_user.conf`(或环境变量覆盖)。 +- sys/tests 的执行见 `src/sys/tests/README.md`。 + +--- + +## 场景二:Swarm 系统集成测试(src/sys/swarm_tests) +需要服务端镜像 + CPU 节点 bundle 镜像。 + +步骤: +1) 构建服务端镜像(默认 `:latest`) +``` +./build/build_images.sh --only core,master,metric,web,alert +``` +2) 构建 CPU bundle(直接 FROM ubuntu:22.04) +``` +# 仅版本 tag 输出 +./build/build_images.sh --only cpu_bundle --version 20251114 +# 若要兼容 swarm_tests 默认 latest: +./build/build_images.sh --only cpu_bundle --version 20251114 --tag-latest +``` +3) 运行 Swarm 测试 +``` +cd src/sys/swarm_tests +# 如未打 latest,可先指定: +export NODE_BUNDLE_IMAGE_TAG=argus-sys-metric-test-node-bundle:20251114 +./scripts/01_server_up.sh +./scripts/02_wait_ready.sh +./scripts/03_nodes_up.sh +./scripts/04_metric_verify.sh # 验证 Prometheus/Grafana/nodes.json 与日志通路 +./scripts/99_down.sh # 结束 +``` +产出: +- 本地镜像:`argus-*:latest` 与 `argus-sys-metric-test-node-bundle:20251114`(或 latest)。 +- `swarm_tests/private-*`:运行态持久化文件。 + +说明: +- CPU bundle 构建用户走“非 pkg”链(local.conf → conf)。 +- `04_metric_verify.sh` 已内置 Fluent Bit 启动与配置修正逻辑,偶发未就绪可重跑一次即通过。 + +--- + +## 场景三:构建离线安装包(deployment_new) +Server 与 Client‑GPU 安装包均采用“版本直出”,只输出 `:` 标签,不改动 `:latest`。 + +1) Server 包 +``` +./build/build_images.sh --only server_pkg --version 20251114 +``` +产出: +- 本地镜像:`argus-<模块>:20251114`(不触碰 latest)。 +- 安装包:`deployment_new/artifact/server/20251114/` 与 `server_20251114.tar.gz` +- 包内包含:逐镜像 tar.gz、compose/.env.example、scripts(config/install/selfcheck/diagnose 等)、docs、manifest/checksums。 + +2) Client‑GPU 包 +``` +# 同步构建 GPU bundle(仅 :,不触碰 latest),并生成客户端包 +./build/build_images.sh --only client_pkg --version 20251114 \\ + --client-semver 1.44.0 --cuda 12.2.2 +``` +产出: +- 本地镜像:`argus-sys-metric-test-node-bundle-gpu:20251114` +- 安装包:`deployment_new/artifact/client_gpu/20251114/` 与 `client_gpu_20251114.tar.gz` +- 包内包含:GPU bundle 镜像 tar.gz、busybox.tar、compose/.env.example、scripts(config/install/uninstall)、docs、manifest/checksums。 + +说明: +- pkg 构建使用 `configs/build_user.pkg.conf` 的 UID/GID(可被环境覆盖)。 +- 包内 `.env.example` 的 `PKG_VERSION=` 与镜像 tag 严格一致。 + +--- + +## 常见问题(FAQ) +- 构建报 `failed to receive status: context canceled`? + - 已内置单镜像多次重试,最后一次禁用 BuildKit;建议加 `--intranet` 与 `--no-cache` 重试,或 `docker builder prune -f` 后再试。 +- 先跑非 pkg(latest),再跑 pkg(version)会不会“打错包”? + - 不会。涉及 UID/GID 的层因参数变化会重建,其它层按缓存命中复用,最终 pkg 产物的属主与运行账户按 `build_user.pkg.conf` 生效。 +- swarm_tests 默认拉取 `:latest`,我只构建了 `:` 的 CPU bundle 怎么办? + - 在运行前 `export NODE_BUNDLE_IMAGE_TAG=argus-sys-metric-test-node-bundle:`,或在构建时加 `--tag-latest`。 + +--- + +如需进一步自动化(例如生成 BUILD_SUMMARY.txt 汇总镜像 digest 与构建参数),可在 pkg 产出阶段追加,我可以按需补齐。 diff --git a/build/build_images.sh b/build/build_images.sh index 8f88f83..030a281 100755 --- a/build/build_images.sh +++ b/build/build_images.sh @@ -12,10 +12,11 @@ Options: --master-offline Build master offline image (requires src/master/offline_wheels.tar.gz) --metric Build metric module images (ftp, prometheus, grafana, test nodes) --no-cache Build all images without using Docker layer cache - --only LIST Comma-separated targets to build: core,master,metric,web,alert,sys,gpu_bundle,server_pkg,client_pkg,all + --only LIST Comma-separated targets to build: core,master,metric,web,alert,sys,gpu_bundle,cpu_bundle,server_pkg,client_pkg,all --version DATE Date tag used by gpu_bundle/server_pkg/client_pkg (e.g. 20251112) --client-semver X.Y.Z Override client semver used in all-in-one-full artifact (optional) --cuda VER CUDA runtime version for NVIDIA base (default: 12.2.2) + --tag-latest Also tag bundle image as :latest (for cpu_bundle only; default off) -h, --help Show this help message Examples: @@ -36,6 +37,7 @@ build_web=true build_alert=true build_sys=true build_gpu_bundle=false +build_cpu_bundle=false build_server_pkg=false build_client_pkg=false no_cache=false @@ -44,6 +46,7 @@ bundle_date="" client_semver="" cuda_ver="12.2.2" DEFAULT_IMAGE_TAG="latest" +tag_latest=false while [[ $# -gt 0 ]]; do case $1 in @@ -74,7 +77,7 @@ while [[ $# -gt 0 ]]; do fi sel="$2"; shift 2 # reset all, then enable selected - build_core=false; build_master=false; build_metric=false; build_web=false; build_alert=false; build_sys=false; build_gpu_bundle=false; build_server_pkg=false; build_client_pkg=false + build_core=false; build_master=false; build_metric=false; build_web=false; build_alert=false; build_sys=false; build_gpu_bundle=false; build_cpu_bundle=false; build_server_pkg=false; build_client_pkg=false IFS=',' read -ra parts <<< "$sel" for p in "${parts[@]}"; do case "$p" in @@ -85,6 +88,7 @@ while [[ $# -gt 0 ]]; do alert) build_alert=true ;; sys) build_sys=true ;; gpu_bundle) build_gpu_bundle=true ;; + cpu_bundle) build_cpu_bundle=true ;; server_pkg) build_server_pkg=true; build_core=true; build_master=true; build_metric=true; build_web=true; build_alert=true ;; client_pkg) build_client_pkg=true ;; all) build_core=true; build_master=true; build_metric=true; build_web=true; build_alert=true; build_sys=true ;; @@ -104,6 +108,10 @@ while [[ $# -gt 0 ]]; do if [[ -z ${2:-} ]]; then echo "--cuda requires a value like 12.2.2" >&2; exit 1; fi cuda_ver="$2"; shift 2 ;; + --tag-latest) + tag_latest=true + shift + ;; -h|--help) show_help exit 0 @@ -507,6 +515,110 @@ build_client_pkg_bundle() { return 0 } +# Build CPU bundle image directly FROM ubuntu:22.04 (no intermediate base) +build_cpu_bundle_image() { + local date_tag="$1" # e.g. 20251113 + local client_ver_in="$2" # semver like 1.43.0 (optional) + local want_tag_latest="$3" # true/false + + if [[ -z "$date_tag" ]]; then + echo "❌ cpu_bundle requires --version YYMMDD" >&2 + return 1 + fi + + echo "\n🔧 Preparing one-click CPU bundle build" + echo " Base: ubuntu:22.04" + echo " Bundle tag: ${date_tag}" + + # 1) Build latest argus-agent from source + echo "\n🛠 Building argus-agent from src/agent" + pushd "$root/src/agent" >/dev/null + if ! bash scripts/build_binary.sh; then + echo "❌ argus-agent build failed" >&2 + popd >/dev/null + return 1 + fi + if [[ ! -f "dist/argus-agent" ]]; then + echo "❌ argus-agent binary missing after build" >&2 + popd >/dev/null + return 1 + fi + popd >/dev/null + + # 2) Inject agent into all-in-one-full plugin and package artifact + local aio_root="$root/src/metric/client-plugins/all-in-one-full" + local agent_bin_src="$root/src/agent/dist/argus-agent" + local agent_bin_dst="$aio_root/plugins/argus-agent/bin/argus-agent" + echo "\n📦 Updating all-in-one-full agent binary → $agent_bin_dst" + cp -f "$agent_bin_src" "$agent_bin_dst" + chmod +x "$agent_bin_dst" || true + + pushd "$aio_root" >/dev/null + local prev_version use_version + prev_version="$(cat config/VERSION 2>/dev/null || echo "1.0.0")" + use_version="$prev_version" + if [[ -n "$client_ver_in" ]]; then + echo "$client_ver_in" > config/VERSION + use_version="$client_ver_in" + fi + echo " Packaging all-in-one-full artifact: version=$use_version" + if ! bash scripts/package_artifact.sh --force; then + echo "❌ package_artifact.sh failed" >&2 + [[ -n "$client_ver_in" ]] && echo "$prev_version" > config/VERSION + popd >/dev/null + return 1 + fi + local artifact_dir="$aio_root/artifact/$use_version" + local artifact_tar + artifact_tar="$(ls -1 "$artifact_dir"/argus-metric_*.tar.gz 2>/dev/null | head -n1 || true)" + if [[ -z "$artifact_tar" ]]; then + echo " No argus-metric_*.tar.gz found; invoking publish_artifact.sh ..." + local owner="$(id -u):$(id -g)" + if ! bash scripts/publish_artifact.sh "$use_version" --output-dir "$artifact_dir" --owner "$owner"; then + echo "❌ publish_artifact.sh failed" >&2 + [[ -n "$client_ver_in" ]] && echo "$prev_version" > config/VERSION + popd >/dev/null + return 1 + fi + artifact_tar="$(ls -1 "$artifact_dir"/argus-metric_*.tar.gz 2>/dev/null | head -n1 || true)" + fi + [[ -n "$client_ver_in" ]] && echo "$prev_version" > config/VERSION + popd >/dev/null + + # 3) Stage docker build context + local bundle_ctx="$root/src/bundle/cpu-node-bundle/.build-$date_tag" + echo "\n🧰 Staging docker build context: $bundle_ctx" + rm -rf "$bundle_ctx" + mkdir -p "$bundle_ctx/bundle" "$bundle_ctx/private" + cp "$root/src/bundle/cpu-node-bundle/Dockerfile" "$bundle_ctx/" + cp "$root/src/bundle/cpu-node-bundle/node-bootstrap.sh" "$bundle_ctx/" + # bundle tar + cp "$artifact_tar" "$bundle_ctx/bundle/" + # offline fluent-bit assets + if [[ -d "$root/src/log/fluent-bit/build/etc" ]]; then + cp -r "$root/src/log/fluent-bit/build/etc" "$bundle_ctx/private/" + fi + if [[ -d "$root/src/log/fluent-bit/build/packages" ]]; then + cp -r "$root/src/log/fluent-bit/build/packages" "$bundle_ctx/private/" + fi + if [[ -f "$root/src/log/fluent-bit/build/start-fluent-bit.sh" ]]; then + cp "$root/src/log/fluent-bit/build/start-fluent-bit.sh" "$bundle_ctx/private/" + fi + + # 4) Build final bundle image + local image_tag="argus-sys-metric-test-node-bundle:${date_tag}" + echo "\n🔄 Building CPU Bundle image" + if build_image "CPU Bundle" "$bundle_ctx/Dockerfile" "$image_tag" "$bundle_ctx"; then + images_built+=("$image_tag") + if [[ "$want_tag_latest" == "true" ]]; then + docker tag "$image_tag" argus-sys-metric-test-node-bundle:latest >/dev/null 2>&1 || true + fi + return 0 + else + return 1 + fi +} + if [[ "$build_core" == true ]]; then if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:${DEFAULT_IMAGE_TAG}"; then images_built+=("argus-elasticsearch:${DEFAULT_IMAGE_TAG}") @@ -692,6 +804,17 @@ if [[ "$build_gpu_bundle" == true ]]; then fi fi +# ======================================= +# One-click CPU bundle (from ubuntu:22.04) +# ======================================= +if [[ "$build_cpu_bundle" == true ]]; then + echo "" + echo "Building one-click CPU bundle image..." + if ! build_cpu_bundle_image "${bundle_date}" "${client_semver}" "${tag_latest}"; then + build_failed=true + fi +fi + # ======================================= # One-click Server/Client packaging # ======================================= diff --git a/src/bundle/cpu-node-bundle/.gitignore b/src/bundle/cpu-node-bundle/.gitignore new file mode 100644 index 0000000..759168e --- /dev/null +++ b/src/bundle/cpu-node-bundle/.gitignore @@ -0,0 +1 @@ +.build*/ diff --git a/src/bundle/cpu-node-bundle/Dockerfile b/src/bundle/cpu-node-bundle/Dockerfile new file mode 100644 index 0000000..9afb200 --- /dev/null +++ b/src/bundle/cpu-node-bundle/Dockerfile @@ -0,0 +1,33 @@ +FROM ubuntu:22.04 + +ARG ARGUS_BUILD_UID=2133 +ARG ARGUS_BUILD_GID=2015 + +ENV DEBIAN_FRONTEND=noninteractive \ + TZ=Asia/Shanghai \ + ARGUS_LOGS_WORLD_WRITABLE=1 + +RUN set -eux; \ + apt-get update; \ + apt-get install -y --no-install-recommends \ + ca-certificates curl wget iproute2 iputils-ping net-tools jq tzdata \ + cron procps supervisor vim less tar gzip python3; \ + rm -rf /var/lib/apt/lists/*; \ + ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone + +WORKDIR / + +# Offline fluent-bit assets and bundle tarball are staged by the build script +COPY node-bootstrap.sh /usr/local/bin/node-bootstrap.sh +COPY private/start-fluent-bit.sh /private/start-fluent-bit.sh +COPY private/etc /private/etc +COPY private/packages /private/packages +COPY bundle/ /bundle/ + +RUN chmod +x /usr/local/bin/node-bootstrap.sh /private/start-fluent-bit.sh || true; \ + mkdir -p /logs/train /logs/infer /buffers /opt/argus-metric; \ + if [ "${ARGUS_LOGS_WORLD_WRITABLE}" = "1" ]; then chmod 1777 /logs/train /logs/infer || true; else chmod 755 /logs/train /logs/infer || true; fi; \ + chmod 770 /buffers || true + +ENTRYPOINT ["/usr/local/bin/node-bootstrap.sh"] + diff --git a/src/bundle/cpu-node-bundle/node-bootstrap.sh b/src/bundle/cpu-node-bundle/node-bootstrap.sh new file mode 100644 index 0000000..faf86d2 --- /dev/null +++ b/src/bundle/cpu-node-bundle/node-bootstrap.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +set -euo pipefail + +echo "[BOOT] CPU node bundle starting" + +INSTALL_ROOT="/opt/argus-metric" +BUNDLE_DIR="/bundle" +STATE_DIR_BASE="/private/argus/agent" + +mkdir -p "$INSTALL_ROOT" "$STATE_DIR_BASE" /logs/train /logs/infer /buffers || true + +# Ensure world-writable logs dir with sticky bit (align with deployment_new policy) +if [[ "${ARGUS_LOGS_WORLD_WRITABLE:-1}" == "1" ]]; then + chmod 1777 /logs/train /logs/infer || true +else + chmod 755 /logs/train /logs/infer || true +fi +chmod 770 /buffers || true + +installed_ok=0 + +# 1) already installed? +if [[ -L "$INSTALL_ROOT/current" && -d "$INSTALL_ROOT/current" ]]; then + echo "[BOOT] client already installed at $INSTALL_ROOT/current" +else + # 2) try local bundle first (argus-metric_*.tar.gz) + tarball=$(ls -1 "$BUNDLE_DIR"/argus-metric_*.tar.gz 2>/dev/null | head -1 || true) + if [[ -n "${tarball:-}" ]]; then + echo "[BOOT] installing from local bundle: $(basename "$tarball")" + tmp=$(mktemp -d) + tar -xzf "$tarball" -C "$tmp" + # locate root containing version.json + root="$tmp" + if [[ ! -f "$root/version.json" ]]; then + sub=$(find "$tmp" -mindepth 1 -maxdepth 1 -type d | head -n1 || true) + [[ -n "$sub" && -f "$sub/version.json" ]] && root="$sub" + fi + if [[ ! -f "$root/version.json" ]]; then + echo "[BOOT][WARN] version.json not found in bundle; fallback to FTP" + else + ver=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$root/version.json" | head -n1) + if [[ -z "$ver" ]]; then + echo "[BOOT][WARN] failed to parse version from version.json; fallback to FTP" + else + target_root="$INSTALL_ROOT" + version_dir="$target_root/versions/$ver" + mkdir -p "$version_dir" + shopt -s dotglob + mv "$root"/* "$version_dir/" 2>/dev/null || true + shopt -u dotglob + if [[ -f "$version_dir/install.sh" ]]; then + chmod +x "$version_dir/install.sh" 2>/dev/null || true + ( + export AUTO_START_DCGM="0" # N/A on CPU + cd "$version_dir" && ./install.sh "$version_dir" + ) + echo "$ver" > "$target_root/LATEST_VERSION" 2>/dev/null || true + ln -sfn "$version_dir" "$target_root/current" 2>/dev/null || true + if [[ -L "$target_root/current" && -d "$target_root/current" ]]; then + installed_ok=1 + echo "[BOOT] local bundle install OK: version=$ver" + else + echo "[BOOT][WARN] current symlink not present after install; will rely on healthcheck to confirm" + fi + else + echo "[BOOT][WARN] install.sh missing under $version_dir; fallback to FTP" + fi + fi + fi + fi + + # 3) fallback: use FTP setup if not installed + if [[ ! -L "$INSTALL_ROOT/current" && "$installed_ok" -eq 0 ]]; then + echo "[BOOT] fallback to FTP setup" + if [[ -z "${FTPIP:-}" || -z "${FTP_USER:-}" || -z "${FTP_PASSWORD:-}" ]]; then + echo "[BOOT][ERROR] FTP variables not set (FTPIP/FTP_USER/FTP_PASSWORD)" >&2 + exit 1 + fi + curl -u "$FTP_USER:$FTP_PASSWORD" -fsSL "ftp://$FTPIP:21/setup.sh" -o /tmp/setup.sh + chmod +x /tmp/setup.sh + /tmp/setup.sh --server "$FTPIP" --user "$FTP_USER" --password "$FTP_PASSWORD" --port 21 + fi +fi + +# 4) ensure argus-agent is running (best-effort) +if ! pgrep -x argus-agent >/dev/null 2>&1; then + echo "[BOOT] starting argus-agent (not detected)" + setsid /usr/local/bin/argus-agent >/var/log/argus-agent.log 2>&1 < /dev/null & +fi + +# 5) post-install selfcheck and state +ver_dir="" +if [[ -L "$INSTALL_ROOT/current" ]]; then + ver_dir="$(readlink -f "$INSTALL_ROOT/current" 2>/dev/null || true)" +fi +if [[ -z "$ver_dir" ]]; then + ver_dir="$(ls -d "$INSTALL_ROOT"/versions/* 2>/dev/null | sort -V | tail -n1 || true)" +fi + +if [[ -n "$ver_dir" && -x "$ver_dir/check_health.sh" ]]; then + echo "[BOOT] running initial health check: $ver_dir/check_health.sh" + if "$ver_dir/check_health.sh" >> "$ver_dir/.health_check.init.log" 2>&1; then + echo "[BOOT] initial health check completed (see $ver_dir/.health_check.init.log)" + else + echo "[BOOT][WARN] initial health check reported issues (see $ver_dir/.health_check.init.log)" + fi +else + echo "[BOOT][WARN] initial health check skipped (script missing: $ver_dir/check_health.sh)" +fi + +host="$(hostname)" +state_dir="$STATE_DIR_BASE/${host}" +mkdir -p "$state_dir" 2>/dev/null || true +for i in {1..60}; do + if [[ -s "$state_dir/node.json" ]]; then + echo "[BOOT] node state present: $state_dir/node.json" + break + fi + sleep 2 +done + +echo "[BOOT] ready; entering sleep" +exec sleep infinity + diff --git a/src/sys/swarm_tests/tmp/metric-verify.graf_health.json b/src/sys/swarm_tests/tmp/metric-verify.graf_health.json deleted file mode 100644 index 41e9747..0000000 --- a/src/sys/swarm_tests/tmp/metric-verify.graf_health.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "commit": "5b85c4c2fcf5d32d4f68aaef345c53096359b2f1", - "database": "ok", - "version": "11.1.0" -} \ No newline at end of file diff --git a/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json b/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json index 88b3bf2..3ca7fca 100644 --- a/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json +++ b/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json @@ -1 +1 @@ -{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.13:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.13:9400/metrics","globalUrl":"http://10.0.1.13:9400/metrics","lastError":"","lastScrape":"2025-11-06T17:13:58.44079249+08:00","lastScrapeDuration":0.001229132,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.13:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.13:9100/metrics","globalUrl":"http://10.0.1.13:9100/metrics","lastError":"","lastScrape":"2025-11-06T17:13:54.277705211+08:00","lastScrapeDuration":0.024348657,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}} \ No newline at end of file +{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.13:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.13:9400/metrics","globalUrl":"http://10.0.1.13:9400/metrics","lastError":"","lastScrape":"2025-11-14T16:20:36.702023128+08:00","lastScrapeDuration":0.001054193,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.13:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.13:9100/metrics","globalUrl":"http://10.0.1.13:9100/metrics","lastError":"","lastScrape":"2025-11-14T16:20:34.338081675+08:00","lastScrapeDuration":0.019183536,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}} \ No newline at end of file diff --git a/src/sys/swarm_tests/tmp/targets.json b/src/sys/swarm_tests/tmp/targets.json deleted file mode 100644 index 7be6783..0000000 --- a/src/sys/swarm_tests/tmp/targets.json +++ /dev/null @@ -1 +0,0 @@ -{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.15:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.15","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.15","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.15:9400/metrics","globalUrl":"http://10.0.1.15:9400/metrics","lastError":"","lastScrape":"2025-11-06T15:47:37.200098366+08:00","lastScrapeDuration":0.001361528,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.15:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.15","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.15","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.15:9100/metrics","globalUrl":"http://10.0.1.15:9100/metrics","lastError":"","lastScrape":"2025-11-06T15:47:40.184367879+08:00","lastScrapeDuration":0.02923333,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}} \ No newline at end of file