#!/usr/bin/env bash set -euo pipefail echo "[BOOT] node bundle starting" INSTALL_DIR="/opt/argus-metric" BUNDLE_DIR="/bundle" installed_ok=0 # 1) already installed? if [[ -L "$INSTALL_DIR/current" && -d "$INSTALL_DIR/current" ]]; then echo "[BOOT] client already installed at $INSTALL_DIR/current" else # 2) try local bundle first (replicate setup.sh layout: move to /opt/argus-metric/versions/ and run install.sh) tarball=$(ls -1 "$BUNDLE_DIR"/argus-metric_*.tar.gz 2>/dev/null | head -1 || true) if [[ -n "${tarball:-}" ]]; then echo "[BOOT] installing from local bundle: $(basename "$tarball")" tmp=$(mktemp -d) tar -xzf "$tarball" -C "$tmp" # locate root containing version.json root="$tmp" if [[ ! -f "$root/version.json" ]]; then sub=$(find "$tmp" -mindepth 1 -maxdepth 1 -type d | head -n1 || true) [[ -n "$sub" && -f "$sub/version.json" ]] && root="$sub" fi if [[ ! -f "$root/version.json" ]]; then echo "[BOOT][WARN] version.json not found in bundle; fallback to FTP" else ver=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$root/version.json" | head -n1) if [[ -z "$ver" ]]; then echo "[BOOT][WARN] failed to parse version from version.json; fallback to FTP" else target_root="/opt/argus-metric" version_dir="$target_root/versions/$ver" mkdir -p "$version_dir" # move contents into version dir shopt -s dotglob mv "$root"/* "$version_dir/" 2>/dev/null || true shopt -u dotglob # run component installer within version dir if [[ -f "$version_dir/install.sh" ]]; then chmod +x "$version_dir/install.sh" 2>/dev/null || true # 传递运行时开关:容器内缺省启用 AUTO_START_DCGM=1、禁用 Profiling(可通过环境变量覆盖) # 注意:不能用 `VAR=.. VAR2=.. (cmd)` 前缀到子 shell;bash 不允许 env 赋值直接修饰 `(` 复合命令。 # 因此改为在子 subshell 中 export 后再执行。 ( export AUTO_START_DCGM="${AUTO_START_DCGM:-1}" export DCGM_EXPORTER_DISABLE_PROFILING="${DCGM_EXPORTER_DISABLE_PROFILING:-1}" export DCGM_EXPORTER_LISTEN="${DCGM_EXPORTER_LISTEN:-:9400}" cd "$version_dir" && ./install.sh "$version_dir" ) echo "$ver" > "$target_root/LATEST_VERSION" 2>/dev/null || true ln -sfn "$version_dir" "$target_root/current" 2>/dev/null || true if [[ -L "$target_root/current" && -d "$target_root/current" ]]; then installed_ok=1 echo "[BOOT] local bundle install OK: version=$ver" else echo "[BOOT][WARN] current symlink not present after install; will rely on healthcheck to confirm" fi else echo "[BOOT][WARN] install.sh missing under $version_dir; fallback to FTP" fi fi fi fi # 3) fallback: use FTP setup if not installed if [[ ! -L "$INSTALL_DIR/current" && "$installed_ok" -eq 0 ]]; then echo "[BOOT] fallback to FTP setup" if [[ -z "${FTPIP:-}" || -z "${FTP_USER:-}" || -z "${FTP_PASSWORD:-}" ]]; then echo "[BOOT][ERROR] FTP variables not set (FTPIP/FTP_USER/FTP_PASSWORD)" >&2 exit 1 fi curl -u "$FTP_USER:$FTP_PASSWORD" -fsSL "ftp://$FTPIP:21/setup.sh" -o /tmp/setup.sh chmod +x /tmp/setup.sh /tmp/setup.sh --server "$FTPIP" --user "$FTP_USER" --password "$FTP_PASSWORD" --port 21 fi fi # 4) ensure agent is running; start if needed (inherits env: MASTER_ENDPOINT/AGENT_*) if ! pgrep -x argus-agent >/dev/null 2>&1; then echo "[BOOT] starting argus-agent (not detected)" setsid /usr/local/bin/argus-agent >/var/log/argus-agent.log 2>&1 < /dev/null & fi # 5) 若 dcgm-exporter 未监听(可能因 Profiling 崩溃),尝试无 Profiling 清单回退启动 if ! ss -tlnp 2>/dev/null | grep -q ":9400 "; then echo "[BOOT] dcgm-exporter not listening; trying no-prof fallback" pgrep -f nv-hostengine >/dev/null || (nohup nv-hostengine >/var/log/nv-hostengine.log 2>&1 & sleep 2) cfg_dir="/etc/dcgm-exporter"; default_cfg="$cfg_dir/default-counters.csv"; no_prof_cfg="$cfg_dir/no-prof.csv" if [[ -f "$default_cfg" ]]; then grep -v 'DCGM_FI_PROF_' "$default_cfg" > "$no_prof_cfg" || true pkill -f dcgm-exporter >/dev/null 2>&1 || true nohup /usr/local/bin/dcgm-exporter --address="${DCGM_EXPORTER_LISTEN:-:9400}" --collectors "$no_prof_cfg" >/var/log/dcgm-exporter.log 2>&1 & fi fi # 6) post-install selfcheck (best-effort) and wait for node.json for i in {1..30}; do if compgen -G "$INSTALL_DIR/versions/*/check_health.sh" > /dev/null; then bash "$INSTALL_DIR"/versions/*/check_health.sh || true break fi sleep 2 done host="$(hostname)" state_dir="/private/argus/agent/${host}" mkdir -p "$state_dir" 2>/dev/null || true for i in {1..60}; do if [[ -s "$state_dir/node.json" ]]; then echo "[BOOT] node state present: $state_dir/node.json" break fi sleep 2 done # 7) spawn health watcher (best-effort, non-blocking) ver_dir="" if [[ -L "$INSTALL_DIR/current" ]]; then ver_dir="$(readlink -f "$INSTALL_DIR/current" 2>/dev/null || true)" fi if [[ -z "$ver_dir" ]]; then ver_dir="$(ls -d "$INSTALL_DIR"/versions/* 2>/dev/null | sort -V | tail -n1 || true)" fi if command -v /usr/local/bin/health-watcher.sh >/dev/null 2>&1; then echo "[BOOT] starting health watcher for $ver_dir" setsid /usr/local/bin/health-watcher.sh "${ver_dir:-}" >/var/log/health-watcher.log 2>&1 < /dev/null || true & else echo "[BOOT][WARN] health-watcher.sh not found; skip health watcher" fi echo "[BOOT] ready; entering sleep" exec sleep infinity