argus/src/sys/build/node-bundle/node-bootstrap.sh

136 lines
5.6 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
set -euo pipefail
echo "[BOOT] node bundle starting"
INSTALL_DIR="/opt/argus-metric"
BUNDLE_DIR="/bundle"
installed_ok=0
# 1) already installed?
if [[ -L "$INSTALL_DIR/current" && -d "$INSTALL_DIR/current" ]]; then
echo "[BOOT] client already installed at $INSTALL_DIR/current"
else
# 2) try local bundle first (replicate setup.sh layout: move to /opt/argus-metric/versions/<ver> and run install.sh)
tarball=$(ls -1 "$BUNDLE_DIR"/argus-metric_*.tar.gz 2>/dev/null | head -1 || true)
if [[ -n "${tarball:-}" ]]; then
echo "[BOOT] installing from local bundle: $(basename "$tarball")"
tmp=$(mktemp -d)
tar -xzf "$tarball" -C "$tmp"
# locate root containing version.json
root="$tmp"
if [[ ! -f "$root/version.json" ]]; then
sub=$(find "$tmp" -mindepth 1 -maxdepth 1 -type d | head -n1 || true)
[[ -n "$sub" && -f "$sub/version.json" ]] && root="$sub"
fi
if [[ ! -f "$root/version.json" ]]; then
echo "[BOOT][WARN] version.json not found in bundle; fallback to FTP"
else
ver=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$root/version.json" | head -n1)
if [[ -z "$ver" ]]; then
echo "[BOOT][WARN] failed to parse version from version.json; fallback to FTP"
else
target_root="/opt/argus-metric"
version_dir="$target_root/versions/$ver"
mkdir -p "$version_dir"
# move contents into version dir
shopt -s dotglob
mv "$root"/* "$version_dir/" 2>/dev/null || true
shopt -u dotglob
# run component installer within version dir
if [[ -f "$version_dir/install.sh" ]]; then
chmod +x "$version_dir/install.sh" 2>/dev/null || true
# 传递运行时开关:容器内缺省启用 AUTO_START_DCGM=1、禁用 Profiling可通过环境变量覆盖
# 注意:不能用 `VAR=.. VAR2=.. (cmd)` 前缀到子 shellbash 不允许 env 赋值直接修饰 `(` 复合命令。
# 因此改为在子 subshell 中 export 后再执行。
(
export AUTO_START_DCGM="${AUTO_START_DCGM:-1}"
export DCGM_EXPORTER_DISABLE_PROFILING="${DCGM_EXPORTER_DISABLE_PROFILING:-1}"
export DCGM_EXPORTER_LISTEN="${DCGM_EXPORTER_LISTEN:-:9400}"
cd "$version_dir" && ./install.sh "$version_dir"
)
echo "$ver" > "$target_root/LATEST_VERSION" 2>/dev/null || true
ln -sfn "$version_dir" "$target_root/current" 2>/dev/null || true
if [[ -L "$target_root/current" && -d "$target_root/current" ]]; then
installed_ok=1
echo "[BOOT] local bundle install OK: version=$ver"
else
echo "[BOOT][WARN] current symlink not present after install; will rely on healthcheck to confirm"
fi
else
echo "[BOOT][WARN] install.sh missing under $version_dir; fallback to FTP"
fi
fi
fi
fi
# 3) fallback: use FTP setup if not installed
if [[ ! -L "$INSTALL_DIR/current" && "$installed_ok" -eq 0 ]]; then
echo "[BOOT] fallback to FTP setup"
if [[ -z "${FTPIP:-}" || -z "${FTP_USER:-}" || -z "${FTP_PASSWORD:-}" ]]; then
echo "[BOOT][ERROR] FTP variables not set (FTPIP/FTP_USER/FTP_PASSWORD)" >&2
exit 1
fi
curl -u "$FTP_USER:$FTP_PASSWORD" -fsSL "ftp://$FTPIP:21/setup.sh" -o /tmp/setup.sh
chmod +x /tmp/setup.sh
/tmp/setup.sh --server "$FTPIP" --user "$FTP_USER" --password "$FTP_PASSWORD" --port 21
fi
fi
# 4) ensure agent is running; start if needed (inherits env: MASTER_ENDPOINT/AGENT_*)
if ! pgrep -x argus-agent >/dev/null 2>&1; then
echo "[BOOT] starting argus-agent (not detected)"
setsid /usr/local/bin/argus-agent >/var/log/argus-agent.log 2>&1 < /dev/null &
fi
# 5) 若 dcgm-exporter 未监听(可能因 Profiling 崩溃),尝试无 Profiling 清单回退启动
if ! ss -tlnp 2>/dev/null | grep -q ":9400 "; then
echo "[BOOT] dcgm-exporter not listening; trying no-prof fallback"
pgrep -f nv-hostengine >/dev/null || (nohup nv-hostengine >/var/log/nv-hostengine.log 2>&1 & sleep 2)
cfg_dir="/etc/dcgm-exporter"; default_cfg="$cfg_dir/default-counters.csv"; no_prof_cfg="$cfg_dir/no-prof.csv"
if [[ -f "$default_cfg" ]]; then
grep -v 'DCGM_FI_PROF_' "$default_cfg" > "$no_prof_cfg" || true
pkill -f dcgm-exporter >/dev/null 2>&1 || true
nohup /usr/local/bin/dcgm-exporter --address="${DCGM_EXPORTER_LISTEN:-:9400}" --collectors "$no_prof_cfg" >/var/log/dcgm-exporter.log 2>&1 &
fi
fi
# 6) post-install selfcheck (best-effort) and wait for node.json
for i in {1..30}; do
if compgen -G "$INSTALL_DIR/versions/*/check_health.sh" > /dev/null; then
bash "$INSTALL_DIR"/versions/*/check_health.sh || true
break
fi
sleep 2
done
host="$(hostname)"
state_dir="/private/argus/agent/${host}"
mkdir -p "$state_dir" 2>/dev/null || true
for i in {1..60}; do
if [[ -s "$state_dir/node.json" ]]; then
echo "[BOOT] node state present: $state_dir/node.json"
break
fi
sleep 2
done
# 7) spawn health watcher (best-effort, non-blocking)
ver_dir=""
if [[ -L "$INSTALL_DIR/current" ]]; then
ver_dir="$(readlink -f "$INSTALL_DIR/current" 2>/dev/null || true)"
fi
if [[ -z "$ver_dir" ]]; then
ver_dir="$(ls -d "$INSTALL_DIR"/versions/* 2>/dev/null | sort -V | tail -n1 || true)"
fi
if command -v /usr/local/bin/health-watcher.sh >/dev/null 2>&1; then
echo "[BOOT] starting health watcher for $ver_dir"
setsid /usr/local/bin/health-watcher.sh "${ver_dir:-}" >/var/log/health-watcher.log 2>&1 < /dev/null || true &
else
echo "[BOOT][WARN] health-watcher.sh not found; skip health watcher"
fi
echo "[BOOT] ready; entering sleep"
exec sleep infinity