Co-authored-by: sundapeng <sundp@mail.zgclab.edu.cn> Co-authored-by: xuxt <xuxt@zgclab.edu.cn> Reviewed-on: #52
136 lines
5.6 KiB
Bash
136 lines
5.6 KiB
Bash
#!/usr/bin/env bash
|
||
set -euo pipefail
|
||
|
||
echo "[BOOT] node bundle starting"
|
||
|
||
INSTALL_DIR="/opt/argus-metric"
|
||
BUNDLE_DIR="/bundle"
|
||
installed_ok=0
|
||
|
||
# 1) already installed?
|
||
if [[ -L "$INSTALL_DIR/current" && -d "$INSTALL_DIR/current" ]]; then
|
||
echo "[BOOT] client already installed at $INSTALL_DIR/current"
|
||
else
|
||
# 2) try local bundle first (replicate setup.sh layout: move to /opt/argus-metric/versions/<ver> and run install.sh)
|
||
tarball=$(ls -1 "$BUNDLE_DIR"/argus-metric_*.tar.gz 2>/dev/null | head -1 || true)
|
||
if [[ -n "${tarball:-}" ]]; then
|
||
echo "[BOOT] installing from local bundle: $(basename "$tarball")"
|
||
tmp=$(mktemp -d)
|
||
tar -xzf "$tarball" -C "$tmp"
|
||
# locate root containing version.json
|
||
root="$tmp"
|
||
if [[ ! -f "$root/version.json" ]]; then
|
||
sub=$(find "$tmp" -mindepth 1 -maxdepth 1 -type d | head -n1 || true)
|
||
[[ -n "$sub" && -f "$sub/version.json" ]] && root="$sub"
|
||
fi
|
||
if [[ ! -f "$root/version.json" ]]; then
|
||
echo "[BOOT][WARN] version.json not found in bundle; fallback to FTP"
|
||
else
|
||
ver=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$root/version.json" | head -n1)
|
||
if [[ -z "$ver" ]]; then
|
||
echo "[BOOT][WARN] failed to parse version from version.json; fallback to FTP"
|
||
else
|
||
target_root="/opt/argus-metric"
|
||
version_dir="$target_root/versions/$ver"
|
||
mkdir -p "$version_dir"
|
||
# move contents into version dir
|
||
shopt -s dotglob
|
||
mv "$root"/* "$version_dir/" 2>/dev/null || true
|
||
shopt -u dotglob
|
||
# run component installer within version dir
|
||
if [[ -f "$version_dir/install.sh" ]]; then
|
||
chmod +x "$version_dir/install.sh" 2>/dev/null || true
|
||
# 传递运行时开关:容器内缺省启用 AUTO_START_DCGM=1、禁用 Profiling(可通过环境变量覆盖)
|
||
# 注意:不能用 `VAR=.. VAR2=.. (cmd)` 前缀到子 shell;bash 不允许 env 赋值直接修饰 `(` 复合命令。
|
||
# 因此改为在子 subshell 中 export 后再执行。
|
||
(
|
||
export AUTO_START_DCGM="${AUTO_START_DCGM:-1}"
|
||
export DCGM_EXPORTER_DISABLE_PROFILING="${DCGM_EXPORTER_DISABLE_PROFILING:-1}"
|
||
export DCGM_EXPORTER_LISTEN="${DCGM_EXPORTER_LISTEN:-:9400}"
|
||
cd "$version_dir" && ./install.sh "$version_dir"
|
||
)
|
||
echo "$ver" > "$target_root/LATEST_VERSION" 2>/dev/null || true
|
||
ln -sfn "$version_dir" "$target_root/current" 2>/dev/null || true
|
||
if [[ -L "$target_root/current" && -d "$target_root/current" ]]; then
|
||
installed_ok=1
|
||
echo "[BOOT] local bundle install OK: version=$ver"
|
||
else
|
||
echo "[BOOT][WARN] current symlink not present after install; will rely on healthcheck to confirm"
|
||
fi
|
||
else
|
||
echo "[BOOT][WARN] install.sh missing under $version_dir; fallback to FTP"
|
||
fi
|
||
fi
|
||
fi
|
||
fi
|
||
|
||
# 3) fallback: use FTP setup if not installed
|
||
if [[ ! -L "$INSTALL_DIR/current" && "$installed_ok" -eq 0 ]]; then
|
||
echo "[BOOT] fallback to FTP setup"
|
||
if [[ -z "${FTPIP:-}" || -z "${FTP_USER:-}" || -z "${FTP_PASSWORD:-}" ]]; then
|
||
echo "[BOOT][ERROR] FTP variables not set (FTPIP/FTP_USER/FTP_PASSWORD)" >&2
|
||
exit 1
|
||
fi
|
||
curl -u "$FTP_USER:$FTP_PASSWORD" -fsSL "ftp://$FTPIP:21/setup.sh" -o /tmp/setup.sh
|
||
chmod +x /tmp/setup.sh
|
||
/tmp/setup.sh --server "$FTPIP" --user "$FTP_USER" --password "$FTP_PASSWORD" --port 21
|
||
fi
|
||
fi
|
||
|
||
# 4) ensure agent is running; start if needed (inherits env: MASTER_ENDPOINT/AGENT_*)
|
||
if ! pgrep -x argus-agent >/dev/null 2>&1; then
|
||
echo "[BOOT] starting argus-agent (not detected)"
|
||
setsid /usr/local/bin/argus-agent >/var/log/argus-agent.log 2>&1 < /dev/null &
|
||
fi
|
||
|
||
# 5) 若 dcgm-exporter 未监听(可能因 Profiling 崩溃),尝试无 Profiling 清单回退启动
|
||
if ! ss -tlnp 2>/dev/null | grep -q ":9400 "; then
|
||
echo "[BOOT] dcgm-exporter not listening; trying no-prof fallback"
|
||
pgrep -f nv-hostengine >/dev/null || (nohup nv-hostengine >/var/log/nv-hostengine.log 2>&1 & sleep 2)
|
||
cfg_dir="/etc/dcgm-exporter"; default_cfg="$cfg_dir/default-counters.csv"; no_prof_cfg="$cfg_dir/no-prof.csv"
|
||
if [[ -f "$default_cfg" ]]; then
|
||
grep -v 'DCGM_FI_PROF_' "$default_cfg" > "$no_prof_cfg" || true
|
||
pkill -f dcgm-exporter >/dev/null 2>&1 || true
|
||
nohup /usr/local/bin/dcgm-exporter --address="${DCGM_EXPORTER_LISTEN:-:9400}" --collectors "$no_prof_cfg" >/var/log/dcgm-exporter.log 2>&1 &
|
||
fi
|
||
fi
|
||
|
||
# 6) post-install selfcheck (best-effort) and wait for node.json
|
||
for i in {1..30}; do
|
||
if compgen -G "$INSTALL_DIR/versions/*/check_health.sh" > /dev/null; then
|
||
bash "$INSTALL_DIR"/versions/*/check_health.sh || true
|
||
break
|
||
fi
|
||
sleep 2
|
||
done
|
||
|
||
host="$(hostname)"
|
||
state_dir="/private/argus/agent/${host}"
|
||
mkdir -p "$state_dir" 2>/dev/null || true
|
||
for i in {1..60}; do
|
||
if [[ -s "$state_dir/node.json" ]]; then
|
||
echo "[BOOT] node state present: $state_dir/node.json"
|
||
break
|
||
fi
|
||
sleep 2
|
||
done
|
||
|
||
# 7) spawn health watcher (best-effort, non-blocking)
|
||
ver_dir=""
|
||
if [[ -L "$INSTALL_DIR/current" ]]; then
|
||
ver_dir="$(readlink -f "$INSTALL_DIR/current" 2>/dev/null || true)"
|
||
fi
|
||
if [[ -z "$ver_dir" ]]; then
|
||
ver_dir="$(ls -d "$INSTALL_DIR"/versions/* 2>/dev/null | sort -V | tail -n1 || true)"
|
||
fi
|
||
|
||
if command -v /usr/local/bin/health-watcher.sh >/dev/null 2>&1; then
|
||
echo "[BOOT] starting health watcher for $ver_dir"
|
||
setsid /usr/local/bin/health-watcher.sh "${ver_dir:-}" >/var/log/health-watcher.log 2>&1 < /dev/null || true &
|
||
else
|
||
echo "[BOOT][WARN] health-watcher.sh not found; skip health watcher"
|
||
fi
|
||
|
||
echo "[BOOT] ready; entering sleep"
|
||
exec sleep infinity
|