#!/usr/bin/env bash set -euo pipefail echo "[BOOT] GPU node bundle starting" INSTALL_ROOT="/opt/argus-metric" BUNDLE_DIR="/bundle" STATE_DIR_BASE="/private/argus/agent" mkdir -p "$INSTALL_ROOT" "$STATE_DIR_BASE" /logs/train /logs/infer /buffers || true # Ensure world-writable logs dir with sticky bit (align with deployment_new policy) if [[ "${ARGUS_LOGS_WORLD_WRITABLE:-1}" == "1" ]]; then chmod 1777 /logs/train /logs/infer || true else chmod 755 /logs/train /logs/infer || true fi chmod 770 /buffers || true installed_ok=0 # 1) already installed? if [[ -L "$INSTALL_ROOT/current" && -d "$INSTALL_ROOT/current" ]]; then echo "[BOOT] client already installed at $INSTALL_ROOT/current" else # 2) try local bundle first (argus-metric_*.tar.gz) tarball=$(ls -1 "$BUNDLE_DIR"/argus-metric_*.tar.gz 2>/dev/null | head -1 || true) if [[ -n "${tarball:-}" ]]; then echo "[BOOT] installing from local bundle: $(basename "$tarball")" tmp=$(mktemp -d) tar -xzf "$tarball" -C "$tmp" # locate root containing version.json root="$tmp" if [[ ! -f "$root/version.json" ]]; then sub=$(find "$tmp" -mindepth 1 -maxdepth 1 -type d | head -n1 || true) [[ -n "$sub" && -f "$sub/version.json" ]] && root="$sub" fi if [[ ! -f "$root/version.json" ]]; then echo "[BOOT][WARN] version.json not found in bundle; fallback to FTP" else ver=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$root/version.json" | head -n1) if [[ -z "$ver" ]]; then echo "[BOOT][WARN] failed to parse version from version.json; fallback to FTP" else target_root="$INSTALL_ROOT" version_dir="$target_root/versions/$ver" mkdir -p "$version_dir" shopt -s dotglob mv "$root"/* "$version_dir/" 2>/dev/null || true shopt -u dotglob if [[ -f "$version_dir/install.sh" ]]; then chmod +x "$version_dir/install.sh" 2>/dev/null || true ( export AUTO_START_DCGM="${AUTO_START_DCGM:-1}" export DCGM_EXPORTER_DISABLE_PROFILING="${DCGM_EXPORTER_DISABLE_PROFILING:-1}" export DCGM_EXPORTER_LISTEN="${DCGM_EXPORTER_LISTEN:-:9400}" cd "$version_dir" && ./install.sh "$version_dir" ) echo "$ver" > "$target_root/LATEST_VERSION" 2>/dev/null || true ln -sfn "$version_dir" "$target_root/current" 2>/dev/null || true if [[ -L "$target_root/current" && -d "$target_root/current" ]]; then installed_ok=1 echo "[BOOT] local bundle install OK: version=$ver" else echo "[BOOT][WARN] current symlink not present after install; will rely on healthcheck to confirm" fi else echo "[BOOT][WARN] install.sh missing under $version_dir; fallback to FTP" fi fi fi fi # 3) fallback: use FTP setup if not installed if [[ ! -L "$INSTALL_ROOT/current" && "$installed_ok" -eq 0 ]]; then echo "[BOOT] fallback to FTP setup" if [[ -z "${FTPIP:-}" || -z "${FTP_USER:-}" || -z "${FTP_PASSWORD:-}" ]]; then echo "[BOOT][ERROR] FTP variables not set (FTPIP/FTP_USER/FTP_PASSWORD)" >&2 exit 1 fi curl -u "$FTP_USER:$FTP_PASSWORD" -fsSL "ftp://$FTPIP:21/setup.sh" -o /tmp/setup.sh chmod +x /tmp/setup.sh /tmp/setup.sh --server "$FTPIP" --user "$FTP_USER" --password "$FTP_PASSWORD" --port 21 fi fi # 4) ensure argus-agent is running (best-effort) if ! pgrep -x argus-agent >/dev/null 2>&1; then echo "[BOOT] starting argus-agent (not detected)" setsid /usr/local/bin/argus-agent >/var/log/argus-agent.log 2>&1 < /dev/null & fi # 5) post-install selfcheck (run once) and state # prefer current version dir; fallback to first version under /opt/argus-metric/versions ver_dir="" if [[ -L "$INSTALL_ROOT/current" ]]; then ver_dir="$(readlink -f "$INSTALL_ROOT/current" 2>/dev/null || true)" fi if [[ -z "$ver_dir" ]]; then # pick the latest by name (semver-like); best-effort ver_dir="$(ls -d "$INSTALL_ROOT"/versions/* 2>/dev/null | sort -V | tail -n1 || true)" fi if [[ -n "$ver_dir" && -x "$ver_dir/check_health.sh" ]]; then echo "[BOOT] running initial health check: $ver_dir/check_health.sh" if "$ver_dir/check_health.sh" >> "$ver_dir/.health_check.init.log" 2>&1; then echo "[BOOT] initial health check completed (see $ver_dir/.health_check.init.log)" else echo "[BOOT][WARN] initial health check reported issues (see $ver_dir/.health_check.init.log)" fi else echo "[BOOT][WARN] initial health check skipped (script missing: $ver_dir/check_health.sh)" fi host="$(hostname)" state_dir="$STATE_DIR_BASE/${host}" mkdir -p "$state_dir" 2>/dev/null || true for i in {1..60}; do if [[ -s "$state_dir/node.json" ]]; then echo "[BOOT] node state present: $state_dir/node.json" break fi sleep 2 done echo "[BOOT] ready; entering sleep" exec sleep infinity