diff --git a/build/build_images.sh b/build/build_images.sh index e32908c..6da1b7e 100755 --- a/build/build_images.sh +++ b/build/build_images.sh @@ -12,7 +12,10 @@ Options: --master-offline Build master offline image (requires src/master/offline_wheels.tar.gz) --metric Build metric module images (ftp, prometheus, grafana, test nodes) --no-cache Build all images without using Docker layer cache - --only LIST Comma-separated targets to build: core,master,metric,web,alert,sys,all + --only LIST Comma-separated targets to build: core,master,metric,web,alert,sys,gpu_bundle,all + --version DATE Bundle date tag used by gpu_bundle (e.g. 20251112) + --client-semver X.Y.Z Override client semver used in all-in-one-full artifact (optional) + --cuda VER CUDA runtime version for NVIDIA base (default: 12.2.2) -h, --help Show this help message Examples: @@ -32,8 +35,13 @@ build_metric=true build_web=true build_alert=true build_sys=true +build_gpu_bundle=false no_cache=false +bundle_date="" +client_semver="" +cuda_ver="12.2.2" + while [[ $# -gt 0 ]]; do case $1 in --intranet) @@ -63,7 +71,7 @@ while [[ $# -gt 0 ]]; do fi sel="$2"; shift 2 # reset all, then enable selected - build_core=false; build_master=false; build_metric=false; build_web=false; build_alert=false; build_sys=false + build_core=false; build_master=false; build_metric=false; build_web=false; build_alert=false; build_sys=false; build_gpu_bundle=false IFS=',' read -ra parts <<< "$sel" for p in "${parts[@]}"; do case "$p" in @@ -73,11 +81,24 @@ while [[ $# -gt 0 ]]; do web) build_web=true ;; alert) build_alert=true ;; sys) build_sys=true ;; + gpu_bundle) build_gpu_bundle=true ;; all) build_core=true; build_master=true; build_metric=true; build_web=true; build_alert=true; build_sys=true ;; *) echo "Unknown --only target: $p" >&2; exit 1 ;; esac done ;; + --version) + if [[ -z ${2:-} ]]; then echo "--version requires a value like 20251112" >&2; exit 1; fi + bundle_date="$2"; shift 2 + ;; + --client-semver) + if [[ -z ${2:-} ]]; then echo "--client-semver requires a value like 1.43.0" >&2; exit 1; fi + client_semver="$2"; shift 2 + ;; + --cuda) + if [[ -z ${2:-} ]]; then echo "--cuda requires a value like 12.2.2" >&2; exit 1; fi + cuda_ver="$2"; shift 2 + ;; -h|--help) show_help exit 0 @@ -203,6 +224,176 @@ pull_base_image() { images_built=() build_failed=false +build_gpu_bundle_image() { + local date_tag="$1" # e.g. 20251112 + local cuda_ver_local="$2" # e.g. 12.2.2 + local client_ver="$3" # semver like 1.43.0 + + if [[ -z "$date_tag" ]]; then + echo "āŒ gpu_bundle requires --version YYMMDD (e.g. 20251112)" >&2 + return 1 + fi + + # sanitize cuda version (trim trailing dots like '12.2.') + while [[ "$cuda_ver_local" == *"." ]]; do cuda_ver_local="${cuda_ver_local%.}"; done + + # Resolve effective CUDA base tag + local resolve_cuda_base_tag + resolve_cuda_base_tag() { + local want="$1" # can be 12, 12.2 or 12.2.2 + local major minor patch + if [[ "$want" =~ ^([0-9]+)\.([0-9]+)\.([0-9]+)$ ]]; then + major="${BASH_REMATCH[1]}"; minor="${BASH_REMATCH[2]}"; patch="${BASH_REMATCH[3]}" + echo "nvidia/cuda:${major}.${minor}.${patch}-runtime-ubuntu22.04"; return 0 + elif [[ "$want" =~ ^([0-9]+)\.([0-9]+)$ ]]; then + major="${BASH_REMATCH[1]}"; minor="${BASH_REMATCH[2]}" + # try to find best local patch for major.minor + local best + best=$(docker images --format '{{.Repository}}:{{.Tag}}' nvidia/cuda 2>/dev/null | \ + grep -E "^nvidia/cuda:${major}\.${minor}\\.[0-9]+-runtime-ubuntu22\.04$" | \ + sed -E 's#^nvidia/cuda:([0-9]+\.[0-9]+\.)([0-9]+)-runtime-ubuntu22\.04$#\1\2#g' | \ + sort -V | tail -n1 || true) + if [[ -n "$best" ]]; then + echo "nvidia/cuda:${best}-runtime-ubuntu22.04"; return 0 + fi + # fallback patch if none local + echo "nvidia/cuda:${major}.${minor}.2-runtime-ubuntu22.04"; return 0 + elif [[ "$want" =~ ^([0-9]+)$ ]]; then + major="${BASH_REMATCH[1]}" + # try to find best local for this major + local best + best=$(docker images --format '{{.Repository}}:{{.Tag}}' nvidia/cuda 2>/dev/null | \ + grep -E "^nvidia/cuda:${major}\\.[0-9]+\\.[0-9]+-runtime-ubuntu22\.04$" | \ + sed -E 's#^nvidia/cuda:([0-9]+\.[0-9]+\.[0-9]+)-runtime-ubuntu22\.04$#\1#g' | \ + sort -V | tail -n1 || true) + if [[ -n "$best" ]]; then + echo "nvidia/cuda:${best}-runtime-ubuntu22.04"; return 0 + fi + echo "nvidia/cuda:${major}.2.2-runtime-ubuntu22.04"; return 0 + else + # invalid format, fallback to default + echo "nvidia/cuda:12.2.2-runtime-ubuntu22.04"; return 0 + fi + } + + local base_image + base_image=$(resolve_cuda_base_tag "$cuda_ver_local") + + echo + echo "šŸ”§ Preparing one-click GPU bundle build" + echo " CUDA runtime base: ${base_image}" + echo " Bundle tag : ${date_tag}" + + # 1) Ensure NVIDIA base image (skip pull if local) + if ! pull_base_image "$base_image"; then + # try once more with default if resolution failed + if ! pull_base_image "nvidia/cuda:12.2.2-runtime-ubuntu22.04"; then + return 1 + else + base_image="nvidia/cuda:12.2.2-runtime-ubuntu22.04" + fi + fi + + # 2) Build latest argus-agent from source + echo "\nšŸ›  Building argus-agent from src/agent" + pushd "$root/src/agent" >/dev/null + if ! bash scripts/build_binary.sh; then + echo "āŒ argus-agent build failed" >&2 + popd >/dev/null + return 1 + fi + if [[ ! -f "dist/argus-agent" ]]; then + echo "āŒ argus-agent binary missing after build" >&2 + popd >/dev/null + return 1 + fi + popd >/dev/null + + # 3) Inject agent into all-in-one-full plugin and package artifact + local aio_root="$root/src/metric/client-plugins/all-in-one-full" + local agent_bin_src="$root/src/agent/dist/argus-agent" + local agent_bin_dst="$aio_root/plugins/argus-agent/bin/argus-agent" + echo "\nšŸ“¦ Updating all-in-one-full agent binary → $agent_bin_dst" + cp -f "$agent_bin_src" "$agent_bin_dst" + chmod +x "$agent_bin_dst" || true + + pushd "$aio_root" >/dev/null + local prev_version + prev_version="$(cat config/VERSION 2>/dev/null || echo "1.0.0")" + local use_version="$prev_version" + if [[ -n "$client_semver" ]]; then + echo "${client_semver}" > config/VERSION + use_version="$client_semver" + fi + echo " Packaging all-in-one-full artifact version: $use_version" + if ! bash scripts/package_artifact.sh --force; then + echo "āŒ package_artifact.sh failed" >&2 + # restore VERSION if changed + if [[ -n "$client_semver" ]]; then echo "$prev_version" > config/VERSION; fi + popd >/dev/null + return 1 + fi + + local artifact_dir="$aio_root/artifact/$use_version" + local artifact_tar + artifact_tar="$(ls -1 "$artifact_dir"/argus-metric_*.tar.gz 2>/dev/null | head -n1 || true)" + if [[ -z "$artifact_tar" ]]; then + echo " No argus-metric_*.tar.gz found; invoking publish_artifact.sh to assemble..." + local owner="$(id -u):$(id -g)" + if ! bash scripts/publish_artifact.sh "$use_version" --output-dir "$artifact_dir" --owner "$owner"; then + echo "āŒ publish_artifact.sh failed" >&2 + if [[ -n "$client_semver" ]]; then echo "$prev_version" > config/VERSION; fi + popd >/dev/null + return 1 + fi + artifact_tar="$(ls -1 "$artifact_dir"/argus-metric_*.tar.gz 2>/dev/null | head -n1 || true)" + fi + if [[ -z "$artifact_tar" ]]; then + echo "āŒ artifact tar not found under $artifact_dir" >&2 + if [[ -n "$client_semver" ]]; then echo "$prev_version" > config/VERSION; fi + popd >/dev/null + return 1 + fi + # restore VERSION if changed (keep filesystem clean) + if [[ -n "$client_semver" ]]; then echo "$prev_version" > config/VERSION; fi + popd >/dev/null + + # 4) Stage docker build context + local bundle_ctx="$root/src/bundle/gpu-node-bundle/.build-$date_tag" + echo "\n🧰 Staging docker build context: $bundle_ctx" + rm -rf "$bundle_ctx" + mkdir -p "$bundle_ctx/bundle" "$bundle_ctx/private" + cp "$root/src/bundle/gpu-node-bundle/Dockerfile" "$bundle_ctx/" + cp "$root/src/bundle/gpu-node-bundle/node-bootstrap.sh" "$bundle_ctx/" + # bundle tar + cp "$artifact_tar" "$bundle_ctx/bundle/" + # offline fluent-bit assets (optional but useful) + if [[ -d "$root/src/log/fluent-bit/build/etc" ]]; then + cp -r "$root/src/log/fluent-bit/build/etc" "$bundle_ctx/private/" + fi + if [[ -d "$root/src/log/fluent-bit/build/packages" ]]; then + cp -r "$root/src/log/fluent-bit/build/packages" "$bundle_ctx/private/" + fi + if [[ -f "$root/src/log/fluent-bit/build/start-fluent-bit.sh" ]]; then + cp "$root/src/log/fluent-bit/build/start-fluent-bit.sh" "$bundle_ctx/private/" + fi + + # 5) Build the final bundle image (directly from NVIDIA base) + local image_tag="argus-sys-metric-test-node-bundle-gpu:${date_tag}" + echo "\nšŸ”„ Building GPU Bundle image" + if build_image "GPU Bundle" "$bundle_ctx/Dockerfile" "$image_tag" "$bundle_ctx" \ + --build-arg CUDA_VER="$(echo "$base_image" | sed -E 's#^nvidia/cuda:([0-9]+\.[0-9]+\.[0-9]+)-runtime-ubuntu22\.04$#\1#')" \ + --build-arg CLIENT_VER="$use_version" \ + --build-arg BUNDLE_DATE="$date_tag"; then + images_built+=("$image_tag") + # also tag latest for convenience + docker tag "$image_tag" argus-sys-metric-test-node-bundle-gpu:latest >/dev/null 2>&1 || true + return 0 + else + return 1 + fi +} + if [[ "$build_core" == true ]]; then if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then images_built+=("argus-elasticsearch:latest") @@ -376,6 +567,18 @@ if [[ "$build_web" == true || "$build_alert" == true ]]; then fi fi +# ======================================= +# One-click GPU bundle (direct NVIDIA base) +# ======================================= + +if [[ "$build_gpu_bundle" == true ]]; then + echo "" + echo "Building one-click GPU bundle image..." + if ! build_gpu_bundle_image "$bundle_date" "$cuda_ver" "$client_semver"; then + build_failed=true + fi +fi + echo "=======================================" echo "šŸ“¦ Build Summary" echo "=======================================" diff --git a/src/agent/.gitignore b/src/agent/.gitignore index 60fe090..d10b76a 100644 --- a/src/agent/.gitignore +++ b/src/agent/.gitignore @@ -3,3 +3,4 @@ build/ __pycache__/ .env +dist/ diff --git a/src/agent/dist/argus-agent b/src/agent/dist/argus-agent deleted file mode 100755 index 9e71eb1..0000000 Binary files a/src/agent/dist/argus-agent and /dev/null differ diff --git a/src/bundle/gpu-node-bundle/.gitignore b/src/bundle/gpu-node-bundle/.gitignore new file mode 100644 index 0000000..759168e --- /dev/null +++ b/src/bundle/gpu-node-bundle/.gitignore @@ -0,0 +1 @@ +.build*/ diff --git a/src/bundle/gpu-node-bundle/Dockerfile b/src/bundle/gpu-node-bundle/Dockerfile new file mode 100644 index 0000000..006a7c9 --- /dev/null +++ b/src/bundle/gpu-node-bundle/Dockerfile @@ -0,0 +1,43 @@ +ARG CUDA_VER=12.2.2 +FROM nvidia/cuda:${CUDA_VER}-runtime-ubuntu22.04 + +ARG CLIENT_VER=0.0.0 +ARG BUNDLE_DATE=00000000 + +LABEL org.opencontainers.image.title="argus-sys-metric-test-node-bundle-gpu" \ + org.opencontainers.image.description="GPU node bundle with embedded Argus client artifact" \ + org.opencontainers.image.version="${CLIENT_VER}" \ + org.opencontainers.image.revision_date="${BUNDLE_DATE}" \ + maintainer="Argus" + +ENV DEBIAN_FRONTEND=noninteractive \ + TZ=Asia/Shanghai \ + ARGUS_LOGS_WORLD_WRITABLE=1 \ + ES_HOST=es.log.argus.com \ + ES_PORT=9200 \ + CLUSTER=local \ + RACK=dev + +RUN set -eux; \ + apt-get update; \ + apt-get install -y --no-install-recommends \ + ca-certificates curl wget iproute2 iputils-ping net-tools jq tzdata cron procps vim less \ + tar gzip; \ + rm -rf /var/lib/apt/lists/*; \ + ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone + +WORKDIR / + +# Expect staged build context to provide these directories/files +COPY bundle/ /bundle/ +COPY node-bootstrap.sh /usr/local/bin/node-bootstrap.sh +COPY private/start-fluent-bit.sh /private/start-fluent-bit.sh +COPY private/etc /private/etc +COPY private/packages /private/packages + +RUN chmod +x /usr/local/bin/node-bootstrap.sh /private/start-fluent-bit.sh || true; \ + mkdir -p /logs/train /logs/infer /buffers /opt/argus-metric; \ + chmod 1777 /logs/train /logs/infer || true; \ + chmod 770 /buffers || true + +ENTRYPOINT ["/usr/local/bin/node-bootstrap.sh"] diff --git a/src/bundle/gpu-node-bundle/node-bootstrap.sh b/src/bundle/gpu-node-bundle/node-bootstrap.sh new file mode 100644 index 0000000..603d4eb --- /dev/null +++ b/src/bundle/gpu-node-bundle/node-bootstrap.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +set -euo pipefail + +echo "[BOOT] GPU node bundle starting" + +INSTALL_ROOT="/opt/argus-metric" +BUNDLE_DIR="/bundle" +STATE_DIR_BASE="/private/argus/agent" + +mkdir -p "$INSTALL_ROOT" "$STATE_DIR_BASE" /logs/train /logs/infer /buffers || true + +# Ensure world-writable logs dir with sticky bit (align with deployment_new policy) +if [[ "${ARGUS_LOGS_WORLD_WRITABLE:-1}" == "1" ]]; then + chmod 1777 /logs/train /logs/infer || true +else + chmod 755 /logs/train /logs/infer || true +fi +chmod 770 /buffers || true + +installed_ok=0 + +# 1) already installed? +if [[ -L "$INSTALL_ROOT/current" && -d "$INSTALL_ROOT/current" ]]; then + echo "[BOOT] client already installed at $INSTALL_ROOT/current" +else + # 2) try local bundle first (argus-metric_*.tar.gz) + tarball=$(ls -1 "$BUNDLE_DIR"/argus-metric_*.tar.gz 2>/dev/null | head -1 || true) + if [[ -n "${tarball:-}" ]]; then + echo "[BOOT] installing from local bundle: $(basename "$tarball")" + tmp=$(mktemp -d) + tar -xzf "$tarball" -C "$tmp" + # locate root containing version.json + root="$tmp" + if [[ ! -f "$root/version.json" ]]; then + sub=$(find "$tmp" -mindepth 1 -maxdepth 1 -type d | head -n1 || true) + [[ -n "$sub" && -f "$sub/version.json" ]] && root="$sub" + fi + if [[ ! -f "$root/version.json" ]]; then + echo "[BOOT][WARN] version.json not found in bundle; fallback to FTP" + else + ver=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$root/version.json" | head -n1) + if [[ -z "$ver" ]]; then + echo "[BOOT][WARN] failed to parse version from version.json; fallback to FTP" + else + target_root="$INSTALL_ROOT" + version_dir="$target_root/versions/$ver" + mkdir -p "$version_dir" + shopt -s dotglob + mv "$root"/* "$version_dir/" 2>/dev/null || true + shopt -u dotglob + if [[ -f "$version_dir/install.sh" ]]; then + chmod +x "$version_dir/install.sh" 2>/dev/null || true + ( + export AUTO_START_DCGM="${AUTO_START_DCGM:-1}" + export DCGM_EXPORTER_DISABLE_PROFILING="${DCGM_EXPORTER_DISABLE_PROFILING:-1}" + export DCGM_EXPORTER_LISTEN="${DCGM_EXPORTER_LISTEN:-:9400}" + cd "$version_dir" && ./install.sh "$version_dir" + ) + echo "$ver" > "$target_root/LATEST_VERSION" 2>/dev/null || true + ln -sfn "$version_dir" "$target_root/current" 2>/dev/null || true + if [[ -L "$target_root/current" && -d "$target_root/current" ]]; then + installed_ok=1 + echo "[BOOT] local bundle install OK: version=$ver" + else + echo "[BOOT][WARN] current symlink not present after install; will rely on healthcheck to confirm" + fi + else + echo "[BOOT][WARN] install.sh missing under $version_dir; fallback to FTP" + fi + fi + fi + fi + + # 3) fallback: use FTP setup if not installed + if [[ ! -L "$INSTALL_ROOT/current" && "$installed_ok" -eq 0 ]]; then + echo "[BOOT] fallback to FTP setup" + if [[ -z "${FTPIP:-}" || -z "${FTP_USER:-}" || -z "${FTP_PASSWORD:-}" ]]; then + echo "[BOOT][ERROR] FTP variables not set (FTPIP/FTP_USER/FTP_PASSWORD)" >&2 + exit 1 + fi + curl -u "$FTP_USER:$FTP_PASSWORD" -fsSL "ftp://$FTPIP:21/setup.sh" -o /tmp/setup.sh + chmod +x /tmp/setup.sh + /tmp/setup.sh --server "$FTPIP" --user "$FTP_USER" --password "$FTP_PASSWORD" --port 21 + fi +fi + +# 4) ensure argus-agent is running (best-effort) +if ! pgrep -x argus-agent >/dev/null 2>&1; then + echo "[BOOT] starting argus-agent (not detected)" + setsid /usr/local/bin/argus-agent >/var/log/argus-agent.log 2>&1 < /dev/null & +fi + +# 5) post-install selfcheck (run once) and state +# prefer current version dir; fallback to first version under /opt/argus-metric/versions +ver_dir="" +if [[ -L "$INSTALL_ROOT/current" ]]; then + ver_dir="$(readlink -f "$INSTALL_ROOT/current" 2>/dev/null || true)" +fi +if [[ -z "$ver_dir" ]]; then + # pick the latest by name (semver-like); best-effort + ver_dir="$(ls -d "$INSTALL_ROOT"/versions/* 2>/dev/null | sort -V | tail -n1 || true)" +fi + +if [[ -n "$ver_dir" && -x "$ver_dir/check_health.sh" ]]; then + echo "[BOOT] running initial health check: $ver_dir/check_health.sh" + if "$ver_dir/check_health.sh" >> "$ver_dir/.health_check.init.log" 2>&1; then + echo "[BOOT] initial health check completed (see $ver_dir/.health_check.init.log)" + else + echo "[BOOT][WARN] initial health check reported issues (see $ver_dir/.health_check.init.log)" + fi +else + echo "[BOOT][WARN] initial health check skipped (script missing: $ver_dir/check_health.sh)" +fi + +host="$(hostname)" +state_dir="$STATE_DIR_BASE/${host}" +mkdir -p "$state_dir" 2>/dev/null || true +for i in {1..60}; do + if [[ -s "$state_dir/node.json" ]]; then + echo "[BOOT] node state present: $state_dir/node.json" + break + fi + sleep 2 +done + +echo "[BOOT] ready; entering sleep" +exec sleep infinity diff --git a/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/.gitignore b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/.gitignore new file mode 100644 index 0000000..e660fd9 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/.gitignore @@ -0,0 +1 @@ +bin/ diff --git a/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent b/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent deleted file mode 100755 index cb9ff7e..0000000 --- a/src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e2e57a49ebf85f2a790381f73cabe22408d0f7428a5a5181724160781e73a75c -size 7583784