完成H20服务器部署及重启测试 #51
@ -12,7 +12,10 @@ Options:
|
|||||||
--master-offline Build master offline image (requires src/master/offline_wheels.tar.gz)
|
--master-offline Build master offline image (requires src/master/offline_wheels.tar.gz)
|
||||||
--metric Build metric module images (ftp, prometheus, grafana, test nodes)
|
--metric Build metric module images (ftp, prometheus, grafana, test nodes)
|
||||||
--no-cache Build all images without using Docker layer cache
|
--no-cache Build all images without using Docker layer cache
|
||||||
--only LIST Comma-separated targets to build: core,master,metric,web,alert,sys,all
|
--only LIST Comma-separated targets to build: core,master,metric,web,alert,sys,gpu_bundle,all
|
||||||
|
--version DATE Bundle date tag used by gpu_bundle (e.g. 20251112)
|
||||||
|
--client-semver X.Y.Z Override client semver used in all-in-one-full artifact (optional)
|
||||||
|
--cuda VER CUDA runtime version for NVIDIA base (default: 12.2.2)
|
||||||
-h, --help Show this help message
|
-h, --help Show this help message
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
@ -32,8 +35,13 @@ build_metric=true
|
|||||||
build_web=true
|
build_web=true
|
||||||
build_alert=true
|
build_alert=true
|
||||||
build_sys=true
|
build_sys=true
|
||||||
|
build_gpu_bundle=false
|
||||||
no_cache=false
|
no_cache=false
|
||||||
|
|
||||||
|
bundle_date=""
|
||||||
|
client_semver=""
|
||||||
|
cuda_ver="12.2.2"
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case $1 in
|
case $1 in
|
||||||
--intranet)
|
--intranet)
|
||||||
@ -63,7 +71,7 @@ while [[ $# -gt 0 ]]; do
|
|||||||
fi
|
fi
|
||||||
sel="$2"; shift 2
|
sel="$2"; shift 2
|
||||||
# reset all, then enable selected
|
# reset all, then enable selected
|
||||||
build_core=false; build_master=false; build_metric=false; build_web=false; build_alert=false; build_sys=false
|
build_core=false; build_master=false; build_metric=false; build_web=false; build_alert=false; build_sys=false; build_gpu_bundle=false
|
||||||
IFS=',' read -ra parts <<< "$sel"
|
IFS=',' read -ra parts <<< "$sel"
|
||||||
for p in "${parts[@]}"; do
|
for p in "${parts[@]}"; do
|
||||||
case "$p" in
|
case "$p" in
|
||||||
@ -73,11 +81,24 @@ while [[ $# -gt 0 ]]; do
|
|||||||
web) build_web=true ;;
|
web) build_web=true ;;
|
||||||
alert) build_alert=true ;;
|
alert) build_alert=true ;;
|
||||||
sys) build_sys=true ;;
|
sys) build_sys=true ;;
|
||||||
|
gpu_bundle) build_gpu_bundle=true ;;
|
||||||
all) build_core=true; build_master=true; build_metric=true; build_web=true; build_alert=true; build_sys=true ;;
|
all) build_core=true; build_master=true; build_metric=true; build_web=true; build_alert=true; build_sys=true ;;
|
||||||
*) echo "Unknown --only target: $p" >&2; exit 1 ;;
|
*) echo "Unknown --only target: $p" >&2; exit 1 ;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
;;
|
;;
|
||||||
|
--version)
|
||||||
|
if [[ -z ${2:-} ]]; then echo "--version requires a value like 20251112" >&2; exit 1; fi
|
||||||
|
bundle_date="$2"; shift 2
|
||||||
|
;;
|
||||||
|
--client-semver)
|
||||||
|
if [[ -z ${2:-} ]]; then echo "--client-semver requires a value like 1.43.0" >&2; exit 1; fi
|
||||||
|
client_semver="$2"; shift 2
|
||||||
|
;;
|
||||||
|
--cuda)
|
||||||
|
if [[ -z ${2:-} ]]; then echo "--cuda requires a value like 12.2.2" >&2; exit 1; fi
|
||||||
|
cuda_ver="$2"; shift 2
|
||||||
|
;;
|
||||||
-h|--help)
|
-h|--help)
|
||||||
show_help
|
show_help
|
||||||
exit 0
|
exit 0
|
||||||
@ -203,6 +224,176 @@ pull_base_image() {
|
|||||||
images_built=()
|
images_built=()
|
||||||
build_failed=false
|
build_failed=false
|
||||||
|
|
||||||
|
build_gpu_bundle_image() {
|
||||||
|
local date_tag="$1" # e.g. 20251112
|
||||||
|
local cuda_ver_local="$2" # e.g. 12.2.2
|
||||||
|
local client_ver="$3" # semver like 1.43.0
|
||||||
|
|
||||||
|
if [[ -z "$date_tag" ]]; then
|
||||||
|
echo "❌ gpu_bundle requires --version YYMMDD (e.g. 20251112)" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# sanitize cuda version (trim trailing dots like '12.2.')
|
||||||
|
while [[ "$cuda_ver_local" == *"." ]]; do cuda_ver_local="${cuda_ver_local%.}"; done
|
||||||
|
|
||||||
|
# Resolve effective CUDA base tag
|
||||||
|
local resolve_cuda_base_tag
|
||||||
|
resolve_cuda_base_tag() {
|
||||||
|
local want="$1" # can be 12, 12.2 or 12.2.2
|
||||||
|
local major minor patch
|
||||||
|
if [[ "$want" =~ ^([0-9]+)\.([0-9]+)\.([0-9]+)$ ]]; then
|
||||||
|
major="${BASH_REMATCH[1]}"; minor="${BASH_REMATCH[2]}"; patch="${BASH_REMATCH[3]}"
|
||||||
|
echo "nvidia/cuda:${major}.${minor}.${patch}-runtime-ubuntu22.04"; return 0
|
||||||
|
elif [[ "$want" =~ ^([0-9]+)\.([0-9]+)$ ]]; then
|
||||||
|
major="${BASH_REMATCH[1]}"; minor="${BASH_REMATCH[2]}"
|
||||||
|
# try to find best local patch for major.minor
|
||||||
|
local best
|
||||||
|
best=$(docker images --format '{{.Repository}}:{{.Tag}}' nvidia/cuda 2>/dev/null | \
|
||||||
|
grep -E "^nvidia/cuda:${major}\.${minor}\\.[0-9]+-runtime-ubuntu22\.04$" | \
|
||||||
|
sed -E 's#^nvidia/cuda:([0-9]+\.[0-9]+\.)([0-9]+)-runtime-ubuntu22\.04$#\1\2#g' | \
|
||||||
|
sort -V | tail -n1 || true)
|
||||||
|
if [[ -n "$best" ]]; then
|
||||||
|
echo "nvidia/cuda:${best}-runtime-ubuntu22.04"; return 0
|
||||||
|
fi
|
||||||
|
# fallback patch if none local
|
||||||
|
echo "nvidia/cuda:${major}.${minor}.2-runtime-ubuntu22.04"; return 0
|
||||||
|
elif [[ "$want" =~ ^([0-9]+)$ ]]; then
|
||||||
|
major="${BASH_REMATCH[1]}"
|
||||||
|
# try to find best local for this major
|
||||||
|
local best
|
||||||
|
best=$(docker images --format '{{.Repository}}:{{.Tag}}' nvidia/cuda 2>/dev/null | \
|
||||||
|
grep -E "^nvidia/cuda:${major}\\.[0-9]+\\.[0-9]+-runtime-ubuntu22\.04$" | \
|
||||||
|
sed -E 's#^nvidia/cuda:([0-9]+\.[0-9]+\.[0-9]+)-runtime-ubuntu22\.04$#\1#g' | \
|
||||||
|
sort -V | tail -n1 || true)
|
||||||
|
if [[ -n "$best" ]]; then
|
||||||
|
echo "nvidia/cuda:${best}-runtime-ubuntu22.04"; return 0
|
||||||
|
fi
|
||||||
|
echo "nvidia/cuda:${major}.2.2-runtime-ubuntu22.04"; return 0
|
||||||
|
else
|
||||||
|
# invalid format, fallback to default
|
||||||
|
echo "nvidia/cuda:12.2.2-runtime-ubuntu22.04"; return 0
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
local base_image
|
||||||
|
base_image=$(resolve_cuda_base_tag "$cuda_ver_local")
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "🔧 Preparing one-click GPU bundle build"
|
||||||
|
echo " CUDA runtime base: ${base_image}"
|
||||||
|
echo " Bundle tag : ${date_tag}"
|
||||||
|
|
||||||
|
# 1) Ensure NVIDIA base image (skip pull if local)
|
||||||
|
if ! pull_base_image "$base_image"; then
|
||||||
|
# try once more with default if resolution failed
|
||||||
|
if ! pull_base_image "nvidia/cuda:12.2.2-runtime-ubuntu22.04"; then
|
||||||
|
return 1
|
||||||
|
else
|
||||||
|
base_image="nvidia/cuda:12.2.2-runtime-ubuntu22.04"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 2) Build latest argus-agent from source
|
||||||
|
echo "\n🛠 Building argus-agent from src/agent"
|
||||||
|
pushd "$root/src/agent" >/dev/null
|
||||||
|
if ! bash scripts/build_binary.sh; then
|
||||||
|
echo "❌ argus-agent build failed" >&2
|
||||||
|
popd >/dev/null
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
if [[ ! -f "dist/argus-agent" ]]; then
|
||||||
|
echo "❌ argus-agent binary missing after build" >&2
|
||||||
|
popd >/dev/null
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
popd >/dev/null
|
||||||
|
|
||||||
|
# 3) Inject agent into all-in-one-full plugin and package artifact
|
||||||
|
local aio_root="$root/src/metric/client-plugins/all-in-one-full"
|
||||||
|
local agent_bin_src="$root/src/agent/dist/argus-agent"
|
||||||
|
local agent_bin_dst="$aio_root/plugins/argus-agent/bin/argus-agent"
|
||||||
|
echo "\n📦 Updating all-in-one-full agent binary → $agent_bin_dst"
|
||||||
|
cp -f "$agent_bin_src" "$agent_bin_dst"
|
||||||
|
chmod +x "$agent_bin_dst" || true
|
||||||
|
|
||||||
|
pushd "$aio_root" >/dev/null
|
||||||
|
local prev_version
|
||||||
|
prev_version="$(cat config/VERSION 2>/dev/null || echo "1.0.0")"
|
||||||
|
local use_version="$prev_version"
|
||||||
|
if [[ -n "$client_semver" ]]; then
|
||||||
|
echo "${client_semver}" > config/VERSION
|
||||||
|
use_version="$client_semver"
|
||||||
|
fi
|
||||||
|
echo " Packaging all-in-one-full artifact version: $use_version"
|
||||||
|
if ! bash scripts/package_artifact.sh --force; then
|
||||||
|
echo "❌ package_artifact.sh failed" >&2
|
||||||
|
# restore VERSION if changed
|
||||||
|
if [[ -n "$client_semver" ]]; then echo "$prev_version" > config/VERSION; fi
|
||||||
|
popd >/dev/null
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
local artifact_dir="$aio_root/artifact/$use_version"
|
||||||
|
local artifact_tar
|
||||||
|
artifact_tar="$(ls -1 "$artifact_dir"/argus-metric_*.tar.gz 2>/dev/null | head -n1 || true)"
|
||||||
|
if [[ -z "$artifact_tar" ]]; then
|
||||||
|
echo " No argus-metric_*.tar.gz found; invoking publish_artifact.sh to assemble..."
|
||||||
|
local owner="$(id -u):$(id -g)"
|
||||||
|
if ! bash scripts/publish_artifact.sh "$use_version" --output-dir "$artifact_dir" --owner "$owner"; then
|
||||||
|
echo "❌ publish_artifact.sh failed" >&2
|
||||||
|
if [[ -n "$client_semver" ]]; then echo "$prev_version" > config/VERSION; fi
|
||||||
|
popd >/dev/null
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
artifact_tar="$(ls -1 "$artifact_dir"/argus-metric_*.tar.gz 2>/dev/null | head -n1 || true)"
|
||||||
|
fi
|
||||||
|
if [[ -z "$artifact_tar" ]]; then
|
||||||
|
echo "❌ artifact tar not found under $artifact_dir" >&2
|
||||||
|
if [[ -n "$client_semver" ]]; then echo "$prev_version" > config/VERSION; fi
|
||||||
|
popd >/dev/null
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
# restore VERSION if changed (keep filesystem clean)
|
||||||
|
if [[ -n "$client_semver" ]]; then echo "$prev_version" > config/VERSION; fi
|
||||||
|
popd >/dev/null
|
||||||
|
|
||||||
|
# 4) Stage docker build context
|
||||||
|
local bundle_ctx="$root/src/bundle/gpu-node-bundle/.build-$date_tag"
|
||||||
|
echo "\n🧰 Staging docker build context: $bundle_ctx"
|
||||||
|
rm -rf "$bundle_ctx"
|
||||||
|
mkdir -p "$bundle_ctx/bundle" "$bundle_ctx/private"
|
||||||
|
cp "$root/src/bundle/gpu-node-bundle/Dockerfile" "$bundle_ctx/"
|
||||||
|
cp "$root/src/bundle/gpu-node-bundle/node-bootstrap.sh" "$bundle_ctx/"
|
||||||
|
# bundle tar
|
||||||
|
cp "$artifact_tar" "$bundle_ctx/bundle/"
|
||||||
|
# offline fluent-bit assets (optional but useful)
|
||||||
|
if [[ -d "$root/src/log/fluent-bit/build/etc" ]]; then
|
||||||
|
cp -r "$root/src/log/fluent-bit/build/etc" "$bundle_ctx/private/"
|
||||||
|
fi
|
||||||
|
if [[ -d "$root/src/log/fluent-bit/build/packages" ]]; then
|
||||||
|
cp -r "$root/src/log/fluent-bit/build/packages" "$bundle_ctx/private/"
|
||||||
|
fi
|
||||||
|
if [[ -f "$root/src/log/fluent-bit/build/start-fluent-bit.sh" ]]; then
|
||||||
|
cp "$root/src/log/fluent-bit/build/start-fluent-bit.sh" "$bundle_ctx/private/"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 5) Build the final bundle image (directly from NVIDIA base)
|
||||||
|
local image_tag="argus-sys-metric-test-node-bundle-gpu:${date_tag}"
|
||||||
|
echo "\n🔄 Building GPU Bundle image"
|
||||||
|
if build_image "GPU Bundle" "$bundle_ctx/Dockerfile" "$image_tag" "$bundle_ctx" \
|
||||||
|
--build-arg CUDA_VER="$(echo "$base_image" | sed -E 's#^nvidia/cuda:([0-9]+\.[0-9]+\.[0-9]+)-runtime-ubuntu22\.04$#\1#')" \
|
||||||
|
--build-arg CLIENT_VER="$use_version" \
|
||||||
|
--build-arg BUNDLE_DATE="$date_tag"; then
|
||||||
|
images_built+=("$image_tag")
|
||||||
|
# also tag latest for convenience
|
||||||
|
docker tag "$image_tag" argus-sys-metric-test-node-bundle-gpu:latest >/dev/null 2>&1 || true
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
if [[ "$build_core" == true ]]; then
|
if [[ "$build_core" == true ]]; then
|
||||||
if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then
|
if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then
|
||||||
images_built+=("argus-elasticsearch:latest")
|
images_built+=("argus-elasticsearch:latest")
|
||||||
@ -376,6 +567,18 @@ if [[ "$build_web" == true || "$build_alert" == true ]]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# =======================================
|
||||||
|
# One-click GPU bundle (direct NVIDIA base)
|
||||||
|
# =======================================
|
||||||
|
|
||||||
|
if [[ "$build_gpu_bundle" == true ]]; then
|
||||||
|
echo ""
|
||||||
|
echo "Building one-click GPU bundle image..."
|
||||||
|
if ! build_gpu_bundle_image "$bundle_date" "$cuda_ver" "$client_semver"; then
|
||||||
|
build_failed=true
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
echo "======================================="
|
echo "======================================="
|
||||||
echo "📦 Build Summary"
|
echo "📦 Build Summary"
|
||||||
echo "======================================="
|
echo "======================================="
|
||||||
|
|||||||
1
src/agent/.gitignore
vendored
1
src/agent/.gitignore
vendored
@ -3,3 +3,4 @@ build/
|
|||||||
__pycache__/
|
__pycache__/
|
||||||
|
|
||||||
.env
|
.env
|
||||||
|
dist/
|
||||||
|
|||||||
BIN
src/agent/dist/argus-agent
vendored
BIN
src/agent/dist/argus-agent
vendored
Binary file not shown.
1
src/bundle/gpu-node-bundle/.gitignore
vendored
Normal file
1
src/bundle/gpu-node-bundle/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
.build*/
|
||||||
43
src/bundle/gpu-node-bundle/Dockerfile
Normal file
43
src/bundle/gpu-node-bundle/Dockerfile
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
ARG CUDA_VER=12.2.2
|
||||||
|
FROM nvidia/cuda:${CUDA_VER}-runtime-ubuntu22.04
|
||||||
|
|
||||||
|
ARG CLIENT_VER=0.0.0
|
||||||
|
ARG BUNDLE_DATE=00000000
|
||||||
|
|
||||||
|
LABEL org.opencontainers.image.title="argus-sys-metric-test-node-bundle-gpu" \
|
||||||
|
org.opencontainers.image.description="GPU node bundle with embedded Argus client artifact" \
|
||||||
|
org.opencontainers.image.version="${CLIENT_VER}" \
|
||||||
|
org.opencontainers.image.revision_date="${BUNDLE_DATE}" \
|
||||||
|
maintainer="Argus"
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive \
|
||||||
|
TZ=Asia/Shanghai \
|
||||||
|
ARGUS_LOGS_WORLD_WRITABLE=1 \
|
||||||
|
ES_HOST=es.log.argus.com \
|
||||||
|
ES_PORT=9200 \
|
||||||
|
CLUSTER=local \
|
||||||
|
RACK=dev
|
||||||
|
|
||||||
|
RUN set -eux; \
|
||||||
|
apt-get update; \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
ca-certificates curl wget iproute2 iputils-ping net-tools jq tzdata cron procps vim less \
|
||||||
|
tar gzip; \
|
||||||
|
rm -rf /var/lib/apt/lists/*; \
|
||||||
|
ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
||||||
|
|
||||||
|
WORKDIR /
|
||||||
|
|
||||||
|
# Expect staged build context to provide these directories/files
|
||||||
|
COPY bundle/ /bundle/
|
||||||
|
COPY node-bootstrap.sh /usr/local/bin/node-bootstrap.sh
|
||||||
|
COPY private/start-fluent-bit.sh /private/start-fluent-bit.sh
|
||||||
|
COPY private/etc /private/etc
|
||||||
|
COPY private/packages /private/packages
|
||||||
|
|
||||||
|
RUN chmod +x /usr/local/bin/node-bootstrap.sh /private/start-fluent-bit.sh || true; \
|
||||||
|
mkdir -p /logs/train /logs/infer /buffers /opt/argus-metric; \
|
||||||
|
chmod 1777 /logs/train /logs/infer || true; \
|
||||||
|
chmod 770 /buffers || true
|
||||||
|
|
||||||
|
ENTRYPOINT ["/usr/local/bin/node-bootstrap.sh"]
|
||||||
127
src/bundle/gpu-node-bundle/node-bootstrap.sh
Normal file
127
src/bundle/gpu-node-bundle/node-bootstrap.sh
Normal file
@ -0,0 +1,127 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
echo "[BOOT] GPU node bundle starting"
|
||||||
|
|
||||||
|
INSTALL_ROOT="/opt/argus-metric"
|
||||||
|
BUNDLE_DIR="/bundle"
|
||||||
|
STATE_DIR_BASE="/private/argus/agent"
|
||||||
|
|
||||||
|
mkdir -p "$INSTALL_ROOT" "$STATE_DIR_BASE" /logs/train /logs/infer /buffers || true
|
||||||
|
|
||||||
|
# Ensure world-writable logs dir with sticky bit (align with deployment_new policy)
|
||||||
|
if [[ "${ARGUS_LOGS_WORLD_WRITABLE:-1}" == "1" ]]; then
|
||||||
|
chmod 1777 /logs/train /logs/infer || true
|
||||||
|
else
|
||||||
|
chmod 755 /logs/train /logs/infer || true
|
||||||
|
fi
|
||||||
|
chmod 770 /buffers || true
|
||||||
|
|
||||||
|
installed_ok=0
|
||||||
|
|
||||||
|
# 1) already installed?
|
||||||
|
if [[ -L "$INSTALL_ROOT/current" && -d "$INSTALL_ROOT/current" ]]; then
|
||||||
|
echo "[BOOT] client already installed at $INSTALL_ROOT/current"
|
||||||
|
else
|
||||||
|
# 2) try local bundle first (argus-metric_*.tar.gz)
|
||||||
|
tarball=$(ls -1 "$BUNDLE_DIR"/argus-metric_*.tar.gz 2>/dev/null | head -1 || true)
|
||||||
|
if [[ -n "${tarball:-}" ]]; then
|
||||||
|
echo "[BOOT] installing from local bundle: $(basename "$tarball")"
|
||||||
|
tmp=$(mktemp -d)
|
||||||
|
tar -xzf "$tarball" -C "$tmp"
|
||||||
|
# locate root containing version.json
|
||||||
|
root="$tmp"
|
||||||
|
if [[ ! -f "$root/version.json" ]]; then
|
||||||
|
sub=$(find "$tmp" -mindepth 1 -maxdepth 1 -type d | head -n1 || true)
|
||||||
|
[[ -n "$sub" && -f "$sub/version.json" ]] && root="$sub"
|
||||||
|
fi
|
||||||
|
if [[ ! -f "$root/version.json" ]]; then
|
||||||
|
echo "[BOOT][WARN] version.json not found in bundle; fallback to FTP"
|
||||||
|
else
|
||||||
|
ver=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$root/version.json" | head -n1)
|
||||||
|
if [[ -z "$ver" ]]; then
|
||||||
|
echo "[BOOT][WARN] failed to parse version from version.json; fallback to FTP"
|
||||||
|
else
|
||||||
|
target_root="$INSTALL_ROOT"
|
||||||
|
version_dir="$target_root/versions/$ver"
|
||||||
|
mkdir -p "$version_dir"
|
||||||
|
shopt -s dotglob
|
||||||
|
mv "$root"/* "$version_dir/" 2>/dev/null || true
|
||||||
|
shopt -u dotglob
|
||||||
|
if [[ -f "$version_dir/install.sh" ]]; then
|
||||||
|
chmod +x "$version_dir/install.sh" 2>/dev/null || true
|
||||||
|
(
|
||||||
|
export AUTO_START_DCGM="${AUTO_START_DCGM:-1}"
|
||||||
|
export DCGM_EXPORTER_DISABLE_PROFILING="${DCGM_EXPORTER_DISABLE_PROFILING:-1}"
|
||||||
|
export DCGM_EXPORTER_LISTEN="${DCGM_EXPORTER_LISTEN:-:9400}"
|
||||||
|
cd "$version_dir" && ./install.sh "$version_dir"
|
||||||
|
)
|
||||||
|
echo "$ver" > "$target_root/LATEST_VERSION" 2>/dev/null || true
|
||||||
|
ln -sfn "$version_dir" "$target_root/current" 2>/dev/null || true
|
||||||
|
if [[ -L "$target_root/current" && -d "$target_root/current" ]]; then
|
||||||
|
installed_ok=1
|
||||||
|
echo "[BOOT] local bundle install OK: version=$ver"
|
||||||
|
else
|
||||||
|
echo "[BOOT][WARN] current symlink not present after install; will rely on healthcheck to confirm"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "[BOOT][WARN] install.sh missing under $version_dir; fallback to FTP"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 3) fallback: use FTP setup if not installed
|
||||||
|
if [[ ! -L "$INSTALL_ROOT/current" && "$installed_ok" -eq 0 ]]; then
|
||||||
|
echo "[BOOT] fallback to FTP setup"
|
||||||
|
if [[ -z "${FTPIP:-}" || -z "${FTP_USER:-}" || -z "${FTP_PASSWORD:-}" ]]; then
|
||||||
|
echo "[BOOT][ERROR] FTP variables not set (FTPIP/FTP_USER/FTP_PASSWORD)" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
curl -u "$FTP_USER:$FTP_PASSWORD" -fsSL "ftp://$FTPIP:21/setup.sh" -o /tmp/setup.sh
|
||||||
|
chmod +x /tmp/setup.sh
|
||||||
|
/tmp/setup.sh --server "$FTPIP" --user "$FTP_USER" --password "$FTP_PASSWORD" --port 21
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 4) ensure argus-agent is running (best-effort)
|
||||||
|
if ! pgrep -x argus-agent >/dev/null 2>&1; then
|
||||||
|
echo "[BOOT] starting argus-agent (not detected)"
|
||||||
|
setsid /usr/local/bin/argus-agent >/var/log/argus-agent.log 2>&1 < /dev/null &
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 5) post-install selfcheck (run once) and state
|
||||||
|
# prefer current version dir; fallback to first version under /opt/argus-metric/versions
|
||||||
|
ver_dir=""
|
||||||
|
if [[ -L "$INSTALL_ROOT/current" ]]; then
|
||||||
|
ver_dir="$(readlink -f "$INSTALL_ROOT/current" 2>/dev/null || true)"
|
||||||
|
fi
|
||||||
|
if [[ -z "$ver_dir" ]]; then
|
||||||
|
# pick the latest by name (semver-like); best-effort
|
||||||
|
ver_dir="$(ls -d "$INSTALL_ROOT"/versions/* 2>/dev/null | sort -V | tail -n1 || true)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "$ver_dir" && -x "$ver_dir/check_health.sh" ]]; then
|
||||||
|
echo "[BOOT] running initial health check: $ver_dir/check_health.sh"
|
||||||
|
if "$ver_dir/check_health.sh" >> "$ver_dir/.health_check.init.log" 2>&1; then
|
||||||
|
echo "[BOOT] initial health check completed (see $ver_dir/.health_check.init.log)"
|
||||||
|
else
|
||||||
|
echo "[BOOT][WARN] initial health check reported issues (see $ver_dir/.health_check.init.log)"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "[BOOT][WARN] initial health check skipped (script missing: $ver_dir/check_health.sh)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
host="$(hostname)"
|
||||||
|
state_dir="$STATE_DIR_BASE/${host}"
|
||||||
|
mkdir -p "$state_dir" 2>/dev/null || true
|
||||||
|
for i in {1..60}; do
|
||||||
|
if [[ -s "$state_dir/node.json" ]]; then
|
||||||
|
echo "[BOOT] node state present: $state_dir/node.json"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "[BOOT] ready; entering sleep"
|
||||||
|
exec sleep infinity
|
||||||
1
src/metric/client-plugins/all-in-one-full/plugins/argus-agent/.gitignore
vendored
Normal file
1
src/metric/client-plugins/all-in-one-full/plugins/argus-agent/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
bin/
|
||||||
BIN
src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent
(Stored with Git LFS)
BIN
src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent
(Stored with Git LFS)
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user