diff --git a/build/build_images.sh b/build/build_images.sh index 384053f..8f88f83 100755 --- a/build/build_images.sh +++ b/build/build_images.sh @@ -43,6 +43,7 @@ no_cache=false bundle_date="" client_semver="" cuda_ver="12.2.2" +DEFAULT_IMAGE_TAG="latest" while [[ $# -gt 0 ]]; do case $1 in @@ -126,6 +127,16 @@ fi cd "$root" +# Set default image tag policy before building +if [[ "$build_server_pkg" == true ]]; then + DEFAULT_IMAGE_TAG="${bundle_date:-latest}" +fi + +# Select build user profile for pkg vs default +if [[ "$build_server_pkg" == true || "$build_client_pkg" == true ]]; then + export ARGUS_BUILD_PROFILE=pkg +fi + load_build_user build_args+=("--build-arg" "ARGUS_BUILD_UID=${ARGUS_BUILD_UID}" "--build-arg" "ARGUS_BUILD_GID=${ARGUS_BUILD_GID}") @@ -188,13 +199,31 @@ build_image() { echo " Tag: $tag" echo " Context: $context" - if docker build "${build_args[@]}" "${extra_args[@]}" -f "$dockerfile_path" -t "$tag" "$context"; then - echo "✅ $image_name image built successfully" - return 0 - else - echo "❌ Failed to build $image_name image" - return 1 - fi + local tries=${ARGUS_BUILD_RETRIES:-3} + local delay=${ARGUS_BUILD_RETRY_DELAY:-5} + local attempt=1 + while (( attempt <= tries )); do + local prefix="" + if (( attempt == tries )); then + # final attempt: disable BuildKit to avoid docker/dockerfile front-end pulls + prefix="DOCKER_BUILDKIT=0" + echo " Attempt ${attempt}/${tries} (fallback: DOCKER_BUILDKIT=0)" + else + echo " Attempt ${attempt}/${tries}" + fi + if eval $prefix docker build "${build_args[@]}" "${extra_args[@]}" -f "$dockerfile_path" -t "$tag" "$context"; then + echo "✅ $image_name image built successfully" + return 0 + fi + echo "⚠️ Build failed for $image_name (attempt ${attempt}/${tries})." + if (( attempt < tries )); then + echo " Retrying in ${delay}s..." + sleep "$delay" + fi + attempt=$((attempt+1)) + done + echo "❌ Failed to build $image_name image after ${tries} attempts" + return 1 } pull_base_image() { @@ -390,8 +419,10 @@ build_gpu_bundle_image() { --build-arg CLIENT_VER="$use_version" \ --build-arg BUNDLE_DATE="$date_tag"; then images_built+=("$image_tag") - # also tag latest for convenience - docker tag "$image_tag" argus-sys-metric-test-node-bundle-gpu:latest >/dev/null 2>&1 || true + # In non-pkg mode, also tag latest for convenience + if [[ "${ARGUS_PKG_BUILD:-0}" != "1" ]]; then + docker tag "$image_tag" argus-sys-metric-test-node-bundle-gpu:latest >/dev/null 2>&1 || true + fi return 0 else return 1 @@ -427,10 +458,13 @@ build_server_pkg_bundle() { argus-metric-ftp argus-metric-prometheus argus-metric-grafana \ argus-alertmanager argus-web-frontend argus-web-proxy ) - echo "\n🔖 Tagging server images with :$date_tag and collecting digests" - if ! ensure_version_tags "$date_tag" "${repos[@]}"; then - return 1 - fi + echo "\n🔖 Verifying server images with :$date_tag and collecting digests" + for repo in "${repos[@]}"; do + if ! docker image inspect "$repo:$date_tag" >/dev/null 2>&1; then + echo "❌ required image missing: $repo:$date_tag (build phase should have produced it)" >&2 + return 1 + fi + done # Optional: show digests for repo in "${repos[@]}"; do local digest @@ -457,6 +491,8 @@ build_client_pkg_bundle() { local bundle_tag="argus-sys-metric-test-node-bundle-gpu:${date_tag}" if ! docker image inspect "$bundle_tag" >/dev/null 2>&1; then echo "\n🧩 GPU bundle image $bundle_tag missing; building it first..." + ARGUS_PKG_BUILD=1 + export ARGUS_PKG_BUILD if ! build_gpu_bundle_image "$date_tag" "$cuda" "$semver"; then return 1 fi @@ -472,24 +508,24 @@ build_client_pkg_bundle() { } if [[ "$build_core" == true ]]; then - if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then - images_built+=("argus-elasticsearch:latest") + if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:${DEFAULT_IMAGE_TAG}"; then + images_built+=("argus-elasticsearch:${DEFAULT_IMAGE_TAG}") else build_failed=true fi echo "" - if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then - images_built+=("argus-kibana:latest") + if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:${DEFAULT_IMAGE_TAG}"; then + images_built+=("argus-kibana:${DEFAULT_IMAGE_TAG}") else build_failed=true fi echo "" - if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then - images_built+=("argus-bind9:latest") + if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:${DEFAULT_IMAGE_TAG}"; then + images_built+=("argus-bind9:${DEFAULT_IMAGE_TAG}") else build_failed=true fi @@ -501,7 +537,7 @@ if [[ "$build_master" == true ]]; then echo "" echo "🔄 Building Master image..." pushd "$master_root" >/dev/null - master_args=("--tag" "argus-master:latest") + master_args=("--tag" "argus-master:${DEFAULT_IMAGE_TAG}") if [[ "$use_intranet" == true ]]; then master_args+=("--intranet") fi @@ -515,7 +551,7 @@ if [[ "$build_master" == true ]]; then if [[ "$build_master_offline" == true ]]; then images_built+=("argus-master:offline") else - images_built+=("argus-master:latest") + images_built+=("argus-master:${DEFAULT_IMAGE_TAG}") fi else build_failed=true @@ -540,9 +576,9 @@ if [[ "$build_metric" == true ]]; then done metric_builds=( - "Metric FTP|src/metric/ftp/build/Dockerfile|argus-metric-ftp:latest|src/metric/ftp/build" - "Metric Prometheus|src/metric/prometheus/build/Dockerfile|argus-metric-prometheus:latest|src/metric/prometheus/build" - "Metric Grafana|src/metric/grafana/build/Dockerfile|argus-metric-grafana:latest|src/metric/grafana/build" + "Metric FTP|src/metric/ftp/build/Dockerfile|argus-metric-ftp:${DEFAULT_IMAGE_TAG}|src/metric/ftp/build" + "Metric Prometheus|src/metric/prometheus/build/Dockerfile|argus-metric-prometheus:${DEFAULT_IMAGE_TAG}|src/metric/prometheus/build" + "Metric Grafana|src/metric/grafana/build/Dockerfile|argus-metric-grafana:${DEFAULT_IMAGE_TAG}|src/metric/grafana/build" ) for build_spec in "${metric_builds[@]}"; do @@ -614,8 +650,8 @@ if [[ "$build_web" == true || "$build_alert" == true ]]; then if [[ "$build_web" == true ]]; then web_builds=( - "Web Frontend|src/web/build_tools/frontend/Dockerfile|argus-web-frontend:latest|." - "Web Proxy|src/web/build_tools/proxy/Dockerfile|argus-web-proxy:latest|." + "Web Frontend|src/web/build_tools/frontend/Dockerfile|argus-web-frontend:${DEFAULT_IMAGE_TAG}|." + "Web Proxy|src/web/build_tools/proxy/Dockerfile|argus-web-proxy:${DEFAULT_IMAGE_TAG}|." ) for build_spec in "${web_builds[@]}"; do IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec" @@ -630,7 +666,7 @@ if [[ "$build_web" == true || "$build_alert" == true ]]; then if [[ "$build_alert" == true ]]; then alert_builds=( - "Alertmanager|src/alert/alertmanager/build/Dockerfile|argus-alertmanager:latest|." + "Alertmanager|src/alert/alertmanager/build/Dockerfile|argus-alertmanager:${DEFAULT_IMAGE_TAG}|." ) for build_spec in "${alert_builds[@]}"; do IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec" diff --git a/configs/build_user.pkg.conf b/configs/build_user.pkg.conf new file mode 100644 index 0000000..e4df5be --- /dev/null +++ b/configs/build_user.pkg.conf @@ -0,0 +1,6 @@ +# Default build-time UID/GID for Argus images +# Override by creating configs/build_user.local.conf with the same format. +# Syntax: KEY=VALUE, supports UID/GID only. Whitespace and lines starting with # are ignored. + +UID=2133 +GID=2015 diff --git a/deployment/.gitignore b/deployment/.gitignore deleted file mode 100644 index a319647..0000000 --- a/deployment/.gitignore +++ /dev/null @@ -1 +0,0 @@ -artifact/ diff --git a/deployment/build/README.md b/deployment/build/README.md deleted file mode 100644 index 4ecac62..0000000 --- a/deployment/build/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# Deployment Build Toolkit - -This folder provides scripts to produce offline server/client packages and publish the client package to FTP. - -Commands -- build_server_package.sh [--version YYYYMMDD] -- build_client_package.sh [--version YYYYMMDD] -- publish_client.sh --version YYYYMMDD --server --user ftpuser --password [--port 21] - -Outputs -- deployment/artifact/server// -- deployment/artifact/client// - -Notes -- Server package contains docker images (single all-images.tar.gz), compose/, scripts/, docs/, private/ skeleton. -- Client package reuses all-in-one-full artifact, repacked as argus-metric_.tar.gz (compatible with setup.sh). diff --git a/deployment/build/build_client_package.sh b/deployment/build/build_client_package.sh deleted file mode 100755 index 8c67cd7..0000000 --- a/deployment/build/build_client_package.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" -BUILD_DIR="$ROOT_DIR/deployment/build" -ART_ROOT="$ROOT_DIR/deployment/artifact" - -. "$BUILD_DIR/common.sh" - -usage() { cat <<'EOF' -Build Argus Client Offline Package - -Usage: build_client_package.sh [--version YYYYMMDD] [--out DIR] - -Produces: deployment/artifact/client//argus-metric_.tar.gz -EOF -} - -VERSION="$(today_version)" -OUT_DIR="" -while [[ $# -gt 0 ]]; do - case "$1" in - --version) VERSION="$2"; shift 2;; - --out) OUT_DIR="$2"; shift 2;; - -h|--help) usage; exit 0;; - *) err "unknown arg: $1"; usage; exit 1;; - esac -done - -PKG_DIR="${OUT_DIR:-$ART_ROOT/client/$VERSION}" -make_dir "$PKG_DIR" - -log "Packaging client from all-in-one-full artifact" -PLUGIN_DIR="$ROOT_DIR/src/metric/client-plugins/all-in-one-full" -require_cmd bash tar gzip - -(cd "$PLUGIN_DIR" && bash scripts/package_artifact.sh --force) - -# pick latest artifact dir -ART_BASE="$PLUGIN_DIR/artifact" -latest_dir=$(ls -1dt "$ART_BASE"/*/ 2>/dev/null | head -n1 || true) -[[ -n "$latest_dir" ]] || { err "no client artifact found in $ART_BASE"; exit 1; } - -tmpdir=$(mktemp -d) -trap 'rm -rf "$tmpdir"' EXIT -# Filter-only copy: keep install_order files + scripts + deps + version.json -mkdir -p "$tmpdir/src" -cp -f "$latest_dir/version.json" "$tmpdir/src/version.json" -if command -v jq >/dev/null 2>&1; then - mapfile -t files < <(jq -r '.install_order[]' "$latest_dir/version.json") -else - files=( $(grep -E '"install_order"' -A10 "$latest_dir/version.json" | sed -n 's/\s*"\(.*\.tar\.gz\)".*/\1/p') ) -fi -for f in "${files[@]}"; do - [[ -f "$latest_dir/$f" ]] && cp -f "$latest_dir/$f" "$tmpdir/src/$f" -done -for aux in install.sh uninstall.sh check_health.sh check_version.sh sync_dns.sh restart_unhealthy.sh config.env; do - [[ -f "$latest_dir/$aux" ]] && cp -f "$latest_dir/$aux" "$tmpdir/src/$aux"; -done -if [[ -d "$latest_dir/deps" ]]; then - mkdir -p "$tmpdir/src/deps" && rsync -a "$latest_dir/deps/" "$tmpdir/src/deps/" >/dev/null 2>&1 || cp -r "$latest_dir/deps/." "$tmpdir/src/deps/"; -fi - -out_name="argus-metric_$(echo "$VERSION" | sed 's/\./_/g').tar.gz" - -(cd "$tmpdir/src" && tar -czf "$PKG_DIR/$out_name" .) - -log "Client package ready: $PKG_DIR/$out_name" -echo "$VERSION" > "$PKG_DIR/LATEST_VERSION" - -# include publish helper and setup.sh for convenience (place first) -PUBLISH_TPL="$BUILD_DIR/templates/client/publish.sh" -if [[ -f "$PUBLISH_TPL" ]]; then - cp "$PUBLISH_TPL" "$PKG_DIR/publish.sh" -fi - -# also place a copy of setup.sh alongside -SETUP_SRC="$ROOT_DIR/src/metric/client-plugins/all-in-one-full/scripts/setup.sh" -[[ -f "$SETUP_SRC" ]] && cp "$SETUP_SRC" "$PKG_DIR/setup.sh" || true - -# docs for end users (this may overwrite file modes), then fix execute bits -CLIENT_DOC_DIR="$BUILD_DIR/templates/client" -if [[ -d "$CLIENT_DOC_DIR" ]]; then - rsync -a "$CLIENT_DOC_DIR/" "$PKG_DIR/" >/dev/null 2>&1 || cp -r "$CLIENT_DOC_DIR/." "$PKG_DIR/" -fi - -# ensure helpers are executable -chmod +x "$PKG_DIR/publish.sh" "$PKG_DIR/setup.sh" 2>/dev/null || true - -exit 0 diff --git a/deployment/build/build_cpu_node_image.sh b/deployment/build/build_cpu_node_image.sh deleted file mode 100755 index 276c9c0..0000000 --- a/deployment/build/build_cpu_node_image.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" -cd "$ROOT_DIR" - -usage() { - cat <&2; usage; exit 1;; - esac -done - -CMD=("./deployment/build/build_images.sh" "--with-node-bundle") -if [[ -n "$VERSION" ]]; then CMD+=("--client-version" "$VERSION"); fi - -echo "[CPU-BUNDLE] invoking: ${CMD[*]}" -"${CMD[@]}" - -echo "[CPU-BUNDLE] built image: argus-sys-metric-test-node-bundle:latest" -docker image inspect argus-sys-metric-test-node-bundle:latest >/dev/null 2>&1 || { - echo "[ERR] expected image not found" >&2; exit 1; } - -echo "[CPU-BUNDLE] done" - diff --git a/deployment/build/build_gpu_node_image.sh b/deployment/build/build_gpu_node_image.sh deleted file mode 100755 index d8414aa..0000000 --- a/deployment/build/build_gpu_node_image.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" -cd "$ROOT_DIR" - -usage() { - cat <&2; usage; exit 1;; - esac -done - -BASE_IMAGE="argus-sys-metric-test-gpu-node:latest" - -CMD=("./deployment/build/build_images.sh" "--with-node-bundle" "--base-image" "$BASE_IMAGE") -if [[ -n "$VERSION" ]]; then CMD+=("--client-version" "$VERSION"); fi - -echo "[GPU-BUNDLE] invoking: ${CMD[*]}" -"${CMD[@]}" - -echo "[GPU-BUNDLE] re-tagging to $OUT_TAG" -docker image inspect argus-sys-metric-test-node-bundle:latest >/dev/null 2>&1 || { - echo "[ERR] base bundle image missing: argus-sys-metric-test-node-bundle:latest" >&2; exit 1; } -docker tag argus-sys-metric-test-node-bundle:latest "$OUT_TAG" -docker image inspect "$OUT_TAG" >/dev/null 2>&1 || { echo "[ERR] re-tag failed" >&2; exit 1; } - -echo "[GPU-BUNDLE] built image: $OUT_TAG (base=$BASE_IMAGE)" - diff --git a/deployment/build/build_images.sh b/deployment/build/build_images.sh deleted file mode 100755 index fbe35a5..0000000 --- a/deployment/build/build_images.sh +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" -. "$ROOT_DIR/deployment/build/common.sh" - -usage() { -cat </dev/null - bash scripts/package_artifact.sh --force - CLIENT_VERSION=$(cat artifact/*/version.json 2>/dev/null | sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' | tail -n1) - popd >/dev/null - [[ -n "$CLIENT_VERSION" ]] || { err "failed to detect client version"; exit 1; } - else - if [[ "$CLIENT_VERSION" =~ ^[0-9]{8}$ ]]; then - PKG_DIR="$ROOT_DIR/deployment/artifact/client/$CLIENT_VERSION" - TAR_PKG="$PKG_DIR/argus-metric_${CLIENT_VERSION}.tar.gz" - [[ -f "$TAR_PKG" ]] || { err "client date package not found: $TAR_PKG"; exit 1; } - # 解包读取内部 version.json - tmpd=$(mktemp -d); trap 'rm -rf "$tmpd"' EXIT - tar -xzf "$TAR_PKG" -C "$tmpd" - if [[ -f "$tmpd/version.json" ]]; then - ART_VER=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$tmpd/version.json" | head -n1) - [[ -n "$ART_VER" ]] || { err "failed to parse artifact version from date package"; exit 1; } - CLIENT_VERSION="$ART_VER" - # 直接使用该 tar 作为 bundle 源 - cp "$TAR_PKG" "$TMP_BUNDLE/argus-metric_$(echo "$ART_VER" | tr '.' '_').tar.gz" - # 同时尝试复制 setup.sh(若存在) - [[ -f "$PKG_DIR/setup.sh" ]] && cp "$PKG_DIR/setup.sh" "$TMP_BUNDLE/" || true - else - err "version.json missing in client date package" - exit 1 - fi - else - # 假定为 artifact 版本目录 - pushd "$PLUGIN_DIR" >/dev/null - [[ -d "artifact/$CLIENT_VERSION" ]] || bash scripts/package_artifact.sh --force - popd >/dev/null - fi - fi - - # 若未通过日期包预置 tar,则从插件 artifact 目录取 - TAR_NAME="argus-metric_$(echo "$CLIENT_VERSION" | tr '.' '_').tar.gz" - if [[ ! -f "$TMP_BUNDLE/$TAR_NAME" ]]; then - SRC_TAR="$PLUGIN_DIR/artifact/$CLIENT_VERSION/$TAR_NAME" - [[ -f "$SRC_TAR" ]] || { err "missing client tar: $SRC_TAR"; exit 1; } - cp "$SRC_TAR" "$TMP_BUNDLE/" - # also include setup.sh for fallback - if [[ -f "$ROOT_DIR/deployment/artifact/client/$(today_version)/setup.sh" ]]; then - cp "$ROOT_DIR/deployment/artifact/client/$(today_version)/setup.sh" "$TMP_BUNDLE/" || true - fi - fi - - log "Building node-bundle image with client version: $CLIENT_VERSION" - DOCKER_BUILDKIT=0 docker build \ - --build-arg CLIENT_VER="$CLIENT_VERSION" \ - --build-arg BASE_IMAGE="$BASE_IMAGE" \ - -t argus-sys-metric-test-node-bundle:latest \ - -f "$BUNDLE_DIR/Dockerfile" "$BUNDLE_DIR" - log "Built image: argus-sys-metric-test-node-bundle:latest" -fi - -log "Done." diff --git a/deployment/build/build_server_package.sh b/deployment/build/build_server_package.sh deleted file mode 100755 index 4d2486f..0000000 --- a/deployment/build/build_server_package.sh +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" -BUILD_DIR="$ROOT_DIR/deployment/build" -ART_ROOT="$ROOT_DIR/deployment/artifact" - -. "$BUILD_DIR/common.sh" - -usage() { cat <<'EOF' -Build Argus Server Offline Package - -Usage: build_server_package.sh [--version YYYYMMDD] [--out DIR] [--resave-image] - -Outputs into deployment/artifact/server// by default. -EOF -} - -VERSION="$(today_version)" -OUT_DIR="" -RESAVE_IMAGE=false -while [[ $# -gt 0 ]]; do - case "$1" in - --version) VERSION="$2"; shift 2;; - --out) OUT_DIR="$2"; shift 2;; - --resave-image) RESAVE_IMAGE=true; shift;; - -h|--help) usage; exit 0;; - *) err "unknown arg: $1"; usage; exit 1;; - esac -done - -PKG_DIR="${OUT_DIR:-$ART_ROOT/server/$VERSION}" -STAGE="$(mktemp -d)" -trap 'rm -rf "$STAGE"' EXIT - -log "Version: $VERSION" -log "Staging: $STAGE" - -# 1) Layout -make_dir "$STAGE/images" -make_dir "$STAGE/compose" -make_dir "$STAGE/scripts" -make_dir "$STAGE/docs" -make_dir "$STAGE/private/argus" - -# 2) Compose: derive from sys/tests by removing test-only services -SRC_COMPOSE="$ROOT_DIR/src/sys/tests/docker-compose.yml" -[[ -f "$SRC_COMPOSE" ]] || { err "missing $SRC_COMPOSE"; exit 1; } -# 2.1 filter out test services -tmp_compose1="$STAGE/compose/docker-compose.filtered.yml" -awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$tmp_compose1" -# 2.2 transform to external overlay network (remove sysnet and per-service blocks) -awk -f "$BUILD_DIR/templates/docker-compose.overlay.awk" "$tmp_compose1" > "$STAGE/compose/docker-compose.yml" -cp "$BUILD_DIR/templates/.env.example" "$STAGE/compose/.env.example" -# fix relative private path to match package layout (compose/ and private/ are siblings) -sed -i 's#\./private/#../private/#g' "$STAGE/compose/docker-compose.yml" -# also handle bind mount form without trailing slash -sed -i -E 's#- \./private:/private#- ../private:/private#g' "$STAGE/compose/docker-compose.yml" -# drop timezone file bind which may not exist on target distros (e.g. NixOS) -sed -i '/\/etc\/timezone:\/etc\/timezone:ro/d' "$STAGE/compose/docker-compose.yml" - -# sanity-check: ensure test services are absent and external network present -if grep -E '^(\s{2})(node-a|node-b|test-node|test-gpu-node):\s*$' -n "$STAGE/compose/docker-compose.yml" >/dev/null; then - err "compose filter failed: test services still present"; exit 1; -fi -if ! grep -q '^networks:' "$STAGE/compose/docker-compose.yml" || ! grep -q 'argus-sys-net:' "$STAGE/compose/docker-compose.yml"; then - err "compose overlay transform failed: external network missing"; exit 1; -fi - -# 3) Images (reuse if already exported unless --resave-image) -existing_images_tar="$PKG_DIR/images/all-images.tar.gz" -if [[ "$RESAVE_IMAGE" == false && -f "$existing_images_tar" ]]; then - log "Reusing existing images tar: $existing_images_tar" - cp "$existing_images_tar" "$STAGE/images/" -elif [[ "$RESAVE_IMAGE" == false ]]; then - # Try cross-version reuse from latest server_*.tar.gz - latest_pkg=$(ls -1t "$ART_ROOT/server"/server_*.tar.gz 2>/dev/null | head -n1 || true) - if [[ -n "$latest_pkg" ]]; then - log "Reusing images from: $latest_pkg" - mkdir -p "$STAGE/images" - # extract matching file regardless of top-level dir - if tar -xzf "$latest_pkg" -C "$STAGE" --wildcards '*/images/all-images.tar.gz' 2>/dev/null; then - # locate and move - found=$(find "$STAGE" -type f -path '*/images/all-images.tar.gz' | head -n1 || true) - if [[ -n "$found" ]]; then - mv "$found" "$STAGE/images/all-images.tar.gz" - # cleanup leftover extracted dir - dir_to_clean=$(dirname "$found") - rm -rf "${dir_to_clean%/images}" 2>/dev/null || true - fi - fi - fi -fi - -# If still not present, save from local docker daemon -if [[ ! -f "$STAGE/images/all-images.tar.gz" ]]; then - require_cmd docker gzip - images=( - argus-bind9:latest - argus-master:latest - argus-elasticsearch:latest - argus-kibana:latest - argus-metric-ftp:latest - argus-metric-prometheus:latest - argus-metric-grafana:latest - argus-alertmanager:latest - argus-web-frontend:latest - argus-web-proxy:latest - ) - log "Saving images: ${#images[@]}" - tarfile="$STAGE/images/all-images.tar" - docker save -o "$tarfile" "${images[@]}" - gzip -f "$tarfile" -fi - -# 4) Scripts & Docs -copy_tree "$BUILD_DIR/templates/scripts" "$STAGE/scripts" -copy_tree "$BUILD_DIR/templates/docs" "$STAGE/docs" -find "$STAGE/scripts" -type f -name '*.sh' -exec chmod +x {} + 2>/dev/null || true - -# 5) Manifests -gen_manifest "$STAGE" "$STAGE/manifest.txt" -checksum_dir "$STAGE" "$STAGE/checksums.txt" - -# 6) Move to artifact -make_dir "$PKG_DIR" -rsync -a "$STAGE/" "$PKG_DIR/" 2>/dev/null || cp -r "$STAGE/." "$PKG_DIR/" -log "Server package ready: $PKG_DIR" - -echo "$VERSION" > "$PKG_DIR/version.json" - -# 7) Create distributable tarball -OUT_TAR_DIR="$(dirname "$PKG_DIR")" -OUT_TAR="$OUT_TAR_DIR/server_${VERSION}.tar.gz" -log "Creating tarball: $OUT_TAR" -(cd "$PKG_DIR/.." && tar -czf "$OUT_TAR" "$(basename "$PKG_DIR")") -log "Tarball ready: $OUT_TAR" - -exit 0 diff --git a/deployment/build/common.sh b/deployment/build/common.sh deleted file mode 100755 index 7bb3fb0..0000000 --- a/deployment/build/common.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -log() { echo -e "\033[0;34m[INFO]\033[0m $*"; } -warn() { echo -e "\033[1;33m[WARN]\033[0m $*"; } -err() { echo -e "\033[0;31m[ERR ]\033[0m $*" >&2; } - -require_cmd() { - for c in "$@"; do - command -v "$c" >/dev/null 2>&1 || { err "missing command: $c"; exit 1; } - done -} - -today_version() { - date +%Y%m%d -} - -checksum_dir() { - local dir="$1"; local out="$2"; : > "$out"; - (cd "$dir" && find . -type f -print0 | sort -z | xargs -0 sha256sum) >> "$out" -} - -make_dir() { mkdir -p "$1"; } - -copy_tree() { - local src="$1" dst="$2"; rsync -a --delete "$src/" "$dst/" 2>/dev/null || cp -r "$src/." "$dst/"; -} - -gen_manifest() { - local root="$1"; local out="$2"; : > "$out"; - (cd "$root" && find . -maxdepth 3 -type f -printf "%p\n" | sort) >> "$out" -} - diff --git a/deployment/build/templates/.env.example b/deployment/build/templates/.env.example deleted file mode 100644 index 557dda2..0000000 --- a/deployment/build/templates/.env.example +++ /dev/null @@ -1,32 +0,0 @@ -# UID/GID for service processes -ARGUS_BUILD_UID=1000 -ARGUS_BUILD_GID=1000 - -# Host ports (adjust if occupied) -MASTER_PORT=32300 -ES_HTTP_PORT=9200 -KIBANA_PORT=5601 -NODE_A_PORT=2020 -NODE_B_PORT=2021 -PROMETHEUS_PORT=9090 -GRAFANA_PORT=3000 -ALERTMANAGER_PORT=9093 -WEB_PROXY_PORT_8080=8080 -WEB_PROXY_PORT_8081=8081 -WEB_PROXY_PORT_8082=8082 -WEB_PROXY_PORT_8083=8083 -WEB_PROXY_PORT_8084=8084 -WEB_PROXY_PORT_8085=8085 - -# FTP -FTP_PORT=21 -FTP_DATA_PORT=20 -FTP_PASSIVE_HOST_RANGE=21100-21110 -FTP_PASSWORD=ZGClab1234! -FTP_DOMAIN=ftp.metric.argus.com - -# GPU profile disabled by default -ENABLE_GPU=false - -# External overlay network (Swarm attachable) -OVERLAY_NET_NAME=argus-sys-net diff --git a/deployment/build/templates/client/INSTALL_CLIENT_zh.md b/deployment/build/templates/client/INSTALL_CLIENT_zh.md deleted file mode 100644 index cb01060..0000000 --- a/deployment/build/templates/client/INSTALL_CLIENT_zh.md +++ /dev/null @@ -1,44 +0,0 @@ -# Argus Metric 客户端安装指南(容器内普通用户场景) - -## 准备与连通性检查 -- FTP 连接(需要账号密码,默认 `ftpuser / ZGClab1234!`) - - `curl -u ftpuser:ZGClab1234! -I ftp://:21/LATEST_VERSION` - - `curl -u ftpuser:ZGClab1234! -s ftp://:21/ | head` -- 下载安装脚本 - - `curl -u ftpuser:ZGClab1234! -fsSL ftp://:21/setup.sh -o /tmp/setup.sh` - - `chmod +x /tmp/setup.sh` - -## 元数据与主机名 -- Agent 需要元数据(env/user/instance)与 Master 地址: - - 方式A:hostname 形如 `env-user-instance-xxx`(推荐) - - 方式B:导出环境变量: - - `export AGENT_ENV=dev` - - `export AGENT_USER=` - - `export AGENT_INSTANCE=` -- Master 地址: - - `export MASTER_ENDPOINT=http://master.argus.com:3000` - -> 安装脚本会在开头检查上述条件,缺失时会终止并给出修复提示。 - -## 执行安装 -- 以 root 运行(容器内如为非 root 用户请切换为 root): - - `sudo /tmp/setup.sh --server --user ftpuser --password 'ZGClab1234!' --port 21` -- 如需自定义安装根目录:`--install-dir /opt/argus-metric` - -提示(容器接入 overlay 网络时): -- 在执行 setup 前,先将容器内 DNS 指向 Bind9 的 overlay IP: - - `echo "nameserver " > /etc/resolv.conf` - - 这样 `master.argus.com`、`es.log.argus.com` 等域名即可解析;首次下载 `setup.sh` 仍建议使用 FTP 的 overlay IP。 - -更多快速步骤请参考:`QUICK_NODE_DEPLOY_zh.md`。 - -## 安装后自检(setup 自动执行) -- setup 会等待最多 5 分钟,确认以下条件后才报告完成: - - `/private/argus/agent//node.json` 已生成; - - `last_report` 在持续更新; - - `health.metric-argus-agent|metric-node-exporter|metric-fluent-bit|metric-dcgm-exporter` 均为 `healthy` 且 `error` 为空。 - -## 手工验证(可选) -- `cat /private/argus/agent/$(hostname)/node.json | jq '.'` -- `curl -s -o /dev/null -w "%{http_code}\n" http://localhost:9100/metrics` 应为 200 -- 查看日志:`/var/log/argus-agent.log`、`/opt/argus-metric/versions/*/.install.log` diff --git a/deployment/build/templates/client/PUBLISH_CLIENT_zh.md b/deployment/build/templates/client/PUBLISH_CLIENT_zh.md deleted file mode 100644 index 94ca70a..0000000 --- a/deployment/build/templates/client/PUBLISH_CLIENT_zh.md +++ /dev/null @@ -1,57 +0,0 @@ -# Argus Metric 客户端发布说明(FTP) - -本说明面向“发布人员”,讲清楚如何把客户端离线包发布到 FTP,供各节点通过 `curl` 自动安装。 - -## 目录结构(构建后) -- `client-YYYYMMDD/` - - `argus-metric_YYYYMMDD.tar.gz` 客户端离线包 - - `setup.sh` 客户端安装入口脚本(提供给节点用 curl 下载) - - `publish.sh` 发布脚本(将上述两项与 `LATEST_VERSION` 上传到 FTP) - - `LATEST_VERSION` 文本(内容为 `YYYYMMDD`,或 `YYYYMMDD-rN`) - - `INSTALL_CLIENT_zh.md` 本地安装指南(给使用者看,不会上载到 FTP) - - `PUBLISH_CLIENT_zh.md` 本说明 - -> 注意:`publish.sh`/`setup.sh` 为可执行脚本;构建脚本已保证二者具有执行权限。 - -## 前置条件 -- FTP 服务已运行(默认容器:`argus-ftp`),并打开端口:21、20、21100–21110(被动模式)。 -- FTP 账号:默认 `ftpuser / ZGClab1234!`(如有更改,以实际为准)。 - -## 发布步骤(在 server 机器或能直连 FTP 的任意机器上) -1) 进入发布目录: -- `cd client-YYYYMMDD` - -2) 执行发布: -- `./publish.sh --server --user --password '' [--port 21]` -- 例如在服务端本机:`./publish.sh --server localhost --user ftpuser --password 'ZGClab1234!' --port 21` - -脚本会上传三类文件到 FTP 根: -- `setup.sh` -- `argus-metric_YYYYMMDD[ -rN ].tar.gz` -- `LATEST_VERSION`(内容为当前版本号) - -3) 发布后验证: -- `curl -u ftpuser:****** -I ftp://:21/LATEST_VERSION` 应返回 200 -- `curl -u ftpuser:****** -fsSL ftp://:21/LATEST_VERSION` 内容为版本号(如 `20251104`) -- `curl -u ftpuser:****** -I ftp://:21/argus-metric_YYYYMMDD.tar.gz` 返回 200 - -## 节点侧使用方式(摘要) -- 首次下载用 FTP 的“IP 地址”: - - `curl -u ftpuser:****** -fsSL ftp://:21/setup.sh -o /tmp/setup.sh && chmod +x /tmp/setup.sh` -- 执行安装: - - 必需元数据:`AGENT_ENV/AGENT_USER/AGENT_INSTANCE`,以及 `MASTER_ENDPOINT=http://master.argus.com:3000` - - `sudo /tmp/setup.sh --server --user ftpuser --password '******' --port 21` -- overlay 容器场景: - - 先将容器内 DNS 指向 Bind9 的 overlay IP:`echo "nameserver " > /etc/resolv.conf` - - 然后再执行上述安装;安装后约 1–2 分钟内 DNS 即可解析 `*.argus.com` 域名。 - -## 常见问题 -- `530 Access denied`:用户名/密码错误或 FTP 目录无权限;请核对账号与 FTP 容器状态。 -- `Permission denied` 执行 `publish.sh`:为脚本权限问题;`chmod +x publish.sh`。构建脚本已修复默认权限。 -- 被动端口不通导致失败:请开放 21100–21110。 -- 客户端安装后短时 `curl http://master.argus.com:3000` 为 000:服务冷启动或 DNS 同步延迟,等待 1–2 分钟再试。 - -## 版本与回滚 -- `LATEST_VERSION` 决定客户端默认安装的版本号。 -- 如需回滚:将旧版本号写回 `LATEST_VERSION` 并重新发布(或手动指定 `--version` 安装)。 - diff --git a/deployment/build/templates/client/QUICK_NODE_DEPLOY_zh.md b/deployment/build/templates/client/QUICK_NODE_DEPLOY_zh.md deleted file mode 100644 index 45f47fe..0000000 --- a/deployment/build/templates/client/QUICK_NODE_DEPLOY_zh.md +++ /dev/null @@ -1,58 +0,0 @@ -# Argus Metric 节点快速部署(Overlay 网络容器) - -本文档给出在 Docker Swarm external overlay 网络中,快速拉起一个测试节点并完成注册的最小可行步骤。 - -## 前提 -- 服务端已在 Manager 机安装完成并运行良好(`server-selfcheck` 通过)。 -- Overlay 网络名称:`argus-sys-net`(默认)。 -- 已通过 FTP 发布 `setup.sh` 与客户端包,且能从 FTP 获取 `LATEST_VERSION`。 -- 用于测试的镜像:`argus-sys-metric-test-node:latest` 已存在于目标机器。 - -## 步骤 - -- 获取 FTP 和 Bind 的 overlay IP(在 Manager 上执行) - - `FTPIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-ftp)` - - `BINDIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-bind-sys)` - - `echo "FTP=$FTPIP BIND=$BINDIP"` - -- 准备宿主挂载目录(以 s4 为例) - - `mkdir -p /home2/yuyr/deploy/test-metric-node/s4` - -- 启动测试节点容器(接入 overlay) - - `docker run -d --name argus-metric-test-node-s4 \ - --hostname dev2-yuyr-node002s4 \ - --network argus-sys-net \ - -v /home2/yuyr/deploy/test-metric-node/s4:/private/argus/agent \ - argus-sys-metric-test-node:latest sleep infinity` - -- 在容器内执行安装(先用 FTP IP 引导,DNS 指向 Bind) - - `docker exec -it argus-metric-test-node-s4 bash` - - `echo "nameserver $BINDIP" > /etc/resolv.conf` - - `curl --ftp-method nocwd -u ftpuser:ZGClab1234! -fsSL "ftp://$FTPIP:21/setup.sh" -o /tmp/setup.sh` - - `chmod +x /tmp/setup.sh` - - `export AGENT_ENV=dev2 AGENT_USER=yuyr AGENT_INSTANCE=node002s4` - - `export MASTER_ENDPOINT=http://master.argus.com:3000` - - `/tmp/setup.sh --server "$FTPIP" --user ftpuser --password 'ZGClab1234!' --port 21` - - 说明:setup 会自动执行安装后自检(最多 5 分钟),无需手动轮询。 - -## 验证(推荐在容器内执行,避免宿主权限问题) - -- 查看 node.json 关键字段 - - `cat /private/argus/agent/dev2-yuyr-node002s4/node.json | jq '{last_report, health}'` - - 期望:四个 health 全部 healthy;等待 ≥70s 再查看,`last_report` 持续更新。 - -- 指标端口 - - `curl -s -o /dev/null -w '%{http_code}\n' http://localhost:9100/metrics`(期望 200) - - (如测试 GPU)`curl -s -o /dev/null -w '%{http_code}\n' http://localhost:9400/metrics`(有 GPU 时 200) - -- 与服务端连通(域名经 Bind 解析) - - `curl -s -o /dev/null -w '%{http_code}\n' http://master.argus.com:3000/readyz`(期望 200) - - `curl -s -o /dev/null -w '%{http_code}\n' http://es.log.argus.com:9200/_cluster/health`(期望 200) - -## (可选)在服务器主机侧观察 Prometheus 目标更新 -- `cat /home2/yuyr/deploy/versions//private/argus/metric/prometheus/nodes.json | jq '.'` - -## 常见提示 -- 初次安装后短时 `curl` 域名返回 000/超时属正常,多等待 1–2 分钟 DNS 同步/组件冷启动完成。 -- 如在宿主直接读取挂载的 node.json 报 Permission denied,请使用 `docker exec` 在容器内查看。 -- MASTER_ENDPOINT 固定使用域名 `http://master.argus.com:3000`,客户端无需固定 IP。 diff --git a/deployment/build/templates/client/busybox.tar b/deployment/build/templates/client/busybox.tar deleted file mode 100644 index 0840f71..0000000 Binary files a/deployment/build/templates/client/busybox.tar and /dev/null differ diff --git a/deployment/build/templates/client/publish.sh b/deployment/build/templates/client/publish.sh deleted file mode 100644 index c1d080e..0000000 --- a/deployment/build/templates/client/publish.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -usage() { cat <<'EOF' -Publish Argus client package to FTP - -Usage: - ./publish.sh --server HOST --user USER --password PASS [--port 21] - -Notes: -- This script expects to run inside the built client artifact directory. -- It reads LATEST_VERSION and uploads setup.sh, argus-metric_.tar.gz, and LATEST_VERSION. -EOF -} - -HOST=""; USERNAME=""; PASSWORD=""; PORT=21 -while [[ $# -gt 0 ]]; do - case "$1" in - --server) HOST="$2"; shift 2;; - --user) USERNAME="$2"; shift 2;; - --password) PASSWORD="$2"; shift 2;; - --port) PORT="$2"; shift 2;; - -h|--help) usage; exit 0;; - *) echo "unknown arg: $1" >&2; usage; exit 1;; - esac -done - -[[ -n "$HOST" && -n "$USERNAME" && -n "$PASSWORD" ]] || { usage; exit 1; } - -here="$(pwd)" -if [[ ! -f "$here/LATEST_VERSION" ]]; then - echo "LATEST_VERSION not found in $(pwd)" >&2; exit 1; -fi -VER=$(cat "$here/LATEST_VERSION" | tr -d '\n') -PKG="argus-metric_${VER}.tar.gz" - -if [[ ! -f "$here/$PKG" ]]; then - echo "client tar not found: $PKG" >&2; exit 1 -fi - -# locate setup.sh (prefer colocated, fallback to bundled path if provided) -SETUP="${here}/setup.sh" -if [[ ! -f "$SETUP" ]]; then - echo "setup.sh not found in $(pwd)" >&2; exit 1 -fi - -echo "[PUBLISH] server=$HOST port=$PORT version=$VER" - -curl -u "$USERNAME:$PASSWORD" -sfT "$SETUP" "ftp://$HOST:$PORT/setup.sh" -curl -u "$USERNAME:$PASSWORD" -sfT "$PKG" "ftp://$HOST:$PORT/$PKG" -printf "%s" "$VER" | curl -u "$USERNAME:$PASSWORD" -sfT - "ftp://$HOST:$PORT/LATEST_VERSION" - -echo "[OK] publish completed" - diff --git a/deployment/build/templates/docker-compose.filter.awk b/deployment/build/templates/docker-compose.filter.awk deleted file mode 100644 index 72c6159..0000000 --- a/deployment/build/templates/docker-compose.filter.awk +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/awk -f -# Remove specific service blocks from a docker-compose.yml by service name. -# Usage: awk -f docker-compose.filter.awk -v remove="node-a,node-b,test-node,test-gpu-node" input.yml > output.yml - -BEGIN{ - split(remove, rm, ","); - for(i in rm){ - gsub(/^\s+|\s+$/,"",rm[i]); - if (rm[i] != "") skipname[rm[i]] = 1; - } - in_services=0; skipping=0; -} - -function service_header(line, m) { - # match exactly two leading spaces followed by name: - if (match(line, /^ ([A-Za-z0-9_-]+):[ ]*$/, m)) return m[1]; - return ""; -} - -{ - # Track top-level sections (no indentation) - if ($0 ~ /^[A-Za-z0-9_-]+:[ ]*$/ && $0 !~ /^\s/) { - in_services = ($0 ~ /^services:[ ]*$/) ? 1 : 0; - } - - if (skipping) { - # Stop skipping at next service header or another top-level section - if (service_header($0) != "" || ($0 ~ /^(networks|volumes):[ ]*$/ && $0 !~ /^\s/)) { - skipping=0; - } else { - next; - } - } - - if (in_services) { - name = service_header($0); - if (name != "" && (name in skipname)) { skipping=1; next; } - } - - print; -} diff --git a/deployment/build/templates/docker-compose.overlay.awk b/deployment/build/templates/docker-compose.overlay.awk deleted file mode 100644 index e719225..0000000 --- a/deployment/build/templates/docker-compose.overlay.awk +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/awk -f -# Transform docker-compose.yml to use an external overlay network for all services -# - Remove top-level networks definition -# - Remove per-service networks block (including ipv4_address and sysnet refs) -# - Insert per-service networks: [argus-sys-net] -# - Append external networks mapping at the end - -BEGIN{ - in_top_networks=0; in_services=0; in_service=0; svc_indent=0; curr_name=""; -} - -function is_service_header(line){ return svc_name(line)!=""; } -function svc_name(line, m){ if (match(line, /^ ([A-Za-z0-9_-]+):[ ]*$/, m)) return m[1]; return ""; } - -function indent_len(s, n){ n=match(s,/[^ ]/)-1; if(n<0) n=0; return n; } - -{ - # Detect entry into top-level sections - if ($0 ~ /^[A-Za-z0-9_-]+:[ ]*$/ && $0 !~ /^\s/) { - in_services = ($0 ~ /^services:[ ]*$/); - # If a new top-level section starts, stop skipping top networks - in_top_networks = 0; - } - - # Handle removal of initial top-level 'networks:' block - if ($0 ~ /^networks:[ ]*$/ && $0 !~ /^\s/) { - in_top_networks = 1; next; - } - if (in_top_networks) { - # skip until next top-level section (non-indented key) - next; - } - - if (in_services) { - # Track service boundaries - if (is_service_header($0)) { - in_service=1; svc_indent=2; networks_inserted=0; curr_name=svc_name($0); print; next; - } - if (in_service) { - # If line is indented <= service indent, we've left this service - if (indent_len($0) <= svc_indent && $0 !~ /^\s*$/) { - in_service=0; - } - } - - if (in_service) { - # Skip any existing networks block under the service - if ($0 ~ /^\s{4}networks:[ ]*$/) { skipping_nets=1; next; } - if (skipping_nets) { - if (indent_len($0) <= 4) { skipping_nets=0; } - else next; - } - - # After container_name or image, inject networks once - if (!networks_inserted && ($0 ~ /^\s{4}container_name:/ || $0 ~ /^\s{4}image:/)) { - print; - print " networks:"; - print " - argus-sys-net"; - networks_inserted=1; next; - } - # no host port injection; bind serves DNS inside overlay only - } - } - - print; -} - -END{ - print ""; - print "networks:"; - print " argus-sys-net:"; - print " external: true"; - print " name: ${OVERLAY_NET_NAME:-argus-sys-net}"; -} diff --git a/deployment/build/templates/docs/INSTALL_SERVER.md b/deployment/build/templates/docs/INSTALL_SERVER.md deleted file mode 100644 index b511a14..0000000 --- a/deployment/build/templates/docs/INSTALL_SERVER.md +++ /dev/null @@ -1,50 +0,0 @@ -# Argus Server Offline Installation - -## Prerequisites -- Linux x86_64 (Ubuntu 22.04 recommended; see OS compatibility for NixOS) -- Docker & Docker Compose installed -- Open ports: 32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 21100–21110 (or auto-fallback to high ports) - -## Quick Start -1. Extract to a target dir, e.g. `/opt/argus-deploy/versions/` -2. `./server-install.sh` (non‑root is supported: it will precreate minimal dirs and auto-fix Kibana/ES/Bind in containers) -3. `./server-status.sh` -4. `./server-selfcheck.sh` (on failure it auto-runs diagnose and writes logs under `logs/`) -5. `./server-uninstall.sh` to tear down - -## What the Installer Does -- Loads local images (`images/all-images.tar.gz`) -- Generates OS-compat override (`security_opt: ["label=disable"]`, `userns_mode: host`, bind `tmpfs:/run/named`) -- Starts server-only services: bind/master/es/kibana/ftp/prometheus/grafana/alertmanager/web-frontend/web-proxy -- DNS Bootstrap: - - Ensure `/private/argus/etc/dns.conf` exists (write `172.31.0.2` if missing); - - Run `/private/argus/etc/update-dns.sh` in dependent containers so `/etc/resolv.conf` points to bind; - - Wait for `*.argus.com` hint files, then reload bind; - - Restart web‑proxy to re-render nginx resolver from `dns.conf`; -- Writes `logs/selfcheck.json` as final summary - -## OS Compatibility -- NixOS / non-xattr FS: containers run with `security_opt: ["label=disable"]` and `userns_mode: host`. -- If you cannot use sudo, the installer will: - - create minimal data dirs (incl. `private/argus/log/{elasticsearch,kibana}`) with permissive perms when possible; - - ensure inside containers: Kibana `data` → `/private/argus/log/kibana`, Elasticsearch `data` → `/private/argus/log/elasticsearch`, and Bind `rndc.key` is generated. - (Manual pre-creation scripts are no longer required.) - -## Files & Layout -- `compose/` (docker-compose.yml, .env) -- `private/` (data mounts) -- `scripts/` (install/uninstall/status/selfcheck/diagnose) -- `logs/` (selfcheck + diagnose outputs) - -## Troubleshooting (Quick) -- Run `./server-selfcheck.sh` → see `logs/selfcheck.json` -- Run `./server-diagnose.sh` → produces timestamped logs: - - `logs/diagnose_details_YYYYMMDD-HHMMSSZ.log` - - `logs/diagnose_error_YYYYMMDD-HHMMSSZ.log` - And updates `diagnose_details.log`/`diagnose_error.log` to the latest -- Error lines are tagged `[service][source]`, e.g. `[kibana][http] /api/status=503` - -Common issues: -- Kibana 503: wait cold start or fix DNS so `es.log.argus.com` resolves -- web‑proxy 504: check nginx `resolver` includes `172.31.0.2 127.0.0.11` -- EACCES/locks: ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches UID:GID diff --git a/deployment/build/templates/docs/INSTALL_SERVER_zh.md b/deployment/build/templates/docs/INSTALL_SERVER_zh.md deleted file mode 100644 index eb1fd7a..0000000 --- a/deployment/build/templates/docs/INSTALL_SERVER_zh.md +++ /dev/null @@ -1,29 +0,0 @@ -# Argus 服务端离线安装指南 - -## 先决条件 -- Linux x86_64(推荐 Ubuntu 22.04;NixOS 见“兼容说明”) -- 已安装 Docker 与 Docker Compose -- 端口:32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 21100–21110 - -## 快速开始 -1. 解压到目标目录(例如 `/opt/argus-deploy/versions/`) -2. 安装:`./server-install.sh`(支持普通用户:会自动创建最小目录并在容器内修复 Kibana/ES/Bind) -3. 状态:`./server-status.sh` -4. 自检:`./server-selfcheck.sh`(失败会自动采集诊断) -5. 卸载:`./server-uninstall.sh` - -## 安装流程要点 -- 仅启动 10 个服务端组件(不包含测试节点); -- DNS Bootstrap:补齐首次部署 DNS 依赖(生成/确认 `dns.conf`、统一容器 resolv.conf、写入 `*.argus.com`、reload bind、重启 web‑proxy); -- 输出自检结果到 `logs/selfcheck.json`。 - -## 兼容说明(NixOS 等) -- 使用 `security_opt: ["label=disable"]` 与 `userns_mode: host`。 -- 非 root 场景:安装器会创建最小目录(含 `private/argus/log/{elasticsearch,kibana}`),并在容器内完成: - - Kibana 的 `data` 软链到 `/private/argus/log/kibana` - - Elasticsearch 的 `data` 软链到 `/private/argus/log/elasticsearch` - - Bind 生成 `/etc/bind/rndc.key` - -## 故障排查(见下文 Troubleshooting_zh) -- `./server-selfcheck.sh` → `logs/selfcheck.json` -- `./server-diagnose.sh` → `logs/diagnose_error_*.log` / `logs/diagnose_details_*.log` diff --git a/deployment/build/templates/docs/SWARM_DEPLOY_zh.md b/deployment/build/templates/docs/SWARM_DEPLOY_zh.md deleted file mode 100644 index 9f5b680..0000000 --- a/deployment/build/templates/docs/SWARM_DEPLOY_zh.md +++ /dev/null @@ -1,50 +0,0 @@ -# Argus 多机部署(Docker Swarm + External Overlay) - -- 前提:Docker ≥ 20.10;Manager/Worker 节点开放 2377/tcp、7946/tcp+udp、4789/udp。 -- DNS:Bind9 为统一权威,解析 *.argus.com 至部署机固定对外主机 IP。 - -## 在部署机(Manager) -- 初始化 Swarm:`docker swarm init --advertise-addr ` -- 创建 overlay:`docker network create --driver overlay --attachable argus-sys-net` -- 解压离线包后执行: - - `./server-install.sh`(会验证 Swarm 状态、确保 overlay 存在、写入 dns.conf) - - `./server-selfcheck.sh`(失败会自动触发诊断) - -## 在节点机(Worker 或非 Docker 主机) -- Swarm Worker:执行 Manager 的 `docker swarm join ...`; -- 运行客户端容器: - - `docker run -d --name argus-metric-node-001 --network argus-sys-net -v /data/argus/agent:/private/argus/agent argus-sys-metric-test-node:latest sleep infinity` -- 进入容器安装(先 IP 引导,后域名): - - `curl -u ftpuser:*** -fsSL ftp://:21/setup.sh -o /tmp/setup.sh && chmod +x /tmp/setup.sh` - - `AGENT_ENV=dev AGENT_USER=alice AGENT_INSTANCE=node001 MASTER_ENDPOINT=http://master.argus.com:3000 /tmp/setup.sh --server ftp.argus.com --user ftpuser --password '***' --port 21` - -## 关键点 -- 首次必须使用 FTP 的 IP 引导(下载 setup.sh 与 dns.conf) -- MASTER_ENDPOINT 永远使用域名:`http://master.argus.com:3000` -- docker compose 改为 external overlay;容器内不使用 Docker 服务名;web-proxy 与组件上游统一用域名 - -## 找回/轮换 Swarm 加入令牌与解锁密钥 - -在任意一个 Manager 节点上执行以下命令即可查看或轮换加入令牌(join token): - -- 查看加入 Worker 的命令: - - `docker swarm join-token worker` -- 只打印 Worker 的 token: - - `docker swarm join-token -q worker` -- 查看加入 Manager 的命令: - - `docker swarm join-token manager` -- 只打印 Manager 的 token: - - `docker swarm join-token -q manager` - -在待加入节点执行(示例,替换 Manager_IP): -- `docker swarm join --token <上面查到的token> :2377` - -轮换 token(怀疑泄露或需要更新时): -- 轮换 Worker:`docker swarm join-token --rotate worker` -- 轮换 Manager:`docker swarm join-token --rotate manager` - -如果你指的是“解锁密钥”(autolock 的 unlock key),在 Manager 上: -- 查看:`docker swarm unlock-key` -- 轮换:`docker swarm unlock-key --rotate` - -提示:当看到 “This node is not a swarm manager.” 时,说明当前节点不是 Manager,需要到 Manager 节点执行,或在现有 Manager 上 `docker node promote ` 将其提升为 Manager。 diff --git a/deployment/build/templates/docs/TROUBLESHOOTING.md b/deployment/build/templates/docs/TROUBLESHOOTING.md deleted file mode 100644 index 87f30e8..0000000 --- a/deployment/build/templates/docs/TROUBLESHOOTING.md +++ /dev/null @@ -1,20 +0,0 @@ -# Troubleshooting - -- Status: `scripts/server-status.sh` -- Selfcheck: `scripts/server-selfcheck.sh` -- Diagnose: `scripts/server-diagnose.sh` - -Outputs: -- `logs/selfcheck.json` -- `logs/diagnose_details_*.log` (full details) -- `logs/diagnose_error_*.log` (tagged errors) - -Web‑Proxy: -- 8083 expects 200/302/403; 8084/8085 must include CORS header -- nginx resolver should be `172.31.0.2 127.0.0.11` - -Kibana/ES: -- Verify `es.log.argus.com` resolves inside Kibana - -Permissions: -- The installer auto-creates minimal dirs and applies container-side fixes (Kibana/ES/Bind). If you still see EACCES/lock errors, rerun `./server-install.sh` and review diagnose logs. diff --git a/deployment/build/templates/docs/TROUBLESHOOTING_zh.md b/deployment/build/templates/docs/TROUBLESHOOTING_zh.md deleted file mode 100644 index 2d19607..0000000 --- a/deployment/build/templates/docs/TROUBLESHOOTING_zh.md +++ /dev/null @@ -1,16 +0,0 @@ -# 故障排查 - -- 状态:`scripts/server-status.sh` -- 自检:`scripts/server-selfcheck.sh` -- 诊断:`scripts/server-diagnose.sh` - -输出: -- `logs/selfcheck.json` -- `logs/diagnose_error_*.log`(错误摘要) -- `logs/diagnose_details_*.log`(详细信息) - -Web‑Proxy:8083=200/302/403;8084/8085 需包含 CORS -Kibana:确认可解析 `es.log.argus.com` -权限: -- 非 root 安装时,安装器会创建最小目录并在容器内修复 Kibana/ES/Bind; -- 如仍有 `EACCES`/锁文件报错,先重跑 `./server-install.sh`(会重复容器内修复),并查看诊断日志。 diff --git a/deployment/build/templates/scripts/es-watermark-relax.sh b/deployment/build/templates/scripts/es-watermark-relax.sh deleted file mode 100644 index d3eb867..0000000 --- a/deployment/build/templates/scripts/es-watermark-relax.sh +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" -ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a - -ES_URL="http://localhost:${ES_HTTP_PORT:-9200}" - -# Tunables (env overrides) -RELAX_WM_LOW="${RELAX_WM_LOW:-99%}" -RELAX_WM_HIGH="${RELAX_WM_HIGH:-99%}" -RELAX_WM_FLOOD="${RELAX_WM_FLOOD:-99%}" -DISABLE_WATERMARK="${DISABLE_WATERMARK:-1}" -SET_KIBANA_REPLICAS_ZERO="${SET_KIBANA_REPLICAS_ZERO:-1}" -CLEAR_READONLY_BLOCKS="${CLEAR_READONLY_BLOCKS:-1}" - -echo "[RELAX] Checking Elasticsearch at $ES_URL" -code=$(curl -s -o /dev/null -w '%{http_code}' "$ES_URL/_cluster/health" || true) -if [[ "$code" != "200" ]]; then - echo "[RELAX][ERROR] ES not reachable (code=$code). Ensure argus-es-sys is running." >&2 - exit 1 -fi - -echo "[RELAX] Applying transient cluster settings (watermarks)" -th_enabled=$([[ "$DISABLE_WATERMARK" == "1" ]] && echo false || echo true) -curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d "{ - \"transient\": { - \"cluster.routing.allocation.disk.threshold_enabled\": $th_enabled, - \"cluster.routing.allocation.disk.watermark.low\": \"$RELAX_WM_LOW\", - \"cluster.routing.allocation.disk.watermark.high\": \"$RELAX_WM_HIGH\", - \"cluster.routing.allocation.disk.watermark.flood_stage\": \"$RELAX_WM_FLOOD\" - } -}" | sed -n '1,5p' - -if [[ "$CLEAR_READONLY_BLOCKS" == "1" ]]; then - echo "[RELAX] Clearing read_only/read_only_allow_delete blocks on all indices (best-effort)" - curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_all/_settings" -d '{ - "index.blocks.read_only": false, - "index.blocks.read_only_allow_delete": false - }' >/dev/null || true -fi - -if [[ "${SET_KIBANA_REPLICAS_ZERO:-1}" != "0" ]]; then - echo "[RELAX] Ensure .kibana* use replicas=0 via index template and per-index settings (best-effort)" - # high priority template for .kibana* only, avoid impacting other indices - curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_index_template/kibana-replicas-0" -d '{ - "index_patterns": [".kibana*"], - "priority": 200, - "template": { "settings": { "number_of_replicas": 0 } } - }' >/dev/null || true - # set existing .kibana* to replicas=0 - idxs=$(curl -sS "$ES_URL/_cat/indices/.kibana*?h=index" | awk '{print $1}') - for i in $idxs; do - [[ -n "$i" ]] || continue - curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/$i/_settings" -d '{"index":{"number_of_replicas":0}}' >/dev/null || true - done -fi - -# Retry failed shard allocations (best-effort) -curl -sS -H 'Content-Type: application/json' -X POST "$ES_URL/_cluster/reroute?retry_failed=true" -d '{}' >/dev/null || true - -echo "[RELAX] Cluster health (post):" -curl -sS "$ES_URL/_cluster/health?pretty" | sed -n '1,80p' - -# Simple current status summary -ch=$(curl -sS "$ES_URL/_cluster/health" || true) -status=$(printf '%s' "$ch" | awk -F'"' '/"status"/{print $4; exit}') -unassigned=$(printf '%s' "$ch" | awk -F'[,: ]+' '/"unassigned_shards"/{print $3; exit}') -duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true) -settings=$(curl -sS "$ES_URL/_cluster/settings?flat_settings=true" || true) -th=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.threshold_enabled"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1) -low=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.low"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1) -high=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.high"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1) -flood=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.flood_stage"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1) -ks=$(curl -sS "$ES_URL/_cat/shards/.kibana*?h=state" || true) -total=$(printf '%s' "$ks" | awk 'NF{c++} END{print c+0}') -started=$(printf '%s' "$ks" | awk '/STARTED/{c++} END{print c+0}') -unass=$(printf '%s' "$ks" | awk '/UNASSIGNED/{c++} END{print c+0}') -echo "[RELAX][SUMMARY] status=${status:-?} unassigned=${unassigned:-?} es.data.use=${duse:-?} watermarks(threshold=${th:-?} low=${low:-?} high=${high:-?} flood=${flood:-?}) kibana_shards(total=${total},started=${started},unassigned=${unass})" - -echo "[RELAX] Done. Remember to run scripts/es-watermark-restore.sh after freeing disk space and cluster becomes stable." diff --git a/deployment/build/templates/scripts/es-watermark-restore.sh b/deployment/build/templates/scripts/es-watermark-restore.sh deleted file mode 100644 index a20383e..0000000 --- a/deployment/build/templates/scripts/es-watermark-restore.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" -ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a - -ES_URL="http://localhost:${ES_HTTP_PORT:-9200}" - -echo "[RESTORE] Checking Elasticsearch at $ES_URL" -code=$(curl -s -o /dev/null -w '%{http_code}' "$ES_URL/_cluster/health" || true) -if [[ "$code" != "200" ]]; then - echo "[RESTORE][ERROR] ES not reachable (code=$code). Ensure argus-es-sys is running." >&2 - exit 1 -fi - -echo "[RESTORE] Re-enabling disk threshold and clearing relaxed watermarks (transient)" -curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d '{ - "transient": { - "cluster.routing.allocation.disk.threshold_enabled": true, - "cluster.routing.allocation.disk.watermark.low": null, - "cluster.routing.allocation.disk.watermark.high": null, - "cluster.routing.allocation.disk.watermark.flood_stage": null - } -}' | sed -n '1,5p' - -# Optionally restore default replicas to 1 (set RESTORE_DEFAULT_REPLICAS=1 to enable) -if [[ "${RESTORE_DEFAULT_REPLICAS:-0}" == "1" ]]; then - echo "[RESTORE] Setting transient default index.number_of_replicas=1" - curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d '{"transient":{"index.number_of_replicas":"1"}}' >/dev/null || true -fi - -echo "[RESTORE] Cluster health:" -curl -sS "$ES_URL/_cluster/health?pretty" | sed -n '1,80p' - -echo "[RESTORE] Done. Verify shards and consider keeping replicas=0 for single-node deployments." - diff --git a/deployment/build/templates/scripts/fix-prom-targets-overlay.sh b/deployment/build/templates/scripts/fix-prom-targets-overlay.sh deleted file mode 100644 index 6dde5a8..0000000 --- a/deployment/build/templates/scripts/fix-prom-targets-overlay.sh +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# Quick fix tool: replace 172.22/16 targets in nodes.json with overlay IPs resolved from hostname. -# Usage: run on server package host: scripts/fix-prom-targets-overlay.sh - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" -NODES_JSON="$ROOT/private/argus/metric/prometheus/nodes.json" - -require_cmd() { command -v "$1" >/dev/null 2>&1 || { echo "[ERROR] missing command: $1" >&2; exit 1; }; } - -backup() { - local src="$1"; local ts; ts=$(date -u +%Y%m%d-%H%M%SZ) - cp "$src" "${src%.json}_bak_${ts}.json" -} - -prefer_overlay_ip() { - local host="$1" - # prefer 10.0/8 then 172.31/16 - getent hosts "$host" | awk '{print $1}' | while read -r ip; do - if [[ "$ip" =~ ^10\. ]]; then echo "$ip"; return; fi - done - getent hosts "$host" | awk '{print $1}' | while read -r ip; do - if [[ "$ip" =~ ^172\.31\. ]]; then echo "$ip"; return; fi - done - # fallback: first A record - getent hosts "$host" | awk '{print $1; exit}' -} - -require_cmd awk -require_cmd sed - -if [[ ! -f "$NODES_JSON" ]]; then - echo "[WARN] nodes.json not found: $NODES_JSON" >&2 - exit 0 -fi - -backup "$NODES_JSON" - -tmp=$(mktemp) -trap 'rm -f "$tmp"' EXIT - -changed=0 -python3 - "$NODES_JSON" <<'PY' > "$tmp" || { -import ipaddress, json, sys, socket -path=sys.argv[1] -data=json.load(open(path)) if path else [] -def resolve(host): - try: - infos=socket.getaddrinfo(host,None,family=socket.AF_INET) - ips=[i[4][0] for i in infos] - # prefer 10. over 172.31. - for ip in ips: - if ip.startswith('10.'): return ip - for ip in ips: - if ip.startswith('172.31.'): return ip - return ips[0] if ips else None - except OSError: - return None -gw=ipaddress.ip_network('172.22.0.0/16') -out=[] -changed=False -for item in data: - ip=item.get('ip') - host=item.get('hostname') or '' - try: - bad = ip and ipaddress.ip_address(ip) in gw - except Exception: - bad = False - if bad and host: - new=resolve(host) - if new: - item=dict(item) - item['ip']=new - changed=True - out.append(item) -json.dump(out, sys.stdout, ensure_ascii=False) -sys.stderr.write('CHANGED' if changed else 'UNCHANGED') -PY - -status=$? -marker=$(tail -n1 /dev/stderr 2>/dev/null || true) -if [[ "$status" -ne 0 ]]; then - echo "[ERROR] failed to rewrite nodes.json" >&2 - exit 1 -fi - -if grep -q '"ip"\s*:\s*"172\.22\.' "$tmp"; then - echo "[WARN] some gwbridge targets remain; manual fix may be required" >&2 -fi - -mv "$tmp" "$NODES_JSON" -echo "[OK] nodes.json updated" - -# try to reload Prometheus -if docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then - docker exec argus-prometheus sh -lc 'pidof prometheus >/dev/null 2>&1 && kill -HUP $(pidof prometheus) || supervisorctl restart prometheus' >/dev/null 2>&1 || true - echo "[INFO] Prometheus reloaded" -fi - -exit 0 - diff --git a/deployment/build/templates/scripts/server-diagnose.sh b/deployment/build/templates/scripts/server-diagnose.sh deleted file mode 100755 index 4f3d65b..0000000 --- a/deployment/build/templates/scripts/server-diagnose.sh +++ /dev/null @@ -1,198 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" - -ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a - -ts="$(date -u +%Y%m%d-%H%M%SZ)" -LOG_DIR="$ROOT/logs" -mkdir -p "$LOG_DIR" || true -# Fallback to /tmp when logs dir is not writable -if ! ( : > "$LOG_DIR/.w" 2>/dev/null ); then - LOG_DIR="/tmp/argus-logs" - mkdir -p "$LOG_DIR" || true -fi -DETAILS="$LOG_DIR/diagnose_details_${ts}.log" -ERRORS="$LOG_DIR/diagnose_error_${ts}.log" -: > "$DETAILS"; : > "$ERRORS" - -logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; } -append_err() { echo "$*" >> "$ERRORS"; } - -http_code() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; } -http_body_head() { curl -s --max-time 3 "$1" 2>/dev/null | sed -n '1,5p' || true; } -header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; } - -section() { - local name="$1"; logd "===== [$name] ====="; } - -svc() { - local svc_name="$1"; local cname="$2"; shift 2 - section "$svc_name ($cname)" - logd "docker ps:"; docker ps -a --format '{{.Names}}\t{{.Status}}\t{{.Image}}' | awk -v n="$cname" '$1==n' >> "$DETAILS" || true - logd "docker inspect (state/restartcount):"; docker inspect -f '{{.State.Status}} rc={{.RestartCount}} started={{.State.StartedAt}}' "$cname" >> "$DETAILS" 2>&1 || true - logd "last 200 container logs:"; docker logs --tail 200 "$cname" >> "$DETAILS" 2>&1 || true - - # extract error lines from container logs - docker logs --tail 200 "$cname" 2>&1 | \ - grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \ - sed "s/^/[${svc_name}][container] /" >> "$ERRORS" || true - - # supervisor status and logs - if docker exec "$cname" sh -lc 'command -v supervisorctl >/dev/null 2>&1' >/dev/null 2>&1; then - logd "supervisorctl status:"; docker exec "$cname" sh -lc 'supervisorctl status' >> "$DETAILS" 2>&1 || true - # iterate supervisor logs and collect tails + errors per file - local files - files=$(docker exec "$cname" sh -lc 'ls /var/log/supervisor/*.log 2>/dev/null' || true) - for f in $files; do - logd "tail -n 80 $f:"; docker exec "$cname" sh -lc "tail -n 80 $f 2>/dev/null || true" >> "$DETAILS" 2>&1 || true - docker exec "$cname" sh -lc "tail -n 200 $f 2>/dev/null" 2>/dev/null | \ - grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \ - sed "s/^/[${svc_name}][supervisor:$(basename $f)] /" >> "$ERRORS" || true - done - fi -} - -# Core services -svc bind argus-bind-sys -svc master argus-master-sys -svc es argus-es-sys -svc kibana argus-kibana-sys -svc ftp argus-ftp -svc prometheus argus-prometheus -svc grafana argus-grafana -svc alertmanager argus-alertmanager -svc web-frontend argus-web-frontend -svc web-proxy argus-web-proxy - -# HTTP checks (host side) -section HTTP -logd "ES: $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health")" -http_body_head "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" >> "$DETAILS" 2>&1 || true - -logd "Kibana: $(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status")" -http_body_head "http://localhost:${KIBANA_PORT:-5601}/api/status" >> "$DETAILS" 2>&1 || true - -logd "Master readyz: $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz")" - -logd "Prometheus: $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready")" -logd "Grafana: $(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health")" -http_body_head "http://localhost:${GRAFANA_PORT:-3000}/api/health" >> "$DETAILS" 2>&1 || true -logd "Alertmanager: $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status")" - -cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true) -cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true) -logd "Web-Proxy 8080: $(http_code "http://localhost:${WEB_PROXY_PORT_8080:-8080}/")" -logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083}/")" -logd "Web-Proxy 8084 CORS: ${cors8084}" -logd "Web-Proxy 8085 CORS: ${cors8085}" - -# Elasticsearch deep checks: disk watermark and Kibana index status -section ES-CHECKS -ch=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" || true) -status=$(printf '%s' "$ch" | awk -F'\"' '/"status"/{print $4; exit}') -if [[ -n "$status" ]]; then logd "cluster.status=$status"; fi -if [[ "$status" != "green" ]]; then append_err "[es][cluster] status=$status"; fi - -if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then - duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true) - logd "es.data.df_use=$duse" - usep=${duse%%%} - if [[ -n "$usep" ]] && (( usep >= 90 )); then - append_err "[es][disk] data path usage ${duse} (>=90%) likely hit high/flood watermarks" - echo "HINT: High ES disk usage. Free space or run scripts/es-watermark-relax.sh (then scripts/es-watermark-restore.sh)." >&2 - fi -fi - -ks=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cat/shards/.kibana*?h=index,shard,prirep,state,unassigned.reason" || true) -if printf '%s' "$ks" | grep -Eiq '\\b(UNASSIGNED|INITIALIZING|RELOCATING)\\b'; then - append_err "[kibana][index] .kibana* shards not green"; logd "$ks" - echo "HINT: .kibana* shards not green. On single-node, set replicas=0 and ensure ES disk watermarks are not exceeded." >&2 -fi - -# Overlay network diagnostics -section OVERLAY-NET -if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then - logd "overlay present: ${OVERLAY_NET_NAME:-argus-sys-net}" - docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" | sed -n '1,60p' >> "$DETAILS" 2>/dev/null || true -else - append_err "[overlay][network] missing ${OVERLAY_NET_NAME:-argus-sys-net}" -fi - -# Domain resolution & reachability from inside web-proxy (bind-backed) -section DOMAIN -for d in master.argus.com ftp.argus.com kibana.argus.com es.log.argus.com prom.argus.com grafana.argus.com; do - logd "getent $d (web-proxy):" - docker exec argus-web-proxy sh -lc "getent hosts $d || true" >> "$DETAILS" 2>&1 || true -done -logd "HTTP (web-proxy): master.readyz=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://master.argus.com:3000/readyz\" 2>/dev/null || echo 000)" -logd "HTTP (web-proxy): es.health=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://es.log.argus.com:9200/_cluster/health\" 2>/dev/null || echo 000)" -logd "HTTP (web-proxy): kibana.status=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://kibana.argus.com:5601/api/status\" 2>/dev/null || echo 000)" - -# FTP share writability (container perspective) -section FTP-SHARE -docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true - -# Collect system info for context -section SYSTEM -logd "uname -a:"; uname -a >> "$DETAILS" -logd "docker version:"; docker version --format '{{.Server.Version}}' >> "$DETAILS" 2>&1 || true -logd "compose ps:"; (cd "$ROOT/compose" && if docker compose version >/dev/null 2>&1; then docker compose -p argus-sys ps; else docker-compose -p argus-sys ps; fi) >> "$DETAILS" 2>&1 || true - -section SUMMARY -# Add HTTP failures and CORS problems to error log with tags -[[ $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health") != 200 ]] && echo "[es][http] health not 200" >> "$ERRORS" -kbcode=$(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status"); [[ "$kbcode" != 200 ]] && echo "[kibana][http] /api/status=$kbcode" >> "$ERRORS" -[[ $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz") != 200 ]] && echo "[master][http] /readyz not 200" >> "$ERRORS" -[[ $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready") != 200 ]] && echo "[prometheus][http] /-/ready not 200" >> "$ERRORS" -gfcode=$(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health"); [[ "$gfcode" != 200 ]] && echo "[grafana][http] /api/health=$gfcode" >> "$ERRORS" -[[ $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status") != 200 ]] && echo "[alertmanager][http] /api/v2/status not 200" >> "$ERRORS" -[[ -z "$cors8084" ]] && echo "[web-proxy][cors] 8084 missing Access-Control-Allow-Origin" >> "$ERRORS" -[[ -z "$cors8085" ]] && echo "[web-proxy][cors] 8085 missing Access-Control-Allow-Origin" >> "$ERRORS" - -# Deduplicate errors -sort -u -o "$ERRORS" "$ERRORS" - -# --- Prometheus targets & nodes.json checks --- -section PROMETHEUS-TARGETS -nodes_json_path="$ROOT/private/argus/metric/prometheus/nodes.json" -if [[ -f "$nodes_json_path" ]]; then - logd "nodes.json present: $nodes_json_path" - # detect gwbridge addresses (172.22/16) - if grep -E '"ip"\s*:\s*"172\.22\.' "$nodes_json_path" >/dev/null 2>&1; then - append_err "[prometheus][targets] contains gwbridge (172.22/16) addresses; prefer overlay (10.0/8)." - echo "HINT: nodes.json has 172.22.x.x targets. Ensure agent publishes overlay_ip and master prefers it." >&2 - fi -else - logd "nodes.json missing at $nodes_json_path" -fi - -# Query Prometheus activeTargets and list down items when possible -pt_json=$(curl -s --max-time 3 "http://localhost:${PROMETHEUS_PORT:-9090}/api/v1/targets" || true) -if command -v jq >/dev/null 2>&1; then - downs=$(printf '%s' "$pt_json" | jq -r '.data.activeTargets[] | select(.health=="down") | "[prometheus][activeTargets] down url=\(.scrapeUrl) lastError=\(.lastError)"' 2>/dev/null || true) - if [[ -n "$downs" ]]; then - printf '%s\n' "$downs" >> "$ERRORS" - fi -else - # best-effort grep when jq is unavailable - if printf '%s' "$pt_json" | grep -q '"health":"down"'; then - append_err "[prometheus][activeTargets] some targets are down (enable jq for detailed reasons)" - fi -fi - -echo "Diagnostic details -> $DETAILS" -echo "Detected errors -> $ERRORS" - -if [[ "$LOG_DIR" == "$ROOT/logs" ]]; then - # maintain latest symlinks when writing under package logs - ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true - ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true -else - echo "Diagnostic details -> $DETAILS" - echo "Detected errors -> $ERRORS" -fi - -exit 0 diff --git a/deployment/build/templates/scripts/server-install.sh b/deployment/build/templates/scripts/server-install.sh deleted file mode 100755 index a48216b..0000000 --- a/deployment/build/templates/scripts/server-install.sh +++ /dev/null @@ -1,365 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" # version root - -PROJECT_NAME="argus-sys" - -log() { echo -e "\033[0;34m[INSTALL]\033[0m $*"; } -err() { echo -e "\033[0;31m[ERROR ]\033[0m $*" >&2; } - -require() { command -v "$1" >/dev/null 2>&1 || { err "missing command: $1"; exit 1; }; } - -require docker -if docker compose version >/dev/null 2>&1; then COMPOSE=(docker compose); else require docker-compose; COMPOSE=(docker-compose); fi - -ENV_FILE="$PKG_ROOT/compose/.env" -ENV_TEMPLATE="$PKG_ROOT/compose/.env.example" - -find_free_port() { - local prefer="$1"; local start=${2:-20000}; local max=${3:-65000}; - if ! ss -ltnH 2>/dev/null | awk -v pat=":"$prefer"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then echo "$prefer"; return; fi - for ((p=start; p<=max; p++)); do - if ! ss -ltnH 2>/dev/null | awk -v pat=":"$p"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then echo "$p"; return; fi - done - return 1 -} - -prepare_env() { - if [[ -f "$ENV_FILE" ]]; then log ".env exists, keep as-is"; return; fi - [[ -f "$ENV_TEMPLATE" ]] || { err "missing $ENV_TEMPLATE"; exit 1; } - cp "$ENV_TEMPLATE" "$ENV_FILE" - # overlay 模式下,避免为不同服务分配到同一新端口;保持模板端口不做自动改写 -} - -# read VAR from .env (simple parser) -_read_env_var() { local var="$1"; local f="$ENV_FILE"; [[ -f "$f" ]] || return 1; awk -F'=' -v k="$var" 'BEGIN{found=0} $1==k{print substr($0,index($0,"=")+1); found=1; exit} END{exit found?0:1}' "$f"; } - -# set or append VAR=VAL in .env atomically -_set_env_var() { - local var="$1"; local val="$2"; local f="$ENV_FILE"; local tmp="$ENV_FILE.tmp$$" - if [[ -f "$f" ]] && grep -qE "^${var}=" "$f"; then - sed -E "s#^(${var}=).*#\\1${val}#" "$f" >"$tmp" && mv "$tmp" "$f" - else - [[ -f "$f" ]] || : >"$f" - printf "%s=%s\n" "$var" "$val" >>"$f" - fi -} - -auto_assign_ports() { - local enable="${AUTO_ASSIGN_PORTS:-true}" - case "$enable" in - 0|false|no|off) log "AUTO_ASSIGN_PORTS disabled"; return;; - esac - [[ -f "$ENV_FILE" ]] || return 0 - log "auto-assigning free host ports (with fallback)" - cp "$ENV_FILE" "$ENV_FILE.bak.$(date +%Y%m%d-%H%M%S)" || true - - # list of VAR:default pairs to try; FTP 相关端口与被动端口范围不自动改写 - local pairs=( - "MASTER_PORT:32300" - "ES_HTTP_PORT:9200" - "KIBANA_PORT:5601" - "PROMETHEUS_PORT:9090" - "ALERTMANAGER_PORT:9093" - "GRAFANA_PORT:3000" - "WEB_PROXY_PORT_8080:8080" - "WEB_PROXY_PORT_8081:8081" - "WEB_PROXY_PORT_8082:8082" - "WEB_PROXY_PORT_8083:8083" - "WEB_PROXY_PORT_8084:8084" - "WEB_PROXY_PORT_8085:8085" - ) - - # track ports reserved in this run to avoid duplicates - declare -A reserved=() - # pre-mark currently listening ports to avoid choosing them twice within the same run - while read -r lp; do reserved["$lp"]=1; done < <(ss -ltnH 2>/dev/null | awk '{print $4}' | sed -n 's#.*:##p') - - for ent in "${pairs[@]}"; do - local var=${ent%%:*}; local def=${ent##*:} - local cur - if ! cur=$(_read_env_var "$var"); then cur="$def"; fi - # strip quotes if any - cur=${cur%\r}; cur=${cur%\n}; cur=${cur//\"/} - # find a free port, avoiding ones we already reserved in this loop - local cand="$cur" - # if already in use or reserved, pick a free one - if ss -ltnH 2>/dev/null | awk -v pat=":"$cand"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then - cand=$(find_free_port "$cand" 20000 65000) - fi - # avoid duplicates chosen in this loop - local attempts=0 - while [[ -n "${reserved[$cand]:-}" ]]; do - attempts=$((attempts+1)) - local start=$((cand+1)); [[ $start -lt 20000 ]] && start=20000 - local next - next=$(find_free_port "$start" "$start" 65000 || true) - if [[ -z "$next" ]]; then - next=$(find_free_port 20000 20000 65000 || true) - fi - if [[ -z "$next" || "$next" == "$cand" ]]; then - err "no free port available while assigning for $var (last tried: $cand)"; exit 1 - fi - cand="$next" - if (( attempts > 1000 )); then err "port assignment loop exceeded for $var"; exit 1; fi - done - reserved["$cand"]=1 - if [[ "$cand" != "$cur" ]]; then - log " port reassigned: $var $cur -> $cand" - _set_env_var "$var" "$cand" - else - # ensure the var exists in .env for clarity - _set_env_var "$var" "$cand" - fi - done -} - -prepare_data_dirs() { - if [[ $EUID -ne 0 ]]; then - echo -e "\033[1;33m[WARN]\033[0m running as non-root: will not chown data dirs." - echo -e "\033[1;33m[WARN]\033[0m If you hit Permission denied, run: sudo $SCRIPT_DIR/server-prepare-dirs.sh" - # still ensure basic directories exist (no chown) - mkdir -p \ - "$PKG_ROOT/private/argus/etc" \ - "$PKG_ROOT/private/argus/log/elasticsearch" \ - "$PKG_ROOT/private/argus/log/kibana" \ - "$PKG_ROOT/private/argus/metric/prometheus" \ - "$PKG_ROOT/private/argus/metric/prometheus/data" \ - "$PKG_ROOT/private/argus/metric/prometheus/rules" \ - "$PKG_ROOT/private/argus/metric/grafana" \ - "$PKG_ROOT/private/argus/metric/grafana/data" \ - "$PKG_ROOT/private/argus/metric/grafana/logs" \ - "$PKG_ROOT/private/argus/metric/grafana/plugins" \ - "$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \ - "$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \ - "$PKG_ROOT/private/argus/metric/grafana/data/sessions" \ - "$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \ - "$PKG_ROOT/private/argus/alert/alertmanager" \ - "$PKG_ROOT/private/argus/metric/ftp/share" - # non-root: relax permissions to avoid container UID mismatch blocking writes - chmod -R a+rwx "$PKG_ROOT/private/argus" 2>/dev/null || true - fi -} - -ensure_swarm_and_overlay() { - local net_name="${OVERLAY_NET_NAME:-argus-sys-net}" - # Require swarm active - local state - state=$(docker info --format '{{.Swarm.LocalNodeState}}' 2>/dev/null || echo "") - if [[ "$state" != "active" ]]; then - err "Docker Swarm is not active. On this host run:" - err " docker swarm init --advertise-addr " - exit 1 - fi - # Create attachable overlay if missing - if ! docker network inspect "$net_name" >/dev/null 2>&1; then - log "creating attachable overlay network: $net_name" - docker network create --driver overlay --attachable "$net_name" >/dev/null - fi -} - -bootstrap_dns_conf() { - local etc_dir="$PKG_ROOT/private/argus/etc" - mkdir -p "$etc_dir" - local dns_file="$etc_dir/dns.conf" - if [[ ! -s "$dns_file" ]]; then - # detect host primary IP - local host_ip - host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7; exit}') - [[ -z "$host_ip" ]] && host_ip=$(hostname -I 2>/dev/null | awk '{print $1}') - if [[ -n "$host_ip" ]]; then - echo "$host_ip" > "$dns_file" - log "wrote initial dns.conf with host IP: $host_ip" - else - err "failed to determine host IP for dns.conf; please edit $dns_file manually" - fi - fi -} - -load_images() { - local tar="$PKG_ROOT/images/all-images.tar.gz" - [[ -f "$tar" ]] || { err "missing images tar: $tar"; exit 1; } - log "loading images from $(basename "$tar") (may take minutes)" - gunzip -c "$tar" | docker load >/dev/null -} - -bring_up() { - log "starting services via compose" - ensure_swarm_and_overlay - bootstrap_dns_conf - local ov="$PKG_ROOT/compose/docker-compose.os-compat.override.yml" - if [[ ! -f "$ov" ]]; then - cat > "$ov" <<'YAML' -services: - bind: - security_opt: ["label=disable"] - userns_mode: "host" - tmpfs: - - /run/named - master: - security_opt: ["label=disable"] - userns_mode: "host" - es: - security_opt: ["label=disable"] - userns_mode: "host" - kibana: - security_opt: ["label=disable"] - userns_mode: "host" - ftp: - security_opt: ["label=disable"] - userns_mode: "host" - prometheus: - security_opt: ["label=disable"] - userns_mode: "host" - grafana: - security_opt: ["label=disable"] - userns_mode: "host" - alertmanager: - security_opt: ["label=disable"] - userns_mode: "host" - # ensure runtime path matches container expectation - volumes: - - ../private/argus/etc:/private/argus/etc - - ../private/argus/alert/alertmanager:/alertmanager - web-frontend: - security_opt: ["label=disable"] - userns_mode: "host" - web-proxy: - security_opt: ["label=disable"] - userns_mode: "host" -YAML - log "generated OS-compat override: $(basename "$ov")" - fi - # 仅启动服务端组件,避免误起测试节点(node-a/node-b/test-node/test-gpu-node) - local services=(bind master es kibana ftp prometheus grafana alertmanager web-frontend web-proxy) - log "services: ${services[*]}" - (cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" -f docker-compose.yml -f $(basename "$ov") up -d "${services[@]}") -} - -# Post bootstrap container-side fixes that do not require sudo on host. -post_bootstrap_fixes() { - # Kibana: ensure /usr/share/kibana/data is a symlink into mounted path to avoid EACCES - if docker ps --format '{{.Names}}' | grep -q '^argus-kibana-sys$'; then - docker exec argus-kibana-sys bash -lc ' - set -e - mkdir -p /private/argus/log/kibana && chmod 777 /private/argus/log/kibana || true - if [ -d /usr/share/kibana/data ] && [ ! -L /usr/share/kibana/data ]; then rm -rf /usr/share/kibana/data; fi - if [ ! -e /usr/share/kibana/data ]; then ln -s /private/argus/log/kibana /usr/share/kibana/data; fi - ' >/dev/null 2>&1 || true - fi - # Elasticsearch: ensure data path points to mounted path and is writable - if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then - docker exec argus-es-sys bash -lc ' - set -e - mkdir -p /private/argus/log/elasticsearch && chmod 777 /private/argus/log/elasticsearch || true - if [ -d /usr/share/elasticsearch/data ] && [ ! -L /usr/share/elasticsearch/data ]; then rm -rf /usr/share/elasticsearch/data; fi - if [ ! -e /usr/share/elasticsearch/data ]; then ln -s /private/argus/log/elasticsearch /usr/share/elasticsearch/data; fi - ' >/dev/null 2>&1 || true - fi - # Bind9: ensure rndc.key exists - if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then - docker exec argus-bind-sys bash -lc ' - set -e - mkdir -p /etc/bind - if [ ! -f /etc/bind/rndc.key ]; then rndc-confgen -a -c /etc/bind/rndc.key; fi - chmod 644 /etc/bind/rndc.key || true - ' >/dev/null 2>&1 || true - fi -} - -dns_bootstrap() { - log "DNS bootstrap: initializing shared dns.conf and container resolv.conf" - local etc_dir="$PKG_ROOT/private/argus/etc" - mkdir -p "$etc_dir" - # 1) ensure dns.conf exists (fallback to bind IP 172.31.0.2) - if [[ ! -s "$etc_dir/dns.conf" ]]; then - if echo "172.31.0.2" > "$etc_dir/dns.conf" 2>/dev/null; then - log "wrote fallback dns.conf with 172.31.0.2" - else - # host-side write denied (ownership 1000:1000); write via bind container instead - if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then - docker exec argus-bind-sys sh -lc 'echo 172.31.0.2 > /private/argus/etc/dns.conf && chmod 644 /private/argus/etc/dns.conf' || true - log "fallback dns.conf written via bind container" - else - log "bind not ready; skip writing fallback dns.conf" - fi - fi - fi - # 2) wait briefly for bind to copy update-dns.sh into shared etc (bind startup.sh does this) - local i=0 - while [[ ! -x "$etc_dir/update-dns.sh" && $i -lt 20 ]]; do - sleep 0.5; ((i++)); - done - if [[ ! -x "$etc_dir/update-dns.sh" ]]; then - log "update-dns.sh not present yet; continuing with existing resolv.conf" - fi - # 3) run update-dns.sh inside key containers so /etc/resolv.conf points to bind - local c - for c in argus-master-sys argus-es-sys argus-kibana-sys argus-grafana argus-prometheus argus-ftp argus-web-frontend argus-web-proxy argus-alertmanager; do - if docker ps --format '{{.Names}}' | grep -q "^${c}$"; then - docker exec "$c" sh -lc 'test -x /private/argus/etc/update-dns.sh && /private/argus/etc/update-dns.sh || true' >/dev/null 2>&1 || true - fi - done - # 4) wait for service A-record hint files generated by services (best-effort) - local need=( es.log.argus.com kibana.log.argus.com master.argus.com grafana.metric.argus.com prom.metric.argus.com alertmanager.alert.argus.com ) - local waited=0; local missing=1 - while (( waited < 15 )); do - missing=0 - for f in "${need[@]}"; do [[ -s "$etc_dir/$f" ]] || { missing=1; break; }; done - [[ $missing -eq 0 ]] && break - sleep 1; ((waited++)) - done - # 5) reload bind zone (script uses supervisor to restart bind9) - if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then - docker exec argus-bind-sys sh -lc '/usr/local/bin/reload-bind9.sh' >/dev/null 2>&1 || true - fi - # 6) restart web-proxy once to re-render nginx resolver with latest dns.conf - if docker ps --format '{{.Names}}' | grep -q '^argus-web-proxy$'; then - docker restart argus-web-proxy >/dev/null 2>&1 || true - fi -} - -selfcheck() { - # Initial selfcheck with retries to absorb cold starts - local max_retries="${SELF_CHECK_RETRIES:-5}" # 重试次数(不含首次),默认 5 - local wait_seconds="${SELF_CHECK_WAIT_SECONDS:-30}" # 每次重试前等待秒数,默认 30s - - local attempt=0 - while :; do - attempt=$((attempt+1)) - if (( attempt == 1 )); then - log "running selfcheck (attempt ${attempt})" - else - log "running selfcheck (attempt ${attempt}/${max_retries}+1)" - fi - - if bash "$PKG_ROOT/scripts/server-selfcheck.sh"; then - return 0 - fi - - # failed - if (( attempt > max_retries )); then - err "selfcheck failed after ${attempt} attempt(s)" - exit 1 - fi - log "selfcheck not ready yet; retrying in ${wait_seconds}s..." - sleep "$wait_seconds" - done -} - -main() { - mkdir -p "$PKG_ROOT/logs" - prepare_env - auto_assign_ports - prepare_data_dirs - load_images - bring_up - post_bootstrap_fixes - dns_bootstrap - selfcheck - log "install completed. See logs in $PKG_ROOT/logs/" -} - -main "$@" diff --git a/deployment/build/templates/scripts/server-prepare-dirs.sh b/deployment/build/templates/scripts/server-prepare-dirs.sh deleted file mode 100755 index 3be214d..0000000 --- a/deployment/build/templates/scripts/server-prepare-dirs.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" - -if [[ $EUID -ne 0 ]]; then - echo "[PREPARE] This script requires root (sudo)." >&2 - echo " Try: sudo $0" >&2 - exit 1 -fi - -ENV_FILE="$PKG_ROOT/compose/.env" -[[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a -UIDV="${ARGUS_BUILD_UID:-1000}"; GIDV="${ARGUS_BUILD_GID:-1000}" - -echo "[PREPARE] Using owner ${UIDV}:${GIDV}" - -# Core etc and service data dirs (aligned with src/sys/tests/scripts/01_bootstrap.sh) -mkdir -p \ - "$PKG_ROOT/private/argus/etc" \ - "$PKG_ROOT/private/argus/bind" \ - "$PKG_ROOT/private/argus/master" \ - "$PKG_ROOT/private/argus/agent" \ - "$PKG_ROOT/private/argus/log/elasticsearch" \ - "$PKG_ROOT/private/argus/log/kibana" - -# Prometheus -mkdir -p \ - "$PKG_ROOT/private/argus/metric/prometheus" \ - "$PKG_ROOT/private/argus/metric/prometheus/data" \ - "$PKG_ROOT/private/argus/metric/prometheus/rules" \ - "$PKG_ROOT/private/argus/metric/prometheus/targets" - -# Grafana -mkdir -p \ - "$PKG_ROOT/private/argus/metric/grafana" \ - "$PKG_ROOT/private/argus/metric/grafana/data" \ - "$PKG_ROOT/private/argus/metric/grafana/logs" \ - "$PKG_ROOT/private/argus/metric/grafana/plugins" \ - "$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \ - "$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \ - "$PKG_ROOT/private/argus/metric/grafana/data/sessions" \ - "$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \ - "$PKG_ROOT/private/argus/metric/grafana/config" - -# FTP -mkdir -p "$PKG_ROOT/private/argus/metric/ftp/share" - -# Alertmanager -mkdir -p "$PKG_ROOT/private/argus/alert/alertmanager" - -chown -R "$UIDV":"$GIDV" \ - "$PKG_ROOT/private/argus/etc" \ - "$PKG_ROOT/private/argus/bind" \ - "$PKG_ROOT/private/argus/master" \ - "$PKG_ROOT/private/argus/agent" \ - "$PKG_ROOT/private/argus/log/elasticsearch" \ - "$PKG_ROOT/private/argus/log/kibana" \ - "$PKG_ROOT/private/argus/metric/prometheus" \ - "$PKG_ROOT/private/argus/metric/grafana" \ - "$PKG_ROOT/private/argus/metric/ftp" \ - "$PKG_ROOT/private/argus/alert" - -chmod -R g+w "$PKG_ROOT/private/argus/alert" "$PKG_ROOT/private/argus/etc" || true - -# Ensure parent directories also owned by runtime user for consistency -chown "$UIDV":"$GIDV" \ - "$PKG_ROOT/private/argus" \ - "$PKG_ROOT/private/argus/log" \ - "$PKG_ROOT/private/argus/metric" || true - -echo "[PREPARE] Done. You can now run server-install.sh" diff --git a/deployment/build/templates/scripts/server-selfcheck.sh b/deployment/build/templates/scripts/server-selfcheck.sh deleted file mode 100755 index 204ecdc..0000000 --- a/deployment/build/templates/scripts/server-selfcheck.sh +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" - -log() { echo -e "\033[0;34m[CHECK]\033[0m $*"; } -err() { echo -e "\033[0;31m[ERROR]\033[0m $*" >&2; } - -ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a - -wait_http() { local url="$1"; local attempts=${2:-120}; local i=1; while ((i<=attempts)); do curl -fsS "$url" >/dev/null 2>&1 && return 0; echo "[..] waiting $url ($i/$attempts)"; sleep 5; ((i++)); done; return 1; } -code_for() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; } -header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; } - -LOG_DIR="$ROOT/logs" -mkdir -p "$LOG_DIR" || true -OUT_JSON="$LOG_DIR/selfcheck.json" -tmp=$(mktemp) - -ok=1 - -log "checking overlay network" -net_ok=false -if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then - if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" | grep -q '"Driver": "overlay"'; then net_ok=true; fi -fi -[[ "$net_ok" == true ]] || ok=0 - -log "checking Elasticsearch (via domain inside web-proxy)" -if docker exec argus-web-proxy sh -lc "curl -fsS http://es.log.argus.com:9200/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then es_ok=true; else es_ok=false; ok=0; fi - -log "checking Kibana (via domain inside web-proxy)" -kb_code=$(docker exec argus-web-proxy sh -lc "curl -s -o /dev/null -w '%{http_code}' http://kibana.log.argus.com:5601/api/status" || echo 000) -kb_ok=false -if [[ "$kb_code" == "200" ]]; then body=$(curl -sS "http://localhost:${KIBANA_PORT:-5601}/api/status"); echo "$body" | grep -q '"level":"available"' && kb_ok=true; fi -[[ "$kb_ok" == true ]] || ok=0 - -log "checking Master (via domain inside web-proxy)" -if docker exec argus-web-proxy sh -lc "curl -fsS http://master.argus.com:3000/readyz" >/dev/null 2>&1; then true; else ok=0; fi - -log "checking FTP" -if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then - if docker exec argus-ftp sh -lc 'test -w /private/argus/ftp/share'; then ftp_ok=true; else ftp_ok=false; ok=0; fi -else - ftp_ok=false; ok=0; -fi - -log "checking Prometheus" -wait_http "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready" 60 || ok=0 - -log "checking Grafana" -gf_code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:${GRAFANA_PORT:-3000}/api/health" || echo 000) -gf_ok=false; if [[ "$gf_code" == "200" ]]; then body=$(curl -sS "http://localhost:${GRAFANA_PORT:-3000}/api/health"); echo "$body" | grep -q '"database"\s*:\s*"ok"' && gf_ok=true; fi -[[ "$gf_ok" == true ]] || ok=0 - -log "checking Alertmanager" -wait_http "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status" 60 || ok=0 - -log "checking Web-Proxy" -p8080=$(code_for "http://localhost:${WEB_PROXY_PORT_8080:-8080}/") -p8083=$(code_for "http://localhost:${WEB_PROXY_PORT_8083:-8083}/") -cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true) -cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true) -wp_ok=true -# 有些环境首页可能 403,此处接受 200/403 -([[ "$p8080" == 200 || "$p8080" == 403 ]]) || wp_ok=false -([[ "$p8083" == 200 || "$p8083" == 302 || "$p8083" == 403 ]]) || wp_ok=false -[[ -n "$cors8084" && -n "$cors8085" ]] || wp_ok=false -[[ "$wp_ok" == true ]] || ok=0 - -cat > "$tmp" </dev/null; then - # fallback when logs dir not writable (no sudo allowed) - OUT_JSON="/tmp/selfcheck_$(date -u +%Y%m%d-%H%M%SZ).json" - cp "$tmp" "$OUT_JSON" - log "selfcheck.json written to $OUT_JSON (logs dir not writable)" -fi -if [[ "$ok" == 1 ]]; then - log "selfcheck OK" - exit 0 -else - err "selfcheck FAILED (see $OUT_JSON)" - # If diagnose script exists, run it to collect more details - if [[ -x "$SCRIPT_DIR/server-diagnose.sh" ]]; then - # run diagnose; it will print the actual timestamped file paths and update 'latest' symlinks - "$SCRIPT_DIR/server-diagnose.sh" || true - fi - exit 1 -fi diff --git a/deployment/build/templates/scripts/server-status.sh b/deployment/build/templates/scripts/server-status.sh deleted file mode 100755 index d9886db..0000000 --- a/deployment/build/templates/scripts/server-status.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" - -PROJECT_NAME="argus-sys" - -if docker compose version >/dev/null 2>&1; then COMPOSE=(docker compose); else COMPOSE=(docker-compose); fi - -echo "== Containers ==" -(cd "$ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" ps) - -echo -echo "== Key Endpoints ==" -ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a -printf "master http://localhost:%s/readyz\n" "${MASTER_PORT:-32300}" -printf "es http://localhost:%s/_cluster/health\n" "${ES_HTTP_PORT:-9200}" -printf "kibana http://localhost:%s/api/status\n" "${KIBANA_PORT:-5601}" -printf "prom http://localhost:%s/-/ready\n" "${PROMETHEUS_PORT:-9090}" -printf "grafana http://localhost:%s/api/health\n" "${GRAFANA_PORT:-3000}" -printf "alert http://localhost:%s/api/v2/status\n" "${ALERTMANAGER_PORT:-9093}" -printf "web http://localhost:%s/ (8080)\n" "${WEB_PROXY_PORT_8080:-8080}" - -echo -echo "== Selfcheck result ==" -cat "$ROOT/logs/selfcheck.json" 2>/dev/null || echo "(no selfcheck yet)" - diff --git a/deployment/build/templates/scripts/server-uninstall.sh b/deployment/build/templates/scripts/server-uninstall.sh deleted file mode 100755 index 86c7688..0000000 --- a/deployment/build/templates/scripts/server-uninstall.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" - -PROJECT_NAME="argus-sys" - -log() { echo -e "\033[0;34m[UNINSTALL]\033[0m $*"; } - -if docker compose version >/dev/null 2>&1; then COMPOSE=(docker compose); else COMPOSE=(docker-compose); fi - -(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" down -v || true) -log "compose stack removed" -log "you may remove data under $PKG_ROOT/private if you want a clean slate" - diff --git a/scripts/common/build_user.sh b/scripts/common/build_user.sh index c8f5c08..bbea2c6 100644 --- a/scripts/common/build_user.sh +++ b/scripts/common/build_user.sh @@ -37,22 +37,11 @@ _argus_is_number() { [[ "$1" =~ ^[0-9]+$ ]] } -load_build_user() { - if [[ "$_ARGUS_BUILD_USER_LOADED" == "1" ]]; then - return 0 - fi - - local project_root config_files config uid gid - project_root="$(argus_project_root)" - config_files=( - "$project_root/configs/build_user.local.conf" - "$project_root/configs/build_user.conf" - ) - - uid="$ARGUS_BUILD_UID_DEFAULT" - gid="$ARGUS_BUILD_GID_DEFAULT" - - for config in "${config_files[@]}"; do +_argus_read_user_from_files() { + local uid_out_var="$1" gid_out_var="$2"; shift 2 + local uid_val="$ARGUS_BUILD_UID_DEFAULT" gid_val="$ARGUS_BUILD_GID_DEFAULT" + local config + for config in "$@"; do if [[ -f "$config" ]]; then while IFS= read -r raw_line || [[ -n "$raw_line" ]]; do local line key value @@ -68,42 +57,58 @@ load_build_user() { key="$(_argus_trim "$key")" value="$(_argus_trim "$value")" case "$key" in - UID) - uid="$value" - ;; - GID) - gid="$value" - ;; - *) - echo "[ARGUS build_user] Unknown key '$key' in $config" >&2 - ;; + UID) uid_val="$value" ;; + GID) gid_val="$value" ;; + *) echo "[ARGUS build_user] Unknown key '$key' in $config" >&2 ;; esac done < "$config" break fi done + printf -v "$uid_out_var" '%s' "$uid_val" + printf -v "$gid_out_var" '%s' "$gid_val" +} - if [[ -n "${ARGUS_BUILD_UID:-}" ]]; then - uid="$ARGUS_BUILD_UID" - fi - if [[ -n "${ARGUS_BUILD_GID:-}" ]]; then - gid="$ARGUS_BUILD_GID" +load_build_user_profile() { + local profile="${1:-default}" + if [[ "$_ARGUS_BUILD_USER_LOADED" == "1" ]]; then + return 0 fi + local project_root uid gid + project_root="$(argus_project_root)" + case "$profile" in + pkg) + _argus_read_user_from_files uid gid \ + "$project_root/configs/build_user.pkg.conf" \ + "$project_root/configs/build_user.local.conf" \ + "$project_root/configs/build_user.conf" + ;; + default|*) + _argus_read_user_from_files uid gid \ + "$project_root/configs/build_user.local.conf" \ + "$project_root/configs/build_user.conf" + ;; + esac + + if [[ -n "${ARGUS_BUILD_UID:-}" ]]; then uid="$ARGUS_BUILD_UID"; fi + if [[ -n "${ARGUS_BUILD_GID:-}" ]]; then gid="$ARGUS_BUILD_GID"; fi if ! _argus_is_number "$uid"; then - echo "[ARGUS build_user] Invalid UID '$uid'" >&2 - return 1 + echo "[ARGUS build_user] Invalid UID '$uid'" >&2; return 1 fi if ! _argus_is_number "$gid"; then - echo "[ARGUS build_user] Invalid GID '$gid'" >&2 - return 1 + echo "[ARGUS build_user] Invalid GID '$gid'" >&2; return 1 fi - export ARGUS_BUILD_UID="$uid" export ARGUS_BUILD_GID="$gid" _ARGUS_BUILD_USER_LOADED=1 } +load_build_user() { + local profile="${ARGUS_BUILD_PROFILE:-default}" + load_build_user_profile "$profile" +} + argus_build_user_args() { load_build_user printf '%s' "--build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID}" diff --git a/src/metric/client-plugins/all-in-one-full/config/VERSION b/src/metric/client-plugins/all-in-one-full/config/VERSION index b978278..372cf40 100644 --- a/src/metric/client-plugins/all-in-one-full/config/VERSION +++ b/src/metric/client-plugins/all-in-one-full/config/VERSION @@ -1 +1 @@ -1.43.0 +1.44.0 diff --git a/src/sys/tests/scripts/16_web_verify.sh b/src/sys/tests/scripts/16_web_verify.sh old mode 100644 new mode 100755 diff --git a/src/web/.gitignore b/src/web/.gitignore index c3702b0..ceca42e 100644 --- a/src/web/.gitignore +++ b/src/web/.gitignore @@ -7,6 +7,7 @@ playwright-report/ # Build output /dist /build +/test-results # Dependency directories jspm_packages/