完成H20服务器部署及重启测试 #51

Merged
yuyr merged 27 commits from dev_1.1.0_yuyr_nobind into dev_1.0.0 2025-11-25 15:54:30 +08:00
36 changed files with 111 additions and 2059 deletions
Showing only changes of commit b402fdf960 - Show all commits

View File

@ -43,6 +43,7 @@ no_cache=false
bundle_date="" bundle_date=""
client_semver="" client_semver=""
cuda_ver="12.2.2" cuda_ver="12.2.2"
DEFAULT_IMAGE_TAG="latest"
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
@ -126,6 +127,16 @@ fi
cd "$root" cd "$root"
# Set default image tag policy before building
if [[ "$build_server_pkg" == true ]]; then
DEFAULT_IMAGE_TAG="${bundle_date:-latest}"
fi
# Select build user profile for pkg vs default
if [[ "$build_server_pkg" == true || "$build_client_pkg" == true ]]; then
export ARGUS_BUILD_PROFILE=pkg
fi
load_build_user load_build_user
build_args+=("--build-arg" "ARGUS_BUILD_UID=${ARGUS_BUILD_UID}" "--build-arg" "ARGUS_BUILD_GID=${ARGUS_BUILD_GID}") build_args+=("--build-arg" "ARGUS_BUILD_UID=${ARGUS_BUILD_UID}" "--build-arg" "ARGUS_BUILD_GID=${ARGUS_BUILD_GID}")
@ -188,13 +199,31 @@ build_image() {
echo " Tag: $tag" echo " Tag: $tag"
echo " Context: $context" echo " Context: $context"
if docker build "${build_args[@]}" "${extra_args[@]}" -f "$dockerfile_path" -t "$tag" "$context"; then local tries=${ARGUS_BUILD_RETRIES:-3}
local delay=${ARGUS_BUILD_RETRY_DELAY:-5}
local attempt=1
while (( attempt <= tries )); do
local prefix=""
if (( attempt == tries )); then
# final attempt: disable BuildKit to avoid docker/dockerfile front-end pulls
prefix="DOCKER_BUILDKIT=0"
echo " Attempt ${attempt}/${tries} (fallback: DOCKER_BUILDKIT=0)"
else
echo " Attempt ${attempt}/${tries}"
fi
if eval $prefix docker build "${build_args[@]}" "${extra_args[@]}" -f "$dockerfile_path" -t "$tag" "$context"; then
echo "$image_name image built successfully" echo "$image_name image built successfully"
return 0 return 0
else
echo "❌ Failed to build $image_name image"
return 1
fi fi
echo "⚠️ Build failed for $image_name (attempt ${attempt}/${tries})."
if (( attempt < tries )); then
echo " Retrying in ${delay}s..."
sleep "$delay"
fi
attempt=$((attempt+1))
done
echo "❌ Failed to build $image_name image after ${tries} attempts"
return 1
} }
pull_base_image() { pull_base_image() {
@ -390,8 +419,10 @@ build_gpu_bundle_image() {
--build-arg CLIENT_VER="$use_version" \ --build-arg CLIENT_VER="$use_version" \
--build-arg BUNDLE_DATE="$date_tag"; then --build-arg BUNDLE_DATE="$date_tag"; then
images_built+=("$image_tag") images_built+=("$image_tag")
# also tag latest for convenience # In non-pkg mode, also tag latest for convenience
if [[ "${ARGUS_PKG_BUILD:-0}" != "1" ]]; then
docker tag "$image_tag" argus-sys-metric-test-node-bundle-gpu:latest >/dev/null 2>&1 || true docker tag "$image_tag" argus-sys-metric-test-node-bundle-gpu:latest >/dev/null 2>&1 || true
fi
return 0 return 0
else else
return 1 return 1
@ -427,10 +458,13 @@ build_server_pkg_bundle() {
argus-metric-ftp argus-metric-prometheus argus-metric-grafana \ argus-metric-ftp argus-metric-prometheus argus-metric-grafana \
argus-alertmanager argus-web-frontend argus-web-proxy argus-alertmanager argus-web-frontend argus-web-proxy
) )
echo "\n🔖 Tagging server images with :$date_tag and collecting digests" echo "\n🔖 Verifying server images with :$date_tag and collecting digests"
if ! ensure_version_tags "$date_tag" "${repos[@]}"; then for repo in "${repos[@]}"; do
if ! docker image inspect "$repo:$date_tag" >/dev/null 2>&1; then
echo "❌ required image missing: $repo:$date_tag (build phase should have produced it)" >&2
return 1 return 1
fi fi
done
# Optional: show digests # Optional: show digests
for repo in "${repos[@]}"; do for repo in "${repos[@]}"; do
local digest local digest
@ -457,6 +491,8 @@ build_client_pkg_bundle() {
local bundle_tag="argus-sys-metric-test-node-bundle-gpu:${date_tag}" local bundle_tag="argus-sys-metric-test-node-bundle-gpu:${date_tag}"
if ! docker image inspect "$bundle_tag" >/dev/null 2>&1; then if ! docker image inspect "$bundle_tag" >/dev/null 2>&1; then
echo "\n🧩 GPU bundle image $bundle_tag missing; building it first..." echo "\n🧩 GPU bundle image $bundle_tag missing; building it first..."
ARGUS_PKG_BUILD=1
export ARGUS_PKG_BUILD
if ! build_gpu_bundle_image "$date_tag" "$cuda" "$semver"; then if ! build_gpu_bundle_image "$date_tag" "$cuda" "$semver"; then
return 1 return 1
fi fi
@ -472,24 +508,24 @@ build_client_pkg_bundle() {
} }
if [[ "$build_core" == true ]]; then if [[ "$build_core" == true ]]; then
if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:${DEFAULT_IMAGE_TAG}"; then
images_built+=("argus-elasticsearch:latest") images_built+=("argus-elasticsearch:${DEFAULT_IMAGE_TAG}")
else else
build_failed=true build_failed=true
fi fi
echo "" echo ""
if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:${DEFAULT_IMAGE_TAG}"; then
images_built+=("argus-kibana:latest") images_built+=("argus-kibana:${DEFAULT_IMAGE_TAG}")
else else
build_failed=true build_failed=true
fi fi
echo "" echo ""
if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:${DEFAULT_IMAGE_TAG}"; then
images_built+=("argus-bind9:latest") images_built+=("argus-bind9:${DEFAULT_IMAGE_TAG}")
else else
build_failed=true build_failed=true
fi fi
@ -501,7 +537,7 @@ if [[ "$build_master" == true ]]; then
echo "" echo ""
echo "🔄 Building Master image..." echo "🔄 Building Master image..."
pushd "$master_root" >/dev/null pushd "$master_root" >/dev/null
master_args=("--tag" "argus-master:latest") master_args=("--tag" "argus-master:${DEFAULT_IMAGE_TAG}")
if [[ "$use_intranet" == true ]]; then if [[ "$use_intranet" == true ]]; then
master_args+=("--intranet") master_args+=("--intranet")
fi fi
@ -515,7 +551,7 @@ if [[ "$build_master" == true ]]; then
if [[ "$build_master_offline" == true ]]; then if [[ "$build_master_offline" == true ]]; then
images_built+=("argus-master:offline") images_built+=("argus-master:offline")
else else
images_built+=("argus-master:latest") images_built+=("argus-master:${DEFAULT_IMAGE_TAG}")
fi fi
else else
build_failed=true build_failed=true
@ -540,9 +576,9 @@ if [[ "$build_metric" == true ]]; then
done done
metric_builds=( metric_builds=(
"Metric FTP|src/metric/ftp/build/Dockerfile|argus-metric-ftp:latest|src/metric/ftp/build" "Metric FTP|src/metric/ftp/build/Dockerfile|argus-metric-ftp:${DEFAULT_IMAGE_TAG}|src/metric/ftp/build"
"Metric Prometheus|src/metric/prometheus/build/Dockerfile|argus-metric-prometheus:latest|src/metric/prometheus/build" "Metric Prometheus|src/metric/prometheus/build/Dockerfile|argus-metric-prometheus:${DEFAULT_IMAGE_TAG}|src/metric/prometheus/build"
"Metric Grafana|src/metric/grafana/build/Dockerfile|argus-metric-grafana:latest|src/metric/grafana/build" "Metric Grafana|src/metric/grafana/build/Dockerfile|argus-metric-grafana:${DEFAULT_IMAGE_TAG}|src/metric/grafana/build"
) )
for build_spec in "${metric_builds[@]}"; do for build_spec in "${metric_builds[@]}"; do
@ -614,8 +650,8 @@ if [[ "$build_web" == true || "$build_alert" == true ]]; then
if [[ "$build_web" == true ]]; then if [[ "$build_web" == true ]]; then
web_builds=( web_builds=(
"Web Frontend|src/web/build_tools/frontend/Dockerfile|argus-web-frontend:latest|." "Web Frontend|src/web/build_tools/frontend/Dockerfile|argus-web-frontend:${DEFAULT_IMAGE_TAG}|."
"Web Proxy|src/web/build_tools/proxy/Dockerfile|argus-web-proxy:latest|." "Web Proxy|src/web/build_tools/proxy/Dockerfile|argus-web-proxy:${DEFAULT_IMAGE_TAG}|."
) )
for build_spec in "${web_builds[@]}"; do for build_spec in "${web_builds[@]}"; do
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec" IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
@ -630,7 +666,7 @@ if [[ "$build_web" == true || "$build_alert" == true ]]; then
if [[ "$build_alert" == true ]]; then if [[ "$build_alert" == true ]]; then
alert_builds=( alert_builds=(
"Alertmanager|src/alert/alertmanager/build/Dockerfile|argus-alertmanager:latest|." "Alertmanager|src/alert/alertmanager/build/Dockerfile|argus-alertmanager:${DEFAULT_IMAGE_TAG}|."
) )
for build_spec in "${alert_builds[@]}"; do for build_spec in "${alert_builds[@]}"; do
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec" IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"

View File

@ -0,0 +1,6 @@
# Default build-time UID/GID for Argus images
# Override by creating configs/build_user.local.conf with the same format.
# Syntax: KEY=VALUE, supports UID/GID only. Whitespace and lines starting with # are ignored.
UID=2133
GID=2015

View File

@ -1 +0,0 @@
artifact/

View File

@ -1,16 +0,0 @@
# Deployment Build Toolkit
This folder provides scripts to produce offline server/client packages and publish the client package to FTP.
Commands
- build_server_package.sh [--version YYYYMMDD]
- build_client_package.sh [--version YYYYMMDD]
- publish_client.sh --version YYYYMMDD --server <host> --user ftpuser --password <pass> [--port 21]
Outputs
- deployment/artifact/server/<YYYYMMDD>/
- deployment/artifact/client/<YYYYMMDD>/
Notes
- Server package contains docker images (single all-images.tar.gz), compose/, scripts/, docs/, private/ skeleton.
- Client package reuses all-in-one-full artifact, repacked as argus-metric_<YYYYMMDD>.tar.gz (compatible with setup.sh).

View File

@ -1,90 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
BUILD_DIR="$ROOT_DIR/deployment/build"
ART_ROOT="$ROOT_DIR/deployment/artifact"
. "$BUILD_DIR/common.sh"
usage() { cat <<'EOF'
Build Argus Client Offline Package
Usage: build_client_package.sh [--version YYYYMMDD] [--out DIR]
Produces: deployment/artifact/client/<YYYYMMDD>/argus-metric_<YYYYMMDD>.tar.gz
EOF
}
VERSION="$(today_version)"
OUT_DIR=""
while [[ $# -gt 0 ]]; do
case "$1" in
--version) VERSION="$2"; shift 2;;
--out) OUT_DIR="$2"; shift 2;;
-h|--help) usage; exit 0;;
*) err "unknown arg: $1"; usage; exit 1;;
esac
done
PKG_DIR="${OUT_DIR:-$ART_ROOT/client/$VERSION}"
make_dir "$PKG_DIR"
log "Packaging client from all-in-one-full artifact"
PLUGIN_DIR="$ROOT_DIR/src/metric/client-plugins/all-in-one-full"
require_cmd bash tar gzip
(cd "$PLUGIN_DIR" && bash scripts/package_artifact.sh --force)
# pick latest artifact dir
ART_BASE="$PLUGIN_DIR/artifact"
latest_dir=$(ls -1dt "$ART_BASE"/*/ 2>/dev/null | head -n1 || true)
[[ -n "$latest_dir" ]] || { err "no client artifact found in $ART_BASE"; exit 1; }
tmpdir=$(mktemp -d)
trap 'rm -rf "$tmpdir"' EXIT
# Filter-only copy: keep install_order files + scripts + deps + version.json
mkdir -p "$tmpdir/src"
cp -f "$latest_dir/version.json" "$tmpdir/src/version.json"
if command -v jq >/dev/null 2>&1; then
mapfile -t files < <(jq -r '.install_order[]' "$latest_dir/version.json")
else
files=( $(grep -E '"install_order"' -A10 "$latest_dir/version.json" | sed -n 's/\s*"\(.*\.tar\.gz\)".*/\1/p') )
fi
for f in "${files[@]}"; do
[[ -f "$latest_dir/$f" ]] && cp -f "$latest_dir/$f" "$tmpdir/src/$f"
done
for aux in install.sh uninstall.sh check_health.sh check_version.sh sync_dns.sh restart_unhealthy.sh config.env; do
[[ -f "$latest_dir/$aux" ]] && cp -f "$latest_dir/$aux" "$tmpdir/src/$aux";
done
if [[ -d "$latest_dir/deps" ]]; then
mkdir -p "$tmpdir/src/deps" && rsync -a "$latest_dir/deps/" "$tmpdir/src/deps/" >/dev/null 2>&1 || cp -r "$latest_dir/deps/." "$tmpdir/src/deps/";
fi
out_name="argus-metric_$(echo "$VERSION" | sed 's/\./_/g').tar.gz"
(cd "$tmpdir/src" && tar -czf "$PKG_DIR/$out_name" .)
log "Client package ready: $PKG_DIR/$out_name"
echo "$VERSION" > "$PKG_DIR/LATEST_VERSION"
# include publish helper and setup.sh for convenience (place first)
PUBLISH_TPL="$BUILD_DIR/templates/client/publish.sh"
if [[ -f "$PUBLISH_TPL" ]]; then
cp "$PUBLISH_TPL" "$PKG_DIR/publish.sh"
fi
# also place a copy of setup.sh alongside
SETUP_SRC="$ROOT_DIR/src/metric/client-plugins/all-in-one-full/scripts/setup.sh"
[[ -f "$SETUP_SRC" ]] && cp "$SETUP_SRC" "$PKG_DIR/setup.sh" || true
# docs for end users (this may overwrite file modes), then fix execute bits
CLIENT_DOC_DIR="$BUILD_DIR/templates/client"
if [[ -d "$CLIENT_DOC_DIR" ]]; then
rsync -a "$CLIENT_DOC_DIR/" "$PKG_DIR/" >/dev/null 2>&1 || cp -r "$CLIENT_DOC_DIR/." "$PKG_DIR/"
fi
# ensure helpers are executable
chmod +x "$PKG_DIR/publish.sh" "$PKG_DIR/setup.sh" 2>/dev/null || true
exit 0

View File

@ -1,39 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
cd "$ROOT_DIR"
usage() {
cat <<EOF
Build CPU node-bundle image (wrapper)
Usage: $(basename "$0") [--client-version YYYYMMDD]
Examples:
$(basename "$0") --client-version 20251106
$(basename "$0") # auto-detect artifact version via packaging
EOF
}
VERSION=""
while [[ $# -gt 0 ]]; do
case "$1" in
--client-version) VERSION="${2:-}"; shift 2;;
-h|--help) usage; exit 0;;
*) echo "Unknown arg: $1" >&2; usage; exit 1;;
esac
done
CMD=("./deployment/build/build_images.sh" "--with-node-bundle")
if [[ -n "$VERSION" ]]; then CMD+=("--client-version" "$VERSION"); fi
echo "[CPU-BUNDLE] invoking: ${CMD[*]}"
"${CMD[@]}"
echo "[CPU-BUNDLE] built image: argus-sys-metric-test-node-bundle:latest"
docker image inspect argus-sys-metric-test-node-bundle:latest >/dev/null 2>&1 || {
echo "[ERR] expected image not found" >&2; exit 1; }
echo "[CPU-BUNDLE] done"

View File

@ -1,49 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
cd "$ROOT_DIR"
usage() {
cat <<EOF
Build GPU node-bundle image (wrapper)
Usage: $(basename "$0") [--client-version YYYYMMDD] [--tag IMAGE:TAG]
Defaults:
base-image = argus-sys-metric-test-gpu-node:latest
output tag = argus-sys-metric-test-node-bundle-gpu:latest
Examples:
$(basename "$0") --client-version 20251106
$(basename "$0") --client-version 20251106 --tag myrepo/node-bundle-gpu:20251106
EOF
}
VERSION=""
OUT_TAG="argus-sys-metric-test-node-bundle-gpu:latest"
while [[ $# -gt 0 ]]; do
case "$1" in
--client-version) VERSION="${2:-}"; shift 2;;
--tag) OUT_TAG="${2:-}"; shift 2;;
-h|--help) usage; exit 0;;
*) echo "Unknown arg: $1" >&2; usage; exit 1;;
esac
done
BASE_IMAGE="argus-sys-metric-test-gpu-node:latest"
CMD=("./deployment/build/build_images.sh" "--with-node-bundle" "--base-image" "$BASE_IMAGE")
if [[ -n "$VERSION" ]]; then CMD+=("--client-version" "$VERSION"); fi
echo "[GPU-BUNDLE] invoking: ${CMD[*]}"
"${CMD[@]}"
echo "[GPU-BUNDLE] re-tagging to $OUT_TAG"
docker image inspect argus-sys-metric-test-node-bundle:latest >/dev/null 2>&1 || {
echo "[ERR] base bundle image missing: argus-sys-metric-test-node-bundle:latest" >&2; exit 1; }
docker tag argus-sys-metric-test-node-bundle:latest "$OUT_TAG"
docker image inspect "$OUT_TAG" >/dev/null 2>&1 || { echo "[ERR] re-tag failed" >&2; exit 1; }
echo "[GPU-BUNDLE] built image: $OUT_TAG (base=$BASE_IMAGE)"

View File

@ -1,98 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
. "$ROOT_DIR/deployment/build/common.sh"
usage() {
cat <<EOF
Build Argus images (optional node-bundle)
Usage: build_images.sh [--with-node-bundle] [--client-version YYYYMMDD] [--base-image NAME[:TAG]]
Examples:
./deployment/build/build_images.sh --with-node-bundle --client-version 20251106
EOF
}
WITH_BUNDLE=false
CLIENT_VERSION=""
BASE_IMAGE="argus-sys-metric-test-node:latest"
while [[ $# -gt 0 ]]; do
case "$1" in
--with-node-bundle) WITH_BUNDLE=true; shift;;
--client-version) CLIENT_VERSION="$2"; shift 2;;
--base-image) BASE_IMAGE="$2"; shift 2;;
-h|--help) usage; exit 0;;
*) err "unknown arg: $1"; usage; exit 1;;
esac
done
if [[ "$WITH_BUNDLE" == true ]]; then
require_cmd docker tar gzip
BUNDLE_DIR="$ROOT_DIR/src/sys/build/node-bundle"
CTX_DIR="$BUNDLE_DIR"
TMP_BUNDLE="$BUNDLE_DIR/bundle"
rm -rf "$TMP_BUNDLE"; mkdir -p "$TMP_BUNDLE"
# Build or locate client artifact
PLUGIN_DIR="$ROOT_DIR/src/metric/client-plugins/all-in-one-full"
# CLIENT_VERSION 支持两种形式:
# - 形如 1.42.0 的 artifact 版本(默认)
# - 形如 YYYYMMDD 的打包日期,将从 deployment/artifact/client/ 下解析出内部 artifact 版本
if [[ -z "$CLIENT_VERSION" ]]; then
pushd "$PLUGIN_DIR" >/dev/null
bash scripts/package_artifact.sh --force
CLIENT_VERSION=$(cat artifact/*/version.json 2>/dev/null | sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' | tail -n1)
popd >/dev/null
[[ -n "$CLIENT_VERSION" ]] || { err "failed to detect client version"; exit 1; }
else
if [[ "$CLIENT_VERSION" =~ ^[0-9]{8}$ ]]; then
PKG_DIR="$ROOT_DIR/deployment/artifact/client/$CLIENT_VERSION"
TAR_PKG="$PKG_DIR/argus-metric_${CLIENT_VERSION}.tar.gz"
[[ -f "$TAR_PKG" ]] || { err "client date package not found: $TAR_PKG"; exit 1; }
# 解包读取内部 version.json
tmpd=$(mktemp -d); trap 'rm -rf "$tmpd"' EXIT
tar -xzf "$TAR_PKG" -C "$tmpd"
if [[ -f "$tmpd/version.json" ]]; then
ART_VER=$(sed -n 's/.*"version"\s*:\s*"\([^"]\+\)".*/\1/p' "$tmpd/version.json" | head -n1)
[[ -n "$ART_VER" ]] || { err "failed to parse artifact version from date package"; exit 1; }
CLIENT_VERSION="$ART_VER"
# 直接使用该 tar 作为 bundle 源
cp "$TAR_PKG" "$TMP_BUNDLE/argus-metric_$(echo "$ART_VER" | tr '.' '_').tar.gz"
# 同时尝试复制 setup.sh若存在
[[ -f "$PKG_DIR/setup.sh" ]] && cp "$PKG_DIR/setup.sh" "$TMP_BUNDLE/" || true
else
err "version.json missing in client date package"
exit 1
fi
else
# 假定为 artifact 版本目录
pushd "$PLUGIN_DIR" >/dev/null
[[ -d "artifact/$CLIENT_VERSION" ]] || bash scripts/package_artifact.sh --force
popd >/dev/null
fi
fi
# 若未通过日期包预置 tar则从插件 artifact 目录取
TAR_NAME="argus-metric_$(echo "$CLIENT_VERSION" | tr '.' '_').tar.gz"
if [[ ! -f "$TMP_BUNDLE/$TAR_NAME" ]]; then
SRC_TAR="$PLUGIN_DIR/artifact/$CLIENT_VERSION/$TAR_NAME"
[[ -f "$SRC_TAR" ]] || { err "missing client tar: $SRC_TAR"; exit 1; }
cp "$SRC_TAR" "$TMP_BUNDLE/"
# also include setup.sh for fallback
if [[ -f "$ROOT_DIR/deployment/artifact/client/$(today_version)/setup.sh" ]]; then
cp "$ROOT_DIR/deployment/artifact/client/$(today_version)/setup.sh" "$TMP_BUNDLE/" || true
fi
fi
log "Building node-bundle image with client version: $CLIENT_VERSION"
DOCKER_BUILDKIT=0 docker build \
--build-arg CLIENT_VER="$CLIENT_VERSION" \
--build-arg BASE_IMAGE="$BASE_IMAGE" \
-t argus-sys-metric-test-node-bundle:latest \
-f "$BUNDLE_DIR/Dockerfile" "$BUNDLE_DIR"
log "Built image: argus-sys-metric-test-node-bundle:latest"
fi
log "Done."

View File

@ -1,139 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
BUILD_DIR="$ROOT_DIR/deployment/build"
ART_ROOT="$ROOT_DIR/deployment/artifact"
. "$BUILD_DIR/common.sh"
usage() { cat <<'EOF'
Build Argus Server Offline Package
Usage: build_server_package.sh [--version YYYYMMDD] [--out DIR] [--resave-image]
Outputs into deployment/artifact/server/<YYYYMMDD>/ by default.
EOF
}
VERSION="$(today_version)"
OUT_DIR=""
RESAVE_IMAGE=false
while [[ $# -gt 0 ]]; do
case "$1" in
--version) VERSION="$2"; shift 2;;
--out) OUT_DIR="$2"; shift 2;;
--resave-image) RESAVE_IMAGE=true; shift;;
-h|--help) usage; exit 0;;
*) err "unknown arg: $1"; usage; exit 1;;
esac
done
PKG_DIR="${OUT_DIR:-$ART_ROOT/server/$VERSION}"
STAGE="$(mktemp -d)"
trap 'rm -rf "$STAGE"' EXIT
log "Version: $VERSION"
log "Staging: $STAGE"
# 1) Layout
make_dir "$STAGE/images"
make_dir "$STAGE/compose"
make_dir "$STAGE/scripts"
make_dir "$STAGE/docs"
make_dir "$STAGE/private/argus"
# 2) Compose: derive from sys/tests by removing test-only services
SRC_COMPOSE="$ROOT_DIR/src/sys/tests/docker-compose.yml"
[[ -f "$SRC_COMPOSE" ]] || { err "missing $SRC_COMPOSE"; exit 1; }
# 2.1 filter out test services
tmp_compose1="$STAGE/compose/docker-compose.filtered.yml"
awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$tmp_compose1"
# 2.2 transform to external overlay network (remove sysnet and per-service blocks)
awk -f "$BUILD_DIR/templates/docker-compose.overlay.awk" "$tmp_compose1" > "$STAGE/compose/docker-compose.yml"
cp "$BUILD_DIR/templates/.env.example" "$STAGE/compose/.env.example"
# fix relative private path to match package layout (compose/ and private/ are siblings)
sed -i 's#\./private/#../private/#g' "$STAGE/compose/docker-compose.yml"
# also handle bind mount form without trailing slash
sed -i -E 's#- \./private:/private#- ../private:/private#g' "$STAGE/compose/docker-compose.yml"
# drop timezone file bind which may not exist on target distros (e.g. NixOS)
sed -i '/\/etc\/timezone:\/etc\/timezone:ro/d' "$STAGE/compose/docker-compose.yml"
# sanity-check: ensure test services are absent and external network present
if grep -E '^(\s{2})(node-a|node-b|test-node|test-gpu-node):\s*$' -n "$STAGE/compose/docker-compose.yml" >/dev/null; then
err "compose filter failed: test services still present"; exit 1;
fi
if ! grep -q '^networks:' "$STAGE/compose/docker-compose.yml" || ! grep -q 'argus-sys-net:' "$STAGE/compose/docker-compose.yml"; then
err "compose overlay transform failed: external network missing"; exit 1;
fi
# 3) Images (reuse if already exported unless --resave-image)
existing_images_tar="$PKG_DIR/images/all-images.tar.gz"
if [[ "$RESAVE_IMAGE" == false && -f "$existing_images_tar" ]]; then
log "Reusing existing images tar: $existing_images_tar"
cp "$existing_images_tar" "$STAGE/images/"
elif [[ "$RESAVE_IMAGE" == false ]]; then
# Try cross-version reuse from latest server_*.tar.gz
latest_pkg=$(ls -1t "$ART_ROOT/server"/server_*.tar.gz 2>/dev/null | head -n1 || true)
if [[ -n "$latest_pkg" ]]; then
log "Reusing images from: $latest_pkg"
mkdir -p "$STAGE/images"
# extract matching file regardless of top-level dir
if tar -xzf "$latest_pkg" -C "$STAGE" --wildcards '*/images/all-images.tar.gz' 2>/dev/null; then
# locate and move
found=$(find "$STAGE" -type f -path '*/images/all-images.tar.gz' | head -n1 || true)
if [[ -n "$found" ]]; then
mv "$found" "$STAGE/images/all-images.tar.gz"
# cleanup leftover extracted dir
dir_to_clean=$(dirname "$found")
rm -rf "${dir_to_clean%/images}" 2>/dev/null || true
fi
fi
fi
fi
# If still not present, save from local docker daemon
if [[ ! -f "$STAGE/images/all-images.tar.gz" ]]; then
require_cmd docker gzip
images=(
argus-bind9:latest
argus-master:latest
argus-elasticsearch:latest
argus-kibana:latest
argus-metric-ftp:latest
argus-metric-prometheus:latest
argus-metric-grafana:latest
argus-alertmanager:latest
argus-web-frontend:latest
argus-web-proxy:latest
)
log "Saving images: ${#images[@]}"
tarfile="$STAGE/images/all-images.tar"
docker save -o "$tarfile" "${images[@]}"
gzip -f "$tarfile"
fi
# 4) Scripts & Docs
copy_tree "$BUILD_DIR/templates/scripts" "$STAGE/scripts"
copy_tree "$BUILD_DIR/templates/docs" "$STAGE/docs"
find "$STAGE/scripts" -type f -name '*.sh' -exec chmod +x {} + 2>/dev/null || true
# 5) Manifests
gen_manifest "$STAGE" "$STAGE/manifest.txt"
checksum_dir "$STAGE" "$STAGE/checksums.txt"
# 6) Move to artifact
make_dir "$PKG_DIR"
rsync -a "$STAGE/" "$PKG_DIR/" 2>/dev/null || cp -r "$STAGE/." "$PKG_DIR/"
log "Server package ready: $PKG_DIR"
echo "$VERSION" > "$PKG_DIR/version.json"
# 7) Create distributable tarball
OUT_TAR_DIR="$(dirname "$PKG_DIR")"
OUT_TAR="$OUT_TAR_DIR/server_${VERSION}.tar.gz"
log "Creating tarball: $OUT_TAR"
(cd "$PKG_DIR/.." && tar -czf "$OUT_TAR" "$(basename "$PKG_DIR")")
log "Tarball ready: $OUT_TAR"
exit 0

View File

@ -1,33 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
log() { echo -e "\033[0;34m[INFO]\033[0m $*"; }
warn() { echo -e "\033[1;33m[WARN]\033[0m $*"; }
err() { echo -e "\033[0;31m[ERR ]\033[0m $*" >&2; }
require_cmd() {
for c in "$@"; do
command -v "$c" >/dev/null 2>&1 || { err "missing command: $c"; exit 1; }
done
}
today_version() {
date +%Y%m%d
}
checksum_dir() {
local dir="$1"; local out="$2"; : > "$out";
(cd "$dir" && find . -type f -print0 | sort -z | xargs -0 sha256sum) >> "$out"
}
make_dir() { mkdir -p "$1"; }
copy_tree() {
local src="$1" dst="$2"; rsync -a --delete "$src/" "$dst/" 2>/dev/null || cp -r "$src/." "$dst/";
}
gen_manifest() {
local root="$1"; local out="$2"; : > "$out";
(cd "$root" && find . -maxdepth 3 -type f -printf "%p\n" | sort) >> "$out"
}

View File

@ -1,32 +0,0 @@
# UID/GID for service processes
ARGUS_BUILD_UID=1000
ARGUS_BUILD_GID=1000
# Host ports (adjust if occupied)
MASTER_PORT=32300
ES_HTTP_PORT=9200
KIBANA_PORT=5601
NODE_A_PORT=2020
NODE_B_PORT=2021
PROMETHEUS_PORT=9090
GRAFANA_PORT=3000
ALERTMANAGER_PORT=9093
WEB_PROXY_PORT_8080=8080
WEB_PROXY_PORT_8081=8081
WEB_PROXY_PORT_8082=8082
WEB_PROXY_PORT_8083=8083
WEB_PROXY_PORT_8084=8084
WEB_PROXY_PORT_8085=8085
# FTP
FTP_PORT=21
FTP_DATA_PORT=20
FTP_PASSIVE_HOST_RANGE=21100-21110
FTP_PASSWORD=ZGClab1234!
FTP_DOMAIN=ftp.metric.argus.com
# GPU profile disabled by default
ENABLE_GPU=false
# External overlay network (Swarm attachable)
OVERLAY_NET_NAME=argus-sys-net

View File

@ -1,44 +0,0 @@
# Argus Metric 客户端安装指南(容器内普通用户场景)
## 准备与连通性检查
- FTP 连接(需要账号密码,默认 `ftpuser / ZGClab1234!`
- `curl -u ftpuser:ZGClab1234! -I ftp://<FTP_IP>:21/LATEST_VERSION`
- `curl -u ftpuser:ZGClab1234! -s ftp://<FTP_IP>:21/ | head`
- 下载安装脚本
- `curl -u ftpuser:ZGClab1234! -fsSL ftp://<FTP_IP>:21/setup.sh -o /tmp/setup.sh`
- `chmod +x /tmp/setup.sh`
## 元数据与主机名
- Agent 需要元数据env/user/instance与 Master 地址:
- 方式Ahostname 形如 `env-user-instance-xxx`(推荐)
- 方式B导出环境变量
- `export AGENT_ENV=dev`
- `export AGENT_USER=<your_user>`
- `export AGENT_INSTANCE=<node_id>`
- Master 地址:
- `export MASTER_ENDPOINT=http://master.argus.com:3000`
> 安装脚本会在开头检查上述条件,缺失时会终止并给出修复提示。
## 执行安装
- 以 root 运行(容器内如为非 root 用户请切换为 root
- `sudo /tmp/setup.sh --server <FTP_IP> --user ftpuser --password 'ZGClab1234!' --port 21`
- 如需自定义安装根目录:`--install-dir /opt/argus-metric`
提示(容器接入 overlay 网络时):
- 在执行 setup 前,先将容器内 DNS 指向 Bind9 的 overlay IP
- `echo "nameserver <BIND_OVERLAY_IP>" > /etc/resolv.conf`
- 这样 `master.argus.com``es.log.argus.com` 等域名即可解析;首次下载 `setup.sh` 仍建议使用 FTP 的 overlay IP。
更多快速步骤请参考:`QUICK_NODE_DEPLOY_zh.md`
## 安装后自检setup 自动执行)
- setup 会等待最多 5 分钟,确认以下条件后才报告完成:
- `/private/argus/agent/<hostname>/node.json` 已生成;
- `last_report` 在持续更新;
- `health.metric-argus-agent|metric-node-exporter|metric-fluent-bit|metric-dcgm-exporter` 均为 `healthy``error` 为空。
## 手工验证(可选)
- `cat /private/argus/agent/$(hostname)/node.json | jq '.'`
- `curl -s -o /dev/null -w "%{http_code}\n" http://localhost:9100/metrics` 应为 200
- 查看日志:`/var/log/argus-agent.log``/opt/argus-metric/versions/*/.install.log`

View File

@ -1,57 +0,0 @@
# Argus Metric 客户端发布说明FTP
本说明面向“发布人员”,讲清楚如何把客户端离线包发布到 FTP供各节点通过 `curl` 自动安装。
## 目录结构(构建后)
- `client-YYYYMMDD/`
- `argus-metric_YYYYMMDD.tar.gz` 客户端离线包
- `setup.sh` 客户端安装入口脚本(提供给节点用 curl 下载)
- `publish.sh` 发布脚本(将上述两项与 `LATEST_VERSION` 上传到 FTP
- `LATEST_VERSION` 文本(内容为 `YYYYMMDD`,或 `YYYYMMDD-rN`
- `INSTALL_CLIENT_zh.md` 本地安装指南(给使用者看,不会上载到 FTP
- `PUBLISH_CLIENT_zh.md` 本说明
> 注意:`publish.sh`/`setup.sh` 为可执行脚本;构建脚本已保证二者具有执行权限。
## 前置条件
- FTP 服务已运行(默认容器:`argus-ftp`并打开端口21、20、2110021110被动模式
- FTP 账号:默认 `ftpuser / ZGClab1234!`(如有更改,以实际为准)。
## 发布步骤(在 server 机器或能直连 FTP 的任意机器上)
1) 进入发布目录:
- `cd client-YYYYMMDD`
2) 执行发布:
- `./publish.sh --server <FTP_HOST> --user <USER> --password '<PASS>' [--port 21]`
- 例如在服务端本机:`./publish.sh --server localhost --user ftpuser --password 'ZGClab1234!' --port 21`
脚本会上传三类文件到 FTP 根:
- `setup.sh`
- `argus-metric_YYYYMMDD[ -rN ].tar.gz`
- `LATEST_VERSION`(内容为当前版本号)
3) 发布后验证:
- `curl -u ftpuser:****** -I ftp://<FTP_HOST>:21/LATEST_VERSION` 应返回 200
- `curl -u ftpuser:****** -fsSL ftp://<FTP_HOST>:21/LATEST_VERSION` 内容为版本号(如 `20251104`
- `curl -u ftpuser:****** -I ftp://<FTP_HOST>:21/argus-metric_YYYYMMDD.tar.gz` 返回 200
## 节点侧使用方式(摘要)
- 首次下载用 FTP 的“IP 地址”:
- `curl -u ftpuser:****** -fsSL ftp://<FTP_IP>:21/setup.sh -o /tmp/setup.sh && chmod +x /tmp/setup.sh`
- 执行安装:
- 必需元数据:`AGENT_ENV/AGENT_USER/AGENT_INSTANCE`,以及 `MASTER_ENDPOINT=http://master.argus.com:3000`
- `sudo /tmp/setup.sh --server <FTP_IP> --user ftpuser --password '******' --port 21`
- overlay 容器场景:
- 先将容器内 DNS 指向 Bind9 的 overlay IP`echo "nameserver <BIND_OVERLAY_IP>" > /etc/resolv.conf`
- 然后再执行上述安装;安装后约 12 分钟内 DNS 即可解析 `*.argus.com` 域名。
## 常见问题
- `530 Access denied`:用户名/密码错误或 FTP 目录无权限;请核对账号与 FTP 容器状态。
- `Permission denied` 执行 `publish.sh`:为脚本权限问题;`chmod +x publish.sh`。构建脚本已修复默认权限。
- 被动端口不通导致失败:请开放 2110021110。
- 客户端安装后短时 `curl http://master.argus.com:3000` 为 000服务冷启动或 DNS 同步延迟,等待 12 分钟再试。
## 版本与回滚
- `LATEST_VERSION` 决定客户端默认安装的版本号。
- 如需回滚:将旧版本号写回 `LATEST_VERSION` 并重新发布(或手动指定 `--version` 安装)。

View File

@ -1,58 +0,0 @@
# Argus Metric 节点快速部署Overlay 网络容器)
本文档给出在 Docker Swarm external overlay 网络中,快速拉起一个测试节点并完成注册的最小可行步骤。
## 前提
- 服务端已在 Manager 机安装完成并运行良好(`server-selfcheck` 通过)。
- Overlay 网络名称:`argus-sys-net`(默认)。
- 已通过 FTP 发布 `setup.sh` 与客户端包,且能从 FTP 获取 `LATEST_VERSION`
- 用于测试的镜像:`argus-sys-metric-test-node:latest` 已存在于目标机器。
## 步骤
- 获取 FTP 和 Bind 的 overlay IP在 Manager 上执行)
- `FTPIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-ftp)`
- `BINDIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "argus-sys-net").IPAddress }}' argus-bind-sys)`
- `echo "FTP=$FTPIP BIND=$BINDIP"`
- 准备宿主挂载目录(以 s4 为例)
- `mkdir -p /home2/yuyr/deploy/test-metric-node/s4`
- 启动测试节点容器(接入 overlay
- `docker run -d --name argus-metric-test-node-s4 \
--hostname dev2-yuyr-node002s4 \
--network argus-sys-net \
-v /home2/yuyr/deploy/test-metric-node/s4:/private/argus/agent \
argus-sys-metric-test-node:latest sleep infinity`
- 在容器内执行安装(先用 FTP IP 引导DNS 指向 Bind
- `docker exec -it argus-metric-test-node-s4 bash`
- `echo "nameserver $BINDIP" > /etc/resolv.conf`
- `curl --ftp-method nocwd -u ftpuser:ZGClab1234! -fsSL "ftp://$FTPIP:21/setup.sh" -o /tmp/setup.sh`
- `chmod +x /tmp/setup.sh`
- `export AGENT_ENV=dev2 AGENT_USER=yuyr AGENT_INSTANCE=node002s4`
- `export MASTER_ENDPOINT=http://master.argus.com:3000`
- `/tmp/setup.sh --server "$FTPIP" --user ftpuser --password 'ZGClab1234!' --port 21`
- 说明setup 会自动执行安装后自检(最多 5 分钟),无需手动轮询。
## 验证(推荐在容器内执行,避免宿主权限问题)
- 查看 node.json 关键字段
- `cat /private/argus/agent/dev2-yuyr-node002s4/node.json | jq '{last_report, health}'`
- 期望:四个 health 全部 healthy等待 ≥70s 再查看,`last_report` 持续更新。
- 指标端口
- `curl -s -o /dev/null -w '%{http_code}\n' http://localhost:9100/metrics`(期望 200
- (如测试 GPU`curl -s -o /dev/null -w '%{http_code}\n' http://localhost:9400/metrics`(有 GPU 时 200
- 与服务端连通(域名经 Bind 解析)
- `curl -s -o /dev/null -w '%{http_code}\n' http://master.argus.com:3000/readyz`(期望 200
- `curl -s -o /dev/null -w '%{http_code}\n' http://es.log.argus.com:9200/_cluster/health`(期望 200
## (可选)在服务器主机侧观察 Prometheus 目标更新
- `cat /home2/yuyr/deploy/versions/<VERSION>/private/argus/metric/prometheus/nodes.json | jq '.'`
## 常见提示
- 初次安装后短时 `curl` 域名返回 000/超时属正常,多等待 12 分钟 DNS 同步/组件冷启动完成。
- 如在宿主直接读取挂载的 node.json 报 Permission denied请使用 `docker exec` 在容器内查看。
- MASTER_ENDPOINT 固定使用域名 `http://master.argus.com:3000`,客户端无需固定 IP。

View File

@ -1,54 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
usage() { cat <<'EOF'
Publish Argus client package to FTP
Usage:
./publish.sh --server HOST --user USER --password PASS [--port 21]
Notes:
- This script expects to run inside the built client artifact directory.
- It reads LATEST_VERSION and uploads setup.sh, argus-metric_<ver>.tar.gz, and LATEST_VERSION.
EOF
}
HOST=""; USERNAME=""; PASSWORD=""; PORT=21
while [[ $# -gt 0 ]]; do
case "$1" in
--server) HOST="$2"; shift 2;;
--user) USERNAME="$2"; shift 2;;
--password) PASSWORD="$2"; shift 2;;
--port) PORT="$2"; shift 2;;
-h|--help) usage; exit 0;;
*) echo "unknown arg: $1" >&2; usage; exit 1;;
esac
done
[[ -n "$HOST" && -n "$USERNAME" && -n "$PASSWORD" ]] || { usage; exit 1; }
here="$(pwd)"
if [[ ! -f "$here/LATEST_VERSION" ]]; then
echo "LATEST_VERSION not found in $(pwd)" >&2; exit 1;
fi
VER=$(cat "$here/LATEST_VERSION" | tr -d '\n')
PKG="argus-metric_${VER}.tar.gz"
if [[ ! -f "$here/$PKG" ]]; then
echo "client tar not found: $PKG" >&2; exit 1
fi
# locate setup.sh (prefer colocated, fallback to bundled path if provided)
SETUP="${here}/setup.sh"
if [[ ! -f "$SETUP" ]]; then
echo "setup.sh not found in $(pwd)" >&2; exit 1
fi
echo "[PUBLISH] server=$HOST port=$PORT version=$VER"
curl -u "$USERNAME:$PASSWORD" -sfT "$SETUP" "ftp://$HOST:$PORT/setup.sh"
curl -u "$USERNAME:$PASSWORD" -sfT "$PKG" "ftp://$HOST:$PORT/$PKG"
printf "%s" "$VER" | curl -u "$USERNAME:$PASSWORD" -sfT - "ftp://$HOST:$PORT/LATEST_VERSION"
echo "[OK] publish completed"

View File

@ -1,41 +0,0 @@
#!/usr/bin/awk -f
# Remove specific service blocks from a docker-compose.yml by service name.
# Usage: awk -f docker-compose.filter.awk -v remove="node-a,node-b,test-node,test-gpu-node" input.yml > output.yml
BEGIN{
split(remove, rm, ",");
for(i in rm){
gsub(/^\s+|\s+$/,"",rm[i]);
if (rm[i] != "") skipname[rm[i]] = 1;
}
in_services=0; skipping=0;
}
function service_header(line, m) {
# match exactly two leading spaces followed by name:
if (match(line, /^ ([A-Za-z0-9_-]+):[ ]*$/, m)) return m[1];
return "";
}
{
# Track top-level sections (no indentation)
if ($0 ~ /^[A-Za-z0-9_-]+:[ ]*$/ && $0 !~ /^\s/) {
in_services = ($0 ~ /^services:[ ]*$/) ? 1 : 0;
}
if (skipping) {
# Stop skipping at next service header or another top-level section
if (service_header($0) != "" || ($0 ~ /^(networks|volumes):[ ]*$/ && $0 !~ /^\s/)) {
skipping=0;
} else {
next;
}
}
if (in_services) {
name = service_header($0);
if (name != "" && (name in skipname)) { skipping=1; next; }
}
print;
}

View File

@ -1,74 +0,0 @@
#!/usr/bin/awk -f
# Transform docker-compose.yml to use an external overlay network for all services
# - Remove top-level networks definition
# - Remove per-service networks block (including ipv4_address and sysnet refs)
# - Insert per-service networks: [argus-sys-net]
# - Append external networks mapping at the end
BEGIN{
in_top_networks=0; in_services=0; in_service=0; svc_indent=0; curr_name="";
}
function is_service_header(line){ return svc_name(line)!=""; }
function svc_name(line, m){ if (match(line, /^ ([A-Za-z0-9_-]+):[ ]*$/, m)) return m[1]; return ""; }
function indent_len(s, n){ n=match(s,/[^ ]/)-1; if(n<0) n=0; return n; }
{
# Detect entry into top-level sections
if ($0 ~ /^[A-Za-z0-9_-]+:[ ]*$/ && $0 !~ /^\s/) {
in_services = ($0 ~ /^services:[ ]*$/);
# If a new top-level section starts, stop skipping top networks
in_top_networks = 0;
}
# Handle removal of initial top-level 'networks:' block
if ($0 ~ /^networks:[ ]*$/ && $0 !~ /^\s/) {
in_top_networks = 1; next;
}
if (in_top_networks) {
# skip until next top-level section (non-indented key)
next;
}
if (in_services) {
# Track service boundaries
if (is_service_header($0)) {
in_service=1; svc_indent=2; networks_inserted=0; curr_name=svc_name($0); print; next;
}
if (in_service) {
# If line is indented <= service indent, we've left this service
if (indent_len($0) <= svc_indent && $0 !~ /^\s*$/) {
in_service=0;
}
}
if (in_service) {
# Skip any existing networks block under the service
if ($0 ~ /^\s{4}networks:[ ]*$/) { skipping_nets=1; next; }
if (skipping_nets) {
if (indent_len($0) <= 4) { skipping_nets=0; }
else next;
}
# After container_name or image, inject networks once
if (!networks_inserted && ($0 ~ /^\s{4}container_name:/ || $0 ~ /^\s{4}image:/)) {
print;
print " networks:";
print " - argus-sys-net";
networks_inserted=1; next;
}
# no host port injection; bind serves DNS inside overlay only
}
}
print;
}
END{
print "";
print "networks:";
print " argus-sys-net:";
print " external: true";
print " name: ${OVERLAY_NET_NAME:-argus-sys-net}";
}

View File

@ -1,50 +0,0 @@
# Argus Server Offline Installation
## Prerequisites
- Linux x86_64 (Ubuntu 22.04 recommended; see OS compatibility for NixOS)
- Docker & Docker Compose installed
- Open ports: 32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 2110021110 (or auto-fallback to high ports)
## Quick Start
1. Extract to a target dir, e.g. `/opt/argus-deploy/versions/<YYYYMMDD>`
2. `./server-install.sh` (nonroot is supported: it will precreate minimal dirs and auto-fix Kibana/ES/Bind in containers)
3. `./server-status.sh`
4. `./server-selfcheck.sh` (on failure it auto-runs diagnose and writes logs under `logs/`)
5. `./server-uninstall.sh` to tear down
## What the Installer Does
- Loads local images (`images/all-images.tar.gz`)
- Generates OS-compat override (`security_opt: ["label=disable"]`, `userns_mode: host`, bind `tmpfs:/run/named`)
- Starts server-only services: bind/master/es/kibana/ftp/prometheus/grafana/alertmanager/web-frontend/web-proxy
- DNS Bootstrap:
- Ensure `/private/argus/etc/dns.conf` exists (write `172.31.0.2` if missing);
- Run `/private/argus/etc/update-dns.sh` in dependent containers so `/etc/resolv.conf` points to bind;
- Wait for `*.argus.com` hint files, then reload bind;
- Restart webproxy to re-render nginx resolver from `dns.conf`;
- Writes `logs/selfcheck.json` as final summary
## OS Compatibility
- NixOS / non-xattr FS: containers run with `security_opt: ["label=disable"]` and `userns_mode: host`.
- If you cannot use sudo, the installer will:
- create minimal data dirs (incl. `private/argus/log/{elasticsearch,kibana}`) with permissive perms when possible;
- ensure inside containers: Kibana `data``/private/argus/log/kibana`, Elasticsearch `data``/private/argus/log/elasticsearch`, and Bind `rndc.key` is generated.
(Manual pre-creation scripts are no longer required.)
## Files & Layout
- `compose/` (docker-compose.yml, .env)
- `private/` (data mounts)
- `scripts/` (install/uninstall/status/selfcheck/diagnose)
- `logs/` (selfcheck + diagnose outputs)
## Troubleshooting (Quick)
- Run `./server-selfcheck.sh` → see `logs/selfcheck.json`
- Run `./server-diagnose.sh` → produces timestamped logs:
- `logs/diagnose_details_YYYYMMDD-HHMMSSZ.log`
- `logs/diagnose_error_YYYYMMDD-HHMMSSZ.log`
And updates `diagnose_details.log`/`diagnose_error.log` to the latest
- Error lines are tagged `[service][source]`, e.g. `[kibana][http] /api/status=503`
Common issues:
- Kibana 503: wait cold start or fix DNS so `es.log.argus.com` resolves
- webproxy 504: check nginx `resolver` includes `172.31.0.2 127.0.0.11`
- EACCES/locks: ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches UID:GID

View File

@ -1,29 +0,0 @@
# Argus 服务端离线安装指南
## 先决条件
- Linux x86_64推荐 Ubuntu 22.04NixOS 见“兼容说明”)
- 已安装 Docker 与 Docker Compose
- 端口32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 2110021110
## 快速开始
1. 解压到目标目录(例如 `/opt/argus-deploy/versions/<YYYYMMDD>`
2. 安装:`./server-install.sh`(支持普通用户:会自动创建最小目录并在容器内修复 Kibana/ES/Bind
3. 状态:`./server-status.sh`
4. 自检:`./server-selfcheck.sh`(失败会自动采集诊断)
5. 卸载:`./server-uninstall.sh`
## 安装流程要点
- 仅启动 10 个服务端组件(不包含测试节点);
- DNS Bootstrap补齐首次部署 DNS 依赖(生成/确认 `dns.conf`、统一容器 resolv.conf、写入 `*.argus.com`、reload bind、重启 webproxy
- 输出自检结果到 `logs/selfcheck.json`
## 兼容说明NixOS 等)
- 使用 `security_opt: ["label=disable"]``userns_mode: host`
- 非 root 场景:安装器会创建最小目录(含 `private/argus/log/{elasticsearch,kibana}`),并在容器内完成:
- Kibana 的 `data` 软链到 `/private/argus/log/kibana`
- Elasticsearch 的 `data` 软链到 `/private/argus/log/elasticsearch`
- Bind 生成 `/etc/bind/rndc.key`
## 故障排查(见下文 Troubleshooting_zh
- `./server-selfcheck.sh``logs/selfcheck.json`
- `./server-diagnose.sh``logs/diagnose_error_*.log` / `logs/diagnose_details_*.log`

View File

@ -1,50 +0,0 @@
# Argus 多机部署Docker Swarm + External Overlay
- 前提Docker ≥ 20.10Manager/Worker 节点开放 2377/tcp、7946/tcp+udp、4789/udp。
- DNSBind9 为统一权威,解析 *.argus.com 至部署机固定对外主机 IP。
## 在部署机Manager
- 初始化 Swarm`docker swarm init --advertise-addr <manager_ip>`
- 创建 overlay`docker network create --driver overlay --attachable argus-sys-net`
- 解压离线包后执行:
- `./server-install.sh`(会验证 Swarm 状态、确保 overlay 存在、写入 dns.conf
- `./server-selfcheck.sh`(失败会自动触发诊断)
## 在节点机Worker 或非 Docker 主机)
- Swarm Worker执行 Manager 的 `docker swarm join ...`
- 运行客户端容器:
- `docker run -d --name argus-metric-node-001 --network argus-sys-net -v /data/argus/agent:/private/argus/agent argus-sys-metric-test-node:latest sleep infinity`
- 进入容器安装(先 IP 引导,后域名):
- `curl -u ftpuser:*** -fsSL ftp://<FTP_IP>:21/setup.sh -o /tmp/setup.sh && chmod +x /tmp/setup.sh`
- `AGENT_ENV=dev AGENT_USER=alice AGENT_INSTANCE=node001 MASTER_ENDPOINT=http://master.argus.com:3000 /tmp/setup.sh --server ftp.argus.com --user ftpuser --password '***' --port 21`
## 关键点
- 首次必须使用 FTP 的 IP 引导(下载 setup.sh 与 dns.conf
- MASTER_ENDPOINT 永远使用域名:`http://master.argus.com:3000`
- docker compose 改为 external overlay容器内不使用 Docker 服务名web-proxy 与组件上游统一用域名
## 找回/轮换 Swarm 加入令牌与解锁密钥
在任意一个 Manager 节点上执行以下命令即可查看或轮换加入令牌join token
- 查看加入 Worker 的命令:
- `docker swarm join-token worker`
- 只打印 Worker 的 token
- `docker swarm join-token -q worker`
- 查看加入 Manager 的命令:
- `docker swarm join-token manager`
- 只打印 Manager 的 token
- `docker swarm join-token -q manager`
在待加入节点执行(示例,替换 Manager_IP
- `docker swarm join --token <上面查到的token> <Manager_IP>:2377`
轮换 token怀疑泄露或需要更新时
- 轮换 Worker`docker swarm join-token --rotate worker`
- 轮换 Manager`docker swarm join-token --rotate manager`
如果你指的是“解锁密钥”autolock 的 unlock key在 Manager 上:
- 查看:`docker swarm unlock-key`
- 轮换:`docker swarm unlock-key --rotate`
提示:当看到 “This node is not a swarm manager.” 时,说明当前节点不是 Manager需要到 Manager 节点执行,或在现有 Manager 上 `docker node promote <NODE-ID>` 将其提升为 Manager。

View File

@ -1,20 +0,0 @@
# Troubleshooting
- Status: `scripts/server-status.sh`
- Selfcheck: `scripts/server-selfcheck.sh`
- Diagnose: `scripts/server-diagnose.sh`
Outputs:
- `logs/selfcheck.json`
- `logs/diagnose_details_*.log` (full details)
- `logs/diagnose_error_*.log` (tagged errors)
WebProxy:
- 8083 expects 200/302/403; 8084/8085 must include CORS header
- nginx resolver should be `172.31.0.2 127.0.0.11`
Kibana/ES:
- Verify `es.log.argus.com` resolves inside Kibana
Permissions:
- The installer auto-creates minimal dirs and applies container-side fixes (Kibana/ES/Bind). If you still see EACCES/lock errors, rerun `./server-install.sh` and review diagnose logs.

View File

@ -1,16 +0,0 @@
# 故障排查
- 状态:`scripts/server-status.sh`
- 自检:`scripts/server-selfcheck.sh`
- 诊断:`scripts/server-diagnose.sh`
输出:
- `logs/selfcheck.json`
- `logs/diagnose_error_*.log`(错误摘要)
- `logs/diagnose_details_*.log`(详细信息)
WebProxy8083=200/302/4038084/8085 需包含 CORS
Kibana确认可解析 `es.log.argus.com`
权限:
- 非 root 安装时,安装器会创建最小目录并在容器内修复 Kibana/ES/Bind
- 如仍有 `EACCES`/锁文件报错,先重跑 `./server-install.sh`(会重复容器内修复),并查看诊断日志。

View File

@ -1,82 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
ES_URL="http://localhost:${ES_HTTP_PORT:-9200}"
# Tunables (env overrides)
RELAX_WM_LOW="${RELAX_WM_LOW:-99%}"
RELAX_WM_HIGH="${RELAX_WM_HIGH:-99%}"
RELAX_WM_FLOOD="${RELAX_WM_FLOOD:-99%}"
DISABLE_WATERMARK="${DISABLE_WATERMARK:-1}"
SET_KIBANA_REPLICAS_ZERO="${SET_KIBANA_REPLICAS_ZERO:-1}"
CLEAR_READONLY_BLOCKS="${CLEAR_READONLY_BLOCKS:-1}"
echo "[RELAX] Checking Elasticsearch at $ES_URL"
code=$(curl -s -o /dev/null -w '%{http_code}' "$ES_URL/_cluster/health" || true)
if [[ "$code" != "200" ]]; then
echo "[RELAX][ERROR] ES not reachable (code=$code). Ensure argus-es-sys is running." >&2
exit 1
fi
echo "[RELAX] Applying transient cluster settings (watermarks)"
th_enabled=$([[ "$DISABLE_WATERMARK" == "1" ]] && echo false || echo true)
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d "{
\"transient\": {
\"cluster.routing.allocation.disk.threshold_enabled\": $th_enabled,
\"cluster.routing.allocation.disk.watermark.low\": \"$RELAX_WM_LOW\",
\"cluster.routing.allocation.disk.watermark.high\": \"$RELAX_WM_HIGH\",
\"cluster.routing.allocation.disk.watermark.flood_stage\": \"$RELAX_WM_FLOOD\"
}
}" | sed -n '1,5p'
if [[ "$CLEAR_READONLY_BLOCKS" == "1" ]]; then
echo "[RELAX] Clearing read_only/read_only_allow_delete blocks on all indices (best-effort)"
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_all/_settings" -d '{
"index.blocks.read_only": false,
"index.blocks.read_only_allow_delete": false
}' >/dev/null || true
fi
if [[ "${SET_KIBANA_REPLICAS_ZERO:-1}" != "0" ]]; then
echo "[RELAX] Ensure .kibana* use replicas=0 via index template and per-index settings (best-effort)"
# high priority template for .kibana* only, avoid impacting other indices
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_index_template/kibana-replicas-0" -d '{
"index_patterns": [".kibana*"],
"priority": 200,
"template": { "settings": { "number_of_replicas": 0 } }
}' >/dev/null || true
# set existing .kibana* to replicas=0
idxs=$(curl -sS "$ES_URL/_cat/indices/.kibana*?h=index" | awk '{print $1}')
for i in $idxs; do
[[ -n "$i" ]] || continue
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/$i/_settings" -d '{"index":{"number_of_replicas":0}}' >/dev/null || true
done
fi
# Retry failed shard allocations (best-effort)
curl -sS -H 'Content-Type: application/json' -X POST "$ES_URL/_cluster/reroute?retry_failed=true" -d '{}' >/dev/null || true
echo "[RELAX] Cluster health (post):"
curl -sS "$ES_URL/_cluster/health?pretty" | sed -n '1,80p'
# Simple current status summary
ch=$(curl -sS "$ES_URL/_cluster/health" || true)
status=$(printf '%s' "$ch" | awk -F'"' '/"status"/{print $4; exit}')
unassigned=$(printf '%s' "$ch" | awk -F'[,: ]+' '/"unassigned_shards"/{print $3; exit}')
duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true)
settings=$(curl -sS "$ES_URL/_cluster/settings?flat_settings=true" || true)
th=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.threshold_enabled"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
low=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.low"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
high=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.high"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
flood=$(printf '%s' "$settings" | grep -o '"cluster.routing.allocation.disk.watermark.flood_stage"[^,}]*' | awk -F: '{gsub(/["} ]/,"",$2);print $2}' | tail -n1)
ks=$(curl -sS "$ES_URL/_cat/shards/.kibana*?h=state" || true)
total=$(printf '%s' "$ks" | awk 'NF{c++} END{print c+0}')
started=$(printf '%s' "$ks" | awk '/STARTED/{c++} END{print c+0}')
unass=$(printf '%s' "$ks" | awk '/UNASSIGNED/{c++} END{print c+0}')
echo "[RELAX][SUMMARY] status=${status:-?} unassigned=${unassigned:-?} es.data.use=${duse:-?} watermarks(threshold=${th:-?} low=${low:-?} high=${high:-?} flood=${flood:-?}) kibana_shards(total=${total},started=${started},unassigned=${unass})"
echo "[RELAX] Done. Remember to run scripts/es-watermark-restore.sh after freeing disk space and cluster becomes stable."

View File

@ -1,37 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
ES_URL="http://localhost:${ES_HTTP_PORT:-9200}"
echo "[RESTORE] Checking Elasticsearch at $ES_URL"
code=$(curl -s -o /dev/null -w '%{http_code}' "$ES_URL/_cluster/health" || true)
if [[ "$code" != "200" ]]; then
echo "[RESTORE][ERROR] ES not reachable (code=$code). Ensure argus-es-sys is running." >&2
exit 1
fi
echo "[RESTORE] Re-enabling disk threshold and clearing relaxed watermarks (transient)"
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d '{
"transient": {
"cluster.routing.allocation.disk.threshold_enabled": true,
"cluster.routing.allocation.disk.watermark.low": null,
"cluster.routing.allocation.disk.watermark.high": null,
"cluster.routing.allocation.disk.watermark.flood_stage": null
}
}' | sed -n '1,5p'
# Optionally restore default replicas to 1 (set RESTORE_DEFAULT_REPLICAS=1 to enable)
if [[ "${RESTORE_DEFAULT_REPLICAS:-0}" == "1" ]]; then
echo "[RESTORE] Setting transient default index.number_of_replicas=1"
curl -sS -H 'Content-Type: application/json' -X PUT "$ES_URL/_cluster/settings" -d '{"transient":{"index.number_of_replicas":"1"}}' >/dev/null || true
fi
echo "[RESTORE] Cluster health:"
curl -sS "$ES_URL/_cluster/health?pretty" | sed -n '1,80p'
echo "[RESTORE] Done. Verify shards and consider keeping replicas=0 for single-node deployments."

View File

@ -1,103 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
# Quick fix tool: replace 172.22/16 targets in nodes.json with overlay IPs resolved from hostname.
# Usage: run on server package host: scripts/fix-prom-targets-overlay.sh
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
NODES_JSON="$ROOT/private/argus/metric/prometheus/nodes.json"
require_cmd() { command -v "$1" >/dev/null 2>&1 || { echo "[ERROR] missing command: $1" >&2; exit 1; }; }
backup() {
local src="$1"; local ts; ts=$(date -u +%Y%m%d-%H%M%SZ)
cp "$src" "${src%.json}_bak_${ts}.json"
}
prefer_overlay_ip() {
local host="$1"
# prefer 10.0/8 then 172.31/16
getent hosts "$host" | awk '{print $1}' | while read -r ip; do
if [[ "$ip" =~ ^10\. ]]; then echo "$ip"; return; fi
done
getent hosts "$host" | awk '{print $1}' | while read -r ip; do
if [[ "$ip" =~ ^172\.31\. ]]; then echo "$ip"; return; fi
done
# fallback: first A record
getent hosts "$host" | awk '{print $1; exit}'
}
require_cmd awk
require_cmd sed
if [[ ! -f "$NODES_JSON" ]]; then
echo "[WARN] nodes.json not found: $NODES_JSON" >&2
exit 0
fi
backup "$NODES_JSON"
tmp=$(mktemp)
trap 'rm -f "$tmp"' EXIT
changed=0
python3 - "$NODES_JSON" <<'PY' > "$tmp" || {
import ipaddress, json, sys, socket
path=sys.argv[1]
data=json.load(open(path)) if path else []
def resolve(host):
try:
infos=socket.getaddrinfo(host,None,family=socket.AF_INET)
ips=[i[4][0] for i in infos]
# prefer 10. over 172.31.
for ip in ips:
if ip.startswith('10.'): return ip
for ip in ips:
if ip.startswith('172.31.'): return ip
return ips[0] if ips else None
except OSError:
return None
gw=ipaddress.ip_network('172.22.0.0/16')
out=[]
changed=False
for item in data:
ip=item.get('ip')
host=item.get('hostname') or ''
try:
bad = ip and ipaddress.ip_address(ip) in gw
except Exception:
bad = False
if bad and host:
new=resolve(host)
if new:
item=dict(item)
item['ip']=new
changed=True
out.append(item)
json.dump(out, sys.stdout, ensure_ascii=False)
sys.stderr.write('CHANGED' if changed else 'UNCHANGED')
PY
status=$?
marker=$(tail -n1 /dev/stderr 2>/dev/null || true)
if [[ "$status" -ne 0 ]]; then
echo "[ERROR] failed to rewrite nodes.json" >&2
exit 1
fi
if grep -q '"ip"\s*:\s*"172\.22\.' "$tmp"; then
echo "[WARN] some gwbridge targets remain; manual fix may be required" >&2
fi
mv "$tmp" "$NODES_JSON"
echo "[OK] nodes.json updated"
# try to reload Prometheus
if docker ps --format '{{.Names}}' | grep -q '^argus-prometheus$'; then
docker exec argus-prometheus sh -lc 'pidof prometheus >/dev/null 2>&1 && kill -HUP $(pidof prometheus) || supervisorctl restart prometheus' >/dev/null 2>&1 || true
echo "[INFO] Prometheus reloaded"
fi
exit 0

View File

@ -1,198 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
ts="$(date -u +%Y%m%d-%H%M%SZ)"
LOG_DIR="$ROOT/logs"
mkdir -p "$LOG_DIR" || true
# Fallback to /tmp when logs dir is not writable
if ! ( : > "$LOG_DIR/.w" 2>/dev/null ); then
LOG_DIR="/tmp/argus-logs"
mkdir -p "$LOG_DIR" || true
fi
DETAILS="$LOG_DIR/diagnose_details_${ts}.log"
ERRORS="$LOG_DIR/diagnose_error_${ts}.log"
: > "$DETAILS"; : > "$ERRORS"
logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; }
append_err() { echo "$*" >> "$ERRORS"; }
http_code() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
http_body_head() { curl -s --max-time 3 "$1" 2>/dev/null | sed -n '1,5p' || true; }
header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; }
section() {
local name="$1"; logd "===== [$name] ====="; }
svc() {
local svc_name="$1"; local cname="$2"; shift 2
section "$svc_name ($cname)"
logd "docker ps:"; docker ps -a --format '{{.Names}}\t{{.Status}}\t{{.Image}}' | awk -v n="$cname" '$1==n' >> "$DETAILS" || true
logd "docker inspect (state/restartcount):"; docker inspect -f '{{.State.Status}} rc={{.RestartCount}} started={{.State.StartedAt}}' "$cname" >> "$DETAILS" 2>&1 || true
logd "last 200 container logs:"; docker logs --tail 200 "$cname" >> "$DETAILS" 2>&1 || true
# extract error lines from container logs
docker logs --tail 200 "$cname" 2>&1 | \
grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \
sed "s/^/[${svc_name}][container] /" >> "$ERRORS" || true
# supervisor status and logs
if docker exec "$cname" sh -lc 'command -v supervisorctl >/dev/null 2>&1' >/dev/null 2>&1; then
logd "supervisorctl status:"; docker exec "$cname" sh -lc 'supervisorctl status' >> "$DETAILS" 2>&1 || true
# iterate supervisor logs and collect tails + errors per file
local files
files=$(docker exec "$cname" sh -lc 'ls /var/log/supervisor/*.log 2>/dev/null' || true)
for f in $files; do
logd "tail -n 80 $f:"; docker exec "$cname" sh -lc "tail -n 80 $f 2>/dev/null || true" >> "$DETAILS" 2>&1 || true
docker exec "$cname" sh -lc "tail -n 200 $f 2>/dev/null" 2>/dev/null | \
grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \
sed "s/^/[${svc_name}][supervisor:$(basename $f)] /" >> "$ERRORS" || true
done
fi
}
# Core services
svc bind argus-bind-sys
svc master argus-master-sys
svc es argus-es-sys
svc kibana argus-kibana-sys
svc ftp argus-ftp
svc prometheus argus-prometheus
svc grafana argus-grafana
svc alertmanager argus-alertmanager
svc web-frontend argus-web-frontend
svc web-proxy argus-web-proxy
# HTTP checks (host side)
section HTTP
logd "ES: $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health")"
http_body_head "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" >> "$DETAILS" 2>&1 || true
logd "Kibana: $(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status")"
http_body_head "http://localhost:${KIBANA_PORT:-5601}/api/status" >> "$DETAILS" 2>&1 || true
logd "Master readyz: $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz")"
logd "Prometheus: $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready")"
logd "Grafana: $(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health")"
http_body_head "http://localhost:${GRAFANA_PORT:-3000}/api/health" >> "$DETAILS" 2>&1 || true
logd "Alertmanager: $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status")"
cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true)
cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true)
logd "Web-Proxy 8080: $(http_code "http://localhost:${WEB_PROXY_PORT_8080:-8080}/")"
logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083}/")"
logd "Web-Proxy 8084 CORS: ${cors8084}"
logd "Web-Proxy 8085 CORS: ${cors8085}"
# Elasticsearch deep checks: disk watermark and Kibana index status
section ES-CHECKS
ch=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" || true)
status=$(printf '%s' "$ch" | awk -F'\"' '/"status"/{print $4; exit}')
if [[ -n "$status" ]]; then logd "cluster.status=$status"; fi
if [[ "$status" != "green" ]]; then append_err "[es][cluster] status=$status"; fi
if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then
duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true)
logd "es.data.df_use=$duse"
usep=${duse%%%}
if [[ -n "$usep" ]] && (( usep >= 90 )); then
append_err "[es][disk] data path usage ${duse} (>=90%) likely hit high/flood watermarks"
echo "HINT: High ES disk usage. Free space or run scripts/es-watermark-relax.sh (then scripts/es-watermark-restore.sh)." >&2
fi
fi
ks=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cat/shards/.kibana*?h=index,shard,prirep,state,unassigned.reason" || true)
if printf '%s' "$ks" | grep -Eiq '\\b(UNASSIGNED|INITIALIZING|RELOCATING)\\b'; then
append_err "[kibana][index] .kibana* shards not green"; logd "$ks"
echo "HINT: .kibana* shards not green. On single-node, set replicas=0 and ensure ES disk watermarks are not exceeded." >&2
fi
# Overlay network diagnostics
section OVERLAY-NET
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then
logd "overlay present: ${OVERLAY_NET_NAME:-argus-sys-net}"
docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" | sed -n '1,60p' >> "$DETAILS" 2>/dev/null || true
else
append_err "[overlay][network] missing ${OVERLAY_NET_NAME:-argus-sys-net}"
fi
# Domain resolution & reachability from inside web-proxy (bind-backed)
section DOMAIN
for d in master.argus.com ftp.argus.com kibana.argus.com es.log.argus.com prom.argus.com grafana.argus.com; do
logd "getent $d (web-proxy):"
docker exec argus-web-proxy sh -lc "getent hosts $d || true" >> "$DETAILS" 2>&1 || true
done
logd "HTTP (web-proxy): master.readyz=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://master.argus.com:3000/readyz\" 2>/dev/null || echo 000)"
logd "HTTP (web-proxy): es.health=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://es.log.argus.com:9200/_cluster/health\" 2>/dev/null || echo 000)"
logd "HTTP (web-proxy): kibana.status=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://kibana.argus.com:5601/api/status\" 2>/dev/null || echo 000)"
# FTP share writability (container perspective)
section FTP-SHARE
docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true
# Collect system info for context
section SYSTEM
logd "uname -a:"; uname -a >> "$DETAILS"
logd "docker version:"; docker version --format '{{.Server.Version}}' >> "$DETAILS" 2>&1 || true
logd "compose ps:"; (cd "$ROOT/compose" && if docker compose version >/dev/null 2>&1; then docker compose -p argus-sys ps; else docker-compose -p argus-sys ps; fi) >> "$DETAILS" 2>&1 || true
section SUMMARY
# Add HTTP failures and CORS problems to error log with tags
[[ $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health") != 200 ]] && echo "[es][http] health not 200" >> "$ERRORS"
kbcode=$(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status"); [[ "$kbcode" != 200 ]] && echo "[kibana][http] /api/status=$kbcode" >> "$ERRORS"
[[ $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz") != 200 ]] && echo "[master][http] /readyz not 200" >> "$ERRORS"
[[ $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready") != 200 ]] && echo "[prometheus][http] /-/ready not 200" >> "$ERRORS"
gfcode=$(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health"); [[ "$gfcode" != 200 ]] && echo "[grafana][http] /api/health=$gfcode" >> "$ERRORS"
[[ $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status") != 200 ]] && echo "[alertmanager][http] /api/v2/status not 200" >> "$ERRORS"
[[ -z "$cors8084" ]] && echo "[web-proxy][cors] 8084 missing Access-Control-Allow-Origin" >> "$ERRORS"
[[ -z "$cors8085" ]] && echo "[web-proxy][cors] 8085 missing Access-Control-Allow-Origin" >> "$ERRORS"
# Deduplicate errors
sort -u -o "$ERRORS" "$ERRORS"
# --- Prometheus targets & nodes.json checks ---
section PROMETHEUS-TARGETS
nodes_json_path="$ROOT/private/argus/metric/prometheus/nodes.json"
if [[ -f "$nodes_json_path" ]]; then
logd "nodes.json present: $nodes_json_path"
# detect gwbridge addresses (172.22/16)
if grep -E '"ip"\s*:\s*"172\.22\.' "$nodes_json_path" >/dev/null 2>&1; then
append_err "[prometheus][targets] contains gwbridge (172.22/16) addresses; prefer overlay (10.0/8)."
echo "HINT: nodes.json has 172.22.x.x targets. Ensure agent publishes overlay_ip and master prefers it." >&2
fi
else
logd "nodes.json missing at $nodes_json_path"
fi
# Query Prometheus activeTargets and list down items when possible
pt_json=$(curl -s --max-time 3 "http://localhost:${PROMETHEUS_PORT:-9090}/api/v1/targets" || true)
if command -v jq >/dev/null 2>&1; then
downs=$(printf '%s' "$pt_json" | jq -r '.data.activeTargets[] | select(.health=="down") | "[prometheus][activeTargets] down url=\(.scrapeUrl) lastError=\(.lastError)"' 2>/dev/null || true)
if [[ -n "$downs" ]]; then
printf '%s\n' "$downs" >> "$ERRORS"
fi
else
# best-effort grep when jq is unavailable
if printf '%s' "$pt_json" | grep -q '"health":"down"'; then
append_err "[prometheus][activeTargets] some targets are down (enable jq for detailed reasons)"
fi
fi
echo "Diagnostic details -> $DETAILS"
echo "Detected errors -> $ERRORS"
if [[ "$LOG_DIR" == "$ROOT/logs" ]]; then
# maintain latest symlinks when writing under package logs
ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true
ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true
else
echo "Diagnostic details -> $DETAILS"
echo "Detected errors -> $ERRORS"
fi
exit 0

View File

@ -1,365 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" # version root
PROJECT_NAME="argus-sys"
log() { echo -e "\033[0;34m[INSTALL]\033[0m $*"; }
err() { echo -e "\033[0;31m[ERROR ]\033[0m $*" >&2; }
require() { command -v "$1" >/dev/null 2>&1 || { err "missing command: $1"; exit 1; }; }
require docker
if docker compose version >/dev/null 2>&1; then COMPOSE=(docker compose); else require docker-compose; COMPOSE=(docker-compose); fi
ENV_FILE="$PKG_ROOT/compose/.env"
ENV_TEMPLATE="$PKG_ROOT/compose/.env.example"
find_free_port() {
local prefer="$1"; local start=${2:-20000}; local max=${3:-65000};
if ! ss -ltnH 2>/dev/null | awk -v pat=":"$prefer"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then echo "$prefer"; return; fi
for ((p=start; p<=max; p++)); do
if ! ss -ltnH 2>/dev/null | awk -v pat=":"$p"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then echo "$p"; return; fi
done
return 1
}
prepare_env() {
if [[ -f "$ENV_FILE" ]]; then log ".env exists, keep as-is"; return; fi
[[ -f "$ENV_TEMPLATE" ]] || { err "missing $ENV_TEMPLATE"; exit 1; }
cp "$ENV_TEMPLATE" "$ENV_FILE"
# overlay 模式下,避免为不同服务分配到同一新端口;保持模板端口不做自动改写
}
# read VAR from .env (simple parser)
_read_env_var() { local var="$1"; local f="$ENV_FILE"; [[ -f "$f" ]] || return 1; awk -F'=' -v k="$var" 'BEGIN{found=0} $1==k{print substr($0,index($0,"=")+1); found=1; exit} END{exit found?0:1}' "$f"; }
# set or append VAR=VAL in .env atomically
_set_env_var() {
local var="$1"; local val="$2"; local f="$ENV_FILE"; local tmp="$ENV_FILE.tmp$$"
if [[ -f "$f" ]] && grep -qE "^${var}=" "$f"; then
sed -E "s#^(${var}=).*#\\1${val}#" "$f" >"$tmp" && mv "$tmp" "$f"
else
[[ -f "$f" ]] || : >"$f"
printf "%s=%s\n" "$var" "$val" >>"$f"
fi
}
auto_assign_ports() {
local enable="${AUTO_ASSIGN_PORTS:-true}"
case "$enable" in
0|false|no|off) log "AUTO_ASSIGN_PORTS disabled"; return;;
esac
[[ -f "$ENV_FILE" ]] || return 0
log "auto-assigning free host ports (with fallback)"
cp "$ENV_FILE" "$ENV_FILE.bak.$(date +%Y%m%d-%H%M%S)" || true
# list of VAR:default pairs to try; FTP 相关端口与被动端口范围不自动改写
local pairs=(
"MASTER_PORT:32300"
"ES_HTTP_PORT:9200"
"KIBANA_PORT:5601"
"PROMETHEUS_PORT:9090"
"ALERTMANAGER_PORT:9093"
"GRAFANA_PORT:3000"
"WEB_PROXY_PORT_8080:8080"
"WEB_PROXY_PORT_8081:8081"
"WEB_PROXY_PORT_8082:8082"
"WEB_PROXY_PORT_8083:8083"
"WEB_PROXY_PORT_8084:8084"
"WEB_PROXY_PORT_8085:8085"
)
# track ports reserved in this run to avoid duplicates
declare -A reserved=()
# pre-mark currently listening ports to avoid choosing them twice within the same run
while read -r lp; do reserved["$lp"]=1; done < <(ss -ltnH 2>/dev/null | awk '{print $4}' | sed -n 's#.*:##p')
for ent in "${pairs[@]}"; do
local var=${ent%%:*}; local def=${ent##*:}
local cur
if ! cur=$(_read_env_var "$var"); then cur="$def"; fi
# strip quotes if any
cur=${cur%\r}; cur=${cur%\n}; cur=${cur//\"/}
# find a free port, avoiding ones we already reserved in this loop
local cand="$cur"
# if already in use or reserved, pick a free one
if ss -ltnH 2>/dev/null | awk -v pat=":"$cand"$" '$4 ~ pat{f=1} END{exit f?0:1}'; then
cand=$(find_free_port "$cand" 20000 65000)
fi
# avoid duplicates chosen in this loop
local attempts=0
while [[ -n "${reserved[$cand]:-}" ]]; do
attempts=$((attempts+1))
local start=$((cand+1)); [[ $start -lt 20000 ]] && start=20000
local next
next=$(find_free_port "$start" "$start" 65000 || true)
if [[ -z "$next" ]]; then
next=$(find_free_port 20000 20000 65000 || true)
fi
if [[ -z "$next" || "$next" == "$cand" ]]; then
err "no free port available while assigning for $var (last tried: $cand)"; exit 1
fi
cand="$next"
if (( attempts > 1000 )); then err "port assignment loop exceeded for $var"; exit 1; fi
done
reserved["$cand"]=1
if [[ "$cand" != "$cur" ]]; then
log " port reassigned: $var $cur -> $cand"
_set_env_var "$var" "$cand"
else
# ensure the var exists in .env for clarity
_set_env_var "$var" "$cand"
fi
done
}
prepare_data_dirs() {
if [[ $EUID -ne 0 ]]; then
echo -e "\033[1;33m[WARN]\033[0m running as non-root: will not chown data dirs."
echo -e "\033[1;33m[WARN]\033[0m If you hit Permission denied, run: sudo $SCRIPT_DIR/server-prepare-dirs.sh"
# still ensure basic directories exist (no chown)
mkdir -p \
"$PKG_ROOT/private/argus/etc" \
"$PKG_ROOT/private/argus/log/elasticsearch" \
"$PKG_ROOT/private/argus/log/kibana" \
"$PKG_ROOT/private/argus/metric/prometheus" \
"$PKG_ROOT/private/argus/metric/prometheus/data" \
"$PKG_ROOT/private/argus/metric/prometheus/rules" \
"$PKG_ROOT/private/argus/metric/grafana" \
"$PKG_ROOT/private/argus/metric/grafana/data" \
"$PKG_ROOT/private/argus/metric/grafana/logs" \
"$PKG_ROOT/private/argus/metric/grafana/plugins" \
"$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \
"$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \
"$PKG_ROOT/private/argus/metric/grafana/data/sessions" \
"$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \
"$PKG_ROOT/private/argus/alert/alertmanager" \
"$PKG_ROOT/private/argus/metric/ftp/share"
# non-root: relax permissions to avoid container UID mismatch blocking writes
chmod -R a+rwx "$PKG_ROOT/private/argus" 2>/dev/null || true
fi
}
ensure_swarm_and_overlay() {
local net_name="${OVERLAY_NET_NAME:-argus-sys-net}"
# Require swarm active
local state
state=$(docker info --format '{{.Swarm.LocalNodeState}}' 2>/dev/null || echo "")
if [[ "$state" != "active" ]]; then
err "Docker Swarm is not active. On this host run:"
err " docker swarm init --advertise-addr <this_host_ip>"
exit 1
fi
# Create attachable overlay if missing
if ! docker network inspect "$net_name" >/dev/null 2>&1; then
log "creating attachable overlay network: $net_name"
docker network create --driver overlay --attachable "$net_name" >/dev/null
fi
}
bootstrap_dns_conf() {
local etc_dir="$PKG_ROOT/private/argus/etc"
mkdir -p "$etc_dir"
local dns_file="$etc_dir/dns.conf"
if [[ ! -s "$dns_file" ]]; then
# detect host primary IP
local host_ip
host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7; exit}')
[[ -z "$host_ip" ]] && host_ip=$(hostname -I 2>/dev/null | awk '{print $1}')
if [[ -n "$host_ip" ]]; then
echo "$host_ip" > "$dns_file"
log "wrote initial dns.conf with host IP: $host_ip"
else
err "failed to determine host IP for dns.conf; please edit $dns_file manually"
fi
fi
}
load_images() {
local tar="$PKG_ROOT/images/all-images.tar.gz"
[[ -f "$tar" ]] || { err "missing images tar: $tar"; exit 1; }
log "loading images from $(basename "$tar") (may take minutes)"
gunzip -c "$tar" | docker load >/dev/null
}
bring_up() {
log "starting services via compose"
ensure_swarm_and_overlay
bootstrap_dns_conf
local ov="$PKG_ROOT/compose/docker-compose.os-compat.override.yml"
if [[ ! -f "$ov" ]]; then
cat > "$ov" <<'YAML'
services:
bind:
security_opt: ["label=disable"]
userns_mode: "host"
tmpfs:
- /run/named
master:
security_opt: ["label=disable"]
userns_mode: "host"
es:
security_opt: ["label=disable"]
userns_mode: "host"
kibana:
security_opt: ["label=disable"]
userns_mode: "host"
ftp:
security_opt: ["label=disable"]
userns_mode: "host"
prometheus:
security_opt: ["label=disable"]
userns_mode: "host"
grafana:
security_opt: ["label=disable"]
userns_mode: "host"
alertmanager:
security_opt: ["label=disable"]
userns_mode: "host"
# ensure runtime path matches container expectation
volumes:
- ../private/argus/etc:/private/argus/etc
- ../private/argus/alert/alertmanager:/alertmanager
web-frontend:
security_opt: ["label=disable"]
userns_mode: "host"
web-proxy:
security_opt: ["label=disable"]
userns_mode: "host"
YAML
log "generated OS-compat override: $(basename "$ov")"
fi
# 仅启动服务端组件避免误起测试节点node-a/node-b/test-node/test-gpu-node
local services=(bind master es kibana ftp prometheus grafana alertmanager web-frontend web-proxy)
log "services: ${services[*]}"
(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" -f docker-compose.yml -f $(basename "$ov") up -d "${services[@]}")
}
# Post bootstrap container-side fixes that do not require sudo on host.
post_bootstrap_fixes() {
# Kibana: ensure /usr/share/kibana/data is a symlink into mounted path to avoid EACCES
if docker ps --format '{{.Names}}' | grep -q '^argus-kibana-sys$'; then
docker exec argus-kibana-sys bash -lc '
set -e
mkdir -p /private/argus/log/kibana && chmod 777 /private/argus/log/kibana || true
if [ -d /usr/share/kibana/data ] && [ ! -L /usr/share/kibana/data ]; then rm -rf /usr/share/kibana/data; fi
if [ ! -e /usr/share/kibana/data ]; then ln -s /private/argus/log/kibana /usr/share/kibana/data; fi
' >/dev/null 2>&1 || true
fi
# Elasticsearch: ensure data path points to mounted path and is writable
if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then
docker exec argus-es-sys bash -lc '
set -e
mkdir -p /private/argus/log/elasticsearch && chmod 777 /private/argus/log/elasticsearch || true
if [ -d /usr/share/elasticsearch/data ] && [ ! -L /usr/share/elasticsearch/data ]; then rm -rf /usr/share/elasticsearch/data; fi
if [ ! -e /usr/share/elasticsearch/data ]; then ln -s /private/argus/log/elasticsearch /usr/share/elasticsearch/data; fi
' >/dev/null 2>&1 || true
fi
# Bind9: ensure rndc.key exists
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
docker exec argus-bind-sys bash -lc '
set -e
mkdir -p /etc/bind
if [ ! -f /etc/bind/rndc.key ]; then rndc-confgen -a -c /etc/bind/rndc.key; fi
chmod 644 /etc/bind/rndc.key || true
' >/dev/null 2>&1 || true
fi
}
dns_bootstrap() {
log "DNS bootstrap: initializing shared dns.conf and container resolv.conf"
local etc_dir="$PKG_ROOT/private/argus/etc"
mkdir -p "$etc_dir"
# 1) ensure dns.conf exists (fallback to bind IP 172.31.0.2)
if [[ ! -s "$etc_dir/dns.conf" ]]; then
if echo "172.31.0.2" > "$etc_dir/dns.conf" 2>/dev/null; then
log "wrote fallback dns.conf with 172.31.0.2"
else
# host-side write denied (ownership 1000:1000); write via bind container instead
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
docker exec argus-bind-sys sh -lc 'echo 172.31.0.2 > /private/argus/etc/dns.conf && chmod 644 /private/argus/etc/dns.conf' || true
log "fallback dns.conf written via bind container"
else
log "bind not ready; skip writing fallback dns.conf"
fi
fi
fi
# 2) wait briefly for bind to copy update-dns.sh into shared etc (bind startup.sh does this)
local i=0
while [[ ! -x "$etc_dir/update-dns.sh" && $i -lt 20 ]]; do
sleep 0.5; ((i++));
done
if [[ ! -x "$etc_dir/update-dns.sh" ]]; then
log "update-dns.sh not present yet; continuing with existing resolv.conf"
fi
# 3) run update-dns.sh inside key containers so /etc/resolv.conf points to bind
local c
for c in argus-master-sys argus-es-sys argus-kibana-sys argus-grafana argus-prometheus argus-ftp argus-web-frontend argus-web-proxy argus-alertmanager; do
if docker ps --format '{{.Names}}' | grep -q "^${c}$"; then
docker exec "$c" sh -lc 'test -x /private/argus/etc/update-dns.sh && /private/argus/etc/update-dns.sh || true' >/dev/null 2>&1 || true
fi
done
# 4) wait for service A-record hint files generated by services (best-effort)
local need=( es.log.argus.com kibana.log.argus.com master.argus.com grafana.metric.argus.com prom.metric.argus.com alertmanager.alert.argus.com )
local waited=0; local missing=1
while (( waited < 15 )); do
missing=0
for f in "${need[@]}"; do [[ -s "$etc_dir/$f" ]] || { missing=1; break; }; done
[[ $missing -eq 0 ]] && break
sleep 1; ((waited++))
done
# 5) reload bind zone (script uses supervisor to restart bind9)
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
docker exec argus-bind-sys sh -lc '/usr/local/bin/reload-bind9.sh' >/dev/null 2>&1 || true
fi
# 6) restart web-proxy once to re-render nginx resolver with latest dns.conf
if docker ps --format '{{.Names}}' | grep -q '^argus-web-proxy$'; then
docker restart argus-web-proxy >/dev/null 2>&1 || true
fi
}
selfcheck() {
# Initial selfcheck with retries to absorb cold starts
local max_retries="${SELF_CHECK_RETRIES:-5}" # 重试次数(不含首次),默认 5
local wait_seconds="${SELF_CHECK_WAIT_SECONDS:-30}" # 每次重试前等待秒数,默认 30s
local attempt=0
while :; do
attempt=$((attempt+1))
if (( attempt == 1 )); then
log "running selfcheck (attempt ${attempt})"
else
log "running selfcheck (attempt ${attempt}/${max_retries}+1)"
fi
if bash "$PKG_ROOT/scripts/server-selfcheck.sh"; then
return 0
fi
# failed
if (( attempt > max_retries )); then
err "selfcheck failed after ${attempt} attempt(s)"
exit 1
fi
log "selfcheck not ready yet; retrying in ${wait_seconds}s..."
sleep "$wait_seconds"
done
}
main() {
mkdir -p "$PKG_ROOT/logs"
prepare_env
auto_assign_ports
prepare_data_dirs
load_images
bring_up
post_bootstrap_fixes
dns_bootstrap
selfcheck
log "install completed. See logs in $PKG_ROOT/logs/"
}
main "$@"

View File

@ -1,73 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
if [[ $EUID -ne 0 ]]; then
echo "[PREPARE] This script requires root (sudo)." >&2
echo " Try: sudo $0" >&2
exit 1
fi
ENV_FILE="$PKG_ROOT/compose/.env"
[[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
UIDV="${ARGUS_BUILD_UID:-1000}"; GIDV="${ARGUS_BUILD_GID:-1000}"
echo "[PREPARE] Using owner ${UIDV}:${GIDV}"
# Core etc and service data dirs (aligned with src/sys/tests/scripts/01_bootstrap.sh)
mkdir -p \
"$PKG_ROOT/private/argus/etc" \
"$PKG_ROOT/private/argus/bind" \
"$PKG_ROOT/private/argus/master" \
"$PKG_ROOT/private/argus/agent" \
"$PKG_ROOT/private/argus/log/elasticsearch" \
"$PKG_ROOT/private/argus/log/kibana"
# Prometheus
mkdir -p \
"$PKG_ROOT/private/argus/metric/prometheus" \
"$PKG_ROOT/private/argus/metric/prometheus/data" \
"$PKG_ROOT/private/argus/metric/prometheus/rules" \
"$PKG_ROOT/private/argus/metric/prometheus/targets"
# Grafana
mkdir -p \
"$PKG_ROOT/private/argus/metric/grafana" \
"$PKG_ROOT/private/argus/metric/grafana/data" \
"$PKG_ROOT/private/argus/metric/grafana/logs" \
"$PKG_ROOT/private/argus/metric/grafana/plugins" \
"$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \
"$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \
"$PKG_ROOT/private/argus/metric/grafana/data/sessions" \
"$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \
"$PKG_ROOT/private/argus/metric/grafana/config"
# FTP
mkdir -p "$PKG_ROOT/private/argus/metric/ftp/share"
# Alertmanager
mkdir -p "$PKG_ROOT/private/argus/alert/alertmanager"
chown -R "$UIDV":"$GIDV" \
"$PKG_ROOT/private/argus/etc" \
"$PKG_ROOT/private/argus/bind" \
"$PKG_ROOT/private/argus/master" \
"$PKG_ROOT/private/argus/agent" \
"$PKG_ROOT/private/argus/log/elasticsearch" \
"$PKG_ROOT/private/argus/log/kibana" \
"$PKG_ROOT/private/argus/metric/prometheus" \
"$PKG_ROOT/private/argus/metric/grafana" \
"$PKG_ROOT/private/argus/metric/ftp" \
"$PKG_ROOT/private/argus/alert"
chmod -R g+w "$PKG_ROOT/private/argus/alert" "$PKG_ROOT/private/argus/etc" || true
# Ensure parent directories also owned by runtime user for consistency
chown "$UIDV":"$GIDV" \
"$PKG_ROOT/private/argus" \
"$PKG_ROOT/private/argus/log" \
"$PKG_ROOT/private/argus/metric" || true
echo "[PREPARE] Done. You can now run server-install.sh"

View File

@ -1,104 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
log() { echo -e "\033[0;34m[CHECK]\033[0m $*"; }
err() { echo -e "\033[0;31m[ERROR]\033[0m $*" >&2; }
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
wait_http() { local url="$1"; local attempts=${2:-120}; local i=1; while ((i<=attempts)); do curl -fsS "$url" >/dev/null 2>&1 && return 0; echo "[..] waiting $url ($i/$attempts)"; sleep 5; ((i++)); done; return 1; }
code_for() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; }
LOG_DIR="$ROOT/logs"
mkdir -p "$LOG_DIR" || true
OUT_JSON="$LOG_DIR/selfcheck.json"
tmp=$(mktemp)
ok=1
log "checking overlay network"
net_ok=false
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" >/dev/null 2>&1; then
if docker network inspect "${OVERLAY_NET_NAME:-argus-sys-net}" | grep -q '"Driver": "overlay"'; then net_ok=true; fi
fi
[[ "$net_ok" == true ]] || ok=0
log "checking Elasticsearch (via domain inside web-proxy)"
if docker exec argus-web-proxy sh -lc "curl -fsS http://es.log.argus.com:9200/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then es_ok=true; else es_ok=false; ok=0; fi
log "checking Kibana (via domain inside web-proxy)"
kb_code=$(docker exec argus-web-proxy sh -lc "curl -s -o /dev/null -w '%{http_code}' http://kibana.log.argus.com:5601/api/status" || echo 000)
kb_ok=false
if [[ "$kb_code" == "200" ]]; then body=$(curl -sS "http://localhost:${KIBANA_PORT:-5601}/api/status"); echo "$body" | grep -q '"level":"available"' && kb_ok=true; fi
[[ "$kb_ok" == true ]] || ok=0
log "checking Master (via domain inside web-proxy)"
if docker exec argus-web-proxy sh -lc "curl -fsS http://master.argus.com:3000/readyz" >/dev/null 2>&1; then true; else ok=0; fi
log "checking FTP"
if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then
if docker exec argus-ftp sh -lc 'test -w /private/argus/ftp/share'; then ftp_ok=true; else ftp_ok=false; ok=0; fi
else
ftp_ok=false; ok=0;
fi
log "checking Prometheus"
wait_http "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready" 60 || ok=0
log "checking Grafana"
gf_code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:${GRAFANA_PORT:-3000}/api/health" || echo 000)
gf_ok=false; if [[ "$gf_code" == "200" ]]; then body=$(curl -sS "http://localhost:${GRAFANA_PORT:-3000}/api/health"); echo "$body" | grep -q '"database"\s*:\s*"ok"' && gf_ok=true; fi
[[ "$gf_ok" == true ]] || ok=0
log "checking Alertmanager"
wait_http "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status" 60 || ok=0
log "checking Web-Proxy"
p8080=$(code_for "http://localhost:${WEB_PROXY_PORT_8080:-8080}/")
p8083=$(code_for "http://localhost:${WEB_PROXY_PORT_8083:-8083}/")
cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true)
cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true)
wp_ok=true
# 有些环境首页可能 403此处接受 200/403
([[ "$p8080" == 200 || "$p8080" == 403 ]]) || wp_ok=false
([[ "$p8083" == 200 || "$p8083" == 302 || "$p8083" == 403 ]]) || wp_ok=false
[[ -n "$cors8084" && -n "$cors8085" ]] || wp_ok=false
[[ "$wp_ok" == true ]] || ok=0
cat > "$tmp" <<JSON
{
"es": $es_ok,
"kibana": $kb_ok,
"master_readyz": true,
"ftp_share_writable": $ftp_ok,
"prometheus": true,
"grafana": $gf_ok,
"alertmanager": true,
"web_proxy": $wp_ok,
"overlay_net": $net_ok,
"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
}
JSON
if ! mv "$tmp" "$OUT_JSON" 2>/dev/null; then
# fallback when logs dir not writable (no sudo allowed)
OUT_JSON="/tmp/selfcheck_$(date -u +%Y%m%d-%H%M%SZ).json"
cp "$tmp" "$OUT_JSON"
log "selfcheck.json written to $OUT_JSON (logs dir not writable)"
fi
if [[ "$ok" == 1 ]]; then
log "selfcheck OK"
exit 0
else
err "selfcheck FAILED (see $OUT_JSON)"
# If diagnose script exists, run it to collect more details
if [[ -x "$SCRIPT_DIR/server-diagnose.sh" ]]; then
# run diagnose; it will print the actual timestamped file paths and update 'latest' symlinks
"$SCRIPT_DIR/server-diagnose.sh" || true
fi
exit 1
fi

View File

@ -1,28 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
PROJECT_NAME="argus-sys"
if docker compose version >/dev/null 2>&1; then COMPOSE=(docker compose); else COMPOSE=(docker-compose); fi
echo "== Containers =="
(cd "$ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" ps)
echo
echo "== Key Endpoints =="
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
printf "master http://localhost:%s/readyz\n" "${MASTER_PORT:-32300}"
printf "es http://localhost:%s/_cluster/health\n" "${ES_HTTP_PORT:-9200}"
printf "kibana http://localhost:%s/api/status\n" "${KIBANA_PORT:-5601}"
printf "prom http://localhost:%s/-/ready\n" "${PROMETHEUS_PORT:-9090}"
printf "grafana http://localhost:%s/api/health\n" "${GRAFANA_PORT:-3000}"
printf "alert http://localhost:%s/api/v2/status\n" "${ALERTMANAGER_PORT:-9093}"
printf "web http://localhost:%s/ (8080)\n" "${WEB_PROXY_PORT_8080:-8080}"
echo
echo "== Selfcheck result =="
cat "$ROOT/logs/selfcheck.json" 2>/dev/null || echo "(no selfcheck yet)"

View File

@ -1,16 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
PROJECT_NAME="argus-sys"
log() { echo -e "\033[0;34m[UNINSTALL]\033[0m $*"; }
if docker compose version >/dev/null 2>&1; then COMPOSE=(docker compose); else COMPOSE=(docker-compose); fi
(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" down -v || true)
log "compose stack removed"
log "you may remove data under $PKG_ROOT/private if you want a clean slate"

View File

@ -37,22 +37,11 @@ _argus_is_number() {
[[ "$1" =~ ^[0-9]+$ ]] [[ "$1" =~ ^[0-9]+$ ]]
} }
load_build_user() { _argus_read_user_from_files() {
if [[ "$_ARGUS_BUILD_USER_LOADED" == "1" ]]; then local uid_out_var="$1" gid_out_var="$2"; shift 2
return 0 local uid_val="$ARGUS_BUILD_UID_DEFAULT" gid_val="$ARGUS_BUILD_GID_DEFAULT"
fi local config
for config in "$@"; do
local project_root config_files config uid gid
project_root="$(argus_project_root)"
config_files=(
"$project_root/configs/build_user.local.conf"
"$project_root/configs/build_user.conf"
)
uid="$ARGUS_BUILD_UID_DEFAULT"
gid="$ARGUS_BUILD_GID_DEFAULT"
for config in "${config_files[@]}"; do
if [[ -f "$config" ]]; then if [[ -f "$config" ]]; then
while IFS= read -r raw_line || [[ -n "$raw_line" ]]; do while IFS= read -r raw_line || [[ -n "$raw_line" ]]; do
local line key value local line key value
@ -68,42 +57,58 @@ load_build_user() {
key="$(_argus_trim "$key")" key="$(_argus_trim "$key")"
value="$(_argus_trim "$value")" value="$(_argus_trim "$value")"
case "$key" in case "$key" in
UID) UID) uid_val="$value" ;;
uid="$value" GID) gid_val="$value" ;;
;; *) echo "[ARGUS build_user] Unknown key '$key' in $config" >&2 ;;
GID)
gid="$value"
;;
*)
echo "[ARGUS build_user] Unknown key '$key' in $config" >&2
;;
esac esac
done < "$config" done < "$config"
break break
fi fi
done done
printf -v "$uid_out_var" '%s' "$uid_val"
printf -v "$gid_out_var" '%s' "$gid_val"
}
if [[ -n "${ARGUS_BUILD_UID:-}" ]]; then load_build_user_profile() {
uid="$ARGUS_BUILD_UID" local profile="${1:-default}"
fi if [[ "$_ARGUS_BUILD_USER_LOADED" == "1" ]]; then
if [[ -n "${ARGUS_BUILD_GID:-}" ]]; then return 0
gid="$ARGUS_BUILD_GID"
fi fi
local project_root uid gid
project_root="$(argus_project_root)"
case "$profile" in
pkg)
_argus_read_user_from_files uid gid \
"$project_root/configs/build_user.pkg.conf" \
"$project_root/configs/build_user.local.conf" \
"$project_root/configs/build_user.conf"
;;
default|*)
_argus_read_user_from_files uid gid \
"$project_root/configs/build_user.local.conf" \
"$project_root/configs/build_user.conf"
;;
esac
if [[ -n "${ARGUS_BUILD_UID:-}" ]]; then uid="$ARGUS_BUILD_UID"; fi
if [[ -n "${ARGUS_BUILD_GID:-}" ]]; then gid="$ARGUS_BUILD_GID"; fi
if ! _argus_is_number "$uid"; then if ! _argus_is_number "$uid"; then
echo "[ARGUS build_user] Invalid UID '$uid'" >&2 echo "[ARGUS build_user] Invalid UID '$uid'" >&2; return 1
return 1
fi fi
if ! _argus_is_number "$gid"; then if ! _argus_is_number "$gid"; then
echo "[ARGUS build_user] Invalid GID '$gid'" >&2 echo "[ARGUS build_user] Invalid GID '$gid'" >&2; return 1
return 1
fi fi
export ARGUS_BUILD_UID="$uid" export ARGUS_BUILD_UID="$uid"
export ARGUS_BUILD_GID="$gid" export ARGUS_BUILD_GID="$gid"
_ARGUS_BUILD_USER_LOADED=1 _ARGUS_BUILD_USER_LOADED=1
} }
load_build_user() {
local profile="${ARGUS_BUILD_PROFILE:-default}"
load_build_user_profile "$profile"
}
argus_build_user_args() { argus_build_user_args() {
load_build_user load_build_user
printf '%s' "--build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID}" printf '%s' "--build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} --build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID}"

View File

@ -1 +1 @@
1.43.0 1.44.0

0
src/sys/tests/scripts/16_web_verify.sh Normal file → Executable file
View File

1
src/web/.gitignore vendored
View File

@ -7,6 +7,7 @@ playwright-report/
# Build output # Build output
/dist /dist
/build /build
/test-results
# Dependency directories # Dependency directories
jspm_packages/ jspm_packages/