完成H20服务器部署及重启测试 #51
1
deployment_new/.gitignore
vendored
Normal file
1
deployment_new/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
artifact/
|
||||||
14
deployment_new/README.md
Normal file
14
deployment_new/README.md
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
# deployment_new
|
||||||
|
|
||||||
|
本目录用于新的部署打包与交付实现(不影响既有 `deployment/`)。
|
||||||
|
|
||||||
|
里程碑 M1(当前实现)
|
||||||
|
- `build/make_server_package.sh`:生成 Server 包(逐服务镜像 tar.gz、compose、.env.example、docs、private 骨架、manifest/checksums、打包 tar.gz)。
|
||||||
|
- `build/make_client_gpu_package.sh`:生成 Client‑GPU 包(GPU bundle 镜像 tar.gz、busybox.tar、compose、.env.example、docs、private 骨架、manifest/checksums、打包 tar.gz)。
|
||||||
|
|
||||||
|
模板
|
||||||
|
- `templates/server/compose/docker-compose.yml`:部署专用,镜像默认使用 `:${PKG_VERSION}` 版本 tag,可通过 `.env` 覆盖。
|
||||||
|
- `templates/client_gpu/compose/docker-compose.yml`:GPU 节点专用,使用 `:${PKG_VERSION}` 版本 tag。
|
||||||
|
|
||||||
|
注意:M1 仅产出安装包,不包含安装脚本落地;安装/运维脚本将在 M2 落地并纳入包内。
|
||||||
|
|
||||||
33
deployment_new/build/common.sh
Normal file
33
deployment_new/build/common.sh
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
log() { echo -e "\033[0;34m[INFO]\033[0m $*"; }
|
||||||
|
warn() { echo -e "\033[1;33m[WARN]\033[0m $*"; }
|
||||||
|
err() { echo -e "\033[0;31m[ERR ]\033[0m $*" >&2; }
|
||||||
|
|
||||||
|
require_cmd() {
|
||||||
|
local miss=0
|
||||||
|
for c in "$@"; do
|
||||||
|
if ! command -v "$c" >/dev/null 2>&1; then err "missing command: $c"; miss=1; fi
|
||||||
|
done
|
||||||
|
[[ $miss -eq 0 ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
today_version() { date +%Y%m%d; }
|
||||||
|
|
||||||
|
checksum_dir() {
|
||||||
|
local dir="$1"; local out="$2"; : > "$out";
|
||||||
|
(cd "$dir" && find . -type f -print0 | sort -z | xargs -0 sha256sum) >> "$out"
|
||||||
|
}
|
||||||
|
|
||||||
|
make_dir() { mkdir -p "$1"; }
|
||||||
|
|
||||||
|
copy_tree() {
|
||||||
|
local src="$1" dst="$2"; rsync -a --delete "$src/" "$dst/" 2>/dev/null || cp -r "$src/." "$dst/";
|
||||||
|
}
|
||||||
|
|
||||||
|
gen_manifest() {
|
||||||
|
local root="$1"; local out="$2"; : > "$out";
|
||||||
|
(cd "$root" && find . -maxdepth 4 -type f -printf "%p\n" | sort) >> "$out"
|
||||||
|
}
|
||||||
|
|
||||||
131
deployment_new/build/make_client_gpu_package.sh
Executable file
131
deployment_new/build/make_client_gpu_package.sh
Executable file
@ -0,0 +1,131 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Make client GPU package (versioned gpu bundle image, compose, env, docs, busybox)
|
||||||
|
|
||||||
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||||
|
TEMPL_DIR="$ROOT_DIR/deployment_new/templates/client_gpu"
|
||||||
|
ART_ROOT="$ROOT_DIR/deployment_new/artifact/client_gpu"
|
||||||
|
|
||||||
|
# Use deployment_new local common helpers
|
||||||
|
COMMON_SH="$ROOT_DIR/deployment_new/build/common.sh"
|
||||||
|
. "$COMMON_SH"
|
||||||
|
|
||||||
|
usage(){ cat <<EOF
|
||||||
|
Build Client-GPU Package (deployment_new)
|
||||||
|
|
||||||
|
Usage: $(basename "$0") --version YYYYMMDD [--image IMAGE[:TAG]]
|
||||||
|
|
||||||
|
Defaults:
|
||||||
|
image = argus-sys-metric-test-node-bundle-gpu:latest
|
||||||
|
|
||||||
|
Outputs: deployment_new/artifact/client_gpu/<YYYYMMDD>/ and client_gpu_YYYYMMDD.tar.gz
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
VERSION=""
|
||||||
|
IMAGE="argus-sys-metric-test-node-bundle-gpu:latest"
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--version) VERSION="$2"; shift 2;;
|
||||||
|
--image) IMAGE="$2"; shift 2;;
|
||||||
|
-h|--help) usage; exit 0;;
|
||||||
|
*) err "unknown arg: $1"; usage; exit 1;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
if [[ -z "$VERSION" ]]; then VERSION="$(today_version)"; fi
|
||||||
|
|
||||||
|
require_cmd docker tar gzip
|
||||||
|
|
||||||
|
STAGE="$(mktemp -d)"; trap 'rm -rf "$STAGE"' EXIT
|
||||||
|
PKG_DIR="$ART_ROOT/$VERSION"
|
||||||
|
mkdir -p "$PKG_DIR" "$STAGE/images" "$STAGE/compose" "$STAGE/docs" "$STAGE/scripts" "$STAGE/private/argus"
|
||||||
|
|
||||||
|
# 1) Save GPU bundle image with version tag
|
||||||
|
if ! docker image inspect "$IMAGE" >/dev/null 2>&1; then
|
||||||
|
err "missing image: $IMAGE"; exit 1; fi
|
||||||
|
|
||||||
|
REPO="${IMAGE%%:*}"; TAG_VER="$REPO:$VERSION"
|
||||||
|
docker tag "$IMAGE" "$TAG_VER"
|
||||||
|
out_tar="$STAGE/images/${REPO//\//-}-$VERSION.tar"
|
||||||
|
docker save -o "$out_tar" "$TAG_VER"
|
||||||
|
gzip -f "$out_tar"
|
||||||
|
|
||||||
|
# 2) Busybox tar for connectivity/overlay warmup (prefer local template; fallback to docker save)
|
||||||
|
BB_SRC="$TEMPL_DIR/images/busybox.tar"
|
||||||
|
if [[ -f "$BB_SRC" ]]; then
|
||||||
|
cp "$BB_SRC" "$STAGE/images/busybox.tar"
|
||||||
|
else
|
||||||
|
if docker image inspect busybox:latest >/dev/null 2>&1 || docker pull busybox:latest >/dev/null 2>&1; then
|
||||||
|
docker save -o "$STAGE/images/busybox.tar" busybox:latest
|
||||||
|
log "Included busybox from local docker daemon"
|
||||||
|
else
|
||||||
|
warn "busybox image not found and cannot pull; skipping busybox.tar"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 3) Compose + env template and docs/scripts from templates
|
||||||
|
cp "$TEMPL_DIR/compose/docker-compose.yml" "$STAGE/compose/docker-compose.yml"
|
||||||
|
ENV_EX="$STAGE/compose/.env.example"
|
||||||
|
cat >"$ENV_EX" <<EOF
|
||||||
|
# Generated by make_client_gpu_package.sh
|
||||||
|
PKG_VERSION=$VERSION
|
||||||
|
|
||||||
|
NODE_GPU_BUNDLE_IMAGE_TAG=${REPO}:${VERSION}
|
||||||
|
|
||||||
|
# Required (no defaults). Must be filled before install.
|
||||||
|
AGENT_ENV=
|
||||||
|
AGENT_USER=
|
||||||
|
AGENT_INSTANCE=
|
||||||
|
GPU_NODE_HOSTNAME=
|
||||||
|
|
||||||
|
# From cluster-info.env (server package output)
|
||||||
|
BINDIP=
|
||||||
|
FTPIP=
|
||||||
|
SWARM_MANAGER_ADDR=
|
||||||
|
SWARM_JOIN_TOKEN_WORKER=
|
||||||
|
SWARM_JOIN_TOKEN_MANAGER=
|
||||||
|
|
||||||
|
# FTP defaults
|
||||||
|
FTP_USER=ftpuser
|
||||||
|
FTP_PASSWORD=NASPlab1234!
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# 4) Docs from deployment_new templates
|
||||||
|
CLIENT_DOC_SRC="$TEMPL_DIR/docs"
|
||||||
|
if [[ -d "$CLIENT_DOC_SRC" ]]; then
|
||||||
|
rsync -a "$CLIENT_DOC_SRC/" "$STAGE/docs/" >/dev/null 2>&1 || cp -r "$CLIENT_DOC_SRC/." "$STAGE/docs/"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Placeholder scripts (will be implemented in M2)
|
||||||
|
cat >"$STAGE/scripts/README.md" <<'EOF'
|
||||||
|
# Client-GPU Scripts (Placeholder)
|
||||||
|
|
||||||
|
本目录将在 M2 引入:
|
||||||
|
- config.sh / install.sh
|
||||||
|
|
||||||
|
当前为占位,便于包结构审阅。
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# 5) Scripts (from deployment_new templates) and Private skeleton
|
||||||
|
SCRIPTS_SRC="$TEMPL_DIR/scripts"
|
||||||
|
if [[ -d "$SCRIPTS_SRC" ]]; then
|
||||||
|
rsync -a "$SCRIPTS_SRC/" "$STAGE/scripts/" >/dev/null 2>&1 || cp -r "$SCRIPTS_SRC/." "$STAGE/scripts/"
|
||||||
|
find "$STAGE/scripts" -type f -name '*.sh' -exec chmod +x {} + 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
mkdir -p "$STAGE/private/argus/agent"
|
||||||
|
|
||||||
|
# 6) Manifest & checksums
|
||||||
|
gen_manifest "$STAGE" "$STAGE/manifest.txt"
|
||||||
|
checksum_dir "$STAGE" "$STAGE/checksums.txt"
|
||||||
|
|
||||||
|
# 7) Move to artifact dir and pack
|
||||||
|
mkdir -p "$PKG_DIR"
|
||||||
|
rsync -a "$STAGE/" "$PKG_DIR/" >/dev/null 2>&1 || cp -r "$STAGE/." "$PKG_DIR/"
|
||||||
|
|
||||||
|
OUT_TAR_DIR="$(dirname "$PKG_DIR")"
|
||||||
|
OUT_TAR="$OUT_TAR_DIR/client_gpu_${VERSION}.tar.gz"
|
||||||
|
log "Creating tarball: $OUT_TAR"
|
||||||
|
(cd "$PKG_DIR/.." && tar -czf "$OUT_TAR" "$(basename "$PKG_DIR")")
|
||||||
|
log "Client-GPU package ready: $PKG_DIR"
|
||||||
|
echo "$OUT_TAR"
|
||||||
166
deployment_new/build/make_server_package.sh
Executable file
166
deployment_new/build/make_server_package.sh
Executable file
@ -0,0 +1,166 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Make server deployment package (versioned, per-image tars, full compose, docs, skeleton)
|
||||||
|
|
||||||
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||||
|
TEMPL_DIR="$ROOT_DIR/deployment_new/templates/server"
|
||||||
|
ART_ROOT="$ROOT_DIR/deployment_new/artifact/server"
|
||||||
|
|
||||||
|
# Use deployment_new local common helpers
|
||||||
|
COMMON_SH="$ROOT_DIR/deployment_new/build/common.sh"
|
||||||
|
. "$COMMON_SH"
|
||||||
|
|
||||||
|
usage(){ cat <<EOF
|
||||||
|
Build Server Deployment Package (deployment_new)
|
||||||
|
|
||||||
|
Usage: $(basename "$0") --version YYYYMMDD
|
||||||
|
|
||||||
|
Outputs: deployment_new/artifact/server/<YYYYMMDD>/ and server_YYYYMMDD.tar.gz
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
VERSION=""
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--version) VERSION="$2"; shift 2;;
|
||||||
|
-h|--help) usage; exit 0;;
|
||||||
|
*) err "unknown arg: $1"; usage; exit 1;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
if [[ -z "$VERSION" ]]; then VERSION="$(today_version)"; fi
|
||||||
|
|
||||||
|
require_cmd docker tar gzip awk sed
|
||||||
|
|
||||||
|
IMAGES=(
|
||||||
|
argus-bind9
|
||||||
|
argus-master
|
||||||
|
argus-elasticsearch
|
||||||
|
argus-kibana
|
||||||
|
argus-metric-ftp
|
||||||
|
argus-metric-prometheus
|
||||||
|
argus-metric-grafana
|
||||||
|
argus-alertmanager
|
||||||
|
argus-web-frontend
|
||||||
|
argus-web-proxy
|
||||||
|
)
|
||||||
|
|
||||||
|
STAGE="$(mktemp -d)"; trap 'rm -rf "$STAGE"' EXIT
|
||||||
|
PKG_DIR="$ART_ROOT/$VERSION"
|
||||||
|
mkdir -p "$PKG_DIR" "$STAGE/images" "$STAGE/compose" "$STAGE/docs" "$STAGE/scripts" "$STAGE/private/argus"
|
||||||
|
|
||||||
|
# 1) Save per-image tars with version tag
|
||||||
|
log "Tagging and saving images (version=$VERSION)"
|
||||||
|
for repo in "${IMAGES[@]}"; do
|
||||||
|
if ! docker image inspect "$repo:latest" >/dev/null 2>&1 && ! docker image inspect "$repo:$VERSION" >/dev/null 2>&1; then
|
||||||
|
err "missing image: $repo (need :latest or :$VERSION)"; exit 1; fi
|
||||||
|
if docker image inspect "$repo:$VERSION" >/dev/null 2>&1; then
|
||||||
|
tag="$repo:$VERSION"
|
||||||
|
else
|
||||||
|
docker tag "$repo:latest" "$repo:$VERSION"
|
||||||
|
tag="$repo:$VERSION"
|
||||||
|
fi
|
||||||
|
out_tar="$STAGE/images/${repo//\//-}-$VERSION.tar"
|
||||||
|
docker save -o "$out_tar" "$tag"
|
||||||
|
gzip -f "$out_tar"
|
||||||
|
done
|
||||||
|
|
||||||
|
# 2) Compose + env template
|
||||||
|
cp "$TEMPL_DIR/compose/docker-compose.yml" "$STAGE/compose/docker-compose.yml"
|
||||||
|
ENV_EX="$STAGE/compose/.env.example"
|
||||||
|
cat >"$ENV_EX" <<EOF
|
||||||
|
# Generated by make_server_package.sh
|
||||||
|
PKG_VERSION=$VERSION
|
||||||
|
|
||||||
|
# Image tags (can be overridden). Default to versioned tags
|
||||||
|
BIND_IMAGE_TAG=argus-bind9:
|
||||||
|
MASTER_IMAGE_TAG=argus-master:
|
||||||
|
ES_IMAGE_TAG=argus-elasticsearch:
|
||||||
|
KIBANA_IMAGE_TAG=argus-kibana:
|
||||||
|
FTP_IMAGE_TAG=argus-metric-ftp:
|
||||||
|
PROM_IMAGE_TAG=argus-metric-prometheus:
|
||||||
|
GRAFANA_IMAGE_TAG=argus-metric-grafana:
|
||||||
|
ALERT_IMAGE_TAG=argus-alertmanager:
|
||||||
|
FRONT_IMAGE_TAG=argus-web-frontend:
|
||||||
|
WEB_PROXY_IMAGE_TAG=argus-web-proxy:
|
||||||
|
EOF
|
||||||
|
sed -i "s#:\$#:${VERSION}#g" "$ENV_EX"
|
||||||
|
|
||||||
|
# Ports and defaults (based on swarm_tests .env.example)
|
||||||
|
cat >>"$ENV_EX" <<'EOF'
|
||||||
|
|
||||||
|
# Host ports for server compose
|
||||||
|
MASTER_PORT=32300
|
||||||
|
ES_HTTP_PORT=9200
|
||||||
|
KIBANA_PORT=5601
|
||||||
|
PROMETHEUS_PORT=9090
|
||||||
|
GRAFANA_PORT=3000
|
||||||
|
ALERTMANAGER_PORT=9093
|
||||||
|
WEB_PROXY_PORT_8080=8080
|
||||||
|
WEB_PROXY_PORT_8081=8081
|
||||||
|
WEB_PROXY_PORT_8082=8082
|
||||||
|
WEB_PROXY_PORT_8083=8083
|
||||||
|
WEB_PROXY_PORT_8084=8084
|
||||||
|
WEB_PROXY_PORT_8085=8085
|
||||||
|
|
||||||
|
# Overlay network name
|
||||||
|
ARGUS_OVERLAY_NET=argus-sys-net
|
||||||
|
|
||||||
|
# FTP defaults
|
||||||
|
FTP_USER=ftpuser
|
||||||
|
FTP_PASSWORD=NASPlab1234!
|
||||||
|
|
||||||
|
# UID/GID for volume ownership
|
||||||
|
ARGUS_BUILD_UID=2133
|
||||||
|
ARGUS_BUILD_GID=2015
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# 3) Docs (from deployment_new templates)
|
||||||
|
DOCS_SRC="$TEMPL_DIR/docs"
|
||||||
|
if [[ -d "$DOCS_SRC" ]]; then
|
||||||
|
rsync -a "$DOCS_SRC/" "$STAGE/docs/" >/dev/null 2>&1 || cp -r "$DOCS_SRC/." "$STAGE/docs/"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 6) Scripts (from deployment_new templates)
|
||||||
|
SCRIPTS_SRC="$TEMPL_DIR/scripts"
|
||||||
|
if [[ -d "$SCRIPTS_SRC" ]]; then
|
||||||
|
rsync -a "$SCRIPTS_SRC/" "$STAGE/scripts/" >/dev/null 2>&1 || cp -r "$SCRIPTS_SRC/." "$STAGE/scripts/"
|
||||||
|
find "$STAGE/scripts" -type f -name '*.sh' -exec chmod +x {} + 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 4) Private skeleton (minimum)
|
||||||
|
mkdir -p \
|
||||||
|
"$STAGE/private/argus/etc" \
|
||||||
|
"$STAGE/private/argus/master" \
|
||||||
|
"$STAGE/private/argus/metric/prometheus" \
|
||||||
|
"$STAGE/private/argus/metric/prometheus/data" \
|
||||||
|
"$STAGE/private/argus/metric/prometheus/rules" \
|
||||||
|
"$STAGE/private/argus/metric/prometheus/targets" \
|
||||||
|
"$STAGE/private/argus/metric/grafana" \
|
||||||
|
"$STAGE/private/argus/metric/grafana/data" \
|
||||||
|
"$STAGE/private/argus/metric/grafana/logs" \
|
||||||
|
"$STAGE/private/argus/metric/grafana/plugins" \
|
||||||
|
"$STAGE/private/argus/metric/grafana/provisioning/datasources" \
|
||||||
|
"$STAGE/private/argus/metric/grafana/provisioning/dashboards" \
|
||||||
|
"$STAGE/private/argus/metric/grafana/data/sessions" \
|
||||||
|
"$STAGE/private/argus/metric/grafana/data/dashboards" \
|
||||||
|
"$STAGE/private/argus/metric/grafana/config" \
|
||||||
|
"$STAGE/private/argus/metric/ftp" \
|
||||||
|
"$STAGE/private/argus/alert/alertmanager" \
|
||||||
|
"$STAGE/private/argus/log/elasticsearch" \
|
||||||
|
"$STAGE/private/argus/log/kibana"
|
||||||
|
|
||||||
|
# 7) Manifest & checksums
|
||||||
|
gen_manifest "$STAGE" "$STAGE/manifest.txt"
|
||||||
|
checksum_dir "$STAGE" "$STAGE/checksums.txt"
|
||||||
|
|
||||||
|
# 8) Move to artifact dir and pack
|
||||||
|
mkdir -p "$PKG_DIR"
|
||||||
|
rsync -a "$STAGE/" "$PKG_DIR/" >/dev/null 2>&1 || cp -r "$STAGE/." "$PKG_DIR/"
|
||||||
|
|
||||||
|
OUT_TAR_DIR="$(dirname "$PKG_DIR")"
|
||||||
|
OUT_TAR="$OUT_TAR_DIR/server_${VERSION}.tar.gz"
|
||||||
|
log "Creating tarball: $OUT_TAR"
|
||||||
|
(cd "$PKG_DIR/.." && tar -czf "$OUT_TAR" "$(basename "$PKG_DIR")")
|
||||||
|
log "Server package ready: $PKG_DIR"
|
||||||
|
echo "$OUT_TAR"
|
||||||
@ -0,0 +1,41 @@
|
|||||||
|
version: "3.8"
|
||||||
|
|
||||||
|
networks:
|
||||||
|
argus-sys-net:
|
||||||
|
external: true
|
||||||
|
|
||||||
|
services:
|
||||||
|
metric-gpu-node:
|
||||||
|
image: ${NODE_GPU_BUNDLE_IMAGE_TAG:-argus-sys-metric-test-node-bundle-gpu:${PKG_VERSION}}
|
||||||
|
container_name: argus-metric-gpu-node-swarm
|
||||||
|
hostname: ${GPU_NODE_HOSTNAME}
|
||||||
|
restart: unless-stopped
|
||||||
|
privileged: true
|
||||||
|
runtime: nvidia
|
||||||
|
environment:
|
||||||
|
- TZ=Asia/Shanghai
|
||||||
|
- DEBIAN_FRONTEND=noninteractive
|
||||||
|
- MASTER_ENDPOINT=${MASTER_ENDPOINT:-http://master.argus.com:3000}
|
||||||
|
# Fluent Bit / 日志上报目标(固定域名)
|
||||||
|
- ES_HOST=es.log.argus.com
|
||||||
|
- ES_PORT=9200
|
||||||
|
- FTPIP=${FTPIP}
|
||||||
|
- BINDIP=${BINDIP}
|
||||||
|
- FTP_USER=${FTP_USER:-ftpuser}
|
||||||
|
- FTP_PASSWORD=${FTP_PASSWORD:-NASPlab1234!}
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
- AGENT_ENV=${AGENT_ENV}
|
||||||
|
- AGENT_USER=${AGENT_USER}
|
||||||
|
- AGENT_INSTANCE=${AGENT_INSTANCE}
|
||||||
|
- NVIDIA_VISIBLE_DEVICES=all
|
||||||
|
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||||
|
- GPU_MODE=gpu
|
||||||
|
dns:
|
||||||
|
- ${BINDIP}
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
volumes:
|
||||||
|
- ../private/argus/agent:/private/argus/agent
|
||||||
|
- ../logs/infer:/logs/infer
|
||||||
|
- ../logs/train:/logs/train
|
||||||
|
command: ["sleep", "infinity"]
|
||||||
@ -0,0 +1,73 @@
|
|||||||
|
# Argus Client‑GPU 安装指南(deployment_new)
|
||||||
|
|
||||||
|
## 一、准备条件(开始前确认)
|
||||||
|
- GPU 节点安装了 NVIDIA 驱动,`nvidia-smi` 正常;
|
||||||
|
- Docker & Docker Compose v2 已安装;
|
||||||
|
- 使用统一账户 `argus`(UID=2133,GID=2015)执行安装,并加入 `docker` 组(如已创建可跳过):
|
||||||
|
```bash
|
||||||
|
sudo groupadd --gid 2015 argus || true
|
||||||
|
sudo useradd --uid 2133 --gid 2015 --create-home --shell /bin/bash argus || true
|
||||||
|
sudo passwd argus
|
||||||
|
sudo usermod -aG docker argus
|
||||||
|
su - argus -c 'id; docker ps >/dev/null && echo OK || echo NO_DOCKER_PERMISSION'
|
||||||
|
```
|
||||||
|
后续解压与执行(config/install/uninstall)均使用 `argus` 账户进行。
|
||||||
|
- 从 Server 安装方拿到 `cluster-info.env`(包含 `SWARM_MANAGER_ADDR/BINDIP/FTPIP/SWARM_JOIN_TOKEN_*`)。
|
||||||
|
|
||||||
|
## 二、解包
|
||||||
|
- `tar -xzf client_gpu_YYYYMMDD.tar.gz`
|
||||||
|
- 进入目录:`cd client_gpu_YYYYMMDD/`
|
||||||
|
- 你应当看到:`images/`(GPU bundle、busybox)、`compose/`、`scripts/`、`docs/`。
|
||||||
|
|
||||||
|
## 三、配置 config(预热 overlay + 生成 .env)
|
||||||
|
命令:
|
||||||
|
```
|
||||||
|
cp /path/to/cluster-info.env ./ # 或 export CLUSTER_INFO=/abs/path/cluster-info.env
|
||||||
|
./scripts/config.sh
|
||||||
|
```
|
||||||
|
脚本做了什么:
|
||||||
|
- 读取 `cluster-info.env` 并 `docker swarm join`(幂等);
|
||||||
|
- 自动用 busybox 预热 external overlay `argus-sys-net`,等待最多 60s 直到本机可见;
|
||||||
|
- 生成/更新 `compose/.env`:填入 `BINDIP/FTPIP/SWARM_*`,并“保留你已填写的 AGENT_* 与 GPU_NODE_HOSTNAME”(不会覆盖)。
|
||||||
|
|
||||||
|
看到什么才算成功:
|
||||||
|
- 终端输出类似:`已预热 overlay=argus-sys-net 并生成 compose/.env;可执行 scripts/install.sh`;
|
||||||
|
- `compose/.env` 至少包含:
|
||||||
|
- `AGENT_ENV/AGENT_USER/AGENT_INSTANCE/GPU_NODE_HOSTNAME`(需要你提前填写);
|
||||||
|
- `BINDIP/FTPIP/SWARM_MANAGER_ADDR/SWARM_JOIN_TOKEN_*`;
|
||||||
|
- `NODE_GPU_BUNDLE_IMAGE_TAG=...:YYYYMMDD`。
|
||||||
|
|
||||||
|
### 日志映射(重要)
|
||||||
|
- 容器内 `/logs/infer` 与 `/logs/train` 已映射到包根 `./logs/infer` 与 `./logs/train`:
|
||||||
|
- 你可以直接在宿主机查看推理/训练日志:`tail -f logs/infer/*.log`、`tail -f logs/train/*.log`;
|
||||||
|
- install 脚本会自动创建这两个目录。
|
||||||
|
|
||||||
|
若提示缺少必填项:
|
||||||
|
- 打开 `compose/.env` 按提示补齐 `AGENT_*` 与 `GPU_NODE_HOSTNAME`,再次执行 `./scripts/config.sh`(脚本不会覆盖你已填的值)。
|
||||||
|
|
||||||
|
## 四、安装 install(加载镜像 + 起容器 + 跟日志)
|
||||||
|
命令:
|
||||||
|
```
|
||||||
|
./scripts/install.sh
|
||||||
|
```
|
||||||
|
脚本做了什么:
|
||||||
|
- 如有必要,先自动预热 overlay;
|
||||||
|
- 从 `images/` 导入 `argus-sys-metric-test-node-bundle-gpu-*.tar.gz` 到本地 Docker;
|
||||||
|
- `docker compose up -d` 启动 GPU 节点容器,并自动执行 `docker logs -f argus-metric-gpu-node-swarm` 跟踪安装过程。
|
||||||
|
|
||||||
|
看到什么才算成功:
|
||||||
|
- 日志中出现:`[BOOT] local bundle install OK: version=...` / `dcgm-exporter ... listening` / `node state present: /private/argus/agent/<hostname>/node.json`;
|
||||||
|
- `docker exec argus-metric-gpu-node-swarm nvidia-smi -L` 能列出 GPU;
|
||||||
|
- 在 Server 侧 Prometheus `/api/v1/targets` 中,GPU 节点 9100(node-exporter)与 9400(dcgm-exporter)至少其一 up。
|
||||||
|
|
||||||
|
## 五、卸载 uninstall
|
||||||
|
命令:
|
||||||
|
```
|
||||||
|
./scripts/uninstall.sh
|
||||||
|
```
|
||||||
|
行为:Compose down(如有 .env),并删除 warmup 容器与节点容器。
|
||||||
|
|
||||||
|
## 六、常见问题
|
||||||
|
- `本机未看到 overlay`:config/install 已自动预热;若仍失败,请检查与 manager 的网络连通性以及 manager 上是否已创建 `argus-sys-net`。
|
||||||
|
- `busybox 缺失`:确保包根 `images/busybox.tar` 在,或主机已有 `busybox:latest`。
|
||||||
|
- `加入 Swarm 失败`:确认 `cluster-info.env` 的 `SWARM_MANAGER_ADDR` 与 `SWARM_JOIN_TOKEN_WORKER` 正确,或在 manager 上重新 `docker swarm join-token -q worker` 后更新该文件。
|
||||||
BIN
deployment_new/templates/client_gpu/images/busybox.tar
Normal file
BIN
deployment_new/templates/client_gpu/images/busybox.tar
Normal file
Binary file not shown.
81
deployment_new/templates/client_gpu/scripts/config.sh
Normal file
81
deployment_new/templates/client_gpu/scripts/config.sh
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||||
|
PKG_ROOT="$ROOT_DIR"
|
||||||
|
ENV_EX="$PKG_ROOT/compose/.env.example"
|
||||||
|
ENV_OUT="$PKG_ROOT/compose/.env"
|
||||||
|
|
||||||
|
info(){ echo -e "\033[34m[CONFIG-GPU]\033[0m $*"; }
|
||||||
|
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
|
||||||
|
require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; }
|
||||||
|
require docker curl jq awk sed tar gzip nvidia-smi
|
||||||
|
|
||||||
|
# 磁盘空间检查(MB)
|
||||||
|
check_disk(){ local p="$1"; local need=10240; local free
|
||||||
|
free=$(df -Pm "$p" | awk 'NR==2{print $4+0}')
|
||||||
|
if [[ -z "$free" || "$free" -lt "$need" ]]; then err "磁盘空间不足: $p 剩余 ${free:-0}MB (<${need}MB)"; return 1; fi
|
||||||
|
}
|
||||||
|
check_disk "$PKG_ROOT"; check_disk "/var/lib/docker" || true
|
||||||
|
|
||||||
|
# 导入 cluster-info.env(默认取当前包根,也可用 CLUSTER_INFO 指定路径)
|
||||||
|
CI_IN="${CLUSTER_INFO:-$PKG_ROOT/cluster-info.env}"
|
||||||
|
info "读取 cluster-info.env: $CI_IN"
|
||||||
|
[[ -f "$CI_IN" ]] || { err "找不到 cluster-info.env(默认当前包根,或设置环境变量 CLUSTER_INFO 指定绝对路径)"; exit 1; }
|
||||||
|
set -a; source "$CI_IN"; set +a
|
||||||
|
[[ -n "${SWARM_MANAGER_ADDR:-}" && -n "${SWARM_JOIN_TOKEN_WORKER:-}" ]] || { err "cluster-info.env 缺少 SWARM 信息(SWARM_MANAGER_ADDR/SWARM_JOIN_TOKEN_WORKER)"; exit 1; }
|
||||||
|
|
||||||
|
# 加入 Swarm(幂等)
|
||||||
|
info "加入 Swarm(幂等):$SWARM_MANAGER_ADDR"
|
||||||
|
docker swarm join --token "$SWARM_JOIN_TOKEN_WORKER" "$SWARM_MANAGER_ADDR":2377 >/dev/null 2>&1 || true
|
||||||
|
|
||||||
|
# 导入 busybox 并做 overlay 预热与连通性(总是执行)
|
||||||
|
NET_NAME="${ARGUS_OVERLAY_NET:-argus-sys-net}"
|
||||||
|
# 准备 busybox
|
||||||
|
if ! docker image inspect busybox:latest >/dev/null 2>&1; then
|
||||||
|
if [[ -f "$PKG_ROOT/images/busybox.tar" ]]; then
|
||||||
|
info "加载 busybox.tar 以预热 overlay"
|
||||||
|
docker load -i "$PKG_ROOT/images/busybox.tar" >/dev/null
|
||||||
|
else
|
||||||
|
err "缺少 busybox 镜像(包内 images/busybox.tar 或本地 busybox:latest),无法预热 overlay $NET_NAME"; exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
# 预热容器(worker 侧加入 overlay 以便本地可见)
|
||||||
|
docker rm -f argus-net-warmup >/dev/null 2>&1 || true
|
||||||
|
info "启动 warmup 容器加入 overlay: $NET_NAME"
|
||||||
|
docker run -d --rm --name argus-net-warmup --network "$NET_NAME" ${BINDIP:+--dns "$BINDIP"} busybox:latest sleep 600 >/dev/null 2>&1 || true
|
||||||
|
for i in {1..60}; do docker network inspect "$NET_NAME" >/dev/null 2>&1 && { info "overlay 可见 (t=${i}s)"; break; }; sleep 1; done
|
||||||
|
docker network inspect "$NET_NAME" >/dev/null 2>&1 || { err "预热后仍未看到 overlay: $NET_NAME;请确认 manager 已创建并网络可达"; exit 1; }
|
||||||
|
|
||||||
|
# 从 warmup 容器内测试连通性(必须能 ping 通 BINDIP 与 FTPIP)
|
||||||
|
ping_ok(){ docker exec argus-net-warmup sh -lc "ping -c 1 -W 2 $1 >/dev/null 2>&1"; }
|
||||||
|
if [[ -n "${BINDIP:-}" ]]; then
|
||||||
|
ping_ok "$BINDIP" || { err "容器内无法 ping 通 BINDIP=$BINDIP;请检查 overlay 与 Bind9 容器状态"; exit 1; }
|
||||||
|
fi
|
||||||
|
if [[ -n "${FTPIP:-}" ]]; then
|
||||||
|
ping_ok "$FTPIP" || { err "容器内无法 ping 通 FTPIP=$FTPIP;请检查 overlay 与 FTP 容器状态"; exit 1; }
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 生成/更新 .env(保留人工填写项,不覆盖已有键)
|
||||||
|
if [[ ! -f "$ENV_OUT" ]]; then
|
||||||
|
cp "$ENV_EX" "$ENV_OUT"
|
||||||
|
fi
|
||||||
|
|
||||||
|
set_kv(){ local k="$1" v="$2"; if grep -q "^${k}=" "$ENV_OUT"; then sed -i -E "s#^${k}=.*#${k}=${v}#" "$ENV_OUT"; else echo "${k}=${v}" >> "$ENV_OUT"; fi }
|
||||||
|
|
||||||
|
set_kv BINDIP "${BINDIP:-}"
|
||||||
|
set_kv FTPIP "${FTPIP:-}"
|
||||||
|
set_kv SWARM_MANAGER_ADDR "${SWARM_MANAGER_ADDR:-}"
|
||||||
|
set_kv SWARM_JOIN_TOKEN_WORKER "${SWARM_JOIN_TOKEN_WORKER:-}"
|
||||||
|
set_kv SWARM_JOIN_TOKEN_MANAGER "${SWARM_JOIN_TOKEN_MANAGER:-}"
|
||||||
|
|
||||||
|
REQ_VARS=(AGENT_ENV AGENT_USER AGENT_INSTANCE GPU_NODE_HOSTNAME)
|
||||||
|
missing=()
|
||||||
|
for v in "${REQ_VARS[@]}"; do
|
||||||
|
val=$(grep -E "^$v=" "$ENV_OUT" | head -1 | cut -d= -f2-)
|
||||||
|
if [[ -z "$val" ]]; then missing+=("$v"); fi
|
||||||
|
done
|
||||||
|
if [[ ${#missing[@]} -gt 0 ]]; then
|
||||||
|
err "以下变量必须在 compose/.env 中填写:${missing[*]}(已保留你现有的内容,不会被覆盖)"; exit 1; fi
|
||||||
|
|
||||||
|
info "已生成 compose/.env;可执行 scripts/install.sh"
|
||||||
57
deployment_new/templates/client_gpu/scripts/install.sh
Normal file
57
deployment_new/templates/client_gpu/scripts/install.sh
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||||
|
PKG_ROOT="$ROOT_DIR"
|
||||||
|
ENV_FILE="$PKG_ROOT/compose/.env"
|
||||||
|
COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml"
|
||||||
|
|
||||||
|
info(){ echo -e "\033[34m[INSTALL-GPU]\033[0m $*"; }
|
||||||
|
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
|
||||||
|
require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; }
|
||||||
|
require docker docker compose nvidia-smi
|
||||||
|
|
||||||
|
[[ -f "$ENV_FILE" ]] || { err "缺少 compose/.env,请先运行 scripts/config.sh"; exit 1; }
|
||||||
|
info "使用环境文件: $ENV_FILE"
|
||||||
|
|
||||||
|
# 预热 overlay(当 config 执行很久之前或容器已被清理时,warmup 可能不存在)
|
||||||
|
set -a; source "$ENV_FILE"; set +a
|
||||||
|
NET_NAME="${ARGUS_OVERLAY_NET:-argus-sys-net}"
|
||||||
|
info "检查 overlay 网络可见性: $NET_NAME"
|
||||||
|
if ! docker network inspect "$NET_NAME" >/dev/null 2>&1; then
|
||||||
|
# 如 Overlay 不可见,尝试用 busybox 预热
|
||||||
|
if ! docker image inspect busybox:latest >/dev/null 2>&1; then
|
||||||
|
if [[ -f "$PKG_ROOT/images/busybox.tar" ]]; then docker load -i "$PKG_ROOT/images/busybox.tar"; else err "缺少 busybox 镜像(images/busybox.tar 或本地 busybox:latest)"; exit 1; fi
|
||||||
|
fi
|
||||||
|
docker rm -f argus-net-warmup >/dev/null 2>&1 || true
|
||||||
|
docker run -d --rm --name argus-net-warmup --network "$NET_NAME" ${BINDIP:+--dns "$BINDIP"} busybox:latest sleep 600 >/dev/null 2>&1 || true
|
||||||
|
for i in {1..60}; do docker network inspect "$NET_NAME" >/dev/null 2>&1 && break; sleep 1; done
|
||||||
|
docker network inspect "$NET_NAME" >/dev/null 2>&1 || { err "预热后仍未看到 overlay: $NET_NAME;请确认 manager 已创建并网络可达"; exit 1; }
|
||||||
|
info "overlay 已可见(warmup=argus-net-warmup)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 容器内连通性检查:BINDIP 与 FTPIP 可达
|
||||||
|
ping_ok(){ docker exec argus-net-warmup sh -lc "ping -c 1 -W 2 $1 >/dev/null 2>&1"; }
|
||||||
|
if [[ -n "${BINDIP:-}" ]]; then
|
||||||
|
if ping_ok "$BINDIP"; then info "warmup 内可达 BINDIP=$BINDIP"; else err "容器内无法 ping 通 BINDIP=$BINDIP"; exit 1; fi
|
||||||
|
fi
|
||||||
|
if [[ -n "${FTPIP:-}" ]]; then
|
||||||
|
if ping_ok "$FTPIP"; then info "warmup 内可达 FTPIP=$FTPIP"; else err "容器内无法 ping 通 FTPIP=$FTPIP"; exit 1; fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 导入 GPU bundle 镜像
|
||||||
|
IMG_TGZ=$(ls -1 "$PKG_ROOT"/images/argus-sys-metric-test-node-bundle-gpu-*.tar.gz 2>/dev/null | head -1 || true)
|
||||||
|
[[ -n "$IMG_TGZ" ]] || { err "找不到 GPU bundle 镜像 tar.gz"; exit 1; }
|
||||||
|
info "导入 GPU bundle 镜像: $(basename "$IMG_TGZ")"
|
||||||
|
tmp=$(mktemp); gunzip -c "$IMG_TGZ" > "$tmp"; docker load -i "$tmp" >/dev/null; rm -f "$tmp"
|
||||||
|
|
||||||
|
# 确保日志目录存在(宿主侧,用于映射 /logs/infer 与 /logs/train)
|
||||||
|
mkdir -p "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train"
|
||||||
|
info "日志目录已准备: logs/infer logs/train"
|
||||||
|
|
||||||
|
# 启动 compose 并跟踪日志
|
||||||
|
info "启动 GPU 节点 (docker compose up -d)"
|
||||||
|
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d
|
||||||
|
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps
|
||||||
|
info "跟踪节点容器日志(按 Ctrl+C 退出)"
|
||||||
|
docker logs -f argus-metric-gpu-node-swarm || true
|
||||||
25
deployment_new/templates/client_gpu/scripts/uninstall.sh
Normal file
25
deployment_new/templates/client_gpu/scripts/uninstall.sh
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||||
|
PKG_ROOT="$ROOT_DIR"
|
||||||
|
ENV_FILE="$PKG_ROOT/compose/.env"
|
||||||
|
COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml"
|
||||||
|
|
||||||
|
info(){ echo -e "\033[34m[UNINSTALL-GPU]\033[0m $*"; }
|
||||||
|
|
||||||
|
if [[ -f "$ENV_FILE" ]]; then
|
||||||
|
info "stopping compose project"
|
||||||
|
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" down --remove-orphans || true
|
||||||
|
else
|
||||||
|
info "compose/.env not found; attempting to remove container by name"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# remove warmup container if still running
|
||||||
|
docker rm -f argus-net-warmup >/dev/null 2>&1 || true
|
||||||
|
|
||||||
|
# remove node container if present
|
||||||
|
docker rm -f argus-metric-gpu-node-swarm >/dev/null 2>&1 || true
|
||||||
|
|
||||||
|
info "uninstall completed"
|
||||||
|
|
||||||
175
deployment_new/templates/server/compose/docker-compose.yml
Normal file
175
deployment_new/templates/server/compose/docker-compose.yml
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
version: "3.8"
|
||||||
|
|
||||||
|
networks:
|
||||||
|
argus-sys-net:
|
||||||
|
external: true
|
||||||
|
|
||||||
|
services:
|
||||||
|
bind:
|
||||||
|
image: ${BIND_IMAGE_TAG:-argus-bind9:${PKG_VERSION}}
|
||||||
|
container_name: argus-bind-sys
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
volumes:
|
||||||
|
- ../private:/private
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
master:
|
||||||
|
image: ${MASTER_IMAGE_TAG:-argus-master:${PKG_VERSION}}
|
||||||
|
container_name: argus-master-sys
|
||||||
|
depends_on: [bind]
|
||||||
|
environment:
|
||||||
|
- OFFLINE_THRESHOLD_SECONDS=6
|
||||||
|
- ONLINE_THRESHOLD_SECONDS=2
|
||||||
|
- SCHEDULER_INTERVAL_SECONDS=1
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
ports:
|
||||||
|
- "${MASTER_PORT:-32300}:3000"
|
||||||
|
volumes:
|
||||||
|
- ../private/argus/master:/private/argus/master
|
||||||
|
- ../private/argus/metric/prometheus:/private/argus/metric/prometheus
|
||||||
|
- ../private/argus/etc:/private/argus/etc
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
es:
|
||||||
|
image: ${ES_IMAGE_TAG:-argus-elasticsearch:${PKG_VERSION}}
|
||||||
|
container_name: argus-es-sys
|
||||||
|
environment:
|
||||||
|
- discovery.type=single-node
|
||||||
|
- xpack.security.enabled=false
|
||||||
|
- ES_JAVA_OPTS=-Xms512m -Xmx512m
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
volumes:
|
||||||
|
- ../private/argus/log/elasticsearch:/private/argus/log/elasticsearch
|
||||||
|
- ../private/argus/etc:/private/argus/etc
|
||||||
|
ports:
|
||||||
|
- "${ES_HTTP_PORT:-9200}:9200"
|
||||||
|
restart: unless-stopped
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
|
||||||
|
kibana:
|
||||||
|
image: ${KIBANA_IMAGE_TAG:-argus-kibana:${PKG_VERSION}}
|
||||||
|
container_name: argus-kibana-sys
|
||||||
|
environment:
|
||||||
|
- ELASTICSEARCH_HOSTS=http://es.log.argus.com:9200
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
volumes:
|
||||||
|
- ../private/argus/log/kibana:/private/argus/log/kibana
|
||||||
|
- ../private/argus/etc:/private/argus/etc
|
||||||
|
depends_on: [es]
|
||||||
|
ports:
|
||||||
|
- "${KIBANA_PORT:-5601}:5601"
|
||||||
|
restart: unless-stopped
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
|
||||||
|
ftp:
|
||||||
|
image: ${FTP_IMAGE_TAG:-argus-metric-ftp:${PKG_VERSION}}
|
||||||
|
container_name: argus-ftp
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
- TZ=Asia/Shanghai
|
||||||
|
- FTP_BASE_PATH=/private/argus/ftp
|
||||||
|
- FTP_PASSWORD=${FTP_PASSWORD:-NASPlab1234!}
|
||||||
|
- DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com}
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
ports:
|
||||||
|
- "${FTP_PORT:-21}:21"
|
||||||
|
- "${FTP_DATA_PORT:-20}:20"
|
||||||
|
- "${FTP_PASSIVE_HOST_RANGE:-21100-21110}:21100-21110"
|
||||||
|
volumes:
|
||||||
|
- ../private/argus/metric/ftp:/private/argus/ftp
|
||||||
|
- ../private/argus/etc:/private/argus/etc
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
|
||||||
|
prometheus:
|
||||||
|
image: ${PROM_IMAGE_TAG:-argus-metric-prometheus:${PKG_VERSION}}
|
||||||
|
container_name: argus-prometheus
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
- TZ=Asia/Shanghai
|
||||||
|
- PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
ports:
|
||||||
|
- "${PROMETHEUS_PORT:-9090}:9090"
|
||||||
|
volumes:
|
||||||
|
- ../private/argus/metric/prometheus:/private/argus/metric/prometheus
|
||||||
|
- ../private/argus/etc:/private/argus/etc
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
|
||||||
|
grafana:
|
||||||
|
image: ${GRAFANA_IMAGE_TAG:-argus-metric-grafana:${PKG_VERSION}}
|
||||||
|
container_name: argus-grafana
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
- TZ=Asia/Shanghai
|
||||||
|
- GRAFANA_BASE_PATH=/private/argus/metric/grafana
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
- GF_SERVER_HTTP_PORT=3000
|
||||||
|
- GF_LOG_LEVEL=warn
|
||||||
|
- GF_LOG_MODE=console
|
||||||
|
- GF_PATHS_PROVISIONING=/private/argus/metric/grafana/provisioning
|
||||||
|
- GF_AUTH_ANONYMOUS_ENABLED=true
|
||||||
|
- GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
|
||||||
|
ports:
|
||||||
|
- "${GRAFANA_PORT:-3000}:3000"
|
||||||
|
volumes:
|
||||||
|
- ../private/argus/metric/grafana:/private/argus/metric/grafana
|
||||||
|
- ../private/argus/etc:/private/argus/etc
|
||||||
|
depends_on: [prometheus]
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
|
||||||
|
alertmanager:
|
||||||
|
image: ${ALERT_IMAGE_TAG:-argus-alertmanager:${PKG_VERSION}}
|
||||||
|
container_name: argus-alertmanager
|
||||||
|
environment:
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
volumes:
|
||||||
|
- ../private/argus/etc:/private/argus/etc
|
||||||
|
- ../private/argus/alert/alertmanager:/private/argus/alert/alertmanager
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
ports:
|
||||||
|
- "${ALERTMANAGER_PORT:-9093}:9093"
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
web-frontend:
|
||||||
|
image: ${FRONT_IMAGE_TAG:-argus-web-frontend:${PKG_VERSION}}
|
||||||
|
container_name: argus-web-frontend
|
||||||
|
environment:
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
- EXTERNAL_MASTER_PORT=${WEB_PROXY_PORT_8085:-8085}
|
||||||
|
- EXTERNAL_ALERTMANAGER_PORT=${WEB_PROXY_PORT_8084:-8084}
|
||||||
|
- EXTERNAL_GRAFANA_PORT=${WEB_PROXY_PORT_8081:-8081}
|
||||||
|
- EXTERNAL_PROMETHEUS_PORT=${WEB_PROXY_PORT_8082:-8082}
|
||||||
|
- EXTERNAL_KIBANA_PORT=${WEB_PROXY_PORT_8083:-8083}
|
||||||
|
volumes:
|
||||||
|
- ../private/argus/etc:/private/argus/etc
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
web-proxy:
|
||||||
|
image: ${WEB_PROXY_IMAGE_TAG:-argus-web-proxy:${PKG_VERSION}}
|
||||||
|
container_name: argus-web-proxy
|
||||||
|
depends_on: [bind, master, grafana, prometheus, kibana, alertmanager]
|
||||||
|
environment:
|
||||||
|
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||||
|
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||||
|
volumes:
|
||||||
|
- ../private/argus/etc:/private/argus/etc
|
||||||
|
networks: [argus-sys-net]
|
||||||
|
ports:
|
||||||
|
- "${WEB_PROXY_PORT_8080:-8080}:8080"
|
||||||
|
- "${WEB_PROXY_PORT_8081:-8081}:8081"
|
||||||
|
- "${WEB_PROXY_PORT_8082:-8082}:8082"
|
||||||
|
- "${WEB_PROXY_PORT_8083:-8083}:8083"
|
||||||
|
- "${WEB_PROXY_PORT_8084:-8084}:8084"
|
||||||
|
- "${WEB_PROXY_PORT_8085:-8085}:8085"
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
102
deployment_new/templates/server/docs/INSTALL_SERVER_zh.md
Normal file
102
deployment_new/templates/server/docs/INSTALL_SERVER_zh.md
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
# Argus Server 安装指南(deployment_new)
|
||||||
|
|
||||||
|
适用:通过 Server 安装包在 Docker Swarm + external overlay 网络一体化部署 Argus 服务端组件。
|
||||||
|
|
||||||
|
—— 本文强调“怎么做、看什么、符合了才继续”。
|
||||||
|
|
||||||
|
## 一、准备条件(开始前确认)
|
||||||
|
- Docker 与 Docker Compose v2 已安装;`docker info` 正常;`docker compose version` 可执行。
|
||||||
|
- 具备 root/sudo 权限;磁盘可用空间 ≥ 10GB(包根与 `/var/lib/docker`)。
|
||||||
|
- 你知道本机管理地址(SWARM_MANAGER_ADDR),该 IP 属于本机某网卡,可被其他节点访问。
|
||||||
|
- 很重要:以统一账户 `argus`(UID=2133,GID=2015)执行后续安装与运维,并将其加入 `docker` 组;示例命令如下(如需不同 UID/GID,请替换为贵方标准):
|
||||||
|
```bash
|
||||||
|
# 1) 创建主组(GID=2015,组名 argus;若已存在可跳过)
|
||||||
|
sudo groupadd --gid 2015 argus || true
|
||||||
|
|
||||||
|
# 2) 创建用户 argus(UID=2133、主组 GID=2015,创建家目录并用 bash 作为默认 shell;若已存在可用 usermod 调整)
|
||||||
|
sudo useradd --uid 2133 --gid 2015 --create-home --shell /bin/bash argus || true
|
||||||
|
sudo passwd argus
|
||||||
|
|
||||||
|
# 3) 将 argus 加入 docker 组,使其能调用 Docker Daemon(新登录后生效)
|
||||||
|
sudo usermod -aG docker argus
|
||||||
|
|
||||||
|
# 4) 验证(重新登录或执行 newgrp docker 使组生效)
|
||||||
|
su - argus -c 'id; docker ps >/dev/null && echo OK || echo NO_DOCKER_PERMISSION'
|
||||||
|
```
|
||||||
|
后续的解压与执行(config/install/selfcheck 等)均使用该 `argus` 账户进行。
|
||||||
|
|
||||||
|
## 二、解包与目录结构
|
||||||
|
- 解压:`tar -xzf server_YYYYMMDD.tar.gz`。
|
||||||
|
- 进入:`cd server_YYYYMMDD/`
|
||||||
|
- 你应当能看到:
|
||||||
|
- `images/`(逐服务镜像 tar.gz,如 `argus-master-YYYYMMDD.tar.gz`)
|
||||||
|
- `compose/`(`docker-compose.yml` 与 `.env.example`)
|
||||||
|
- `scripts/`(安装/运维脚本)
|
||||||
|
- `private/argus/`(数据与配置骨架)
|
||||||
|
- `docs/`(中文文档)
|
||||||
|
|
||||||
|
## 三、配置 config(生成 .env 与 SWARM_MANAGER_ADDR)
|
||||||
|
命令:
|
||||||
|
```
|
||||||
|
export SWARM_MANAGER_ADDR=<本机管理IP>
|
||||||
|
./scripts/config.sh
|
||||||
|
```
|
||||||
|
脚本做了什么:
|
||||||
|
- 检查依赖与磁盘空间;
|
||||||
|
- 自动从“端口 20000 起”分配所有服务端口,确保“系统未占用”且“彼此不冲突”;
|
||||||
|
- 写入 `compose/.env`(包含端口、镜像 tag、FTP 账号、overlay 名称等);
|
||||||
|
- 将当前执行账户的 UID/GID 写入 `ARGUS_BUILD_UID/GID`(若主组名是 docker,会改用“与用户名同名的组”的 GID,避免拿到 docker 组 999);
|
||||||
|
- 更新/追加 `cluster-info.env` 中的 `SWARM_MANAGER_ADDR`(不会覆盖其他键)。
|
||||||
|
|
||||||
|
看到什么才算成功:
|
||||||
|
- 终端输出:`已生成 compose/.env 并更新 cluster-info.env 的 SWARM_MANAGER_ADDR。`
|
||||||
|
- `compose/.env` 打开应当看到:
|
||||||
|
- 端口均 ≥20000 且没有重复;
|
||||||
|
- `ARGUS_BUILD_UID/GID` 与 `id -u/-g` 一致;
|
||||||
|
- `SWARM_MANAGER_ADDR=<你的IP>`。
|
||||||
|
|
||||||
|
遇到问题:
|
||||||
|
- 端口被异常占用:可删去 `.env` 后再次执行 `config.sh`,或手工编辑端口再执行 `install.sh`。
|
||||||
|
|
||||||
|
## 四、安装 install(一次到位)
|
||||||
|
命令:
|
||||||
|
```
|
||||||
|
./scripts/install.sh
|
||||||
|
```
|
||||||
|
脚本做了什么:
|
||||||
|
- 若 Swarm 未激活:执行 `docker swarm init --advertise-addr $SWARM_MANAGER_ADDR`;
|
||||||
|
- 确保 external overlay `argus-sys-net` 存在;
|
||||||
|
- 导入 `images/*.tar.gz` 到本机 Docker;
|
||||||
|
- `docker compose up -d` 启动服务;
|
||||||
|
- 等待“六项就绪”:
|
||||||
|
- Master `/readyz`=200、ES `/_cluster/health`=200、Prometheus TCP 可达、Grafana `/api/health`=200、Alertmanager `/api/v2/status`=200、Kibana `/api/status` level=available;
|
||||||
|
- 将各服务 overlay IP 写入 `private/argus/etc/<域名>`,Reload Bind9 与 Nginx;
|
||||||
|
- 写出 `cluster-info.env`(含 `BINDIP/FTPIP/SWARM_JOIN_TOKEN_{WORKER,MANAGER}/SWARM_MANAGER_ADDR`);
|
||||||
|
- 生成 `安装报告_YYYYMMDD-HHMMSS.md`(端口、健康检查摘要与提示)。
|
||||||
|
|
||||||
|
看到什么才算成功:
|
||||||
|
- `docker compose ps` 全部是 Up;
|
||||||
|
- `安装报告_…md` 中各项 HTTP 检查为 200/available;
|
||||||
|
- `cluster-info.env` 包含五个关键键:
|
||||||
|
- `SWARM_MANAGER_ADDR=...`
|
||||||
|
- `BINDIP=10.x.x.x` `FTPIP=10.x.x.x`
|
||||||
|
- `SWARM_JOIN_TOKEN_WORKER=SWMTKN-...`
|
||||||
|
- `SWARM_JOIN_TOKEN_MANAGER=SWMTKN-...`
|
||||||
|
|
||||||
|
## 五、健康自检与常用操作
|
||||||
|
- 健康自检:`./scripts/selfcheck.sh`
|
||||||
|
- 期望输出:`selfcheck OK -> logs/selfcheck.json`
|
||||||
|
- 文件 `logs/selfcheck.json` 中 `overlay_net/es/kibana/master_readyz/ftp_share_writable/prometheus/grafana/alertmanager/web_proxy_cors` 为 true。
|
||||||
|
- 状态:`./scripts/status.sh`(相当于 `docker compose ps`)。
|
||||||
|
- 诊断:`./scripts/diagnose.sh`(收集容器/HTTP/CORS/ES 细节,输出到 `logs/diagnose_*.log`)。
|
||||||
|
- 卸载:`./scripts/uninstall.sh`(Compose down)。
|
||||||
|
- ES 磁盘水位临时放宽/还原:`./scripts/es-watermark-relax.sh` / `./scripts/es-watermark-restore.sh`。
|
||||||
|
|
||||||
|
## 六、下一步:分发 cluster-info.env 给 Client
|
||||||
|
- 将 `cluster-info.env` 拷贝给安装 Client 的同事;
|
||||||
|
- 对方在 Client 机器的包根放置该文件(或设置 `CLUSTER_INFO=/绝对路径`)即可。
|
||||||
|
|
||||||
|
## 七、故障排查快览
|
||||||
|
- Proxy 502 或 8080 连接复位:多因 Bind 域名未更新到 overlay IP;重跑 `install.sh`(会写入私有域名文件并 reload)或查看 `logs/diagnose_error.log`。
|
||||||
|
- Kibana 不 available:等待 1–2 分钟、查看 `argus-kibana-sys` 日志;
|
||||||
|
- cluster-info.env 的 SWARM_MANAGER_ADDR 为空:重新 `export SWARM_MANAGER_ADDR=<IP>; ./scripts/config.sh` 或 `./scripts/install.sh`(会回读 `.env` 补写)。
|
||||||
7
deployment_new/templates/server/docs/SWARM_DEPLOY_zh.md
Normal file
7
deployment_new/templates/server/docs/SWARM_DEPLOY_zh.md
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
# Docker Swarm 部署要点
|
||||||
|
|
||||||
|
- 初始化 Swarm:`docker swarm init --advertise-addr <SWARM_MANAGER_ADDR>`
|
||||||
|
- 创建 overlay:`docker network create --driver overlay --attachable argus-sys-net`
|
||||||
|
- Server 包 `install.sh` 自动完成上述操作;如需手动执行,确保 `argus-sys-net` 存在且 attachable。
|
||||||
|
- Worker 节点加入:`docker swarm join --token <worker_token> <SWARM_MANAGER_ADDR>:2377`。
|
||||||
|
|
||||||
11
deployment_new/templates/server/docs/TROUBLESHOOTING_zh.md
Normal file
11
deployment_new/templates/server/docs/TROUBLESHOOTING_zh.md
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# 故障排查(Server)
|
||||||
|
|
||||||
|
- 端口占用:查看 `安装报告_*.md` 中端口表;如需修改,编辑 `compose/.env` 后执行 `docker compose ... up -d`。
|
||||||
|
- 组件未就绪:
|
||||||
|
- Master: `curl http://127.0.0.1:${MASTER_PORT}/readyz -I`
|
||||||
|
- ES: `curl http://127.0.0.1:${ES_HTTP_PORT}/_cluster/health`
|
||||||
|
- Grafana: `curl http://127.0.0.1:${GRAFANA_PORT}/api/health`
|
||||||
|
- Prometheus TCP: `exec 3<>/dev/tcp/127.0.0.1/${PROMETHEUS_PORT}`
|
||||||
|
- 域名解析:进入 `argus-web-proxy` 或 `argus-master-sys` 容器:`getent hosts master.argus.com`。
|
||||||
|
- Swarm/Overlay:检查 `docker network ls | grep argus-sys-net`,或 `docker node ls`。
|
||||||
|
|
||||||
104
deployment_new/templates/server/scripts/config.sh
Normal file
104
deployment_new/templates/server/scripts/config.sh
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||||
|
PKG_ROOT="$ROOT_DIR"
|
||||||
|
ENV_EX="$PKG_ROOT/compose/.env.example"
|
||||||
|
ENV_OUT="$PKG_ROOT/compose/.env"
|
||||||
|
|
||||||
|
info(){ echo -e "\033[34m[CONFIG]\033[0m $*"; }
|
||||||
|
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
|
||||||
|
require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; }
|
||||||
|
|
||||||
|
require docker curl jq awk sed tar gzip
|
||||||
|
|
||||||
|
# 磁盘空间检查(MB)
|
||||||
|
check_disk(){ local p="$1"; local need=10240; local free
|
||||||
|
free=$(df -Pm "$p" | awk 'NR==2{print $4+0}')
|
||||||
|
if [[ -z "$free" || "$free" -lt "$need" ]]; then err "磁盘空间不足: $p 剩余 ${free:-0}MB (<${need}MB)"; return 1; fi
|
||||||
|
}
|
||||||
|
|
||||||
|
check_disk "$PKG_ROOT"; check_disk "/var/lib/docker" || true
|
||||||
|
|
||||||
|
# 读取/生成 SWARM_MANAGER_ADDR
|
||||||
|
SWARM_MANAGER_ADDR=${SWARM_MANAGER_ADDR:-}
|
||||||
|
if [[ -z "${SWARM_MANAGER_ADDR}" ]]; then
|
||||||
|
read -rp "请输入本机管理地址 SWARM_MANAGER_ADDR: " SWARM_MANAGER_ADDR
|
||||||
|
fi
|
||||||
|
info "SWARM_MANAGER_ADDR=$SWARM_MANAGER_ADDR"
|
||||||
|
|
||||||
|
# 校验 IP 属于本机网卡
|
||||||
|
if ! ip -o addr | awk '{print $4}' | cut -d'/' -f1 | grep -qx "$SWARM_MANAGER_ADDR"; then
|
||||||
|
err "SWARM_MANAGER_ADDR 非本机地址: $SWARM_MANAGER_ADDR"; exit 1; fi
|
||||||
|
|
||||||
|
info "开始分配服务端口(起始=20000,避免系统占用与相互冲突)"
|
||||||
|
is_port_used(){ local p="$1"; ss -tulnH 2>/dev/null | awk '{print $5}' | sed 's/.*://g' | grep -qx "$p"; }
|
||||||
|
declare -A PRESENT=() CHOSEN=() USED=()
|
||||||
|
START_PORT="${START_PORT:-20000}"; cur=$START_PORT
|
||||||
|
ORDER=(MASTER_PORT ES_HTTP_PORT KIBANA_PORT PROMETHEUS_PORT GRAFANA_PORT ALERTMANAGER_PORT \
|
||||||
|
WEB_PROXY_PORT_8080 WEB_PROXY_PORT_8081 WEB_PROXY_PORT_8082 WEB_PROXY_PORT_8083 WEB_PROXY_PORT_8084 WEB_PROXY_PORT_8085 \
|
||||||
|
FTP_PORT FTP_DATA_PORT)
|
||||||
|
|
||||||
|
# 标记 .env.example 中实际存在的键
|
||||||
|
for key in "${ORDER[@]}"; do
|
||||||
|
if grep -q "^${key}=" "$ENV_EX"; then PRESENT[$key]=1; fi
|
||||||
|
done
|
||||||
|
|
||||||
|
next_free(){ local p="$1"; while :; do if [[ -n "${USED[$p]:-}" ]] || is_port_used "$p"; then p=$((p+1)); else echo "$p"; return; fi; done; }
|
||||||
|
|
||||||
|
for key in "${ORDER[@]}"; do
|
||||||
|
[[ -z "${PRESENT[$key]:-}" ]] && continue
|
||||||
|
p=$(next_free "$cur"); CHOSEN[$key]="$p"; USED[$p]=1; cur=$((p+1))
|
||||||
|
done
|
||||||
|
|
||||||
|
info "端口分配结果:MASTER=${CHOSEN[MASTER_PORT]:-} ES=${CHOSEN[ES_HTTP_PORT]:-} KIBANA=${CHOSEN[KIBANA_PORT]:-} PROM=${CHOSEN[PROMETHEUS_PORT]:-} GRAFANA=${CHOSEN[GRAFANA_PORT]:-} ALERT=${CHOSEN[ALERTMANAGER_PORT]:-} WEB_PROXY(8080..8085)=${CHOSEN[WEB_PROXY_PORT_8080]:-}/${CHOSEN[WEB_PROXY_PORT_8081]:-}/${CHOSEN[WEB_PROXY_PORT_8082]:-}/${CHOSEN[WEB_PROXY_PORT_8083]:-}/${CHOSEN[WEB_PROXY_PORT_8084]:-}/${CHOSEN[WEB_PROXY_PORT_8085]:-}"
|
||||||
|
|
||||||
|
cp "$ENV_EX" "$ENV_OUT"
|
||||||
|
# 覆盖端口(按唯一化结果写回)
|
||||||
|
for key in "${ORDER[@]}"; do
|
||||||
|
val="${CHOSEN[$key]:-}"
|
||||||
|
[[ -z "$val" ]] && continue
|
||||||
|
sed -i -E "s#^$key=.*#$key=${val}#" "$ENV_OUT"
|
||||||
|
done
|
||||||
|
info "已写入 compose/.env 的端口配置"
|
||||||
|
# 覆盖/补充 Overlay 名称
|
||||||
|
grep -q '^ARGUS_OVERLAY_NET=' "$ENV_OUT" || echo 'ARGUS_OVERLAY_NET=argus-sys-net' >> "$ENV_OUT"
|
||||||
|
# FTP 默认
|
||||||
|
grep -q '^FTP_USER=' "$ENV_OUT" || echo 'FTP_USER=ftpuser' >> "$ENV_OUT"
|
||||||
|
grep -q '^FTP_PASSWORD=' "$ENV_OUT" || echo 'FTP_PASSWORD=NASPlab1234!' >> "$ENV_OUT"
|
||||||
|
# 以当前执行账户 UID/GID 写入(避免误选 docker 组)
|
||||||
|
RUID=$(id -u)
|
||||||
|
PRIMARY_GID=$(id -g)
|
||||||
|
PRIMARY_GRP=$(id -gn)
|
||||||
|
USER_NAME=$(id -un)
|
||||||
|
# 若主组名被解析为 docker,尝试用与用户名同名的组的 GID;否则回退主 GID
|
||||||
|
if [[ "$PRIMARY_GRP" == "docker" ]]; then
|
||||||
|
RGID=$(getent group "$USER_NAME" | awk -F: '{print $3}' 2>/dev/null || true)
|
||||||
|
[[ -z "$RGID" ]] && RGID="$PRIMARY_GID"
|
||||||
|
else
|
||||||
|
RGID="$PRIMARY_GID"
|
||||||
|
fi
|
||||||
|
info "使用构建账户 UID:GID=${RUID}:${RGID} (user=$USER_NAME primary_group=$PRIMARY_GRP)"
|
||||||
|
if grep -q '^ARGUS_BUILD_UID=' "$ENV_OUT"; then
|
||||||
|
sed -i -E "s#^ARGUS_BUILD_UID=.*#ARGUS_BUILD_UID=${RUID}#" "$ENV_OUT"
|
||||||
|
else
|
||||||
|
echo "ARGUS_BUILD_UID=${RUID}" >> "$ENV_OUT"
|
||||||
|
fi
|
||||||
|
if grep -q '^ARGUS_BUILD_GID=' "$ENV_OUT"; then
|
||||||
|
sed -i -E "s#^ARGUS_BUILD_GID=.*#ARGUS_BUILD_GID=${RGID}#" "$ENV_OUT"
|
||||||
|
else
|
||||||
|
echo "ARGUS_BUILD_GID=${RGID}" >> "$ENV_OUT"
|
||||||
|
fi
|
||||||
|
|
||||||
|
CI="$PKG_ROOT/cluster-info.env"
|
||||||
|
if [[ -f "$CI" ]]; then
|
||||||
|
if grep -q '^SWARM_MANAGER_ADDR=' "$CI"; then
|
||||||
|
sed -i -E "s#^SWARM_MANAGER_ADDR=.*#SWARM_MANAGER_ADDR=${SWARM_MANAGER_ADDR}#" "$CI"
|
||||||
|
else
|
||||||
|
echo "SWARM_MANAGER_ADDR=${SWARM_MANAGER_ADDR}" >> "$CI"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "SWARM_MANAGER_ADDR=${SWARM_MANAGER_ADDR}" > "$CI"
|
||||||
|
fi
|
||||||
|
info "已生成 compose/.env 并更新 cluster-info.env 的 SWARM_MANAGER_ADDR。"
|
||||||
|
info "下一步可执行: scripts/install.sh"
|
||||||
109
deployment_new/templates/server/scripts/diagnose.sh
Normal file
109
deployment_new/templates/server/scripts/diagnose.sh
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
|
||||||
|
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
||||||
|
|
||||||
|
ts="$(date -u +%Y%m%d-%H%M%SZ)"
|
||||||
|
LOG_DIR="$ROOT/logs"; mkdir -p "$LOG_DIR" || true
|
||||||
|
if ! ( : > "$LOG_DIR/.w" 2>/dev/null ); then LOG_DIR="/tmp/argus-logs"; mkdir -p "$LOG_DIR" || true; fi
|
||||||
|
DETAILS="$LOG_DIR/diagnose_details_${ts}.log"; ERRORS="$LOG_DIR/diagnose_error_${ts}.log"; : > "$DETAILS"; : > "$ERRORS"
|
||||||
|
|
||||||
|
logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; }
|
||||||
|
append_err() { echo "$*" >> "$ERRORS"; }
|
||||||
|
http_code() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
|
||||||
|
http_body_head() { curl -s --max-time 3 "$1" 2>/dev/null | sed -n '1,5p' || true; }
|
||||||
|
header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; }
|
||||||
|
|
||||||
|
section() { local name="$1"; logd "===== [$name] ====="; }
|
||||||
|
svc() {
|
||||||
|
local svc_name="$1"; local cname="$2"; shift 2
|
||||||
|
section "$svc_name ($cname)"
|
||||||
|
logd "docker ps:"; docker ps -a --format '{{.Names}}\t{{.Status}}\t{{.Image}}' | awk -v n="$cname" '$1==n' >> "$DETAILS" || true
|
||||||
|
logd "docker inspect:"; docker inspect -f '{{.State.Status}} rc={{.RestartCount}} started={{.State.StartedAt}}' "$cname" >> "$DETAILS" 2>&1 || true
|
||||||
|
logd "last 200 container logs:"; docker logs --tail 200 "$cname" >> "$DETAILS" 2>&1 || true
|
||||||
|
docker logs --tail 200 "$cname" 2>&1 | grep -Ei '\\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\\b' | sed "s/^/[${svc_name}][container] /" >> "$ERRORS" || true
|
||||||
|
if docker exec "$cname" sh -lc 'command -v supervisorctl >/dev/null 2>&1' >/dev/null 2>&1; then
|
||||||
|
logd "supervisorctl status:"; docker exec "$cname" sh -lc 'supervisorctl status' >> "$DETAILS" 2>&1 || true
|
||||||
|
local files; files=$(docker exec "$cname" sh -lc 'ls /var/log/supervisor/*.log 2>/dev/null' || true)
|
||||||
|
for f in $files; do
|
||||||
|
logd "tail -n 80 $f:"; docker exec "$cname" sh -lc "tail -n 80 $f 2>/dev/null || true" >> "$DETAILS" 2>&1 || true
|
||||||
|
docker exec "$cname" sh -lc "tail -n 200 $f 2>/dev/null" 2>/dev/null | grep -Ei '\\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\\b' | sed "s/^/[${svc_name}][supervisor:$(basename $f)] /" >> "$ERRORS" || true
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
svc bind argus-bind-sys
|
||||||
|
svc master argus-master-sys
|
||||||
|
svc es argus-es-sys
|
||||||
|
svc kibana argus-kibana-sys
|
||||||
|
svc ftp argus-ftp
|
||||||
|
svc prometheus argus-prometheus
|
||||||
|
svc grafana argus-grafana
|
||||||
|
svc alertmanager argus-alertmanager
|
||||||
|
svc web-frontend argus-web-frontend
|
||||||
|
svc web-proxy argus-web-proxy
|
||||||
|
|
||||||
|
section HTTP
|
||||||
|
logd "ES: $(http_code \"http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health\")"; http_body_head "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" >> "$DETAILS" 2>&1 || true
|
||||||
|
logd "Kibana: $(http_code \"http://localhost:${KIBANA_PORT:-5601}/api/status\")"; http_body_head "http://localhost:${KIBANA_PORT:-5601}/api/status" >> "$DETAILS" 2>&1 || true
|
||||||
|
logd "Master readyz: $(http_code \"http://localhost:${MASTER_PORT:-32300}/readyz\")"
|
||||||
|
logd "Prometheus: $(http_code \"http://localhost:${PROMETHEUS_PORT:-9090}/-/ready\")"
|
||||||
|
logd "Grafana: $(http_code \"http://localhost:${GRAFANA_PORT:-3000}/api/health\")"; http_body_head "http://localhost:${GRAFANA_PORT:-3000}/api/health" >> "$DETAILS" 2>&1 || true
|
||||||
|
logd "Alertmanager: $(http_code \"http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status\")"
|
||||||
|
cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true)
|
||||||
|
cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true)
|
||||||
|
logd "Web-Proxy 8080: $(http_code \"http://localhost:${WEB_PROXY_PORT_8080:-8080}/\")"
|
||||||
|
logd "Web-Proxy 8083: $(http_code \"http://localhost:${WEB_PROXY_PORT_8083:-8083}/\")"
|
||||||
|
logd "Web-Proxy 8084 CORS: ${cors8084}"
|
||||||
|
logd "Web-Proxy 8085 CORS: ${cors8085}"
|
||||||
|
|
||||||
|
section ES-CHECKS
|
||||||
|
ch=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" || true)
|
||||||
|
status=$(printf '%s' "$ch" | awk -F'\"' '/"status"/{print $4; exit}')
|
||||||
|
if [[ -n "$status" ]]; then logd "cluster.status=$status"; fi
|
||||||
|
if [[ "$status" != "green" ]]; then append_err "[es][cluster] status=$status"; fi
|
||||||
|
if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then
|
||||||
|
duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true)
|
||||||
|
logd "es.data.df_use=$duse"; usep=${duse%%%}
|
||||||
|
if [[ -n "$usep" ]] && (( usep >= 90 )); then append_err "[es][disk] data path usage=${usep}%"; fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
section DNS-IN-PROXY
|
||||||
|
for d in master.argus.com es.log.argus.com kibana.log.argus.com grafana.metric.argus.com prom.metric.argus.com alertmanager.alert.argus.com; do
|
||||||
|
docker exec argus-web-proxy sh -lc "getent hosts $d || nslookup $d 2>/dev/null | tail -n+1" >> "$DETAILS" 2>&1 || true
|
||||||
|
done
|
||||||
|
logd "HTTP (web-proxy): master.readyz=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://master.argus.com:3000/readyz\" 2>/dev/null || echo 000)"
|
||||||
|
logd "HTTP (web-proxy): es.health=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://es.log.argus.com:9200/_cluster/health\" 2>/dev/null || echo 000)"
|
||||||
|
logd "HTTP (web-proxy): kibana.status=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://kibana.log.argus.com:5601/api/status\" 2>/dev/null || echo 000)"
|
||||||
|
|
||||||
|
section FTP-SHARE
|
||||||
|
docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true
|
||||||
|
|
||||||
|
section SYSTEM
|
||||||
|
logd "uname -a:"; uname -a >> "$DETAILS"
|
||||||
|
logd "docker version:"; docker version --format '{{.Server.Version}}' >> "$DETAILS" 2>&1 || true
|
||||||
|
logd "compose ps:"; (cd "$ROOT/compose" && docker compose ps) >> "$DETAILS" 2>&1 || true
|
||||||
|
|
||||||
|
section SUMMARY
|
||||||
|
[[ $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health") != 200 ]] && echo "[es][http] health not 200" >> "$ERRORS"
|
||||||
|
kbcode=$(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status"); [[ "$kbcode" != 200 ]] && echo "[kibana][http] /api/status=$kbcode" >> "$ERRORS"
|
||||||
|
[[ $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz") != 200 ]] && echo "[master][http] /readyz not 200" >> "$ERRORS"
|
||||||
|
[[ $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready") != 200 ]] && echo "[prometheus][http] /-/ready not 200" >> "$ERRORS"
|
||||||
|
gfcode=$(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health"); [[ "$gfcode" != 200 ]] && echo "[grafana][http] /api/health=$gfcode" >> "$ERRORS"
|
||||||
|
[[ $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status") != 200 ]] && echo "[alertmanager][http] /api/v2/status not 200" >> "$ERRORS"
|
||||||
|
[[ -z "$cors8084" ]] && echo "[web-proxy][cors] 8084 missing Access-Control-Allow-Origin" >> "$ERRORS"
|
||||||
|
[[ -z "$cors8085" ]] && echo "[web-proxy][cors] 8085 missing Access-Control-Allow-Origin" >> "$ERRORS"
|
||||||
|
sort -u -o "$ERRORS" "$ERRORS"
|
||||||
|
|
||||||
|
echo "Diagnostic details -> $DETAILS"
|
||||||
|
echo "Detected errors -> $ERRORS"
|
||||||
|
|
||||||
|
if [[ "$LOG_DIR" == "$ROOT/logs" ]]; then
|
||||||
|
ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true
|
||||||
|
ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
HOST="${1:-http://127.0.0.1:9200}"
|
||||||
|
echo "设置 ES watermark 为 95%/96%/97%: $HOST"
|
||||||
|
curl -fsS -XPUT "$HOST/_cluster/settings" -H 'Content-Type: application/json' -d '{
|
||||||
|
"transient": {
|
||||||
|
"cluster.routing.allocation.disk.watermark.low": "95%",
|
||||||
|
"cluster.routing.allocation.disk.watermark.high": "96%",
|
||||||
|
"cluster.routing.allocation.disk.watermark.flood_stage": "97%"
|
||||||
|
}
|
||||||
|
}' && echo "\nOK"
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
HOST="${1:-http://127.0.0.1:9200}"
|
||||||
|
echo "恢复 ES watermark 为默认值: $HOST"
|
||||||
|
curl -fsS -XPUT "$HOST/_cluster/settings" -H 'Content-Type: application/json' -d '{
|
||||||
|
"transient": {
|
||||||
|
"cluster.routing.allocation.disk.watermark.low": null,
|
||||||
|
"cluster.routing.allocation.disk.watermark.high": null,
|
||||||
|
"cluster.routing.allocation.disk.watermark.flood_stage": null
|
||||||
|
}
|
||||||
|
}' && echo "\nOK"
|
||||||
167
deployment_new/templates/server/scripts/install.sh
Normal file
167
deployment_new/templates/server/scripts/install.sh
Normal file
@ -0,0 +1,167 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||||
|
PKG_ROOT="$ROOT_DIR"
|
||||||
|
ENV_FILE="$PKG_ROOT/compose/.env"
|
||||||
|
COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml"
|
||||||
|
|
||||||
|
info(){ echo -e "\033[34m[INSTALL]\033[0m $*"; }
|
||||||
|
err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; }
|
||||||
|
require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; }
|
||||||
|
require docker curl jq awk sed tar gzip
|
||||||
|
|
||||||
|
[[ -f "$ENV_FILE" ]] || { err "缺少 compose/.env,请先运行 scripts/config.sh"; exit 1; }
|
||||||
|
info "使用环境文件: $ENV_FILE"
|
||||||
|
set -a; source "$ENV_FILE"; set +a
|
||||||
|
# 兼容:若 .env 未包含 SWARM_MANAGER_ADDR,则从已存在的 cluster-info.env 读取以避免写空
|
||||||
|
SMADDR="${SWARM_MANAGER_ADDR:-}"
|
||||||
|
CI_FILE="$PKG_ROOT/cluster-info.env"
|
||||||
|
if [[ -z "$SMADDR" && -f "$CI_FILE" ]]; then
|
||||||
|
SMADDR=$(sed -n 's/^SWARM_MANAGER_ADDR=\(.*\)$/\1/p' "$CI_FILE" | head -n1)
|
||||||
|
fi
|
||||||
|
SWARM_MANAGER_ADDR="$SMADDR"
|
||||||
|
|
||||||
|
# Swarm init & overlay
|
||||||
|
if ! docker info 2>/dev/null | grep -q "Swarm: active"; then
|
||||||
|
[[ -n "${SWARM_MANAGER_ADDR:-}" ]] || { err "SWARM_MANAGER_ADDR 未设置,请在 scripts/config.sh 中配置"; exit 1; }
|
||||||
|
info "初始化 Swarm (--advertise-addr $SWARM_MANAGER_ADDR)"
|
||||||
|
docker swarm init --advertise-addr "$SWARM_MANAGER_ADDR" >/dev/null 2>&1 || true
|
||||||
|
else
|
||||||
|
info "Swarm 已激活"
|
||||||
|
fi
|
||||||
|
NET_NAME="${ARGUS_OVERLAY_NET:-argus-sys-net}"
|
||||||
|
if ! docker network inspect "$NET_NAME" >/dev/null 2>&1; then
|
||||||
|
info "创建 overlay 网络: $NET_NAME"
|
||||||
|
docker network create -d overlay --attachable "$NET_NAME" >/dev/null
|
||||||
|
else
|
||||||
|
info "overlay 网络已存在: $NET_NAME"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Load images
|
||||||
|
IMAGES_DIR="$PKG_ROOT/images"
|
||||||
|
shopt -s nullglob
|
||||||
|
tars=("$IMAGES_DIR"/*.tar.gz)
|
||||||
|
if [[ ${#tars[@]} -eq 0 ]]; then err "images 目录为空,缺少镜像 tar.gz"; exit 1; fi
|
||||||
|
total=${#tars[@]}; idx=0
|
||||||
|
for tgz in "${tars[@]}"; do
|
||||||
|
idx=$((idx+1))
|
||||||
|
info "导入镜像 ($idx/$total): $(basename "$tgz")"
|
||||||
|
tmp=$(mktemp); gunzip -c "$tgz" > "$tmp"; docker load -i "$tmp" >/dev/null; rm -f "$tmp"
|
||||||
|
done
|
||||||
|
shopt -u nullglob
|
||||||
|
|
||||||
|
# Compose up
|
||||||
|
info "启动服务栈 (docker compose up -d)"
|
||||||
|
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d
|
||||||
|
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps
|
||||||
|
|
||||||
|
# Wait readiness (best-effort)
|
||||||
|
code(){ curl -4 -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
|
||||||
|
prom_ok(){ (exec 3<>/dev/tcp/127.0.0.1/${PROMETHEUS_PORT:-9090}) >/dev/null 2>&1 && return 0 || return 1; }
|
||||||
|
kb_ok(){ local body; body=$(curl -s "http://127.0.0.1:${KIBANA_PORT:-5601}/api/status" || true); echo "$body" | grep -q '"level"\s*:\s*"available"'; }
|
||||||
|
RETRIES=${RETRIES:-60}; SLEEP=${SLEEP:-5}; ok=0
|
||||||
|
info "等待基础服务就绪 (<= $((RETRIES*SLEEP))s)"
|
||||||
|
for i in $(seq 1 "$RETRIES"); do
|
||||||
|
e1=$(code "http://127.0.0.1:${MASTER_PORT:-32300}/readyz")
|
||||||
|
e2=$(code "http://127.0.0.1:${ES_HTTP_PORT:-9200}/_cluster/health")
|
||||||
|
e3=000; prom_ok && e3=200
|
||||||
|
e4=$(code "http://127.0.0.1:${GRAFANA_PORT:-3000}/api/health")
|
||||||
|
e5=$(code "http://127.0.0.1:${ALERTMANAGER_PORT:-9093}/api/v2/status")
|
||||||
|
e6=$(kb_ok && echo 200 || echo 000)
|
||||||
|
info "[ready] t=$((i*SLEEP))s master=$e1 es=$e2 prom=$e3 graf=$e4 alert=$e5 kibana=$e6"
|
||||||
|
[[ "$e1" == 200 ]] && ok=$((ok+1))
|
||||||
|
[[ "$e2" == 200 ]] && ok=$((ok+1))
|
||||||
|
[[ "$e3" == 200 ]] && ok=$((ok+1))
|
||||||
|
[[ "$e4" == 200 ]] && ok=$((ok+1))
|
||||||
|
[[ "$e5" == 200 ]] && ok=$((ok+1))
|
||||||
|
[[ "$e6" == 200 ]] && ok=$((ok+1))
|
||||||
|
if [[ $ok -ge 6 ]]; then break; fi; ok=0; sleep "$SLEEP"
|
||||||
|
done
|
||||||
|
[[ $ok -ge 6 ]] || err "部分服务未就绪(可稍后重试 selfcheck)"
|
||||||
|
|
||||||
|
# Resolve overlay IPs
|
||||||
|
bind_c=argus-bind-sys; ftp_c=argus-ftp
|
||||||
|
BINDIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "'$NET_NAME'").IPAddress }}' "$bind_c" 2>/dev/null || true)
|
||||||
|
FTPIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "'$NET_NAME'").IPAddress }}' "$ftp_c" 2>/dev/null || true)
|
||||||
|
info "解析 overlay IP: BINDIP=${BINDIP:-<empty>} FTPIP=${FTPIP:-<empty>}"
|
||||||
|
|
||||||
|
# Swarm join tokens
|
||||||
|
TOKEN_WORKER=$(docker swarm join-token -q worker 2>/dev/null || echo "")
|
||||||
|
TOKEN_MANAGER=$(docker swarm join-token -q manager 2>/dev/null || echo "")
|
||||||
|
|
||||||
|
# cluster-info.env
|
||||||
|
CI="$PKG_ROOT/cluster-info.env"
|
||||||
|
info "写入 cluster-info.env (manager/token/IP)"
|
||||||
|
{
|
||||||
|
echo "SWARM_MANAGER_ADDR=${SWARM_MANAGER_ADDR:-}"
|
||||||
|
echo "BINDIP=${BINDIP:-}"
|
||||||
|
echo "FTPIP=${FTPIP:-}"
|
||||||
|
echo "SWARM_JOIN_TOKEN_WORKER=${TOKEN_WORKER:-}"
|
||||||
|
echo "SWARM_JOIN_TOKEN_MANAGER=${TOKEN_MANAGER:-}"
|
||||||
|
} > "$CI"
|
||||||
|
info "已输出 $CI"
|
||||||
|
|
||||||
|
# 安装报告
|
||||||
|
ts=$(date +%Y%m%d-%H%M%S)
|
||||||
|
RPT="$PKG_ROOT/安装报告_${ts}.md"
|
||||||
|
{
|
||||||
|
echo "# Argus Server 安装报告 (${ts})"
|
||||||
|
echo
|
||||||
|
echo "## 端口映射"
|
||||||
|
echo "- MASTER_PORT=${MASTER_PORT}"
|
||||||
|
echo "- ES_HTTP_PORT=${ES_HTTP_PORT}"
|
||||||
|
echo "- KIBANA_PORT=${KIBANA_PORT}"
|
||||||
|
echo "- PROMETHEUS_PORT=${PROMETHEUS_PORT}"
|
||||||
|
echo "- GRAFANA_PORT=${GRAFANA_PORT}"
|
||||||
|
echo "- ALERTMANAGER_PORT=${ALERTMANAGER_PORT}"
|
||||||
|
echo "- WEB_PROXY_PORT_8080=${WEB_PROXY_PORT_8080} ... 8085=${WEB_PROXY_PORT_8085}"
|
||||||
|
echo
|
||||||
|
echo "## Swarm/Overlay"
|
||||||
|
echo "- SWARM_MANAGER_ADDR=${SWARM_MANAGER_ADDR:-}"
|
||||||
|
echo "- NET=${NET_NAME}"
|
||||||
|
echo "- JOIN_TOKEN_WORKER=${TOKEN_WORKER:-}"
|
||||||
|
echo "- JOIN_TOKEN_MANAGER=${TOKEN_MANAGER:-}"
|
||||||
|
echo
|
||||||
|
echo "## Overlay IPs"
|
||||||
|
echo "- BINDIP=${BINDIP:-}"
|
||||||
|
echo "- FTPIP=${FTPIP:-}"
|
||||||
|
echo
|
||||||
|
echo "## 健康检查(简要)"
|
||||||
|
echo "- master/readyz=$(code http://127.0.0.1:${MASTER_PORT:-32300}/readyz)"
|
||||||
|
echo "- es/_cluster/health=$(code http://127.0.0.1:${ES_HTTP_PORT:-9200}/_cluster/health)"
|
||||||
|
echo "- grafana/api/health=$(code http://127.0.0.1:${GRAFANA_PORT:-3000}/api/health)"
|
||||||
|
echo "- prometheus/tcp=$([[ $(prom_ok; echo $?) == 0 ]] && echo 200 || echo 000)"
|
||||||
|
echo "- alertmanager/api/v2/status=$(code http://127.0.0.1:${ALERTMANAGER_PORT:-9093}/api/v2/status)"
|
||||||
|
echo "- kibana/api/status=$([[ $(kb_ok; echo $?) == 0 ]] && echo available || echo not-ready)"
|
||||||
|
} > "$RPT"
|
||||||
|
info "已生成报告: $RPT"
|
||||||
|
|
||||||
|
info "安装完成。可将 cluster-info.env 分发给 Client-GPU 安装方。"
|
||||||
|
|
||||||
|
# 写入域名→overlay IP 并热更新 Bind/Nginx
|
||||||
|
ETC_DIR="$PKG_ROOT/private/argus/etc"; mkdir -p "$ETC_DIR"
|
||||||
|
declare -A MAP
|
||||||
|
MAP[web-frontend]=web.argus.com
|
||||||
|
MAP[argus-grafana]=grafana.metric.argus.com
|
||||||
|
MAP[argus-prometheus]=prom.metric.argus.com
|
||||||
|
MAP[argus-kibana-sys]=kibana.log.argus.com
|
||||||
|
MAP[argus-alertmanager]=alertmanager.alert.argus.com
|
||||||
|
MAP[argus-master-sys]=master.argus.com
|
||||||
|
changed=0
|
||||||
|
for cname in "${!MAP[@]}"; do
|
||||||
|
domain="${MAP[$cname]}"; fpath="$ETC_DIR/$domain"
|
||||||
|
ip=$(docker inspect -f '{{ (index .NetworkSettings.Networks "'$NET_NAME'").IPAddress }}' "$cname" 2>/dev/null || true)
|
||||||
|
[[ -z "$ip" ]] && { echo "[DNS-FIX][WARN] $domain: container $cname no overlay IP yet"; continue; }
|
||||||
|
cur=$(cat "$fpath" 2>/dev/null || echo "")
|
||||||
|
if [[ "$cur" != "$ip" ]]; then
|
||||||
|
echo "$ip" > "$fpath"; echo "[DNS-FIX][SET] $domain = $ip (was: ${cur:-<empty>})"; changed=1
|
||||||
|
else
|
||||||
|
echo "[DNS-FIX][OK] $domain already $ip"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if [[ $changed -eq 1 ]]; then
|
||||||
|
docker exec argus-bind-sys /usr/local/bin/reload-bind9.sh >/dev/null 2>&1 || docker exec argus-bind-sys rndc reload >/dev/null 2>&1 || true
|
||||||
|
sleep 1
|
||||||
|
fi
|
||||||
|
docker exec argus-web-proxy nginx -t >/dev/null 2>&1 && docker exec argus-web-proxy nginx -s reload >/dev/null 2>&1 || true
|
||||||
89
deployment_new/templates/server/scripts/selfcheck.sh
Normal file
89
deployment_new/templates/server/scripts/selfcheck.sh
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
|
||||||
|
log() { echo -e "\033[0;34m[CHECK]\033[0m $*"; }
|
||||||
|
err() { echo -e "\033[0;31m[ERROR]\033[0m $*" >&2; }
|
||||||
|
|
||||||
|
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
||||||
|
|
||||||
|
wait_http() { local url="$1"; local attempts=${2:-120}; local i=1; while ((i<=attempts)); do curl -fsS "$url" >/dev/null 2>&1 && return 0; echo "[..] waiting $url ($i/$attempts)"; sleep 5; ((i++)); done; return 1; }
|
||||||
|
code_for() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
|
||||||
|
header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; }
|
||||||
|
|
||||||
|
LOG_DIR="$ROOT/logs"; mkdir -p "$LOG_DIR" || true
|
||||||
|
OUT_JSON="$LOG_DIR/selfcheck.json"; tmp=$(mktemp)
|
||||||
|
|
||||||
|
ok=1
|
||||||
|
|
||||||
|
log "checking overlay network"
|
||||||
|
net_ok=false
|
||||||
|
if docker network inspect "${ARGUS_OVERLAY_NET:-argus-sys-net}" >/dev/null 2>&1; then
|
||||||
|
if docker network inspect "${ARGUS_OVERLAY_NET:-argus-sys-net}" | grep -q '"Driver": "overlay"'; then net_ok=true; fi
|
||||||
|
fi
|
||||||
|
[[ "$net_ok" == true ]] || ok=0
|
||||||
|
|
||||||
|
log "checking Elasticsearch"
|
||||||
|
wait_http "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" 60 || ok=0
|
||||||
|
|
||||||
|
log "checking Kibana"
|
||||||
|
kb_code=$(code_for "http://localhost:${KIBANA_PORT:-5601}/api/status")
|
||||||
|
kb_ok=false
|
||||||
|
if [[ "$kb_code" == 200 ]]; then
|
||||||
|
body=$(curl -sS "http://localhost:${KIBANA_PORT:-5601}/api/status" || true)
|
||||||
|
echo "$body" | grep -q '"level"\s*:\s*"available"' && kb_ok=true
|
||||||
|
fi
|
||||||
|
[[ "$kb_ok" == true ]] || ok=0
|
||||||
|
|
||||||
|
log "checking Master"
|
||||||
|
[[ $(code_for "http://localhost:${MASTER_PORT:-32300}/readyz") == 200 ]] || ok=0
|
||||||
|
|
||||||
|
log "checking FTP"
|
||||||
|
if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then
|
||||||
|
docker exec argus-ftp sh -lc 'test -w /private/argus/ftp/share' >/dev/null 2>&1 || ok=0
|
||||||
|
else ok=0; fi
|
||||||
|
|
||||||
|
log "checking Prometheus"
|
||||||
|
wait_http "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready" 60 || ok=0
|
||||||
|
|
||||||
|
log "checking Grafana"
|
||||||
|
gf_code=$(code_for "http://localhost:${GRAFANA_PORT:-3000}/api/health")
|
||||||
|
gf_ok=false; if [[ "$gf_code" == 200 ]]; then body=$(curl -sS "http://localhost:${GRAFANA_PORT:-3000}/api/health" || true); echo "$body" | grep -q '"database"\s*:\s*"ok"' && gf_ok=true; fi
|
||||||
|
[[ "$gf_ok" == true ]] || ok=0
|
||||||
|
|
||||||
|
log "checking Alertmanager"
|
||||||
|
wait_http "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status" 60 || ok=0
|
||||||
|
|
||||||
|
log "checking Web-Proxy (CORS)"
|
||||||
|
cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true)
|
||||||
|
cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true)
|
||||||
|
wp_ok=true
|
||||||
|
[[ -n "$cors8084" && -n "$cors8085" ]] || wp_ok=false
|
||||||
|
[[ "$wp_ok" == true ]] || ok=0
|
||||||
|
|
||||||
|
cat > "$tmp" <<JSON
|
||||||
|
{
|
||||||
|
"overlay_net": $net_ok,
|
||||||
|
"es": true,
|
||||||
|
"kibana": $kb_ok,
|
||||||
|
"master_readyz": true,
|
||||||
|
"ftp_share_writable": true,
|
||||||
|
"prometheus": true,
|
||||||
|
"grafana": $gf_ok,
|
||||||
|
"alertmanager": true,
|
||||||
|
"web_proxy_cors": $wp_ok,
|
||||||
|
"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||||
|
}
|
||||||
|
JSON
|
||||||
|
|
||||||
|
mv "$tmp" "$OUT_JSON" 2>/dev/null || cp "$tmp" "$OUT_JSON"
|
||||||
|
|
||||||
|
if [[ "$ok" == 1 ]]; then
|
||||||
|
log "selfcheck OK -> $OUT_JSON"
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
err "selfcheck FAILED -> $OUT_JSON"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
7
deployment_new/templates/server/scripts/status.sh
Normal file
7
deployment_new/templates/server/scripts/status.sh
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||||
|
PKG_ROOT="$ROOT_DIR"
|
||||||
|
ENV_FILE="$PKG_ROOT/compose/.env"
|
||||||
|
COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml"
|
||||||
|
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps
|
||||||
9
deployment_new/templates/server/scripts/uninstall.sh
Normal file
9
deployment_new/templates/server/scripts/uninstall.sh
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||||
|
PKG_ROOT="$ROOT_DIR"
|
||||||
|
ENV_FILE="$PKG_ROOT/compose/.env"
|
||||||
|
COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml"
|
||||||
|
echo "[UNINSTALL] stopping compose"
|
||||||
|
docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" down --remove-orphans || true
|
||||||
|
echo "[UNINSTALL] done"
|
||||||
@ -31,26 +31,31 @@ RUN mkdir -p /usr/share/alertmanager && \
|
|||||||
rm -rf /alertmanager && \
|
rm -rf /alertmanager && \
|
||||||
ln -s ${ALERTMANAGER_BASE_PATH} /alertmanager
|
ln -s ${ALERTMANAGER_BASE_PATH} /alertmanager
|
||||||
|
|
||||||
# 创建 alertmanager 用户(可自定义 UID/GID)
|
# 确保 ubuntu 账户存在并使用 ARGUS_BUILD_UID/GID
|
||||||
# 创建 alertmanager 用户组
|
|
||||||
RUN set -eux; \
|
RUN set -eux; \
|
||||||
# 确保目标 GID 存在;若已被占用,直接使用该 GID(组名不限)\
|
# 确保存在目标 GID 的组;若不存在则优先尝试将 ubuntu 组改为该 GID,否则创建新组
|
||||||
if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \
|
if getent group "${ARGUS_BUILD_GID}" >/dev/null; then \
|
||||||
groupadd -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
:; \
|
||||||
fi; \
|
|
||||||
# 确保存在 alertmanager 用户;若 UID 已被占用,跳过并继续使用现有 UID 的用户
|
|
||||||
if ! id alertmanager >/dev/null 2>&1; then \
|
|
||||||
if getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
|
|
||||||
# UID 已占用,则创建同名用户但不指定 UID(避免冲突),仅保证 user 存在
|
|
||||||
useradd -M -s /usr/sbin/nologin -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
|
||||||
else \
|
|
||||||
useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
|
||||||
fi; \
|
|
||||||
else \
|
else \
|
||||||
usermod -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
if getent group ubuntu >/dev/null; then \
|
||||||
fi
|
groupmod -g "${ARGUS_BUILD_GID}" ubuntu || true; \
|
||||||
|
else \
|
||||||
RUN chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/alertmanager /alertmanager ${ALERTMANAGER_BASE_PATH} /private/argus/etc /usr/local/bin || true
|
groupadd -g "${ARGUS_BUILD_GID}" ubuntu || groupadd -g "${ARGUS_BUILD_GID}" argus || true; \
|
||||||
|
fi; \
|
||||||
|
fi; \
|
||||||
|
# 创建或调整 ubuntu 用户
|
||||||
|
if id ubuntu >/dev/null 2>&1; then \
|
||||||
|
# 设置主组为目标 GID(可用 GID 数字指定)
|
||||||
|
usermod -g "${ARGUS_BUILD_GID}" ubuntu || true; \
|
||||||
|
# 若目标 UID 未被占用,则更新 ubuntu 的 UID
|
||||||
|
if [ "$(id -u ubuntu)" != "${ARGUS_BUILD_UID}" ] && ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
|
||||||
|
usermod -u "${ARGUS_BUILD_UID}" ubuntu || true; \
|
||||||
|
fi; \
|
||||||
|
else \
|
||||||
|
useradd -m -s /bin/bash -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" ubuntu || true; \
|
||||||
|
fi; \
|
||||||
|
# 调整关键目录属主为 ubuntu UID/GID
|
||||||
|
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/alertmanager /alertmanager ${ALERTMANAGER_BASE_PATH} /private/argus/etc /usr/local/bin || true
|
||||||
|
|
||||||
# 配置内网 apt 源 (如果指定了内网选项)
|
# 配置内网 apt 源 (如果指定了内网选项)
|
||||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||||
|
|||||||
@ -6,7 +6,7 @@ user=root
|
|||||||
|
|
||||||
[program:alertmanager]
|
[program:alertmanager]
|
||||||
command=/usr/local/bin/start-am-supervised.sh
|
command=/usr/local/bin/start-am-supervised.sh
|
||||||
user=alertmanager
|
user=ubuntu
|
||||||
stdout_logfile=/var/log/supervisor/alertmanager.log
|
stdout_logfile=/var/log/supervisor/alertmanager.log
|
||||||
stderr_logfile=/var/log/supervisor/alertmanager_error.log
|
stderr_logfile=/var/log/supervisor/alertmanager_error.log
|
||||||
autorestart=true
|
autorestart=true
|
||||||
|
|||||||
@ -1,9 +1,11 @@
|
|||||||
# 重要:使用 Logstash_Format + Logstash_Prefix,生成 train-*/infer-* 索引
|
# 重要:使用 Logstash_Format + Logstash_Prefix,生成 train-*/infer-* 索引
|
||||||
|
# 说明:Fluent Bit 配置仅支持 ${VAR} 占位符,不支持 Bash 的 ${VAR:-default}
|
||||||
|
# 固定域名要求:使用 es.log.argus.com 与端口 9200
|
||||||
[OUTPUT]
|
[OUTPUT]
|
||||||
Name es
|
Name es
|
||||||
Match app.train
|
Match app.train
|
||||||
Host ${ES_HOST:-localhost}
|
Host es.log.argus.com
|
||||||
Port ${ES_PORT:-9200}
|
Port 9200
|
||||||
Logstash_Format On
|
Logstash_Format On
|
||||||
Logstash_Prefix train
|
Logstash_Prefix train
|
||||||
Replace_Dots On
|
Replace_Dots On
|
||||||
@ -14,8 +16,8 @@
|
|||||||
[OUTPUT]
|
[OUTPUT]
|
||||||
Name es
|
Name es
|
||||||
Match app.infer
|
Match app.infer
|
||||||
Host ${ES_HOST:-localhost}
|
Host es.log.argus.com
|
||||||
Port ${ES_PORT:-9200}
|
Port 9200
|
||||||
Logstash_Format On
|
Logstash_Format On
|
||||||
Logstash_Prefix infer
|
Logstash_Prefix infer
|
||||||
Replace_Dots On
|
Replace_Dots On
|
||||||
|
|||||||
@ -206,7 +206,8 @@ export HOSTNAME
|
|||||||
|
|
||||||
export CLUSTER="${CLUSTER:-local}"
|
export CLUSTER="${CLUSTER:-local}"
|
||||||
export RACK="${RACK:-dev}"
|
export RACK="${RACK:-dev}"
|
||||||
export ES_HOST="${ES_HOST:-localhost}"
|
# 默认使用固定域名(满足“固定域名”需求);若外部传入覆盖,则使用外部值
|
||||||
|
export ES_HOST="${ES_HOST:-es.log.argus.com}"
|
||||||
export ES_PORT="${ES_PORT:-9200}"
|
export ES_PORT="${ES_PORT:-9200}"
|
||||||
|
|
||||||
log_info "Environment variables:"
|
log_info "Environment variables:"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user