diff --git a/deployment_new/.gitignore b/deployment_new/.gitignore new file mode 100644 index 0000000..a319647 --- /dev/null +++ b/deployment_new/.gitignore @@ -0,0 +1 @@ +artifact/ diff --git a/deployment_new/README.md b/deployment_new/README.md new file mode 100644 index 0000000..f433c34 --- /dev/null +++ b/deployment_new/README.md @@ -0,0 +1,14 @@ +# deployment_new + +本目录用于新的部署打包与交付实现(不影响既有 `deployment/`)。 + +里程碑 M1(当前实现) +- `build/make_server_package.sh`:生成 Server 包(逐服务镜像 tar.gz、compose、.env.example、docs、private 骨架、manifest/checksums、打包 tar.gz)。 +- `build/make_client_gpu_package.sh`:生成 Client‑GPU 包(GPU bundle 镜像 tar.gz、busybox.tar、compose、.env.example、docs、private 骨架、manifest/checksums、打包 tar.gz)。 + +模板 +- `templates/server/compose/docker-compose.yml`:部署专用,镜像默认使用 `:${PKG_VERSION}` 版本 tag,可通过 `.env` 覆盖。 +- `templates/client_gpu/compose/docker-compose.yml`:GPU 节点专用,使用 `:${PKG_VERSION}` 版本 tag。 + +注意:M1 仅产出安装包,不包含安装脚本落地;安装/运维脚本将在 M2 落地并纳入包内。 + diff --git a/deployment_new/build/common.sh b/deployment_new/build/common.sh new file mode 100644 index 0000000..9db255b --- /dev/null +++ b/deployment_new/build/common.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +set -euo pipefail + +log() { echo -e "\033[0;34m[INFO]\033[0m $*"; } +warn() { echo -e "\033[1;33m[WARN]\033[0m $*"; } +err() { echo -e "\033[0;31m[ERR ]\033[0m $*" >&2; } + +require_cmd() { + local miss=0 + for c in "$@"; do + if ! command -v "$c" >/dev/null 2>&1; then err "missing command: $c"; miss=1; fi + done + [[ $miss -eq 0 ]] +} + +today_version() { date +%Y%m%d; } + +checksum_dir() { + local dir="$1"; local out="$2"; : > "$out"; + (cd "$dir" && find . -type f -print0 | sort -z | xargs -0 sha256sum) >> "$out" +} + +make_dir() { mkdir -p "$1"; } + +copy_tree() { + local src="$1" dst="$2"; rsync -a --delete "$src/" "$dst/" 2>/dev/null || cp -r "$src/." "$dst/"; +} + +gen_manifest() { + local root="$1"; local out="$2"; : > "$out"; + (cd "$root" && find . -maxdepth 4 -type f -printf "%p\n" | sort) >> "$out" +} + diff --git a/deployment_new/build/make_client_gpu_package.sh b/deployment_new/build/make_client_gpu_package.sh new file mode 100755 index 0000000..9e2d5ac --- /dev/null +++ b/deployment_new/build/make_client_gpu_package.sh @@ -0,0 +1,131 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Make client GPU package (versioned gpu bundle image, compose, env, docs, busybox) + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +TEMPL_DIR="$ROOT_DIR/deployment_new/templates/client_gpu" +ART_ROOT="$ROOT_DIR/deployment_new/artifact/client_gpu" + +# Use deployment_new local common helpers +COMMON_SH="$ROOT_DIR/deployment_new/build/common.sh" +. "$COMMON_SH" + +usage(){ cat </ and client_gpu_YYYYMMDD.tar.gz +EOF +} + +VERSION="" +IMAGE="argus-sys-metric-test-node-bundle-gpu:latest" +while [[ $# -gt 0 ]]; do + case "$1" in + --version) VERSION="$2"; shift 2;; + --image) IMAGE="$2"; shift 2;; + -h|--help) usage; exit 0;; + *) err "unknown arg: $1"; usage; exit 1;; + esac +done +if [[ -z "$VERSION" ]]; then VERSION="$(today_version)"; fi + +require_cmd docker tar gzip + +STAGE="$(mktemp -d)"; trap 'rm -rf "$STAGE"' EXIT +PKG_DIR="$ART_ROOT/$VERSION" +mkdir -p "$PKG_DIR" "$STAGE/images" "$STAGE/compose" "$STAGE/docs" "$STAGE/scripts" "$STAGE/private/argus" + +# 1) Save GPU bundle image with version tag +if ! docker image inspect "$IMAGE" >/dev/null 2>&1; then + err "missing image: $IMAGE"; exit 1; fi + +REPO="${IMAGE%%:*}"; TAG_VER="$REPO:$VERSION" +docker tag "$IMAGE" "$TAG_VER" +out_tar="$STAGE/images/${REPO//\//-}-$VERSION.tar" +docker save -o "$out_tar" "$TAG_VER" +gzip -f "$out_tar" + +# 2) Busybox tar for connectivity/overlay warmup (prefer local template; fallback to docker save) +BB_SRC="$TEMPL_DIR/images/busybox.tar" +if [[ -f "$BB_SRC" ]]; then + cp "$BB_SRC" "$STAGE/images/busybox.tar" +else + if docker image inspect busybox:latest >/dev/null 2>&1 || docker pull busybox:latest >/dev/null 2>&1; then + docker save -o "$STAGE/images/busybox.tar" busybox:latest + log "Included busybox from local docker daemon" + else + warn "busybox image not found and cannot pull; skipping busybox.tar" + fi +fi + +# 3) Compose + env template and docs/scripts from templates +cp "$TEMPL_DIR/compose/docker-compose.yml" "$STAGE/compose/docker-compose.yml" +ENV_EX="$STAGE/compose/.env.example" +cat >"$ENV_EX" </dev/null 2>&1 || cp -r "$CLIENT_DOC_SRC/." "$STAGE/docs/" +fi + +# Placeholder scripts (will be implemented in M2) +cat >"$STAGE/scripts/README.md" <<'EOF' +# Client-GPU Scripts (Placeholder) + +本目录将在 M2 引入: +- config.sh / install.sh + +当前为占位,便于包结构审阅。 +EOF + +# 5) Scripts (from deployment_new templates) and Private skeleton +SCRIPTS_SRC="$TEMPL_DIR/scripts" +if [[ -d "$SCRIPTS_SRC" ]]; then + rsync -a "$SCRIPTS_SRC/" "$STAGE/scripts/" >/dev/null 2>&1 || cp -r "$SCRIPTS_SRC/." "$STAGE/scripts/" + find "$STAGE/scripts" -type f -name '*.sh' -exec chmod +x {} + 2>/dev/null || true +fi +mkdir -p "$STAGE/private/argus/agent" + +# 6) Manifest & checksums +gen_manifest "$STAGE" "$STAGE/manifest.txt" +checksum_dir "$STAGE" "$STAGE/checksums.txt" + +# 7) Move to artifact dir and pack +mkdir -p "$PKG_DIR" +rsync -a "$STAGE/" "$PKG_DIR/" >/dev/null 2>&1 || cp -r "$STAGE/." "$PKG_DIR/" + +OUT_TAR_DIR="$(dirname "$PKG_DIR")" +OUT_TAR="$OUT_TAR_DIR/client_gpu_${VERSION}.tar.gz" +log "Creating tarball: $OUT_TAR" +(cd "$PKG_DIR/.." && tar -czf "$OUT_TAR" "$(basename "$PKG_DIR")") +log "Client-GPU package ready: $PKG_DIR" +echo "$OUT_TAR" diff --git a/deployment_new/build/make_server_package.sh b/deployment_new/build/make_server_package.sh new file mode 100755 index 0000000..16d81f9 --- /dev/null +++ b/deployment_new/build/make_server_package.sh @@ -0,0 +1,166 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Make server deployment package (versioned, per-image tars, full compose, docs, skeleton) + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +TEMPL_DIR="$ROOT_DIR/deployment_new/templates/server" +ART_ROOT="$ROOT_DIR/deployment_new/artifact/server" + +# Use deployment_new local common helpers +COMMON_SH="$ROOT_DIR/deployment_new/build/common.sh" +. "$COMMON_SH" + +usage(){ cat </ and server_YYYYMMDD.tar.gz +EOF +} + +VERSION="" +while [[ $# -gt 0 ]]; do + case "$1" in + --version) VERSION="$2"; shift 2;; + -h|--help) usage; exit 0;; + *) err "unknown arg: $1"; usage; exit 1;; + esac +done +if [[ -z "$VERSION" ]]; then VERSION="$(today_version)"; fi + +require_cmd docker tar gzip awk sed + +IMAGES=( + argus-bind9 + argus-master + argus-elasticsearch + argus-kibana + argus-metric-ftp + argus-metric-prometheus + argus-metric-grafana + argus-alertmanager + argus-web-frontend + argus-web-proxy +) + +STAGE="$(mktemp -d)"; trap 'rm -rf "$STAGE"' EXIT +PKG_DIR="$ART_ROOT/$VERSION" +mkdir -p "$PKG_DIR" "$STAGE/images" "$STAGE/compose" "$STAGE/docs" "$STAGE/scripts" "$STAGE/private/argus" + +# 1) Save per-image tars with version tag +log "Tagging and saving images (version=$VERSION)" +for repo in "${IMAGES[@]}"; do + if ! docker image inspect "$repo:latest" >/dev/null 2>&1 && ! docker image inspect "$repo:$VERSION" >/dev/null 2>&1; then + err "missing image: $repo (need :latest or :$VERSION)"; exit 1; fi + if docker image inspect "$repo:$VERSION" >/dev/null 2>&1; then + tag="$repo:$VERSION" + else + docker tag "$repo:latest" "$repo:$VERSION" + tag="$repo:$VERSION" + fi + out_tar="$STAGE/images/${repo//\//-}-$VERSION.tar" + docker save -o "$out_tar" "$tag" + gzip -f "$out_tar" +done + +# 2) Compose + env template +cp "$TEMPL_DIR/compose/docker-compose.yml" "$STAGE/compose/docker-compose.yml" +ENV_EX="$STAGE/compose/.env.example" +cat >"$ENV_EX" <>"$ENV_EX" <<'EOF' + +# Host ports for server compose +MASTER_PORT=32300 +ES_HTTP_PORT=9200 +KIBANA_PORT=5601 +PROMETHEUS_PORT=9090 +GRAFANA_PORT=3000 +ALERTMANAGER_PORT=9093 +WEB_PROXY_PORT_8080=8080 +WEB_PROXY_PORT_8081=8081 +WEB_PROXY_PORT_8082=8082 +WEB_PROXY_PORT_8083=8083 +WEB_PROXY_PORT_8084=8084 +WEB_PROXY_PORT_8085=8085 + +# Overlay network name +ARGUS_OVERLAY_NET=argus-sys-net + +# FTP defaults +FTP_USER=ftpuser +FTP_PASSWORD=NASPlab1234! + +# UID/GID for volume ownership +ARGUS_BUILD_UID=2133 +ARGUS_BUILD_GID=2015 +EOF + +# 3) Docs (from deployment_new templates) +DOCS_SRC="$TEMPL_DIR/docs" +if [[ -d "$DOCS_SRC" ]]; then + rsync -a "$DOCS_SRC/" "$STAGE/docs/" >/dev/null 2>&1 || cp -r "$DOCS_SRC/." "$STAGE/docs/" +fi + +# 6) Scripts (from deployment_new templates) +SCRIPTS_SRC="$TEMPL_DIR/scripts" +if [[ -d "$SCRIPTS_SRC" ]]; then + rsync -a "$SCRIPTS_SRC/" "$STAGE/scripts/" >/dev/null 2>&1 || cp -r "$SCRIPTS_SRC/." "$STAGE/scripts/" + find "$STAGE/scripts" -type f -name '*.sh' -exec chmod +x {} + 2>/dev/null || true +fi + +# 4) Private skeleton (minimum) +mkdir -p \ + "$STAGE/private/argus/etc" \ + "$STAGE/private/argus/master" \ + "$STAGE/private/argus/metric/prometheus" \ + "$STAGE/private/argus/metric/prometheus/data" \ + "$STAGE/private/argus/metric/prometheus/rules" \ + "$STAGE/private/argus/metric/prometheus/targets" \ + "$STAGE/private/argus/metric/grafana" \ + "$STAGE/private/argus/metric/grafana/data" \ + "$STAGE/private/argus/metric/grafana/logs" \ + "$STAGE/private/argus/metric/grafana/plugins" \ + "$STAGE/private/argus/metric/grafana/provisioning/datasources" \ + "$STAGE/private/argus/metric/grafana/provisioning/dashboards" \ + "$STAGE/private/argus/metric/grafana/data/sessions" \ + "$STAGE/private/argus/metric/grafana/data/dashboards" \ + "$STAGE/private/argus/metric/grafana/config" \ + "$STAGE/private/argus/metric/ftp" \ + "$STAGE/private/argus/alert/alertmanager" \ + "$STAGE/private/argus/log/elasticsearch" \ + "$STAGE/private/argus/log/kibana" + +# 7) Manifest & checksums +gen_manifest "$STAGE" "$STAGE/manifest.txt" +checksum_dir "$STAGE" "$STAGE/checksums.txt" + +# 8) Move to artifact dir and pack +mkdir -p "$PKG_DIR" +rsync -a "$STAGE/" "$PKG_DIR/" >/dev/null 2>&1 || cp -r "$STAGE/." "$PKG_DIR/" + +OUT_TAR_DIR="$(dirname "$PKG_DIR")" +OUT_TAR="$OUT_TAR_DIR/server_${VERSION}.tar.gz" +log "Creating tarball: $OUT_TAR" +(cd "$PKG_DIR/.." && tar -czf "$OUT_TAR" "$(basename "$PKG_DIR")") +log "Server package ready: $PKG_DIR" +echo "$OUT_TAR" diff --git a/deployment_new/templates/client_gpu/compose/docker-compose.yml b/deployment_new/templates/client_gpu/compose/docker-compose.yml new file mode 100644 index 0000000..1e3a19f --- /dev/null +++ b/deployment_new/templates/client_gpu/compose/docker-compose.yml @@ -0,0 +1,41 @@ +version: "3.8" + +networks: + argus-sys-net: + external: true + +services: + metric-gpu-node: + image: ${NODE_GPU_BUNDLE_IMAGE_TAG:-argus-sys-metric-test-node-bundle-gpu:${PKG_VERSION}} + container_name: argus-metric-gpu-node-swarm + hostname: ${GPU_NODE_HOSTNAME} + restart: unless-stopped + privileged: true + runtime: nvidia + environment: + - TZ=Asia/Shanghai + - DEBIAN_FRONTEND=noninteractive + - MASTER_ENDPOINT=${MASTER_ENDPOINT:-http://master.argus.com:3000} + # Fluent Bit / 日志上报目标(固定域名) + - ES_HOST=es.log.argus.com + - ES_PORT=9200 + - FTPIP=${FTPIP} + - BINDIP=${BINDIP} + - FTP_USER=${FTP_USER:-ftpuser} + - FTP_PASSWORD=${FTP_PASSWORD:-NASPlab1234!} + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - AGENT_ENV=${AGENT_ENV} + - AGENT_USER=${AGENT_USER} + - AGENT_INSTANCE=${AGENT_INSTANCE} + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - GPU_MODE=gpu + dns: + - ${BINDIP} + networks: [argus-sys-net] + volumes: + - ../private/argus/agent:/private/argus/agent + - ../logs/infer:/logs/infer + - ../logs/train:/logs/train + command: ["sleep", "infinity"] diff --git a/deployment_new/templates/client_gpu/docs/INSTALL_CLIENT_zh.md b/deployment_new/templates/client_gpu/docs/INSTALL_CLIENT_zh.md new file mode 100644 index 0000000..8915b5c --- /dev/null +++ b/deployment_new/templates/client_gpu/docs/INSTALL_CLIENT_zh.md @@ -0,0 +1,73 @@ +# Argus Client‑GPU 安装指南(deployment_new) + +## 一、准备条件(开始前确认) +- GPU 节点安装了 NVIDIA 驱动,`nvidia-smi` 正常; +- Docker & Docker Compose v2 已安装; +- 使用统一账户 `argus`(UID=2133,GID=2015)执行安装,并加入 `docker` 组(如已创建可跳过): + ```bash + sudo groupadd --gid 2015 argus || true + sudo useradd --uid 2133 --gid 2015 --create-home --shell /bin/bash argus || true + sudo passwd argus + sudo usermod -aG docker argus + su - argus -c 'id; docker ps >/dev/null && echo OK || echo NO_DOCKER_PERMISSION' + ``` + 后续解压与执行(config/install/uninstall)均使用 `argus` 账户进行。 +- 从 Server 安装方拿到 `cluster-info.env`(包含 `SWARM_MANAGER_ADDR/BINDIP/FTPIP/SWARM_JOIN_TOKEN_*`)。 + +## 二、解包 +- `tar -xzf client_gpu_YYYYMMDD.tar.gz` +- 进入目录:`cd client_gpu_YYYYMMDD/` +- 你应当看到:`images/`(GPU bundle、busybox)、`compose/`、`scripts/`、`docs/`。 + +## 三、配置 config(预热 overlay + 生成 .env) +命令: +``` +cp /path/to/cluster-info.env ./ # 或 export CLUSTER_INFO=/abs/path/cluster-info.env +./scripts/config.sh +``` +脚本做了什么: +- 读取 `cluster-info.env` 并 `docker swarm join`(幂等); +- 自动用 busybox 预热 external overlay `argus-sys-net`,等待最多 60s 直到本机可见; +- 生成/更新 `compose/.env`:填入 `BINDIP/FTPIP/SWARM_*`,并“保留你已填写的 AGENT_* 与 GPU_NODE_HOSTNAME”(不会覆盖)。 + +看到什么才算成功: +- 终端输出类似:`已预热 overlay=argus-sys-net 并生成 compose/.env;可执行 scripts/install.sh`; +- `compose/.env` 至少包含: + - `AGENT_ENV/AGENT_USER/AGENT_INSTANCE/GPU_NODE_HOSTNAME`(需要你提前填写); + - `BINDIP/FTPIP/SWARM_MANAGER_ADDR/SWARM_JOIN_TOKEN_*`; + - `NODE_GPU_BUNDLE_IMAGE_TAG=...:YYYYMMDD`。 + +### 日志映射(重要) +- 容器内 `/logs/infer` 与 `/logs/train` 已映射到包根 `./logs/infer` 与 `./logs/train`: + - 你可以直接在宿主机查看推理/训练日志:`tail -f logs/infer/*.log`、`tail -f logs/train/*.log`; + - install 脚本会自动创建这两个目录。 + +若提示缺少必填项: +- 打开 `compose/.env` 按提示补齐 `AGENT_*` 与 `GPU_NODE_HOSTNAME`,再次执行 `./scripts/config.sh`(脚本不会覆盖你已填的值)。 + +## 四、安装 install(加载镜像 + 起容器 + 跟日志) +命令: +``` +./scripts/install.sh +``` +脚本做了什么: +- 如有必要,先自动预热 overlay; +- 从 `images/` 导入 `argus-sys-metric-test-node-bundle-gpu-*.tar.gz` 到本地 Docker; +- `docker compose up -d` 启动 GPU 节点容器,并自动执行 `docker logs -f argus-metric-gpu-node-swarm` 跟踪安装过程。 + +看到什么才算成功: +- 日志中出现:`[BOOT] local bundle install OK: version=...` / `dcgm-exporter ... listening` / `node state present: /private/argus/agent//node.json`; +- `docker exec argus-metric-gpu-node-swarm nvidia-smi -L` 能列出 GPU; +- 在 Server 侧 Prometheus `/api/v1/targets` 中,GPU 节点 9100(node-exporter)与 9400(dcgm-exporter)至少其一 up。 + +## 五、卸载 uninstall +命令: +``` +./scripts/uninstall.sh +``` +行为:Compose down(如有 .env),并删除 warmup 容器与节点容器。 + +## 六、常见问题 +- `本机未看到 overlay`:config/install 已自动预热;若仍失败,请检查与 manager 的网络连通性以及 manager 上是否已创建 `argus-sys-net`。 +- `busybox 缺失`:确保包根 `images/busybox.tar` 在,或主机已有 `busybox:latest`。 +- `加入 Swarm 失败`:确认 `cluster-info.env` 的 `SWARM_MANAGER_ADDR` 与 `SWARM_JOIN_TOKEN_WORKER` 正确,或在 manager 上重新 `docker swarm join-token -q worker` 后更新该文件。 diff --git a/deployment_new/templates/client_gpu/images/busybox.tar b/deployment_new/templates/client_gpu/images/busybox.tar new file mode 100644 index 0000000..0840f71 Binary files /dev/null and b/deployment_new/templates/client_gpu/images/busybox.tar differ diff --git a/deployment_new/templates/client_gpu/scripts/config.sh b/deployment_new/templates/client_gpu/scripts/config.sh new file mode 100644 index 0000000..1f7b1fe --- /dev/null +++ b/deployment_new/templates/client_gpu/scripts/config.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +PKG_ROOT="$ROOT_DIR" +ENV_EX="$PKG_ROOT/compose/.env.example" +ENV_OUT="$PKG_ROOT/compose/.env" + +info(){ echo -e "\033[34m[CONFIG-GPU]\033[0m $*"; } +err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; } +require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; } +require docker curl jq awk sed tar gzip nvidia-smi + +# 磁盘空间检查(MB) +check_disk(){ local p="$1"; local need=10240; local free + free=$(df -Pm "$p" | awk 'NR==2{print $4+0}') + if [[ -z "$free" || "$free" -lt "$need" ]]; then err "磁盘空间不足: $p 剩余 ${free:-0}MB (<${need}MB)"; return 1; fi +} +check_disk "$PKG_ROOT"; check_disk "/var/lib/docker" || true + +# 导入 cluster-info.env(默认取当前包根,也可用 CLUSTER_INFO 指定路径) +CI_IN="${CLUSTER_INFO:-$PKG_ROOT/cluster-info.env}" +info "读取 cluster-info.env: $CI_IN" +[[ -f "$CI_IN" ]] || { err "找不到 cluster-info.env(默认当前包根,或设置环境变量 CLUSTER_INFO 指定绝对路径)"; exit 1; } +set -a; source "$CI_IN"; set +a +[[ -n "${SWARM_MANAGER_ADDR:-}" && -n "${SWARM_JOIN_TOKEN_WORKER:-}" ]] || { err "cluster-info.env 缺少 SWARM 信息(SWARM_MANAGER_ADDR/SWARM_JOIN_TOKEN_WORKER)"; exit 1; } + +# 加入 Swarm(幂等) +info "加入 Swarm(幂等):$SWARM_MANAGER_ADDR" +docker swarm join --token "$SWARM_JOIN_TOKEN_WORKER" "$SWARM_MANAGER_ADDR":2377 >/dev/null 2>&1 || true + +# 导入 busybox 并做 overlay 预热与连通性(总是执行) +NET_NAME="${ARGUS_OVERLAY_NET:-argus-sys-net}" +# 准备 busybox +if ! docker image inspect busybox:latest >/dev/null 2>&1; then + if [[ -f "$PKG_ROOT/images/busybox.tar" ]]; then + info "加载 busybox.tar 以预热 overlay" + docker load -i "$PKG_ROOT/images/busybox.tar" >/dev/null + else + err "缺少 busybox 镜像(包内 images/busybox.tar 或本地 busybox:latest),无法预热 overlay $NET_NAME"; exit 1 + fi +fi +# 预热容器(worker 侧加入 overlay 以便本地可见) +docker rm -f argus-net-warmup >/dev/null 2>&1 || true +info "启动 warmup 容器加入 overlay: $NET_NAME" +docker run -d --rm --name argus-net-warmup --network "$NET_NAME" ${BINDIP:+--dns "$BINDIP"} busybox:latest sleep 600 >/dev/null 2>&1 || true +for i in {1..60}; do docker network inspect "$NET_NAME" >/dev/null 2>&1 && { info "overlay 可见 (t=${i}s)"; break; }; sleep 1; done +docker network inspect "$NET_NAME" >/dev/null 2>&1 || { err "预热后仍未看到 overlay: $NET_NAME;请确认 manager 已创建并网络可达"; exit 1; } + +# 从 warmup 容器内测试连通性(必须能 ping 通 BINDIP 与 FTPIP) +ping_ok(){ docker exec argus-net-warmup sh -lc "ping -c 1 -W 2 $1 >/dev/null 2>&1"; } +if [[ -n "${BINDIP:-}" ]]; then + ping_ok "$BINDIP" || { err "容器内无法 ping 通 BINDIP=$BINDIP;请检查 overlay 与 Bind9 容器状态"; exit 1; } +fi +if [[ -n "${FTPIP:-}" ]]; then + ping_ok "$FTPIP" || { err "容器内无法 ping 通 FTPIP=$FTPIP;请检查 overlay 与 FTP 容器状态"; exit 1; } +fi + +# 生成/更新 .env(保留人工填写项,不覆盖已有键) +if [[ ! -f "$ENV_OUT" ]]; then + cp "$ENV_EX" "$ENV_OUT" +fi + +set_kv(){ local k="$1" v="$2"; if grep -q "^${k}=" "$ENV_OUT"; then sed -i -E "s#^${k}=.*#${k}=${v}#" "$ENV_OUT"; else echo "${k}=${v}" >> "$ENV_OUT"; fi } + +set_kv BINDIP "${BINDIP:-}" +set_kv FTPIP "${FTPIP:-}" +set_kv SWARM_MANAGER_ADDR "${SWARM_MANAGER_ADDR:-}" +set_kv SWARM_JOIN_TOKEN_WORKER "${SWARM_JOIN_TOKEN_WORKER:-}" +set_kv SWARM_JOIN_TOKEN_MANAGER "${SWARM_JOIN_TOKEN_MANAGER:-}" + +REQ_VARS=(AGENT_ENV AGENT_USER AGENT_INSTANCE GPU_NODE_HOSTNAME) +missing=() +for v in "${REQ_VARS[@]}"; do + val=$(grep -E "^$v=" "$ENV_OUT" | head -1 | cut -d= -f2-) + if [[ -z "$val" ]]; then missing+=("$v"); fi +done +if [[ ${#missing[@]} -gt 0 ]]; then + err "以下变量必须在 compose/.env 中填写:${missing[*]}(已保留你现有的内容,不会被覆盖)"; exit 1; fi + +info "已生成 compose/.env;可执行 scripts/install.sh" diff --git a/deployment_new/templates/client_gpu/scripts/install.sh b/deployment_new/templates/client_gpu/scripts/install.sh new file mode 100644 index 0000000..27950c0 --- /dev/null +++ b/deployment_new/templates/client_gpu/scripts/install.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +PKG_ROOT="$ROOT_DIR" +ENV_FILE="$PKG_ROOT/compose/.env" +COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml" + +info(){ echo -e "\033[34m[INSTALL-GPU]\033[0m $*"; } +err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; } +require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; } +require docker docker compose nvidia-smi + +[[ -f "$ENV_FILE" ]] || { err "缺少 compose/.env,请先运行 scripts/config.sh"; exit 1; } +info "使用环境文件: $ENV_FILE" + +# 预热 overlay(当 config 执行很久之前或容器已被清理时,warmup 可能不存在) +set -a; source "$ENV_FILE"; set +a +NET_NAME="${ARGUS_OVERLAY_NET:-argus-sys-net}" +info "检查 overlay 网络可见性: $NET_NAME" +if ! docker network inspect "$NET_NAME" >/dev/null 2>&1; then + # 如 Overlay 不可见,尝试用 busybox 预热 + if ! docker image inspect busybox:latest >/dev/null 2>&1; then + if [[ -f "$PKG_ROOT/images/busybox.tar" ]]; then docker load -i "$PKG_ROOT/images/busybox.tar"; else err "缺少 busybox 镜像(images/busybox.tar 或本地 busybox:latest)"; exit 1; fi + fi + docker rm -f argus-net-warmup >/dev/null 2>&1 || true + docker run -d --rm --name argus-net-warmup --network "$NET_NAME" ${BINDIP:+--dns "$BINDIP"} busybox:latest sleep 600 >/dev/null 2>&1 || true + for i in {1..60}; do docker network inspect "$NET_NAME" >/dev/null 2>&1 && break; sleep 1; done + docker network inspect "$NET_NAME" >/dev/null 2>&1 || { err "预热后仍未看到 overlay: $NET_NAME;请确认 manager 已创建并网络可达"; exit 1; } + info "overlay 已可见(warmup=argus-net-warmup)" +fi + +# 容器内连通性检查:BINDIP 与 FTPIP 可达 +ping_ok(){ docker exec argus-net-warmup sh -lc "ping -c 1 -W 2 $1 >/dev/null 2>&1"; } +if [[ -n "${BINDIP:-}" ]]; then + if ping_ok "$BINDIP"; then info "warmup 内可达 BINDIP=$BINDIP"; else err "容器内无法 ping 通 BINDIP=$BINDIP"; exit 1; fi +fi +if [[ -n "${FTPIP:-}" ]]; then + if ping_ok "$FTPIP"; then info "warmup 内可达 FTPIP=$FTPIP"; else err "容器内无法 ping 通 FTPIP=$FTPIP"; exit 1; fi +fi + +# 导入 GPU bundle 镜像 +IMG_TGZ=$(ls -1 "$PKG_ROOT"/images/argus-sys-metric-test-node-bundle-gpu-*.tar.gz 2>/dev/null | head -1 || true) +[[ -n "$IMG_TGZ" ]] || { err "找不到 GPU bundle 镜像 tar.gz"; exit 1; } +info "导入 GPU bundle 镜像: $(basename "$IMG_TGZ")" +tmp=$(mktemp); gunzip -c "$IMG_TGZ" > "$tmp"; docker load -i "$tmp" >/dev/null; rm -f "$tmp" + +# 确保日志目录存在(宿主侧,用于映射 /logs/infer 与 /logs/train) +mkdir -p "$PKG_ROOT/logs/infer" "$PKG_ROOT/logs/train" +info "日志目录已准备: logs/infer logs/train" + +# 启动 compose 并跟踪日志 +info "启动 GPU 节点 (docker compose up -d)" +docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d +docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps +info "跟踪节点容器日志(按 Ctrl+C 退出)" +docker logs -f argus-metric-gpu-node-swarm || true diff --git a/deployment_new/templates/client_gpu/scripts/uninstall.sh b/deployment_new/templates/client_gpu/scripts/uninstall.sh new file mode 100644 index 0000000..2be7d6d --- /dev/null +++ b/deployment_new/templates/client_gpu/scripts/uninstall.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +PKG_ROOT="$ROOT_DIR" +ENV_FILE="$PKG_ROOT/compose/.env" +COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml" + +info(){ echo -e "\033[34m[UNINSTALL-GPU]\033[0m $*"; } + +if [[ -f "$ENV_FILE" ]]; then + info "stopping compose project" + docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" down --remove-orphans || true +else + info "compose/.env not found; attempting to remove container by name" +fi + +# remove warmup container if still running +docker rm -f argus-net-warmup >/dev/null 2>&1 || true + +# remove node container if present +docker rm -f argus-metric-gpu-node-swarm >/dev/null 2>&1 || true + +info "uninstall completed" + diff --git a/deployment_new/templates/server/compose/docker-compose.yml b/deployment_new/templates/server/compose/docker-compose.yml new file mode 100644 index 0000000..1350d58 --- /dev/null +++ b/deployment_new/templates/server/compose/docker-compose.yml @@ -0,0 +1,175 @@ +version: "3.8" + +networks: + argus-sys-net: + external: true + +services: + bind: + image: ${BIND_IMAGE_TAG:-argus-bind9:${PKG_VERSION}} + container_name: argus-bind-sys + networks: [argus-sys-net] + volumes: + - ../private:/private + restart: unless-stopped + + master: + image: ${MASTER_IMAGE_TAG:-argus-master:${PKG_VERSION}} + container_name: argus-master-sys + depends_on: [bind] + environment: + - OFFLINE_THRESHOLD_SECONDS=6 + - ONLINE_THRESHOLD_SECONDS=2 + - SCHEDULER_INTERVAL_SECONDS=1 + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + ports: + - "${MASTER_PORT:-32300}:3000" + volumes: + - ../private/argus/master:/private/argus/master + - ../private/argus/metric/prometheus:/private/argus/metric/prometheus + - ../private/argus/etc:/private/argus/etc + networks: [argus-sys-net] + restart: unless-stopped + + es: + image: ${ES_IMAGE_TAG:-argus-elasticsearch:${PKG_VERSION}} + container_name: argus-es-sys + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - ES_JAVA_OPTS=-Xms512m -Xmx512m + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ../private/argus/log/elasticsearch:/private/argus/log/elasticsearch + - ../private/argus/etc:/private/argus/etc + ports: + - "${ES_HTTP_PORT:-9200}:9200" + restart: unless-stopped + networks: [argus-sys-net] + + kibana: + image: ${KIBANA_IMAGE_TAG:-argus-kibana:${PKG_VERSION}} + container_name: argus-kibana-sys + environment: + - ELASTICSEARCH_HOSTS=http://es.log.argus.com:9200 + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ../private/argus/log/kibana:/private/argus/log/kibana + - ../private/argus/etc:/private/argus/etc + depends_on: [es] + ports: + - "${KIBANA_PORT:-5601}:5601" + restart: unless-stopped + networks: [argus-sys-net] + + ftp: + image: ${FTP_IMAGE_TAG:-argus-metric-ftp:${PKG_VERSION}} + container_name: argus-ftp + restart: unless-stopped + environment: + - TZ=Asia/Shanghai + - FTP_BASE_PATH=/private/argus/ftp + - FTP_PASSWORD=${FTP_PASSWORD:-NASPlab1234!} + - DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com} + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + ports: + - "${FTP_PORT:-21}:21" + - "${FTP_DATA_PORT:-20}:20" + - "${FTP_PASSIVE_HOST_RANGE:-21100-21110}:21100-21110" + volumes: + - ../private/argus/metric/ftp:/private/argus/ftp + - ../private/argus/etc:/private/argus/etc + networks: [argus-sys-net] + + prometheus: + image: ${PROM_IMAGE_TAG:-argus-metric-prometheus:${PKG_VERSION}} + container_name: argus-prometheus + restart: unless-stopped + environment: + - TZ=Asia/Shanghai + - PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + ports: + - "${PROMETHEUS_PORT:-9090}:9090" + volumes: + - ../private/argus/metric/prometheus:/private/argus/metric/prometheus + - ../private/argus/etc:/private/argus/etc + networks: [argus-sys-net] + + grafana: + image: ${GRAFANA_IMAGE_TAG:-argus-metric-grafana:${PKG_VERSION}} + container_name: argus-grafana + restart: unless-stopped + environment: + - TZ=Asia/Shanghai + - GRAFANA_BASE_PATH=/private/argus/metric/grafana + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - GF_SERVER_HTTP_PORT=3000 + - GF_LOG_LEVEL=warn + - GF_LOG_MODE=console + - GF_PATHS_PROVISIONING=/private/argus/metric/grafana/provisioning + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer + ports: + - "${GRAFANA_PORT:-3000}:3000" + volumes: + - ../private/argus/metric/grafana:/private/argus/metric/grafana + - ../private/argus/etc:/private/argus/etc + depends_on: [prometheus] + networks: [argus-sys-net] + + alertmanager: + image: ${ALERT_IMAGE_TAG:-argus-alertmanager:${PKG_VERSION}} + container_name: argus-alertmanager + environment: + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ../private/argus/etc:/private/argus/etc + - ../private/argus/alert/alertmanager:/private/argus/alert/alertmanager + networks: [argus-sys-net] + ports: + - "${ALERTMANAGER_PORT:-9093}:9093" + restart: unless-stopped + + web-frontend: + image: ${FRONT_IMAGE_TAG:-argus-web-frontend:${PKG_VERSION}} + container_name: argus-web-frontend + environment: + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - EXTERNAL_MASTER_PORT=${WEB_PROXY_PORT_8085:-8085} + - EXTERNAL_ALERTMANAGER_PORT=${WEB_PROXY_PORT_8084:-8084} + - EXTERNAL_GRAFANA_PORT=${WEB_PROXY_PORT_8081:-8081} + - EXTERNAL_PROMETHEUS_PORT=${WEB_PROXY_PORT_8082:-8082} + - EXTERNAL_KIBANA_PORT=${WEB_PROXY_PORT_8083:-8083} + volumes: + - ../private/argus/etc:/private/argus/etc + networks: [argus-sys-net] + restart: unless-stopped + + web-proxy: + image: ${WEB_PROXY_IMAGE_TAG:-argus-web-proxy:${PKG_VERSION}} + container_name: argus-web-proxy + depends_on: [bind, master, grafana, prometheus, kibana, alertmanager] + environment: + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ../private/argus/etc:/private/argus/etc + networks: [argus-sys-net] + ports: + - "${WEB_PROXY_PORT_8080:-8080}:8080" + - "${WEB_PROXY_PORT_8081:-8081}:8081" + - "${WEB_PROXY_PORT_8082:-8082}:8082" + - "${WEB_PROXY_PORT_8083:-8083}:8083" + - "${WEB_PROXY_PORT_8084:-8084}:8084" + - "${WEB_PROXY_PORT_8085:-8085}:8085" + restart: unless-stopped + diff --git a/deployment_new/templates/server/docs/INSTALL_SERVER_zh.md b/deployment_new/templates/server/docs/INSTALL_SERVER_zh.md new file mode 100644 index 0000000..31fecfe --- /dev/null +++ b/deployment_new/templates/server/docs/INSTALL_SERVER_zh.md @@ -0,0 +1,102 @@ +# Argus Server 安装指南(deployment_new) + +适用:通过 Server 安装包在 Docker Swarm + external overlay 网络一体化部署 Argus 服务端组件。 + +—— 本文强调“怎么做、看什么、符合了才继续”。 + +## 一、准备条件(开始前确认) +- Docker 与 Docker Compose v2 已安装;`docker info` 正常;`docker compose version` 可执行。 +- 具备 root/sudo 权限;磁盘可用空间 ≥ 10GB(包根与 `/var/lib/docker`)。 +- 你知道本机管理地址(SWARM_MANAGER_ADDR),该 IP 属于本机某网卡,可被其他节点访问。 +- 很重要:以统一账户 `argus`(UID=2133,GID=2015)执行后续安装与运维,并将其加入 `docker` 组;示例命令如下(如需不同 UID/GID,请替换为贵方标准): + ```bash + # 1) 创建主组(GID=2015,组名 argus;若已存在可跳过) + sudo groupadd --gid 2015 argus || true + + # 2) 创建用户 argus(UID=2133、主组 GID=2015,创建家目录并用 bash 作为默认 shell;若已存在可用 usermod 调整) + sudo useradd --uid 2133 --gid 2015 --create-home --shell /bin/bash argus || true + sudo passwd argus + + # 3) 将 argus 加入 docker 组,使其能调用 Docker Daemon(新登录后生效) + sudo usermod -aG docker argus + + # 4) 验证(重新登录或执行 newgrp docker 使组生效) + su - argus -c 'id; docker ps >/dev/null && echo OK || echo NO_DOCKER_PERMISSION' + ``` + 后续的解压与执行(config/install/selfcheck 等)均使用该 `argus` 账户进行。 + +## 二、解包与目录结构 +- 解压:`tar -xzf server_YYYYMMDD.tar.gz`。 +- 进入:`cd server_YYYYMMDD/` +- 你应当能看到: + - `images/`(逐服务镜像 tar.gz,如 `argus-master-YYYYMMDD.tar.gz`) + - `compose/`(`docker-compose.yml` 与 `.env.example`) + - `scripts/`(安装/运维脚本) + - `private/argus/`(数据与配置骨架) + - `docs/`(中文文档) + +## 三、配置 config(生成 .env 与 SWARM_MANAGER_ADDR) +命令: +``` +export SWARM_MANAGER_ADDR=<本机管理IP> +./scripts/config.sh +``` +脚本做了什么: +- 检查依赖与磁盘空间; +- 自动从“端口 20000 起”分配所有服务端口,确保“系统未占用”且“彼此不冲突”; +- 写入 `compose/.env`(包含端口、镜像 tag、FTP 账号、overlay 名称等); +- 将当前执行账户的 UID/GID 写入 `ARGUS_BUILD_UID/GID`(若主组名是 docker,会改用“与用户名同名的组”的 GID,避免拿到 docker 组 999); +- 更新/追加 `cluster-info.env` 中的 `SWARM_MANAGER_ADDR`(不会覆盖其他键)。 + +看到什么才算成功: +- 终端输出:`已生成 compose/.env 并更新 cluster-info.env 的 SWARM_MANAGER_ADDR。` +- `compose/.env` 打开应当看到: + - 端口均 ≥20000 且没有重复; + - `ARGUS_BUILD_UID/GID` 与 `id -u/-g` 一致; + - `SWARM_MANAGER_ADDR=<你的IP>`。 + +遇到问题: +- 端口被异常占用:可删去 `.env` 后再次执行 `config.sh`,或手工编辑端口再执行 `install.sh`。 + +## 四、安装 install(一次到位) +命令: +``` +./scripts/install.sh +``` +脚本做了什么: +- 若 Swarm 未激活:执行 `docker swarm init --advertise-addr $SWARM_MANAGER_ADDR`; +- 确保 external overlay `argus-sys-net` 存在; +- 导入 `images/*.tar.gz` 到本机 Docker; +- `docker compose up -d` 启动服务; +- 等待“六项就绪”: + - Master `/readyz`=200、ES `/_cluster/health`=200、Prometheus TCP 可达、Grafana `/api/health`=200、Alertmanager `/api/v2/status`=200、Kibana `/api/status` level=available; +- 将各服务 overlay IP 写入 `private/argus/etc/<域名>`,Reload Bind9 与 Nginx; +- 写出 `cluster-info.env`(含 `BINDIP/FTPIP/SWARM_JOIN_TOKEN_{WORKER,MANAGER}/SWARM_MANAGER_ADDR`); +- 生成 `安装报告_YYYYMMDD-HHMMSS.md`(端口、健康检查摘要与提示)。 + +看到什么才算成功: +- `docker compose ps` 全部是 Up; +- `安装报告_…md` 中各项 HTTP 检查为 200/available; +- `cluster-info.env` 包含五个关键键: + - `SWARM_MANAGER_ADDR=...` + - `BINDIP=10.x.x.x` `FTPIP=10.x.x.x` + - `SWARM_JOIN_TOKEN_WORKER=SWMTKN-...` + - `SWARM_JOIN_TOKEN_MANAGER=SWMTKN-...` + +## 五、健康自检与常用操作 +- 健康自检:`./scripts/selfcheck.sh` + - 期望输出:`selfcheck OK -> logs/selfcheck.json` + - 文件 `logs/selfcheck.json` 中 `overlay_net/es/kibana/master_readyz/ftp_share_writable/prometheus/grafana/alertmanager/web_proxy_cors` 为 true。 +- 状态:`./scripts/status.sh`(相当于 `docker compose ps`)。 +- 诊断:`./scripts/diagnose.sh`(收集容器/HTTP/CORS/ES 细节,输出到 `logs/diagnose_*.log`)。 +- 卸载:`./scripts/uninstall.sh`(Compose down)。 +- ES 磁盘水位临时放宽/还原:`./scripts/es-watermark-relax.sh` / `./scripts/es-watermark-restore.sh`。 + +## 六、下一步:分发 cluster-info.env 给 Client +- 将 `cluster-info.env` 拷贝给安装 Client 的同事; +- 对方在 Client 机器的包根放置该文件(或设置 `CLUSTER_INFO=/绝对路径`)即可。 + +## 七、故障排查快览 +- Proxy 502 或 8080 连接复位:多因 Bind 域名未更新到 overlay IP;重跑 `install.sh`(会写入私有域名文件并 reload)或查看 `logs/diagnose_error.log`。 +- Kibana 不 available:等待 1–2 分钟、查看 `argus-kibana-sys` 日志; +- cluster-info.env 的 SWARM_MANAGER_ADDR 为空:重新 `export SWARM_MANAGER_ADDR=; ./scripts/config.sh` 或 `./scripts/install.sh`(会回读 `.env` 补写)。 diff --git a/deployment_new/templates/server/docs/SWARM_DEPLOY_zh.md b/deployment_new/templates/server/docs/SWARM_DEPLOY_zh.md new file mode 100644 index 0000000..c2ee8d0 --- /dev/null +++ b/deployment_new/templates/server/docs/SWARM_DEPLOY_zh.md @@ -0,0 +1,7 @@ +# Docker Swarm 部署要点 + +- 初始化 Swarm:`docker swarm init --advertise-addr ` +- 创建 overlay:`docker network create --driver overlay --attachable argus-sys-net` +- Server 包 `install.sh` 自动完成上述操作;如需手动执行,确保 `argus-sys-net` 存在且 attachable。 +- Worker 节点加入:`docker swarm join --token :2377`。 + diff --git a/deployment_new/templates/server/docs/TROUBLESHOOTING_zh.md b/deployment_new/templates/server/docs/TROUBLESHOOTING_zh.md new file mode 100644 index 0000000..c188ae0 --- /dev/null +++ b/deployment_new/templates/server/docs/TROUBLESHOOTING_zh.md @@ -0,0 +1,11 @@ +# 故障排查(Server) + +- 端口占用:查看 `安装报告_*.md` 中端口表;如需修改,编辑 `compose/.env` 后执行 `docker compose ... up -d`。 +- 组件未就绪: + - Master: `curl http://127.0.0.1:${MASTER_PORT}/readyz -I` + - ES: `curl http://127.0.0.1:${ES_HTTP_PORT}/_cluster/health` + - Grafana: `curl http://127.0.0.1:${GRAFANA_PORT}/api/health` + - Prometheus TCP: `exec 3<>/dev/tcp/127.0.0.1/${PROMETHEUS_PORT}` +- 域名解析:进入 `argus-web-proxy` 或 `argus-master-sys` 容器:`getent hosts master.argus.com`。 +- Swarm/Overlay:检查 `docker network ls | grep argus-sys-net`,或 `docker node ls`。 + diff --git a/deployment_new/templates/server/scripts/config.sh b/deployment_new/templates/server/scripts/config.sh new file mode 100644 index 0000000..d8d0339 --- /dev/null +++ b/deployment_new/templates/server/scripts/config.sh @@ -0,0 +1,104 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +PKG_ROOT="$ROOT_DIR" +ENV_EX="$PKG_ROOT/compose/.env.example" +ENV_OUT="$PKG_ROOT/compose/.env" + +info(){ echo -e "\033[34m[CONFIG]\033[0m $*"; } +err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; } +require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; } + +require docker curl jq awk sed tar gzip + +# 磁盘空间检查(MB) +check_disk(){ local p="$1"; local need=10240; local free + free=$(df -Pm "$p" | awk 'NR==2{print $4+0}') + if [[ -z "$free" || "$free" -lt "$need" ]]; then err "磁盘空间不足: $p 剩余 ${free:-0}MB (<${need}MB)"; return 1; fi +} + +check_disk "$PKG_ROOT"; check_disk "/var/lib/docker" || true + +# 读取/生成 SWARM_MANAGER_ADDR +SWARM_MANAGER_ADDR=${SWARM_MANAGER_ADDR:-} +if [[ -z "${SWARM_MANAGER_ADDR}" ]]; then + read -rp "请输入本机管理地址 SWARM_MANAGER_ADDR: " SWARM_MANAGER_ADDR +fi +info "SWARM_MANAGER_ADDR=$SWARM_MANAGER_ADDR" + +# 校验 IP 属于本机网卡 +if ! ip -o addr | awk '{print $4}' | cut -d'/' -f1 | grep -qx "$SWARM_MANAGER_ADDR"; then + err "SWARM_MANAGER_ADDR 非本机地址: $SWARM_MANAGER_ADDR"; exit 1; fi + +info "开始分配服务端口(起始=20000,避免系统占用与相互冲突)" +is_port_used(){ local p="$1"; ss -tulnH 2>/dev/null | awk '{print $5}' | sed 's/.*://g' | grep -qx "$p"; } +declare -A PRESENT=() CHOSEN=() USED=() +START_PORT="${START_PORT:-20000}"; cur=$START_PORT +ORDER=(MASTER_PORT ES_HTTP_PORT KIBANA_PORT PROMETHEUS_PORT GRAFANA_PORT ALERTMANAGER_PORT \ + WEB_PROXY_PORT_8080 WEB_PROXY_PORT_8081 WEB_PROXY_PORT_8082 WEB_PROXY_PORT_8083 WEB_PROXY_PORT_8084 WEB_PROXY_PORT_8085 \ + FTP_PORT FTP_DATA_PORT) + +# 标记 .env.example 中实际存在的键 +for key in "${ORDER[@]}"; do + if grep -q "^${key}=" "$ENV_EX"; then PRESENT[$key]=1; fi +done + +next_free(){ local p="$1"; while :; do if [[ -n "${USED[$p]:-}" ]] || is_port_used "$p"; then p=$((p+1)); else echo "$p"; return; fi; done; } + +for key in "${ORDER[@]}"; do + [[ -z "${PRESENT[$key]:-}" ]] && continue + p=$(next_free "$cur"); CHOSEN[$key]="$p"; USED[$p]=1; cur=$((p+1)) +done + +info "端口分配结果:MASTER=${CHOSEN[MASTER_PORT]:-} ES=${CHOSEN[ES_HTTP_PORT]:-} KIBANA=${CHOSEN[KIBANA_PORT]:-} PROM=${CHOSEN[PROMETHEUS_PORT]:-} GRAFANA=${CHOSEN[GRAFANA_PORT]:-} ALERT=${CHOSEN[ALERTMANAGER_PORT]:-} WEB_PROXY(8080..8085)=${CHOSEN[WEB_PROXY_PORT_8080]:-}/${CHOSEN[WEB_PROXY_PORT_8081]:-}/${CHOSEN[WEB_PROXY_PORT_8082]:-}/${CHOSEN[WEB_PROXY_PORT_8083]:-}/${CHOSEN[WEB_PROXY_PORT_8084]:-}/${CHOSEN[WEB_PROXY_PORT_8085]:-}" + +cp "$ENV_EX" "$ENV_OUT" +# 覆盖端口(按唯一化结果写回) +for key in "${ORDER[@]}"; do + val="${CHOSEN[$key]:-}" + [[ -z "$val" ]] && continue + sed -i -E "s#^$key=.*#$key=${val}#" "$ENV_OUT" +done +info "已写入 compose/.env 的端口配置" +# 覆盖/补充 Overlay 名称 +grep -q '^ARGUS_OVERLAY_NET=' "$ENV_OUT" || echo 'ARGUS_OVERLAY_NET=argus-sys-net' >> "$ENV_OUT" +# FTP 默认 +grep -q '^FTP_USER=' "$ENV_OUT" || echo 'FTP_USER=ftpuser' >> "$ENV_OUT" +grep -q '^FTP_PASSWORD=' "$ENV_OUT" || echo 'FTP_PASSWORD=NASPlab1234!' >> "$ENV_OUT" +# 以当前执行账户 UID/GID 写入(避免误选 docker 组) +RUID=$(id -u) +PRIMARY_GID=$(id -g) +PRIMARY_GRP=$(id -gn) +USER_NAME=$(id -un) +# 若主组名被解析为 docker,尝试用与用户名同名的组的 GID;否则回退主 GID +if [[ "$PRIMARY_GRP" == "docker" ]]; then + RGID=$(getent group "$USER_NAME" | awk -F: '{print $3}' 2>/dev/null || true) + [[ -z "$RGID" ]] && RGID="$PRIMARY_GID" +else + RGID="$PRIMARY_GID" +fi +info "使用构建账户 UID:GID=${RUID}:${RGID} (user=$USER_NAME primary_group=$PRIMARY_GRP)" +if grep -q '^ARGUS_BUILD_UID=' "$ENV_OUT"; then + sed -i -E "s#^ARGUS_BUILD_UID=.*#ARGUS_BUILD_UID=${RUID}#" "$ENV_OUT" +else + echo "ARGUS_BUILD_UID=${RUID}" >> "$ENV_OUT" +fi +if grep -q '^ARGUS_BUILD_GID=' "$ENV_OUT"; then + sed -i -E "s#^ARGUS_BUILD_GID=.*#ARGUS_BUILD_GID=${RGID}#" "$ENV_OUT" +else + echo "ARGUS_BUILD_GID=${RGID}" >> "$ENV_OUT" +fi + +CI="$PKG_ROOT/cluster-info.env" +if [[ -f "$CI" ]]; then + if grep -q '^SWARM_MANAGER_ADDR=' "$CI"; then + sed -i -E "s#^SWARM_MANAGER_ADDR=.*#SWARM_MANAGER_ADDR=${SWARM_MANAGER_ADDR}#" "$CI" + else + echo "SWARM_MANAGER_ADDR=${SWARM_MANAGER_ADDR}" >> "$CI" + fi +else + echo "SWARM_MANAGER_ADDR=${SWARM_MANAGER_ADDR}" > "$CI" +fi +info "已生成 compose/.env 并更新 cluster-info.env 的 SWARM_MANAGER_ADDR。" +info "下一步可执行: scripts/install.sh" diff --git a/deployment_new/templates/server/scripts/diagnose.sh b/deployment_new/templates/server/scripts/diagnose.sh new file mode 100644 index 0000000..e93d69d --- /dev/null +++ b/deployment_new/templates/server/scripts/diagnose.sh @@ -0,0 +1,109 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a + +ts="$(date -u +%Y%m%d-%H%M%SZ)" +LOG_DIR="$ROOT/logs"; mkdir -p "$LOG_DIR" || true +if ! ( : > "$LOG_DIR/.w" 2>/dev/null ); then LOG_DIR="/tmp/argus-logs"; mkdir -p "$LOG_DIR" || true; fi +DETAILS="$LOG_DIR/diagnose_details_${ts}.log"; ERRORS="$LOG_DIR/diagnose_error_${ts}.log"; : > "$DETAILS"; : > "$ERRORS" + +logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; } +append_err() { echo "$*" >> "$ERRORS"; } +http_code() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; } +http_body_head() { curl -s --max-time 3 "$1" 2>/dev/null | sed -n '1,5p' || true; } +header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; } + +section() { local name="$1"; logd "===== [$name] ====="; } +svc() { + local svc_name="$1"; local cname="$2"; shift 2 + section "$svc_name ($cname)" + logd "docker ps:"; docker ps -a --format '{{.Names}}\t{{.Status}}\t{{.Image}}' | awk -v n="$cname" '$1==n' >> "$DETAILS" || true + logd "docker inspect:"; docker inspect -f '{{.State.Status}} rc={{.RestartCount}} started={{.State.StartedAt}}' "$cname" >> "$DETAILS" 2>&1 || true + logd "last 200 container logs:"; docker logs --tail 200 "$cname" >> "$DETAILS" 2>&1 || true + docker logs --tail 200 "$cname" 2>&1 | grep -Ei '\\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\\b' | sed "s/^/[${svc_name}][container] /" >> "$ERRORS" || true + if docker exec "$cname" sh -lc 'command -v supervisorctl >/dev/null 2>&1' >/dev/null 2>&1; then + logd "supervisorctl status:"; docker exec "$cname" sh -lc 'supervisorctl status' >> "$DETAILS" 2>&1 || true + local files; files=$(docker exec "$cname" sh -lc 'ls /var/log/supervisor/*.log 2>/dev/null' || true) + for f in $files; do + logd "tail -n 80 $f:"; docker exec "$cname" sh -lc "tail -n 80 $f 2>/dev/null || true" >> "$DETAILS" 2>&1 || true + docker exec "$cname" sh -lc "tail -n 200 $f 2>/dev/null" 2>/dev/null | grep -Ei '\\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\\b' | sed "s/^/[${svc_name}][supervisor:$(basename $f)] /" >> "$ERRORS" || true + done + fi +} + +svc bind argus-bind-sys +svc master argus-master-sys +svc es argus-es-sys +svc kibana argus-kibana-sys +svc ftp argus-ftp +svc prometheus argus-prometheus +svc grafana argus-grafana +svc alertmanager argus-alertmanager +svc web-frontend argus-web-frontend +svc web-proxy argus-web-proxy + +section HTTP +logd "ES: $(http_code \"http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health\")"; http_body_head "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" >> "$DETAILS" 2>&1 || true +logd "Kibana: $(http_code \"http://localhost:${KIBANA_PORT:-5601}/api/status\")"; http_body_head "http://localhost:${KIBANA_PORT:-5601}/api/status" >> "$DETAILS" 2>&1 || true +logd "Master readyz: $(http_code \"http://localhost:${MASTER_PORT:-32300}/readyz\")" +logd "Prometheus: $(http_code \"http://localhost:${PROMETHEUS_PORT:-9090}/-/ready\")" +logd "Grafana: $(http_code \"http://localhost:${GRAFANA_PORT:-3000}/api/health\")"; http_body_head "http://localhost:${GRAFANA_PORT:-3000}/api/health" >> "$DETAILS" 2>&1 || true +logd "Alertmanager: $(http_code \"http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status\")" +cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true) +cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true) +logd "Web-Proxy 8080: $(http_code \"http://localhost:${WEB_PROXY_PORT_8080:-8080}/\")" +logd "Web-Proxy 8083: $(http_code \"http://localhost:${WEB_PROXY_PORT_8083:-8083}/\")" +logd "Web-Proxy 8084 CORS: ${cors8084}" +logd "Web-Proxy 8085 CORS: ${cors8085}" + +section ES-CHECKS +ch=$(curl -s --max-time 3 "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" || true) +status=$(printf '%s' "$ch" | awk -F'\"' '/"status"/{print $4; exit}') +if [[ -n "$status" ]]; then logd "cluster.status=$status"; fi +if [[ "$status" != "green" ]]; then append_err "[es][cluster] status=$status"; fi +if docker ps --format '{{.Names}}' | grep -q '^argus-es-sys$'; then + duse=$(docker exec argus-es-sys sh -lc 'df -P /usr/share/elasticsearch/data | awk "NR==2{print \$5}"' 2>/dev/null || true) + logd "es.data.df_use=$duse"; usep=${duse%%%} + if [[ -n "$usep" ]] && (( usep >= 90 )); then append_err "[es][disk] data path usage=${usep}%"; fi +fi + +section DNS-IN-PROXY +for d in master.argus.com es.log.argus.com kibana.log.argus.com grafana.metric.argus.com prom.metric.argus.com alertmanager.alert.argus.com; do + docker exec argus-web-proxy sh -lc "getent hosts $d || nslookup $d 2>/dev/null | tail -n+1" >> "$DETAILS" 2>&1 || true +done +logd "HTTP (web-proxy): master.readyz=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://master.argus.com:3000/readyz\" 2>/dev/null || echo 000)" +logd "HTTP (web-proxy): es.health=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://es.log.argus.com:9200/_cluster/health\" 2>/dev/null || echo 000)" +logd "HTTP (web-proxy): kibana.status=$(docker exec argus-web-proxy sh -lc \"curl -s -o /dev/null -w '%{http_code}' http://kibana.log.argus.com:5601/api/status\" 2>/dev/null || echo 000)" + +section FTP-SHARE +docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true + +section SYSTEM +logd "uname -a:"; uname -a >> "$DETAILS" +logd "docker version:"; docker version --format '{{.Server.Version}}' >> "$DETAILS" 2>&1 || true +logd "compose ps:"; (cd "$ROOT/compose" && docker compose ps) >> "$DETAILS" 2>&1 || true + +section SUMMARY +[[ $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health") != 200 ]] && echo "[es][http] health not 200" >> "$ERRORS" +kbcode=$(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status"); [[ "$kbcode" != 200 ]] && echo "[kibana][http] /api/status=$kbcode" >> "$ERRORS" +[[ $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz") != 200 ]] && echo "[master][http] /readyz not 200" >> "$ERRORS" +[[ $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready") != 200 ]] && echo "[prometheus][http] /-/ready not 200" >> "$ERRORS" +gfcode=$(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health"); [[ "$gfcode" != 200 ]] && echo "[grafana][http] /api/health=$gfcode" >> "$ERRORS" +[[ $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status") != 200 ]] && echo "[alertmanager][http] /api/v2/status not 200" >> "$ERRORS" +[[ -z "$cors8084" ]] && echo "[web-proxy][cors] 8084 missing Access-Control-Allow-Origin" >> "$ERRORS" +[[ -z "$cors8085" ]] && echo "[web-proxy][cors] 8085 missing Access-Control-Allow-Origin" >> "$ERRORS" +sort -u -o "$ERRORS" "$ERRORS" + +echo "Diagnostic details -> $DETAILS" +echo "Detected errors -> $ERRORS" + +if [[ "$LOG_DIR" == "$ROOT/logs" ]]; then + ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true + ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true +fi + +exit 0 diff --git a/deployment_new/templates/server/scripts/es-watermark-relax.sh b/deployment_new/templates/server/scripts/es-watermark-relax.sh new file mode 100644 index 0000000..f1fa222 --- /dev/null +++ b/deployment_new/templates/server/scripts/es-watermark-relax.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +set -euo pipefail +HOST="${1:-http://127.0.0.1:9200}" +echo "设置 ES watermark 为 95%/96%/97%: $HOST" +curl -fsS -XPUT "$HOST/_cluster/settings" -H 'Content-Type: application/json' -d '{ + "transient": { + "cluster.routing.allocation.disk.watermark.low": "95%", + "cluster.routing.allocation.disk.watermark.high": "96%", + "cluster.routing.allocation.disk.watermark.flood_stage": "97%" + } +}' && echo "\nOK" diff --git a/deployment_new/templates/server/scripts/es-watermark-restore.sh b/deployment_new/templates/server/scripts/es-watermark-restore.sh new file mode 100644 index 0000000..67cd690 --- /dev/null +++ b/deployment_new/templates/server/scripts/es-watermark-restore.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +set -euo pipefail +HOST="${1:-http://127.0.0.1:9200}" +echo "恢复 ES watermark 为默认值: $HOST" +curl -fsS -XPUT "$HOST/_cluster/settings" -H 'Content-Type: application/json' -d '{ + "transient": { + "cluster.routing.allocation.disk.watermark.low": null, + "cluster.routing.allocation.disk.watermark.high": null, + "cluster.routing.allocation.disk.watermark.flood_stage": null + } +}' && echo "\nOK" diff --git a/deployment_new/templates/server/scripts/install.sh b/deployment_new/templates/server/scripts/install.sh new file mode 100644 index 0000000..81e2258 --- /dev/null +++ b/deployment_new/templates/server/scripts/install.sh @@ -0,0 +1,167 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +PKG_ROOT="$ROOT_DIR" +ENV_FILE="$PKG_ROOT/compose/.env" +COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml" + +info(){ echo -e "\033[34m[INSTALL]\033[0m $*"; } +err(){ echo -e "\033[31m[ERROR]\033[0m $*" >&2; } +require(){ local ok=1; for c in "$@"; do command -v "$c" >/dev/null 2>&1 || { err "缺少依赖: $c"; ok=0; }; done; [[ $ok -eq 1 ]]; } +require docker curl jq awk sed tar gzip + +[[ -f "$ENV_FILE" ]] || { err "缺少 compose/.env,请先运行 scripts/config.sh"; exit 1; } +info "使用环境文件: $ENV_FILE" +set -a; source "$ENV_FILE"; set +a +# 兼容:若 .env 未包含 SWARM_MANAGER_ADDR,则从已存在的 cluster-info.env 读取以避免写空 +SMADDR="${SWARM_MANAGER_ADDR:-}" +CI_FILE="$PKG_ROOT/cluster-info.env" +if [[ -z "$SMADDR" && -f "$CI_FILE" ]]; then + SMADDR=$(sed -n 's/^SWARM_MANAGER_ADDR=\(.*\)$/\1/p' "$CI_FILE" | head -n1) +fi +SWARM_MANAGER_ADDR="$SMADDR" + +# Swarm init & overlay +if ! docker info 2>/dev/null | grep -q "Swarm: active"; then + [[ -n "${SWARM_MANAGER_ADDR:-}" ]] || { err "SWARM_MANAGER_ADDR 未设置,请在 scripts/config.sh 中配置"; exit 1; } + info "初始化 Swarm (--advertise-addr $SWARM_MANAGER_ADDR)" + docker swarm init --advertise-addr "$SWARM_MANAGER_ADDR" >/dev/null 2>&1 || true +else + info "Swarm 已激活" +fi +NET_NAME="${ARGUS_OVERLAY_NET:-argus-sys-net}" +if ! docker network inspect "$NET_NAME" >/dev/null 2>&1; then + info "创建 overlay 网络: $NET_NAME" + docker network create -d overlay --attachable "$NET_NAME" >/dev/null +else + info "overlay 网络已存在: $NET_NAME" +fi + +# Load images +IMAGES_DIR="$PKG_ROOT/images" +shopt -s nullglob +tars=("$IMAGES_DIR"/*.tar.gz) +if [[ ${#tars[@]} -eq 0 ]]; then err "images 目录为空,缺少镜像 tar.gz"; exit 1; fi +total=${#tars[@]}; idx=0 +for tgz in "${tars[@]}"; do + idx=$((idx+1)) + info "导入镜像 ($idx/$total): $(basename "$tgz")" + tmp=$(mktemp); gunzip -c "$tgz" > "$tmp"; docker load -i "$tmp" >/dev/null; rm -f "$tmp" +done +shopt -u nullglob + +# Compose up +info "启动服务栈 (docker compose up -d)" +docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" up -d +docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps + +# Wait readiness (best-effort) +code(){ curl -4 -s -o /dev/null -w "%{http_code}" "$1" || echo 000; } +prom_ok(){ (exec 3<>/dev/tcp/127.0.0.1/${PROMETHEUS_PORT:-9090}) >/dev/null 2>&1 && return 0 || return 1; } +kb_ok(){ local body; body=$(curl -s "http://127.0.0.1:${KIBANA_PORT:-5601}/api/status" || true); echo "$body" | grep -q '"level"\s*:\s*"available"'; } +RETRIES=${RETRIES:-60}; SLEEP=${SLEEP:-5}; ok=0 +info "等待基础服务就绪 (<= $((RETRIES*SLEEP))s)" +for i in $(seq 1 "$RETRIES"); do + e1=$(code "http://127.0.0.1:${MASTER_PORT:-32300}/readyz") + e2=$(code "http://127.0.0.1:${ES_HTTP_PORT:-9200}/_cluster/health") + e3=000; prom_ok && e3=200 + e4=$(code "http://127.0.0.1:${GRAFANA_PORT:-3000}/api/health") + e5=$(code "http://127.0.0.1:${ALERTMANAGER_PORT:-9093}/api/v2/status") + e6=$(kb_ok && echo 200 || echo 000) + info "[ready] t=$((i*SLEEP))s master=$e1 es=$e2 prom=$e3 graf=$e4 alert=$e5 kibana=$e6" + [[ "$e1" == 200 ]] && ok=$((ok+1)) + [[ "$e2" == 200 ]] && ok=$((ok+1)) + [[ "$e3" == 200 ]] && ok=$((ok+1)) + [[ "$e4" == 200 ]] && ok=$((ok+1)) + [[ "$e5" == 200 ]] && ok=$((ok+1)) + [[ "$e6" == 200 ]] && ok=$((ok+1)) + if [[ $ok -ge 6 ]]; then break; fi; ok=0; sleep "$SLEEP" +done +[[ $ok -ge 6 ]] || err "部分服务未就绪(可稍后重试 selfcheck)" + +# Resolve overlay IPs +bind_c=argus-bind-sys; ftp_c=argus-ftp +BINDIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "'$NET_NAME'").IPAddress }}' "$bind_c" 2>/dev/null || true) +FTPIP=$(docker inspect -f '{{ (index .NetworkSettings.Networks "'$NET_NAME'").IPAddress }}' "$ftp_c" 2>/dev/null || true) +info "解析 overlay IP: BINDIP=${BINDIP:-} FTPIP=${FTPIP:-}" + +# Swarm join tokens +TOKEN_WORKER=$(docker swarm join-token -q worker 2>/dev/null || echo "") +TOKEN_MANAGER=$(docker swarm join-token -q manager 2>/dev/null || echo "") + +# cluster-info.env +CI="$PKG_ROOT/cluster-info.env" +info "写入 cluster-info.env (manager/token/IP)" +{ + echo "SWARM_MANAGER_ADDR=${SWARM_MANAGER_ADDR:-}" + echo "BINDIP=${BINDIP:-}" + echo "FTPIP=${FTPIP:-}" + echo "SWARM_JOIN_TOKEN_WORKER=${TOKEN_WORKER:-}" + echo "SWARM_JOIN_TOKEN_MANAGER=${TOKEN_MANAGER:-}" +} > "$CI" +info "已输出 $CI" + +# 安装报告 +ts=$(date +%Y%m%d-%H%M%S) +RPT="$PKG_ROOT/安装报告_${ts}.md" +{ + echo "# Argus Server 安装报告 (${ts})" + echo + echo "## 端口映射" + echo "- MASTER_PORT=${MASTER_PORT}" + echo "- ES_HTTP_PORT=${ES_HTTP_PORT}" + echo "- KIBANA_PORT=${KIBANA_PORT}" + echo "- PROMETHEUS_PORT=${PROMETHEUS_PORT}" + echo "- GRAFANA_PORT=${GRAFANA_PORT}" + echo "- ALERTMANAGER_PORT=${ALERTMANAGER_PORT}" + echo "- WEB_PROXY_PORT_8080=${WEB_PROXY_PORT_8080} ... 8085=${WEB_PROXY_PORT_8085}" + echo + echo "## Swarm/Overlay" + echo "- SWARM_MANAGER_ADDR=${SWARM_MANAGER_ADDR:-}" + echo "- NET=${NET_NAME}" + echo "- JOIN_TOKEN_WORKER=${TOKEN_WORKER:-}" + echo "- JOIN_TOKEN_MANAGER=${TOKEN_MANAGER:-}" + echo + echo "## Overlay IPs" + echo "- BINDIP=${BINDIP:-}" + echo "- FTPIP=${FTPIP:-}" + echo + echo "## 健康检查(简要)" + echo "- master/readyz=$(code http://127.0.0.1:${MASTER_PORT:-32300}/readyz)" + echo "- es/_cluster/health=$(code http://127.0.0.1:${ES_HTTP_PORT:-9200}/_cluster/health)" + echo "- grafana/api/health=$(code http://127.0.0.1:${GRAFANA_PORT:-3000}/api/health)" + echo "- prometheus/tcp=$([[ $(prom_ok; echo $?) == 0 ]] && echo 200 || echo 000)" + echo "- alertmanager/api/v2/status=$(code http://127.0.0.1:${ALERTMANAGER_PORT:-9093}/api/v2/status)" + echo "- kibana/api/status=$([[ $(kb_ok; echo $?) == 0 ]] && echo available || echo not-ready)" +} > "$RPT" +info "已生成报告: $RPT" + +info "安装完成。可将 cluster-info.env 分发给 Client-GPU 安装方。" + +# 写入域名→overlay IP 并热更新 Bind/Nginx +ETC_DIR="$PKG_ROOT/private/argus/etc"; mkdir -p "$ETC_DIR" +declare -A MAP +MAP[web-frontend]=web.argus.com +MAP[argus-grafana]=grafana.metric.argus.com +MAP[argus-prometheus]=prom.metric.argus.com +MAP[argus-kibana-sys]=kibana.log.argus.com +MAP[argus-alertmanager]=alertmanager.alert.argus.com +MAP[argus-master-sys]=master.argus.com +changed=0 +for cname in "${!MAP[@]}"; do + domain="${MAP[$cname]}"; fpath="$ETC_DIR/$domain" + ip=$(docker inspect -f '{{ (index .NetworkSettings.Networks "'$NET_NAME'").IPAddress }}' "$cname" 2>/dev/null || true) + [[ -z "$ip" ]] && { echo "[DNS-FIX][WARN] $domain: container $cname no overlay IP yet"; continue; } + cur=$(cat "$fpath" 2>/dev/null || echo "") + if [[ "$cur" != "$ip" ]]; then + echo "$ip" > "$fpath"; echo "[DNS-FIX][SET] $domain = $ip (was: ${cur:-})"; changed=1 + else + echo "[DNS-FIX][OK] $domain already $ip" + fi +done +if [[ $changed -eq 1 ]]; then + docker exec argus-bind-sys /usr/local/bin/reload-bind9.sh >/dev/null 2>&1 || docker exec argus-bind-sys rndc reload >/dev/null 2>&1 || true + sleep 1 +fi +docker exec argus-web-proxy nginx -t >/dev/null 2>&1 && docker exec argus-web-proxy nginx -s reload >/dev/null 2>&1 || true diff --git a/deployment_new/templates/server/scripts/selfcheck.sh b/deployment_new/templates/server/scripts/selfcheck.sh new file mode 100644 index 0000000..96d9ce5 --- /dev/null +++ b/deployment_new/templates/server/scripts/selfcheck.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +log() { echo -e "\033[0;34m[CHECK]\033[0m $*"; } +err() { echo -e "\033[0;31m[ERROR]\033[0m $*" >&2; } + +ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a + +wait_http() { local url="$1"; local attempts=${2:-120}; local i=1; while ((i<=attempts)); do curl -fsS "$url" >/dev/null 2>&1 && return 0; echo "[..] waiting $url ($i/$attempts)"; sleep 5; ((i++)); done; return 1; } +code_for() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; } +header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; } + +LOG_DIR="$ROOT/logs"; mkdir -p "$LOG_DIR" || true +OUT_JSON="$LOG_DIR/selfcheck.json"; tmp=$(mktemp) + +ok=1 + +log "checking overlay network" +net_ok=false +if docker network inspect "${ARGUS_OVERLAY_NET:-argus-sys-net}" >/dev/null 2>&1; then + if docker network inspect "${ARGUS_OVERLAY_NET:-argus-sys-net}" | grep -q '"Driver": "overlay"'; then net_ok=true; fi +fi +[[ "$net_ok" == true ]] || ok=0 + +log "checking Elasticsearch" +wait_http "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" 60 || ok=0 + +log "checking Kibana" +kb_code=$(code_for "http://localhost:${KIBANA_PORT:-5601}/api/status") +kb_ok=false +if [[ "$kb_code" == 200 ]]; then + body=$(curl -sS "http://localhost:${KIBANA_PORT:-5601}/api/status" || true) + echo "$body" | grep -q '"level"\s*:\s*"available"' && kb_ok=true +fi +[[ "$kb_ok" == true ]] || ok=0 + +log "checking Master" +[[ $(code_for "http://localhost:${MASTER_PORT:-32300}/readyz") == 200 ]] || ok=0 + +log "checking FTP" +if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then + docker exec argus-ftp sh -lc 'test -w /private/argus/ftp/share' >/dev/null 2>&1 || ok=0 +else ok=0; fi + +log "checking Prometheus" +wait_http "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready" 60 || ok=0 + +log "checking Grafana" +gf_code=$(code_for "http://localhost:${GRAFANA_PORT:-3000}/api/health") +gf_ok=false; if [[ "$gf_code" == 200 ]]; then body=$(curl -sS "http://localhost:${GRAFANA_PORT:-3000}/api/health" || true); echo "$body" | grep -q '"database"\s*:\s*"ok"' && gf_ok=true; fi +[[ "$gf_ok" == true ]] || ok=0 + +log "checking Alertmanager" +wait_http "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status" 60 || ok=0 + +log "checking Web-Proxy (CORS)" +cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true) +cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true) +wp_ok=true +[[ -n "$cors8084" && -n "$cors8085" ]] || wp_ok=false +[[ "$wp_ok" == true ]] || ok=0 + +cat > "$tmp" </dev/null || cp "$tmp" "$OUT_JSON" + +if [[ "$ok" == 1 ]]; then + log "selfcheck OK -> $OUT_JSON" + exit 0 +else + err "selfcheck FAILED -> $OUT_JSON" + exit 1 +fi diff --git a/deployment_new/templates/server/scripts/status.sh b/deployment_new/templates/server/scripts/status.sh new file mode 100644 index 0000000..c555cb8 --- /dev/null +++ b/deployment_new/templates/server/scripts/status.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +set -euo pipefail +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +PKG_ROOT="$ROOT_DIR" +ENV_FILE="$PKG_ROOT/compose/.env" +COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml" +docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" ps diff --git a/deployment_new/templates/server/scripts/uninstall.sh b/deployment_new/templates/server/scripts/uninstall.sh new file mode 100644 index 0000000..c63bb24 --- /dev/null +++ b/deployment_new/templates/server/scripts/uninstall.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -euo pipefail +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +PKG_ROOT="$ROOT_DIR" +ENV_FILE="$PKG_ROOT/compose/.env" +COMPOSE_FILE="$PKG_ROOT/compose/docker-compose.yml" +echo "[UNINSTALL] stopping compose" +docker compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" down --remove-orphans || true +echo "[UNINSTALL] done" diff --git a/src/alert/alertmanager/build/Dockerfile b/src/alert/alertmanager/build/Dockerfile index 2045db9..f0c82c8 100644 --- a/src/alert/alertmanager/build/Dockerfile +++ b/src/alert/alertmanager/build/Dockerfile @@ -31,26 +31,31 @@ RUN mkdir -p /usr/share/alertmanager && \ rm -rf /alertmanager && \ ln -s ${ALERTMANAGER_BASE_PATH} /alertmanager -# 创建 alertmanager 用户(可自定义 UID/GID) -# 创建 alertmanager 用户组 +# 确保 ubuntu 账户存在并使用 ARGUS_BUILD_UID/GID RUN set -eux; \ - # 确保目标 GID 存在;若已被占用,直接使用该 GID(组名不限)\ - if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \ - groupadd -g "${ARGUS_BUILD_GID}" alertmanager || true; \ - fi; \ - # 确保存在 alertmanager 用户;若 UID 已被占用,跳过并继续使用现有 UID 的用户 - if ! id alertmanager >/dev/null 2>&1; then \ - if getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \ - # UID 已占用,则创建同名用户但不指定 UID(避免冲突),仅保证 user 存在 - useradd -M -s /usr/sbin/nologin -g "${ARGUS_BUILD_GID}" alertmanager || true; \ - else \ - useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" alertmanager || true; \ - fi; \ + # 确保存在目标 GID 的组;若不存在则优先尝试将 ubuntu 组改为该 GID,否则创建新组 + if getent group "${ARGUS_BUILD_GID}" >/dev/null; then \ + :; \ else \ - usermod -g "${ARGUS_BUILD_GID}" alertmanager || true; \ - fi - -RUN chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/alertmanager /alertmanager ${ALERTMANAGER_BASE_PATH} /private/argus/etc /usr/local/bin || true + if getent group ubuntu >/dev/null; then \ + groupmod -g "${ARGUS_BUILD_GID}" ubuntu || true; \ + else \ + groupadd -g "${ARGUS_BUILD_GID}" ubuntu || groupadd -g "${ARGUS_BUILD_GID}" argus || true; \ + fi; \ + fi; \ + # 创建或调整 ubuntu 用户 + if id ubuntu >/dev/null 2>&1; then \ + # 设置主组为目标 GID(可用 GID 数字指定) + usermod -g "${ARGUS_BUILD_GID}" ubuntu || true; \ + # 若目标 UID 未被占用,则更新 ubuntu 的 UID + if [ "$(id -u ubuntu)" != "${ARGUS_BUILD_UID}" ] && ! getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \ + usermod -u "${ARGUS_BUILD_UID}" ubuntu || true; \ + fi; \ + else \ + useradd -m -s /bin/bash -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" ubuntu || true; \ + fi; \ + # 调整关键目录属主为 ubuntu UID/GID + chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/alertmanager /alertmanager ${ALERTMANAGER_BASE_PATH} /private/argus/etc /usr/local/bin || true # 配置内网 apt 源 (如果指定了内网选项) RUN if [ "$USE_INTRANET" = "true" ]; then \ diff --git a/src/alert/alertmanager/build/supervisord.conf b/src/alert/alertmanager/build/supervisord.conf index d284547..da05ac7 100644 --- a/src/alert/alertmanager/build/supervisord.conf +++ b/src/alert/alertmanager/build/supervisord.conf @@ -6,7 +6,7 @@ user=root [program:alertmanager] command=/usr/local/bin/start-am-supervised.sh -user=alertmanager +user=ubuntu stdout_logfile=/var/log/supervisor/alertmanager.log stderr_logfile=/var/log/supervisor/alertmanager_error.log autorestart=true diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/outputs.d/10-es.conf b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/outputs.d/10-es.conf index f273270..a828428 100644 --- a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/outputs.d/10-es.conf +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/outputs.d/10-es.conf @@ -1,9 +1,11 @@ # 重要:使用 Logstash_Format + Logstash_Prefix,生成 train-*/infer-* 索引 +# 说明:Fluent Bit 配置仅支持 ${VAR} 占位符,不支持 Bash 的 ${VAR:-default} +# 固定域名要求:使用 es.log.argus.com 与端口 9200 [OUTPUT] Name es Match app.train - Host ${ES_HOST:-localhost} - Port ${ES_PORT:-9200} + Host es.log.argus.com + Port 9200 Logstash_Format On Logstash_Prefix train Replace_Dots On @@ -14,8 +16,8 @@ [OUTPUT] Name es Match app.infer - Host ${ES_HOST:-localhost} - Port ${ES_PORT:-9200} + Host es.log.argus.com + Port 9200 Logstash_Format On Logstash_Prefix infer Replace_Dots On diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/install.sh b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/install.sh index aef6e34..1d5f371 100755 --- a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/install.sh +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/install.sh @@ -206,7 +206,8 @@ export HOSTNAME export CLUSTER="${CLUSTER:-local}" export RACK="${RACK:-dev}" -export ES_HOST="${ES_HOST:-localhost}" +# 默认使用固定域名(满足“固定域名”需求);若外部传入覆盖,则使用外部值 +export ES_HOST="${ES_HOST:-es.log.argus.com}" export ES_PORT="${ES_PORT:-9200}" log_info "Environment variables:"