Compare commits
No commits in common. "252461c6426a01d8608b92a84ffe8b1b1810f827" and "af7cfbe74b186fbae258cecb696b33a33a080936" have entirely different histories.
252461c642
...
af7cfbe74b
35
deployment/build/build_client_package.sh
Executable file → Normal file
35
deployment/build/build_client_package.sh
Executable file → Normal file
@ -43,23 +43,7 @@ latest_dir=$(ls -1dt "$ART_BASE"/*/ 2>/dev/null | head -n1 || true)
|
|||||||
|
|
||||||
tmpdir=$(mktemp -d)
|
tmpdir=$(mktemp -d)
|
||||||
trap 'rm -rf "$tmpdir"' EXIT
|
trap 'rm -rf "$tmpdir"' EXIT
|
||||||
# Filter-only copy: keep install_order files + scripts + deps + version.json
|
rsync -a "$latest_dir" "$tmpdir/src" >/dev/null 2>&1 || cp -r "$latest_dir" "$tmpdir/src"
|
||||||
mkdir -p "$tmpdir/src"
|
|
||||||
cp -f "$latest_dir/version.json" "$tmpdir/src/version.json"
|
|
||||||
if command -v jq >/dev/null 2>&1; then
|
|
||||||
mapfile -t files < <(jq -r '.install_order[]' "$latest_dir/version.json")
|
|
||||||
else
|
|
||||||
files=( $(grep -E '"install_order"' -A10 "$latest_dir/version.json" | sed -n 's/\s*"\(.*\.tar\.gz\)".*/\1/p') )
|
|
||||||
fi
|
|
||||||
for f in "${files[@]}"; do
|
|
||||||
[[ -f "$latest_dir/$f" ]] && cp -f "$latest_dir/$f" "$tmpdir/src/$f"
|
|
||||||
done
|
|
||||||
for aux in install.sh uninstall.sh check_health.sh check_version.sh sync_dns.sh restart_unhealthy.sh config.env; do
|
|
||||||
[[ -f "$latest_dir/$aux" ]] && cp -f "$latest_dir/$aux" "$tmpdir/src/$aux";
|
|
||||||
done
|
|
||||||
if [[ -d "$latest_dir/deps" ]]; then
|
|
||||||
mkdir -p "$tmpdir/src/deps" && rsync -a "$latest_dir/deps/" "$tmpdir/src/deps/" >/dev/null 2>&1 || cp -r "$latest_dir/deps/." "$tmpdir/src/deps/";
|
|
||||||
fi
|
|
||||||
|
|
||||||
out_name="argus-metric_$(echo "$VERSION" | sed 's/\./_/g').tar.gz"
|
out_name="argus-metric_$(echo "$VERSION" | sed 's/\./_/g').tar.gz"
|
||||||
|
|
||||||
@ -68,20 +52,5 @@ out_name="argus-metric_$(echo "$VERSION" | sed 's/\./_/g').tar.gz"
|
|||||||
log "Client package ready: $PKG_DIR/$out_name"
|
log "Client package ready: $PKG_DIR/$out_name"
|
||||||
echo "$VERSION" > "$PKG_DIR/LATEST_VERSION"
|
echo "$VERSION" > "$PKG_DIR/LATEST_VERSION"
|
||||||
|
|
||||||
# include publish helper and setup.sh for convenience
|
|
||||||
PUBLISH_TPL="$BUILD_DIR/templates/client/publish.sh"
|
|
||||||
if [[ -f "$PUBLISH_TPL" ]]; then
|
|
||||||
cp "$PUBLISH_TPL" "$PKG_DIR/publish.sh" && chmod +x "$PKG_DIR/publish.sh"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# also place a copy of setup.sh alongside
|
|
||||||
SETUP_SRC="$ROOT_DIR/src/metric/client-plugins/all-in-one-full/scripts/setup.sh"
|
|
||||||
[[ -f "$SETUP_SRC" ]] && cp "$SETUP_SRC" "$PKG_DIR/setup.sh" || true
|
|
||||||
|
|
||||||
# docs for end users
|
|
||||||
CLIENT_DOC_DIR="$BUILD_DIR/templates/client"
|
|
||||||
if [[ -d "$CLIENT_DOC_DIR" ]]; then
|
|
||||||
rsync -a "$CLIENT_DOC_DIR/" "$PKG_DIR/" >/dev/null 2>&1 || cp -r "$CLIENT_DOC_DIR/." "$PKG_DIR/"
|
|
||||||
fi
|
|
||||||
|
|
||||||
exit 0
|
exit 0
|
||||||
|
|
||||||
|
|||||||
30
deployment/build/build_server_package.sh
Executable file → Normal file
30
deployment/build/build_server_package.sh
Executable file → Normal file
@ -48,17 +48,6 @@ SRC_COMPOSE="$ROOT_DIR/src/sys/tests/docker-compose.yml"
|
|||||||
[[ -f "$SRC_COMPOSE" ]] || { err "missing $SRC_COMPOSE"; exit 1; }
|
[[ -f "$SRC_COMPOSE" ]] || { err "missing $SRC_COMPOSE"; exit 1; }
|
||||||
awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$STAGE/compose/docker-compose.yml"
|
awk -f "$BUILD_DIR/templates/docker-compose.filter.awk" -v remove="node-a,node-b,test-node,test-gpu-node" "$SRC_COMPOSE" > "$STAGE/compose/docker-compose.yml"
|
||||||
cp "$BUILD_DIR/templates/.env.example" "$STAGE/compose/.env.example"
|
cp "$BUILD_DIR/templates/.env.example" "$STAGE/compose/.env.example"
|
||||||
# fix relative private path to match package layout (compose/ and private/ are siblings)
|
|
||||||
sed -i 's#\./private/#../private/#g' "$STAGE/compose/docker-compose.yml"
|
|
||||||
# also handle bind mount form without trailing slash
|
|
||||||
sed -i -E 's#- \./private:/private#- ../private:/private#g' "$STAGE/compose/docker-compose.yml"
|
|
||||||
# drop timezone file bind which may not exist on target distros (e.g. NixOS)
|
|
||||||
sed -i '/\/etc\/timezone:\/etc\/timezone:ro/d' "$STAGE/compose/docker-compose.yml"
|
|
||||||
|
|
||||||
# sanity-check: ensure test services are absent
|
|
||||||
if grep -E '^(\s{2})(node-a|node-b|test-node|test-gpu-node):\s*$' -n "$STAGE/compose/docker-compose.yml" >/dev/null; then
|
|
||||||
err "compose filter failed: test services still present"; exit 1;
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 3) Images (reuse if already exported unless --resave-image)
|
# 3) Images (reuse if already exported unless --resave-image)
|
||||||
existing_images_tar="$PKG_DIR/images/all-images.tar.gz"
|
existing_images_tar="$PKG_DIR/images/all-images.tar.gz"
|
||||||
@ -87,7 +76,24 @@ fi
|
|||||||
|
|
||||||
# 4) Scripts & Docs
|
# 4) Scripts & Docs
|
||||||
copy_tree "$BUILD_DIR/templates/scripts" "$STAGE/scripts"
|
copy_tree "$BUILD_DIR/templates/scripts" "$STAGE/scripts"
|
||||||
copy_tree "$BUILD_DIR/templates/docs" "$STAGE/docs"
|
cat > "$STAGE/docs/INSTALL_SERVER.md" << 'MD'
|
||||||
|
# Argus Server Offline Installation
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
- Ubuntu 22.04 x86_64
|
||||||
|
- Docker & Docker Compose installed
|
||||||
|
- Open ports: 32300,9200,5601,9090,9093,8080..8085,21,20,21100-21110 (or auto-fallback to high ports)
|
||||||
|
|
||||||
|
## Steps
|
||||||
|
1. Extract this package to /opt/argus-deploy/versions/<YYYYMMDD>
|
||||||
|
2. cd scripts && sudo ./server-install.sh
|
||||||
|
3. Check status: ./server-status.sh
|
||||||
|
4. Uninstall: ./server-uninstall.sh
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
- Selfcheck result is written to logs/selfcheck.json
|
||||||
|
- DNS will be managed by internal bind; FTP dns.conf is auto-published to share/dns.conf
|
||||||
|
MD
|
||||||
|
|
||||||
# 5) Manifests
|
# 5) Manifests
|
||||||
gen_manifest "$STAGE" "$STAGE/manifest.txt"
|
gen_manifest "$STAGE" "$STAGE/manifest.txt"
|
||||||
|
|||||||
0
deployment/build/common.sh
Executable file → Normal file
0
deployment/build/common.sh
Executable file → Normal file
57
deployment/build/publish_client.sh
Normal file
57
deployment/build/publish_client.sh
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||||
|
. "$ROOT_DIR/build/common.sh"
|
||||||
|
|
||||||
|
usage() { cat <<'EOF'
|
||||||
|
Publish client package to FTP server
|
||||||
|
|
||||||
|
Usage: publish_client.sh --version YYYYMMDD --server HOST --user USER --password PASS [--port 21]
|
||||||
|
|
||||||
|
It uploads: setup.sh, argus-metric_<YYYYMMDD>.tar.gz, LATEST_VERSION to the FTP share root.
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
VERSION=""; HOST=""; USERNAME=""; PASSWORD=""; PORT=21
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--version) VERSION="$2"; shift 2;;
|
||||||
|
--server) HOST="$2"; shift 2;;
|
||||||
|
--user) USERNAME="$2"; shift 2;;
|
||||||
|
--password) PASSWORD="$2"; shift 2;;
|
||||||
|
--port) PORT="$2"; shift 2;;
|
||||||
|
-h|--help) usage; exit 0;;
|
||||||
|
*) err "unknown arg: $1"; usage; exit 1;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
[[ -n "$VERSION" && -n "$HOST" && -n "$USERNAME" && -n "$PASSWORD" ]] || { usage; exit 1; }
|
||||||
|
|
||||||
|
CLIENT_DIR="$ROOT_DIR/artifact/client/$VERSION"
|
||||||
|
TAR_NAME="argus-metric_${VERSION}.tar.gz"
|
||||||
|
PKG="$CLIENT_DIR/$TAR_NAME"
|
||||||
|
SETUP_SRC="$ROOT_DIR/../src/sys/tests/private/argus/metric/ftp/share/setup.sh"
|
||||||
|
ALT_SETUP="$ROOT_DIR/../src/metric/client-plugins/all-in-one-full/scripts/setup.sh"
|
||||||
|
|
||||||
|
[[ -f "$PKG" ]] || { err "missing client package: $PKG"; exit 1; }
|
||||||
|
if [[ ! -f "$SETUP_SRC" ]]; then
|
||||||
|
if [[ -f "$ALT_SETUP" ]]; then
|
||||||
|
SETUP_SRC="$ALT_SETUP"
|
||||||
|
else
|
||||||
|
err "missing setup.sh (checked $SETUP_SRC and $ALT_SETUP)"; exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Uploading setup.sh"
|
||||||
|
curl -u "$USERNAME:$PASSWORD" -sfT "$SETUP_SRC" "ftp://$HOST:$PORT/setup.sh"
|
||||||
|
|
||||||
|
log "Uploading client tar: $TAR_NAME"
|
||||||
|
curl -u "$USERNAME:$PASSWORD" -sfT "$PKG" "ftp://$HOST:$PORT/$TAR_NAME"
|
||||||
|
|
||||||
|
log "Updating LATEST_VERSION -> $VERSION"
|
||||||
|
printf "%s" "$VERSION" | curl -u "$USERNAME:$PASSWORD" -sfT - "ftp://$HOST:$PORT/LATEST_VERSION"
|
||||||
|
|
||||||
|
log "Publish done"
|
||||||
|
|
||||||
|
exit 0
|
||||||
@ -1,37 +0,0 @@
|
|||||||
# Argus Metric 客户端安装指南(容器内普通用户场景)
|
|
||||||
|
|
||||||
## 准备与连通性检查
|
|
||||||
- FTP 连接(需要账号密码,默认 `ftpuser / ZGClab1234!`)
|
|
||||||
- `curl -u ftpuser:ZGClab1234! -I ftp://<FTP_IP>:21/LATEST_VERSION`
|
|
||||||
- `curl -u ftpuser:ZGClab1234! -s ftp://<FTP_IP>:21/ | head`
|
|
||||||
- 下载安装脚本
|
|
||||||
- `curl -u ftpuser:ZGClab1234! -fsSL ftp://<FTP_IP>:21/setup.sh -o /tmp/setup.sh`
|
|
||||||
- `chmod +x /tmp/setup.sh`
|
|
||||||
|
|
||||||
## 元数据与主机名
|
|
||||||
- Agent 需要元数据(env/user/instance)与 Master 地址:
|
|
||||||
- 方式A:hostname 形如 `env-user-instance-xxx`(推荐)
|
|
||||||
- 方式B:导出环境变量:
|
|
||||||
- `export AGENT_ENV=dev`
|
|
||||||
- `export AGENT_USER=<your_user>`
|
|
||||||
- `export AGENT_INSTANCE=<node_id>`
|
|
||||||
- Master 地址:
|
|
||||||
- `export MASTER_ENDPOINT=http://master.argus.com:3000`
|
|
||||||
|
|
||||||
> 安装脚本会在开头检查上述条件,缺失时会终止并给出修复提示。
|
|
||||||
|
|
||||||
## 执行安装
|
|
||||||
- 以 root 运行(容器内如为非 root 用户请切换为 root):
|
|
||||||
- `sudo /tmp/setup.sh --server <FTP_IP> --user ftpuser --password 'ZGClab1234!' --port 21`
|
|
||||||
- 如需自定义安装根目录:`--install-dir /opt/argus-metric`
|
|
||||||
|
|
||||||
## 安装后自检(setup 自动执行)
|
|
||||||
- setup 会等待最多 5 分钟,确认以下条件后才报告完成:
|
|
||||||
- `/private/argus/agent/<hostname>/node.json` 已生成;
|
|
||||||
- `last_report` 在持续更新;
|
|
||||||
- `health.metric-argus-agent|metric-node-exporter|metric-fluent-bit|metric-dcgm-exporter` 均为 `healthy` 且 `error` 为空。
|
|
||||||
|
|
||||||
## 手工验证(可选)
|
|
||||||
- `cat /private/argus/agent/$(hostname)/node.json | jq '.'`
|
|
||||||
- `curl -s -o /dev/null -w "%{http_code}\n" http://localhost:9100/metrics` 应为 200
|
|
||||||
- 查看日志:`/var/log/argus-agent.log`、`/opt/argus-metric/versions/*/.install.log`
|
|
||||||
@ -1,54 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
usage() { cat <<'EOF'
|
|
||||||
Publish Argus client package to FTP
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
./publish.sh --server HOST --user USER --password PASS [--port 21]
|
|
||||||
|
|
||||||
Notes:
|
|
||||||
- This script expects to run inside the built client artifact directory.
|
|
||||||
- It reads LATEST_VERSION and uploads setup.sh, argus-metric_<ver>.tar.gz, and LATEST_VERSION.
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
HOST=""; USERNAME=""; PASSWORD=""; PORT=21
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case "$1" in
|
|
||||||
--server) HOST="$2"; shift 2;;
|
|
||||||
--user) USERNAME="$2"; shift 2;;
|
|
||||||
--password) PASSWORD="$2"; shift 2;;
|
|
||||||
--port) PORT="$2"; shift 2;;
|
|
||||||
-h|--help) usage; exit 0;;
|
|
||||||
*) echo "unknown arg: $1" >&2; usage; exit 1;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
[[ -n "$HOST" && -n "$USERNAME" && -n "$PASSWORD" ]] || { usage; exit 1; }
|
|
||||||
|
|
||||||
here="$(pwd)"
|
|
||||||
if [[ ! -f "$here/LATEST_VERSION" ]]; then
|
|
||||||
echo "LATEST_VERSION not found in $(pwd)" >&2; exit 1;
|
|
||||||
fi
|
|
||||||
VER=$(cat "$here/LATEST_VERSION" | tr -d '\n')
|
|
||||||
PKG="argus-metric_${VER}.tar.gz"
|
|
||||||
|
|
||||||
if [[ ! -f "$here/$PKG" ]]; then
|
|
||||||
echo "client tar not found: $PKG" >&2; exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# locate setup.sh (prefer colocated, fallback to bundled path if provided)
|
|
||||||
SETUP="${here}/setup.sh"
|
|
||||||
if [[ ! -f "$SETUP" ]]; then
|
|
||||||
echo "setup.sh not found in $(pwd)" >&2; exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[PUBLISH] server=$HOST port=$PORT version=$VER"
|
|
||||||
|
|
||||||
curl -u "$USERNAME:$PASSWORD" -sfT "$SETUP" "ftp://$HOST:$PORT/setup.sh"
|
|
||||||
curl -u "$USERNAME:$PASSWORD" -sfT "$PKG" "ftp://$HOST:$PORT/$PKG"
|
|
||||||
printf "%s" "$VER" | curl -u "$USERNAME:$PASSWORD" -sfT - "ftp://$HOST:$PORT/LATEST_VERSION"
|
|
||||||
|
|
||||||
echo "[OK] publish completed"
|
|
||||||
|
|
||||||
@ -4,38 +4,38 @@
|
|||||||
|
|
||||||
BEGIN{
|
BEGIN{
|
||||||
split(remove, rm, ",");
|
split(remove, rm, ",");
|
||||||
for(i in rm){
|
for(i in rm) skipname[rm[i]] = 1;
|
||||||
gsub(/^\s+|\s+$/,"",rm[i]);
|
|
||||||
if (rm[i] != "") skipname[rm[i]] = 1;
|
|
||||||
}
|
|
||||||
in_services=0; skipping=0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function service_header(line, m) {
|
function starts_service_line(line, name) {
|
||||||
# match exactly two leading spaces followed by name:
|
if (match(line, /^\s{2}([a-zA-Z0-9_-]+):\s*$/, m)) {
|
||||||
if (match(line, /^ ([A-Za-z0-9_-]+):[ ]*$/, m)) return m[1];
|
name = m[1];
|
||||||
|
return name;
|
||||||
|
}
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
# Track top-level sections (no indentation)
|
name = starts_service_line($0);
|
||||||
if ($0 ~ /^[A-Za-z0-9_-]+:[ ]*$/ && $0 !~ /^\s/) {
|
if (name != "") {
|
||||||
in_services = ($0 ~ /^services:[ ]*$/) ? 1 : 0;
|
# detect top-level keys (networks:, services:, etc.)
|
||||||
}
|
if ($0 ~ /^services:\s*$/) { in_services=1; print; next; }
|
||||||
|
if ($0 ~ /^[a-zA-Z0-9_-]+:\s*$/ && $0 !~ /^\s/) {
|
||||||
|
in_services= ($0 ~ /^services:\s*$/);
|
||||||
|
}
|
||||||
|
|
||||||
if (skipping) {
|
if (in_services && (name in skipname)) {
|
||||||
# Stop skipping at next service header or another top-level section
|
skipping=1; next;
|
||||||
if (service_header($0) != "" || ($0 ~ /^(networks|volumes):[ ]*$/ && $0 !~ /^\s/)) {
|
|
||||||
skipping=0;
|
|
||||||
} else {
|
|
||||||
next;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (in_services) {
|
# end skipping when next top-level service appears
|
||||||
name = service_header($0);
|
if (skipping) {
|
||||||
if (name != "" && (name in skipname)) { skipping=1; next; }
|
if (starts_service_line($0) != "") { skipping=0; }
|
||||||
|
else if ($0 ~ /^(networks|volumes):\s*$/) { skipping=0; }
|
||||||
|
else { next; }
|
||||||
}
|
}
|
||||||
|
|
||||||
print;
|
print;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,48 +0,0 @@
|
|||||||
# Argus Server Offline Installation
|
|
||||||
|
|
||||||
## Prerequisites
|
|
||||||
- Linux x86_64 (Ubuntu 22.04 recommended; see OS compatibility for NixOS)
|
|
||||||
- Docker & Docker Compose installed
|
|
||||||
- Open ports: 32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 21100–21110 (or auto-fallback to high ports)
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
1. Extract to a target dir, e.g. `/opt/argus-deploy/versions/<YYYYMMDD>`
|
|
||||||
2. `cd scripts && sudo ./server-prepare-dirs.sh`
|
|
||||||
3. `./server-install.sh`
|
|
||||||
4. `./server-status.sh`
|
|
||||||
5. `./server-selfcheck.sh` (on failure it auto-runs diagnose and writes logs under `logs/`)
|
|
||||||
6. `./server-uninstall.sh` to tear down
|
|
||||||
|
|
||||||
## What the Installer Does
|
|
||||||
- Loads local images (`images/all-images.tar.gz`)
|
|
||||||
- Generates OS-compat override (`security_opt: ["label=disable"]`, `userns_mode: host`, bind `tmpfs:/run/named`)
|
|
||||||
- Starts server-only services: bind/master/es/kibana/ftp/prometheus/grafana/alertmanager/web-frontend/web-proxy
|
|
||||||
- DNS Bootstrap:
|
|
||||||
- Ensure `/private/argus/etc/dns.conf` exists (write `172.31.0.2` if missing);
|
|
||||||
- Run `/private/argus/etc/update-dns.sh` in dependent containers so `/etc/resolv.conf` points to bind;
|
|
||||||
- Wait for `*.argus.com` hint files, then reload bind;
|
|
||||||
- Restart web‑proxy to re-render nginx resolver from `dns.conf`;
|
|
||||||
- Writes `logs/selfcheck.json` as final summary
|
|
||||||
|
|
||||||
## OS Compatibility
|
|
||||||
- NixOS / non-xattr FS: containers run with `security_opt: ["label=disable"]` and `userns_mode: host`; ensure data dirs are pre-created via `sudo ./server-prepare-dirs.sh` and owned by runtime UID:GID (default 1000:1000).
|
|
||||||
|
|
||||||
## Files & Layout
|
|
||||||
- `compose/` (docker-compose.yml, .env)
|
|
||||||
- `private/` (data mounts)
|
|
||||||
- `scripts/` (install/uninstall/status/selfcheck/diagnose)
|
|
||||||
- `logs/` (selfcheck + diagnose outputs)
|
|
||||||
|
|
||||||
## Troubleshooting (Quick)
|
|
||||||
- Run `./server-selfcheck.sh` → see `logs/selfcheck.json`
|
|
||||||
- Run `./server-diagnose.sh` → produces timestamped logs:
|
|
||||||
- `logs/diagnose_details_YYYYMMDD-HHMMSSZ.log`
|
|
||||||
- `logs/diagnose_error_YYYYMMDD-HHMMSSZ.log`
|
|
||||||
And updates `diagnose_details.log`/`diagnose_error.log` to the latest
|
|
||||||
- Error lines are tagged `[service][source]`, e.g. `[kibana][http] /api/status=503`
|
|
||||||
|
|
||||||
Common issues:
|
|
||||||
- Kibana 503: wait cold start or fix DNS so `es.log.argus.com` resolves
|
|
||||||
- web‑proxy 504: check nginx `resolver` includes `172.31.0.2 127.0.0.11`
|
|
||||||
- EACCES/locks: ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches UID:GID
|
|
||||||
|
|
||||||
@ -1,28 +0,0 @@
|
|||||||
# Argus 服务端离线安装指南
|
|
||||||
|
|
||||||
## 先决条件
|
|
||||||
- Linux x86_64(推荐 Ubuntu 22.04;NixOS 见“兼容说明”)
|
|
||||||
- 已安装 Docker 与 Docker Compose
|
|
||||||
- 端口:32300, 9200, 5601, 9090, 9093, 8080..8085, 21, 20, 21100–21110
|
|
||||||
|
|
||||||
## 快速开始
|
|
||||||
1. 解压到目标目录(例如 `/opt/argus-deploy/versions/<YYYYMMDD>`)
|
|
||||||
2. 进入 `scripts/`:`sudo ./server-prepare-dirs.sh`
|
|
||||||
3. 安装:`./server-install.sh`
|
|
||||||
4. 状态:`./server-status.sh`
|
|
||||||
5. 自检:`./server-selfcheck.sh`(失败会自动采集诊断)
|
|
||||||
6. 卸载:`./server-uninstall.sh`
|
|
||||||
|
|
||||||
## 安装流程要点
|
|
||||||
- 仅启动 10 个服务端组件(不包含测试节点);
|
|
||||||
- DNS Bootstrap:补齐首次部署 DNS 依赖(生成/确认 `dns.conf`、统一容器 resolv.conf、写入 `*.argus.com`、reload bind、重启 web‑proxy);
|
|
||||||
- 输出自检结果到 `logs/selfcheck.json`。
|
|
||||||
|
|
||||||
## 兼容说明(NixOS 等)
|
|
||||||
- 使用 `security_opt: ["label=disable"]` 与 `userns_mode: host`;
|
|
||||||
- 先运行 `sudo ./server-prepare-dirs.sh` 创建/授权目录为 `1000:1000`;
|
|
||||||
|
|
||||||
## 故障排查(见下文 Troubleshooting_zh)
|
|
||||||
- `./server-selfcheck.sh` → `logs/selfcheck.json`
|
|
||||||
- `./server-diagnose.sh` → `logs/diagnose_error_*.log` / `logs/diagnose_details_*.log`
|
|
||||||
|
|
||||||
@ -1,21 +0,0 @@
|
|||||||
# Troubleshooting
|
|
||||||
|
|
||||||
- Status: `scripts/server-status.sh`
|
|
||||||
- Selfcheck: `scripts/server-selfcheck.sh`
|
|
||||||
- Diagnose: `scripts/server-diagnose.sh`
|
|
||||||
|
|
||||||
Outputs:
|
|
||||||
- `logs/selfcheck.json`
|
|
||||||
- `logs/diagnose_details_*.log` (full details)
|
|
||||||
- `logs/diagnose_error_*.log` (tagged errors)
|
|
||||||
|
|
||||||
Web‑Proxy:
|
|
||||||
- 8083 expects 200/302/403; 8084/8085 must include CORS header
|
|
||||||
- nginx resolver should be `172.31.0.2 127.0.0.11`
|
|
||||||
|
|
||||||
Kibana/ES:
|
|
||||||
- Verify `es.log.argus.com` resolves inside Kibana
|
|
||||||
|
|
||||||
Permissions:
|
|
||||||
- Ensure `sudo ./server-prepare-dirs.sh` ran and ownership matches runtime UID:GID
|
|
||||||
|
|
||||||
@ -1,15 +0,0 @@
|
|||||||
# 故障排查
|
|
||||||
|
|
||||||
- 状态:`scripts/server-status.sh`
|
|
||||||
- 自检:`scripts/server-selfcheck.sh`
|
|
||||||
- 诊断:`scripts/server-diagnose.sh`
|
|
||||||
|
|
||||||
输出:
|
|
||||||
- `logs/selfcheck.json`
|
|
||||||
- `logs/diagnose_error_*.log`(错误摘要)
|
|
||||||
- `logs/diagnose_details_*.log`(详细信息)
|
|
||||||
|
|
||||||
Web‑Proxy:8083=200/302/403;8084/8085 需包含 CORS
|
|
||||||
Kibana:确认可解析 `es.log.argus.com`
|
|
||||||
权限:先运行 `sudo ./server-prepare-dirs.sh`
|
|
||||||
|
|
||||||
@ -1,117 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
|
|
||||||
ENV_FILE="$ROOT/compose/.env"; [[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
|
||||||
|
|
||||||
mkdir -p "$ROOT/logs"
|
|
||||||
ts="$(date -u +%Y%m%d-%H%M%SZ)"
|
|
||||||
DETAILS="$ROOT/logs/diagnose_details_${ts}.log"
|
|
||||||
ERRORS="$ROOT/logs/diagnose_error_${ts}.log"
|
|
||||||
: > "$DETAILS"; : > "$ERRORS"
|
|
||||||
|
|
||||||
logd() { echo "$(date '+%F %T') $*" >> "$DETAILS"; }
|
|
||||||
append_err() { echo "$*" >> "$ERRORS"; }
|
|
||||||
|
|
||||||
http_code() { curl -s -o /dev/null -w "%{http_code}" "$1" || echo 000; }
|
|
||||||
http_body_head() { curl -s --max-time 3 "$1" 2>/dev/null | sed -n '1,5p' || true; }
|
|
||||||
header_val() { curl -s -D - -o /dev/null "$@" | awk -F': ' 'BEGIN{IGNORECASE=1}$1=="Access-Control-Allow-Origin"{gsub("\r","",$2);print $2}'; }
|
|
||||||
|
|
||||||
section() {
|
|
||||||
local name="$1"; logd "===== [$name] ====="; }
|
|
||||||
|
|
||||||
svc() {
|
|
||||||
local svc_name="$1"; local cname="$2"; shift 2
|
|
||||||
section "$svc_name ($cname)"
|
|
||||||
logd "docker ps:"; docker ps -a --format '{{.Names}}\t{{.Status}}\t{{.Image}}' | awk -v n="$cname" '$1==n' >> "$DETAILS" || true
|
|
||||||
logd "docker inspect (state/restartcount):"; docker inspect -f '{{.State.Status}} rc={{.RestartCount}} started={{.State.StartedAt}}' "$cname" >> "$DETAILS" 2>&1 || true
|
|
||||||
logd "last 200 container logs:"; docker logs --tail 200 "$cname" >> "$DETAILS" 2>&1 || true
|
|
||||||
|
|
||||||
# extract error lines from container logs
|
|
||||||
docker logs --tail 200 "$cname" 2>&1 | \
|
|
||||||
grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \
|
|
||||||
sed "s/^/[${svc_name}][container] /" >> "$ERRORS" || true
|
|
||||||
|
|
||||||
# supervisor status and logs
|
|
||||||
if docker exec "$cname" sh -lc 'command -v supervisorctl >/dev/null 2>&1' >/dev/null 2>&1; then
|
|
||||||
logd "supervisorctl status:"; docker exec "$cname" sh -lc 'supervisorctl status' >> "$DETAILS" 2>&1 || true
|
|
||||||
# iterate supervisor logs and collect tails + errors per file
|
|
||||||
local files
|
|
||||||
files=$(docker exec "$cname" sh -lc 'ls /var/log/supervisor/*.log 2>/dev/null' || true)
|
|
||||||
for f in $files; do
|
|
||||||
logd "tail -n 80 $f:"; docker exec "$cname" sh -lc "tail -n 80 $f 2>/dev/null || true" >> "$DETAILS" 2>&1 || true
|
|
||||||
docker exec "$cname" sh -lc "tail -n 200 $f 2>/dev/null" 2>/dev/null | \
|
|
||||||
grep -Ei '\b(error|failed|fail|exception|panic|fatal|critical|unhealthy|permission denied|forbidden|refused|traceback|错误|失败)\b' | \
|
|
||||||
sed "s/^/[${svc_name}][supervisor:$(basename $f)] /" >> "$ERRORS" || true
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Core services
|
|
||||||
svc bind argus-bind-sys
|
|
||||||
svc master argus-master-sys
|
|
||||||
svc es argus-es-sys
|
|
||||||
svc kibana argus-kibana-sys
|
|
||||||
svc ftp argus-ftp
|
|
||||||
svc prometheus argus-prometheus
|
|
||||||
svc grafana argus-grafana
|
|
||||||
svc alertmanager argus-alertmanager
|
|
||||||
svc web-frontend argus-web-frontend
|
|
||||||
svc web-proxy argus-web-proxy
|
|
||||||
|
|
||||||
# HTTP checks (host side)
|
|
||||||
section HTTP
|
|
||||||
logd "ES: $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health")"
|
|
||||||
http_body_head "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health" >> "$DETAILS" 2>&1 || true
|
|
||||||
|
|
||||||
logd "Kibana: $(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status")"
|
|
||||||
http_body_head "http://localhost:${KIBANA_PORT:-5601}/api/status" >> "$DETAILS" 2>&1 || true
|
|
||||||
|
|
||||||
logd "Master readyz: $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz")"
|
|
||||||
|
|
||||||
logd "Prometheus: $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready")"
|
|
||||||
logd "Grafana: $(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health")"
|
|
||||||
http_body_head "http://localhost:${GRAFANA_PORT:-3000}/api/health" >> "$DETAILS" 2>&1 || true
|
|
||||||
logd "Alertmanager: $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status")"
|
|
||||||
|
|
||||||
cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true)
|
|
||||||
cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true)
|
|
||||||
logd "Web-Proxy 8080: $(http_code "http://localhost:${WEB_PROXY_PORT_8080:-8080}/")"
|
|
||||||
logd "Web-Proxy 8083: $(http_code "http://localhost:${WEB_PROXY_PORT_8083:-8083}/")"
|
|
||||||
logd "Web-Proxy 8084 CORS: ${cors8084}"
|
|
||||||
logd "Web-Proxy 8085 CORS: ${cors8085}"
|
|
||||||
|
|
||||||
# FTP share writability (container perspective)
|
|
||||||
section FTP-SHARE
|
|
||||||
docker exec argus-ftp sh -lc 'ls -ld /private/argus/ftp /private/argus/ftp/share; test -w /private/argus/ftp/share && echo "write:OK" || echo "write:FAIL"' >> "$DETAILS" 2>&1 || true
|
|
||||||
|
|
||||||
# Collect system info for context
|
|
||||||
section SYSTEM
|
|
||||||
logd "uname -a:"; uname -a >> "$DETAILS"
|
|
||||||
logd "docker version:"; docker version --format '{{.Server.Version}}' >> "$DETAILS" 2>&1 || true
|
|
||||||
logd "compose ps:"; (cd "$ROOT/compose" && if docker compose version >/dev/null 2>&1; then docker compose -p argus-sys ps; else docker-compose -p argus-sys ps; fi) >> "$DETAILS" 2>&1 || true
|
|
||||||
|
|
||||||
section SUMMARY
|
|
||||||
# Add HTTP failures and CORS problems to error log with tags
|
|
||||||
[[ $(http_code "http://localhost:${ES_HTTP_PORT:-9200}/_cluster/health") != 200 ]] && echo "[es][http] health not 200" >> "$ERRORS"
|
|
||||||
kbcode=$(http_code "http://localhost:${KIBANA_PORT:-5601}/api/status"); [[ "$kbcode" != 200 ]] && echo "[kibana][http] /api/status=$kbcode" >> "$ERRORS"
|
|
||||||
[[ $(http_code "http://localhost:${MASTER_PORT:-32300}/readyz") != 200 ]] && echo "[master][http] /readyz not 200" >> "$ERRORS"
|
|
||||||
[[ $(http_code "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready") != 200 ]] && echo "[prometheus][http] /-/ready not 200" >> "$ERRORS"
|
|
||||||
gfcode=$(http_code "http://localhost:${GRAFANA_PORT:-3000}/api/health"); [[ "$gfcode" != 200 ]] && echo "[grafana][http] /api/health=$gfcode" >> "$ERRORS"
|
|
||||||
[[ $(http_code "http://localhost:${ALERTMANAGER_PORT:-9093}/api/v2/status") != 200 ]] && echo "[alertmanager][http] /api/v2/status not 200" >> "$ERRORS"
|
|
||||||
[[ -z "$cors8084" ]] && echo "[web-proxy][cors] 8084 missing Access-Control-Allow-Origin" >> "$ERRORS"
|
|
||||||
[[ -z "$cors8085" ]] && echo "[web-proxy][cors] 8085 missing Access-Control-Allow-Origin" >> "$ERRORS"
|
|
||||||
|
|
||||||
# Deduplicate errors
|
|
||||||
sort -u -o "$ERRORS" "$ERRORS"
|
|
||||||
|
|
||||||
echo "Diagnostic details -> $DETAILS"
|
|
||||||
echo "Detected errors -> $ERRORS"
|
|
||||||
|
|
||||||
# maintain latest symlinks for convenience
|
|
||||||
ln -sfn "$(basename "$DETAILS")" "$ROOT/logs/diagnose_details.log" 2>/dev/null || cp "$DETAILS" "$ROOT/logs/diagnose_details.log" 2>/dev/null || true
|
|
||||||
ln -sfn "$(basename "$ERRORS")" "$ROOT/logs/diagnose_error.log" 2>/dev/null || cp "$ERRORS" "$ROOT/logs/diagnose_error.log" 2>/dev/null || true
|
|
||||||
|
|
||||||
exit 0
|
|
||||||
126
deployment/build/templates/scripts/server-install.sh
Executable file → Normal file
126
deployment/build/templates/scripts/server-install.sh
Executable file → Normal file
@ -43,29 +43,6 @@ prepare_env() {
|
|||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
prepare_data_dirs() {
|
|
||||||
if [[ $EUID -ne 0 ]]; then
|
|
||||||
echo -e "\033[1;33m[WARN]\033[0m running as non-root: will not chown data dirs."
|
|
||||||
echo -e "\033[1;33m[WARN]\033[0m If you hit Permission denied, run: sudo $SCRIPT_DIR/server-prepare-dirs.sh"
|
|
||||||
# still ensure basic directories exist (no chown)
|
|
||||||
mkdir -p \
|
|
||||||
"$PKG_ROOT/private/argus/etc" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus/data" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus/rules" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/data" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/logs" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/plugins" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/data/sessions" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \
|
|
||||||
"$PKG_ROOT/private/argus/alert/alertmanager" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/ftp/share"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
load_images() {
|
load_images() {
|
||||||
local tar="$PKG_ROOT/images/all-images.tar.gz"
|
local tar="$PKG_ROOT/images/all-images.tar.gz"
|
||||||
[[ -f "$tar" ]] || { err "missing images tar: $tar"; exit 1; }
|
[[ -f "$tar" ]] || { err "missing images tar: $tar"; exit 1; }
|
||||||
@ -75,105 +52,7 @@ load_images() {
|
|||||||
|
|
||||||
bring_up() {
|
bring_up() {
|
||||||
log "starting services via compose"
|
log "starting services via compose"
|
||||||
local ov="$PKG_ROOT/compose/docker-compose.os-compat.override.yml"
|
(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" up -d)
|
||||||
if [[ ! -f "$ov" ]]; then
|
|
||||||
cat > "$ov" <<'YAML'
|
|
||||||
services:
|
|
||||||
bind:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
tmpfs:
|
|
||||||
- /run/named
|
|
||||||
master:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
es:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
kibana:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
ftp:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
prometheus:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
grafana:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
alertmanager:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
# ensure runtime path matches container expectation
|
|
||||||
volumes:
|
|
||||||
- ../private/argus/etc:/private/argus/etc
|
|
||||||
- ../private/argus/alert/alertmanager:/alertmanager
|
|
||||||
web-frontend:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
web-proxy:
|
|
||||||
security_opt: ["label=disable"]
|
|
||||||
userns_mode: "host"
|
|
||||||
YAML
|
|
||||||
log "generated OS-compat override: $(basename "$ov")"
|
|
||||||
fi
|
|
||||||
# 仅启动服务端组件,避免误起测试节点(node-a/node-b/test-node/test-gpu-node)
|
|
||||||
local services=(bind master es kibana ftp prometheus grafana alertmanager web-frontend web-proxy)
|
|
||||||
log "services: ${services[*]}"
|
|
||||||
(cd "$PKG_ROOT/compose" && "${COMPOSE[@]}" -p "$PROJECT_NAME" -f docker-compose.yml -f $(basename "$ov") up -d "${services[@]}")
|
|
||||||
}
|
|
||||||
|
|
||||||
dns_bootstrap() {
|
|
||||||
log "DNS bootstrap: initializing shared dns.conf and container resolv.conf"
|
|
||||||
local etc_dir="$PKG_ROOT/private/argus/etc"
|
|
||||||
mkdir -p "$etc_dir"
|
|
||||||
# 1) ensure dns.conf exists (fallback to bind IP 172.31.0.2)
|
|
||||||
if [[ ! -s "$etc_dir/dns.conf" ]]; then
|
|
||||||
if echo "172.31.0.2" > "$etc_dir/dns.conf" 2>/dev/null; then
|
|
||||||
log "wrote fallback dns.conf with 172.31.0.2"
|
|
||||||
else
|
|
||||||
# host-side write denied (ownership 1000:1000); write via bind container instead
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
|
|
||||||
docker exec argus-bind-sys sh -lc 'echo 172.31.0.2 > /private/argus/etc/dns.conf && chmod 644 /private/argus/etc/dns.conf' || true
|
|
||||||
log "fallback dns.conf written via bind container"
|
|
||||||
else
|
|
||||||
log "bind not ready; skip writing fallback dns.conf"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
# 2) wait briefly for bind to copy update-dns.sh into shared etc (bind startup.sh does this)
|
|
||||||
local i=0
|
|
||||||
while [[ ! -x "$etc_dir/update-dns.sh" && $i -lt 20 ]]; do
|
|
||||||
sleep 0.5; ((i++));
|
|
||||||
done
|
|
||||||
if [[ ! -x "$etc_dir/update-dns.sh" ]]; then
|
|
||||||
log "update-dns.sh not present yet; continuing with existing resolv.conf"
|
|
||||||
fi
|
|
||||||
# 3) run update-dns.sh inside key containers so /etc/resolv.conf points to bind
|
|
||||||
local c
|
|
||||||
for c in argus-master-sys argus-es-sys argus-kibana-sys argus-grafana argus-prometheus argus-ftp argus-web-frontend argus-web-proxy argus-alertmanager; do
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q "^${c}$"; then
|
|
||||||
docker exec "$c" sh -lc 'test -x /private/argus/etc/update-dns.sh && /private/argus/etc/update-dns.sh || true' >/dev/null 2>&1 || true
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
# 4) wait for service A-record hint files generated by services (best-effort)
|
|
||||||
local need=( es.log.argus.com kibana.log.argus.com master.argus.com grafana.metric.argus.com prom.metric.argus.com alertmanager.alert.argus.com )
|
|
||||||
local waited=0; local missing=1
|
|
||||||
while (( waited < 15 )); do
|
|
||||||
missing=0
|
|
||||||
for f in "${need[@]}"; do [[ -s "$etc_dir/$f" ]] || { missing=1; break; }; done
|
|
||||||
[[ $missing -eq 0 ]] && break
|
|
||||||
sleep 1; ((waited++))
|
|
||||||
done
|
|
||||||
# 5) reload bind zone (script uses supervisor to restart bind9)
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q '^argus-bind-sys$'; then
|
|
||||||
docker exec argus-bind-sys sh -lc '/usr/local/bin/reload-bind9.sh' >/dev/null 2>&1 || true
|
|
||||||
fi
|
|
||||||
# 6) restart web-proxy once to re-render nginx resolver with latest dns.conf
|
|
||||||
if docker ps --format '{{.Names}}' | grep -q '^argus-web-proxy$'; then
|
|
||||||
docker restart argus-web-proxy >/dev/null 2>&1 || true
|
|
||||||
fi
|
|
||||||
}
|
}
|
||||||
|
|
||||||
selfcheck() {
|
selfcheck() {
|
||||||
@ -183,12 +62,11 @@ selfcheck() {
|
|||||||
|
|
||||||
main() {
|
main() {
|
||||||
prepare_env
|
prepare_env
|
||||||
prepare_data_dirs
|
|
||||||
load_images
|
load_images
|
||||||
bring_up
|
bring_up
|
||||||
dns_bootstrap
|
|
||||||
selfcheck
|
selfcheck
|
||||||
log "install completed. See logs in $PKG_ROOT/logs/"
|
log "install completed. See logs in $PKG_ROOT/logs/"
|
||||||
}
|
}
|
||||||
|
|
||||||
main "$@"
|
main "$@"
|
||||||
|
|
||||||
|
|||||||
@ -1,73 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
PKG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
||||||
|
|
||||||
if [[ $EUID -ne 0 ]]; then
|
|
||||||
echo "[PREPARE] This script requires root (sudo)." >&2
|
|
||||||
echo " Try: sudo $0" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
ENV_FILE="$PKG_ROOT/compose/.env"
|
|
||||||
[[ -f "$ENV_FILE" ]] && set -a && source "$ENV_FILE" && set +a
|
|
||||||
UIDV="${ARGUS_BUILD_UID:-1000}"; GIDV="${ARGUS_BUILD_GID:-1000}"
|
|
||||||
|
|
||||||
echo "[PREPARE] Using owner ${UIDV}:${GIDV}"
|
|
||||||
|
|
||||||
# Core etc and service data dirs (aligned with src/sys/tests/scripts/01_bootstrap.sh)
|
|
||||||
mkdir -p \
|
|
||||||
"$PKG_ROOT/private/argus/etc" \
|
|
||||||
"$PKG_ROOT/private/argus/bind" \
|
|
||||||
"$PKG_ROOT/private/argus/master" \
|
|
||||||
"$PKG_ROOT/private/argus/agent" \
|
|
||||||
"$PKG_ROOT/private/argus/log/elasticsearch" \
|
|
||||||
"$PKG_ROOT/private/argus/log/kibana"
|
|
||||||
|
|
||||||
# Prometheus
|
|
||||||
mkdir -p \
|
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus/data" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus/rules" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus/targets"
|
|
||||||
|
|
||||||
# Grafana
|
|
||||||
mkdir -p \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/data" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/logs" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/plugins" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/provisioning/datasources" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/provisioning/dashboards" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/data/sessions" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/data/dashboards" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana/config"
|
|
||||||
|
|
||||||
# FTP
|
|
||||||
mkdir -p "$PKG_ROOT/private/argus/metric/ftp/share"
|
|
||||||
|
|
||||||
# Alertmanager
|
|
||||||
mkdir -p "$PKG_ROOT/private/argus/alert/alertmanager"
|
|
||||||
|
|
||||||
chown -R "$UIDV":"$GIDV" \
|
|
||||||
"$PKG_ROOT/private/argus/etc" \
|
|
||||||
"$PKG_ROOT/private/argus/bind" \
|
|
||||||
"$PKG_ROOT/private/argus/master" \
|
|
||||||
"$PKG_ROOT/private/argus/agent" \
|
|
||||||
"$PKG_ROOT/private/argus/log/elasticsearch" \
|
|
||||||
"$PKG_ROOT/private/argus/log/kibana" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/prometheus" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/grafana" \
|
|
||||||
"$PKG_ROOT/private/argus/metric/ftp" \
|
|
||||||
"$PKG_ROOT/private/argus/alert"
|
|
||||||
|
|
||||||
chmod -R g+w "$PKG_ROOT/private/argus/alert" "$PKG_ROOT/private/argus/etc" || true
|
|
||||||
|
|
||||||
# Ensure parent directories also owned by runtime user for consistency
|
|
||||||
chown "$UIDV":"$GIDV" \
|
|
||||||
"$PKG_ROOT/private/argus" \
|
|
||||||
"$PKG_ROOT/private/argus/log" \
|
|
||||||
"$PKG_ROOT/private/argus/metric" || true
|
|
||||||
|
|
||||||
echo "[PREPARE] Done. You can now run server-install.sh"
|
|
||||||
25
deployment/build/templates/scripts/server-selfcheck.sh
Executable file → Normal file
25
deployment/build/templates/scripts/server-selfcheck.sh
Executable file → Normal file
@ -32,11 +32,7 @@ log "checking Master"
|
|||||||
wait_http "http://localhost:${MASTER_PORT:-32300}/readyz" 60 || ok=0
|
wait_http "http://localhost:${MASTER_PORT:-32300}/readyz" 60 || ok=0
|
||||||
|
|
||||||
log "checking FTP"
|
log "checking FTP"
|
||||||
if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then
|
ftp_root="$ROOT/private/argus/metric/ftp/share"; [[ -d "$ftp_root" && -w "$ftp_root" ]] && ftp_ok=true || { ftp_ok=false; ok=0; }
|
||||||
if docker exec argus-ftp sh -lc 'test -w /private/argus/ftp/share'; then ftp_ok=true; else ftp_ok=false; ok=0; fi
|
|
||||||
else
|
|
||||||
ftp_ok=false; ok=0;
|
|
||||||
fi
|
|
||||||
|
|
||||||
log "checking Prometheus"
|
log "checking Prometheus"
|
||||||
wait_http "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready" 60 || ok=0
|
wait_http "http://localhost:${PROMETHEUS_PORT:-9090}/-/ready" 60 || ok=0
|
||||||
@ -55,9 +51,8 @@ p8083=$(code_for "http://localhost:${WEB_PROXY_PORT_8083:-8083}/")
|
|||||||
cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true)
|
cors8084=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8084:-8084}/api/v2/status" || true)
|
||||||
cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true)
|
cors8085=$(header_val -H "Origin: http://localhost:${WEB_PROXY_PORT_8080:-8080}" "http://localhost:${WEB_PROXY_PORT_8085:-8085}/api/v1/master/nodes" || true)
|
||||||
wp_ok=true
|
wp_ok=true
|
||||||
# 有些环境首页可能 403,此处接受 200/403
|
[[ "$p8080" == 200 ]] || wp_ok=false
|
||||||
([[ "$p8080" == 200 || "$p8080" == 403 ]]) || wp_ok=false
|
([[ "$p8083" == 200 || "$p8083" == 302 ]]) || wp_ok=false
|
||||||
([[ "$p8083" == 200 || "$p8083" == 302 || "$p8083" == 403 ]]) || wp_ok=false
|
|
||||||
[[ -n "$cors8084" && -n "$cors8085" ]] || wp_ok=false
|
[[ -n "$cors8084" && -n "$cors8085" ]] || wp_ok=false
|
||||||
[[ "$wp_ok" == true ]] || ok=0
|
[[ "$wp_ok" == true ]] || ok=0
|
||||||
|
|
||||||
@ -76,15 +71,5 @@ cat > "$tmp" <<JSON
|
|||||||
JSON
|
JSON
|
||||||
|
|
||||||
mv "$tmp" "$OUT_JSON"
|
mv "$tmp" "$OUT_JSON"
|
||||||
if [[ "$ok" == 1 ]]; then
|
[[ "$ok" == 1 ]] && { log "selfcheck OK"; exit 0; } || { err "selfcheck FAILED (see $OUT_JSON)"; exit 1; }
|
||||||
log "selfcheck OK"
|
|
||||||
exit 0
|
|
||||||
else
|
|
||||||
err "selfcheck FAILED (see $OUT_JSON)"
|
|
||||||
# If diagnose script exists, run it to collect more details
|
|
||||||
if [[ -x "$SCRIPT_DIR/server-diagnose.sh" ]]; then
|
|
||||||
# run diagnose; it will print the actual timestamped file paths and update 'latest' symlinks
|
|
||||||
"$SCRIPT_DIR/server-diagnose.sh" || true
|
|
||||||
fi
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|||||||
0
deployment/build/templates/scripts/server-status.sh
Executable file → Normal file
0
deployment/build/templates/scripts/server-status.sh
Executable file → Normal file
0
deployment/build/templates/scripts/server-uninstall.sh
Executable file → Normal file
0
deployment/build/templates/scripts/server-uninstall.sh
Executable file → Normal file
Binary file not shown.
@ -1 +1 @@
|
|||||||
1.40.0
|
1.38.0
|
||||||
|
|||||||
@ -48,31 +48,6 @@ BACKUPS_DIR="$INSTALL_DIR/backups" # 备份目录
|
|||||||
CURRENT_LINK="$INSTALL_DIR/current" # 当前版本软链接
|
CURRENT_LINK="$INSTALL_DIR/current" # 当前版本软链接
|
||||||
LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION" # 当前版本记录文件
|
LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION" # 当前版本记录文件
|
||||||
|
|
||||||
# 预检查:Agent 元数据与 hostname 约束
|
|
||||||
require_agent_metadata() {
|
|
||||||
local hn
|
|
||||||
hn="$(hostname)"
|
|
||||||
local ok=false
|
|
||||||
# 三元环境变量
|
|
||||||
if [[ -n "${AGENT_ENV:-}" && -n "${AGENT_USER:-}" && -n "${AGENT_INSTANCE:-}" ]]; then
|
|
||||||
ok=true
|
|
||||||
fi
|
|
||||||
# host 形如 env-user-instance-xxx
|
|
||||||
if [[ "$hn" =~ ^[^-]+-[^-]+-[^-]+-.*$ ]]; then
|
|
||||||
ok=true
|
|
||||||
fi
|
|
||||||
if [[ "$ok" == false ]]; then
|
|
||||||
log_error "检测到 hostname 与 Agent 元数据不完整:"
|
|
||||||
log_error " 当前 hostname: $hn"
|
|
||||||
log_error " AGENT_ENV='${AGENT_ENV:-}' AGENT_USER='${AGENT_USER:-}' AGENT_INSTANCE='${AGENT_INSTANCE:-}'"
|
|
||||||
echo
|
|
||||||
log_info "请满足以下其一后重试:"
|
|
||||||
log_info " 方式A:设置 hostname 为 env-user-instance-任意,例如 dev-alice-node001-pod-0"
|
|
||||||
log_info " 方式B:导出环境变量:export AGENT_ENV=dev AGENT_USER=alice AGENT_INSTANCE=node001"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# 检查必需的FTP参数
|
# 检查必需的FTP参数
|
||||||
check_ftp_params() {
|
check_ftp_params() {
|
||||||
local missing_params=()
|
local missing_params=()
|
||||||
@ -898,47 +873,6 @@ rollback_version() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# 自检实现:等待 node.json 就绪且健康,并验证 last_report 持续更新
|
|
||||||
selfcheck_post_install() {
|
|
||||||
local hn="$(hostname)"
|
|
||||||
local node_file="/private/argus/agent/${AGENT_HOSTNAME:-$hn}/node.json"
|
|
||||||
local deadline=$(( $(date +%s) + 300 ))
|
|
||||||
local t1="" t2=""
|
|
||||||
while :; do
|
|
||||||
if [[ -f "$node_file" ]]; then
|
|
||||||
if command -v jq >/dev/null 2>&1; then
|
|
||||||
local ok_health lr
|
|
||||||
ok_health=$(jq -er '(.health["metric-argus-agent"].status=="healthy") and (.health["metric-node-exporter"].status=="healthy") and (.health["metric-fluent-bit"].status=="healthy") and (.health["metric-dcgm-exporter"].status=="healthy")' "$node_file" 2>/dev/null || echo false)
|
|
||||||
lr=$(jq -r '.last_report // ""' "$node_file" 2>/dev/null)
|
|
||||||
if [[ "$ok_health" == true && -n "$lr" ]]; then
|
|
||||||
if [[ -z "$t1" ]]; then
|
|
||||||
t1="$lr"
|
|
||||||
# agent 默认 60s 上报,等待 70s 再校验一次
|
|
||||||
sleep 70
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
t2="$lr"
|
|
||||||
if [[ "$t2" != "$t1" ]]; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
# 若未变化,再等待一会儿直到超时
|
|
||||||
sleep 10
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
# 无 jq 时的宽松校验
|
|
||||||
if grep -q '"status"\s*:\s*"healthy"' "$node_file"; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
if (( $(date +%s) >= deadline )); then
|
|
||||||
log_error "自检超时:未在 5 分钟内确认 last_report 持续更新 或 健康状态不满足(路径:$node_file)"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
sleep 5
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
# 主函数
|
# 主函数
|
||||||
main() {
|
main() {
|
||||||
echo "=========================================="
|
echo "=========================================="
|
||||||
@ -978,9 +912,8 @@ main() {
|
|||||||
# return 0
|
# return 0
|
||||||
# fi
|
# fi
|
||||||
|
|
||||||
check_ftp_params
|
check_ftp_params
|
||||||
check_system
|
check_system
|
||||||
require_agent_metadata
|
|
||||||
|
|
||||||
if [[ "$ACTION" == "uninstall" ]]; then
|
if [[ "$ACTION" == "uninstall" ]]; then
|
||||||
uninstall_argus_metric
|
uninstall_argus_metric
|
||||||
@ -988,16 +921,8 @@ require_agent_metadata
|
|||||||
install_argus_metric
|
install_argus_metric
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 安装后自检:最多等待 5 分钟,确认 node.json 存在且健康
|
|
||||||
echo
|
echo
|
||||||
log_info "开始安装后自检(最多等待 5 分钟)..."
|
log_info "操作完成!"
|
||||||
selfcheck_post_install || {
|
|
||||||
log_error "安装后自检未通过,请查看 /var/log/argus-agent.log 以及 /opt/argus-metric/versions/*/.install.log"
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
echo
|
|
||||||
log_success "全部自检通过,安装完成!"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# 脚本入口
|
# 脚本入口
|
||||||
|
|||||||
@ -24,18 +24,13 @@ else
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# ========== 读取 DNS ==========
|
# ========== 读取 DNS ==========
|
||||||
RESOLVERS=""
|
if [ -f "$DNS_CONF_PRIVATE" ]; then
|
||||||
# 优先等待 /private/argus/etc/dns.conf 生成并读取其中的 IP
|
echo "从 $DNS_CONF_PRIVATE 读取 DNS 服务器..."
|
||||||
for i in $(seq 1 10); do
|
RESOLVERS=$(awk '/^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/ {print $1}' "$DNS_CONF_PRIVATE" | tr '\n' ' ')
|
||||||
if [ -f "$DNS_CONF_PRIVATE" ]; then
|
fi
|
||||||
RESOLVERS=$(awk '/^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/{print $1}' "$DNS_CONF_PRIVATE" | tr '\n' ' ')
|
|
||||||
fi
|
|
||||||
[ -n "$RESOLVERS" ] && break
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
|
|
||||||
# 若仍为空则回退到系统 resolv.conf
|
# 如果 /private 文件不存在则 fallback
|
||||||
if [ -z "$RESOLVERS" ]; then
|
if [ -z "${RESOLVERS:-}" ]; then
|
||||||
echo "未在 $DNS_CONF_PRIVATE 中找到有效 DNS,使用系统 /etc/resolv.conf"
|
echo "未在 $DNS_CONF_PRIVATE 中找到有效 DNS,使用系统 /etc/resolv.conf"
|
||||||
RESOLVERS=$(awk '/^nameserver/ {print $2}' "$DNS_CONF_SYSTEM" | tr '\n' ' ')
|
RESOLVERS=$(awk '/^nameserver/ {print $2}' "$DNS_CONF_SYSTEM" | tr '\n' ' ')
|
||||||
fi
|
fi
|
||||||
@ -52,9 +47,8 @@ echo "检测到 DNS 服务器列表: $RESOLVERS"
|
|||||||
if [ -f "$TEMPLATE" ]; then
|
if [ -f "$TEMPLATE" ]; then
|
||||||
echo "从模板生成 nginx.conf ..."
|
echo "从模板生成 nginx.conf ..."
|
||||||
# 合并 Docker 内置 DNS 以保障解析 Compose 服务名
|
# 合并 Docker 内置 DNS 以保障解析 Compose 服务名
|
||||||
# 将 127.0.0.11 放在末尾,优先使用 /private/argus/etc/dns.conf 指向的 bind
|
|
||||||
if ! echo " $RESOLVERS " | grep -q " 127.0.0.11 "; then
|
if ! echo " $RESOLVERS " | grep -q " 127.0.0.11 "; then
|
||||||
RESOLVERS="${RESOLVERS} 127.0.0.11"
|
RESOLVERS="127.0.0.11 ${RESOLVERS}"
|
||||||
fi
|
fi
|
||||||
sed "s|__RESOLVERS__|$RESOLVERS|" "$TEMPLATE" > "$TARGET"
|
sed "s|__RESOLVERS__|$RESOLVERS|" "$TEMPLATE" > "$TARGET"
|
||||||
else
|
else
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user