From 4ed5c648041b40c736492ce8fda2c0a71b05944e Mon Sep 17 00:00:00 2001 From: yuyr Date: Mon, 3 Nov 2025 14:53:12 +0800 Subject: [PATCH] =?UTF-8?q?[#37]=20=E4=BC=98=E5=8C=96client=E6=9E=84?= =?UTF-8?q?=E5=BB=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deployment/build/build_client_package.sh | 24 +++++- .../templates/client/INSTALL_CLIENT_zh.md | 37 +++++++++ .../all-in-one-full/scripts/setup.sh | 83 ++++++++++++++++++- 3 files changed, 139 insertions(+), 5 deletions(-) create mode 100644 deployment/build/templates/client/INSTALL_CLIENT_zh.md diff --git a/deployment/build/build_client_package.sh b/deployment/build/build_client_package.sh index e91944b..2134e12 100755 --- a/deployment/build/build_client_package.sh +++ b/deployment/build/build_client_package.sh @@ -43,7 +43,23 @@ latest_dir=$(ls -1dt "$ART_BASE"/*/ 2>/dev/null | head -n1 || true) tmpdir=$(mktemp -d) trap 'rm -rf "$tmpdir"' EXIT -rsync -a "$latest_dir" "$tmpdir/src" >/dev/null 2>&1 || cp -r "$latest_dir" "$tmpdir/src" +# Filter-only copy: keep install_order files + scripts + deps + version.json +mkdir -p "$tmpdir/src" +cp -f "$latest_dir/version.json" "$tmpdir/src/version.json" +if command -v jq >/dev/null 2>&1; then + mapfile -t files < <(jq -r '.install_order[]' "$latest_dir/version.json") +else + files=( $(grep -E '"install_order"' -A10 "$latest_dir/version.json" | sed -n 's/\s*"\(.*\.tar\.gz\)".*/\1/p') ) +fi +for f in "${files[@]}"; do + [[ -f "$latest_dir/$f" ]] && cp -f "$latest_dir/$f" "$tmpdir/src/$f" +done +for aux in install.sh uninstall.sh check_health.sh check_version.sh sync_dns.sh restart_unhealthy.sh config.env; do + [[ -f "$latest_dir/$aux" ]] && cp -f "$latest_dir/$aux" "$tmpdir/src/$aux"; +done +if [[ -d "$latest_dir/deps" ]]; then + mkdir -p "$tmpdir/src/deps" && rsync -a "$latest_dir/deps/" "$tmpdir/src/deps/" >/dev/null 2>&1 || cp -r "$latest_dir/deps/." "$tmpdir/src/deps/"; +fi out_name="argus-metric_$(echo "$VERSION" | sed 's/\./_/g').tar.gz" @@ -62,4 +78,10 @@ fi SETUP_SRC="$ROOT_DIR/src/metric/client-plugins/all-in-one-full/scripts/setup.sh" [[ -f "$SETUP_SRC" ]] && cp "$SETUP_SRC" "$PKG_DIR/setup.sh" || true +# docs for end users +CLIENT_DOC_DIR="$BUILD_DIR/templates/client" +if [[ -d "$CLIENT_DOC_DIR" ]]; then + rsync -a "$CLIENT_DOC_DIR/" "$PKG_DIR/" >/dev/null 2>&1 || cp -r "$CLIENT_DOC_DIR/." "$PKG_DIR/" +fi + exit 0 diff --git a/deployment/build/templates/client/INSTALL_CLIENT_zh.md b/deployment/build/templates/client/INSTALL_CLIENT_zh.md new file mode 100644 index 0000000..4d0933b --- /dev/null +++ b/deployment/build/templates/client/INSTALL_CLIENT_zh.md @@ -0,0 +1,37 @@ +# Argus Metric 客户端安装指南(容器内普通用户场景) + +## 准备与连通性检查 +- FTP 连接(需要账号密码,默认 `ftpuser / ZGClab1234!`) + - `curl -u ftpuser:ZGClab1234! -I ftp://:21/LATEST_VERSION` + - `curl -u ftpuser:ZGClab1234! -s ftp://:21/ | head` +- 下载安装脚本 + - `curl -u ftpuser:ZGClab1234! -fsSL ftp://:21/setup.sh -o /tmp/setup.sh` + - `chmod +x /tmp/setup.sh` + +## 元数据与主机名 +- Agent 需要元数据(env/user/instance)与 Master 地址: + - 方式A:hostname 形如 `env-user-instance-xxx`(推荐) + - 方式B:导出环境变量: + - `export AGENT_ENV=dev` + - `export AGENT_USER=` + - `export AGENT_INSTANCE=` +- Master 地址: + - `export MASTER_ENDPOINT=http://master.argus.com:3000` + +> 安装脚本会在开头检查上述条件,缺失时会终止并给出修复提示。 + +## 执行安装 +- 以 root 运行(容器内如为非 root 用户请切换为 root): + - `sudo /tmp/setup.sh --server --user ftpuser --password 'ZGClab1234!' --port 21` +- 如需自定义安装根目录:`--install-dir /opt/argus-metric` + +## 安装后自检(setup 自动执行) +- setup 会等待最多 5 分钟,确认以下条件后才报告完成: + - `/private/argus/agent//node.json` 已生成; + - `last_report` 在持续更新; + - `health.metric-argus-agent|metric-node-exporter|metric-fluent-bit|metric-dcgm-exporter` 均为 `healthy` 且 `error` 为空。 + +## 手工验证(可选) +- `cat /private/argus/agent/$(hostname)/node.json | jq '.'` +- `curl -s -o /dev/null -w "%{http_code}\n" http://localhost:9100/metrics` 应为 200 +- 查看日志:`/var/log/argus-agent.log`、`/opt/argus-metric/versions/*/.install.log` diff --git a/src/metric/client-plugins/all-in-one-full/scripts/setup.sh b/src/metric/client-plugins/all-in-one-full/scripts/setup.sh index 0c36bce..006d679 100755 --- a/src/metric/client-plugins/all-in-one-full/scripts/setup.sh +++ b/src/metric/client-plugins/all-in-one-full/scripts/setup.sh @@ -48,6 +48,31 @@ BACKUPS_DIR="$INSTALL_DIR/backups" # 备份目录 CURRENT_LINK="$INSTALL_DIR/current" # 当前版本软链接 LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION" # 当前版本记录文件 +# 预检查:Agent 元数据与 hostname 约束 +require_agent_metadata() { + local hn + hn="$(hostname)" + local ok=false + # 三元环境变量 + if [[ -n "${AGENT_ENV:-}" && -n "${AGENT_USER:-}" && -n "${AGENT_INSTANCE:-}" ]]; then + ok=true + fi + # host 形如 env-user-instance-xxx + if [[ "$hn" =~ ^[^-]+-[^-]+-[^-]+-.*$ ]]; then + ok=true + fi + if [[ "$ok" == false ]]; then + log_error "检测到 hostname 与 Agent 元数据不完整:" + log_error " 当前 hostname: $hn" + log_error " AGENT_ENV='${AGENT_ENV:-}' AGENT_USER='${AGENT_USER:-}' AGENT_INSTANCE='${AGENT_INSTANCE:-}'" + echo + log_info "请满足以下其一后重试:" + log_info " 方式A:设置 hostname 为 env-user-instance-任意,例如 dev-alice-node001-pod-0" + log_info " 方式B:导出环境变量:export AGENT_ENV=dev AGENT_USER=alice AGENT_INSTANCE=node001" + exit 1 + fi +} + # 检查必需的FTP参数 check_ftp_params() { local missing_params=() @@ -873,6 +898,47 @@ rollback_version() { fi } +# 自检实现:等待 node.json 就绪且健康,并验证 last_report 持续更新 +selfcheck_post_install() { + local hn="$(hostname)" + local node_file="/private/argus/agent/${AGENT_HOSTNAME:-$hn}/node.json" + local deadline=$(( $(date +%s) + 300 )) + local t1="" t2="" + while :; do + if [[ -f "$node_file" ]]; then + if command -v jq >/dev/null 2>&1; then + local ok_health lr + ok_health=$(jq -er '(.health["metric-argus-agent"].status=="healthy") and (.health["metric-node-exporter"].status=="healthy") and (.health["metric-fluent-bit"].status=="healthy") and (.health["metric-dcgm-exporter"].status=="healthy")' "$node_file" 2>/dev/null || echo false) + lr=$(jq -r '.last_report // ""' "$node_file" 2>/dev/null) + if [[ "$ok_health" == true && -n "$lr" ]]; then + if [[ -z "$t1" ]]; then + t1="$lr" + # agent 默认 60s 上报,等待 70s 再校验一次 + sleep 70 + continue + fi + t2="$lr" + if [[ "$t2" != "$t1" ]]; then + return 0 + fi + # 若未变化,再等待一会儿直到超时 + sleep 10 + fi + else + # 无 jq 时的宽松校验 + if grep -q '"status"\s*:\s*"healthy"' "$node_file"; then + return 0 + fi + fi + fi + if (( $(date +%s) >= deadline )); then + log_error "自检超时:未在 5 分钟内确认 last_report 持续更新 或 健康状态不满足(路径:$node_file)" + return 1 + fi + sleep 5 + done +} + # 主函数 main() { echo "==========================================" @@ -912,17 +978,26 @@ main() { # return 0 # fi - check_ftp_params - check_system +check_ftp_params +check_system +require_agent_metadata if [[ "$ACTION" == "uninstall" ]]; then uninstall_argus_metric else install_argus_metric fi - + + # 安装后自检:最多等待 5 分钟,确认 node.json 存在且健康 echo - log_info "操作完成!" + log_info "开始安装后自检(最多等待 5 分钟)..." + selfcheck_post_install || { + log_error "安装后自检未通过,请查看 /var/log/argus-agent.log 以及 /opt/argus-metric/versions/*/.install.log" + exit 1 + } + + echo + log_success "全部自检通过,安装完成!" } # 脚本入口