[#37] 优化client构建
This commit is contained in:
parent
3551360687
commit
4ed5c64804
@ -43,7 +43,23 @@ latest_dir=$(ls -1dt "$ART_BASE"/*/ 2>/dev/null | head -n1 || true)
|
||||
|
||||
tmpdir=$(mktemp -d)
|
||||
trap 'rm -rf "$tmpdir"' EXIT
|
||||
rsync -a "$latest_dir" "$tmpdir/src" >/dev/null 2>&1 || cp -r "$latest_dir" "$tmpdir/src"
|
||||
# Filter-only copy: keep install_order files + scripts + deps + version.json
|
||||
mkdir -p "$tmpdir/src"
|
||||
cp -f "$latest_dir/version.json" "$tmpdir/src/version.json"
|
||||
if command -v jq >/dev/null 2>&1; then
|
||||
mapfile -t files < <(jq -r '.install_order[]' "$latest_dir/version.json")
|
||||
else
|
||||
files=( $(grep -E '"install_order"' -A10 "$latest_dir/version.json" | sed -n 's/\s*"\(.*\.tar\.gz\)".*/\1/p') )
|
||||
fi
|
||||
for f in "${files[@]}"; do
|
||||
[[ -f "$latest_dir/$f" ]] && cp -f "$latest_dir/$f" "$tmpdir/src/$f"
|
||||
done
|
||||
for aux in install.sh uninstall.sh check_health.sh check_version.sh sync_dns.sh restart_unhealthy.sh config.env; do
|
||||
[[ -f "$latest_dir/$aux" ]] && cp -f "$latest_dir/$aux" "$tmpdir/src/$aux";
|
||||
done
|
||||
if [[ -d "$latest_dir/deps" ]]; then
|
||||
mkdir -p "$tmpdir/src/deps" && rsync -a "$latest_dir/deps/" "$tmpdir/src/deps/" >/dev/null 2>&1 || cp -r "$latest_dir/deps/." "$tmpdir/src/deps/";
|
||||
fi
|
||||
|
||||
out_name="argus-metric_$(echo "$VERSION" | sed 's/\./_/g').tar.gz"
|
||||
|
||||
@ -62,4 +78,10 @@ fi
|
||||
SETUP_SRC="$ROOT_DIR/src/metric/client-plugins/all-in-one-full/scripts/setup.sh"
|
||||
[[ -f "$SETUP_SRC" ]] && cp "$SETUP_SRC" "$PKG_DIR/setup.sh" || true
|
||||
|
||||
# docs for end users
|
||||
CLIENT_DOC_DIR="$BUILD_DIR/templates/client"
|
||||
if [[ -d "$CLIENT_DOC_DIR" ]]; then
|
||||
rsync -a "$CLIENT_DOC_DIR/" "$PKG_DIR/" >/dev/null 2>&1 || cp -r "$CLIENT_DOC_DIR/." "$PKG_DIR/"
|
||||
fi
|
||||
|
||||
exit 0
|
||||
|
||||
37
deployment/build/templates/client/INSTALL_CLIENT_zh.md
Normal file
37
deployment/build/templates/client/INSTALL_CLIENT_zh.md
Normal file
@ -0,0 +1,37 @@
|
||||
# Argus Metric 客户端安装指南(容器内普通用户场景)
|
||||
|
||||
## 准备与连通性检查
|
||||
- FTP 连接(需要账号密码,默认 `ftpuser / ZGClab1234!`)
|
||||
- `curl -u ftpuser:ZGClab1234! -I ftp://<FTP_IP>:21/LATEST_VERSION`
|
||||
- `curl -u ftpuser:ZGClab1234! -s ftp://<FTP_IP>:21/ | head`
|
||||
- 下载安装脚本
|
||||
- `curl -u ftpuser:ZGClab1234! -fsSL ftp://<FTP_IP>:21/setup.sh -o /tmp/setup.sh`
|
||||
- `chmod +x /tmp/setup.sh`
|
||||
|
||||
## 元数据与主机名
|
||||
- Agent 需要元数据(env/user/instance)与 Master 地址:
|
||||
- 方式A:hostname 形如 `env-user-instance-xxx`(推荐)
|
||||
- 方式B:导出环境变量:
|
||||
- `export AGENT_ENV=dev`
|
||||
- `export AGENT_USER=<your_user>`
|
||||
- `export AGENT_INSTANCE=<node_id>`
|
||||
- Master 地址:
|
||||
- `export MASTER_ENDPOINT=http://master.argus.com:3000`
|
||||
|
||||
> 安装脚本会在开头检查上述条件,缺失时会终止并给出修复提示。
|
||||
|
||||
## 执行安装
|
||||
- 以 root 运行(容器内如为非 root 用户请切换为 root):
|
||||
- `sudo /tmp/setup.sh --server <FTP_IP> --user ftpuser --password 'ZGClab1234!' --port 21`
|
||||
- 如需自定义安装根目录:`--install-dir /opt/argus-metric`
|
||||
|
||||
## 安装后自检(setup 自动执行)
|
||||
- setup 会等待最多 5 分钟,确认以下条件后才报告完成:
|
||||
- `/private/argus/agent/<hostname>/node.json` 已生成;
|
||||
- `last_report` 在持续更新;
|
||||
- `health.metric-argus-agent|metric-node-exporter|metric-fluent-bit|metric-dcgm-exporter` 均为 `healthy` 且 `error` 为空。
|
||||
|
||||
## 手工验证(可选)
|
||||
- `cat /private/argus/agent/$(hostname)/node.json | jq '.'`
|
||||
- `curl -s -o /dev/null -w "%{http_code}\n" http://localhost:9100/metrics` 应为 200
|
||||
- 查看日志:`/var/log/argus-agent.log`、`/opt/argus-metric/versions/*/.install.log`
|
||||
@ -48,6 +48,31 @@ BACKUPS_DIR="$INSTALL_DIR/backups" # 备份目录
|
||||
CURRENT_LINK="$INSTALL_DIR/current" # 当前版本软链接
|
||||
LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION" # 当前版本记录文件
|
||||
|
||||
# 预检查:Agent 元数据与 hostname 约束
|
||||
require_agent_metadata() {
|
||||
local hn
|
||||
hn="$(hostname)"
|
||||
local ok=false
|
||||
# 三元环境变量
|
||||
if [[ -n "${AGENT_ENV:-}" && -n "${AGENT_USER:-}" && -n "${AGENT_INSTANCE:-}" ]]; then
|
||||
ok=true
|
||||
fi
|
||||
# host 形如 env-user-instance-xxx
|
||||
if [[ "$hn" =~ ^[^-]+-[^-]+-[^-]+-.*$ ]]; then
|
||||
ok=true
|
||||
fi
|
||||
if [[ "$ok" == false ]]; then
|
||||
log_error "检测到 hostname 与 Agent 元数据不完整:"
|
||||
log_error " 当前 hostname: $hn"
|
||||
log_error " AGENT_ENV='${AGENT_ENV:-}' AGENT_USER='${AGENT_USER:-}' AGENT_INSTANCE='${AGENT_INSTANCE:-}'"
|
||||
echo
|
||||
log_info "请满足以下其一后重试:"
|
||||
log_info " 方式A:设置 hostname 为 env-user-instance-任意,例如 dev-alice-node001-pod-0"
|
||||
log_info " 方式B:导出环境变量:export AGENT_ENV=dev AGENT_USER=alice AGENT_INSTANCE=node001"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 检查必需的FTP参数
|
||||
check_ftp_params() {
|
||||
local missing_params=()
|
||||
@ -873,6 +898,47 @@ rollback_version() {
|
||||
fi
|
||||
}
|
||||
|
||||
# 自检实现:等待 node.json 就绪且健康,并验证 last_report 持续更新
|
||||
selfcheck_post_install() {
|
||||
local hn="$(hostname)"
|
||||
local node_file="/private/argus/agent/${AGENT_HOSTNAME:-$hn}/node.json"
|
||||
local deadline=$(( $(date +%s) + 300 ))
|
||||
local t1="" t2=""
|
||||
while :; do
|
||||
if [[ -f "$node_file" ]]; then
|
||||
if command -v jq >/dev/null 2>&1; then
|
||||
local ok_health lr
|
||||
ok_health=$(jq -er '(.health["metric-argus-agent"].status=="healthy") and (.health["metric-node-exporter"].status=="healthy") and (.health["metric-fluent-bit"].status=="healthy") and (.health["metric-dcgm-exporter"].status=="healthy")' "$node_file" 2>/dev/null || echo false)
|
||||
lr=$(jq -r '.last_report // ""' "$node_file" 2>/dev/null)
|
||||
if [[ "$ok_health" == true && -n "$lr" ]]; then
|
||||
if [[ -z "$t1" ]]; then
|
||||
t1="$lr"
|
||||
# agent 默认 60s 上报,等待 70s 再校验一次
|
||||
sleep 70
|
||||
continue
|
||||
fi
|
||||
t2="$lr"
|
||||
if [[ "$t2" != "$t1" ]]; then
|
||||
return 0
|
||||
fi
|
||||
# 若未变化,再等待一会儿直到超时
|
||||
sleep 10
|
||||
fi
|
||||
else
|
||||
# 无 jq 时的宽松校验
|
||||
if grep -q '"status"\s*:\s*"healthy"' "$node_file"; then
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
if (( $(date +%s) >= deadline )); then
|
||||
log_error "自检超时:未在 5 分钟内确认 last_report 持续更新 或 健康状态不满足(路径:$node_file)"
|
||||
return 1
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
echo "=========================================="
|
||||
@ -912,8 +978,9 @@ main() {
|
||||
# return 0
|
||||
# fi
|
||||
|
||||
check_ftp_params
|
||||
check_system
|
||||
check_ftp_params
|
||||
check_system
|
||||
require_agent_metadata
|
||||
|
||||
if [[ "$ACTION" == "uninstall" ]]; then
|
||||
uninstall_argus_metric
|
||||
@ -921,8 +988,16 @@ main() {
|
||||
install_argus_metric
|
||||
fi
|
||||
|
||||
# 安装后自检:最多等待 5 分钟,确认 node.json 存在且健康
|
||||
echo
|
||||
log_info "操作完成!"
|
||||
log_info "开始安装后自检(最多等待 5 分钟)..."
|
||||
selfcheck_post_install || {
|
||||
log_error "安装后自检未通过,请查看 /var/log/argus-agent.log 以及 /opt/argus-metric/versions/*/.install.log"
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo
|
||||
log_success "全部自检通过,安装完成!"
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user