完成a6000测试系统构建、部署、测试整合 #35

Merged
yuyr merged 18 commits from dev_1.0.0_yuyr_5 into dev_1.0.0 2025-10-29 10:04:29 +08:00
143 changed files with 10840 additions and 609 deletions

1
.gitattributes vendored Normal file
View File

@ -0,0 +1 @@
src/metric/client-plugins/all-in-one-full/plugins/*/bin/* filter=lfs diff=lfs merge=lfs -text

View File

@ -10,20 +10,28 @@ Usage: $0 [OPTIONS]
Options:
--intranet Use intranet mirror for log/bind builds
--master-offline Build master offline image (requires src/master/offline_wheels.tar.gz)
--metric Build metric module images (ftp, prometheus, grafana, test nodes)
--no-cache Build all images without using Docker layer cache
--only LIST Comma-separated targets to build: core,master,metric,web,alert,sys,all
-h, --help Show this help message
Examples:
$0 # Build with default sources
$0 --intranet # Build with intranet mirror
$0 --master-offline # Additionally build argus-master:offline
$0 --intranet --master-offline
$0 --metric # Additionally build metric module images
$0 --intranet --master-offline --metric
EOF
}
use_intranet=false
build_core=true
build_master=true
build_master_offline=false
build_metric=true
build_web=true
build_alert=true
build_sys=true
no_cache=false
while [[ $# -gt 0 ]]; do
@ -41,10 +49,35 @@ while [[ $# -gt 0 ]]; do
build_master_offline=true
shift
;;
--metric)
build_metric=true
shift
;;
--no-cache)
no_cache=true
shift
;;
--only)
if [[ -z ${2:-} ]]; then
echo "--only requires a target list" >&2; exit 1
fi
sel="$2"; shift 2
# reset all, then enable selected
build_core=false; build_master=false; build_metric=false; build_web=false; build_alert=false; build_sys=false
IFS=',' read -ra parts <<< "$sel"
for p in "${parts[@]}"; do
case "$p" in
core) build_core=true ;;
master) build_master=true ;;
metric) build_metric=true ;;
web) build_web=true ;;
alert) build_alert=true ;;
sys) build_sys=true ;;
all) build_core=true; build_master=true; build_metric=true; build_web=true; build_alert=true; build_sys=true ;;
*) echo "Unknown --only target: $p" >&2; exit 1 ;;
esac
done
;;
-h|--help)
show_help
exit 0
@ -115,14 +148,22 @@ build_image() {
local image_name=$1
local dockerfile_path=$2
local tag=$3
local context="."
shift 3
if [[ $# -gt 0 ]]; then
context=$1
shift
fi
local extra_args=("$@")
echo "🔄 Building $image_name image..."
echo " Dockerfile: $dockerfile_path"
echo " Tag: $tag"
echo " Context: $context"
if docker build "${build_args[@]}" "${extra_args[@]}" -f "$dockerfile_path" -t "$tag" .; then
if docker build "${build_args[@]}" "${extra_args[@]}" -f "$dockerfile_path" -t "$tag" "$context"; then
echo "$image_name image built successfully"
return 0
else
@ -131,29 +172,59 @@ build_image() {
fi
}
pull_base_image() {
local image_ref=$1
local attempts=${2:-3}
local delay=${3:-5}
# If the image already exists locally, skip pulling.
if docker image inspect "$image_ref" >/dev/null 2>&1; then
echo " Local image present; skip pull: $image_ref"
return 0
fi
for ((i=1; i<=attempts; i++)); do
echo " Pulling base image ($i/$attempts): $image_ref"
if docker pull "$image_ref" >/dev/null; then
echo " Base image ready: $image_ref"
return 0
fi
echo " Pull failed: $image_ref"
if (( i < attempts )); then
echo " Retrying in ${delay}s..."
sleep "$delay"
fi
done
echo "❌ Unable to pull base image after ${attempts} attempts: $image_ref"
return 1
}
images_built=()
build_failed=false
if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then
images_built+=("argus-elasticsearch:latest")
else
build_failed=true
fi
if [[ "$build_core" == true ]]; then
if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then
images_built+=("argus-elasticsearch:latest")
else
build_failed=true
fi
echo ""
echo ""
if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then
images_built+=("argus-kibana:latest")
else
build_failed=true
fi
if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then
images_built+=("argus-kibana:latest")
else
build_failed=true
fi
echo ""
echo ""
if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then
images_built+=("argus-bind9:latest")
else
build_failed=true
if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then
images_built+=("argus-bind9:latest")
else
build_failed=true
fi
fi
echo ""
@ -184,6 +255,127 @@ if [[ "$build_master" == true ]]; then
popd >/dev/null
fi
if [[ "$build_metric" == true ]]; then
echo ""
echo "Building Metric module images..."
metric_base_images=(
"ubuntu:22.04"
"ubuntu/prometheus:3-24.04_stable"
"grafana/grafana:11.1.0"
)
for base_image in "${metric_base_images[@]}"; do
if ! pull_base_image "$base_image"; then
build_failed=true
fi
done
metric_builds=(
"Metric FTP|src/metric/ftp/build/Dockerfile|argus-metric-ftp:latest|src/metric/ftp/build"
"Metric Prometheus|src/metric/prometheus/build/Dockerfile|argus-metric-prometheus:latest|src/metric/prometheus/build"
"Metric Grafana|src/metric/grafana/build/Dockerfile|argus-metric-grafana:latest|src/metric/grafana/build"
)
for build_spec in "${metric_builds[@]}"; do
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
images_built+=("$image_tag")
else
build_failed=true
fi
echo ""
done
fi
# =======================================
# Sys (system tests) node images
# =======================================
if [[ "$build_sys" == true ]]; then
echo ""
echo "Building Sys node images..."
sys_base_images=(
"ubuntu:22.04"
"nvidia/cuda:12.2.2-runtime-ubuntu22.04"
)
for base_image in "${sys_base_images[@]}"; do
if ! pull_base_image "$base_image"; then
build_failed=true
fi
done
sys_builds=(
"Sys Node|src/sys/build/node/Dockerfile|argus-sys-node:latest|."
"Sys Metric Test Node|src/sys/build/test-node/Dockerfile|argus-sys-metric-test-node:latest|."
"Sys Metric Test GPU Node|src/sys/build/test-gpu-node/Dockerfile|argus-sys-metric-test-gpu-node:latest|."
)
for build_spec in "${sys_builds[@]}"; do
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
images_built+=("$image_tag")
else
build_failed=true
fi
echo ""
done
fi
# =======================================
# Web & Alert module images
# =======================================
if [[ "$build_web" == true || "$build_alert" == true ]]; then
echo ""
echo "Building Web and Alert module images..."
# Pre-pull commonly used base images for stability
web_alert_base_images=(
"node:20"
"ubuntu:24.04"
)
for base_image in "${web_alert_base_images[@]}"; do
if ! pull_base_image "$base_image"; then
build_failed=true
fi
done
if [[ "$build_web" == true ]]; then
web_builds=(
"Web Frontend|src/web/build_tools/frontend/Dockerfile|argus-web-frontend:latest|."
"Web Proxy|src/web/build_tools/proxy/Dockerfile|argus-web-proxy:latest|."
)
for build_spec in "${web_builds[@]}"; do
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
images_built+=("$image_tag")
else
build_failed=true
fi
echo ""
done
fi
if [[ "$build_alert" == true ]]; then
alert_builds=(
"Alertmanager|src/alert/alertmanager/build/Dockerfile|argus-alertmanager:latest|."
)
for build_spec in "${alert_builds[@]}"; do
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
images_built+=("$image_tag")
else
build_failed=true
fi
echo ""
done
fi
fi
echo "======================================="
echo "📦 Build Summary"
echo "======================================="
@ -210,7 +402,6 @@ if [[ "$build_master_offline" == true ]]; then
echo ""
echo "🧳 Master offline wheels 已解压到 $master_offline_dir"
fi
echo ""
echo "🚀 Next steps:"
echo " ./build/save_images.sh --compress # 导出镜像"

View File

@ -68,6 +68,12 @@ declare -A images=(
["argus-kibana:latest"]="argus-kibana-latest.tar"
["argus-bind9:latest"]="argus-bind9-latest.tar"
["argus-master:offline"]="argus-master-offline.tar"
["argus-metric-ftp:latest"]="argus-metric-ftp-latest.tar"
["argus-metric-prometheus:latest"]="argus-metric-prometheus-latest.tar"
["argus-metric-grafana:latest"]="argus-metric-grafana-latest.tar"
["argus-web-frontend:latest"]="argus-web-frontend-latest.tar"
["argus-web-proxy:latest"]="argus-web-proxy-latest.tar"
["argus-alertmanager:latest"]="argus-alertmanager-latest.tar"
)
# 函数:检查镜像是否存在

View File

@ -12,6 +12,8 @@ VENV_DIR="$BUILD_ROOT/venv"
AGENT_BUILD_IMAGE="${AGENT_BUILD_IMAGE:-python:3.11-slim-bullseye}"
AGENT_BUILD_USE_DOCKER="${AGENT_BUILD_USE_DOCKER:-1}"
# 默认在容器内忽略代理以避免公司内网代理在 Docker 网络不可达导致 pip 失败(可用 0 关闭)
AGENT_BUILD_IGNORE_PROXY="${AGENT_BUILD_IGNORE_PROXY:-1}"
USED_DOCKER=0
run_host_build() {
@ -71,6 +73,7 @@ run_docker_build() {
pass_env_if_set http_proxy
pass_env_if_set https_proxy
pass_env_if_set no_proxy
pass_env_if_set AGENT_BUILD_IGNORE_PROXY
build_script=$(cat <<'INNER'
set -euo pipefail
@ -82,6 +85,10 @@ rm -rf build dist
mkdir -p build/pyinstaller dist
python3 -m venv --copies build/venv
source build/venv/bin/activate
# 若指定忽略代理,则清空常见代理与 pip 镜像环境变量,避免容器内代理不可达
if [ "${AGENT_BUILD_IGNORE_PROXY:-1}" = "1" ]; then
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY PIP_INDEX_URL PIP_EXTRA_INDEX_URL PIP_TRUSTED_HOST
fi
pip install --upgrade pip
pip install .
pip install pyinstaller==6.6.0

View File

@ -9,21 +9,21 @@ RUN apt-get update && \
apt-get install -y wget supervisor net-tools inetutils-ping vim ca-certificates passwd && \
apt-get clean && rm -rf /var/lib/apt/lists/*
# 设置 Alertmanager 版本
# 设置 Alertmanager 版本(与本地离线包保持一致)
ARG ALERTMANAGER_VERSION=0.28.1
# 下载并解压 Alertmanager 二进制
RUN wget https://github.com/prometheus/alertmanager/releases/download/v${ALERTMANAGER_VERSION}/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz && \
tar xvf alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz && \
mv alertmanager-${ALERTMANAGER_VERSION}.linux-amd64 /usr/local/alertmanager && \
rm alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz
# 使用仓库内预置的离线包构建(无需联网)
COPY src/alert/alertmanager/build/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz /tmp/
RUN tar xvf /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz -C /tmp && \
mv /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64 /usr/local/alertmanager && \
rm -f /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz
ENV ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
ARG ARGUS_UID=2133
ARG ARGUS_GID=2015
ENV ARGUS_UID=${ARGUS_UID}
ENV ARGUS_GID=${ARGUS_GID}
ARG ARGUS_BUILD_UID=2133
ARG ARGUS_BUILD_GID=2015
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID}
ENV ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
RUN mkdir -p /usr/share/alertmanager && \
mkdir -p ${ALERTMANAGER_BASE_PATH} && \
@ -33,16 +33,24 @@ RUN mkdir -p /usr/share/alertmanager && \
# 创建 alertmanager 用户(可自定义 UID/GID
# 创建 alertmanager 用户组
RUN groupadd -g ${ARGUS_GID} alertmanager
RUN set -eux; \
# 确保目标 GID 存在;若已被占用,直接使用该 GID组名不限\
if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \
groupadd -g "${ARGUS_BUILD_GID}" alertmanager || true; \
fi; \
# 确保存在 alertmanager 用户;若 UID 已被占用,跳过并继续使用现有 UID 的用户
if ! id alertmanager >/dev/null 2>&1; then \
if getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
# UID 已占用,则创建同名用户但不指定 UID避免冲突仅保证 user 存在
useradd -M -s /usr/sbin/nologin -g "${ARGUS_BUILD_GID}" alertmanager || true; \
else \
useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" alertmanager || true; \
fi; \
else \
usermod -g "${ARGUS_BUILD_GID}" alertmanager || true; \
fi
# 创建 alertmanager 用户并指定组
RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} alertmanager
RUN chown -R alertmanager:alertmanager /usr/share/alertmanager && \
chown -R alertmanager:alertmanager /alertmanager && \
chown -R alertmanager:alertmanager ${ALERTMANAGER_BASE_PATH} && \
chown -R alertmanager:alertmanager /private/argus/etc && \
chown -R alertmanager:alertmanager /usr/local/bin
RUN chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/alertmanager /alertmanager ${ALERTMANAGER_BASE_PATH} /private/argus/etc /usr/local/bin || true
# 配置内网 apt 源 (如果指定了内网选项)
RUN if [ "$USE_INTRANET" = "true" ]; then \
@ -86,4 +94,3 @@ EXPOSE 9093
# 使用 supervisor 作为入口点
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]

View File

@ -5,8 +5,8 @@ docker pull ubuntu:24.04
source src/alert/tests/.env
docker build \
--build-arg ARGUS_UID=${ARGUS_UID} \
--build-arg ARGUS_GID=${ARGUS_GID} \
--build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
--build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID} \
-f src/alert/alertmanager/build/Dockerfile \
-t argus-alertmanager:latest .

View File

@ -0,0 +1,22 @@
#!/usr/bin/env bash
set -euo pipefail
# 下载 Alertmanager 离线安装包到本目录,用于 Docker 构建时 COPY
# 用法:
# ./fetch-dist.sh [version]
# 示例:
# ./fetch-dist.sh 0.28.1
VER="${1:-0.28.1}"
OUT="alertmanager-${VER}.linux-amd64.tar.gz"
URL="https://github.com/prometheus/alertmanager/releases/download/v${VER}/${OUT}"
if [[ -f "$OUT" ]]; then
echo "[INFO] $OUT already exists, skip download"
exit 0
fi
echo "[INFO] Downloading $URL"
curl -fL --retry 3 --connect-timeout 10 -o "$OUT" "$URL"
echo "[OK] Saved to $(pwd)/$OUT"

View File

@ -7,10 +7,8 @@ ALERTMANAGER_BASE_PATH=${ALERTMANAGER_BASE_PATH:-/private/argus/alert/alertmanag
echo "[INFO] Alertmanager base path: ${ALERTMANAGER_BASE_PATH}"
# 生成配置文件
echo "[INFO] Generating Alertmanager configuration file..."
sed "s|\${ALERTMANAGER_BASE_PATH}|${ALERTMANAGER_BASE_PATH}|g" \
/etc/alertmanager/alertmanager.yml > ${ALERTMANAGER_BASE_PATH}/alertmanager.yml
# 使用容器内的 /etc/alertmanager/alertmanager.yml 作为配置文件,避免写入挂载卷导致的权限问题
echo "[INFO] Using /etc/alertmanager/alertmanager.yml as configuration"
# 记录容器 IP 地址

View File

@ -6,7 +6,7 @@ user=root
[program:alertmanager]
command=/usr/local/bin/start-am-supervised.sh
user=alertmanager
user=ubuntu
stdout_logfile=/var/log/supervisor/alertmanager.log
stderr_logfile=/var/log/supervisor/alertmanager_error.log
autorestart=true

View File

@ -1,5 +1,5 @@
DATA_ROOT=/home/argus/tmp/private/argus
ARGUS_UID=1048
ARGUS_GID=1048
ARGUS_BUILD_UID=1048
ARGUS_BUILD_GID=1048
USE_INTRANET=false

View File

@ -4,15 +4,15 @@ services:
context: ../../../
dockerfile: src/alert/alertmanager/build/Dockerfile
args:
ARGUS_UID: ${ARGUS_UID:-2133}
ARGUS_GID: ${ARGUS_GID:-2015}
ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
USE_INTRANET: ${USE_INTRANET:-false}
image: argus-alertmanager:latest
container_name: argus-alertmanager
environment:
- ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
- ARGUS_UID=${ARGUS_UID:-2133}
- ARGUS_GID=${ARGUS_GID:-2015}
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
ports:
- "${ARGUS_PORT:-9093}:9093"
volumes:

View File

@ -26,6 +26,7 @@ RUN apt-get update && \
apt-get install -y \
bind9 \
bind9utils \
dnsutils \
bind9-doc \
supervisor \
net-tools \

View File

@ -1,47 +1,96 @@
#!/bin/bash
set -euo pipefail
echo "[INFO] Starting Fluent Bit setup in Ubuntu container..."
echo "[INFO] Starting Fluent Bit setup in Ubuntu container (offline-first)..."
# 安装必要的工具
echo "[INFO] Installing required packages..."
export DEBIAN_FRONTEND=noninteractive
apt-get update -qq
apt-get install -y -qq curl
# 解压bundle到/tmp
echo "[INFO] Extracting fluent-bit bundle..."
cp -r /private/etc /tmp
cp -r /private/packages /tmp
cd /tmp
# Stage bundle to /tmp (read-only mount under /private)
echo "[INFO] Staging fluent-bit bundle..."
rm -rf /tmp/flb && mkdir -p /tmp/flb
cp -r /private/etc /tmp/flb/
mkdir -p /tmp/flb/packages
cp -r /private/packages/* /tmp/flb/packages/ 2>/dev/null || true
# 安装 Fluent Bit 从 deb 包
echo "[INFO] Installing Fluent Bit from deb package..."
dpkg -i /tmp/packages/fluent-bit_3.1.9_amd64.deb || true
apt-get install -f -y -qq # 解决依赖问题
# Helper: check and install a local deb if not already satisfied
ensure_lib() {
local soname="$1"; shift
local pattern="$1"; shift
if ldconfig -p 2>/dev/null | grep -q "$soname"; then
echo "[OK] $soname already present"
return 0
fi
local deb="$(ls /tmp/flb/packages/$pattern 2>/dev/null | head -n1 || true)"
if [[ -n "$deb" ]]; then
echo "[INFO] Installing local dependency: $(basename "$deb")"
dpkg -i "$deb" >/dev/null 2>&1 || true
else
echo "[WARN] Local deb for $soname not found (pattern=$pattern)"
fi
if ! ldconfig -p 2>/dev/null | grep -q "$soname"; then
echo "[WARN] $soname still missing after local install; attempting apt fallback"
apt-get update -qq || true
case "$soname" in
libpq.so.5) apt-get install -y -qq libpq5 || true ;;
libyaml-0.so.2) apt-get install -y -qq libyaml-0-2 || true ;;
esac
fi
ldconfig 2>/dev/null || true
}
# Offline-first: satisfy runtime deps from local debs, fallback to apt only if necessary
ensure_lib "libpq.so.5" "libpq5_*_amd64.deb"
ensure_lib "libyaml-0.so.2" "libyaml-0-2_*_amd64.deb"
ensure_lib "libsasl2.so.2" "libsasl2-2_*_amd64.deb"
ensure_lib "libldap-2.5.so.0" "libldap-2.5-0_*_amd64.deb"
# Install fluent-bit main package from local bundle
FLB_DEB="$(ls /tmp/flb/packages/fluent-bit_*_amd64.deb 2>/dev/null | head -n1 || true)"
if [[ -z "$FLB_DEB" ]]; then
echo "[ERROR] fluent-bit deb not found under /private/packages" >&2
exit 1
fi
echo "[INFO] Installing Fluent Bit: $(basename "$FLB_DEB")"
dpkg -i "$FLB_DEB" >/dev/null 2>&1 || true
# If dpkg reported unresolved dependencies, try apt -f only as last resort
if ! command -v /opt/fluent-bit/bin/fluent-bit >/dev/null 2>&1; then
echo "[WARN] fluent-bit binary missing after dpkg; attempting apt --fix-broken"
apt-get install -f -y -qq || true
fi
# Ensure runtime library dependencies are satisfied (libsasl2, libldap are required via libpq/curl)
MISSING=$(ldd /opt/fluent-bit/bin/fluent-bit 2>/dev/null | awk '/not found/{print $1}' | xargs -r echo || true)
if [[ -n "$MISSING" ]]; then
echo "[WARN] missing shared libs: $MISSING"
apt-get update -qq || true
apt-get install -y -qq libsasl2-2 libldap-2.5-0 || true
apt-get install -f -y -qq || true
fi
# 验证 Fluent Bit 可以运行
echo "[INFO] Fluent Bit version:"
/opt/fluent-bit/bin/fluent-bit --version
/opt/fluent-bit/bin/fluent-bit --version || { echo "[ERROR] fluent-bit not installed or libraries missing" >&2; exit 1; }
# 创建配置目录
# Place configuration
mkdir -p /etc/fluent-bit
cp -r /tmp/etc/* /etc/fluent-bit/
cp -r /tmp/flb/etc/* /etc/fluent-bit/
# 创建日志和缓冲区目录
# Create logs/buffers dirs
mkdir -p /logs/train /logs/infer /buffers
chmod 755 /logs/train /logs/infer /buffers
# 等待 Elasticsearch 就绪
echo "[INFO] Waiting for Elasticsearch to be ready..."
while ! curl -fs http://${ES_HOST}:${ES_PORT}/_cluster/health >/dev/null 2>&1; do
echo " Waiting for ES at ${ES_HOST}:${ES_PORT}..."
sleep 5
# Wait for Elasticsearch via bash /dev/tcp to avoid curl dependency
echo "[INFO] Waiting for Elasticsearch to be ready (tcp ${ES_HOST}:${ES_PORT})..."
for i in $(seq 1 120); do
if exec 3<>/dev/tcp/${ES_HOST}/${ES_PORT}; then
exec 3<&- 3>&-
echo "[INFO] Elasticsearch is ready"
break
fi
[[ $i -eq 120 ]] && { echo "[ERROR] ES not reachable" >&2; exit 1; }
sleep 1
done
echo "[INFO] Elasticsearch is ready"
# 启动 Fluent Bit
echo "[INFO] Starting Fluent Bit with configuration from /etc/fluent-bit/"
echo "[INFO] Command: /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf"
exec /opt/fluent-bit/bin/fluent-bit \
--config=/etc/fluent-bit/fluent-bit.conf
exec /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf

View File

@ -32,3 +32,42 @@ fi
echo "[OK] 初始化完成: private/argus/log/{elasticsearch,kibana}"
echo "[INFO] Fluent-bit files should be in fluent-bit/ directory"
# 准备 Fluent Bit 离线依赖(从 metric all-in-one-full 复制 deb 到 ../fluent-bit/build/packages
FLB_BUILD_PACKAGES_DIR="$root/../fluent-bit/build/packages"
mkdir -p "$FLB_BUILD_PACKAGES_DIR"
for deb in \
"$project_root/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_"*_amd64.deb \
"$project_root/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_"*_amd64.deb ; do
if ls $deb >/dev/null 2>&1; then
for f in $deb; do
base="$(basename "$f")"
if [[ ! -f "$FLB_BUILD_PACKAGES_DIR/$base" ]]; then
cp "$f" "$FLB_BUILD_PACKAGES_DIR/"
echo " [+] copied $base"
fi
done
fi
done
# 额外:从 all-in-one-full 的 ubuntu22/curl.tar.gz 解包必要依赖libsasl2/ldap便于离线安装
CURLOPT_TAR="$project_root/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz"
if [[ -f "$CURLOPT_TAR" ]]; then
tmpdir=$(mktemp -d)
if tar -xzf "$CURLOPT_TAR" -C "$tmpdir" 2>/dev/null; then
for p in \
libsasl2-2_*_amd64.deb \
libsasl2-modules-db_*_amd64.deb \
libldap-2.5-0_*_amd64.deb \
libidn2-0_*_amd64.deb \
libbrotli1_*_amd64.deb \
libssl3_*_amd64.deb ; do
src=$(ls "$tmpdir"/curl/$p 2>/dev/null | head -n1 || true)
if [[ -n "$src" ]]; then
base="$(basename "$src")"
[[ -f "$FLB_BUILD_PACKAGES_DIR/$base" ]] || cp "$src" "$FLB_BUILD_PACKAGES_DIR/" && echo " [+] staged $base"
fi
done
fi
rm -rf "$tmpdir"
fi

View File

@ -4,4 +4,4 @@
/client-plugins/demo-all-in-one/publish/
/client-plugins/demo-all-in-one/checklist
/client-plugins/demo-all-in-one/VERSION
/client-plugins/all-in-one-full/
/client-plugins/all-in-one-full/artifact/

View File

@ -104,7 +104,26 @@ log_info "文件所有者: $OWNER"
# 确保发布目录存在
log_info "确保发布目录存在: $PUBLISH_DIR"
sudo mkdir -p "$PUBLISH_DIR"
mkdir -p "$PUBLISH_DIR"
IFS=':' read -r OWNER_UID OWNER_GID <<< "$OWNER"
if [[ -z "$OWNER_UID" || -z "$OWNER_GID" ]]; then
log_error "--owner 格式不正确,应为 uid:gid"
exit 1
fi
CURRENT_UID=$(id -u)
CURRENT_GID=$(id -g)
if [[ "$OWNER_UID" != "$CURRENT_UID" || "$OWNER_GID" != "$CURRENT_GID" ]]; then
if [[ "$CURRENT_UID" -ne 0 ]]; then
log_error "当前用户 (${CURRENT_UID}:${CURRENT_GID}) 无法设置所有者为 ${OWNER_UID}:${OWNER_GID}"
log_error "请以目标用户运行脚本或预先调整目录权限"
exit 1
fi
NEED_CHOWN=true
else
NEED_CHOWN=false
fi
# 创建临时目录用于打包
TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$"
@ -208,26 +227,31 @@ fi
TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz"
log_info "创建发布包: $TAR_NAME"
cd "$TEMP_PACKAGE_DIR"
sudo tar -czf "$PUBLISH_DIR/$TAR_NAME" *
tar -czf "$PUBLISH_DIR/$TAR_NAME" *
cd - > /dev/null
# 设置文件所有者
log_info "设置文件所有者为: $OWNER"
sudo chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME"
if [[ "$NEED_CHOWN" == true ]]; then
log_info "设置文件所有者为: $OWNER"
chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME"
fi
# 清理临时目录
rm -rf "$TEMP_PACKAGE_DIR"
# 更新 LATEST_VERSION 文件
log_info "更新 LATEST_VERSION 文件..."
echo "$VERSION" | sudo tee "$PUBLISH_DIR/LATEST_VERSION" > /dev/null
sudo chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION"
echo "$VERSION" > "$PUBLISH_DIR/LATEST_VERSION"
if [[ "$NEED_CHOWN" == true ]]; then
chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION"
fi
# 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制)
if [[ -f "config/dns.conf" ]]; then
log_info "复制 DNS 配置文件到发布目录根目录..."
sudo cp "config/dns.conf" "$PUBLISH_DIR/"
sudo chown "$OWNER" "$PUBLISH_DIR/dns.conf"
cp "config/dns.conf" "$PUBLISH_DIR/"
if [[ "$NEED_CHOWN" == true ]]; then
chown "$OWNER" "$PUBLISH_DIR/dns.conf"
fi
log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf"
else
log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制"
@ -236,8 +260,10 @@ fi
# 复制 setup.sh 到发布目录
if [[ -f "scripts/setup.sh" ]]; then
log_info "复制 setup.sh 到发布目录..."
sudo cp "scripts/setup.sh" "$PUBLISH_DIR/"
sudo chown "$OWNER" "$PUBLISH_DIR/setup.sh"
cp "scripts/setup.sh" "$PUBLISH_DIR/"
if [[ "$NEED_CHOWN" == true ]]; then
chown "$OWNER" "$PUBLISH_DIR/setup.sh"
fi
fi
# 显示发布结果

View File

@ -0,0 +1,59 @@
# 客户侧组件安装包构建、发布流程
## 第一步:配置版本和组件
首先搞定配置文件:
1. 把 `.checklist.example` 重命名成 `checklist`
2. 把 `.VERSION.example` 重命名成 `VERSION`
### checklist 文件格式
```
# 组件名称 目录路径 版本号 [依赖组件] [安装顺序]
dcgm-exporter-installer /path/to/dcgm-exporter-installer 1.1.0
node-exporter-installer /path/to/node-exporter-installer 1.1.0
```
### VERSION 文件
设置需要发布的版本号,比如 `1.29.0`
> 建议用 `version-manager.sh` 来管理版本
## 第二步:构建安装包
直接跑脚本:
```bash
./package_artifact.sh
```
构建完的东西会放在 `artifact/` 目录下,按版本分文件夹。
如果版本已经存在了,想要覆盖重新构建:
```bash
./package_artifact.sh --force
```
构建完可以手工测试安装包。
## 第三步:发布安装包
用这个脚本发布:
```bash
./publish_artifact.sh
```
发布后的内容在 `publish/` 目录里,包含:
- 压缩版本的安装包
- 一键安装的bash脚本
## 第四步部署到FTP服务器
把发布的内容上传到FTP服务器客户端就可以通过一键命令安装
```bash
curl -fsSL http://your-ftp-server/install.sh | sh -
curl -fsSL "ftp://ftpuser:{PASSWD}!@10.211.55.4/share/setup.sh" | sudo bash -s -- --server 10.211.55.4 --user ftpuser --password {PASSWD}
```
这样客户就能直接从FTP服务器下载并安装组件了。

View File

@ -0,0 +1 @@
1.29.0

View File

@ -0,0 +1,3 @@
# 组件名称 目录路径 版本号 [依赖组件] [安装顺序]
dcgm-exporter-installer /Users/sundapeng/Project/nlp/aiops/client-plugins/dcgm-exporter-installer 1.1.0
node-exporter-installer /Users/sundapeng/Project/nlp/aiops/client-plugins/node-exporter-installer 1.1.0

View File

@ -0,0 +1 @@
1.35.0

View File

@ -0,0 +1,5 @@
# 组件名称 目录路径 版本号 [依赖组件] [安装顺序]
argus-agent plugins/argus-agent 1.0.0
node-exporter plugins/node-exporter 1.0.0
dcgm-exporter plugins/dcgm-exporter 1.0.0
fluent-bit plugins/fluent-bit 1.0.0

View File

@ -0,0 +1,14 @@
# Elasticsearch
ES_HOST=es.log.argus.com
ES_PORT=9200
# Argus-Agent
# 连接master服务
MASTER_ENDPOINT=master.argus.com:3000
# 上报状态间隔描述
REPORT_INTERVAL_SECONDS=5
# FTP
FTP_SERVER=172.31.0.40
FTP_USER=ftpuser
FTP_PASSWORD=ZGClab1234!

View File

@ -0,0 +1,8 @@
# Argus Metric 配置文件示例
# 复制此文件为 config.env 并根据需要修改配置
# 连接master服务
MASTER_ENDPOINT=master.argus.com:3000
# 上报状态间隔描述(秒)
REPORT_INTERVAL_SECONDS=60

View File

@ -0,0 +1 @@
172.31.0.2

View File

@ -0,0 +1,94 @@
# Argus Agent 插件
这是 Argus Agent 的安装和管理插件,提供了完整的安装、卸载、健康检查功能。
## 文件结构
```
argus-agent/
├── bin/
│ └── argus-agent # Argus Agent 二进制文件
├── config/ # 配置文件目录
├── install.sh # 安装脚本
├── uninstall.sh # 卸载脚本
├── check_health.sh # 健康检查脚本
├── package.sh # 打包脚本
└── README.md # 说明文档
```
## 使用方法
### 安装
```bash
sudo ./install.sh
```
安装脚本会:
- 检查系统要求
- 停止可能运行的服务
- 安装二进制文件到 `/usr/local/bin/argus-agent`
- 创建 `argus-agent` 用户
- 创建配置和数据目录
- 启动服务并记录 PID
### 卸载
```bash
sudo ./uninstall.sh
```
卸载脚本会:
- 停止所有 argus-agent 进程
- 删除二进制文件
- 删除配置和数据目录
- 清理日志文件
- 更新安装记录
### 健康检查
```bash
./check_health.sh
```
健康检查脚本会:
- 检查安装记录中的 PID
- 验证进程是否正在运行
- 输出 JSON 格式的健康状态
### 打包
```bash
./package.sh
```
打包脚本会:
- 检查所有必要文件
- 创建时间戳命名的压缩包
- 输出安装包信息
## 安装后的文件位置
- 二进制文件: `/usr/local/bin/argus-agent`
- 配置目录: `/etc/argus-agent/`
- 数据目录: `/var/lib/argus-agent/`
- 日志文件: `/var/log/argus-agent.log`
- PID 文件: `/var/run/argus-agent.pid`
- 安装记录: `/opt/argus-metric/current/.install_record`
## 健康检查输出格式
```json
{
"name": "argus-agent",
"status": "health|unhealth",
"reason": "状态说明"
}
```
## 注意事项
1. 安装和卸载脚本需要 root 权限
2. 健康检查脚本使用安装记录中的 PID 来验证进程状态
3. 如果 jq 命令不可用,健康检查会使用简单的文本解析
4. 卸载时会保留 `argus-agent` 用户,避免影响其他服务

Binary file not shown.

View File

@ -0,0 +1,69 @@
#!/bin/bash
# Argus Agent 健康检查脚本
# 输出 JSON 格式结果
set -e
# 检查 Argus Agent 健康状态
check_health() {
local name="argus-agent"
local status="unhealth"
local reason=""
local install_record="/opt/argus-metric/current/.install_record"
# 首先尝试通过安装记录文件检查进程
if [[ -f "$install_record" ]]; then
# 尝试使用jq解析JSON格式的安装记录文件
local pid=""
if command -v jq &> /dev/null; then
pid=$(jq -r '.components."argus-agent".pid // empty' "$install_record" 2>/dev/null || echo "")
else
# 如果没有jq使用简单的文本解析方法
pid=$(grep -A 10 '"argus-agent"' "$install_record" | grep '"pid"' | cut -d'"' -f4 | head -1)
fi
if [[ -n "$pid" && "$pid" =~ ^[0-9]+$ ]]; then
if kill -0 "$pid" 2>/dev/null; then
# 进程存在且运行正常
status="health"
reason="进程运行正常 (PID: $pid)"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 0
else
reason="安装记录中的 PID $pid 进程不存在"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 1
fi
else
reason="安装记录文件中未找到有效的 argus-agent PID"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 1
fi
else
# 如果安装记录文件不存在,尝试查找 argus-agent 进程
local pids=$(pgrep -f "argus-agent" 2>/dev/null || true)
if [[ -n "$pids" ]]; then
# 取第一个找到的 PID
local pid=$(echo "$pids" | head -1)
status="health"
reason="发现 argus-agent 进程运行 (PID: $pid),但未找到安装记录"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 0
else
reason="未找到 argus-agent 进程,且安装记录文件不存在"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 1
fi
fi
}
# 主函数
main() {
check_health
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,289 @@
#!/bin/bash
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 显示帮助信息
show_help() {
echo "Argus Agent 安装脚本"
echo
echo "用法: $0 [选项]"
echo
echo "选项:"
echo " --help 显示此帮助信息"
echo
echo "示例:"
echo " $0 # 安装 Argus Agent"
echo
}
# 解析命令行参数
INSTALL_DIR=""
for arg in "$@"; do
case $arg in
--help|-h)
show_help
exit 0
;;
*)
# 如果参数不是以--开头,则认为是安装目录
if [[ ! "$arg" =~ ^-- ]]; then
INSTALL_DIR="$arg"
else
log_error "未知参数: $arg"
show_help
exit 1
fi
;;
esac
done
# 检查是否为 root 用户
check_root() {
if [[ $EUID -ne 0 ]]; then
log_error "此脚本需要 root 权限运行"
log_info "请使用: sudo $0"
exit 1
fi
}
# 检查系统要求
check_system() {
log_info "检查系统要求..."
# 检查操作系统
if [[ ! -f /etc/os-release ]]; then
log_error "无法检测操作系统版本"
exit 1
fi
source /etc/os-release
log_info "检测到操作系统: $NAME $VERSION"
# 检查是否为 Linux 系统
if [[ "$ID" != "ubuntu" && "$ID" != "debian" && "$ID" != "centos" && "$ID" != "rhel" && "$ID" != "fedora" ]]; then
log_warning "此脚本主要针对常见 Linux 发行版,其他系统可能需要调整"
fi
# 检查系统架构
local arch=$(uname -m)
log_info "系统架构: $arch"
if [[ "$arch" != "x86_64" && "$arch" != "amd64" ]]; then
log_warning "当前架构为 $archargus-agent 主要支持 x86_64/amd64"
fi
}
# 停止可能运行的服务
stop_existing_service() {
log_info "检查并停止可能运行的服务..."
local pid_file="/var/run/argus-agent.pid"
if [[ -f "$pid_file" ]]; then
local pid=$(cat "$pid_file")
if ps -p "$pid" -o comm= | grep -q "^argus-agent$"; then
kill "$pid" 2>/dev/null || true
sleep 2
kill -9 "$pid" 2>/dev/null || true
log_success "服务已停止"
fi
rm -f "$pid_file"
fi
local pids=$(pgrep -x argus-agent 2>/dev/null || true)
if [[ -n "$pids" ]]; then
for pid in $pids; do kill -9 "$pid" 2>/dev/null || true; done
fi
# 检查僵尸进程
local zombies=$(ps -eo pid,stat,comm | grep '[a]rgus-agent' | awk '$2 ~ /Z/ {print $1}')
if [[ -n "$zombies" ]]; then
for pid in $zombies; do
local ppid=$(ps -o ppid= -p $pid)
log_warning "检测到僵尸 argus-agent (PID=$pid, PPID=$ppid),尝试清理"
[[ "$ppid" -ne 1 ]] && kill -9 "$ppid" 2>/dev/null || true
done
fi
}
# 安装 Argus Agent 二进制文件
install_argus_agent() {
log_info "安装 Argus Agent..."
local binary_file="bin/argus-agent"
local install_dir="/usr/local/bin"
local target_file="$install_dir/argus-agent"
[[ ! -f "$binary_file" ]] && log_error "找不到 Argus Agent 二进制文件: $binary_file" && exit 1
stop_existing_service
local timeout=10
while [[ $timeout -gt 0 ]]; do
remaining_pids=$(pgrep -x argus-agent | grep -vw $$ || true)
[[ -z "$remaining_pids" ]] && break
if ps -eo pid,stat,comm | grep -E 'argus-agent' | grep -q 'Z'; then
log_warning "检测到僵尸 argus-agent跳过等待"
break
fi
log_warning "等待 argus-agent 完全退出... ($timeout)"
sleep 1
((timeout--))
done
cp "$binary_file" "${target_file}.new"
chmod +x "${target_file}.new"
mv -f "${target_file}.new" "$target_file"
log_success "Argus Agent 二进制文件安装完成"
}
# 创建用户和组
create_user() {
log_info "创建 argus-agent 用户..."
# 检查用户是否已存在
if id "argus-agent" &>/dev/null; then
log_info "用户 argus-agent 已存在"
else
useradd --no-create-home --shell /bin/false argus-agent
log_success "用户 argus-agent 创建完成"
fi
}
# 安装配置文件
install_config() {
log_info "安装配置文件..."
local config_dir="/etc/argus-agent"
# 创建配置目录
mkdir -p "$config_dir"
# 创建健康检查目录
mkdir -p "/var/lib/argus-agent/health"
chown argus-agent:argus-agent "/var/lib/argus-agent/health"
}
# 启动 Argus Agent 服务
start_argus_agent() {
log_info "启动 Argus Agent 服务..."
local binary_path="/usr/local/bin/argus-agent"
local log_file="/var/log/argus-agent.log"
local pid_file="/var/run/argus-agent.pid"
[[ -f "$pid_file" ]] && rm -f "$pid_file"
log_info "正在启动 Argus Agent..."
setsid "$binary_path" > "$log_file" 2>&1 < /dev/null &
local pid=$!
echo "$pid" > "$pid_file"
sleep 2
if kill -0 "$pid" 2>/dev/null; then
log_success "Argus Agent 服务启动成功 (PID: $pid)"
else
log_error "Argus Agent 启动失败"
[[ -f "$log_file" ]] && tail -n 10 "$log_file"
rm -f "$pid_file"
fi
}
# 更新安装记录
update_install_record() {
local pid="$1"
# 使用传入的安装目录参数,如果没有则使用默认值
local install_base_dir="${2:-/opt/argus-metric/current}"
local install_record="$install_base_dir/.install_record"
# 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建
if [[ ! -f "$install_record" ]]; then
log_info "安装记录文件不存在,将由主安装脚本创建"
return 0
fi
# 如果文件存在,说明是重启场景,只更新 PID 字段
if command -v jq &> /dev/null; then
# 读取当前 PID
local current_pid=$(jq -r '.components."argus-agent".pid // ""' "$install_record" 2>/dev/null)
if [[ -z "$current_pid" ]]; then
log_warning "无法读取当前 PID跳过更新"
return 1
fi
# 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段
jq --arg new_pid "$pid" '.components."argus-agent".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record"
log_info "PID 已更新: $current_pid -> $pid"
else
log_warning "jq 命令不可用,无法更新安装记录文件"
fi
}
# 显示安装信息
show_install_info() {
log_success "Argus Agent 安装完成!"
echo
echo "安装信息:"
echo " 二进制文件: /usr/local/bin/argus-agent"
echo " 运行用户: argus-agent"
echo " 配置目录: /etc/argus-agent/"
echo " 健康检查目录: /var/lib/argus-agent/health"
echo
echo "使用方法:"
echo " 手动启动: /usr/local/bin/argus-agent"
echo " 后台启动: nohup /usr/local/bin/argus-agent &"
echo
echo "健康检查:"
echo " ./check_health.sh"
echo
}
# 主函数
main() {
echo "=========================================="
echo " Argus Agent 安装脚本 v1.0"
echo "=========================================="
echo
check_root
check_system
log_info "开始安装 Argus Agent..."
install_argus_agent
create_user
install_config
start_argus_agent
show_install_info
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,87 @@
#!/bin/bash
set -e
# 颜色定义
GREEN='\033[0;32m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
# 获取当前目录
CURRENT_DIR=$(pwd)
PACKAGE_NAME="argus-agent-$(date +%Y%m%d-%H%M%S)"
PACKAGE_FILE="${PACKAGE_NAME}.tar.gz"
log_info "开始打包 Argus Agent 安装包..."
# 检查必要文件
log_info "检查必要文件..."
required_files=(
"install.sh"
"uninstall.sh"
"bin/argus-agent"
"check_health.sh"
)
missing_files=()
for file in "${required_files[@]}"; do
if [[ ! -f "$file" ]]; then
missing_files+=("$file")
fi
done
if [[ ${#missing_files[@]} -gt 0 ]]; then
echo "缺少以下文件:"
for file in "${missing_files[@]}"; do
echo " - $file"
done
exit 1
fi
log_success "所有必要文件检查完成"
# 创建临时目录
TEMP_DIR=$(mktemp -d)
log_info "创建临时目录: $TEMP_DIR"
# 复制文件到临时目录
cp -r . "$TEMP_DIR/$PACKAGE_NAME"
# 进入临时目录
cd "$TEMP_DIR"
# 创建压缩包
log_info "创建压缩包: $PACKAGE_FILE"
tar -czf "$PACKAGE_FILE" "$PACKAGE_NAME"
# 移动压缩包到原目录
mv "$PACKAGE_FILE" "$CURRENT_DIR/"
# 清理临时目录
rm -rf "$TEMP_DIR"
# 返回原目录
cd "$CURRENT_DIR"
# 显示结果
log_success "打包完成!"
echo
echo "安装包文件: $PACKAGE_FILE"
echo "文件大小: $(du -h "$PACKAGE_FILE" | cut -f1)"
echo
echo "使用方法:"
echo "1. 将 $PACKAGE_FILE 传输到目标服务器"
echo "2. 解压: tar -xzf $PACKAGE_FILE"
echo "3. 进入目录: cd $PACKAGE_NAME"
echo "4. 运行安装: sudo ./install.sh"
echo
echo "注意: 请确保所有必要文件都存在"

View File

@ -0,0 +1,255 @@
#!/bin/bash
# Argus Agent 卸载脚本
# 版本: 1.0
# 作者: AIOps Team
# 日期: $(date +%Y-%m-%d)
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 检查是否为 root 用户
check_root() {
if [[ $EUID -ne 0 ]]; then
log_error "此脚本需要 root 权限运行"
log_info "请使用: sudo $0"
exit 1
fi
}
# 停止运行中的进程
stop_processes() {
log_info "停止 Argus Agent 进程..."
local pid_file="/var/run/argus-agent.pid"
local stopped=false
# 首先尝试通过 PID 文件停止服务
if [[ -f "$pid_file" ]]; then
local pid=$(cat "$pid_file")
if kill -0 "$pid" 2>/dev/null; then
log_info "通过 PID 文件停止服务 (PID: $pid)..."
kill "$pid"
sleep 3
# 检查进程是否已停止
if kill -0 "$pid" 2>/dev/null; then
log_warning "进程未响应,强制终止..."
kill -9 "$pid" 2>/dev/null || true
fi
log_success "Argus Agent 进程已停止"
stopped=true
else
log_warning "PID 文件存在但进程已不存在,清理 PID 文件"
rm -f "$pid_file"
fi
fi
# 查找并杀死所有 argus-agent 进程
local pids=$(pgrep -f "argus-agent" 2>/dev/null || true)
if [[ -n "$pids" ]]; then
log_info "发现 argus-agent 进程,正在停止..."
for pid in $pids; do
log_info "停止进程 PID: $pid"
kill "$pid" 2>/dev/null || true
done
sleep 2
# 检查是否还有进程在运行,如果有则强制终止
local remaining_pids=$(pgrep -f "argus-agent" 2>/dev/null || true)
if [[ -n "$remaining_pids" ]]; then
log_warning "进程未响应,强制终止..."
for pid in $remaining_pids; do
log_info "强制终止进程 PID: $pid"
kill -9 "$pid" 2>/dev/null || true
done
sleep 1
fi
# 最终检查
if pgrep -f "argus-agent" > /dev/null; then
log_error "无法停止所有 argus-agent 进程"
else
log_success "所有 Argus Agent 进程已停止"
stopped=true
fi
else
log_info "Argus Agent 进程未运行"
fi
# 清理 PID 文件
rm -f "$pid_file"
if [[ "$stopped" == "false" ]]; then
log_warning "未发现需要停止的 Argus Agent 进程"
fi
}
# 删除二进制文件
remove_binary() {
log_info "删除 Argus Agent 二进制文件..."
local binary_files=(
"/usr/local/bin/argus-agent"
)
local deleted=false
for binary_file in "${binary_files[@]}"; do
if [[ -f "$binary_file" ]]; then
rm -f "$binary_file"
log_success "二进制文件已删除: $binary_file"
deleted=true
fi
done
if [[ "$deleted" == "false" ]]; then
log_info "二进制文件不存在"
fi
}
# 删除配置文件
remove_config() {
log_info "删除配置文件..."
local config_dir="/etc/argus-agent"
if [[ -d "$config_dir" ]]; then
rm -rf "$config_dir"
log_success "配置目录已删除"
else
log_info "配置目录不存在"
fi
}
# 删除数据目录
remove_data_dir() {
log_info "删除数据目录..."
local data_dir="/var/lib/argus-agent"
if [[ -d "$data_dir" ]]; then
rm -rf "$data_dir"
log_success "数据目录已删除"
else
log_info "数据目录不存在"
fi
}
# 检查用户状态(可选)
check_user_status() {
log_info "检查 argus-agent 用户状态..."
if id "argus-agent" &>/dev/null; then
log_info "检测到 argus-agent 用户存在"
log_warning "argus-agent 是系统用户,可能被其他服务使用"
log_info "为了系统稳定性,将保留 argus-agent 用户"
log_info "如需手动删除,请运行: sudo userdel argus-agent"
else
log_info "argus-agent 用户不存在"
fi
}
# 清理日志文件
cleanup_logs() {
log_info "清理日志文件..."
# 删除安装脚本创建的日志文件
rm -f /var/log/argus-agent.log
log_success "日志文件已清理"
}
# 清理安装记录
cleanup_install_record() {
log_info "清理安装记录..."
local install_record="/opt/argus-metric/current/.install_record"
if [[ -f "$install_record" ]]; then
if command -v jq &> /dev/null; then
# 使用 jq 删除 argus-agent 记录
jq 'del(.components."argus-agent")' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record"
log_success "安装记录已更新"
else
log_warning "jq 命令不可用,无法清理安装记录"
fi
else
log_info "安装记录文件不存在"
fi
}
# 显示卸载信息
show_uninstall_info() {
log_success "Argus Agent 卸载完成!"
echo
echo "已删除的内容:"
echo " - 二进制文件: /usr/local/bin/argus-agent"
echo " - 配置目录: /etc/argus-agent"
echo " - 数据目录: /var/lib/argus-agent"
echo " - 相关日志文件"
echo
echo "注意:"
echo " - argus-agent 用户已保留(系统用户,可能被其他服务使用)"
echo " - 如需完全清理,请手动检查并删除相关文件"
echo
}
# 主函数
main() {
echo "=========================================="
echo " Argus Agent 卸载脚本 v1.0"
echo "=========================================="
echo
check_root
log_warning "此操作将完全卸载 Argus Agent"
read -p "确认继续?(y/N): " confirm
if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then
log_info "取消卸载操作"
exit 0
fi
log_info "开始卸载 Argus Agent..."
stop_processes
remove_binary
remove_config
remove_data_dir
cleanup_logs
cleanup_install_record
# 检查用户状态
check_user_status
show_uninstall_info
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

Binary file not shown.

View File

@ -0,0 +1,55 @@
#!/bin/bash
# DCGM Exporter 健康检查脚本
# 输出 JSON 格式结果
set -e
# 检查 DCGM Exporter 健康状态
check_health() {
local url="http://localhost:9400"
local metrics_url="$url/metrics"
local name="dcgm-exporter"
local status="unhealth"
local reason=""
# 检查 curl 是否可用
if ! command -v curl &> /dev/null; then
reason="curl 命令不可用,无法进行健康检查"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 1
fi
# 测试根路径连接
local http_code=$(curl -s -o /dev/null -w "%{http_code}" "$url" 2>/dev/null || echo "000")
if [[ "$http_code" == "200" ]]; then
# 测试 metrics 端点
local metrics_code=$(curl -s -o /dev/null -w "%{http_code}" "$metrics_url" 2>/dev/null || echo "000")
if [[ "$metrics_code" == "200" ]]; then
status="health"
reason="success"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 0
else
reason="Metrics 端点异常 (HTTP $metrics_code)"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 1
fi
else
reason="HTTP 服务异常 (HTTP $http_code),请检查 DCGM Exporter 是否正在运行在端口 9400"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 1
fi
}
# 主函数
main() {
check_health
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,77 @@
# Format
# If line starts with a '#' it is considered a comment
# DCGM FIELD, Prometheus metric type, help message
# Clocks
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
# DCGM_EXP_CLOCK_EVENTS_COUNT, gauge, Count of clock events within the user-specified time window (see clock-events-count-window-size param).
# Temperature
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).
# Power
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
# PCIE
DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML.
DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML.
DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
# Utilization (the sample period varies depending on the product)
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %).
# Errors and violations
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us).
# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us).
# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us).
# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us).
# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
# DCGM_EXP_XID_ERRORS_COUNT, gauge, Count of XID Errors within user-specified time window (see xid-count-window-size param).
# Memory usage
DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB).
DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB).
# ECC
# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
# DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
# DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
# DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.
# Retired pages
# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors.
# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors.
# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.
# NVLink
# DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
# DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
# DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries.
# DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes
# VGPU License status
DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
# Remapped rows
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed
# Static configuration information. These appear as labels on the other metrics
DCGM_FI_DRIVER_VERSION, label, Driver Version
# DCGM_FI_NVML_VERSION, label, NVML Version
# DCGM_FI_DEV_BRAND, label, Device Brand
# DCGM_FI_DEV_SERIAL, label, Device Serial Number
# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version
# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version
# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version
# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version
# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device
1 # Format
2 # If line starts with a '#' it is considered a comment
3 # DCGM FIELD, Prometheus metric type, help message
4 # Clocks
5 DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
6 DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
7 # DCGM_EXP_CLOCK_EVENTS_COUNT, gauge, Count of clock events within the user-specified time window (see clock-events-count-window-size param).
8 # Temperature
9 DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
10 DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).
11 # Power
12 DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).
13 DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
14 # PCIE
15 DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML.
16 DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML.
17 DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
18 # Utilization (the sample period varies depending on the product)
19 DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
20 DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
21 DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
22 DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %).
23 # Errors and violations
24 DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
25 # DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us).
26 # DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us).
27 # DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us).
28 # DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
29 # DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us).
30 # DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
31 # DCGM_EXP_XID_ERRORS_COUNT, gauge, Count of XID Errors within user-specified time window (see xid-count-window-size param).
32 # Memory usage
33 DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB).
34 DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB).
35 # ECC
36 # DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
37 # DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
38 # DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
39 # DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.
40 # Retired pages
41 # DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors.
42 # DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors.
43 # DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.
44 # NVLink
45 # DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
46 # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
47 # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries.
48 # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
49 DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes
50 # VGPU License status
51 DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
52 # Remapped rows
53 DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
54 DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
55 DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed
56 # Static configuration information. These appear as labels on the other metrics
57 DCGM_FI_DRIVER_VERSION, label, Driver Version
58 # DCGM_FI_NVML_VERSION, label, NVML Version
59 # DCGM_FI_DEV_BRAND, label, Device Brand
60 # DCGM_FI_DEV_SERIAL, label, Device Serial Number
61 # DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version
62 # DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version
63 # DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version
64 # DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version
65 # DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device

View File

@ -0,0 +1,365 @@
#!/bin/bash
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 更新安装记录
update_install_record() {
local pid="$1"
# 使用传入的安装目录参数,如果没有则使用默认值
local install_base_dir="${2:-/opt/argus-metric/current}"
local install_record="$install_base_dir/.install_record"
# 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建
if [[ ! -f "$install_record" ]]; then
log_info "安装记录文件不存在,将由主安装脚本创建"
return 0
fi
# 如果文件存在,说明是重启场景,只更新 PID 字段
if command -v jq &> /dev/null; then
# 读取当前 PID
local current_pid=$(jq -r '.components."dcgm-exporter".pid // ""' "$install_record" 2>/dev/null)
if [[ -z "$current_pid" ]]; then
log_warning "无法读取当前 PID跳过更新"
return 1
fi
# 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段
jq --arg new_pid "$pid" '.components."dcgm-exporter".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record"
log_info "PID 已更新: $current_pid -> $pid"
else
log_warning "jq 命令不可用,无法更新安装记录文件"
fi
}
# 显示帮助信息
show_help() {
echo "DCGM Exporter 安装脚本"
echo
echo "用法: $0 [选项]"
echo
echo "选项:"
echo " --help 显示此帮助信息"
echo
echo "示例:"
echo " $0 # 安装 DCGM Exporter"
echo
}
# 解析命令行参数
INSTALL_DIR=""
for arg in "$@"; do
case $arg in
--help|-h)
show_help
exit 0
;;
*)
# 如果参数不是以--开头,则认为是安装目录
if [[ ! "$arg" =~ ^-- ]]; then
INSTALL_DIR="$arg"
else
log_error "未知参数: $arg"
show_help
exit 1
fi
;;
esac
done
# 检查是否为 root 用户
check_root() {
if [[ $EUID -ne 0 ]]; then
log_error "此脚本需要 root 权限运行"
log_info "请使用: sudo $0"
exit 1
fi
}
# 检查系统要求
check_system() {
log_info "检查系统要求..."
# 检查操作系统
if [[ ! -f /etc/os-release ]]; then
log_error "无法检测操作系统版本"
exit 1
fi
source /etc/os-release
log_info "检测到操作系统: $NAME $VERSION"
# 检查是否为 Ubuntu/Debian
if [[ "$ID" != "ubuntu" && "$ID" != "debian" ]]; then
log_warning "此脚本主要针对 Ubuntu/Debian 系统,其他系统可能需要调整"
fi
# 检查 NVIDIA GPU
if ! command -v nvidia-smi &> /dev/null; then
log_warning "未检测到 nvidia-smi请确保已安装 NVIDIA 驱动"
else
log_success "检测到 NVIDIA GPU"
nvidia-smi --query-gpu=name --format=csv,noheader,nounits | head -1
fi
}
# 安装 DCGM 依赖
install_dcgm_dependency() {
log_info "安装 DCGM 依赖..."
local deb_file="bin/datacenter-gpu-manager_3.3.9_amd64.deb"
if [[ ! -f "$deb_file" ]]; then
log_error "找不到 DCGM 依赖文件: $deb_file"
exit 1
fi
# 安装 deb 包
dpkg -i "$deb_file" || {
log_warning "dpkg 安装失败,尝试使用 apt 修复依赖..."
apt-get update
apt-get install -f -y
dpkg -i "$deb_file"
}
log_success "DCGM 依赖安装完成"
}
# 检查 DCGM 服务状态
check_dcgm_service() {
log_info "检查 DCGM 服务状态..."
# 检查 DCGM 服务是否在运行
if systemctl is-active --quiet dcgm 2>/dev/null; then
log_success "DCGM 服务已在运行"
elif pgrep -f nv-hostengine > /dev/null; then
log_success "nv-hostengine 进程已在运行"
else
log_warning "DCGM 服务未运行,需要手动启动"
log_info "启动 DCGM 服务的方法:"
log_info " 1. 使用 systemd: sudo systemctl start dcgm"
log_info " 2. 手动启动: nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &"
fi
# 测试 DCGM 连接
if systemctl is-active --quiet dcgm 2>/dev/null || pgrep -f nv-hostengine > /dev/null; then
log_info "测试 DCGM 连接..."
if dcgmi discovery -l > /dev/null 2>&1; then
log_success "DCGM 连接测试成功"
else
log_warning "DCGM 连接测试失败,请检查服务状态"
fi
fi
}
# 停止可能运行的服务
stop_existing_service() {
log_info "检查并停止可能运行的服务..."
local pid_file="/var/run/dcgm-exporter.pid"
# 检查并停止通过 PID 文件管理的服务
if [[ -f "$pid_file" ]]; then
local pid=$(cat "$pid_file")
if kill -0 "$pid" 2>/dev/null; then
log_info "发现正在运行的 DCGM Exporter 服务 (PID: $pid),正在停止..."
kill "$pid" > /dev/null 2>&1 || true
sleep 2
if kill -0 "$pid" 2>/dev/null; then
log_warning "进程未响应,强制终止..."
kill -9 "$pid" > /dev/null 2>&1 || true
fi
rm -f "$pid_file"
log_success "服务已停止"
else
log_warning "发现过期的 PID 文件,正在清理..."
rm -f "$pid_file"
fi
fi
# 查找并停止所有 dcgm-exporter 进程(排除脚本自身)
local exporter_bin="/usr/local/bin/dcgm-exporter"
local pids=$(pgrep -f "$exporter_bin")
if [[ -n "$pids" ]]; then
log_info "发现其他 dcgm-exporter 进程,正在停止..."
for pid in $pids; do
if [[ "$pid" != "$$" ]]; then
kill "$pid" > /dev/null 2>&1 || true
sleep 1
if kill -0 "$pid" 2>/dev/null; then
log_warning "进程 $pid 未响应,强制终止..."
kill -9 "$pid" > /dev/null 2>&1 || true
fi
fi
done
log_success "所有 dcgm-exporter 进程已停止"
fi
}
# 安装 DCGM Exporter 二进制文件
install_dcgm_exporter() {
log_info "安装 DCGM Exporter..."
local binary_file="bin/dcgm-exporter"
local install_dir="/usr/local/bin"
if [[ ! -f "$binary_file" ]]; then
log_error "找不到 DCGM Exporter 二进制文件: $binary_file"
exit 1
fi
# 停止可能运行的服务
stop_existing_service
# 复制二进制文件
cp "$binary_file" "$install_dir/"
chmod +x "$install_dir/dcgm-exporter"
log_success "DCGM Exporter 二进制文件安装完成"
}
# 安装配置文件
install_config() {
log_info "安装配置文件..."
local config_dir="/etc/dcgm-exporter"
local config_file="config/default-counters.csv"
# 创建配置目录
mkdir -p "$config_dir"
if [[ -f "$config_file" ]]; then
cp "$config_file" "$config_dir/"
log_success "配置文件安装完成"
else
log_warning "未找到配置文件,使用默认配置"
fi
}
# 启动 DCGM Exporter 服务
start_dcgm_exporter() {
log_info "启动 DCGM Exporter 服务..."
local binary_path="/usr/local/bin/dcgm-exporter"
local log_file="/var/log/dcgm-exporter.log"
local pid_file="/var/run/dcgm-exporter.pid"
# 检查服务是否已经在运行
if [[ -f "$pid_file" ]]; then
local pid=$(cat "$pid_file")
if kill -0 "$pid" 2>/dev/null; then
log_info "DCGM Exporter 服务已在运行 (PID: $pid)"
return 0
else
log_warning "发现过期的 PID 文件,正在清理..."
rm -f "$pid_file"
fi
fi
# 检查端口是否被占用
if netstat -tuln 2>/dev/null | grep -q ":9400 "; then
log_warning "端口 9400 已被占用,请检查是否有其他服务在运行"
return 1
fi
# 启动服务
log_info "正在启动 DCGM Exporter..."
nohup "$binary_path" --address=:9400 > "$log_file" 2>&1 &
local pid=$!
# 保存 PID
echo "$pid" > "$pid_file"
# 等待服务启动
sleep 2
# 检查服务是否成功启动
if kill -0 "$pid" 2>/dev/null; then
log_success "DCGM Exporter 服务启动成功 (PID: $pid)"
log_info "日志文件: $log_file"
log_info "PID 文件: $pid_file"
# 更新安装记录
update_install_record "$pid" "$INSTALL_DIR"
else
log_error "DCGM Exporter 服务启动失败"
rm -f "$pid_file"
return 1
fi
}
# 显示安装信息
show_install_info() {
log_success "DCGM Exporter 安装完成!"
echo
echo "安装信息:"
echo " 二进制文件: /usr/local/bin/dcgm-exporter"
echo " 配置文件: /etc/dcgm-exporter/default-counters.csv"
echo " 默认端口: 9400"
echo
echo "使用方法:"
echo " 1. 启动 DCGM 服务:"
echo " sudo systemctl start dcgm"
echo " 或: nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &"
echo " 2. 启动 DCGM Exporter:"
echo " /usr/local/bin/dcgm-exporter --address=:9400"
echo " 或: nohup /usr/local/bin/dcgm-exporter --address=:9400 &"
echo
echo "测试连接:"
echo " curl http://localhost:9400/metrics"
echo
}
# 主函数
main() {
echo "=========================================="
echo " DCGM Exporter 安装脚本 v1.0"
echo "=========================================="
echo
check_root
check_system
log_info "开始安装 DCGM Exporter..."
install_dcgm_dependency
check_dcgm_service
install_dcgm_exporter
install_config
start_dcgm_exporter
show_install_info
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,88 @@
#!/bin/bash
set -e
# 颜色定义
GREEN='\033[0;32m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
# 获取当前目录
CURRENT_DIR=$(pwd)
PACKAGE_NAME="dcgm-exporter-$(date +%Y%m%d-%H%M%S)"
PACKAGE_FILE="${PACKAGE_NAME}.tar.gz"
log_info "开始打包 DCGM Exporter 安装包..."
# 检查必要文件
log_info "检查必要文件..."
required_files=(
"install.sh"
"uninstall.sh"
"bin/dcgm-exporter"
"bin/datacenter-gpu-manager_3.3.9_amd64.deb"
"check_health.sh"
)
missing_files=()
for file in "${required_files[@]}"; do
if [[ ! -f "$file" ]]; then
missing_files+=("$file")
fi
done
if [[ ${#missing_files[@]} -gt 0 ]]; then
echo "缺少以下文件:"
for file in "${missing_files[@]}"; do
echo " - $file"
done
exit 1
fi
log_success "所有必要文件检查完成"
# 创建临时目录
TEMP_DIR=$(mktemp -d)
log_info "创建临时目录: $TEMP_DIR"
# 复制文件到临时目录
cp -r . "$TEMP_DIR/$PACKAGE_NAME"
# 进入临时目录
cd "$TEMP_DIR"
# 创建压缩包
log_info "创建压缩包: $PACKAGE_FILE"
tar -czf "$PACKAGE_FILE" "$PACKAGE_NAME"
# 移动压缩包到原目录
mv "$PACKAGE_FILE" "$CURRENT_DIR/"
# 清理临时目录
rm -rf "$TEMP_DIR"
# 返回原目录
cd "$CURRENT_DIR"
# 显示结果
log_success "打包完成!"
echo
echo "安装包文件: $PACKAGE_FILE"
echo "文件大小: $(du -h "$PACKAGE_FILE" | cut -f1)"
echo
echo "使用方法:"
echo "1. 将 $PACKAGE_FILE 传输到目标服务器"
echo "2. 解压: tar -xzf $PACKAGE_FILE"
echo "3. 进入目录: cd $PACKAGE_NAME"
echo "4. 运行安装: sudo ./install.sh"
echo
echo "注意: 请确保 config/default-counters.csv 文件存在"

View File

@ -0,0 +1,216 @@
#!/bin/bash
# DCGM Exporter 卸载脚本
# 版本: 1.0
# 作者: AIOps Team
# 日期: $(date +%Y-%m-%d)
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 检查是否为 root 用户
check_root() {
if [[ $EUID -ne 0 ]]; then
log_error "此脚本需要 root 权限运行"
log_info "请使用: sudo $0"
exit 1
fi
}
# 停止运行中的进程
stop_processes() {
log_info "停止 DCGM Exporter 进程..."
local pid_file="/var/run/dcgm-exporter.pid"
local stopped=false
# 首先尝试通过 PID 文件停止服务
if [[ -f "$pid_file" ]]; then
local pid=$(cat "$pid_file")
if kill -0 "$pid" 2>/dev/null; then
log_info "通过 PID 文件停止服务 (PID: $pid)..."
kill "$pid"
sleep 3
# 检查进程是否已停止
if kill -0 "$pid" 2>/dev/null; then
log_warning "进程未响应,强制终止..."
kill -9 "$pid" 2>/dev/null || true
fi
log_success "DCGM Exporter 进程已停止"
stopped=true
else
log_warning "PID 文件存在但进程已不存在,清理 PID 文件"
rm -f "$pid_file"
fi
fi
# 查找并杀死所有 dcgm-exporter 进程
local pids=$(pgrep -f "dcgm-exporter" 2>/dev/null || true)
if [[ -n "$pids" ]]; then
log_info "发现 dcgm-exporter 进程,正在停止..."
for pid in $pids; do
log_info "停止进程 PID: $pid"
kill "$pid" 2>/dev/null || true
done
sleep 2
# 检查是否还有进程在运行,如果有则强制终止
local remaining_pids=$(pgrep -f "dcgm-exporter" 2>/dev/null || true)
if [[ -n "$remaining_pids" ]]; then
log_warning "进程未响应,强制终止..."
for pid in $remaining_pids; do
log_info "强制终止进程 PID: $pid"
kill -9 "$pid" 2>/dev/null || true
done
sleep 1
fi
# 最终检查
if pgrep -f "dcgm-exporter" > /dev/null; then
log_error "无法停止所有 dcgm-exporter 进程"
else
log_success "所有 DCGM Exporter 进程已停止"
stopped=true
fi
else
log_info "DCGM Exporter 进程未运行"
fi
# 清理 PID 文件
rm -f "$pid_file"
if [[ "$stopped" == "false" ]]; then
log_warning "未发现需要停止的 DCGM Exporter 进程"
fi
}
# 删除二进制文件
remove_binary() {
log_info "删除 DCGM Exporter 二进制文件..."
local binary_file="/usr/local/bin/dcgm-exporter"
if [[ -f "$binary_file" ]]; then
rm -f "$binary_file"
log_success "二进制文件已删除"
else
log_info "二进制文件不存在"
fi
}
# 删除配置文件
remove_config() {
log_info "删除配置文件..."
local config_dir="/etc/dcgm-exporter"
if [[ -d "$config_dir" ]]; then
rm -rf "$config_dir"
log_success "配置目录已删除"
else
log_info "配置目录不存在"
fi
}
# 卸载 DCGM 依赖(可选)
remove_dcgm_dependency() {
log_info "检查 DCGM 依赖状态..."
# 检查是否安装了 DCGM 包
if dpkg -l | grep -q datacenter-gpu-manager; then
log_info "检测到 DCGM 依赖包已安装"
log_warning "DCGM 是系统级依赖,可能被其他应用程序使用"
log_info "为了系统稳定性,将保留 DCGM 依赖包"
log_info "如需手动卸载,请运行: sudo apt-get remove --purge datacenter-gpu-manager"
else
log_info "DCGM 依赖包未安装"
fi
}
# 清理日志文件
cleanup_logs() {
log_info "清理日志文件..."
# 清理 journal 日志
journalctl --vacuum-time=1s --quiet || true
# 删除可能的日志文件
rm -f /var/log/nv-hostengine.log
rm -f /var/log/dcgm-exporter.log
log_success "日志文件已清理"
}
# 显示卸载信息
show_uninstall_info() {
log_success "DCGM Exporter 卸载完成!"
echo
echo "已删除的内容:"
echo " - 二进制文件: /usr/local/bin/dcgm-exporter"
echo " - 配置目录: /etc/dcgm-exporter"
echo " - 相关日志文件"
echo
echo "注意:"
echo " - DCGM 依赖包可能仍然存在"
echo " - 如需完全清理,请手动检查并删除相关文件"
echo
}
# 主函数
main() {
echo "=========================================="
echo " DCGM Exporter 卸载脚本 v1.0"
echo "=========================================="
echo
check_root
log_warning "此操作将完全卸载 DCGM Exporter"
read -p "确认继续?(y/N): " confirm
if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then
log_info "取消卸载操作"
exit 0
fi
log_info "开始卸载 DCGM Exporter..."
stop_processes
remove_binary
remove_config
cleanup_logs
# 询问是否卸载 DCGM 依赖
remove_dcgm_dependency
show_uninstall_info
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,181 @@
# Fluent Bit 安装包
这是一个 Fluent Bit 的自动化安装包,提供了完整的安装、卸载和健康检查功能。
## 目录结构
```
fluent-bit-installer/
├── install.sh # 安装脚本
├── uninstall.sh # 卸载脚本
├── package.sh # 打包脚本
├── check_health.sh # 健康检查脚本
├── bin/
│ └── fluent-bit_3.1.9_amd64.deb # Fluent Bit 安装包
└── config/
├── fluent-bit.conf # 主配置文件
├── inject_labels.lua # Lua 脚本
├── parsers.conf # 解析器配置
├── inputs.d/ # 输入配置目录
│ ├── 10-train.conf
│ └── 20-infer.conf
└── outputs.d/ # 输出配置目录
└── 10-es.conf
```
## 功能特性
- **自动化安装**: 一键安装 Fluent Bit 及其依赖
- **配置管理**: 自动部署预配置的配置文件
- **服务管理**: 自动启动和停止 Fluent Bit 服务
- **健康检查**: 提供 JSON 格式的健康状态检查
- **完整卸载**: 彻底清理所有相关文件和配置
- **用户管理**: 自动创建专用的 fluent-bit 用户
## 使用方法
### 1. 打包安装包
```bash
./package.sh
```
这将创建一个带时间戳的压缩包,例如:`fluent-bit-installer-20250924-160954.tar.gz`
### 2. 安装 Fluent Bit
```bash
# 解压安装包
tar -xzf fluent-bit-installer-*.tar.gz
cd fluent-bit-installer-*
# 运行安装脚本(需要 root 权限)
sudo ./install.sh
```
### 3. 健康检查
```bash
./check_health.sh
```
输出示例:
```json
{"name": "fluent-bit", "status": "health", "reason": "success"}
```
### 4. 卸载 Fluent Bit
```bash
sudo ./uninstall.sh
```
## 安装后的文件位置
- **二进制文件**: `/opt/fluent-bit/bin/fluent-bit`
- **配置文件**: `/etc/fluent-bit/`
- **日志文件**: `/var/log/fluent-bit/`
- **缓冲区目录**: `/var/lib/fluent-bit/buffers/`
- **运行用户**: `fluent-bit`
- **HTTP 端口**: `2020`
## 配置说明
### 主配置文件
主配置文件位于 `/etc/fluent-bit/fluent-bit.conf`,包含以下主要部分:
- **SERVICE**: 服务配置,包括 HTTP 服务器设置
- **INPUT**: 输入配置,通过 `inputs.d/` 目录管理
- **FILTER**: 过滤器配置,包括解析器和标签注入
- **OUTPUT**: 输出配置,通过 `outputs.d/` 目录管理
### 输入配置
- `10-train.conf`: 训练日志输入配置
- `20-infer.conf`: 推理日志输入配置
### 输出配置
- `10-es.conf`: Elasticsearch 输出配置
## 服务管理
### 手动启动
```bash
/opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf
```
### 后台启动
```bash
nohup /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf &
```
### 检查服务状态
```bash
# 检查进程
ps aux | grep fluent-bit
# 检查端口
netstat -tuln | grep 2020
# 检查日志
tail -f /var/log/fluent-bit/fluent-bit.log
```
## API 接口
Fluent Bit 提供 HTTP API 用于监控和管理:
- **根路径**: `http://localhost:2020`
- **状态接口**: `http://localhost:2020/api/v1/status`
- **指标接口**: `http://localhost:2020/api/v1/metrics`
## 故障排除
### 常见问题
1. **端口被占用**
- 检查端口 2020 是否被其他服务占用
- 修改配置文件中的端口设置
2. **权限问题**
- 确保 fluent-bit 用户有足够的权限访问日志文件
- 检查目录权限设置
3. **配置文件错误**
- 检查配置文件语法
- 查看日志文件中的错误信息
### 日志查看
```bash
# 查看服务日志
tail -f /var/log/fluent-bit/fluent-bit.log
# 查看系统日志
journalctl -u fluent-bit -f
```
## 系统要求
- **操作系统**: Ubuntu/Debian/CentOS/RHEL/Fedora
- **架构**: x86_64/amd64
- **权限**: root 权限(用于安装和卸载)
- **依赖**: curl用于健康检查
## 版本信息
- **Fluent Bit 版本**: 3.1.9
- **安装包版本**: 1.0
- **支持架构**: amd64
## 注意事项
1. 安装前请确保系统已更新
2. 卸载时会保留 fluent-bit 用户(系统用户,可能被其他服务使用)
3. 配置文件包含环境变量,请根据实际环境调整
4. 建议在生产环境使用前进行充分测试

View File

@ -0,0 +1,69 @@
#!/bin/bash
# Fluent Bit 健康检查脚本
# 输出 JSON 格式结果
set -e
# 检查 Fluent Bit 健康状态
check_health() {
local name="fluent-bit"
local status="unhealth"
local reason=""
local install_record="/opt/argus-metric/current/.install_record"
# 首先尝试通过安装记录文件检查进程
if [[ -f "$install_record" ]]; then
# 尝试使用jq解析JSON格式的安装记录文件
local pid=""
if command -v jq &> /dev/null; then
pid=$(jq -r '.components."fluent-bit".pid // empty' "$install_record" 2>/dev/null || echo "")
else
# 如果没有jq使用简单的文本解析方法
pid=$(grep -A 10 '"fluent-bit"' "$install_record" | grep '"pid"' | cut -d'"' -f4 | head -1)
fi
if [[ -n "$pid" && "$pid" =~ ^[0-9]+$ ]]; then
if kill -0 "$pid" 2>/dev/null; then
# 进程存在且运行正常
status="health"
reason="进程运行正常 (PID: $pid)"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 0
else
reason="安装记录中的 PID $pid 进程不存在"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 1
fi
else
reason="安装记录文件中未找到有效的 fluent-bit PID"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 1
fi
else
# 如果安装记录文件不存在,尝试查找 fluent-bit 进程
local pids=$(pgrep -f "fluent-bit" 2>/dev/null || true)
if [[ -n "$pids" ]]; then
# 取第一个找到的 PID
local pid=$(echo "$pids" | head -1)
status="health"
reason="发现 fluent-bit 进程运行 (PID: $pid),但未找到安装记录"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 0
else
reason="未找到 fluent-bit 进程,且安装记录文件不存在"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 1
fi
fi
}
# 主函数
main() {
check_health
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,37 @@
[SERVICE]
Daemon Off
Parsers_File parsers.conf
HTTP_Server On
HTTP_Listen 0.0.0.0
HTTP_Port 2020
storage.path /buffers
storage.sync normal
storage.checksum on
storage.backlog.mem_limit 128M
# 备注:该镜像默认未开启 Hot Reload修改配置后请重启容器。
@INCLUDE inputs.d/*.conf
[FILTER]
Name parser
Match app.*
Key_Name log
Parser timestamp_parser
Reserve_Data On
Preserve_Key On
Unescape_Key On
[FILTER]
Name record_modifier
Match *
Record cluster ${CLUSTER}
Record rack ${RACK}
Record host ${HOSTNAME}
[FILTER]
Name lua
Match app.*
script inject_labels.lua
call add_labels
@INCLUDE outputs.d/*.conf

View File

@ -0,0 +1,15 @@
function add_labels(tag, ts, record)
record["job_id"] = os.getenv("FB_JOB_ID") or record["job_id"] or "unknown"
record["user"] = os.getenv("FB_USER") or record["user"] or "unknown"
record["model"] = os.getenv("FB_MODEL") or record["model"] or "unknown"
record["gpu_id"] = os.getenv("FB_GPU_ID") or record["gpu_id"] or "na"
local p = record["log_path"] or ""
if string.find(p, "/logs/infer/") then
record["role"] = "infer"
elseif string.find(p, "/logs/train/") then
record["role"] = "train"
else
record["role"] = record["role"] or "app"
end
return 1, ts, record
end

View File

@ -0,0 +1,10 @@
[INPUT]
Name tail
Path /logs/train/*.log
Tag app.train
Path_Key log_path
Refresh_Interval 5
DB /buffers/train.db
Skip_Long_Lines On
storage.type filesystem
multiline.parser python,go,java

View File

@ -0,0 +1,10 @@
[INPUT]
Name tail
Path /logs/infer/*.log
Tag app.infer
Path_Key log_path
Refresh_Interval 5
DB /buffers/infer.db
Skip_Long_Lines On
storage.type filesystem
multiline.parser python,go,java

View File

@ -0,0 +1,24 @@
# 重要:使用 Logstash_Format + Logstash_Prefix生成 train-*/infer-* 索引
[OUTPUT]
Name es
Match app.train
Host ${ES_HOST:-localhost}
Port ${ES_PORT:-9200}
Logstash_Format On
Logstash_Prefix train
Replace_Dots On
Generate_ID On
Retry_Limit False
Suppress_Type_Name On
[OUTPUT]
Name es
Match app.infer
Host ${ES_HOST:-localhost}
Port ${ES_PORT:-9200}
Logstash_Format On
Logstash_Prefix infer
Replace_Dots On
Generate_ID On
Retry_Limit False
Suppress_Type_Name On

View File

@ -0,0 +1,27 @@
[MULTILINE_PARSER]
Name python
Type regex
Flush 2
Rule "start_state" "/^\d{4}-\d{2}-\d{2}[\sT]/" "cont"
Rule "cont" "/^\s+|^Traceback|^\tat\s+/" "cont"
[MULTILINE_PARSER]
Name go
Type regex
Flush 2
Rule "start_state" "/^[0-9]{4}\/[0-9]{2}\/[0-9]{2}/" "cont"
Rule "cont" "/^\s+|^\t/" "cont"
[MULTILINE_PARSER]
Name java
Type regex
Flush 2
Rule "start_state" "/^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/" "cont"
Rule "cont" "/^\s+at\s+|^\t.../" "cont"
[PARSER]
Name timestamp_parser
Format regex
Regex ^(?<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?<level>\w+)\s+(?<message>.*)$
Time_Key timestamp
Time_Format %Y-%m-%d %H:%M:%S

View File

@ -0,0 +1,291 @@
#!/bin/bash
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
log_info "Starting Fluent Bit installation..."
# 解析命令行参数
INSTALL_DIR="${1:-/opt/argus-metric/current}"
# 更新安装记录
update_install_record() {
local pid="$1"
# 使用传入的安装目录参数,如果没有则使用默认值
local install_base_dir="${2:-/opt/argus-metric/current}"
local install_record="$install_base_dir/.install_record"
# 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建
if [[ ! -f "$install_record" ]]; then
log_info "安装记录文件不存在,将由主安装脚本创建"
return 0
fi
# 如果文件存在,说明是重启场景,只更新 PID 字段
if command -v jq &> /dev/null; then
# 读取当前 PID
local current_pid=$(jq -r '.components."fluent-bit".pid // ""' "$install_record" 2>/dev/null)
if [[ -z "$current_pid" ]]; then
log_warning "无法读取当前 PID跳过更新"
return 1
fi
# 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段
jq --arg new_pid "$pid" '.components."fluent-bit".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record"
log_info "PID updated: $current_pid -> $pid"
else
log_warning "jq 命令不可用,无法更新安装记录文件"
fi
}
# 检查是否为 root 用户
if [[ $EUID -ne 0 ]]; then
log_error "This script requires root privileges"
log_info "Please use: sudo $0"
exit 1
fi
# 停止可能运行的服务
log_info "Stopping existing fluent-bit processes..."
# 只匹配进程名为 fluent-bit 的进程
pids=$(pgrep -x fluent-bit 2>/dev/null || true)
if [[ -n "$pids" ]]; then
for pid in $pids; do
log_info "Stopping process PID: $pid"
kill "$pid" 2>/dev/null || true
done
sleep 2
# 检查是否还有残留进程
remaining_pids=$(pgrep -x fluent-bit 2>/dev/null || true)
if [[ -n "$remaining_pids" ]]; then
log_warning "Force killing unresponsive processes..."
for pid in $remaining_pids; do
kill -9 "$pid" 2>/dev/null || true
done
fi
fi
# 安装 Fluent Bit 依赖库 libpq5离线模式
log_info "Checking Fluent Bit dependency: libpq5 ..."
if ! ldconfig -p | grep -q libpq.so.5; then
if ls bin/libpq5_*.deb >/dev/null 2>&1; then
log_info "Installing local dependency package: libpq5"
DEBIAN_FRONTEND=noninteractive dpkg -i bin/libpq5_*.deb >/dev/null 2>&1 || {
log_error "Failed to install libpq5 from bin/, please check package validity"
exit 1
}
else
log_error "Missing dependency: libpq5 (libpq.so.5). Please put bin/libpq5_*.deb in the bin/ directory."
exit 1
fi
else
log_info "libpq.so.5 already present on system"
fi
# 安装 Fluent Bit 依赖库 libyaml-0-2离线模式
log_info "Checking Fluent Bit dependency: libyaml-0.so.2 ..."
if ! ldconfig -p | grep -q libyaml-0.so.2; then
if ls bin/libyaml-0-2_*.deb >/dev/null 2>&1; then
log_info "Installing local dependency package: libyaml-0-2"
DEBIAN_FRONTEND=noninteractive dpkg -i bin/libyaml-0-2_*.deb >/dev/null 2>&1 || {
log_error "Failed to install libyaml-0-2 from bin/, please check package validity"
exit 1
}
else
log_error "Missing dependency: libyaml-0-2 (libyaml-0.so.2). Please put bin/libyaml-0-2_*.deb in the bin/ directory."
exit 1
fi
else
log_info "libyaml-0.so.2 already present on system"
fi
# 清理可能存在的旧 fluent-bit 安装(避免配置文件冲突)
log_info "Cleaning up old fluent-bit installation if exists..."
if dpkg -l | grep -q "^ii.*fluent-bit"; then
log_info "Found existing fluent-bit package, removing..."
dpkg --purge fluent-bit 2>/dev/null || true
apt-get remove --purge -y fluent-bit 2>/dev/null || true
fi
# 确保清理残留的配置文件
if [[ -d "/etc/fluent-bit" ]]; then
log_info "Removing old fluent-bit configuration directory..."
rm -rf /etc/fluent-bit
fi
# 安装 Fluent Bit 主包
log_info "Installing Fluent Bit from deb package..."
deb_file="bin/fluent-bit_3.1.9_amd64.deb"
if [[ ! -f "$deb_file" ]]; then
log_error "Fluent Bit package not found: $deb_file"
exit 1
fi
DEBIAN_FRONTEND=noninteractive dpkg -i "$deb_file" >/dev/null 2>&1 || true
# 验证 Fluent Bit 可以运行
fb_version=$(/opt/fluent-bit/bin/fluent-bit --version 2>&1 | head -1)
log_info "Fluent Bit version: $fb_version"
# 创建 fluent-bit 用户
log_info "Creating fluent-bit user..."
if ! id "fluent-bit" &>/dev/null; then
useradd --no-create-home --shell /bin/false fluent-bit
fi
# 创建配置目录
log_info "Installing configuration files..."
mkdir -p /etc/fluent-bit
if [[ -d "config" ]]; then
cp -r config/* /etc/fluent-bit/
chown -R fluent-bit:fluent-bit /etc/fluent-bit
fi
# 创建日志和缓冲区目录
log_info "Creating log and buffer directories..."
mkdir -p /logs/train /logs/infer /buffers
chmod 755 /logs/train /logs/infer
chmod 770 /buffers
chown -R fluent-bit:fluent-bit /logs /buffers
# 启动 Fluent Bit
log_info "Starting Fluent Bit with configuration from /etc/fluent-bit/"
config_path="/etc/fluent-bit/fluent-bit.conf"
if [[ ! -f "$config_path" ]]; then
log_error "Configuration file not found: $config_path"
exit 1
fi
# 设置环境变量
log_info "Setting environment variables..."
# 获取非 127.0.0.1 的 IP 地址作为 HOSTNAME
if [[ -z "${HOSTNAME:-}" ]]; then
# 获取 177.x.x.x 段的 IP 地址
HOSTNAME=$(ip route get 8.8.8.8 2>/dev/null | grep -oP 'src \K\S+' | grep '^177\.' | head -1)
# 如果没有找到 177.x.x.x 段的 IP则获取第一个非 127.0.0.1 的 IP
if [[ -z "$HOSTNAME" ]]; then
HOSTNAME=$(ip route get 8.8.8.8 2>/dev/null | grep -oP 'src \K\S+' | grep -v '^127\.' | head -1)
fi
# 如果还是没有找到,使用 hostname 命令
if [[ -z "$HOSTNAME" ]]; then
HOSTNAME=$(hostname)
fi
fi
export HOSTNAME
export CLUSTER="${CLUSTER:-local}"
export RACK="${RACK:-dev}"
export ES_HOST="${ES_HOST:-localhost}"
export ES_PORT="${ES_PORT:-9200}"
log_info "Environment variables:"
log_info " CLUSTER=$CLUSTER"
log_info " RACK=$RACK"
log_info " HOSTNAME=$HOSTNAME"
log_info " ES_HOST=$ES_HOST"
log_info " ES_PORT=$ES_PORT"
# 检查 fluent-bit 二进制文件
log_info "[DEBUG] Checking fluent-bit binary..."
if [[ ! -f "/opt/fluent-bit/bin/fluent-bit" ]]; then
log_error "fluent-bit binary not found at /opt/fluent-bit/bin/fluent-bit"
exit 1
fi
log_info "[DEBUG] fluent-bit binary exists and is executable: $(ls -lh /opt/fluent-bit/bin/fluent-bit)"
# 检查配置文件
log_info "[DEBUG] Checking configuration file: $config_path"
if [[ ! -f "$config_path" ]]; then
log_error "Configuration file not found: $config_path"
exit 1
fi
log_info "[DEBUG] Configuration file exists: $(ls -lh $config_path)"
# 显示完整的启动命令
log_info "[DEBUG] Full command to execute:"
log_info "[DEBUG] su -s /bin/bash fluent-bit -c 'env CLUSTER=\"$CLUSTER\" RACK=\"$RACK\" HOSTNAME=\"$HOSTNAME\" ES_HOST=\"$ES_HOST\" ES_PORT=\"$ES_PORT\" /opt/fluent-bit/bin/fluent-bit --config=\"$config_path\"'"
# 清空或创建日志文件
log_info "[DEBUG] Preparing log file: /var/log/fluent-bit.log"
: > /var/log/fluent-bit.log
chmod 666 /var/log/fluent-bit.log
log_info "Command: /opt/fluent-bit/bin/fluent-bit --config=$config_path"
log_info "[DEBUG] Starting fluent-bit process as fluent-bit user (using su)..."
nohup su -s /bin/bash fluent-bit -c "env CLUSTER='$CLUSTER' RACK='$RACK' HOSTNAME='$HOSTNAME' ES_HOST='$ES_HOST' ES_PORT='$ES_PORT' /opt/fluent-bit/bin/fluent-bit --config='$config_path' >> /var/log/fluent-bit.log 2>&1" &
bg_pid=$!
log_info "[DEBUG] Background process started with PID: $bg_pid"
# 等待服务启动
log_info "[DEBUG] Waiting 3 seconds for service to start..."
sleep 3
# 查找实际的 fluent-bit 进程 PID
log_info "[DEBUG] Searching for fluent-bit process..."
log_info "[DEBUG] Running: pgrep -u fluent-bit -x fluent-bit"
actual_pid=$(pgrep -u fluent-bit -x fluent-bit | head -1)
# 显示所有 fluent-bit 相关进程
log_info "[DEBUG] All fluent-bit related processes:"
ps aux | grep fluent-bit | grep -v grep || log_warning "No fluent-bit processes found in ps output"
if [[ -n "$actual_pid" ]]; then
log_success "Fluent Bit started successfully (PID: $actual_pid)"
log_info "[DEBUG] Process details: $(ps -p $actual_pid -o pid,user,cmd --no-headers)"
# 更新安装记录
update_install_record "$actual_pid" "$INSTALL_DIR"
else
log_error "Fluent Bit failed to start - no fluent-bit process found"
log_info "[DEBUG] Checking if background process $bg_pid still exists..."
if ps -p $bg_pid > /dev/null 2>&1; then
log_warning "Background shell process $bg_pid still exists"
else
log_warning "Background shell process $bg_pid has exited"
fi
log_info "[DEBUG] Last 20 lines of /var/log/fluent-bit.log:"
if [[ -f "/var/log/fluent-bit.log" ]]; then
tail -20 /var/log/fluent-bit.log | while IFS= read -r line; do
log_info "[LOG] $line"
done
else
log_error "Log file /var/log/fluent-bit.log does not exist"
fi
exit 1
fi
log_success "Fluent Bit installation completed!"

View File

@ -0,0 +1,87 @@
#!/bin/bash
set -e
# 颜色定义
GREEN='\033[0;32m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
# 获取当前目录
CURRENT_DIR=$(pwd)
PACKAGE_NAME="fluent-bit-$(date +%Y%m%d-%H%M%S)"
PACKAGE_FILE="${PACKAGE_NAME}.tar.gz"
log_info "开始打包 Fluent Bit 安装包..."
# 检查必要文件
log_info "检查必要文件..."
required_files=(
"install.sh"
"uninstall.sh"
"bin/fluent-bit_3.1.9_amd64.deb"
"check_health.sh"
)
missing_files=()
for file in "${required_files[@]}"; do
if [[ ! -f "$file" ]]; then
missing_files+=("$file")
fi
done
if [[ ${#missing_files[@]} -gt 0 ]]; then
echo "缺少以下文件:"
for file in "${missing_files[@]}"; do
echo " - $file"
done
exit 1
fi
log_success "所有必要文件检查完成"
# 创建临时目录
TEMP_DIR=$(mktemp -d)
log_info "创建临时目录: $TEMP_DIR"
# 复制文件到临时目录
cp -r . "$TEMP_DIR/$PACKAGE_NAME"
# 进入临时目录
cd "$TEMP_DIR"
# 创建压缩包
log_info "创建压缩包: $PACKAGE_FILE"
tar -czf "$PACKAGE_FILE" "$PACKAGE_NAME"
# 移动压缩包到原目录
mv "$PACKAGE_FILE" "$CURRENT_DIR/"
# 清理临时目录
rm -rf "$TEMP_DIR"
# 返回原目录
cd "$CURRENT_DIR"
# 显示结果
log_success "打包完成!"
echo
echo "安装包文件: $PACKAGE_FILE"
echo "文件大小: $(du -h "$PACKAGE_FILE" | cut -f1)"
echo
echo "使用方法:"
echo "1. 将 $PACKAGE_FILE 传输到目标服务器"
echo "2. 解压: tar -xzf $PACKAGE_FILE"
echo "3. 进入目录: cd $PACKAGE_NAME"
echo "4. 运行安装: sudo ./install.sh"
echo
echo "注意: 请确保所有必要文件都存在"

View File

@ -0,0 +1,169 @@
#!/bin/bash
set -euo pipefail
echo "[INFO] Starting Fluent Bit uninstallation..."
# 检查是否为 root 用户
if [[ $EUID -ne 0 ]]; then
echo "[ERROR] This script requires root privileges"
echo "[INFO] Please use: sudo $0"
exit 1
fi
echo "[WARNING] This operation will completely uninstall Fluent Bit"
read -p "Confirm to continue? (y/N): " confirm
if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then
echo "[INFO] Uninstallation cancelled"
exit 0
fi
# 停止运行中的进程
echo "[INFO] Stopping Fluent Bit processes..."
install_record="/opt/argus-metric/current/.install_record"
stopped=false
# 首先尝试通过安装记录文件停止服务
if [[ -f "$install_record" ]]; then
# 尝试使用jq解析JSON格式的安装记录文件
pid=""
if command -v jq &> /dev/null; then
pid=$(jq -r '.components."fluent-bit".pid // empty' "$install_record" 2>/dev/null || echo "")
else
# 如果没有jq使用简单的文本解析方法
pid=$(grep -A 10 '"fluent-bit"' "$install_record" | grep '"pid"' | cut -d'"' -f4 | head -1)
fi
if [[ -n "$pid" && "$pid" =~ ^[0-9]+$ ]]; then
if kill -0 "$pid" 2>/dev/null; then
echo "[INFO] Stopping service via installation record (PID: $pid)..."
kill "$pid"
sleep 3
# 检查进程是否已停止
if kill -0 "$pid" 2>/dev/null; then
echo "[WARNING] Process unresponsive, force killing..."
kill -9 "$pid" 2>/dev/null || true
fi
echo "[SUCCESS] Fluent Bit process stopped"
stopped=true
else
echo "[WARNING] PID in installation record no longer exists"
fi
fi
fi
# 查找并杀死所有 fluent-bit 进程
pids=$(pgrep -f "fluent-bit" 2>/dev/null || true)
if [[ -n "$pids" ]]; then
echo "[INFO] Found fluent-bit processes, stopping..."
for pid in $pids; do
echo "[INFO] Stopping process PID: $pid"
kill "$pid" 2>/dev/null || true
done
sleep 2
# 检查是否还有进程在运行,如果有则强制终止
remaining_pids=$(pgrep -f "fluent-bit" 2>/dev/null || true)
if [[ -n "$remaining_pids" ]]; then
echo "[WARNING] Processes unresponsive, force killing..."
for pid in $remaining_pids; do
echo "[INFO] Force killing process PID: $pid"
kill -9 "$pid" 2>/dev/null || true
done
sleep 1
fi
# 最终检查
if pgrep -f "fluent-bit" > /dev/null; then
echo "[ERROR] Unable to stop all fluent-bit processes"
else
echo "[SUCCESS] All Fluent Bit processes stopped"
stopped=true
fi
else
echo "[INFO] No Fluent Bit processes running"
fi
if [[ "$stopped" == "false" ]]; then
echo "[WARNING] No Fluent Bit processes found to stop"
fi
# 卸载 Fluent Bit 包
echo "[INFO] Uninstalling Fluent Bit package..."
if dpkg -l | grep -q "fluent-bit"; then
echo "[INFO] Found fluent-bit package installed via dpkg, uninstalling..."
dpkg --remove --force-remove-reinstreq fluent-bit || true
echo "[SUCCESS] Fluent Bit package uninstalled"
else
echo "[INFO] No fluent-bit package found via package manager"
fi
# 删除二进制文件
echo "[INFO] Removing Fluent Bit binary files..."
binary_dir="/opt/fluent-bit"
if [[ -d "$binary_dir" ]]; then
rm -rf "$binary_dir"
echo "[SUCCESS] Binary directory removed: $binary_dir"
else
echo "[INFO] Binary directory does not exist"
fi
# 删除配置文件
echo "[INFO] Removing configuration files..."
config_dir="/etc/fluent-bit"
if [[ -d "$config_dir" ]]; then
rm -rf "$config_dir"
echo "[SUCCESS] Configuration directory removed"
else
echo "[INFO] Configuration directory does not exist"
fi
# 删除数据目录
echo "[INFO] Removing data directories..."
data_dirs=("/logs" "/buffers")
deleted=false
for data_dir in "${data_dirs[@]}"; do
if [[ -d "$data_dir" ]]; then
rm -rf "$data_dir"
echo "[SUCCESS] Data directory removed: $data_dir"
deleted=true
fi
done
if [[ "$deleted" == "false" ]]; then
echo "[INFO] No data directories found"
fi
# 清理安装记录
echo "[INFO] Cleaning up installation record..."
if [[ -f "$install_record" ]]; then
# 从安装记录中移除 fluent-bit 条目
sed -i '/^fluent-bit:/d' "$install_record"
echo "[SUCCESS] Installation record cleaned"
else
echo "[INFO] Installation record file does not exist"
fi
# 检查用户状态
echo "[INFO] Checking fluent-bit user status..."
if id "fluent-bit" &>/dev/null; then
echo "[INFO] fluent-bit user exists"
echo "[WARNING] fluent-bit is a system user, may be used by other services"
echo "[INFO] fluent-bit user will be preserved for system stability"
echo "[INFO] To manually remove, run: sudo userdel fluent-bit"
else
echo "[INFO] fluent-bit user does not exist"
fi
echo "[SUCCESS] Fluent Bit uninstallation completed!"
echo
echo "Removed content:"
echo " - Binary directory: /opt/fluent-bit"
echo " - Configuration directory: /etc/fluent-bit"
echo " - Application log directory: /logs"
echo " - Buffer directory: /buffers"
echo
echo "Note:"
echo " - fluent-bit user preserved (system user, may be used by other services)"
echo " - For complete cleanup, manually check and remove related files"

Binary file not shown.

View File

@ -0,0 +1,55 @@
#!/bin/bash
# Node Exporter 健康检查脚本
# 输出 JSON 格式结果
set -e
# 检查 Node Exporter 健康状态
check_health() {
local url="http://localhost:9100"
local metrics_url="$url/metrics"
local name="node-exporter"
local status="unhealth"
local reason=""
# 检查 curl 是否可用
if ! command -v curl &> /dev/null; then
reason="curl 命令不可用,无法进行健康检查"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 1
fi
# 测试根路径连接
local http_code=$(curl -s -o /dev/null -w "%{http_code}" "$url" 2>/dev/null || echo "000")
if [[ "$http_code" == "200" ]]; then
# 测试 metrics 端点
local metrics_code=$(curl -s -o /dev/null -w "%{http_code}" "$metrics_url" 2>/dev/null || echo "000")
if [[ "$metrics_code" == "200" ]]; then
status="health"
reason="success"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 0
else
reason="Metrics 端点异常 (HTTP $metrics_code)"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 1
fi
else
reason="HTTP 服务异常 (HTTP $http_code),请检查 Node Exporter 是否正在运行在端口 9100"
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
exit 1
fi
}
# 主函数
main() {
check_health
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,343 @@
#!/bin/bash
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 更新安装记录
update_install_record() {
local pid="$1"
# 使用传入的安装目录参数,如果没有则使用默认值
local install_base_dir="${2:-/opt/argus-metric/current}"
local install_record="$install_base_dir/.install_record"
# 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建
if [[ ! -f "$install_record" ]]; then
log_info "安装记录文件不存在,将由主安装脚本创建"
return 0
fi
# 如果文件存在,说明是重启场景,只更新 PID 字段
if command -v jq &> /dev/null; then
# 读取当前 PID
local current_pid=$(jq -r '.components."node-exporter".pid // ""' "$install_record" 2>/dev/null)
if [[ -z "$current_pid" ]]; then
log_warning "无法读取当前 PID跳过更新"
return 1
fi
# 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段
jq --arg new_pid "$pid" '.components."node-exporter".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record"
log_info "PID 已更新: $current_pid -> $pid"
else
log_warning "jq 命令不可用,无法更新安装记录文件"
fi
}
# 显示帮助信息
show_help() {
echo "Node Exporter 安装脚本"
echo
echo "用法: $0 [选项]"
echo
echo "选项:"
echo " --help 显示此帮助信息"
echo
echo "示例:"
echo " $0 # 安装 Node Exporter"
echo
}
# 解析命令行参数
INSTALL_DIR=""
for arg in "$@"; do
case $arg in
--help|-h)
show_help
exit 0
;;
*)
# 如果参数不是以--开头,则认为是安装目录
if [[ ! "$arg" =~ ^-- ]]; then
INSTALL_DIR="$arg"
else
log_error "未知参数: $arg"
show_help
exit 1
fi
;;
esac
done
# 检查是否为 root 用户
check_root() {
if [[ $EUID -ne 0 ]]; then
log_error "此脚本需要 root 权限运行"
log_info "请使用: sudo $0"
exit 1
fi
}
# 检查系统要求
check_system() {
log_info "检查系统要求..."
# 检查操作系统
if [[ ! -f /etc/os-release ]]; then
log_error "无法检测操作系统版本"
exit 1
fi
source /etc/os-release
log_info "检测到操作系统: $NAME $VERSION"
# 检查是否为 Linux 系统
if [[ "$ID" != "ubuntu" && "$ID" != "debian" && "$ID" != "centos" && "$ID" != "rhel" && "$ID" != "fedora" ]]; then
log_warning "此脚本主要针对常见 Linux 发行版,其他系统可能需要调整"
fi
# 检查系统架构
local arch=$(uname -m)
log_info "系统架构: $arch"
if [[ "$arch" != "x86_64" && "$arch" != "amd64" ]]; then
log_warning "当前架构为 $archnode_exporter 主要支持 x86_64/amd64"
fi
}
stop_existing_service() {
log_info "检查并停止可能运行的 Node Exporter 服务..."
# 当前脚本 PID防止误杀
SELF_PID=$$
# 1. 停止 systemd 服务(如果存在)
if systemctl list-units --full -all | grep -q "node_exporter.service"; then
log_info "检测到 systemd 服务 node_exporter正在停止..."
systemctl stop node_exporter || true
systemctl disable node_exporter || true
fi
# 2. 清理可能存在的 PID 文件
for pid_file in /var/run/node-exporter.pid /var/run/node_exporter.pid /tmp/node_exporter.pid; do
if [[ -f "$pid_file" ]]; then
local pid=$(cat "$pid_file")
if kill -0 "$pid" 2>/dev/null; then
log_info "发现 Node Exporter (PID: $pid),正在停止..."
kill "$pid"
sleep 2
kill -0 "$pid" 2>/dev/null && kill -9 "$pid"
fi
rm -f "$pid_file"
fi
done
# 3. 用 pgrep 查找进程,排除当前脚本
local pids=$(pgrep -f "node_exporter|node-exporter|/usr/local/bin/node-exporter" | grep -vw "$SELF_PID" || true)
if [[ -n "$pids" ]]; then
log_info "发现 Node Exporter 进程 (PID: $pids),正在停止..."
for pid in $pids; do
if kill -0 "$pid" 2>/dev/null; then
kill "$pid" 2>/dev/null || true
sleep 1
kill -0 "$pid" 2>/dev/null && kill -9 "$pid" 2>/dev/null || true
fi
done
fi
# 4. 兜底:检查是否有进程占用 9100 端口
local listen_pids=$(lsof -ti:9100 2>/dev/null || true)
if [[ -n "$listen_pids" ]]; then
log_warning "发现占用 9100 端口的进程 (PID: $listen_pids),强制终止..."
for pid in $listen_pids; do
kill -9 "$pid" 2>/dev/null || true
done
sleep 1
fi
# 5. 最终验证
if netstat -tuln 2>/dev/null | grep -q ":9100 "; then
log_error "端口 9100 仍被占用,请手动检查"
return 1
else
log_success "旧的 Node Exporter 已完全停止"
fi
}
# 安装 Node Exporter 二进制文件
install_node_exporter() {
log_info "安装 Node Exporter..."
local binary_file="bin/node_exporter"
local install_dir="/usr/local/bin"
if [[ ! -f "$binary_file" ]]; then
log_error "找不到 Node Exporter 二进制文件: $binary_file"
exit 1
fi
# 停止可能运行的服务
stop_existing_service
# 复制二进制文件并重命名为统一格式
cp "$binary_file" "$install_dir/node-exporter"
chmod +x "$install_dir/node-exporter"
log_success "Node Exporter 二进制文件安装完成"
}
# 创建用户和组
create_user() {
log_info "创建 node_exporter 用户..."
# 检查用户是否已存在
if id "node_exporter" &>/dev/null; then
log_info "用户 node_exporter 已存在"
else
useradd --no-create-home --shell /bin/false node_exporter
log_success "用户 node_exporter 创建完成"
fi
}
# 安装配置文件
install_config() {
log_info "安装配置文件..."
local config_dir="/etc/node_exporter"
# 创建配置目录
mkdir -p "$config_dir"
# 创建文本文件收集器目录
mkdir -p "/var/lib/node_exporter/textfile_collector"
chown node_exporter:node_exporter "/var/lib/node_exporter/textfile_collector"
}
# 启动 Node Exporter 服务
start_node_exporter() {
log_info "启动 Node Exporter 服务..."
local binary_path="/usr/local/bin/node-exporter"
local log_file="/var/log/node-exporter.log"
local pid_file="/var/run/node-exporter.pid"
# 检查服务是否已经在运行
if [[ -f "$pid_file" ]]; then
local pid=$(cat "$pid_file")
if kill -0 "$pid" 2>/dev/null; then
log_info "Node Exporter 服务已在运行 (PID: $pid)"
return 0
else
log_warning "发现过期的 PID 文件,正在清理..."
rm -f "$pid_file"
fi
fi
# 检查端口是否被占用
if netstat -tuln 2>/dev/null | grep -q ":9100 "; then
log_warning "端口 9100 已被占用,请检查是否有其他服务在运行"
return 1
fi
# 启动服务
log_info "正在启动 Node Exporter..."
nohup "$binary_path" --web.listen-address=:9100 > "$log_file" 2>&1 &
local pid=$!
# 保存 PID
echo "$pid" > "$pid_file"
# 等待服务启动
sleep 2
# 检查服务是否成功启动
if kill -0 "$pid" 2>/dev/null; then
log_success "Node Exporter 服务启动成功 (PID: $pid)"
log_info "日志文件: $log_file"
log_info "PID 文件: $pid_file"
# 更新安装记录
update_install_record "$pid" "$INSTALL_DIR"
else
log_error "Node Exporter 服务启动失败"
rm -f "$pid_file"
return 1
fi
}
# 显示安装信息
show_install_info() {
log_success "Node Exporter 安装完成!"
echo
echo "安装信息:"
echo " 二进制文件: /usr/local/bin/node-exporter"
echo " 运行用户: node_exporter"
echo " 配置目录: /etc/node_exporter/"
echo " 默认端口: 9100"
echo
echo "使用方法:"
echo " 手动启动: /usr/local/bin/node-exporter --web.listen-address=:9100"
echo " 后台启动: nohup /usr/local/bin/node-exporter --web.listen-address=:9100 &"
echo
echo "测试连接:"
echo " curl http://localhost:9100/metrics"
echo " curl http://localhost:9100"
echo
echo "Prometheus 配置示例:"
echo " - job_name: 'node_exporter'"
echo " static_configs:"
echo " - targets: ['localhost:9100']"
echo
}
# 主函数
main() {
echo "=========================================="
echo " Node Exporter 安装脚本 v1.0"
echo "=========================================="
echo
check_root
check_system
log_info "开始安装 Node Exporter..."
install_node_exporter
create_user
install_config
start_node_exporter
show_install_info
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,87 @@
#!/bin/bash
set -e
# 颜色定义
GREEN='\033[0;32m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
# 获取当前目录
CURRENT_DIR=$(pwd)
PACKAGE_NAME="node-exporter-$(date +%Y%m%d-%H%M%S)"
PACKAGE_FILE="${PACKAGE_NAME}.tar.gz"
log_info "开始打包 Node Exporter 安装包..."
# 检查必要文件
log_info "检查必要文件..."
required_files=(
"install.sh"
"uninstall.sh"
"bin/node_exporter"
"check_health.sh"
)
missing_files=()
for file in "${required_files[@]}"; do
if [[ ! -f "$file" ]]; then
missing_files+=("$file")
fi
done
if [[ ${#missing_files[@]} -gt 0 ]]; then
echo "缺少以下文件:"
for file in "${missing_files[@]}"; do
echo " - $file"
done
exit 1
fi
log_success "所有必要文件检查完成"
# 创建临时目录
TEMP_DIR=$(mktemp -d)
log_info "创建临时目录: $TEMP_DIR"
# 复制文件到临时目录
cp -r . "$TEMP_DIR/$PACKAGE_NAME"
# 进入临时目录
cd "$TEMP_DIR"
# 创建压缩包
log_info "创建压缩包: $PACKAGE_FILE"
tar -czf "$PACKAGE_FILE" "$PACKAGE_NAME"
# 移动压缩包到原目录
mv "$PACKAGE_FILE" "$CURRENT_DIR/"
# 清理临时目录
rm -rf "$TEMP_DIR"
# 返回原目录
cd "$CURRENT_DIR"
# 显示结果
log_success "打包完成!"
echo
echo "安装包文件: $PACKAGE_FILE"
echo "文件大小: $(du -h "$PACKAGE_FILE" | cut -f1)"
echo
echo "使用方法:"
echo "1. 将 $PACKAGE_FILE 传输到目标服务器"
echo "2. 解压: tar -xzf $PACKAGE_FILE"
echo "3. 进入目录: cd $PACKAGE_NAME"
echo "4. 运行安装: sudo ./install.sh"
echo
echo "注意: 请确保所有必要文件都存在"

View File

@ -0,0 +1,239 @@
#!/bin/bash
# Node Exporter 卸载脚本
# 版本: 1.0
# 作者: AIOps Team
# 日期: $(date +%Y-%m-%d)
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 检查是否为 root 用户
check_root() {
if [[ $EUID -ne 0 ]]; then
log_error "此脚本需要 root 权限运行"
log_info "请使用: sudo $0"
exit 1
fi
}
# 停止运行中的进程
stop_processes() {
log_info "停止 Node Exporter 进程..."
local pid_file="/var/run/node-exporter.pid"
local stopped=false
# 首先尝试通过 PID 文件停止服务
if [[ -f "$pid_file" ]]; then
local pid=$(cat "$pid_file")
if kill -0 "$pid" 2>/dev/null; then
log_info "通过 PID 文件停止服务 (PID: $pid)..."
kill "$pid"
sleep 3
# 检查进程是否已停止
if kill -0 "$pid" 2>/dev/null; then
log_warning "进程未响应,强制终止..."
kill -9 "$pid" 2>/dev/null || true
fi
log_success "Node Exporter 进程已停止"
stopped=true
else
log_warning "PID 文件存在但进程已不存在,清理 PID 文件"
rm -f "$pid_file"
fi
fi
# 查找并杀死所有 node_exporter 和 node-exporter 进程
local pids=$(pgrep -f "node_exporter\|node-exporter" 2>/dev/null || true)
if [[ -n "$pids" ]]; then
log_info "发现 node_exporter 或 node-exporter 进程,正在停止..."
for pid in $pids; do
log_info "停止进程 PID: $pid"
kill "$pid" 2>/dev/null || true
done
sleep 2
# 检查是否还有进程在运行,如果有则强制终止
local remaining_pids=$(pgrep -f "node_exporter\|node-exporter" 2>/dev/null || true)
if [[ -n "$remaining_pids" ]]; then
log_warning "进程未响应,强制终止..."
for pid in $remaining_pids; do
log_info "强制终止进程 PID: $pid"
kill -9 "$pid" 2>/dev/null || true
done
sleep 1
fi
# 最终检查
if pgrep -f "node_exporter\|node-exporter" > /dev/null; then
log_error "无法停止所有 node_exporter 进程"
else
log_success "所有 Node Exporter 进程已停止"
stopped=true
fi
else
log_info "Node Exporter 进程未运行"
fi
# 清理 PID 文件
rm -f "$pid_file"
if [[ "$stopped" == "false" ]]; then
log_warning "未发现需要停止的 Node Exporter 进程"
fi
}
# 删除二进制文件
remove_binary() {
log_info "删除 Node Exporter 二进制文件..."
local binary_files=(
"/usr/local/bin/node-exporter"
"/usr/local/bin/node_exporter"
)
local deleted=false
for binary_file in "${binary_files[@]}"; do
if [[ -f "$binary_file" ]]; then
rm -f "$binary_file"
log_success "二进制文件已删除: $binary_file"
deleted=true
fi
done
if [[ "$deleted" == "false" ]]; then
log_info "二进制文件不存在"
fi
}
# 删除配置文件
remove_config() {
log_info "删除配置文件..."
local config_dir="/etc/node_exporter"
if [[ -d "$config_dir" ]]; then
rm -rf "$config_dir"
log_success "配置目录已删除"
else
log_info "配置目录不存在"
fi
}
# 删除数据目录
remove_data_dir() {
log_info "删除数据目录..."
local data_dir="/var/lib/node_exporter"
if [[ -d "$data_dir" ]]; then
rm -rf "$data_dir"
log_success "数据目录已删除"
else
log_info "数据目录不存在"
fi
}
# 检查用户状态(可选)
check_user_status() {
log_info "检查 node_exporter 用户状态..."
if id "node_exporter" &>/dev/null; then
log_info "检测到 node_exporter 用户存在"
log_warning "node_exporter 是系统用户,可能被其他服务使用"
log_info "为了系统稳定性,将保留 node_exporter 用户"
log_info "如需手动删除,请运行: sudo userdel node_exporter"
else
log_info "node_exporter 用户不存在"
fi
}
# 清理日志文件
cleanup_logs() {
log_info "清理日志文件..."
# 清理 journal 日志
journalctl --vacuum-time=1s --quiet || true
# 删除安装脚本创建的日志文件
rm -f /var/log/node-exporter.log
log_success "日志文件已清理"
}
# 显示卸载信息
show_uninstall_info() {
log_success "Node Exporter 卸载完成!"
echo
echo "已删除的内容:"
echo " - 二进制文件: /usr/local/bin/node-exporter"
echo " - 配置目录: /etc/node_exporter"
echo " - 数据目录: /var/lib/node_exporter"
echo " - 相关日志文件"
echo
echo "注意:"
echo " - node_exporter 用户已保留(系统用户,可能被其他服务使用)"
echo " - 如需完全清理,请手动检查并删除相关文件"
echo
}
# 主函数
main() {
echo "=========================================="
echo " Node Exporter 卸载脚本 v1.0"
echo "=========================================="
echo
check_root
log_warning "此操作将完全卸载 Node Exporter"
read -p "确认继续?(y/N): " confirm
if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then
log_info "取消卸载操作"
exit 0
fi
log_info "开始卸载 Node Exporter..."
stop_processes
remove_binary
remove_config
remove_data_dir
cleanup_logs
# 检查用户状态
check_user_status
show_uninstall_info
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,286 @@
#!/bin/bash
# 整体健康检查脚本,调用各个组件的健康检查并将结果写入 .health_log 文件
set -e
# PID 文件检测,防止重复执行
PIDFILE="/var/run/check_health.pid"
if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then
echo "健康检查脚本已在运行中,跳过本次执行" >&2
exit 0
fi
echo $$ > "$PIDFILE"
trap "rm -f $PIDFILE" EXIT
# 获取脚本所在目录
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
HEALTH_LOG_FILE="$SCRIPT_DIR/.health_log"
INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数 - 输出到 stderr 避免影响 JSON 结果
log_info() {
echo -e "${BLUE}[INFO]${NC} $1" >&2
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1" >&2
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1" >&2
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1" >&2
}
# 检查单个组件健康状态
check_component() {
local component_name="$1"
local check_script_path="$2"
log_info "检查 $component_name 健康状态..."
if [[ ! -f "$check_script_path" ]]; then
log_error "健康检查脚本不存在: $check_script_path"
echo "{\"name\": \"$component_name\", \"status\": \"unhealth\", \"reason\": \"健康检查脚本不存在: $check_script_path\"}"
return 1
fi
if [[ ! -x "$check_script_path" ]]; then
log_error "健康检查脚本无执行权限: $check_script_path"
echo "{\"name\": \"$component_name\", \"status\": \"unhealth\", \"reason\": \"健康检查脚本无执行权限: $check_script_path\"}"
return 1
fi
# 执行健康检查脚本,只捕获 stdoutstderr 输出到终端
local result
if result=$("$check_script_path" 2>/dev/null); then
log_success "$component_name 健康检查通过"
echo "$result"
return 0
else
log_warning "$component_name 健康检查失败"
echo "$result"
return 1
fi
}
# 生成时间戳
get_timestamp() {
date '+%Y-%m-%d %H:%M:%S'
}
# 生成UTC时间戳
get_utc_timestamp() {
date -u '+%Y-%m-%dT%H:%M:%SZ'
}
# 获取主机名
get_hostname() {
echo "${HOSTNAME:-$(hostname)}"
}
# 创建健康状态目录
create_health_dir() {
local hostname=$(get_hostname)
local health_dir="/private/argus/agent/$hostname/health"
if [[ ! -d "$health_dir" ]]; then
log_info "创建健康状态目录: $health_dir"
mkdir -p "$health_dir"
fi
echo "$health_dir"
}
# 写入单个模块的健康状态JSON文件
write_component_health_json() {
local component_name="$1"
local status="$2"
local error_msg="$3"
local health_dir="$4"
# 生成模块名前缀-xxx.json格式的文件名
local module_prefix="metric"
local filename="${module_prefix}-${component_name}.json"
local filepath="$health_dir/$filename"
# 生成UTC时间戳
local timestamp=$(get_utc_timestamp)
# 构建JSON内容
local json_content=$(cat << EOF
{
"status": "$status",
"error": "$error_msg",
"timestamp": "$timestamp"
}
EOF
)
# 写入文件
echo "$json_content" > "$filepath"
log_info "已写入模块健康状态文件: $filepath"
}
# 从安装记录文件中读取组件安装目录
read_install_record() {
local install_record_file="$1"
if [[ ! -f "$install_record_file" ]]; then
log_error "安装记录文件不存在: $install_record_file"
return 1
fi
# 检查是否有 jq 命令来解析 JSON
if command -v jq &> /dev/null; then
# 使用 jq 解析 JSON
local components_json
if components_json=$(jq -r '.components | to_entries[] | "\(.key):\(.value.install_dir)"' "$install_record_file" 2>/dev/null); then
echo "$components_json"
return 0
else
log_error "无法解析安装记录文件 JSON 格式: $install_record_file"
return 1
fi
else
# 如果没有 jq尝试简单的文本解析
log_warning "jq 命令不可用,尝试简单文本解析"
# 查找所有 install_dir 行
local components=()
while IFS= read -r line; do
if [[ "$line" =~ \"install_dir\":[[:space:]]*\"([^\"]+)\" ]]; then
local install_dir="${BASH_REMATCH[1]}"
# 从路径中提取组件名称
local component_name=$(basename "$install_dir")
components+=("$component_name:$install_dir")
fi
done < "$install_record_file"
if [[ ${#components[@]} -gt 0 ]]; then
printf '%s\n' "${components[@]}"
return 0
else
log_error "无法从安装记录文件中提取组件信息"
return 1
fi
fi
}
# 主函数
main() {
echo "==========================================" >&2
echo " 整体健康检查脚本" >&2
echo "==========================================" >&2
echo >&2
# 记录健康检查开始时间
local start_time=$(get_timestamp)
log_info "健康检查开始时间: $start_time"
# 创建健康状态目录
local health_dir
health_dir=$(create_health_dir)
# 从安装记录文件中读取组件信息
log_info "从安装记录文件读取组件信息: $INSTALL_RECORD_FILE"
local components_info
if ! components_info=$(read_install_record "$INSTALL_RECORD_FILE"); then
log_error "无法读取安装记录文件,健康检查终止"
exit 1
fi
# 存储所有检查结果
local all_results=()
local overall_status="health"
# 逐个检查组件
while IFS= read -r component_info; do
if [[ -n "$component_info" ]]; then
IFS=':' read -r component_name install_dir <<< "$component_info"
local check_script_path="$install_dir/check_health.sh"
local result
local component_status="healthy"
local error_msg=""
if result=$(check_component "$component_name" "$check_script_path"); then
all_results+=("$result")
else
all_results+=("$result")
overall_status="unhealth"
component_status="unhealthy"
# 从结果中提取错误信息
if command -v jq &> /dev/null; then
error_msg=$(echo "$result" | jq -r '.reason // ""' 2>/dev/null || echo "")
else
# 简单的文本解析提取错误信息
if [[ "$result" =~ \"reason\":[[:space:]]*\"([^\"]+)\" ]]; then
error_msg="${BASH_REMATCH[1]}"
fi
fi
fi
# 写入单个模块的健康状态JSON文件
write_component_health_json "$component_name" "$component_status" "$error_msg" "$health_dir"
fi
done <<< "$components_info"
# 记录健康检查结束时间
local end_time=$(get_timestamp)
log_info "健康检查结束时间: $end_time"
# 构建完整的健康检查结果 JSON
local health_check_result=$(cat << EOF
{
"start_time": "$start_time",
"end_time": "$end_time",
"overall_status": "$overall_status",
"components": [
$(printf '%s,\n' "${all_results[@]}" | sed '$s/,$//')
]
}
EOF
)
# 写入健康日志文件
log_info "将健康检查结果写入日志文件: $HEALTH_LOG_FILE"
echo "$health_check_result" >> "$HEALTH_LOG_FILE"
# 输出 JSON 结果到 stdout
echo "$health_check_result"
# 显示总结到 stderr
echo >&2
echo "==========================================" >&2
echo " 健康检查总结" >&2
echo "==========================================" >&2
echo "开始时间: $start_time" >&2
echo "结束时间: $end_time" >&2
echo "整体状态: $overall_status" >&2
echo "日志文件: $HEALTH_LOG_FILE" >&2
echo >&2
if [[ "$overall_status" == "health" ]]; then
log_success "所有组件健康检查通过!"
exit 0
else
log_error "部分组件健康检查失败,请查看上述详细信息"
exit 1
fi
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,240 @@
#!/bin/bash
# 版本校验脚本
# 比较本地 LATEST_VERSION 与 FTP 的 VERSION 版本,如果不一致则更新对应版本
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数 - 输出到 stderr 避免影响函数返回值
log_info() {
echo -e "${BLUE}[INFO]${NC} $1" >&2
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1" >&2
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1" >&2
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1" >&2
}
# 获取脚本所在目录
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# 动态获取当前版本目录
get_current_version_dir() {
# 查找 /opt/argus-metric/versions/ 下的最新版本目录
local versions_dir="/opt/argus-metric/versions"
if [[ -d "$versions_dir" ]]; then
# 按版本号排序,获取最新的版本目录
local latest_version_dir=$(ls -1 "$versions_dir" 2>/dev/null | sort -V | tail -1)
if [[ -n "$latest_version_dir" ]]; then
echo "$versions_dir/$latest_version_dir"
else
echo "/opt/argus-metric"
fi
else
echo "/opt/argus-metric"
fi
}
# 获取当前版本目录
CURRENT_VERSION_DIR=$(get_current_version_dir)
# LATEST_VERSION 文件在根目录
LOCAL_VERSION_FILE="/opt/argus-metric/LATEST_VERSION"
REMOTE_VERSION_URL=""
LOG_FILE="$CURRENT_VERSION_DIR/.version_check.log"
# 从环境变量或配置文件获取 FTP 服务器信息
get_ftp_config() {
# 优先从环境变量获取配置
log_info "获取 FTP 配置信息..."
# 如果环境变量中没有设置,则尝试从配置文件读取
if [[ -z "$FTP_SERVER" || -z "$FTP_USER" || -z "$FTP_PASSWORD" ]]; then
local config_file="$SCRIPT_DIR/../config/config.env"
if [[ -f "$config_file" ]]; then
log_info "从配置文件读取 FTP 配置: $config_file"
source "$config_file"
fi
else
log_info "使用环境变量中的 FTP 配置"
fi
# 设置默认值(如果环境变量和配置文件都没有设置)
FTP_SERVER="${FTP_SERVER:-localhost}"
FTP_USER="${FTP_USER:-ftpuser}"
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
# 构建远程版本文件 URL
REMOTE_VERSION_URL="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/LATEST_VERSION"
log_info "FTP 配置来源: ${FTP_CONFIG_SOURCE:-环境变量/配置文件}"
}
# 获取远程版本号
get_remote_version() {
log_info "从 FTP 服务器获取远程版本号..."
log_info "远程地址: $REMOTE_VERSION_URL"
# 先测试 FTP 连接
log_info "测试 FTP 连接..."
if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null 2>&1; then
log_success "FTP 服务器连接成功"
else
log_error "无法连接到 FTP 服务器: $FTP_SERVER"
return 1
fi
# 测试 LATEST_VERSION 文件是否存在
log_info "检查远程 LATEST_VERSION 文件是否存在..."
if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/LATEST_VERSION" >/dev/null 2>&1; then
log_success "远程 LATEST_VERSION 文件存在"
else
log_error "远程 LATEST_VERSION 文件不存在或无法访问"
return 1
fi
# 获取远程版本号
local remote_version
if remote_version=$(curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfL "ftp://${FTP_SERVER}/LATEST_VERSION" 2>/dev/null | tr -d '[:space:]'); then
if [[ -n "$remote_version" ]]; then
log_success "获取到远程版本号: $remote_version"
echo "$remote_version"
else
log_error "远程版本号为空"
return 1
fi
else
log_error "获取远程版本号失败"
return 1
fi
}
# 获取本地版本号
get_local_version() {
if [[ -f "$LOCAL_VERSION_FILE" ]]; then
local local_version=$(cat "$LOCAL_VERSION_FILE" 2>/dev/null | tr -d '[:space:]')
if [[ -n "$local_version" ]]; then
log_info "本地版本号: $local_version"
echo "$local_version"
else
log_warning "本地版本文件为空"
echo ""
fi
else
log_warning "本地版本文件不存在: $LOCAL_VERSION_FILE"
echo ""
fi
}
# 更新到新版本
update_to_version() {
local new_version="$1"
local temp_dir="/tmp/argus-update-$$"
local setup_script="$temp_dir/setup.sh"
log_info "开始更新到版本: $new_version"
# 创建临时目录
mkdir -p "$temp_dir"
# 下载最新的 setup.sh
log_info "从 FTP 服务器下载最新的安装脚本..."
local setup_url="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/setup.sh"
if curl -fsS "$setup_url" -o "$setup_script"; then
log_success "安装脚本下载完成"
else
log_error "下载安装脚本失败: $setup_url"
rm -rf "$temp_dir"
return 1
fi
# 添加执行权限
chmod +x "$setup_script"
# 执行安装脚本
log_info "执行安装脚本进行版本更新..."
if "$setup_script" --server "$FTP_SERVER" --user "$FTP_USER" --password "$FTP_PASSWORD" --version "$new_version"; then
log_success "版本更新完成: $new_version"
rm -rf "$temp_dir"
return 0
else
log_error "版本更新失败: $new_version"
rm -rf "$temp_dir"
return 1
fi
}
# 记录检查日志
log_check() {
local message="$1"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$timestamp] $message" >> "$LOG_FILE"
}
# 主函数
main() {
log_info "开始版本校验检查..."
log_check "版本校验检查开始"
# 确保系统目录存在
mkdir -p "/opt/argus-metric"
mkdir -p "$CURRENT_VERSION_DIR"
log_info "当前版本目录: $CURRENT_VERSION_DIR"
# 获取 FTP 配置
get_ftp_config
# 获取本地版本号
local local_version
local_version=$(get_local_version)
# 获取远程版本号
local remote_version
if ! remote_version=$(get_remote_version); then
log_error "无法获取远程版本号,跳过本次检查"
log_check "版本校验失败:无法获取远程版本号"
exit 1
fi
# 比较版本号
if [[ "$local_version" == "$remote_version" ]]; then
log_info "版本一致,无需更新 (本地: $local_version, 远程: $remote_version)"
log_check "版本校验完成:版本一致 ($local_version)"
else
log_info "检测到版本不一致 (本地: $local_version, 远程: $remote_version)"
log_check "检测到版本不一致:本地($local_version) -> 远程($remote_version)"
# 更新到新版本
if update_to_version "$remote_version"; then
log_success "版本更新成功: $local_version -> $remote_version"
log_check "版本更新成功:$local_version -> $remote_version"
else
log_error "版本更新失败"
log_check "版本更新失败:$local_version -> $remote_version"
exit 1
fi
fi
log_success "版本校验检查完成"
log_check "版本校验检查完成"
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,991 @@
#!/bin/bash
set -e
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
log_info() {
local message="[INFO] $1"
echo -e "${BLUE}${message}${NC}"
echo "$(date '+%Y-%m-%d %H:%M:%S') $message" >> "$LOG_FILE"
}
log_success() {
local message="[SUCCESS] $1"
echo -e "${GREEN}${message}${NC}"
echo "$(date '+%Y-%m-%d %H:%M:%S') $message" >> "$LOG_FILE"
}
log_warning() {
local message="[WARNING] $1"
echo -e "${YELLOW}${message}${NC}"
echo "$(date '+%Y-%m-%d %H:%M:%S') $message" >> "$LOG_FILE"
}
log_error() {
local message="[ERROR] $1"
echo -e "${RED}${message}${NC}"
echo "$(date '+%Y-%m-%d %H:%M:%S') $message" >> "$LOG_FILE"
}
# 配置变量
INSTALL_DIR="${1:-$(pwd)}" # 使用第一个参数作为安装目录,如果没有参数则使用当前目录
TEMP_DIR="/tmp/metrics-install-$$"
VERSION_FILE="version.json"
LOG_FILE="${INSTALL_DIR}/.install.log" # 安装日志文件
# 加载配置文件
load_config() {
local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
local config_file="$script_dir/config.env"
if [[ -f "$config_file" ]]; then
log_info "加载配置文件: $config_file"
# 导出配置文件中的环境变量
set -a # 自动导出所有变量
source "$config_file"
set +a # 关闭自动导出
log_success "配置文件加载完成"
else
log_warning "配置文件不存在: $config_file,使用默认配置"
fi
}
# 复制配置文件到安装目录
copy_config_files() {
log_info "复制配置文件到安装目录..."
local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
local source_config="$script_dir/../config/config.env"
local target_config="$INSTALL_DIR/config.env"
if [[ -f "$source_config" ]]; then
# 检查源文件和目标文件是否是同一个文件
if [[ "$source_config" == "$target_config" ]]; then
log_info "配置文件已在目标位置,跳过复制"
log_success "配置文件已存在: $target_config"
else
if cp "$source_config" "$target_config"; then
log_success "配置文件复制完成: $target_config"
else
log_error "配置文件复制失败"
return 1
fi
fi
else
log_warning "源配置文件不存在: $source_config"
fi
# 复制版本校验脚本
log_info "复制版本校验脚本到安装目录..."
local target_check_version="$INSTALL_DIR/check_version.sh"
# 检查目标文件是否已存在(从 artifact 包中解压出来的)
if [[ -f "$target_check_version" ]]; then
log_info "版本校验脚本已存在,设置执行权限..."
chmod +x "$target_check_version"
log_success "版本校验脚本权限设置完成: $target_check_version"
else
log_warning "版本校验脚本不存在: $target_check_version"
log_info "请确保 check_version.sh 已包含在 artifact 包中"
fi
}
check_root() {
if [[ $EUID -ne 0 ]]; then
log_error "此脚本需要 root 权限运行"
log_info "请使用: sudo $0 [安装目录]"
log_info "如果不指定安装目录,将使用当前目录: $(pwd)"
exit 1
fi
}
# 检查系统要求
check_system() {
log_info "检查系统要求..."
# 检查操作系统
if [[ ! -f /etc/os-release ]]; then
log_error "无法检测操作系统版本"
exit 1
fi
source /etc/os-release
log_info "检测到操作系统: $NAME $VERSION"
# 检查系统架构
arch=$(uname -m)
log_info "系统架构: $arch"
# 检查磁盘空间
available_space=$(df / | awk 'NR==2 {print $4}')
if [[ $available_space -lt 10485760 ]]; then # 10GB in KB
log_warning "可用磁盘空间不足 10GB当前可用: $(($available_space / 1024 / 1024))GB"
fi
# 检查内存
total_mem=$(free -m | awk 'NR==2{print $2}')
if [[ $total_mem -lt 4096 ]]; then # 4GB
log_warning "系统内存不足 4GB当前: ${total_mem}MB"
fi
}
# 查找版本文件
find_version_file() {
log_info "查找版本信息文件..."
# 在当前目录查找
if [[ -f "$VERSION_FILE" ]]; then
VERSION_FILE_PATH="$(pwd)/$VERSION_FILE"
log_success "找到版本文件: $VERSION_FILE"
return 0
fi
# 在 artifact 目录查找
for version_dir in artifact/*/; do
if [[ -f "${version_dir}${VERSION_FILE}" ]]; then
VERSION_FILE_PATH="$(cd "$(dirname "${version_dir}${VERSION_FILE}")" && pwd)/$(basename "${version_dir}${VERSION_FILE}")"
log_success "找到版本文件: $VERSION_FILE_PATH"
return 0
fi
done
log_error "未找到版本信息文件 $VERSION_FILE"
exit 1
}
# 解析版本信息
parse_version_info() {
log_info "解析版本信息..."
if [[ ! -f "$VERSION_FILE_PATH" ]]; then
log_error "版本文件不存在: $VERSION_FILE_PATH"
exit 1
fi
# 使用 jq 解析 JSON如果可用
if command -v jq &> /dev/null; then
# 验证JSON文件格式
if ! jq empty "$VERSION_FILE_PATH" 2>/dev/null; then
log_error "JSON文件格式错误请检查 $VERSION_FILE_PATH"
exit 1
fi
VERSION=$(jq -r '.version' "$VERSION_FILE_PATH")
BUILD_TIME=$(jq -r '.build_time' "$VERSION_FILE_PATH")
# 解析 artifact_list
if jq -e '.artifact_list' "$VERSION_FILE_PATH" > /dev/null 2>&1; then
jq -r '.artifact_list | to_entries[] | "\(.key):\(.value)"' "$VERSION_FILE_PATH" > "$TEMP_DIR/components.txt"
else
log_error "version.json 中缺少 artifact_list 字段"
exit 1
fi
# 解析 checksums
if jq -e '.checksums' "$VERSION_FILE_PATH" > /dev/null 2>&1; then
jq -r '.checksums | to_entries[] | "\(.key):\(.value)"' "$VERSION_FILE_PATH" > "$TEMP_DIR/checksums.txt"
else
log_error "version.json 中缺少 checksums 字段"
exit 1
fi
# 解析 install_order现在包含完整的文件名
if jq -e '.install_order' "$VERSION_FILE_PATH" > /dev/null 2>&1; then
jq -r '.install_order[]' "$VERSION_FILE_PATH" > "$TEMP_DIR/install_order.txt"
else
log_error "version.json 中缺少 install_order 字段"
exit 1
fi
else
log_warning "jq 未安装,使用简单的 JSON 解析"
# 简单的 JSON 解析
VERSION=$(grep '"version"' "$VERSION_FILE_PATH" | sed 's/.*"version": *"\([^"]*\)".*/\1/')
BUILD_TIME=$(grep '"build_time"' "$VERSION_FILE_PATH" | sed 's/.*"build_time": *"\([^"]*\)".*/\1/')
# 解析 artifact_list跳过字段名本身
grep -A 100 '"artifact_list"' "$VERSION_FILE_PATH" | grep -v '"artifact_list"' | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do
component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/')
version=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/')
echo "$component:$version" >> "$TEMP_DIR/components.txt"
done
# 解析 checksums跳过字段名本身
grep -A 100 '"checksums"' "$VERSION_FILE_PATH" | grep -v '"checksums"' | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do
component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/')
checksum=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/')
echo "$component:$checksum" >> "$TEMP_DIR/checksums.txt"
done
# 解析 install_order跳过字段名本身只取数组元素
grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -v '"install_order"' | grep -E '^\s*"[^"]+"' | while read line; do
component=$(echo "$line" | sed 's/.*"\([^"]*\)".*/\1/')
echo "$component" >> "$TEMP_DIR/install_order.txt"
done
# 验证解析结果
if [[ ! -f "$TEMP_DIR/components.txt" || ! -s "$TEMP_DIR/components.txt" ]]; then
log_error "无法解析 artifact_list请检查 version.json 格式"
exit 1
fi
if [[ ! -f "$TEMP_DIR/checksums.txt" || ! -s "$TEMP_DIR/checksums.txt" ]]; then
log_error "无法解析 checksums请检查 version.json 格式"
exit 1
fi
if [[ ! -f "$TEMP_DIR/install_order.txt" || ! -s "$TEMP_DIR/install_order.txt" ]]; then
log_error "无法解析 install_order请检查 version.json 格式"
exit 1
fi
fi
log_success "版本信息解析完成"
log_info " 版本: $VERSION"
log_info " 构建时间: $BUILD_TIME"
component_count=0
if [[ -f "$TEMP_DIR/components.txt" ]]; then
component_count=$(wc -l < "$TEMP_DIR/components.txt")
log_info " 组件数量: $component_count"
log_info " 组件列表:"
while IFS= read -r line; do
component=$(echo "$line" | cut -d':' -f1)
version=$(echo "$line" | cut -d':' -f2)
log_info " - $component v$version"
done < "$TEMP_DIR/components.txt"
else
log_error "components.txt 文件不存在"
exit 1
fi
}
# 验证文件完整性
verify_checksums() {
log_info "验证文件完整性..."
artifact_dir=$(dirname "$VERSION_FILE_PATH")
log_info "Artifact 目录: $artifact_dir"
failed_verification=0
if [[ -f "$TEMP_DIR/checksums.txt" ]]; then
while IFS= read -r line; do
component=$(echo "$line" | cut -d':' -f1)
expected_checksum=$(echo "$line" | cut -d':' -f2-)
# 查找匹配的 tar 文件
actual_file=""
for file in "$artifact_dir/${component}-"*.tar.gz; do
if [[ -f "$file" ]]; then
actual_file="$file"
break
fi
done
if [[ -z "$actual_file" ]]; then
log_error "找不到组件文件: $component"
failed_verification=1
continue
fi
# 计算实际校验和
actual_checksum="sha256:$(sha256sum "$actual_file" | cut -d' ' -f1)"
if [[ "$actual_checksum" == "$expected_checksum" ]]; then
log_success " $component: 校验通过"
else
log_error " $component: 校验失败"
log_error " 期望: $expected_checksum"
log_error " 实际: $actual_checksum"
failed_verification=1
fi
done < "$TEMP_DIR/checksums.txt"
fi
if [[ $failed_verification -eq 1 ]]; then
log_error "文件完整性验证失败"
exit 1
fi
log_success "所有文件校验通过"
}
# 创建安装目录
create_install_dirs() {
log_info "创建安装目录..."
mkdir -p "$INSTALL_DIR"
mkdir -p "$TEMP_DIR"
log_success "安装目录创建完成: $INSTALL_DIR"
}
# 获取系统版本
get_system_version() {
if [[ ! -f /etc/os-release ]]; then
log_error "无法检测操作系统版本"
return 1
fi
source /etc/os-release
# 提取主版本号
case "$VERSION_ID" in
"20.04")
echo "ubuntu20"
;;
"22.04")
echo "ubuntu22"
;;
*)
log_warning "未识别的Ubuntu版本: $VERSION_ID尝试使用ubuntu22"
echo "ubuntu22"
;;
esac
}
# 安装系统依赖包
install_system_deps() {
log_info "开始安装系统依赖包(离线模式)..."
local artifact_dir
artifact_dir=$(dirname "$VERSION_FILE_PATH")
local deps_dir="$artifact_dir/deps"
local system_version
system_version=$(get_system_version)
local version_deps_dir="$deps_dir/$system_version"
if [[ ! -d "$version_deps_dir" ]]; then
log_warning "未找到 $system_version 版本的依赖目录: $version_deps_dir,跳过安装"
return 0
fi
log_info "找到系统版本依赖目录: $version_deps_dir"
local deps_temp_dir="/tmp/argus_deps"
mkdir -p "$deps_temp_dir"
rm -rf "$deps_temp_dir"/*
local FAILED_DEPS=()
local CORE_DEPS=(jq cron curl) # 核心依赖列表
# 遍历每个 tar.gz
for tar_file in "$version_deps_dir"/*.tar.gz; do
[[ -f "$tar_file" ]] || continue
local tar_basename
tar_basename=$(basename "$tar_file")
log_info "处理依赖包: $tar_basename"
local extract_dir="$deps_temp_dir/${tar_basename%.tar.gz}"
mkdir -p "$extract_dir"
if tar -xzf "$tar_file" -C "$extract_dir"; then
log_success " $tar_basename 解压完成"
else
log_error " $tar_basename 解压失败"
FAILED_DEPS+=("$tar_basename")
continue
fi
# 递归查找所有 deb 文件,一次性安装
mapfile -t deb_files < <(find "$extract_dir" -type f -name "*.deb")
if [[ ${#deb_files[@]} -eq 0 ]]; then
log_warning " 没有找到 deb 包,跳过"
continue
fi
log_info " 安装 ${#deb_files[@]} 个 deb 包..."
if dpkg -i "${deb_files[@]}" &>/tmp/dpkg_install.log; then
log_success " 所有 deb 包安装成功"
else
dpkg --configure -a || true
if dpkg -l | grep -q '^ii'; then
log_success " dpkg --configure 修复后安装成功"
else
log_error " 部分 deb 包安装失败,请手动安装"
for deb in "${deb_files[@]}"; do
pkg_name=$(dpkg-deb -f "$deb" Package 2>/dev/null || true)
FAILED_DEPS+=("${pkg_name:-$deb}")
done
fi
fi
done
# 启动 cron 服务或其它必要服务
start_cron_service
# 检查核心依赖是否都已安装
local missing_core=()
for dep in "${CORE_DEPS[@]}"; do
if ! dpkg -s "$dep" &>/dev/null; then
missing_core+=("$dep")
fi
done
if [[ ${#missing_core[@]} -gt 0 ]]; then
log_error "核心依赖安装失败,请手动安装以下组件:"
for d in "${missing_core[@]}"; do
echo " - $d"
done
exit 1
fi
# 最终处理其他安装失败的包
if [[ ${#FAILED_DEPS[@]} -gt 0 ]]; then
log_error "以下系统依赖安装失败,请手动安装后重试:"
for f in "${FAILED_DEPS[@]}"; do
echo " - $f"
done
exit 1
fi
log_success "系统依赖安装完成,全部就绪"
}
# 启动 cron 服务
start_cron_service() {
log_info "检查并启动 cron 服务..."
# 检查 cron 是否已经在运行
if pgrep -x "cron" > /dev/null; then
log_success "cron 服务已在运行"
return 0
fi
# 检查 /usr/sbin/cron 是否存在
if [[ ! -f "/usr/sbin/cron" ]]; then
log_warning "cron 可执行文件不存在,跳过启动"
return 1
fi
# 启动 cron 服务
log_info "启动 cron 服务..."
if /usr/sbin/cron start 2>/dev/null || /usr/sbin/cron 2>/dev/null; then
log_success "cron 服务启动成功"
sleep 2
if pgrep -x "cron" > /dev/null; then
log_success "cron 服务运行正常"
else
log_warning "cron 服务可能未正常启动"
fi
else
log_error "cron 服务启动失败"
return 1
fi
}
# 安装组件
install_components() {
log_info "开始安装组件..."
artifact_dir=$(dirname "$VERSION_FILE_PATH")
log_info "Artifact 目录: $artifact_dir"
install_count=0
total_count=0
if [[ -f "$TEMP_DIR/install_order.txt" ]]; then
total_count=$(wc -l < "$TEMP_DIR/install_order.txt")
fi
if [[ -f "$TEMP_DIR/install_order.txt" ]]; then
while IFS= read -r filename; do
install_count=$((install_count + 1))
# 从文件名中提取组件名(去掉时间戳后缀)
component=$(echo "$filename" | sed 's/-[0-9]\{8\}-[0-9]\{6\}\.tar\.gz$//')
log_info "[$install_count/$total_count] 安装 $component..."
log_info " 文件名: $filename"
# 直接使用完整的文件名
tar_file="$artifact_dir/$filename"
if [[ ! -f "$tar_file" ]]; then
log_error "找不到组件文件: $filename"
log_info " 期望路径: $tar_file"
log_info " 当前目录: $(pwd)"
log_info " 目录内容:"
ls -la "$artifact_dir" | while read line; do
log_info " $line"
done
exit 1
fi
log_info " 找到文件: $tar_file"
# 解压到临时目录
component_temp_dir="$TEMP_DIR/$component"
mkdir -p "$component_temp_dir"
if tar -xzf "$tar_file" -C "$component_temp_dir" 2>/dev/null; then
log_success " $component 解压完成"
else
log_error " $component 解压失败"
exit 1
fi
# 查找解压后的目录
extracted_dir=""
for dir in "$component_temp_dir"/*; do
if [[ -d "$dir" ]]; then
extracted_dir="$dir"
break
fi
done
if [[ -z "$extracted_dir" ]]; then
log_error " $component 解压后未找到目录"
exit 1
fi
# 执行安装脚本
if [[ -f "$extracted_dir/install.sh" ]]; then
log_info " 执行 $component 安装脚本..."
if (cd "$extracted_dir" && ./install.sh "$INSTALL_DIR"); then
log_success " $component 安装完成"
else
log_error " $component 安装失败"
exit 1
fi
else
log_error " $component 缺少 install.sh 文件"
exit 1
fi
# 将解压后的目录移动到安装目录,保留组件目录
component_install_dir="$INSTALL_DIR/$component"
# 简化安装逻辑:直接删除旧目录,不进行备份
if [[ -d "$component_install_dir" ]]; then
log_info " 组件目录已存在,删除旧版本: $component_install_dir"
rm -rf "$component_install_dir"
# log_info " 组件目录已存在,备份后更新: $component_install_dir"
# mv "$component_install_dir" "${component_install_dir}.backup.$(date +%Y%m%d_%H%M%S)"
fi
mv "$extracted_dir" "$component_install_dir"
log_success " 组件目录已保存: $component_install_dir"
# 清理临时文件
rm -rf "$component_temp_dir"
done < "$TEMP_DIR/install_order.txt"
fi
log_success "所有组件安装完成"
}
# 创建安装记录
create_install_record() {
log_info "创建安装记录..."
# 等待一段时间确保所有进程都已启动
log_info "等待进程启动..."
sleep 3
local install_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
local install_record_file="$INSTALL_DIR/.install_record"
# 创建 JSON 格式的安装记录
cat > "$install_record_file" << EOF
{
"version": "$VERSION",
"build_time": "$BUILD_TIME",
"install_time": "$install_time",
"install_dir": "$INSTALL_DIR",
"install_pid": $$,
"components": {
EOF
# 添加组件信息
local first_component=true
if [[ -f "$TEMP_DIR/components.txt" ]]; then
while IFS= read -r line; do
component=$(echo "$line" | cut -d':' -f1)
version=$(echo "$line" | cut -d':' -f2)
# 获取组件的进程信息
local component_pid=""
# 根据组件名查找进程使用多种方法确保能找到PID
case "$component" in
"node-exporter")
# 尝试多种方式查找node_exporter进程
component_pid=$(pgrep -f "node_exporter" | head -1)
if [[ -z "$component_pid" ]]; then
component_pid=$(pgrep -f "node-exporter" | head -1)
fi
if [[ -z "$component_pid" ]]; then
component_pid=$(ps aux | grep -v grep | grep "node_exporter" | awk '{print $2}' | head -1)
fi
;;
"dcgm-exporter")
# 查找dcgm-exporter进程
component_pid=$(pgrep -f "dcgm-exporter" | head -1)
if [[ -z "$component_pid" ]]; then
component_pid=$(pgrep -f "dcgm_exporter" | head -1)
fi
if [[ -z "$component_pid" ]]; then
component_pid=$(ps aux | grep -v grep | grep "dcgm-exporter" | awk '{print $2}' | head -1)
fi
;;
"fluent-bit")
# 查找fluent-bit进程
component_pid=$(pgrep -f "fluent-bit" | head -1)
if [[ -z "$component_pid" ]]; then
component_pid=$(pgrep -f "fluent_bit" | head -1)
fi
if [[ -z "$component_pid" ]]; then
component_pid=$(ps aux | grep -v grep | grep "fluent-bit" | awk '{print $2}' | head -1)
fi
;;
"argus-agent")
# 查找argus-agent进程
component_pid=$(pgrep -f "argus-agent" | head -1)
if [[ -z "$component_pid" ]]; then
component_pid=$(ps aux | grep -v grep | grep "argus-agent" | awk '{print $2}' | head -1)
fi
;;
esac
# 记录找到的PID信息
if [[ -n "$component_pid" ]]; then
log_info " 找到 $component 进程 PID: $component_pid"
else
log_warning " 未找到 $component 进程"
fi
# 添加逗号分隔符
if [[ "$first_component" == "true" ]]; then
first_component=false
else
echo "," >> "$install_record_file"
fi
# 添加组件信息
cat >> "$install_record_file" << EOF
"$component": {
"version": "$version",
"pid": "$component_pid",
"install_dir": "$INSTALL_DIR/$component"
}
EOF
done < "$TEMP_DIR/components.txt"
fi
# 结束 JSON
cat >> "$install_record_file" << EOF
}
}
EOF
log_success "安装记录已创建: $install_record_file"
}
# 检查cron任务是否已存在
check_cron_task_exists() {
local task_pattern="$1"
local temp_cron="$2"
if grep -q "$task_pattern" "$temp_cron"; then
return 0 # 任务已存在
else
return 1 # 任务不存在
fi
}
# 设置健康检查定时任务
setup_health_check_cron() {
log_info "设置健康检查定时任务..."
# 直接使用当前安装目录不依赖current软链接
# INSTALL_DIR 是 /opt/argus-metric/versions/1.34.0
local check_health_script="$INSTALL_DIR/check_health.sh"
# 检查健康检查脚本是否存在
if [[ ! -f "$check_health_script" ]]; then
log_error "健康检查脚本不存在: $check_health_script"
return 1
fi
# 确保脚本有执行权限
chmod +x "$check_health_script"
# 创建临时crontab文件
local temp_cron="/tmp/crontab_$$"
# 获取当前用户的crontab如果存在
crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron"
# 检查并删除旧的健康检查任务
if check_cron_task_exists "check_health.sh" "$temp_cron"; then
log_info "发现旧的健康检查定时任务,正在更新..."
# 删除所有包含check_health.sh的行
grep -v "check_health.sh" "$temp_cron" > "$temp_cron.new"
mv "$temp_cron.new" "$temp_cron"
log_info "旧的健康检查定时任务已删除"
fi
# 添加新的定时任务每5分钟执行一次
echo "# Argus-Metrics 健康检查定时任务" >> "$temp_cron"
echo "*/5 * * * * $check_health_script >> $INSTALL_DIR/.health_cron.log 2>&1" >> "$temp_cron"
# 安装新的crontab
if crontab "$temp_cron"; then
log_success "健康检查定时任务设置成功"
log_info " 执行频率: 每5分钟"
log_info " 日志文件: $INSTALL_DIR/.health_cron.log"
log_info " 查看定时任务: crontab -l"
log_info " 删除定时任务: crontab -e"
else
log_error "健康检查定时任务设置失败"
rm -f "$temp_cron"
return 1
fi
# 清理临时文件
rm -f "$temp_cron"
log_info "健康检查通过crontab自动执行"
}
# 设置 DNS 同步定时任务
setup_dns_sync_cron() {
log_info "设置 DNS 同步定时任务..."
# 使用当前版本目录中的 DNS 同步脚本
local sync_dns_script="$INSTALL_DIR/sync_dns.sh"
# 检查 DNS 同步脚本是否存在
if [[ ! -f "$sync_dns_script" ]]; then
log_warning "DNS 同步脚本不存在: $sync_dns_script"
log_warning "跳过 DNS 同步定时任务设置"
return 0
fi
# 确保脚本有执行权限
chmod +x "$sync_dns_script"
# 创建临时crontab文件
local temp_cron="/tmp/crontab_$$"
# 获取当前用户的crontab如果存在
crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron"
# 检查并删除旧的 DNS 同步任务
if check_cron_task_exists "sync_dns.sh" "$temp_cron"; then
log_info "发现旧的 DNS 同步定时任务,正在更新..."
# 删除所有包含sync_dns.sh的行
grep -v "sync_dns.sh" "$temp_cron" > "$temp_cron.new"
mv "$temp_cron.new" "$temp_cron"
log_info "旧的 DNS 同步定时任务已删除"
fi
# 添加新的定时任务每1分钟执行一次
# 直接使用版本目录中的 DNS 同步脚本
echo "# Argus-Metrics DNS 同步定时任务" >> "$temp_cron"
echo "* * * * * $sync_dns_script >> $INSTALL_DIR/.dns_sync.log 2>&1" >> "$temp_cron"
# 安装新的crontab
if crontab "$temp_cron"; then
log_success "DNS 同步定时任务设置成功"
log_info " 执行频率: 每1分钟"
log_info " 日志文件: $INSTALL_DIR/.dns_sync.log"
log_info " 查看定时任务: crontab -l"
log_info " 删除定时任务: crontab -e"
else
log_error "DNS 同步定时任务设置失败"
rm -f "$temp_cron"
return 1
fi
# 清理临时文件
rm -f "$temp_cron"
log_info "DNS 同步通过crontab自动执行"
}
# 设置版本校验定时任务
setup_version_check_cron() {
log_info "设置版本校验定时任务..."
# 使用当前版本目录中的版本校验脚本
local check_version_script="$INSTALL_DIR/check_version.sh"
# 检查脚本是否存在
if [[ ! -f "$check_version_script" ]]; then
log_warning "版本校验脚本不存在: $check_version_script"
log_info "跳过版本校验定时任务设置"
return 0
fi
# 确保脚本可执行
chmod +x "$check_version_script"
# 创建临时crontab文件
local temp_cron="/tmp/crontab_$$"
crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron"
# 检查是否已存在版本校验定时任务
if check_cron_task_exists "check_version.sh" "$temp_cron"; then
log_info "发现旧的版本校验定时任务,正在更新..."
# 删除所有包含check_version.sh的行
grep -v "check_version.sh" "$temp_cron" > "$temp_cron.new"
mv "$temp_cron.new" "$temp_cron"
log_info "旧的版本校验定时任务已删除"
fi
# 添加新的定时任务每30分钟执行一次
echo "# Argus-Metrics 版本校验定时任务" >> "$temp_cron"
echo "*/1 * * * * $check_version_script >> $INSTALL_DIR/.version_check.log 2>&1" >> "$temp_cron"
# 安装新的crontab
if crontab "$temp_cron"; then
log_success "版本校验定时任务设置成功"
log_info " 执行频率: 每1分钟"
log_info " 日志文件: $INSTALL_DIR/.version_check.log"
log_info " 查看定时任务: crontab -l"
log_info " 删除定时任务: crontab -e"
else
log_error "版本校验定时任务设置失败"
rm -f "$temp_cron"
return 1
fi
# 清理临时文件
rm -f "$temp_cron"
log_info "版本校验通过crontab自动执行"
}
# 设置自动重启定时任务
setup_restart_cron() {
log_info "设置自动重启定时任务..."
# 使用当前版本目录中的重启脚本
local restart_script="$INSTALL_DIR/restart_unhealthy.sh"
# 检查脚本是否存在
if [[ ! -f "$restart_script" ]]; then
log_warning "重启脚本不存在: $restart_script"
log_info "跳过自动重启定时任务设置"
return 0
fi
# 确保脚本可执行
chmod +x "$restart_script"
# 创建临时crontab文件
local temp_cron="/tmp/crontab_$$"
crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron"
# 检查是否已存在自动重启定时任务
if check_cron_task_exists "restart_unhealthy.sh" "$temp_cron"; then
log_info "发现旧的自动重启定时任务,正在更新..."
# 删除所有包含restart_unhealthy.sh的行
grep -v "restart_unhealthy.sh" "$temp_cron" > "$temp_cron.new"
mv "$temp_cron.new" "$temp_cron"
log_info "旧的自动重启定时任务已删除"
fi
# 添加新的定时任务每2分钟执行一次
echo "# Argus-Metrics 自动重启定时任务" >> "$temp_cron"
echo "*/2 * * * * $restart_script >> $INSTALL_DIR/.restart.log 2>&1" >> "$temp_cron"
# 安装新的crontab
if crontab "$temp_cron"; then
log_success "自动重启定时任务设置成功"
log_info " 执行频率: 每2分钟"
log_info " 日志文件: $INSTALL_DIR/.restart.log"
log_info " 查看定时任务: crontab -l"
log_info " 删除定时任务: crontab -e"
else
log_error "自动重启定时任务设置失败"
rm -f "$temp_cron"
return 1
fi
# 清理临时文件
rm -f "$temp_cron"
log_info "自动重启检查通过crontab自动执行"
}
# 显示安装信息
show_install_info() {
log_success "Argus-Metrics All-in-One 安装完成!"
echo
log_info "安装日志已保存到: $LOG_FILE"
log_info "如需查看详细日志,请执行: cat $LOG_FILE"
echo
}
cleanup() {
if [[ -d "$TEMP_DIR" ]]; then
rm -rf "$TEMP_DIR"
fi
}
trap cleanup EXIT
# 主函数
main() {
echo "=========================================="
echo " Argus-Metrics All-in-One 安装脚本 v1.0"
echo "=========================================="
echo
# 初始化日志文件
mkdir -p "$INSTALL_DIR"
echo "==========================================" > "$LOG_FILE"
echo " Argus-Metrics All-in-One 安装日志" >> "$LOG_FILE"
echo " 开始时间: $(date '+%Y-%m-%d %H:%M:%S')" >> "$LOG_FILE"
echo "==========================================" >> "$LOG_FILE"
# 加载配置文件
load_config
log_info "安装目录: $INSTALL_DIR"
log_info "日志文件: $LOG_FILE"
echo
check_root
check_system
find_version_file
create_install_dirs
install_system_deps
parse_version_info
verify_checksums
install_components
copy_config_files
create_install_record
setup_health_check_cron
setup_dns_sync_cron
setup_version_check_cron
setup_restart_cron
# 注释掉立即执行健康检查避免与cron任务重复执行
# log_info "立即执行一次健康检查..."
# local check_health_script="$INSTALL_DIR/check_health.sh"
# if [[ -f "$check_health_script" ]]; then
# if "$check_health_script" >> "$INSTALL_DIR/.health_check.log" 2>&1; then
# log_success "健康检查执行完成"
# else
# log_warning "健康检查执行失败,请检查日志: $INSTALL_DIR/.health_check.log"
# fi
# else
# log_warning "健康检查脚本不存在: $check_health_script"
# fi
show_install_info
}
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,474 @@
#!/bin/bash
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 显示帮助信息
show_help() {
echo "AIOps All-in-One 打包脚本"
echo
echo "用法: $0 [选项]"
echo
echo "选项:"
echo " --force 强制重新打包,即使版本已存在"
echo " --help 显示此帮助信息"
echo
echo "示例:"
echo " $0 # 正常打包,跳过已存在的版本"
echo " $0 --force # 强制重新打包"
echo
}
# 解析命令行参数
FORCE_PACKAGE=false
if [[ "$1" == "--force" ]]; then
FORCE_PACKAGE=true
log_info "强制重新打包模式"
elif [[ "$1" == "--help" || "$1" == "-h" ]]; then
show_help
exit 0
fi
# 获取当前目录和版本
CURRENT_DIR=$(pwd)
VERSION=$(cat config/VERSION 2>/dev/null || echo "1.0.0")
ARTIFACT_DIR="artifact/$VERSION"
log_info "开始打包 AIOps All-in-One 安装包 v$VERSION"
# 检查必要文件
log_info "检查必要文件..."
if [[ ! -f "config/VERSION" ]]; then
log_error "VERSION 文件不存在"
exit 1
fi
if [[ ! -f "config/checklist" ]]; then
log_error "checklist 文件不存在"
exit 1
fi
# 检查是否已存在该版本
if [[ -d "$ARTIFACT_DIR" && "$FORCE_PACKAGE" == "false" ]]; then
log_info "检查版本 $VERSION 是否已存在..."
# 检查 version.json 是否存在
if [[ -f "$ARTIFACT_DIR/version.json" ]]; then
log_info "找到已存在的版本信息文件"
# 检查是否所有组件文件都存在
missing_files=0
existing_components=0
# 解析已存在的 version.json 来检查文件
if command -v jq &> /dev/null; then
# 使用 jq 解析
while IFS= read -r component; do
existing_components=$((existing_components + 1))
# 查找对应的 tar 文件
found_file=false
for file in "$ARTIFACT_DIR/${component}-"*.tar.gz; do
if [[ -f "$file" ]]; then
found_file=true
break
fi
done
if [[ "$found_file" == "false" ]]; then
missing_files=$((missing_files + 1))
log_warning " 缺少文件: $component"
fi
done < <(jq -r '.artifact_list | keys[]' "$ARTIFACT_DIR/version.json" 2>/dev/null)
else
# 简单的文件检查
for file in "$ARTIFACT_DIR"/*.tar.gz; do
if [[ -f "$file" ]]; then
existing_components=$((existing_components + 1))
fi
done
fi
# 如果所有文件都存在,则跳过打包
if [[ $missing_files -eq 0 && $existing_components -gt 0 ]]; then
log_success "版本 $VERSION 已完整打包,跳过重复打包"
echo
echo "现有文件:"
ls -la "$ARTIFACT_DIR"
echo
echo "如需强制重新打包,请删除目录: rm -rf $ARTIFACT_DIR"
echo "或使用: ./package.sh --force"
exit 0
else
log_warning "版本 $VERSION 存在但不完整,将重新打包"
log_info " 现有组件: $existing_components"
log_info " 缺少文件: $missing_files"
fi
else
log_warning "版本目录存在但缺少 version.json将重新打包"
fi
fi
# 创建 artifact 目录
mkdir -p "$ARTIFACT_DIR"
log_info "创建输出目录: $ARTIFACT_DIR"
# 创建临时文件存储数据
TEMP_DIR=$(mktemp -d)
COMPONENTS_FILE="$TEMP_DIR/components.txt"
VERSIONS_FILE="$TEMP_DIR/versions.txt"
DEPENDENCIES_FILE="$TEMP_DIR/dependencies.txt"
INSTALL_ORDER_FILE="$TEMP_DIR/install_order.txt"
CHECKSUMS_FILE="$TEMP_DIR/checksums.txt"
ARTIFACT_LIST_FILE="$TEMP_DIR/artifact_list.txt"
# 解析 checklist 文件
log_info "解析组件清单..."
line_num=0
component_count=0
while IFS= read -r line; do
[[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
line_num=$((line_num + 1))
# 解析行: 组件名 目录路径 版本 [依赖组件] [安装顺序]
read -r component component_path version dep_component order <<< "$line"
if [[ -z "$component" || -z "$component_path" || -z "$version" ]]; then
log_warning "跳过无效行 $line_num: $line"
continue
fi
# 存储组件信息
echo "$component" >> "$COMPONENTS_FILE"
echo "$component:$version" >> "$VERSIONS_FILE"
echo "$component:$component_path" >> "$TEMP_DIR/component_paths.txt"
if [[ -n "$dep_component" && "$dep_component" != "$component" ]]; then
echo "$component:$dep_component" >> "$DEPENDENCIES_FILE"
fi
if [[ -n "$order" && "$order" =~ ^[0-9]+$ ]]; then
echo "$order:$component" >> "$INSTALL_ORDER_FILE"
else
# 如果没有指定顺序,按解析顺序分配
echo "$line_num:$component" >> "$INSTALL_ORDER_FILE"
fi
component_count=$((component_count + 1))
log_info " - $component v$version"
done < config/checklist
if [[ $component_count -eq 0 ]]; then
log_error "没有找到有效的组件"
rm -rf "$TEMP_DIR"
exit 1
fi
log_success "找到 $component_count 个组件"
# 检查组件目录是否存在
log_info "检查组件目录..."
missing_components=()
while IFS= read -r component; do
# 获取组件路径
component_path=$(grep "^$component:" "$TEMP_DIR/component_paths.txt" | cut -d':' -f2-)
if [[ -z "$component_path" ]]; then
log_error "未找到组件 $component 的路径配置"
log_info "请检查 component_paths.txt 文件或添加路径配置"
exit 1
fi
if [[ ! -d "$component_path" ]]; then
missing_components+=("$component:$component_path")
fi
done < "$COMPONENTS_FILE"
if [[ ${#missing_components[@]} -gt 0 ]]; then
log_error "以下组件目录不存在:"
for component_path in "${missing_components[@]}"; do
echo " - $component_path"
done
rm -rf "$TEMP_DIR"
exit 1
fi
# 打包各个组件
log_info "开始打包组件..."
while IFS= read -r component; do
# 获取组件版本和路径
version=$(grep "^$component:" "$VERSIONS_FILE" | cut -d':' -f2)
component_path=$(grep "^$component:" "$TEMP_DIR/component_paths.txt" | cut -d':' -f2-)
if [[ -z "$component_path" ]]; then
log_error "未找到组件 $component 的路径配置"
log_info "请检查 component_paths.txt 文件或添加路径配置"
exit 1
fi
log_info "打包 $component v$version..."
log_info " 组件路径: $component_path"
# 进入组件目录
cd "$component_path"
# 检查组件是否有 package.sh
if [[ ! -f "package.sh" ]]; then
log_error "$component 缺少 package.sh 文件"
cd "$CURRENT_DIR"
rm -rf "$TEMP_DIR"
exit 1
fi
# 执行组件的打包脚本
if ./package.sh; then
# 查找生成的 tar 包
tar_file=$(find . -name "*.tar.gz" -type f | head -1)
if [[ -n "$tar_file" ]]; then
# 移动到 artifact 目录
mv "$tar_file" "$CURRENT_DIR/$ARTIFACT_DIR/"
tar_filename=$(basename "$tar_file")
# 计算校验和
checksum=$(sha256sum "$CURRENT_DIR/$ARTIFACT_DIR/$tar_filename" | cut -d' ' -f1)
echo "$component:sha256:$checksum" >> "$CHECKSUMS_FILE"
echo "$component:$version" >> "$ARTIFACT_LIST_FILE"
# 将完整的文件名存储到安装顺序文件中
echo "$tar_filename" >> "$TEMP_DIR/install_order_files.txt"
log_success " $component 打包完成: $tar_filename"
else
log_error "$component 打包失败,未找到生成的 tar 包"
cd "$CURRENT_DIR"
rm -rf "$TEMP_DIR"
exit 1
fi
else
log_error "$component 打包失败"
cd "$CURRENT_DIR"
rm -rf "$TEMP_DIR"
exit 1
fi
# 返回主目录
cd "$CURRENT_DIR"
done < "$COMPONENTS_FILE"
# 生成 version.json
log_info "生成版本信息文件..."
version_json="$ARTIFACT_DIR/version.json"
# 构建依赖关系 JSON
deps_json=""
if [[ -f "$DEPENDENCIES_FILE" ]]; then
first=true
while IFS= read -r line; do
component=$(echo "$line" | cut -d':' -f1)
dep=$(echo "$line" | cut -d':' -f2)
if [[ "$first" == "true" ]]; then
deps_json="\"$component\":[\"$dep\"]"
first=false
else
deps_json="$deps_json,\"$component\":[\"$dep\"]"
fi
done < "$DEPENDENCIES_FILE"
fi
# 构建安装顺序数组
order_array=""
if [[ -f "$TEMP_DIR/install_order_files.txt" ]]; then
first=true
while IFS= read -r filename; do
if [[ "$first" == "true" ]]; then
order_array="\"$filename\""
first=false
else
order_array="$order_array,\"$filename\""
fi
done < "$TEMP_DIR/install_order_files.txt"
fi
# 构建 artifact_list JSON
artifact_json=""
if [[ -f "$ARTIFACT_LIST_FILE" ]]; then
first=true
while IFS= read -r line; do
component=$(echo "$line" | cut -d':' -f1)
version=$(echo "$line" | cut -d':' -f2)
if [[ "$first" == "true" ]]; then
artifact_json="\"$component\":\"$version\""
first=false
else
artifact_json="$artifact_json,\"$component\":\"$version\""
fi
done < "$ARTIFACT_LIST_FILE"
fi
# 构建 checksums JSON
checksums_json=""
if [[ -f "$CHECKSUMS_FILE" ]]; then
first=true
while IFS= read -r line; do
component=$(echo "$line" | cut -d':' -f1)
checksum=$(echo "$line" | cut -d':' -f2-)
if [[ "$first" == "true" ]]; then
checksums_json="\"$component\":\"$checksum\""
first=false
else
checksums_json="$checksums_json,\"$component\":\"$checksum\""
fi
done < "$CHECKSUMS_FILE"
fi
# 生成完整的 version.json
cat > "$version_json" << EOF
{
"version": "$VERSION",
"build_time": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
"artifact_list": {
$artifact_json
},
"checksums": {
$checksums_json
},
"dependencies": {
$deps_json
},
"install_order": [
$order_array
]
}
EOF
log_success "版本信息文件生成完成: $version_json"
# 复制`安装`脚本到 artifact 目录
log_info "复制安装脚本..."
if [[ -f "scripts/install_artifact.sh" ]]; then
cp "scripts/install_artifact.sh" "$ARTIFACT_DIR/install.sh"
chmod +x "$ARTIFACT_DIR/install.sh"
log_success "安装脚本复制完成: $ARTIFACT_DIR/install.sh"
else
log_warning "scripts/install_artifact.sh 文件不存在"
fi
# 复制`卸载`脚本到 artifact 目录
log_info "复制卸载脚本..."
if [[ -f "scripts/uninstall_artifact.sh" ]]; then
cp "scripts/uninstall_artifact.sh" "$ARTIFACT_DIR/uninstall.sh"
chmod +x "$ARTIFACT_DIR/uninstall.sh"
log_success "卸载脚本复制完成: $ARTIFACT_DIR/uninstall.sh"
else
log_warning "scripts/uninstall_artifact.sh 文件不存在"
fi
# 复制`健康检查`脚本到 artifact 目录
log_info "复制健康检查脚本..."
if [[ -f "scripts/check_health.sh" ]]; then
cp "scripts/check_health.sh" "$ARTIFACT_DIR/check_health.sh"
chmod +x "$ARTIFACT_DIR/check_health.sh"
log_success "健康检查脚本复制完成: $ARTIFACT_DIR/check_health.sh"
else
log_warning "scripts/check_health.sh 文件不存在"
fi
# 复制`DNS 同步`脚本到 artifact 目录
log_info "复制 DNS 同步脚本..."
if [[ -f "scripts/sync_dns.sh" ]]; then
cp "scripts/sync_dns.sh" "$ARTIFACT_DIR/sync_dns.sh"
chmod +x "$ARTIFACT_DIR/sync_dns.sh"
log_success "DNS 同步脚本复制完成: $ARTIFACT_DIR/sync_dns.sh"
else
log_warning "scripts/sync_dns.sh 文件不存在"
fi
# 复制`版本校验`脚本到 artifact 目录
log_info "复制版本校验脚本..."
if [[ -f "scripts/check_version.sh" ]]; then
cp "scripts/check_version.sh" "$ARTIFACT_DIR/check_version.sh"
chmod +x "$ARTIFACT_DIR/check_version.sh"
log_success "版本校验脚本复制完成: $ARTIFACT_DIR/check_version.sh"
else
log_warning "scripts/check_version.sh 文件不存在"
fi
# 复制`自动重启`脚本到 artifact 目录
log_info "复制自动重启脚本..."
if [[ -f "scripts/restart_unhealthy.sh" ]]; then
cp "scripts/restart_unhealthy.sh" "$ARTIFACT_DIR/restart_unhealthy.sh"
chmod +x "$ARTIFACT_DIR/restart_unhealthy.sh"
log_success "自动重启脚本复制完成: $ARTIFACT_DIR/restart_unhealthy.sh"
else
log_warning "scripts/restart_unhealthy.sh 文件不存在"
fi
# 复制配置文件到 artifact 目录
log_info "复制配置文件..."
if [[ -f "config/config.env" ]]; then
cp "config/config.env" "$ARTIFACT_DIR/"
log_success "配置文件复制完成: $ARTIFACT_DIR/config.env"
else
log_warning "config 目录不存在,跳过配置文件复制"
fi
# DNS 配置文件不需要复制到版本目录,直接从 FTP 服务器根目录获取
# 复制 deps 目录到 artifact 目录
log_info "复制系统依赖包..."
if [[ -d "deps" ]]; then
cp -r "deps" "$ARTIFACT_DIR/"
log_success "系统依赖包复制完成: $ARTIFACT_DIR/deps"
# 显示deps目录内容
log_info " 依赖包列表:"
find "$ARTIFACT_DIR/deps" -name "*.tar.gz" -exec basename {} \; | while read dep_file; do
log_info " - $dep_file"
done
else
log_warning "deps 目录不存在,跳过依赖包复制"
fi
# 显示打包结果
log_success "打包完成!"
echo
echo "版本: $VERSION"
echo "输出目录: $ARTIFACT_DIR"
echo "包含组件:"
if [[ -f "$ARTIFACT_LIST_FILE" ]]; then
while IFS= read -r line; do
component=$(echo "$line" | cut -d':' -f1)
version=$(echo "$line" | cut -d':' -f2)
echo " - $component v$version"
done < "$ARTIFACT_LIST_FILE"
fi
echo
echo "文件列表:"
ls -la "$ARTIFACT_DIR"
echo
# 清理临时文件
rm -rf "$TEMP_DIR"

View File

@ -0,0 +1,293 @@
#!/bin/bash
set -e
# 颜色定义
GREEN='\033[0;32m'
BLUE='\033[0;34m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 显示帮助信息
show_help() {
echo "Argus-Metric Artifact 发布脚本"
echo
echo "用法: $0 <版本号> [选项]"
echo
echo "参数:"
echo " <版本号> 要发布的版本号,对应 artifact 目录中的版本"
echo
echo "选项:"
echo " --output-dir <路径> 指定输出目录 (默认: /private/argus/ftp/share/)"
echo " --owner <uid:gid> 指定文件所有者 (默认: 2133:2015)"
echo " -h, --help 显示此帮助信息"
echo
echo "示例:"
echo " $0 1.20.0 # 使用默认配置发布"
echo " $0 1.20.0 --output-dir /tmp/publish # 指定输出目录"
echo " $0 1.20.0 --owner 1000:1000 # 指定文件所有者"
echo " $0 1.20.0 --output-dir /srv/ftp --owner root:root # 同时指定两者"
echo
}
# 默认配置
DEFAULT_PUBLISH_DIR="/private/argus/ftp/share/"
DEFAULT_OWNER="2133:2015"
# 解析参数
VERSION=""
PUBLISH_DIR="$DEFAULT_PUBLISH_DIR"
OWNER="$DEFAULT_OWNER"
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help)
show_help
exit 0
;;
--output-dir)
PUBLISH_DIR="$2"
shift 2
;;
--owner)
OWNER="$2"
shift 2
;;
*)
if [[ -z "$VERSION" ]]; then
VERSION="$1"
shift
else
log_error "未知参数: $1"
show_help
exit 1
fi
;;
esac
done
# 检查版本号是否提供
if [[ -z "$VERSION" ]]; then
log_error "请提供版本号参数"
show_help
exit 1
fi
ARTIFACT_DIR="artifact/$VERSION"
# 检查版本目录是否存在
if [[ ! -d "$ARTIFACT_DIR" ]]; then
log_error "版本目录不存在: $ARTIFACT_DIR"
exit 1
fi
log_info "开始发布版本: $VERSION"
log_info "输出目录: $PUBLISH_DIR"
log_info "文件所有者: $OWNER"
# 确保发布目录存在
log_info "确保发布目录存在: $PUBLISH_DIR"
mkdir -p "$PUBLISH_DIR"
# 解析并校验所有者(仅在需要时 chown
IFS=':' read -r OWNER_UID OWNER_GID <<< "$OWNER"
if [[ -z "$OWNER_UID" || -z "$OWNER_GID" ]]; then
log_error "--owner 格式不正确,应为 uid:gid"
exit 1
fi
CURRENT_UID=$(id -u)
CURRENT_GID=$(id -g)
if [[ "$OWNER_UID" != "$CURRENT_UID" || "$OWNER_GID" != "$CURRENT_GID" ]]; then
if [[ "$CURRENT_UID" -ne 0 ]]; then
log_error "当前用户 (${CURRENT_UID}:${CURRENT_GID}) 无法设置所有者为 ${OWNER_UID}:${OWNER_GID}"
log_error "请以目标用户运行脚本或预先调整目录权限"
exit 1
fi
NEED_CHOWN=true
else
NEED_CHOWN=false
fi
# 创建临时目录用于打包
TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$"
mkdir -p "$TEMP_PACKAGE_DIR"
# 复制所有 tar.gz 文件到临时目录
log_info "准备 artifact 文件..."
tar_files=$(find "$ARTIFACT_DIR" -name "*.tar.gz" -type f)
if [[ -z "$tar_files" ]]; then
log_error "$ARTIFACT_DIR 中未找到 tar.gz 文件"
exit 1
fi
for file in $tar_files; do
filename=$(basename "$file")
log_info " 准备: $filename"
cp "$file" "$TEMP_PACKAGE_DIR/"
done
# 复制版本信息文件
if [[ -f "$ARTIFACT_DIR/version.json" ]]; then
log_info "复制版本信息文件..."
cp "$ARTIFACT_DIR/version.json" "$TEMP_PACKAGE_DIR/"
fi
# 复制健康检查脚本
if [[ -f "$ARTIFACT_DIR/check_health.sh" ]]; then
log_info "复制健康检查脚本..."
cp "$ARTIFACT_DIR/check_health.sh" "$TEMP_PACKAGE_DIR/"
elif [[ -f "scripts/check_health.sh" ]]; then
log_info "复制健康检查脚本 (从当前目录)..."
cp "scripts/check_health.sh" "$TEMP_PACKAGE_DIR/"
else
log_warning "未找到 check_health.sh 文件"
fi
# 复制 DNS 同步脚本
if [[ -f "$ARTIFACT_DIR/sync_dns.sh" ]]; then
log_info "复制 DNS 同步脚本..."
cp "$ARTIFACT_DIR/sync_dns.sh" "$TEMP_PACKAGE_DIR/"
elif [[ -f "scripts/sync_dns.sh" ]]; then
log_info "复制 DNS 同步脚本 (从当前目录)..."
cp "scripts/sync_dns.sh" "$TEMP_PACKAGE_DIR/"
else
log_warning "未找到 sync_dns.sh 文件"
fi
# 复制版本校验脚本
if [[ -f "$ARTIFACT_DIR/check_version.sh" ]]; then
log_info "复制版本校验脚本..."
cp "$ARTIFACT_DIR/check_version.sh" "$TEMP_PACKAGE_DIR/"
elif [[ -f "scripts/check_version.sh" ]]; then
log_info "复制版本校验脚本 (从当前目录)..."
cp "scripts/check_version.sh" "$TEMP_PACKAGE_DIR/"
else
log_warning "未找到 check_version.sh 文件"
fi
# 复制重启失败脚本
if [[ -f "$ARTIFACT_DIR/restart_unhealthy.sh" ]]; then
log_info "复制重启失败脚本..."
cp "$ARTIFACT_DIR/restart_unhealthy.sh" "$TEMP_PACKAGE_DIR/"
elif [[ -f "scripts/restart_unhealthy.sh" ]]; then
log_info "复制重启失败脚本 (从当前目录)..."
cp "scripts/restart_unhealthy.sh" "$TEMP_PACKAGE_DIR/"
else
log_warning "未找到 restart_unhealthy.sh 文件"
fi
# 复制安装脚本并重命名为 install.sh
if [[ -f "scripts/install_artifact.sh" ]]; then
log_info "复制安装脚本..."
cp "scripts/install_artifact.sh" "$TEMP_PACKAGE_DIR/install.sh"
fi
if [[ -f "scripts/uninstall_artifact.sh" ]]; then
log_info "复制卸载脚本..."
cp "scripts/uninstall_artifact.sh" "$TEMP_PACKAGE_DIR/uninstall.sh"
fi
# 复制配置文件
if [[ -f "$ARTIFACT_DIR/config.env" ]]; then
log_info "复制配置文件..."
cp "$ARTIFACT_DIR/config.env" "$TEMP_PACKAGE_DIR/"
log_success "配置文件复制完成"
else
log_warning "未找到 config.env 文件"
fi
# DNS 配置文件将在后面直接复制到发布目录根目录,不包含在 tar.gz 中
# 复制 deps 目录
if [[ -d "$ARTIFACT_DIR/deps" ]]; then
log_info "复制系统依赖包..."
cp -r "$ARTIFACT_DIR/deps" "$TEMP_PACKAGE_DIR/"
log_success "系统依赖包复制完成"
fi
# 创建tar包使用新的命名规范
TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz"
log_info "创建发布包: $TAR_NAME"
cd "$TEMP_PACKAGE_DIR"
tar -czf "$PUBLISH_DIR/$TAR_NAME" .
cd - > /dev/null
# 设置文件所有者
log_info "设置文件所有者为: $OWNER"
if [[ "$NEED_CHOWN" == true ]]; then
chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME"
fi
# 清理临时目录
rm -rf "$TEMP_PACKAGE_DIR"
# 更新 LATEST_VERSION 文件
log_info "更新 LATEST_VERSION 文件..."
echo "$VERSION" > "$PUBLISH_DIR/LATEST_VERSION"
if [[ "$NEED_CHOWN" == true ]]; then
chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION"
fi
# 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制)
if [[ -f "config/dns.conf" ]]; then
log_info "复制 DNS 配置文件到发布目录根目录..."
cp "config/dns.conf" "$PUBLISH_DIR/"
if [[ "$NEED_CHOWN" == true ]]; then
chown "$OWNER" "$PUBLISH_DIR/dns.conf"
fi
log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf"
else
log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制"
fi
# 复制 setup.sh 到发布目录
if [[ -f "scripts/setup.sh" ]]; then
log_info "复制 setup.sh 到发布目录..."
cp "scripts/setup.sh" "$PUBLISH_DIR/"
if [[ "$NEED_CHOWN" == true ]]; then
chown "$OWNER" "$PUBLISH_DIR/setup.sh"
fi
fi
# 显示发布结果
log_success "版本 $VERSION 发布完成!"
echo
echo "发布目录: $PUBLISH_DIR"
echo "发布包: $PUBLISH_DIR/$TAR_NAME"
echo "包大小: $(du -h "$PUBLISH_DIR/$TAR_NAME" | cut -f1)"
echo "最新版本: $(cat "$PUBLISH_DIR/LATEST_VERSION")"
echo
echo "发布目录中的文件:"
ls -la "$PUBLISH_DIR" | while read line; do
echo " $line"
done
echo
echo "使用方法:"
echo " 1. 确保 /srv/ftp/share 目录可通过 FTP 访问"
echo " 2. 用户首先下载安装脚本:"
echo " curl -u ftpuser:admin1234 ftp://10.211.55.4/setup.sh -o setup.sh"
echo " 3. 然后执行安装 (自动获取最新版本):"
echo " sudo sh setup.sh"
echo " 4. 或者指定版本安装:"
echo " sudo sh setup.sh --version $VERSION"
echo " 5. 或者指定不同的FTP服务器:"
echo " sudo sh setup.sh --server 192.168.1.100 --user myuser --password mypass"

View File

@ -0,0 +1,337 @@
#!/bin/bash
# 此脚本会检查各组件的健康状态,并重启不健康的组件
# PID 文件检测,防止重复执行
PIDFILE="/var/run/restart_unhealthy.pid"
if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then
echo "自动重启脚本已在运行中,跳过本次执行" >&2
exit 0
fi
echo $$ > "$PIDFILE"
trap "rm -f $PIDFILE" EXIT
# 获取脚本所在目录
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
}
# 加载配置文件
load_config() {
local config_file="$SCRIPT_DIR/config.env"
if [[ -f "$config_file" ]]; then
log_info "加载配置文件: $config_file"
set -a
source "$config_file"
set +a
log_success "配置文件加载完成"
else
log_warning "配置文件不存在: $config_file,使用默认配置"
fi
}
# 检查单个组件健康状态
check_component_health() {
local component_name="$1"
local check_script_path="$2"
if [[ ! -f "$check_script_path" ]]; then
log_error "$component_name: 健康检查脚本不存在: $check_script_path"
return 1
fi
if [[ ! -x "$check_script_path" ]]; then
chmod +x "$check_script_path" 2>/dev/null || true
fi
# 执行健康检查,捕获退出码
if "$check_script_path" > /dev/null 2>&1; then
return 0
else
return 1
fi
}
# 重启单个组件
restart_component() {
local component_name="$1"
local install_dir="$2"
log_warning "正在重启组件: $component_name"
# 先执行卸载脚本
local uninstall_script="$install_dir/uninstall.sh"
if [[ -f "$uninstall_script" ]]; then
log_info "$component_name: 执行卸载脚本..."
chmod +x "$uninstall_script" 2>/dev/null || true
# 使用 yes 命令自动回答所有确认提示
yes 2>/dev/null | (cd "$install_dir" && "$uninstall_script") || true
log_info "$component_name: 卸载完成"
fi
# 执行安装脚本
local install_script="$install_dir/install.sh"
if [[ ! -f "$install_script" ]]; then
log_error "$component_name: 安装脚本不存在: $install_script"
return 1
fi
chmod +x "$install_script" 2>/dev/null || true
log_info "$component_name: 执行安装脚本..."
# 使用 yes 命令自动回答所有确认提示,传递 SCRIPT_DIR 作为参数
yes 2>/dev/null | (cd "$install_dir" && "$install_script" "$SCRIPT_DIR") || true
log_info "$component_name: 安装脚本执行完成"
return 0
}
# 查找组件进程 PID
find_component_pid() {
local component_name="$1"
local component_pid=""
case "$component_name" in
"node-exporter")
component_pid=$(pgrep -f "node_exporter" | head -1)
if [[ -z "$component_pid" ]]; then
component_pid=$(pgrep -f "node-exporter" | head -1)
fi
if [[ -z "$component_pid" ]]; then
component_pid=$(ps aux | grep -v grep | grep "node_exporter" | awk '{print $2}' | head -1)
fi
;;
"dcgm-exporter")
component_pid=$(pgrep -f "dcgm-exporter" | head -1)
if [[ -z "$component_pid" ]]; then
component_pid=$(pgrep -f "dcgm_exporter" | head -1)
fi
if [[ -z "$component_pid" ]]; then
component_pid=$(ps aux | grep -v grep | grep "dcgm-exporter" | awk '{print $2}' | head -1)
fi
;;
"fluent-bit")
component_pid=$(pgrep -f "fluent-bit" | head -1)
if [[ -z "$component_pid" ]]; then
component_pid=$(pgrep -f "fluent_bit" | head -1)
fi
if [[ -z "$component_pid" ]]; then
component_pid=$(ps aux | grep -v grep | grep "fluent-bit" | awk '{print $2}' | head -1)
fi
;;
"argus-agent")
component_pid=$(pgrep -f "argus-agent" | head -1)
if [[ -z "$component_pid" ]]; then
component_pid=$(ps aux | grep -v grep | grep "argus-agent" | awk '{print $2}' | head -1)
fi
;;
esac
echo "$component_pid"
}
# 更新安装记录文件中的 PID
update_install_record_pid() {
local component_name="$1"
local new_pid="$2"
if [[ ! -f "$INSTALL_RECORD_FILE" ]]; then
log_error "安装记录文件不存在: $INSTALL_RECORD_FILE"
return 1
fi
# 读取当前 PID
local current_pid=""
if command -v jq &> /dev/null; then
current_pid=$(jq -r --arg comp "$component_name" '.components[$comp].pid // ""' "$INSTALL_RECORD_FILE" 2>/dev/null)
fi
if [[ -z "$current_pid" ]]; then
log_warning "$component_name: 无法读取当前 PID跳过更新"
return 1
fi
# 使用 sed 精确替换 PID保持原有格式不变
# 只替换指定组件块中的 pid 字段
local temp_file="${INSTALL_RECORD_FILE}.tmp"
local in_component=0
local updated=0
while IFS= read -r line; do
if [[ "$line" =~ \"$component_name\":[[:space:]]*\{ ]]; then
in_component=1
echo "$line"
elif [[ $in_component -eq 1 && "$line" =~ \"pid\":[[:space:]]*\"$current_pid\" ]]; then
echo "$line" | sed "s/\"pid\": \"$current_pid\"/\"pid\": \"$new_pid\"/"
updated=1
in_component=0
else
echo "$line"
if [[ "$line" =~ ^[[:space:]]*\}[[:space:]]*$ ]]; then
in_component=0
fi
fi
done < "$INSTALL_RECORD_FILE" > "$temp_file"
# 验证替换是否成功
if [[ $updated -eq 1 ]]; then
mv "$temp_file" "$INSTALL_RECORD_FILE"
log_success "$component_name: PID 已更新为 $new_pid(原值: $current_pid"
return 0
else
log_error "$component_name: PID 替换失败"
rm -f "$temp_file"
return 1
fi
}
# 从安装记录文件中读取组件信息
read_install_record() {
local install_record_file="$1"
if [[ ! -f "$install_record_file" ]]; then
log_error "安装记录文件不存在: $install_record_file"
return 1
fi
# 检查是否有 jq 命令来解析 JSON
if command -v jq &> /dev/null; then
# 使用 jq 解析 JSON
local components_json
if components_json=$(jq -r '.components | to_entries[] | "\(.key):\(.value.install_dir)"' "$install_record_file" 2>/dev/null); then
echo "$components_json"
return 0
else
log_error "无法解析安装记录文件 JSON 格式: $install_record_file"
return 1
fi
else
# 如果没有 jq尝试简单的文本解析
log_warning "jq 命令不可用,尝试简单文本解析"
# 查找所有 install_dir 行
local components=()
while IFS= read -r line; do
if [[ "$line" =~ \"install_dir\":[[:space:]]*\"([^\"]+)\" ]]; then
local install_dir="${BASH_REMATCH[1]}"
# 从路径中提取组件名称
local component_name=$(basename "$install_dir")
components+=("$component_name:$install_dir")
fi
done < "$install_record_file"
if [[ ${#components[@]} -gt 0 ]]; then
printf '%s\n' "${components[@]}"
return 0
else
log_error "无法从安装记录文件中提取组件信息"
return 1
fi
fi
}
# 主函数
main() {
log_info "=========================================="
log_info " 组件自动重启检查"
log_info "=========================================="
# 检查是否是root用户
if [[ $EUID -ne 0 ]]; then
log_error "此脚本需要 root 权限运行"
exit 1
fi
# 加载配置文件
load_config
# 从安装记录文件中读取组件信息
log_info "从安装记录文件读取组件信息: $INSTALL_RECORD_FILE"
local components_info
if ! components_info=$(read_install_record "$INSTALL_RECORD_FILE"); then
log_error "无法读取安装记录文件,自动重启检查终止"
exit 1
fi
local restart_count=0
local check_count=0
# 逐个检查组件
while IFS= read -r component_info; do
if [[ -n "$component_info" ]]; then
IFS=':' read -r component_name install_dir <<< "$component_info"
check_count=$((check_count + 1))
local check_script_path="$install_dir/check_health.sh"
log_info "检查组件: $component_name"
# 检查健康状态
if check_component_health "$component_name" "$check_script_path"; then
log_success "$component_name: 运行正常"
else
log_warning "$component_name: 健康检查失败,尝试重启"
restart_count=$((restart_count + 1))
# 执行重启
restart_component "$component_name" "$install_dir"
# 等待服务启动
log_info "$component_name: 等待进程启动..."
sleep 10
# 查找新的进程 PID
local new_pid=$(find_component_pid "$component_name")
if [[ -n "$new_pid" ]]; then
log_info "$component_name: 找到新进程 PID: $new_pid"
update_install_record_pid "$component_name" "$new_pid"
else
log_warning "$component_name: 未找到新进程 PID"
fi
# 再次检查健康状态
if check_component_health "$component_name" "$check_script_path"; then
log_success "$component_name: 重启成功"
else
log_warning "$component_name: 重启后仍不健康,可能需要手动检查"
fi
fi
fi
done <<< "$components_info"
log_info "=========================================="
log_info "检查完成: 共检查 $check_count 个组件,尝试重启 $restart_count"
log_info "=========================================="
exit 0
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,931 @@
#!/bin/bash
set -e
# 加载配置文件(仅在解压后的目录中可用)
load_config() {
# setup.sh 脚本不需要配置文件FTP参数通过命令行参数或环境变量提供
log_info "setup.sh 脚本使用命令行参数或环境变量获取FTP配置"
}
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
FTP_SERVER="${FTP_SERVER}"
FTP_USER="${FTP_USER}"
FTP_PASS="${FTP_PASS}"
FTP_PORT="${FTP_PORT:-21}"
BASE_URL="" # FTP基础URL (将在check_ftp_params中设置)
LATEST_VERSION_URL="" # 版本文件URL (将在check_ftp_params中设置)
TEMP_DIR="/tmp/argus-metric-install-$$"
# 安装目录配置
DEFAULT_INSTALL_DIR="/opt/argus-metric" # 默认安装目录
INSTALL_DIR="${INSTALL_DIR:-$DEFAULT_INSTALL_DIR}" # 可通过环境变量覆盖
VERSIONS_DIR="$INSTALL_DIR/versions" # 版本目录
BACKUPS_DIR="$INSTALL_DIR/backups" # 备份目录
CURRENT_LINK="$INSTALL_DIR/current" # 当前版本软链接
LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION" # 当前版本记录文件
# 检查必需的FTP参数
check_ftp_params() {
local missing_params=()
if [[ -z "$FTP_SERVER" ]]; then
missing_params+=("FTP_SERVER")
fi
if [[ -z "$FTP_USER" ]]; then
missing_params+=("FTP_USER")
fi
if [[ -z "$FTP_PASS" ]]; then
missing_params+=("FTP_PASS")
fi
if [[ ${#missing_params[@]} -gt 0 ]]; then
log_error "缺少必需的FTP参数: ${missing_params[*]}"
log_error "请通过以下方式之一设置FTP参数:"
log_error " 1. 命令行参数: --server <地址> --user <用户名> --password <密码>"
log_error " 2. 环境变量: FTP_SERVER=<地址> FTP_USER=<用户名> FTP_PASS=<密码>"
log_error ""
log_error "示例:"
log_error " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234"
log_error " FTP_SERVER=10.211.55.4 FTP_USER=ftpuser FTP_PASS=admin1234 sudo sh setup.sh"
exit 1
fi
# 设置BASE_URL和LATEST_VERSION_URL
BASE_URL="ftp://${FTP_SERVER}:${FTP_PORT}"
LATEST_VERSION_URL="$BASE_URL/LATEST_VERSION"
log_info "FTP配置:"
log_info " 服务器: $FTP_SERVER:$FTP_PORT"
log_info " 用户: $FTP_USER"
}
# 获取最新版本号的函数
get_latest_version() {
log_info "获取最新版本信息..." >&2
log_info "尝试从URL获取: $LATEST_VERSION_URL" >&2
# 先测试FTP连接
log_info "测试FTP连接..." >&2
if ! curl -u "${FTP_USER}:${FTP_PASS}" -sfI "$LATEST_VERSION_URL" >/dev/null 2>&1; then
log_error "无法连接到FTP服务器或文件不存在" >&2
log_error "URL: $LATEST_VERSION_URL" >&2
log_error "请检查:" >&2
log_error " 1. FTP服务器是否运行: $FTP_SERVER:$FTP_PORT" >&2
log_error " 2. 用户名密码是否正确: $FTP_USER" >&2
log_error " 3. LATEST_VERSION文件是否存在" >&2
log_error "手动测试命令: curl -u ${FTP_USER}:${FTP_PASS} ftp://${FTP_SERVER}/LATEST_VERSION" >&2
exit 1
fi
# 获取文件内容
if ! LATEST_VERSION=$(curl -u "${FTP_USER}:${FTP_PASS}" -sfL "$LATEST_VERSION_URL" 2>/dev/null | tr -d '[:space:]'); then
log_error "下载LATEST_VERSION文件失败" >&2
exit 1
fi
log_info "原始获取内容: '$LATEST_VERSION'" >&2
if [[ -z "$LATEST_VERSION" ]]; then
log_error "获取到的版本信息为空" >&2
log_error "可能的原因:" >&2
log_error " 1. LATEST_VERSION文件为空" >&2
log_error " 2. 文件内容格式不正确" >&2
log_error " 3. 网络传输问题" >&2
log_error "请检查FTP服务器上的 /srv/ftp/share/LATEST_VERSION 文件" >&2
exit 1
fi
log_info "检测到最新版本: $LATEST_VERSION" >&2
echo "$LATEST_VERSION"
}
# 解析参数
ARGUS_VERSION="" # 使用不同的变量名避免与系统VERSION冲突
ACTION="install"
FORCE_INSTALL=false
while [[ $# -gt 0 ]]; do
case $1 in
--version)
ARGUS_VERSION="$2"
shift 2
;;
--server)
FTP_SERVER="$2"
shift 2
;;
--user)
FTP_USER="$2"
shift 2
;;
--password)
FTP_PASS="$2"
shift 2
;;
--port)
FTP_PORT="$2"
shift 2
;;
--uninstall)
ACTION="uninstall"
shift
;;
--install-dir)
INSTALL_DIR="$2"
shift 2
;;
# 简化安装逻辑:不再支持回滚和备份列表功能
# --rollback)
# ACTION="rollback"
# shift
# ;;
# --backup-list)
# ACTION="backup-list"
# shift
# ;;
--status)
ACTION="status"
shift
;;
--force)
FORCE_INSTALL=true
shift
;;
--help)
echo "Argus Metric FTP在线安装脚本"
echo
echo "用法: curl -u <用户名>:<密码> ftp://<服务器>/setup.sh -o setup.sh && sh setup.sh [选项]"
echo
echo "必需参数 (必须通过命令行参数或环境变量设置):"
echo " --server SERVER FTP服务器地址 (必须)"
echo " --user USER FTP用户名 (必须)"
echo " --password PASS FTP密码 (必须)"
echo
echo "可选参数:"
echo " --version VERSION 指定版本 (默认: 自动获取最新版本)"
echo " --port PORT FTP端口 (默认: 21)"
echo " --install-dir DIR 安装目录 (默认: /opt/argus-metric)"
echo " --force 强制重新安装 (即使相同版本)"
echo " --uninstall 卸载 (自动确认)"
# echo " --rollback 回滚到上一个备份版本"
# echo " --backup-list 列出所有备份版本"
echo " --status 显示当前安装状态"
echo " --help 显示帮助"
echo
echo "环境变量:"
echo " FTP_SERVER FTP服务器地址 (必须)"
echo " FTP_USER FTP用户名 (必须)"
echo " FTP_PASS FTP密码 (必须)"
echo " FTP_PORT FTP端口 (默认: 21)"
echo
echo "示例:"
echo " # 方式1: 使用命令行参数"
echo " curl -u ftpuser:admin1234 ftp://10.211.55.4/setup.sh -o setup.sh"
echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234"
echo " "
echo " # 方式2: 使用环境变量"
echo " FTP_SERVER=10.211.55.4 FTP_USER=ftpuser FTP_PASS=admin1234 sudo sh setup.sh"
echo " "
echo " # 指定版本安装"
echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --version 1.30.0"
echo " "
echo " # 强制重新安装"
echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --force"
echo " "
echo " # 卸载"
echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --uninstall"
exit 0
;;
*)
log_error "未知参数: $1"
echo "使用 --help 查看帮助信息"
exit 1
;;
esac
done
# 清理函数
cleanup() {
if [[ -d "$TEMP_DIR" ]]; then
rm -rf "$TEMP_DIR"
fi
}
trap cleanup EXIT
# 创建安装目录结构
create_install_directories() {
log_info "创建安装目录结构..."
# 创建主要目录
mkdir -p "$VERSIONS_DIR"
mkdir -p "$BACKUPS_DIR"
log_success "安装目录结构创建完成: $INSTALL_DIR"
}
# 获取当前安装的版本
get_current_version() {
# 优先从LATEST_VERSION文件读取
if [[ -f "$LATEST_VERSION_FILE" ]]; then
local version_from_file=$(cat "$LATEST_VERSION_FILE" 2>/dev/null | tr -d '[:space:]')
if [[ -n "$version_from_file" ]]; then
# 确保版本号格式一致不带v前缀
echo "$version_from_file"
return 0
fi
fi
# 如果文件不存在或为空,从软链接读取
if [[ -L "$CURRENT_LINK" ]]; then
local current_path=$(readlink "$CURRENT_LINK")
# 从版本目录名中提取版本号现在不带v前缀
basename "$current_path"
else
echo ""
fi
}
# 检查是否已安装
check_installed() {
if [[ -L "$CURRENT_LINK" ]] && [[ -d "$CURRENT_LINK" ]]; then
local current_version=$(get_current_version)
if [[ -n "$current_version" ]]; then
log_info "检测到已安装版本: v$current_version"
return 0
fi
fi
return 1
}
# 更新LATEST_VERSION文件
update_latest_version_file() {
local version="$1"
log_info "更新LATEST_VERSION文件: $version"
if echo "$version" > "$LATEST_VERSION_FILE"; then
log_success "LATEST_VERSION文件已更新"
else
log_error "更新LATEST_VERSION文件失败"
return 1
fi
}
# 初始化 DNS 配置文件到系统目录
init_dns_config_to_system() {
log_info "初始化 DNS 配置文件到系统目录..."
# 系统 DNS 配置文件
local system_dns_conf="$INSTALL_DIR/dns.conf"
# 如果系统目录中还没有 dns.conf创建一个空的占位文件
if [[ ! -f "$system_dns_conf" ]]; then
touch "$system_dns_conf"
chmod 644 "$system_dns_conf"
log_success "DNS 配置文件占位文件已创建: $system_dns_conf"
log_info "DNS 同步脚本将从 FTP 服务器下载实际的 DNS 配置"
else
log_info "DNS 配置文件已存在: $system_dns_conf"
fi
}
# 备份当前版本
backup_current_version() {
local current_version=$(get_current_version)
if [[ -z "$current_version" ]]; then
log_info "没有当前版本需要备份"
return 0
fi
# 确保备份目录存在
mkdir -p "$BACKUPS_DIR"
local backup_name="$current_version"
local backup_path="$BACKUPS_DIR/$backup_name"
log_info "备份当前版本 $current_version 到: $backup_path"
# 如果备份已存在,先删除
if [[ -d "$backup_path" ]]; then
log_info "备份版本已存在,覆盖: $backup_path"
rm -rf "$backup_path"
fi
# 复制当前版本目录(跟随软链接复制实际内容)
if cp -rL "$CURRENT_LINK" "$backup_path"; then
log_success "版本备份完成: $backup_name"
else
log_error "版本备份失败"
exit 1
fi
}
# 回滚到备份版本
rollback_to_backup() {
local backup_name="$1"
# 确保备份目录存在
mkdir -p "$BACKUPS_DIR"
local backup_path="$BACKUPS_DIR/$backup_name"
if [[ ! -d "$backup_path" ]]; then
log_error "备份不存在: $backup_path"
return 1
fi
log_info "回滚到备份版本: $backup_name"
# 停止当前服务
stop_services
# 检查是否存在对应的版本目录
local version_dir="$VERSIONS_DIR/$backup_name"
if [[ ! -d "$version_dir" ]]; then
log_info "版本目录不存在,从备份恢复版本目录: $version_dir"
# 从备份目录恢复到版本目录
mkdir -p "$VERSIONS_DIR"
cp -r "$backup_path" "$version_dir"
fi
# 恢复软链接指向版本目录
if ln -sfn "$version_dir" "$CURRENT_LINK"; then
log_success "版本回滚完成: $backup_name"
# 更新LATEST_VERSION文件
update_latest_version_file "$backup_name"
return 0
else
log_error "版本回滚失败"
return 1
fi
}
# 停止服务
stop_services() {
log_info "停止当前服务..."
# 检查服务是否正在运行
if ! check_services_running; then
log_info "服务未运行,无需停止"
return 0
fi
# 尝试使用卸载脚本停止服务
if [[ -f "$CURRENT_LINK/uninstall.sh" ]]; then
cd "$CURRENT_LINK"
chmod +x uninstall.sh
# 自动确认停止服务(避免交互式确认)
echo "y" | ./uninstall.sh >/dev/null 2>&1
local stop_exit_code=$?
if [[ $stop_exit_code -eq 0 ]]; then
log_success "服务停止完成"
else
log_warning "停止服务时出现警告,尝试手动停止"
manual_stop_services
fi
else
log_warning "未找到卸载脚本,尝试手动停止服务"
manual_stop_services
fi
}
# 手动停止服务
manual_stop_services() {
log_info "手动停止服务..."
# 停止 node_exporter
if pgrep -f "node_exporter" >/dev/null 2>&1; then
pkill -f "node_exporter" && log_info "node_exporter 已停止"
fi
# 停止 dcgm_exporter
if pgrep -f "dcgm_exporter" >/dev/null 2>&1; then
pkill -f "dcgm_exporter" && log_info "dcgm_exporter 已停止"
fi
# 等待进程完全停止
sleep 2
# 检查是否还有残留进程
if pgrep -f "node_exporter\|dcgm_exporter" >/dev/null 2>&1; then
log_warning "仍有服务进程运行,尝试强制停止"
pkill -9 -f "node_exporter\|dcgm_exporter" 2>/dev/null || true
fi
log_success "手动停止服务完成"
}
# 启动服务
start_services() {
log_info "启动服务..."
# 检查服务是否已经在运行
if check_services_running; then
log_info "服务已在运行,跳过启动"
return 0
fi
# 由于 install_artifact.sh 已经安装了所有组件并设置了健康检查定时任务
# 这里只需要简单验证服务状态即可
log_info "组件已安装完成,健康检查定时任务已设置"
log_info "服务将在健康检查时自动启动每5分钟检查一次"
# 等待一下让服务有时间启动
sleep 3
# 验证服务状态
if check_services_running; then
log_success "服务启动成功"
else
log_info "服务可能正在启动中,健康检查机制将自动监控"
fi
return 0
}
# 检查服务是否正在运行
check_services_running() {
# 检查常见的服务端口是否在监听
local ports=(9100 9400) # node-exporter 和 dcgm-exporter 的默认端口
for port in "${ports[@]}"; do
if netstat -tlnp 2>/dev/null | grep -q ":$port "; then
log_info "检测到服务正在端口 $port 上运行"
return 0
fi
done
# 检查相关进程
if pgrep -f "node_exporter\|dcgm_exporter" >/dev/null 2>&1; then
log_info "检测到相关服务进程正在运行"
return 0
fi
return 1
}
# 检查是否为 root 用户
check_root() {
if [[ $EUID -ne 0 ]]; then
log_error "此脚本需要 root 权限运行"
log_info "请使用: sudo sh setup.sh"
exit 1
fi
}
# 检查系统要求
check_system() {
log_info "检查系统要求..."
# 检查操作系统
if [[ ! -f /etc/os-release ]]; then
log_error "无法检测操作系统版本"
exit 1
fi
# 读取系统信息使用子shell避免污染当前环境变量
local OS_INFO=$(source /etc/os-release && echo "$NAME $VERSION_ID")
log_info "检测到操作系统: $OS_INFO"
# 检查系统架构
arch=$(uname -m)
log_info "系统架构: $arch"
# 检查磁盘空间
available_space=$(df / | awk 'NR==2 {print $4}')
if [[ $available_space -lt 1024 ]]; then
log_warning "可用磁盘空间不足 1GB当前可用: $(($available_space / 1024 / 1024))GB"
fi
}
# 下载并安装
install_argus_metric() {
# 如果没有指定版本,获取最新版本
if [[ -z "$ARGUS_VERSION" ]]; then
ARGUS_VERSION=$(get_latest_version)
fi
log_info "开始安装 Argus Metric v$ARGUS_VERSION..."
log_info "安装目录: $INSTALL_DIR"
# 创建安装目录结构(必须先创建,以便备份时目录存在)
create_install_directories
# 检查是否已安装
local is_upgrade=false
if check_installed; then
local current_version=$(get_current_version)
if [[ "$current_version" == "$ARGUS_VERSION" ]]; then
if [[ "$FORCE_INSTALL" == true ]]; then
log_info "检测到相同版本 v$ARGUS_VERSION,但使用了 --force 参数,将强制重新安装"
is_upgrade=true
# 简化安装逻辑:不再备份当前版本
# backup_current_version
else
log_info "版本 v$ARGUS_VERSION 已安装,无需重复安装"
log_info "如需强制重新安装,请使用 --force 参数"
return 0
fi
else
log_info "检测到版本升级: v$current_version -> v$ARGUS_VERSION"
is_upgrade=true
# 简化安装逻辑:不再备份当前版本
# backup_current_version
fi
fi
# 创建临时目录
mkdir -p "$TEMP_DIR"
cd "$TEMP_DIR"
# 下载发布包,使用新的命名规范
TAR_NAME="argus-metric_$(echo $ARGUS_VERSION | tr '.' '_').tar.gz"
log_info "下载发布包: $TAR_NAME"
log_info "从FTP服务器下载: $FTP_SERVER:$FTP_PORT, 用户: $FTP_USER"
# 构造curl命令并显示隐藏密码
CURL_CMD="curl -u \"${FTP_USER}:***\" -sfL \"$BASE_URL/$TAR_NAME\" -o \"$TAR_NAME\""
log_info "执行命令: $CURL_CMD"
if ! curl -u "${FTP_USER}:${FTP_PASS}" -sfL "$BASE_URL/$TAR_NAME" -o "$TAR_NAME"; then
log_error "下载发布包失败: $BASE_URL/$TAR_NAME"
log_error "完整命令: curl -u \"${FTP_USER}:${FTP_PASS}\" -sfL \"$BASE_URL/$TAR_NAME\" -o \"$TAR_NAME\""
log_error "请检查FTP服务器连接、用户名密码是否正确"
exit 1
fi
# 解压发布包到当前目录
log_info "解压发布包..."
if ! tar -xzf "$TAR_NAME"; then
log_error "解压发布包失败"
exit 1
fi
# 显示解压后的文件结构
log_info "解压后的文件结构:"
ls -la "$TEMP_DIR"
# 准备版本目录
local version_dir="$VERSIONS_DIR/$ARGUS_VERSION"
log_info "安装到版本目录: $version_dir"
# 如果升级,先停止服务
if [[ "$is_upgrade" == true ]]; then
stop_services
fi
# 创建版本目录
if [[ -d "$version_dir" ]]; then
log_info "版本目录已存在,备份后更新"
rm -rf "$version_dir"
fi
# 创建新的版本目录
mkdir -p "$version_dir"
# 移动解压的文件到版本目录
log_info "移动文件到版本目录: $TEMP_DIR/* -> $version_dir/"
# 检查源目录是否有内容
if [[ ! "$(ls -A "$TEMP_DIR" 2>/dev/null)" ]]; then
log_error "临时目录为空,无法移动文件"
exit 1
fi
# 检查目标目录是否存在
if [[ ! -d "$version_dir" ]]; then
log_error "目标版本目录不存在: $version_dir"
exit 1
fi
# 执行文件移动
if mv "$TEMP_DIR"/* "$version_dir" 2>/dev/null; then
log_success "文件移动到版本目录完成"
else
log_error "移动文件到版本目录失败"
log_error "源目录内容:"
ls -la "$TEMP_DIR" || true
log_error "目标目录状态:"
ls -la "$version_dir" || true
log_error "权限检查:"
ls -ld "$TEMP_DIR" "$version_dir" || true
exit 1
fi
# 执行安装脚本
log_info "执行安装脚本..."
cd "$version_dir"
if [[ -f "install.sh" ]]; then
chmod +x install.sh
# 传递安装根目录给安装脚本让install_artifact.sh安装到正确的版本目录
if ./install.sh "$version_dir"; then
log_success "安装脚本执行完成"
else
log_error "安装脚本执行失败"
# 简化安装逻辑:不再自动回滚
# if [[ "$is_upgrade" == true ]]; then
# log_warning "升级失败,尝试回滚到之前版本..."
# # 确保备份目录存在
# mkdir -p "$BACKUPS_DIR"
# local latest_backup=$(ls -1t "$BACKUPS_DIR" 2>/dev/null | head -n 1)
# if [[ -n "$latest_backup" ]]; then
# rollback_to_backup "$latest_backup"
# return 1
# fi
# fi
exit 1
fi
else
log_error "未找到安装脚本 install.sh"
exit 1
fi
# 更新软链接指向新版本
log_info "更新当前版本链接..."
# 如果 current 已经存在且是目录,先删除它
if [[ -d "$CURRENT_LINK" ]] && [[ ! -L "$CURRENT_LINK" ]]; then
log_warning "发现 current 是目录而不是符号链接,正在删除..."
rm -rf "$CURRENT_LINK"
fi
if ln -sfn "$version_dir" "$CURRENT_LINK"; then
log_success "版本链接更新完成: $CURRENT_LINK -> $version_dir"
else
log_error "版本链接更新失败"
exit 1
fi
# 更新LATEST_VERSION文件
update_latest_version_file "$ARGUS_VERSION"
# 初始化 DNS 配置文件到系统目录
init_dns_config_to_system
# 启动服务
# start_services
log_success "Argus Metric v$ARGUS_VERSION 安装完成!"
# 显示安装信息
echo
log_info "安装信息:"
log_info " 版本: $ARGUS_VERSION"
log_info " 安装目录: $INSTALL_DIR"
log_info " 版本目录: $version_dir"
log_info " 当前链接: $CURRENT_LINK"
if [[ "$is_upgrade" == true ]]; then
log_info " 升级类型: 版本升级"
else
log_info " 安装类型: 全新安装"
fi
}
# 卸载
uninstall_argus_metric() {
log_info "开始卸载 Argus Metric..."
log_info "安装目录: $INSTALL_DIR"
# 检查是否已安装
if ! check_installed; then
log_info "未检测到已安装的 Argus Metric"
return 0
fi
local current_version=$(get_current_version)
log_info "检测到当前版本: v$current_version"
# 停止服务
stop_services
# 执行卸载脚本
log_info "执行卸载脚本..."
if [[ -f "$CURRENT_LINK/uninstall.sh" ]]; then
cd "$CURRENT_LINK"
chmod +x uninstall.sh
# 自动确认卸载(因为用户已经明确使用了 --uninstall 参数)
log_info "自动确认卸载操作..."
echo "y" | ./uninstall.sh
local uninstall_exit_code=$?
if [[ $uninstall_exit_code -eq 0 ]]; then
log_success "卸载脚本执行完成"
else
log_error "卸载脚本执行失败 (退出码: $uninstall_exit_code)"
exit 1
fi
else
log_warning "未找到卸载脚本,执行基本清理"
fi
# 清理安装目录
log_info "清理安装目录..."
if [[ -d "$INSTALL_DIR" ]]; then
# 询问是否完全删除安装目录
log_warning "这将删除整个安装目录: $INSTALL_DIR"
log_warning "包括所有版本、备份和配置文件"
# 在自动化环境中,直接删除
if rm -rf "$INSTALL_DIR"; then
log_success "安装目录已完全清理: $INSTALL_DIR"
else
log_error "清理安装目录失败"
exit 1
fi
else
log_info "安装目录不存在,无需清理"
fi
log_success "Argus Metric 卸载完成!"
}
# 显示状态
show_status() {
echo "=========================================="
echo " Argus Metric 安装状态"
echo "=========================================="
echo
if check_installed; then
local current_version=$(get_current_version)
log_info "当前版本: $current_version"
log_info "安装目录: $INSTALL_DIR"
log_info "当前链接: $CURRENT_LINK"
log_info "版本目录: $VERSIONS_DIR/$current_version"
log_info "版本文件: $LATEST_VERSION_FILE"
# 显示LATEST_VERSION文件内容
if [[ -f "$LATEST_VERSION_FILE" ]]; then
local file_version=$(cat "$LATEST_VERSION_FILE" 2>/dev/null | tr -d '[:space:]')
log_info "版本文件内容: $file_version"
fi
echo
log_info "目录结构:"
if [[ -d "$INSTALL_DIR" ]]; then
tree -L 2 "$INSTALL_DIR" 2>/dev/null || ls -la "$INSTALL_DIR"
fi
echo
log_info "可用版本:"
if [[ -d "$VERSIONS_DIR" ]]; then
ls -1 "$VERSIONS_DIR" 2>/dev/null | sed 's/^/ - /'
else
echo " 无"
fi
# 简化安装逻辑:不再显示备份版本信息
# echo
# log_info "备份版本:"
# if [[ -d "$BACKUPS_DIR" ]] && [[ $(ls -1 "$BACKUPS_DIR" 2>/dev/null | wc -l) -gt 0 ]]; then
# ls -1t "$BACKUPS_DIR" 2>/dev/null | sed 's/^/ - /'
# else
# echo " 无"
# fi
else
log_warning "Argus Metric 未安装"
log_info "安装目录: $INSTALL_DIR"
fi
}
# 列出备份
list_backups() {
echo "=========================================="
echo " Argus Metric 备份列表"
echo "=========================================="
echo
if [[ -d "$BACKUPS_DIR" ]] && [[ $(ls -1 "$BACKUPS_DIR" 2>/dev/null | wc -l) -gt 0 ]]; then
log_info "可用备份版本:"
ls -1t "$BACKUPS_DIR" 2>/dev/null | while read backup; do
local backup_time=$(stat -c %y "$BACKUPS_DIR/$backup" 2>/dev/null | cut -d' ' -f1-2)
echo " - $backup (创建时间: $backup_time)"
done
else
log_warning "没有可用的备份版本"
fi
}
# 回滚功能
rollback_version() {
log_info "开始回滚操作..."
if ! check_installed; then
log_error "没有检测到已安装的版本,无法回滚"
exit 1
fi
# 确保备份目录存在
mkdir -p "$BACKUPS_DIR"
# 获取最新的备份
local latest_backup=$(ls -1t "$BACKUPS_DIR" 2>/dev/null | head -n 1)
if [[ -z "$latest_backup" ]]; then
log_error "没有找到可用的备份版本"
exit 1
fi
log_info "将回滚到备份版本: $latest_backup"
if rollback_to_backup "$latest_backup"; then
log_success "回滚完成!"
# 显示当前状态
echo
show_status
else
log_error "回滚失败"
exit 1
fi
}
# 主函数
main() {
echo "=========================================="
echo " Argus Metric 在线安装脚本 v1.0"
echo "=========================================="
echo
# 加载配置文件
load_config
# 对于状态操作不需要FTP参数和root权限
# 简化安装逻辑:不再支持备份列表操作
if [[ "$ACTION" == "status" ]]; then
show_status
return 0
fi
# if [[ "$ACTION" == "status" || "$ACTION" == "backup-list" ]]; then
# if [[ "$ACTION" == "status" ]]; then
# show_status
# elif [[ "$ACTION" == "backup-list" ]]; then
# list_backups
# fi
# return 0
# fi
check_root
# 更新目录配置变量在设置INSTALL_DIR后
VERSIONS_DIR="$INSTALL_DIR/versions"
BACKUPS_DIR="$INSTALL_DIR/backups"
CURRENT_LINK="$INSTALL_DIR/current"
LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION"
# 简化安装逻辑:不再支持回滚操作
# if [[ "$ACTION" == "rollback" ]]; then
# rollback_version
# return 0
# fi
check_ftp_params
check_system
if [[ "$ACTION" == "uninstall" ]]; then
uninstall_argus_metric
else
install_argus_metric
fi
echo
log_info "操作完成!"
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,143 @@
#!/bin/bash
set -e
# 颜色
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
# 日志函数
log_info() { echo -e "${BLUE}[INFO]${NC} $1" >&2; }
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" >&2; }
log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" >&2; }
log_error() { echo -e "${RED}[ERROR]${NC} $1" >&2; }
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LOCAL_DNS_CONF="/opt/argus-metric/dns.conf"
RESOLV_CONF="/etc/resolv.conf"
ALT_RESOLV_CONF="/run/resolv.conf"
LOG_FILE="/opt/argus-metric/.dns_sync.log"
REMOTE_DNS_CONF_URL=""
# 获取 FTP 配置
get_ftp_config() {
log_info "获取 FTP 配置信息..."
if [[ -z "$FTP_SERVER" || -z "$FTP_USER" || -z "$FTP_PASSWORD" ]]; then
[[ -f "$SCRIPT_DIR/config.env" ]] && source "$SCRIPT_DIR/config.env"
fi
FTP_SERVER="${FTP_SERVER:-localhost}"
FTP_USER="${FTP_USER:-ftpuser}"
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
REMOTE_DNS_CONF_URL="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/dns.conf"
}
# 下载远程 dns.conf
download_remote_dns_conf() {
local tmp="/tmp/dns.remote.$$"
log_info "测试 FTP 连接..."
if ! curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null; then
log_error "无法连接到 FTP 服务器: $FTP_SERVER"; return 1
fi
if ! curl -u "${FTP_USER}:${FTP_PASSWORD}" -sf "ftp://${FTP_SERVER}/dns.conf" -o "$tmp" 2>/dev/null; then
log_error "下载 dns.conf 失败"; rm -f "$tmp"; return 1
fi
echo "$tmp"
}
# 文件比较
compare_files() { diff -q "$1" "$2" >/dev/null 2>&1; }
# 从 dns.conf 提取有效 IP
get_dns_ips() {
grep -Eo '^[0-9]{1,3}(\.[0-9]{1,3}){3}$' "$1" | sort -u
}
# 安全更新 resolv.conf保留符号链接
update_resolv_conf() {
local dns_conf="$1"
local dns_ips
mapfile -t dns_ips < <(get_dns_ips "$dns_conf")
[[ ${#dns_ips[@]} -eq 0 ]] && { log_warning "未检测到有效 DNS"; return; }
local target_file="$RESOLV_CONF"
if [[ ! -w "$RESOLV_CONF" ]]; then
log_warning "/etc/resolv.conf 不可写,使用兜底路径 $ALT_RESOLV_CONF"
target_file="$ALT_RESOLV_CONF"
fi
local temp="/tmp/resolv.new.$$"
cp "$target_file" "${target_file}.backup.$(date +%Y%m%d_%H%M%S)" 2>/dev/null || true
log_info "更新 DNS 配置文件: $target_file"
# 写入新的 nameserver 行
for ip in "${dns_ips[@]}"; do
echo "nameserver $ip"
done >"$temp"
# 追加原内容(去掉重复 nameserver
grep -v '^nameserver' "$target_file" >>"$temp" 2>/dev/null || true
awk '!a[$0]++' "$temp" >"${temp}.uniq"
# ⚙️ 使用 cat 原地覆盖,避免 mv 引发 “设备忙”
if cat "${temp}.uniq" >"$target_file" 2>/dev/null; then
chmod 644 "$target_file"
log_success "DNS 更新完成: ${dns_ips[*]}"
else
log_error "无法写入 $target_file,可能被系统锁定"
fi
rm -f "$temp" "${temp}.uniq"
}
# 检查 resolv.conf 是否包含 dns.conf 内容
ensure_dns_in_resolv() {
local dns_conf="$1"
local dns_ips
mapfile -t dns_ips < <(get_dns_ips "$dns_conf")
[[ ${#dns_ips[@]} -eq 0 ]] && return
for ip in "${dns_ips[@]}"; do
if ! grep -q "nameserver $ip" "$RESOLV_CONF" 2>/dev/null; then
log_warning "检测到 /etc/resolv.conf 缺少 $ip,执行兜底修复"
update_resolv_conf "$dns_conf"
return
fi
done
log_info "/etc/resolv.conf 已包含所有 DNS"
}
log_sync() { echo "[$(date '+%F %T')] $1" >>"$LOG_FILE"; }
main() {
log_info "开始 DNS 同步检查..."
mkdir -p /opt/argus-metric
get_ftp_config
local remote_file
if ! remote_file=$(download_remote_dns_conf); then
log_error "下载失败"; log_sync "同步失败"; exit 1
fi
if [[ ! -f "$LOCAL_DNS_CONF" ]]; then
log_info "本地 dns.conf 不存在,初始化..."
cp "$remote_file" "$LOCAL_DNS_CONF"
update_resolv_conf "$LOCAL_DNS_CONF"
log_sync "首次同步完成"
else
if compare_files "$LOCAL_DNS_CONF" "$remote_file"; then
log_info "dns.conf 无变化"
ensure_dns_in_resolv "$LOCAL_DNS_CONF"
log_sync "dns.conf 无变化,执行兜底检查"
else
log_info "检测到 DNS 配置更新"
cp "$remote_file" "$LOCAL_DNS_CONF"
update_resolv_conf "$LOCAL_DNS_CONF"
log_sync "DNS 配置同步完成"
fi
fi
rm -f "$remote_file"
log_success "DNS 同步流程完成"
}
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,274 @@
#!/bin/bash
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 配置变量
INSTALL_DIR="/opt/argus-metric"
TEMP_DIR="/tmp/argus-metric-uninstall-$$"
VERSION_FILE="version.json"
# 检查是否为 root 用户
check_root() {
if [[ $EUID -ne 0 ]]; then
log_error "此脚本需要 root 权限运行"
log_info "请使用: sudo $0"
exit 1
fi
}
# 查找版本文件
find_version_file() {
log_info "查找版本信息文件..."
# 在当前目录查找
if [[ -f "$VERSION_FILE" ]]; then
VERSION_FILE_PATH="$VERSION_FILE"
log_success "找到版本文件: $VERSION_FILE"
return 0
fi
# 在 artifact 目录查找
for version_dir in artifact/*/; do
if [[ -f "${version_dir}${VERSION_FILE}" ]]; then
VERSION_FILE_PATH="${version_dir}${VERSION_FILE}"
log_success "找到版本文件: $VERSION_FILE_PATH"
return 0
fi
done
log_error "未找到版本信息文件 $VERSION_FILE"
log_info "请确保在正确的目录下运行此脚本"
exit 1
}
# 解析版本信息
parse_version_info() {
log_info "解析版本信息..."
if [[ ! -f "$VERSION_FILE_PATH" ]]; then
log_error "版本文件不存在: $VERSION_FILE_PATH"
exit 1
fi
# 使用 jq 解析 JSON如果可用
if command -v jq &> /dev/null; then
VERSION=$(jq -r '.version' "$VERSION_FILE_PATH")
BUILD_TIME=$(jq -r '.build_time' "$VERSION_FILE_PATH")
# 解析 install_order现在包含完整的文件名
if jq -e '.install_order' "$VERSION_FILE_PATH" > /dev/null 2>&1; then
jq -r '.install_order[]' "$VERSION_FILE_PATH" > "$TEMP_DIR/install_order.txt"
else
log_error "version.json 中缺少 install_order 字段"
exit 1
fi
else
log_warning "jq 未安装,使用简单的 JSON 解析"
VERSION=$(grep '"version"' "$VERSION_FILE_PATH" | sed 's/.*"version": *"\([^"]*\)".*/\1/')
BUILD_TIME=$(grep '"build_time"' "$VERSION_FILE_PATH" | sed 's/.*"build_time": *"\([^"]*\)".*/\1/')
# 解析 install_order
grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+"' | while read line; do
component=$(echo "$line" | sed 's/.*"\([^"]*\)".*/\1/')
echo "$component" >> "$TEMP_DIR/install_order.txt"
done
fi
log_success "版本信息解析完成"
log_info " 版本: $VERSION"
log_info " 构建时间: $BUILD_TIME"
}
# 创建临时目录
create_temp_dirs() {
log_info "创建临时目录..."
mkdir -p "$TEMP_DIR"
log_success "临时目录创建完成: $TEMP_DIR"
}
# 卸载组件
uninstall_components() {
log_info "开始卸载组件..."
artifact_dir=$(dirname "$VERSION_FILE_PATH")
uninstall_count=0
total_count=0
if [[ -f "$TEMP_DIR/install_order.txt" ]]; then
total_count=$(wc -l < "$TEMP_DIR/install_order.txt")
fi
if [[ -f "$TEMP_DIR/install_order.txt" ]]; then
while IFS= read -r filename; do
uninstall_count=$((uninstall_count + 1))
# 从文件名中提取组件名(去掉时间戳后缀)
component=$(echo "$filename" | sed 's/-[0-9]\{8\}-[0-9]\{6\}\.tar\.gz$//')
log_info "[$uninstall_count/$total_count] 卸载 $component..."
# 直接使用完整的文件名
tar_file="$artifact_dir/$filename"
if [[ ! -f "$tar_file" ]]; then
log_error "找不到组件文件: $filename"
exit 1
fi
# 解压到临时目录
component_temp_dir="$TEMP_DIR/$component"
mkdir -p "$component_temp_dir"
if tar -xzf "$tar_file" -C "$component_temp_dir"; then
log_success " $component 解压完成"
else
log_error " $component 解压失败"
exit 1
fi
# 查找解压后的目录
extracted_dir=""
for dir in "$component_temp_dir"/*; do
if [[ -d "$dir" ]]; then
extracted_dir="$dir"
break
fi
done
if [[ -z "$extracted_dir" ]]; then
log_error " $component 解压后未找到目录"
exit 1
fi
# 执行卸载脚本
if [[ -f "$extracted_dir/uninstall.sh" ]]; then
log_info " 执行 $component 卸载脚本..."
# 所有组件都只需要一个确认
if (cd "$extracted_dir" && echo "y" | ./uninstall.sh); then
log_success " $component 卸载完成"
else
log_error " $component 卸载失败"
exit 1
fi
else
log_warning " $component 缺少 uninstall.sh 文件,跳过卸载"
fi
# 清理临时文件
rm -rf "$component_temp_dir"
done < "$TEMP_DIR/install_order.txt"
fi
log_success "所有组件卸载完成"
}
# 清理全局文件
cleanup_global_files() {
log_info "清理全局文件..."
# 清理安装目录
if [[ -d "$INSTALL_DIR" ]]; then
rm -rf "$INSTALL_DIR"
log_success "安装目录已清理: $INSTALL_DIR"
else
log_info "安装目录不存在: $INSTALL_DIR"
fi
# 清理可能的全局配置文件
local global_configs=(
"/etc/argus-metric"
"/var/log/argus-metric"
)
for config in "${global_configs[@]}"; do
if [[ -d "$config" ]]; then
rm -rf "$config"
log_success "全局配置已清理: $config"
fi
done
}
# 显示卸载信息
show_uninstall_info() {
log_success "Argus-Metrics All-in-One 卸载完成!"
echo
echo "卸载信息:"
echo " 版本: $VERSION"
echo " 构建时间: $BUILD_TIME"
echo
echo "清理内容:"
echo " - 二进制文件"
echo " - 配置文件"
echo " - 数据目录"
echo " - 进程和服务"
echo " - 全局安装目录"
echo
echo "注意:"
echo " - 系统依赖包可能仍然存在"
echo " - 如需完全清理,请手动检查并删除相关文件"
echo
}
# 清理函数
cleanup() {
if [[ -d "$TEMP_DIR" ]]; then
rm -rf "$TEMP_DIR"
fi
}
# 设置清理陷阱
trap cleanup EXIT
# 主函数
main() {
echo "=========================================="
echo " Argus-Metrics All-in-One 卸载脚本"
echo "=========================================="
echo
check_root
find_version_file
create_temp_dirs
parse_version_info
log_warning "此操作将完全卸载 Argus-Metrics All-in-One"
read -p "确认继续?(y/N): " confirm
if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then
log_info "取消卸载操作"
exit 0
fi
uninstall_components
cleanup_global_files
show_uninstall_info
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -0,0 +1,350 @@
#!/bin/bash
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 日志函数
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 显示帮助信息
show_help() {
echo "AIOps 版本管理工具"
echo
echo "用法: $0 <command> [options]"
echo
echo "命令:"
echo " bump <type> - 升级版本号 (major|minor|patch)"
echo " set <version> - 设置指定版本号"
echo " show - 显示当前版本信息"
echo " list - 列出所有版本"
echo " clean - 清理旧版本"
echo " validate - 验证版本配置"
echo
echo "示例:"
echo " $0 bump minor # 升级次版本号 1.0.0 -> 1.1.0"
echo " $0 set 2.0.0 # 设置版本为 2.0.0"
echo " $0 show # 显示当前版本"
echo " $0 list # 列出所有版本"
}
# 获取当前版本
get_current_version() {
if [[ -f "config/VERSION" ]]; then
cat config/VERSION
else
echo "0.0.0"
fi
}
# 设置版本号
set_version() {
local new_version="$1"
# 验证版本号格式
if [[ ! "$new_version" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
log_error "无效的版本号格式: $new_version"
log_info "版本号格式应为: major.minor.patch (如: 1.2.3)"
exit 1
fi
echo "$new_version" > config/VERSION
log_success "版本号已设置为: $new_version"
}
# 升级版本号
bump_version() {
local bump_type="$1"
local current_version=$(get_current_version)
# 解析当前版本号
IFS='.' read -r major minor patch <<< "$current_version"
case "$bump_type" in
"major")
major=$((major + 1))
minor=0
patch=0
;;
"minor")
minor=$((minor + 1))
patch=0
;;
"patch")
patch=$((patch + 1))
;;
*)
log_error "无效的升级类型: $bump_type"
log_info "支持的类型: major, minor, patch"
exit 1
;;
esac
local new_version="$major.$minor.$patch"
set_version "$new_version"
log_success "版本号已从 $current_version 升级到 $new_version"
}
# 显示当前版本信息
show_version() {
local current_version=$(get_current_version)
log_info "当前版本: $current_version"
if [[ -f "config/checklist" ]]; then
echo
echo "组件清单:"
while IFS= read -r line; do
[[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
read -r component version dep order <<< "$line"
if [[ -n "$component" && -n "$version" ]]; then
echo " - $component v$version"
fi
done < config/checklist
fi
# 检查是否有对应的 artifact
local artifact_dir="artifact/$current_version"
if [[ -d "$artifact_dir" ]]; then
echo
echo "已构建的组件:"
for file in "$artifact_dir"/*.tar.gz; do
if [[ -f "$file" ]]; then
local filename=$(basename "$file")
local size=$(du -h "$file" | cut -f1)
echo " - $filename ($size)"
fi
done
if [[ -f "$artifact_dir/version.json" ]]; then
echo
echo "版本信息文件: $artifact_dir/version.json"
fi
else
echo
log_warning "未找到对应的构建目录: $artifact_dir"
log_info "运行 ./package.sh 进行构建"
fi
}
# 列出所有版本
list_versions() {
log_info "所有版本列表:"
echo
if [[ ! -d "artifact" ]]; then
log_warning "artifact 目录不存在"
return
fi
for version_dir in artifact/*/; do
if [[ -d "$version_dir" ]]; then
local version=$(basename "$version_dir")
local current_version=$(get_current_version)
if [[ "$version" == "$current_version" ]]; then
echo " * $version (当前版本)"
else
echo " $version"
fi
# 显示该版本的组件
local component_count=0
for file in "$version_dir"/*.tar.gz; do
if [[ -f "$file" ]]; then
component_count=$((component_count + 1))
fi
done
if [[ $component_count -gt 0 ]]; then
echo " 包含 $component_count 个组件"
fi
fi
done
}
# 清理旧版本
clean_versions() {
local current_version=$(get_current_version)
local keep_versions=5 # 保留最近5个版本
log_info "清理旧版本 (保留最近 $keep_versions 个版本)..."
if [[ ! -d "artifact" ]]; then
log_warning "artifact 目录不存在"
return
fi
# 获取所有版本目录,按修改时间排序
local versions=()
while IFS= read -r -d '' version_dir; do
versions+=("$(basename "$version_dir")")
done < <(find artifact -maxdepth 1 -type d -name "[0-9]*" -print0 | sort -z)
local total_versions=${#versions[@]}
local versions_to_remove=$((total_versions - keep_versions))
if [[ $versions_to_remove -le 0 ]]; then
log_info "无需清理,当前只有 $total_versions 个版本"
return
fi
log_info "将删除 $versions_to_remove 个旧版本..."
for ((i=0; i<versions_to_remove; i++)); do
local version="${versions[i]}"
if [[ "$version" != "$current_version" ]]; then
log_info "删除版本: $version"
rm -rf "artifact/$version"
fi
done
log_success "旧版本清理完成"
}
# 验证版本配置
validate_version() {
log_info "验证版本配置..."
local errors=0
# 检查 VERSION 文件
if [[ ! -f "config/VERSION" ]]; then
log_error "VERSION 文件不存在"
errors=$((errors + 1))
else
local version=$(get_current_version)
if [[ ! "$version" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
log_error "VERSION 文件格式无效: $version"
errors=$((errors + 1))
else
log_success "VERSION 文件格式正确: $version"
fi
fi
# 检查 checklist 文件
if [[ ! -f "config/checklist" ]]; then
log_error "checklist 文件不存在"
errors=$((errors + 1))
else
local component_count=0
while IFS= read -r line; do
[[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
read -r component version dep order <<< "$line"
if [[ -n "$component" && -n "$version" ]]; then
component_count=$((component_count + 1))
# 检查组件目录是否存在
if [[ ! -d "plugins/$component" ]]; then
log_error "组件目录不存在: plugins/$component"
errors=$((errors + 1))
fi
fi
done < config/checklist
if [[ $component_count -gt 0 ]]; then
log_success "checklist 包含 $component_count 个组件"
else
log_error "checklist 中没有有效组件"
errors=$((errors + 1))
fi
fi
# 检查 package.sh 文件
if [[ ! -f "scripts/package_artifact.sh" ]]; then
log_error "package_artifact.sh 文件不存在"
errors=$((errors + 1))
else
if [[ -x "scripts/package_artifact.sh" ]]; then
log_success "package_artifact.sh 可执行"
else
log_warning "package_artifact.sh 不可执行,请运行: chmod +x scripts/package_artifact.sh"
fi
fi
# 检查 install.sh 文件
if [[ ! -f "scripts/install_artifact.sh" ]]; then
log_error "install_artifact.sh 文件不存在"
errors=$((errors + 1))
else
if [[ -x "scripts/install_artifact.sh" ]]; then
log_success "install_artifact.sh 可执行"
else
log_warning "install_artifact.sh 不可执行,请运行: chmod +x scripts/install_artifact.sh"
fi
fi
if [[ $errors -eq 0 ]]; then
log_success "版本配置验证通过"
else
log_error "发现 $errors 个配置问题"
exit 1
fi
}
# 主函数
main() {
case "${1:-}" in
"bump")
if [[ -z "${2:-}" ]]; then
log_error "请指定升级类型: major, minor, patch"
exit 1
fi
bump_version "$2"
;;
"set")
if [[ -z "${2:-}" ]]; then
log_error "请指定版本号"
exit 1
fi
set_version "$2"
;;
"show")
show_version
;;
"list")
list_versions
;;
"clean")
clean_versions
;;
"validate")
validate_version
;;
"help"|"-h"|"--help")
show_help
;;
"")
show_help
;;
*)
log_error "未知命令: $1"
echo
show_help
exit 1
;;
esac
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -2,6 +2,18 @@ FROM grafana/grafana:11.1.0
USER root
# 构建参数:是否使用内网镜像
ARG USE_INTRANET=false
# 根据是否为内网构建切换 apk 源
RUN if [ "$USE_INTRANET" = "true" ]; then \
echo "Configuring intranet apk repositories..." && \
sed -i 's#https\?://[^/]\+#http://10.68.64.1#g' /etc/apk/repositories; \
else \
echo "Configuring public apk repositories..." && \
sed -i 's#https\?://[^/]\+#https://mirrors.aliyun.com#g' /etc/apk/repositories; \
fi
# 安装必要的工具
RUN apk add --no-cache \
supervisor \
@ -10,6 +22,11 @@ RUN apk add --no-cache \
vim \
bash
# 部署镜像时恢复到部署侧使用的内网镜像源
RUN if [ "$USE_INTRANET" = "true" ]; then \
sed -i 's#https\?://[^/]\+#https://10.92.132.52/mirrors#g' /etc/apk/repositories; \
fi
# supervisor 日志目录
RUN mkdir -p /var/log/supervisor
@ -48,6 +65,8 @@ COPY grafana.ini /tmp/grafana.ini
COPY datasources/datasources.yml /tmp/datasources.yml
COPY dashboards/dashboards.yml /tmp/dashboards.yml
COPY dashboards/default_dashboard_by_hostname.json /tmp/default_dashboard.json
COPY dashboards/default_cluster_dashboard.json /tmp/default_cluster_dashboard.json
COPY dashboards/default_dashboard_by_instance.json /tmp/default_dashboard_by_instance.json
# supervisor 配置
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf

View File

@ -581,6 +581,372 @@
],
"title": "Node Process Count",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "GPU Utilization (%)",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "orange",
"value": 80
},
{
"color": "red",
"value": 95
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 24
},
"id": 301,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"expr": "DCGM_FI_DEV_GPU_UTIL{hostname=~\"$hostname\"}",
"legendFormat": "{{hostname}} GPU{{gpu}}",
"refId": "A"
}
],
"title": "GPU 利用率 (单卡)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": true,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "Memory Used (%)",
"axisPlacement": "left",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "orange",
"value": 80
},
{
"color": "red",
"value": 95
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 24
},
"id": 403,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"expr": "round(DCGM_FI_DEV_FB_USED{hostname=~\"$hostname\"} / (DCGM_FI_DEV_FB_USED{hostname=~\"$hostname\"} + DCGM_FI_DEV_FB_FREE{hostname=~\"$hostname\"}) * 100)",
"legendFormat": "{{hostname}} GPU{{gpu}}",
"refId": "A"
}
],
"title": "GPU 显存使用率 (单卡)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": true,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "Temperature (℃)",
"axisPlacement": "left",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 32
},
"id": 501,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"expr": "DCGM_FI_DEV_GPU_TEMP{hostname=~\"$hostname\"}",
"legendFormat": "{{hostname}} GPU{{gpu}}",
"refId": "A"
}
],
"title": "GPU 温度(单卡)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": true,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "Power (W)",
"axisPlacement": "left",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 300,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "orange",
"value": 200
},
{
"color": "red",
"value": 300
}
]
},
"unit": "watt"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 32
},
"id": 502,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"expr": "DCGM_FI_DEV_POWER_USAGE{hostname=~\"$hostname\"}",
"legendFormat": "{{hostname}} GPU{{gpu}}",
"refId": "A"
}
],
"title": "GPU 功率 (单卡)",
"type": "timeseries"
}
],
"refresh": "15s",
@ -589,11 +955,6 @@
"templating": {
"list": [
{
"current": {
"selected": true,
"text": "node-exporter-A1",
"value": "node-exporter-A1"
},
"datasource": {
"type": "prometheus"
},
@ -623,7 +984,7 @@
},
"timepicker": {},
"timezone": "",
"title": "Node and GPU Metrics",
"uid": "node_gpu_metrics",
"title": "Node and GPU Metrics (by hostname)",
"uid": "node_gpu_metrics_by_hostname",
"weekStart": ""
}

View File

@ -622,7 +622,7 @@
},
"timepicker": {},
"timezone": "",
"title": "Node and GPU Metrics",
"uid": "node_gpu_metrics",
"title": "Node and GPU Metrics (by instance)",
"uid": "node_gpu_metrics_by_instance",
"weekStart": ""
}

View File

@ -8,7 +8,7 @@ datasources:
type: prometheus
access: proxy
uid: eezk1zvkie4g0a
url: http://10.211.55.5:9090
url: http://prom.metric.argus.com:9090
isDefault: true
editable: true
jsonData:

View File

@ -44,12 +44,18 @@ else
fi
# 复制数据源配置文件到挂载目录
if [ -f "/tmp/datasources.yml" ]; then
echo "[INFO] Copying datasource configuration to /private/argus/metric/grafana/provisioning/datasources/"
cp /tmp/datasources.yml /private/argus/metric/grafana/provisioning/datasources/datasources.yml
echo "[INFO] Datasource configuration copied successfully"
elif [ -d "/private/argus/metric/grafana/provisioning/datasources" ] && [ "$(ls -A /private/argus/metric/grafana/provisioning/datasources)" ]; then
echo "[INFO] Found existing datasource provisioning files in /private/argus/metric/grafana/provisioning/datasources"
DS_OUT="/private/argus/metric/grafana/provisioning/datasources/datasources.yml"
PROM_DOMAIN="prom.metric.argus.com:9090"
if [ -f "/tmp/datasources.yml" ] && [ ! -f "$DS_OUT" ]; then
echo "[INFO] Initializing datasource provisioning file from /tmp"
cp /tmp/datasources.yml "$DS_OUT"
fi
# 统一将数据源 URL 规范为 prom.metric.argus.com:9090
if [ -f "$DS_OUT" ]; then
sed -i -E "s#^\s*url:\s*http://[^[:space:]]+# url: http://$PROM_DOMAIN#g" "$DS_OUT" || true
echo "[INFO] Datasource URL normalized to http://$PROM_DOMAIN"
elif [ -d "/etc/grafana/provisioning/datasources" ] && [ "$(ls -A /etc/grafana/provisioning/datasources)" ]; then
echo "[INFO] Found datasource provisioning files in /etc/grafana/provisioning/datasources"
# 确保数据源配置目录权限正确
@ -65,11 +71,33 @@ if [ -f "/tmp/dashboards.yml" ]; then
echo "[INFO] Dashboard configuration copied successfully"
fi
# 复制默认仪表板到挂载目录
if [ -f "/tmp/default_dashboard.json" ]; then
echo "[INFO] Copying default dashboard to /private/argus/metric/grafana/provisioning/dashboards/"
cp /tmp/default_dashboard.json /private/argus/metric/grafana/provisioning/dashboards/default_dashboard.json
echo "[INFO] Default dashboard copied successfully"
# 复制默认仪表板到挂载目录(按需,不覆盖已存在文件)
copy_dashboard_if_missing() {
local src="$1"; local dst_name="$2"
local dst_dir="/private/argus/metric/grafana/provisioning/dashboards"
local dst="$dst_dir/$dst_name"
if [ -f "$src" ]; then
if [ ! -f "$dst" ]; then
echo "[INFO] Installing dashboard: $dst_name"
cp "$src" "$dst"
else
echo "[INFO] Dashboard exists, skip: $dst_name"
fi
fi
}
copy_dashboard_if_missing "/tmp/default_dashboard.json" "default_dashboard.json"
copy_dashboard_if_missing "/tmp/default_cluster_dashboard.json" "default_cluster_dashboard.json"
copy_dashboard_if_missing "/tmp/default_dashboard_by_instance.json" "default_dashboard_by_instance.json"
# 规范面板中的数据源字段:将字符串 "prometheus" 替换为 null使用默认数据源
DB_DIR="/private/argus/metric/grafana/provisioning/dashboards"
if [ -d "$DB_DIR" ]; then
for f in "$DB_DIR"/*.json; do
[ -f "$f" ] || continue
sed -i -E 's/"datasource"\s*:\s*"prometheus"/"datasource": null/g' "$f" || true
done
echo "[INFO] Normalized dashboard datasource to default (null)"
fi
# 启动 Grafana

View File

@ -11,13 +11,6 @@ RUN if [ "$USE_INTRANET" = "true" ]; then \
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
else \
echo "Configuring fast apt sources for external network..." && \
find /etc/apt -name "sources.list*" -exec sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g' {} \; && \
find /etc/apt -name "sources.list*" -exec sed -i 's/security.ubuntu.com/mirrors.aliyun.com/g' {} \; && \
echo "deb http://mirrors.aliyun.com/ubuntu/ jammy main restricted universe multiverse" > /etc/apt/sources.list && \
echo "deb http://mirrors.aliyun.com/ubuntu/ jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \
echo "deb http://mirrors.aliyun.com/ubuntu/ jammy-security main restricted universe multiverse" >> /etc/apt/sources.list; \
fi
# 验证源配置并安装常用工具
@ -61,10 +54,25 @@ RUN mkdir -p ${PROMETHEUS_BASE_PATH}/rules \
&& ln -s ${PROMETHEUS_BASE_PATH} /prometheus
# 修改 Prometheus 用户 UID/GID 并授权
RUN usermod -u ${ARGUS_BUILD_UID} nobody && \
groupmod -g ${ARGUS_BUILD_GID} nogroup && \
chown -h nobody:nogroup /prometheus && \
chown -R nobody:nogroup ${PROMETHEUS_BASE_PATH} && \
RUN set -eux; \
existing_user=""; \
if getent passwd "${ARGUS_BUILD_UID}" >/dev/null 2>&1; then \
existing_user="$(getent passwd "${ARGUS_BUILD_UID}" | cut -d: -f1)"; \
fi; \
if [ -n "$existing_user" ] && [ "$existing_user" != "nobody" ]; then \
userdel -r "$existing_user" || true; \
fi; \
existing_group=""; \
if getent group "${ARGUS_BUILD_GID}" >/dev/null 2>&1; then \
existing_group="$(getent group "${ARGUS_BUILD_GID}" | cut -d: -f1)"; \
fi; \
if [ -n "$existing_group" ] && [ "$existing_group" != "nogroup" ]; then \
groupdel "$existing_group" || true; \
fi; \
usermod -u ${ARGUS_BUILD_UID} nobody; \
groupmod -g ${ARGUS_BUILD_GID} nogroup; \
chown -h nobody:nogroup /prometheus; \
chown -R nobody:nogroup ${PROMETHEUS_BASE_PATH}; \
chown -R nobody:nogroup /etc/prometheus
# supervisor 配置

View File

@ -5,13 +5,6 @@ networks:
services:
ftp:
build:
context: ../ftp/build
dockerfile: Dockerfile
args:
ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
USE_INTRANET: ${USE_INTRANET:-false}
image: argus-metric-ftp:latest
container_name: argus-ftp
restart: unless-stopped
@ -41,13 +34,6 @@ services:
max-file: "3"
prometheus:
build:
context: ../prometheus/build
dockerfile: Dockerfile
args:
ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
USE_INTRANET: ${USE_INTRANET:-false}
image: argus-metric-prometheus:latest
container_name: argus-prometheus
restart: unless-stopped
@ -73,12 +59,6 @@ services:
max-file: "3"
grafana:
build:
context: ../grafana/build
dockerfile: Dockerfile
args:
ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
image: argus-metric-grafana:latest
container_name: argus-grafana
restart: unless-stopped
@ -109,9 +89,6 @@ services:
max-file: "3"
test-node:
build:
context: ./client-test-node/build
dockerfile: Dockerfile
image: argus-metric-test-node:latest
container_name: argus-metric-test-node
hostname: test-metric-node-001
@ -143,9 +120,6 @@ services:
max-file: "3"
test-gpu-node:
build:
context: ./client-test-gpu-node/build
dockerfile: Dockerfile
image: argus-metric-test-gpu-node:latest
container_name: argus-metric-test-gpu-node
hostname: test-metric-gpu-node-001

View File

@ -3,15 +3,8 @@ set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# 解析参数
REBUILD_FLAG=""
if [[ "$1" == "--rebuild" || "$1" == "-r" ]]; then
REBUILD_FLAG="--rebuild"
echo "[01] 启用强制重新构建模式"
fi
echo "[01] 启动所有服务..."
bash "$SCRIPT_DIR/common/start-all.sh" $REBUILD_FLAG
bash "$SCRIPT_DIR/common/start-all.sh"
echo "[01] 等待服务就绪..."
sleep 5

View File

@ -1,6 +1,9 @@
#!/bin/bash
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
COMMON_DIR="$SCRIPT_DIR/common"
FTP_SERVER="${FTP_SERVER:-172.30.0.40}"
FTP_USER="${FTP_USER:-ftpuser}"
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
@ -8,26 +11,37 @@ FTP_PORT="${FTP_PORT:-21}"
FTP_HOST="${FTP_SERVER}"
echo "[03] 进入测试节点执行安装..."
echo "[03] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}"
echo "[04] 检测GPU环境..."
# 检测GPU环境
if bash "$COMMON_DIR/check-gpu.sh"; then
echo "[04] GPU环境可用继续执行GPU节点安装"
GPU_AVAILABLE=true
else
echo "[04] GPU环境不可用跳过GPU节点安装"
GPU_AVAILABLE=false
exit 0
fi
echo "[04] 进入测试节点执行安装..."
echo "[04] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}"
docker exec argus-metric-test-gpu-node bash -c "
set -e
if ! command -v curl &>/dev/null; then
echo '[03] curl 未安装,正在安装...'
echo '[04] curl 未安装,正在安装...'
apt-get update && apt-get install -y curl
fi
cd /tmp
echo '[03] 下载 setup.sh...'
echo '[04] 下载 setup.sh...'
curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_HOST}:${FTP_PORT}/setup.sh -o setup.sh
echo '[03] 执行安装...'
echo '[04] 执行安装...'
chmod +x setup.sh
bash setup.sh --server ${FTP_HOST} --user ${FTP_USER} --password '${FTP_PASSWORD}' --port ${FTP_PORT}
echo '[03] 安装完成'
echo '[04] 安装完成'
"
echo "[03] 完成"
echo "[04] 完成"

View File

@ -0,0 +1,59 @@
#!/bin/bash
# GPU环境检测脚本
# 检测系统是否有NVIDIA GPU硬件
set -e
# 检测函数
check_gpu_support() {
echo "检测GPU环境..."
# 方法1: 检测GPU设备文件
if ls /dev/nvidia* &>/dev/null; then
echo "✓ 检测到NVIDIA GPU设备文件"
return 0
fi
# 方法2: 检测lspci中的NVIDIA设备Linux
if command -v lspci &> /dev/null; then
if lspci | grep -i nvidia &> /dev/null; then
echo "✓ 检测到NVIDIA GPU硬件"
return 0
fi
fi
# 方法3: 检测nvidia-smi
if command -v nvidia-smi &> /dev/null; then
if nvidia-smi &> /dev/null; then
echo "✓ 检测到NVIDIA GPU硬件"
return 0
fi
fi
echo "✗ 未检测到NVIDIA GPU硬件"
return 1
}
# 主函数
main() {
echo "=========================================="
echo " GPU环境检测"
echo "=========================================="
echo ""
if check_gpu_support; then
echo ""
echo "结果: GPU环境可用"
exit 0
else
echo ""
echo "结果: GPU环境不可用将跳过GPU相关服务"
exit 1
fi
}
# 如果直接运行此脚本
if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
main "$@"
fi

View File

@ -1,7 +1,8 @@
#!/bin/bash
# 一键启动脚本
# 用于初始化目录、构建镜像并启动所有服务
# 用于初始化目录并启动所有服务
# 镜像构建已移至 build/build_images.sh
set -e
@ -9,12 +10,6 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
cd "$TEST_DIR"
# 解析参数
FORCE_REBUILD=false
if [[ "$1" == "--rebuild" ]]; then
FORCE_REBUILD=true
fi
echo "=========================================="
echo " Argus Metrics 一键启动脚本"
echo "=========================================="
@ -37,26 +32,6 @@ echo "使用: docker compose"
echo "Compose 文件: $TEST_DIR/docker-compose.yml"
echo ""
# 检查必要的构建目录
echo "检查构建目录..."
BUILD_DIRS=(
"../ftp/build"
"../prometheus/build"
"../grafana/build"
"client-test-node/build"
"client-test-gpu-node/build"
)
for dir in "${BUILD_DIRS[@]}"; do
if [ ! -d "$dir" ]; then
echo "错误: 构建目录不存在: $dir"
echo "完整路径: $(cd "$(dirname "$dir")" 2>/dev/null && pwd)/$(basename "$dir")"
exit 1
else
echo " ✓ 找到: $dir"
fi
done
echo ""
# 检查并创建 .env 文件
if [ ! -f .env ]; then
@ -84,118 +59,65 @@ echo "1. 初始化目录结构..."
bash "$SCRIPT_DIR/init-directories.sh"
echo ""
echo "2. 准备 Docker 镜像..."
# 检查镜像是否存在
IMAGE_CACHE_DIR="$TEST_DIR/images-cache"
IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest" "argus-metric-test-node:latest" "argus-metric-test-gpu-node:latest")
all_images_exist=true
for image in "${IMAGES[@]}"; do
if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then
all_images_exist=false
break
fi
done
if $FORCE_REBUILD; then
echo "强制重新构建镜像(--rebuild 模式)..."
cd "$TEST_DIR"
docker compose build --no-cache
echo "镜像重新构建完成"
elif $all_images_exist; then
echo "所有镜像已存在,跳过构建"
echo "2. 检测GPU环境..."
# 检测GPU环境
if bash "$SCRIPT_DIR/check-gpu.sh"; then
echo "GPU环境可用将启动GPU节点"
GPU_AVAILABLE=true
else
echo "检测到缺失镜像,尝试从缓存加载..."
# 尝试从缓存加载
loaded_from_cache=false
if [ -d "$IMAGE_CACHE_DIR" ]; then
for image in "${IMAGES[@]}"; do
if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then
# 镜像不存在,尝试加载
case "$image" in
"argus-metric-ftp:latest")
cache_file="${IMAGE_CACHE_DIR}/argus-ftp.tar"
;;
"argus-metric-prometheus:latest")
cache_file="${IMAGE_CACHE_DIR}/argus-prometheus.tar"
;;
"argus-metric-grafana:latest")
cache_file="${IMAGE_CACHE_DIR}/argus-grafana.tar"
;;
"argus-metric-test-node:latest")
cache_file="${IMAGE_CACHE_DIR}/argus-test-node.tar"
;;
"argus-metric-test-gpu-node:latest")
cache_file="${IMAGE_CACHE_DIR}/argus-test-gpu-node.tar"
;;
esac
if [ -f "$cache_file" ]; then
echo " 从缓存加载: $image"
docker load -i "$cache_file"
loaded_from_cache=true
fi
fi
done
fi
# 检查加载后是否还有缺失的镜像
need_build=false
for image in "${IMAGES[@]}"; do
if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then
need_build=true
break
fi
done
if $need_build; then
echo ""
echo "部分镜像缺失,开始构建..."
echo "工作目录: $(pwd)"
cd "$TEST_DIR"
docker compose build --no-cache
# 询问是否保存镜像
echo ""
read -p "是否保存镜像到缓存以便下次快速启动? (Y/n): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Nn]$ ]]; then
mkdir -p "$IMAGE_CACHE_DIR"
echo "保存镜像到缓存..."
for image in "${IMAGES[@]}"; do
case "$image" in
"argus-metric-ftp:latest")
docker save -o "${IMAGE_CACHE_DIR}/argus-ftp.tar" "$image" && echo " 已保存: argus-ftp.tar"
;;
"argus-metric-prometheus:latest")
docker save -o "${IMAGE_CACHE_DIR}/argus-prometheus.tar" "$image" && echo " 已保存: argus-prometheus.tar"
;;
"argus-metric-grafana:latest")
docker save -o "${IMAGE_CACHE_DIR}/argus-grafana.tar" "$image" && echo " 已保存: argus-grafana.tar"
;;
"argus-metric-test-node:latest")
docker save -o "${IMAGE_CACHE_DIR}/argus-test-node.tar" "$image" && echo " 已保存: argus-test-node.tar"
;;
"argus-metric-test-gpu-node:latest")
docker save -o "${IMAGE_CACHE_DIR}/argus-test-gpu-node.tar" "$image" && echo " 已保存: argus-test-gpu-node.tar"
;;
esac
done
echo "镜像已保存到: $IMAGE_CACHE_DIR/"
fi
elif $loaded_from_cache; then
echo ""
echo "所有镜像已从缓存加载完成!"
fi
echo "GPU环境不可用跳过GPU节点"
GPU_AVAILABLE=false
fi
echo ""
echo "3. 启动基础服务..."
echo "3. 检查 Docker 镜像..."
# 检查必要的镜像是否存在
BASE_IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest" "argus-metric-test-node:latest")
GPU_IMAGES=("argus-metric-test-gpu-node:latest")
# 先检查基础镜像
missing_images=()
for image in "${BASE_IMAGES[@]}"; do
if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then
missing_images+=("$image")
fi
done
# 检查GPU镜像如果GPU环境可用
if [ "$GPU_AVAILABLE" = true ]; then
for image in "${GPU_IMAGES[@]}"; do
if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then
missing_images+=("$image")
fi
done
fi
if [ ${#missing_images[@]} -gt 0 ]; then
echo "以下镜像缺失,请先运行 build/build_images.sh 构建镜像:"
for image in "${missing_images[@]}"; do
echo "$image"
done
echo ""
echo "构建命令:"
echo " ./build/build_images.sh --metric"
exit 1
else
echo "所有必要镜像已存在"
fi
echo ""
echo "4. 启动基础服务..."
cd "$TEST_DIR"
# 启动除GPU节点外的所有服务
docker compose up -d ftp prometheus grafana test-node test-gpu-node
# 根据GPU环境决定启动的服务
if [ "$GPU_AVAILABLE" = true ]; then
echo "启动所有服务包括GPU节点..."
docker compose up -d ftp prometheus grafana test-node test-gpu-node
else
echo "启动基础服务跳过GPU节点..."
docker compose up -d ftp prometheus grafana test-node
fi
echo ""
echo "4. 等待服务启动..."

View File

@ -0,0 +1,36 @@
FROM ubuntu:22.04
ENV DEBIAN_FRONTEND=noninteractive \
TZ=Asia/Shanghai
ARG USE_INTRANET=false
ARG ARGUS_BUILD_UID=2133
ARG ARGUS_BUILD_GID=2015
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
# Optional: switch to intranet apt mirrors during build
RUN if [ "$USE_INTRANET" = "true" ]; then \
echo "Configuring intranet apt sources..." && \
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
fi
# Install base tools and all libs that Fluent Bit may require at runtime
# so that start-fluent-bit.sh will NOT fallback to apt during container start.
RUN set -eux; \
apt-get update; \
apt-get install -y --no-install-recommends \
ca-certificates tzdata \
procps iproute2 net-tools lsof \
libpq5 libyaml-0-2 libsasl2-2 libldap-2.5-0; \
rm -rf /var/lib/apt/lists/*
# Keep root; compose provides entrypoint via bind mount
USER root
CMD ["bash", "-lc", "sleep infinity"]

View File

@ -0,0 +1,34 @@
FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive \
TZ=Asia/Shanghai \
NVIDIA_VISIBLE_DEVICES=all \
NVIDIA_DRIVER_CAPABILITIES=compute,utility
ARG USE_INTRANET=false
ARG ARGUS_BUILD_UID=2133
ARG ARGUS_BUILD_GID=2015
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
# Optional intranet mirror for build-time apt
RUN if [ "$USE_INTRANET" = "true" ]; then \
echo "Configuring intranet apt sources..." && \
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
fi
# Pre-install curl and diagnostics to avoid runtime apt installs in GPU test node
RUN set -eux; \
apt-get update; \
apt-get install -y --no-install-recommends \
curl ca-certificates tzdata \
procps iproute2 net-tools lsof; \
rm -rf /var/lib/apt/lists/*
USER root
CMD ["bash", "-lc", "sleep infinity"]

View File

@ -0,0 +1,32 @@
FROM ubuntu:22.04
ENV DEBIAN_FRONTEND=noninteractive \
TZ=Asia/Shanghai
ARG USE_INTRANET=false
ARG ARGUS_BUILD_UID=2133
ARG ARGUS_BUILD_GID=2015
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
# Optional intranet mirror for build-time apt
RUN if [ "$USE_INTRANET" = "true" ]; then \
echo "Configuring intranet apt sources..." && \
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
fi
# Pre-install curl and common diagnostics to avoid runtime apt installs
RUN set -eux; \
apt-get update; \
apt-get install -y --no-install-recommends \
curl ca-certificates tzdata \
procps iproute2 net-tools lsof; \
rm -rf /var/lib/apt/lists/*
USER root
CMD ["bash", "-lc", "sleep infinity"]

View File

@ -32,7 +32,7 @@
- 一键执行
- `cd src/sys/tests`
- `./scripts/00_e2e_test.sh`
- `./scripts/00_e2e_test.sh`CPU-only`./scripts/00_e2e_test.sh --enable-gpu`(启用 GPU 流程)
- 分步执行(推荐用于排查)
- `./scripts/01_bootstrap.sh` 生成目录/拷贝 `update-dns.sh`/构建 agent 二进制/写 `.env`
@ -42,7 +42,12 @@
- `./scripts/05_agent_register.sh` 获取两个节点的 `node_id` 与初始 IP检查本地 `node.json`
- `./scripts/06_write_health_and_assert.sh` 写健康文件并断言 `nodes.json` 仅包含 2 个在线节点
- `./scripts/07_logs_send_and_assert.sh` 向两个节点写日志,断言 ES `train-*`/`infer-*` 计数增长
- `./scripts/08_restart_agent_reregister.sh` `node-b` 改为固定 IP `172.29.0.200`,验证保持同一节点 ID 且 IP/时间戳更新
- `./scripts/08_restart_agent_reregister.sh` `node-b` 改为固定 IP `172.31.0.200`,验证保持同一节点 ID 且 IP/时间戳更新
- `./scripts/10_metric_publish.sh` 发布 metric 客户端包到 FTP
- `./scripts/11_metric_node_install.sh` 在 CPU 节点安装并验证端点
- `./scripts/12_metric_gpu_install.sh` 在 GPU 节点安装并等待 9100/9400 就绪(仅启用 GPU 时)
- `./scripts/13_metric_verify.sh` 对 master/Prometheus/数据面/Grafana 做综合校验(含 GPU 时校验 dcgm 指标)
- `./scripts/14_metric_cleanup.sh` 清理 FTP 产物
- `./scripts/09_down.sh` 回收容器、网络并清理 `private*/``tmp/`
- 重置环境
@ -53,16 +58,17 @@
## 二、测试部署架构docker-compose
- 网络
- 自定义 bridge`argus-sys-net`,子网 `172.29.0.0/16`
- 固定地址bind=`172.29.0.2`master=`172.29.0.10`
- 自定义 bridge`argus-sys-net`,子网 `172.31.0.0/16`
- 固定地址bind=`172.31.0.2`master=`172.31.0.10`
- 服务与端口
- 服务与端口(宿主机映射端口由 `01_bootstrap.sh` 自动分配并写入 `.env`
- 关键变量:`MASTER_PORT``ES_HTTP_PORT``KIBANA_PORT``NODE_A_PORT``NODE_B_PORT``PROMETHEUS_PORT``GRAFANA_PORT``ALERTMANAGER_PORT``WEB_PROXY_PORT_8080..8085``FTP_PORT``FTP_DATA_PORT``FTP_PASSIVE_HOST_RANGE`
- `bind``argus-bind9:latest`):监听 53/tcp+udp负责同步 `*.argus.com` 记录
- `master``argus-master:latest`):对外 `32300→3000`API `http://localhost:32300`
- `es``argus-elasticsearch:latest``9200→9200`;单节点,无安全
- `kibana``argus-kibana:latest``5601→5601`;通过 `ELASTICSEARCH_HOSTS=http://es:9200` 访问 ES
- `node-a``ubuntu:22.04`):同时运行 Fluent Bit + argus-agent`hostname=dev-yyrshare-nbnyx10-cp2f-pod-0``2020→2020`
- `node-b``ubuntu:22.04`):同时运行 Fluent Bit + argus-agent`hostname=dev-yyrshare-uuuu10-ep2f-pod-0``2021→2020`
- `master``argus-master:latest`):对外 `${MASTER_PORT}→3000`API `http://localhost:${MASTER_PORT}`
- `es``argus-elasticsearch:latest``${ES_HTTP_PORT}→9200`;单节点,无安全
- `kibana``argus-kibana:latest``${KIBANA_PORT}→5601`;通过 `ELASTICSEARCH_HOSTS=http://es:9200` 访问 ES
- `node-a``ubuntu:22.04`):同时运行 Fluent Bit + argus-agent`hostname=dev-yyrshare-nbnyx10-cp2f-pod-0``${NODE_A_PORT}→2020`
- `node-b``ubuntu:22.04`):同时运行 Fluent Bit + argus-agent`hostname=dev-yyrshare-uuuu10-ep2f-pod-0``${NODE_B_PORT}→2020`
- 卷与目录
- 核心服务bind/master/es/kibana共享宿主 `./private` 挂载到容器 `/private`
@ -72,7 +78,7 @@
- 节点容器的 Fluent Bit/agent 资产以只读方式挂载到 `/assets`/`/usr/local/bin/argus-agent`
- DNS 配置
- 节点容器通过 compose 配置 `dns: [172.29.0.2]` 指向 bind不挂载 `/etc/resolv.conf`,也不依赖 `update-dns.sh`
- 节点容器通过 compose 配置 `dns: [172.31.0.2]` 指向 bind不挂载 `/etc/resolv.conf`,也不依赖 `update-dns.sh`
- master/es/kibana 仍共享 `./private`master 启动会写 `/private/argus/etc/master.argus.com` 供 bind 同步 A 记录
- 节点入口
@ -106,6 +112,7 @@
- 判定:
- `private/argus/etc/master.argus.com` 存在且为 master IP
- 在 node-a/node-b 内 `getent hosts master.argus.com` 成功解析到 master IP
- 在 metric CPU/GPU 节点内可解析 `master.argus.com``prom.metric.argus.com`
- `05_agent_register.sh`
- 目的:确认两个节点注册到 master 并持久化 `node.json`
@ -136,3 +143,16 @@
---
如需更严格的断言(例如 Kibana 载入具体插件、ES 文档字段校验),可在 `07_*.sh` 中追加查询与校验逻辑。
---
## 可选GPU 流程说明
- 前置条件:宿主安装 NVIDIA 驱动与 `nvidia-container-toolkit``nvidia-smi` 在宿主可用。
- 启用方式:
- 一键:`./scripts/00_e2e_test.sh --enable-gpu`
- 分步:设置 `ARGUS_SYS_ENABLE_GPU=true` 后执行 `01_bootstrap.sh``02_up.sh`;或直接在 `.env` 中将 `ENABLE_GPU=true` 后单独运行 `02_up.sh`
- `01_bootstrap.sh` 会写入:
- `METRIC_TEST_HOSTNAME_GPU=test-metric-gpu-node-001`
- `METRIC_TEST_INSTANCE_GPU=172.31.0.51:9100`
- `METRIC_TEST_DCGM_GPU=172.31.0.51:9400`
- 验证点:`04_verify_dns_routing.sh` 增加对 metric 节点的域名解析;`12_metric_gpu_install.sh` 等待 9100/9400`13_metric_verify_*` 校验 dcgm 指标与 Grafana 面板。

View File

@ -1,21 +1,18 @@
version: "3.8"
networks:
default:
name: argus-sys-net
sysnet:
driver: bridge
ipam:
driver: default
config:
- subnet: 172.29.0.0/16
- subnet: 172.31.0.0/16
services:
bind:
image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
container_name: argus-bind-sys
networks:
default:
ipv4_address: 172.29.0.2
sysnet:
ipv4_address: 172.31.0.2
volumes:
- ./private:/private
restart: unless-stopped
@ -32,14 +29,14 @@ services:
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
ports:
- "32300:3000"
- "${MASTER_PORT:-32300}:3000"
volumes:
- ./private/argus/master:/private/argus/master
- ./private/argus/metric/prometheus:/private/argus/metric/prometheus
- ./private/argus/etc:/private/argus/etc
networks:
default:
ipv4_address: 172.29.0.10
sysnet:
ipv4_address: 172.31.0.10
restart: unless-stopped
es:
@ -55,8 +52,11 @@ services:
- ./private/argus/log/elasticsearch:/private/argus/log/elasticsearch
- ./private/argus/etc:/private/argus/etc
ports:
- "9200:9200"
- "${ES_HTTP_PORT:-9200}:9200"
restart: unless-stopped
networks:
sysnet:
ipv4_address: 172.31.0.3
kibana:
image: argus-kibana:latest
@ -71,11 +71,14 @@ services:
depends_on:
- es
ports:
- "5601:5601"
- "${KIBANA_PORT:-5601}:5601"
restart: unless-stopped
networks:
sysnet:
ipv4_address: 172.31.0.4
node-a:
image: ubuntu:22.04
image: argus-sys-node:latest
container_name: argus-node-a
hostname: dev-yyrshare-nbnyx10-cp2f-pod-0
depends_on:
@ -101,13 +104,16 @@ services:
entrypoint:
- /usr/local/bin/node-entrypoint.sh
dns:
- 172.29.0.2
- 172.31.0.2 # internal bind for *.argus.com
- 8.8.8.8 # external fallback for apt/external domains
ports:
- "2020:2020"
- "${NODE_A_PORT:-2020}:2020"
restart: unless-stopped
networks:
- sysnet
node-b:
image: ubuntu:22.04
image: argus-sys-node:latest
container_name: argus-node-b
hostname: dev-yyrshare-uuuu10-ep2f-pod-0
depends_on:
@ -133,7 +139,269 @@ services:
entrypoint:
- /usr/local/bin/node-entrypoint.sh
dns:
- 172.29.0.2
- 172.31.0.2
- 8.8.8.8
ports:
- "2021:2020"
- "${NODE_B_PORT:-2021}:2020"
restart: unless-stopped
networks:
- sysnet
ftp:
image: argus-metric-ftp:latest
container_name: argus-ftp
restart: unless-stopped
environment:
- TZ=Asia/Shanghai
- FTP_BASE_PATH=/private/argus/ftp
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
- DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com}
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
ports:
- "${FTP_PORT:-21}:21"
- "${FTP_DATA_PORT:-20}:20"
- "${FTP_PASSIVE_HOST_RANGE:-21100-21110}:21100-21110"
volumes:
- ./private/argus/metric/ftp:/private/argus/ftp
- ./private/argus/etc:/private/argus/etc
- /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro
networks:
sysnet:
ipv4_address: 172.31.0.40
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
prometheus:
image: argus-metric-prometheus:latest
container_name: argus-prometheus
restart: unless-stopped
environment:
- TZ=Asia/Shanghai
- PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
ports:
- "${PROMETHEUS_PORT:-9090}:9090"
volumes:
- ./private/argus/metric/prometheus:/private/argus/metric/prometheus
- ./private/argus/etc:/private/argus/etc
- /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro
networks:
sysnet:
ipv4_address: 172.31.0.41
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
grafana:
image: argus-metric-grafana:latest
container_name: argus-grafana
restart: unless-stopped
environment:
- TZ=Asia/Shanghai
- GRAFANA_BASE_PATH=/private/argus/metric/grafana
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
- GF_SERVER_HTTP_PORT=3000
- GF_LOG_LEVEL=warn
- GF_LOG_MODE=console
- GF_PATHS_PROVISIONING=/private/argus/metric/grafana/provisioning
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
ports:
- "${GRAFANA_PORT:-3000}:3000"
volumes:
- ./private/argus/metric/grafana:/private/argus/metric/grafana
- ./private/argus/etc:/private/argus/etc
- /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro
networks:
sysnet:
ipv4_address: 172.31.0.42
depends_on:
- prometheus
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
# --- Added: Web Frontend (no host port; resolved by DNS as web.argus.com) ---
web-frontend:
image: argus-web-frontend:latest
container_name: argus-web-frontend
environment:
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
# Frontend runtime-injected external ports (used to render hyperlinks)
- EXTERNAL_MASTER_PORT=${WEB_PROXY_PORT_8085:-8085}
- EXTERNAL_ALERTMANAGER_PORT=${WEB_PROXY_PORT_8084:-8084}
- EXTERNAL_GRAFANA_PORT=${WEB_PROXY_PORT_8081:-8081}
- EXTERNAL_PROMETHEUS_PORT=${WEB_PROXY_PORT_8082:-8082}
- EXTERNAL_KIBANA_PORT=${WEB_PROXY_PORT_8083:-8083}
volumes:
- ./private/argus/etc:/private/argus/etc
networks:
sysnet:
ipv4_address: 172.31.0.80
restart: unless-stopped
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
test-node:
image: argus-sys-metric-test-node:latest
container_name: argus-metric-test-node
hostname: test-metric-node-001
restart: unless-stopped
privileged: true
depends_on:
- ftp
- prometheus
environment:
- TZ=Asia/Shanghai
- DEBIAN_FRONTEND=noninteractive
- FTP_DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com}
- FTP_SERVER=${FTP_SERVER:-172.31.0.40}
- FTP_USER=${FTP_USER:-ftpuser}
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
- FTP_PORT=${FTP_PORT:-21}
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
- METRIC_NODE_ROLE=cpu
volumes:
- ./private/argus/agent:/private/argus/agent
- ./scripts/metric/test-node-entrypoint.sh:/usr/local/bin/metric-test-node-entrypoint.sh:ro
- /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro
entrypoint:
- /usr/local/bin/metric-test-node-entrypoint.sh
command:
- sleep
- infinity
dns:
- 172.31.0.2
- 8.8.8.8
networks:
sysnet:
ipv4_address: 172.31.0.50
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
test-gpu-node:
profiles: ["gpu"]
image: argus-sys-metric-test-gpu-node:latest
container_name: argus-metric-test-gpu-node
hostname: test-metric-gpu-node-001
restart: unless-stopped
privileged: true
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities:
- gpu
depends_on:
- ftp
- prometheus
environment:
- TZ=Asia/Shanghai
- DEBIAN_FRONTEND=noninteractive
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- GPU_MODE=gpu
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
- METRIC_NODE_ROLE=gpu
volumes:
- ./private/argus/agent:/private/argus/agent
- ./scripts/metric/test-node-entrypoint.sh:/usr/local/bin/metric-test-node-entrypoint.sh:ro
- /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro
entrypoint:
- /usr/local/bin/metric-test-node-entrypoint.sh
command:
- sleep
- infinity
dns:
- 172.31.0.2
- 8.8.8.8
networks:
sysnet:
ipv4_address: 172.31.0.51
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
# --- Added: Alertmanager ---
alertmanager:
image: argus-alertmanager:latest
container_name: argus-alertmanager
environment:
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
volumes:
- ./private/argus/etc:/private/argus/etc
- ./private/argus/alert/alertmanager:/private/argus/alert/alertmanager
networks:
sysnet:
ipv4_address: 172.31.0.82
ports:
- "${ALERTMANAGER_PORT:-9093}:9093"
restart: unless-stopped
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
# --- Added: Web Proxy (multi-port gateway) ---
web-proxy:
image: argus-web-proxy:latest
container_name: argus-web-proxy
depends_on:
- bind
- master
- grafana
- prometheus
- kibana
- alertmanager
environment:
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
volumes:
- ./private/argus/etc:/private/argus/etc
networks:
sysnet:
ipv4_address: 172.31.0.81
ports:
- "${WEB_PROXY_PORT_8080:-8080}:8080"
- "${WEB_PROXY_PORT_8081:-8081}:8081"
- "${WEB_PROXY_PORT_8082:-8082}:8082"
- "${WEB_PROXY_PORT_8083:-8083}:8083"
- "${WEB_PROXY_PORT_8084:-8084}:8084"
- "${WEB_PROXY_PORT_8085:-8085}:8085"
restart: unless-stopped
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"

View File

@ -3,6 +3,45 @@ set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ENABLE_GPU=false
CLEANUP=true
usage() {
cat <<'EOF'
Usage: 00_e2e_test.sh [options]
Options:
--enable-gpu 启用 GPU 相关拓扑与测试流程
--no-clean 跳过清理流程(不执行 14 和 09
-h, --help 显示帮助信息
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--enable-gpu)
ENABLE_GPU=true
shift
;;
--no-clean)
CLEANUP=false
shift
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage
exit 1
;;
esac
done
export ARGUS_SYS_ENABLE_GPU=$ENABLE_GPU
# 基础步骤(不包含清理与下线)
SCRIPTS=(
"01_bootstrap.sh"
"02_up.sh"
@ -12,9 +51,20 @@ SCRIPTS=(
"06_write_health_and_assert.sh"
"07_logs_send_and_assert.sh"
"08_restart_agent_reregister.sh"
"09_down.sh"
"10_metric_publish.sh"
"11_metric_node_install.sh"
"12_metric_gpu_install.sh"
"13_metric_verify.sh"
)
# 如未禁用清理,则追加清理与下线步骤(保持原有顺序)
if [[ "$CLEANUP" == "true" ]]; then
SCRIPTS+=(
"14_metric_cleanup.sh"
"09_down.sh"
)
fi
for script in "${SCRIPTS[@]}"; do
echo "[SYS-E2E] Running $script"
"$SCRIPT_DIR/$script"
@ -22,5 +72,8 @@ for script in "${SCRIPTS[@]}"; do
echo
done
echo "[SYS-E2E] All tests completed"
if [[ "$CLEANUP" == "true" ]]; then
echo "[SYS-E2E] All tests completed"
else
echo "[SYS-E2E] All tests completed (cleanup skipped)"
fi

Some files were not shown because too many files have changed in this diff Show More