完成a6000测试系统构建、部署、测试整合 #35
1
.gitattributes
vendored
Normal file
1
.gitattributes
vendored
Normal file
@ -0,0 +1 @@
|
||||
src/metric/client-plugins/all-in-one-full/plugins/*/bin/* filter=lfs diff=lfs merge=lfs -text
|
||||
@ -10,20 +10,28 @@ Usage: $0 [OPTIONS]
|
||||
Options:
|
||||
--intranet Use intranet mirror for log/bind builds
|
||||
--master-offline Build master offline image (requires src/master/offline_wheels.tar.gz)
|
||||
--metric Build metric module images (ftp, prometheus, grafana, test nodes)
|
||||
--no-cache Build all images without using Docker layer cache
|
||||
--only LIST Comma-separated targets to build: core,master,metric,web,alert,sys,all
|
||||
-h, --help Show this help message
|
||||
|
||||
Examples:
|
||||
$0 # Build with default sources
|
||||
$0 --intranet # Build with intranet mirror
|
||||
$0 --master-offline # Additionally build argus-master:offline
|
||||
$0 --intranet --master-offline
|
||||
$0 --metric # Additionally build metric module images
|
||||
$0 --intranet --master-offline --metric
|
||||
EOF
|
||||
}
|
||||
|
||||
use_intranet=false
|
||||
build_core=true
|
||||
build_master=true
|
||||
build_master_offline=false
|
||||
build_metric=true
|
||||
build_web=true
|
||||
build_alert=true
|
||||
build_sys=true
|
||||
no_cache=false
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
@ -41,10 +49,35 @@ while [[ $# -gt 0 ]]; do
|
||||
build_master_offline=true
|
||||
shift
|
||||
;;
|
||||
--metric)
|
||||
build_metric=true
|
||||
shift
|
||||
;;
|
||||
--no-cache)
|
||||
no_cache=true
|
||||
shift
|
||||
;;
|
||||
--only)
|
||||
if [[ -z ${2:-} ]]; then
|
||||
echo "--only requires a target list" >&2; exit 1
|
||||
fi
|
||||
sel="$2"; shift 2
|
||||
# reset all, then enable selected
|
||||
build_core=false; build_master=false; build_metric=false; build_web=false; build_alert=false; build_sys=false
|
||||
IFS=',' read -ra parts <<< "$sel"
|
||||
for p in "${parts[@]}"; do
|
||||
case "$p" in
|
||||
core) build_core=true ;;
|
||||
master) build_master=true ;;
|
||||
metric) build_metric=true ;;
|
||||
web) build_web=true ;;
|
||||
alert) build_alert=true ;;
|
||||
sys) build_sys=true ;;
|
||||
all) build_core=true; build_master=true; build_metric=true; build_web=true; build_alert=true; build_sys=true ;;
|
||||
*) echo "Unknown --only target: $p" >&2; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
@ -115,14 +148,22 @@ build_image() {
|
||||
local image_name=$1
|
||||
local dockerfile_path=$2
|
||||
local tag=$3
|
||||
local context="."
|
||||
shift 3
|
||||
|
||||
if [[ $# -gt 0 ]]; then
|
||||
context=$1
|
||||
shift
|
||||
fi
|
||||
|
||||
local extra_args=("$@")
|
||||
|
||||
echo "🔄 Building $image_name image..."
|
||||
echo " Dockerfile: $dockerfile_path"
|
||||
echo " Tag: $tag"
|
||||
echo " Context: $context"
|
||||
|
||||
if docker build "${build_args[@]}" "${extra_args[@]}" -f "$dockerfile_path" -t "$tag" .; then
|
||||
if docker build "${build_args[@]}" "${extra_args[@]}" -f "$dockerfile_path" -t "$tag" "$context"; then
|
||||
echo "✅ $image_name image built successfully"
|
||||
return 0
|
||||
else
|
||||
@ -131,29 +172,59 @@ build_image() {
|
||||
fi
|
||||
}
|
||||
|
||||
pull_base_image() {
|
||||
local image_ref=$1
|
||||
local attempts=${2:-3}
|
||||
local delay=${3:-5}
|
||||
|
||||
# If the image already exists locally, skip pulling.
|
||||
if docker image inspect "$image_ref" >/dev/null 2>&1; then
|
||||
echo " Local image present; skip pull: $image_ref"
|
||||
return 0
|
||||
fi
|
||||
|
||||
for ((i=1; i<=attempts; i++)); do
|
||||
echo " Pulling base image ($i/$attempts): $image_ref"
|
||||
if docker pull "$image_ref" >/dev/null; then
|
||||
echo " Base image ready: $image_ref"
|
||||
return 0
|
||||
fi
|
||||
echo " Pull failed: $image_ref"
|
||||
if (( i < attempts )); then
|
||||
echo " Retrying in ${delay}s..."
|
||||
sleep "$delay"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "❌ Unable to pull base image after ${attempts} attempts: $image_ref"
|
||||
return 1
|
||||
}
|
||||
|
||||
images_built=()
|
||||
build_failed=false
|
||||
|
||||
if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then
|
||||
images_built+=("argus-elasticsearch:latest")
|
||||
else
|
||||
build_failed=true
|
||||
fi
|
||||
if [[ "$build_core" == true ]]; then
|
||||
if build_image "Elasticsearch" "src/log/elasticsearch/build/Dockerfile" "argus-elasticsearch:latest"; then
|
||||
images_built+=("argus-elasticsearch:latest")
|
||||
else
|
||||
build_failed=true
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo ""
|
||||
|
||||
if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then
|
||||
images_built+=("argus-kibana:latest")
|
||||
else
|
||||
build_failed=true
|
||||
fi
|
||||
if build_image "Kibana" "src/log/kibana/build/Dockerfile" "argus-kibana:latest"; then
|
||||
images_built+=("argus-kibana:latest")
|
||||
else
|
||||
build_failed=true
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo ""
|
||||
|
||||
if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then
|
||||
images_built+=("argus-bind9:latest")
|
||||
else
|
||||
build_failed=true
|
||||
if build_image "BIND9" "src/bind/build/Dockerfile" "argus-bind9:latest"; then
|
||||
images_built+=("argus-bind9:latest")
|
||||
else
|
||||
build_failed=true
|
||||
fi
|
||||
fi
|
||||
|
||||
echo ""
|
||||
@ -184,6 +255,127 @@ if [[ "$build_master" == true ]]; then
|
||||
popd >/dev/null
|
||||
fi
|
||||
|
||||
if [[ "$build_metric" == true ]]; then
|
||||
echo ""
|
||||
echo "Building Metric module images..."
|
||||
|
||||
metric_base_images=(
|
||||
"ubuntu:22.04"
|
||||
"ubuntu/prometheus:3-24.04_stable"
|
||||
"grafana/grafana:11.1.0"
|
||||
)
|
||||
|
||||
for base_image in "${metric_base_images[@]}"; do
|
||||
if ! pull_base_image "$base_image"; then
|
||||
build_failed=true
|
||||
fi
|
||||
done
|
||||
|
||||
metric_builds=(
|
||||
"Metric FTP|src/metric/ftp/build/Dockerfile|argus-metric-ftp:latest|src/metric/ftp/build"
|
||||
"Metric Prometheus|src/metric/prometheus/build/Dockerfile|argus-metric-prometheus:latest|src/metric/prometheus/build"
|
||||
"Metric Grafana|src/metric/grafana/build/Dockerfile|argus-metric-grafana:latest|src/metric/grafana/build"
|
||||
)
|
||||
|
||||
for build_spec in "${metric_builds[@]}"; do
|
||||
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
|
||||
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
|
||||
images_built+=("$image_tag")
|
||||
else
|
||||
build_failed=true
|
||||
fi
|
||||
echo ""
|
||||
done
|
||||
fi
|
||||
|
||||
# =======================================
|
||||
# Sys (system tests) node images
|
||||
# =======================================
|
||||
|
||||
if [[ "$build_sys" == true ]]; then
|
||||
echo ""
|
||||
echo "Building Sys node images..."
|
||||
|
||||
sys_base_images=(
|
||||
"ubuntu:22.04"
|
||||
"nvidia/cuda:12.2.2-runtime-ubuntu22.04"
|
||||
)
|
||||
|
||||
for base_image in "${sys_base_images[@]}"; do
|
||||
if ! pull_base_image "$base_image"; then
|
||||
build_failed=true
|
||||
fi
|
||||
done
|
||||
|
||||
sys_builds=(
|
||||
"Sys Node|src/sys/build/node/Dockerfile|argus-sys-node:latest|."
|
||||
"Sys Metric Test Node|src/sys/build/test-node/Dockerfile|argus-sys-metric-test-node:latest|."
|
||||
"Sys Metric Test GPU Node|src/sys/build/test-gpu-node/Dockerfile|argus-sys-metric-test-gpu-node:latest|."
|
||||
)
|
||||
|
||||
for build_spec in "${sys_builds[@]}"; do
|
||||
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
|
||||
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
|
||||
images_built+=("$image_tag")
|
||||
else
|
||||
build_failed=true
|
||||
fi
|
||||
echo ""
|
||||
done
|
||||
fi
|
||||
|
||||
# =======================================
|
||||
# Web & Alert module images
|
||||
# =======================================
|
||||
|
||||
if [[ "$build_web" == true || "$build_alert" == true ]]; then
|
||||
echo ""
|
||||
echo "Building Web and Alert module images..."
|
||||
|
||||
# Pre-pull commonly used base images for stability
|
||||
web_alert_base_images=(
|
||||
"node:20"
|
||||
"ubuntu:24.04"
|
||||
)
|
||||
|
||||
for base_image in "${web_alert_base_images[@]}"; do
|
||||
if ! pull_base_image "$base_image"; then
|
||||
build_failed=true
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ "$build_web" == true ]]; then
|
||||
web_builds=(
|
||||
"Web Frontend|src/web/build_tools/frontend/Dockerfile|argus-web-frontend:latest|."
|
||||
"Web Proxy|src/web/build_tools/proxy/Dockerfile|argus-web-proxy:latest|."
|
||||
)
|
||||
for build_spec in "${web_builds[@]}"; do
|
||||
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
|
||||
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
|
||||
images_built+=("$image_tag")
|
||||
else
|
||||
build_failed=true
|
||||
fi
|
||||
echo ""
|
||||
done
|
||||
fi
|
||||
|
||||
if [[ "$build_alert" == true ]]; then
|
||||
alert_builds=(
|
||||
"Alertmanager|src/alert/alertmanager/build/Dockerfile|argus-alertmanager:latest|."
|
||||
)
|
||||
for build_spec in "${alert_builds[@]}"; do
|
||||
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
|
||||
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
|
||||
images_built+=("$image_tag")
|
||||
else
|
||||
build_failed=true
|
||||
fi
|
||||
echo ""
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "======================================="
|
||||
echo "📦 Build Summary"
|
||||
echo "======================================="
|
||||
@ -210,7 +402,6 @@ if [[ "$build_master_offline" == true ]]; then
|
||||
echo ""
|
||||
echo "🧳 Master offline wheels 已解压到 $master_offline_dir"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "🚀 Next steps:"
|
||||
echo " ./build/save_images.sh --compress # 导出镜像"
|
||||
|
||||
@ -68,6 +68,12 @@ declare -A images=(
|
||||
["argus-kibana:latest"]="argus-kibana-latest.tar"
|
||||
["argus-bind9:latest"]="argus-bind9-latest.tar"
|
||||
["argus-master:offline"]="argus-master-offline.tar"
|
||||
["argus-metric-ftp:latest"]="argus-metric-ftp-latest.tar"
|
||||
["argus-metric-prometheus:latest"]="argus-metric-prometheus-latest.tar"
|
||||
["argus-metric-grafana:latest"]="argus-metric-grafana-latest.tar"
|
||||
["argus-web-frontend:latest"]="argus-web-frontend-latest.tar"
|
||||
["argus-web-proxy:latest"]="argus-web-proxy-latest.tar"
|
||||
["argus-alertmanager:latest"]="argus-alertmanager-latest.tar"
|
||||
)
|
||||
|
||||
# 函数:检查镜像是否存在
|
||||
@ -220,4 +226,4 @@ fi
|
||||
|
||||
echo ""
|
||||
echo "✅ Image export completed successfully!"
|
||||
echo ""
|
||||
echo ""
|
||||
|
||||
@ -12,6 +12,8 @@ VENV_DIR="$BUILD_ROOT/venv"
|
||||
|
||||
AGENT_BUILD_IMAGE="${AGENT_BUILD_IMAGE:-python:3.11-slim-bullseye}"
|
||||
AGENT_BUILD_USE_DOCKER="${AGENT_BUILD_USE_DOCKER:-1}"
|
||||
# 默认在容器内忽略代理以避免公司内网代理在 Docker 网络不可达导致 pip 失败(可用 0 关闭)
|
||||
AGENT_BUILD_IGNORE_PROXY="${AGENT_BUILD_IGNORE_PROXY:-1}"
|
||||
USED_DOCKER=0
|
||||
|
||||
run_host_build() {
|
||||
@ -71,6 +73,7 @@ run_docker_build() {
|
||||
pass_env_if_set http_proxy
|
||||
pass_env_if_set https_proxy
|
||||
pass_env_if_set no_proxy
|
||||
pass_env_if_set AGENT_BUILD_IGNORE_PROXY
|
||||
|
||||
build_script=$(cat <<'INNER'
|
||||
set -euo pipefail
|
||||
@ -82,6 +85,10 @@ rm -rf build dist
|
||||
mkdir -p build/pyinstaller dist
|
||||
python3 -m venv --copies build/venv
|
||||
source build/venv/bin/activate
|
||||
# 若指定忽略代理,则清空常见代理与 pip 镜像环境变量,避免容器内代理不可达
|
||||
if [ "${AGENT_BUILD_IGNORE_PROXY:-1}" = "1" ]; then
|
||||
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY PIP_INDEX_URL PIP_EXTRA_INDEX_URL PIP_TRUSTED_HOST
|
||||
fi
|
||||
pip install --upgrade pip
|
||||
pip install .
|
||||
pip install pyinstaller==6.6.0
|
||||
|
||||
@ -9,21 +9,21 @@ RUN apt-get update && \
|
||||
apt-get install -y wget supervisor net-tools inetutils-ping vim ca-certificates passwd && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 设置 Alertmanager 版本
|
||||
# 设置 Alertmanager 版本(与本地离线包保持一致)
|
||||
ARG ALERTMANAGER_VERSION=0.28.1
|
||||
|
||||
# 下载并解压 Alertmanager 二进制
|
||||
RUN wget https://github.com/prometheus/alertmanager/releases/download/v${ALERTMANAGER_VERSION}/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz && \
|
||||
tar xvf alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz && \
|
||||
mv alertmanager-${ALERTMANAGER_VERSION}.linux-amd64 /usr/local/alertmanager && \
|
||||
rm alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz
|
||||
# 使用仓库内预置的离线包构建(无需联网)
|
||||
COPY src/alert/alertmanager/build/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz /tmp/
|
||||
RUN tar xvf /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz -C /tmp && \
|
||||
mv /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64 /usr/local/alertmanager && \
|
||||
rm -f /tmp/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz
|
||||
|
||||
ENV ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
|
||||
|
||||
ARG ARGUS_UID=2133
|
||||
ARG ARGUS_GID=2015
|
||||
ENV ARGUS_UID=${ARGUS_UID}
|
||||
ENV ARGUS_GID=${ARGUS_GID}
|
||||
ARG ARGUS_BUILD_UID=2133
|
||||
ARG ARGUS_BUILD_GID=2015
|
||||
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID}
|
||||
ENV ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
||||
|
||||
RUN mkdir -p /usr/share/alertmanager && \
|
||||
mkdir -p ${ALERTMANAGER_BASE_PATH} && \
|
||||
@ -33,16 +33,24 @@ RUN mkdir -p /usr/share/alertmanager && \
|
||||
|
||||
# 创建 alertmanager 用户(可自定义 UID/GID)
|
||||
# 创建 alertmanager 用户组
|
||||
RUN groupadd -g ${ARGUS_GID} alertmanager
|
||||
RUN set -eux; \
|
||||
# 确保目标 GID 存在;若已被占用,直接使用该 GID(组名不限)\
|
||||
if ! getent group "${ARGUS_BUILD_GID}" >/dev/null; then \
|
||||
groupadd -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
||||
fi; \
|
||||
# 确保存在 alertmanager 用户;若 UID 已被占用,跳过并继续使用现有 UID 的用户
|
||||
if ! id alertmanager >/dev/null 2>&1; then \
|
||||
if getent passwd "${ARGUS_BUILD_UID}" >/dev/null; then \
|
||||
# UID 已占用,则创建同名用户但不指定 UID(避免冲突),仅保证 user 存在
|
||||
useradd -M -s /usr/sbin/nologin -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
||||
else \
|
||||
useradd -M -s /usr/sbin/nologin -u "${ARGUS_BUILD_UID}" -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
||||
fi; \
|
||||
else \
|
||||
usermod -g "${ARGUS_BUILD_GID}" alertmanager || true; \
|
||||
fi
|
||||
|
||||
# 创建 alertmanager 用户并指定组
|
||||
RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} alertmanager
|
||||
|
||||
RUN chown -R alertmanager:alertmanager /usr/share/alertmanager && \
|
||||
chown -R alertmanager:alertmanager /alertmanager && \
|
||||
chown -R alertmanager:alertmanager ${ALERTMANAGER_BASE_PATH} && \
|
||||
chown -R alertmanager:alertmanager /private/argus/etc && \
|
||||
chown -R alertmanager:alertmanager /usr/local/bin
|
||||
RUN chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" /usr/share/alertmanager /alertmanager ${ALERTMANAGER_BASE_PATH} /private/argus/etc /usr/local/bin || true
|
||||
|
||||
# 配置内网 apt 源 (如果指定了内网选项)
|
||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
@ -86,4 +94,3 @@ EXPOSE 9093
|
||||
|
||||
# 使用 supervisor 作为入口点
|
||||
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
|
||||
|
||||
|
||||
Binary file not shown.
@ -5,9 +5,9 @@ docker pull ubuntu:24.04
|
||||
source src/alert/tests/.env
|
||||
|
||||
docker build \
|
||||
--build-arg ARGUS_UID=${ARGUS_UID} \
|
||||
--build-arg ARGUS_GID=${ARGUS_GID} \
|
||||
--build-arg ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
||||
--build-arg ARGUS_BUILD_GID=${ARGUS_BUILD_GID} \
|
||||
-f src/alert/alertmanager/build/Dockerfile \
|
||||
-t argus-alertmanager:latest .
|
||||
|
||||
docker save -o argus-alertmanager-latest.tar argus-alertmanager:latest
|
||||
docker save -o argus-alertmanager-latest.tar argus-alertmanager:latest
|
||||
|
||||
22
src/alert/alertmanager/build/fetch-dist.sh
Normal file
22
src/alert/alertmanager/build/fetch-dist.sh
Normal file
@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# 下载 Alertmanager 离线安装包到本目录,用于 Docker 构建时 COPY
|
||||
# 用法:
|
||||
# ./fetch-dist.sh [version]
|
||||
# 示例:
|
||||
# ./fetch-dist.sh 0.28.1
|
||||
|
||||
VER="${1:-0.28.1}"
|
||||
OUT="alertmanager-${VER}.linux-amd64.tar.gz"
|
||||
URL="https://github.com/prometheus/alertmanager/releases/download/v${VER}/${OUT}"
|
||||
|
||||
if [[ -f "$OUT" ]]; then
|
||||
echo "[INFO] $OUT already exists, skip download"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "[INFO] Downloading $URL"
|
||||
curl -fL --retry 3 --connect-timeout 10 -o "$OUT" "$URL"
|
||||
echo "[OK] Saved to $(pwd)/$OUT"
|
||||
|
||||
@ -7,10 +7,8 @@ ALERTMANAGER_BASE_PATH=${ALERTMANAGER_BASE_PATH:-/private/argus/alert/alertmanag
|
||||
|
||||
echo "[INFO] Alertmanager base path: ${ALERTMANAGER_BASE_PATH}"
|
||||
|
||||
# 生成配置文件
|
||||
echo "[INFO] Generating Alertmanager configuration file..."
|
||||
sed "s|\${ALERTMANAGER_BASE_PATH}|${ALERTMANAGER_BASE_PATH}|g" \
|
||||
/etc/alertmanager/alertmanager.yml > ${ALERTMANAGER_BASE_PATH}/alertmanager.yml
|
||||
# 使用容器内的 /etc/alertmanager/alertmanager.yml 作为配置文件,避免写入挂载卷导致的权限问题
|
||||
echo "[INFO] Using /etc/alertmanager/alertmanager.yml as configuration"
|
||||
|
||||
|
||||
# 记录容器 IP 地址
|
||||
|
||||
@ -6,7 +6,7 @@ user=root
|
||||
|
||||
[program:alertmanager]
|
||||
command=/usr/local/bin/start-am-supervised.sh
|
||||
user=alertmanager
|
||||
user=ubuntu
|
||||
stdout_logfile=/var/log/supervisor/alertmanager.log
|
||||
stderr_logfile=/var/log/supervisor/alertmanager_error.log
|
||||
autorestart=true
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
DATA_ROOT=/home/argus/tmp/private/argus
|
||||
ARGUS_UID=1048
|
||||
ARGUS_GID=1048
|
||||
ARGUS_BUILD_UID=1048
|
||||
ARGUS_BUILD_GID=1048
|
||||
|
||||
USE_INTRANET=false
|
||||
|
||||
@ -4,15 +4,15 @@ services:
|
||||
context: ../../../
|
||||
dockerfile: src/alert/alertmanager/build/Dockerfile
|
||||
args:
|
||||
ARGUS_UID: ${ARGUS_UID:-2133}
|
||||
ARGUS_GID: ${ARGUS_GID:-2015}
|
||||
ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
|
||||
ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
|
||||
USE_INTRANET: ${USE_INTRANET:-false}
|
||||
image: argus-alertmanager:latest
|
||||
container_name: argus-alertmanager
|
||||
environment:
|
||||
- ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
|
||||
- ARGUS_UID=${ARGUS_UID:-2133}
|
||||
- ARGUS_GID=${ARGUS_GID:-2015}
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
ports:
|
||||
- "${ARGUS_PORT:-9093}:9093"
|
||||
volumes:
|
||||
|
||||
@ -26,6 +26,7 @@ RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
bind9 \
|
||||
bind9utils \
|
||||
dnsutils \
|
||||
bind9-doc \
|
||||
supervisor \
|
||||
net-tools \
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,47 +1,96 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
echo "[INFO] Starting Fluent Bit setup in Ubuntu container..."
|
||||
echo "[INFO] Starting Fluent Bit setup in Ubuntu container (offline-first)..."
|
||||
|
||||
# 安装必要的工具
|
||||
echo "[INFO] Installing required packages..."
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
apt-get update -qq
|
||||
apt-get install -y -qq curl
|
||||
|
||||
# 解压bundle到/tmp
|
||||
echo "[INFO] Extracting fluent-bit bundle..."
|
||||
cp -r /private/etc /tmp
|
||||
cp -r /private/packages /tmp
|
||||
cd /tmp
|
||||
# Stage bundle to /tmp (read-only mount under /private)
|
||||
echo "[INFO] Staging fluent-bit bundle..."
|
||||
rm -rf /tmp/flb && mkdir -p /tmp/flb
|
||||
cp -r /private/etc /tmp/flb/
|
||||
mkdir -p /tmp/flb/packages
|
||||
cp -r /private/packages/* /tmp/flb/packages/ 2>/dev/null || true
|
||||
|
||||
# 安装 Fluent Bit 从 deb 包
|
||||
echo "[INFO] Installing Fluent Bit from deb package..."
|
||||
dpkg -i /tmp/packages/fluent-bit_3.1.9_amd64.deb || true
|
||||
apt-get install -f -y -qq # 解决依赖问题
|
||||
# Helper: check and install a local deb if not already satisfied
|
||||
ensure_lib() {
|
||||
local soname="$1"; shift
|
||||
local pattern="$1"; shift
|
||||
if ldconfig -p 2>/dev/null | grep -q "$soname"; then
|
||||
echo "[OK] $soname already present"
|
||||
return 0
|
||||
fi
|
||||
local deb="$(ls /tmp/flb/packages/$pattern 2>/dev/null | head -n1 || true)"
|
||||
if [[ -n "$deb" ]]; then
|
||||
echo "[INFO] Installing local dependency: $(basename "$deb")"
|
||||
dpkg -i "$deb" >/dev/null 2>&1 || true
|
||||
else
|
||||
echo "[WARN] Local deb for $soname not found (pattern=$pattern)"
|
||||
fi
|
||||
if ! ldconfig -p 2>/dev/null | grep -q "$soname"; then
|
||||
echo "[WARN] $soname still missing after local install; attempting apt fallback"
|
||||
apt-get update -qq || true
|
||||
case "$soname" in
|
||||
libpq.so.5) apt-get install -y -qq libpq5 || true ;;
|
||||
libyaml-0.so.2) apt-get install -y -qq libyaml-0-2 || true ;;
|
||||
esac
|
||||
fi
|
||||
ldconfig 2>/dev/null || true
|
||||
}
|
||||
|
||||
# Offline-first: satisfy runtime deps from local debs, fallback to apt only if necessary
|
||||
ensure_lib "libpq.so.5" "libpq5_*_amd64.deb"
|
||||
ensure_lib "libyaml-0.so.2" "libyaml-0-2_*_amd64.deb"
|
||||
ensure_lib "libsasl2.so.2" "libsasl2-2_*_amd64.deb"
|
||||
ensure_lib "libldap-2.5.so.0" "libldap-2.5-0_*_amd64.deb"
|
||||
|
||||
# Install fluent-bit main package from local bundle
|
||||
FLB_DEB="$(ls /tmp/flb/packages/fluent-bit_*_amd64.deb 2>/dev/null | head -n1 || true)"
|
||||
if [[ -z "$FLB_DEB" ]]; then
|
||||
echo "[ERROR] fluent-bit deb not found under /private/packages" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "[INFO] Installing Fluent Bit: $(basename "$FLB_DEB")"
|
||||
dpkg -i "$FLB_DEB" >/dev/null 2>&1 || true
|
||||
|
||||
# If dpkg reported unresolved dependencies, try apt -f only as last resort
|
||||
if ! command -v /opt/fluent-bit/bin/fluent-bit >/dev/null 2>&1; then
|
||||
echo "[WARN] fluent-bit binary missing after dpkg; attempting apt --fix-broken"
|
||||
apt-get install -f -y -qq || true
|
||||
fi
|
||||
|
||||
# Ensure runtime library dependencies are satisfied (libsasl2, libldap are required via libpq/curl)
|
||||
MISSING=$(ldd /opt/fluent-bit/bin/fluent-bit 2>/dev/null | awk '/not found/{print $1}' | xargs -r echo || true)
|
||||
if [[ -n "$MISSING" ]]; then
|
||||
echo "[WARN] missing shared libs: $MISSING"
|
||||
apt-get update -qq || true
|
||||
apt-get install -y -qq libsasl2-2 libldap-2.5-0 || true
|
||||
apt-get install -f -y -qq || true
|
||||
fi
|
||||
|
||||
# 验证 Fluent Bit 可以运行
|
||||
echo "[INFO] Fluent Bit version:"
|
||||
/opt/fluent-bit/bin/fluent-bit --version
|
||||
/opt/fluent-bit/bin/fluent-bit --version || { echo "[ERROR] fluent-bit not installed or libraries missing" >&2; exit 1; }
|
||||
|
||||
# 创建配置目录
|
||||
# Place configuration
|
||||
mkdir -p /etc/fluent-bit
|
||||
cp -r /tmp/etc/* /etc/fluent-bit/
|
||||
cp -r /tmp/flb/etc/* /etc/fluent-bit/
|
||||
|
||||
# 创建日志和缓冲区目录
|
||||
# Create logs/buffers dirs
|
||||
mkdir -p /logs/train /logs/infer /buffers
|
||||
chmod 755 /logs/train /logs/infer /buffers
|
||||
|
||||
# 等待 Elasticsearch 就绪
|
||||
echo "[INFO] Waiting for Elasticsearch to be ready..."
|
||||
while ! curl -fs http://${ES_HOST}:${ES_PORT}/_cluster/health >/dev/null 2>&1; do
|
||||
echo " Waiting for ES at ${ES_HOST}:${ES_PORT}..."
|
||||
sleep 5
|
||||
# Wait for Elasticsearch via bash /dev/tcp to avoid curl dependency
|
||||
echo "[INFO] Waiting for Elasticsearch to be ready (tcp ${ES_HOST}:${ES_PORT})..."
|
||||
for i in $(seq 1 120); do
|
||||
if exec 3<>/dev/tcp/${ES_HOST}/${ES_PORT}; then
|
||||
exec 3<&- 3>&-
|
||||
echo "[INFO] Elasticsearch is ready"
|
||||
break
|
||||
fi
|
||||
[[ $i -eq 120 ]] && { echo "[ERROR] ES not reachable" >&2; exit 1; }
|
||||
sleep 1
|
||||
done
|
||||
echo "[INFO] Elasticsearch is ready"
|
||||
|
||||
# 启动 Fluent Bit
|
||||
echo "[INFO] Starting Fluent Bit with configuration from /etc/fluent-bit/"
|
||||
echo "[INFO] Command: /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf"
|
||||
exec /opt/fluent-bit/bin/fluent-bit \
|
||||
--config=/etc/fluent-bit/fluent-bit.conf
|
||||
exec /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf
|
||||
|
||||
@ -32,3 +32,42 @@ fi
|
||||
|
||||
echo "[OK] 初始化完成: private/argus/log/{elasticsearch,kibana}"
|
||||
echo "[INFO] Fluent-bit files should be in fluent-bit/ directory"
|
||||
|
||||
# 准备 Fluent Bit 离线依赖(从 metric all-in-one-full 复制 deb 到 ../fluent-bit/build/packages)
|
||||
FLB_BUILD_PACKAGES_DIR="$root/../fluent-bit/build/packages"
|
||||
mkdir -p "$FLB_BUILD_PACKAGES_DIR"
|
||||
for deb in \
|
||||
"$project_root/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_"*_amd64.deb \
|
||||
"$project_root/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_"*_amd64.deb ; do
|
||||
if ls $deb >/dev/null 2>&1; then
|
||||
for f in $deb; do
|
||||
base="$(basename "$f")"
|
||||
if [[ ! -f "$FLB_BUILD_PACKAGES_DIR/$base" ]]; then
|
||||
cp "$f" "$FLB_BUILD_PACKAGES_DIR/"
|
||||
echo " [+] copied $base"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
done
|
||||
|
||||
# 额外:从 all-in-one-full 的 ubuntu22/curl.tar.gz 解包必要依赖(libsasl2/ldap),便于离线安装
|
||||
CURLOPT_TAR="$project_root/src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz"
|
||||
if [[ -f "$CURLOPT_TAR" ]]; then
|
||||
tmpdir=$(mktemp -d)
|
||||
if tar -xzf "$CURLOPT_TAR" -C "$tmpdir" 2>/dev/null; then
|
||||
for p in \
|
||||
libsasl2-2_*_amd64.deb \
|
||||
libsasl2-modules-db_*_amd64.deb \
|
||||
libldap-2.5-0_*_amd64.deb \
|
||||
libidn2-0_*_amd64.deb \
|
||||
libbrotli1_*_amd64.deb \
|
||||
libssl3_*_amd64.deb ; do
|
||||
src=$(ls "$tmpdir"/curl/$p 2>/dev/null | head -n1 || true)
|
||||
if [[ -n "$src" ]]; then
|
||||
base="$(basename "$src")"
|
||||
[[ -f "$FLB_BUILD_PACKAGES_DIR/$base" ]] || cp "$src" "$FLB_BUILD_PACKAGES_DIR/" && echo " [+] staged $base"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
rm -rf "$tmpdir"
|
||||
fi
|
||||
|
||||
2
src/metric/.gitignore
vendored
2
src/metric/.gitignore
vendored
@ -4,4 +4,4 @@
|
||||
/client-plugins/demo-all-in-one/publish/
|
||||
/client-plugins/demo-all-in-one/checklist
|
||||
/client-plugins/demo-all-in-one/VERSION
|
||||
/client-plugins/all-in-one-full/
|
||||
/client-plugins/all-in-one-full/artifact/
|
||||
|
||||
@ -104,7 +104,26 @@ log_info "文件所有者: $OWNER"
|
||||
|
||||
# 确保发布目录存在
|
||||
log_info "确保发布目录存在: $PUBLISH_DIR"
|
||||
sudo mkdir -p "$PUBLISH_DIR"
|
||||
mkdir -p "$PUBLISH_DIR"
|
||||
|
||||
IFS=':' read -r OWNER_UID OWNER_GID <<< "$OWNER"
|
||||
if [[ -z "$OWNER_UID" || -z "$OWNER_GID" ]]; then
|
||||
log_error "--owner 格式不正确,应为 uid:gid"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CURRENT_UID=$(id -u)
|
||||
CURRENT_GID=$(id -g)
|
||||
if [[ "$OWNER_UID" != "$CURRENT_UID" || "$OWNER_GID" != "$CURRENT_GID" ]]; then
|
||||
if [[ "$CURRENT_UID" -ne 0 ]]; then
|
||||
log_error "当前用户 (${CURRENT_UID}:${CURRENT_GID}) 无法设置所有者为 ${OWNER_UID}:${OWNER_GID}"
|
||||
log_error "请以目标用户运行脚本或预先调整目录权限"
|
||||
exit 1
|
||||
fi
|
||||
NEED_CHOWN=true
|
||||
else
|
||||
NEED_CHOWN=false
|
||||
fi
|
||||
|
||||
# 创建临时目录用于打包
|
||||
TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$"
|
||||
@ -208,26 +227,31 @@ fi
|
||||
TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz"
|
||||
log_info "创建发布包: $TAR_NAME"
|
||||
cd "$TEMP_PACKAGE_DIR"
|
||||
sudo tar -czf "$PUBLISH_DIR/$TAR_NAME" *
|
||||
tar -czf "$PUBLISH_DIR/$TAR_NAME" *
|
||||
cd - > /dev/null
|
||||
|
||||
# 设置文件所有者
|
||||
log_info "设置文件所有者为: $OWNER"
|
||||
sudo chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME"
|
||||
if [[ "$NEED_CHOWN" == true ]]; then
|
||||
log_info "设置文件所有者为: $OWNER"
|
||||
chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME"
|
||||
fi
|
||||
|
||||
# 清理临时目录
|
||||
rm -rf "$TEMP_PACKAGE_DIR"
|
||||
|
||||
# 更新 LATEST_VERSION 文件
|
||||
log_info "更新 LATEST_VERSION 文件..."
|
||||
echo "$VERSION" | sudo tee "$PUBLISH_DIR/LATEST_VERSION" > /dev/null
|
||||
sudo chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION"
|
||||
echo "$VERSION" > "$PUBLISH_DIR/LATEST_VERSION"
|
||||
if [[ "$NEED_CHOWN" == true ]]; then
|
||||
chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION"
|
||||
fi
|
||||
|
||||
# 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制)
|
||||
if [[ -f "config/dns.conf" ]]; then
|
||||
log_info "复制 DNS 配置文件到发布目录根目录..."
|
||||
sudo cp "config/dns.conf" "$PUBLISH_DIR/"
|
||||
sudo chown "$OWNER" "$PUBLISH_DIR/dns.conf"
|
||||
cp "config/dns.conf" "$PUBLISH_DIR/"
|
||||
if [[ "$NEED_CHOWN" == true ]]; then
|
||||
chown "$OWNER" "$PUBLISH_DIR/dns.conf"
|
||||
fi
|
||||
log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf"
|
||||
else
|
||||
log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制"
|
||||
@ -236,8 +260,10 @@ fi
|
||||
# 复制 setup.sh 到发布目录
|
||||
if [[ -f "scripts/setup.sh" ]]; then
|
||||
log_info "复制 setup.sh 到发布目录..."
|
||||
sudo cp "scripts/setup.sh" "$PUBLISH_DIR/"
|
||||
sudo chown "$OWNER" "$PUBLISH_DIR/setup.sh"
|
||||
cp "scripts/setup.sh" "$PUBLISH_DIR/"
|
||||
if [[ "$NEED_CHOWN" == true ]]; then
|
||||
chown "$OWNER" "$PUBLISH_DIR/setup.sh"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 显示发布结果
|
||||
|
||||
59
src/metric/client-plugins/all-in-one-full/README.md
Normal file
59
src/metric/client-plugins/all-in-one-full/README.md
Normal file
@ -0,0 +1,59 @@
|
||||
# 客户侧组件安装包构建、发布流程
|
||||
|
||||
## 第一步:配置版本和组件
|
||||
|
||||
首先搞定配置文件:
|
||||
|
||||
1. 把 `.checklist.example` 重命名成 `checklist`
|
||||
2. 把 `.VERSION.example` 重命名成 `VERSION`
|
||||
|
||||
### checklist 文件格式
|
||||
```
|
||||
# 组件名称 目录路径 版本号 [依赖组件] [安装顺序]
|
||||
dcgm-exporter-installer /path/to/dcgm-exporter-installer 1.1.0
|
||||
node-exporter-installer /path/to/node-exporter-installer 1.1.0
|
||||
```
|
||||
|
||||
### VERSION 文件
|
||||
设置需要发布的版本号,比如 `1.29.0`
|
||||
|
||||
> 建议用 `version-manager.sh` 来管理版本
|
||||
|
||||
## 第二步:构建安装包
|
||||
|
||||
直接跑脚本:
|
||||
```bash
|
||||
./package_artifact.sh
|
||||
```
|
||||
|
||||
构建完的东西会放在 `artifact/` 目录下,按版本分文件夹。
|
||||
|
||||
如果版本已经存在了,想要覆盖重新构建:
|
||||
```bash
|
||||
./package_artifact.sh --force
|
||||
```
|
||||
|
||||
构建完可以手工测试安装包。
|
||||
|
||||
## 第三步:发布安装包
|
||||
|
||||
用这个脚本发布:
|
||||
```bash
|
||||
./publish_artifact.sh
|
||||
```
|
||||
|
||||
发布后的内容在 `publish/` 目录里,包含:
|
||||
- 压缩版本的安装包
|
||||
- 一键安装的bash脚本
|
||||
|
||||
## 第四步:部署到FTP服务器
|
||||
|
||||
把发布的内容上传到FTP服务器,客户端就可以通过一键命令安装:
|
||||
|
||||
```bash
|
||||
curl -fsSL http://your-ftp-server/install.sh | sh -
|
||||
|
||||
curl -fsSL "ftp://ftpuser:{PASSWD}!@10.211.55.4/share/setup.sh" | sudo bash -s -- --server 10.211.55.4 --user ftpuser --password {PASSWD}
|
||||
```
|
||||
|
||||
这样客户就能直接从FTP服务器下载并安装组件了。
|
||||
@ -0,0 +1 @@
|
||||
1.29.0
|
||||
@ -0,0 +1,3 @@
|
||||
# 组件名称 目录路径 版本号 [依赖组件] [安装顺序]
|
||||
dcgm-exporter-installer /Users/sundapeng/Project/nlp/aiops/client-plugins/dcgm-exporter-installer 1.1.0
|
||||
node-exporter-installer /Users/sundapeng/Project/nlp/aiops/client-plugins/node-exporter-installer 1.1.0
|
||||
1
src/metric/client-plugins/all-in-one-full/config/VERSION
Normal file
1
src/metric/client-plugins/all-in-one-full/config/VERSION
Normal file
@ -0,0 +1 @@
|
||||
1.35.0
|
||||
@ -0,0 +1,5 @@
|
||||
# 组件名称 目录路径 版本号 [依赖组件] [安装顺序]
|
||||
argus-agent plugins/argus-agent 1.0.0
|
||||
node-exporter plugins/node-exporter 1.0.0
|
||||
dcgm-exporter plugins/dcgm-exporter 1.0.0
|
||||
fluent-bit plugins/fluent-bit 1.0.0
|
||||
14
src/metric/client-plugins/all-in-one-full/config/config.env
Normal file
14
src/metric/client-plugins/all-in-one-full/config/config.env
Normal file
@ -0,0 +1,14 @@
|
||||
# Elasticsearch
|
||||
ES_HOST=es.log.argus.com
|
||||
ES_PORT=9200
|
||||
|
||||
# Argus-Agent
|
||||
# 连接master服务
|
||||
MASTER_ENDPOINT=master.argus.com:3000
|
||||
# 上报状态间隔描述
|
||||
REPORT_INTERVAL_SECONDS=5
|
||||
|
||||
# FTP
|
||||
FTP_SERVER=172.31.0.40
|
||||
FTP_USER=ftpuser
|
||||
FTP_PASSWORD=ZGClab1234!
|
||||
@ -0,0 +1,8 @@
|
||||
# Argus Metric 配置文件示例
|
||||
# 复制此文件为 config.env 并根据需要修改配置
|
||||
|
||||
# 连接master服务
|
||||
MASTER_ENDPOINT=master.argus.com:3000
|
||||
|
||||
# 上报状态间隔描述(秒)
|
||||
REPORT_INTERVAL_SECONDS=60
|
||||
@ -0,0 +1 @@
|
||||
172.31.0.2
|
||||
Binary file not shown.
BIN
src/metric/client-plugins/all-in-one-full/deps/jq-curl.tar.gz
Normal file
BIN
src/metric/client-plugins/all-in-one-full/deps/jq-curl.tar.gz
Normal file
Binary file not shown.
BIN
src/metric/client-plugins/all-in-one-full/deps/ubuntu20/cron.tar.gz
Executable file
BIN
src/metric/client-plugins/all-in-one-full/deps/ubuntu20/cron.tar.gz
Executable file
Binary file not shown.
BIN
src/metric/client-plugins/all-in-one-full/deps/ubuntu20/curl.tar.gz
Executable file
BIN
src/metric/client-plugins/all-in-one-full/deps/ubuntu20/curl.tar.gz
Executable file
Binary file not shown.
BIN
src/metric/client-plugins/all-in-one-full/deps/ubuntu20/jq.tar.gz
Executable file
BIN
src/metric/client-plugins/all-in-one-full/deps/ubuntu20/jq.tar.gz
Executable file
Binary file not shown.
BIN
src/metric/client-plugins/all-in-one-full/deps/ubuntu22/cron.tar.gz
Executable file
BIN
src/metric/client-plugins/all-in-one-full/deps/ubuntu22/cron.tar.gz
Executable file
Binary file not shown.
BIN
src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz
Executable file
BIN
src/metric/client-plugins/all-in-one-full/deps/ubuntu22/curl.tar.gz
Executable file
Binary file not shown.
BIN
src/metric/client-plugins/all-in-one-full/deps/ubuntu22/jq.tar.gz
Executable file
BIN
src/metric/client-plugins/all-in-one-full/deps/ubuntu22/jq.tar.gz
Executable file
Binary file not shown.
@ -0,0 +1,94 @@
|
||||
# Argus Agent 插件
|
||||
|
||||
这是 Argus Agent 的安装和管理插件,提供了完整的安装、卸载、健康检查功能。
|
||||
|
||||
## 文件结构
|
||||
|
||||
```
|
||||
argus-agent/
|
||||
├── bin/
|
||||
│ └── argus-agent # Argus Agent 二进制文件
|
||||
├── config/ # 配置文件目录
|
||||
├── install.sh # 安装脚本
|
||||
├── uninstall.sh # 卸载脚本
|
||||
├── check_health.sh # 健康检查脚本
|
||||
├── package.sh # 打包脚本
|
||||
└── README.md # 说明文档
|
||||
```
|
||||
|
||||
## 使用方法
|
||||
|
||||
### 安装
|
||||
|
||||
```bash
|
||||
sudo ./install.sh
|
||||
```
|
||||
|
||||
安装脚本会:
|
||||
- 检查系统要求
|
||||
- 停止可能运行的服务
|
||||
- 安装二进制文件到 `/usr/local/bin/argus-agent`
|
||||
- 创建 `argus-agent` 用户
|
||||
- 创建配置和数据目录
|
||||
- 启动服务并记录 PID
|
||||
|
||||
### 卸载
|
||||
|
||||
```bash
|
||||
sudo ./uninstall.sh
|
||||
```
|
||||
|
||||
卸载脚本会:
|
||||
- 停止所有 argus-agent 进程
|
||||
- 删除二进制文件
|
||||
- 删除配置和数据目录
|
||||
- 清理日志文件
|
||||
- 更新安装记录
|
||||
|
||||
### 健康检查
|
||||
|
||||
```bash
|
||||
./check_health.sh
|
||||
```
|
||||
|
||||
健康检查脚本会:
|
||||
- 检查安装记录中的 PID
|
||||
- 验证进程是否正在运行
|
||||
- 输出 JSON 格式的健康状态
|
||||
|
||||
### 打包
|
||||
|
||||
```bash
|
||||
./package.sh
|
||||
```
|
||||
|
||||
打包脚本会:
|
||||
- 检查所有必要文件
|
||||
- 创建时间戳命名的压缩包
|
||||
- 输出安装包信息
|
||||
|
||||
## 安装后的文件位置
|
||||
|
||||
- 二进制文件: `/usr/local/bin/argus-agent`
|
||||
- 配置目录: `/etc/argus-agent/`
|
||||
- 数据目录: `/var/lib/argus-agent/`
|
||||
- 日志文件: `/var/log/argus-agent.log`
|
||||
- PID 文件: `/var/run/argus-agent.pid`
|
||||
- 安装记录: `/opt/argus-metric/current/.install_record`
|
||||
|
||||
## 健康检查输出格式
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "argus-agent",
|
||||
"status": "health|unhealth",
|
||||
"reason": "状态说明"
|
||||
}
|
||||
```
|
||||
|
||||
## 注意事项
|
||||
|
||||
1. 安装和卸载脚本需要 root 权限
|
||||
2. 健康检查脚本使用安装记录中的 PID 来验证进程状态
|
||||
3. 如果 jq 命令不可用,健康检查会使用简单的文本解析
|
||||
4. 卸载时会保留 `argus-agent` 用户,避免影响其他服务
|
||||
BIN
src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent
(Stored with Git LFS)
Executable file
BIN
src/metric/client-plugins/all-in-one-full/plugins/argus-agent/bin/argus-agent
(Stored with Git LFS)
Executable file
Binary file not shown.
@ -0,0 +1,69 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Argus Agent 健康检查脚本
|
||||
# 输出 JSON 格式结果
|
||||
|
||||
set -e
|
||||
|
||||
# 检查 Argus Agent 健康状态
|
||||
check_health() {
|
||||
local name="argus-agent"
|
||||
local status="unhealth"
|
||||
local reason=""
|
||||
local install_record="/opt/argus-metric/current/.install_record"
|
||||
|
||||
# 首先尝试通过安装记录文件检查进程
|
||||
if [[ -f "$install_record" ]]; then
|
||||
# 尝试使用jq解析JSON格式的安装记录文件
|
||||
local pid=""
|
||||
if command -v jq &> /dev/null; then
|
||||
pid=$(jq -r '.components."argus-agent".pid // empty' "$install_record" 2>/dev/null || echo "")
|
||||
else
|
||||
# 如果没有jq,使用简单的文本解析方法
|
||||
pid=$(grep -A 10 '"argus-agent"' "$install_record" | grep '"pid"' | cut -d'"' -f4 | head -1)
|
||||
fi
|
||||
|
||||
if [[ -n "$pid" && "$pid" =~ ^[0-9]+$ ]]; then
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
# 进程存在且运行正常
|
||||
status="health"
|
||||
reason="进程运行正常 (PID: $pid)"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 0
|
||||
else
|
||||
reason="安装记录中的 PID $pid 进程不存在"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
reason="安装记录文件中未找到有效的 argus-agent PID"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
# 如果安装记录文件不存在,尝试查找 argus-agent 进程
|
||||
local pids=$(pgrep -f "argus-agent" 2>/dev/null || true)
|
||||
if [[ -n "$pids" ]]; then
|
||||
# 取第一个找到的 PID
|
||||
local pid=$(echo "$pids" | head -1)
|
||||
status="health"
|
||||
reason="发现 argus-agent 进程运行 (PID: $pid),但未找到安装记录"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 0
|
||||
else
|
||||
reason="未找到 argus-agent 进程,且安装记录文件不存在"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
check_health
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
289
src/metric/client-plugins/all-in-one-full/plugins/argus-agent/install.sh
Executable file
289
src/metric/client-plugins/all-in-one-full/plugins/argus-agent/install.sh
Executable file
@ -0,0 +1,289 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# 日志函数
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# 显示帮助信息
|
||||
show_help() {
|
||||
echo "Argus Agent 安装脚本"
|
||||
echo
|
||||
echo "用法: $0 [选项]"
|
||||
echo
|
||||
echo "选项:"
|
||||
echo " --help 显示此帮助信息"
|
||||
echo
|
||||
echo "示例:"
|
||||
echo " $0 # 安装 Argus Agent"
|
||||
echo
|
||||
}
|
||||
|
||||
# 解析命令行参数
|
||||
INSTALL_DIR=""
|
||||
for arg in "$@"; do
|
||||
case $arg in
|
||||
--help|-h)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
# 如果参数不是以--开头,则认为是安装目录
|
||||
if [[ ! "$arg" =~ ^-- ]]; then
|
||||
INSTALL_DIR="$arg"
|
||||
else
|
||||
log_error "未知参数: $arg"
|
||||
show_help
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# 检查是否为 root 用户
|
||||
check_root() {
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
log_error "此脚本需要 root 权限运行"
|
||||
log_info "请使用: sudo $0"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 检查系统要求
|
||||
check_system() {
|
||||
log_info "检查系统要求..."
|
||||
|
||||
# 检查操作系统
|
||||
if [[ ! -f /etc/os-release ]]; then
|
||||
log_error "无法检测操作系统版本"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
source /etc/os-release
|
||||
log_info "检测到操作系统: $NAME $VERSION"
|
||||
|
||||
# 检查是否为 Linux 系统
|
||||
if [[ "$ID" != "ubuntu" && "$ID" != "debian" && "$ID" != "centos" && "$ID" != "rhel" && "$ID" != "fedora" ]]; then
|
||||
log_warning "此脚本主要针对常见 Linux 发行版,其他系统可能需要调整"
|
||||
fi
|
||||
|
||||
# 检查系统架构
|
||||
local arch=$(uname -m)
|
||||
log_info "系统架构: $arch"
|
||||
|
||||
if [[ "$arch" != "x86_64" && "$arch" != "amd64" ]]; then
|
||||
log_warning "当前架构为 $arch,argus-agent 主要支持 x86_64/amd64"
|
||||
fi
|
||||
}
|
||||
|
||||
# 停止可能运行的服务
|
||||
stop_existing_service() {
|
||||
log_info "检查并停止可能运行的服务..."
|
||||
local pid_file="/var/run/argus-agent.pid"
|
||||
|
||||
if [[ -f "$pid_file" ]]; then
|
||||
local pid=$(cat "$pid_file")
|
||||
if ps -p "$pid" -o comm= | grep -q "^argus-agent$"; then
|
||||
kill "$pid" 2>/dev/null || true
|
||||
sleep 2
|
||||
kill -9 "$pid" 2>/dev/null || true
|
||||
log_success "服务已停止"
|
||||
fi
|
||||
rm -f "$pid_file"
|
||||
fi
|
||||
|
||||
local pids=$(pgrep -x argus-agent 2>/dev/null || true)
|
||||
if [[ -n "$pids" ]]; then
|
||||
for pid in $pids; do kill -9 "$pid" 2>/dev/null || true; done
|
||||
fi
|
||||
|
||||
# 检查僵尸进程
|
||||
local zombies=$(ps -eo pid,stat,comm | grep '[a]rgus-agent' | awk '$2 ~ /Z/ {print $1}')
|
||||
if [[ -n "$zombies" ]]; then
|
||||
for pid in $zombies; do
|
||||
local ppid=$(ps -o ppid= -p $pid)
|
||||
log_warning "检测到僵尸 argus-agent (PID=$pid, PPID=$ppid),尝试清理"
|
||||
[[ "$ppid" -ne 1 ]] && kill -9 "$ppid" 2>/dev/null || true
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
# 安装 Argus Agent 二进制文件
|
||||
install_argus_agent() {
|
||||
log_info "安装 Argus Agent..."
|
||||
local binary_file="bin/argus-agent"
|
||||
local install_dir="/usr/local/bin"
|
||||
local target_file="$install_dir/argus-agent"
|
||||
|
||||
[[ ! -f "$binary_file" ]] && log_error "找不到 Argus Agent 二进制文件: $binary_file" && exit 1
|
||||
|
||||
stop_existing_service
|
||||
|
||||
local timeout=10
|
||||
while [[ $timeout -gt 0 ]]; do
|
||||
remaining_pids=$(pgrep -x argus-agent | grep -vw $$ || true)
|
||||
[[ -z "$remaining_pids" ]] && break
|
||||
if ps -eo pid,stat,comm | grep -E 'argus-agent' | grep -q 'Z'; then
|
||||
log_warning "检测到僵尸 argus-agent,跳过等待"
|
||||
break
|
||||
fi
|
||||
log_warning "等待 argus-agent 完全退出... ($timeout)"
|
||||
sleep 1
|
||||
((timeout--))
|
||||
done
|
||||
|
||||
cp "$binary_file" "${target_file}.new"
|
||||
chmod +x "${target_file}.new"
|
||||
mv -f "${target_file}.new" "$target_file"
|
||||
log_success "Argus Agent 二进制文件安装完成"
|
||||
}
|
||||
|
||||
|
||||
# 创建用户和组
|
||||
create_user() {
|
||||
log_info "创建 argus-agent 用户..."
|
||||
|
||||
# 检查用户是否已存在
|
||||
if id "argus-agent" &>/dev/null; then
|
||||
log_info "用户 argus-agent 已存在"
|
||||
else
|
||||
useradd --no-create-home --shell /bin/false argus-agent
|
||||
log_success "用户 argus-agent 创建完成"
|
||||
fi
|
||||
}
|
||||
|
||||
# 安装配置文件
|
||||
install_config() {
|
||||
log_info "安装配置文件..."
|
||||
|
||||
local config_dir="/etc/argus-agent"
|
||||
|
||||
# 创建配置目录
|
||||
mkdir -p "$config_dir"
|
||||
|
||||
# 创建健康检查目录
|
||||
mkdir -p "/var/lib/argus-agent/health"
|
||||
chown argus-agent:argus-agent "/var/lib/argus-agent/health"
|
||||
}
|
||||
|
||||
# 启动 Argus Agent 服务
|
||||
start_argus_agent() {
|
||||
log_info "启动 Argus Agent 服务..."
|
||||
local binary_path="/usr/local/bin/argus-agent"
|
||||
local log_file="/var/log/argus-agent.log"
|
||||
local pid_file="/var/run/argus-agent.pid"
|
||||
|
||||
[[ -f "$pid_file" ]] && rm -f "$pid_file"
|
||||
|
||||
log_info "正在启动 Argus Agent..."
|
||||
setsid "$binary_path" > "$log_file" 2>&1 < /dev/null &
|
||||
local pid=$!
|
||||
echo "$pid" > "$pid_file"
|
||||
sleep 2
|
||||
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_success "Argus Agent 服务启动成功 (PID: $pid)"
|
||||
else
|
||||
log_error "Argus Agent 启动失败"
|
||||
[[ -f "$log_file" ]] && tail -n 10 "$log_file"
|
||||
rm -f "$pid_file"
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
# 更新安装记录
|
||||
update_install_record() {
|
||||
local pid="$1"
|
||||
# 使用传入的安装目录参数,如果没有则使用默认值
|
||||
local install_base_dir="${2:-/opt/argus-metric/current}"
|
||||
local install_record="$install_base_dir/.install_record"
|
||||
|
||||
# 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建
|
||||
if [[ ! -f "$install_record" ]]; then
|
||||
log_info "安装记录文件不存在,将由主安装脚本创建"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 如果文件存在,说明是重启场景,只更新 PID 字段
|
||||
if command -v jq &> /dev/null; then
|
||||
# 读取当前 PID
|
||||
local current_pid=$(jq -r '.components."argus-agent".pid // ""' "$install_record" 2>/dev/null)
|
||||
|
||||
if [[ -z "$current_pid" ]]; then
|
||||
log_warning "无法读取当前 PID,跳过更新"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段
|
||||
jq --arg new_pid "$pid" '.components."argus-agent".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record"
|
||||
log_info "PID 已更新: $current_pid -> $pid"
|
||||
else
|
||||
log_warning "jq 命令不可用,无法更新安装记录文件"
|
||||
fi
|
||||
}
|
||||
|
||||
# 显示安装信息
|
||||
show_install_info() {
|
||||
log_success "Argus Agent 安装完成!"
|
||||
echo
|
||||
echo "安装信息:"
|
||||
echo " 二进制文件: /usr/local/bin/argus-agent"
|
||||
echo " 运行用户: argus-agent"
|
||||
echo " 配置目录: /etc/argus-agent/"
|
||||
echo " 健康检查目录: /var/lib/argus-agent/health"
|
||||
echo
|
||||
echo "使用方法:"
|
||||
echo " 手动启动: /usr/local/bin/argus-agent"
|
||||
echo " 后台启动: nohup /usr/local/bin/argus-agent &"
|
||||
echo
|
||||
echo "健康检查:"
|
||||
echo " ./check_health.sh"
|
||||
echo
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
echo "=========================================="
|
||||
echo " Argus Agent 安装脚本 v1.0"
|
||||
echo "=========================================="
|
||||
echo
|
||||
|
||||
check_root
|
||||
check_system
|
||||
|
||||
log_info "开始安装 Argus Agent..."
|
||||
|
||||
install_argus_agent
|
||||
create_user
|
||||
install_config
|
||||
start_argus_agent
|
||||
|
||||
show_install_info
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
87
src/metric/client-plugins/all-in-one-full/plugins/argus-agent/package.sh
Executable file
87
src/metric/client-plugins/all-in-one-full/plugins/argus-agent/package.sh
Executable file
@ -0,0 +1,87 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# 颜色定义
|
||||
GREEN='\033[0;32m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
# 获取当前目录
|
||||
CURRENT_DIR=$(pwd)
|
||||
PACKAGE_NAME="argus-agent-$(date +%Y%m%d-%H%M%S)"
|
||||
PACKAGE_FILE="${PACKAGE_NAME}.tar.gz"
|
||||
|
||||
log_info "开始打包 Argus Agent 安装包..."
|
||||
|
||||
# 检查必要文件
|
||||
log_info "检查必要文件..."
|
||||
|
||||
required_files=(
|
||||
"install.sh"
|
||||
"uninstall.sh"
|
||||
"bin/argus-agent"
|
||||
"check_health.sh"
|
||||
)
|
||||
|
||||
missing_files=()
|
||||
for file in "${required_files[@]}"; do
|
||||
if [[ ! -f "$file" ]]; then
|
||||
missing_files+=("$file")
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ ${#missing_files[@]} -gt 0 ]]; then
|
||||
echo "缺少以下文件:"
|
||||
for file in "${missing_files[@]}"; do
|
||||
echo " - $file"
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_success "所有必要文件检查完成"
|
||||
|
||||
# 创建临时目录
|
||||
TEMP_DIR=$(mktemp -d)
|
||||
log_info "创建临时目录: $TEMP_DIR"
|
||||
|
||||
# 复制文件到临时目录
|
||||
cp -r . "$TEMP_DIR/$PACKAGE_NAME"
|
||||
|
||||
# 进入临时目录
|
||||
cd "$TEMP_DIR"
|
||||
|
||||
# 创建压缩包
|
||||
log_info "创建压缩包: $PACKAGE_FILE"
|
||||
tar -czf "$PACKAGE_FILE" "$PACKAGE_NAME"
|
||||
|
||||
# 移动压缩包到原目录
|
||||
mv "$PACKAGE_FILE" "$CURRENT_DIR/"
|
||||
|
||||
# 清理临时目录
|
||||
rm -rf "$TEMP_DIR"
|
||||
|
||||
# 返回原目录
|
||||
cd "$CURRENT_DIR"
|
||||
|
||||
# 显示结果
|
||||
log_success "打包完成!"
|
||||
echo
|
||||
echo "安装包文件: $PACKAGE_FILE"
|
||||
echo "文件大小: $(du -h "$PACKAGE_FILE" | cut -f1)"
|
||||
echo
|
||||
echo "使用方法:"
|
||||
echo "1. 将 $PACKAGE_FILE 传输到目标服务器"
|
||||
echo "2. 解压: tar -xzf $PACKAGE_FILE"
|
||||
echo "3. 进入目录: cd $PACKAGE_NAME"
|
||||
echo "4. 运行安装: sudo ./install.sh"
|
||||
echo
|
||||
echo "注意: 请确保所有必要文件都存在"
|
||||
255
src/metric/client-plugins/all-in-one-full/plugins/argus-agent/uninstall.sh
Executable file
255
src/metric/client-plugins/all-in-one-full/plugins/argus-agent/uninstall.sh
Executable file
@ -0,0 +1,255 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Argus Agent 卸载脚本
|
||||
# 版本: 1.0
|
||||
# 作者: AIOps Team
|
||||
# 日期: $(date +%Y-%m-%d)
|
||||
|
||||
set -e
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# 日志函数
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# 检查是否为 root 用户
|
||||
check_root() {
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
log_error "此脚本需要 root 权限运行"
|
||||
log_info "请使用: sudo $0"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 停止运行中的进程
|
||||
stop_processes() {
|
||||
log_info "停止 Argus Agent 进程..."
|
||||
|
||||
local pid_file="/var/run/argus-agent.pid"
|
||||
local stopped=false
|
||||
|
||||
# 首先尝试通过 PID 文件停止服务
|
||||
if [[ -f "$pid_file" ]]; then
|
||||
local pid=$(cat "$pid_file")
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_info "通过 PID 文件停止服务 (PID: $pid)..."
|
||||
kill "$pid"
|
||||
sleep 3
|
||||
|
||||
# 检查进程是否已停止
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_warning "进程未响应,强制终止..."
|
||||
kill -9 "$pid" 2>/dev/null || true
|
||||
fi
|
||||
log_success "Argus Agent 进程已停止"
|
||||
stopped=true
|
||||
else
|
||||
log_warning "PID 文件存在但进程已不存在,清理 PID 文件"
|
||||
rm -f "$pid_file"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 查找并杀死所有 argus-agent 进程
|
||||
local pids=$(pgrep -f "argus-agent" 2>/dev/null || true)
|
||||
if [[ -n "$pids" ]]; then
|
||||
log_info "发现 argus-agent 进程,正在停止..."
|
||||
for pid in $pids; do
|
||||
log_info "停止进程 PID: $pid"
|
||||
kill "$pid" 2>/dev/null || true
|
||||
done
|
||||
sleep 2
|
||||
|
||||
# 检查是否还有进程在运行,如果有则强制终止
|
||||
local remaining_pids=$(pgrep -f "argus-agent" 2>/dev/null || true)
|
||||
if [[ -n "$remaining_pids" ]]; then
|
||||
log_warning "进程未响应,强制终止..."
|
||||
for pid in $remaining_pids; do
|
||||
log_info "强制终止进程 PID: $pid"
|
||||
kill -9 "$pid" 2>/dev/null || true
|
||||
done
|
||||
sleep 1
|
||||
fi
|
||||
|
||||
# 最终检查
|
||||
if pgrep -f "argus-agent" > /dev/null; then
|
||||
log_error "无法停止所有 argus-agent 进程"
|
||||
else
|
||||
log_success "所有 Argus Agent 进程已停止"
|
||||
stopped=true
|
||||
fi
|
||||
else
|
||||
log_info "Argus Agent 进程未运行"
|
||||
fi
|
||||
|
||||
# 清理 PID 文件
|
||||
rm -f "$pid_file"
|
||||
|
||||
if [[ "$stopped" == "false" ]]; then
|
||||
log_warning "未发现需要停止的 Argus Agent 进程"
|
||||
fi
|
||||
}
|
||||
|
||||
# 删除二进制文件
|
||||
remove_binary() {
|
||||
log_info "删除 Argus Agent 二进制文件..."
|
||||
|
||||
local binary_files=(
|
||||
"/usr/local/bin/argus-agent"
|
||||
)
|
||||
|
||||
local deleted=false
|
||||
for binary_file in "${binary_files[@]}"; do
|
||||
if [[ -f "$binary_file" ]]; then
|
||||
rm -f "$binary_file"
|
||||
log_success "二进制文件已删除: $binary_file"
|
||||
deleted=true
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ "$deleted" == "false" ]]; then
|
||||
log_info "二进制文件不存在"
|
||||
fi
|
||||
}
|
||||
|
||||
# 删除配置文件
|
||||
remove_config() {
|
||||
log_info "删除配置文件..."
|
||||
|
||||
local config_dir="/etc/argus-agent"
|
||||
|
||||
if [[ -d "$config_dir" ]]; then
|
||||
rm -rf "$config_dir"
|
||||
log_success "配置目录已删除"
|
||||
else
|
||||
log_info "配置目录不存在"
|
||||
fi
|
||||
}
|
||||
|
||||
# 删除数据目录
|
||||
remove_data_dir() {
|
||||
log_info "删除数据目录..."
|
||||
|
||||
local data_dir="/var/lib/argus-agent"
|
||||
|
||||
if [[ -d "$data_dir" ]]; then
|
||||
rm -rf "$data_dir"
|
||||
log_success "数据目录已删除"
|
||||
else
|
||||
log_info "数据目录不存在"
|
||||
fi
|
||||
}
|
||||
|
||||
# 检查用户状态(可选)
|
||||
check_user_status() {
|
||||
log_info "检查 argus-agent 用户状态..."
|
||||
|
||||
if id "argus-agent" &>/dev/null; then
|
||||
log_info "检测到 argus-agent 用户存在"
|
||||
log_warning "argus-agent 是系统用户,可能被其他服务使用"
|
||||
log_info "为了系统稳定性,将保留 argus-agent 用户"
|
||||
log_info "如需手动删除,请运行: sudo userdel argus-agent"
|
||||
else
|
||||
log_info "argus-agent 用户不存在"
|
||||
fi
|
||||
}
|
||||
|
||||
# 清理日志文件
|
||||
cleanup_logs() {
|
||||
log_info "清理日志文件..."
|
||||
|
||||
# 删除安装脚本创建的日志文件
|
||||
rm -f /var/log/argus-agent.log
|
||||
|
||||
log_success "日志文件已清理"
|
||||
}
|
||||
|
||||
# 清理安装记录
|
||||
cleanup_install_record() {
|
||||
log_info "清理安装记录..."
|
||||
|
||||
local install_record="/opt/argus-metric/current/.install_record"
|
||||
|
||||
if [[ -f "$install_record" ]]; then
|
||||
if command -v jq &> /dev/null; then
|
||||
# 使用 jq 删除 argus-agent 记录
|
||||
jq 'del(.components."argus-agent")' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record"
|
||||
log_success "安装记录已更新"
|
||||
else
|
||||
log_warning "jq 命令不可用,无法清理安装记录"
|
||||
fi
|
||||
else
|
||||
log_info "安装记录文件不存在"
|
||||
fi
|
||||
}
|
||||
|
||||
# 显示卸载信息
|
||||
show_uninstall_info() {
|
||||
log_success "Argus Agent 卸载完成!"
|
||||
echo
|
||||
echo "已删除的内容:"
|
||||
echo " - 二进制文件: /usr/local/bin/argus-agent"
|
||||
echo " - 配置目录: /etc/argus-agent"
|
||||
echo " - 数据目录: /var/lib/argus-agent"
|
||||
echo " - 相关日志文件"
|
||||
echo
|
||||
echo "注意:"
|
||||
echo " - argus-agent 用户已保留(系统用户,可能被其他服务使用)"
|
||||
echo " - 如需完全清理,请手动检查并删除相关文件"
|
||||
echo
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
echo "=========================================="
|
||||
echo " Argus Agent 卸载脚本 v1.0"
|
||||
echo "=========================================="
|
||||
echo
|
||||
|
||||
check_root
|
||||
|
||||
log_warning "此操作将完全卸载 Argus Agent"
|
||||
read -p "确认继续?(y/N): " confirm
|
||||
|
||||
if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then
|
||||
log_info "取消卸载操作"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log_info "开始卸载 Argus Agent..."
|
||||
|
||||
stop_processes
|
||||
remove_binary
|
||||
remove_config
|
||||
remove_data_dir
|
||||
cleanup_logs
|
||||
cleanup_install_record
|
||||
|
||||
# 检查用户状态
|
||||
check_user_status
|
||||
|
||||
show_uninstall_info
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
BIN
src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/bin/datacenter-gpu-manager_3.3.9_amd64.deb
(Stored with Git LFS)
Normal file
BIN
src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/bin/datacenter-gpu-manager_3.3.9_amd64.deb
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/bin/dcgm-exporter
(Stored with Git LFS)
Executable file
BIN
src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/bin/dcgm-exporter
(Stored with Git LFS)
Executable file
Binary file not shown.
@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
|
||||
# DCGM Exporter 健康检查脚本
|
||||
# 输出 JSON 格式结果
|
||||
|
||||
set -e
|
||||
|
||||
# 检查 DCGM Exporter 健康状态
|
||||
check_health() {
|
||||
local url="http://localhost:9400"
|
||||
local metrics_url="$url/metrics"
|
||||
local name="dcgm-exporter"
|
||||
local status="unhealth"
|
||||
local reason=""
|
||||
|
||||
# 检查 curl 是否可用
|
||||
if ! command -v curl &> /dev/null; then
|
||||
reason="curl 命令不可用,无法进行健康检查"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 测试根路径连接
|
||||
local http_code=$(curl -s -o /dev/null -w "%{http_code}" "$url" 2>/dev/null || echo "000")
|
||||
|
||||
if [[ "$http_code" == "200" ]]; then
|
||||
# 测试 metrics 端点
|
||||
local metrics_code=$(curl -s -o /dev/null -w "%{http_code}" "$metrics_url" 2>/dev/null || echo "000")
|
||||
|
||||
if [[ "$metrics_code" == "200" ]]; then
|
||||
status="health"
|
||||
reason="success"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 0
|
||||
else
|
||||
reason="Metrics 端点异常 (HTTP $metrics_code)"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
reason="HTTP 服务异常 (HTTP $http_code),请检查 DCGM Exporter 是否正在运行在端口 9400"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
check_health
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
@ -0,0 +1,77 @@
|
||||
# Format
|
||||
# If line starts with a '#' it is considered a comment
|
||||
# DCGM FIELD, Prometheus metric type, help message
|
||||
|
||||
# Clocks
|
||||
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
|
||||
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
|
||||
# DCGM_EXP_CLOCK_EVENTS_COUNT, gauge, Count of clock events within the user-specified time window (see clock-events-count-window-size param).
|
||||
|
||||
# Temperature
|
||||
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
|
||||
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).
|
||||
|
||||
# Power
|
||||
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).
|
||||
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
|
||||
|
||||
# PCIE
|
||||
DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML.
|
||||
DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML.
|
||||
DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
|
||||
|
||||
# Utilization (the sample period varies depending on the product)
|
||||
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
|
||||
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
|
||||
DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
|
||||
DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %).
|
||||
|
||||
# Errors and violations
|
||||
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
|
||||
# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us).
|
||||
# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us).
|
||||
# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us).
|
||||
# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
|
||||
# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us).
|
||||
# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
|
||||
# DCGM_EXP_XID_ERRORS_COUNT, gauge, Count of XID Errors within user-specified time window (see xid-count-window-size param).
|
||||
# Memory usage
|
||||
DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB).
|
||||
DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB).
|
||||
|
||||
# ECC
|
||||
# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
|
||||
# DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
|
||||
# DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
|
||||
# DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.
|
||||
|
||||
# Retired pages
|
||||
# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors.
|
||||
# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors.
|
||||
# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.
|
||||
|
||||
# NVLink
|
||||
# DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
|
||||
# DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
|
||||
# DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries.
|
||||
# DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
|
||||
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes
|
||||
|
||||
# VGPU License status
|
||||
DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
|
||||
|
||||
# Remapped rows
|
||||
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
|
||||
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
|
||||
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed
|
||||
|
||||
# Static configuration information. These appear as labels on the other metrics
|
||||
DCGM_FI_DRIVER_VERSION, label, Driver Version
|
||||
# DCGM_FI_NVML_VERSION, label, NVML Version
|
||||
# DCGM_FI_DEV_BRAND, label, Device Brand
|
||||
# DCGM_FI_DEV_SERIAL, label, Device Serial Number
|
||||
# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version
|
||||
# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version
|
||||
# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version
|
||||
# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version
|
||||
# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device
|
||||
|
365
src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/install.sh
Executable file
365
src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/install.sh
Executable file
@ -0,0 +1,365 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# 日志函数
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# 更新安装记录
|
||||
update_install_record() {
|
||||
local pid="$1"
|
||||
# 使用传入的安装目录参数,如果没有则使用默认值
|
||||
local install_base_dir="${2:-/opt/argus-metric/current}"
|
||||
local install_record="$install_base_dir/.install_record"
|
||||
|
||||
# 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建
|
||||
if [[ ! -f "$install_record" ]]; then
|
||||
log_info "安装记录文件不存在,将由主安装脚本创建"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 如果文件存在,说明是重启场景,只更新 PID 字段
|
||||
if command -v jq &> /dev/null; then
|
||||
# 读取当前 PID
|
||||
local current_pid=$(jq -r '.components."dcgm-exporter".pid // ""' "$install_record" 2>/dev/null)
|
||||
|
||||
if [[ -z "$current_pid" ]]; then
|
||||
log_warning "无法读取当前 PID,跳过更新"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段
|
||||
jq --arg new_pid "$pid" '.components."dcgm-exporter".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record"
|
||||
log_info "PID 已更新: $current_pid -> $pid"
|
||||
else
|
||||
log_warning "jq 命令不可用,无法更新安装记录文件"
|
||||
fi
|
||||
}
|
||||
|
||||
# 显示帮助信息
|
||||
show_help() {
|
||||
echo "DCGM Exporter 安装脚本"
|
||||
echo
|
||||
echo "用法: $0 [选项]"
|
||||
echo
|
||||
echo "选项:"
|
||||
echo " --help 显示此帮助信息"
|
||||
echo
|
||||
echo "示例:"
|
||||
echo " $0 # 安装 DCGM Exporter"
|
||||
echo
|
||||
}
|
||||
|
||||
# 解析命令行参数
|
||||
INSTALL_DIR=""
|
||||
for arg in "$@"; do
|
||||
case $arg in
|
||||
--help|-h)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
# 如果参数不是以--开头,则认为是安装目录
|
||||
if [[ ! "$arg" =~ ^-- ]]; then
|
||||
INSTALL_DIR="$arg"
|
||||
else
|
||||
log_error "未知参数: $arg"
|
||||
show_help
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# 检查是否为 root 用户
|
||||
check_root() {
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
log_error "此脚本需要 root 权限运行"
|
||||
log_info "请使用: sudo $0"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 检查系统要求
|
||||
check_system() {
|
||||
log_info "检查系统要求..."
|
||||
|
||||
# 检查操作系统
|
||||
if [[ ! -f /etc/os-release ]]; then
|
||||
log_error "无法检测操作系统版本"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
source /etc/os-release
|
||||
log_info "检测到操作系统: $NAME $VERSION"
|
||||
|
||||
# 检查是否为 Ubuntu/Debian
|
||||
if [[ "$ID" != "ubuntu" && "$ID" != "debian" ]]; then
|
||||
log_warning "此脚本主要针对 Ubuntu/Debian 系统,其他系统可能需要调整"
|
||||
fi
|
||||
|
||||
# 检查 NVIDIA GPU
|
||||
if ! command -v nvidia-smi &> /dev/null; then
|
||||
log_warning "未检测到 nvidia-smi,请确保已安装 NVIDIA 驱动"
|
||||
else
|
||||
log_success "检测到 NVIDIA GPU"
|
||||
nvidia-smi --query-gpu=name --format=csv,noheader,nounits | head -1
|
||||
fi
|
||||
}
|
||||
|
||||
# 安装 DCGM 依赖
|
||||
install_dcgm_dependency() {
|
||||
log_info "安装 DCGM 依赖..."
|
||||
|
||||
local deb_file="bin/datacenter-gpu-manager_3.3.9_amd64.deb"
|
||||
|
||||
if [[ ! -f "$deb_file" ]]; then
|
||||
log_error "找不到 DCGM 依赖文件: $deb_file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 安装 deb 包
|
||||
dpkg -i "$deb_file" || {
|
||||
log_warning "dpkg 安装失败,尝试使用 apt 修复依赖..."
|
||||
apt-get update
|
||||
apt-get install -f -y
|
||||
dpkg -i "$deb_file"
|
||||
}
|
||||
|
||||
log_success "DCGM 依赖安装完成"
|
||||
}
|
||||
|
||||
# 检查 DCGM 服务状态
|
||||
check_dcgm_service() {
|
||||
log_info "检查 DCGM 服务状态..."
|
||||
|
||||
# 检查 DCGM 服务是否在运行
|
||||
if systemctl is-active --quiet dcgm 2>/dev/null; then
|
||||
log_success "DCGM 服务已在运行"
|
||||
elif pgrep -f nv-hostengine > /dev/null; then
|
||||
log_success "nv-hostengine 进程已在运行"
|
||||
else
|
||||
log_warning "DCGM 服务未运行,需要手动启动"
|
||||
log_info "启动 DCGM 服务的方法:"
|
||||
log_info " 1. 使用 systemd: sudo systemctl start dcgm"
|
||||
log_info " 2. 手动启动: nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &"
|
||||
fi
|
||||
|
||||
# 测试 DCGM 连接
|
||||
if systemctl is-active --quiet dcgm 2>/dev/null || pgrep -f nv-hostengine > /dev/null; then
|
||||
log_info "测试 DCGM 连接..."
|
||||
if dcgmi discovery -l > /dev/null 2>&1; then
|
||||
log_success "DCGM 连接测试成功"
|
||||
else
|
||||
log_warning "DCGM 连接测试失败,请检查服务状态"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# 停止可能运行的服务
|
||||
stop_existing_service() {
|
||||
log_info "检查并停止可能运行的服务..."
|
||||
|
||||
local pid_file="/var/run/dcgm-exporter.pid"
|
||||
|
||||
# 检查并停止通过 PID 文件管理的服务
|
||||
if [[ -f "$pid_file" ]]; then
|
||||
local pid=$(cat "$pid_file")
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_info "发现正在运行的 DCGM Exporter 服务 (PID: $pid),正在停止..."
|
||||
kill "$pid" > /dev/null 2>&1 || true
|
||||
sleep 2
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_warning "进程未响应,强制终止..."
|
||||
kill -9 "$pid" > /dev/null 2>&1 || true
|
||||
fi
|
||||
rm -f "$pid_file"
|
||||
log_success "服务已停止"
|
||||
else
|
||||
log_warning "发现过期的 PID 文件,正在清理..."
|
||||
rm -f "$pid_file"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 查找并停止所有 dcgm-exporter 进程(排除脚本自身)
|
||||
local exporter_bin="/usr/local/bin/dcgm-exporter"
|
||||
local pids=$(pgrep -f "$exporter_bin")
|
||||
|
||||
if [[ -n "$pids" ]]; then
|
||||
log_info "发现其他 dcgm-exporter 进程,正在停止..."
|
||||
for pid in $pids; do
|
||||
if [[ "$pid" != "$$" ]]; then
|
||||
kill "$pid" > /dev/null 2>&1 || true
|
||||
sleep 1
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_warning "进程 $pid 未响应,强制终止..."
|
||||
kill -9 "$pid" > /dev/null 2>&1 || true
|
||||
fi
|
||||
fi
|
||||
done
|
||||
log_success "所有 dcgm-exporter 进程已停止"
|
||||
fi
|
||||
}
|
||||
|
||||
# 安装 DCGM Exporter 二进制文件
|
||||
install_dcgm_exporter() {
|
||||
log_info "安装 DCGM Exporter..."
|
||||
|
||||
local binary_file="bin/dcgm-exporter"
|
||||
local install_dir="/usr/local/bin"
|
||||
|
||||
if [[ ! -f "$binary_file" ]]; then
|
||||
log_error "找不到 DCGM Exporter 二进制文件: $binary_file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 停止可能运行的服务
|
||||
stop_existing_service
|
||||
|
||||
# 复制二进制文件
|
||||
cp "$binary_file" "$install_dir/"
|
||||
chmod +x "$install_dir/dcgm-exporter"
|
||||
|
||||
log_success "DCGM Exporter 二进制文件安装完成"
|
||||
}
|
||||
|
||||
# 安装配置文件
|
||||
install_config() {
|
||||
log_info "安装配置文件..."
|
||||
|
||||
local config_dir="/etc/dcgm-exporter"
|
||||
local config_file="config/default-counters.csv"
|
||||
|
||||
# 创建配置目录
|
||||
mkdir -p "$config_dir"
|
||||
|
||||
if [[ -f "$config_file" ]]; then
|
||||
cp "$config_file" "$config_dir/"
|
||||
log_success "配置文件安装完成"
|
||||
else
|
||||
log_warning "未找到配置文件,使用默认配置"
|
||||
fi
|
||||
}
|
||||
|
||||
# 启动 DCGM Exporter 服务
|
||||
start_dcgm_exporter() {
|
||||
log_info "启动 DCGM Exporter 服务..."
|
||||
|
||||
local binary_path="/usr/local/bin/dcgm-exporter"
|
||||
local log_file="/var/log/dcgm-exporter.log"
|
||||
local pid_file="/var/run/dcgm-exporter.pid"
|
||||
|
||||
# 检查服务是否已经在运行
|
||||
if [[ -f "$pid_file" ]]; then
|
||||
local pid=$(cat "$pid_file")
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_info "DCGM Exporter 服务已在运行 (PID: $pid)"
|
||||
return 0
|
||||
else
|
||||
log_warning "发现过期的 PID 文件,正在清理..."
|
||||
rm -f "$pid_file"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 检查端口是否被占用
|
||||
if netstat -tuln 2>/dev/null | grep -q ":9400 "; then
|
||||
log_warning "端口 9400 已被占用,请检查是否有其他服务在运行"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 启动服务
|
||||
log_info "正在启动 DCGM Exporter..."
|
||||
nohup "$binary_path" --address=:9400 > "$log_file" 2>&1 &
|
||||
local pid=$!
|
||||
|
||||
# 保存 PID
|
||||
echo "$pid" > "$pid_file"
|
||||
|
||||
# 等待服务启动
|
||||
sleep 2
|
||||
|
||||
# 检查服务是否成功启动
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_success "DCGM Exporter 服务启动成功 (PID: $pid)"
|
||||
log_info "日志文件: $log_file"
|
||||
log_info "PID 文件: $pid_file"
|
||||
|
||||
# 更新安装记录
|
||||
update_install_record "$pid" "$INSTALL_DIR"
|
||||
else
|
||||
log_error "DCGM Exporter 服务启动失败"
|
||||
rm -f "$pid_file"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
|
||||
# 显示安装信息
|
||||
show_install_info() {
|
||||
log_success "DCGM Exporter 安装完成!"
|
||||
echo
|
||||
echo "安装信息:"
|
||||
echo " 二进制文件: /usr/local/bin/dcgm-exporter"
|
||||
echo " 配置文件: /etc/dcgm-exporter/default-counters.csv"
|
||||
echo " 默认端口: 9400"
|
||||
echo
|
||||
echo "使用方法:"
|
||||
echo " 1. 启动 DCGM 服务:"
|
||||
echo " sudo systemctl start dcgm"
|
||||
echo " 或: nohup nv-hostengine > /var/log/nv-hostengine.log 2>&1 &"
|
||||
echo " 2. 启动 DCGM Exporter:"
|
||||
echo " /usr/local/bin/dcgm-exporter --address=:9400"
|
||||
echo " 或: nohup /usr/local/bin/dcgm-exporter --address=:9400 &"
|
||||
echo
|
||||
echo "测试连接:"
|
||||
echo " curl http://localhost:9400/metrics"
|
||||
echo
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
echo "=========================================="
|
||||
echo " DCGM Exporter 安装脚本 v1.0"
|
||||
echo "=========================================="
|
||||
echo
|
||||
|
||||
check_root
|
||||
check_system
|
||||
|
||||
log_info "开始安装 DCGM Exporter..."
|
||||
|
||||
install_dcgm_dependency
|
||||
check_dcgm_service
|
||||
install_dcgm_exporter
|
||||
install_config
|
||||
start_dcgm_exporter
|
||||
|
||||
show_install_info
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
88
src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/package.sh
Executable file
88
src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/package.sh
Executable file
@ -0,0 +1,88 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# 颜色定义
|
||||
GREEN='\033[0;32m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
# 获取当前目录
|
||||
CURRENT_DIR=$(pwd)
|
||||
PACKAGE_NAME="dcgm-exporter-$(date +%Y%m%d-%H%M%S)"
|
||||
PACKAGE_FILE="${PACKAGE_NAME}.tar.gz"
|
||||
|
||||
log_info "开始打包 DCGM Exporter 安装包..."
|
||||
|
||||
# 检查必要文件
|
||||
log_info "检查必要文件..."
|
||||
|
||||
required_files=(
|
||||
"install.sh"
|
||||
"uninstall.sh"
|
||||
"bin/dcgm-exporter"
|
||||
"bin/datacenter-gpu-manager_3.3.9_amd64.deb"
|
||||
"check_health.sh"
|
||||
)
|
||||
|
||||
missing_files=()
|
||||
for file in "${required_files[@]}"; do
|
||||
if [[ ! -f "$file" ]]; then
|
||||
missing_files+=("$file")
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ ${#missing_files[@]} -gt 0 ]]; then
|
||||
echo "缺少以下文件:"
|
||||
for file in "${missing_files[@]}"; do
|
||||
echo " - $file"
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_success "所有必要文件检查完成"
|
||||
|
||||
# 创建临时目录
|
||||
TEMP_DIR=$(mktemp -d)
|
||||
log_info "创建临时目录: $TEMP_DIR"
|
||||
|
||||
# 复制文件到临时目录
|
||||
cp -r . "$TEMP_DIR/$PACKAGE_NAME"
|
||||
|
||||
# 进入临时目录
|
||||
cd "$TEMP_DIR"
|
||||
|
||||
# 创建压缩包
|
||||
log_info "创建压缩包: $PACKAGE_FILE"
|
||||
tar -czf "$PACKAGE_FILE" "$PACKAGE_NAME"
|
||||
|
||||
# 移动压缩包到原目录
|
||||
mv "$PACKAGE_FILE" "$CURRENT_DIR/"
|
||||
|
||||
# 清理临时目录
|
||||
rm -rf "$TEMP_DIR"
|
||||
|
||||
# 返回原目录
|
||||
cd "$CURRENT_DIR"
|
||||
|
||||
# 显示结果
|
||||
log_success "打包完成!"
|
||||
echo
|
||||
echo "安装包文件: $PACKAGE_FILE"
|
||||
echo "文件大小: $(du -h "$PACKAGE_FILE" | cut -f1)"
|
||||
echo
|
||||
echo "使用方法:"
|
||||
echo "1. 将 $PACKAGE_FILE 传输到目标服务器"
|
||||
echo "2. 解压: tar -xzf $PACKAGE_FILE"
|
||||
echo "3. 进入目录: cd $PACKAGE_NAME"
|
||||
echo "4. 运行安装: sudo ./install.sh"
|
||||
echo
|
||||
echo "注意: 请确保 config/default-counters.csv 文件存在"
|
||||
216
src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/uninstall.sh
Executable file
216
src/metric/client-plugins/all-in-one-full/plugins/dcgm-exporter/uninstall.sh
Executable file
@ -0,0 +1,216 @@
|
||||
#!/bin/bash
|
||||
|
||||
# DCGM Exporter 卸载脚本
|
||||
# 版本: 1.0
|
||||
# 作者: AIOps Team
|
||||
# 日期: $(date +%Y-%m-%d)
|
||||
|
||||
set -e
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# 日志函数
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# 检查是否为 root 用户
|
||||
check_root() {
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
log_error "此脚本需要 root 权限运行"
|
||||
log_info "请使用: sudo $0"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 停止运行中的进程
|
||||
stop_processes() {
|
||||
log_info "停止 DCGM Exporter 进程..."
|
||||
|
||||
local pid_file="/var/run/dcgm-exporter.pid"
|
||||
local stopped=false
|
||||
|
||||
# 首先尝试通过 PID 文件停止服务
|
||||
if [[ -f "$pid_file" ]]; then
|
||||
local pid=$(cat "$pid_file")
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_info "通过 PID 文件停止服务 (PID: $pid)..."
|
||||
kill "$pid"
|
||||
sleep 3
|
||||
|
||||
# 检查进程是否已停止
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_warning "进程未响应,强制终止..."
|
||||
kill -9 "$pid" 2>/dev/null || true
|
||||
fi
|
||||
log_success "DCGM Exporter 进程已停止"
|
||||
stopped=true
|
||||
else
|
||||
log_warning "PID 文件存在但进程已不存在,清理 PID 文件"
|
||||
rm -f "$pid_file"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 查找并杀死所有 dcgm-exporter 进程
|
||||
local pids=$(pgrep -f "dcgm-exporter" 2>/dev/null || true)
|
||||
if [[ -n "$pids" ]]; then
|
||||
log_info "发现 dcgm-exporter 进程,正在停止..."
|
||||
for pid in $pids; do
|
||||
log_info "停止进程 PID: $pid"
|
||||
kill "$pid" 2>/dev/null || true
|
||||
done
|
||||
sleep 2
|
||||
|
||||
# 检查是否还有进程在运行,如果有则强制终止
|
||||
local remaining_pids=$(pgrep -f "dcgm-exporter" 2>/dev/null || true)
|
||||
if [[ -n "$remaining_pids" ]]; then
|
||||
log_warning "进程未响应,强制终止..."
|
||||
for pid in $remaining_pids; do
|
||||
log_info "强制终止进程 PID: $pid"
|
||||
kill -9 "$pid" 2>/dev/null || true
|
||||
done
|
||||
sleep 1
|
||||
fi
|
||||
|
||||
# 最终检查
|
||||
if pgrep -f "dcgm-exporter" > /dev/null; then
|
||||
log_error "无法停止所有 dcgm-exporter 进程"
|
||||
else
|
||||
log_success "所有 DCGM Exporter 进程已停止"
|
||||
stopped=true
|
||||
fi
|
||||
else
|
||||
log_info "DCGM Exporter 进程未运行"
|
||||
fi
|
||||
|
||||
# 清理 PID 文件
|
||||
rm -f "$pid_file"
|
||||
|
||||
if [[ "$stopped" == "false" ]]; then
|
||||
log_warning "未发现需要停止的 DCGM Exporter 进程"
|
||||
fi
|
||||
}
|
||||
|
||||
# 删除二进制文件
|
||||
remove_binary() {
|
||||
log_info "删除 DCGM Exporter 二进制文件..."
|
||||
|
||||
local binary_file="/usr/local/bin/dcgm-exporter"
|
||||
|
||||
if [[ -f "$binary_file" ]]; then
|
||||
rm -f "$binary_file"
|
||||
log_success "二进制文件已删除"
|
||||
else
|
||||
log_info "二进制文件不存在"
|
||||
fi
|
||||
}
|
||||
|
||||
# 删除配置文件
|
||||
remove_config() {
|
||||
log_info "删除配置文件..."
|
||||
|
||||
local config_dir="/etc/dcgm-exporter"
|
||||
|
||||
if [[ -d "$config_dir" ]]; then
|
||||
rm -rf "$config_dir"
|
||||
log_success "配置目录已删除"
|
||||
else
|
||||
log_info "配置目录不存在"
|
||||
fi
|
||||
}
|
||||
|
||||
# 卸载 DCGM 依赖(可选)
|
||||
remove_dcgm_dependency() {
|
||||
log_info "检查 DCGM 依赖状态..."
|
||||
|
||||
# 检查是否安装了 DCGM 包
|
||||
if dpkg -l | grep -q datacenter-gpu-manager; then
|
||||
log_info "检测到 DCGM 依赖包已安装"
|
||||
log_warning "DCGM 是系统级依赖,可能被其他应用程序使用"
|
||||
log_info "为了系统稳定性,将保留 DCGM 依赖包"
|
||||
log_info "如需手动卸载,请运行: sudo apt-get remove --purge datacenter-gpu-manager"
|
||||
else
|
||||
log_info "DCGM 依赖包未安装"
|
||||
fi
|
||||
}
|
||||
|
||||
# 清理日志文件
|
||||
cleanup_logs() {
|
||||
log_info "清理日志文件..."
|
||||
|
||||
# 清理 journal 日志
|
||||
journalctl --vacuum-time=1s --quiet || true
|
||||
|
||||
# 删除可能的日志文件
|
||||
rm -f /var/log/nv-hostengine.log
|
||||
rm -f /var/log/dcgm-exporter.log
|
||||
|
||||
log_success "日志文件已清理"
|
||||
}
|
||||
|
||||
# 显示卸载信息
|
||||
show_uninstall_info() {
|
||||
log_success "DCGM Exporter 卸载完成!"
|
||||
echo
|
||||
echo "已删除的内容:"
|
||||
echo " - 二进制文件: /usr/local/bin/dcgm-exporter"
|
||||
echo " - 配置目录: /etc/dcgm-exporter"
|
||||
echo " - 相关日志文件"
|
||||
echo
|
||||
echo "注意:"
|
||||
echo " - DCGM 依赖包可能仍然存在"
|
||||
echo " - 如需完全清理,请手动检查并删除相关文件"
|
||||
echo
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
echo "=========================================="
|
||||
echo " DCGM Exporter 卸载脚本 v1.0"
|
||||
echo "=========================================="
|
||||
echo
|
||||
|
||||
check_root
|
||||
|
||||
log_warning "此操作将完全卸载 DCGM Exporter"
|
||||
read -p "确认继续?(y/N): " confirm
|
||||
|
||||
if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then
|
||||
log_info "取消卸载操作"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log_info "开始卸载 DCGM Exporter..."
|
||||
|
||||
stop_processes
|
||||
remove_binary
|
||||
remove_config
|
||||
cleanup_logs
|
||||
|
||||
# 询问是否卸载 DCGM 依赖
|
||||
remove_dcgm_dependency
|
||||
|
||||
show_uninstall_info
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
@ -0,0 +1,181 @@
|
||||
# Fluent Bit 安装包
|
||||
|
||||
这是一个 Fluent Bit 的自动化安装包,提供了完整的安装、卸载和健康检查功能。
|
||||
|
||||
## 目录结构
|
||||
|
||||
```
|
||||
fluent-bit-installer/
|
||||
├── install.sh # 安装脚本
|
||||
├── uninstall.sh # 卸载脚本
|
||||
├── package.sh # 打包脚本
|
||||
├── check_health.sh # 健康检查脚本
|
||||
├── bin/
|
||||
│ └── fluent-bit_3.1.9_amd64.deb # Fluent Bit 安装包
|
||||
└── config/
|
||||
├── fluent-bit.conf # 主配置文件
|
||||
├── inject_labels.lua # Lua 脚本
|
||||
├── parsers.conf # 解析器配置
|
||||
├── inputs.d/ # 输入配置目录
|
||||
│ ├── 10-train.conf
|
||||
│ └── 20-infer.conf
|
||||
└── outputs.d/ # 输出配置目录
|
||||
└── 10-es.conf
|
||||
```
|
||||
|
||||
## 功能特性
|
||||
|
||||
- **自动化安装**: 一键安装 Fluent Bit 及其依赖
|
||||
- **配置管理**: 自动部署预配置的配置文件
|
||||
- **服务管理**: 自动启动和停止 Fluent Bit 服务
|
||||
- **健康检查**: 提供 JSON 格式的健康状态检查
|
||||
- **完整卸载**: 彻底清理所有相关文件和配置
|
||||
- **用户管理**: 自动创建专用的 fluent-bit 用户
|
||||
|
||||
## 使用方法
|
||||
|
||||
### 1. 打包安装包
|
||||
|
||||
```bash
|
||||
./package.sh
|
||||
```
|
||||
|
||||
这将创建一个带时间戳的压缩包,例如:`fluent-bit-installer-20250924-160954.tar.gz`
|
||||
|
||||
### 2. 安装 Fluent Bit
|
||||
|
||||
```bash
|
||||
# 解压安装包
|
||||
tar -xzf fluent-bit-installer-*.tar.gz
|
||||
cd fluent-bit-installer-*
|
||||
|
||||
# 运行安装脚本(需要 root 权限)
|
||||
sudo ./install.sh
|
||||
```
|
||||
|
||||
### 3. 健康检查
|
||||
|
||||
```bash
|
||||
./check_health.sh
|
||||
```
|
||||
|
||||
输出示例:
|
||||
```json
|
||||
{"name": "fluent-bit", "status": "health", "reason": "success"}
|
||||
```
|
||||
|
||||
### 4. 卸载 Fluent Bit
|
||||
|
||||
```bash
|
||||
sudo ./uninstall.sh
|
||||
```
|
||||
|
||||
## 安装后的文件位置
|
||||
|
||||
- **二进制文件**: `/opt/fluent-bit/bin/fluent-bit`
|
||||
- **配置文件**: `/etc/fluent-bit/`
|
||||
- **日志文件**: `/var/log/fluent-bit/`
|
||||
- **缓冲区目录**: `/var/lib/fluent-bit/buffers/`
|
||||
- **运行用户**: `fluent-bit`
|
||||
- **HTTP 端口**: `2020`
|
||||
|
||||
## 配置说明
|
||||
|
||||
### 主配置文件
|
||||
|
||||
主配置文件位于 `/etc/fluent-bit/fluent-bit.conf`,包含以下主要部分:
|
||||
|
||||
- **SERVICE**: 服务配置,包括 HTTP 服务器设置
|
||||
- **INPUT**: 输入配置,通过 `inputs.d/` 目录管理
|
||||
- **FILTER**: 过滤器配置,包括解析器和标签注入
|
||||
- **OUTPUT**: 输出配置,通过 `outputs.d/` 目录管理
|
||||
|
||||
### 输入配置
|
||||
|
||||
- `10-train.conf`: 训练日志输入配置
|
||||
- `20-infer.conf`: 推理日志输入配置
|
||||
|
||||
### 输出配置
|
||||
|
||||
- `10-es.conf`: Elasticsearch 输出配置
|
||||
|
||||
## 服务管理
|
||||
|
||||
### 手动启动
|
||||
|
||||
```bash
|
||||
/opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf
|
||||
```
|
||||
|
||||
### 后台启动
|
||||
|
||||
```bash
|
||||
nohup /opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf &
|
||||
```
|
||||
|
||||
### 检查服务状态
|
||||
|
||||
```bash
|
||||
# 检查进程
|
||||
ps aux | grep fluent-bit
|
||||
|
||||
# 检查端口
|
||||
netstat -tuln | grep 2020
|
||||
|
||||
# 检查日志
|
||||
tail -f /var/log/fluent-bit/fluent-bit.log
|
||||
```
|
||||
|
||||
## API 接口
|
||||
|
||||
Fluent Bit 提供 HTTP API 用于监控和管理:
|
||||
|
||||
- **根路径**: `http://localhost:2020`
|
||||
- **状态接口**: `http://localhost:2020/api/v1/status`
|
||||
- **指标接口**: `http://localhost:2020/api/v1/metrics`
|
||||
|
||||
## 故障排除
|
||||
|
||||
### 常见问题
|
||||
|
||||
1. **端口被占用**
|
||||
- 检查端口 2020 是否被其他服务占用
|
||||
- 修改配置文件中的端口设置
|
||||
|
||||
2. **权限问题**
|
||||
- 确保 fluent-bit 用户有足够的权限访问日志文件
|
||||
- 检查目录权限设置
|
||||
|
||||
3. **配置文件错误**
|
||||
- 检查配置文件语法
|
||||
- 查看日志文件中的错误信息
|
||||
|
||||
### 日志查看
|
||||
|
||||
```bash
|
||||
# 查看服务日志
|
||||
tail -f /var/log/fluent-bit/fluent-bit.log
|
||||
|
||||
# 查看系统日志
|
||||
journalctl -u fluent-bit -f
|
||||
```
|
||||
|
||||
## 系统要求
|
||||
|
||||
- **操作系统**: Ubuntu/Debian/CentOS/RHEL/Fedora
|
||||
- **架构**: x86_64/amd64
|
||||
- **权限**: root 权限(用于安装和卸载)
|
||||
- **依赖**: curl(用于健康检查)
|
||||
|
||||
## 版本信息
|
||||
|
||||
- **Fluent Bit 版本**: 3.1.9
|
||||
- **安装包版本**: 1.0
|
||||
- **支持架构**: amd64
|
||||
|
||||
## 注意事项
|
||||
|
||||
1. 安装前请确保系统已更新
|
||||
2. 卸载时会保留 fluent-bit 用户(系统用户,可能被其他服务使用)
|
||||
3. 配置文件包含环境变量,请根据实际环境调整
|
||||
4. 建议在生产环境使用前进行充分测试
|
||||
BIN
src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/fluent-bit_3.1.9_amd64.deb
(Stored with Git LFS)
Normal file
BIN
src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/fluent-bit_3.1.9_amd64.deb
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_14.19-0ubuntu0.22.04.1_amd64.deb
(Stored with Git LFS)
Normal file
BIN
src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libpq5_14.19-0ubuntu0.22.04.1_amd64.deb
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_0.2.2-1build2_amd64.deb
(Stored with Git LFS)
Normal file
BIN
src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/bin/libyaml-0-2_0.2.2-1build2_amd64.deb
(Stored with Git LFS)
Normal file
Binary file not shown.
69
src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/check_health.sh
Executable file
69
src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/check_health.sh
Executable file
@ -0,0 +1,69 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Fluent Bit 健康检查脚本
|
||||
# 输出 JSON 格式结果
|
||||
|
||||
set -e
|
||||
|
||||
# 检查 Fluent Bit 健康状态
|
||||
check_health() {
|
||||
local name="fluent-bit"
|
||||
local status="unhealth"
|
||||
local reason=""
|
||||
local install_record="/opt/argus-metric/current/.install_record"
|
||||
|
||||
# 首先尝试通过安装记录文件检查进程
|
||||
if [[ -f "$install_record" ]]; then
|
||||
# 尝试使用jq解析JSON格式的安装记录文件
|
||||
local pid=""
|
||||
if command -v jq &> /dev/null; then
|
||||
pid=$(jq -r '.components."fluent-bit".pid // empty' "$install_record" 2>/dev/null || echo "")
|
||||
else
|
||||
# 如果没有jq,使用简单的文本解析方法
|
||||
pid=$(grep -A 10 '"fluent-bit"' "$install_record" | grep '"pid"' | cut -d'"' -f4 | head -1)
|
||||
fi
|
||||
|
||||
if [[ -n "$pid" && "$pid" =~ ^[0-9]+$ ]]; then
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
# 进程存在且运行正常
|
||||
status="health"
|
||||
reason="进程运行正常 (PID: $pid)"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 0
|
||||
else
|
||||
reason="安装记录中的 PID $pid 进程不存在"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
reason="安装记录文件中未找到有效的 fluent-bit PID"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
# 如果安装记录文件不存在,尝试查找 fluent-bit 进程
|
||||
local pids=$(pgrep -f "fluent-bit" 2>/dev/null || true)
|
||||
if [[ -n "$pids" ]]; then
|
||||
# 取第一个找到的 PID
|
||||
local pid=$(echo "$pids" | head -1)
|
||||
status="health"
|
||||
reason="发现 fluent-bit 进程运行 (PID: $pid),但未找到安装记录"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 0
|
||||
else
|
||||
reason="未找到 fluent-bit 进程,且安装记录文件不存在"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
check_health
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
@ -0,0 +1,37 @@
|
||||
[SERVICE]
|
||||
Daemon Off
|
||||
Parsers_File parsers.conf
|
||||
HTTP_Server On
|
||||
HTTP_Listen 0.0.0.0
|
||||
HTTP_Port 2020
|
||||
storage.path /buffers
|
||||
storage.sync normal
|
||||
storage.checksum on
|
||||
storage.backlog.mem_limit 128M
|
||||
# 备注:该镜像默认未开启 Hot Reload,修改配置后请重启容器。
|
||||
|
||||
@INCLUDE inputs.d/*.conf
|
||||
|
||||
[FILTER]
|
||||
Name parser
|
||||
Match app.*
|
||||
Key_Name log
|
||||
Parser timestamp_parser
|
||||
Reserve_Data On
|
||||
Preserve_Key On
|
||||
Unescape_Key On
|
||||
|
||||
[FILTER]
|
||||
Name record_modifier
|
||||
Match *
|
||||
Record cluster ${CLUSTER}
|
||||
Record rack ${RACK}
|
||||
Record host ${HOSTNAME}
|
||||
|
||||
[FILTER]
|
||||
Name lua
|
||||
Match app.*
|
||||
script inject_labels.lua
|
||||
call add_labels
|
||||
|
||||
@INCLUDE outputs.d/*.conf
|
||||
@ -0,0 +1,15 @@
|
||||
function add_labels(tag, ts, record)
|
||||
record["job_id"] = os.getenv("FB_JOB_ID") or record["job_id"] or "unknown"
|
||||
record["user"] = os.getenv("FB_USER") or record["user"] or "unknown"
|
||||
record["model"] = os.getenv("FB_MODEL") or record["model"] or "unknown"
|
||||
record["gpu_id"] = os.getenv("FB_GPU_ID") or record["gpu_id"] or "na"
|
||||
local p = record["log_path"] or ""
|
||||
if string.find(p, "/logs/infer/") then
|
||||
record["role"] = "infer"
|
||||
elseif string.find(p, "/logs/train/") then
|
||||
record["role"] = "train"
|
||||
else
|
||||
record["role"] = record["role"] or "app"
|
||||
end
|
||||
return 1, ts, record
|
||||
end
|
||||
@ -0,0 +1,10 @@
|
||||
[INPUT]
|
||||
Name tail
|
||||
Path /logs/train/*.log
|
||||
Tag app.train
|
||||
Path_Key log_path
|
||||
Refresh_Interval 5
|
||||
DB /buffers/train.db
|
||||
Skip_Long_Lines On
|
||||
storage.type filesystem
|
||||
multiline.parser python,go,java
|
||||
@ -0,0 +1,10 @@
|
||||
[INPUT]
|
||||
Name tail
|
||||
Path /logs/infer/*.log
|
||||
Tag app.infer
|
||||
Path_Key log_path
|
||||
Refresh_Interval 5
|
||||
DB /buffers/infer.db
|
||||
Skip_Long_Lines On
|
||||
storage.type filesystem
|
||||
multiline.parser python,go,java
|
||||
@ -0,0 +1,24 @@
|
||||
# 重要:使用 Logstash_Format + Logstash_Prefix,生成 train-*/infer-* 索引
|
||||
[OUTPUT]
|
||||
Name es
|
||||
Match app.train
|
||||
Host ${ES_HOST:-localhost}
|
||||
Port ${ES_PORT:-9200}
|
||||
Logstash_Format On
|
||||
Logstash_Prefix train
|
||||
Replace_Dots On
|
||||
Generate_ID On
|
||||
Retry_Limit False
|
||||
Suppress_Type_Name On
|
||||
|
||||
[OUTPUT]
|
||||
Name es
|
||||
Match app.infer
|
||||
Host ${ES_HOST:-localhost}
|
||||
Port ${ES_PORT:-9200}
|
||||
Logstash_Format On
|
||||
Logstash_Prefix infer
|
||||
Replace_Dots On
|
||||
Generate_ID On
|
||||
Retry_Limit False
|
||||
Suppress_Type_Name On
|
||||
@ -0,0 +1,27 @@
|
||||
[MULTILINE_PARSER]
|
||||
Name python
|
||||
Type regex
|
||||
Flush 2
|
||||
Rule "start_state" "/^\d{4}-\d{2}-\d{2}[\sT]/" "cont"
|
||||
Rule "cont" "/^\s+|^Traceback|^\tat\s+/" "cont"
|
||||
|
||||
[MULTILINE_PARSER]
|
||||
Name go
|
||||
Type regex
|
||||
Flush 2
|
||||
Rule "start_state" "/^[0-9]{4}\/[0-9]{2}\/[0-9]{2}/" "cont"
|
||||
Rule "cont" "/^\s+|^\t/" "cont"
|
||||
|
||||
[MULTILINE_PARSER]
|
||||
Name java
|
||||
Type regex
|
||||
Flush 2
|
||||
Rule "start_state" "/^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/" "cont"
|
||||
Rule "cont" "/^\s+at\s+|^\t.../" "cont"
|
||||
|
||||
[PARSER]
|
||||
Name timestamp_parser
|
||||
Format regex
|
||||
Regex ^(?<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?<level>\w+)\s+(?<message>.*)$
|
||||
Time_Key timestamp
|
||||
Time_Format %Y-%m-%d %H:%M:%S
|
||||
291
src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/install.sh
Executable file
291
src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/install.sh
Executable file
@ -0,0 +1,291 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# 日志函数
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
log_info "Starting Fluent Bit installation..."
|
||||
|
||||
# 解析命令行参数
|
||||
INSTALL_DIR="${1:-/opt/argus-metric/current}"
|
||||
|
||||
# 更新安装记录
|
||||
update_install_record() {
|
||||
local pid="$1"
|
||||
# 使用传入的安装目录参数,如果没有则使用默认值
|
||||
local install_base_dir="${2:-/opt/argus-metric/current}"
|
||||
local install_record="$install_base_dir/.install_record"
|
||||
|
||||
# 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建
|
||||
if [[ ! -f "$install_record" ]]; then
|
||||
log_info "安装记录文件不存在,将由主安装脚本创建"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 如果文件存在,说明是重启场景,只更新 PID 字段
|
||||
if command -v jq &> /dev/null; then
|
||||
# 读取当前 PID
|
||||
local current_pid=$(jq -r '.components."fluent-bit".pid // ""' "$install_record" 2>/dev/null)
|
||||
|
||||
if [[ -z "$current_pid" ]]; then
|
||||
log_warning "无法读取当前 PID,跳过更新"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段
|
||||
jq --arg new_pid "$pid" '.components."fluent-bit".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record"
|
||||
log_info "PID updated: $current_pid -> $pid"
|
||||
else
|
||||
log_warning "jq 命令不可用,无法更新安装记录文件"
|
||||
fi
|
||||
}
|
||||
|
||||
# 检查是否为 root 用户
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
log_error "This script requires root privileges"
|
||||
log_info "Please use: sudo $0"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 停止可能运行的服务
|
||||
log_info "Stopping existing fluent-bit processes..."
|
||||
|
||||
# 只匹配进程名为 fluent-bit 的进程
|
||||
pids=$(pgrep -x fluent-bit 2>/dev/null || true)
|
||||
|
||||
if [[ -n "$pids" ]]; then
|
||||
for pid in $pids; do
|
||||
log_info "Stopping process PID: $pid"
|
||||
kill "$pid" 2>/dev/null || true
|
||||
done
|
||||
sleep 2
|
||||
|
||||
# 检查是否还有残留进程
|
||||
remaining_pids=$(pgrep -x fluent-bit 2>/dev/null || true)
|
||||
if [[ -n "$remaining_pids" ]]; then
|
||||
log_warning "Force killing unresponsive processes..."
|
||||
for pid in $remaining_pids; do
|
||||
kill -9 "$pid" 2>/dev/null || true
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
# 安装 Fluent Bit 依赖库 libpq5(离线模式)
|
||||
log_info "Checking Fluent Bit dependency: libpq5 ..."
|
||||
if ! ldconfig -p | grep -q libpq.so.5; then
|
||||
if ls bin/libpq5_*.deb >/dev/null 2>&1; then
|
||||
log_info "Installing local dependency package: libpq5"
|
||||
DEBIAN_FRONTEND=noninteractive dpkg -i bin/libpq5_*.deb >/dev/null 2>&1 || {
|
||||
log_error "Failed to install libpq5 from bin/, please check package validity"
|
||||
exit 1
|
||||
}
|
||||
else
|
||||
log_error "Missing dependency: libpq5 (libpq.so.5). Please put bin/libpq5_*.deb in the bin/ directory."
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
log_info "libpq.so.5 already present on system"
|
||||
fi
|
||||
|
||||
# 安装 Fluent Bit 依赖库 libyaml-0-2(离线模式)
|
||||
log_info "Checking Fluent Bit dependency: libyaml-0.so.2 ..."
|
||||
if ! ldconfig -p | grep -q libyaml-0.so.2; then
|
||||
if ls bin/libyaml-0-2_*.deb >/dev/null 2>&1; then
|
||||
log_info "Installing local dependency package: libyaml-0-2"
|
||||
DEBIAN_FRONTEND=noninteractive dpkg -i bin/libyaml-0-2_*.deb >/dev/null 2>&1 || {
|
||||
log_error "Failed to install libyaml-0-2 from bin/, please check package validity"
|
||||
exit 1
|
||||
}
|
||||
else
|
||||
log_error "Missing dependency: libyaml-0-2 (libyaml-0.so.2). Please put bin/libyaml-0-2_*.deb in the bin/ directory."
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
log_info "libyaml-0.so.2 already present on system"
|
||||
fi
|
||||
|
||||
# 清理可能存在的旧 fluent-bit 安装(避免配置文件冲突)
|
||||
log_info "Cleaning up old fluent-bit installation if exists..."
|
||||
if dpkg -l | grep -q "^ii.*fluent-bit"; then
|
||||
log_info "Found existing fluent-bit package, removing..."
|
||||
dpkg --purge fluent-bit 2>/dev/null || true
|
||||
apt-get remove --purge -y fluent-bit 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# 确保清理残留的配置文件
|
||||
if [[ -d "/etc/fluent-bit" ]]; then
|
||||
log_info "Removing old fluent-bit configuration directory..."
|
||||
rm -rf /etc/fluent-bit
|
||||
fi
|
||||
|
||||
# 安装 Fluent Bit 主包
|
||||
log_info "Installing Fluent Bit from deb package..."
|
||||
deb_file="bin/fluent-bit_3.1.9_amd64.deb"
|
||||
if [[ ! -f "$deb_file" ]]; then
|
||||
log_error "Fluent Bit package not found: $deb_file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
DEBIAN_FRONTEND=noninteractive dpkg -i "$deb_file" >/dev/null 2>&1 || true
|
||||
|
||||
# 验证 Fluent Bit 可以运行
|
||||
fb_version=$(/opt/fluent-bit/bin/fluent-bit --version 2>&1 | head -1)
|
||||
log_info "Fluent Bit version: $fb_version"
|
||||
|
||||
# 创建 fluent-bit 用户
|
||||
log_info "Creating fluent-bit user..."
|
||||
if ! id "fluent-bit" &>/dev/null; then
|
||||
useradd --no-create-home --shell /bin/false fluent-bit
|
||||
fi
|
||||
|
||||
# 创建配置目录
|
||||
log_info "Installing configuration files..."
|
||||
mkdir -p /etc/fluent-bit
|
||||
if [[ -d "config" ]]; then
|
||||
cp -r config/* /etc/fluent-bit/
|
||||
chown -R fluent-bit:fluent-bit /etc/fluent-bit
|
||||
fi
|
||||
|
||||
# 创建日志和缓冲区目录
|
||||
log_info "Creating log and buffer directories..."
|
||||
mkdir -p /logs/train /logs/infer /buffers
|
||||
chmod 755 /logs/train /logs/infer
|
||||
chmod 770 /buffers
|
||||
chown -R fluent-bit:fluent-bit /logs /buffers
|
||||
|
||||
# 启动 Fluent Bit
|
||||
log_info "Starting Fluent Bit with configuration from /etc/fluent-bit/"
|
||||
config_path="/etc/fluent-bit/fluent-bit.conf"
|
||||
|
||||
if [[ ! -f "$config_path" ]]; then
|
||||
log_error "Configuration file not found: $config_path"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 设置环境变量
|
||||
log_info "Setting environment variables..."
|
||||
|
||||
# 获取非 127.0.0.1 的 IP 地址作为 HOSTNAME
|
||||
if [[ -z "${HOSTNAME:-}" ]]; then
|
||||
# 获取 177.x.x.x 段的 IP 地址
|
||||
HOSTNAME=$(ip route get 8.8.8.8 2>/dev/null | grep -oP 'src \K\S+' | grep '^177\.' | head -1)
|
||||
|
||||
# 如果没有找到 177.x.x.x 段的 IP,则获取第一个非 127.0.0.1 的 IP
|
||||
if [[ -z "$HOSTNAME" ]]; then
|
||||
HOSTNAME=$(ip route get 8.8.8.8 2>/dev/null | grep -oP 'src \K\S+' | grep -v '^127\.' | head -1)
|
||||
fi
|
||||
|
||||
# 如果还是没有找到,使用 hostname 命令
|
||||
if [[ -z "$HOSTNAME" ]]; then
|
||||
HOSTNAME=$(hostname)
|
||||
fi
|
||||
fi
|
||||
export HOSTNAME
|
||||
|
||||
export CLUSTER="${CLUSTER:-local}"
|
||||
export RACK="${RACK:-dev}"
|
||||
export ES_HOST="${ES_HOST:-localhost}"
|
||||
export ES_PORT="${ES_PORT:-9200}"
|
||||
|
||||
log_info "Environment variables:"
|
||||
log_info " CLUSTER=$CLUSTER"
|
||||
log_info " RACK=$RACK"
|
||||
log_info " HOSTNAME=$HOSTNAME"
|
||||
log_info " ES_HOST=$ES_HOST"
|
||||
log_info " ES_PORT=$ES_PORT"
|
||||
|
||||
# 检查 fluent-bit 二进制文件
|
||||
log_info "[DEBUG] Checking fluent-bit binary..."
|
||||
if [[ ! -f "/opt/fluent-bit/bin/fluent-bit" ]]; then
|
||||
log_error "fluent-bit binary not found at /opt/fluent-bit/bin/fluent-bit"
|
||||
exit 1
|
||||
fi
|
||||
log_info "[DEBUG] fluent-bit binary exists and is executable: $(ls -lh /opt/fluent-bit/bin/fluent-bit)"
|
||||
|
||||
# 检查配置文件
|
||||
log_info "[DEBUG] Checking configuration file: $config_path"
|
||||
if [[ ! -f "$config_path" ]]; then
|
||||
log_error "Configuration file not found: $config_path"
|
||||
exit 1
|
||||
fi
|
||||
log_info "[DEBUG] Configuration file exists: $(ls -lh $config_path)"
|
||||
|
||||
# 显示完整的启动命令
|
||||
log_info "[DEBUG] Full command to execute:"
|
||||
log_info "[DEBUG] su -s /bin/bash fluent-bit -c 'env CLUSTER=\"$CLUSTER\" RACK=\"$RACK\" HOSTNAME=\"$HOSTNAME\" ES_HOST=\"$ES_HOST\" ES_PORT=\"$ES_PORT\" /opt/fluent-bit/bin/fluent-bit --config=\"$config_path\"'"
|
||||
|
||||
# 清空或创建日志文件
|
||||
log_info "[DEBUG] Preparing log file: /var/log/fluent-bit.log"
|
||||
: > /var/log/fluent-bit.log
|
||||
chmod 666 /var/log/fluent-bit.log
|
||||
|
||||
log_info "Command: /opt/fluent-bit/bin/fluent-bit --config=$config_path"
|
||||
log_info "[DEBUG] Starting fluent-bit process as fluent-bit user (using su)..."
|
||||
nohup su -s /bin/bash fluent-bit -c "env CLUSTER='$CLUSTER' RACK='$RACK' HOSTNAME='$HOSTNAME' ES_HOST='$ES_HOST' ES_PORT='$ES_PORT' /opt/fluent-bit/bin/fluent-bit --config='$config_path' >> /var/log/fluent-bit.log 2>&1" &
|
||||
|
||||
bg_pid=$!
|
||||
log_info "[DEBUG] Background process started with PID: $bg_pid"
|
||||
|
||||
# 等待服务启动
|
||||
log_info "[DEBUG] Waiting 3 seconds for service to start..."
|
||||
sleep 3
|
||||
|
||||
# 查找实际的 fluent-bit 进程 PID
|
||||
log_info "[DEBUG] Searching for fluent-bit process..."
|
||||
log_info "[DEBUG] Running: pgrep -u fluent-bit -x fluent-bit"
|
||||
actual_pid=$(pgrep -u fluent-bit -x fluent-bit | head -1)
|
||||
|
||||
# 显示所有 fluent-bit 相关进程
|
||||
log_info "[DEBUG] All fluent-bit related processes:"
|
||||
ps aux | grep fluent-bit | grep -v grep || log_warning "No fluent-bit processes found in ps output"
|
||||
|
||||
if [[ -n "$actual_pid" ]]; then
|
||||
log_success "Fluent Bit started successfully (PID: $actual_pid)"
|
||||
log_info "[DEBUG] Process details: $(ps -p $actual_pid -o pid,user,cmd --no-headers)"
|
||||
|
||||
# 更新安装记录
|
||||
update_install_record "$actual_pid" "$INSTALL_DIR"
|
||||
else
|
||||
log_error "Fluent Bit failed to start - no fluent-bit process found"
|
||||
log_info "[DEBUG] Checking if background process $bg_pid still exists..."
|
||||
if ps -p $bg_pid > /dev/null 2>&1; then
|
||||
log_warning "Background shell process $bg_pid still exists"
|
||||
else
|
||||
log_warning "Background shell process $bg_pid has exited"
|
||||
fi
|
||||
|
||||
log_info "[DEBUG] Last 20 lines of /var/log/fluent-bit.log:"
|
||||
if [[ -f "/var/log/fluent-bit.log" ]]; then
|
||||
tail -20 /var/log/fluent-bit.log | while IFS= read -r line; do
|
||||
log_info "[LOG] $line"
|
||||
done
|
||||
else
|
||||
log_error "Log file /var/log/fluent-bit.log does not exist"
|
||||
fi
|
||||
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_success "Fluent Bit installation completed!"
|
||||
87
src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/package.sh
Executable file
87
src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/package.sh
Executable file
@ -0,0 +1,87 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# 颜色定义
|
||||
GREEN='\033[0;32m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
# 获取当前目录
|
||||
CURRENT_DIR=$(pwd)
|
||||
PACKAGE_NAME="fluent-bit-$(date +%Y%m%d-%H%M%S)"
|
||||
PACKAGE_FILE="${PACKAGE_NAME}.tar.gz"
|
||||
|
||||
log_info "开始打包 Fluent Bit 安装包..."
|
||||
|
||||
# 检查必要文件
|
||||
log_info "检查必要文件..."
|
||||
|
||||
required_files=(
|
||||
"install.sh"
|
||||
"uninstall.sh"
|
||||
"bin/fluent-bit_3.1.9_amd64.deb"
|
||||
"check_health.sh"
|
||||
)
|
||||
|
||||
missing_files=()
|
||||
for file in "${required_files[@]}"; do
|
||||
if [[ ! -f "$file" ]]; then
|
||||
missing_files+=("$file")
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ ${#missing_files[@]} -gt 0 ]]; then
|
||||
echo "缺少以下文件:"
|
||||
for file in "${missing_files[@]}"; do
|
||||
echo " - $file"
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_success "所有必要文件检查完成"
|
||||
|
||||
# 创建临时目录
|
||||
TEMP_DIR=$(mktemp -d)
|
||||
log_info "创建临时目录: $TEMP_DIR"
|
||||
|
||||
# 复制文件到临时目录
|
||||
cp -r . "$TEMP_DIR/$PACKAGE_NAME"
|
||||
|
||||
# 进入临时目录
|
||||
cd "$TEMP_DIR"
|
||||
|
||||
# 创建压缩包
|
||||
log_info "创建压缩包: $PACKAGE_FILE"
|
||||
tar -czf "$PACKAGE_FILE" "$PACKAGE_NAME"
|
||||
|
||||
# 移动压缩包到原目录
|
||||
mv "$PACKAGE_FILE" "$CURRENT_DIR/"
|
||||
|
||||
# 清理临时目录
|
||||
rm -rf "$TEMP_DIR"
|
||||
|
||||
# 返回原目录
|
||||
cd "$CURRENT_DIR"
|
||||
|
||||
# 显示结果
|
||||
log_success "打包完成!"
|
||||
echo
|
||||
echo "安装包文件: $PACKAGE_FILE"
|
||||
echo "文件大小: $(du -h "$PACKAGE_FILE" | cut -f1)"
|
||||
echo
|
||||
echo "使用方法:"
|
||||
echo "1. 将 $PACKAGE_FILE 传输到目标服务器"
|
||||
echo "2. 解压: tar -xzf $PACKAGE_FILE"
|
||||
echo "3. 进入目录: cd $PACKAGE_NAME"
|
||||
echo "4. 运行安装: sudo ./install.sh"
|
||||
echo
|
||||
echo "注意: 请确保所有必要文件都存在"
|
||||
169
src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/uninstall.sh
Executable file
169
src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/uninstall.sh
Executable file
@ -0,0 +1,169 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
echo "[INFO] Starting Fluent Bit uninstallation..."
|
||||
|
||||
# 检查是否为 root 用户
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
echo "[ERROR] This script requires root privileges"
|
||||
echo "[INFO] Please use: sudo $0"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "[WARNING] This operation will completely uninstall Fluent Bit"
|
||||
read -p "Confirm to continue? (y/N): " confirm
|
||||
|
||||
if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then
|
||||
echo "[INFO] Uninstallation cancelled"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 停止运行中的进程
|
||||
echo "[INFO] Stopping Fluent Bit processes..."
|
||||
install_record="/opt/argus-metric/current/.install_record"
|
||||
stopped=false
|
||||
|
||||
# 首先尝试通过安装记录文件停止服务
|
||||
if [[ -f "$install_record" ]]; then
|
||||
# 尝试使用jq解析JSON格式的安装记录文件
|
||||
pid=""
|
||||
if command -v jq &> /dev/null; then
|
||||
pid=$(jq -r '.components."fluent-bit".pid // empty' "$install_record" 2>/dev/null || echo "")
|
||||
else
|
||||
# 如果没有jq,使用简单的文本解析方法
|
||||
pid=$(grep -A 10 '"fluent-bit"' "$install_record" | grep '"pid"' | cut -d'"' -f4 | head -1)
|
||||
fi
|
||||
|
||||
if [[ -n "$pid" && "$pid" =~ ^[0-9]+$ ]]; then
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
echo "[INFO] Stopping service via installation record (PID: $pid)..."
|
||||
kill "$pid"
|
||||
sleep 3
|
||||
|
||||
# 检查进程是否已停止
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
echo "[WARNING] Process unresponsive, force killing..."
|
||||
kill -9 "$pid" 2>/dev/null || true
|
||||
fi
|
||||
echo "[SUCCESS] Fluent Bit process stopped"
|
||||
stopped=true
|
||||
else
|
||||
echo "[WARNING] PID in installation record no longer exists"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# 查找并杀死所有 fluent-bit 进程
|
||||
pids=$(pgrep -f "fluent-bit" 2>/dev/null || true)
|
||||
if [[ -n "$pids" ]]; then
|
||||
echo "[INFO] Found fluent-bit processes, stopping..."
|
||||
for pid in $pids; do
|
||||
echo "[INFO] Stopping process PID: $pid"
|
||||
kill "$pid" 2>/dev/null || true
|
||||
done
|
||||
sleep 2
|
||||
|
||||
# 检查是否还有进程在运行,如果有则强制终止
|
||||
remaining_pids=$(pgrep -f "fluent-bit" 2>/dev/null || true)
|
||||
if [[ -n "$remaining_pids" ]]; then
|
||||
echo "[WARNING] Processes unresponsive, force killing..."
|
||||
for pid in $remaining_pids; do
|
||||
echo "[INFO] Force killing process PID: $pid"
|
||||
kill -9 "$pid" 2>/dev/null || true
|
||||
done
|
||||
sleep 1
|
||||
fi
|
||||
|
||||
# 最终检查
|
||||
if pgrep -f "fluent-bit" > /dev/null; then
|
||||
echo "[ERROR] Unable to stop all fluent-bit processes"
|
||||
else
|
||||
echo "[SUCCESS] All Fluent Bit processes stopped"
|
||||
stopped=true
|
||||
fi
|
||||
else
|
||||
echo "[INFO] No Fluent Bit processes running"
|
||||
fi
|
||||
|
||||
if [[ "$stopped" == "false" ]]; then
|
||||
echo "[WARNING] No Fluent Bit processes found to stop"
|
||||
fi
|
||||
|
||||
# 卸载 Fluent Bit 包
|
||||
echo "[INFO] Uninstalling Fluent Bit package..."
|
||||
if dpkg -l | grep -q "fluent-bit"; then
|
||||
echo "[INFO] Found fluent-bit package installed via dpkg, uninstalling..."
|
||||
dpkg --remove --force-remove-reinstreq fluent-bit || true
|
||||
echo "[SUCCESS] Fluent Bit package uninstalled"
|
||||
else
|
||||
echo "[INFO] No fluent-bit package found via package manager"
|
||||
fi
|
||||
|
||||
# 删除二进制文件
|
||||
echo "[INFO] Removing Fluent Bit binary files..."
|
||||
binary_dir="/opt/fluent-bit"
|
||||
if [[ -d "$binary_dir" ]]; then
|
||||
rm -rf "$binary_dir"
|
||||
echo "[SUCCESS] Binary directory removed: $binary_dir"
|
||||
else
|
||||
echo "[INFO] Binary directory does not exist"
|
||||
fi
|
||||
|
||||
# 删除配置文件
|
||||
echo "[INFO] Removing configuration files..."
|
||||
config_dir="/etc/fluent-bit"
|
||||
if [[ -d "$config_dir" ]]; then
|
||||
rm -rf "$config_dir"
|
||||
echo "[SUCCESS] Configuration directory removed"
|
||||
else
|
||||
echo "[INFO] Configuration directory does not exist"
|
||||
fi
|
||||
|
||||
# 删除数据目录
|
||||
echo "[INFO] Removing data directories..."
|
||||
data_dirs=("/logs" "/buffers")
|
||||
deleted=false
|
||||
for data_dir in "${data_dirs[@]}"; do
|
||||
if [[ -d "$data_dir" ]]; then
|
||||
rm -rf "$data_dir"
|
||||
echo "[SUCCESS] Data directory removed: $data_dir"
|
||||
deleted=true
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ "$deleted" == "false" ]]; then
|
||||
echo "[INFO] No data directories found"
|
||||
fi
|
||||
|
||||
# 清理安装记录
|
||||
echo "[INFO] Cleaning up installation record..."
|
||||
if [[ -f "$install_record" ]]; then
|
||||
# 从安装记录中移除 fluent-bit 条目
|
||||
sed -i '/^fluent-bit:/d' "$install_record"
|
||||
echo "[SUCCESS] Installation record cleaned"
|
||||
else
|
||||
echo "[INFO] Installation record file does not exist"
|
||||
fi
|
||||
|
||||
# 检查用户状态
|
||||
echo "[INFO] Checking fluent-bit user status..."
|
||||
if id "fluent-bit" &>/dev/null; then
|
||||
echo "[INFO] fluent-bit user exists"
|
||||
echo "[WARNING] fluent-bit is a system user, may be used by other services"
|
||||
echo "[INFO] fluent-bit user will be preserved for system stability"
|
||||
echo "[INFO] To manually remove, run: sudo userdel fluent-bit"
|
||||
else
|
||||
echo "[INFO] fluent-bit user does not exist"
|
||||
fi
|
||||
|
||||
echo "[SUCCESS] Fluent Bit uninstallation completed!"
|
||||
echo
|
||||
echo "Removed content:"
|
||||
echo " - Binary directory: /opt/fluent-bit"
|
||||
echo " - Configuration directory: /etc/fluent-bit"
|
||||
echo " - Application log directory: /logs"
|
||||
echo " - Buffer directory: /buffers"
|
||||
echo
|
||||
echo "Note:"
|
||||
echo " - fluent-bit user preserved (system user, may be used by other services)"
|
||||
echo " - For complete cleanup, manually check and remove related files"
|
||||
BIN
src/metric/client-plugins/all-in-one-full/plugins/node-exporter/bin/node_exporter
(Stored with Git LFS)
Executable file
BIN
src/metric/client-plugins/all-in-one-full/plugins/node-exporter/bin/node_exporter
(Stored with Git LFS)
Executable file
Binary file not shown.
@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Node Exporter 健康检查脚本
|
||||
# 输出 JSON 格式结果
|
||||
|
||||
set -e
|
||||
|
||||
# 检查 Node Exporter 健康状态
|
||||
check_health() {
|
||||
local url="http://localhost:9100"
|
||||
local metrics_url="$url/metrics"
|
||||
local name="node-exporter"
|
||||
local status="unhealth"
|
||||
local reason=""
|
||||
|
||||
# 检查 curl 是否可用
|
||||
if ! command -v curl &> /dev/null; then
|
||||
reason="curl 命令不可用,无法进行健康检查"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 测试根路径连接
|
||||
local http_code=$(curl -s -o /dev/null -w "%{http_code}" "$url" 2>/dev/null || echo "000")
|
||||
|
||||
if [[ "$http_code" == "200" ]]; then
|
||||
# 测试 metrics 端点
|
||||
local metrics_code=$(curl -s -o /dev/null -w "%{http_code}" "$metrics_url" 2>/dev/null || echo "000")
|
||||
|
||||
if [[ "$metrics_code" == "200" ]]; then
|
||||
status="health"
|
||||
reason="success"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 0
|
||||
else
|
||||
reason="Metrics 端点异常 (HTTP $metrics_code)"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
reason="HTTP 服务异常 (HTTP $http_code),请检查 Node Exporter 是否正在运行在端口 9100"
|
||||
echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
check_health
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
343
src/metric/client-plugins/all-in-one-full/plugins/node-exporter/install.sh
Executable file
343
src/metric/client-plugins/all-in-one-full/plugins/node-exporter/install.sh
Executable file
@ -0,0 +1,343 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# 日志函数
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# 更新安装记录
|
||||
update_install_record() {
|
||||
local pid="$1"
|
||||
# 使用传入的安装目录参数,如果没有则使用默认值
|
||||
local install_base_dir="${2:-/opt/argus-metric/current}"
|
||||
local install_record="$install_base_dir/.install_record"
|
||||
|
||||
# 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建
|
||||
if [[ ! -f "$install_record" ]]; then
|
||||
log_info "安装记录文件不存在,将由主安装脚本创建"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 如果文件存在,说明是重启场景,只更新 PID 字段
|
||||
if command -v jq &> /dev/null; then
|
||||
# 读取当前 PID
|
||||
local current_pid=$(jq -r '.components."node-exporter".pid // ""' "$install_record" 2>/dev/null)
|
||||
|
||||
if [[ -z "$current_pid" ]]; then
|
||||
log_warning "无法读取当前 PID,跳过更新"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段
|
||||
jq --arg new_pid "$pid" '.components."node-exporter".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record"
|
||||
log_info "PID 已更新: $current_pid -> $pid"
|
||||
else
|
||||
log_warning "jq 命令不可用,无法更新安装记录文件"
|
||||
fi
|
||||
}
|
||||
|
||||
# 显示帮助信息
|
||||
show_help() {
|
||||
echo "Node Exporter 安装脚本"
|
||||
echo
|
||||
echo "用法: $0 [选项]"
|
||||
echo
|
||||
echo "选项:"
|
||||
echo " --help 显示此帮助信息"
|
||||
echo
|
||||
echo "示例:"
|
||||
echo " $0 # 安装 Node Exporter"
|
||||
echo
|
||||
}
|
||||
|
||||
# 解析命令行参数
|
||||
INSTALL_DIR=""
|
||||
for arg in "$@"; do
|
||||
case $arg in
|
||||
--help|-h)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
# 如果参数不是以--开头,则认为是安装目录
|
||||
if [[ ! "$arg" =~ ^-- ]]; then
|
||||
INSTALL_DIR="$arg"
|
||||
else
|
||||
log_error "未知参数: $arg"
|
||||
show_help
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# 检查是否为 root 用户
|
||||
check_root() {
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
log_error "此脚本需要 root 权限运行"
|
||||
log_info "请使用: sudo $0"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 检查系统要求
|
||||
check_system() {
|
||||
log_info "检查系统要求..."
|
||||
|
||||
# 检查操作系统
|
||||
if [[ ! -f /etc/os-release ]]; then
|
||||
log_error "无法检测操作系统版本"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
source /etc/os-release
|
||||
log_info "检测到操作系统: $NAME $VERSION"
|
||||
|
||||
# 检查是否为 Linux 系统
|
||||
if [[ "$ID" != "ubuntu" && "$ID" != "debian" && "$ID" != "centos" && "$ID" != "rhel" && "$ID" != "fedora" ]]; then
|
||||
log_warning "此脚本主要针对常见 Linux 发行版,其他系统可能需要调整"
|
||||
fi
|
||||
|
||||
# 检查系统架构
|
||||
local arch=$(uname -m)
|
||||
log_info "系统架构: $arch"
|
||||
|
||||
if [[ "$arch" != "x86_64" && "$arch" != "amd64" ]]; then
|
||||
log_warning "当前架构为 $arch,node_exporter 主要支持 x86_64/amd64"
|
||||
fi
|
||||
}
|
||||
|
||||
stop_existing_service() {
|
||||
log_info "检查并停止可能运行的 Node Exporter 服务..."
|
||||
|
||||
# 当前脚本 PID,防止误杀
|
||||
SELF_PID=$$
|
||||
|
||||
# 1. 停止 systemd 服务(如果存在)
|
||||
if systemctl list-units --full -all | grep -q "node_exporter.service"; then
|
||||
log_info "检测到 systemd 服务 node_exporter,正在停止..."
|
||||
systemctl stop node_exporter || true
|
||||
systemctl disable node_exporter || true
|
||||
fi
|
||||
|
||||
# 2. 清理可能存在的 PID 文件
|
||||
for pid_file in /var/run/node-exporter.pid /var/run/node_exporter.pid /tmp/node_exporter.pid; do
|
||||
if [[ -f "$pid_file" ]]; then
|
||||
local pid=$(cat "$pid_file")
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_info "发现 Node Exporter (PID: $pid),正在停止..."
|
||||
kill "$pid"
|
||||
sleep 2
|
||||
kill -0 "$pid" 2>/dev/null && kill -9 "$pid"
|
||||
fi
|
||||
rm -f "$pid_file"
|
||||
fi
|
||||
done
|
||||
|
||||
# 3. 用 pgrep 查找进程,排除当前脚本
|
||||
local pids=$(pgrep -f "node_exporter|node-exporter|/usr/local/bin/node-exporter" | grep -vw "$SELF_PID" || true)
|
||||
if [[ -n "$pids" ]]; then
|
||||
log_info "发现 Node Exporter 进程 (PID: $pids),正在停止..."
|
||||
for pid in $pids; do
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
kill "$pid" 2>/dev/null || true
|
||||
sleep 1
|
||||
kill -0 "$pid" 2>/dev/null && kill -9 "$pid" 2>/dev/null || true
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# 4. 兜底:检查是否有进程占用 9100 端口
|
||||
local listen_pids=$(lsof -ti:9100 2>/dev/null || true)
|
||||
if [[ -n "$listen_pids" ]]; then
|
||||
log_warning "发现占用 9100 端口的进程 (PID: $listen_pids),强制终止..."
|
||||
for pid in $listen_pids; do
|
||||
kill -9 "$pid" 2>/dev/null || true
|
||||
done
|
||||
sleep 1
|
||||
fi
|
||||
|
||||
# 5. 最终验证
|
||||
if netstat -tuln 2>/dev/null | grep -q ":9100 "; then
|
||||
log_error "端口 9100 仍被占用,请手动检查"
|
||||
return 1
|
||||
else
|
||||
log_success "旧的 Node Exporter 已完全停止"
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
# 安装 Node Exporter 二进制文件
|
||||
install_node_exporter() {
|
||||
log_info "安装 Node Exporter..."
|
||||
|
||||
local binary_file="bin/node_exporter"
|
||||
local install_dir="/usr/local/bin"
|
||||
|
||||
if [[ ! -f "$binary_file" ]]; then
|
||||
log_error "找不到 Node Exporter 二进制文件: $binary_file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 停止可能运行的服务
|
||||
stop_existing_service
|
||||
|
||||
# 复制二进制文件并重命名为统一格式
|
||||
cp "$binary_file" "$install_dir/node-exporter"
|
||||
chmod +x "$install_dir/node-exporter"
|
||||
|
||||
log_success "Node Exporter 二进制文件安装完成"
|
||||
}
|
||||
|
||||
# 创建用户和组
|
||||
create_user() {
|
||||
log_info "创建 node_exporter 用户..."
|
||||
|
||||
# 检查用户是否已存在
|
||||
if id "node_exporter" &>/dev/null; then
|
||||
log_info "用户 node_exporter 已存在"
|
||||
else
|
||||
useradd --no-create-home --shell /bin/false node_exporter
|
||||
log_success "用户 node_exporter 创建完成"
|
||||
fi
|
||||
}
|
||||
|
||||
# 安装配置文件
|
||||
install_config() {
|
||||
log_info "安装配置文件..."
|
||||
|
||||
local config_dir="/etc/node_exporter"
|
||||
|
||||
# 创建配置目录
|
||||
mkdir -p "$config_dir"
|
||||
|
||||
# 创建文本文件收集器目录
|
||||
mkdir -p "/var/lib/node_exporter/textfile_collector"
|
||||
chown node_exporter:node_exporter "/var/lib/node_exporter/textfile_collector"
|
||||
}
|
||||
|
||||
# 启动 Node Exporter 服务
|
||||
start_node_exporter() {
|
||||
log_info "启动 Node Exporter 服务..."
|
||||
|
||||
local binary_path="/usr/local/bin/node-exporter"
|
||||
local log_file="/var/log/node-exporter.log"
|
||||
local pid_file="/var/run/node-exporter.pid"
|
||||
|
||||
# 检查服务是否已经在运行
|
||||
if [[ -f "$pid_file" ]]; then
|
||||
local pid=$(cat "$pid_file")
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_info "Node Exporter 服务已在运行 (PID: $pid)"
|
||||
return 0
|
||||
else
|
||||
log_warning "发现过期的 PID 文件,正在清理..."
|
||||
rm -f "$pid_file"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 检查端口是否被占用
|
||||
if netstat -tuln 2>/dev/null | grep -q ":9100 "; then
|
||||
log_warning "端口 9100 已被占用,请检查是否有其他服务在运行"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 启动服务
|
||||
log_info "正在启动 Node Exporter..."
|
||||
nohup "$binary_path" --web.listen-address=:9100 > "$log_file" 2>&1 &
|
||||
local pid=$!
|
||||
|
||||
# 保存 PID
|
||||
echo "$pid" > "$pid_file"
|
||||
|
||||
# 等待服务启动
|
||||
sleep 2
|
||||
|
||||
# 检查服务是否成功启动
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_success "Node Exporter 服务启动成功 (PID: $pid)"
|
||||
log_info "日志文件: $log_file"
|
||||
log_info "PID 文件: $pid_file"
|
||||
|
||||
# 更新安装记录
|
||||
update_install_record "$pid" "$INSTALL_DIR"
|
||||
else
|
||||
log_error "Node Exporter 服务启动失败"
|
||||
rm -f "$pid_file"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
|
||||
# 显示安装信息
|
||||
show_install_info() {
|
||||
log_success "Node Exporter 安装完成!"
|
||||
echo
|
||||
echo "安装信息:"
|
||||
echo " 二进制文件: /usr/local/bin/node-exporter"
|
||||
echo " 运行用户: node_exporter"
|
||||
echo " 配置目录: /etc/node_exporter/"
|
||||
echo " 默认端口: 9100"
|
||||
echo
|
||||
echo "使用方法:"
|
||||
echo " 手动启动: /usr/local/bin/node-exporter --web.listen-address=:9100"
|
||||
echo " 后台启动: nohup /usr/local/bin/node-exporter --web.listen-address=:9100 &"
|
||||
echo
|
||||
echo "测试连接:"
|
||||
echo " curl http://localhost:9100/metrics"
|
||||
echo " curl http://localhost:9100"
|
||||
echo
|
||||
echo "Prometheus 配置示例:"
|
||||
echo " - job_name: 'node_exporter'"
|
||||
echo " static_configs:"
|
||||
echo " - targets: ['localhost:9100']"
|
||||
echo
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
echo "=========================================="
|
||||
echo " Node Exporter 安装脚本 v1.0"
|
||||
echo "=========================================="
|
||||
echo
|
||||
|
||||
check_root
|
||||
check_system
|
||||
|
||||
log_info "开始安装 Node Exporter..."
|
||||
|
||||
install_node_exporter
|
||||
create_user
|
||||
install_config
|
||||
start_node_exporter
|
||||
|
||||
show_install_info
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
|
||||
87
src/metric/client-plugins/all-in-one-full/plugins/node-exporter/package.sh
Executable file
87
src/metric/client-plugins/all-in-one-full/plugins/node-exporter/package.sh
Executable file
@ -0,0 +1,87 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# 颜色定义
|
||||
GREEN='\033[0;32m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
# 获取当前目录
|
||||
CURRENT_DIR=$(pwd)
|
||||
PACKAGE_NAME="node-exporter-$(date +%Y%m%d-%H%M%S)"
|
||||
PACKAGE_FILE="${PACKAGE_NAME}.tar.gz"
|
||||
|
||||
log_info "开始打包 Node Exporter 安装包..."
|
||||
|
||||
# 检查必要文件
|
||||
log_info "检查必要文件..."
|
||||
|
||||
required_files=(
|
||||
"install.sh"
|
||||
"uninstall.sh"
|
||||
"bin/node_exporter"
|
||||
"check_health.sh"
|
||||
)
|
||||
|
||||
missing_files=()
|
||||
for file in "${required_files[@]}"; do
|
||||
if [[ ! -f "$file" ]]; then
|
||||
missing_files+=("$file")
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ ${#missing_files[@]} -gt 0 ]]; then
|
||||
echo "缺少以下文件:"
|
||||
for file in "${missing_files[@]}"; do
|
||||
echo " - $file"
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_success "所有必要文件检查完成"
|
||||
|
||||
# 创建临时目录
|
||||
TEMP_DIR=$(mktemp -d)
|
||||
log_info "创建临时目录: $TEMP_DIR"
|
||||
|
||||
# 复制文件到临时目录
|
||||
cp -r . "$TEMP_DIR/$PACKAGE_NAME"
|
||||
|
||||
# 进入临时目录
|
||||
cd "$TEMP_DIR"
|
||||
|
||||
# 创建压缩包
|
||||
log_info "创建压缩包: $PACKAGE_FILE"
|
||||
tar -czf "$PACKAGE_FILE" "$PACKAGE_NAME"
|
||||
|
||||
# 移动压缩包到原目录
|
||||
mv "$PACKAGE_FILE" "$CURRENT_DIR/"
|
||||
|
||||
# 清理临时目录
|
||||
rm -rf "$TEMP_DIR"
|
||||
|
||||
# 返回原目录
|
||||
cd "$CURRENT_DIR"
|
||||
|
||||
# 显示结果
|
||||
log_success "打包完成!"
|
||||
echo
|
||||
echo "安装包文件: $PACKAGE_FILE"
|
||||
echo "文件大小: $(du -h "$PACKAGE_FILE" | cut -f1)"
|
||||
echo
|
||||
echo "使用方法:"
|
||||
echo "1. 将 $PACKAGE_FILE 传输到目标服务器"
|
||||
echo "2. 解压: tar -xzf $PACKAGE_FILE"
|
||||
echo "3. 进入目录: cd $PACKAGE_NAME"
|
||||
echo "4. 运行安装: sudo ./install.sh"
|
||||
echo
|
||||
echo "注意: 请确保所有必要文件都存在"
|
||||
239
src/metric/client-plugins/all-in-one-full/plugins/node-exporter/uninstall.sh
Executable file
239
src/metric/client-plugins/all-in-one-full/plugins/node-exporter/uninstall.sh
Executable file
@ -0,0 +1,239 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Node Exporter 卸载脚本
|
||||
# 版本: 1.0
|
||||
# 作者: AIOps Team
|
||||
# 日期: $(date +%Y-%m-%d)
|
||||
|
||||
set -e
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# 日志函数
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# 检查是否为 root 用户
|
||||
check_root() {
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
log_error "此脚本需要 root 权限运行"
|
||||
log_info "请使用: sudo $0"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 停止运行中的进程
|
||||
stop_processes() {
|
||||
log_info "停止 Node Exporter 进程..."
|
||||
|
||||
local pid_file="/var/run/node-exporter.pid"
|
||||
local stopped=false
|
||||
|
||||
# 首先尝试通过 PID 文件停止服务
|
||||
if [[ -f "$pid_file" ]]; then
|
||||
local pid=$(cat "$pid_file")
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_info "通过 PID 文件停止服务 (PID: $pid)..."
|
||||
kill "$pid"
|
||||
sleep 3
|
||||
|
||||
# 检查进程是否已停止
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log_warning "进程未响应,强制终止..."
|
||||
kill -9 "$pid" 2>/dev/null || true
|
||||
fi
|
||||
log_success "Node Exporter 进程已停止"
|
||||
stopped=true
|
||||
else
|
||||
log_warning "PID 文件存在但进程已不存在,清理 PID 文件"
|
||||
rm -f "$pid_file"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 查找并杀死所有 node_exporter 和 node-exporter 进程
|
||||
local pids=$(pgrep -f "node_exporter\|node-exporter" 2>/dev/null || true)
|
||||
if [[ -n "$pids" ]]; then
|
||||
log_info "发现 node_exporter 或 node-exporter 进程,正在停止..."
|
||||
for pid in $pids; do
|
||||
log_info "停止进程 PID: $pid"
|
||||
kill "$pid" 2>/dev/null || true
|
||||
done
|
||||
sleep 2
|
||||
|
||||
# 检查是否还有进程在运行,如果有则强制终止
|
||||
local remaining_pids=$(pgrep -f "node_exporter\|node-exporter" 2>/dev/null || true)
|
||||
if [[ -n "$remaining_pids" ]]; then
|
||||
log_warning "进程未响应,强制终止..."
|
||||
for pid in $remaining_pids; do
|
||||
log_info "强制终止进程 PID: $pid"
|
||||
kill -9 "$pid" 2>/dev/null || true
|
||||
done
|
||||
sleep 1
|
||||
fi
|
||||
|
||||
# 最终检查
|
||||
if pgrep -f "node_exporter\|node-exporter" > /dev/null; then
|
||||
log_error "无法停止所有 node_exporter 进程"
|
||||
else
|
||||
log_success "所有 Node Exporter 进程已停止"
|
||||
stopped=true
|
||||
fi
|
||||
else
|
||||
log_info "Node Exporter 进程未运行"
|
||||
fi
|
||||
|
||||
# 清理 PID 文件
|
||||
rm -f "$pid_file"
|
||||
|
||||
if [[ "$stopped" == "false" ]]; then
|
||||
log_warning "未发现需要停止的 Node Exporter 进程"
|
||||
fi
|
||||
}
|
||||
|
||||
# 删除二进制文件
|
||||
remove_binary() {
|
||||
log_info "删除 Node Exporter 二进制文件..."
|
||||
|
||||
local binary_files=(
|
||||
"/usr/local/bin/node-exporter"
|
||||
"/usr/local/bin/node_exporter"
|
||||
)
|
||||
|
||||
local deleted=false
|
||||
for binary_file in "${binary_files[@]}"; do
|
||||
if [[ -f "$binary_file" ]]; then
|
||||
rm -f "$binary_file"
|
||||
log_success "二进制文件已删除: $binary_file"
|
||||
deleted=true
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ "$deleted" == "false" ]]; then
|
||||
log_info "二进制文件不存在"
|
||||
fi
|
||||
}
|
||||
|
||||
# 删除配置文件
|
||||
remove_config() {
|
||||
log_info "删除配置文件..."
|
||||
|
||||
local config_dir="/etc/node_exporter"
|
||||
|
||||
if [[ -d "$config_dir" ]]; then
|
||||
rm -rf "$config_dir"
|
||||
log_success "配置目录已删除"
|
||||
else
|
||||
log_info "配置目录不存在"
|
||||
fi
|
||||
}
|
||||
|
||||
# 删除数据目录
|
||||
remove_data_dir() {
|
||||
log_info "删除数据目录..."
|
||||
|
||||
local data_dir="/var/lib/node_exporter"
|
||||
|
||||
if [[ -d "$data_dir" ]]; then
|
||||
rm -rf "$data_dir"
|
||||
log_success "数据目录已删除"
|
||||
else
|
||||
log_info "数据目录不存在"
|
||||
fi
|
||||
}
|
||||
|
||||
# 检查用户状态(可选)
|
||||
check_user_status() {
|
||||
log_info "检查 node_exporter 用户状态..."
|
||||
|
||||
if id "node_exporter" &>/dev/null; then
|
||||
log_info "检测到 node_exporter 用户存在"
|
||||
log_warning "node_exporter 是系统用户,可能被其他服务使用"
|
||||
log_info "为了系统稳定性,将保留 node_exporter 用户"
|
||||
log_info "如需手动删除,请运行: sudo userdel node_exporter"
|
||||
else
|
||||
log_info "node_exporter 用户不存在"
|
||||
fi
|
||||
}
|
||||
|
||||
# 清理日志文件
|
||||
cleanup_logs() {
|
||||
log_info "清理日志文件..."
|
||||
|
||||
# 清理 journal 日志
|
||||
journalctl --vacuum-time=1s --quiet || true
|
||||
|
||||
# 删除安装脚本创建的日志文件
|
||||
rm -f /var/log/node-exporter.log
|
||||
|
||||
log_success "日志文件已清理"
|
||||
}
|
||||
|
||||
# 显示卸载信息
|
||||
show_uninstall_info() {
|
||||
log_success "Node Exporter 卸载完成!"
|
||||
echo
|
||||
echo "已删除的内容:"
|
||||
echo " - 二进制文件: /usr/local/bin/node-exporter"
|
||||
echo " - 配置目录: /etc/node_exporter"
|
||||
echo " - 数据目录: /var/lib/node_exporter"
|
||||
echo " - 相关日志文件"
|
||||
echo
|
||||
echo "注意:"
|
||||
echo " - node_exporter 用户已保留(系统用户,可能被其他服务使用)"
|
||||
echo " - 如需完全清理,请手动检查并删除相关文件"
|
||||
echo
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
echo "=========================================="
|
||||
echo " Node Exporter 卸载脚本 v1.0"
|
||||
echo "=========================================="
|
||||
echo
|
||||
|
||||
check_root
|
||||
|
||||
log_warning "此操作将完全卸载 Node Exporter"
|
||||
read -p "确认继续?(y/N): " confirm
|
||||
|
||||
if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then
|
||||
log_info "取消卸载操作"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log_info "开始卸载 Node Exporter..."
|
||||
|
||||
stop_processes
|
||||
remove_binary
|
||||
remove_config
|
||||
remove_data_dir
|
||||
cleanup_logs
|
||||
|
||||
# 检查用户状态
|
||||
check_user_status
|
||||
|
||||
show_uninstall_info
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
286
src/metric/client-plugins/all-in-one-full/scripts/check_health.sh
Executable file
286
src/metric/client-plugins/all-in-one-full/scripts/check_health.sh
Executable file
@ -0,0 +1,286 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 整体健康检查脚本,调用各个组件的健康检查并将结果写入 .health_log 文件
|
||||
|
||||
set -e
|
||||
|
||||
# PID 文件检测,防止重复执行
|
||||
PIDFILE="/var/run/check_health.pid"
|
||||
if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then
|
||||
echo "健康检查脚本已在运行中,跳过本次执行" >&2
|
||||
exit 0
|
||||
fi
|
||||
echo $$ > "$PIDFILE"
|
||||
trap "rm -f $PIDFILE" EXIT
|
||||
|
||||
# 获取脚本所在目录
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
HEALTH_LOG_FILE="$SCRIPT_DIR/.health_log"
|
||||
INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record"
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# 日志函数 - 输出到 stderr 避免影响 JSON 结果
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1" >&2
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1" >&2
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1" >&2
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1" >&2
|
||||
}
|
||||
|
||||
# 检查单个组件健康状态
|
||||
check_component() {
|
||||
local component_name="$1"
|
||||
local check_script_path="$2"
|
||||
|
||||
log_info "检查 $component_name 健康状态..."
|
||||
|
||||
if [[ ! -f "$check_script_path" ]]; then
|
||||
log_error "健康检查脚本不存在: $check_script_path"
|
||||
echo "{\"name\": \"$component_name\", \"status\": \"unhealth\", \"reason\": \"健康检查脚本不存在: $check_script_path\"}"
|
||||
return 1
|
||||
fi
|
||||
|
||||
if [[ ! -x "$check_script_path" ]]; then
|
||||
log_error "健康检查脚本无执行权限: $check_script_path"
|
||||
echo "{\"name\": \"$component_name\", \"status\": \"unhealth\", \"reason\": \"健康检查脚本无执行权限: $check_script_path\"}"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 执行健康检查脚本,只捕获 stdout,stderr 输出到终端
|
||||
local result
|
||||
if result=$("$check_script_path" 2>/dev/null); then
|
||||
log_success "$component_name 健康检查通过"
|
||||
echo "$result"
|
||||
return 0
|
||||
else
|
||||
log_warning "$component_name 健康检查失败"
|
||||
echo "$result"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 生成时间戳
|
||||
get_timestamp() {
|
||||
date '+%Y-%m-%d %H:%M:%S'
|
||||
}
|
||||
|
||||
# 生成UTC时间戳
|
||||
get_utc_timestamp() {
|
||||
date -u '+%Y-%m-%dT%H:%M:%SZ'
|
||||
}
|
||||
|
||||
# 获取主机名
|
||||
get_hostname() {
|
||||
echo "${HOSTNAME:-$(hostname)}"
|
||||
}
|
||||
|
||||
# 创建健康状态目录
|
||||
create_health_dir() {
|
||||
local hostname=$(get_hostname)
|
||||
local health_dir="/private/argus/agent/$hostname/health"
|
||||
|
||||
if [[ ! -d "$health_dir" ]]; then
|
||||
log_info "创建健康状态目录: $health_dir"
|
||||
mkdir -p "$health_dir"
|
||||
fi
|
||||
|
||||
echo "$health_dir"
|
||||
}
|
||||
|
||||
# 写入单个模块的健康状态JSON文件
|
||||
write_component_health_json() {
|
||||
local component_name="$1"
|
||||
local status="$2"
|
||||
local error_msg="$3"
|
||||
local health_dir="$4"
|
||||
|
||||
# 生成模块名前缀-xxx.json格式的文件名
|
||||
local module_prefix="metric"
|
||||
local filename="${module_prefix}-${component_name}.json"
|
||||
local filepath="$health_dir/$filename"
|
||||
|
||||
# 生成UTC时间戳
|
||||
local timestamp=$(get_utc_timestamp)
|
||||
|
||||
# 构建JSON内容
|
||||
local json_content=$(cat << EOF
|
||||
{
|
||||
"status": "$status",
|
||||
"error": "$error_msg",
|
||||
"timestamp": "$timestamp"
|
||||
}
|
||||
EOF
|
||||
)
|
||||
|
||||
# 写入文件
|
||||
echo "$json_content" > "$filepath"
|
||||
log_info "已写入模块健康状态文件: $filepath"
|
||||
}
|
||||
|
||||
# 从安装记录文件中读取组件安装目录
|
||||
read_install_record() {
|
||||
local install_record_file="$1"
|
||||
|
||||
if [[ ! -f "$install_record_file" ]]; then
|
||||
log_error "安装记录文件不存在: $install_record_file"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 检查是否有 jq 命令来解析 JSON
|
||||
if command -v jq &> /dev/null; then
|
||||
# 使用 jq 解析 JSON
|
||||
local components_json
|
||||
if components_json=$(jq -r '.components | to_entries[] | "\(.key):\(.value.install_dir)"' "$install_record_file" 2>/dev/null); then
|
||||
echo "$components_json"
|
||||
return 0
|
||||
else
|
||||
log_error "无法解析安装记录文件 JSON 格式: $install_record_file"
|
||||
return 1
|
||||
fi
|
||||
else
|
||||
# 如果没有 jq,尝试简单的文本解析
|
||||
log_warning "jq 命令不可用,尝试简单文本解析"
|
||||
|
||||
# 查找所有 install_dir 行
|
||||
local components=()
|
||||
while IFS= read -r line; do
|
||||
if [[ "$line" =~ \"install_dir\":[[:space:]]*\"([^\"]+)\" ]]; then
|
||||
local install_dir="${BASH_REMATCH[1]}"
|
||||
# 从路径中提取组件名称
|
||||
local component_name=$(basename "$install_dir")
|
||||
components+=("$component_name:$install_dir")
|
||||
fi
|
||||
done < "$install_record_file"
|
||||
|
||||
if [[ ${#components[@]} -gt 0 ]]; then
|
||||
printf '%s\n' "${components[@]}"
|
||||
return 0
|
||||
else
|
||||
log_error "无法从安装记录文件中提取组件信息"
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
echo "==========================================" >&2
|
||||
echo " 整体健康检查脚本" >&2
|
||||
echo "==========================================" >&2
|
||||
echo >&2
|
||||
|
||||
# 记录健康检查开始时间
|
||||
local start_time=$(get_timestamp)
|
||||
log_info "健康检查开始时间: $start_time"
|
||||
|
||||
# 创建健康状态目录
|
||||
local health_dir
|
||||
health_dir=$(create_health_dir)
|
||||
|
||||
# 从安装记录文件中读取组件信息
|
||||
log_info "从安装记录文件读取组件信息: $INSTALL_RECORD_FILE"
|
||||
local components_info
|
||||
if ! components_info=$(read_install_record "$INSTALL_RECORD_FILE"); then
|
||||
log_error "无法读取安装记录文件,健康检查终止"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 存储所有检查结果
|
||||
local all_results=()
|
||||
local overall_status="health"
|
||||
|
||||
# 逐个检查组件
|
||||
while IFS= read -r component_info; do
|
||||
if [[ -n "$component_info" ]]; then
|
||||
IFS=':' read -r component_name install_dir <<< "$component_info"
|
||||
local check_script_path="$install_dir/check_health.sh"
|
||||
|
||||
local result
|
||||
local component_status="healthy"
|
||||
local error_msg=""
|
||||
|
||||
if result=$(check_component "$component_name" "$check_script_path"); then
|
||||
all_results+=("$result")
|
||||
else
|
||||
all_results+=("$result")
|
||||
overall_status="unhealth"
|
||||
component_status="unhealthy"
|
||||
# 从结果中提取错误信息
|
||||
if command -v jq &> /dev/null; then
|
||||
error_msg=$(echo "$result" | jq -r '.reason // ""' 2>/dev/null || echo "")
|
||||
else
|
||||
# 简单的文本解析提取错误信息
|
||||
if [[ "$result" =~ \"reason\":[[:space:]]*\"([^\"]+)\" ]]; then
|
||||
error_msg="${BASH_REMATCH[1]}"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# 写入单个模块的健康状态JSON文件
|
||||
write_component_health_json "$component_name" "$component_status" "$error_msg" "$health_dir"
|
||||
fi
|
||||
done <<< "$components_info"
|
||||
|
||||
# 记录健康检查结束时间
|
||||
local end_time=$(get_timestamp)
|
||||
log_info "健康检查结束时间: $end_time"
|
||||
|
||||
# 构建完整的健康检查结果 JSON
|
||||
local health_check_result=$(cat << EOF
|
||||
{
|
||||
"start_time": "$start_time",
|
||||
"end_time": "$end_time",
|
||||
"overall_status": "$overall_status",
|
||||
"components": [
|
||||
$(printf '%s,\n' "${all_results[@]}" | sed '$s/,$//')
|
||||
]
|
||||
}
|
||||
EOF
|
||||
)
|
||||
|
||||
# 写入健康日志文件
|
||||
log_info "将健康检查结果写入日志文件: $HEALTH_LOG_FILE"
|
||||
echo "$health_check_result" >> "$HEALTH_LOG_FILE"
|
||||
|
||||
# 输出 JSON 结果到 stdout
|
||||
echo "$health_check_result"
|
||||
|
||||
# 显示总结到 stderr
|
||||
echo >&2
|
||||
echo "==========================================" >&2
|
||||
echo " 健康检查总结" >&2
|
||||
echo "==========================================" >&2
|
||||
echo "开始时间: $start_time" >&2
|
||||
echo "结束时间: $end_time" >&2
|
||||
echo "整体状态: $overall_status" >&2
|
||||
echo "日志文件: $HEALTH_LOG_FILE" >&2
|
||||
echo >&2
|
||||
|
||||
if [[ "$overall_status" == "health" ]]; then
|
||||
log_success "所有组件健康检查通过!"
|
||||
exit 0
|
||||
else
|
||||
log_error "部分组件健康检查失败,请查看上述详细信息"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
240
src/metric/client-plugins/all-in-one-full/scripts/check_version.sh
Executable file
240
src/metric/client-plugins/all-in-one-full/scripts/check_version.sh
Executable file
@ -0,0 +1,240 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 版本校验脚本
|
||||
# 比较本地 LATEST_VERSION 与 FTP 的 VERSION 版本,如果不一致则更新对应版本
|
||||
|
||||
set -e
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# 日志函数 - 输出到 stderr 避免影响函数返回值
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1" >&2
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1" >&2
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1" >&2
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1" >&2
|
||||
}
|
||||
|
||||
# 获取脚本所在目录
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
# 动态获取当前版本目录
|
||||
get_current_version_dir() {
|
||||
# 查找 /opt/argus-metric/versions/ 下的最新版本目录
|
||||
local versions_dir="/opt/argus-metric/versions"
|
||||
if [[ -d "$versions_dir" ]]; then
|
||||
# 按版本号排序,获取最新的版本目录
|
||||
local latest_version_dir=$(ls -1 "$versions_dir" 2>/dev/null | sort -V | tail -1)
|
||||
if [[ -n "$latest_version_dir" ]]; then
|
||||
echo "$versions_dir/$latest_version_dir"
|
||||
else
|
||||
echo "/opt/argus-metric"
|
||||
fi
|
||||
else
|
||||
echo "/opt/argus-metric"
|
||||
fi
|
||||
}
|
||||
|
||||
# 获取当前版本目录
|
||||
CURRENT_VERSION_DIR=$(get_current_version_dir)
|
||||
# LATEST_VERSION 文件在根目录
|
||||
LOCAL_VERSION_FILE="/opt/argus-metric/LATEST_VERSION"
|
||||
REMOTE_VERSION_URL=""
|
||||
LOG_FILE="$CURRENT_VERSION_DIR/.version_check.log"
|
||||
|
||||
# 从环境变量或配置文件获取 FTP 服务器信息
|
||||
get_ftp_config() {
|
||||
# 优先从环境变量获取配置
|
||||
log_info "获取 FTP 配置信息..."
|
||||
|
||||
# 如果环境变量中没有设置,则尝试从配置文件读取
|
||||
if [[ -z "$FTP_SERVER" || -z "$FTP_USER" || -z "$FTP_PASSWORD" ]]; then
|
||||
local config_file="$SCRIPT_DIR/../config/config.env"
|
||||
if [[ -f "$config_file" ]]; then
|
||||
log_info "从配置文件读取 FTP 配置: $config_file"
|
||||
source "$config_file"
|
||||
fi
|
||||
else
|
||||
log_info "使用环境变量中的 FTP 配置"
|
||||
fi
|
||||
|
||||
# 设置默认值(如果环境变量和配置文件都没有设置)
|
||||
FTP_SERVER="${FTP_SERVER:-localhost}"
|
||||
FTP_USER="${FTP_USER:-ftpuser}"
|
||||
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
|
||||
|
||||
# 构建远程版本文件 URL
|
||||
REMOTE_VERSION_URL="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/LATEST_VERSION"
|
||||
|
||||
log_info "FTP 配置来源: ${FTP_CONFIG_SOURCE:-环境变量/配置文件}"
|
||||
}
|
||||
|
||||
# 获取远程版本号
|
||||
get_remote_version() {
|
||||
log_info "从 FTP 服务器获取远程版本号..."
|
||||
log_info "远程地址: $REMOTE_VERSION_URL"
|
||||
|
||||
# 先测试 FTP 连接
|
||||
log_info "测试 FTP 连接..."
|
||||
if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null 2>&1; then
|
||||
log_success "FTP 服务器连接成功"
|
||||
else
|
||||
log_error "无法连接到 FTP 服务器: $FTP_SERVER"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 测试 LATEST_VERSION 文件是否存在
|
||||
log_info "检查远程 LATEST_VERSION 文件是否存在..."
|
||||
if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/LATEST_VERSION" >/dev/null 2>&1; then
|
||||
log_success "远程 LATEST_VERSION 文件存在"
|
||||
else
|
||||
log_error "远程 LATEST_VERSION 文件不存在或无法访问"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 获取远程版本号
|
||||
local remote_version
|
||||
if remote_version=$(curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfL "ftp://${FTP_SERVER}/LATEST_VERSION" 2>/dev/null | tr -d '[:space:]'); then
|
||||
if [[ -n "$remote_version" ]]; then
|
||||
log_success "获取到远程版本号: $remote_version"
|
||||
echo "$remote_version"
|
||||
else
|
||||
log_error "远程版本号为空"
|
||||
return 1
|
||||
fi
|
||||
else
|
||||
log_error "获取远程版本号失败"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 获取本地版本号
|
||||
get_local_version() {
|
||||
if [[ -f "$LOCAL_VERSION_FILE" ]]; then
|
||||
local local_version=$(cat "$LOCAL_VERSION_FILE" 2>/dev/null | tr -d '[:space:]')
|
||||
if [[ -n "$local_version" ]]; then
|
||||
log_info "本地版本号: $local_version"
|
||||
echo "$local_version"
|
||||
else
|
||||
log_warning "本地版本文件为空"
|
||||
echo ""
|
||||
fi
|
||||
else
|
||||
log_warning "本地版本文件不存在: $LOCAL_VERSION_FILE"
|
||||
echo ""
|
||||
fi
|
||||
}
|
||||
|
||||
# 更新到新版本
|
||||
update_to_version() {
|
||||
local new_version="$1"
|
||||
local temp_dir="/tmp/argus-update-$$"
|
||||
local setup_script="$temp_dir/setup.sh"
|
||||
|
||||
log_info "开始更新到版本: $new_version"
|
||||
|
||||
# 创建临时目录
|
||||
mkdir -p "$temp_dir"
|
||||
|
||||
# 下载最新的 setup.sh
|
||||
log_info "从 FTP 服务器下载最新的安装脚本..."
|
||||
local setup_url="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/setup.sh"
|
||||
|
||||
if curl -fsS "$setup_url" -o "$setup_script"; then
|
||||
log_success "安装脚本下载完成"
|
||||
else
|
||||
log_error "下载安装脚本失败: $setup_url"
|
||||
rm -rf "$temp_dir"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 添加执行权限
|
||||
chmod +x "$setup_script"
|
||||
|
||||
# 执行安装脚本
|
||||
log_info "执行安装脚本进行版本更新..."
|
||||
if "$setup_script" --server "$FTP_SERVER" --user "$FTP_USER" --password "$FTP_PASSWORD" --version "$new_version"; then
|
||||
log_success "版本更新完成: $new_version"
|
||||
rm -rf "$temp_dir"
|
||||
return 0
|
||||
else
|
||||
log_error "版本更新失败: $new_version"
|
||||
rm -rf "$temp_dir"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 记录检查日志
|
||||
log_check() {
|
||||
local message="$1"
|
||||
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
echo "[$timestamp] $message" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
log_info "开始版本校验检查..."
|
||||
log_check "版本校验检查开始"
|
||||
|
||||
# 确保系统目录存在
|
||||
mkdir -p "/opt/argus-metric"
|
||||
mkdir -p "$CURRENT_VERSION_DIR"
|
||||
|
||||
log_info "当前版本目录: $CURRENT_VERSION_DIR"
|
||||
|
||||
# 获取 FTP 配置
|
||||
get_ftp_config
|
||||
|
||||
# 获取本地版本号
|
||||
local local_version
|
||||
local_version=$(get_local_version)
|
||||
|
||||
# 获取远程版本号
|
||||
local remote_version
|
||||
if ! remote_version=$(get_remote_version); then
|
||||
log_error "无法获取远程版本号,跳过本次检查"
|
||||
log_check "版本校验失败:无法获取远程版本号"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 比较版本号
|
||||
if [[ "$local_version" == "$remote_version" ]]; then
|
||||
log_info "版本一致,无需更新 (本地: $local_version, 远程: $remote_version)"
|
||||
log_check "版本校验完成:版本一致 ($local_version)"
|
||||
else
|
||||
log_info "检测到版本不一致 (本地: $local_version, 远程: $remote_version)"
|
||||
log_check "检测到版本不一致:本地($local_version) -> 远程($remote_version)"
|
||||
|
||||
# 更新到新版本
|
||||
if update_to_version "$remote_version"; then
|
||||
log_success "版本更新成功: $local_version -> $remote_version"
|
||||
log_check "版本更新成功:$local_version -> $remote_version"
|
||||
else
|
||||
log_error "版本更新失败"
|
||||
log_check "版本更新失败:$local_version -> $remote_version"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
log_success "版本校验检查完成"
|
||||
log_check "版本校验检查完成"
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
991
src/metric/client-plugins/all-in-one-full/scripts/install_artifact.sh
Executable file
991
src/metric/client-plugins/all-in-one-full/scripts/install_artifact.sh
Executable file
@ -0,0 +1,991 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m'
|
||||
|
||||
log_info() {
|
||||
local message="[INFO] $1"
|
||||
echo -e "${BLUE}${message}${NC}"
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') $message" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
local message="[SUCCESS] $1"
|
||||
echo -e "${GREEN}${message}${NC}"
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') $message" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
local message="[WARNING] $1"
|
||||
echo -e "${YELLOW}${message}${NC}"
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') $message" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
local message="[ERROR] $1"
|
||||
echo -e "${RED}${message}${NC}"
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') $message" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
# 配置变量
|
||||
INSTALL_DIR="${1:-$(pwd)}" # 使用第一个参数作为安装目录,如果没有参数则使用当前目录
|
||||
TEMP_DIR="/tmp/metrics-install-$$"
|
||||
VERSION_FILE="version.json"
|
||||
LOG_FILE="${INSTALL_DIR}/.install.log" # 安装日志文件
|
||||
|
||||
|
||||
# 加载配置文件
|
||||
load_config() {
|
||||
local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
local config_file="$script_dir/config.env"
|
||||
|
||||
if [[ -f "$config_file" ]]; then
|
||||
log_info "加载配置文件: $config_file"
|
||||
# 导出配置文件中的环境变量
|
||||
set -a # 自动导出所有变量
|
||||
source "$config_file"
|
||||
set +a # 关闭自动导出
|
||||
log_success "配置文件加载完成"
|
||||
else
|
||||
log_warning "配置文件不存在: $config_file,使用默认配置"
|
||||
fi
|
||||
}
|
||||
|
||||
# 复制配置文件到安装目录
|
||||
copy_config_files() {
|
||||
log_info "复制配置文件到安装目录..."
|
||||
|
||||
local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
local source_config="$script_dir/../config/config.env"
|
||||
local target_config="$INSTALL_DIR/config.env"
|
||||
|
||||
if [[ -f "$source_config" ]]; then
|
||||
# 检查源文件和目标文件是否是同一个文件
|
||||
if [[ "$source_config" == "$target_config" ]]; then
|
||||
log_info "配置文件已在目标位置,跳过复制"
|
||||
log_success "配置文件已存在: $target_config"
|
||||
else
|
||||
if cp "$source_config" "$target_config"; then
|
||||
log_success "配置文件复制完成: $target_config"
|
||||
else
|
||||
log_error "配置文件复制失败"
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
else
|
||||
log_warning "源配置文件不存在: $source_config"
|
||||
fi
|
||||
|
||||
# 复制版本校验脚本
|
||||
log_info "复制版本校验脚本到安装目录..."
|
||||
local target_check_version="$INSTALL_DIR/check_version.sh"
|
||||
|
||||
# 检查目标文件是否已存在(从 artifact 包中解压出来的)
|
||||
if [[ -f "$target_check_version" ]]; then
|
||||
log_info "版本校验脚本已存在,设置执行权限..."
|
||||
chmod +x "$target_check_version"
|
||||
log_success "版本校验脚本权限设置完成: $target_check_version"
|
||||
else
|
||||
log_warning "版本校验脚本不存在: $target_check_version"
|
||||
log_info "请确保 check_version.sh 已包含在 artifact 包中"
|
||||
fi
|
||||
}
|
||||
|
||||
check_root() {
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
log_error "此脚本需要 root 权限运行"
|
||||
log_info "请使用: sudo $0 [安装目录]"
|
||||
log_info "如果不指定安装目录,将使用当前目录: $(pwd)"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 检查系统要求
|
||||
check_system() {
|
||||
log_info "检查系统要求..."
|
||||
|
||||
# 检查操作系统
|
||||
if [[ ! -f /etc/os-release ]]; then
|
||||
log_error "无法检测操作系统版本"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
source /etc/os-release
|
||||
log_info "检测到操作系统: $NAME $VERSION"
|
||||
|
||||
# 检查系统架构
|
||||
arch=$(uname -m)
|
||||
log_info "系统架构: $arch"
|
||||
|
||||
# 检查磁盘空间
|
||||
available_space=$(df / | awk 'NR==2 {print $4}')
|
||||
if [[ $available_space -lt 10485760 ]]; then # 10GB in KB
|
||||
log_warning "可用磁盘空间不足 10GB,当前可用: $(($available_space / 1024 / 1024))GB"
|
||||
fi
|
||||
|
||||
# 检查内存
|
||||
total_mem=$(free -m | awk 'NR==2{print $2}')
|
||||
if [[ $total_mem -lt 4096 ]]; then # 4GB
|
||||
log_warning "系统内存不足 4GB,当前: ${total_mem}MB"
|
||||
fi
|
||||
}
|
||||
|
||||
# 查找版本文件
|
||||
find_version_file() {
|
||||
log_info "查找版本信息文件..."
|
||||
|
||||
# 在当前目录查找
|
||||
if [[ -f "$VERSION_FILE" ]]; then
|
||||
VERSION_FILE_PATH="$(pwd)/$VERSION_FILE"
|
||||
log_success "找到版本文件: $VERSION_FILE"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 在 artifact 目录查找
|
||||
for version_dir in artifact/*/; do
|
||||
if [[ -f "${version_dir}${VERSION_FILE}" ]]; then
|
||||
VERSION_FILE_PATH="$(cd "$(dirname "${version_dir}${VERSION_FILE}")" && pwd)/$(basename "${version_dir}${VERSION_FILE}")"
|
||||
log_success "找到版本文件: $VERSION_FILE_PATH"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
|
||||
log_error "未找到版本信息文件 $VERSION_FILE"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# 解析版本信息
|
||||
parse_version_info() {
|
||||
log_info "解析版本信息..."
|
||||
|
||||
if [[ ! -f "$VERSION_FILE_PATH" ]]; then
|
||||
log_error "版本文件不存在: $VERSION_FILE_PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 使用 jq 解析 JSON(如果可用)
|
||||
if command -v jq &> /dev/null; then
|
||||
# 验证JSON文件格式
|
||||
if ! jq empty "$VERSION_FILE_PATH" 2>/dev/null; then
|
||||
log_error "JSON文件格式错误,请检查 $VERSION_FILE_PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
VERSION=$(jq -r '.version' "$VERSION_FILE_PATH")
|
||||
BUILD_TIME=$(jq -r '.build_time' "$VERSION_FILE_PATH")
|
||||
|
||||
# 解析 artifact_list
|
||||
if jq -e '.artifact_list' "$VERSION_FILE_PATH" > /dev/null 2>&1; then
|
||||
jq -r '.artifact_list | to_entries[] | "\(.key):\(.value)"' "$VERSION_FILE_PATH" > "$TEMP_DIR/components.txt"
|
||||
else
|
||||
log_error "version.json 中缺少 artifact_list 字段"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 解析 checksums
|
||||
if jq -e '.checksums' "$VERSION_FILE_PATH" > /dev/null 2>&1; then
|
||||
jq -r '.checksums | to_entries[] | "\(.key):\(.value)"' "$VERSION_FILE_PATH" > "$TEMP_DIR/checksums.txt"
|
||||
else
|
||||
log_error "version.json 中缺少 checksums 字段"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 解析 install_order(现在包含完整的文件名)
|
||||
if jq -e '.install_order' "$VERSION_FILE_PATH" > /dev/null 2>&1; then
|
||||
jq -r '.install_order[]' "$VERSION_FILE_PATH" > "$TEMP_DIR/install_order.txt"
|
||||
else
|
||||
log_error "version.json 中缺少 install_order 字段"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
else
|
||||
log_warning "jq 未安装,使用简单的 JSON 解析"
|
||||
# 简单的 JSON 解析
|
||||
VERSION=$(grep '"version"' "$VERSION_FILE_PATH" | sed 's/.*"version": *"\([^"]*\)".*/\1/')
|
||||
BUILD_TIME=$(grep '"build_time"' "$VERSION_FILE_PATH" | sed 's/.*"build_time": *"\([^"]*\)".*/\1/')
|
||||
|
||||
# 解析 artifact_list(跳过字段名本身)
|
||||
grep -A 100 '"artifact_list"' "$VERSION_FILE_PATH" | grep -v '"artifact_list"' | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do
|
||||
component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/')
|
||||
version=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/')
|
||||
echo "$component:$version" >> "$TEMP_DIR/components.txt"
|
||||
done
|
||||
|
||||
# 解析 checksums(跳过字段名本身)
|
||||
grep -A 100 '"checksums"' "$VERSION_FILE_PATH" | grep -v '"checksums"' | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do
|
||||
component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/')
|
||||
checksum=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/')
|
||||
echo "$component:$checksum" >> "$TEMP_DIR/checksums.txt"
|
||||
done
|
||||
|
||||
# 解析 install_order(跳过字段名本身,只取数组元素)
|
||||
grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -v '"install_order"' | grep -E '^\s*"[^"]+"' | while read line; do
|
||||
component=$(echo "$line" | sed 's/.*"\([^"]*\)".*/\1/')
|
||||
echo "$component" >> "$TEMP_DIR/install_order.txt"
|
||||
done
|
||||
|
||||
# 验证解析结果
|
||||
if [[ ! -f "$TEMP_DIR/components.txt" || ! -s "$TEMP_DIR/components.txt" ]]; then
|
||||
log_error "无法解析 artifact_list,请检查 version.json 格式"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -f "$TEMP_DIR/checksums.txt" || ! -s "$TEMP_DIR/checksums.txt" ]]; then
|
||||
log_error "无法解析 checksums,请检查 version.json 格式"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -f "$TEMP_DIR/install_order.txt" || ! -s "$TEMP_DIR/install_order.txt" ]]; then
|
||||
log_error "无法解析 install_order,请检查 version.json 格式"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
log_success "版本信息解析完成"
|
||||
log_info " 版本: $VERSION"
|
||||
log_info " 构建时间: $BUILD_TIME"
|
||||
|
||||
component_count=0
|
||||
if [[ -f "$TEMP_DIR/components.txt" ]]; then
|
||||
component_count=$(wc -l < "$TEMP_DIR/components.txt")
|
||||
log_info " 组件数量: $component_count"
|
||||
log_info " 组件列表:"
|
||||
while IFS= read -r line; do
|
||||
component=$(echo "$line" | cut -d':' -f1)
|
||||
version=$(echo "$line" | cut -d':' -f2)
|
||||
log_info " - $component v$version"
|
||||
done < "$TEMP_DIR/components.txt"
|
||||
else
|
||||
log_error "components.txt 文件不存在"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 验证文件完整性
|
||||
verify_checksums() {
|
||||
log_info "验证文件完整性..."
|
||||
|
||||
artifact_dir=$(dirname "$VERSION_FILE_PATH")
|
||||
log_info "Artifact 目录: $artifact_dir"
|
||||
failed_verification=0
|
||||
|
||||
if [[ -f "$TEMP_DIR/checksums.txt" ]]; then
|
||||
while IFS= read -r line; do
|
||||
component=$(echo "$line" | cut -d':' -f1)
|
||||
expected_checksum=$(echo "$line" | cut -d':' -f2-)
|
||||
|
||||
# 查找匹配的 tar 文件
|
||||
actual_file=""
|
||||
for file in "$artifact_dir/${component}-"*.tar.gz; do
|
||||
if [[ -f "$file" ]]; then
|
||||
actual_file="$file"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ -z "$actual_file" ]]; then
|
||||
log_error "找不到组件文件: $component"
|
||||
failed_verification=1
|
||||
continue
|
||||
fi
|
||||
|
||||
# 计算实际校验和
|
||||
actual_checksum="sha256:$(sha256sum "$actual_file" | cut -d' ' -f1)"
|
||||
|
||||
if [[ "$actual_checksum" == "$expected_checksum" ]]; then
|
||||
log_success " $component: 校验通过"
|
||||
else
|
||||
log_error " $component: 校验失败"
|
||||
log_error " 期望: $expected_checksum"
|
||||
log_error " 实际: $actual_checksum"
|
||||
failed_verification=1
|
||||
fi
|
||||
done < "$TEMP_DIR/checksums.txt"
|
||||
fi
|
||||
|
||||
if [[ $failed_verification -eq 1 ]]; then
|
||||
log_error "文件完整性验证失败"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_success "所有文件校验通过"
|
||||
}
|
||||
|
||||
# 创建安装目录
|
||||
create_install_dirs() {
|
||||
log_info "创建安装目录..."
|
||||
|
||||
mkdir -p "$INSTALL_DIR"
|
||||
mkdir -p "$TEMP_DIR"
|
||||
|
||||
log_success "安装目录创建完成: $INSTALL_DIR"
|
||||
}
|
||||
|
||||
# 获取系统版本
|
||||
get_system_version() {
|
||||
if [[ ! -f /etc/os-release ]]; then
|
||||
log_error "无法检测操作系统版本"
|
||||
return 1
|
||||
fi
|
||||
|
||||
source /etc/os-release
|
||||
|
||||
# 提取主版本号
|
||||
case "$VERSION_ID" in
|
||||
"20.04")
|
||||
echo "ubuntu20"
|
||||
;;
|
||||
"22.04")
|
||||
echo "ubuntu22"
|
||||
;;
|
||||
*)
|
||||
log_warning "未识别的Ubuntu版本: $VERSION_ID,尝试使用ubuntu22"
|
||||
echo "ubuntu22"
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# 安装系统依赖包
|
||||
install_system_deps() {
|
||||
log_info "开始安装系统依赖包(离线模式)..."
|
||||
|
||||
local artifact_dir
|
||||
artifact_dir=$(dirname "$VERSION_FILE_PATH")
|
||||
local deps_dir="$artifact_dir/deps"
|
||||
local system_version
|
||||
system_version=$(get_system_version)
|
||||
local version_deps_dir="$deps_dir/$system_version"
|
||||
|
||||
if [[ ! -d "$version_deps_dir" ]]; then
|
||||
log_warning "未找到 $system_version 版本的依赖目录: $version_deps_dir,跳过安装"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log_info "找到系统版本依赖目录: $version_deps_dir"
|
||||
|
||||
local deps_temp_dir="/tmp/argus_deps"
|
||||
mkdir -p "$deps_temp_dir"
|
||||
rm -rf "$deps_temp_dir"/*
|
||||
|
||||
local FAILED_DEPS=()
|
||||
local CORE_DEPS=(jq cron curl) # 核心依赖列表
|
||||
|
||||
# 遍历每个 tar.gz
|
||||
for tar_file in "$version_deps_dir"/*.tar.gz; do
|
||||
[[ -f "$tar_file" ]] || continue
|
||||
|
||||
local tar_basename
|
||||
tar_basename=$(basename "$tar_file")
|
||||
log_info "处理依赖包: $tar_basename"
|
||||
|
||||
local extract_dir="$deps_temp_dir/${tar_basename%.tar.gz}"
|
||||
mkdir -p "$extract_dir"
|
||||
|
||||
if tar -xzf "$tar_file" -C "$extract_dir"; then
|
||||
log_success " $tar_basename 解压完成"
|
||||
else
|
||||
log_error " $tar_basename 解压失败"
|
||||
FAILED_DEPS+=("$tar_basename")
|
||||
continue
|
||||
fi
|
||||
|
||||
# 递归查找所有 deb 文件,一次性安装
|
||||
mapfile -t deb_files < <(find "$extract_dir" -type f -name "*.deb")
|
||||
if [[ ${#deb_files[@]} -eq 0 ]]; then
|
||||
log_warning " 没有找到 deb 包,跳过"
|
||||
continue
|
||||
fi
|
||||
|
||||
log_info " 安装 ${#deb_files[@]} 个 deb 包..."
|
||||
if dpkg -i "${deb_files[@]}" &>/tmp/dpkg_install.log; then
|
||||
log_success " 所有 deb 包安装成功"
|
||||
else
|
||||
dpkg --configure -a || true
|
||||
if dpkg -l | grep -q '^ii'; then
|
||||
log_success " dpkg --configure 修复后安装成功"
|
||||
else
|
||||
log_error " 部分 deb 包安装失败,请手动安装"
|
||||
for deb in "${deb_files[@]}"; do
|
||||
pkg_name=$(dpkg-deb -f "$deb" Package 2>/dev/null || true)
|
||||
FAILED_DEPS+=("${pkg_name:-$deb}")
|
||||
done
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
# 启动 cron 服务或其它必要服务
|
||||
start_cron_service
|
||||
|
||||
# 检查核心依赖是否都已安装
|
||||
local missing_core=()
|
||||
for dep in "${CORE_DEPS[@]}"; do
|
||||
if ! dpkg -s "$dep" &>/dev/null; then
|
||||
missing_core+=("$dep")
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ ${#missing_core[@]} -gt 0 ]]; then
|
||||
log_error "核心依赖安装失败,请手动安装以下组件:"
|
||||
for d in "${missing_core[@]}"; do
|
||||
echo " - $d"
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 最终处理其他安装失败的包
|
||||
if [[ ${#FAILED_DEPS[@]} -gt 0 ]]; then
|
||||
log_error "以下系统依赖安装失败,请手动安装后重试:"
|
||||
for f in "${FAILED_DEPS[@]}"; do
|
||||
echo " - $f"
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_success "系统依赖安装完成,全部就绪"
|
||||
}
|
||||
|
||||
# 启动 cron 服务
|
||||
start_cron_service() {
|
||||
log_info "检查并启动 cron 服务..."
|
||||
|
||||
# 检查 cron 是否已经在运行
|
||||
if pgrep -x "cron" > /dev/null; then
|
||||
log_success "cron 服务已在运行"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 检查 /usr/sbin/cron 是否存在
|
||||
if [[ ! -f "/usr/sbin/cron" ]]; then
|
||||
log_warning "cron 可执行文件不存在,跳过启动"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 启动 cron 服务
|
||||
log_info "启动 cron 服务..."
|
||||
if /usr/sbin/cron start 2>/dev/null || /usr/sbin/cron 2>/dev/null; then
|
||||
log_success "cron 服务启动成功"
|
||||
|
||||
sleep 2
|
||||
|
||||
if pgrep -x "cron" > /dev/null; then
|
||||
log_success "cron 服务运行正常"
|
||||
else
|
||||
log_warning "cron 服务可能未正常启动"
|
||||
fi
|
||||
else
|
||||
log_error "cron 服务启动失败"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 安装组件
|
||||
install_components() {
|
||||
log_info "开始安装组件..."
|
||||
|
||||
artifact_dir=$(dirname "$VERSION_FILE_PATH")
|
||||
log_info "Artifact 目录: $artifact_dir"
|
||||
install_count=0
|
||||
total_count=0
|
||||
|
||||
if [[ -f "$TEMP_DIR/install_order.txt" ]]; then
|
||||
total_count=$(wc -l < "$TEMP_DIR/install_order.txt")
|
||||
fi
|
||||
|
||||
if [[ -f "$TEMP_DIR/install_order.txt" ]]; then
|
||||
while IFS= read -r filename; do
|
||||
install_count=$((install_count + 1))
|
||||
|
||||
# 从文件名中提取组件名(去掉时间戳后缀)
|
||||
component=$(echo "$filename" | sed 's/-[0-9]\{8\}-[0-9]\{6\}\.tar\.gz$//')
|
||||
|
||||
log_info "[$install_count/$total_count] 安装 $component..."
|
||||
log_info " 文件名: $filename"
|
||||
|
||||
# 直接使用完整的文件名
|
||||
tar_file="$artifact_dir/$filename"
|
||||
|
||||
if [[ ! -f "$tar_file" ]]; then
|
||||
log_error "找不到组件文件: $filename"
|
||||
log_info " 期望路径: $tar_file"
|
||||
log_info " 当前目录: $(pwd)"
|
||||
log_info " 目录内容:"
|
||||
ls -la "$artifact_dir" | while read line; do
|
||||
log_info " $line"
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_info " 找到文件: $tar_file"
|
||||
|
||||
# 解压到临时目录
|
||||
component_temp_dir="$TEMP_DIR/$component"
|
||||
mkdir -p "$component_temp_dir"
|
||||
|
||||
if tar -xzf "$tar_file" -C "$component_temp_dir" 2>/dev/null; then
|
||||
log_success " $component 解压完成"
|
||||
else
|
||||
log_error " $component 解压失败"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 查找解压后的目录
|
||||
extracted_dir=""
|
||||
for dir in "$component_temp_dir"/*; do
|
||||
if [[ -d "$dir" ]]; then
|
||||
extracted_dir="$dir"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ -z "$extracted_dir" ]]; then
|
||||
log_error " $component 解压后未找到目录"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 执行安装脚本
|
||||
if [[ -f "$extracted_dir/install.sh" ]]; then
|
||||
log_info " 执行 $component 安装脚本..."
|
||||
if (cd "$extracted_dir" && ./install.sh "$INSTALL_DIR"); then
|
||||
log_success " $component 安装完成"
|
||||
else
|
||||
log_error " $component 安装失败"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
log_error " $component 缺少 install.sh 文件"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 将解压后的目录移动到安装目录,保留组件目录
|
||||
component_install_dir="$INSTALL_DIR/$component"
|
||||
# 简化安装逻辑:直接删除旧目录,不进行备份
|
||||
if [[ -d "$component_install_dir" ]]; then
|
||||
log_info " 组件目录已存在,删除旧版本: $component_install_dir"
|
||||
rm -rf "$component_install_dir"
|
||||
# log_info " 组件目录已存在,备份后更新: $component_install_dir"
|
||||
# mv "$component_install_dir" "${component_install_dir}.backup.$(date +%Y%m%d_%H%M%S)"
|
||||
fi
|
||||
mv "$extracted_dir" "$component_install_dir"
|
||||
log_success " 组件目录已保存: $component_install_dir"
|
||||
|
||||
# 清理临时文件
|
||||
rm -rf "$component_temp_dir"
|
||||
done < "$TEMP_DIR/install_order.txt"
|
||||
fi
|
||||
|
||||
log_success "所有组件安装完成"
|
||||
}
|
||||
|
||||
# 创建安装记录
|
||||
create_install_record() {
|
||||
log_info "创建安装记录..."
|
||||
|
||||
# 等待一段时间确保所有进程都已启动
|
||||
log_info "等待进程启动..."
|
||||
sleep 3
|
||||
|
||||
local install_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
||||
local install_record_file="$INSTALL_DIR/.install_record"
|
||||
|
||||
# 创建 JSON 格式的安装记录
|
||||
cat > "$install_record_file" << EOF
|
||||
{
|
||||
"version": "$VERSION",
|
||||
"build_time": "$BUILD_TIME",
|
||||
"install_time": "$install_time",
|
||||
"install_dir": "$INSTALL_DIR",
|
||||
"install_pid": $$,
|
||||
"components": {
|
||||
EOF
|
||||
|
||||
# 添加组件信息
|
||||
local first_component=true
|
||||
if [[ -f "$TEMP_DIR/components.txt" ]]; then
|
||||
while IFS= read -r line; do
|
||||
component=$(echo "$line" | cut -d':' -f1)
|
||||
version=$(echo "$line" | cut -d':' -f2)
|
||||
|
||||
# 获取组件的进程信息
|
||||
local component_pid=""
|
||||
|
||||
# 根据组件名查找进程,使用多种方法确保能找到PID
|
||||
case "$component" in
|
||||
"node-exporter")
|
||||
# 尝试多种方式查找node_exporter进程
|
||||
component_pid=$(pgrep -f "node_exporter" | head -1)
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(pgrep -f "node-exporter" | head -1)
|
||||
fi
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(ps aux | grep -v grep | grep "node_exporter" | awk '{print $2}' | head -1)
|
||||
fi
|
||||
;;
|
||||
"dcgm-exporter")
|
||||
# 查找dcgm-exporter进程
|
||||
component_pid=$(pgrep -f "dcgm-exporter" | head -1)
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(pgrep -f "dcgm_exporter" | head -1)
|
||||
fi
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(ps aux | grep -v grep | grep "dcgm-exporter" | awk '{print $2}' | head -1)
|
||||
fi
|
||||
;;
|
||||
"fluent-bit")
|
||||
# 查找fluent-bit进程
|
||||
component_pid=$(pgrep -f "fluent-bit" | head -1)
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(pgrep -f "fluent_bit" | head -1)
|
||||
fi
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(ps aux | grep -v grep | grep "fluent-bit" | awk '{print $2}' | head -1)
|
||||
fi
|
||||
;;
|
||||
"argus-agent")
|
||||
# 查找argus-agent进程
|
||||
component_pid=$(pgrep -f "argus-agent" | head -1)
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(ps aux | grep -v grep | grep "argus-agent" | awk '{print $2}' | head -1)
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
# 记录找到的PID信息
|
||||
if [[ -n "$component_pid" ]]; then
|
||||
log_info " 找到 $component 进程 PID: $component_pid"
|
||||
else
|
||||
log_warning " 未找到 $component 进程"
|
||||
fi
|
||||
|
||||
# 添加逗号分隔符
|
||||
if [[ "$first_component" == "true" ]]; then
|
||||
first_component=false
|
||||
else
|
||||
echo "," >> "$install_record_file"
|
||||
fi
|
||||
|
||||
# 添加组件信息
|
||||
cat >> "$install_record_file" << EOF
|
||||
"$component": {
|
||||
"version": "$version",
|
||||
"pid": "$component_pid",
|
||||
"install_dir": "$INSTALL_DIR/$component"
|
||||
}
|
||||
EOF
|
||||
done < "$TEMP_DIR/components.txt"
|
||||
fi
|
||||
|
||||
# 结束 JSON
|
||||
cat >> "$install_record_file" << EOF
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
log_success "安装记录已创建: $install_record_file"
|
||||
}
|
||||
|
||||
# 检查cron任务是否已存在
|
||||
check_cron_task_exists() {
|
||||
local task_pattern="$1"
|
||||
local temp_cron="$2"
|
||||
|
||||
if grep -q "$task_pattern" "$temp_cron"; then
|
||||
return 0 # 任务已存在
|
||||
else
|
||||
return 1 # 任务不存在
|
||||
fi
|
||||
}
|
||||
|
||||
# 设置健康检查定时任务
|
||||
setup_health_check_cron() {
|
||||
log_info "设置健康检查定时任务..."
|
||||
|
||||
# 直接使用当前安装目录,不依赖current软链接
|
||||
# INSTALL_DIR 是 /opt/argus-metric/versions/1.34.0
|
||||
local check_health_script="$INSTALL_DIR/check_health.sh"
|
||||
|
||||
# 检查健康检查脚本是否存在
|
||||
if [[ ! -f "$check_health_script" ]]; then
|
||||
log_error "健康检查脚本不存在: $check_health_script"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 确保脚本有执行权限
|
||||
chmod +x "$check_health_script"
|
||||
|
||||
# 创建临时crontab文件
|
||||
local temp_cron="/tmp/crontab_$$"
|
||||
|
||||
# 获取当前用户的crontab(如果存在)
|
||||
crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron"
|
||||
|
||||
# 检查并删除旧的健康检查任务
|
||||
if check_cron_task_exists "check_health.sh" "$temp_cron"; then
|
||||
log_info "发现旧的健康检查定时任务,正在更新..."
|
||||
# 删除所有包含check_health.sh的行
|
||||
grep -v "check_health.sh" "$temp_cron" > "$temp_cron.new"
|
||||
mv "$temp_cron.new" "$temp_cron"
|
||||
log_info "旧的健康检查定时任务已删除"
|
||||
fi
|
||||
|
||||
# 添加新的定时任务(每5分钟执行一次)
|
||||
echo "# Argus-Metrics 健康检查定时任务" >> "$temp_cron"
|
||||
echo "*/5 * * * * $check_health_script >> $INSTALL_DIR/.health_cron.log 2>&1" >> "$temp_cron"
|
||||
|
||||
# 安装新的crontab
|
||||
if crontab "$temp_cron"; then
|
||||
log_success "健康检查定时任务设置成功"
|
||||
log_info " 执行频率: 每5分钟"
|
||||
log_info " 日志文件: $INSTALL_DIR/.health_cron.log"
|
||||
log_info " 查看定时任务: crontab -l"
|
||||
log_info " 删除定时任务: crontab -e"
|
||||
else
|
||||
log_error "健康检查定时任务设置失败"
|
||||
rm -f "$temp_cron"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 清理临时文件
|
||||
rm -f "$temp_cron"
|
||||
|
||||
log_info "健康检查通过crontab自动执行"
|
||||
}
|
||||
|
||||
# 设置 DNS 同步定时任务
|
||||
setup_dns_sync_cron() {
|
||||
log_info "设置 DNS 同步定时任务..."
|
||||
|
||||
# 使用当前版本目录中的 DNS 同步脚本
|
||||
local sync_dns_script="$INSTALL_DIR/sync_dns.sh"
|
||||
|
||||
# 检查 DNS 同步脚本是否存在
|
||||
if [[ ! -f "$sync_dns_script" ]]; then
|
||||
log_warning "DNS 同步脚本不存在: $sync_dns_script"
|
||||
log_warning "跳过 DNS 同步定时任务设置"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 确保脚本有执行权限
|
||||
chmod +x "$sync_dns_script"
|
||||
|
||||
# 创建临时crontab文件
|
||||
local temp_cron="/tmp/crontab_$$"
|
||||
|
||||
# 获取当前用户的crontab(如果存在)
|
||||
crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron"
|
||||
|
||||
# 检查并删除旧的 DNS 同步任务
|
||||
if check_cron_task_exists "sync_dns.sh" "$temp_cron"; then
|
||||
log_info "发现旧的 DNS 同步定时任务,正在更新..."
|
||||
# 删除所有包含sync_dns.sh的行
|
||||
grep -v "sync_dns.sh" "$temp_cron" > "$temp_cron.new"
|
||||
mv "$temp_cron.new" "$temp_cron"
|
||||
log_info "旧的 DNS 同步定时任务已删除"
|
||||
fi
|
||||
|
||||
# 添加新的定时任务(每1分钟执行一次)
|
||||
# 直接使用版本目录中的 DNS 同步脚本
|
||||
echo "# Argus-Metrics DNS 同步定时任务" >> "$temp_cron"
|
||||
echo "* * * * * $sync_dns_script >> $INSTALL_DIR/.dns_sync.log 2>&1" >> "$temp_cron"
|
||||
|
||||
# 安装新的crontab
|
||||
if crontab "$temp_cron"; then
|
||||
log_success "DNS 同步定时任务设置成功"
|
||||
log_info " 执行频率: 每1分钟"
|
||||
log_info " 日志文件: $INSTALL_DIR/.dns_sync.log"
|
||||
log_info " 查看定时任务: crontab -l"
|
||||
log_info " 删除定时任务: crontab -e"
|
||||
else
|
||||
log_error "DNS 同步定时任务设置失败"
|
||||
rm -f "$temp_cron"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 清理临时文件
|
||||
rm -f "$temp_cron"
|
||||
|
||||
log_info "DNS 同步通过crontab自动执行"
|
||||
}
|
||||
|
||||
# 设置版本校验定时任务
|
||||
setup_version_check_cron() {
|
||||
log_info "设置版本校验定时任务..."
|
||||
|
||||
# 使用当前版本目录中的版本校验脚本
|
||||
local check_version_script="$INSTALL_DIR/check_version.sh"
|
||||
|
||||
# 检查脚本是否存在
|
||||
if [[ ! -f "$check_version_script" ]]; then
|
||||
log_warning "版本校验脚本不存在: $check_version_script"
|
||||
log_info "跳过版本校验定时任务设置"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 确保脚本可执行
|
||||
chmod +x "$check_version_script"
|
||||
|
||||
# 创建临时crontab文件
|
||||
local temp_cron="/tmp/crontab_$$"
|
||||
crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron"
|
||||
|
||||
# 检查是否已存在版本校验定时任务
|
||||
if check_cron_task_exists "check_version.sh" "$temp_cron"; then
|
||||
log_info "发现旧的版本校验定时任务,正在更新..."
|
||||
# 删除所有包含check_version.sh的行
|
||||
grep -v "check_version.sh" "$temp_cron" > "$temp_cron.new"
|
||||
mv "$temp_cron.new" "$temp_cron"
|
||||
log_info "旧的版本校验定时任务已删除"
|
||||
fi
|
||||
|
||||
# 添加新的定时任务(每30分钟执行一次)
|
||||
echo "# Argus-Metrics 版本校验定时任务" >> "$temp_cron"
|
||||
echo "*/1 * * * * $check_version_script >> $INSTALL_DIR/.version_check.log 2>&1" >> "$temp_cron"
|
||||
|
||||
# 安装新的crontab
|
||||
if crontab "$temp_cron"; then
|
||||
log_success "版本校验定时任务设置成功"
|
||||
log_info " 执行频率: 每1分钟"
|
||||
log_info " 日志文件: $INSTALL_DIR/.version_check.log"
|
||||
log_info " 查看定时任务: crontab -l"
|
||||
log_info " 删除定时任务: crontab -e"
|
||||
else
|
||||
log_error "版本校验定时任务设置失败"
|
||||
rm -f "$temp_cron"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 清理临时文件
|
||||
rm -f "$temp_cron"
|
||||
|
||||
log_info "版本校验通过crontab自动执行"
|
||||
}
|
||||
|
||||
# 设置自动重启定时任务
|
||||
setup_restart_cron() {
|
||||
log_info "设置自动重启定时任务..."
|
||||
|
||||
# 使用当前版本目录中的重启脚本
|
||||
local restart_script="$INSTALL_DIR/restart_unhealthy.sh"
|
||||
|
||||
# 检查脚本是否存在
|
||||
if [[ ! -f "$restart_script" ]]; then
|
||||
log_warning "重启脚本不存在: $restart_script"
|
||||
log_info "跳过自动重启定时任务设置"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 确保脚本可执行
|
||||
chmod +x "$restart_script"
|
||||
|
||||
# 创建临时crontab文件
|
||||
local temp_cron="/tmp/crontab_$$"
|
||||
crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron"
|
||||
|
||||
# 检查是否已存在自动重启定时任务
|
||||
if check_cron_task_exists "restart_unhealthy.sh" "$temp_cron"; then
|
||||
log_info "发现旧的自动重启定时任务,正在更新..."
|
||||
# 删除所有包含restart_unhealthy.sh的行
|
||||
grep -v "restart_unhealthy.sh" "$temp_cron" > "$temp_cron.new"
|
||||
mv "$temp_cron.new" "$temp_cron"
|
||||
log_info "旧的自动重启定时任务已删除"
|
||||
fi
|
||||
|
||||
# 添加新的定时任务(每2分钟执行一次)
|
||||
echo "# Argus-Metrics 自动重启定时任务" >> "$temp_cron"
|
||||
echo "*/2 * * * * $restart_script >> $INSTALL_DIR/.restart.log 2>&1" >> "$temp_cron"
|
||||
|
||||
# 安装新的crontab
|
||||
if crontab "$temp_cron"; then
|
||||
log_success "自动重启定时任务设置成功"
|
||||
log_info " 执行频率: 每2分钟"
|
||||
log_info " 日志文件: $INSTALL_DIR/.restart.log"
|
||||
log_info " 查看定时任务: crontab -l"
|
||||
log_info " 删除定时任务: crontab -e"
|
||||
else
|
||||
log_error "自动重启定时任务设置失败"
|
||||
rm -f "$temp_cron"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 清理临时文件
|
||||
rm -f "$temp_cron"
|
||||
|
||||
log_info "自动重启检查通过crontab自动执行"
|
||||
}
|
||||
|
||||
# 显示安装信息
|
||||
show_install_info() {
|
||||
log_success "Argus-Metrics All-in-One 安装完成!"
|
||||
echo
|
||||
log_info "安装日志已保存到: $LOG_FILE"
|
||||
log_info "如需查看详细日志,请执行: cat $LOG_FILE"
|
||||
echo
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
if [[ -d "$TEMP_DIR" ]]; then
|
||||
rm -rf "$TEMP_DIR"
|
||||
fi
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
echo "=========================================="
|
||||
echo " Argus-Metrics All-in-One 安装脚本 v1.0"
|
||||
echo "=========================================="
|
||||
echo
|
||||
|
||||
# 初始化日志文件
|
||||
mkdir -p "$INSTALL_DIR"
|
||||
echo "==========================================" > "$LOG_FILE"
|
||||
echo " Argus-Metrics All-in-One 安装日志" >> "$LOG_FILE"
|
||||
echo " 开始时间: $(date '+%Y-%m-%d %H:%M:%S')" >> "$LOG_FILE"
|
||||
echo "==========================================" >> "$LOG_FILE"
|
||||
|
||||
# 加载配置文件
|
||||
load_config
|
||||
|
||||
log_info "安装目录: $INSTALL_DIR"
|
||||
log_info "日志文件: $LOG_FILE"
|
||||
echo
|
||||
|
||||
check_root
|
||||
check_system
|
||||
find_version_file
|
||||
create_install_dirs
|
||||
install_system_deps
|
||||
parse_version_info
|
||||
verify_checksums
|
||||
install_components
|
||||
copy_config_files
|
||||
create_install_record
|
||||
setup_health_check_cron
|
||||
setup_dns_sync_cron
|
||||
setup_version_check_cron
|
||||
setup_restart_cron
|
||||
|
||||
# 注释掉立即执行健康检查,避免与cron任务重复执行
|
||||
# log_info "立即执行一次健康检查..."
|
||||
# local check_health_script="$INSTALL_DIR/check_health.sh"
|
||||
# if [[ -f "$check_health_script" ]]; then
|
||||
# if "$check_health_script" >> "$INSTALL_DIR/.health_check.log" 2>&1; then
|
||||
# log_success "健康检查执行完成"
|
||||
# else
|
||||
# log_warning "健康检查执行失败,请检查日志: $INSTALL_DIR/.health_check.log"
|
||||
# fi
|
||||
# else
|
||||
# log_warning "健康检查脚本不存在: $check_health_script"
|
||||
# fi
|
||||
|
||||
show_install_info
|
||||
}
|
||||
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
474
src/metric/client-plugins/all-in-one-full/scripts/package_artifact.sh
Executable file
474
src/metric/client-plugins/all-in-one-full/scripts/package_artifact.sh
Executable file
@ -0,0 +1,474 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# 日志函数
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# 显示帮助信息
|
||||
show_help() {
|
||||
echo "AIOps All-in-One 打包脚本"
|
||||
echo
|
||||
echo "用法: $0 [选项]"
|
||||
echo
|
||||
echo "选项:"
|
||||
echo " --force 强制重新打包,即使版本已存在"
|
||||
echo " --help 显示此帮助信息"
|
||||
echo
|
||||
echo "示例:"
|
||||
echo " $0 # 正常打包,跳过已存在的版本"
|
||||
echo " $0 --force # 强制重新打包"
|
||||
echo
|
||||
}
|
||||
|
||||
# 解析命令行参数
|
||||
FORCE_PACKAGE=false
|
||||
if [[ "$1" == "--force" ]]; then
|
||||
FORCE_PACKAGE=true
|
||||
log_info "强制重新打包模式"
|
||||
elif [[ "$1" == "--help" || "$1" == "-h" ]]; then
|
||||
show_help
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 获取当前目录和版本
|
||||
CURRENT_DIR=$(pwd)
|
||||
VERSION=$(cat config/VERSION 2>/dev/null || echo "1.0.0")
|
||||
ARTIFACT_DIR="artifact/$VERSION"
|
||||
|
||||
log_info "开始打包 AIOps All-in-One 安装包 v$VERSION"
|
||||
|
||||
# 检查必要文件
|
||||
log_info "检查必要文件..."
|
||||
if [[ ! -f "config/VERSION" ]]; then
|
||||
log_error "VERSION 文件不存在"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -f "config/checklist" ]]; then
|
||||
log_error "checklist 文件不存在"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 检查是否已存在该版本
|
||||
if [[ -d "$ARTIFACT_DIR" && "$FORCE_PACKAGE" == "false" ]]; then
|
||||
log_info "检查版本 $VERSION 是否已存在..."
|
||||
|
||||
# 检查 version.json 是否存在
|
||||
if [[ -f "$ARTIFACT_DIR/version.json" ]]; then
|
||||
log_info "找到已存在的版本信息文件"
|
||||
|
||||
# 检查是否所有组件文件都存在
|
||||
missing_files=0
|
||||
existing_components=0
|
||||
|
||||
# 解析已存在的 version.json 来检查文件
|
||||
if command -v jq &> /dev/null; then
|
||||
# 使用 jq 解析
|
||||
while IFS= read -r component; do
|
||||
existing_components=$((existing_components + 1))
|
||||
# 查找对应的 tar 文件
|
||||
found_file=false
|
||||
for file in "$ARTIFACT_DIR/${component}-"*.tar.gz; do
|
||||
if [[ -f "$file" ]]; then
|
||||
found_file=true
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [[ "$found_file" == "false" ]]; then
|
||||
missing_files=$((missing_files + 1))
|
||||
log_warning " 缺少文件: $component"
|
||||
fi
|
||||
done < <(jq -r '.artifact_list | keys[]' "$ARTIFACT_DIR/version.json" 2>/dev/null)
|
||||
else
|
||||
# 简单的文件检查
|
||||
for file in "$ARTIFACT_DIR"/*.tar.gz; do
|
||||
if [[ -f "$file" ]]; then
|
||||
existing_components=$((existing_components + 1))
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# 如果所有文件都存在,则跳过打包
|
||||
if [[ $missing_files -eq 0 && $existing_components -gt 0 ]]; then
|
||||
log_success "版本 $VERSION 已完整打包,跳过重复打包"
|
||||
echo
|
||||
echo "现有文件:"
|
||||
ls -la "$ARTIFACT_DIR"
|
||||
echo
|
||||
echo "如需强制重新打包,请删除目录: rm -rf $ARTIFACT_DIR"
|
||||
echo "或使用: ./package.sh --force"
|
||||
exit 0
|
||||
else
|
||||
log_warning "版本 $VERSION 存在但不完整,将重新打包"
|
||||
log_info " 现有组件: $existing_components"
|
||||
log_info " 缺少文件: $missing_files"
|
||||
fi
|
||||
else
|
||||
log_warning "版本目录存在但缺少 version.json,将重新打包"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 创建 artifact 目录
|
||||
mkdir -p "$ARTIFACT_DIR"
|
||||
log_info "创建输出目录: $ARTIFACT_DIR"
|
||||
|
||||
# 创建临时文件存储数据
|
||||
TEMP_DIR=$(mktemp -d)
|
||||
COMPONENTS_FILE="$TEMP_DIR/components.txt"
|
||||
VERSIONS_FILE="$TEMP_DIR/versions.txt"
|
||||
DEPENDENCIES_FILE="$TEMP_DIR/dependencies.txt"
|
||||
INSTALL_ORDER_FILE="$TEMP_DIR/install_order.txt"
|
||||
CHECKSUMS_FILE="$TEMP_DIR/checksums.txt"
|
||||
ARTIFACT_LIST_FILE="$TEMP_DIR/artifact_list.txt"
|
||||
|
||||
# 解析 checklist 文件
|
||||
log_info "解析组件清单..."
|
||||
line_num=0
|
||||
component_count=0
|
||||
|
||||
while IFS= read -r line; do
|
||||
[[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
|
||||
|
||||
line_num=$((line_num + 1))
|
||||
|
||||
# 解析行: 组件名 目录路径 版本 [依赖组件] [安装顺序]
|
||||
read -r component component_path version dep_component order <<< "$line"
|
||||
|
||||
if [[ -z "$component" || -z "$component_path" || -z "$version" ]]; then
|
||||
log_warning "跳过无效行 $line_num: $line"
|
||||
continue
|
||||
fi
|
||||
|
||||
# 存储组件信息
|
||||
echo "$component" >> "$COMPONENTS_FILE"
|
||||
echo "$component:$version" >> "$VERSIONS_FILE"
|
||||
echo "$component:$component_path" >> "$TEMP_DIR/component_paths.txt"
|
||||
|
||||
if [[ -n "$dep_component" && "$dep_component" != "$component" ]]; then
|
||||
echo "$component:$dep_component" >> "$DEPENDENCIES_FILE"
|
||||
fi
|
||||
|
||||
if [[ -n "$order" && "$order" =~ ^[0-9]+$ ]]; then
|
||||
echo "$order:$component" >> "$INSTALL_ORDER_FILE"
|
||||
else
|
||||
# 如果没有指定顺序,按解析顺序分配
|
||||
echo "$line_num:$component" >> "$INSTALL_ORDER_FILE"
|
||||
fi
|
||||
|
||||
component_count=$((component_count + 1))
|
||||
log_info " - $component v$version"
|
||||
done < config/checklist
|
||||
|
||||
if [[ $component_count -eq 0 ]]; then
|
||||
log_error "没有找到有效的组件"
|
||||
rm -rf "$TEMP_DIR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_success "找到 $component_count 个组件"
|
||||
|
||||
# 检查组件目录是否存在
|
||||
log_info "检查组件目录..."
|
||||
missing_components=()
|
||||
|
||||
while IFS= read -r component; do
|
||||
# 获取组件路径
|
||||
component_path=$(grep "^$component:" "$TEMP_DIR/component_paths.txt" | cut -d':' -f2-)
|
||||
if [[ -z "$component_path" ]]; then
|
||||
log_error "未找到组件 $component 的路径配置"
|
||||
log_info "请检查 component_paths.txt 文件或添加路径配置"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -d "$component_path" ]]; then
|
||||
missing_components+=("$component:$component_path")
|
||||
fi
|
||||
done < "$COMPONENTS_FILE"
|
||||
|
||||
if [[ ${#missing_components[@]} -gt 0 ]]; then
|
||||
log_error "以下组件目录不存在:"
|
||||
for component_path in "${missing_components[@]}"; do
|
||||
echo " - $component_path"
|
||||
done
|
||||
rm -rf "$TEMP_DIR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 打包各个组件
|
||||
log_info "开始打包组件..."
|
||||
|
||||
while IFS= read -r component; do
|
||||
# 获取组件版本和路径
|
||||
version=$(grep "^$component:" "$VERSIONS_FILE" | cut -d':' -f2)
|
||||
component_path=$(grep "^$component:" "$TEMP_DIR/component_paths.txt" | cut -d':' -f2-)
|
||||
if [[ -z "$component_path" ]]; then
|
||||
log_error "未找到组件 $component 的路径配置"
|
||||
log_info "请检查 component_paths.txt 文件或添加路径配置"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_info "打包 $component v$version..."
|
||||
log_info " 组件路径: $component_path"
|
||||
|
||||
# 进入组件目录
|
||||
cd "$component_path"
|
||||
|
||||
# 检查组件是否有 package.sh
|
||||
if [[ ! -f "package.sh" ]]; then
|
||||
log_error "$component 缺少 package.sh 文件"
|
||||
cd "$CURRENT_DIR"
|
||||
rm -rf "$TEMP_DIR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 执行组件的打包脚本
|
||||
if ./package.sh; then
|
||||
# 查找生成的 tar 包
|
||||
tar_file=$(find . -name "*.tar.gz" -type f | head -1)
|
||||
if [[ -n "$tar_file" ]]; then
|
||||
# 移动到 artifact 目录
|
||||
mv "$tar_file" "$CURRENT_DIR/$ARTIFACT_DIR/"
|
||||
tar_filename=$(basename "$tar_file")
|
||||
|
||||
# 计算校验和
|
||||
checksum=$(sha256sum "$CURRENT_DIR/$ARTIFACT_DIR/$tar_filename" | cut -d' ' -f1)
|
||||
echo "$component:sha256:$checksum" >> "$CHECKSUMS_FILE"
|
||||
echo "$component:$version" >> "$ARTIFACT_LIST_FILE"
|
||||
|
||||
# 将完整的文件名存储到安装顺序文件中
|
||||
echo "$tar_filename" >> "$TEMP_DIR/install_order_files.txt"
|
||||
|
||||
log_success " $component 打包完成: $tar_filename"
|
||||
else
|
||||
log_error "$component 打包失败,未找到生成的 tar 包"
|
||||
cd "$CURRENT_DIR"
|
||||
rm -rf "$TEMP_DIR"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
log_error "$component 打包失败"
|
||||
cd "$CURRENT_DIR"
|
||||
rm -rf "$TEMP_DIR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 返回主目录
|
||||
cd "$CURRENT_DIR"
|
||||
done < "$COMPONENTS_FILE"
|
||||
|
||||
# 生成 version.json
|
||||
log_info "生成版本信息文件..."
|
||||
version_json="$ARTIFACT_DIR/version.json"
|
||||
|
||||
# 构建依赖关系 JSON
|
||||
deps_json=""
|
||||
if [[ -f "$DEPENDENCIES_FILE" ]]; then
|
||||
first=true
|
||||
while IFS= read -r line; do
|
||||
component=$(echo "$line" | cut -d':' -f1)
|
||||
dep=$(echo "$line" | cut -d':' -f2)
|
||||
if [[ "$first" == "true" ]]; then
|
||||
deps_json="\"$component\":[\"$dep\"]"
|
||||
first=false
|
||||
else
|
||||
deps_json="$deps_json,\"$component\":[\"$dep\"]"
|
||||
fi
|
||||
done < "$DEPENDENCIES_FILE"
|
||||
fi
|
||||
|
||||
# 构建安装顺序数组
|
||||
order_array=""
|
||||
if [[ -f "$TEMP_DIR/install_order_files.txt" ]]; then
|
||||
first=true
|
||||
while IFS= read -r filename; do
|
||||
if [[ "$first" == "true" ]]; then
|
||||
order_array="\"$filename\""
|
||||
first=false
|
||||
else
|
||||
order_array="$order_array,\"$filename\""
|
||||
fi
|
||||
done < "$TEMP_DIR/install_order_files.txt"
|
||||
fi
|
||||
|
||||
# 构建 artifact_list JSON
|
||||
artifact_json=""
|
||||
if [[ -f "$ARTIFACT_LIST_FILE" ]]; then
|
||||
first=true
|
||||
while IFS= read -r line; do
|
||||
component=$(echo "$line" | cut -d':' -f1)
|
||||
version=$(echo "$line" | cut -d':' -f2)
|
||||
if [[ "$first" == "true" ]]; then
|
||||
artifact_json="\"$component\":\"$version\""
|
||||
first=false
|
||||
else
|
||||
artifact_json="$artifact_json,\"$component\":\"$version\""
|
||||
fi
|
||||
done < "$ARTIFACT_LIST_FILE"
|
||||
fi
|
||||
|
||||
# 构建 checksums JSON
|
||||
checksums_json=""
|
||||
if [[ -f "$CHECKSUMS_FILE" ]]; then
|
||||
first=true
|
||||
while IFS= read -r line; do
|
||||
component=$(echo "$line" | cut -d':' -f1)
|
||||
checksum=$(echo "$line" | cut -d':' -f2-)
|
||||
if [[ "$first" == "true" ]]; then
|
||||
checksums_json="\"$component\":\"$checksum\""
|
||||
first=false
|
||||
else
|
||||
checksums_json="$checksums_json,\"$component\":\"$checksum\""
|
||||
fi
|
||||
done < "$CHECKSUMS_FILE"
|
||||
fi
|
||||
|
||||
# 生成完整的 version.json
|
||||
cat > "$version_json" << EOF
|
||||
{
|
||||
"version": "$VERSION",
|
||||
"build_time": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
|
||||
"artifact_list": {
|
||||
$artifact_json
|
||||
},
|
||||
"checksums": {
|
||||
$checksums_json
|
||||
},
|
||||
"dependencies": {
|
||||
$deps_json
|
||||
},
|
||||
"install_order": [
|
||||
$order_array
|
||||
]
|
||||
}
|
||||
EOF
|
||||
|
||||
log_success "版本信息文件生成完成: $version_json"
|
||||
|
||||
# 复制`安装`脚本到 artifact 目录
|
||||
log_info "复制安装脚本..."
|
||||
if [[ -f "scripts/install_artifact.sh" ]]; then
|
||||
cp "scripts/install_artifact.sh" "$ARTIFACT_DIR/install.sh"
|
||||
chmod +x "$ARTIFACT_DIR/install.sh"
|
||||
log_success "安装脚本复制完成: $ARTIFACT_DIR/install.sh"
|
||||
else
|
||||
log_warning "scripts/install_artifact.sh 文件不存在"
|
||||
fi
|
||||
|
||||
# 复制`卸载`脚本到 artifact 目录
|
||||
log_info "复制卸载脚本..."
|
||||
if [[ -f "scripts/uninstall_artifact.sh" ]]; then
|
||||
cp "scripts/uninstall_artifact.sh" "$ARTIFACT_DIR/uninstall.sh"
|
||||
chmod +x "$ARTIFACT_DIR/uninstall.sh"
|
||||
log_success "卸载脚本复制完成: $ARTIFACT_DIR/uninstall.sh"
|
||||
else
|
||||
log_warning "scripts/uninstall_artifact.sh 文件不存在"
|
||||
fi
|
||||
|
||||
# 复制`健康检查`脚本到 artifact 目录
|
||||
log_info "复制健康检查脚本..."
|
||||
if [[ -f "scripts/check_health.sh" ]]; then
|
||||
cp "scripts/check_health.sh" "$ARTIFACT_DIR/check_health.sh"
|
||||
chmod +x "$ARTIFACT_DIR/check_health.sh"
|
||||
log_success "健康检查脚本复制完成: $ARTIFACT_DIR/check_health.sh"
|
||||
else
|
||||
log_warning "scripts/check_health.sh 文件不存在"
|
||||
fi
|
||||
|
||||
# 复制`DNS 同步`脚本到 artifact 目录
|
||||
log_info "复制 DNS 同步脚本..."
|
||||
if [[ -f "scripts/sync_dns.sh" ]]; then
|
||||
cp "scripts/sync_dns.sh" "$ARTIFACT_DIR/sync_dns.sh"
|
||||
chmod +x "$ARTIFACT_DIR/sync_dns.sh"
|
||||
log_success "DNS 同步脚本复制完成: $ARTIFACT_DIR/sync_dns.sh"
|
||||
else
|
||||
log_warning "scripts/sync_dns.sh 文件不存在"
|
||||
fi
|
||||
|
||||
# 复制`版本校验`脚本到 artifact 目录
|
||||
log_info "复制版本校验脚本..."
|
||||
if [[ -f "scripts/check_version.sh" ]]; then
|
||||
cp "scripts/check_version.sh" "$ARTIFACT_DIR/check_version.sh"
|
||||
chmod +x "$ARTIFACT_DIR/check_version.sh"
|
||||
log_success "版本校验脚本复制完成: $ARTIFACT_DIR/check_version.sh"
|
||||
else
|
||||
log_warning "scripts/check_version.sh 文件不存在"
|
||||
fi
|
||||
|
||||
# 复制`自动重启`脚本到 artifact 目录
|
||||
log_info "复制自动重启脚本..."
|
||||
if [[ -f "scripts/restart_unhealthy.sh" ]]; then
|
||||
cp "scripts/restart_unhealthy.sh" "$ARTIFACT_DIR/restart_unhealthy.sh"
|
||||
chmod +x "$ARTIFACT_DIR/restart_unhealthy.sh"
|
||||
log_success "自动重启脚本复制完成: $ARTIFACT_DIR/restart_unhealthy.sh"
|
||||
else
|
||||
log_warning "scripts/restart_unhealthy.sh 文件不存在"
|
||||
fi
|
||||
|
||||
# 复制配置文件到 artifact 目录
|
||||
log_info "复制配置文件..."
|
||||
if [[ -f "config/config.env" ]]; then
|
||||
cp "config/config.env" "$ARTIFACT_DIR/"
|
||||
log_success "配置文件复制完成: $ARTIFACT_DIR/config.env"
|
||||
else
|
||||
log_warning "config 目录不存在,跳过配置文件复制"
|
||||
fi
|
||||
|
||||
# DNS 配置文件不需要复制到版本目录,直接从 FTP 服务器根目录获取
|
||||
|
||||
# 复制 deps 目录到 artifact 目录
|
||||
log_info "复制系统依赖包..."
|
||||
if [[ -d "deps" ]]; then
|
||||
cp -r "deps" "$ARTIFACT_DIR/"
|
||||
log_success "系统依赖包复制完成: $ARTIFACT_DIR/deps"
|
||||
|
||||
# 显示deps目录内容
|
||||
log_info " 依赖包列表:"
|
||||
find "$ARTIFACT_DIR/deps" -name "*.tar.gz" -exec basename {} \; | while read dep_file; do
|
||||
log_info " - $dep_file"
|
||||
done
|
||||
else
|
||||
log_warning "deps 目录不存在,跳过依赖包复制"
|
||||
fi
|
||||
|
||||
# 显示打包结果
|
||||
log_success "打包完成!"
|
||||
echo
|
||||
echo "版本: $VERSION"
|
||||
echo "输出目录: $ARTIFACT_DIR"
|
||||
echo "包含组件:"
|
||||
if [[ -f "$ARTIFACT_LIST_FILE" ]]; then
|
||||
while IFS= read -r line; do
|
||||
component=$(echo "$line" | cut -d':' -f1)
|
||||
version=$(echo "$line" | cut -d':' -f2)
|
||||
echo " - $component v$version"
|
||||
done < "$ARTIFACT_LIST_FILE"
|
||||
fi
|
||||
echo
|
||||
echo "文件列表:"
|
||||
ls -la "$ARTIFACT_DIR"
|
||||
echo
|
||||
|
||||
# 清理临时文件
|
||||
rm -rf "$TEMP_DIR"
|
||||
293
src/metric/client-plugins/all-in-one-full/scripts/publish_artifact.sh
Executable file
293
src/metric/client-plugins/all-in-one-full/scripts/publish_artifact.sh
Executable file
@ -0,0 +1,293 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# 颜色定义
|
||||
GREEN='\033[0;32m'
|
||||
BLUE='\033[0;34m'
|
||||
RED='\033[0;31m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# 显示帮助信息
|
||||
show_help() {
|
||||
echo "Argus-Metric Artifact 发布脚本"
|
||||
echo
|
||||
echo "用法: $0 <版本号> [选项]"
|
||||
echo
|
||||
echo "参数:"
|
||||
echo " <版本号> 要发布的版本号,对应 artifact 目录中的版本"
|
||||
echo
|
||||
echo "选项:"
|
||||
echo " --output-dir <路径> 指定输出目录 (默认: /private/argus/ftp/share/)"
|
||||
echo " --owner <uid:gid> 指定文件所有者 (默认: 2133:2015)"
|
||||
echo " -h, --help 显示此帮助信息"
|
||||
echo
|
||||
echo "示例:"
|
||||
echo " $0 1.20.0 # 使用默认配置发布"
|
||||
echo " $0 1.20.0 --output-dir /tmp/publish # 指定输出目录"
|
||||
echo " $0 1.20.0 --owner 1000:1000 # 指定文件所有者"
|
||||
echo " $0 1.20.0 --output-dir /srv/ftp --owner root:root # 同时指定两者"
|
||||
echo
|
||||
}
|
||||
|
||||
# 默认配置
|
||||
DEFAULT_PUBLISH_DIR="/private/argus/ftp/share/"
|
||||
DEFAULT_OWNER="2133:2015"
|
||||
|
||||
# 解析参数
|
||||
VERSION=""
|
||||
PUBLISH_DIR="$DEFAULT_PUBLISH_DIR"
|
||||
OWNER="$DEFAULT_OWNER"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
--output-dir)
|
||||
PUBLISH_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--owner)
|
||||
OWNER="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
if [[ -z "$VERSION" ]]; then
|
||||
VERSION="$1"
|
||||
shift
|
||||
else
|
||||
log_error "未知参数: $1"
|
||||
show_help
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# 检查版本号是否提供
|
||||
if [[ -z "$VERSION" ]]; then
|
||||
log_error "请提供版本号参数"
|
||||
show_help
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ARTIFACT_DIR="artifact/$VERSION"
|
||||
|
||||
# 检查版本目录是否存在
|
||||
if [[ ! -d "$ARTIFACT_DIR" ]]; then
|
||||
log_error "版本目录不存在: $ARTIFACT_DIR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_info "开始发布版本: $VERSION"
|
||||
log_info "输出目录: $PUBLISH_DIR"
|
||||
log_info "文件所有者: $OWNER"
|
||||
|
||||
# 确保发布目录存在
|
||||
log_info "确保发布目录存在: $PUBLISH_DIR"
|
||||
mkdir -p "$PUBLISH_DIR"
|
||||
|
||||
# 解析并校验所有者(仅在需要时 chown)
|
||||
IFS=':' read -r OWNER_UID OWNER_GID <<< "$OWNER"
|
||||
if [[ -z "$OWNER_UID" || -z "$OWNER_GID" ]]; then
|
||||
log_error "--owner 格式不正确,应为 uid:gid"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CURRENT_UID=$(id -u)
|
||||
CURRENT_GID=$(id -g)
|
||||
if [[ "$OWNER_UID" != "$CURRENT_UID" || "$OWNER_GID" != "$CURRENT_GID" ]]; then
|
||||
if [[ "$CURRENT_UID" -ne 0 ]]; then
|
||||
log_error "当前用户 (${CURRENT_UID}:${CURRENT_GID}) 无法设置所有者为 ${OWNER_UID}:${OWNER_GID}"
|
||||
log_error "请以目标用户运行脚本或预先调整目录权限"
|
||||
exit 1
|
||||
fi
|
||||
NEED_CHOWN=true
|
||||
else
|
||||
NEED_CHOWN=false
|
||||
fi
|
||||
|
||||
# 创建临时目录用于打包
|
||||
TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$"
|
||||
mkdir -p "$TEMP_PACKAGE_DIR"
|
||||
|
||||
# 复制所有 tar.gz 文件到临时目录
|
||||
log_info "准备 artifact 文件..."
|
||||
tar_files=$(find "$ARTIFACT_DIR" -name "*.tar.gz" -type f)
|
||||
|
||||
if [[ -z "$tar_files" ]]; then
|
||||
log_error "在 $ARTIFACT_DIR 中未找到 tar.gz 文件"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for file in $tar_files; do
|
||||
filename=$(basename "$file")
|
||||
log_info " 准备: $filename"
|
||||
cp "$file" "$TEMP_PACKAGE_DIR/"
|
||||
done
|
||||
|
||||
# 复制版本信息文件
|
||||
if [[ -f "$ARTIFACT_DIR/version.json" ]]; then
|
||||
log_info "复制版本信息文件..."
|
||||
cp "$ARTIFACT_DIR/version.json" "$TEMP_PACKAGE_DIR/"
|
||||
fi
|
||||
|
||||
# 复制健康检查脚本
|
||||
if [[ -f "$ARTIFACT_DIR/check_health.sh" ]]; then
|
||||
log_info "复制健康检查脚本..."
|
||||
cp "$ARTIFACT_DIR/check_health.sh" "$TEMP_PACKAGE_DIR/"
|
||||
elif [[ -f "scripts/check_health.sh" ]]; then
|
||||
log_info "复制健康检查脚本 (从当前目录)..."
|
||||
cp "scripts/check_health.sh" "$TEMP_PACKAGE_DIR/"
|
||||
else
|
||||
log_warning "未找到 check_health.sh 文件"
|
||||
fi
|
||||
|
||||
# 复制 DNS 同步脚本
|
||||
if [[ -f "$ARTIFACT_DIR/sync_dns.sh" ]]; then
|
||||
log_info "复制 DNS 同步脚本..."
|
||||
cp "$ARTIFACT_DIR/sync_dns.sh" "$TEMP_PACKAGE_DIR/"
|
||||
elif [[ -f "scripts/sync_dns.sh" ]]; then
|
||||
log_info "复制 DNS 同步脚本 (从当前目录)..."
|
||||
cp "scripts/sync_dns.sh" "$TEMP_PACKAGE_DIR/"
|
||||
else
|
||||
log_warning "未找到 sync_dns.sh 文件"
|
||||
fi
|
||||
|
||||
# 复制版本校验脚本
|
||||
if [[ -f "$ARTIFACT_DIR/check_version.sh" ]]; then
|
||||
log_info "复制版本校验脚本..."
|
||||
cp "$ARTIFACT_DIR/check_version.sh" "$TEMP_PACKAGE_DIR/"
|
||||
elif [[ -f "scripts/check_version.sh" ]]; then
|
||||
log_info "复制版本校验脚本 (从当前目录)..."
|
||||
cp "scripts/check_version.sh" "$TEMP_PACKAGE_DIR/"
|
||||
else
|
||||
log_warning "未找到 check_version.sh 文件"
|
||||
fi
|
||||
|
||||
# 复制重启失败脚本
|
||||
if [[ -f "$ARTIFACT_DIR/restart_unhealthy.sh" ]]; then
|
||||
log_info "复制重启失败脚本..."
|
||||
cp "$ARTIFACT_DIR/restart_unhealthy.sh" "$TEMP_PACKAGE_DIR/"
|
||||
elif [[ -f "scripts/restart_unhealthy.sh" ]]; then
|
||||
log_info "复制重启失败脚本 (从当前目录)..."
|
||||
cp "scripts/restart_unhealthy.sh" "$TEMP_PACKAGE_DIR/"
|
||||
else
|
||||
log_warning "未找到 restart_unhealthy.sh 文件"
|
||||
fi
|
||||
|
||||
# 复制安装脚本并重命名为 install.sh
|
||||
if [[ -f "scripts/install_artifact.sh" ]]; then
|
||||
log_info "复制安装脚本..."
|
||||
cp "scripts/install_artifact.sh" "$TEMP_PACKAGE_DIR/install.sh"
|
||||
fi
|
||||
|
||||
if [[ -f "scripts/uninstall_artifact.sh" ]]; then
|
||||
log_info "复制卸载脚本..."
|
||||
cp "scripts/uninstall_artifact.sh" "$TEMP_PACKAGE_DIR/uninstall.sh"
|
||||
fi
|
||||
|
||||
# 复制配置文件
|
||||
if [[ -f "$ARTIFACT_DIR/config.env" ]]; then
|
||||
log_info "复制配置文件..."
|
||||
cp "$ARTIFACT_DIR/config.env" "$TEMP_PACKAGE_DIR/"
|
||||
log_success "配置文件复制完成"
|
||||
else
|
||||
log_warning "未找到 config.env 文件"
|
||||
fi
|
||||
|
||||
# DNS 配置文件将在后面直接复制到发布目录根目录,不包含在 tar.gz 中
|
||||
|
||||
# 复制 deps 目录
|
||||
if [[ -d "$ARTIFACT_DIR/deps" ]]; then
|
||||
log_info "复制系统依赖包..."
|
||||
cp -r "$ARTIFACT_DIR/deps" "$TEMP_PACKAGE_DIR/"
|
||||
log_success "系统依赖包复制完成"
|
||||
fi
|
||||
|
||||
# 创建tar包,使用新的命名规范
|
||||
TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz"
|
||||
log_info "创建发布包: $TAR_NAME"
|
||||
cd "$TEMP_PACKAGE_DIR"
|
||||
tar -czf "$PUBLISH_DIR/$TAR_NAME" .
|
||||
cd - > /dev/null
|
||||
|
||||
# 设置文件所有者
|
||||
log_info "设置文件所有者为: $OWNER"
|
||||
if [[ "$NEED_CHOWN" == true ]]; then
|
||||
chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME"
|
||||
fi
|
||||
|
||||
# 清理临时目录
|
||||
rm -rf "$TEMP_PACKAGE_DIR"
|
||||
|
||||
# 更新 LATEST_VERSION 文件
|
||||
log_info "更新 LATEST_VERSION 文件..."
|
||||
echo "$VERSION" > "$PUBLISH_DIR/LATEST_VERSION"
|
||||
if [[ "$NEED_CHOWN" == true ]]; then
|
||||
chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION"
|
||||
fi
|
||||
|
||||
# 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制)
|
||||
if [[ -f "config/dns.conf" ]]; then
|
||||
log_info "复制 DNS 配置文件到发布目录根目录..."
|
||||
cp "config/dns.conf" "$PUBLISH_DIR/"
|
||||
if [[ "$NEED_CHOWN" == true ]]; then
|
||||
chown "$OWNER" "$PUBLISH_DIR/dns.conf"
|
||||
fi
|
||||
log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf"
|
||||
else
|
||||
log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制"
|
||||
fi
|
||||
|
||||
# 复制 setup.sh 到发布目录
|
||||
if [[ -f "scripts/setup.sh" ]]; then
|
||||
log_info "复制 setup.sh 到发布目录..."
|
||||
cp "scripts/setup.sh" "$PUBLISH_DIR/"
|
||||
if [[ "$NEED_CHOWN" == true ]]; then
|
||||
chown "$OWNER" "$PUBLISH_DIR/setup.sh"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 显示发布结果
|
||||
log_success "版本 $VERSION 发布完成!"
|
||||
echo
|
||||
echo "发布目录: $PUBLISH_DIR"
|
||||
echo "发布包: $PUBLISH_DIR/$TAR_NAME"
|
||||
echo "包大小: $(du -h "$PUBLISH_DIR/$TAR_NAME" | cut -f1)"
|
||||
echo "最新版本: $(cat "$PUBLISH_DIR/LATEST_VERSION")"
|
||||
echo
|
||||
echo "发布目录中的文件:"
|
||||
ls -la "$PUBLISH_DIR" | while read line; do
|
||||
echo " $line"
|
||||
done
|
||||
echo
|
||||
echo "使用方法:"
|
||||
echo " 1. 确保 /srv/ftp/share 目录可通过 FTP 访问"
|
||||
echo " 2. 用户首先下载安装脚本:"
|
||||
echo " curl -u ftpuser:admin1234 ftp://10.211.55.4/setup.sh -o setup.sh"
|
||||
echo " 3. 然后执行安装 (自动获取最新版本):"
|
||||
echo " sudo sh setup.sh"
|
||||
echo " 4. 或者指定版本安装:"
|
||||
echo " sudo sh setup.sh --version $VERSION"
|
||||
echo " 5. 或者指定不同的FTP服务器:"
|
||||
echo " sudo sh setup.sh --server 192.168.1.100 --user myuser --password mypass"
|
||||
337
src/metric/client-plugins/all-in-one-full/scripts/restart_unhealthy.sh
Executable file
337
src/metric/client-plugins/all-in-one-full/scripts/restart_unhealthy.sh
Executable file
@ -0,0 +1,337 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 此脚本会检查各组件的健康状态,并重启不健康的组件
|
||||
|
||||
# PID 文件检测,防止重复执行
|
||||
PIDFILE="/var/run/restart_unhealthy.pid"
|
||||
if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then
|
||||
echo "自动重启脚本已在运行中,跳过本次执行" >&2
|
||||
exit 0
|
||||
fi
|
||||
echo $$ > "$PIDFILE"
|
||||
trap "rm -f $PIDFILE" EXIT
|
||||
|
||||
# 获取脚本所在目录
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record"
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m'
|
||||
|
||||
# 日志函数
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1"
|
||||
}
|
||||
|
||||
# 加载配置文件
|
||||
load_config() {
|
||||
local config_file="$SCRIPT_DIR/config.env"
|
||||
|
||||
if [[ -f "$config_file" ]]; then
|
||||
log_info "加载配置文件: $config_file"
|
||||
set -a
|
||||
source "$config_file"
|
||||
set +a
|
||||
log_success "配置文件加载完成"
|
||||
else
|
||||
log_warning "配置文件不存在: $config_file,使用默认配置"
|
||||
fi
|
||||
}
|
||||
|
||||
# 检查单个组件健康状态
|
||||
check_component_health() {
|
||||
local component_name="$1"
|
||||
local check_script_path="$2"
|
||||
|
||||
if [[ ! -f "$check_script_path" ]]; then
|
||||
log_error "$component_name: 健康检查脚本不存在: $check_script_path"
|
||||
return 1
|
||||
fi
|
||||
|
||||
if [[ ! -x "$check_script_path" ]]; then
|
||||
chmod +x "$check_script_path" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# 执行健康检查,捕获退出码
|
||||
if "$check_script_path" > /dev/null 2>&1; then
|
||||
return 0
|
||||
else
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 重启单个组件
|
||||
restart_component() {
|
||||
local component_name="$1"
|
||||
local install_dir="$2"
|
||||
|
||||
log_warning "正在重启组件: $component_name"
|
||||
|
||||
# 先执行卸载脚本
|
||||
local uninstall_script="$install_dir/uninstall.sh"
|
||||
if [[ -f "$uninstall_script" ]]; then
|
||||
log_info "$component_name: 执行卸载脚本..."
|
||||
chmod +x "$uninstall_script" 2>/dev/null || true
|
||||
# 使用 yes 命令自动回答所有确认提示
|
||||
yes 2>/dev/null | (cd "$install_dir" && "$uninstall_script") || true
|
||||
log_info "$component_name: 卸载完成"
|
||||
fi
|
||||
|
||||
# 执行安装脚本
|
||||
local install_script="$install_dir/install.sh"
|
||||
if [[ ! -f "$install_script" ]]; then
|
||||
log_error "$component_name: 安装脚本不存在: $install_script"
|
||||
return 1
|
||||
fi
|
||||
|
||||
chmod +x "$install_script" 2>/dev/null || true
|
||||
log_info "$component_name: 执行安装脚本..."
|
||||
|
||||
# 使用 yes 命令自动回答所有确认提示,传递 SCRIPT_DIR 作为参数
|
||||
yes 2>/dev/null | (cd "$install_dir" && "$install_script" "$SCRIPT_DIR") || true
|
||||
|
||||
log_info "$component_name: 安装脚本执行完成"
|
||||
return 0
|
||||
}
|
||||
|
||||
# 查找组件进程 PID
|
||||
find_component_pid() {
|
||||
local component_name="$1"
|
||||
local component_pid=""
|
||||
|
||||
case "$component_name" in
|
||||
"node-exporter")
|
||||
component_pid=$(pgrep -f "node_exporter" | head -1)
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(pgrep -f "node-exporter" | head -1)
|
||||
fi
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(ps aux | grep -v grep | grep "node_exporter" | awk '{print $2}' | head -1)
|
||||
fi
|
||||
;;
|
||||
"dcgm-exporter")
|
||||
component_pid=$(pgrep -f "dcgm-exporter" | head -1)
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(pgrep -f "dcgm_exporter" | head -1)
|
||||
fi
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(ps aux | grep -v grep | grep "dcgm-exporter" | awk '{print $2}' | head -1)
|
||||
fi
|
||||
;;
|
||||
"fluent-bit")
|
||||
component_pid=$(pgrep -f "fluent-bit" | head -1)
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(pgrep -f "fluent_bit" | head -1)
|
||||
fi
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(ps aux | grep -v grep | grep "fluent-bit" | awk '{print $2}' | head -1)
|
||||
fi
|
||||
;;
|
||||
"argus-agent")
|
||||
component_pid=$(pgrep -f "argus-agent" | head -1)
|
||||
if [[ -z "$component_pid" ]]; then
|
||||
component_pid=$(ps aux | grep -v grep | grep "argus-agent" | awk '{print $2}' | head -1)
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "$component_pid"
|
||||
}
|
||||
|
||||
# 更新安装记录文件中的 PID
|
||||
update_install_record_pid() {
|
||||
local component_name="$1"
|
||||
local new_pid="$2"
|
||||
|
||||
if [[ ! -f "$INSTALL_RECORD_FILE" ]]; then
|
||||
log_error "安装记录文件不存在: $INSTALL_RECORD_FILE"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 读取当前 PID
|
||||
local current_pid=""
|
||||
if command -v jq &> /dev/null; then
|
||||
current_pid=$(jq -r --arg comp "$component_name" '.components[$comp].pid // ""' "$INSTALL_RECORD_FILE" 2>/dev/null)
|
||||
fi
|
||||
|
||||
if [[ -z "$current_pid" ]]; then
|
||||
log_warning "$component_name: 无法读取当前 PID,跳过更新"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 使用 sed 精确替换 PID,保持原有格式不变
|
||||
# 只替换指定组件块中的 pid 字段
|
||||
local temp_file="${INSTALL_RECORD_FILE}.tmp"
|
||||
local in_component=0
|
||||
local updated=0
|
||||
|
||||
while IFS= read -r line; do
|
||||
if [[ "$line" =~ \"$component_name\":[[:space:]]*\{ ]]; then
|
||||
in_component=1
|
||||
echo "$line"
|
||||
elif [[ $in_component -eq 1 && "$line" =~ \"pid\":[[:space:]]*\"$current_pid\" ]]; then
|
||||
echo "$line" | sed "s/\"pid\": \"$current_pid\"/\"pid\": \"$new_pid\"/"
|
||||
updated=1
|
||||
in_component=0
|
||||
else
|
||||
echo "$line"
|
||||
if [[ "$line" =~ ^[[:space:]]*\}[[:space:]]*$ ]]; then
|
||||
in_component=0
|
||||
fi
|
||||
fi
|
||||
done < "$INSTALL_RECORD_FILE" > "$temp_file"
|
||||
|
||||
# 验证替换是否成功
|
||||
if [[ $updated -eq 1 ]]; then
|
||||
mv "$temp_file" "$INSTALL_RECORD_FILE"
|
||||
log_success "$component_name: PID 已更新为 $new_pid(原值: $current_pid)"
|
||||
return 0
|
||||
else
|
||||
log_error "$component_name: PID 替换失败"
|
||||
rm -f "$temp_file"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 从安装记录文件中读取组件信息
|
||||
read_install_record() {
|
||||
local install_record_file="$1"
|
||||
|
||||
if [[ ! -f "$install_record_file" ]]; then
|
||||
log_error "安装记录文件不存在: $install_record_file"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# 检查是否有 jq 命令来解析 JSON
|
||||
if command -v jq &> /dev/null; then
|
||||
# 使用 jq 解析 JSON
|
||||
local components_json
|
||||
if components_json=$(jq -r '.components | to_entries[] | "\(.key):\(.value.install_dir)"' "$install_record_file" 2>/dev/null); then
|
||||
echo "$components_json"
|
||||
return 0
|
||||
else
|
||||
log_error "无法解析安装记录文件 JSON 格式: $install_record_file"
|
||||
return 1
|
||||
fi
|
||||
else
|
||||
# 如果没有 jq,尝试简单的文本解析
|
||||
log_warning "jq 命令不可用,尝试简单文本解析"
|
||||
|
||||
# 查找所有 install_dir 行
|
||||
local components=()
|
||||
while IFS= read -r line; do
|
||||
if [[ "$line" =~ \"install_dir\":[[:space:]]*\"([^\"]+)\" ]]; then
|
||||
local install_dir="${BASH_REMATCH[1]}"
|
||||
# 从路径中提取组件名称
|
||||
local component_name=$(basename "$install_dir")
|
||||
components+=("$component_name:$install_dir")
|
||||
fi
|
||||
done < "$install_record_file"
|
||||
|
||||
if [[ ${#components[@]} -gt 0 ]]; then
|
||||
printf '%s\n' "${components[@]}"
|
||||
return 0
|
||||
else
|
||||
log_error "无法从安装记录文件中提取组件信息"
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
log_info "=========================================="
|
||||
log_info " 组件自动重启检查"
|
||||
log_info "=========================================="
|
||||
|
||||
# 检查是否是root用户
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
log_error "此脚本需要 root 权限运行"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 加载配置文件
|
||||
load_config
|
||||
|
||||
# 从安装记录文件中读取组件信息
|
||||
log_info "从安装记录文件读取组件信息: $INSTALL_RECORD_FILE"
|
||||
local components_info
|
||||
if ! components_info=$(read_install_record "$INSTALL_RECORD_FILE"); then
|
||||
log_error "无法读取安装记录文件,自动重启检查终止"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local restart_count=0
|
||||
local check_count=0
|
||||
|
||||
# 逐个检查组件
|
||||
while IFS= read -r component_info; do
|
||||
if [[ -n "$component_info" ]]; then
|
||||
IFS=':' read -r component_name install_dir <<< "$component_info"
|
||||
check_count=$((check_count + 1))
|
||||
|
||||
local check_script_path="$install_dir/check_health.sh"
|
||||
|
||||
log_info "检查组件: $component_name"
|
||||
|
||||
# 检查健康状态
|
||||
if check_component_health "$component_name" "$check_script_path"; then
|
||||
log_success "$component_name: 运行正常"
|
||||
else
|
||||
log_warning "$component_name: 健康检查失败,尝试重启"
|
||||
restart_count=$((restart_count + 1))
|
||||
|
||||
# 执行重启
|
||||
restart_component "$component_name" "$install_dir"
|
||||
|
||||
# 等待服务启动
|
||||
log_info "$component_name: 等待进程启动..."
|
||||
sleep 10
|
||||
|
||||
# 查找新的进程 PID
|
||||
local new_pid=$(find_component_pid "$component_name")
|
||||
if [[ -n "$new_pid" ]]; then
|
||||
log_info "$component_name: 找到新进程 PID: $new_pid"
|
||||
update_install_record_pid "$component_name" "$new_pid"
|
||||
else
|
||||
log_warning "$component_name: 未找到新进程 PID"
|
||||
fi
|
||||
|
||||
# 再次检查健康状态
|
||||
if check_component_health "$component_name" "$check_script_path"; then
|
||||
log_success "$component_name: 重启成功"
|
||||
else
|
||||
log_warning "$component_name: 重启后仍不健康,可能需要手动检查"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
done <<< "$components_info"
|
||||
|
||||
log_info "=========================================="
|
||||
log_info "检查完成: 共检查 $check_count 个组件,尝试重启 $restart_count 个"
|
||||
log_info "=========================================="
|
||||
|
||||
exit 0
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
|
||||
931
src/metric/client-plugins/all-in-one-full/scripts/setup.sh
Executable file
931
src/metric/client-plugins/all-in-one-full/scripts/setup.sh
Executable file
@ -0,0 +1,931 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# 加载配置文件(仅在解压后的目录中可用)
|
||||
load_config() {
|
||||
# setup.sh 脚本不需要配置文件,FTP参数通过命令行参数或环境变量提供
|
||||
log_info "setup.sh 脚本使用命令行参数或环境变量获取FTP配置"
|
||||
}
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# 日志函数
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
FTP_SERVER="${FTP_SERVER}"
|
||||
FTP_USER="${FTP_USER}"
|
||||
FTP_PASS="${FTP_PASS}"
|
||||
FTP_PORT="${FTP_PORT:-21}"
|
||||
BASE_URL="" # FTP基础URL (将在check_ftp_params中设置)
|
||||
LATEST_VERSION_URL="" # 版本文件URL (将在check_ftp_params中设置)
|
||||
TEMP_DIR="/tmp/argus-metric-install-$$"
|
||||
|
||||
# 安装目录配置
|
||||
DEFAULT_INSTALL_DIR="/opt/argus-metric" # 默认安装目录
|
||||
INSTALL_DIR="${INSTALL_DIR:-$DEFAULT_INSTALL_DIR}" # 可通过环境变量覆盖
|
||||
VERSIONS_DIR="$INSTALL_DIR/versions" # 版本目录
|
||||
BACKUPS_DIR="$INSTALL_DIR/backups" # 备份目录
|
||||
CURRENT_LINK="$INSTALL_DIR/current" # 当前版本软链接
|
||||
LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION" # 当前版本记录文件
|
||||
|
||||
# 检查必需的FTP参数
|
||||
check_ftp_params() {
|
||||
local missing_params=()
|
||||
|
||||
if [[ -z "$FTP_SERVER" ]]; then
|
||||
missing_params+=("FTP_SERVER")
|
||||
fi
|
||||
|
||||
if [[ -z "$FTP_USER" ]]; then
|
||||
missing_params+=("FTP_USER")
|
||||
fi
|
||||
|
||||
if [[ -z "$FTP_PASS" ]]; then
|
||||
missing_params+=("FTP_PASS")
|
||||
fi
|
||||
|
||||
if [[ ${#missing_params[@]} -gt 0 ]]; then
|
||||
log_error "缺少必需的FTP参数: ${missing_params[*]}"
|
||||
log_error "请通过以下方式之一设置FTP参数:"
|
||||
log_error " 1. 命令行参数: --server <地址> --user <用户名> --password <密码>"
|
||||
log_error " 2. 环境变量: FTP_SERVER=<地址> FTP_USER=<用户名> FTP_PASS=<密码>"
|
||||
log_error ""
|
||||
log_error "示例:"
|
||||
log_error " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234"
|
||||
log_error " FTP_SERVER=10.211.55.4 FTP_USER=ftpuser FTP_PASS=admin1234 sudo sh setup.sh"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 设置BASE_URL和LATEST_VERSION_URL
|
||||
BASE_URL="ftp://${FTP_SERVER}:${FTP_PORT}"
|
||||
LATEST_VERSION_URL="$BASE_URL/LATEST_VERSION"
|
||||
|
||||
log_info "FTP配置:"
|
||||
log_info " 服务器: $FTP_SERVER:$FTP_PORT"
|
||||
log_info " 用户: $FTP_USER"
|
||||
}
|
||||
|
||||
# 获取最新版本号的函数
|
||||
get_latest_version() {
|
||||
log_info "获取最新版本信息..." >&2
|
||||
log_info "尝试从URL获取: $LATEST_VERSION_URL" >&2
|
||||
|
||||
# 先测试FTP连接
|
||||
log_info "测试FTP连接..." >&2
|
||||
if ! curl -u "${FTP_USER}:${FTP_PASS}" -sfI "$LATEST_VERSION_URL" >/dev/null 2>&1; then
|
||||
log_error "无法连接到FTP服务器或文件不存在" >&2
|
||||
log_error "URL: $LATEST_VERSION_URL" >&2
|
||||
log_error "请检查:" >&2
|
||||
log_error " 1. FTP服务器是否运行: $FTP_SERVER:$FTP_PORT" >&2
|
||||
log_error " 2. 用户名密码是否正确: $FTP_USER" >&2
|
||||
log_error " 3. LATEST_VERSION文件是否存在" >&2
|
||||
log_error "手动测试命令: curl -u ${FTP_USER}:${FTP_PASS} ftp://${FTP_SERVER}/LATEST_VERSION" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 获取文件内容
|
||||
if ! LATEST_VERSION=$(curl -u "${FTP_USER}:${FTP_PASS}" -sfL "$LATEST_VERSION_URL" 2>/dev/null | tr -d '[:space:]'); then
|
||||
log_error "下载LATEST_VERSION文件失败" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_info "原始获取内容: '$LATEST_VERSION'" >&2
|
||||
|
||||
if [[ -z "$LATEST_VERSION" ]]; then
|
||||
log_error "获取到的版本信息为空" >&2
|
||||
log_error "可能的原因:" >&2
|
||||
log_error " 1. LATEST_VERSION文件为空" >&2
|
||||
log_error " 2. 文件内容格式不正确" >&2
|
||||
log_error " 3. 网络传输问题" >&2
|
||||
log_error "请检查FTP服务器上的 /srv/ftp/share/LATEST_VERSION 文件" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_info "检测到最新版本: $LATEST_VERSION" >&2
|
||||
echo "$LATEST_VERSION"
|
||||
}
|
||||
|
||||
# 解析参数
|
||||
ARGUS_VERSION="" # 使用不同的变量名避免与系统VERSION冲突
|
||||
ACTION="install"
|
||||
FORCE_INSTALL=false
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--version)
|
||||
ARGUS_VERSION="$2"
|
||||
shift 2
|
||||
;;
|
||||
--server)
|
||||
FTP_SERVER="$2"
|
||||
shift 2
|
||||
;;
|
||||
--user)
|
||||
FTP_USER="$2"
|
||||
shift 2
|
||||
;;
|
||||
--password)
|
||||
FTP_PASS="$2"
|
||||
shift 2
|
||||
;;
|
||||
--port)
|
||||
FTP_PORT="$2"
|
||||
shift 2
|
||||
;;
|
||||
--uninstall)
|
||||
ACTION="uninstall"
|
||||
shift
|
||||
;;
|
||||
--install-dir)
|
||||
INSTALL_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
# 简化安装逻辑:不再支持回滚和备份列表功能
|
||||
# --rollback)
|
||||
# ACTION="rollback"
|
||||
# shift
|
||||
# ;;
|
||||
# --backup-list)
|
||||
# ACTION="backup-list"
|
||||
# shift
|
||||
# ;;
|
||||
--status)
|
||||
ACTION="status"
|
||||
shift
|
||||
;;
|
||||
--force)
|
||||
FORCE_INSTALL=true
|
||||
shift
|
||||
;;
|
||||
--help)
|
||||
echo "Argus Metric FTP在线安装脚本"
|
||||
echo
|
||||
echo "用法: curl -u <用户名>:<密码> ftp://<服务器>/setup.sh -o setup.sh && sh setup.sh [选项]"
|
||||
echo
|
||||
echo "必需参数 (必须通过命令行参数或环境变量设置):"
|
||||
echo " --server SERVER FTP服务器地址 (必须)"
|
||||
echo " --user USER FTP用户名 (必须)"
|
||||
echo " --password PASS FTP密码 (必须)"
|
||||
echo
|
||||
echo "可选参数:"
|
||||
echo " --version VERSION 指定版本 (默认: 自动获取最新版本)"
|
||||
echo " --port PORT FTP端口 (默认: 21)"
|
||||
echo " --install-dir DIR 安装目录 (默认: /opt/argus-metric)"
|
||||
echo " --force 强制重新安装 (即使相同版本)"
|
||||
echo " --uninstall 卸载 (自动确认)"
|
||||
# echo " --rollback 回滚到上一个备份版本"
|
||||
# echo " --backup-list 列出所有备份版本"
|
||||
echo " --status 显示当前安装状态"
|
||||
echo " --help 显示帮助"
|
||||
echo
|
||||
echo "环境变量:"
|
||||
echo " FTP_SERVER FTP服务器地址 (必须)"
|
||||
echo " FTP_USER FTP用户名 (必须)"
|
||||
echo " FTP_PASS FTP密码 (必须)"
|
||||
echo " FTP_PORT FTP端口 (默认: 21)"
|
||||
echo
|
||||
echo "示例:"
|
||||
echo " # 方式1: 使用命令行参数"
|
||||
echo " curl -u ftpuser:admin1234 ftp://10.211.55.4/setup.sh -o setup.sh"
|
||||
echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234"
|
||||
echo " "
|
||||
echo " # 方式2: 使用环境变量"
|
||||
echo " FTP_SERVER=10.211.55.4 FTP_USER=ftpuser FTP_PASS=admin1234 sudo sh setup.sh"
|
||||
echo " "
|
||||
echo " # 指定版本安装"
|
||||
echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --version 1.30.0"
|
||||
echo " "
|
||||
echo " # 强制重新安装"
|
||||
echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --force"
|
||||
echo " "
|
||||
echo " # 卸载"
|
||||
echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --uninstall"
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
log_error "未知参数: $1"
|
||||
echo "使用 --help 查看帮助信息"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# 清理函数
|
||||
cleanup() {
|
||||
if [[ -d "$TEMP_DIR" ]]; then
|
||||
rm -rf "$TEMP_DIR"
|
||||
fi
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
# 创建安装目录结构
|
||||
create_install_directories() {
|
||||
log_info "创建安装目录结构..."
|
||||
|
||||
# 创建主要目录
|
||||
mkdir -p "$VERSIONS_DIR"
|
||||
mkdir -p "$BACKUPS_DIR"
|
||||
|
||||
log_success "安装目录结构创建完成: $INSTALL_DIR"
|
||||
}
|
||||
|
||||
# 获取当前安装的版本
|
||||
get_current_version() {
|
||||
# 优先从LATEST_VERSION文件读取
|
||||
if [[ -f "$LATEST_VERSION_FILE" ]]; then
|
||||
local version_from_file=$(cat "$LATEST_VERSION_FILE" 2>/dev/null | tr -d '[:space:]')
|
||||
if [[ -n "$version_from_file" ]]; then
|
||||
# 确保版本号格式一致(不带v前缀)
|
||||
echo "$version_from_file"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
# 如果文件不存在或为空,从软链接读取
|
||||
if [[ -L "$CURRENT_LINK" ]]; then
|
||||
local current_path=$(readlink "$CURRENT_LINK")
|
||||
# 从版本目录名中提取版本号(现在不带v前缀)
|
||||
basename "$current_path"
|
||||
else
|
||||
echo ""
|
||||
fi
|
||||
}
|
||||
|
||||
# 检查是否已安装
|
||||
check_installed() {
|
||||
if [[ -L "$CURRENT_LINK" ]] && [[ -d "$CURRENT_LINK" ]]; then
|
||||
local current_version=$(get_current_version)
|
||||
if [[ -n "$current_version" ]]; then
|
||||
log_info "检测到已安装版本: v$current_version"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
# 更新LATEST_VERSION文件
|
||||
update_latest_version_file() {
|
||||
local version="$1"
|
||||
log_info "更新LATEST_VERSION文件: $version"
|
||||
|
||||
if echo "$version" > "$LATEST_VERSION_FILE"; then
|
||||
log_success "LATEST_VERSION文件已更新"
|
||||
else
|
||||
log_error "更新LATEST_VERSION文件失败"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 初始化 DNS 配置文件到系统目录
|
||||
init_dns_config_to_system() {
|
||||
log_info "初始化 DNS 配置文件到系统目录..."
|
||||
|
||||
# 系统 DNS 配置文件
|
||||
local system_dns_conf="$INSTALL_DIR/dns.conf"
|
||||
|
||||
# 如果系统目录中还没有 dns.conf,创建一个空的占位文件
|
||||
if [[ ! -f "$system_dns_conf" ]]; then
|
||||
touch "$system_dns_conf"
|
||||
chmod 644 "$system_dns_conf"
|
||||
log_success "DNS 配置文件占位文件已创建: $system_dns_conf"
|
||||
log_info "DNS 同步脚本将从 FTP 服务器下载实际的 DNS 配置"
|
||||
else
|
||||
log_info "DNS 配置文件已存在: $system_dns_conf"
|
||||
fi
|
||||
}
|
||||
|
||||
# 备份当前版本
|
||||
backup_current_version() {
|
||||
local current_version=$(get_current_version)
|
||||
if [[ -z "$current_version" ]]; then
|
||||
log_info "没有当前版本需要备份"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 确保备份目录存在
|
||||
mkdir -p "$BACKUPS_DIR"
|
||||
|
||||
local backup_name="$current_version"
|
||||
local backup_path="$BACKUPS_DIR/$backup_name"
|
||||
|
||||
log_info "备份当前版本 $current_version 到: $backup_path"
|
||||
|
||||
# 如果备份已存在,先删除
|
||||
if [[ -d "$backup_path" ]]; then
|
||||
log_info "备份版本已存在,覆盖: $backup_path"
|
||||
rm -rf "$backup_path"
|
||||
fi
|
||||
|
||||
# 复制当前版本目录(跟随软链接复制实际内容)
|
||||
if cp -rL "$CURRENT_LINK" "$backup_path"; then
|
||||
log_success "版本备份完成: $backup_name"
|
||||
|
||||
else
|
||||
log_error "版本备份失败"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 回滚到备份版本
|
||||
rollback_to_backup() {
|
||||
local backup_name="$1"
|
||||
|
||||
# 确保备份目录存在
|
||||
mkdir -p "$BACKUPS_DIR"
|
||||
|
||||
local backup_path="$BACKUPS_DIR/$backup_name"
|
||||
|
||||
if [[ ! -d "$backup_path" ]]; then
|
||||
log_error "备份不存在: $backup_path"
|
||||
return 1
|
||||
fi
|
||||
|
||||
log_info "回滚到备份版本: $backup_name"
|
||||
|
||||
# 停止当前服务
|
||||
stop_services
|
||||
|
||||
# 检查是否存在对应的版本目录
|
||||
local version_dir="$VERSIONS_DIR/$backup_name"
|
||||
|
||||
if [[ ! -d "$version_dir" ]]; then
|
||||
log_info "版本目录不存在,从备份恢复版本目录: $version_dir"
|
||||
# 从备份目录恢复到版本目录
|
||||
mkdir -p "$VERSIONS_DIR"
|
||||
cp -r "$backup_path" "$version_dir"
|
||||
fi
|
||||
|
||||
# 恢复软链接指向版本目录
|
||||
if ln -sfn "$version_dir" "$CURRENT_LINK"; then
|
||||
log_success "版本回滚完成: $backup_name"
|
||||
|
||||
# 更新LATEST_VERSION文件
|
||||
update_latest_version_file "$backup_name"
|
||||
|
||||
return 0
|
||||
else
|
||||
log_error "版本回滚失败"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 停止服务
|
||||
stop_services() {
|
||||
log_info "停止当前服务..."
|
||||
|
||||
# 检查服务是否正在运行
|
||||
if ! check_services_running; then
|
||||
log_info "服务未运行,无需停止"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 尝试使用卸载脚本停止服务
|
||||
if [[ -f "$CURRENT_LINK/uninstall.sh" ]]; then
|
||||
cd "$CURRENT_LINK"
|
||||
chmod +x uninstall.sh
|
||||
|
||||
# 自动确认停止服务(避免交互式确认)
|
||||
echo "y" | ./uninstall.sh >/dev/null 2>&1
|
||||
local stop_exit_code=$?
|
||||
|
||||
if [[ $stop_exit_code -eq 0 ]]; then
|
||||
log_success "服务停止完成"
|
||||
else
|
||||
log_warning "停止服务时出现警告,尝试手动停止"
|
||||
manual_stop_services
|
||||
fi
|
||||
else
|
||||
log_warning "未找到卸载脚本,尝试手动停止服务"
|
||||
manual_stop_services
|
||||
fi
|
||||
}
|
||||
|
||||
# 手动停止服务
|
||||
manual_stop_services() {
|
||||
log_info "手动停止服务..."
|
||||
|
||||
# 停止 node_exporter
|
||||
if pgrep -f "node_exporter" >/dev/null 2>&1; then
|
||||
pkill -f "node_exporter" && log_info "node_exporter 已停止"
|
||||
fi
|
||||
|
||||
# 停止 dcgm_exporter
|
||||
if pgrep -f "dcgm_exporter" >/dev/null 2>&1; then
|
||||
pkill -f "dcgm_exporter" && log_info "dcgm_exporter 已停止"
|
||||
fi
|
||||
|
||||
# 等待进程完全停止
|
||||
sleep 2
|
||||
|
||||
# 检查是否还有残留进程
|
||||
if pgrep -f "node_exporter\|dcgm_exporter" >/dev/null 2>&1; then
|
||||
log_warning "仍有服务进程运行,尝试强制停止"
|
||||
pkill -9 -f "node_exporter\|dcgm_exporter" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
log_success "手动停止服务完成"
|
||||
}
|
||||
|
||||
# 启动服务
|
||||
start_services() {
|
||||
log_info "启动服务..."
|
||||
|
||||
# 检查服务是否已经在运行
|
||||
if check_services_running; then
|
||||
log_info "服务已在运行,跳过启动"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 由于 install_artifact.sh 已经安装了所有组件并设置了健康检查定时任务
|
||||
# 这里只需要简单验证服务状态即可
|
||||
log_info "组件已安装完成,健康检查定时任务已设置"
|
||||
log_info "服务将在健康检查时自动启动(每5分钟检查一次)"
|
||||
|
||||
# 等待一下让服务有时间启动
|
||||
sleep 3
|
||||
|
||||
# 验证服务状态
|
||||
if check_services_running; then
|
||||
log_success "服务启动成功"
|
||||
else
|
||||
log_info "服务可能正在启动中,健康检查机制将自动监控"
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# 检查服务是否正在运行
|
||||
check_services_running() {
|
||||
# 检查常见的服务端口是否在监听
|
||||
local ports=(9100 9400) # node-exporter 和 dcgm-exporter 的默认端口
|
||||
|
||||
for port in "${ports[@]}"; do
|
||||
if netstat -tlnp 2>/dev/null | grep -q ":$port "; then
|
||||
log_info "检测到服务正在端口 $port 上运行"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
|
||||
# 检查相关进程
|
||||
if pgrep -f "node_exporter\|dcgm_exporter" >/dev/null 2>&1; then
|
||||
log_info "检测到相关服务进程正在运行"
|
||||
return 0
|
||||
fi
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
# 检查是否为 root 用户
|
||||
check_root() {
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
log_error "此脚本需要 root 权限运行"
|
||||
log_info "请使用: sudo sh setup.sh"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 检查系统要求
|
||||
check_system() {
|
||||
log_info "检查系统要求..."
|
||||
|
||||
# 检查操作系统
|
||||
if [[ ! -f /etc/os-release ]]; then
|
||||
log_error "无法检测操作系统版本"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 读取系统信息,使用子shell避免污染当前环境变量
|
||||
local OS_INFO=$(source /etc/os-release && echo "$NAME $VERSION_ID")
|
||||
log_info "检测到操作系统: $OS_INFO"
|
||||
|
||||
# 检查系统架构
|
||||
arch=$(uname -m)
|
||||
log_info "系统架构: $arch"
|
||||
|
||||
# 检查磁盘空间
|
||||
available_space=$(df / | awk 'NR==2 {print $4}')
|
||||
if [[ $available_space -lt 1024 ]]; then
|
||||
log_warning "可用磁盘空间不足 1GB,当前可用: $(($available_space / 1024 / 1024))GB"
|
||||
fi
|
||||
}
|
||||
|
||||
# 下载并安装
|
||||
install_argus_metric() {
|
||||
# 如果没有指定版本,获取最新版本
|
||||
if [[ -z "$ARGUS_VERSION" ]]; then
|
||||
ARGUS_VERSION=$(get_latest_version)
|
||||
fi
|
||||
|
||||
log_info "开始安装 Argus Metric v$ARGUS_VERSION..."
|
||||
log_info "安装目录: $INSTALL_DIR"
|
||||
|
||||
# 创建安装目录结构(必须先创建,以便备份时目录存在)
|
||||
create_install_directories
|
||||
|
||||
# 检查是否已安装
|
||||
local is_upgrade=false
|
||||
if check_installed; then
|
||||
local current_version=$(get_current_version)
|
||||
if [[ "$current_version" == "$ARGUS_VERSION" ]]; then
|
||||
if [[ "$FORCE_INSTALL" == true ]]; then
|
||||
log_info "检测到相同版本 v$ARGUS_VERSION,但使用了 --force 参数,将强制重新安装"
|
||||
is_upgrade=true
|
||||
# 简化安装逻辑:不再备份当前版本
|
||||
# backup_current_version
|
||||
else
|
||||
log_info "版本 v$ARGUS_VERSION 已安装,无需重复安装"
|
||||
log_info "如需强制重新安装,请使用 --force 参数"
|
||||
return 0
|
||||
fi
|
||||
else
|
||||
log_info "检测到版本升级: v$current_version -> v$ARGUS_VERSION"
|
||||
is_upgrade=true
|
||||
|
||||
# 简化安装逻辑:不再备份当前版本
|
||||
# backup_current_version
|
||||
fi
|
||||
fi
|
||||
|
||||
# 创建临时目录
|
||||
mkdir -p "$TEMP_DIR"
|
||||
cd "$TEMP_DIR"
|
||||
|
||||
# 下载发布包,使用新的命名规范
|
||||
TAR_NAME="argus-metric_$(echo $ARGUS_VERSION | tr '.' '_').tar.gz"
|
||||
log_info "下载发布包: $TAR_NAME"
|
||||
log_info "从FTP服务器下载: $FTP_SERVER:$FTP_PORT, 用户: $FTP_USER"
|
||||
|
||||
# 构造curl命令并显示(隐藏密码)
|
||||
CURL_CMD="curl -u \"${FTP_USER}:***\" -sfL \"$BASE_URL/$TAR_NAME\" -o \"$TAR_NAME\""
|
||||
log_info "执行命令: $CURL_CMD"
|
||||
|
||||
if ! curl -u "${FTP_USER}:${FTP_PASS}" -sfL "$BASE_URL/$TAR_NAME" -o "$TAR_NAME"; then
|
||||
log_error "下载发布包失败: $BASE_URL/$TAR_NAME"
|
||||
log_error "完整命令: curl -u \"${FTP_USER}:${FTP_PASS}\" -sfL \"$BASE_URL/$TAR_NAME\" -o \"$TAR_NAME\""
|
||||
log_error "请检查FTP服务器连接、用户名密码是否正确"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 解压发布包到当前目录
|
||||
log_info "解压发布包..."
|
||||
if ! tar -xzf "$TAR_NAME"; then
|
||||
log_error "解压发布包失败"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 显示解压后的文件结构
|
||||
log_info "解压后的文件结构:"
|
||||
ls -la "$TEMP_DIR"
|
||||
|
||||
# 准备版本目录
|
||||
local version_dir="$VERSIONS_DIR/$ARGUS_VERSION"
|
||||
log_info "安装到版本目录: $version_dir"
|
||||
|
||||
# 如果升级,先停止服务
|
||||
if [[ "$is_upgrade" == true ]]; then
|
||||
stop_services
|
||||
fi
|
||||
|
||||
# 创建版本目录
|
||||
if [[ -d "$version_dir" ]]; then
|
||||
log_info "版本目录已存在,备份后更新"
|
||||
rm -rf "$version_dir"
|
||||
fi
|
||||
|
||||
# 创建新的版本目录
|
||||
mkdir -p "$version_dir"
|
||||
|
||||
# 移动解压的文件到版本目录
|
||||
log_info "移动文件到版本目录: $TEMP_DIR/* -> $version_dir/"
|
||||
|
||||
# 检查源目录是否有内容
|
||||
if [[ ! "$(ls -A "$TEMP_DIR" 2>/dev/null)" ]]; then
|
||||
log_error "临时目录为空,无法移动文件"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 检查目标目录是否存在
|
||||
if [[ ! -d "$version_dir" ]]; then
|
||||
log_error "目标版本目录不存在: $version_dir"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 执行文件移动
|
||||
if mv "$TEMP_DIR"/* "$version_dir" 2>/dev/null; then
|
||||
log_success "文件移动到版本目录完成"
|
||||
else
|
||||
log_error "移动文件到版本目录失败"
|
||||
log_error "源目录内容:"
|
||||
ls -la "$TEMP_DIR" || true
|
||||
log_error "目标目录状态:"
|
||||
ls -la "$version_dir" || true
|
||||
log_error "权限检查:"
|
||||
ls -ld "$TEMP_DIR" "$version_dir" || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 执行安装脚本
|
||||
log_info "执行安装脚本..."
|
||||
cd "$version_dir"
|
||||
if [[ -f "install.sh" ]]; then
|
||||
chmod +x install.sh
|
||||
# 传递安装根目录给安装脚本,让install_artifact.sh安装到正确的版本目录
|
||||
if ./install.sh "$version_dir"; then
|
||||
log_success "安装脚本执行完成"
|
||||
else
|
||||
log_error "安装脚本执行失败"
|
||||
# 简化安装逻辑:不再自动回滚
|
||||
# if [[ "$is_upgrade" == true ]]; then
|
||||
# log_warning "升级失败,尝试回滚到之前版本..."
|
||||
# # 确保备份目录存在
|
||||
# mkdir -p "$BACKUPS_DIR"
|
||||
# local latest_backup=$(ls -1t "$BACKUPS_DIR" 2>/dev/null | head -n 1)
|
||||
# if [[ -n "$latest_backup" ]]; then
|
||||
# rollback_to_backup "$latest_backup"
|
||||
# return 1
|
||||
# fi
|
||||
# fi
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
log_error "未找到安装脚本 install.sh"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 更新软链接指向新版本
|
||||
log_info "更新当前版本链接..."
|
||||
|
||||
# 如果 current 已经存在且是目录,先删除它
|
||||
if [[ -d "$CURRENT_LINK" ]] && [[ ! -L "$CURRENT_LINK" ]]; then
|
||||
log_warning "发现 current 是目录而不是符号链接,正在删除..."
|
||||
rm -rf "$CURRENT_LINK"
|
||||
fi
|
||||
|
||||
if ln -sfn "$version_dir" "$CURRENT_LINK"; then
|
||||
log_success "版本链接更新完成: $CURRENT_LINK -> $version_dir"
|
||||
else
|
||||
log_error "版本链接更新失败"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 更新LATEST_VERSION文件
|
||||
update_latest_version_file "$ARGUS_VERSION"
|
||||
|
||||
# 初始化 DNS 配置文件到系统目录
|
||||
init_dns_config_to_system
|
||||
|
||||
# 启动服务
|
||||
# start_services
|
||||
|
||||
log_success "Argus Metric v$ARGUS_VERSION 安装完成!"
|
||||
|
||||
# 显示安装信息
|
||||
echo
|
||||
log_info "安装信息:"
|
||||
log_info " 版本: $ARGUS_VERSION"
|
||||
log_info " 安装目录: $INSTALL_DIR"
|
||||
log_info " 版本目录: $version_dir"
|
||||
log_info " 当前链接: $CURRENT_LINK"
|
||||
if [[ "$is_upgrade" == true ]]; then
|
||||
log_info " 升级类型: 版本升级"
|
||||
else
|
||||
log_info " 安装类型: 全新安装"
|
||||
fi
|
||||
}
|
||||
|
||||
# 卸载
|
||||
uninstall_argus_metric() {
|
||||
log_info "开始卸载 Argus Metric..."
|
||||
log_info "安装目录: $INSTALL_DIR"
|
||||
|
||||
# 检查是否已安装
|
||||
if ! check_installed; then
|
||||
log_info "未检测到已安装的 Argus Metric"
|
||||
return 0
|
||||
fi
|
||||
|
||||
local current_version=$(get_current_version)
|
||||
log_info "检测到当前版本: v$current_version"
|
||||
|
||||
# 停止服务
|
||||
stop_services
|
||||
|
||||
# 执行卸载脚本
|
||||
log_info "执行卸载脚本..."
|
||||
if [[ -f "$CURRENT_LINK/uninstall.sh" ]]; then
|
||||
cd "$CURRENT_LINK"
|
||||
chmod +x uninstall.sh
|
||||
|
||||
# 自动确认卸载(因为用户已经明确使用了 --uninstall 参数)
|
||||
log_info "自动确认卸载操作..."
|
||||
echo "y" | ./uninstall.sh
|
||||
local uninstall_exit_code=$?
|
||||
|
||||
if [[ $uninstall_exit_code -eq 0 ]]; then
|
||||
log_success "卸载脚本执行完成"
|
||||
else
|
||||
log_error "卸载脚本执行失败 (退出码: $uninstall_exit_code)"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
log_warning "未找到卸载脚本,执行基本清理"
|
||||
fi
|
||||
|
||||
# 清理安装目录
|
||||
log_info "清理安装目录..."
|
||||
if [[ -d "$INSTALL_DIR" ]]; then
|
||||
# 询问是否完全删除安装目录
|
||||
log_warning "这将删除整个安装目录: $INSTALL_DIR"
|
||||
log_warning "包括所有版本、备份和配置文件"
|
||||
|
||||
# 在自动化环境中,直接删除
|
||||
if rm -rf "$INSTALL_DIR"; then
|
||||
log_success "安装目录已完全清理: $INSTALL_DIR"
|
||||
else
|
||||
log_error "清理安装目录失败"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
log_info "安装目录不存在,无需清理"
|
||||
fi
|
||||
|
||||
log_success "Argus Metric 卸载完成!"
|
||||
}
|
||||
|
||||
# 显示状态
|
||||
show_status() {
|
||||
echo "=========================================="
|
||||
echo " Argus Metric 安装状态"
|
||||
echo "=========================================="
|
||||
echo
|
||||
|
||||
if check_installed; then
|
||||
local current_version=$(get_current_version)
|
||||
log_info "当前版本: $current_version"
|
||||
log_info "安装目录: $INSTALL_DIR"
|
||||
log_info "当前链接: $CURRENT_LINK"
|
||||
log_info "版本目录: $VERSIONS_DIR/$current_version"
|
||||
log_info "版本文件: $LATEST_VERSION_FILE"
|
||||
|
||||
# 显示LATEST_VERSION文件内容
|
||||
if [[ -f "$LATEST_VERSION_FILE" ]]; then
|
||||
local file_version=$(cat "$LATEST_VERSION_FILE" 2>/dev/null | tr -d '[:space:]')
|
||||
log_info "版本文件内容: $file_version"
|
||||
fi
|
||||
|
||||
echo
|
||||
log_info "目录结构:"
|
||||
if [[ -d "$INSTALL_DIR" ]]; then
|
||||
tree -L 2 "$INSTALL_DIR" 2>/dev/null || ls -la "$INSTALL_DIR"
|
||||
fi
|
||||
|
||||
echo
|
||||
log_info "可用版本:"
|
||||
if [[ -d "$VERSIONS_DIR" ]]; then
|
||||
ls -1 "$VERSIONS_DIR" 2>/dev/null | sed 's/^/ - /'
|
||||
else
|
||||
echo " 无"
|
||||
fi
|
||||
|
||||
# 简化安装逻辑:不再显示备份版本信息
|
||||
# echo
|
||||
# log_info "备份版本:"
|
||||
# if [[ -d "$BACKUPS_DIR" ]] && [[ $(ls -1 "$BACKUPS_DIR" 2>/dev/null | wc -l) -gt 0 ]]; then
|
||||
# ls -1t "$BACKUPS_DIR" 2>/dev/null | sed 's/^/ - /'
|
||||
# else
|
||||
# echo " 无"
|
||||
# fi
|
||||
else
|
||||
log_warning "Argus Metric 未安装"
|
||||
log_info "安装目录: $INSTALL_DIR"
|
||||
fi
|
||||
}
|
||||
|
||||
# 列出备份
|
||||
list_backups() {
|
||||
echo "=========================================="
|
||||
echo " Argus Metric 备份列表"
|
||||
echo "=========================================="
|
||||
echo
|
||||
|
||||
if [[ -d "$BACKUPS_DIR" ]] && [[ $(ls -1 "$BACKUPS_DIR" 2>/dev/null | wc -l) -gt 0 ]]; then
|
||||
log_info "可用备份版本:"
|
||||
ls -1t "$BACKUPS_DIR" 2>/dev/null | while read backup; do
|
||||
local backup_time=$(stat -c %y "$BACKUPS_DIR/$backup" 2>/dev/null | cut -d' ' -f1-2)
|
||||
echo " - $backup (创建时间: $backup_time)"
|
||||
done
|
||||
else
|
||||
log_warning "没有可用的备份版本"
|
||||
fi
|
||||
}
|
||||
|
||||
# 回滚功能
|
||||
rollback_version() {
|
||||
log_info "开始回滚操作..."
|
||||
|
||||
if ! check_installed; then
|
||||
log_error "没有检测到已安装的版本,无法回滚"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 确保备份目录存在
|
||||
mkdir -p "$BACKUPS_DIR"
|
||||
|
||||
# 获取最新的备份
|
||||
local latest_backup=$(ls -1t "$BACKUPS_DIR" 2>/dev/null | head -n 1)
|
||||
if [[ -z "$latest_backup" ]]; then
|
||||
log_error "没有找到可用的备份版本"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_info "将回滚到备份版本: $latest_backup"
|
||||
|
||||
if rollback_to_backup "$latest_backup"; then
|
||||
log_success "回滚完成!"
|
||||
|
||||
# 显示当前状态
|
||||
echo
|
||||
show_status
|
||||
else
|
||||
log_error "回滚失败"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
echo "=========================================="
|
||||
echo " Argus Metric 在线安装脚本 v1.0"
|
||||
echo "=========================================="
|
||||
echo
|
||||
|
||||
# 加载配置文件
|
||||
load_config
|
||||
|
||||
# 对于状态操作,不需要FTP参数和root权限
|
||||
# 简化安装逻辑:不再支持备份列表操作
|
||||
if [[ "$ACTION" == "status" ]]; then
|
||||
show_status
|
||||
return 0
|
||||
fi
|
||||
# if [[ "$ACTION" == "status" || "$ACTION" == "backup-list" ]]; then
|
||||
# if [[ "$ACTION" == "status" ]]; then
|
||||
# show_status
|
||||
# elif [[ "$ACTION" == "backup-list" ]]; then
|
||||
# list_backups
|
||||
# fi
|
||||
# return 0
|
||||
# fi
|
||||
|
||||
check_root
|
||||
|
||||
# 更新目录配置变量(在设置INSTALL_DIR后)
|
||||
VERSIONS_DIR="$INSTALL_DIR/versions"
|
||||
BACKUPS_DIR="$INSTALL_DIR/backups"
|
||||
CURRENT_LINK="$INSTALL_DIR/current"
|
||||
LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION"
|
||||
|
||||
# 简化安装逻辑:不再支持回滚操作
|
||||
# if [[ "$ACTION" == "rollback" ]]; then
|
||||
# rollback_version
|
||||
# return 0
|
||||
# fi
|
||||
|
||||
check_ftp_params
|
||||
check_system
|
||||
|
||||
if [[ "$ACTION" == "uninstall" ]]; then
|
||||
uninstall_argus_metric
|
||||
else
|
||||
install_argus_metric
|
||||
fi
|
||||
|
||||
echo
|
||||
log_info "操作完成!"
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
143
src/metric/client-plugins/all-in-one-full/scripts/sync_dns.sh
Executable file
143
src/metric/client-plugins/all-in-one-full/scripts/sync_dns.sh
Executable file
@ -0,0 +1,143 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# 颜色
|
||||
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
|
||||
|
||||
# 日志函数
|
||||
log_info() { echo -e "${BLUE}[INFO]${NC} $1" >&2; }
|
||||
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" >&2; }
|
||||
log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" >&2; }
|
||||
log_error() { echo -e "${RED}[ERROR]${NC} $1" >&2; }
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
LOCAL_DNS_CONF="/opt/argus-metric/dns.conf"
|
||||
RESOLV_CONF="/etc/resolv.conf"
|
||||
ALT_RESOLV_CONF="/run/resolv.conf"
|
||||
LOG_FILE="/opt/argus-metric/.dns_sync.log"
|
||||
REMOTE_DNS_CONF_URL=""
|
||||
|
||||
# 获取 FTP 配置
|
||||
get_ftp_config() {
|
||||
log_info "获取 FTP 配置信息..."
|
||||
if [[ -z "$FTP_SERVER" || -z "$FTP_USER" || -z "$FTP_PASSWORD" ]]; then
|
||||
[[ -f "$SCRIPT_DIR/config.env" ]] && source "$SCRIPT_DIR/config.env"
|
||||
fi
|
||||
FTP_SERVER="${FTP_SERVER:-localhost}"
|
||||
FTP_USER="${FTP_USER:-ftpuser}"
|
||||
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
|
||||
REMOTE_DNS_CONF_URL="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/dns.conf"
|
||||
}
|
||||
|
||||
# 下载远程 dns.conf
|
||||
download_remote_dns_conf() {
|
||||
local tmp="/tmp/dns.remote.$$"
|
||||
log_info "测试 FTP 连接..."
|
||||
if ! curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null; then
|
||||
log_error "无法连接到 FTP 服务器: $FTP_SERVER"; return 1
|
||||
fi
|
||||
if ! curl -u "${FTP_USER}:${FTP_PASSWORD}" -sf "ftp://${FTP_SERVER}/dns.conf" -o "$tmp" 2>/dev/null; then
|
||||
log_error "下载 dns.conf 失败"; rm -f "$tmp"; return 1
|
||||
fi
|
||||
echo "$tmp"
|
||||
}
|
||||
|
||||
# 文件比较
|
||||
compare_files() { diff -q "$1" "$2" >/dev/null 2>&1; }
|
||||
|
||||
# 从 dns.conf 提取有效 IP
|
||||
get_dns_ips() {
|
||||
grep -Eo '^[0-9]{1,3}(\.[0-9]{1,3}){3}$' "$1" | sort -u
|
||||
}
|
||||
|
||||
# 安全更新 resolv.conf(保留符号链接)
|
||||
update_resolv_conf() {
|
||||
local dns_conf="$1"
|
||||
local dns_ips
|
||||
mapfile -t dns_ips < <(get_dns_ips "$dns_conf")
|
||||
[[ ${#dns_ips[@]} -eq 0 ]] && { log_warning "未检测到有效 DNS"; return; }
|
||||
|
||||
local target_file="$RESOLV_CONF"
|
||||
if [[ ! -w "$RESOLV_CONF" ]]; then
|
||||
log_warning "/etc/resolv.conf 不可写,使用兜底路径 $ALT_RESOLV_CONF"
|
||||
target_file="$ALT_RESOLV_CONF"
|
||||
fi
|
||||
|
||||
local temp="/tmp/resolv.new.$$"
|
||||
cp "$target_file" "${target_file}.backup.$(date +%Y%m%d_%H%M%S)" 2>/dev/null || true
|
||||
log_info "更新 DNS 配置文件: $target_file"
|
||||
|
||||
# 写入新的 nameserver 行
|
||||
for ip in "${dns_ips[@]}"; do
|
||||
echo "nameserver $ip"
|
||||
done >"$temp"
|
||||
|
||||
# 追加原内容(去掉重复 nameserver)
|
||||
grep -v '^nameserver' "$target_file" >>"$temp" 2>/dev/null || true
|
||||
awk '!a[$0]++' "$temp" >"${temp}.uniq"
|
||||
|
||||
# ⚙️ 使用 cat 原地覆盖,避免 mv 引发 “设备忙”
|
||||
if cat "${temp}.uniq" >"$target_file" 2>/dev/null; then
|
||||
chmod 644 "$target_file"
|
||||
log_success "DNS 更新完成: ${dns_ips[*]}"
|
||||
else
|
||||
log_error "无法写入 $target_file,可能被系统锁定"
|
||||
fi
|
||||
|
||||
rm -f "$temp" "${temp}.uniq"
|
||||
}
|
||||
|
||||
# 检查 resolv.conf 是否包含 dns.conf 内容
|
||||
ensure_dns_in_resolv() {
|
||||
local dns_conf="$1"
|
||||
local dns_ips
|
||||
mapfile -t dns_ips < <(get_dns_ips "$dns_conf")
|
||||
[[ ${#dns_ips[@]} -eq 0 ]] && return
|
||||
|
||||
for ip in "${dns_ips[@]}"; do
|
||||
if ! grep -q "nameserver $ip" "$RESOLV_CONF" 2>/dev/null; then
|
||||
log_warning "检测到 /etc/resolv.conf 缺少 $ip,执行兜底修复"
|
||||
update_resolv_conf "$dns_conf"
|
||||
return
|
||||
fi
|
||||
done
|
||||
log_info "/etc/resolv.conf 已包含所有 DNS"
|
||||
}
|
||||
|
||||
log_sync() { echo "[$(date '+%F %T')] $1" >>"$LOG_FILE"; }
|
||||
|
||||
main() {
|
||||
log_info "开始 DNS 同步检查..."
|
||||
mkdir -p /opt/argus-metric
|
||||
|
||||
get_ftp_config
|
||||
local remote_file
|
||||
if ! remote_file=$(download_remote_dns_conf); then
|
||||
log_error "下载失败"; log_sync "同步失败"; exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -f "$LOCAL_DNS_CONF" ]]; then
|
||||
log_info "本地 dns.conf 不存在,初始化..."
|
||||
cp "$remote_file" "$LOCAL_DNS_CONF"
|
||||
update_resolv_conf "$LOCAL_DNS_CONF"
|
||||
log_sync "首次同步完成"
|
||||
else
|
||||
if compare_files "$LOCAL_DNS_CONF" "$remote_file"; then
|
||||
log_info "dns.conf 无变化"
|
||||
ensure_dns_in_resolv "$LOCAL_DNS_CONF"
|
||||
log_sync "dns.conf 无变化,执行兜底检查"
|
||||
else
|
||||
log_info "检测到 DNS 配置更新"
|
||||
cp "$remote_file" "$LOCAL_DNS_CONF"
|
||||
update_resolv_conf "$LOCAL_DNS_CONF"
|
||||
log_sync "DNS 配置同步完成"
|
||||
fi
|
||||
fi
|
||||
|
||||
rm -f "$remote_file"
|
||||
log_success "DNS 同步流程完成"
|
||||
}
|
||||
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
274
src/metric/client-plugins/all-in-one-full/scripts/uninstall_artifact.sh
Executable file
274
src/metric/client-plugins/all-in-one-full/scripts/uninstall_artifact.sh
Executable file
@ -0,0 +1,274 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# 日志函数
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# 配置变量
|
||||
INSTALL_DIR="/opt/argus-metric"
|
||||
TEMP_DIR="/tmp/argus-metric-uninstall-$$"
|
||||
VERSION_FILE="version.json"
|
||||
|
||||
# 检查是否为 root 用户
|
||||
check_root() {
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
log_error "此脚本需要 root 权限运行"
|
||||
log_info "请使用: sudo $0"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 查找版本文件
|
||||
find_version_file() {
|
||||
log_info "查找版本信息文件..."
|
||||
|
||||
# 在当前目录查找
|
||||
if [[ -f "$VERSION_FILE" ]]; then
|
||||
VERSION_FILE_PATH="$VERSION_FILE"
|
||||
log_success "找到版本文件: $VERSION_FILE"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 在 artifact 目录查找
|
||||
for version_dir in artifact/*/; do
|
||||
if [[ -f "${version_dir}${VERSION_FILE}" ]]; then
|
||||
VERSION_FILE_PATH="${version_dir}${VERSION_FILE}"
|
||||
log_success "找到版本文件: $VERSION_FILE_PATH"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
|
||||
log_error "未找到版本信息文件 $VERSION_FILE"
|
||||
log_info "请确保在正确的目录下运行此脚本"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# 解析版本信息
|
||||
parse_version_info() {
|
||||
log_info "解析版本信息..."
|
||||
|
||||
if [[ ! -f "$VERSION_FILE_PATH" ]]; then
|
||||
log_error "版本文件不存在: $VERSION_FILE_PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 使用 jq 解析 JSON(如果可用)
|
||||
if command -v jq &> /dev/null; then
|
||||
VERSION=$(jq -r '.version' "$VERSION_FILE_PATH")
|
||||
BUILD_TIME=$(jq -r '.build_time' "$VERSION_FILE_PATH")
|
||||
|
||||
# 解析 install_order(现在包含完整的文件名)
|
||||
if jq -e '.install_order' "$VERSION_FILE_PATH" > /dev/null 2>&1; then
|
||||
jq -r '.install_order[]' "$VERSION_FILE_PATH" > "$TEMP_DIR/install_order.txt"
|
||||
else
|
||||
log_error "version.json 中缺少 install_order 字段"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
log_warning "jq 未安装,使用简单的 JSON 解析"
|
||||
VERSION=$(grep '"version"' "$VERSION_FILE_PATH" | sed 's/.*"version": *"\([^"]*\)".*/\1/')
|
||||
BUILD_TIME=$(grep '"build_time"' "$VERSION_FILE_PATH" | sed 's/.*"build_time": *"\([^"]*\)".*/\1/')
|
||||
|
||||
# 解析 install_order
|
||||
grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+"' | while read line; do
|
||||
component=$(echo "$line" | sed 's/.*"\([^"]*\)".*/\1/')
|
||||
echo "$component" >> "$TEMP_DIR/install_order.txt"
|
||||
done
|
||||
fi
|
||||
|
||||
log_success "版本信息解析完成"
|
||||
log_info " 版本: $VERSION"
|
||||
log_info " 构建时间: $BUILD_TIME"
|
||||
}
|
||||
|
||||
# 创建临时目录
|
||||
create_temp_dirs() {
|
||||
log_info "创建临时目录..."
|
||||
mkdir -p "$TEMP_DIR"
|
||||
log_success "临时目录创建完成: $TEMP_DIR"
|
||||
}
|
||||
|
||||
# 卸载组件
|
||||
uninstall_components() {
|
||||
log_info "开始卸载组件..."
|
||||
|
||||
artifact_dir=$(dirname "$VERSION_FILE_PATH")
|
||||
uninstall_count=0
|
||||
total_count=0
|
||||
|
||||
if [[ -f "$TEMP_DIR/install_order.txt" ]]; then
|
||||
total_count=$(wc -l < "$TEMP_DIR/install_order.txt")
|
||||
fi
|
||||
|
||||
if [[ -f "$TEMP_DIR/install_order.txt" ]]; then
|
||||
while IFS= read -r filename; do
|
||||
uninstall_count=$((uninstall_count + 1))
|
||||
|
||||
# 从文件名中提取组件名(去掉时间戳后缀)
|
||||
component=$(echo "$filename" | sed 's/-[0-9]\{8\}-[0-9]\{6\}\.tar\.gz$//')
|
||||
|
||||
log_info "[$uninstall_count/$total_count] 卸载 $component..."
|
||||
|
||||
# 直接使用完整的文件名
|
||||
tar_file="$artifact_dir/$filename"
|
||||
|
||||
if [[ ! -f "$tar_file" ]]; then
|
||||
log_error "找不到组件文件: $filename"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 解压到临时目录
|
||||
component_temp_dir="$TEMP_DIR/$component"
|
||||
mkdir -p "$component_temp_dir"
|
||||
|
||||
if tar -xzf "$tar_file" -C "$component_temp_dir"; then
|
||||
log_success " $component 解压完成"
|
||||
else
|
||||
log_error " $component 解压失败"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 查找解压后的目录
|
||||
extracted_dir=""
|
||||
for dir in "$component_temp_dir"/*; do
|
||||
if [[ -d "$dir" ]]; then
|
||||
extracted_dir="$dir"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ -z "$extracted_dir" ]]; then
|
||||
log_error " $component 解压后未找到目录"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 执行卸载脚本
|
||||
if [[ -f "$extracted_dir/uninstall.sh" ]]; then
|
||||
log_info " 执行 $component 卸载脚本..."
|
||||
# 所有组件都只需要一个确认
|
||||
if (cd "$extracted_dir" && echo "y" | ./uninstall.sh); then
|
||||
log_success " $component 卸载完成"
|
||||
else
|
||||
log_error " $component 卸载失败"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
log_warning " $component 缺少 uninstall.sh 文件,跳过卸载"
|
||||
fi
|
||||
|
||||
# 清理临时文件
|
||||
rm -rf "$component_temp_dir"
|
||||
done < "$TEMP_DIR/install_order.txt"
|
||||
fi
|
||||
|
||||
log_success "所有组件卸载完成"
|
||||
}
|
||||
|
||||
# 清理全局文件
|
||||
cleanup_global_files() {
|
||||
log_info "清理全局文件..."
|
||||
|
||||
# 清理安装目录
|
||||
if [[ -d "$INSTALL_DIR" ]]; then
|
||||
rm -rf "$INSTALL_DIR"
|
||||
log_success "安装目录已清理: $INSTALL_DIR"
|
||||
else
|
||||
log_info "安装目录不存在: $INSTALL_DIR"
|
||||
fi
|
||||
|
||||
# 清理可能的全局配置文件
|
||||
local global_configs=(
|
||||
"/etc/argus-metric"
|
||||
"/var/log/argus-metric"
|
||||
)
|
||||
|
||||
for config in "${global_configs[@]}"; do
|
||||
if [[ -d "$config" ]]; then
|
||||
rm -rf "$config"
|
||||
log_success "全局配置已清理: $config"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# 显示卸载信息
|
||||
show_uninstall_info() {
|
||||
log_success "Argus-Metrics All-in-One 卸载完成!"
|
||||
echo
|
||||
echo "卸载信息:"
|
||||
echo " 版本: $VERSION"
|
||||
echo " 构建时间: $BUILD_TIME"
|
||||
echo
|
||||
echo "清理内容:"
|
||||
echo " - 二进制文件"
|
||||
echo " - 配置文件"
|
||||
echo " - 数据目录"
|
||||
echo " - 进程和服务"
|
||||
echo " - 全局安装目录"
|
||||
echo
|
||||
echo "注意:"
|
||||
echo " - 系统依赖包可能仍然存在"
|
||||
echo " - 如需完全清理,请手动检查并删除相关文件"
|
||||
echo
|
||||
}
|
||||
|
||||
# 清理函数
|
||||
cleanup() {
|
||||
if [[ -d "$TEMP_DIR" ]]; then
|
||||
rm -rf "$TEMP_DIR"
|
||||
fi
|
||||
}
|
||||
|
||||
# 设置清理陷阱
|
||||
trap cleanup EXIT
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
echo "=========================================="
|
||||
echo " Argus-Metrics All-in-One 卸载脚本"
|
||||
echo "=========================================="
|
||||
echo
|
||||
|
||||
check_root
|
||||
find_version_file
|
||||
create_temp_dirs
|
||||
parse_version_info
|
||||
|
||||
log_warning "此操作将完全卸载 Argus-Metrics All-in-One"
|
||||
read -p "确认继续?(y/N): " confirm
|
||||
|
||||
if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then
|
||||
log_info "取消卸载操作"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
uninstall_components
|
||||
cleanup_global_files
|
||||
show_uninstall_info
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
350
src/metric/client-plugins/all-in-one-full/scripts/version-manager.sh
Executable file
350
src/metric/client-plugins/all-in-one-full/scripts/version-manager.sh
Executable file
@ -0,0 +1,350 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# 颜色定义
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# 日志函数
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# 显示帮助信息
|
||||
show_help() {
|
||||
echo "AIOps 版本管理工具"
|
||||
echo
|
||||
echo "用法: $0 <command> [options]"
|
||||
echo
|
||||
echo "命令:"
|
||||
echo " bump <type> - 升级版本号 (major|minor|patch)"
|
||||
echo " set <version> - 设置指定版本号"
|
||||
echo " show - 显示当前版本信息"
|
||||
echo " list - 列出所有版本"
|
||||
echo " clean - 清理旧版本"
|
||||
echo " validate - 验证版本配置"
|
||||
echo
|
||||
echo "示例:"
|
||||
echo " $0 bump minor # 升级次版本号 1.0.0 -> 1.1.0"
|
||||
echo " $0 set 2.0.0 # 设置版本为 2.0.0"
|
||||
echo " $0 show # 显示当前版本"
|
||||
echo " $0 list # 列出所有版本"
|
||||
}
|
||||
|
||||
# 获取当前版本
|
||||
get_current_version() {
|
||||
if [[ -f "config/VERSION" ]]; then
|
||||
cat config/VERSION
|
||||
else
|
||||
echo "0.0.0"
|
||||
fi
|
||||
}
|
||||
|
||||
# 设置版本号
|
||||
set_version() {
|
||||
local new_version="$1"
|
||||
|
||||
# 验证版本号格式
|
||||
if [[ ! "$new_version" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
|
||||
log_error "无效的版本号格式: $new_version"
|
||||
log_info "版本号格式应为: major.minor.patch (如: 1.2.3)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "$new_version" > config/VERSION
|
||||
log_success "版本号已设置为: $new_version"
|
||||
}
|
||||
|
||||
# 升级版本号
|
||||
bump_version() {
|
||||
local bump_type="$1"
|
||||
local current_version=$(get_current_version)
|
||||
|
||||
# 解析当前版本号
|
||||
IFS='.' read -r major minor patch <<< "$current_version"
|
||||
|
||||
case "$bump_type" in
|
||||
"major")
|
||||
major=$((major + 1))
|
||||
minor=0
|
||||
patch=0
|
||||
;;
|
||||
"minor")
|
||||
minor=$((minor + 1))
|
||||
patch=0
|
||||
;;
|
||||
"patch")
|
||||
patch=$((patch + 1))
|
||||
;;
|
||||
*)
|
||||
log_error "无效的升级类型: $bump_type"
|
||||
log_info "支持的类型: major, minor, patch"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
local new_version="$major.$minor.$patch"
|
||||
set_version "$new_version"
|
||||
log_success "版本号已从 $current_version 升级到 $new_version"
|
||||
}
|
||||
|
||||
# 显示当前版本信息
|
||||
show_version() {
|
||||
local current_version=$(get_current_version)
|
||||
log_info "当前版本: $current_version"
|
||||
|
||||
if [[ -f "config/checklist" ]]; then
|
||||
echo
|
||||
echo "组件清单:"
|
||||
while IFS= read -r line; do
|
||||
[[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
|
||||
read -r component version dep order <<< "$line"
|
||||
if [[ -n "$component" && -n "$version" ]]; then
|
||||
echo " - $component v$version"
|
||||
fi
|
||||
done < config/checklist
|
||||
fi
|
||||
|
||||
# 检查是否有对应的 artifact
|
||||
local artifact_dir="artifact/$current_version"
|
||||
if [[ -d "$artifact_dir" ]]; then
|
||||
echo
|
||||
echo "已构建的组件:"
|
||||
for file in "$artifact_dir"/*.tar.gz; do
|
||||
if [[ -f "$file" ]]; then
|
||||
local filename=$(basename "$file")
|
||||
local size=$(du -h "$file" | cut -f1)
|
||||
echo " - $filename ($size)"
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ -f "$artifact_dir/version.json" ]]; then
|
||||
echo
|
||||
echo "版本信息文件: $artifact_dir/version.json"
|
||||
fi
|
||||
else
|
||||
echo
|
||||
log_warning "未找到对应的构建目录: $artifact_dir"
|
||||
log_info "运行 ./package.sh 进行构建"
|
||||
fi
|
||||
}
|
||||
|
||||
# 列出所有版本
|
||||
list_versions() {
|
||||
log_info "所有版本列表:"
|
||||
echo
|
||||
|
||||
if [[ ! -d "artifact" ]]; then
|
||||
log_warning "artifact 目录不存在"
|
||||
return
|
||||
fi
|
||||
|
||||
for version_dir in artifact/*/; do
|
||||
if [[ -d "$version_dir" ]]; then
|
||||
local version=$(basename "$version_dir")
|
||||
local current_version=$(get_current_version)
|
||||
|
||||
if [[ "$version" == "$current_version" ]]; then
|
||||
echo " * $version (当前版本)"
|
||||
else
|
||||
echo " $version"
|
||||
fi
|
||||
|
||||
# 显示该版本的组件
|
||||
local component_count=0
|
||||
for file in "$version_dir"/*.tar.gz; do
|
||||
if [[ -f "$file" ]]; then
|
||||
component_count=$((component_count + 1))
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ $component_count -gt 0 ]]; then
|
||||
echo " 包含 $component_count 个组件"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# 清理旧版本
|
||||
clean_versions() {
|
||||
local current_version=$(get_current_version)
|
||||
local keep_versions=5 # 保留最近5个版本
|
||||
|
||||
log_info "清理旧版本 (保留最近 $keep_versions 个版本)..."
|
||||
|
||||
if [[ ! -d "artifact" ]]; then
|
||||
log_warning "artifact 目录不存在"
|
||||
return
|
||||
fi
|
||||
|
||||
# 获取所有版本目录,按修改时间排序
|
||||
local versions=()
|
||||
while IFS= read -r -d '' version_dir; do
|
||||
versions+=("$(basename "$version_dir")")
|
||||
done < <(find artifact -maxdepth 1 -type d -name "[0-9]*" -print0 | sort -z)
|
||||
|
||||
local total_versions=${#versions[@]}
|
||||
local versions_to_remove=$((total_versions - keep_versions))
|
||||
|
||||
if [[ $versions_to_remove -le 0 ]]; then
|
||||
log_info "无需清理,当前只有 $total_versions 个版本"
|
||||
return
|
||||
fi
|
||||
|
||||
log_info "将删除 $versions_to_remove 个旧版本..."
|
||||
|
||||
for ((i=0; i<versions_to_remove; i++)); do
|
||||
local version="${versions[i]}"
|
||||
if [[ "$version" != "$current_version" ]]; then
|
||||
log_info "删除版本: $version"
|
||||
rm -rf "artifact/$version"
|
||||
fi
|
||||
done
|
||||
|
||||
log_success "旧版本清理完成"
|
||||
}
|
||||
|
||||
# 验证版本配置
|
||||
validate_version() {
|
||||
log_info "验证版本配置..."
|
||||
|
||||
local errors=0
|
||||
|
||||
# 检查 VERSION 文件
|
||||
if [[ ! -f "config/VERSION" ]]; then
|
||||
log_error "VERSION 文件不存在"
|
||||
errors=$((errors + 1))
|
||||
else
|
||||
local version=$(get_current_version)
|
||||
if [[ ! "$version" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
|
||||
log_error "VERSION 文件格式无效: $version"
|
||||
errors=$((errors + 1))
|
||||
else
|
||||
log_success "VERSION 文件格式正确: $version"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 检查 checklist 文件
|
||||
if [[ ! -f "config/checklist" ]]; then
|
||||
log_error "checklist 文件不存在"
|
||||
errors=$((errors + 1))
|
||||
else
|
||||
local component_count=0
|
||||
while IFS= read -r line; do
|
||||
[[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
|
||||
read -r component version dep order <<< "$line"
|
||||
if [[ -n "$component" && -n "$version" ]]; then
|
||||
component_count=$((component_count + 1))
|
||||
|
||||
# 检查组件目录是否存在
|
||||
if [[ ! -d "plugins/$component" ]]; then
|
||||
log_error "组件目录不存在: plugins/$component"
|
||||
errors=$((errors + 1))
|
||||
fi
|
||||
fi
|
||||
done < config/checklist
|
||||
|
||||
if [[ $component_count -gt 0 ]]; then
|
||||
log_success "checklist 包含 $component_count 个组件"
|
||||
else
|
||||
log_error "checklist 中没有有效组件"
|
||||
errors=$((errors + 1))
|
||||
fi
|
||||
fi
|
||||
|
||||
# 检查 package.sh 文件
|
||||
if [[ ! -f "scripts/package_artifact.sh" ]]; then
|
||||
log_error "package_artifact.sh 文件不存在"
|
||||
errors=$((errors + 1))
|
||||
else
|
||||
if [[ -x "scripts/package_artifact.sh" ]]; then
|
||||
log_success "package_artifact.sh 可执行"
|
||||
else
|
||||
log_warning "package_artifact.sh 不可执行,请运行: chmod +x scripts/package_artifact.sh"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 检查 install.sh 文件
|
||||
if [[ ! -f "scripts/install_artifact.sh" ]]; then
|
||||
log_error "install_artifact.sh 文件不存在"
|
||||
errors=$((errors + 1))
|
||||
else
|
||||
if [[ -x "scripts/install_artifact.sh" ]]; then
|
||||
log_success "install_artifact.sh 可执行"
|
||||
else
|
||||
log_warning "install_artifact.sh 不可执行,请运行: chmod +x scripts/install_artifact.sh"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ $errors -eq 0 ]]; then
|
||||
log_success "版本配置验证通过"
|
||||
else
|
||||
log_error "发现 $errors 个配置问题"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
case "${1:-}" in
|
||||
"bump")
|
||||
if [[ -z "${2:-}" ]]; then
|
||||
log_error "请指定升级类型: major, minor, patch"
|
||||
exit 1
|
||||
fi
|
||||
bump_version "$2"
|
||||
;;
|
||||
"set")
|
||||
if [[ -z "${2:-}" ]]; then
|
||||
log_error "请指定版本号"
|
||||
exit 1
|
||||
fi
|
||||
set_version "$2"
|
||||
;;
|
||||
"show")
|
||||
show_version
|
||||
;;
|
||||
"list")
|
||||
list_versions
|
||||
;;
|
||||
"clean")
|
||||
clean_versions
|
||||
;;
|
||||
"validate")
|
||||
validate_version
|
||||
;;
|
||||
"help"|"-h"|"--help")
|
||||
show_help
|
||||
;;
|
||||
"")
|
||||
show_help
|
||||
;;
|
||||
*)
|
||||
log_error "未知命令: $1"
|
||||
echo
|
||||
show_help
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# 脚本入口
|
||||
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||
main "$@"
|
||||
fi
|
||||
@ -2,6 +2,18 @@ FROM grafana/grafana:11.1.0
|
||||
|
||||
USER root
|
||||
|
||||
# 构建参数:是否使用内网镜像
|
||||
ARG USE_INTRANET=false
|
||||
|
||||
# 根据是否为内网构建切换 apk 源
|
||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
echo "Configuring intranet apk repositories..." && \
|
||||
sed -i 's#https\?://[^/]\+#http://10.68.64.1#g' /etc/apk/repositories; \
|
||||
else \
|
||||
echo "Configuring public apk repositories..." && \
|
||||
sed -i 's#https\?://[^/]\+#https://mirrors.aliyun.com#g' /etc/apk/repositories; \
|
||||
fi
|
||||
|
||||
# 安装必要的工具
|
||||
RUN apk add --no-cache \
|
||||
supervisor \
|
||||
@ -10,6 +22,11 @@ RUN apk add --no-cache \
|
||||
vim \
|
||||
bash
|
||||
|
||||
# 部署镜像时恢复到部署侧使用的内网镜像源
|
||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
sed -i 's#https\?://[^/]\+#https://10.92.132.52/mirrors#g' /etc/apk/repositories; \
|
||||
fi
|
||||
|
||||
# supervisor 日志目录
|
||||
RUN mkdir -p /var/log/supervisor
|
||||
|
||||
@ -48,6 +65,8 @@ COPY grafana.ini /tmp/grafana.ini
|
||||
COPY datasources/datasources.yml /tmp/datasources.yml
|
||||
COPY dashboards/dashboards.yml /tmp/dashboards.yml
|
||||
COPY dashboards/default_dashboard_by_hostname.json /tmp/default_dashboard.json
|
||||
COPY dashboards/default_cluster_dashboard.json /tmp/default_cluster_dashboard.json
|
||||
COPY dashboards/default_dashboard_by_instance.json /tmp/default_dashboard_by_instance.json
|
||||
|
||||
# supervisor 配置
|
||||
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
||||
|
||||
@ -581,6 +581,372 @@
|
||||
],
|
||||
"title": "Node Process Count",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "GPU Utilization (%)",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 80
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 95
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
},
|
||||
"id": 301,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "DCGM_FI_DEV_GPU_UTIL{hostname=~\"$hostname\"}",
|
||||
"legendFormat": "{{hostname}} GPU{{gpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "GPU 利用率 (单卡)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": true,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "Memory Used (%)",
|
||||
"axisPlacement": "left",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 80
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 95
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 24
|
||||
},
|
||||
"id": 403,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "round(DCGM_FI_DEV_FB_USED{hostname=~\"$hostname\"} / (DCGM_FI_DEV_FB_USED{hostname=~\"$hostname\"} + DCGM_FI_DEV_FB_FREE{hostname=~\"$hostname\"}) * 100)",
|
||||
"legendFormat": "{{hostname}} GPU{{gpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "GPU 显存使用率 (单卡)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": true,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "Temperature (℃)",
|
||||
"axisPlacement": "left",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 75
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 85
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "celsius"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 32
|
||||
},
|
||||
"id": 501,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "DCGM_FI_DEV_GPU_TEMP{hostname=~\"$hostname\"}",
|
||||
"legendFormat": "{{hostname}} GPU{{gpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "GPU 温度(单卡)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": true,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "Power (W)",
|
||||
"axisPlacement": "left",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 300,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 200
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 300
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "watt"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 32
|
||||
},
|
||||
"id": 502,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "DCGM_FI_DEV_POWER_USAGE{hostname=~\"$hostname\"}",
|
||||
"legendFormat": "{{hostname}} GPU{{gpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "GPU 功率 (单卡)",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "15s",
|
||||
@ -589,11 +955,6 @@
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"selected": true,
|
||||
"text": "node-exporter-A1",
|
||||
"value": "node-exporter-A1"
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
@ -623,7 +984,7 @@
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Node and GPU Metrics",
|
||||
"uid": "node_gpu_metrics",
|
||||
"title": "Node and GPU Metrics (by hostname)",
|
||||
"uid": "node_gpu_metrics_by_hostname",
|
||||
"weekStart": ""
|
||||
}
|
||||
}
|
||||
|
||||
@ -622,7 +622,7 @@
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Node and GPU Metrics",
|
||||
"uid": "node_gpu_metrics",
|
||||
"title": "Node and GPU Metrics (by instance)",
|
||||
"uid": "node_gpu_metrics_by_instance",
|
||||
"weekStart": ""
|
||||
}
|
||||
}
|
||||
|
||||
@ -8,7 +8,7 @@ datasources:
|
||||
type: prometheus
|
||||
access: proxy
|
||||
uid: eezk1zvkie4g0a
|
||||
url: http://10.211.55.5:9090
|
||||
url: http://prom.metric.argus.com:9090
|
||||
isDefault: true
|
||||
editable: true
|
||||
jsonData:
|
||||
|
||||
@ -44,12 +44,18 @@ else
|
||||
fi
|
||||
|
||||
# 复制数据源配置文件到挂载目录
|
||||
if [ -f "/tmp/datasources.yml" ]; then
|
||||
echo "[INFO] Copying datasource configuration to /private/argus/metric/grafana/provisioning/datasources/"
|
||||
cp /tmp/datasources.yml /private/argus/metric/grafana/provisioning/datasources/datasources.yml
|
||||
echo "[INFO] Datasource configuration copied successfully"
|
||||
elif [ -d "/private/argus/metric/grafana/provisioning/datasources" ] && [ "$(ls -A /private/argus/metric/grafana/provisioning/datasources)" ]; then
|
||||
echo "[INFO] Found existing datasource provisioning files in /private/argus/metric/grafana/provisioning/datasources"
|
||||
DS_OUT="/private/argus/metric/grafana/provisioning/datasources/datasources.yml"
|
||||
PROM_DOMAIN="prom.metric.argus.com:9090"
|
||||
|
||||
if [ -f "/tmp/datasources.yml" ] && [ ! -f "$DS_OUT" ]; then
|
||||
echo "[INFO] Initializing datasource provisioning file from /tmp"
|
||||
cp /tmp/datasources.yml "$DS_OUT"
|
||||
fi
|
||||
|
||||
# 统一将数据源 URL 规范为 prom.metric.argus.com:9090
|
||||
if [ -f "$DS_OUT" ]; then
|
||||
sed -i -E "s#^\s*url:\s*http://[^[:space:]]+# url: http://$PROM_DOMAIN#g" "$DS_OUT" || true
|
||||
echo "[INFO] Datasource URL normalized to http://$PROM_DOMAIN"
|
||||
elif [ -d "/etc/grafana/provisioning/datasources" ] && [ "$(ls -A /etc/grafana/provisioning/datasources)" ]; then
|
||||
echo "[INFO] Found datasource provisioning files in /etc/grafana/provisioning/datasources"
|
||||
# 确保数据源配置目录权限正确
|
||||
@ -65,11 +71,33 @@ if [ -f "/tmp/dashboards.yml" ]; then
|
||||
echo "[INFO] Dashboard configuration copied successfully"
|
||||
fi
|
||||
|
||||
# 复制默认仪表板到挂载目录
|
||||
if [ -f "/tmp/default_dashboard.json" ]; then
|
||||
echo "[INFO] Copying default dashboard to /private/argus/metric/grafana/provisioning/dashboards/"
|
||||
cp /tmp/default_dashboard.json /private/argus/metric/grafana/provisioning/dashboards/default_dashboard.json
|
||||
echo "[INFO] Default dashboard copied successfully"
|
||||
# 复制默认仪表板到挂载目录(按需,不覆盖已存在文件)
|
||||
copy_dashboard_if_missing() {
|
||||
local src="$1"; local dst_name="$2"
|
||||
local dst_dir="/private/argus/metric/grafana/provisioning/dashboards"
|
||||
local dst="$dst_dir/$dst_name"
|
||||
if [ -f "$src" ]; then
|
||||
if [ ! -f "$dst" ]; then
|
||||
echo "[INFO] Installing dashboard: $dst_name"
|
||||
cp "$src" "$dst"
|
||||
else
|
||||
echo "[INFO] Dashboard exists, skip: $dst_name"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
copy_dashboard_if_missing "/tmp/default_dashboard.json" "default_dashboard.json"
|
||||
copy_dashboard_if_missing "/tmp/default_cluster_dashboard.json" "default_cluster_dashboard.json"
|
||||
copy_dashboard_if_missing "/tmp/default_dashboard_by_instance.json" "default_dashboard_by_instance.json"
|
||||
|
||||
# 规范面板中的数据源字段:将字符串 "prometheus" 替换为 null(使用默认数据源)
|
||||
DB_DIR="/private/argus/metric/grafana/provisioning/dashboards"
|
||||
if [ -d "$DB_DIR" ]; then
|
||||
for f in "$DB_DIR"/*.json; do
|
||||
[ -f "$f" ] || continue
|
||||
sed -i -E 's/"datasource"\s*:\s*"prometheus"/"datasource": null/g' "$f" || true
|
||||
done
|
||||
echo "[INFO] Normalized dashboard datasource to default (null)"
|
||||
fi
|
||||
|
||||
# 启动 Grafana
|
||||
|
||||
@ -11,13 +11,6 @@ RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
||||
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
||||
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
||||
else \
|
||||
echo "Configuring fast apt sources for external network..." && \
|
||||
find /etc/apt -name "sources.list*" -exec sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g' {} \; && \
|
||||
find /etc/apt -name "sources.list*" -exec sed -i 's/security.ubuntu.com/mirrors.aliyun.com/g' {} \; && \
|
||||
echo "deb http://mirrors.aliyun.com/ubuntu/ jammy main restricted universe multiverse" > /etc/apt/sources.list && \
|
||||
echo "deb http://mirrors.aliyun.com/ubuntu/ jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \
|
||||
echo "deb http://mirrors.aliyun.com/ubuntu/ jammy-security main restricted universe multiverse" >> /etc/apt/sources.list; \
|
||||
fi
|
||||
|
||||
# 验证源配置并安装常用工具
|
||||
@ -61,10 +54,25 @@ RUN mkdir -p ${PROMETHEUS_BASE_PATH}/rules \
|
||||
&& ln -s ${PROMETHEUS_BASE_PATH} /prometheus
|
||||
|
||||
# 修改 Prometheus 用户 UID/GID 并授权
|
||||
RUN usermod -u ${ARGUS_BUILD_UID} nobody && \
|
||||
groupmod -g ${ARGUS_BUILD_GID} nogroup && \
|
||||
chown -h nobody:nogroup /prometheus && \
|
||||
chown -R nobody:nogroup ${PROMETHEUS_BASE_PATH} && \
|
||||
RUN set -eux; \
|
||||
existing_user=""; \
|
||||
if getent passwd "${ARGUS_BUILD_UID}" >/dev/null 2>&1; then \
|
||||
existing_user="$(getent passwd "${ARGUS_BUILD_UID}" | cut -d: -f1)"; \
|
||||
fi; \
|
||||
if [ -n "$existing_user" ] && [ "$existing_user" != "nobody" ]; then \
|
||||
userdel -r "$existing_user" || true; \
|
||||
fi; \
|
||||
existing_group=""; \
|
||||
if getent group "${ARGUS_BUILD_GID}" >/dev/null 2>&1; then \
|
||||
existing_group="$(getent group "${ARGUS_BUILD_GID}" | cut -d: -f1)"; \
|
||||
fi; \
|
||||
if [ -n "$existing_group" ] && [ "$existing_group" != "nogroup" ]; then \
|
||||
groupdel "$existing_group" || true; \
|
||||
fi; \
|
||||
usermod -u ${ARGUS_BUILD_UID} nobody; \
|
||||
groupmod -g ${ARGUS_BUILD_GID} nogroup; \
|
||||
chown -h nobody:nogroup /prometheus; \
|
||||
chown -R nobody:nogroup ${PROMETHEUS_BASE_PATH}; \
|
||||
chown -R nobody:nogroup /etc/prometheus
|
||||
|
||||
# supervisor 配置
|
||||
|
||||
@ -5,13 +5,6 @@ networks:
|
||||
|
||||
services:
|
||||
ftp:
|
||||
build:
|
||||
context: ../ftp/build
|
||||
dockerfile: Dockerfile
|
||||
args:
|
||||
ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
|
||||
ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
|
||||
USE_INTRANET: ${USE_INTRANET:-false}
|
||||
image: argus-metric-ftp:latest
|
||||
container_name: argus-ftp
|
||||
restart: unless-stopped
|
||||
@ -41,13 +34,6 @@ services:
|
||||
max-file: "3"
|
||||
|
||||
prometheus:
|
||||
build:
|
||||
context: ../prometheus/build
|
||||
dockerfile: Dockerfile
|
||||
args:
|
||||
ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
|
||||
ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
|
||||
USE_INTRANET: ${USE_INTRANET:-false}
|
||||
image: argus-metric-prometheus:latest
|
||||
container_name: argus-prometheus
|
||||
restart: unless-stopped
|
||||
@ -73,12 +59,6 @@ services:
|
||||
max-file: "3"
|
||||
|
||||
grafana:
|
||||
build:
|
||||
context: ../grafana/build
|
||||
dockerfile: Dockerfile
|
||||
args:
|
||||
ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
|
||||
ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
|
||||
image: argus-metric-grafana:latest
|
||||
container_name: argus-grafana
|
||||
restart: unless-stopped
|
||||
@ -109,9 +89,6 @@ services:
|
||||
max-file: "3"
|
||||
|
||||
test-node:
|
||||
build:
|
||||
context: ./client-test-node/build
|
||||
dockerfile: Dockerfile
|
||||
image: argus-metric-test-node:latest
|
||||
container_name: argus-metric-test-node
|
||||
hostname: test-metric-node-001
|
||||
@ -143,9 +120,6 @@ services:
|
||||
max-file: "3"
|
||||
|
||||
test-gpu-node:
|
||||
build:
|
||||
context: ./client-test-gpu-node/build
|
||||
dockerfile: Dockerfile
|
||||
image: argus-metric-test-gpu-node:latest
|
||||
container_name: argus-metric-test-gpu-node
|
||||
hostname: test-metric-gpu-node-001
|
||||
|
||||
@ -3,15 +3,8 @@ set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
|
||||
# 解析参数
|
||||
REBUILD_FLAG=""
|
||||
if [[ "$1" == "--rebuild" || "$1" == "-r" ]]; then
|
||||
REBUILD_FLAG="--rebuild"
|
||||
echo "[01] 启用强制重新构建模式"
|
||||
fi
|
||||
|
||||
echo "[01] 启动所有服务..."
|
||||
bash "$SCRIPT_DIR/common/start-all.sh" $REBUILD_FLAG
|
||||
bash "$SCRIPT_DIR/common/start-all.sh"
|
||||
|
||||
echo "[01] 等待服务就绪..."
|
||||
sleep 5
|
||||
|
||||
@ -1,6 +1,9 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
COMMON_DIR="$SCRIPT_DIR/common"
|
||||
|
||||
FTP_SERVER="${FTP_SERVER:-172.30.0.40}"
|
||||
FTP_USER="${FTP_USER:-ftpuser}"
|
||||
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
|
||||
@ -8,26 +11,37 @@ FTP_PORT="${FTP_PORT:-21}"
|
||||
|
||||
FTP_HOST="${FTP_SERVER}"
|
||||
|
||||
echo "[03] 进入测试节点执行安装..."
|
||||
echo "[03] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}"
|
||||
echo "[04] 检测GPU环境..."
|
||||
# 检测GPU环境
|
||||
if bash "$COMMON_DIR/check-gpu.sh"; then
|
||||
echo "[04] GPU环境可用,继续执行GPU节点安装"
|
||||
GPU_AVAILABLE=true
|
||||
else
|
||||
echo "[04] GPU环境不可用,跳过GPU节点安装"
|
||||
GPU_AVAILABLE=false
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "[04] 进入测试节点执行安装..."
|
||||
echo "[04] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}"
|
||||
|
||||
docker exec argus-metric-test-gpu-node bash -c "
|
||||
set -e
|
||||
|
||||
if ! command -v curl &>/dev/null; then
|
||||
echo '[03] curl 未安装,正在安装...'
|
||||
echo '[04] curl 未安装,正在安装...'
|
||||
apt-get update && apt-get install -y curl
|
||||
fi
|
||||
|
||||
cd /tmp
|
||||
echo '[03] 下载 setup.sh...'
|
||||
echo '[04] 下载 setup.sh...'
|
||||
curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_HOST}:${FTP_PORT}/setup.sh -o setup.sh
|
||||
|
||||
echo '[03] 执行安装...'
|
||||
echo '[04] 执行安装...'
|
||||
chmod +x setup.sh
|
||||
bash setup.sh --server ${FTP_HOST} --user ${FTP_USER} --password '${FTP_PASSWORD}' --port ${FTP_PORT}
|
||||
|
||||
echo '[03] 安装完成'
|
||||
echo '[04] 安装完成'
|
||||
"
|
||||
|
||||
echo "[03] 完成"
|
||||
echo "[04] 完成"
|
||||
|
||||
59
src/metric/tests/scripts/common/check-gpu.sh
Executable file
59
src/metric/tests/scripts/common/check-gpu.sh
Executable file
@ -0,0 +1,59 @@
|
||||
#!/bin/bash
|
||||
|
||||
# GPU环境检测脚本
|
||||
# 检测系统是否有NVIDIA GPU硬件
|
||||
|
||||
set -e
|
||||
|
||||
# 检测函数
|
||||
check_gpu_support() {
|
||||
echo "检测GPU环境..."
|
||||
|
||||
# 方法1: 检测GPU设备文件
|
||||
if ls /dev/nvidia* &>/dev/null; then
|
||||
echo "✓ 检测到NVIDIA GPU设备文件"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 方法2: 检测lspci中的NVIDIA设备(Linux)
|
||||
if command -v lspci &> /dev/null; then
|
||||
if lspci | grep -i nvidia &> /dev/null; then
|
||||
echo "✓ 检测到NVIDIA GPU硬件"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
# 方法3: 检测nvidia-smi
|
||||
if command -v nvidia-smi &> /dev/null; then
|
||||
if nvidia-smi &> /dev/null; then
|
||||
echo "✓ 检测到NVIDIA GPU硬件"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "✗ 未检测到NVIDIA GPU硬件"
|
||||
return 1
|
||||
}
|
||||
|
||||
# 主函数
|
||||
main() {
|
||||
echo "=========================================="
|
||||
echo " GPU环境检测"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
if check_gpu_support; then
|
||||
echo ""
|
||||
echo "结果: GPU环境可用"
|
||||
exit 0
|
||||
else
|
||||
echo ""
|
||||
echo "结果: GPU环境不可用,将跳过GPU相关服务"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 如果直接运行此脚本
|
||||
if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
|
||||
main "$@"
|
||||
fi
|
||||
@ -1,7 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 一键启动脚本
|
||||
# 用于初始化目录、构建镜像并启动所有服务
|
||||
# 用于初始化目录并启动所有服务
|
||||
# 镜像构建已移至 build/build_images.sh
|
||||
|
||||
set -e
|
||||
|
||||
@ -9,12 +10,6 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
cd "$TEST_DIR"
|
||||
|
||||
# 解析参数
|
||||
FORCE_REBUILD=false
|
||||
if [[ "$1" == "--rebuild" ]]; then
|
||||
FORCE_REBUILD=true
|
||||
fi
|
||||
|
||||
echo "=========================================="
|
||||
echo " Argus Metrics 一键启动脚本"
|
||||
echo "=========================================="
|
||||
@ -37,26 +32,6 @@ echo "使用: docker compose"
|
||||
echo "Compose 文件: $TEST_DIR/docker-compose.yml"
|
||||
echo ""
|
||||
|
||||
# 检查必要的构建目录
|
||||
echo "检查构建目录..."
|
||||
BUILD_DIRS=(
|
||||
"../ftp/build"
|
||||
"../prometheus/build"
|
||||
"../grafana/build"
|
||||
"client-test-node/build"
|
||||
"client-test-gpu-node/build"
|
||||
)
|
||||
|
||||
for dir in "${BUILD_DIRS[@]}"; do
|
||||
if [ ! -d "$dir" ]; then
|
||||
echo "错误: 构建目录不存在: $dir"
|
||||
echo "完整路径: $(cd "$(dirname "$dir")" 2>/dev/null && pwd)/$(basename "$dir")"
|
||||
exit 1
|
||||
else
|
||||
echo " ✓ 找到: $dir"
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
|
||||
# 检查并创建 .env 文件
|
||||
if [ ! -f .env ]; then
|
||||
@ -84,118 +59,65 @@ echo "1. 初始化目录结构..."
|
||||
bash "$SCRIPT_DIR/init-directories.sh"
|
||||
|
||||
echo ""
|
||||
echo "2. 准备 Docker 镜像..."
|
||||
|
||||
# 检查镜像是否存在
|
||||
IMAGE_CACHE_DIR="$TEST_DIR/images-cache"
|
||||
IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest" "argus-metric-test-node:latest" "argus-metric-test-gpu-node:latest")
|
||||
all_images_exist=true
|
||||
|
||||
for image in "${IMAGES[@]}"; do
|
||||
if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then
|
||||
all_images_exist=false
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if $FORCE_REBUILD; then
|
||||
echo "强制重新构建镜像(--rebuild 模式)..."
|
||||
cd "$TEST_DIR"
|
||||
docker compose build --no-cache
|
||||
echo "镜像重新构建完成"
|
||||
elif $all_images_exist; then
|
||||
echo "所有镜像已存在,跳过构建"
|
||||
echo "2. 检测GPU环境..."
|
||||
# 检测GPU环境
|
||||
if bash "$SCRIPT_DIR/check-gpu.sh"; then
|
||||
echo "GPU环境可用,将启动GPU节点"
|
||||
GPU_AVAILABLE=true
|
||||
else
|
||||
echo "检测到缺失镜像,尝试从缓存加载..."
|
||||
|
||||
# 尝试从缓存加载
|
||||
loaded_from_cache=false
|
||||
if [ -d "$IMAGE_CACHE_DIR" ]; then
|
||||
for image in "${IMAGES[@]}"; do
|
||||
if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then
|
||||
# 镜像不存在,尝试加载
|
||||
case "$image" in
|
||||
"argus-metric-ftp:latest")
|
||||
cache_file="${IMAGE_CACHE_DIR}/argus-ftp.tar"
|
||||
;;
|
||||
"argus-metric-prometheus:latest")
|
||||
cache_file="${IMAGE_CACHE_DIR}/argus-prometheus.tar"
|
||||
;;
|
||||
"argus-metric-grafana:latest")
|
||||
cache_file="${IMAGE_CACHE_DIR}/argus-grafana.tar"
|
||||
;;
|
||||
"argus-metric-test-node:latest")
|
||||
cache_file="${IMAGE_CACHE_DIR}/argus-test-node.tar"
|
||||
;;
|
||||
"argus-metric-test-gpu-node:latest")
|
||||
cache_file="${IMAGE_CACHE_DIR}/argus-test-gpu-node.tar"
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ -f "$cache_file" ]; then
|
||||
echo " 从缓存加载: $image"
|
||||
docker load -i "$cache_file"
|
||||
loaded_from_cache=true
|
||||
fi
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# 检查加载后是否还有缺失的镜像
|
||||
need_build=false
|
||||
for image in "${IMAGES[@]}"; do
|
||||
if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then
|
||||
need_build=true
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if $need_build; then
|
||||
echo ""
|
||||
echo "部分镜像缺失,开始构建..."
|
||||
echo "工作目录: $(pwd)"
|
||||
cd "$TEST_DIR"
|
||||
docker compose build --no-cache
|
||||
|
||||
# 询问是否保存镜像
|
||||
echo ""
|
||||
read -p "是否保存镜像到缓存以便下次快速启动? (Y/n): " -n 1 -r
|
||||
echo
|
||||
if [[ ! $REPLY =~ ^[Nn]$ ]]; then
|
||||
mkdir -p "$IMAGE_CACHE_DIR"
|
||||
echo "保存镜像到缓存..."
|
||||
for image in "${IMAGES[@]}"; do
|
||||
case "$image" in
|
||||
"argus-metric-ftp:latest")
|
||||
docker save -o "${IMAGE_CACHE_DIR}/argus-ftp.tar" "$image" && echo " 已保存: argus-ftp.tar"
|
||||
;;
|
||||
"argus-metric-prometheus:latest")
|
||||
docker save -o "${IMAGE_CACHE_DIR}/argus-prometheus.tar" "$image" && echo " 已保存: argus-prometheus.tar"
|
||||
;;
|
||||
"argus-metric-grafana:latest")
|
||||
docker save -o "${IMAGE_CACHE_DIR}/argus-grafana.tar" "$image" && echo " 已保存: argus-grafana.tar"
|
||||
;;
|
||||
"argus-metric-test-node:latest")
|
||||
docker save -o "${IMAGE_CACHE_DIR}/argus-test-node.tar" "$image" && echo " 已保存: argus-test-node.tar"
|
||||
;;
|
||||
"argus-metric-test-gpu-node:latest")
|
||||
docker save -o "${IMAGE_CACHE_DIR}/argus-test-gpu-node.tar" "$image" && echo " 已保存: argus-test-gpu-node.tar"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
echo "镜像已保存到: $IMAGE_CACHE_DIR/"
|
||||
fi
|
||||
elif $loaded_from_cache; then
|
||||
echo ""
|
||||
echo "所有镜像已从缓存加载完成!"
|
||||
fi
|
||||
echo "GPU环境不可用,跳过GPU节点"
|
||||
GPU_AVAILABLE=false
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "3. 启动基础服务..."
|
||||
echo "3. 检查 Docker 镜像..."
|
||||
|
||||
# 检查必要的镜像是否存在
|
||||
BASE_IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest" "argus-metric-test-node:latest")
|
||||
GPU_IMAGES=("argus-metric-test-gpu-node:latest")
|
||||
|
||||
# 先检查基础镜像
|
||||
missing_images=()
|
||||
for image in "${BASE_IMAGES[@]}"; do
|
||||
if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then
|
||||
missing_images+=("$image")
|
||||
fi
|
||||
done
|
||||
|
||||
# 检查GPU镜像(如果GPU环境可用)
|
||||
if [ "$GPU_AVAILABLE" = true ]; then
|
||||
for image in "${GPU_IMAGES[@]}"; do
|
||||
if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then
|
||||
missing_images+=("$image")
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
if [ ${#missing_images[@]} -gt 0 ]; then
|
||||
echo "以下镜像缺失,请先运行 build/build_images.sh 构建镜像:"
|
||||
for image in "${missing_images[@]}"; do
|
||||
echo " • $image"
|
||||
done
|
||||
echo ""
|
||||
echo "构建命令:"
|
||||
echo " ./build/build_images.sh --metric"
|
||||
exit 1
|
||||
else
|
||||
echo "所有必要镜像已存在"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "4. 启动基础服务..."
|
||||
cd "$TEST_DIR"
|
||||
# 启动除GPU节点外的所有服务
|
||||
docker compose up -d ftp prometheus grafana test-node test-gpu-node
|
||||
|
||||
# 根据GPU环境决定启动的服务
|
||||
if [ "$GPU_AVAILABLE" = true ]; then
|
||||
echo "启动所有服务(包括GPU节点)..."
|
||||
docker compose up -d ftp prometheus grafana test-node test-gpu-node
|
||||
else
|
||||
echo "启动基础服务(跳过GPU节点)..."
|
||||
docker compose up -d ftp prometheus grafana test-node
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "4. 等待服务启动..."
|
||||
|
||||
36
src/sys/build/node/Dockerfile
Normal file
36
src/sys/build/node/Dockerfile
Normal file
@ -0,0 +1,36 @@
|
||||
FROM ubuntu:22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
TZ=Asia/Shanghai
|
||||
|
||||
ARG USE_INTRANET=false
|
||||
ARG ARGUS_BUILD_UID=2133
|
||||
ARG ARGUS_BUILD_GID=2015
|
||||
|
||||
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
||||
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
||||
|
||||
# Optional: switch to intranet apt mirrors during build
|
||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
echo "Configuring intranet apt sources..." && \
|
||||
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
|
||||
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
||||
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
||||
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
||||
fi
|
||||
|
||||
# Install base tools and all libs that Fluent Bit may require at runtime
|
||||
# so that start-fluent-bit.sh will NOT fallback to apt during container start.
|
||||
RUN set -eux; \
|
||||
apt-get update; \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ca-certificates tzdata \
|
||||
procps iproute2 net-tools lsof \
|
||||
libpq5 libyaml-0-2 libsasl2-2 libldap-2.5-0; \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Keep root; compose provides entrypoint via bind mount
|
||||
USER root
|
||||
|
||||
CMD ["bash", "-lc", "sleep infinity"]
|
||||
|
||||
34
src/sys/build/test-gpu-node/Dockerfile
Normal file
34
src/sys/build/test-gpu-node/Dockerfile
Normal file
@ -0,0 +1,34 @@
|
||||
FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
TZ=Asia/Shanghai \
|
||||
NVIDIA_VISIBLE_DEVICES=all \
|
||||
NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||
|
||||
ARG USE_INTRANET=false
|
||||
ARG ARGUS_BUILD_UID=2133
|
||||
ARG ARGUS_BUILD_GID=2015
|
||||
|
||||
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
||||
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
||||
|
||||
# Optional intranet mirror for build-time apt
|
||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
echo "Configuring intranet apt sources..." && \
|
||||
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
|
||||
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
||||
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
||||
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
||||
fi
|
||||
|
||||
# Pre-install curl and diagnostics to avoid runtime apt installs in GPU test node
|
||||
RUN set -eux; \
|
||||
apt-get update; \
|
||||
apt-get install -y --no-install-recommends \
|
||||
curl ca-certificates tzdata \
|
||||
procps iproute2 net-tools lsof; \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
USER root
|
||||
CMD ["bash", "-lc", "sleep infinity"]
|
||||
|
||||
32
src/sys/build/test-node/Dockerfile
Normal file
32
src/sys/build/test-node/Dockerfile
Normal file
@ -0,0 +1,32 @@
|
||||
FROM ubuntu:22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
TZ=Asia/Shanghai
|
||||
|
||||
ARG USE_INTRANET=false
|
||||
ARG ARGUS_BUILD_UID=2133
|
||||
ARG ARGUS_BUILD_GID=2015
|
||||
|
||||
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
||||
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
||||
|
||||
# Optional intranet mirror for build-time apt
|
||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
echo "Configuring intranet apt sources..." && \
|
||||
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
|
||||
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
||||
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
||||
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
||||
fi
|
||||
|
||||
# Pre-install curl and common diagnostics to avoid runtime apt installs
|
||||
RUN set -eux; \
|
||||
apt-get update; \
|
||||
apt-get install -y --no-install-recommends \
|
||||
curl ca-certificates tzdata \
|
||||
procps iproute2 net-tools lsof; \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
USER root
|
||||
CMD ["bash", "-lc", "sleep infinity"]
|
||||
|
||||
@ -32,7 +32,7 @@
|
||||
|
||||
- 一键执行
|
||||
- `cd src/sys/tests`
|
||||
- `./scripts/00_e2e_test.sh`
|
||||
- `./scripts/00_e2e_test.sh`(CPU-only)或 `./scripts/00_e2e_test.sh --enable-gpu`(启用 GPU 流程)
|
||||
|
||||
- 分步执行(推荐用于排查)
|
||||
- `./scripts/01_bootstrap.sh` 生成目录/拷贝 `update-dns.sh`/构建 agent 二进制/写 `.env`
|
||||
@ -42,7 +42,12 @@
|
||||
- `./scripts/05_agent_register.sh` 获取两个节点的 `node_id` 与初始 IP,检查本地 `node.json`
|
||||
- `./scripts/06_write_health_and_assert.sh` 写健康文件并断言 `nodes.json` 仅包含 2 个在线节点
|
||||
- `./scripts/07_logs_send_and_assert.sh` 向两个节点写日志,断言 ES `train-*`/`infer-*` 计数增长
|
||||
- `./scripts/08_restart_agent_reregister.sh` `node-b` 改为固定 IP `172.29.0.200`,验证保持同一节点 ID 且 IP/时间戳更新
|
||||
- `./scripts/08_restart_agent_reregister.sh` `node-b` 改为固定 IP `172.31.0.200`,验证保持同一节点 ID 且 IP/时间戳更新
|
||||
- `./scripts/10_metric_publish.sh` 发布 metric 客户端包到 FTP
|
||||
- `./scripts/11_metric_node_install.sh` 在 CPU 节点安装并验证端点
|
||||
- `./scripts/12_metric_gpu_install.sh` 在 GPU 节点安装并等待 9100/9400 就绪(仅启用 GPU 时)
|
||||
- `./scripts/13_metric_verify.sh` 对 master/Prometheus/数据面/Grafana 做综合校验(含 GPU 时校验 dcgm 指标)
|
||||
- `./scripts/14_metric_cleanup.sh` 清理 FTP 产物
|
||||
- `./scripts/09_down.sh` 回收容器、网络并清理 `private*/`、`tmp/`
|
||||
|
||||
- 重置环境
|
||||
@ -53,16 +58,17 @@
|
||||
## 二、测试部署架构(docker-compose)
|
||||
|
||||
- 网络
|
||||
- 自定义 bridge:`argus-sys-net`,子网 `172.29.0.0/16`
|
||||
- 固定地址:bind=`172.29.0.2`,master=`172.29.0.10`
|
||||
- 自定义 bridge:`argus-sys-net`,子网 `172.31.0.0/16`
|
||||
- 固定地址:bind=`172.31.0.2`,master=`172.31.0.10`
|
||||
|
||||
- 服务与端口
|
||||
- 服务与端口(宿主机映射端口由 `01_bootstrap.sh` 自动分配并写入 `.env`)
|
||||
- 关键变量:`MASTER_PORT`、`ES_HTTP_PORT`、`KIBANA_PORT`、`NODE_A_PORT`、`NODE_B_PORT`、`PROMETHEUS_PORT`、`GRAFANA_PORT`、`ALERTMANAGER_PORT`、`WEB_PROXY_PORT_8080..8085`、`FTP_PORT`、`FTP_DATA_PORT`、`FTP_PASSIVE_HOST_RANGE`
|
||||
- `bind`(`argus-bind9:latest`):监听 53/tcp+udp;负责同步 `*.argus.com` 记录
|
||||
- `master`(`argus-master:latest`):对外 `32300→3000`;API `http://localhost:32300`
|
||||
- `es`(`argus-elasticsearch:latest`):`9200→9200`;单节点,无安全
|
||||
- `kibana`(`argus-kibana:latest`):`5601→5601`;通过 `ELASTICSEARCH_HOSTS=http://es:9200` 访问 ES
|
||||
- `node-a`(`ubuntu:22.04`):同时运行 Fluent Bit + argus-agent,`hostname=dev-yyrshare-nbnyx10-cp2f-pod-0`,`2020→2020`
|
||||
- `node-b`(`ubuntu:22.04`):同时运行 Fluent Bit + argus-agent,`hostname=dev-yyrshare-uuuu10-ep2f-pod-0`,`2021→2020`
|
||||
- `master`(`argus-master:latest`):对外 `${MASTER_PORT}→3000`;API `http://localhost:${MASTER_PORT}`
|
||||
- `es`(`argus-elasticsearch:latest`):`${ES_HTTP_PORT}→9200`;单节点,无安全
|
||||
- `kibana`(`argus-kibana:latest`):`${KIBANA_PORT}→5601`;通过 `ELASTICSEARCH_HOSTS=http://es:9200` 访问 ES
|
||||
- `node-a`(`ubuntu:22.04`):同时运行 Fluent Bit + argus-agent,`hostname=dev-yyrshare-nbnyx10-cp2f-pod-0`,`${NODE_A_PORT}→2020`
|
||||
- `node-b`(`ubuntu:22.04`):同时运行 Fluent Bit + argus-agent,`hostname=dev-yyrshare-uuuu10-ep2f-pod-0`,`${NODE_B_PORT}→2020`
|
||||
|
||||
- 卷与目录
|
||||
- 核心服务(bind/master/es/kibana)共享宿主 `./private` 挂载到容器 `/private`
|
||||
@ -72,7 +78,7 @@
|
||||
- 节点容器的 Fluent Bit/agent 资产以只读方式挂载到 `/assets`/`/usr/local/bin/argus-agent`
|
||||
|
||||
- DNS 配置
|
||||
- 节点容器通过 compose 配置 `dns: [172.29.0.2]` 指向 bind,不挂载 `/etc/resolv.conf`,也不依赖 `update-dns.sh`
|
||||
- 节点容器通过 compose 配置 `dns: [172.31.0.2]` 指向 bind,不挂载 `/etc/resolv.conf`,也不依赖 `update-dns.sh`
|
||||
- master/es/kibana 仍共享 `./private`,master 启动会写 `/private/argus/etc/master.argus.com` 供 bind 同步 A 记录
|
||||
|
||||
- 节点入口
|
||||
@ -106,6 +112,7 @@
|
||||
- 判定:
|
||||
- `private/argus/etc/master.argus.com` 存在且为 master IP
|
||||
- 在 node-a/node-b 内 `getent hosts master.argus.com` 成功解析到 master IP
|
||||
- 在 metric CPU/GPU 节点内可解析 `master.argus.com` 与 `prom.metric.argus.com`
|
||||
|
||||
- `05_agent_register.sh`
|
||||
- 目的:确认两个节点注册到 master 并持久化 `node.json`
|
||||
@ -136,3 +143,16 @@
|
||||
---
|
||||
|
||||
如需更严格的断言(例如 Kibana 载入具体插件、ES 文档字段校验),可在 `07_*.sh` 中追加查询与校验逻辑。
|
||||
|
||||
---
|
||||
|
||||
## 可选:GPU 流程说明
|
||||
- 前置条件:宿主安装 NVIDIA 驱动与 `nvidia-container-toolkit`,`nvidia-smi` 在宿主可用。
|
||||
- 启用方式:
|
||||
- 一键:`./scripts/00_e2e_test.sh --enable-gpu`
|
||||
- 分步:设置 `ARGUS_SYS_ENABLE_GPU=true` 后执行 `01_bootstrap.sh`、`02_up.sh`;或直接在 `.env` 中将 `ENABLE_GPU=true` 后单独运行 `02_up.sh`。
|
||||
- `01_bootstrap.sh` 会写入:
|
||||
- `METRIC_TEST_HOSTNAME_GPU=test-metric-gpu-node-001`
|
||||
- `METRIC_TEST_INSTANCE_GPU=172.31.0.51:9100`
|
||||
- `METRIC_TEST_DCGM_GPU=172.31.0.51:9400`
|
||||
- 验证点:`04_verify_dns_routing.sh` 增加对 metric 节点的域名解析;`12_metric_gpu_install.sh` 等待 9100/9400;`13_metric_verify_*` 校验 dcgm 指标与 Grafana 面板。
|
||||
|
||||
@ -1,21 +1,18 @@
|
||||
version: "3.8"
|
||||
|
||||
networks:
|
||||
default:
|
||||
name: argus-sys-net
|
||||
sysnet:
|
||||
driver: bridge
|
||||
ipam:
|
||||
driver: default
|
||||
config:
|
||||
- subnet: 172.29.0.0/16
|
||||
- subnet: 172.31.0.0/16
|
||||
|
||||
services:
|
||||
bind:
|
||||
image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
|
||||
container_name: argus-bind-sys
|
||||
networks:
|
||||
default:
|
||||
ipv4_address: 172.29.0.2
|
||||
sysnet:
|
||||
ipv4_address: 172.31.0.2
|
||||
volumes:
|
||||
- ./private:/private
|
||||
restart: unless-stopped
|
||||
@ -32,14 +29,14 @@ services:
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
ports:
|
||||
- "32300:3000"
|
||||
- "${MASTER_PORT:-32300}:3000"
|
||||
volumes:
|
||||
- ./private/argus/master:/private/argus/master
|
||||
- ./private/argus/metric/prometheus:/private/argus/metric/prometheus
|
||||
- ./private/argus/etc:/private/argus/etc
|
||||
networks:
|
||||
default:
|
||||
ipv4_address: 172.29.0.10
|
||||
sysnet:
|
||||
ipv4_address: 172.31.0.10
|
||||
restart: unless-stopped
|
||||
|
||||
es:
|
||||
@ -55,8 +52,11 @@ services:
|
||||
- ./private/argus/log/elasticsearch:/private/argus/log/elasticsearch
|
||||
- ./private/argus/etc:/private/argus/etc
|
||||
ports:
|
||||
- "9200:9200"
|
||||
- "${ES_HTTP_PORT:-9200}:9200"
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
sysnet:
|
||||
ipv4_address: 172.31.0.3
|
||||
|
||||
kibana:
|
||||
image: argus-kibana:latest
|
||||
@ -71,11 +71,14 @@ services:
|
||||
depends_on:
|
||||
- es
|
||||
ports:
|
||||
- "5601:5601"
|
||||
- "${KIBANA_PORT:-5601}:5601"
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
sysnet:
|
||||
ipv4_address: 172.31.0.4
|
||||
|
||||
node-a:
|
||||
image: ubuntu:22.04
|
||||
image: argus-sys-node:latest
|
||||
container_name: argus-node-a
|
||||
hostname: dev-yyrshare-nbnyx10-cp2f-pod-0
|
||||
depends_on:
|
||||
@ -101,13 +104,16 @@ services:
|
||||
entrypoint:
|
||||
- /usr/local/bin/node-entrypoint.sh
|
||||
dns:
|
||||
- 172.29.0.2
|
||||
- 172.31.0.2 # internal bind for *.argus.com
|
||||
- 8.8.8.8 # external fallback for apt/external domains
|
||||
ports:
|
||||
- "2020:2020"
|
||||
- "${NODE_A_PORT:-2020}:2020"
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- sysnet
|
||||
|
||||
node-b:
|
||||
image: ubuntu:22.04
|
||||
image: argus-sys-node:latest
|
||||
container_name: argus-node-b
|
||||
hostname: dev-yyrshare-uuuu10-ep2f-pod-0
|
||||
depends_on:
|
||||
@ -133,7 +139,269 @@ services:
|
||||
entrypoint:
|
||||
- /usr/local/bin/node-entrypoint.sh
|
||||
dns:
|
||||
- 172.29.0.2
|
||||
- 172.31.0.2
|
||||
- 8.8.8.8
|
||||
ports:
|
||||
- "2021:2020"
|
||||
- "${NODE_B_PORT:-2021}:2020"
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- sysnet
|
||||
|
||||
ftp:
|
||||
image: argus-metric-ftp:latest
|
||||
container_name: argus-ftp
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- TZ=Asia/Shanghai
|
||||
- FTP_BASE_PATH=/private/argus/ftp
|
||||
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
|
||||
- DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com}
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
ports:
|
||||
- "${FTP_PORT:-21}:21"
|
||||
- "${FTP_DATA_PORT:-20}:20"
|
||||
- "${FTP_PASSIVE_HOST_RANGE:-21100-21110}:21100-21110"
|
||||
volumes:
|
||||
- ./private/argus/metric/ftp:/private/argus/ftp
|
||||
- ./private/argus/etc:/private/argus/etc
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
- /etc/timezone:/etc/timezone:ro
|
||||
networks:
|
||||
sysnet:
|
||||
ipv4_address: 172.31.0.40
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "10m"
|
||||
max-file: "3"
|
||||
|
||||
prometheus:
|
||||
image: argus-metric-prometheus:latest
|
||||
container_name: argus-prometheus
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- TZ=Asia/Shanghai
|
||||
- PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
ports:
|
||||
- "${PROMETHEUS_PORT:-9090}:9090"
|
||||
volumes:
|
||||
- ./private/argus/metric/prometheus:/private/argus/metric/prometheus
|
||||
- ./private/argus/etc:/private/argus/etc
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
- /etc/timezone:/etc/timezone:ro
|
||||
networks:
|
||||
sysnet:
|
||||
ipv4_address: 172.31.0.41
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "10m"
|
||||
max-file: "3"
|
||||
|
||||
grafana:
|
||||
image: argus-metric-grafana:latest
|
||||
container_name: argus-grafana
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- TZ=Asia/Shanghai
|
||||
- GRAFANA_BASE_PATH=/private/argus/metric/grafana
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
- GF_SERVER_HTTP_PORT=3000
|
||||
- GF_LOG_LEVEL=warn
|
||||
- GF_LOG_MODE=console
|
||||
- GF_PATHS_PROVISIONING=/private/argus/metric/grafana/provisioning
|
||||
- GF_AUTH_ANONYMOUS_ENABLED=true
|
||||
- GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
|
||||
ports:
|
||||
- "${GRAFANA_PORT:-3000}:3000"
|
||||
volumes:
|
||||
- ./private/argus/metric/grafana:/private/argus/metric/grafana
|
||||
- ./private/argus/etc:/private/argus/etc
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
- /etc/timezone:/etc/timezone:ro
|
||||
networks:
|
||||
sysnet:
|
||||
ipv4_address: 172.31.0.42
|
||||
depends_on:
|
||||
- prometheus
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "10m"
|
||||
max-file: "3"
|
||||
|
||||
# --- Added: Web Frontend (no host port; resolved by DNS as web.argus.com) ---
|
||||
web-frontend:
|
||||
image: argus-web-frontend:latest
|
||||
container_name: argus-web-frontend
|
||||
environment:
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
# Frontend runtime-injected external ports (used to render hyperlinks)
|
||||
- EXTERNAL_MASTER_PORT=${WEB_PROXY_PORT_8085:-8085}
|
||||
- EXTERNAL_ALERTMANAGER_PORT=${WEB_PROXY_PORT_8084:-8084}
|
||||
- EXTERNAL_GRAFANA_PORT=${WEB_PROXY_PORT_8081:-8081}
|
||||
- EXTERNAL_PROMETHEUS_PORT=${WEB_PROXY_PORT_8082:-8082}
|
||||
- EXTERNAL_KIBANA_PORT=${WEB_PROXY_PORT_8083:-8083}
|
||||
volumes:
|
||||
- ./private/argus/etc:/private/argus/etc
|
||||
networks:
|
||||
sysnet:
|
||||
ipv4_address: 172.31.0.80
|
||||
restart: unless-stopped
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "10m"
|
||||
max-file: "3"
|
||||
|
||||
test-node:
|
||||
image: argus-sys-metric-test-node:latest
|
||||
container_name: argus-metric-test-node
|
||||
hostname: test-metric-node-001
|
||||
restart: unless-stopped
|
||||
privileged: true
|
||||
depends_on:
|
||||
- ftp
|
||||
- prometheus
|
||||
environment:
|
||||
- TZ=Asia/Shanghai
|
||||
- DEBIAN_FRONTEND=noninteractive
|
||||
- FTP_DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com}
|
||||
- FTP_SERVER=${FTP_SERVER:-172.31.0.40}
|
||||
- FTP_USER=${FTP_USER:-ftpuser}
|
||||
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
|
||||
- FTP_PORT=${FTP_PORT:-21}
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
- METRIC_NODE_ROLE=cpu
|
||||
volumes:
|
||||
- ./private/argus/agent:/private/argus/agent
|
||||
- ./scripts/metric/test-node-entrypoint.sh:/usr/local/bin/metric-test-node-entrypoint.sh:ro
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
- /etc/timezone:/etc/timezone:ro
|
||||
entrypoint:
|
||||
- /usr/local/bin/metric-test-node-entrypoint.sh
|
||||
command:
|
||||
- sleep
|
||||
- infinity
|
||||
dns:
|
||||
- 172.31.0.2
|
||||
- 8.8.8.8
|
||||
networks:
|
||||
sysnet:
|
||||
ipv4_address: 172.31.0.50
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "10m"
|
||||
max-file: "3"
|
||||
|
||||
test-gpu-node:
|
||||
profiles: ["gpu"]
|
||||
image: argus-sys-metric-test-gpu-node:latest
|
||||
container_name: argus-metric-test-gpu-node
|
||||
hostname: test-metric-gpu-node-001
|
||||
restart: unless-stopped
|
||||
privileged: true
|
||||
runtime: nvidia
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities:
|
||||
- gpu
|
||||
depends_on:
|
||||
- ftp
|
||||
- prometheus
|
||||
environment:
|
||||
- TZ=Asia/Shanghai
|
||||
- DEBIAN_FRONTEND=noninteractive
|
||||
- NVIDIA_VISIBLE_DEVICES=all
|
||||
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||
- GPU_MODE=gpu
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
- METRIC_NODE_ROLE=gpu
|
||||
volumes:
|
||||
- ./private/argus/agent:/private/argus/agent
|
||||
- ./scripts/metric/test-node-entrypoint.sh:/usr/local/bin/metric-test-node-entrypoint.sh:ro
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
- /etc/timezone:/etc/timezone:ro
|
||||
entrypoint:
|
||||
- /usr/local/bin/metric-test-node-entrypoint.sh
|
||||
command:
|
||||
- sleep
|
||||
- infinity
|
||||
dns:
|
||||
- 172.31.0.2
|
||||
- 8.8.8.8
|
||||
networks:
|
||||
sysnet:
|
||||
ipv4_address: 172.31.0.51
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "10m"
|
||||
max-file: "3"
|
||||
|
||||
# --- Added: Alertmanager ---
|
||||
alertmanager:
|
||||
image: argus-alertmanager:latest
|
||||
container_name: argus-alertmanager
|
||||
environment:
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
volumes:
|
||||
- ./private/argus/etc:/private/argus/etc
|
||||
- ./private/argus/alert/alertmanager:/private/argus/alert/alertmanager
|
||||
networks:
|
||||
sysnet:
|
||||
ipv4_address: 172.31.0.82
|
||||
ports:
|
||||
- "${ALERTMANAGER_PORT:-9093}:9093"
|
||||
restart: unless-stopped
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "10m"
|
||||
max-file: "3"
|
||||
|
||||
# --- Added: Web Proxy (multi-port gateway) ---
|
||||
web-proxy:
|
||||
image: argus-web-proxy:latest
|
||||
container_name: argus-web-proxy
|
||||
depends_on:
|
||||
- bind
|
||||
- master
|
||||
- grafana
|
||||
- prometheus
|
||||
- kibana
|
||||
- alertmanager
|
||||
environment:
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
volumes:
|
||||
- ./private/argus/etc:/private/argus/etc
|
||||
networks:
|
||||
sysnet:
|
||||
ipv4_address: 172.31.0.81
|
||||
ports:
|
||||
- "${WEB_PROXY_PORT_8080:-8080}:8080"
|
||||
- "${WEB_PROXY_PORT_8081:-8081}:8081"
|
||||
- "${WEB_PROXY_PORT_8082:-8082}:8082"
|
||||
- "${WEB_PROXY_PORT_8083:-8083}:8083"
|
||||
- "${WEB_PROXY_PORT_8084:-8084}:8084"
|
||||
- "${WEB_PROXY_PORT_8085:-8085}:8085"
|
||||
restart: unless-stopped
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "10m"
|
||||
max-file: "3"
|
||||
|
||||
@ -3,6 +3,45 @@ set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
ENABLE_GPU=false
|
||||
CLEANUP=true
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage: 00_e2e_test.sh [options]
|
||||
|
||||
Options:
|
||||
--enable-gpu 启用 GPU 相关拓扑与测试流程
|
||||
--no-clean 跳过清理流程(不执行 14 和 09)
|
||||
-h, --help 显示帮助信息
|
||||
EOF
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--enable-gpu)
|
||||
ENABLE_GPU=true
|
||||
shift
|
||||
;;
|
||||
--no-clean)
|
||||
CLEANUP=false
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
export ARGUS_SYS_ENABLE_GPU=$ENABLE_GPU
|
||||
|
||||
# 基础步骤(不包含清理与下线)
|
||||
SCRIPTS=(
|
||||
"01_bootstrap.sh"
|
||||
"02_up.sh"
|
||||
@ -12,9 +51,20 @@ SCRIPTS=(
|
||||
"06_write_health_and_assert.sh"
|
||||
"07_logs_send_and_assert.sh"
|
||||
"08_restart_agent_reregister.sh"
|
||||
"09_down.sh"
|
||||
"10_metric_publish.sh"
|
||||
"11_metric_node_install.sh"
|
||||
"12_metric_gpu_install.sh"
|
||||
"13_metric_verify.sh"
|
||||
)
|
||||
|
||||
# 如未禁用清理,则追加清理与下线步骤(保持原有顺序)
|
||||
if [[ "$CLEANUP" == "true" ]]; then
|
||||
SCRIPTS+=(
|
||||
"14_metric_cleanup.sh"
|
||||
"09_down.sh"
|
||||
)
|
||||
fi
|
||||
|
||||
for script in "${SCRIPTS[@]}"; do
|
||||
echo "[SYS-E2E] Running $script"
|
||||
"$SCRIPT_DIR/$script"
|
||||
@ -22,5 +72,8 @@ for script in "${SCRIPTS[@]}"; do
|
||||
echo
|
||||
done
|
||||
|
||||
echo "[SYS-E2E] All tests completed"
|
||||
|
||||
if [[ "$CLEANUP" == "true" ]]; then
|
||||
echo "[SYS-E2E] All tests completed"
|
||||
else
|
||||
echo "[SYS-E2E] All tests completed (cleanup skipped)"
|
||||
fi
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user