diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..62c8935 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea/ \ No newline at end of file diff --git a/src/metric/.gitignore b/src/metric/.gitignore new file mode 100644 index 0000000..43f5e6d --- /dev/null +++ b/src/metric/.gitignore @@ -0,0 +1,7 @@ +/prometheus/data/ +/client-plugins/dcgm-exporter-installer/ +/client-plugins/demo-all-in-one/artifact/ +/client-plugins/demo-all-in-one/publish/ +/client-plugins/demo-all-in-one/checklist +/client-plugins/demo-all-in-one/VERSION +/client-plugins/all-in-one-full/ diff --git a/src/metric/client-plugins/all-in-one-demo/README.md b/src/metric/client-plugins/all-in-one-demo/README.md new file mode 100644 index 0000000..68640cf --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/README.md @@ -0,0 +1,65 @@ +# 客户侧组件安装包构建、发布流程 + +## 第一步:配置版本和组件 + +首先搞定配置文件: + +1. 把 `.checklist.example` 重命名成 `checklist` +2. 把 `.VERSION.example` 重命名成 `VERSION` + +### checklist 文件格式 +``` +# 组件名称 目录路径 版本号 [依赖组件] [安装顺序] +dcgm-exporter-installer /path/to/dcgm-exporter-installer 1.1.0 +node-exporter-installer /path/to/node-exporter-installer 1.1.0 +``` + +### VERSION 文件 +设置需要发布的版本号,比如 `1.29.0` + +> 建议用 `version-manager.sh` 来管理版本 + +## 第二步:构建安装包 + +直接跑脚本: +```bash +./package_artifact.sh +``` + +构建完的东西会放在 `artifact/` 目录下,按版本分文件夹。 + +如果版本已经存在了,想要覆盖重新构建: +```bash +./package_artifact.sh --force +``` + +构建完可以手工测试安装包。 + +## 第三步:发布安装包 + +用这个脚本发布: +```bash +./publish_artifact.sh +``` + +发布后的内容在 `publish/` 目录里,包含: +- 压缩版本的安装包 +- 一键安装的bash脚本 + +## 第四步:部署到FTP服务器 + +把发布的内容上传到FTP服务器,客户端就可以通过一键命令安装: + +```bash +curl -fsSL 'ftp://{$USER}:{$PASSWD}@{$your-ftp-server}/setup.sh' -o setup.sh + +# root用户直接执行,非root用户需要使用sudo +chmod +x setup.sh +bash setup.sh --server {$your-ftp-server} --user {$USER} --password {$PASSWD} + +示例: +curl -fsS 'ftp://ftpuser:ZGClab1234!@177.177.70.200/setup.sh' -o setup.sh +chmod +x setup.sh +bash setup.sh --server {$域名} --user ftpuser --password 'ZGClab1234!' + +``` \ No newline at end of file diff --git a/src/metric/client-plugins/all-in-one-demo/config/.VERSION.example b/src/metric/client-plugins/all-in-one-demo/config/.VERSION.example new file mode 100644 index 0000000..5e57fb8 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/config/.VERSION.example @@ -0,0 +1 @@ +1.29.0 diff --git a/src/metric/client-plugins/all-in-one-demo/config/.checklist.example b/src/metric/client-plugins/all-in-one-demo/config/.checklist.example new file mode 100644 index 0000000..89cf322 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/config/.checklist.example @@ -0,0 +1,3 @@ +# 组件名称 目录路径 版本号 [依赖组件] [安装顺序] +dcgm-exporter-installer /Users/sundapeng/Project/nlp/aiops/client-plugins/dcgm-exporter-installer 1.1.0 +node-exporter-installer /Users/sundapeng/Project/nlp/aiops/client-plugins/node-exporter-installer 1.1.0 diff --git a/src/metric/client-plugins/all-in-one-demo/config/.config.env.example b/src/metric/client-plugins/all-in-one-demo/config/.config.env.example new file mode 100644 index 0000000..8871dfe --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/config/.config.env.example @@ -0,0 +1,8 @@ +# Argus Metric 配置文件示例 +# 复制此文件为 config.env 并根据需要修改配置 + +# 连接master服务 +MASTER_ENDPOINT=master.argus.com:3000 + +# 上报状态间隔描述(秒) +REPORT_INTERVAL_SECONDS=60 diff --git a/src/metric/client-plugins/all-in-one-demo/config/config.env b/src/metric/client-plugins/all-in-one-demo/config/config.env new file mode 100644 index 0000000..0a70059 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/config/config.env @@ -0,0 +1,3 @@ +# Elasticsearch +ES_HOST=es.log.argus.com +ES_PORT=9200 diff --git a/src/metric/client-plugins/all-in-one-demo/config/dns.conf.example b/src/metric/client-plugins/all-in-one-demo/config/dns.conf.example new file mode 100644 index 0000000..73b77bb --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/config/dns.conf.example @@ -0,0 +1 @@ +177.177.17.106 diff --git a/src/metric/client-plugins/all-in-one-demo/deps/cron-offline.tar.gz b/src/metric/client-plugins/all-in-one-demo/deps/cron-offline.tar.gz new file mode 100644 index 0000000..77104f7 Binary files /dev/null and b/src/metric/client-plugins/all-in-one-demo/deps/cron-offline.tar.gz differ diff --git a/src/metric/client-plugins/all-in-one-demo/plugins/node-exporter/bin/node_exporter b/src/metric/client-plugins/all-in-one-demo/plugins/node-exporter/bin/node_exporter new file mode 100755 index 0000000..66c3e4a Binary files /dev/null and b/src/metric/client-plugins/all-in-one-demo/plugins/node-exporter/bin/node_exporter differ diff --git a/src/metric/client-plugins/all-in-one-demo/plugins/node-exporter/check_health.sh b/src/metric/client-plugins/all-in-one-demo/plugins/node-exporter/check_health.sh new file mode 100755 index 0000000..ed168e3 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/plugins/node-exporter/check_health.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# Node Exporter 健康检查脚本 +# 输出 JSON 格式结果 + +set -e + +# 检查 Node Exporter 健康状态 +check_health() { + local url="http://localhost:9100" + local metrics_url="$url/metrics" + local name="node-exporter" + local status="unhealth" + local reason="" + + # 检查 curl 是否可用 + if ! command -v curl &> /dev/null; then + reason="curl 命令不可用,无法进行健康检查" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + + # 测试根路径连接 + local http_code=$(curl -s -o /dev/null -w "%{http_code}" "$url" 2>/dev/null || echo "000") + + if [[ "$http_code" == "200" ]]; then + # 测试 metrics 端点 + local metrics_code=$(curl -s -o /dev/null -w "%{http_code}" "$metrics_url" 2>/dev/null || echo "000") + + if [[ "$metrics_code" == "200" ]]; then + status="health" + reason="success" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 0 + else + reason="Metrics 端点异常 (HTTP $metrics_code)" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi + else + reason="HTTP 服务异常 (HTTP $http_code),请检查 Node Exporter 是否正在运行在端口 9100" + echo "{\"name\": \"$name\", \"status\": \"$status\", \"reason\": \"$reason\"}" + exit 1 + fi +} + +# 主函数 +main() { + check_health +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-demo/plugins/node-exporter/install.sh b/src/metric/client-plugins/all-in-one-demo/plugins/node-exporter/install.sh new file mode 100755 index 0000000..28ba2d1 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/plugins/node-exporter/install.sh @@ -0,0 +1,343 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 更新安装记录 +update_install_record() { + local pid="$1" + # 使用传入的安装目录参数,如果没有则使用默认值 + local install_base_dir="${2:-/opt/argus-metric/current}" + local install_record="$install_base_dir/.install_record" + + # 如果安装记录文件不存在,说明是首次安装,由主安装脚本统一创建 + if [[ ! -f "$install_record" ]]; then + log_info "安装记录文件不存在,将由主安装脚本创建" + return 0 + fi + + # 如果文件存在,说明是重启场景,只更新 PID 字段 + if command -v jq &> /dev/null; then + # 读取当前 PID + local current_pid=$(jq -r '.components."node-exporter".pid // ""' "$install_record" 2>/dev/null) + + if [[ -z "$current_pid" ]]; then + log_warning "无法读取当前 PID,跳过更新" + return 1 + fi + + # 使用 jq 只更新 pid 字段,保持字符串类型,保留其他字段 + jq --arg new_pid "$pid" '.components."node-exporter".pid = $new_pid' "$install_record" > "$install_record.tmp" && mv "$install_record.tmp" "$install_record" + log_info "PID 已更新: $current_pid -> $pid" + else + log_warning "jq 命令不可用,无法更新安装记录文件" + fi +} + +# 显示帮助信息 +show_help() { + echo "Node Exporter 安装脚本" + echo + echo "用法: $0 [选项]" + echo + echo "选项:" + echo " --help 显示此帮助信息" + echo + echo "示例:" + echo " $0 # 安装 Node Exporter" + echo +} + +# 解析命令行参数 +INSTALL_DIR="" +for arg in "$@"; do + case $arg in + --help|-h) + show_help + exit 0 + ;; + *) + # 如果参数不是以--开头,则认为是安装目录 + if [[ ! "$arg" =~ ^-- ]]; then + INSTALL_DIR="$arg" + else + log_error "未知参数: $arg" + show_help + exit 1 + fi + ;; + esac +done + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0" + exit 1 + fi +} + +# 检查系统要求 +check_system() { + log_info "检查系统要求..." + + # 检查操作系统 + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + exit 1 + fi + + source /etc/os-release + log_info "检测到操作系统: $NAME $VERSION" + + # 检查是否为 Linux 系统 + if [[ "$ID" != "ubuntu" && "$ID" != "debian" && "$ID" != "centos" && "$ID" != "rhel" && "$ID" != "fedora" ]]; then + log_warning "此脚本主要针对常见 Linux 发行版,其他系统可能需要调整" + fi + + # 检查系统架构 + local arch=$(uname -m) + log_info "系统架构: $arch" + + if [[ "$arch" != "x86_64" && "$arch" != "amd64" ]]; then + log_warning "当前架构为 $arch,node_exporter 主要支持 x86_64/amd64" + fi +} + +stop_existing_service() { + log_info "检查并停止可能运行的 Node Exporter 服务..." + + # 当前脚本 PID,防止误杀 + SELF_PID=$$ + + # 1. 停止 systemd 服务(如果存在) + if systemctl list-units --full -all | grep -q "node_exporter.service"; then + log_info "检测到 systemd 服务 node_exporter,正在停止..." + systemctl stop node_exporter || true + systemctl disable node_exporter || true + fi + + # 2. 清理可能存在的 PID 文件 + for pid_file in /var/run/node-exporter.pid /var/run/node_exporter.pid /tmp/node_exporter.pid; do + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if kill -0 "$pid" 2>/dev/null; then + log_info "发现 Node Exporter (PID: $pid),正在停止..." + kill "$pid" + sleep 2 + kill -0 "$pid" 2>/dev/null && kill -9 "$pid" + fi + rm -f "$pid_file" + fi + done + + # 3. 用 pgrep 查找进程,排除当前脚本 + local pids=$(pgrep -f "node_exporter|node-exporter|/usr/local/bin/node-exporter" | grep -vw "$SELF_PID" || true) + if [[ -n "$pids" ]]; then + log_info "发现 Node Exporter 进程 (PID: $pids),正在停止..." + for pid in $pids; do + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" 2>/dev/null || true + sleep 1 + kill -0 "$pid" 2>/dev/null && kill -9 "$pid" 2>/dev/null || true + fi + done + fi + + # 4. 兜底:检查是否有进程占用 9100 端口 + local listen_pids=$(lsof -ti:9100 2>/dev/null || true) + if [[ -n "$listen_pids" ]]; then + log_warning "发现占用 9100 端口的进程 (PID: $listen_pids),强制终止..." + for pid in $listen_pids; do + kill -9 "$pid" 2>/dev/null || true + done + sleep 1 + fi + + # 5. 最终验证 + if netstat -tuln 2>/dev/null | grep -q ":9100 "; then + log_error "端口 9100 仍被占用,请手动检查" + return 1 + else + log_success "旧的 Node Exporter 已完全停止" + fi +} + + +# 安装 Node Exporter 二进制文件 +install_node_exporter() { + log_info "安装 Node Exporter..." + + local binary_file="bin/node_exporter" + local install_dir="/usr/local/bin" + + if [[ ! -f "$binary_file" ]]; then + log_error "找不到 Node Exporter 二进制文件: $binary_file" + exit 1 + fi + + # 停止可能运行的服务 + stop_existing_service + + # 复制二进制文件并重命名为统一格式 + cp "$binary_file" "$install_dir/node-exporter" + chmod +x "$install_dir/node-exporter" + + log_success "Node Exporter 二进制文件安装完成" +} + +# 创建用户和组 +create_user() { + log_info "创建 node_exporter 用户..." + + # 检查用户是否已存在 + if id "node_exporter" &>/dev/null; then + log_info "用户 node_exporter 已存在" + else + useradd --no-create-home --shell /bin/false node_exporter + log_success "用户 node_exporter 创建完成" + fi +} + +# 安装配置文件 +install_config() { + log_info "安装配置文件..." + + local config_dir="/etc/node_exporter" + + # 创建配置目录 + mkdir -p "$config_dir" + + # 创建文本文件收集器目录 + mkdir -p "/var/lib/node_exporter/textfile_collector" + chown node_exporter:node_exporter "/var/lib/node_exporter/textfile_collector" +} + +# 启动 Node Exporter 服务 +start_node_exporter() { + log_info "启动 Node Exporter 服务..." + + local binary_path="/usr/local/bin/node-exporter" + local log_file="/var/log/node-exporter.log" + local pid_file="/var/run/node-exporter.pid" + + # 检查服务是否已经在运行 + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if kill -0 "$pid" 2>/dev/null; then + log_info "Node Exporter 服务已在运行 (PID: $pid)" + return 0 + else + log_warning "发现过期的 PID 文件,正在清理..." + rm -f "$pid_file" + fi + fi + + # 检查端口是否被占用 + if netstat -tuln 2>/dev/null | grep -q ":9100 "; then + log_warning "端口 9100 已被占用,请检查是否有其他服务在运行" + return 1 + fi + + # 启动服务 + log_info "正在启动 Node Exporter..." + nohup "$binary_path" --web.listen-address=:9100 > "$log_file" 2>&1 & + local pid=$! + + # 保存 PID + echo "$pid" > "$pid_file" + + # 等待服务启动 + sleep 2 + + # 检查服务是否成功启动 + if kill -0 "$pid" 2>/dev/null; then + log_success "Node Exporter 服务启动成功 (PID: $pid)" + log_info "日志文件: $log_file" + log_info "PID 文件: $pid_file" + + # 更新安装记录 + update_install_record "$pid" "$INSTALL_DIR" + else + log_error "Node Exporter 服务启动失败" + rm -f "$pid_file" + return 1 + fi +} + + + +# 显示安装信息 +show_install_info() { + log_success "Node Exporter 安装完成!" + echo + echo "安装信息:" + echo " 二进制文件: /usr/local/bin/node-exporter" + echo " 运行用户: node_exporter" + echo " 配置目录: /etc/node_exporter/" + echo " 默认端口: 9100" + echo + echo "使用方法:" + echo " 手动启动: /usr/local/bin/node-exporter --web.listen-address=:9100" + echo " 后台启动: nohup /usr/local/bin/node-exporter --web.listen-address=:9100 &" + echo + echo "测试连接:" + echo " curl http://localhost:9100/metrics" + echo " curl http://localhost:9100" + echo + echo "Prometheus 配置示例:" + echo " - job_name: 'node_exporter'" + echo " static_configs:" + echo " - targets: ['localhost:9100']" + echo +} + +# 主函数 +main() { + echo "==========================================" + echo " Node Exporter 安装脚本 v1.0" + echo "==========================================" + echo + + check_root + check_system + + log_info "开始安装 Node Exporter..." + + install_node_exporter + create_user + install_config + start_node_exporter + + show_install_info +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi + diff --git a/src/metric/client-plugins/all-in-one-demo/plugins/node-exporter/package.sh b/src/metric/client-plugins/all-in-one-demo/plugins/node-exporter/package.sh new file mode 100755 index 0000000..b38c733 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/plugins/node-exporter/package.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +set -e + +# 颜色定义 +GREEN='\033[0;32m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +# 获取当前目录 +CURRENT_DIR=$(pwd) +PACKAGE_NAME="node-exporter-$(date +%Y%m%d-%H%M%S)" +PACKAGE_FILE="${PACKAGE_NAME}.tar.gz" + +log_info "开始打包 Node Exporter 安装包..." + +# 检查必要文件 +log_info "检查必要文件..." + +required_files=( + "install.sh" + "uninstall.sh" + "bin/node_exporter" + "check_health.sh" +) + +missing_files=() +for file in "${required_files[@]}"; do + if [[ ! -f "$file" ]]; then + missing_files+=("$file") + fi +done + +if [[ ${#missing_files[@]} -gt 0 ]]; then + echo "缺少以下文件:" + for file in "${missing_files[@]}"; do + echo " - $file" + done + exit 1 +fi + +log_success "所有必要文件检查完成" + +# 创建临时目录 +TEMP_DIR=$(mktemp -d) +log_info "创建临时目录: $TEMP_DIR" + +# 复制文件到临时目录 +cp -r . "$TEMP_DIR/$PACKAGE_NAME" + +# 进入临时目录 +cd "$TEMP_DIR" + +# 创建压缩包 +log_info "创建压缩包: $PACKAGE_FILE" +tar -czf "$PACKAGE_FILE" "$PACKAGE_NAME" + +# 移动压缩包到原目录 +mv "$PACKAGE_FILE" "$CURRENT_DIR/" + +# 清理临时目录 +rm -rf "$TEMP_DIR" + +# 返回原目录 +cd "$CURRENT_DIR" + +# 显示结果 +log_success "打包完成!" +echo +echo "安装包文件: $PACKAGE_FILE" +echo "文件大小: $(du -h "$PACKAGE_FILE" | cut -f1)" +echo +echo "使用方法:" +echo "1. 将 $PACKAGE_FILE 传输到目标服务器" +echo "2. 解压: tar -xzf $PACKAGE_FILE" +echo "3. 进入目录: cd $PACKAGE_NAME" +echo "4. 运行安装: sudo ./install.sh" +echo +echo "注意: 请确保所有必要文件都存在" diff --git a/src/metric/client-plugins/all-in-one-demo/plugins/node-exporter/uninstall.sh b/src/metric/client-plugins/all-in-one-demo/plugins/node-exporter/uninstall.sh new file mode 100755 index 0000000..14801c1 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/plugins/node-exporter/uninstall.sh @@ -0,0 +1,239 @@ +#!/bin/bash + +# Node Exporter 卸载脚本 +# 版本: 1.0 +# 作者: AIOps Team +# 日期: $(date +%Y-%m-%d) + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0" + exit 1 + fi +} + +# 停止运行中的进程 +stop_processes() { + log_info "停止 Node Exporter 进程..." + + local pid_file="/var/run/node-exporter.pid" + local stopped=false + + # 首先尝试通过 PID 文件停止服务 + if [[ -f "$pid_file" ]]; then + local pid=$(cat "$pid_file") + if kill -0 "$pid" 2>/dev/null; then + log_info "通过 PID 文件停止服务 (PID: $pid)..." + kill "$pid" + sleep 3 + + # 检查进程是否已停止 + if kill -0 "$pid" 2>/dev/null; then + log_warning "进程未响应,强制终止..." + kill -9 "$pid" 2>/dev/null || true + fi + log_success "Node Exporter 进程已停止" + stopped=true + else + log_warning "PID 文件存在但进程已不存在,清理 PID 文件" + rm -f "$pid_file" + fi + fi + + # 查找并杀死所有 node_exporter 和 node-exporter 进程 + local pids=$(pgrep -f "node_exporter\|node-exporter" 2>/dev/null || true) + if [[ -n "$pids" ]]; then + log_info "发现 node_exporter 或 node-exporter 进程,正在停止..." + for pid in $pids; do + log_info "停止进程 PID: $pid" + kill "$pid" 2>/dev/null || true + done + sleep 2 + + # 检查是否还有进程在运行,如果有则强制终止 + local remaining_pids=$(pgrep -f "node_exporter\|node-exporter" 2>/dev/null || true) + if [[ -n "$remaining_pids" ]]; then + log_warning "进程未响应,强制终止..." + for pid in $remaining_pids; do + log_info "强制终止进程 PID: $pid" + kill -9 "$pid" 2>/dev/null || true + done + sleep 1 + fi + + # 最终检查 + if pgrep -f "node_exporter\|node-exporter" > /dev/null; then + log_error "无法停止所有 node_exporter 进程" + else + log_success "所有 Node Exporter 进程已停止" + stopped=true + fi + else + log_info "Node Exporter 进程未运行" + fi + + # 清理 PID 文件 + rm -f "$pid_file" + + if [[ "$stopped" == "false" ]]; then + log_warning "未发现需要停止的 Node Exporter 进程" + fi +} + +# 删除二进制文件 +remove_binary() { + log_info "删除 Node Exporter 二进制文件..." + + local binary_files=( + "/usr/local/bin/node-exporter" + "/usr/local/bin/node_exporter" + ) + + local deleted=false + for binary_file in "${binary_files[@]}"; do + if [[ -f "$binary_file" ]]; then + rm -f "$binary_file" + log_success "二进制文件已删除: $binary_file" + deleted=true + fi + done + + if [[ "$deleted" == "false" ]]; then + log_info "二进制文件不存在" + fi +} + +# 删除配置文件 +remove_config() { + log_info "删除配置文件..." + + local config_dir="/etc/node_exporter" + + if [[ -d "$config_dir" ]]; then + rm -rf "$config_dir" + log_success "配置目录已删除" + else + log_info "配置目录不存在" + fi +} + +# 删除数据目录 +remove_data_dir() { + log_info "删除数据目录..." + + local data_dir="/var/lib/node_exporter" + + if [[ -d "$data_dir" ]]; then + rm -rf "$data_dir" + log_success "数据目录已删除" + else + log_info "数据目录不存在" + fi +} + +# 检查用户状态(可选) +check_user_status() { + log_info "检查 node_exporter 用户状态..." + + if id "node_exporter" &>/dev/null; then + log_info "检测到 node_exporter 用户存在" + log_warning "node_exporter 是系统用户,可能被其他服务使用" + log_info "为了系统稳定性,将保留 node_exporter 用户" + log_info "如需手动删除,请运行: sudo userdel node_exporter" + else + log_info "node_exporter 用户不存在" + fi +} + +# 清理日志文件 +cleanup_logs() { + log_info "清理日志文件..." + + # 清理 journal 日志 + journalctl --vacuum-time=1s --quiet || true + + # 删除安装脚本创建的日志文件 + rm -f /var/log/node-exporter.log + + log_success "日志文件已清理" +} + +# 显示卸载信息 +show_uninstall_info() { + log_success "Node Exporter 卸载完成!" + echo + echo "已删除的内容:" + echo " - 二进制文件: /usr/local/bin/node-exporter" + echo " - 配置目录: /etc/node_exporter" + echo " - 数据目录: /var/lib/node_exporter" + echo " - 相关日志文件" + echo + echo "注意:" + echo " - node_exporter 用户已保留(系统用户,可能被其他服务使用)" + echo " - 如需完全清理,请手动检查并删除相关文件" + echo +} + +# 主函数 +main() { + echo "==========================================" + echo " Node Exporter 卸载脚本 v1.0" + echo "==========================================" + echo + + check_root + + log_warning "此操作将完全卸载 Node Exporter" + read -p "确认继续?(y/N): " confirm + + if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "取消卸载操作" + exit 0 + fi + + log_info "开始卸载 Node Exporter..." + + stop_processes + remove_binary + remove_config + remove_data_dir + cleanup_logs + + # 检查用户状态 + check_user_status + + show_uninstall_info +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/check_health.sh b/src/metric/client-plugins/all-in-one-demo/scripts/check_health.sh new file mode 100755 index 0000000..991cc9f --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/scripts/check_health.sh @@ -0,0 +1,277 @@ +#!/bin/bash + +# 整体健康检查脚本,调用各个组件的健康检查并将结果写入 .health_log 文件 + +set -e + +# 获取脚本所在目录 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +HEALTH_LOG_FILE="$SCRIPT_DIR/.health_log" +INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record" + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 - 输出到 stderr 避免影响 JSON 结果 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" >&2 +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" >&2 +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" >&2 +} + +# 检查单个组件健康状态 +check_component() { + local component_name="$1" + local check_script_path="$2" + + log_info "检查 $component_name 健康状态..." + + if [[ ! -f "$check_script_path" ]]; then + log_error "健康检查脚本不存在: $check_script_path" + echo "{\"name\": \"$component_name\", \"status\": \"unhealth\", \"reason\": \"健康检查脚本不存在: $check_script_path\"}" + return 1 + fi + + if [[ ! -x "$check_script_path" ]]; then + log_error "健康检查脚本无执行权限: $check_script_path" + echo "{\"name\": \"$component_name\", \"status\": \"unhealth\", \"reason\": \"健康检查脚本无执行权限: $check_script_path\"}" + return 1 + fi + + # 执行健康检查脚本,只捕获 stdout,stderr 输出到终端 + local result + if result=$("$check_script_path" 2>/dev/null); then + log_success "$component_name 健康检查通过" + echo "$result" + return 0 + else + log_warning "$component_name 健康检查失败" + echo "$result" + return 1 + fi +} + +# 生成时间戳 +get_timestamp() { + date '+%Y-%m-%d %H:%M:%S' +} + +# 生成UTC时间戳 +get_utc_timestamp() { + date -u '+%Y-%m-%dT%H:%M:%SZ' +} + +# 获取主机名 +get_hostname() { + echo "${HOSTNAME:-$(hostname)}" +} + +# 创建健康状态目录 +create_health_dir() { + local hostname=$(get_hostname) + local health_dir="/private/argus/agent/$hostname/health" + + if [[ ! -d "$health_dir" ]]; then + log_info "创建健康状态目录: $health_dir" + mkdir -p "$health_dir" + fi + + echo "$health_dir" +} + +# 写入单个模块的健康状态JSON文件 +write_component_health_json() { + local component_name="$1" + local status="$2" + local error_msg="$3" + local health_dir="$4" + + # 生成模块名前缀-xxx.json格式的文件名 + local module_prefix="metric" + local filename="${module_prefix}-${component_name}.json" + local filepath="$health_dir/$filename" + + # 生成UTC时间戳 + local timestamp=$(get_utc_timestamp) + + # 构建JSON内容 + local json_content=$(cat << EOF +{ + "status": "$status", + "error": "$error_msg", + "timestamp": "$timestamp" +} +EOF +) + + # 写入文件 + echo "$json_content" > "$filepath" + log_info "已写入模块健康状态文件: $filepath" +} + +# 从安装记录文件中读取组件安装目录 +read_install_record() { + local install_record_file="$1" + + if [[ ! -f "$install_record_file" ]]; then + log_error "安装记录文件不存在: $install_record_file" + return 1 + fi + + # 检查是否有 jq 命令来解析 JSON + if command -v jq &> /dev/null; then + # 使用 jq 解析 JSON + local components_json + if components_json=$(jq -r '.components | to_entries[] | "\(.key):\(.value.install_dir)"' "$install_record_file" 2>/dev/null); then + echo "$components_json" + return 0 + else + log_error "无法解析安装记录文件 JSON 格式: $install_record_file" + return 1 + fi + else + # 如果没有 jq,尝试简单的文本解析 + log_warning "jq 命令不可用,尝试简单文本解析" + + # 查找所有 install_dir 行 + local components=() + while IFS= read -r line; do + if [[ "$line" =~ \"install_dir\":[[:space:]]*\"([^\"]+)\" ]]; then + local install_dir="${BASH_REMATCH[1]}" + # 从路径中提取组件名称 + local component_name=$(basename "$install_dir") + components+=("$component_name:$install_dir") + fi + done < "$install_record_file" + + if [[ ${#components[@]} -gt 0 ]]; then + printf '%s\n' "${components[@]}" + return 0 + else + log_error "无法从安装记录文件中提取组件信息" + return 1 + fi + fi +} + +# 主函数 +main() { + echo "==========================================" >&2 + echo " 整体健康检查脚本" >&2 + echo "==========================================" >&2 + echo >&2 + + # 记录健康检查开始时间 + local start_time=$(get_timestamp) + log_info "健康检查开始时间: $start_time" + + # 创建健康状态目录 + local health_dir + health_dir=$(create_health_dir) + + # 从安装记录文件中读取组件信息 + log_info "从安装记录文件读取组件信息: $INSTALL_RECORD_FILE" + local components_info + if ! components_info=$(read_install_record "$INSTALL_RECORD_FILE"); then + log_error "无法读取安装记录文件,健康检查终止" + exit 1 + fi + + # 存储所有检查结果 + local all_results=() + local overall_status="health" + + # 逐个检查组件 + while IFS= read -r component_info; do + if [[ -n "$component_info" ]]; then + IFS=':' read -r component_name install_dir <<< "$component_info" + local check_script_path="$install_dir/check_health.sh" + + local result + local component_status="healthy" + local error_msg="" + + if result=$(check_component "$component_name" "$check_script_path"); then + all_results+=("$result") + else + all_results+=("$result") + overall_status="unhealth" + component_status="unhealthy" + # 从结果中提取错误信息 + if command -v jq &> /dev/null; then + error_msg=$(echo "$result" | jq -r '.reason // ""' 2>/dev/null || echo "") + else + # 简单的文本解析提取错误信息 + if [[ "$result" =~ \"reason\":[[:space:]]*\"([^\"]+)\" ]]; then + error_msg="${BASH_REMATCH[1]}" + fi + fi + fi + + # 写入单个模块的健康状态JSON文件 + write_component_health_json "$component_name" "$component_status" "$error_msg" "$health_dir" + fi + done <<< "$components_info" + + # 记录健康检查结束时间 + local end_time=$(get_timestamp) + log_info "健康检查结束时间: $end_time" + + # 构建完整的健康检查结果 JSON + local health_check_result=$(cat << EOF +{ + "start_time": "$start_time", + "end_time": "$end_time", + "overall_status": "$overall_status", + "components": [ +$(printf '%s,\n' "${all_results[@]}" | sed '$s/,$//') + ] +} +EOF +) + + # 写入健康日志文件 + log_info "将健康检查结果写入日志文件: $HEALTH_LOG_FILE" + echo "$health_check_result" >> "$HEALTH_LOG_FILE" + + # 输出 JSON 结果到 stdout + echo "$health_check_result" + + # 显示总结到 stderr + echo >&2 + echo "==========================================" >&2 + echo " 健康检查总结" >&2 + echo "==========================================" >&2 + echo "开始时间: $start_time" >&2 + echo "结束时间: $end_time" >&2 + echo "整体状态: $overall_status" >&2 + echo "日志文件: $HEALTH_LOG_FILE" >&2 + echo >&2 + + if [[ "$overall_status" == "health" ]]; then + log_success "所有组件健康检查通过!" + exit 0 + else + log_error "部分组件健康检查失败,请查看上述详细信息" + exit 1 + fi +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi \ No newline at end of file diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/check_version.sh b/src/metric/client-plugins/all-in-one-demo/scripts/check_version.sh new file mode 100755 index 0000000..fce49f3 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/scripts/check_version.sh @@ -0,0 +1,240 @@ +#!/bin/bash + +# 版本校验脚本 +# 比较本地 LATEST_VERSION 与 FTP 的 VERSION 版本,如果不一致则更新对应版本 + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 - 输出到 stderr 避免影响函数返回值 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" >&2 +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" >&2 +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" >&2 +} + +# 获取脚本所在目录 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# 动态获取当前版本目录 +get_current_version_dir() { + # 查找 /opt/argus-metric/versions/ 下的最新版本目录 + local versions_dir="/opt/argus-metric/versions" + if [[ -d "$versions_dir" ]]; then + # 按版本号排序,获取最新的版本目录 + local latest_version_dir=$(ls -1 "$versions_dir" 2>/dev/null | sort -V | tail -1) + if [[ -n "$latest_version_dir" ]]; then + echo "$versions_dir/$latest_version_dir" + else + echo "/opt/argus-metric" + fi + else + echo "/opt/argus-metric" + fi +} + +# 获取当前版本目录 +CURRENT_VERSION_DIR=$(get_current_version_dir) +# LATEST_VERSION 文件在根目录 +LOCAL_VERSION_FILE="/opt/argus-metric/LATEST_VERSION" +REMOTE_VERSION_URL="" +LOG_FILE="$CURRENT_VERSION_DIR/.version_check.log" + +# 从环境变量或配置文件获取 FTP 服务器信息 +get_ftp_config() { + # 优先从环境变量获取配置 + log_info "获取 FTP 配置信息..." + + # 如果环境变量中没有设置,则尝试从配置文件读取 + if [[ -z "$FTP_SERVER" || -z "$FTP_USER" || -z "$FTP_PASSWORD" ]]; then + local config_file="$SCRIPT_DIR/../config/config.env" + if [[ -f "$config_file" ]]; then + log_info "从配置文件读取 FTP 配置: $config_file" + source "$config_file" + fi + else + log_info "使用环境变量中的 FTP 配置" + fi + + # 设置默认值(如果环境变量和配置文件都没有设置) + FTP_SERVER="${FTP_SERVER:-localhost}" + FTP_USER="${FTP_USER:-ftpuser}" + FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" + + # 构建远程版本文件 URL + REMOTE_VERSION_URL="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/LATEST_VERSION" + + log_info "FTP 配置来源: ${FTP_CONFIG_SOURCE:-环境变量/配置文件}" +} + +# 获取远程版本号 +get_remote_version() { + log_info "从 FTP 服务器获取远程版本号..." + log_info "远程地址: $REMOTE_VERSION_URL" + + # 先测试 FTP 连接 + log_info "测试 FTP 连接..." + if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null 2>&1; then + log_success "FTP 服务器连接成功" + else + log_error "无法连接到 FTP 服务器: $FTP_SERVER" + return 1 + fi + + # 测试 LATEST_VERSION 文件是否存在 + log_info "检查远程 LATEST_VERSION 文件是否存在..." + if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/LATEST_VERSION" >/dev/null 2>&1; then + log_success "远程 LATEST_VERSION 文件存在" + else + log_error "远程 LATEST_VERSION 文件不存在或无法访问" + return 1 + fi + + # 获取远程版本号 + local remote_version + if remote_version=$(curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfL "ftp://${FTP_SERVER}/LATEST_VERSION" 2>/dev/null | tr -d '[:space:]'); then + if [[ -n "$remote_version" ]]; then + log_success "获取到远程版本号: $remote_version" + echo "$remote_version" + else + log_error "远程版本号为空" + return 1 + fi + else + log_error "获取远程版本号失败" + return 1 + fi +} + +# 获取本地版本号 +get_local_version() { + if [[ -f "$LOCAL_VERSION_FILE" ]]; then + local local_version=$(cat "$LOCAL_VERSION_FILE" 2>/dev/null | tr -d '[:space:]') + if [[ -n "$local_version" ]]; then + log_info "本地版本号: $local_version" + echo "$local_version" + else + log_warning "本地版本文件为空" + echo "" + fi + else + log_warning "本地版本文件不存在: $LOCAL_VERSION_FILE" + echo "" + fi +} + +# 更新到新版本 +update_to_version() { + local new_version="$1" + local temp_dir="/tmp/argus-update-$$" + local setup_script="$temp_dir/setup.sh" + + log_info "开始更新到版本: $new_version" + + # 创建临时目录 + mkdir -p "$temp_dir" + + # 下载最新的 setup.sh + log_info "从 FTP 服务器下载最新的安装脚本..." + local setup_url="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/setup.sh" + + if curl -fsS "$setup_url" -o "$setup_script"; then + log_success "安装脚本下载完成" + else + log_error "下载安装脚本失败: $setup_url" + rm -rf "$temp_dir" + return 1 + fi + + # 添加执行权限 + chmod +x "$setup_script" + + # 执行安装脚本 + log_info "执行安装脚本进行版本更新..." + if "$setup_script" --server "$FTP_SERVER" --user "$FTP_USER" --password "$FTP_PASSWORD" --version "$new_version"; then + log_success "版本更新完成: $new_version" + rm -rf "$temp_dir" + return 0 + else + log_error "版本更新失败: $new_version" + rm -rf "$temp_dir" + return 1 + fi +} + +# 记录检查日志 +log_check() { + local message="$1" + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + echo "[$timestamp] $message" >> "$LOG_FILE" +} + +# 主函数 +main() { + log_info "开始版本校验检查..." + log_check "版本校验检查开始" + + # 确保系统目录存在 + mkdir -p "/opt/argus-metric" + mkdir -p "$CURRENT_VERSION_DIR" + + log_info "当前版本目录: $CURRENT_VERSION_DIR" + + # 获取 FTP 配置 + get_ftp_config + + # 获取本地版本号 + local local_version + local_version=$(get_local_version) + + # 获取远程版本号 + local remote_version + if ! remote_version=$(get_remote_version); then + log_error "无法获取远程版本号,跳过本次检查" + log_check "版本校验失败:无法获取远程版本号" + exit 1 + fi + + # 比较版本号 + if [[ "$local_version" == "$remote_version" ]]; then + log_info "版本一致,无需更新 (本地: $local_version, 远程: $remote_version)" + log_check "版本校验完成:版本一致 ($local_version)" + else + log_info "检测到版本不一致 (本地: $local_version, 远程: $remote_version)" + log_check "检测到版本不一致:本地($local_version) -> 远程($remote_version)" + + # 更新到新版本 + if update_to_version "$remote_version"; then + log_success "版本更新成功: $local_version -> $remote_version" + log_check "版本更新成功:$local_version -> $remote_version" + else + log_error "版本更新失败" + log_check "版本更新失败:$local_version -> $remote_version" + exit 1 + fi + fi + + log_success "版本校验检查完成" + log_check "版本校验检查完成" +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/install_artifact.sh b/src/metric/client-plugins/all-in-one-demo/scripts/install_artifact.sh new file mode 100755 index 0000000..ba9ade2 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/scripts/install_artifact.sh @@ -0,0 +1,903 @@ +#!/bin/bash + +set -e + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 配置变量 +INSTALL_DIR="${1:-$(pwd)}" # 使用第一个参数作为安装目录,如果没有参数则使用当前目录 +TEMP_DIR="/tmp/metrics-install-$$" +VERSION_FILE="version.json" + + +# 加载配置文件 +load_config() { + local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + local config_file="$script_dir/config.env" + + if [[ -f "$config_file" ]]; then + log_info "加载配置文件: $config_file" + # 导出配置文件中的环境变量 + set -a # 自动导出所有变量 + source "$config_file" + set +a # 关闭自动导出 + log_success "配置文件加载完成" + else + log_warning "配置文件不存在: $config_file,使用默认配置" + fi +} + +# 复制配置文件到安装目录 +copy_config_files() { + log_info "复制配置文件到安装目录..." + + local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + local source_config="$script_dir/../config/config.env" + local target_config="$INSTALL_DIR/config.env" + + if [[ -f "$source_config" ]]; then + # 检查源文件和目标文件是否是同一个文件 + if [[ "$source_config" == "$target_config" ]]; then + log_info "配置文件已在目标位置,跳过复制" + log_success "配置文件已存在: $target_config" + else + if cp "$source_config" "$target_config"; then + log_success "配置文件复制完成: $target_config" + else + log_error "配置文件复制失败" + return 1 + fi + fi + else + log_warning "源配置文件不存在: $source_config" + fi + + # 复制版本校验脚本 + log_info "复制版本校验脚本到安装目录..." + local target_check_version="$INSTALL_DIR/check_version.sh" + + # 检查目标文件是否已存在(从 artifact 包中解压出来的) + if [[ -f "$target_check_version" ]]; then + log_info "版本校验脚本已存在,设置执行权限..." + chmod +x "$target_check_version" + log_success "版本校验脚本权限设置完成: $target_check_version" + else + log_warning "版本校验脚本不存在: $target_check_version" + log_info "请确保 check_version.sh 已包含在 artifact 包中" + fi +} + +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0 [安装目录]" + log_info "如果不指定安装目录,将使用当前目录: $(pwd)" + exit 1 + fi +} + +# 检查系统要求 +check_system() { + log_info "检查系统要求..." + + # 检查操作系统 + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + exit 1 + fi + + source /etc/os-release + log_info "检测到操作系统: $NAME $VERSION" + + # 检查系统架构 + arch=$(uname -m) + log_info "系统架构: $arch" + + # 检查磁盘空间 + available_space=$(df / | awk 'NR==2 {print $4}') + if [[ $available_space -lt 10485760 ]]; then # 10GB in KB + log_warning "可用磁盘空间不足 10GB,当前可用: $(($available_space / 1024 / 1024))GB" + fi + + # 检查内存 + total_mem=$(free -m | awk 'NR==2{print $2}') + if [[ $total_mem -lt 4096 ]]; then # 4GB + log_warning "系统内存不足 4GB,当前: ${total_mem}MB" + fi +} + +# 查找版本文件 +find_version_file() { + log_info "查找版本信息文件..." + + # 在当前目录查找 + if [[ -f "$VERSION_FILE" ]]; then + VERSION_FILE_PATH="$(pwd)/$VERSION_FILE" + log_success "找到版本文件: $VERSION_FILE" + return 0 + fi + + # 在 artifact 目录查找 + for version_dir in artifact/*/; do + if [[ -f "${version_dir}${VERSION_FILE}" ]]; then + VERSION_FILE_PATH="$(cd "$(dirname "${version_dir}${VERSION_FILE}")" && pwd)/$(basename "${version_dir}${VERSION_FILE}")" + log_success "找到版本文件: $VERSION_FILE_PATH" + return 0 + fi + done + + log_error "未找到版本信息文件 $VERSION_FILE" + exit 1 +} + +# 解析版本信息 +parse_version_info() { + log_info "解析版本信息..." + + if [[ ! -f "$VERSION_FILE_PATH" ]]; then + log_error "版本文件不存在: $VERSION_FILE_PATH" + exit 1 + fi + + # 使用 jq 解析 JSON(如果可用) + if command -v jq &> /dev/null; then + # 验证JSON文件格式 + if ! jq empty "$VERSION_FILE_PATH" 2>/dev/null; then + log_error "JSON文件格式错误,请检查 $VERSION_FILE_PATH" + exit 1 + fi + + VERSION=$(jq -r '.version' "$VERSION_FILE_PATH") + BUILD_TIME=$(jq -r '.build_time' "$VERSION_FILE_PATH") + + # 解析 artifact_list + if jq -e '.artifact_list' "$VERSION_FILE_PATH" > /dev/null 2>&1; then + jq -r '.artifact_list | to_entries[] | "\(.key):\(.value)"' "$VERSION_FILE_PATH" > "$TEMP_DIR/components.txt" + else + log_error "version.json 中缺少 artifact_list 字段" + exit 1 + fi + + # 解析 checksums + if jq -e '.checksums' "$VERSION_FILE_PATH" > /dev/null 2>&1; then + jq -r '.checksums | to_entries[] | "\(.key):\(.value)"' "$VERSION_FILE_PATH" > "$TEMP_DIR/checksums.txt" + else + log_error "version.json 中缺少 checksums 字段" + exit 1 + fi + + # 解析 install_order(现在包含完整的文件名) + if jq -e '.install_order' "$VERSION_FILE_PATH" > /dev/null 2>&1; then + jq -r '.install_order[]' "$VERSION_FILE_PATH" > "$TEMP_DIR/install_order.txt" + else + log_error "version.json 中缺少 install_order 字段" + exit 1 + fi + + else + log_warning "jq 未安装,使用简单的 JSON 解析" + # 简单的 JSON 解析 + VERSION=$(grep '"version"' "$VERSION_FILE_PATH" | sed 's/.*"version": *"\([^"]*\)".*/\1/') + BUILD_TIME=$(grep '"build_time"' "$VERSION_FILE_PATH" | sed 's/.*"build_time": *"\([^"]*\)".*/\1/') + + # 解析 artifact_list + grep -A 100 '"artifact_list"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do + component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/') + version=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/') + echo "$component:$version" >> "$TEMP_DIR/components.txt" + done + + # 解析 checksums + grep -A 100 '"checksums"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do + component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/') + checksum=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/') + echo "$component:$checksum" >> "$TEMP_DIR/checksums.txt" + done + + # 解析 install_order + grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+"' | while read line; do + component=$(echo "$line" | sed 's/.*"\([^"]*\)".*/\1/') + echo "$component" >> "$TEMP_DIR/install_order.txt" + done + + # 验证解析结果 + if [[ ! -f "$TEMP_DIR/components.txt" || ! -s "$TEMP_DIR/components.txt" ]]; then + log_error "无法解析 artifact_list,请检查 version.json 格式" + exit 1 + fi + + if [[ ! -f "$TEMP_DIR/checksums.txt" || ! -s "$TEMP_DIR/checksums.txt" ]]; then + log_error "无法解析 checksums,请检查 version.json 格式" + exit 1 + fi + + if [[ ! -f "$TEMP_DIR/install_order.txt" || ! -s "$TEMP_DIR/install_order.txt" ]]; then + log_error "无法解析 install_order,请检查 version.json 格式" + exit 1 + fi + fi + + log_success "版本信息解析完成" + log_info " 版本: $VERSION" + log_info " 构建时间: $BUILD_TIME" + + component_count=0 + if [[ -f "$TEMP_DIR/components.txt" ]]; then + component_count=$(wc -l < "$TEMP_DIR/components.txt") + log_info " 组件数量: $component_count" + log_info " 组件列表:" + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + version=$(echo "$line" | cut -d':' -f2) + log_info " - $component v$version" + done < "$TEMP_DIR/components.txt" + else + log_error "components.txt 文件不存在" + exit 1 + fi +} + +# 验证文件完整性 +verify_checksums() { + log_info "验证文件完整性..." + + artifact_dir=$(dirname "$VERSION_FILE_PATH") + log_info "Artifact 目录: $artifact_dir" + failed_verification=0 + + if [[ -f "$TEMP_DIR/checksums.txt" ]]; then + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + expected_checksum=$(echo "$line" | cut -d':' -f2-) + + # 查找匹配的 tar 文件 + actual_file="" + for file in "$artifact_dir/${component}-"*.tar.gz; do + if [[ -f "$file" ]]; then + actual_file="$file" + break + fi + done + + if [[ -z "$actual_file" ]]; then + log_error "找不到组件文件: $component" + failed_verification=1 + continue + fi + + # 计算实际校验和 + actual_checksum="sha256:$(sha256sum "$actual_file" | cut -d' ' -f1)" + + if [[ "$actual_checksum" == "$expected_checksum" ]]; then + log_success " $component: 校验通过" + else + log_error " $component: 校验失败" + log_error " 期望: $expected_checksum" + log_error " 实际: $actual_checksum" + failed_verification=1 + fi + done < "$TEMP_DIR/checksums.txt" + fi + + if [[ $failed_verification -eq 1 ]]; then + log_error "文件完整性验证失败" + exit 1 + fi + + log_success "所有文件校验通过" +} + +# 创建安装目录 +create_install_dirs() { + log_info "创建安装目录..." + + mkdir -p "$INSTALL_DIR" + mkdir -p "$TEMP_DIR" + + log_success "安装目录创建完成: $INSTALL_DIR" +} + +# 安装系统依赖包 +install_system_deps() { + log_info "检查系统依赖包..." + + local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + local deps_dir="$script_dir/deps" + + # 检查deps目录是否存在 + if [[ ! -d "$deps_dir" ]]; then + log_info "deps 目录不存在,跳过系统依赖包安装" + return 0 + fi + + # 检查是否有tar.gz文件 + local deps_count=$(find "$deps_dir" -name "*.tar.gz" | wc -l) + if [[ $deps_count -eq 0 ]]; then + log_info "deps 目录中没有 tar.gz 文件,跳过系统依赖包安装" + return 0 + fi + + log_info "找到 $deps_count 个系统依赖包,开始安装..." + + # 创建临时目录用于解压依赖包 + local deps_temp_dir="$TEMP_DIR/deps" + mkdir -p "$deps_temp_dir" + + # 处理每个tar.gz文件 + find "$deps_dir" -name "*.tar.gz" | while read tar_file; do + local tar_basename=$(basename "$tar_file") + local extract_name="${tar_basename%.tar.gz}" + + log_info "处理依赖包: $tar_basename" + + # 解压到临时目录 + local extract_dir="$deps_temp_dir/$extract_name" + mkdir -p "$extract_dir" + + if tar -xzf "$tar_file" -C "$extract_dir" 2>/dev/null; then + log_success " $tar_basename 解压完成" + else + log_error " $tar_basename 解压失败" + continue + fi + + # 进入解压目录,查找deb包 + cd "$extract_dir" + local deb_count=$(find . -name "*.deb" | wc -l) + + if [[ $deb_count -gt 0 ]]; then + log_info " 找到 $deb_count 个 deb 包,开始安装..." + + # 1. 先尝试安装所有deb包 + log_info " 第1步:批量安装deb包..." + if dpkg -i *.deb 2>/dev/null; then + log_success " 所有deb包安装成功" + else + log_warning " 部分deb包安装失败,可能存在依赖问题" + + # 2. 使用apt-get修复依赖 + log_info " 第2步:修复依赖关系..." + if apt-get install -f -y; then + log_success " 依赖关系修复完成" + else + log_error " 依赖关系修复失败" + # 继续处理其他包,不退出 + fi + fi + else + log_info " $tar_basename 中没有找到deb包,跳过" + fi + + # 返回到依赖临时目录 + cd "$deps_temp_dir" + done + + # 检查并启动 cron 服务 + start_cron_service + + log_success "系统依赖包安装完成" +} + +# 启动 cron 服务 +start_cron_service() { + log_info "检查并启动 cron 服务..." + + # 检查 cron 是否已经在运行 + if pgrep -x "cron" > /dev/null; then + log_success "cron 服务已在运行" + return 0 + fi + + # 检查 /usr/sbin/cron 是否存在 + if [[ ! -f "/usr/sbin/cron" ]]; then + log_warning "cron 可执行文件不存在,跳过启动" + return 1 + fi + + # 启动 cron 服务 + log_info "启动 cron 服务..." + if /usr/sbin/cron start 2>/dev/null || /usr/sbin/cron 2>/dev/null; then + log_success "cron 服务启动成功" + + sleep 2 + + if pgrep -x "cron" > /dev/null; then + log_success "cron 服务运行正常" + else + log_warning "cron 服务可能未正常启动" + fi + else + log_error "cron 服务启动失败" + return 1 + fi +} + +# 安装组件 +install_components() { + log_info "开始安装组件..." + + artifact_dir=$(dirname "$VERSION_FILE_PATH") + log_info "Artifact 目录: $artifact_dir" + install_count=0 + total_count=0 + + if [[ -f "$TEMP_DIR/install_order.txt" ]]; then + total_count=$(wc -l < "$TEMP_DIR/install_order.txt") + fi + + if [[ -f "$TEMP_DIR/install_order.txt" ]]; then + while IFS= read -r filename; do + install_count=$((install_count + 1)) + + # 从文件名中提取组件名(去掉时间戳后缀) + component=$(echo "$filename" | sed 's/-[0-9]\{8\}-[0-9]\{6\}\.tar\.gz$//') + + log_info "[$install_count/$total_count] 安装 $component..." + log_info " 文件名: $filename" + + # 直接使用完整的文件名 + tar_file="$artifact_dir/$filename" + + if [[ ! -f "$tar_file" ]]; then + log_error "找不到组件文件: $filename" + log_info " 期望路径: $tar_file" + log_info " 当前目录: $(pwd)" + log_info " 目录内容:" + ls -la "$artifact_dir" | while read line; do + log_info " $line" + done + exit 1 + fi + + log_info " 找到文件: $tar_file" + + # 解压到临时目录 + component_temp_dir="$TEMP_DIR/$component" + mkdir -p "$component_temp_dir" + + if tar -xzf "$tar_file" -C "$component_temp_dir" 2>/dev/null; then + log_success " $component 解压完成" + else + log_error " $component 解压失败" + exit 1 + fi + + # 查找解压后的目录 + extracted_dir="" + for dir in "$component_temp_dir"/*; do + if [[ -d "$dir" ]]; then + extracted_dir="$dir" + break + fi + done + + if [[ -z "$extracted_dir" ]]; then + log_error " $component 解压后未找到目录" + exit 1 + fi + + # 执行安装脚本 + if [[ -f "$extracted_dir/install.sh" ]]; then + log_info " 执行 $component 安装脚本..." + if (cd "$extracted_dir" && ./install.sh "$INSTALL_DIR"); then + log_success " $component 安装完成" + else + log_error " $component 安装失败" + exit 1 + fi + else + log_error " $component 缺少 install.sh 文件" + exit 1 + fi + + # 将解压后的目录移动到安装目录,保留组件目录 + component_install_dir="$INSTALL_DIR/$component" + # 简化安装逻辑:直接删除旧目录,不进行备份 + if [[ -d "$component_install_dir" ]]; then + log_info " 组件目录已存在,删除旧版本: $component_install_dir" + rm -rf "$component_install_dir" + # log_info " 组件目录已存在,备份后更新: $component_install_dir" + # mv "$component_install_dir" "${component_install_dir}.backup.$(date +%Y%m%d_%H%M%S)" + fi + mv "$extracted_dir" "$component_install_dir" + log_success " 组件目录已保存: $component_install_dir" + + # 清理临时文件 + rm -rf "$component_temp_dir" + done < "$TEMP_DIR/install_order.txt" + fi + + log_success "所有组件安装完成" +} + +# 创建安装记录 +create_install_record() { + log_info "创建安装记录..." + + # 等待一段时间确保所有进程都已启动 + log_info "等待进程启动..." + sleep 3 + + local install_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + local install_record_file="$INSTALL_DIR/.install_record" + + # 创建 JSON 格式的安装记录 + cat > "$install_record_file" << EOF +{ + "version": "$VERSION", + "build_time": "$BUILD_TIME", + "install_time": "$install_time", + "install_dir": "$INSTALL_DIR", + "install_pid": $$, + "components": { +EOF + + # 添加组件信息 + local first_component=true + if [[ -f "$TEMP_DIR/components.txt" ]]; then + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + version=$(echo "$line" | cut -d':' -f2) + + # 获取组件的进程信息 + local component_pid="" + + # 根据组件名查找进程,使用多种方法确保能找到PID + case "$component" in + "node-exporter") + # 尝试多种方式查找node_exporter进程 + component_pid=$(pgrep -f "node_exporter" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "node-exporter" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "node_exporter" | awk '{print $2}' | head -1) + fi + ;; + "dcgm-exporter") + # 查找dcgm-exporter进程 + component_pid=$(pgrep -f "dcgm-exporter" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "dcgm_exporter" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "dcgm-exporter" | awk '{print $2}' | head -1) + fi + ;; + "fluent-bit") + # 查找fluent-bit进程 + component_pid=$(pgrep -f "fluent-bit" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "fluent_bit" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "fluent-bit" | awk '{print $2}' | head -1) + fi + ;; + "argus-agent") + # 查找argus-agent进程 + component_pid=$(pgrep -f "argus-agent" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "argus-agent" | awk '{print $2}' | head -1) + fi + ;; + esac + + # 记录找到的PID信息 + if [[ -n "$component_pid" ]]; then + log_info " 找到 $component 进程 PID: $component_pid" + else + log_warning " 未找到 $component 进程" + fi + + # 添加逗号分隔符 + if [[ "$first_component" == "true" ]]; then + first_component=false + else + echo "," >> "$install_record_file" + fi + + # 添加组件信息 + cat >> "$install_record_file" << EOF + "$component": { + "version": "$version", + "pid": "$component_pid", + "install_dir": "$INSTALL_DIR/$component" + } +EOF + done < "$TEMP_DIR/components.txt" + fi + + # 结束 JSON + cat >> "$install_record_file" << EOF + } +} +EOF + + log_success "安装记录已创建: $install_record_file" +} + +# 设置健康检查定时任务 +setup_health_check_cron() { + log_info "设置健康检查定时任务..." + + # 直接使用当前安装目录,不依赖current软链接 + # INSTALL_DIR 是 /opt/argus-metric/versions/1.34.0 + local check_health_script="$INSTALL_DIR/check_health.sh" + + # 检查健康检查脚本是否存在 + if [[ ! -f "$check_health_script" ]]; then + log_error "健康检查脚本不存在: $check_health_script" + return 1 + fi + + # 确保脚本有执行权限 + chmod +x "$check_health_script" + + # 创建临时crontab文件 + local temp_cron="/tmp/crontab_$$" + + # 获取当前用户的crontab(如果存在) + crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron" + + # 检查并删除旧的健康检查任务 + if grep -q "check_health.sh" "$temp_cron"; then + log_info "发现旧的健康检查定时任务,正在更新..." + # 删除所有包含check_health.sh的行 + grep -v "check_health.sh" "$temp_cron" > "$temp_cron.new" + mv "$temp_cron.new" "$temp_cron" + log_info "旧的健康检查定时任务已删除" + fi + + # 添加新的定时任务(每5分钟执行一次) + echo "# Argus-Metrics 健康检查定时任务" >> "$temp_cron" + echo "*/5 * * * * $check_health_script >> $INSTALL_DIR/.health_cron.log 2>&1" >> "$temp_cron" + + # 安装新的crontab + if crontab "$temp_cron"; then + log_success "健康检查定时任务设置成功" + log_info " 执行频率: 每5分钟" + log_info " 日志文件: $INSTALL_DIR/.health_cron.log" + log_info " 查看定时任务: crontab -l" + log_info " 删除定时任务: crontab -e" + else + log_error "健康检查定时任务设置失败" + rm -f "$temp_cron" + return 1 + fi + + # 清理临时文件 + rm -f "$temp_cron" + + log_info "健康检查通过crontab自动执行" +} + +# 设置 DNS 同步定时任务 +setup_dns_sync_cron() { + log_info "设置 DNS 同步定时任务..." + + # 使用当前版本目录中的 DNS 同步脚本 + local sync_dns_script="$INSTALL_DIR/sync_dns.sh" + + # 检查 DNS 同步脚本是否存在 + if [[ ! -f "$sync_dns_script" ]]; then + log_warning "DNS 同步脚本不存在: $sync_dns_script" + log_warning "跳过 DNS 同步定时任务设置" + return 0 + fi + + # 确保脚本有执行权限 + chmod +x "$sync_dns_script" + + # 创建临时crontab文件 + local temp_cron="/tmp/crontab_$$" + + # 获取当前用户的crontab(如果存在) + crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron" + + # 检查并删除旧的 DNS 同步任务 + if grep -q "sync_dns.sh" "$temp_cron"; then + log_info "发现旧的 DNS 同步定时任务,正在更新..." + # 删除所有包含sync_dns.sh的行 + grep -v "sync_dns.sh" "$temp_cron" > "$temp_cron.new" + mv "$temp_cron.new" "$temp_cron" + log_info "旧的 DNS 同步定时任务已删除" + fi + + # 添加新的定时任务(每30秒执行一次) + # 直接使用版本目录中的 DNS 同步脚本 + echo "# Argus-Metrics DNS 同步定时任务" >> "$temp_cron" + echo "* * * * * $sync_dns_script >> $INSTALL_DIR/.dns_sync.log 2>&1" >> "$temp_cron" + echo "* * * * * sleep 30; $sync_dns_script >> $INSTALL_DIR/.dns_sync.log 2>&1" >> "$temp_cron" + + # 安装新的crontab + if crontab "$temp_cron"; then + log_success "DNS 同步定时任务设置成功" + log_info " 执行频率: 每30秒" + log_info " 日志文件: $INSTALL_DIR/.dns_sync.log" + log_info " 查看定时任务: crontab -l" + log_info " 删除定时任务: crontab -e" + else + log_error "DNS 同步定时任务设置失败" + rm -f "$temp_cron" + return 1 + fi + + # 清理临时文件 + rm -f "$temp_cron" + + log_info "DNS 同步通过crontab自动执行" +} + +# 设置版本校验定时任务 +setup_version_check_cron() { + log_info "设置版本校验定时任务..." + + # 使用当前版本目录中的版本校验脚本 + local check_version_script="$INSTALL_DIR/check_version.sh" + + # 检查脚本是否存在 + if [[ ! -f "$check_version_script" ]]; then + log_warning "版本校验脚本不存在: $check_version_script" + log_info "跳过版本校验定时任务设置" + return 0 + fi + + # 确保脚本可执行 + chmod +x "$check_version_script" + + # 创建临时crontab文件 + local temp_cron="/tmp/crontab_$$" + crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron" + + # 检查是否已存在版本校验定时任务 + if grep -q "check_version.sh" "$temp_cron"; then + log_info "发现旧的版本校验定时任务,正在更新..." + # 删除所有包含check_version.sh的行 + grep -v "check_version.sh" "$temp_cron" > "$temp_cron.new" + mv "$temp_cron.new" "$temp_cron" + log_info "旧的版本校验定时任务已删除" + fi + + # 添加新的定时任务(每30分钟执行一次) + echo "# Argus-Metrics 版本校验定时任务" >> "$temp_cron" + echo "*/1 * * * * $check_version_script >> $INSTALL_DIR/.version_check.log 2>&1" >> "$temp_cron" + + # 安装新的crontab + if crontab "$temp_cron"; then + log_success "版本校验定时任务设置成功" + log_info " 执行频率: 每1分钟" + log_info " 日志文件: $INSTALL_DIR/.version_check.log" + log_info " 查看定时任务: crontab -l" + log_info " 删除定时任务: crontab -e" + else + log_error "版本校验定时任务设置失败" + rm -f "$temp_cron" + return 1 + fi + + # 清理临时文件 + rm -f "$temp_cron" + + log_info "版本校验通过crontab自动执行" +} + +# 设置自动重启定时任务 +setup_restart_cron() { + log_info "设置自动重启定时任务..." + + # 使用当前版本目录中的重启脚本 + local restart_script="$INSTALL_DIR/restart_unhealthy.sh" + + # 检查脚本是否存在 + if [[ ! -f "$restart_script" ]]; then + log_warning "重启脚本不存在: $restart_script" + log_info "跳过自动重启定时任务设置" + return 0 + fi + + # 确保脚本可执行 + chmod +x "$restart_script" + + # 创建临时crontab文件 + local temp_cron="/tmp/crontab_$$" + crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron" + + # 检查是否已存在自动重启定时任务 + if grep -q "restart_unhealthy.sh" "$temp_cron"; then + log_info "发现旧的自动重启定时任务,正在更新..." + # 删除所有包含restart_unhealthy.sh的行 + grep -v "restart_unhealthy.sh" "$temp_cron" > "$temp_cron.new" + mv "$temp_cron.new" "$temp_cron" + log_info "旧的自动重启定时任务已删除" + fi + + # 添加新的定时任务(每2分钟执行一次) + echo "# Argus-Metrics 自动重启定时任务" >> "$temp_cron" + echo "*/2 * * * * $restart_script >> $INSTALL_DIR/.restart.log 2>&1" >> "$temp_cron" + + # 安装新的crontab + if crontab "$temp_cron"; then + log_success "自动重启定时任务设置成功" + log_info " 执行频率: 每2分钟" + log_info " 日志文件: $INSTALL_DIR/.restart.log" + log_info " 查看定时任务: crontab -l" + log_info " 删除定时任务: crontab -e" + else + log_error "自动重启定时任务设置失败" + rm -f "$temp_cron" + return 1 + fi + + # 清理临时文件 + rm -f "$temp_cron" + + log_info "自动重启检查通过crontab自动执行" +} + +# 显示安装信息 +show_install_info() { + log_success "Argus-Metrics All-in-One 安装完成!" +} + +cleanup() { + if [[ -d "$TEMP_DIR" ]]; then + rm -rf "$TEMP_DIR" + fi +} + +trap cleanup EXIT + +# 主函数 +main() { + echo "==========================================" + echo " Argus-Metrics All-in-One 安装脚本 v1.0" + echo "==========================================" + echo + + # 加载配置文件 + load_config + + log_info "安装目录: $INSTALL_DIR" + echo + + check_root + check_system + find_version_file + create_install_dirs + parse_version_info + verify_checksums + install_system_deps + install_components + copy_config_files + create_install_record + setup_health_check_cron + setup_dns_sync_cron + setup_version_check_cron + setup_restart_cron + show_install_info +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/package_artifact.sh b/src/metric/client-plugins/all-in-one-demo/scripts/package_artifact.sh new file mode 100755 index 0000000..2c4bb6b --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/scripts/package_artifact.sh @@ -0,0 +1,474 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 显示帮助信息 +show_help() { + echo "AIOps All-in-One 打包脚本" + echo + echo "用法: $0 [选项]" + echo + echo "选项:" + echo " --force 强制重新打包,即使版本已存在" + echo " --help 显示此帮助信息" + echo + echo "示例:" + echo " $0 # 正常打包,跳过已存在的版本" + echo " $0 --force # 强制重新打包" + echo +} + +# 解析命令行参数 +FORCE_PACKAGE=false +if [[ "$1" == "--force" ]]; then + FORCE_PACKAGE=true + log_info "强制重新打包模式" +elif [[ "$1" == "--help" || "$1" == "-h" ]]; then + show_help + exit 0 +fi + +# 获取当前目录和版本 +CURRENT_DIR=$(pwd) +VERSION=$(cat config/VERSION 2>/dev/null || echo "1.0.0") +ARTIFACT_DIR="artifact/$VERSION" + +log_info "开始打包 AIOps All-in-One 安装包 v$VERSION" + +# 检查必要文件 +log_info "检查必要文件..." +if [[ ! -f "config/VERSION" ]]; then + log_error "VERSION 文件不存在" + exit 1 +fi + +if [[ ! -f "config/checklist" ]]; then + log_error "checklist 文件不存在" + exit 1 +fi + +# 检查是否已存在该版本 +if [[ -d "$ARTIFACT_DIR" && "$FORCE_PACKAGE" == "false" ]]; then + log_info "检查版本 $VERSION 是否已存在..." + + # 检查 version.json 是否存在 + if [[ -f "$ARTIFACT_DIR/version.json" ]]; then + log_info "找到已存在的版本信息文件" + + # 检查是否所有组件文件都存在 + missing_files=0 + existing_components=0 + + # 解析已存在的 version.json 来检查文件 + if command -v jq &> /dev/null; then + # 使用 jq 解析 + while IFS= read -r component; do + existing_components=$((existing_components + 1)) + # 查找对应的 tar 文件 + found_file=false + for file in "$ARTIFACT_DIR/${component}-"*.tar.gz; do + if [[ -f "$file" ]]; then + found_file=true + break + fi + done + if [[ "$found_file" == "false" ]]; then + missing_files=$((missing_files + 1)) + log_warning " 缺少文件: $component" + fi + done < <(jq -r '.artifact_list | keys[]' "$ARTIFACT_DIR/version.json" 2>/dev/null) + else + # 简单的文件检查 + for file in "$ARTIFACT_DIR"/*.tar.gz; do + if [[ -f "$file" ]]; then + existing_components=$((existing_components + 1)) + fi + done + fi + + # 如果所有文件都存在,则跳过打包 + if [[ $missing_files -eq 0 && $existing_components -gt 0 ]]; then + log_success "版本 $VERSION 已完整打包,跳过重复打包" + echo + echo "现有文件:" + ls -la "$ARTIFACT_DIR" + echo + echo "如需强制重新打包,请删除目录: rm -rf $ARTIFACT_DIR" + echo "或使用: ./package.sh --force" + exit 0 + else + log_warning "版本 $VERSION 存在但不完整,将重新打包" + log_info " 现有组件: $existing_components" + log_info " 缺少文件: $missing_files" + fi + else + log_warning "版本目录存在但缺少 version.json,将重新打包" + fi +fi + +# 创建 artifact 目录 +mkdir -p "$ARTIFACT_DIR" +log_info "创建输出目录: $ARTIFACT_DIR" + +# 创建临时文件存储数据 +TEMP_DIR=$(mktemp -d) +COMPONENTS_FILE="$TEMP_DIR/components.txt" +VERSIONS_FILE="$TEMP_DIR/versions.txt" +DEPENDENCIES_FILE="$TEMP_DIR/dependencies.txt" +INSTALL_ORDER_FILE="$TEMP_DIR/install_order.txt" +CHECKSUMS_FILE="$TEMP_DIR/checksums.txt" +ARTIFACT_LIST_FILE="$TEMP_DIR/artifact_list.txt" + +# 解析 checklist 文件 +log_info "解析组件清单..." +line_num=0 +component_count=0 + +while IFS= read -r line; do + [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue + + line_num=$((line_num + 1)) + + # 解析行: 组件名 目录路径 版本 [依赖组件] [安装顺序] + read -r component component_path version dep_component order <<< "$line" + + if [[ -z "$component" || -z "$component_path" || -z "$version" ]]; then + log_warning "跳过无效行 $line_num: $line" + continue + fi + + # 存储组件信息 + echo "$component" >> "$COMPONENTS_FILE" + echo "$component:$version" >> "$VERSIONS_FILE" + echo "$component:$component_path" >> "$TEMP_DIR/component_paths.txt" + + if [[ -n "$dep_component" && "$dep_component" != "$component" ]]; then + echo "$component:$dep_component" >> "$DEPENDENCIES_FILE" + fi + + if [[ -n "$order" && "$order" =~ ^[0-9]+$ ]]; then + echo "$order:$component" >> "$INSTALL_ORDER_FILE" + else + # 如果没有指定顺序,按解析顺序分配 + echo "$line_num:$component" >> "$INSTALL_ORDER_FILE" + fi + + component_count=$((component_count + 1)) + log_info " - $component v$version" +done < config/checklist + +if [[ $component_count -eq 0 ]]; then + log_error "没有找到有效的组件" + rm -rf "$TEMP_DIR" + exit 1 +fi + +log_success "找到 $component_count 个组件" + +# 检查组件目录是否存在 +log_info "检查组件目录..." +missing_components=() + +while IFS= read -r component; do + # 获取组件路径 + component_path=$(grep "^$component:" "$TEMP_DIR/component_paths.txt" | cut -d':' -f2-) + if [[ -z "$component_path" ]]; then + log_error "未找到组件 $component 的路径配置" + log_info "请检查 component_paths.txt 文件或添加路径配置" + exit 1 + fi + + if [[ ! -d "$component_path" ]]; then + missing_components+=("$component:$component_path") + fi +done < "$COMPONENTS_FILE" + +if [[ ${#missing_components[@]} -gt 0 ]]; then + log_error "以下组件目录不存在:" + for component_path in "${missing_components[@]}"; do + echo " - $component_path" + done + rm -rf "$TEMP_DIR" + exit 1 +fi + +# 打包各个组件 +log_info "开始打包组件..." + +while IFS= read -r component; do + # 获取组件版本和路径 + version=$(grep "^$component:" "$VERSIONS_FILE" | cut -d':' -f2) + component_path=$(grep "^$component:" "$TEMP_DIR/component_paths.txt" | cut -d':' -f2-) + if [[ -z "$component_path" ]]; then + log_error "未找到组件 $component 的路径配置" + log_info "请检查 component_paths.txt 文件或添加路径配置" + exit 1 + fi + + log_info "打包 $component v$version..." + log_info " 组件路径: $component_path" + + # 进入组件目录 + cd "$component_path" + + # 检查组件是否有 package.sh + if [[ ! -f "package.sh" ]]; then + log_error "$component 缺少 package.sh 文件" + cd "$CURRENT_DIR" + rm -rf "$TEMP_DIR" + exit 1 + fi + + # 执行组件的打包脚本 + if ./package.sh; then + # 查找生成的 tar 包 + tar_file=$(find . -name "*.tar.gz" -type f | head -1) + if [[ -n "$tar_file" ]]; then + # 移动到 artifact 目录 + mv "$tar_file" "$CURRENT_DIR/$ARTIFACT_DIR/" + tar_filename=$(basename "$tar_file") + + # 计算校验和 + checksum=$(sha256sum "$CURRENT_DIR/$ARTIFACT_DIR/$tar_filename" | cut -d' ' -f1) + echo "$component:sha256:$checksum" >> "$CHECKSUMS_FILE" + echo "$component:$version" >> "$ARTIFACT_LIST_FILE" + + # 将完整的文件名存储到安装顺序文件中 + echo "$tar_filename" >> "$TEMP_DIR/install_order_files.txt" + + log_success " $component 打包完成: $tar_filename" + else + log_error "$component 打包失败,未找到生成的 tar 包" + cd "$CURRENT_DIR" + rm -rf "$TEMP_DIR" + exit 1 + fi + else + log_error "$component 打包失败" + cd "$CURRENT_DIR" + rm -rf "$TEMP_DIR" + exit 1 + fi + + # 返回主目录 + cd "$CURRENT_DIR" +done < "$COMPONENTS_FILE" + +# 生成 version.json +log_info "生成版本信息文件..." +version_json="$ARTIFACT_DIR/version.json" + +# 构建依赖关系 JSON +deps_json="" +if [[ -f "$DEPENDENCIES_FILE" ]]; then + first=true + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + dep=$(echo "$line" | cut -d':' -f2) + if [[ "$first" == "true" ]]; then + deps_json="\"$component\":[\"$dep\"]" + first=false + else + deps_json="$deps_json,\"$component\":[\"$dep\"]" + fi + done < "$DEPENDENCIES_FILE" +fi + +# 构建安装顺序数组 +order_array="" +if [[ -f "$TEMP_DIR/install_order_files.txt" ]]; then + first=true + while IFS= read -r filename; do + if [[ "$first" == "true" ]]; then + order_array="\"$filename\"" + first=false + else + order_array="$order_array,\"$filename\"" + fi + done < "$TEMP_DIR/install_order_files.txt" +fi + +# 构建 artifact_list JSON +artifact_json="" +if [[ -f "$ARTIFACT_LIST_FILE" ]]; then + first=true + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + version=$(echo "$line" | cut -d':' -f2) + if [[ "$first" == "true" ]]; then + artifact_json="\"$component\":\"$version\"" + first=false + else + artifact_json="$artifact_json,\"$component\":\"$version\"" + fi + done < "$ARTIFACT_LIST_FILE" +fi + +# 构建 checksums JSON +checksums_json="" +if [[ -f "$CHECKSUMS_FILE" ]]; then + first=true + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + checksum=$(echo "$line" | cut -d':' -f2-) + if [[ "$first" == "true" ]]; then + checksums_json="\"$component\":\"$checksum\"" + first=false + else + checksums_json="$checksums_json,\"$component\":\"$checksum\"" + fi + done < "$CHECKSUMS_FILE" +fi + +# 生成完整的 version.json +cat > "$version_json" << EOF +{ + "version": "$VERSION", + "build_time": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", + "artifact_list": { + $artifact_json + }, + "checksums": { + $checksums_json + }, + "dependencies": { + $deps_json + }, + "install_order": [ + $order_array + ] +} +EOF + +log_success "版本信息文件生成完成: $version_json" + +# 复制`安装`脚本到 artifact 目录 +log_info "复制安装脚本..." +if [[ -f "scripts/install_artifact.sh" ]]; then + cp "scripts/install_artifact.sh" "$ARTIFACT_DIR/install.sh" + chmod +x "$ARTIFACT_DIR/install.sh" + log_success "安装脚本复制完成: $ARTIFACT_DIR/install.sh" +else + log_warning "scripts/install_artifact.sh 文件不存在" +fi + +# 复制`卸载`脚本到 artifact 目录 +log_info "复制卸载脚本..." +if [[ -f "scripts/uninstall_artifact.sh" ]]; then + cp "scripts/uninstall_artifact.sh" "$ARTIFACT_DIR/uninstall.sh" + chmod +x "$ARTIFACT_DIR/uninstall.sh" + log_success "卸载脚本复制完成: $ARTIFACT_DIR/uninstall.sh" +else + log_warning "scripts/uninstall_artifact.sh 文件不存在" +fi + +# 复制`健康检查`脚本到 artifact 目录 +log_info "复制健康检查脚本..." +if [[ -f "scripts/check_health.sh" ]]; then + cp "scripts/check_health.sh" "$ARTIFACT_DIR/check_health.sh" + chmod +x "$ARTIFACT_DIR/check_health.sh" + log_success "健康检查脚本复制完成: $ARTIFACT_DIR/check_health.sh" +else + log_warning "scripts/check_health.sh 文件不存在" +fi + +# 复制`DNS 同步`脚本到 artifact 目录 +log_info "复制 DNS 同步脚本..." +if [[ -f "scripts/sync_dns.sh" ]]; then + cp "scripts/sync_dns.sh" "$ARTIFACT_DIR/sync_dns.sh" + chmod +x "$ARTIFACT_DIR/sync_dns.sh" + log_success "DNS 同步脚本复制完成: $ARTIFACT_DIR/sync_dns.sh" +else + log_warning "scripts/sync_dns.sh 文件不存在" +fi + +# 复制`版本校验`脚本到 artifact 目录 +log_info "复制版本校验脚本..." +if [[ -f "scripts/check_version.sh" ]]; then + cp "scripts/check_version.sh" "$ARTIFACT_DIR/check_version.sh" + chmod +x "$ARTIFACT_DIR/check_version.sh" + log_success "版本校验脚本复制完成: $ARTIFACT_DIR/check_version.sh" +else + log_warning "scripts/check_version.sh 文件不存在" +fi + +# 复制`自动重启`脚本到 artifact 目录 +log_info "复制自动重启脚本..." +if [[ -f "scripts/restart_unhealthy.sh" ]]; then + cp "scripts/restart_unhealthy.sh" "$ARTIFACT_DIR/restart_unhealthy.sh" + chmod +x "$ARTIFACT_DIR/restart_unhealthy.sh" + log_success "自动重启脚本复制完成: $ARTIFACT_DIR/restart_unhealthy.sh" +else + log_warning "scripts/restart_unhealthy.sh 文件不存在" +fi + +# 复制配置文件到 artifact 目录 +log_info "复制配置文件..." +if [[ -f "config/config.env" ]]; then + cp "config/config.env" "$ARTIFACT_DIR/" + log_success "配置文件复制完成: $ARTIFACT_DIR/config.env" +else + log_warning "config 目录不存在,跳过配置文件复制" +fi + +# DNS 配置文件不需要复制到版本目录,直接从 FTP 服务器根目录获取 + +# 复制 deps 目录到 artifact 目录 +log_info "复制系统依赖包..." +if [[ -d "deps" ]]; then + cp -r "deps" "$ARTIFACT_DIR/" + log_success "系统依赖包复制完成: $ARTIFACT_DIR/deps" + + # 显示deps目录内容 + log_info " 依赖包列表:" + find "$ARTIFACT_DIR/deps" -name "*.tar.gz" -exec basename {} \; | while read dep_file; do + log_info " - $dep_file" + done +else + log_warning "deps 目录不存在,跳过依赖包复制" +fi + +# 显示打包结果 +log_success "打包完成!" +echo +echo "版本: $VERSION" +echo "输出目录: $ARTIFACT_DIR" +echo "包含组件:" +if [[ -f "$ARTIFACT_LIST_FILE" ]]; then + while IFS= read -r line; do + component=$(echo "$line" | cut -d':' -f1) + version=$(echo "$line" | cut -d':' -f2) + echo " - $component v$version" + done < "$ARTIFACT_LIST_FILE" +fi +echo +echo "文件列表:" +ls -la "$ARTIFACT_DIR" +echo + +# 清理临时文件 +rm -rf "$TEMP_DIR" diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh b/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh new file mode 100755 index 0000000..7b93099 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/scripts/publish_artifact.sh @@ -0,0 +1,214 @@ +#!/bin/bash + +set -e + +# 颜色定义 +GREEN='\033[0;32m' +BLUE='\033[0;34m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 显示帮助信息 +show_help() { + echo "Argus-Metric Artifact 发布脚本" + echo + echo "用法: $0 <版本号>" + echo + echo "参数:" + echo " <版本号> 要发布的版本号,对应 artifact 目录中的版本" + echo + echo "示例:" + echo " $0 1.20.0 # 发布 1.20.0 版本" + echo +} + +# 检查参数 +if [[ $# -ne 1 ]]; then + log_error "请提供版本号参数" + show_help + exit 1 +fi + +VERSION="$1" +ARTIFACT_DIR="artifact/$VERSION" +PUBLISH_DIR="/Users/sundapeng/Project/nlp/aiops/client-plugins/all-in-one/publish/" + +# 检查版本目录是否存在 +if [[ ! -d "$ARTIFACT_DIR" ]]; then + log_error "版本目录不存在: $ARTIFACT_DIR" + exit 1 +fi + +log_info "开始发布版本: $VERSION" + +# 确保发布目录存在 +log_info "确保发布目录存在: $PUBLISH_DIR" +mkdir -p "$PUBLISH_DIR" + +# 创建临时目录用于打包 +TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$" +mkdir -p "$TEMP_PACKAGE_DIR" + +# 复制所有 tar.gz 文件到临时目录 +log_info "准备 artifact 文件..." +tar_files=$(find "$ARTIFACT_DIR" -name "*.tar.gz" -type f) + +if [[ -z "$tar_files" ]]; then + log_error "在 $ARTIFACT_DIR 中未找到 tar.gz 文件" + exit 1 +fi + +for file in $tar_files; do + filename=$(basename "$file") + log_info " 准备: $filename" + cp "$file" "$TEMP_PACKAGE_DIR/" +done + +# 复制版本信息文件 +if [[ -f "$ARTIFACT_DIR/version.json" ]]; then + log_info "复制版本信息文件..." + cp "$ARTIFACT_DIR/version.json" "$TEMP_PACKAGE_DIR/" +fi + +# 复制健康检查脚本 +if [[ -f "$ARTIFACT_DIR/check_health.sh" ]]; then + log_info "复制健康检查脚本..." + cp "$ARTIFACT_DIR/check_health.sh" "$TEMP_PACKAGE_DIR/" +elif [[ -f "scripts/check_health.sh" ]]; then + log_info "复制健康检查脚本 (从当前目录)..." + cp "scripts/check_health.sh" "$TEMP_PACKAGE_DIR/" +else + log_warning "未找到 check_health.sh 文件" +fi + +# 复制 DNS 同步脚本 +if [[ -f "$ARTIFACT_DIR/sync_dns.sh" ]]; then + log_info "复制 DNS 同步脚本..." + cp "$ARTIFACT_DIR/sync_dns.sh" "$TEMP_PACKAGE_DIR/" +elif [[ -f "scripts/sync_dns.sh" ]]; then + log_info "复制 DNS 同步脚本 (从当前目录)..." + cp "scripts/sync_dns.sh" "$TEMP_PACKAGE_DIR/" +else + log_warning "未找到 sync_dns.sh 文件" +fi + +# 复制版本校验脚本 +if [[ -f "$ARTIFACT_DIR/check_version.sh" ]]; then + log_info "复制版本校验脚本..." + cp "$ARTIFACT_DIR/check_version.sh" "$TEMP_PACKAGE_DIR/" +elif [[ -f "scripts/check_version.sh" ]]; then + log_info "复制版本校验脚本 (从当前目录)..." + cp "scripts/check_version.sh" "$TEMP_PACKAGE_DIR/" +else + log_warning "未找到 check_version.sh 文件" +fi + +# 复制重启失败脚本 +if [[ -f "$ARTIFACT_DIR/restart_unhealthy.sh" ]]; then + log_info "复制重启失败脚本..." + cp "$ARTIFACT_DIR/restart_unhealthy.sh" "$TEMP_PACKAGE_DIR/" +elif [[ -f "scripts/restart_unhealthy.sh" ]]; then + log_info "复制重启失败脚本 (从当前目录)..." + cp "scripts/restart_unhealthy.sh" "$TEMP_PACKAGE_DIR/" +else + log_warning "未找到 restart_unhealthy.sh 文件" +fi + +# 复制安装脚本并重命名为 install.sh +if [[ -f "scripts/install_artifact.sh" ]]; then + log_info "复制安装脚本..." + cp "scripts/install_artifact.sh" "$TEMP_PACKAGE_DIR/install.sh" +fi + +if [[ -f "scripts/uninstall_artifact.sh" ]]; then + log_info "复制卸载脚本..." + cp "scripts/uninstall_artifact.sh" "$TEMP_PACKAGE_DIR/uninstall.sh" +fi + +# 复制配置文件 +if [[ -f "$ARTIFACT_DIR/config.env" ]]; then + log_info "复制配置文件..." + cp "$ARTIFACT_DIR/config.env" "$TEMP_PACKAGE_DIR/" + log_success "配置文件复制完成" +else + log_warning "未找到 config.env 文件" +fi + +# DNS 配置文件将在后面直接复制到发布目录根目录,不包含在 tar.gz 中 + +# 复制 deps 目录 +if [[ -d "$ARTIFACT_DIR/deps" ]]; then + log_info "复制系统依赖包..." + cp -r "$ARTIFACT_DIR/deps" "$TEMP_PACKAGE_DIR/" + log_success "系统依赖包复制完成" +fi + +# 创建tar包,使用新的命名规范 +TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz" +log_info "创建发布包: $TAR_NAME" +cd "$TEMP_PACKAGE_DIR" +tar -czf "$PUBLISH_DIR/$TAR_NAME" * +cd - > /dev/null + +# 清理临时目录 +rm -rf "$TEMP_PACKAGE_DIR" + +# 更新 LATEST_VERSION 文件 +log_info "更新 LATEST_VERSION 文件..." +echo "$VERSION" > "$PUBLISH_DIR/LATEST_VERSION" + +# 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制) +if [[ -f "config/dns.conf" ]]; then + log_info "复制 DNS 配置文件到发布目录根目录..." + cp "config/dns.conf" "$PUBLISH_DIR/" + log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf" +else + log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制" +fi + +# 复制 setup.sh 到发布目录 +if [[ -f "scripts/setup.sh" ]]; then + log_info "复制 setup.sh 到发布目录..." + cp "scripts/setup.sh" "$PUBLISH_DIR/" +fi + +# 显示发布结果 +log_success "版本 $VERSION 发布完成!" +echo +echo "发布目录: $PUBLISH_DIR" +echo "发布包: $PUBLISH_DIR/$TAR_NAME" +echo "包大小: $(du -h "$PUBLISH_DIR/$TAR_NAME" | cut -f1)" +echo "最新版本: $(cat "$PUBLISH_DIR/LATEST_VERSION")" +echo +echo "发布目录中的文件:" +ls -la "$PUBLISH_DIR" | while read line; do + echo " $line" +done +echo +echo "使用方法:" +echo " 1. 确保 /srv/ftp/share 目录可通过 FTP 访问" +echo " 2. 用户首先下载安装脚本:" +echo " curl -u ftpuser:admin1234 ftp://10.211.55.4/setup.sh -o setup.sh" +echo " 3. 然后执行安装 (自动获取最新版本):" +echo " sudo sh setup.sh" +echo " 4. 或者指定版本安装:" +echo " sudo sh setup.sh --version $VERSION" +echo " 5. 或者指定不同的FTP服务器:" +echo " sudo sh setup.sh --server 192.168.1.100 --user myuser --password mypass" diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/restart_unhealthy.sh b/src/metric/client-plugins/all-in-one-demo/scripts/restart_unhealthy.sh new file mode 100755 index 0000000..7e54693 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/scripts/restart_unhealthy.sh @@ -0,0 +1,328 @@ +#!/bin/bash + +# 此脚本会检查各组件的健康状态,并重启不健康的组件 + +# 获取脚本所在目录 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record" + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') - $1" +} + +# 加载配置文件 +load_config() { + local config_file="$SCRIPT_DIR/config.env" + + if [[ -f "$config_file" ]]; then + log_info "加载配置文件: $config_file" + set -a + source "$config_file" + set +a + log_success "配置文件加载完成" + else + log_warning "配置文件不存在: $config_file,使用默认配置" + fi +} + +# 检查单个组件健康状态 +check_component_health() { + local component_name="$1" + local check_script_path="$2" + + if [[ ! -f "$check_script_path" ]]; then + log_error "$component_name: 健康检查脚本不存在: $check_script_path" + return 1 + fi + + if [[ ! -x "$check_script_path" ]]; then + chmod +x "$check_script_path" 2>/dev/null || true + fi + + # 执行健康检查,捕获退出码 + if "$check_script_path" > /dev/null 2>&1; then + return 0 + else + return 1 + fi +} + +# 重启单个组件 +restart_component() { + local component_name="$1" + local install_dir="$2" + + log_warning "正在重启组件: $component_name" + + # 先执行卸载脚本 + local uninstall_script="$install_dir/uninstall.sh" + if [[ -f "$uninstall_script" ]]; then + log_info "$component_name: 执行卸载脚本..." + chmod +x "$uninstall_script" 2>/dev/null || true + # 使用 yes 命令自动回答所有确认提示 + yes 2>/dev/null | (cd "$install_dir" && "$uninstall_script") || true + log_info "$component_name: 卸载完成" + fi + + # 执行安装脚本 + local install_script="$install_dir/install.sh" + if [[ ! -f "$install_script" ]]; then + log_error "$component_name: 安装脚本不存在: $install_script" + return 1 + fi + + chmod +x "$install_script" 2>/dev/null || true + log_info "$component_name: 执行安装脚本..." + + # 使用 yes 命令自动回答所有确认提示,传递 SCRIPT_DIR 作为参数 + yes 2>/dev/null | (cd "$install_dir" && "$install_script" "$SCRIPT_DIR") || true + + log_info "$component_name: 安装脚本执行完成" + return 0 +} + +# 查找组件进程 PID +find_component_pid() { + local component_name="$1" + local component_pid="" + + case "$component_name" in + "node-exporter") + component_pid=$(pgrep -f "node_exporter" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "node-exporter" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "node_exporter" | awk '{print $2}' | head -1) + fi + ;; + "dcgm-exporter") + component_pid=$(pgrep -f "dcgm-exporter" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "dcgm_exporter" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "dcgm-exporter" | awk '{print $2}' | head -1) + fi + ;; + "fluent-bit") + component_pid=$(pgrep -f "fluent-bit" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(pgrep -f "fluent_bit" | head -1) + fi + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "fluent-bit" | awk '{print $2}' | head -1) + fi + ;; + "argus-agent") + component_pid=$(pgrep -f "argus-agent" | head -1) + if [[ -z "$component_pid" ]]; then + component_pid=$(ps aux | grep -v grep | grep "argus-agent" | awk '{print $2}' | head -1) + fi + ;; + esac + + echo "$component_pid" +} + +# 更新安装记录文件中的 PID +update_install_record_pid() { + local component_name="$1" + local new_pid="$2" + + if [[ ! -f "$INSTALL_RECORD_FILE" ]]; then + log_error "安装记录文件不存在: $INSTALL_RECORD_FILE" + return 1 + fi + + # 读取当前 PID + local current_pid="" + if command -v jq &> /dev/null; then + current_pid=$(jq -r --arg comp "$component_name" '.components[$comp].pid // ""' "$INSTALL_RECORD_FILE" 2>/dev/null) + fi + + if [[ -z "$current_pid" ]]; then + log_warning "$component_name: 无法读取当前 PID,跳过更新" + return 1 + fi + + # 使用 sed 精确替换 PID,保持原有格式不变 + # 只替换指定组件块中的 pid 字段 + local temp_file="${INSTALL_RECORD_FILE}.tmp" + local in_component=0 + local updated=0 + + while IFS= read -r line; do + if [[ "$line" =~ \"$component_name\":[[:space:]]*\{ ]]; then + in_component=1 + echo "$line" + elif [[ $in_component -eq 1 && "$line" =~ \"pid\":[[:space:]]*\"$current_pid\" ]]; then + echo "$line" | sed "s/\"pid\": \"$current_pid\"/\"pid\": \"$new_pid\"/" + updated=1 + in_component=0 + else + echo "$line" + if [[ "$line" =~ ^[[:space:]]*\}[[:space:]]*$ ]]; then + in_component=0 + fi + fi + done < "$INSTALL_RECORD_FILE" > "$temp_file" + + # 验证替换是否成功 + if [[ $updated -eq 1 ]]; then + mv "$temp_file" "$INSTALL_RECORD_FILE" + log_success "$component_name: PID 已更新为 $new_pid(原值: $current_pid)" + return 0 + else + log_error "$component_name: PID 替换失败" + rm -f "$temp_file" + return 1 + fi +} + +# 从安装记录文件中读取组件信息 +read_install_record() { + local install_record_file="$1" + + if [[ ! -f "$install_record_file" ]]; then + log_error "安装记录文件不存在: $install_record_file" + return 1 + fi + + # 检查是否有 jq 命令来解析 JSON + if command -v jq &> /dev/null; then + # 使用 jq 解析 JSON + local components_json + if components_json=$(jq -r '.components | to_entries[] | "\(.key):\(.value.install_dir)"' "$install_record_file" 2>/dev/null); then + echo "$components_json" + return 0 + else + log_error "无法解析安装记录文件 JSON 格式: $install_record_file" + return 1 + fi + else + # 如果没有 jq,尝试简单的文本解析 + log_warning "jq 命令不可用,尝试简单文本解析" + + # 查找所有 install_dir 行 + local components=() + while IFS= read -r line; do + if [[ "$line" =~ \"install_dir\":[[:space:]]*\"([^\"]+)\" ]]; then + local install_dir="${BASH_REMATCH[1]}" + # 从路径中提取组件名称 + local component_name=$(basename "$install_dir") + components+=("$component_name:$install_dir") + fi + done < "$install_record_file" + + if [[ ${#components[@]} -gt 0 ]]; then + printf '%s\n' "${components[@]}" + return 0 + else + log_error "无法从安装记录文件中提取组件信息" + return 1 + fi + fi +} + +# 主函数 +main() { + log_info "==========================================" + log_info " 组件自动重启检查" + log_info "==========================================" + + # 检查是否是root用户 + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + exit 1 + fi + + # 加载配置文件 + load_config + + # 从安装记录文件中读取组件信息 + log_info "从安装记录文件读取组件信息: $INSTALL_RECORD_FILE" + local components_info + if ! components_info=$(read_install_record "$INSTALL_RECORD_FILE"); then + log_error "无法读取安装记录文件,自动重启检查终止" + exit 1 + fi + + local restart_count=0 + local check_count=0 + + # 逐个检查组件 + while IFS= read -r component_info; do + if [[ -n "$component_info" ]]; then + IFS=':' read -r component_name install_dir <<< "$component_info" + check_count=$((check_count + 1)) + + local check_script_path="$install_dir/check_health.sh" + + log_info "检查组件: $component_name" + + # 检查健康状态 + if check_component_health "$component_name" "$check_script_path"; then + log_success "$component_name: 运行正常" + else + log_warning "$component_name: 健康检查失败,尝试重启" + restart_count=$((restart_count + 1)) + + # 执行重启 + restart_component "$component_name" "$install_dir" + + # 等待服务启动 + log_info "$component_name: 等待进程启动..." + sleep 10 + + # 查找新的进程 PID + local new_pid=$(find_component_pid "$component_name") + if [[ -n "$new_pid" ]]; then + log_info "$component_name: 找到新进程 PID: $new_pid" + update_install_record_pid "$component_name" "$new_pid" + else + log_warning "$component_name: 未找到新进程 PID" + fi + + # 再次检查健康状态 + if check_component_health "$component_name" "$check_script_path"; then + log_success "$component_name: 重启成功" + else + log_warning "$component_name: 重启后仍不健康,可能需要手动检查" + fi + fi + fi + done <<< "$components_info" + + log_info "==========================================" + log_info "检查完成: 共检查 $check_count 个组件,尝试重启 $restart_count 个" + log_info "==========================================" + + exit 0 +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi + diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/setup.sh b/src/metric/client-plugins/all-in-one-demo/scripts/setup.sh new file mode 100755 index 0000000..0c36bce --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/scripts/setup.sh @@ -0,0 +1,931 @@ +#!/bin/bash + +set -e + +# 加载配置文件(仅在解压后的目录中可用) +load_config() { + # setup.sh 脚本不需要配置文件,FTP参数通过命令行参数或环境变量提供 + log_info "setup.sh 脚本使用命令行参数或环境变量获取FTP配置" +} + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +FTP_SERVER="${FTP_SERVER}" +FTP_USER="${FTP_USER}" +FTP_PASS="${FTP_PASS}" +FTP_PORT="${FTP_PORT:-21}" +BASE_URL="" # FTP基础URL (将在check_ftp_params中设置) +LATEST_VERSION_URL="" # 版本文件URL (将在check_ftp_params中设置) +TEMP_DIR="/tmp/argus-metric-install-$$" + +# 安装目录配置 +DEFAULT_INSTALL_DIR="/opt/argus-metric" # 默认安装目录 +INSTALL_DIR="${INSTALL_DIR:-$DEFAULT_INSTALL_DIR}" # 可通过环境变量覆盖 +VERSIONS_DIR="$INSTALL_DIR/versions" # 版本目录 +BACKUPS_DIR="$INSTALL_DIR/backups" # 备份目录 +CURRENT_LINK="$INSTALL_DIR/current" # 当前版本软链接 +LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION" # 当前版本记录文件 + +# 检查必需的FTP参数 +check_ftp_params() { + local missing_params=() + + if [[ -z "$FTP_SERVER" ]]; then + missing_params+=("FTP_SERVER") + fi + + if [[ -z "$FTP_USER" ]]; then + missing_params+=("FTP_USER") + fi + + if [[ -z "$FTP_PASS" ]]; then + missing_params+=("FTP_PASS") + fi + + if [[ ${#missing_params[@]} -gt 0 ]]; then + log_error "缺少必需的FTP参数: ${missing_params[*]}" + log_error "请通过以下方式之一设置FTP参数:" + log_error " 1. 命令行参数: --server <地址> --user <用户名> --password <密码>" + log_error " 2. 环境变量: FTP_SERVER=<地址> FTP_USER=<用户名> FTP_PASS=<密码>" + log_error "" + log_error "示例:" + log_error " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234" + log_error " FTP_SERVER=10.211.55.4 FTP_USER=ftpuser FTP_PASS=admin1234 sudo sh setup.sh" + exit 1 + fi + + # 设置BASE_URL和LATEST_VERSION_URL + BASE_URL="ftp://${FTP_SERVER}:${FTP_PORT}" + LATEST_VERSION_URL="$BASE_URL/LATEST_VERSION" + + log_info "FTP配置:" + log_info " 服务器: $FTP_SERVER:$FTP_PORT" + log_info " 用户: $FTP_USER" +} + +# 获取最新版本号的函数 +get_latest_version() { + log_info "获取最新版本信息..." >&2 + log_info "尝试从URL获取: $LATEST_VERSION_URL" >&2 + + # 先测试FTP连接 + log_info "测试FTP连接..." >&2 + if ! curl -u "${FTP_USER}:${FTP_PASS}" -sfI "$LATEST_VERSION_URL" >/dev/null 2>&1; then + log_error "无法连接到FTP服务器或文件不存在" >&2 + log_error "URL: $LATEST_VERSION_URL" >&2 + log_error "请检查:" >&2 + log_error " 1. FTP服务器是否运行: $FTP_SERVER:$FTP_PORT" >&2 + log_error " 2. 用户名密码是否正确: $FTP_USER" >&2 + log_error " 3. LATEST_VERSION文件是否存在" >&2 + log_error "手动测试命令: curl -u ${FTP_USER}:${FTP_PASS} ftp://${FTP_SERVER}/LATEST_VERSION" >&2 + exit 1 + fi + + # 获取文件内容 + if ! LATEST_VERSION=$(curl -u "${FTP_USER}:${FTP_PASS}" -sfL "$LATEST_VERSION_URL" 2>/dev/null | tr -d '[:space:]'); then + log_error "下载LATEST_VERSION文件失败" >&2 + exit 1 + fi + + log_info "原始获取内容: '$LATEST_VERSION'" >&2 + + if [[ -z "$LATEST_VERSION" ]]; then + log_error "获取到的版本信息为空" >&2 + log_error "可能的原因:" >&2 + log_error " 1. LATEST_VERSION文件为空" >&2 + log_error " 2. 文件内容格式不正确" >&2 + log_error " 3. 网络传输问题" >&2 + log_error "请检查FTP服务器上的 /srv/ftp/share/LATEST_VERSION 文件" >&2 + exit 1 + fi + + log_info "检测到最新版本: $LATEST_VERSION" >&2 + echo "$LATEST_VERSION" +} + +# 解析参数 +ARGUS_VERSION="" # 使用不同的变量名避免与系统VERSION冲突 +ACTION="install" +FORCE_INSTALL=false + +while [[ $# -gt 0 ]]; do + case $1 in + --version) + ARGUS_VERSION="$2" + shift 2 + ;; + --server) + FTP_SERVER="$2" + shift 2 + ;; + --user) + FTP_USER="$2" + shift 2 + ;; + --password) + FTP_PASS="$2" + shift 2 + ;; + --port) + FTP_PORT="$2" + shift 2 + ;; + --uninstall) + ACTION="uninstall" + shift + ;; + --install-dir) + INSTALL_DIR="$2" + shift 2 + ;; + # 简化安装逻辑:不再支持回滚和备份列表功能 + # --rollback) + # ACTION="rollback" + # shift + # ;; + # --backup-list) + # ACTION="backup-list" + # shift + # ;; + --status) + ACTION="status" + shift + ;; + --force) + FORCE_INSTALL=true + shift + ;; + --help) + echo "Argus Metric FTP在线安装脚本" + echo + echo "用法: curl -u <用户名>:<密码> ftp://<服务器>/setup.sh -o setup.sh && sh setup.sh [选项]" + echo + echo "必需参数 (必须通过命令行参数或环境变量设置):" + echo " --server SERVER FTP服务器地址 (必须)" + echo " --user USER FTP用户名 (必须)" + echo " --password PASS FTP密码 (必须)" + echo + echo "可选参数:" + echo " --version VERSION 指定版本 (默认: 自动获取最新版本)" + echo " --port PORT FTP端口 (默认: 21)" + echo " --install-dir DIR 安装目录 (默认: /opt/argus-metric)" + echo " --force 强制重新安装 (即使相同版本)" + echo " --uninstall 卸载 (自动确认)" + # echo " --rollback 回滚到上一个备份版本" + # echo " --backup-list 列出所有备份版本" + echo " --status 显示当前安装状态" + echo " --help 显示帮助" + echo + echo "环境变量:" + echo " FTP_SERVER FTP服务器地址 (必须)" + echo " FTP_USER FTP用户名 (必须)" + echo " FTP_PASS FTP密码 (必须)" + echo " FTP_PORT FTP端口 (默认: 21)" + echo + echo "示例:" + echo " # 方式1: 使用命令行参数" + echo " curl -u ftpuser:admin1234 ftp://10.211.55.4/setup.sh -o setup.sh" + echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234" + echo " " + echo " # 方式2: 使用环境变量" + echo " FTP_SERVER=10.211.55.4 FTP_USER=ftpuser FTP_PASS=admin1234 sudo sh setup.sh" + echo " " + echo " # 指定版本安装" + echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --version 1.30.0" + echo " " + echo " # 强制重新安装" + echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --force" + echo " " + echo " # 卸载" + echo " sudo sh setup.sh --server 10.211.55.4 --user ftpuser --password admin1234 --uninstall" + exit 0 + ;; + *) + log_error "未知参数: $1" + echo "使用 --help 查看帮助信息" + exit 1 + ;; + esac +done + +# 清理函数 +cleanup() { + if [[ -d "$TEMP_DIR" ]]; then + rm -rf "$TEMP_DIR" + fi +} + +trap cleanup EXIT + +# 创建安装目录结构 +create_install_directories() { + log_info "创建安装目录结构..." + + # 创建主要目录 + mkdir -p "$VERSIONS_DIR" + mkdir -p "$BACKUPS_DIR" + + log_success "安装目录结构创建完成: $INSTALL_DIR" +} + +# 获取当前安装的版本 +get_current_version() { + # 优先从LATEST_VERSION文件读取 + if [[ -f "$LATEST_VERSION_FILE" ]]; then + local version_from_file=$(cat "$LATEST_VERSION_FILE" 2>/dev/null | tr -d '[:space:]') + if [[ -n "$version_from_file" ]]; then + # 确保版本号格式一致(不带v前缀) + echo "$version_from_file" + return 0 + fi + fi + + # 如果文件不存在或为空,从软链接读取 + if [[ -L "$CURRENT_LINK" ]]; then + local current_path=$(readlink "$CURRENT_LINK") + # 从版本目录名中提取版本号(现在不带v前缀) + basename "$current_path" + else + echo "" + fi +} + +# 检查是否已安装 +check_installed() { + if [[ -L "$CURRENT_LINK" ]] && [[ -d "$CURRENT_LINK" ]]; then + local current_version=$(get_current_version) + if [[ -n "$current_version" ]]; then + log_info "检测到已安装版本: v$current_version" + return 0 + fi + fi + return 1 +} + +# 更新LATEST_VERSION文件 +update_latest_version_file() { + local version="$1" + log_info "更新LATEST_VERSION文件: $version" + + if echo "$version" > "$LATEST_VERSION_FILE"; then + log_success "LATEST_VERSION文件已更新" + else + log_error "更新LATEST_VERSION文件失败" + return 1 + fi +} + +# 初始化 DNS 配置文件到系统目录 +init_dns_config_to_system() { + log_info "初始化 DNS 配置文件到系统目录..." + + # 系统 DNS 配置文件 + local system_dns_conf="$INSTALL_DIR/dns.conf" + + # 如果系统目录中还没有 dns.conf,创建一个空的占位文件 + if [[ ! -f "$system_dns_conf" ]]; then + touch "$system_dns_conf" + chmod 644 "$system_dns_conf" + log_success "DNS 配置文件占位文件已创建: $system_dns_conf" + log_info "DNS 同步脚本将从 FTP 服务器下载实际的 DNS 配置" + else + log_info "DNS 配置文件已存在: $system_dns_conf" + fi +} + +# 备份当前版本 +backup_current_version() { + local current_version=$(get_current_version) + if [[ -z "$current_version" ]]; then + log_info "没有当前版本需要备份" + return 0 + fi + + # 确保备份目录存在 + mkdir -p "$BACKUPS_DIR" + + local backup_name="$current_version" + local backup_path="$BACKUPS_DIR/$backup_name" + + log_info "备份当前版本 $current_version 到: $backup_path" + + # 如果备份已存在,先删除 + if [[ -d "$backup_path" ]]; then + log_info "备份版本已存在,覆盖: $backup_path" + rm -rf "$backup_path" + fi + + # 复制当前版本目录(跟随软链接复制实际内容) + if cp -rL "$CURRENT_LINK" "$backup_path"; then + log_success "版本备份完成: $backup_name" + + else + log_error "版本备份失败" + exit 1 + fi +} + +# 回滚到备份版本 +rollback_to_backup() { + local backup_name="$1" + + # 确保备份目录存在 + mkdir -p "$BACKUPS_DIR" + + local backup_path="$BACKUPS_DIR/$backup_name" + + if [[ ! -d "$backup_path" ]]; then + log_error "备份不存在: $backup_path" + return 1 + fi + + log_info "回滚到备份版本: $backup_name" + + # 停止当前服务 + stop_services + + # 检查是否存在对应的版本目录 + local version_dir="$VERSIONS_DIR/$backup_name" + + if [[ ! -d "$version_dir" ]]; then + log_info "版本目录不存在,从备份恢复版本目录: $version_dir" + # 从备份目录恢复到版本目录 + mkdir -p "$VERSIONS_DIR" + cp -r "$backup_path" "$version_dir" + fi + + # 恢复软链接指向版本目录 + if ln -sfn "$version_dir" "$CURRENT_LINK"; then + log_success "版本回滚完成: $backup_name" + + # 更新LATEST_VERSION文件 + update_latest_version_file "$backup_name" + + return 0 + else + log_error "版本回滚失败" + return 1 + fi +} + +# 停止服务 +stop_services() { + log_info "停止当前服务..." + + # 检查服务是否正在运行 + if ! check_services_running; then + log_info "服务未运行,无需停止" + return 0 + fi + + # 尝试使用卸载脚本停止服务 + if [[ -f "$CURRENT_LINK/uninstall.sh" ]]; then + cd "$CURRENT_LINK" + chmod +x uninstall.sh + + # 自动确认停止服务(避免交互式确认) + echo "y" | ./uninstall.sh >/dev/null 2>&1 + local stop_exit_code=$? + + if [[ $stop_exit_code -eq 0 ]]; then + log_success "服务停止完成" + else + log_warning "停止服务时出现警告,尝试手动停止" + manual_stop_services + fi + else + log_warning "未找到卸载脚本,尝试手动停止服务" + manual_stop_services + fi +} + +# 手动停止服务 +manual_stop_services() { + log_info "手动停止服务..." + + # 停止 node_exporter + if pgrep -f "node_exporter" >/dev/null 2>&1; then + pkill -f "node_exporter" && log_info "node_exporter 已停止" + fi + + # 停止 dcgm_exporter + if pgrep -f "dcgm_exporter" >/dev/null 2>&1; then + pkill -f "dcgm_exporter" && log_info "dcgm_exporter 已停止" + fi + + # 等待进程完全停止 + sleep 2 + + # 检查是否还有残留进程 + if pgrep -f "node_exporter\|dcgm_exporter" >/dev/null 2>&1; then + log_warning "仍有服务进程运行,尝试强制停止" + pkill -9 -f "node_exporter\|dcgm_exporter" 2>/dev/null || true + fi + + log_success "手动停止服务完成" +} + +# 启动服务 +start_services() { + log_info "启动服务..." + + # 检查服务是否已经在运行 + if check_services_running; then + log_info "服务已在运行,跳过启动" + return 0 + fi + + # 由于 install_artifact.sh 已经安装了所有组件并设置了健康检查定时任务 + # 这里只需要简单验证服务状态即可 + log_info "组件已安装完成,健康检查定时任务已设置" + log_info "服务将在健康检查时自动启动(每5分钟检查一次)" + + # 等待一下让服务有时间启动 + sleep 3 + + # 验证服务状态 + if check_services_running; then + log_success "服务启动成功" + else + log_info "服务可能正在启动中,健康检查机制将自动监控" + fi + + return 0 +} + +# 检查服务是否正在运行 +check_services_running() { + # 检查常见的服务端口是否在监听 + local ports=(9100 9400) # node-exporter 和 dcgm-exporter 的默认端口 + + for port in "${ports[@]}"; do + if netstat -tlnp 2>/dev/null | grep -q ":$port "; then + log_info "检测到服务正在端口 $port 上运行" + return 0 + fi + done + + # 检查相关进程 + if pgrep -f "node_exporter\|dcgm_exporter" >/dev/null 2>&1; then + log_info "检测到相关服务进程正在运行" + return 0 + fi + + return 1 +} + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo sh setup.sh" + exit 1 + fi +} + +# 检查系统要求 +check_system() { + log_info "检查系统要求..." + + # 检查操作系统 + if [[ ! -f /etc/os-release ]]; then + log_error "无法检测操作系统版本" + exit 1 + fi + + # 读取系统信息,使用子shell避免污染当前环境变量 + local OS_INFO=$(source /etc/os-release && echo "$NAME $VERSION_ID") + log_info "检测到操作系统: $OS_INFO" + + # 检查系统架构 + arch=$(uname -m) + log_info "系统架构: $arch" + + # 检查磁盘空间 + available_space=$(df / | awk 'NR==2 {print $4}') + if [[ $available_space -lt 1024 ]]; then + log_warning "可用磁盘空间不足 1GB,当前可用: $(($available_space / 1024 / 1024))GB" + fi +} + +# 下载并安装 +install_argus_metric() { + # 如果没有指定版本,获取最新版本 + if [[ -z "$ARGUS_VERSION" ]]; then + ARGUS_VERSION=$(get_latest_version) + fi + + log_info "开始安装 Argus Metric v$ARGUS_VERSION..." + log_info "安装目录: $INSTALL_DIR" + + # 创建安装目录结构(必须先创建,以便备份时目录存在) + create_install_directories + + # 检查是否已安装 + local is_upgrade=false + if check_installed; then + local current_version=$(get_current_version) + if [[ "$current_version" == "$ARGUS_VERSION" ]]; then + if [[ "$FORCE_INSTALL" == true ]]; then + log_info "检测到相同版本 v$ARGUS_VERSION,但使用了 --force 参数,将强制重新安装" + is_upgrade=true + # 简化安装逻辑:不再备份当前版本 + # backup_current_version + else + log_info "版本 v$ARGUS_VERSION 已安装,无需重复安装" + log_info "如需强制重新安装,请使用 --force 参数" + return 0 + fi + else + log_info "检测到版本升级: v$current_version -> v$ARGUS_VERSION" + is_upgrade=true + + # 简化安装逻辑:不再备份当前版本 + # backup_current_version + fi + fi + + # 创建临时目录 + mkdir -p "$TEMP_DIR" + cd "$TEMP_DIR" + + # 下载发布包,使用新的命名规范 + TAR_NAME="argus-metric_$(echo $ARGUS_VERSION | tr '.' '_').tar.gz" + log_info "下载发布包: $TAR_NAME" + log_info "从FTP服务器下载: $FTP_SERVER:$FTP_PORT, 用户: $FTP_USER" + + # 构造curl命令并显示(隐藏密码) + CURL_CMD="curl -u \"${FTP_USER}:***\" -sfL \"$BASE_URL/$TAR_NAME\" -o \"$TAR_NAME\"" + log_info "执行命令: $CURL_CMD" + + if ! curl -u "${FTP_USER}:${FTP_PASS}" -sfL "$BASE_URL/$TAR_NAME" -o "$TAR_NAME"; then + log_error "下载发布包失败: $BASE_URL/$TAR_NAME" + log_error "完整命令: curl -u \"${FTP_USER}:${FTP_PASS}\" -sfL \"$BASE_URL/$TAR_NAME\" -o \"$TAR_NAME\"" + log_error "请检查FTP服务器连接、用户名密码是否正确" + exit 1 + fi + + # 解压发布包到当前目录 + log_info "解压发布包..." + if ! tar -xzf "$TAR_NAME"; then + log_error "解压发布包失败" + exit 1 + fi + + # 显示解压后的文件结构 + log_info "解压后的文件结构:" + ls -la "$TEMP_DIR" + + # 准备版本目录 + local version_dir="$VERSIONS_DIR/$ARGUS_VERSION" + log_info "安装到版本目录: $version_dir" + + # 如果升级,先停止服务 + if [[ "$is_upgrade" == true ]]; then + stop_services + fi + + # 创建版本目录 + if [[ -d "$version_dir" ]]; then + log_info "版本目录已存在,备份后更新" + rm -rf "$version_dir" + fi + + # 创建新的版本目录 + mkdir -p "$version_dir" + + # 移动解压的文件到版本目录 + log_info "移动文件到版本目录: $TEMP_DIR/* -> $version_dir/" + + # 检查源目录是否有内容 + if [[ ! "$(ls -A "$TEMP_DIR" 2>/dev/null)" ]]; then + log_error "临时目录为空,无法移动文件" + exit 1 + fi + + # 检查目标目录是否存在 + if [[ ! -d "$version_dir" ]]; then + log_error "目标版本目录不存在: $version_dir" + exit 1 + fi + + # 执行文件移动 + if mv "$TEMP_DIR"/* "$version_dir" 2>/dev/null; then + log_success "文件移动到版本目录完成" + else + log_error "移动文件到版本目录失败" + log_error "源目录内容:" + ls -la "$TEMP_DIR" || true + log_error "目标目录状态:" + ls -la "$version_dir" || true + log_error "权限检查:" + ls -ld "$TEMP_DIR" "$version_dir" || true + exit 1 + fi + + # 执行安装脚本 + log_info "执行安装脚本..." + cd "$version_dir" + if [[ -f "install.sh" ]]; then + chmod +x install.sh + # 传递安装根目录给安装脚本,让install_artifact.sh安装到正确的版本目录 + if ./install.sh "$version_dir"; then + log_success "安装脚本执行完成" + else + log_error "安装脚本执行失败" + # 简化安装逻辑:不再自动回滚 + # if [[ "$is_upgrade" == true ]]; then + # log_warning "升级失败,尝试回滚到之前版本..." + # # 确保备份目录存在 + # mkdir -p "$BACKUPS_DIR" + # local latest_backup=$(ls -1t "$BACKUPS_DIR" 2>/dev/null | head -n 1) + # if [[ -n "$latest_backup" ]]; then + # rollback_to_backup "$latest_backup" + # return 1 + # fi + # fi + exit 1 + fi + else + log_error "未找到安装脚本 install.sh" + exit 1 + fi + + # 更新软链接指向新版本 + log_info "更新当前版本链接..." + + # 如果 current 已经存在且是目录,先删除它 + if [[ -d "$CURRENT_LINK" ]] && [[ ! -L "$CURRENT_LINK" ]]; then + log_warning "发现 current 是目录而不是符号链接,正在删除..." + rm -rf "$CURRENT_LINK" + fi + + if ln -sfn "$version_dir" "$CURRENT_LINK"; then + log_success "版本链接更新完成: $CURRENT_LINK -> $version_dir" + else + log_error "版本链接更新失败" + exit 1 + fi + + # 更新LATEST_VERSION文件 + update_latest_version_file "$ARGUS_VERSION" + + # 初始化 DNS 配置文件到系统目录 + init_dns_config_to_system + + # 启动服务 + # start_services + + log_success "Argus Metric v$ARGUS_VERSION 安装完成!" + + # 显示安装信息 + echo + log_info "安装信息:" + log_info " 版本: $ARGUS_VERSION" + log_info " 安装目录: $INSTALL_DIR" + log_info " 版本目录: $version_dir" + log_info " 当前链接: $CURRENT_LINK" + if [[ "$is_upgrade" == true ]]; then + log_info " 升级类型: 版本升级" + else + log_info " 安装类型: 全新安装" + fi +} + +# 卸载 +uninstall_argus_metric() { + log_info "开始卸载 Argus Metric..." + log_info "安装目录: $INSTALL_DIR" + + # 检查是否已安装 + if ! check_installed; then + log_info "未检测到已安装的 Argus Metric" + return 0 + fi + + local current_version=$(get_current_version) + log_info "检测到当前版本: v$current_version" + + # 停止服务 + stop_services + + # 执行卸载脚本 + log_info "执行卸载脚本..." + if [[ -f "$CURRENT_LINK/uninstall.sh" ]]; then + cd "$CURRENT_LINK" + chmod +x uninstall.sh + + # 自动确认卸载(因为用户已经明确使用了 --uninstall 参数) + log_info "自动确认卸载操作..." + echo "y" | ./uninstall.sh + local uninstall_exit_code=$? + + if [[ $uninstall_exit_code -eq 0 ]]; then + log_success "卸载脚本执行完成" + else + log_error "卸载脚本执行失败 (退出码: $uninstall_exit_code)" + exit 1 + fi + else + log_warning "未找到卸载脚本,执行基本清理" + fi + + # 清理安装目录 + log_info "清理安装目录..." + if [[ -d "$INSTALL_DIR" ]]; then + # 询问是否完全删除安装目录 + log_warning "这将删除整个安装目录: $INSTALL_DIR" + log_warning "包括所有版本、备份和配置文件" + + # 在自动化环境中,直接删除 + if rm -rf "$INSTALL_DIR"; then + log_success "安装目录已完全清理: $INSTALL_DIR" + else + log_error "清理安装目录失败" + exit 1 + fi + else + log_info "安装目录不存在,无需清理" + fi + + log_success "Argus Metric 卸载完成!" +} + +# 显示状态 +show_status() { + echo "==========================================" + echo " Argus Metric 安装状态" + echo "==========================================" + echo + + if check_installed; then + local current_version=$(get_current_version) + log_info "当前版本: $current_version" + log_info "安装目录: $INSTALL_DIR" + log_info "当前链接: $CURRENT_LINK" + log_info "版本目录: $VERSIONS_DIR/$current_version" + log_info "版本文件: $LATEST_VERSION_FILE" + + # 显示LATEST_VERSION文件内容 + if [[ -f "$LATEST_VERSION_FILE" ]]; then + local file_version=$(cat "$LATEST_VERSION_FILE" 2>/dev/null | tr -d '[:space:]') + log_info "版本文件内容: $file_version" + fi + + echo + log_info "目录结构:" + if [[ -d "$INSTALL_DIR" ]]; then + tree -L 2 "$INSTALL_DIR" 2>/dev/null || ls -la "$INSTALL_DIR" + fi + + echo + log_info "可用版本:" + if [[ -d "$VERSIONS_DIR" ]]; then + ls -1 "$VERSIONS_DIR" 2>/dev/null | sed 's/^/ - /' + else + echo " 无" + fi + + # 简化安装逻辑:不再显示备份版本信息 + # echo + # log_info "备份版本:" + # if [[ -d "$BACKUPS_DIR" ]] && [[ $(ls -1 "$BACKUPS_DIR" 2>/dev/null | wc -l) -gt 0 ]]; then + # ls -1t "$BACKUPS_DIR" 2>/dev/null | sed 's/^/ - /' + # else + # echo " 无" + # fi + else + log_warning "Argus Metric 未安装" + log_info "安装目录: $INSTALL_DIR" + fi +} + +# 列出备份 +list_backups() { + echo "==========================================" + echo " Argus Metric 备份列表" + echo "==========================================" + echo + + if [[ -d "$BACKUPS_DIR" ]] && [[ $(ls -1 "$BACKUPS_DIR" 2>/dev/null | wc -l) -gt 0 ]]; then + log_info "可用备份版本:" + ls -1t "$BACKUPS_DIR" 2>/dev/null | while read backup; do + local backup_time=$(stat -c %y "$BACKUPS_DIR/$backup" 2>/dev/null | cut -d' ' -f1-2) + echo " - $backup (创建时间: $backup_time)" + done + else + log_warning "没有可用的备份版本" + fi +} + +# 回滚功能 +rollback_version() { + log_info "开始回滚操作..." + + if ! check_installed; then + log_error "没有检测到已安装的版本,无法回滚" + exit 1 + fi + + # 确保备份目录存在 + mkdir -p "$BACKUPS_DIR" + + # 获取最新的备份 + local latest_backup=$(ls -1t "$BACKUPS_DIR" 2>/dev/null | head -n 1) + if [[ -z "$latest_backup" ]]; then + log_error "没有找到可用的备份版本" + exit 1 + fi + + log_info "将回滚到备份版本: $latest_backup" + + if rollback_to_backup "$latest_backup"; then + log_success "回滚完成!" + + # 显示当前状态 + echo + show_status + else + log_error "回滚失败" + exit 1 + fi +} + +# 主函数 +main() { + echo "==========================================" + echo " Argus Metric 在线安装脚本 v1.0" + echo "==========================================" + echo + + # 加载配置文件 + load_config + + # 对于状态操作,不需要FTP参数和root权限 + # 简化安装逻辑:不再支持备份列表操作 + if [[ "$ACTION" == "status" ]]; then + show_status + return 0 + fi + # if [[ "$ACTION" == "status" || "$ACTION" == "backup-list" ]]; then + # if [[ "$ACTION" == "status" ]]; then + # show_status + # elif [[ "$ACTION" == "backup-list" ]]; then + # list_backups + # fi + # return 0 + # fi + + check_root + + # 更新目录配置变量(在设置INSTALL_DIR后) + VERSIONS_DIR="$INSTALL_DIR/versions" + BACKUPS_DIR="$INSTALL_DIR/backups" + CURRENT_LINK="$INSTALL_DIR/current" + LATEST_VERSION_FILE="$INSTALL_DIR/LATEST_VERSION" + + # 简化安装逻辑:不再支持回滚操作 + # if [[ "$ACTION" == "rollback" ]]; then + # rollback_version + # return 0 + # fi + + check_ftp_params + check_system + + if [[ "$ACTION" == "uninstall" ]]; then + uninstall_argus_metric + else + install_argus_metric + fi + + echo + log_info "操作完成!" +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/sync_dns.sh b/src/metric/client-plugins/all-in-one-demo/scripts/sync_dns.sh new file mode 100755 index 0000000..9e05f24 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/scripts/sync_dns.sh @@ -0,0 +1,244 @@ +#!/bin/bash + +# DNS 同步脚本 +# 比较 FTP 根目录的 dns.conf 和本地的 dns.conf,如果有变化则同步到 /etc/resolv.conf + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 - 输出到 stderr 避免影响函数返回值 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" >&2 +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" >&2 +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" >&2 +} + +# 获取脚本所在目录 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LOCAL_DNS_CONF="/opt/argus-metric/dns.conf" +REMOTE_DNS_CONF_URL="" +RESOLV_CONF="/etc/resolv.conf" +LOG_FILE="/opt/argus-metric/.dns_sync.log" + +# 从环境变量或配置文件获取 FTP 服务器信息 +get_ftp_config() { + # 优先从环境变量获取配置 + log_info "获取 FTP 配置信息..." + + # 如果环境变量中没有设置,则尝试从配置文件读取 + if [[ -z "$FTP_SERVER" || -z "$FTP_USER" || -z "$FTP_PASSWORD" ]]; then + local config_file="$SCRIPT_DIR/config.env" + if [[ -f "$config_file" ]]; then + log_info "从配置文件读取 FTP 配置: $config_file" + source "$config_file" + fi + else + log_info "使用环境变量中的 FTP 配置" + fi + + # 设置默认值(如果环境变量和配置文件都没有设置) + FTP_SERVER="${FTP_SERVER:-localhost}" + FTP_USER="${FTP_USER:-ftpuser}" + FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}" + + # 构建远程 DNS 配置文件 URL + REMOTE_DNS_CONF_URL="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/dns.conf" + + log_info "FTP 配置来源: ${FTP_CONFIG_SOURCE:-环境变量/配置文件}" +} + +# 下载远程 DNS 配置文件 +download_remote_dns_conf() { + local temp_file="/tmp/dns.conf.remote.$$" + + log_info "从 FTP 服务器下载 DNS 配置文件..." + log_info "远程地址: $REMOTE_DNS_CONF_URL" + log_info "FTP 服务器: $FTP_SERVER" + log_info "FTP 用户: $FTP_USER" + + # 先测试 FTP 连接 + log_info "测试 FTP 连接..." + if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null 2>&1; then + log_success "FTP 服务器连接成功" + else + log_error "无法连接到 FTP 服务器: $FTP_SERVER" + log_error "请检查:" + log_error " 1. FTP 服务器是否运行" + log_error " 2. 网络连接是否正常" + log_error " 3. 服务器地址是否正确" + return 1 + fi + + # 测试 dns.conf 文件是否存在 + log_info "检查远程 dns.conf 文件是否存在..." + if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/dns.conf" >/dev/null 2>&1; then + log_success "远程 dns.conf 文件存在" + else + log_error "远程 dns.conf 文件不存在或无法访问" + log_error "请检查 FTP 服务器根目录下是否有 dns.conf 文件" + return 1 + fi + + # 尝试下载文件 + log_info "开始下载 dns.conf 文件..." + if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sf "ftp://${FTP_SERVER}/dns.conf" -o "$temp_file" 2>/dev/null; then + log_success "远程 DNS 配置文件下载成功" + echo "$temp_file" + else + log_error "下载 dns.conf 文件失败" + log_error "尝试手动测试命令:" + log_error " curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_SERVER}/dns.conf" + rm -f "$temp_file" + return 1 + fi +} + +# 比较两个文件是否相同 +compare_files() { + local file1="$1" + local file2="$2" + + if [[ ! -f "$file1" || ! -f "$file2" ]]; then + return 1 + fi + + # 使用 diff 比较文件内容 + if diff -q "$file1" "$file2" >/dev/null 2>&1; then + return 0 # 文件相同 + else + return 1 # 文件不同 + fi +} + +# 将 DNS 配置追加到 /etc/resolv.conf +update_resolv_conf() { + local dns_conf_file="$1" + + log_info "更新 /etc/resolv.conf 文件..." + + # 备份原始文件 + if [[ -f "$RESOLV_CONF" ]]; then + cp "$RESOLV_CONF" "${RESOLV_CONF}.backup.$(date +%Y%m%d_%H%M%S)" + log_info "已备份原始 resolv.conf 文件" + fi + + # 读取 DNS 配置文件并追加到 resolv.conf + while IFS= read -r line; do + # 跳过空行和注释行 + [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue + + # 验证是否为有效的 IP 地址 + if [[ "$line" =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then + # 检查是否已存在相同的 nameserver 行 + if ! grep -q "nameserver $line" "$RESOLV_CONF" 2>/dev/null; then + echo "nameserver $line" >> "$RESOLV_CONF" + log_info "添加 DNS 服务器: $line" + else + log_info "DNS 服务器已存在,跳过: $line" + fi + else + log_warning "跳过无效的 DNS 地址: $line" + fi + done < "$dns_conf_file" + + # 设置文件权限 + chmod 644 "$RESOLV_CONF" + + log_success "/etc/resolv.conf 文件更新完成" +} + +# 记录同步日志 +log_sync() { + local message="$1" + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + echo "[$timestamp] $message" >> "$LOG_FILE" +} + +# 主函数 +main() { + log_info "开始 DNS 同步检查..." + log_sync "DNS 同步检查开始" + + # 确保系统目录存在 + mkdir -p "/opt/argus-metric" + + # 获取 FTP 配置 + get_ftp_config + + # 检查本地 DNS 配置文件是否存在 + if [[ ! -f "$LOCAL_DNS_CONF" ]]; then + log_warning "本地 DNS 配置文件不存在: $LOCAL_DNS_CONF" + log_warning "将下载远程配置文件并更新系统 DNS 设置" + + # 下载远程配置文件 + if remote_file=$(download_remote_dns_conf); then + # 复制到本地 + cp "$remote_file" "$LOCAL_DNS_CONF" + log_success "远程 DNS 配置文件已保存到本地" + + # 更新 resolv.conf + update_resolv_conf "$LOCAL_DNS_CONF" + log_sync "首次同步完成,DNS 配置已更新" + + # 清理临时文件 + rm -f "$remote_file" + else + log_error "无法下载远程 DNS 配置文件,同步失败" + log_sync "同步失败:无法下载远程配置文件" + exit 1 + fi + else + log_info "本地 DNS 配置文件存在: $LOCAL_DNS_CONF" + + # 下载远程配置文件进行比较 + if remote_file=$(download_remote_dns_conf); then + # 比较文件 + if compare_files "$LOCAL_DNS_CONF" "$remote_file"; then + log_info "DNS 配置文件无变化,无需更新" + log_sync "DNS 配置文件无变化" + else + log_info "检测到 DNS 配置文件有变化,开始同步..." + log_sync "检测到 DNS 配置文件变化,开始同步" + + # 更新本地配置文件 + cp "$remote_file" "$LOCAL_DNS_CONF" + log_success "本地 DNS 配置文件已更新" + + # 更新 resolv.conf + update_resolv_conf "$LOCAL_DNS_CONF" + log_sync "DNS 配置同步完成" + fi + + # 清理临时文件 + rm -f "$remote_file" + else + log_error "无法下载远程 DNS 配置文件,跳过本次同步" + log_sync "同步失败:无法下载远程配置文件" + exit 1 + fi + fi + + log_success "DNS 同步检查完成" + log_sync "DNS 同步检查完成" +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/uninstall_artifact.sh b/src/metric/client-plugins/all-in-one-demo/scripts/uninstall_artifact.sh new file mode 100755 index 0000000..ca137a7 --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/scripts/uninstall_artifact.sh @@ -0,0 +1,274 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 配置变量 +INSTALL_DIR="/opt/argus-metric" +TEMP_DIR="/tmp/argus-metric-uninstall-$$" +VERSION_FILE="version.json" + +# 检查是否为 root 用户 +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本需要 root 权限运行" + log_info "请使用: sudo $0" + exit 1 + fi +} + +# 查找版本文件 +find_version_file() { + log_info "查找版本信息文件..." + + # 在当前目录查找 + if [[ -f "$VERSION_FILE" ]]; then + VERSION_FILE_PATH="$VERSION_FILE" + log_success "找到版本文件: $VERSION_FILE" + return 0 + fi + + # 在 artifact 目录查找 + for version_dir in artifact/*/; do + if [[ -f "${version_dir}${VERSION_FILE}" ]]; then + VERSION_FILE_PATH="${version_dir}${VERSION_FILE}" + log_success "找到版本文件: $VERSION_FILE_PATH" + return 0 + fi + done + + log_error "未找到版本信息文件 $VERSION_FILE" + log_info "请确保在正确的目录下运行此脚本" + exit 1 +} + +# 解析版本信息 +parse_version_info() { + log_info "解析版本信息..." + + if [[ ! -f "$VERSION_FILE_PATH" ]]; then + log_error "版本文件不存在: $VERSION_FILE_PATH" + exit 1 + fi + + # 使用 jq 解析 JSON(如果可用) + if command -v jq &> /dev/null; then + VERSION=$(jq -r '.version' "$VERSION_FILE_PATH") + BUILD_TIME=$(jq -r '.build_time' "$VERSION_FILE_PATH") + + # 解析 install_order(现在包含完整的文件名) + if jq -e '.install_order' "$VERSION_FILE_PATH" > /dev/null 2>&1; then + jq -r '.install_order[]' "$VERSION_FILE_PATH" > "$TEMP_DIR/install_order.txt" + else + log_error "version.json 中缺少 install_order 字段" + exit 1 + fi + else + log_warning "jq 未安装,使用简单的 JSON 解析" + VERSION=$(grep '"version"' "$VERSION_FILE_PATH" | sed 's/.*"version": *"\([^"]*\)".*/\1/') + BUILD_TIME=$(grep '"build_time"' "$VERSION_FILE_PATH" | sed 's/.*"build_time": *"\([^"]*\)".*/\1/') + + # 解析 install_order + grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+"' | while read line; do + component=$(echo "$line" | sed 's/.*"\([^"]*\)".*/\1/') + echo "$component" >> "$TEMP_DIR/install_order.txt" + done + fi + + log_success "版本信息解析完成" + log_info " 版本: $VERSION" + log_info " 构建时间: $BUILD_TIME" +} + +# 创建临时目录 +create_temp_dirs() { + log_info "创建临时目录..." + mkdir -p "$TEMP_DIR" + log_success "临时目录创建完成: $TEMP_DIR" +} + +# 卸载组件 +uninstall_components() { + log_info "开始卸载组件..." + + artifact_dir=$(dirname "$VERSION_FILE_PATH") + uninstall_count=0 + total_count=0 + + if [[ -f "$TEMP_DIR/install_order.txt" ]]; then + total_count=$(wc -l < "$TEMP_DIR/install_order.txt") + fi + + if [[ -f "$TEMP_DIR/install_order.txt" ]]; then + while IFS= read -r filename; do + uninstall_count=$((uninstall_count + 1)) + + # 从文件名中提取组件名(去掉时间戳后缀) + component=$(echo "$filename" | sed 's/-[0-9]\{8\}-[0-9]\{6\}\.tar\.gz$//') + + log_info "[$uninstall_count/$total_count] 卸载 $component..." + + # 直接使用完整的文件名 + tar_file="$artifact_dir/$filename" + + if [[ ! -f "$tar_file" ]]; then + log_error "找不到组件文件: $filename" + exit 1 + fi + + # 解压到临时目录 + component_temp_dir="$TEMP_DIR/$component" + mkdir -p "$component_temp_dir" + + if tar -xzf "$tar_file" -C "$component_temp_dir"; then + log_success " $component 解压完成" + else + log_error " $component 解压失败" + exit 1 + fi + + # 查找解压后的目录 + extracted_dir="" + for dir in "$component_temp_dir"/*; do + if [[ -d "$dir" ]]; then + extracted_dir="$dir" + break + fi + done + + if [[ -z "$extracted_dir" ]]; then + log_error " $component 解压后未找到目录" + exit 1 + fi + + # 执行卸载脚本 + if [[ -f "$extracted_dir/uninstall.sh" ]]; then + log_info " 执行 $component 卸载脚本..." + # 所有组件都只需要一个确认 + if (cd "$extracted_dir" && echo "y" | ./uninstall.sh); then + log_success " $component 卸载完成" + else + log_error " $component 卸载失败" + exit 1 + fi + else + log_warning " $component 缺少 uninstall.sh 文件,跳过卸载" + fi + + # 清理临时文件 + rm -rf "$component_temp_dir" + done < "$TEMP_DIR/install_order.txt" + fi + + log_success "所有组件卸载完成" +} + +# 清理全局文件 +cleanup_global_files() { + log_info "清理全局文件..." + + # 清理安装目录 + if [[ -d "$INSTALL_DIR" ]]; then + rm -rf "$INSTALL_DIR" + log_success "安装目录已清理: $INSTALL_DIR" + else + log_info "安装目录不存在: $INSTALL_DIR" + fi + + # 清理可能的全局配置文件 + local global_configs=( + "/etc/argus-metric" + "/var/log/argus-metric" + ) + + for config in "${global_configs[@]}"; do + if [[ -d "$config" ]]; then + rm -rf "$config" + log_success "全局配置已清理: $config" + fi + done +} + +# 显示卸载信息 +show_uninstall_info() { + log_success "Argus-Metrics All-in-One 卸载完成!" + echo + echo "卸载信息:" + echo " 版本: $VERSION" + echo " 构建时间: $BUILD_TIME" + echo + echo "清理内容:" + echo " - 二进制文件" + echo " - 配置文件" + echo " - 数据目录" + echo " - 进程和服务" + echo " - 全局安装目录" + echo + echo "注意:" + echo " - 系统依赖包可能仍然存在" + echo " - 如需完全清理,请手动检查并删除相关文件" + echo +} + +# 清理函数 +cleanup() { + if [[ -d "$TEMP_DIR" ]]; then + rm -rf "$TEMP_DIR" + fi +} + +# 设置清理陷阱 +trap cleanup EXIT + +# 主函数 +main() { + echo "==========================================" + echo " Argus-Metrics All-in-One 卸载脚本" + echo "==========================================" + echo + + check_root + find_version_file + create_temp_dirs + parse_version_info + + log_warning "此操作将完全卸载 Argus-Metrics All-in-One" + read -p "确认继续?(y/N): " confirm + + if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + log_info "取消卸载操作" + exit 0 + fi + + uninstall_components + cleanup_global_files + show_uninstall_info +} + +# 脚本入口 +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi \ No newline at end of file diff --git a/src/metric/client-plugins/all-in-one-demo/scripts/version-manager.sh b/src/metric/client-plugins/all-in-one-demo/scripts/version-manager.sh new file mode 100755 index 0000000..65e566c --- /dev/null +++ b/src/metric/client-plugins/all-in-one-demo/scripts/version-manager.sh @@ -0,0 +1,350 @@ +#!/bin/bash + +set -e + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 日志函数 +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 显示帮助信息 +show_help() { + echo "AIOps 版本管理工具" + echo + echo "用法: $0 [options]" + echo + echo "命令:" + echo " bump - 升级版本号 (major|minor|patch)" + echo " set - 设置指定版本号" + echo " show - 显示当前版本信息" + echo " list - 列出所有版本" + echo " clean - 清理旧版本" + echo " validate - 验证版本配置" + echo + echo "示例:" + echo " $0 bump minor # 升级次版本号 1.0.0 -> 1.1.0" + echo " $0 set 2.0.0 # 设置版本为 2.0.0" + echo " $0 show # 显示当前版本" + echo " $0 list # 列出所有版本" +} + +# 获取当前版本 +get_current_version() { + if [[ -f "config/VERSION" ]]; then + cat config/VERSION + else + echo "0.0.0" + fi +} + +# 设置版本号 +set_version() { + local new_version="$1" + + # 验证版本号格式 + if [[ ! "$new_version" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + log_error "无效的版本号格式: $new_version" + log_info "版本号格式应为: major.minor.patch (如: 1.2.3)" + exit 1 + fi + + echo "$new_version" > config/VERSION + log_success "版本号已设置为: $new_version" +} + +# 升级版本号 +bump_version() { + local bump_type="$1" + local current_version=$(get_current_version) + + # 解析当前版本号 + IFS='.' read -r major minor patch <<< "$current_version" + + case "$bump_type" in + "major") + major=$((major + 1)) + minor=0 + patch=0 + ;; + "minor") + minor=$((minor + 1)) + patch=0 + ;; + "patch") + patch=$((patch + 1)) + ;; + *) + log_error "无效的升级类型: $bump_type" + log_info "支持的类型: major, minor, patch" + exit 1 + ;; + esac + + local new_version="$major.$minor.$patch" + set_version "$new_version" + log_success "版本号已从 $current_version 升级到 $new_version" +} + +# 显示当前版本信息 +show_version() { + local current_version=$(get_current_version) + log_info "当前版本: $current_version" + + if [[ -f "config/checklist" ]]; then + echo + echo "组件清单:" + while IFS= read -r line; do + [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue + read -r component version dep order <<< "$line" + if [[ -n "$component" && -n "$version" ]]; then + echo " - $component v$version" + fi + done < config/checklist + fi + + # 检查是否有对应的 artifact + local artifact_dir="artifact/$current_version" + if [[ -d "$artifact_dir" ]]; then + echo + echo "已构建的组件:" + for file in "$artifact_dir"/*.tar.gz; do + if [[ -f "$file" ]]; then + local filename=$(basename "$file") + local size=$(du -h "$file" | cut -f1) + echo " - $filename ($size)" + fi + done + + if [[ -f "$artifact_dir/version.json" ]]; then + echo + echo "版本信息文件: $artifact_dir/version.json" + fi + else + echo + log_warning "未找到对应的构建目录: $artifact_dir" + log_info "运行 ./package.sh 进行构建" + fi +} + +# 列出所有版本 +list_versions() { + log_info "所有版本列表:" + echo + + if [[ ! -d "artifact" ]]; then + log_warning "artifact 目录不存在" + return + fi + + for version_dir in artifact/*/; do + if [[ -d "$version_dir" ]]; then + local version=$(basename "$version_dir") + local current_version=$(get_current_version) + + if [[ "$version" == "$current_version" ]]; then + echo " * $version (当前版本)" + else + echo " $version" + fi + + # 显示该版本的组件 + local component_count=0 + for file in "$version_dir"/*.tar.gz; do + if [[ -f "$file" ]]; then + component_count=$((component_count + 1)) + fi + done + + if [[ $component_count -gt 0 ]]; then + echo " 包含 $component_count 个组件" + fi + fi + done +} + +# 清理旧版本 +clean_versions() { + local current_version=$(get_current_version) + local keep_versions=5 # 保留最近5个版本 + + log_info "清理旧版本 (保留最近 $keep_versions 个版本)..." + + if [[ ! -d "artifact" ]]; then + log_warning "artifact 目录不存在" + return + fi + + # 获取所有版本目录,按修改时间排序 + local versions=() + while IFS= read -r -d '' version_dir; do + versions+=("$(basename "$version_dir")") + done < <(find artifact -maxdepth 1 -type d -name "[0-9]*" -print0 | sort -z) + + local total_versions=${#versions[@]} + local versions_to_remove=$((total_versions - keep_versions)) + + if [[ $versions_to_remove -le 0 ]]; then + log_info "无需清理,当前只有 $total_versions 个版本" + return + fi + + log_info "将删除 $versions_to_remove 个旧版本..." + + for ((i=0; i /etc/apt/sources.list && \ + echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \ + echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \ + fi + +# 安装常用工具和FTP服务 +RUN apt-get update && \ + apt-get install -y supervisor net-tools inetutils-ping vim vsftpd && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# 如果是部署环境替换 apt 源 +RUN if [ "$USE_INTRANET" = "true" ]; then \ + echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \ + fi + +# supervisor 日志目录 +RUN mkdir -p /var/log/supervisor + +# 设置 FTP 基础路径环境变量 +ENV FTP_BASE_PATH=/private/argus/ftp + +# 设置域名环境变量 +ENV DOMAIN=prom.ftp.argus.com + +# 设置FTP用户密码环境变量 +ENV FTP_PASSWORD=ZGClab1234! + +# 设置用户和组ID环境变量 +ARG FTP_UID=2133 +ARG FTP_GID=2015 +ENV FTP_UID=${FTP_UID} +ENV FTP_GID=${FTP_GID} + +# 创建FTP用户和目录结构 +RUN groupadd -g ${FTP_GID} ftpuser && \ + useradd -u ${FTP_UID} -g ${FTP_GID} -d ${FTP_BASE_PATH}/share -s /bin/bash ftpuser && \ + mkdir -p ${FTP_BASE_PATH}/share \ + && mkdir -p /private/argus/etc \ + && mkdir -p /var/log/vsftpd \ + && mkdir -p /var/run/vsftpd/empty \ + && chown -R ftpuser:ftpuser ${FTP_BASE_PATH} + +# 创建vsftpd配置目录和用户列表文件 +RUN mkdir -p /etc/vsftpd && \ + echo "ftpuser" > /etc/vsftpd.userlist + +# supervisor 配置 +COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf + +# 启动脚本 +COPY start-ftp-supervised.sh /usr/local/bin/start-ftp-supervised.sh +RUN chmod +x /usr/local/bin/start-ftp-supervised.sh + +# vsftpd 配置文件 +COPY vsftpd.conf /etc/vsftpd/vsftpd.conf + +COPY dns-monitor.sh /usr/local/bin/dns-monitor.sh +RUN chmod +x /usr/local/bin/dns-monitor.sh + +USER root + +EXPOSE 21 20 21100-21110 + +ENTRYPOINT ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf", "-n"] diff --git a/src/metric/ftp/build/README.md b/src/metric/ftp/build/README.md new file mode 100644 index 0000000..f3881e1 --- /dev/null +++ b/src/metric/ftp/build/README.md @@ -0,0 +1,159 @@ +# FTP 镜像配置 + +## 环境变量配置 + +### FTP_BASE_PATH + +设置 FTP 数据的基础路径。 + +**默认值**: `/private/argus/ftp` + +**用途**: +- 共享目录路径: `${FTP_BASE_PATH}/share` (用于版本发布) +- 配置文件存储路径: `/private/argus/etc/` + +### DOMAIN + +设置 FTP 服务的域名。 + +**默认值**: `ftp.metric.argus.com` + +**用途**: +- 容器IP记录文件: `/private/argus/etc/${DOMAIN}` + +### FTP_PASSWORD + +设置 ftpuser 用户的密码。 + +**默认值**: `ZGClab1234!` + +**用途**: +- ftpuser 用户的登录密码 + +## 使用示例 + +### 1. 使用默认配置 +```bash +docker run -d \ + --name ftp-server \ + -p 21:21 \ + -p 21100-21110:21100-21110 \ + -v /host/ftp/data:/private/argus/ftp \ + argus-metric-ftp:1.0.0 +``` + +### 2. 自定义配置(运行时环境变量) +```bash +docker run -d \ + --name ftp-server \ + -p 21:21 \ + -p 21100-21110:21100-21110 \ + -e FTP_BASE_PATH=/custom/ftp/path \ + -e DOMAIN=custom.ftp.domain.com \ + -e FTP_PASSWORD=MySecurePassword123! \ + -v /host/ftp/data:/custom/ftp/path \ + argus-metric-ftp:1.0.0 +``` + +## 目录结构 + +容器启动后会在 `${FTP_BASE_PATH}` 下创建以下目录结构: + +``` +${FTP_BASE_PATH}/ +└── share/ # FTP根目录(直接挂载) + └── (用户上传的文件) + +/private/argus/etc/ +└── ${DOMAIN} # 容器IP记录文件 +``` + +## vsftpd 配置说明 + +### 核心配置参数 + +根据README中的推荐配置,vsftpd.conf包含以下关键设置: + +```bash +# 基本设置 +local_enable=YES # 允许本地用户登录 +write_enable=YES # 允许写操作(上传/删除/修改) +chroot_local_user=YES # 限制用户在自己目录中 +allow_writeable_chroot=YES # 防止 chroot 错误(重要!) + +# 被动模式配置 +pasv_enable=YES # 启用被动模式 +pasv_min_port=21100 # 被动模式最小端口 +pasv_max_port=21110 # 被动模式最大端口 + +# 用户访问控制 +userlist_enable=YES # 启用用户列表 +userlist_file=/etc/vsftpd.userlist # 用户列表文件 +userlist_deny=NO # 只允许列表中的用户登录 +``` + +### 用户管理 + +#### 默认用户 +- **用户名**: ftpuser +- **密码**: ZGClab1234! (可通过 FTP_PASSWORD 环境变量修改) +- **UID**: 2133 (与prometheus用户保持一致,可通过 FTP_UID 环境变量修改) +- **GID**: 2015 (与prometheus用户保持一致,可通过 FTP_GID 环境变量修改) +- **主目录**: ${FTP_BASE_PATH}/share (直接指向挂载目录) +- **Shell**: /bin/bash +- **用户列表**: 已添加到 `/etc/vsftpd.userlist` + +#### 添加新用户 +```bash +# 进入容器 +docker exec -it ftp-server bash + +# 添加新用户 +useradd -d ${FTP_BASE_PATH}/share/newuser -s /bin/bash newuser +echo "newuser" >> /etc/vsftpd.userlist +passwd newuser + +# 创建用户目录 +mkdir -p ${FTP_BASE_PATH}/share/newuser +chown newuser:newuser ${FTP_BASE_PATH}/share/newuser +``` + +## 端口配置 + +- **21**: FTP 控制端口 +- **20**: FTP 数据端口 (主动模式) +- **21100-21110**: 被动模式数据端口范围 + +### 日志文件位置 +- **vsftpd 日志**: `/var/log/vsftpd/vsftpd.log` +- **supervisor 日志**: `/var/log/supervisor/` + - `supervisord.log`: supervisor 主日志 + - `vsftpd.log`: vsftpd 标准输出 + - `vsftpd_error.log`: vsftpd 错误输出 + +```bash +# 在宿主机上配置 logrotate +cat > /etc/logrotate.d/ftp-docker << EOF +/var/lib/docker/containers/*/ftp-server-*.log { + daily + rotate 7 + compress + delaycompress + missingok + notifempty + copytruncate +} +EOF +``` + +### FTP连接测试 +```bash +# 本地测试连接 +ftp localhost + +curl -fsS 'ftp://ftpuser:ZGClab1234!@177.177.70.200/setup.sh' -o setup.sh + +# root用户直接执行,非root用户需要使用sudo +chmod +x setup.sh +bash setup.sh --server {$域名} --user ftpuser --password 'ZGClab1234!' +``` \ No newline at end of file diff --git a/src/metric/ftp/build/deps/vsftpd_3.0.5-0ubuntu1.1_amd64.deb b/src/metric/ftp/build/deps/vsftpd_3.0.5-0ubuntu1.1_amd64.deb new file mode 100644 index 0000000..995a5db Binary files /dev/null and b/src/metric/ftp/build/deps/vsftpd_3.0.5-0ubuntu1.1_amd64.deb differ diff --git a/src/metric/ftp/build/dns-monitor.sh b/src/metric/ftp/build/dns-monitor.sh new file mode 100644 index 0000000..2890b47 --- /dev/null +++ b/src/metric/ftp/build/dns-monitor.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# DNS监控脚本 - 每10秒检查dns.conf是否有变化 +# 如果有变化则执行update-dns.sh脚本 + +DNS_CONF="/private/argus/etc/dns.conf" +DNS_BACKUP="/tmp/dns.conf.backup" +UPDATE_SCRIPT="/private/argus/etc/update-dns.sh" +LOG_FILE="/var/log/supervisor/dns-monitor.log" + +# 确保日志文件存在 +touch "$LOG_FILE" + +log_message() { + echo "$(date '+%Y-%m-%d %H:%M:%S') [DNS-Monitor] $1" >> "$LOG_FILE" +} + +log_message "DNS监控脚本启动" + +while true; do + if [ -f "$DNS_CONF" ]; then + if [ -f "$DNS_BACKUP" ]; then + # 比较文件内容 + if ! cmp -s "$DNS_CONF" "$DNS_BACKUP"; then + log_message "检测到DNS配置变化" + + # 更新备份文件 + cp "$DNS_CONF" "$DNS_BACKUP" + + # 执行更新脚本 + if [ -x "$UPDATE_SCRIPT" ]; then + log_message "执行DNS更新脚本: $UPDATE_SCRIPT" + "$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1 + if [ $? -eq 0 ]; then + log_message "DNS更新脚本执行成功" + else + log_message "DNS更新脚本执行失败" + fi + else + log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT" + fi + fi + else + + # 第一次检测到配置文件,执行更新脚本 + if [ -x "$UPDATE_SCRIPT" ]; then + log_message "执行DNS更新脚本: $UPDATE_SCRIPT" + "$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1 + if [ $? -eq 0 ]; then + log_message "DNS更新脚本执行成功" + + # 第一次运行,创建备份并执行更新 + cp "$DNS_CONF" "$DNS_BACKUP" + log_message "创建DNS配置备份文件" + + else + log_message "DNS更新脚本执行失败" + fi + else + log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT" + fi + fi + else + log_message "警告: DNS配置文件不存在: $DNS_CONF" + fi + + sleep 10 +done diff --git a/src/metric/ftp/build/start-ftp-supervised.sh b/src/metric/ftp/build/start-ftp-supervised.sh new file mode 100644 index 0000000..fb0a213 --- /dev/null +++ b/src/metric/ftp/build/start-ftp-supervised.sh @@ -0,0 +1,37 @@ +#!/bin/bash +set -euo pipefail + +echo "[INFO] Starting FTP server under supervisor..." + +FTP_BASE_PATH=${FTP_BASE_PATH:-/private/argus/ftp} +DOMAIN=${DOMAIN:-ftp.metric.argus.com} +FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!} + +echo "[INFO] FTP base path: ${FTP_BASE_PATH}" +echo "[INFO] Domain: ${DOMAIN}" +echo "[INFO] Setting ftpuser password..." + +# 设置ftpuser密码 +echo "ftpuser:${FTP_PASSWORD}" | chpasswd + +# 确保目录存在 +mkdir -p ${FTP_BASE_PATH}/share +mkdir -p /private/argus/etc +mkdir -p /var/run/vsftpd/empty + +# 直接使用挂载目录作为FTP根目录,无需软链接 +echo "[INFO] Using ${FTP_BASE_PATH}/share as FTP root directory" + +# 生成vsftpd配置文件 +echo "[INFO] Generating vsftpd.conf with base path: ${FTP_BASE_PATH}" +sed "s|\${FTP_BASE_PATH}|${FTP_BASE_PATH}|g" \ + /etc/vsftpd/vsftpd.conf > /tmp/vsftpd.conf + +# 记录容器 IP +IP=$(ifconfig eth0 | awk '/inet /{print $2}' || hostname -i) +echo "current IP: ${IP}" +echo "${IP}" > /private/argus/etc/${DOMAIN} + +# 启动vsftpd +echo "[INFO] Starting vsftpd..." +exec /usr/sbin/vsftpd /tmp/vsftpd.conf diff --git a/src/metric/ftp/build/supervisord.conf b/src/metric/ftp/build/supervisord.conf new file mode 100644 index 0000000..4d76417 --- /dev/null +++ b/src/metric/ftp/build/supervisord.conf @@ -0,0 +1,39 @@ +[supervisord] +nodaemon=true +logfile=/var/log/supervisor/supervisord.log +pidfile=/var/run/supervisord.pid +user=root + +[program:vsftpd] +command=/usr/local/bin/start-ftp-supervised.sh +user=root +stdout_logfile=/var/log/supervisor/vsftpd.log +stderr_logfile=/var/log/supervisor/vsftpd_error.log +autorestart=true +startretries=3 +startsecs=10 +stopwaitsecs=30 +killasgroup=true +stopasgroup=true + +[program:dns-monitor] +command=/usr/local/bin/dns-monitor.sh +user=root +stdout_logfile=/var/log/supervisor/dns-monitor.log +stderr_logfile=/var/log/supervisor/dns-monitor_error.log +autorestart=true +startretries=3 +startsecs=5 +stopwaitsecs=10 +killasgroup=true +stopasgroup=true + +[unix_http_server] +file=/var/run/supervisor.sock +chmod=0700 + +[supervisorctl] +serverurl=unix:///var/run/supervisor.sock + +[rpcinterface:supervisor] +supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface diff --git a/src/metric/ftp/build/vsftpd-config-README.md b/src/metric/ftp/build/vsftpd-config-README.md new file mode 100644 index 0000000..acd3d0c --- /dev/null +++ b/src/metric/ftp/build/vsftpd-config-README.md @@ -0,0 +1,111 @@ +# vsftpd 配置 + +配置 vsftpd FTP 服务器。 + +# 安装deps下 vsftpd 的离线安装包 + +sudo dpkg -i vsftpd_3.0.5-0ubuntu1.1_amd64.deb + +# 有依赖问题,修复依赖 + +sudo apt-get install -f + +## 启动服务 + +sudo service vsftpd start + +# 重启服务 + +sudo service vsftpd restart + +# 查看状态 + +sudo service vsftpd status + +## 备份配置文件 + +先备份默认配置,出问题能恢复: + +```bash +sudo cp /etc/vsftpd.conf /etc/vsftpd.conf.bak +``` + +## 修改配置文件 + +编辑配置: + +```bash +sudo vim /etc/vsftpd.conf +``` + +### 基本配置参数 + +```bash +# 允许本地用户登录 +local_enable=YES + +# 允许写操作(上传/删除/修改) +write_enable=YES + +# 限制用户在自己目录中,不能访问整个系统 +chroot_local_user=YES + +# 防止 chroot 错误(重要!) +allow_writeable_chroot=YES + +# 被动模式配置 +pasv_enable=YES +pasv_min_port=30000 +pasv_max_port=31000 +``` + +## 创建 FTP 目录和用户 + +### 创建共享目录 + +```bash +sudo mkdir -p /srv/ftp/share +sudo chmod 755 /srv/ftp/share +``` + +### 创建专用用户 + +```bash +sudo adduser ftpuser + +# 修改用户主目录 +sudo usermod -d /srv/ftp/share ftpuser +``` + +## 重启服务 + +```bash +sudo service vsftpd restart +``` + +## 防火墙配置 + +### 开放基本端口 + +```bash +sudo ufw allow 21/tcp +``` + +### 开放被动模式端口 + +```bash +sudo ufw allow 30000:31000/tcp +``` + +## 测试连接 + +```bash +# 本地测试 +ftp localhost + +# 远程测试 +ftp 你的服务器IP +``` + +用户名:ftpuser +密码:设置的密码 \ No newline at end of file diff --git a/src/metric/ftp/build/vsftpd-offline-install.sh b/src/metric/ftp/build/vsftpd-offline-install.sh new file mode 100755 index 0000000..79f70aa --- /dev/null +++ b/src/metric/ftp/build/vsftpd-offline-install.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# vsftpd 离线安装脚本 +# 使用方法:./vsftpd-offline-install.sh + +set -e + +echo "开始 vsftpd 离线安装..." + +# 检查是否为 root 用户 +if [ "$EUID" -ne 0 ]; then + echo "请使用 root 权限运行此脚本" + exit 1 +fi + +# 定义离线包目录 +OFFLINE_DIR="./vsftpd-offline" +DEB_DIR="$OFFLINE_DIR/debs" + +# 检查离线包是否存在 +if [ ! -d "$OFFLINE_DIR" ]; then + echo "错误:找不到离线包目录 $OFFLINE_DIR" + echo "请先准备离线包,方法:" + echo "1. 在有网络的机器上运行:" + echo " mkdir -p $DEB_DIR" + echo " cd $DEB_DIR" + echo " apt download vsftpd" + echo " apt download \$(apt-cache depends vsftpd | grep Depends | cut -d: -f2 | tr -d ' ')" + echo "2. 将整个 $OFFLINE_DIR 目录拷贝到目标机器" + exit 1 +fi + +# 安装 deb 包 +echo "安装 vsftpd 及依赖包..." +cd "$DEB_DIR" +dpkg -i *.deb || apt-get install -f -y + +# 检查安装状态 +if systemctl is-active --quiet vsftpd; then + echo "vsftpd 安装成功并已启动" +else + echo "启动 vsftpd 服务..." + systemctl start vsftpd + systemctl enable vsftpd +fi + +echo "vsftpd 离线安装完成!" +echo "配置文件位置: /etc/vsftpd.conf" +echo "服务状态: $(systemctl is-active vsftpd)" diff --git a/src/metric/ftp/build/vsftpd.conf b/src/metric/ftp/build/vsftpd.conf new file mode 100644 index 0000000..8403b85 --- /dev/null +++ b/src/metric/ftp/build/vsftpd.conf @@ -0,0 +1,56 @@ +# vsftpd 配置文件 + +# 基本设置 +listen=YES +listen_ipv6=NO +anonymous_enable=NO +local_enable=YES +write_enable=YES +local_umask=022 +dirmessage_enable=YES +use_localtime=YES +xferlog_enable=YES +connect_from_port_20=YES + +# 安全设置 +chroot_local_user=YES +allow_writeable_chroot=YES +secure_chroot_dir=/var/run/vsftpd/empty +pam_service_name=vsftpd +rsa_cert_file=/etc/ssl/certs/ssl-cert-snakeoil.pem +rsa_private_key_file=/etc/ssl/private/ssl-cert-snakeoil.key +ssl_enable=NO + +# 用户设置 +userlist_enable=YES +userlist_file=/etc/vsftpd.userlist +userlist_deny=NO + +# 目录设置 +local_root=${FTP_BASE_PATH}/share + +# 被动模式设置 +pasv_enable=YES +pasv_min_port=21100 +pasv_max_port=21110 +pasv_address=0.0.0.0 + +# 日志设置 +xferlog_file=/var/log/vsftpd/vsftpd.log +log_ftp_protocol=YES + +# 其他设置 +hide_ids=YES +tcp_wrappers=YES + +# 文件上传设置 +file_open_mode=0666 +local_umask=022 + +# 超时设置 +idle_session_timeout=300 +data_connection_timeout=300 + +# 限制设置 +max_clients=50 +max_per_ip=5 diff --git a/src/metric/grafana/build/Dockerfile b/src/metric/grafana/build/Dockerfile new file mode 100644 index 0000000..82ba4fa --- /dev/null +++ b/src/metric/grafana/build/Dockerfile @@ -0,0 +1,68 @@ +FROM grafana/grafana:11.1.0 + +USER root + +# 安装必要的工具 +RUN apk add --no-cache \ + supervisor \ + net-tools \ + iputils \ + vim \ + bash + +# supervisor 日志目录 +RUN mkdir -p /var/log/supervisor + +# 设置 Grafana 基础路径环境变量 +ENV GRAFANA_BASE_PATH=/private/argus/metric/grafana + +# 设置用户和组ID环境变量 +ARG GRAFANA_UID=2133 +ARG GRAFANA_GID=2015 +ENV GRAFANA_UID=${GRAFANA_UID} +ENV GRAFANA_GID=${GRAFANA_GID} + +# 创建基本目录结构 +RUN mkdir -p /private/argus/etc \ + && mkdir -p /private/argus/metric/grafana/data \ + && mkdir -p /private/argus/metric/grafana/logs \ + && mkdir -p /private/argus/metric/grafana/plugins \ + && mkdir -p /private/argus/metric/grafana/provisioning/datasources \ + && mkdir -p /private/argus/metric/grafana/provisioning/dashboards \ + && mkdir -p /private/argus/metric/grafana/data/sessions \ + && mkdir -p /private/argus/metric/grafana/data/dashboards \ + && mkdir -p /private/argus/metric/grafana/config \ + && mkdir -p /etc/grafana \ + && mkdir -p /var/lib/grafana \ + && mkdir -p /var/log/grafana + +# 修改 Grafana 用户 UID/GID 并授权 +RUN deluser grafana && \ + addgroup -g ${GRAFANA_GID} grafana && \ + adduser -u ${GRAFANA_UID} -G grafana -s /bin/sh -D grafana && \ + chown -R grafana:grafana /var/lib/grafana /etc/grafana /var/log/grafana /private/argus + +# 复制配置文件到容器内临时位置 +COPY grafana.ini /tmp/grafana.ini +COPY datasources/datasources.yml /tmp/datasources.yml +COPY dashboards/dashboards.yml /tmp/dashboards.yml +COPY dashboards/default_dashboard_by_hostname.json /tmp/default_dashboard.json + +# supervisor 配置 +COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf + +# 启动脚本 +COPY start-grafana-supervised.sh /usr/local/bin/start-grafana-supervised.sh +RUN chmod +x /usr/local/bin/start-grafana-supervised.sh + +# 确保配置文件权限正确 +RUN chown -R grafana:grafana /etc/grafana + +COPY dns-monitor.sh /usr/local/bin/dns-monitor.sh +RUN chmod +x /usr/local/bin/dns-monitor.sh + +USER root + +EXPOSE 3000 + +ENTRYPOINT ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf", "-n"] diff --git a/src/metric/grafana/build/README.md b/src/metric/grafana/build/README.md new file mode 100644 index 0000000..91ce864 --- /dev/null +++ b/src/metric/grafana/build/README.md @@ -0,0 +1,100 @@ +# Grafana 构建配置 + +基于 `grafana/grafana:11.1.0` 构建的自定义镜像,主要做了用户 ID 适配和配置自动化。 + +## 快速开始 + +```bash +# 构建镜像 +docker build -t argus-metric-grafana:1.0.0 . + +# 启动容器(主机网络模式) +docker run -d \ + --name grafana \ + --network=host \ + -v /private/argus:/private/argus \ + argus-metric-grafana:1.0.0 +``` + +访问:`http://localhost:3001/private/argus/metric/grafana/` +默认账号:`admin` / `admin` + +## 用户 ID 配置 + +镜像默认使用特殊的用户 ID 以适配主机权限: +- `GRAFANA_UID=2133` +- `GRAFANA_GID=2015` + +如果需要修改,构建时传入参数: + +```bash +docker build \ + --build-arg GRAFANA_UID=1000 \ + --build-arg GRAFANA_GID=1000 \ + -t argus-metric-grafana:1.0.0 . +``` + +## 配置说明 + +### 数据源配置 + +修改 `datasources/datasources.yml` 中的 Prometheus 地址: + +```yaml +datasources: + - name: Prometheus + type: prometheus + url: http://10.211.55.5:9090 # 改成你的 Prometheus 地址 + isDefault: true +``` + +**注意**:确保 Grafana 容器能访问到 Prometheus 服务,网络要通。 + +### Dashboard 导入 + +配置好数据源后,手动导入默认 dashboard: + +1. 登录 Grafana +2. 左侧菜单 → Dashboards → Import +3. 上传 `dashboards/default_dashboard.json` +4. 选择 Prometheus 数据源 +5. Import + +或者直接把 dashboard 放到持久化目录: + +```bash +cp dashboards/default_dashboard.json /private/argus/metric/grafana/provisioning/dashboards/ +``` + +重启容器会自动加载(因为 `dashboards.yml` 配置了自动扫描该目录)。 + +## 目录结构 + +持久化目录都在 `/private/argus` 下: + +``` +/private/argus/ +├── etc/ +│ └── grafana.metric.argus.com # 容器 IP 记录 +└── metric/grafana/ + ├── config/ + │ └── grafana.ini # 主配置文件 + ├── data/ # 数据库、会话等 + ├── logs/ # 日志 + ├── plugins/ # 插件 + └── provisioning/ + ├── datasources/ + │ └── datasources.yml # 数据源配置 + └── dashboards/ + ├── dashboards.yml # dashboard 配置 + └── *.json # dashboard JSON 文件 +``` + +## 启动流程 + +容器启动时 `start-grafana-supervised.sh` 会: + +1. 记录容器 IP 到 `/private/argus/etc/grafana.metric.argus.com` +2. 创建必要的目录 +3. 从 `/tmp/` 复制配置文件到持久化目录(首次启动或配置不存在时) +4. 用 `grafana:grafana` (UID:GID=2133:2015) 启动 Grafana 服务 \ No newline at end of file diff --git a/src/metric/grafana/build/dashboards/dashboards.yml b/src/metric/grafana/build/dashboards/dashboards.yml new file mode 100644 index 0000000..2fdff96 --- /dev/null +++ b/src/metric/grafana/build/dashboards/dashboards.yml @@ -0,0 +1,15 @@ +# 仪表板配置文件 +# 这个文件定义了仪表板的自动配置 + +apiVersion: 1 + +providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /private/argus/metric/grafana/provisioning/dashboards diff --git a/src/metric/grafana/build/dashboards/default_dashboard_by_hostname.json b/src/metric/grafana/build/dashboards/default_dashboard_by_hostname.json new file mode 100644 index 0000000..4a09e80 --- /dev/null +++ b/src/metric/grafana/build/dashboards/default_dashboard_by_hostname.json @@ -0,0 +1,629 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 9, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Load", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 101, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "node_load1{hostname=\"$hostname\"}", + "legendFormat": "{{hostname}} load1", + "refId": "A" + }, + { + "expr": "node_load5{hostname=\"$hostname\"}", + "legendFormat": "{{hostname}} load5", + "refId": "B" + }, + { + "expr": "node_load15{hostname=\"$hostname\"}", + "legendFormat": "{{hostname}} load15", + "refId": "C" + } + ], + "title": "System Load", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "100 * (1 - avg by(hostname) (irate(node_cpu_seconds_total{mode=\"idle\",hostname=\"$hostname\"}[5m])))", + "legendFormat": "{{hostname}}", + "refId": "A" + } + ], + "title": "CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "%", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "100 * (1 - (node_memory_MemAvailable_bytes{hostname=\"$hostname\"} / node_memory_MemTotal_bytes{hostname=\"$hostname\"}))", + "legendFormat": "{{hostname}}", + "refId": "B" + } + ], + "title": "Node Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes/s", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by(hostname) (rate(node_disk_read_bytes_total{device!~\"^(loop|ram|sr0).*\",hostname=\"$hostname\"}[5m]))", + "legendFormat": "{{hostname}} read", + "refId": "A" + }, + { + "expr": "sum by(hostname) (rate(node_disk_written_bytes_total{device!~\"^(loop|ram|sr0).*\",hostname=\"$hostname\"}[5m]))", + "legendFormat": "{{hostname}} write", + "refId": "B" + } + ], + "title": "Node Disk I/O (Bytes/s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes/s", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 102, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by(hostname)(rate(node_network_receive_bytes_total{device!~\"^(lo|docker.*)\",hostname=\"$hostname\"}[5m]))", + "legendFormat": "{{hostname}} RX", + "refId": "A" + }, + { + "expr": "sum by(hostname)(rate(node_network_transmit_bytes_total{device!~\"^(lo|docker.*)\",hostname=\"$hostname\"}[5m]))", + "legendFormat": "{{hostname}} TX", + "refId": "B" + } + ], + "title": "Network Traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Processes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 200 + }, + { + "color": "red", + "value": 500 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 104, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "node_procs_running{hostname=\"$hostname\"}", + "legendFormat": "{{hostname}} Running", + "refId": "A" + }, + { + "expr": "node_procs_blocked{hostname=\"$hostname\"}", + "legendFormat": "{{hostname}} Blocked", + "refId": "B" + } + ], + "title": "Node Process Count", + "type": "timeseries" + } + ], + "refresh": "15s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "node-exporter-A1", + "value": "node-exporter-A1" + }, + "datasource": { + "type": "prometheus" + }, + "definition": "label_values(node_cpu_seconds_total,hostname)", + "hide": 0, + "includeAll": false, + "label": "hostname", + "multi": false, + "name": "hostname", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(node_cpu_seconds_total,hostname)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-12h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Node and GPU Metrics", + "uid": "node_gpu_metrics", + "weekStart": "" +} \ No newline at end of file diff --git a/src/metric/grafana/build/dashboards/default_dashboard_by_instance.json b/src/metric/grafana/build/dashboards/default_dashboard_by_instance.json new file mode 100644 index 0000000..78f0c43 --- /dev/null +++ b/src/metric/grafana/build/dashboards/default_dashboard_by_instance.json @@ -0,0 +1,628 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 9, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Load", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 101, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "node_load1{instance=\"$instance\"}", + "legendFormat": "{{instance}} load1", + "refId": "A" + }, + { + "expr": "node_load5{instance=\"$instance\"}", + "legendFormat": "{{instance}} load5", + "refId": "B" + }, + { + "expr": "node_load15{instance=\"$instance\"}", + "legendFormat": "{{instance}} load15", + "refId": "C" + } + ], + "title": "System Load", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "100 * (1 - avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\",instance=\"$instance\"}[5m])))", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "%", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "100 * (1 - (node_memory_MemAvailable_bytes{instance=\"$instance\"} / node_memory_MemTotal_bytes{instance=\"$instance\"}))", + "legendFormat": "{{instance}}", + "refId": "B" + } + ], + "title": "Node Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes/s", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by(instance) (rate(node_disk_read_bytes_total{device!~\"^(loop|ram|sr0).*\",instance=\"$instance\"}[5m]))", + "legendFormat": "{{instance}} read", + "refId": "A" + }, + { + "expr": "sum by(instance) (rate(node_disk_written_bytes_total{device!~\"^(loop|ram|sr0).*\",instance=\"$instance\"}[5m]))", + "legendFormat": "{{instance}} write", + "refId": "B" + } + ], + "title": "Node Disk I/O (Bytes/s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes/s", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 102, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by(instance)(rate(node_network_receive_bytes_total{device!~\"^(lo|docker.*)\",instance=\"$instance\"}[5m]))", + "legendFormat": "{{instance}} RX", + "refId": "A" + }, + { + "expr": "sum by(instance)(rate(node_network_transmit_bytes_total{device!~\"^(lo|docker.*)\",instance=\"$instance\"}[5m]))", + "legendFormat": "{{instance}} TX", + "refId": "B" + } + ], + "title": "Network Traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Processes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 200 + }, + { + "color": "red", + "value": 500 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 104, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "node_procs_running{instance=\"$instance\"}", + "legendFormat": "{{instance}} Running", + "refId": "A" + }, + { + "expr": "node_procs_blocked{instance=\"$instance\"}", + "legendFormat": "{{instance}} Blocked", + "refId": "B" + } + ], + "title": "Node Process Count", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "node-exporter-A1", + "value": "node-exporter-A1" + }, + "datasource": { + "type": "prometheus" + }, + "definition": "label_values(node_cpu_seconds_total,instance)", + "hide": 0, + "includeAll": false, + "label": "instance", + "multi": false, + "name": "instance", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(node_cpu_seconds_total,instance)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-12h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Node and GPU Metrics", + "uid": "node_gpu_metrics", + "weekStart": "" + } \ No newline at end of file diff --git a/src/metric/grafana/build/datasources/datasources.yml b/src/metric/grafana/build/datasources/datasources.yml new file mode 100644 index 0000000..fb277cc --- /dev/null +++ b/src/metric/grafana/build/datasources/datasources.yml @@ -0,0 +1,26 @@ +# 数据源配置文件 +# 这个文件定义了所有数据源的配置 + +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + uid: eezk1zvkie4g0a + url: http://10.211.55.5:9090 + isDefault: true + editable: true + jsonData: + httpMethod: POST + manageAlerts: true + prometheusType: Prometheus + prometheusVersion: 2.40.0 + cacheLevel: 'High' + disableRecordingRules: false + incrementalQueryOverlapWindow: 10m + incrementalQuerying: false + queryTimeout: 60s + timeInterval: 15s + secureJsonData: {} + version: 1 diff --git a/src/metric/grafana/build/dns-monitor.sh b/src/metric/grafana/build/dns-monitor.sh new file mode 100644 index 0000000..2890b47 --- /dev/null +++ b/src/metric/grafana/build/dns-monitor.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# DNS监控脚本 - 每10秒检查dns.conf是否有变化 +# 如果有变化则执行update-dns.sh脚本 + +DNS_CONF="/private/argus/etc/dns.conf" +DNS_BACKUP="/tmp/dns.conf.backup" +UPDATE_SCRIPT="/private/argus/etc/update-dns.sh" +LOG_FILE="/var/log/supervisor/dns-monitor.log" + +# 确保日志文件存在 +touch "$LOG_FILE" + +log_message() { + echo "$(date '+%Y-%m-%d %H:%M:%S') [DNS-Monitor] $1" >> "$LOG_FILE" +} + +log_message "DNS监控脚本启动" + +while true; do + if [ -f "$DNS_CONF" ]; then + if [ -f "$DNS_BACKUP" ]; then + # 比较文件内容 + if ! cmp -s "$DNS_CONF" "$DNS_BACKUP"; then + log_message "检测到DNS配置变化" + + # 更新备份文件 + cp "$DNS_CONF" "$DNS_BACKUP" + + # 执行更新脚本 + if [ -x "$UPDATE_SCRIPT" ]; then + log_message "执行DNS更新脚本: $UPDATE_SCRIPT" + "$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1 + if [ $? -eq 0 ]; then + log_message "DNS更新脚本执行成功" + else + log_message "DNS更新脚本执行失败" + fi + else + log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT" + fi + fi + else + + # 第一次检测到配置文件,执行更新脚本 + if [ -x "$UPDATE_SCRIPT" ]; then + log_message "执行DNS更新脚本: $UPDATE_SCRIPT" + "$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1 + if [ $? -eq 0 ]; then + log_message "DNS更新脚本执行成功" + + # 第一次运行,创建备份并执行更新 + cp "$DNS_CONF" "$DNS_BACKUP" + log_message "创建DNS配置备份文件" + + else + log_message "DNS更新脚本执行失败" + fi + else + log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT" + fi + fi + else + log_message "警告: DNS配置文件不存在: $DNS_CONF" + fi + + sleep 10 +done diff --git a/src/metric/grafana/build/grafana.ini b/src/metric/grafana/build/grafana.ini new file mode 100644 index 0000000..fea2ada --- /dev/null +++ b/src/metric/grafana/build/grafana.ini @@ -0,0 +1,96 @@ +# Grafana 配置文件 +# 这个配置文件定义了 Grafana 的基本设置和 Prometheus 数据源配置 + +[paths] +data = /private/argus/metric/grafana/data +logs = /private/argus/metric/grafana/logs +plugins = /private/argus/metric/grafana/plugins +provisioning = /private/argus/metric/grafana/provisioning + +[server] +http_port = 3000 +domain = localhost +root_url = %(protocol)s://%(domain)s:%(http_port)s +serve_from_sub_path = true + +[database] +type = sqlite3 +path = /private/argus/metric/grafana/data/grafana.db + +[session] +provider = file +provider_config = /private/argus/metric/grafana/data/sessions +cookie_name = grafana_sess +cookie_secure = false +session_life_time = 86400 + +[analytics] +reporting_enabled = false +check_for_updates = false + +[security] +admin_user = admin +admin_password = admin +secret_key = SW2YcwTIb9zpOOhoPsMm + +[snapshots] +external_enabled = true + +[users] +allow_sign_up = false +auto_assign_org = true +auto_assign_org_role = Viewer +verify_email_enabled = false + +[log] +mode = console +level = info + +[log.console] +level = info +format = console + +[log.file] +level = info +format = text +log_rotate = true +max_lines = 1000000 +max_size_shift = 28 +daily_rotate = true +max_days = 7 +filename = /private/argus/metric/grafana/logs/grafana.log + +[quota] +enabled = false + +[unified_alerting] +enabled = true + +[explore] +enabled = true + +[panels] +disable_sanitize_html = false + +[plugins] +enable_alpha = false +app_tls_skip_verify_insecure = false + +[enterprise] +license_path = + +[feature_toggles] +enable = + +[date_formats] +default_timezone = browser +full_date = YYYY-MM-DD HH:mm:ss +interval_second = HH:mm:ss +interval_minute = HH:mm +interval_hour = MM/DD HH:mm +interval_day = MM/DD +interval_month = YYYY-MM +interval_year = YYYY + +[expressions] +enabled = true diff --git a/src/metric/grafana/build/start-grafana-supervised.sh b/src/metric/grafana/build/start-grafana-supervised.sh new file mode 100644 index 0000000..5d4b8a1 --- /dev/null +++ b/src/metric/grafana/build/start-grafana-supervised.sh @@ -0,0 +1,97 @@ +#!/bin/bash +set -euo pipefail + +echo "[INFO] Starting Grafana under supervisor..." + +DOMAIN=grafana.metric.argus.com + +# 记录容器 IP +IP=$(ifconfig | awk '/inet / && $2 != "127.0.0.1" {print $2; exit}') +echo "current IP: ${IP}" +echo "${IP}" > /private/argus/etc/${DOMAIN} + +# 确保必要目录存在(权限已在 Dockerfile 中设置) +mkdir -p /private/argus/metric/grafana/data +mkdir -p /private/argus/metric/grafana/logs +mkdir -p /private/argus/metric/grafana/plugins +mkdir -p /private/argus/metric/grafana/provisioning/datasources +mkdir -p /private/argus/metric/grafana/provisioning/dashboards +mkdir -p /private/argus/metric/grafana/data/sessions +mkdir -p /private/argus/metric/grafana/data/dashboards +mkdir -p /private/argus/metric/grafana/config +mkdir -p /var/log/grafana +mkdir -p /etc/grafana/provisioning/datasources +mkdir -p /var/lib/grafana + +# 复制主配置文件到持久化目录 +if [ -f "/tmp/grafana.ini" ]; then + echo "[INFO] Copying grafana.ini to /private/argus/metric/grafana/config/" + cp /tmp/grafana.ini /private/argus/metric/grafana/config/grafana.ini + chown grafana:grafana /private/argus/metric/grafana/config/grafana.ini + echo "[INFO] Grafana configuration copied successfully" +fi + +# 检查配置文件来源(优先级:挂载目录 > 容器内配置 > 默认配置) +if [ -f "/private/argus/metric/grafana/config/grafana.ini" ]; then + echo "[INFO] Using grafana.ini from /private/argus/metric/grafana/config/" + CONFIG_FILE="--config=/private/argus/metric/grafana/config/grafana.ini" +elif [ -f "/etc/grafana/grafana.ini" ]; then + echo "[INFO] Using custom grafana.ini from /etc/grafana/" + CONFIG_FILE="--config=/etc/grafana/grafana.ini" +else + echo "[INFO] Using default configuration" + CONFIG_FILE="" +fi + +# 复制数据源配置文件到挂载目录 +if [ -f "/tmp/datasources.yml" ]; then + echo "[INFO] Copying datasource configuration to /private/argus/metric/grafana/provisioning/datasources/" + cp /tmp/datasources.yml /private/argus/metric/grafana/provisioning/datasources/datasources.yml + chown grafana:grafana /private/argus/metric/grafana/provisioning/datasources/datasources.yml + echo "[INFO] Datasource configuration copied successfully" +elif [ -d "/private/argus/metric/grafana/provisioning/datasources" ] && [ "$(ls -A /private/argus/metric/grafana/provisioning/datasources)" ]; then + echo "[INFO] Found existing datasource provisioning files in /private/argus/metric/grafana/provisioning/datasources" + # 确保数据源配置目录权限正确 + chown -R grafana:grafana /private/argus/metric/grafana/provisioning/datasources +elif [ -d "/etc/grafana/provisioning/datasources" ] && [ "$(ls -A /etc/grafana/provisioning/datasources)" ]; then + echo "[INFO] Found datasource provisioning files in /etc/grafana/provisioning/datasources" + # 确保数据源配置目录权限正确 + chown -R grafana:grafana /etc/grafana/provisioning/datasources +else + echo "[INFO] No datasource provisioning files found, using manual configuration" +fi + +# 复制仪表板配置文件到挂载目录 +if [ -f "/tmp/dashboards.yml" ]; then + echo "[INFO] Copying dashboard configuration to /private/argus/metric/grafana/provisioning/dashboards/" + cp /tmp/dashboards.yml /private/argus/metric/grafana/provisioning/dashboards/dashboards.yml + chown grafana:grafana /private/argus/metric/grafana/provisioning/dashboards/dashboards.yml + echo "[INFO] Dashboard configuration copied successfully" +fi + +# 复制默认仪表板到挂载目录 +if [ -f "/tmp/default_dashboard.json" ]; then + echo "[INFO] Copying default dashboard to /private/argus/metric/grafana/provisioning/dashboards/" + cp /tmp/default_dashboard.json /private/argus/metric/grafana/provisioning/dashboards/default_dashboard.json + chown grafana:grafana /private/argus/metric/grafana/provisioning/dashboards/default_dashboard.json + echo "[INFO] Default dashboard copied successfully" +fi + +# 确保所有配置目录权限正确 +chown -R grafana:grafana /private/argus/metric/grafana/provisioning/ + +# 启动 Grafana +if [ -n "$CONFIG_FILE" ]; then + echo "[INFO] Starting Grafana with custom configuration..." + exec /usr/share/grafana/bin/grafana server \ + --homepath=/usr/share/grafana \ + --packaging=docker \ + $CONFIG_FILE +else + echo "[INFO] Starting Grafana with default configuration..." + exec /usr/share/grafana/bin/grafana server \ + --homepath=/usr/share/grafana \ + --packaging=docker \ + cfg:default.log.mode=console \ + cfg:default.log.level=info +fi diff --git a/src/metric/grafana/build/supervisord.conf b/src/metric/grafana/build/supervisord.conf new file mode 100644 index 0000000..b331284 --- /dev/null +++ b/src/metric/grafana/build/supervisord.conf @@ -0,0 +1,40 @@ +[supervisord] +nodaemon=true +logfile=/var/log/supervisor/supervisord.log +pidfile=/var/run/supervisord.pid +user=root +sockfile=/var/run/supervisor.sock + +[program:grafana] +command=/usr/local/bin/start-grafana-supervised.sh +user=grafana +stdout_logfile=/var/log/supervisor/grafana.log +stderr_logfile=/var/log/supervisor/grafana_error.log +autorestart=true +startretries=3 +startsecs=30 +stopwaitsecs=30 +killasgroup=true +stopasgroup=true + +[program:dns-monitor] +command=/usr/local/bin/dns-monitor.sh +user=root +stdout_logfile=/var/log/supervisor/dns-monitor.log +stderr_logfile=/var/log/supervisor/dns-monitor_error.log +autorestart=true +startretries=3 +startsecs=5 +stopwaitsecs=10 +killasgroup=true +stopasgroup=true + +[unix_http_server] +file=/var/run/supervisor.sock +chmod=0700 + +[supervisorctl] +serverurl=unix:///var/run/supervisor.sock + +[rpcinterface:supervisor] +supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface diff --git a/src/metric/prometheus/build/Dockerfile b/src/metric/prometheus/build/Dockerfile new file mode 100755 index 0000000..e2195a8 --- /dev/null +++ b/src/metric/prometheus/build/Dockerfile @@ -0,0 +1,102 @@ +FROM ubuntu/prometheus:3-24.04_stable + +USER root + +ARG USE_INTRANET=false + +# 内网 apt 源配置 +RUN if [ "$USE_INTRANET" = "true" ]; then \ + echo "Configuring intranet apt sources..." && \ + cp /etc/apt/sources.list /etc/apt/sources.list.bak && \ + echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \ + echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \ + echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \ + else \ + echo "Configuring fast apt sources for external network..." && \ + find /etc/apt -name "sources.list*" -exec sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g' {} \; && \ + find /etc/apt -name "sources.list*" -exec sed -i 's/security.ubuntu.com/mirrors.aliyun.com/g' {} \; && \ + echo "deb http://mirrors.aliyun.com/ubuntu/ jammy main restricted universe multiverse" > /etc/apt/sources.list && \ + echo "deb http://mirrors.aliyun.com/ubuntu/ jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \ + echo "deb http://mirrors.aliyun.com/ubuntu/ jammy-security main restricted universe multiverse" >> /etc/apt/sources.list; \ + fi + +# 验证源配置并安装常用工具 +RUN echo "=== Current apt sources ===" && \ + cat /etc/apt/sources.list && \ + echo "=== Updating package list ===" && \ + apt-get update && \ + echo "=== Installing packages ===" && \ + apt-get install -y --no-install-recommends \ + supervisor \ + net-tools \ + inetutils-ping \ + vim \ + python3 \ + python3-pip && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# 如果是部署环境替换 apt 源 +RUN if [ "$USE_INTRANET" = "true" ]; then \ + echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \ + fi + +# supervisor 日志目录 +RUN mkdir -p /var/log/supervisor + +# 设置 Prometheus 基础路径环境变量 +ENV PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus + +# 设置用户和组ID环境变量 +ARG PROMETHEUS_UID=2133 +ARG PROMETHEUS_GID=2015 +ENV PROMETHEUS_UID=${PROMETHEUS_UID} +ENV PROMETHEUS_GID=${PROMETHEUS_GID} + +# 创建目录结构 +RUN mkdir -p ${PROMETHEUS_BASE_PATH}/rules \ + && mkdir -p ${PROMETHEUS_BASE_PATH}/targets \ + && mkdir -p /private/argus/etc \ + && rm -rf /prometheus \ + && ln -s ${PROMETHEUS_BASE_PATH} /prometheus + +# 修改 Prometheus 用户 UID/GID 并授权 +RUN usermod -u ${PROMETHEUS_UID} nobody && \ + groupmod -g ${PROMETHEUS_GID} nogroup && \ + chown -h nobody:nogroup /prometheus && \ + chown -R nobody:nogroup /private/argus/metric /etc/prometheus && \ + chown -R nobody:nogroup ${PROMETHEUS_BASE_PATH} + +# supervisor 配置 +COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf + +# 启动脚本 +COPY start-prometheus-supervised.sh /usr/local/bin/start-prometheus-supervised.sh +RUN chmod +x /usr/local/bin/start-prometheus-supervised.sh && \ + chown nobody:nogroup /usr/local/bin/start-prometheus-supervised.sh + +# targets 更新脚本 +COPY start-targets-updater.sh /usr/local/bin/start-targets-updater.sh +RUN chmod +x /usr/local/bin/start-targets-updater.sh && \ + chown nobody:nogroup /usr/local/bin/start-targets-updater.sh + +# targets 更新 Python 脚本 +COPY update_targets.py /usr/local/bin/update_targets.py +RUN chmod +x /usr/local/bin/update_targets.py && \ + chown nobody:nogroup /usr/local/bin/update_targets.py + +# exporter 配置文件 - 复制到内部目录 +COPY exporter_config.json /usr/local/bin/exporter_config.json + +COPY prometheus.yml /etc/prometheus/prometheus.yml + +RUN chown nobody:nogroup /usr/local/bin/exporter_config.json /etc/prometheus/prometheus.yml + +COPY dns-monitor.sh /usr/local/bin/dns-monitor.sh +RUN chmod +x /usr/local/bin/dns-monitor.sh + +USER root + +EXPOSE 9090 + +ENTRYPOINT ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf", "-n"] diff --git a/src/metric/prometheus/build/README.md b/src/metric/prometheus/build/README.md new file mode 100755 index 0000000..63c7046 --- /dev/null +++ b/src/metric/prometheus/build/README.md @@ -0,0 +1,114 @@ +# Prometheus Docker 镜像配置 + +## 环境变量配置 + +### PROMETHEUS_BASE_PATH + +设置 Prometheus 配置和数据的基础路径。 + +**默认值**: `/private/argus/metric/prometheus` + +**用途**: +- 配置文件存储路径: `${PROMETHEUS_BASE_PATH}/prometheus.yml` +- 规则文件路径: `${PROMETHEUS_BASE_PATH}/rules/*.yml` +- 监控目标文件路径: `${PROMETHEUS_BASE_PATH}/targets/` + +## 目录结构 + +容器启动后会在 `${PROMETHEUS_BASE_PATH}` 下创建以下目录结构: + +``` +${PROMETHEUS_BASE_PATH}/ +├── prometheus.yml # 主配置文件 +├── rules/ # 告警规则目录 +│ └── *.yml +└── targets/ # 监控目标目录 + ├── node_exporter.json + └── dcgm_exporter.json +``` + +## 动态配置 + +- **规则文件**: 在 `rules/` 目录下添加 `.yml` 文件即可自动加载 +- **监控目标**: 修改 `targets/` 目录下的 JSON 文件即可动态更新监控目标 +- **主配置**: 修改 `prometheus.yml` 后可通过 Prometheus 的 `/-/reload` 端点重新加载配置 + +## 权限管理 + +### 默认路径权限 +- 默认路径 `/private/argus/metric/prometheus` 在 Dockerfile 中已设置正确的权限 +- nobody 用户(UID: 2133, GID: 2015)拥有完全读写权限 + +### 自定义路径权限 +- 当使用自定义 `PROMETHEUS_BASE_PATH` 时,启动脚本会自动创建目录并设置权限 +- 确保 nobody 用户对自定义路径有读写权限 + +### 挂载卷注意事项 +1. **主机目录权限**: 确保挂载的主机目录对 nobody 用户(UID: 2133)可写 +2. **SELinux**: 如果使用 SELinux,可能需要设置适当的上下文 +3. **Docker 用户映射**: 确保容器内的 nobody 用户与主机用户权限匹配 + +## 故障排除 + +### 权限问题 +如果遇到权限错误,可以检查: +```bash +# 检查目录权限 +ls -la /path/to/prometheus/data + +# 检查用户映射 +id nobody + +# 手动修复权限 +chown -R 2133:2015 /path/to/prometheus/data +chmod -R 755 /path/to/prometheus/data +``` + +## 动态 Targets 配置 + +### 配置流程 + +1. **节点资源清单**: `nodes.json` 包含所有监控节点的基本信息 + ```json + [ + { + "node_id": "A1", + "user_id": "user01", + "ip": "1.2.3.4", + "hostname": "dev-node-1", + "labels": ["production", "us-west-1"] + } + ] + ``` + +2. **Exporter 配置**: `exporter_config.json` 定义各类型 exporter 的端口和标签模板 + - 支持 dcgm (GPU监控) 和 node (系统监控) 两种类型 + - 配置端口映射和标签模板规则 + +3. **自动拆分生成**: `update_targets.py` 脚本根据节点清单自动生成对应的 targets 文件 + - 读取 `nodes.json` 获取节点信息 + - 按 exporter 类型拆分生成 `targets/*_exporter.json` + - 应用标签模板,生成完整的监控目标配置 + +4. **热加载机制**: + - 脚本支持守护进程模式,定期检查 `nodes.json` 变化 + - 文件内容变化时自动重新生成 targets 配置 + - Prometheus 自动发现并重新加载新的监控目标 + +### 使用方式 + +```bash +# 单次更新(注意用户权限,此方法用于测试,但生成文件是 root 权限) +python3 update_targets.py --config nodes.json --targets-dir targets/ + +# 守护进程模式, 该进程托管于supervisor +python3 update_targets.py --daemon --check-interval 30 +``` + +## 注意事项 + +1. 确保挂载的目录有适当的读写权限 +2. 配置文件会在容器启动时自动生成,无需手动创建 +3. 可以通过修改环境变量 `PROMETHEUS_BASE_PATH` 来改变所有相关路径,无需重新构建镜像 +4. 自定义路径的目录会在启动时自动创建并设置权限 +5. `nodes.json` 文件变化后,targets 配置会自动更新,无需手动干预 diff --git a/src/metric/prometheus/build/dns-monitor.sh b/src/metric/prometheus/build/dns-monitor.sh new file mode 100644 index 0000000..2890b47 --- /dev/null +++ b/src/metric/prometheus/build/dns-monitor.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# DNS监控脚本 - 每10秒检查dns.conf是否有变化 +# 如果有变化则执行update-dns.sh脚本 + +DNS_CONF="/private/argus/etc/dns.conf" +DNS_BACKUP="/tmp/dns.conf.backup" +UPDATE_SCRIPT="/private/argus/etc/update-dns.sh" +LOG_FILE="/var/log/supervisor/dns-monitor.log" + +# 确保日志文件存在 +touch "$LOG_FILE" + +log_message() { + echo "$(date '+%Y-%m-%d %H:%M:%S') [DNS-Monitor] $1" >> "$LOG_FILE" +} + +log_message "DNS监控脚本启动" + +while true; do + if [ -f "$DNS_CONF" ]; then + if [ -f "$DNS_BACKUP" ]; then + # 比较文件内容 + if ! cmp -s "$DNS_CONF" "$DNS_BACKUP"; then + log_message "检测到DNS配置变化" + + # 更新备份文件 + cp "$DNS_CONF" "$DNS_BACKUP" + + # 执行更新脚本 + if [ -x "$UPDATE_SCRIPT" ]; then + log_message "执行DNS更新脚本: $UPDATE_SCRIPT" + "$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1 + if [ $? -eq 0 ]; then + log_message "DNS更新脚本执行成功" + else + log_message "DNS更新脚本执行失败" + fi + else + log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT" + fi + fi + else + + # 第一次检测到配置文件,执行更新脚本 + if [ -x "$UPDATE_SCRIPT" ]; then + log_message "执行DNS更新脚本: $UPDATE_SCRIPT" + "$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1 + if [ $? -eq 0 ]; then + log_message "DNS更新脚本执行成功" + + # 第一次运行,创建备份并执行更新 + cp "$DNS_CONF" "$DNS_BACKUP" + log_message "创建DNS配置备份文件" + + else + log_message "DNS更新脚本执行失败" + fi + else + log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT" + fi + fi + else + log_message "警告: DNS配置文件不存在: $DNS_CONF" + fi + + sleep 10 +done diff --git a/src/metric/prometheus/build/exporter_config.json b/src/metric/prometheus/build/exporter_config.json new file mode 100755 index 0000000..75cee90 --- /dev/null +++ b/src/metric/prometheus/build/exporter_config.json @@ -0,0 +1,41 @@ +{ + "exporters": { + "dcgm": { + "port": 9400, + "job_name": "dcgm", + "instance_prefix": "dcgm-exporter", + "description": "DCGM GPU 监控 exporter" + }, + "node": { + "port": 9100, + "job_name": "node", + "instance_prefix": "node-exporter", + "description": "Node 系统监控 exporter" + } + }, + "label_templates": { + "dcgm": { + "job": "dcgm", + "instance": "dcgm-exporter-{node_id}", + "node_id": "{node_id}", + "ip": "{ip}", + "hostname": "{hostname}", + "user_id": "{user_id}", + "tag": "{tag}" + }, + "node": { + "job": "node", + "instance": "node-exporter-{node_id}", + "node_id": "{node_id}", + "ip": "{ip}", + "hostname": "{hostname}", + "user_id": "{user_id}", + "tag": "{tag}" + } + }, + "settings": { + "backup_retention_days": 7, + "log_retention_days": 30, + "refresh_interval": "30s" + } +} \ No newline at end of file diff --git a/src/metric/prometheus/build/prometheus.yml b/src/metric/prometheus/build/prometheus.yml new file mode 100755 index 0000000..e3e4403 --- /dev/null +++ b/src/metric/prometheus/build/prometheus.yml @@ -0,0 +1,27 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + scrape_timeout: 10s + +# 对接 AlertManager +alerting: + alertmanagers: + - static_configs: + - targets: [] + +# 规则目录 +rule_files: + - "${PROMETHEUS_BASE_PATH}/rules/*.yml" + +scrape_configs: + - job_name: "node" + file_sd_configs: + - files: + - "${PROMETHEUS_BASE_PATH}/targets/node_exporter.json" + refresh_interval: 30s + + - job_name: "dcgm" + file_sd_configs: + - files: + - "${PROMETHEUS_BASE_PATH}/targets/dcgm_exporter.json" + refresh_interval: 30s diff --git a/src/metric/prometheus/build/start-prometheus-supervised.sh b/src/metric/prometheus/build/start-prometheus-supervised.sh new file mode 100755 index 0000000..75d9a39 --- /dev/null +++ b/src/metric/prometheus/build/start-prometheus-supervised.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -euo pipefail + +echo "[INFO] Starting Prometheus under supervisor..." + +PROMETHEUS_BASE_PATH=${PROMETHEUS_BASE_PATH:-/private/argus/metric/prometheus} +DOMAIN=prom.metric.argus.com + +echo "[INFO] Prometheus base path: ${PROMETHEUS_BASE_PATH}" + +# 生成配置文件 +echo "[INFO] Generating prometheus.yml with base path: ${PROMETHEUS_BASE_PATH}" +sed "s|\${PROMETHEUS_BASE_PATH}|${PROMETHEUS_BASE_PATH}|g" \ + /etc/prometheus/prometheus.yml > ${PROMETHEUS_BASE_PATH}/prometheus.yml + +# 记录容器 IP +IP=$(ifconfig eth0 | awk '/inet /{print $2}') +echo "current IP: ${IP}" +echo "${IP}" > /private/argus/etc/${DOMAIN} + +exec /bin/prometheus \ + --config.file=${PROMETHEUS_BASE_PATH}/prometheus.yml \ + --storage.tsdb.path=/prometheus \ + --web.enable-lifecycle \ + --web.console.libraries=/usr/share/prometheus/console_libraries \ + --web.console.templates=/usr/share/prometheus/consoles diff --git a/src/metric/prometheus/build/start-targets-updater.sh b/src/metric/prometheus/build/start-targets-updater.sh new file mode 100755 index 0000000..a067003 --- /dev/null +++ b/src/metric/prometheus/build/start-targets-updater.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -euo pipefail + +echo "[INFO] Starting Prometheus Targets Updater under supervisor..." + +# 配置变量 +PROMETHEUS_BASE_PATH=${PROMETHEUS_BASE_PATH:-/private/argus/metric/prometheus} +NODES_CONFIG_FILE=${NODES_CONFIG_FILE:-${PROMETHEUS_BASE_PATH}/nodes.json} +TARGETS_DIR=${PROMETHEUS_BASE_PATH}/targets +EXPORTER_CONFIG_FILE=${EXPORTER_CONFIG_FILE:-${PROMETHEUS_BASE_PATH}/exporter_config.json} +CHECK_INTERVAL=${CHECK_INTERVAL:-30} +LOG_LEVEL=${LOG_LEVEL:-INFO} + +echo "[INFO] Prometheus base path: ${PROMETHEUS_BASE_PATH}" +echo "[INFO] Nodes config file: ${NODES_CONFIG_FILE}" +echo "[INFO] Targets directory: ${TARGETS_DIR}" +echo "[INFO] Exporter config file: ${EXPORTER_CONFIG_FILE}" +echo "[INFO] Check interval: ${CHECK_INTERVAL}s" +echo "[INFO] Log level: ${LOG_LEVEL}" + +# 确保目录存在 +mkdir -p "${TARGETS_DIR}" + +# 检查 EXPORTER_CONFIG_FILE 是否存在,没有则从内部复制 +if [ ! -f "${EXPORTER_CONFIG_FILE}" ]; then + echo "[INFO] exporter_config.json not found at ${EXPORTER_CONFIG_FILE}, copying from internal location..." + cp /usr/local/bin/exporter_config.json "${EXPORTER_CONFIG_FILE}" + chown nobody:nogroup "${EXPORTER_CONFIG_FILE}" + echo "[INFO] Successfully copied exporter_config.json to ${EXPORTER_CONFIG_FILE}" +else + echo "[INFO] exporter_config.json already exists at ${EXPORTER_CONFIG_FILE}, skipping copy" +fi + +exec python3 /usr/local/bin/update_targets.py \ + --config "${NODES_CONFIG_FILE}" \ + --targets-dir "${TARGETS_DIR}" \ + --exporter-config "${EXPORTER_CONFIG_FILE}" \ + --log-level "${LOG_LEVEL}" \ + --daemon \ + --check-interval "${CHECK_INTERVAL}" diff --git a/src/metric/prometheus/build/supervisord.conf b/src/metric/prometheus/build/supervisord.conf new file mode 100755 index 0000000..5359989 --- /dev/null +++ b/src/metric/prometheus/build/supervisord.conf @@ -0,0 +1,51 @@ +[supervisord] +nodaemon=true +logfile=/var/log/supervisor/supervisord.log +pidfile=/var/run/supervisord.pid +user=root + +[program:prometheus] +command=/usr/local/bin/start-prometheus-supervised.sh +user=nobody +stdout_logfile=/var/log/supervisor/prometheus.log +stderr_logfile=/var/log/supervisor/prometheus_error.log +autorestart=true +startretries=3 +startsecs=30 +stopwaitsecs=30 +killasgroup=true +stopasgroup=true + +[program:targets-updater] +command=/usr/local/bin/start-targets-updater.sh +user=nobody +stdout_logfile=/var/log/supervisor/targets_updater.log +stderr_logfile=/var/log/supervisor/targets_updater_error.log +autorestart=true +startretries=3 +startsecs=10 +stopwaitsecs=30 +killasgroup=true +stopasgroup=true + +[program:dns-monitor] +command=/usr/local/bin/dns-monitor.sh +user=root +stdout_logfile=/var/log/supervisor/dns-monitor.log +stderr_logfile=/var/log/supervisor/dns-monitor_error.log +autorestart=true +startretries=3 +startsecs=5 +stopwaitsecs=10 +killasgroup=true +stopasgroup=true + +[unix_http_server] +file=/var/run/supervisor.sock +chmod=0700 + +[supervisorctl] +serverurl=unix:///var/run/supervisor.sock + +[rpcinterface:supervisor] +supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface \ No newline at end of file diff --git a/src/metric/prometheus/build/update_targets.py b/src/metric/prometheus/build/update_targets.py new file mode 100755 index 0000000..91b5dc8 --- /dev/null +++ b/src/metric/prometheus/build/update_targets.py @@ -0,0 +1,416 @@ +#!/usr/bin/env python3 +""" +Prometheus Targets 动态更新脚本 + +脚本从节点配置文件读取节点信息,并动态生成对应的 Prometheus targets 文件。 + +""" + +import json +import os +import sys +import logging +import argparse +import time +import hashlib +from datetime import datetime +from typing import Dict, List, Any +from pathlib import Path + + +class PrometheusTargetsManager: + """Prometheus Targets 管理器""" + + def __init__(self, config_file: str, targets_dir: str, exporter_config_file: str = None, log_level: str = "INFO"): + """ + 初始化管理器 + + Args: + config_file: 节点配置文件路径 + targets_dir: targets 文件输出目录 + exporter_config_file: exporter 配置文件路径 + log_level: 日志级别 + """ + self.config_file = Path(config_file) + self.targets_dir = Path(targets_dir) + self.exporter_config_file = Path(exporter_config_file) if exporter_config_file else None + self.log_level = log_level + self.last_mtime = 0 # 记录文件最后修改时间 + self.last_content_hash = None # 记录文件内容哈希 + + # 设置日志 + self._setup_logging() + + # 加载 exporter 配置(必需,失败则程序退出) + try: + full_config = self._load_exporter_config() + self.exporter_configs = full_config.get('exporters', {}) + self.label_templates = full_config.get('label_templates', {}) + except Exception as e: + self.logger.error(f"初始化失败,无法加载 exporter 配置: {e}") + raise + + # 确保 targets 目录存在 + self.targets_dir.mkdir(parents=True, exist_ok=True) + + def _setup_logging(self): + """设置日志配置""" + logging.basicConfig( + level=getattr(logging, self.log_level.upper()), + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(sys.stdout), + logging.FileHandler(f'{self.targets_dir}/targets_update.log') + ] + ) + self.logger = logging.getLogger(__name__) + + def _load_exporter_config(self) -> Dict[str, Any]: + """ + 加载 exporter 配置文件 + + Returns: + exporter 配置字典 + + Raises: + FileNotFoundError: 配置文件不存在 + json.JSONDecodeError: JSON 格式错误 + ValueError: 配置格式错误 + """ + if not self.exporter_config_file: + raise FileNotFoundError("Exporter 配置文件路径未指定") + + if not self.exporter_config_file.exists(): + raise FileNotFoundError(f"Exporter 配置文件不存在: {self.exporter_config_file}") + + try: + with open(self.exporter_config_file, 'r', encoding='utf-8') as f: + config = json.load(f) + + if not isinstance(config, dict): + raise ValueError("Exporter 配置文件必须是 JSON 对象格式") + + exporters = config.get('exporters', {}) + if not isinstance(exporters, dict): + raise ValueError("exporters 配置必须是对象格式") + + if not exporters: + raise ValueError("exporters 配置不能为空") + + self.logger.info(f"成功加载 exporter 配置: {len(exporters)} 个 exporter") + return config + + except json.JSONDecodeError as e: + self.logger.error(f"Exporter 配置文件 JSON 解析错误: {e}") + raise + except Exception as e: + self.logger.error(f"加载 exporter 配置失败: {e}") + raise + + def load_nodes_config(self) -> List[Dict[str, Any]]: + """ + 加载节点配置文件 + + Returns: + 节点配置列表 + """ + try: + if not self.config_file.exists(): + self.logger.warning(f"节点配置文件不存在: {self.config_file}") + return [] + + with open(self.config_file, 'r', encoding='utf-8') as f: + nodes = json.load(f) + + if not isinstance(nodes, list): + self.logger.error("节点配置必须是数组格式") + return [] + + self.logger.info(f"成功加载 {len(nodes)} 个节点配置") + return nodes + + except json.JSONDecodeError as e: + self.logger.error(f"JSON 解析错误: {e}") + return [] + except Exception as e: + self.logger.error(f"加载节点配置失败: {e}") + return [] + + def generate_targets(self, nodes: List[Dict[str, Any]], exporter_type: str) -> List[Dict[str, Any]]: + """ + 生成指定类型的 targets 配置 + + Args: + nodes: 节点配置列表 + exporter_type: exporter 类型 (dcgm, node) + + Returns: + targets 配置列表 + """ + if exporter_type not in self.exporter_configs: + self.logger.error(f"不支持的 exporter 类型: {exporter_type}") + return [] + + config = self.exporter_configs[exporter_type] + targets = [] + + for node in nodes: + # 验证必要字段 + if not all(key in node for key in ['node_id', 'ip']): + self.logger.warning(f"节点配置缺少必要字段,跳过: {node}") + continue + + # 构建 target 地址 + target_address = f"{node['ip']}:{config['port']}" + + # 构建上下文变量 + context = { + 'node_id': node['node_id'], + 'ip': node['ip'], + 'hostname': node.get('hostname', ''), + 'user_id': node.get('user_id', ''), + 'tag': self._join_labels(node.get('labels', [])) + } + + # 使用模板生成标签 + label_template = self.label_templates.get(exporter_type, {}) + labels = {} + + for label_key, template_value in label_template.items(): + if isinstance(template_value, str) and '{' in template_value: + # 模板字符串,需要渲染 + labels[label_key] = self._render_label_template(template_value, context) + else: + # 固定值 + labels[label_key] = template_value + + targets.append({ + "targets": [target_address], + "labels": labels + }) + + self.logger.info(f"为 {exporter_type} exporter 生成了 {len(targets)} 个 targets") + return targets + + def write_targets_file(self, targets: List[Dict[str, Any]], exporter_type: str) -> None: + """ + 写入 targets 文件 + + Args: + targets: targets 配置列表 + exporter_type: exporter 类型 + """ + filename = f"{exporter_type}_exporter.json" + filepath = self.targets_dir / filename + + try: + # 写入新文件 + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(targets, f, indent=2, ensure_ascii=False) + + self.logger.info(f"成功写入 targets 文件: {filepath}") + + except Exception as e: + self.logger.error(f"写入 targets 文件失败: {e}") + raise + + def update_all_targets(self) -> None: + """更新所有类型的 targets 文件""" + try: + # 加载节点配置 + nodes = self.load_nodes_config() + + if not nodes: + self.logger.warning("没有找到任何节点配置") + return + + # 为每种 exporter 类型生成 targets + for exporter_type in self.exporter_configs.keys(): + targets = self.generate_targets(nodes, exporter_type) + if targets: # 只有当有 targets 时才写入文件 + self.write_targets_file(targets, exporter_type) + + self.logger.info("所有 targets 文件更新完成") + + except Exception as e: + self.logger.error(f"更新 targets 失败: {e}") + raise + + def _calculate_file_hash(self, file_path: Path) -> str: + """ + 计算文件内容的 MD5 哈希值 + + Args: + file_path: 文件路径 + + Returns: + 文件内容的 MD5 哈希值 + """ + try: + with open(file_path, 'rb') as f: + content = f.read() + return hashlib.md5(content).hexdigest() + except Exception as e: + self.logger.error(f"计算文件哈希失败: {e}") + return "" + + def _render_label_template(self, template: str, context: Dict[str, str]) -> str: + """ + 渲染标签模板 + + Args: + template: 模板字符串,如 "dcgm-exporter-{node_id}" + context: 上下文变量字典 + + Returns: + 渲染后的字符串 + """ + try: + return template.format(**context) + except KeyError as e: + self.logger.warning(f"模板渲染失败,缺少变量 {e}: {template}") + return template + except Exception as e: + self.logger.warning(f"模板渲染失败: {e}") + return template + + def _join_labels(self, labels_list: List[str]) -> str: + """ + 将 labels 数组拼接成一个字符串 + + Args: + labels_list: 标签字符串数组 + + Returns: + 拼接后的字符串,用逗号分隔 + """ + if not labels_list: + return "" + + # 过滤掉空字符串和 None 值 + valid_labels = [label.strip() for label in labels_list if label and label.strip()] + + return ",".join(valid_labels) + + def check_file_changed(self) -> bool: + """ + 检查配置文件是否发生变化 + + Returns: + True 如果文件发生变化,False 否则 + """ + try: + if not self.config_file.exists(): + return False + + # 计算当前文件内容哈希 + current_hash = self._calculate_file_hash(self.config_file) + if not current_hash: + return False + + # 如果是第一次检查,记录哈希并触发更新 + if self.last_content_hash is None: + self.last_content_hash = current_hash + self.logger.info("首次检查,记录文件内容哈希并触发初始更新") + return True + + # 比较内容哈希 + if current_hash != self.last_content_hash: + self.last_content_hash = current_hash + self.logger.info("检测到文件内容变化") + return True + + return False + + except Exception as e: + self.logger.error(f"检查文件变化失败: {e}") + return False + + def run_daemon(self, check_interval: int = 30) -> None: + """ + 以守护进程模式运行,定期检查文件变化 + + Args: + check_interval: 检查间隔(秒) + """ + self.logger.info(f"启动守护进程模式,检查间隔: {check_interval}秒") + + try: + while True: + if self.check_file_changed(): + self.logger.info("检测到配置文件变化,开始更新 targets") + self.update_all_targets() + else: + self.logger.debug("配置文件无变化,跳过更新") + + time.sleep(check_interval) + + except KeyboardInterrupt: + self.logger.info("收到中断信号,正在退出...") + except Exception as e: + self.logger.error(f"守护进程运行错误: {e}") + raise + + +def main(): + """主函数""" + parser = argparse.ArgumentParser(description="Prometheus Targets 动态更新脚本 (精简版)") + parser.add_argument( + "--config", + default="/private/argus/metric/prometheus/nodes.json", + help="节点配置文件路径 (默认: /private/argus/metric/prometheus/nodes.json)" + ) + parser.add_argument( + "--targets-dir", + default="/private/argus/metric/prometheus/targets", + help="targets 文件输出目录 (默认: /private/argus/metric/prometheus/targets)" + ) + parser.add_argument( + "--exporter-config", + default="/private/argus/metric/prometheus/exporter_config.json", + help="exporter 配置文件路径 (默认: /private/argus/metric/prometheus/exporter_config.json)" + ) + parser.add_argument( + "--log-level", + choices=["DEBUG", "INFO", "WARNING", "ERROR"], + default="INFO", + help="日志级别 (默认: INFO)" + ) + parser.add_argument( + "--daemon", + action="store_true", + help="以守护进程模式运行" + ) + parser.add_argument( + "--check-interval", + type=int, + default=30, + help="守护进程模式下的检查间隔(秒,默认: 30)" + ) + + args = parser.parse_args() + + try: + # 创建管理器 + manager = PrometheusTargetsManager( + config_file=args.config, + targets_dir=args.targets_dir, + exporter_config_file=args.exporter_config, + log_level=args.log_level + ) + + if args.daemon: + # 守护进程模式 + manager.run_daemon(args.check_interval) + else: + # 单次执行模式 + manager.update_all_targets() + print("成功更新所有 exporter targets") + + except Exception as e: + print(f"错误: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/metric/prometheus/demo-targets/dcgm_exporter.json b/src/metric/prometheus/demo-targets/dcgm_exporter.json new file mode 100644 index 0000000..f551adb --- /dev/null +++ b/src/metric/prometheus/demo-targets/dcgm_exporter.json @@ -0,0 +1,9 @@ +[ + { + "targets": ["localhost:9400"], + "labels": { + "job": "dcgm", + "instance": "dcgm-exporter" + } + } +] diff --git a/src/metric/prometheus/demo-targets/node_exporter.json b/src/metric/prometheus/demo-targets/node_exporter.json new file mode 100644 index 0000000..37b5104 --- /dev/null +++ b/src/metric/prometheus/demo-targets/node_exporter.json @@ -0,0 +1,9 @@ +[ + { + "targets": ["localhost:9100", "192.168.16.116:9100"], + "labels": { + "job": "node", + "instance": "node-exporter" + } + } +] diff --git a/src/metric/tests/.gitignore b/src/metric/tests/.gitignore new file mode 100644 index 0000000..b73f619 --- /dev/null +++ b/src/metric/tests/.gitignore @@ -0,0 +1,7 @@ +.env +data/ +images-cache/ +*.tar +*.log +.DS_Store + diff --git a/src/metric/tests/README.md b/src/metric/tests/README.md new file mode 100644 index 0000000..2898e45 --- /dev/null +++ b/src/metric/tests/README.md @@ -0,0 +1,171 @@ +# E2E Test - Argus Metric 部署测试 +## 概述 + +本项目用于对 Argus Metric 模块进行端到端(E2E)部署测试。 +通过一键脚本可快速搭建 Prometheus、FTP、Grafana 等服务,验证 Metric 模块的完整部署与运行流程。 + +## 拉取完整项目,进入 metric.tests 目录 + +``` bash +git clone https://git.nasp.fit/NASP/argus.git + +cd {$PROJECT_ROOT}/argus/src/metric/tests +``` + +## 一键构建与部署 Prometheus / FTP / Grafana +### 1. 修改环境变量文件 + +将示例配置文件复制为 .env 并根据实际情况修改: + +``` bash +cp env.example .env +``` + +### 2. 一键启动服务 + +执行以下命令完成环境初始化、镜像构建与服务启动: + +``` bash +sudo bash start-all.sh +``` + +该脚本将自动完成: +- 初始化目录结构(如 /private/argus/metric) +- 构建各服务 Docker 镜像 +- 启动 Prometheus、FTP、Grafana 容器 + +### 3. 检查容器日志 + +可手动验证容器运行状态: + +``` bash +docker logs argus-metric-ftp +docker logs argus-metric-grafana +docker logs argus-metric-prometheus +``` + +如日志输出中无 ERROR 或 supervisor 报错信息,则表示服务启动正常。 + +## 客户端安装包打包与分发 + +> **前置说明**:完整的 `all-in-one` 安装包打包分发框架因包含大量二进制文件和依赖包,无法上传至 Git 仓库。请先联系项目管理员获取最新的 `all-in-one` 完整框架,再执行后续操作。 + +打包后服务端会将安装包发布至 FTP 共享目录,默认路径为: + +``` bash +$DATA_ROOT/ftp/share +``` + +发布后的文件权限与 FTP 目录账户保持一致。 + +### 1. 递增版本号 +``` bash +bash scripts/version-manager.sh bump minor +``` +该脚本会自动更新版本号(如 1.101.0 → 1.102.0)。 + +### 2. 打包安装制品 +``` bash +bash scripts/package_artifact.sh +``` +执行后会在输出目录中生成压缩包或安装脚本。 + +### 3. 发布制品至 FTP +``` bash +sudo bash scripts/publish_artifact.sh $VERSION --output-dir $OUTPUT_DIR --owner $UID:$GID +``` + +参数说明: + +参数 说明 +$VERSION 发布版本号(如 1.102.0) +$OUTPUT_DIR 输出目录(默认 /private/argus/ftp/share) +$UID:$GID 文件属主(用户ID:组ID) + +示例: + +``` bash +sudo bash scripts/publish_artifact.sh 1.102.0 --output-dir /private/argus/ftp/share --owner 2133:2015 +``` + +更多详情可参考 client-plugins/all-in-one/README.md。 + +## 客户端安装(通过 FTP) + +客户端下载与安装步骤如下: + +``` bash +curl -u ${USER}:${PASSWD} ftp://${FTP_SERVER}/setup.sh -o setup.sh +chmod +x setup.sh +sudo bash setup.sh --server ${FTP_SERVER} --user ${USER} --password ${PASSWD} --port ${PORT} +``` + +参数说明: + +参数 说明 +$FTP_SERVER 服务器 IP 地址 +$USER 默认 ftpuser +$PASSWD 默认 ZGClab1234! +$PORT FTP 服务端口(需与 .env 保持一致) + +示例: + +``` bash +curl -u ftpuser:ZGClab1234! ftp://10.211.55.4/setup.sh -o setup.sh +chmod +x setup.sh +sudo bash setup.sh --server 10.211.55.4 --user ftpuser --password 'ZGClab1234!' --port 2122 +``` + +更多细节可参考 client-plugins/all-in-one/README.md。 + +## 模拟 Argus-Master 配置下发 + +可通过手动写入 nodes.json 文件模拟 Argus-Master 对 Argus-Metric 的配置下发: + +``` json +[ + { + "node_id": "A1", + "user_id": "sundapeng", + "ip": "10.211.55.4", + "hostname": "dev-sundapeng-nsche-wohen-pod-0", + "labels": ["label-a", "label-b"] + } +] +``` + +路径: + +``` bash +${DATA_ROOT}/prometheus/nodes.json +``` + +Argus-Metric 中的 prometheus 模块会自动解析该文件,并将其拆分生成目标配置: + +``` bash +${DATA_ROOT}/prometheus/targets/ +``` + +## Grafana 手动配置(如未自动接入 Prometheus) + +如 Grafana 未自动导入 Prometheus 数据源,可手动执行以下操作: + +1. 添加数据源 +- 进入 Grafana → Data sources +- 选择 Add data source → Prometheus +- URL 填写:http://prometheus:9090(Docker 内部 DNS 地址) + +2. 导入测试 Dashboard +- 打开 Grafana → Dashboards → Import +- 上传或粘贴 test_grafana_dashboard.json + +## 查看监控数据 +Prometheus 访问以下地址查看节点活性: +``` bash +http://127.0.0.1:9091/targets +``` + +Grafana 访问以下地址查看监控大屏: +``` bash +http://127.0.0.1:3000/d/node_gpu_metrics/node-and-gpu-metrics +``` \ No newline at end of file diff --git a/src/metric/tests/check-paths.sh b/src/metric/tests/check-paths.sh new file mode 100755 index 0000000..bc93897 --- /dev/null +++ b/src/metric/tests/check-paths.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +# 路径检查脚本 +# 用于验证所有必要的构建目录是否存在 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +echo "==========================================" +echo " 路径检查脚本" +echo "==========================================" +echo "" +echo "当前脚本目录: $SCRIPT_DIR" +echo "当前工作目录: $(pwd)" +echo "" + +# 检查配置文件 +echo "检查配置文件..." +if [ -f "$SCRIPT_DIR/docker-compose.yml" ]; then + echo " ✓ docker-compose.yml 存在" +else + echo " ✗ docker-compose.yml 不存在" +fi + +if [ -f "$SCRIPT_DIR/.env" ]; then + echo " ✓ .env 存在" +elif [ -f "$SCRIPT_DIR/env.example" ]; then + echo " ⚠ .env 不存在,但 env.example 存在" +else + echo " ✗ .env 和 env.example 都不存在" +fi +echo "" + +# 检查构建目录 +echo "检查构建目录..." +BUILD_DIRS=( + "../ftp/build" + "../prometheus/build" + "../grafana/build" +) + +all_exist=true +for dir in "${BUILD_DIRS[@]}"; do + full_path="$SCRIPT_DIR/$dir" + if [ -d "$full_path" ]; then + echo " ✓ $dir" + echo " 完整路径: $full_path" + else + echo " ✗ $dir 不存在" + echo " 查找路径: $full_path" + all_exist=false + fi +done +echo "" + +# 检查 Dockerfile +echo "检查 Dockerfile..." +DOCKERFILES=( + "../ftp/build/Dockerfile" + "../prometheus/build/Dockerfile" + "../grafana/build/Dockerfile" +) + +for dockerfile in "${DOCKERFILES[@]}"; do + full_path="$SCRIPT_DIR/$dockerfile" + if [ -f "$full_path" ]; then + echo " ✓ $dockerfile" + else + echo " ✗ $dockerfile 不存在" + echo " 查找路径: $full_path" + all_exist=false + fi +done +echo "" + +# 检查数据目录(可选) +if [ -f "$SCRIPT_DIR/.env" ]; then + source "$SCRIPT_DIR/.env" + DATA_ROOT=${DATA_ROOT:-./data} + + echo "检查数据目录..." + echo " 数据根目录: $DATA_ROOT" + + if [ -d "$SCRIPT_DIR/$DATA_ROOT" ]; then + echo " ✓ 数据目录存在" + ls -la "$SCRIPT_DIR/$DATA_ROOT" | head -10 + else + echo " ⚠ 数据目录不存在(首次运行时会自动创建)" + fi + echo "" +fi + +# 总结 +echo "==========================================" +if $all_exist; then + echo " ✓ 所有必要的文件和目录都存在" + echo " 可以运行 ./start-all.sh 启动服务" +else + echo " ✗ 部分文件或目录缺失" + echo " 请检查项目结构是否完整" +fi +echo "==========================================" +echo "" + diff --git a/src/metric/tests/docker-compose.yml b/src/metric/tests/docker-compose.yml new file mode 100644 index 0000000..45ea0ac --- /dev/null +++ b/src/metric/tests/docker-compose.yml @@ -0,0 +1,105 @@ +services: + ftp: + build: + context: ../ftp/build + dockerfile: Dockerfile + args: + FTP_UID: ${FTP_UID:-2133} + FTP_GID: ${FTP_GID:-2015} + image: argus-metric-ftp:latest + container_name: argus-ftp + restart: unless-stopped + environment: + - FTP_BASE_PATH=/private/argus/ftp + - FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!} + - DOMAIN=${FTP_DOMAIN:-prom.ftp.argus.com} + - FTP_UID=${FTP_UID:-2133} + - FTP_GID=${FTP_GID:-2015} + ports: + - "${FTP_PORT:-21}:21" + - "${FTP_DATA_PORT:-20}:20" + - "21100-21110:21100-21110" + volumes: + - ${DATA_ROOT:-./data}/ftp:/private/argus/ftp + - ${DATA_ROOT:-./data}/etc:/private/argus/etc + networks: + - argus-network + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + prometheus: + build: + context: ../prometheus/build + dockerfile: Dockerfile + args: + PROMETHEUS_UID: ${PROMETHEUS_UID:-2133} + PROMETHEUS_GID: ${PROMETHEUS_GID:-2015} + USE_INTRANET: ${USE_INTRANET:-false} + image: argus-metric-prometheus:latest + container_name: argus-prometheus + restart: unless-stopped + environment: + - PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus + - PROMETHEUS_UID=${PROMETHEUS_UID:-2133} + - PROMETHEUS_GID=${PROMETHEUS_GID:-2015} + ports: + - "${PROMETHEUS_PORT:-9090}:9090" + volumes: + - ${DATA_ROOT:-./data}/prometheus:/private/argus/metric/prometheus + - ${DATA_ROOT:-./data}/etc:/private/argus/etc + networks: + - argus-network + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + grafana: + build: + context: ../grafana/build + dockerfile: Dockerfile + args: + GRAFANA_UID: ${GRAFANA_UID:-2133} + GRAFANA_GID: ${GRAFANA_GID:-2015} + image: argus-metric-grafana:latest + container_name: argus-grafana + restart: unless-stopped + environment: + - GRAFANA_BASE_PATH=/private/argus/metric/grafana + - GRAFANA_UID=${GRAFANA_UID:-2133} + - GRAFANA_GID=${GRAFANA_GID:-2015} + - GF_SERVER_HTTP_PORT=3000 + - GF_LOG_LEVEL=warn + - GF_LOG_MODE=console + ports: + - "${GRAFANA_PORT:-3000}:3000" + volumes: + - ${DATA_ROOT:-./data}/grafana:/private/argus/metric/grafana + - ${DATA_ROOT:-./data}/etc:/private/argus/etc + networks: + - argus-network + depends_on: + - prometheus + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + +networks: + argus-network: + driver: bridge + name: argus-network + +volumes: + ftp_data: + driver: local + prometheus_data: + driver: local + grafana_data: + driver: local + diff --git a/src/metric/tests/env.example b/src/metric/tests/env.example new file mode 100644 index 0000000..9d72de2 --- /dev/null +++ b/src/metric/tests/env.example @@ -0,0 +1,26 @@ +# 用户和组配置 +FTP_UID=2133 +FTP_GID=2015 +PROMETHEUS_UID=2133 +PROMETHEUS_GID=2015 +GRAFANA_UID=2133 +GRAFANA_GID=2015 + +# 数据根目录 +DATA_ROOT=/private/argus + +# FTP 配置 +FTP_PORT=2122 +FTP_DATA_PORT=2022 +FTP_PASSWORD=ZGClab1234! +FTP_DOMAIN=prom.ftp.argus.com + +# Prometheus 配置 +PROMETHEUS_PORT=9090 + +# Grafana 配置 +GRAFANA_PORT=3000 + +# 网络配置 +USE_INTRANET=false + diff --git a/src/metric/tests/init-directories.sh b/src/metric/tests/init-directories.sh new file mode 100755 index 0000000..df5d719 --- /dev/null +++ b/src/metric/tests/init-directories.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +# 初始化目录脚本 +# 用于创建所有必要的数据目录并设置正确的权限 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# 加载 .env 文件(如果存在) +if [ -f .env ]; then + echo "加载 .env 配置文件..." + source .env +fi + +# 默认配置 +FTP_UID=${FTP_UID:-2133} +FTP_GID=${FTP_GID:-2015} +PROMETHEUS_UID=${PROMETHEUS_UID:-2133} +PROMETHEUS_GID=${PROMETHEUS_GID:-2015} +GRAFANA_UID=${GRAFANA_UID:-2133} +GRAFANA_GID=${GRAFANA_GID:-2015} +DATA_ROOT=${DATA_ROOT:-./data} + +echo "开始初始化目录结构..." +echo "数据目录: ${DATA_ROOT}" +echo "" + +# 创建 FTP 目录 +echo "创建 FTP 目录..." +sudo mkdir -p ${DATA_ROOT}/ftp/share +sudo chown -R ${FTP_UID}:${FTP_GID} ${DATA_ROOT}/ftp +sudo chmod -R 755 ${DATA_ROOT}/ftp + +# 创建 Prometheus 目录 +echo "创建 Prometheus 目录..." +sudo mkdir -p ${DATA_ROOT}/prometheus/{data,rules,targets} + +# 创建默认的 targets 文件(先创建文件再改权限) +if [ ! -f "${DATA_ROOT}/prometheus/targets/node_exporter.json" ]; then + echo "创建默认 node_exporter targets..." + echo '[ + { + "targets": [], + "labels": { + "job": "node" + } + } +]' | sudo tee ${DATA_ROOT}/prometheus/targets/node_exporter.json > /dev/null +fi + +if [ ! -f "${DATA_ROOT}/prometheus/targets/dcgm_exporter.json" ]; then + echo "创建默认 dcgm_exporter targets..." + echo '[ + { + "targets": [], + "labels": { + "job": "dcgm" + } + } +]' | sudo tee ${DATA_ROOT}/prometheus/targets/dcgm_exporter.json > /dev/null +fi + +# 统一设置 Prometheus 目录权限 +sudo chown -R ${PROMETHEUS_UID}:${PROMETHEUS_GID} ${DATA_ROOT}/prometheus +sudo chmod -R 755 ${DATA_ROOT}/prometheus + +# 创建 Grafana 目录 +echo "创建 Grafana 目录..." +sudo mkdir -p ${DATA_ROOT}/grafana/{data,logs,plugins,provisioning/datasources,provisioning/dashboards,data/sessions,data/dashboards,config} +sudo chown -R ${GRAFANA_UID}:${GRAFANA_GID} ${DATA_ROOT}/grafana +sudo chmod -R 755 ${DATA_ROOT}/grafana + +# 创建公共配置目录 +sudo mkdir -p ${DATA_ROOT}/etc +sudo chown -R ${FTP_UID}:${FTP_GID} ${DATA_ROOT}/etc +sudo chmod -R 755 ${DATA_ROOT}/etc + +echo "目录初始化完成!" +echo "" +echo "目录结构:" +echo " ${DATA_ROOT}/" +echo " ├── ftp/ (UID:${FTP_UID}, GID:${FTP_GID})" +echo " ├── prometheus/ (UID:${PROMETHEUS_UID}, GID:${PROMETHEUS_GID})" +echo " ├── grafana/ (UID:${GRAFANA_UID}, GID:${GRAFANA_GID})" +echo " └── etc/ (UID:${FTP_UID}, GID:${FTP_GID})" +echo "" +echo "您现在可以运行 'docker-compose up -d' 来启动所有服务" + diff --git a/src/metric/tests/init-environment.sh b/src/metric/tests/init-environment.sh new file mode 100755 index 0000000..38f23d3 --- /dev/null +++ b/src/metric/tests/init-environment.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +################################################################################ +# Ubuntu 22.04 环境初始化脚本 +# 用途:安装开发测试环境所需的基础工具 +# 系统要求:Ubuntu 22.04 +# 使用方法:sudo ./init_environment.sh +################################################################################ + +set -e + +echo "===================================" +echo "开始安装环境依赖..." +echo "===================================" + +# 更新系统 +echo "[1/4] 更新系统包列表..." +apt-get update -y + +# 安装基础工具 +echo "[2/4] 安装基础工具..." +apt-get install -y \ + vim \ + curl \ + wget \ + git \ + htop \ + tree \ + net-tools \ + dnsutils \ + iputils-ping \ + telnet \ + traceroute \ + lsof \ + unzip \ + zip \ + tar \ + jq \ + ca-certificates \ + gnupg \ + lsb-release \ + software-properties-common \ + apt-transport-https \ + build-essential \ + python3 \ + python3-pip \ + python3-venv \ + tmux \ + ncdu + +# 安装 Docker +echo "[3/4] 安装 Docker..." + +# 卸载旧版本 +apt-get remove -y docker docker-engine docker.io containerd runc 2>/dev/null || true + +# 添加 Docker 官方 GPG key +install -m 0755 -d /etc/apt/keyrings +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg +chmod a+r /etc/apt/keyrings/docker.gpg + +# 添加 Docker 仓库 +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + tee /etc/apt/sources.list.d/docker.list > /dev/null + +# 更新包列表并安装 Docker +apt-get update -y +apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + +# 启动 Docker 服务 +systemctl start docker +systemctl enable docker + +# 添加当前用户到 docker 组 +if [ -n "$SUDO_USER" ]; then + usermod -aG docker "$SUDO_USER" + echo "✓ 用户 $SUDO_USER 已添加到 docker 组" +fi + +# 清理 +echo "[4/4] 清理..." +apt-get autoremove -y +apt-get autoclean -y + +# 显示安装结果 +echo "" +echo "===================================" +echo "安装完成!" +echo "===================================" +echo "" +echo "已安装:" +echo " ✓ vim" +echo " ✓ curl, wget, git" +echo " ✓ Docker: $(docker --version)" +echo " ✓ Docker Compose: $(docker compose version)" +echo " ✓ Python: $(python3 --version)" +echo " ✓ 其他基础工具 (htop, tree, jq, tmux 等)" +echo "" +if [ -n "$SUDO_USER" ]; then + echo "提示:请重新登录以使 docker 组权限生效" +fi +echo "" + diff --git a/src/metric/tests/manage-images.sh b/src/metric/tests/manage-images.sh new file mode 100755 index 0000000..b47150b --- /dev/null +++ b/src/metric/tests/manage-images.sh @@ -0,0 +1,371 @@ +#!/bin/bash + +# Docker 镜像管理脚本 +# 支持构建、保存、加载、清理镜像 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# 检测 docker-compose 命令 +if command -v docker-compose &> /dev/null; then + DOCKER_COMPOSE="docker-compose" +elif docker compose version &> /dev/null 2>&1; then + DOCKER_COMPOSE="docker compose" +else + echo "错误: 未找到 docker-compose 或 docker compose 命令" + exit 1 +fi + +# 镜像缓存目录 +IMAGE_CACHE_DIR="./images-cache" +mkdir -p "$IMAGE_CACHE_DIR" + +# 定义镜像列表 +IMAGES=( + "argus-metric-ftp:latest" + "argus-metric-prometheus:latest" + "argus-metric-grafana:latest" +) + +# 镜像文件名映射 +declare -A IMAGE_FILES=( + ["argus-metric-ftp:latest"]="argus-ftp.tar" + ["argus-metric-prometheus:latest"]="argus-prometheus.tar" + ["argus-metric-grafana:latest"]="argus-grafana.tar" +) + +# 检查镜像是否存在 +check_image_exists() { + local image=$1 + if docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then + return 0 + else + return 1 + fi +} + +# 加载镜像 +load_image() { + local image=$1 + local file="${IMAGE_CACHE_DIR}/${IMAGE_FILES[$image]}" + + if [ -f "$file" ]; then + echo "正在从缓存加载镜像: $image" + docker load -i "$file" + return 0 + else + return 1 + fi +} + +# 保存镜像 +save_image() { + local image=$1 + local file="${IMAGE_CACHE_DIR}/${IMAGE_FILES[$image]}" + + if check_image_exists "$image"; then + echo "正在保存镜像到缓存: $image" + docker save -o "$file" "$image" + echo "已保存: $file ($(du -h "$file" | cut -f1))" + return 0 + else + echo "镜像不存在: $image" + return 1 + fi +} + +# 构建所有镜像 +build_all() { + echo "==========================================" + echo " 构建所有 Docker 镜像" + echo "==========================================" + echo "" + + local build_flag="${1:---no-cache}" + + echo "开始构建镜像..." + $DOCKER_COMPOSE build $build_flag + + echo "" + echo "构建完成!" +} + +# 保存所有镜像 +save_all() { + echo "==========================================" + echo " 保存所有 Docker 镜像到缓存" + echo "==========================================" + echo "" + + for image in "${IMAGES[@]}"; do + if save_image "$image"; then + echo "✓ $image" + else + echo "✗ $image (跳过)" + fi + echo "" + done + + echo "缓存目录: $IMAGE_CACHE_DIR" + echo "总大小: $(du -sh "$IMAGE_CACHE_DIR" | cut -f1)" +} + +# 加载所有镜像 +load_all() { + echo "==========================================" + echo " 从缓存加载所有 Docker 镜像" + echo "==========================================" + echo "" + + local loaded=0 + local skipped=0 + + for image in "${IMAGES[@]}"; do + if check_image_exists "$image"; then + echo "镜像已存在,跳过: $image" + ((skipped++)) + elif load_image "$image"; then + echo "✓ 已加载: $image" + ((loaded++)) + else + echo "✗ 缓存不存在: $image" + fi + echo "" + done + + echo "加载: $loaded, 跳过: $skipped" +} + +# 检查镜像状态 +status() { + echo "==========================================" + echo " 镜像状态" + echo "==========================================" + echo "" + + echo "Docker 镜像:" + for image in "${IMAGES[@]}"; do + if check_image_exists "$image"; then + local size=$(docker images --format "{{.Size}}" "$image" | head -1) + echo " ✓ $image ($size)" + else + echo " ✗ $image (未构建)" + fi + done + + echo "" + echo "缓存文件:" + if [ -d "$IMAGE_CACHE_DIR" ] && [ "$(ls -A $IMAGE_CACHE_DIR 2>/dev/null)" ]; then + for image in "${IMAGES[@]}"; do + local file="${IMAGE_CACHE_DIR}/${IMAGE_FILES[$image]}" + if [ -f "$file" ]; then + echo " ✓ ${IMAGE_FILES[$image]} ($(du -h "$file" | cut -f1))" + else + echo " ✗ ${IMAGE_FILES[$image]} (不存在)" + fi + done + echo "" + echo "缓存总大小: $(du -sh "$IMAGE_CACHE_DIR" | cut -f1)" + else + echo " (无缓存文件)" + fi +} + +# 清理缓存 +clean_cache() { + echo "==========================================" + echo " 清理镜像缓存" + echo "==========================================" + echo "" + + if [ -d "$IMAGE_CACHE_DIR" ] && [ "$(ls -A $IMAGE_CACHE_DIR 2>/dev/null)" ]; then + echo "缓存目录: $IMAGE_CACHE_DIR" + echo "大小: $(du -sh "$IMAGE_CACHE_DIR" | cut -f1)" + echo "" + read -p "确认删除所有缓存文件? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + rm -rf "$IMAGE_CACHE_DIR"/*.tar + echo "已清理缓存文件" + else + echo "已取消" + fi + else + echo "没有缓存文件" + fi +} + +# 清理 Docker 镜像 +clean_images() { + echo "==========================================" + echo " 清理 Docker 镜像" + echo "==========================================" + echo "" + + local exists=0 + for image in "${IMAGES[@]}"; do + if check_image_exists "$image"; then + exists=1 + break + fi + done + + if [ $exists -eq 0 ]; then + echo "没有需要清理的镜像" + return + fi + + echo "将删除以下镜像:" + for image in "${IMAGES[@]}"; do + if check_image_exists "$image"; then + echo " - $image" + fi + done + echo "" + + read -p "确认删除这些镜像? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + for image in "${IMAGES[@]}"; do + if check_image_exists "$image"; then + docker rmi "$image" + echo "已删除: $image" + fi + done + else + echo "已取消" + fi +} + +# 智能准备镜像(自动检测并加载或构建) +prepare() { + echo "==========================================" + echo " 智能准备 Docker 镜像" + echo "==========================================" + echo "" + + local need_build=() + local loaded=0 + local existed=0 + + for image in "${IMAGES[@]}"; do + if check_image_exists "$image"; then + echo "✓ 镜像已存在: $image" + ((existed++)) + elif load_image "$image"; then + echo "✓ 已从缓存加载: $image" + ((loaded++)) + else + echo "✗ 需要构建: $image" + need_build+=("$image") + fi + done + + echo "" + echo "统计: 已存在 $existed, 已加载 $loaded, 需构建 ${#need_build[@]}" + + if [ ${#need_build[@]} -gt 0 ]; then + echo "" + echo "需要构建以下镜像:" + for image in "${need_build[@]}"; do + echo " - $image" + done + echo "" + + read -p "是否现在构建? (Y/n): " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Nn]$ ]]; then + build_all "" + echo "" + read -p "是否保存新构建的镜像到缓存? (Y/n): " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Nn]$ ]]; then + save_all + fi + fi + else + echo "" + echo "所有镜像已就绪!" + fi +} + +# 显示帮助 +show_help() { + cat << EOF +Docker 镜像管理工具 + +用法: $0 + +命令: + prepare 智能准备镜像(推荐)- 自动检测、加载或构建 + build 构建所有镜像 + build-cache 使用缓存构建 + save 保存所有镜像到缓存 + load 从缓存加载所有镜像 + status 查看镜像状态 + clean-cache 清理缓存文件 + clean-images 清理 Docker 镜像 + clean-all 清理缓存和镜像 + help 显示此帮助信息 + +示例: + # 智能准备(首次使用或镜像丢失时) + $0 prepare + + # 构建并保存镜像 + $0 build + $0 save + + # 从缓存加载镜像 + $0 load + + # 查看状态 + $0 status + +镜像缓存目录: $IMAGE_CACHE_DIR/ +EOF +} + +# 主逻辑 +case "${1:-help}" in + prepare) + prepare + ;; + build) + build_all "--no-cache" + ;; + build-cache) + build_all "" + ;; + save) + save_all + ;; + load) + load_all + ;; + status) + status + ;; + clean-cache) + clean_cache + ;; + clean-images) + clean_images + ;; + clean-all) + clean_cache + clean_images + ;; + help|--help|-h) + show_help + ;; + *) + echo "错误: 未知命令 '$1'" + echo "" + show_help + exit 1 + ;; +esac + diff --git a/src/metric/tests/start-all.sh b/src/metric/tests/start-all.sh new file mode 100755 index 0000000..b0ceb72 --- /dev/null +++ b/src/metric/tests/start-all.sh @@ -0,0 +1,199 @@ +#!/bin/bash + +# 一键启动脚本 +# 用于初始化目录、构建镜像并启动所有服务 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +echo "==========================================" +echo " Argus Metrics 一键启动脚本" +echo "==========================================" +echo "" +echo "当前工作目录: $SCRIPT_DIR" +echo "" + +# 检查 Docker 和 Docker Compose +if ! command -v docker &> /dev/null; then + echo "错误: 未找到 docker 命令,请先安装 Docker" + exit 1 +fi + +# 检测 docker-compose 命令(兼容新旧版本) +COMPOSE_FILE="$SCRIPT_DIR/docker-compose.yml" +if command -v docker-compose &> /dev/null; then + DOCKER_COMPOSE="docker-compose -f $COMPOSE_FILE" + echo "使用: docker-compose" +elif docker compose version &> /dev/null 2>&1; then + DOCKER_COMPOSE="docker compose -f $COMPOSE_FILE" + echo "使用: docker compose" +else + echo "错误: 未找到 docker-compose 或 docker compose 命令" + exit 1 +fi +echo "Compose 文件: $COMPOSE_FILE" +echo "" + +# 检查必要的构建目录 +echo "检查构建目录..." +BUILD_DIRS=( + "../ftp/build" + "../prometheus/build" + "../grafana/build" +) + +for dir in "${BUILD_DIRS[@]}"; do + if [ ! -d "$dir" ]; then + echo "错误: 构建目录不存在: $dir" + echo "完整路径: $(cd "$(dirname "$dir")" 2>/dev/null && pwd)/$(basename "$dir")" + exit 1 + else + echo " ✓ 找到: $dir" + fi +done +echo "" + +# 检查并创建 .env 文件 +if [ ! -f .env ]; then + echo "未找到 .env 文件,从 env.example 创建..." + cp env.example .env + echo "已创建 .env 文件,请根据需要修改配置" +fi + +# 加载环境变量 +source .env + +echo "1. 初始化目录结构..." +bash "$SCRIPT_DIR/init-directories.sh" + +echo "" +echo "2. 准备 Docker 镜像..." + +# 检查镜像是否存在 +IMAGE_CACHE_DIR="./images-cache" +IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest") +all_images_exist=true + +for image in "${IMAGES[@]}"; do + if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then + all_images_exist=false + break + fi +done + +if $all_images_exist; then + echo "所有镜像已存在,跳过构建" +else + echo "检测到缺失镜像,尝试从缓存加载..." + + # 尝试从缓存加载 + loaded_from_cache=false + if [ -d "$IMAGE_CACHE_DIR" ]; then + for image in "${IMAGES[@]}"; do + if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then + # 镜像不存在,尝试加载 + case "$image" in + "argus-metric-ftp:latest") + cache_file="${IMAGE_CACHE_DIR}/argus-ftp.tar" + ;; + "argus-metric-prometheus:latest") + cache_file="${IMAGE_CACHE_DIR}/argus-prometheus.tar" + ;; + "argus-metric-grafana:latest") + cache_file="${IMAGE_CACHE_DIR}/argus-grafana.tar" + ;; + esac + + if [ -f "$cache_file" ]; then + echo " 从缓存加载: $image" + docker load -i "$cache_file" + loaded_from_cache=true + fi + fi + done + fi + + # 检查加载后是否还有缺失的镜像 + need_build=false + for image in "${IMAGES[@]}"; do + if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then + need_build=true + break + fi + done + + if $need_build; then + echo "" + echo "部分镜像缺失,开始构建..." + echo "工作目录: $(pwd)" + cd "$SCRIPT_DIR" + $DOCKER_COMPOSE build + + # 询问是否保存镜像 + echo "" + read -p "是否保存镜像到缓存以便下次快速启动? (Y/n): " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Nn]$ ]]; then + mkdir -p "$IMAGE_CACHE_DIR" + echo "保存镜像到缓存..." + for image in "${IMAGES[@]}"; do + case "$image" in + "argus-metric-ftp:latest") + docker save -o "${IMAGE_CACHE_DIR}/argus-ftp.tar" "$image" && echo " 已保存: argus-ftp.tar" + ;; + "argus-metric-prometheus:latest") + docker save -o "${IMAGE_CACHE_DIR}/argus-prometheus.tar" "$image" && echo " 已保存: argus-prometheus.tar" + ;; + "argus-metric-grafana:latest") + docker save -o "${IMAGE_CACHE_DIR}/argus-grafana.tar" "$image" && echo " 已保存: argus-grafana.tar" + ;; + esac + done + echo "镜像已保存到: $IMAGE_CACHE_DIR/" + fi + elif $loaded_from_cache; then + echo "" + echo "所有镜像已从缓存加载完成!" + fi +fi + +echo "" +echo "3. 启动服务..." +cd "$SCRIPT_DIR" +$DOCKER_COMPOSE up -d + +echo "" +echo "4. 等待服务启动..." +sleep 5 + +echo "" +echo "5. 检查服务状态..." +cd "$SCRIPT_DIR" +$DOCKER_COMPOSE ps + +echo "" +echo "==========================================" +echo " 服务启动完成!" +echo "==========================================" +echo "" +echo "服务访问地址:" +echo " - FTP: ftp://localhost:${FTP_PORT:-21}" +echo " 用户名: ftpuser" +echo " 密码: ${FTP_PASSWORD:-ZGClab1234!}" +echo "" +echo " - Prometheus: http://localhost:${PROMETHEUS_PORT:-9090}" +echo "" +echo " - Grafana: http://localhost:${GRAFANA_PORT:-3000}" +echo " 用户名: admin" +echo " 密码: admin" +echo "" +echo "常用命令:" +echo " 查看日志: $DOCKER_COMPOSE logs -f [service]" +echo " 停止服务: $DOCKER_COMPOSE stop" +echo " 重启服务: $DOCKER_COMPOSE restart" +echo " 停止并删除: $DOCKER_COMPOSE down" +echo " 停止并删除卷: $DOCKER_COMPOSE down -v" +echo "" + diff --git a/src/metric/tests/stop-all.sh b/src/metric/tests/stop-all.sh new file mode 100755 index 0000000..6886160 --- /dev/null +++ b/src/metric/tests/stop-all.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# 停止所有服务脚本 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# 检测 docker-compose 命令(兼容新旧版本) +COMPOSE_FILE="$SCRIPT_DIR/docker-compose.yml" +if command -v docker-compose &> /dev/null; then + DOCKER_COMPOSE="docker-compose -f $COMPOSE_FILE" +elif docker compose version &> /dev/null 2>&1; then + DOCKER_COMPOSE="docker compose -f $COMPOSE_FILE" +else + echo "错误: 未找到 docker-compose 或 docker compose 命令" + exit 1 +fi + +echo "==========================================" +echo " 停止 Argus Metrics 服务" +echo "==========================================" +echo "" + +# 检查是否有运行的容器 +if [ "$($DOCKER_COMPOSE ps -q)" ]; then + echo "停止所有服务..." + $DOCKER_COMPOSE stop + + echo "" + read -p "是否要删除容器? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + $DOCKER_COMPOSE down + echo "容器已删除" + + read -p "是否要删除数据卷? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + $DOCKER_COMPOSE down -v + echo "数据卷已删除" + fi + fi +else + echo "没有运行的服务" +fi + +echo "" +echo "完成!" + diff --git a/src/metric/tests/test_grafana_dashboard.json b/src/metric/tests/test_grafana_dashboard.json new file mode 100644 index 0000000..4a09e80 --- /dev/null +++ b/src/metric/tests/test_grafana_dashboard.json @@ -0,0 +1,629 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 9, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Load", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 101, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "node_load1{hostname=\"$hostname\"}", + "legendFormat": "{{hostname}} load1", + "refId": "A" + }, + { + "expr": "node_load5{hostname=\"$hostname\"}", + "legendFormat": "{{hostname}} load5", + "refId": "B" + }, + { + "expr": "node_load15{hostname=\"$hostname\"}", + "legendFormat": "{{hostname}} load15", + "refId": "C" + } + ], + "title": "System Load", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "100 * (1 - avg by(hostname) (irate(node_cpu_seconds_total{mode=\"idle\",hostname=\"$hostname\"}[5m])))", + "legendFormat": "{{hostname}}", + "refId": "A" + } + ], + "title": "CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "%", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "100 * (1 - (node_memory_MemAvailable_bytes{hostname=\"$hostname\"} / node_memory_MemTotal_bytes{hostname=\"$hostname\"}))", + "legendFormat": "{{hostname}}", + "refId": "B" + } + ], + "title": "Node Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes/s", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by(hostname) (rate(node_disk_read_bytes_total{device!~\"^(loop|ram|sr0).*\",hostname=\"$hostname\"}[5m]))", + "legendFormat": "{{hostname}} read", + "refId": "A" + }, + { + "expr": "sum by(hostname) (rate(node_disk_written_bytes_total{device!~\"^(loop|ram|sr0).*\",hostname=\"$hostname\"}[5m]))", + "legendFormat": "{{hostname}} write", + "refId": "B" + } + ], + "title": "Node Disk I/O (Bytes/s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes/s", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 102, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by(hostname)(rate(node_network_receive_bytes_total{device!~\"^(lo|docker.*)\",hostname=\"$hostname\"}[5m]))", + "legendFormat": "{{hostname}} RX", + "refId": "A" + }, + { + "expr": "sum by(hostname)(rate(node_network_transmit_bytes_total{device!~\"^(lo|docker.*)\",hostname=\"$hostname\"}[5m]))", + "legendFormat": "{{hostname}} TX", + "refId": "B" + } + ], + "title": "Network Traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Processes", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 200 + }, + { + "color": "red", + "value": 500 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 104, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "expr": "node_procs_running{hostname=\"$hostname\"}", + "legendFormat": "{{hostname}} Running", + "refId": "A" + }, + { + "expr": "node_procs_blocked{hostname=\"$hostname\"}", + "legendFormat": "{{hostname}} Blocked", + "refId": "B" + } + ], + "title": "Node Process Count", + "type": "timeseries" + } + ], + "refresh": "15s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "node-exporter-A1", + "value": "node-exporter-A1" + }, + "datasource": { + "type": "prometheus" + }, + "definition": "label_values(node_cpu_seconds_total,hostname)", + "hide": 0, + "includeAll": false, + "label": "hostname", + "multi": false, + "name": "hostname", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(node_cpu_seconds_total,hostname)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-12h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Node and GPU Metrics", + "uid": "node_gpu_metrics", + "weekStart": "" +} \ No newline at end of file