diff --git a/src/master/Dockerfile b/src/master/Dockerfile index 32c2e3d..b855115 100644 --- a/src/master/Dockerfile +++ b/src/master/Dockerfile @@ -4,29 +4,60 @@ SHELL ["/bin/bash", "-c"] ARG PIP_INDEX_URL= ARG USE_OFFLINE=0 +ARG USE_INTRANET=false + ENV PIP_NO_CACHE_DIR=1 \ PYTHONUNBUFFERED=1 \ PYTHONPATH=/app +USER root + WORKDIR /app -COPY requirements.txt ./ -COPY offline_wheels/ /opt/offline_wheels/ +COPY ./src/master/requirements.txt ./requirements.txt +COPY ./src/master/offline_wheels/ /opt/offline_wheels/ RUN set -euxo pipefail \ && if [[ "$USE_OFFLINE" == "1" ]]; then \ + python -m pip install --no-index --find-links /opt/offline_wheels pip && \ python -m pip install --no-index --find-links /opt/offline_wheels -r requirements.txt; \ else \ - python -m pip install --upgrade pip \ - && if [[ -n "$PIP_INDEX_URL" ]]; then \ + python -m pip install --upgrade pip && \ + if [[ -n "$PIP_INDEX_URL" ]]; then \ PIP_INDEX_URL="$PIP_INDEX_URL" python -m pip install -r requirements.txt; \ - else \ + else \ python -m pip install -r requirements.txt; \ - fi; \ + fi; \ fi -COPY app ./app +# 配置内网 apt 源并安装常用工具 +RUN if [[ "$USE_INTRANET" == "true" ]]; then \ + echo "Configuring intranet apt sources" && \ + if [[ -f /etc/apt/sources.list ]]; then cp /etc/apt/sources.list /etc/apt/sources.list.bak; fi && \ + mkdir -p /etc/apt && \ + echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \ + echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \ + echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \ + fi && \ + apt-get update && \ + apt-get install -y supervisor net-tools inetutils-ping vim && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# 运行期切换到运行所需的 apt 源 +RUN if [[ "$USE_INTRANET" == "true" ]]; then \ + echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \ + fi + +RUN mkdir -p /var/log/supervisor + +COPY ./src/master/build/supervisord.conf /etc/supervisor/conf.d/supervisord.conf +COPY ./src/master/build/start-master.sh /usr/local/bin/start-master.sh +COPY ./src/master/build/dns-monitor.sh /usr/local/bin/dns-monitor.sh +RUN chmod +x /usr/local/bin/start-master.sh /usr/local/bin/dns-monitor.sh + +COPY ./src/master/app ./app EXPOSE 3000 -CMD ["gunicorn", "--bind", "0.0.0.0:3000", "app:create_app()"] +CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"] diff --git a/src/master/README.md b/src/master/README.md index 3e7cd8a..e10f08d 100644 --- a/src/master/README.md +++ b/src/master/README.md @@ -51,6 +51,15 @@ cd src/master/tests | `NODE_ID_PREFIX` | `A` | 新节点 ID 的前缀,实际 ID 形如 `A1`、`A2`。 | | `AUTH_MODE` | `disabled` | 预留的认证开关,当前固定为禁用。 | +## 进程与监控 + +镜像内通过 `supervisord` 管理进程: + +- `master`:执行 `/usr/local/bin/start-master.sh`,默认以 4 个 Gunicorn worker 监听 `0.0.0.0:3000`;可通过环境变量 `GUNICORN_WORKERS`、`GUNICORN_BIND`、`GUNICORN_EXTRA_ARGS` 调整。 +- `dns-monitor`:轮询 `/private/argus/etc/dns.conf`,若发现变更则调用 `/private/argus/etc/update-dns.sh`,日志输出在 `/var/log/supervisor/dns-monitor.log`。 + +镜像构建阶段会安装 `supervisor`/`net-tools`/`inetutils-ping`/`vim` 等基础工具,并在运行前把 apt 源切换到内网镜像,方便容器内进一步运维。 + ## REST API 详解 基础路径:`/api/v1/master`,全部返回 JSON。 diff --git a/src/master/build/dns-monitor.sh b/src/master/build/dns-monitor.sh new file mode 120000 index 0000000..dc3391b --- /dev/null +++ b/src/master/build/dns-monitor.sh @@ -0,0 +1 @@ +../../bind/build/dns-monitor.sh \ No newline at end of file diff --git a/src/master/build/start-master.sh b/src/master/build/start-master.sh new file mode 100755 index 0000000..97a2e15 --- /dev/null +++ b/src/master/build/start-master.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -euo pipefail + +WORKERS=${GUNICORN_WORKERS:-4} +BIND_ADDR=${GUNICORN_BIND:-0.0.0.0:3000} +EXTRA_OPTS=${GUNICORN_EXTRA_ARGS:-} + +if [[ -n "$EXTRA_OPTS" ]]; then + read -r -a EXTRA_ARRAY <<< "$EXTRA_OPTS" +else + EXTRA_ARRAY=() +fi + +exec gunicorn --bind "$BIND_ADDR" --workers "$WORKERS" "${EXTRA_ARRAY[@]}" "app:create_app()" diff --git a/src/master/build/supervisord.conf b/src/master/build/supervisord.conf new file mode 100644 index 0000000..5d250a2 --- /dev/null +++ b/src/master/build/supervisord.conf @@ -0,0 +1,39 @@ +[supervisord] +nodaemon=true +logfile=/var/log/supervisor/supervisord.log +pidfile=/var/run/supervisord.pid +user=root + +[program:master] +command=/usr/local/bin/start-master.sh +user=root +stdout_logfile=/var/log/supervisor/master.log +stderr_logfile=/var/log/supervisor/master_error.log +autostart=true +autorestart=true +startsecs=5 +stopwaitsecs=30 +killasgroup=true +stopasgroup=true + +[program:dns-monitor] +command=/usr/local/bin/dns-monitor.sh +user=root +stdout_logfile=/var/log/supervisor/dns-monitor.log +stderr_logfile=/var/log/supervisor/dns-monitor_error.log +autostart=true +autorestart=true +startsecs=5 +stopwaitsecs=10 +killasgroup=true +stopasgroup=true + +[unix_http_server] +file=/var/run/supervisor.sock +chmod=0700 + +[supervisorctl] +serverurl=unix:///var/run/supervisor.sock + +[rpcinterface:supervisor] +supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface diff --git a/src/master/scripts/build_images.sh b/src/master/scripts/build_images.sh index ae587d5..7bd5992 100755 --- a/src/master/scripts/build_images.sh +++ b/src/master/scripts/build_images.sh @@ -13,21 +13,27 @@ USAGE } SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" +MODULE_ROOT="$PROJECT_ROOT/src/master" IMAGE_TAG="${IMAGE_TAG:-argus-master:dev}" +DOCKERFILE="src/master/Dockerfile" BUILD_ARGS=() OFFLINE_MODE=0 +cd "$PROJECT_ROOT" + while [[ "$#" -gt 0 ]]; do case "$1" in --intranet) INTRANET_INDEX="${INTRANET_INDEX:-https://pypi.tuna.tsinghua.edu.cn/simple}" BUILD_ARGS+=("--build-arg" "PIP_INDEX_URL=${INTRANET_INDEX}") + BUILD_ARGS+=("--build-arg" "USE_INTRANET=true") shift ;; --offline) OFFLINE_MODE=1 BUILD_ARGS+=("--build-arg" "USE_OFFLINE=1") + BUILD_ARGS+=("--build-arg" "USE_INTRANET=true") shift ;; --tag) @@ -48,16 +54,19 @@ while [[ "$#" -gt 0 ]]; do done if [[ "$OFFLINE_MODE" -eq 1 ]]; then - WHEELS_DIR="$PROJECT_ROOT/offline_wheels" + WHEELS_DIR="$MODULE_ROOT/offline_wheels" if [[ ! -d "$WHEELS_DIR" ]]; then echo "[ERROR] offline_wheels 目录不存在: $WHEELS_DIR" >&2 exit 1 fi if ! find "$WHEELS_DIR" -maxdepth 1 -type f -name '*.whl' -print -quit >/dev/null; then - echo "[WARN] offline_wheels 目录为空,请确保已提前下载所需的 wheel 包" >&2 + echo "[ERROR] offline_wheels 目录为空,请先在有网环境执行 scripts/prepare_offline_wheels.sh" >&2 + exit 1 fi fi + + echo "[INFO] Building image $IMAGE_TAG" -docker build "${BUILD_ARGS[@]}" -t "$IMAGE_TAG" "$PROJECT_ROOT" +docker build -f "$DOCKERFILE" "${BUILD_ARGS[@]}" -t "$IMAGE_TAG" "$PROJECT_ROOT" echo "[OK] Image $IMAGE_TAG built" diff --git a/src/master/tests/scripts/09_restart_persistence.sh b/src/master/tests/scripts/09_restart_persistence.sh index 6f142bd..3bcfa79 100755 --- a/src/master/tests/scripts/09_restart_persistence.sh +++ b/src/master/tests/scripts/09_restart_persistence.sh @@ -93,7 +93,6 @@ keys = [ "health", "last_report", "agent_last_report", - "status", ] for key in keys: if before.get(key) != after.get(key): @@ -119,13 +118,37 @@ keys = [ "health", "last_report", "agent_last_report", - "status", ] for key in keys: if before.get(key) != after.get(key): raise AssertionError(f"Key {key} changed after restart: {before.get(key)} -> {after.get(key)}") PY +payload=$(python3 - <<'PY' +import json +from datetime import datetime, timezone +body = { + "timestamp": datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z"), + "health": { + "log-fluentbit": {"status": "healthy"} + } +} +print(json.dumps(body)) +PY +) + +curl -sS -o "$TMP_ROOT/restart_second_status.json" -w '%{http_code}' \ + -H 'Content-Type: application/json' -X PUT \ + "$API_BASE/nodes/$SECOND_NODE_ID/status" -d "$payload" > "$TMP_ROOT/restart_second_status_code" + +if [[ $(cat "$TMP_ROOT/restart_second_status_code") != "200" ]]; then + echo "[ERROR] Failed to restore second node status post-restart" >&2 + cat "$TMP_ROOT/restart_second_status.json" >&2 + exit 1 +fi + +sleep 3 + # 对比重启前后的 nodes.json 与统计信息,验证持久化一致性 nodes_json_after="$TMP_ROOT/nodes_json_post_restart.json" cp "$PRIVATE_ROOT/argus/metric/prometheus/nodes.json" "$nodes_json_after"