[#2] master模块使用supervisor守护;增加dns-monitor功能;内网apt源

This commit is contained in:
yuyr 2025-09-25 03:54:47 +00:00
parent 5a681e291a
commit 8687b937d7
7 changed files with 140 additions and 14 deletions

View File

@ -4,29 +4,60 @@ SHELL ["/bin/bash", "-c"]
ARG PIP_INDEX_URL=
ARG USE_OFFLINE=0
ARG USE_INTRANET=false
ENV PIP_NO_CACHE_DIR=1 \
PYTHONUNBUFFERED=1 \
PYTHONPATH=/app
USER root
WORKDIR /app
COPY requirements.txt ./
COPY offline_wheels/ /opt/offline_wheels/
COPY ./src/master/requirements.txt ./requirements.txt
COPY ./src/master/offline_wheels/ /opt/offline_wheels/
RUN set -euxo pipefail \
&& if [[ "$USE_OFFLINE" == "1" ]]; then \
python -m pip install --no-index --find-links /opt/offline_wheels pip && \
python -m pip install --no-index --find-links /opt/offline_wheels -r requirements.txt; \
else \
python -m pip install --upgrade pip \
&& if [[ -n "$PIP_INDEX_URL" ]]; then \
python -m pip install --upgrade pip && \
if [[ -n "$PIP_INDEX_URL" ]]; then \
PIP_INDEX_URL="$PIP_INDEX_URL" python -m pip install -r requirements.txt; \
else \
python -m pip install -r requirements.txt; \
fi; \
fi
COPY app ./app
# 配置内网 apt 源并安装常用工具
RUN if [[ "$USE_INTRANET" == "true" ]]; then \
echo "Configuring intranet apt sources" && \
if [[ -f /etc/apt/sources.list ]]; then cp /etc/apt/sources.list /etc/apt/sources.list.bak; fi && \
mkdir -p /etc/apt && \
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
fi && \
apt-get update && \
apt-get install -y supervisor net-tools inetutils-ping vim && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# 运行期切换到运行所需的 apt 源
RUN if [[ "$USE_INTRANET" == "true" ]]; then \
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
fi
RUN mkdir -p /var/log/supervisor
COPY ./src/master/build/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
COPY ./src/master/build/start-master.sh /usr/local/bin/start-master.sh
COPY ./src/master/build/dns-monitor.sh /usr/local/bin/dns-monitor.sh
RUN chmod +x /usr/local/bin/start-master.sh /usr/local/bin/dns-monitor.sh
COPY ./src/master/app ./app
EXPOSE 3000
CMD ["gunicorn", "--bind", "0.0.0.0:3000", "app:create_app()"]
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]

View File

@ -51,6 +51,15 @@ cd src/master/tests
| `NODE_ID_PREFIX` | `A` | 新节点 ID 的前缀,实际 ID 形如 `A1``A2`。 |
| `AUTH_MODE` | `disabled` | 预留的认证开关,当前固定为禁用。 |
## 进程与监控
镜像内通过 `supervisord` 管理进程:
- `master`:执行 `/usr/local/bin/start-master.sh`,默认以 4 个 Gunicorn worker 监听 `0.0.0.0:3000`;可通过环境变量 `GUNICORN_WORKERS``GUNICORN_BIND``GUNICORN_EXTRA_ARGS` 调整。
- `dns-monitor`:轮询 `/private/argus/etc/dns.conf`,若发现变更则调用 `/private/argus/etc/update-dns.sh`,日志输出在 `/var/log/supervisor/dns-monitor.log`
镜像构建阶段会安装 `supervisor`/`net-tools`/`inetutils-ping`/`vim` 等基础工具,并在运行前把 apt 源切换到内网镜像,方便容器内进一步运维。
## REST API 详解
基础路径:`/api/v1/master`,全部返回 JSON。

View File

@ -0,0 +1 @@
../../bind/build/dns-monitor.sh

View File

@ -0,0 +1,14 @@
#!/usr/bin/env bash
set -euo pipefail
WORKERS=${GUNICORN_WORKERS:-4}
BIND_ADDR=${GUNICORN_BIND:-0.0.0.0:3000}
EXTRA_OPTS=${GUNICORN_EXTRA_ARGS:-}
if [[ -n "$EXTRA_OPTS" ]]; then
read -r -a EXTRA_ARRAY <<< "$EXTRA_OPTS"
else
EXTRA_ARRAY=()
fi
exec gunicorn --bind "$BIND_ADDR" --workers "$WORKERS" "${EXTRA_ARRAY[@]}" "app:create_app()"

View File

@ -0,0 +1,39 @@
[supervisord]
nodaemon=true
logfile=/var/log/supervisor/supervisord.log
pidfile=/var/run/supervisord.pid
user=root
[program:master]
command=/usr/local/bin/start-master.sh
user=root
stdout_logfile=/var/log/supervisor/master.log
stderr_logfile=/var/log/supervisor/master_error.log
autostart=true
autorestart=true
startsecs=5
stopwaitsecs=30
killasgroup=true
stopasgroup=true
[program:dns-monitor]
command=/usr/local/bin/dns-monitor.sh
user=root
stdout_logfile=/var/log/supervisor/dns-monitor.log
stderr_logfile=/var/log/supervisor/dns-monitor_error.log
autostart=true
autorestart=true
startsecs=5
stopwaitsecs=10
killasgroup=true
stopasgroup=true
[unix_http_server]
file=/var/run/supervisor.sock
chmod=0700
[supervisorctl]
serverurl=unix:///var/run/supervisor.sock
[rpcinterface:supervisor]
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface

View File

@ -13,21 +13,27 @@ USAGE
}
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
MODULE_ROOT="$PROJECT_ROOT/src/master"
IMAGE_TAG="${IMAGE_TAG:-argus-master:dev}"
DOCKERFILE="src/master/Dockerfile"
BUILD_ARGS=()
OFFLINE_MODE=0
cd "$PROJECT_ROOT"
while [[ "$#" -gt 0 ]]; do
case "$1" in
--intranet)
INTRANET_INDEX="${INTRANET_INDEX:-https://pypi.tuna.tsinghua.edu.cn/simple}"
BUILD_ARGS+=("--build-arg" "PIP_INDEX_URL=${INTRANET_INDEX}")
BUILD_ARGS+=("--build-arg" "USE_INTRANET=true")
shift
;;
--offline)
OFFLINE_MODE=1
BUILD_ARGS+=("--build-arg" "USE_OFFLINE=1")
BUILD_ARGS+=("--build-arg" "USE_INTRANET=true")
shift
;;
--tag)
@ -48,16 +54,19 @@ while [[ "$#" -gt 0 ]]; do
done
if [[ "$OFFLINE_MODE" -eq 1 ]]; then
WHEELS_DIR="$PROJECT_ROOT/offline_wheels"
WHEELS_DIR="$MODULE_ROOT/offline_wheels"
if [[ ! -d "$WHEELS_DIR" ]]; then
echo "[ERROR] offline_wheels 目录不存在: $WHEELS_DIR" >&2
exit 1
fi
if ! find "$WHEELS_DIR" -maxdepth 1 -type f -name '*.whl' -print -quit >/dev/null; then
echo "[WARN] offline_wheels 目录为空,请确保已提前下载所需的 wheel 包" >&2
echo "[ERROR] offline_wheels 目录为空,请先在有网环境执行 scripts/prepare_offline_wheels.sh" >&2
exit 1
fi
fi
echo "[INFO] Building image $IMAGE_TAG"
docker build "${BUILD_ARGS[@]}" -t "$IMAGE_TAG" "$PROJECT_ROOT"
docker build -f "$DOCKERFILE" "${BUILD_ARGS[@]}" -t "$IMAGE_TAG" "$PROJECT_ROOT"
echo "[OK] Image $IMAGE_TAG built"

View File

@ -93,7 +93,6 @@ keys = [
"health",
"last_report",
"agent_last_report",
"status",
]
for key in keys:
if before.get(key) != after.get(key):
@ -119,13 +118,37 @@ keys = [
"health",
"last_report",
"agent_last_report",
"status",
]
for key in keys:
if before.get(key) != after.get(key):
raise AssertionError(f"Key {key} changed after restart: {before.get(key)} -> {after.get(key)}")
PY
payload=$(python3 - <<'PY'
import json
from datetime import datetime, timezone
body = {
"timestamp": datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z"),
"health": {
"log-fluentbit": {"status": "healthy"}
}
}
print(json.dumps(body))
PY
)
curl -sS -o "$TMP_ROOT/restart_second_status.json" -w '%{http_code}' \
-H 'Content-Type: application/json' -X PUT \
"$API_BASE/nodes/$SECOND_NODE_ID/status" -d "$payload" > "$TMP_ROOT/restart_second_status_code"
if [[ $(cat "$TMP_ROOT/restart_second_status_code") != "200" ]]; then
echo "[ERROR] Failed to restore second node status post-restart" >&2
cat "$TMP_ROOT/restart_second_status.json" >&2
exit 1
fi
sleep 3
# 对比重启前后的 nodes.json 与统计信息,验证持久化一致性
nodes_json_after="$TMP_ROOT/nodes_json_post_restart.json"
cp "$PRIVATE_ROOT/argus/metric/prometheus/nodes.json" "$nodes_json_after"