Compare commits
10 Commits
fb6a7ec9c8
...
33a4c6564e
| Author | SHA1 | Date | |
|---|---|---|---|
| 33a4c6564e | |||
| 68b8461ea0 | |||
| 22f40aaafc | |||
| 83b53dac7f | |||
| 7de75e2bf2 | |||
| bb28c71e1d | |||
| bedf4ade9d | |||
| 2c1586d36d | |||
| 94b024cc69 | |||
| 8fbe107ac9 |
@ -10,6 +10,7 @@ Usage: $0 [OPTIONS]
|
||||
Options:
|
||||
--intranet Use intranet mirror for log/bind builds
|
||||
--master-offline Build master offline image (requires src/master/offline_wheels.tar.gz)
|
||||
--no-cache Build all images without using Docker layer cache
|
||||
-h, --help Show this help message
|
||||
|
||||
Examples:
|
||||
@ -23,6 +24,7 @@ EOF
|
||||
use_intranet=false
|
||||
build_master=true
|
||||
build_master_offline=false
|
||||
no_cache=false
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
@ -39,6 +41,10 @@ while [[ $# -gt 0 ]]; do
|
||||
build_master_offline=true
|
||||
shift
|
||||
;;
|
||||
--no-cache)
|
||||
no_cache=true
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
@ -65,6 +71,10 @@ cd "$root"
|
||||
load_build_user
|
||||
build_args+=("--build-arg" "ARGUS_BUILD_UID=${ARGUS_BUILD_UID}" "--build-arg" "ARGUS_BUILD_GID=${ARGUS_BUILD_GID}")
|
||||
|
||||
if [[ "$no_cache" == true ]]; then
|
||||
build_args+=("--no-cache")
|
||||
fi
|
||||
|
||||
master_root="$root/src/master"
|
||||
master_offline_tar="$master_root/offline_wheels.tar.gz"
|
||||
master_offline_dir="$master_root/offline_wheels"
|
||||
@ -159,6 +169,9 @@ if [[ "$build_master" == true ]]; then
|
||||
if [[ "$build_master_offline" == true ]]; then
|
||||
master_args+=("--offline")
|
||||
fi
|
||||
if [[ "$no_cache" == true ]]; then
|
||||
master_args+=("--no-cache")
|
||||
fi
|
||||
if ./scripts/build_images.sh "${master_args[@]}"; then
|
||||
if [[ "$build_master_offline" == true ]]; then
|
||||
images_built+=("argus-master:offline")
|
||||
|
||||
@ -34,6 +34,18 @@ Agent 不再依赖配置文件;所有参数均由环境变量与主机名推
|
||||
| `MASTER_ENDPOINT` | 是 | N/A | Master 基础地址,可写 `http://host:3000` 或 `host:3000`(自动补全 `http://`)。 |
|
||||
| `REPORT_INTERVAL_SECONDS` | 否 | `60` | 状态上报间隔(秒)。必须为正整数。 |
|
||||
| `AGENT_HOSTNAME` | 否 | `$(hostname)` | 覆盖容器内主机名,便于测试或特殊命名需求。 |
|
||||
| `AGENT_ENV` | 否 | 来源于主机名 | 运行环境标识(如 `dev`、`prod`)。与 `AGENT_USER`、`AGENT_INSTANCE` 必须同时设置。 |
|
||||
| `AGENT_USER` | 否 | 来源于主机名 | 归属用户或团队标识。与 `AGENT_ENV`、`AGENT_INSTANCE` 必须同时设置。 |
|
||||
| `AGENT_INSTANCE` | 否 | 来源于主机名 | 实例编号或别名。与 `AGENT_ENV`、`AGENT_USER` 必须同时设置。 |
|
||||
|
||||
主机名与元数据的解析优先级:
|
||||
|
||||
1. 若设置 `AGENT_ENV` / `AGENT_USER` / `AGENT_INSTANCE` 且全部存在,则直接使用这些值。
|
||||
2. 否则检查历史 `node.json`(注册成功后由 Master 返回的信息),若包含 `env` / `user` / `instance` 则沿用。
|
||||
3. 若以上均不可用,则按历史约定从主机名解析 `env-user-instance` 前缀。
|
||||
4. 如果仍无法得到完整结果,Agent 启动会失败并提示需要提供上述环境变量。
|
||||
|
||||
> 提示:在首次部署时需确保环境变量或主机名能够提供完整信息。完成注册后,Agent 会把 Master 返回的元数据写入 `node.json`,后续重启无需再次提供环境变量就能保持一致性。
|
||||
|
||||
派生路径:
|
||||
|
||||
|
||||
@ -18,13 +18,12 @@ _HOSTNAME_PATTERN = re.compile(r"^([^-]+)-([^-]+)-([^-]+)-.*$")
|
||||
def collect_metadata(config: AgentConfig) -> Dict[str, Any]:
|
||||
"""汇总节点注册需要的静态信息。"""
|
||||
hostname = config.hostname
|
||||
env, user, instance = _parse_hostname(hostname)
|
||||
meta = {
|
||||
"hostname": hostname,
|
||||
"ip": _detect_ip_address(),
|
||||
"env": env,
|
||||
"user": user,
|
||||
"instance": instance,
|
||||
"env": config.environment,
|
||||
"user": config.user,
|
||||
"instance": config.instance,
|
||||
"cpu_number": _detect_cpu_count(),
|
||||
"memory_in_bytes": _detect_memory_bytes(),
|
||||
"gpu_number": _detect_gpu_count(),
|
||||
|
||||
@ -6,14 +6,21 @@ from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Final
|
||||
|
||||
from .state import load_node_state
|
||||
from .version import VERSION
|
||||
from .log import get_logger
|
||||
|
||||
DEFAULT_REPORT_INTERVAL_SECONDS: Final[int] = 60
|
||||
|
||||
LOGGER = get_logger("argus.agent.config")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AgentConfig:
|
||||
hostname: str
|
||||
environment: str
|
||||
user: str
|
||||
instance: str
|
||||
node_file: str
|
||||
version: str
|
||||
master_endpoint: str
|
||||
@ -47,11 +54,68 @@ def _resolve_hostname() -> str:
|
||||
return os.environ.get("AGENT_HOSTNAME") or socket.gethostname()
|
||||
|
||||
|
||||
def _load_metadata_from_state(node_file: str) -> tuple[str, str, str] | None:
|
||||
state = load_node_state(node_file)
|
||||
if not state:
|
||||
return None
|
||||
|
||||
meta = state.get("meta_data") or {}
|
||||
env = meta.get("env") or state.get("env")
|
||||
user = meta.get("user") or state.get("user")
|
||||
instance = meta.get("instance") or state.get("instance")
|
||||
|
||||
if env and user and instance:
|
||||
LOGGER.debug("Metadata resolved from node state", extra={"node_file": node_file})
|
||||
return env, user, instance
|
||||
|
||||
LOGGER.warning(
|
||||
"node.json missing metadata fields; ignoring",
|
||||
extra={"node_file": node_file, "meta_data": meta},
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_metadata_fields(hostname: str, node_file: str) -> tuple[str, str, str]:
|
||||
env = os.environ.get("AGENT_ENV")
|
||||
user = os.environ.get("AGENT_USER")
|
||||
instance = os.environ.get("AGENT_INSTANCE")
|
||||
|
||||
if env and user and instance:
|
||||
return env, user, instance
|
||||
|
||||
if any([env, user, instance]):
|
||||
LOGGER.warning(
|
||||
"Incomplete metadata environment variables; falling back to persisted metadata",
|
||||
extra={
|
||||
"has_env": bool(env),
|
||||
"has_user": bool(user),
|
||||
"has_instance": bool(instance),
|
||||
},
|
||||
)
|
||||
|
||||
state_metadata = _load_metadata_from_state(node_file)
|
||||
if state_metadata is not None:
|
||||
return state_metadata
|
||||
|
||||
from .collector import _parse_hostname # Local import to avoid circular dependency
|
||||
|
||||
env, user, instance = _parse_hostname(hostname)
|
||||
|
||||
if not all([env, user, instance]):
|
||||
raise ValueError(
|
||||
"Failed to determine metadata fields; set AGENT_ENV/USER/INSTANCE or use supported hostname pattern"
|
||||
)
|
||||
|
||||
return env, user, instance
|
||||
|
||||
|
||||
def load_config() -> AgentConfig:
|
||||
"""从环境变量推导配置,移除了外部配置文件依赖。"""
|
||||
|
||||
hostname = _resolve_hostname()
|
||||
node_file = f"/private/argus/agent/{hostname}/node.json"
|
||||
environment, user, instance = _resolve_metadata_fields(hostname, node_file)
|
||||
|
||||
health_dir = f"/private/argus/agent/{hostname}/health/"
|
||||
|
||||
master_endpoint_env = os.environ.get("MASTER_ENDPOINT")
|
||||
@ -66,6 +130,9 @@ def load_config() -> AgentConfig:
|
||||
|
||||
return AgentConfig(
|
||||
hostname=hostname,
|
||||
environment=environment,
|
||||
user=user,
|
||||
instance=instance,
|
||||
node_file=node_file,
|
||||
version=VERSION,
|
||||
master_endpoint=master_endpoint,
|
||||
|
||||
BIN
src/agent/dist/argus-agent
vendored
0
src/agent/tests/__init__.py
Normal file
@ -60,6 +60,36 @@ services:
|
||||
ipv4_address: 172.28.0.20
|
||||
restart: always
|
||||
|
||||
agent_env:
|
||||
image: ubuntu:22.04
|
||||
container_name: argus-agent-env-e2e
|
||||
hostname: host_abc
|
||||
depends_on:
|
||||
- master
|
||||
- bind
|
||||
environment:
|
||||
- MASTER_ENDPOINT=http://master.argus.com:3000
|
||||
- REPORT_INTERVAL_SECONDS=2
|
||||
- AGENT_ENV=prod
|
||||
- AGENT_USER=ml
|
||||
- AGENT_INSTANCE=node-3
|
||||
- AGENT_HOSTNAME=host_abc
|
||||
- "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
|
||||
- "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
|
||||
volumes:
|
||||
- ./private/argus/agent/host_abc:/private/argus/agent/host_abc
|
||||
- ./private/argus/agent/host_abc/health:/private/argus/agent/host_abc/health
|
||||
- ./private/argus/etc:/private/argus/etc
|
||||
- ../dist/argus-agent:/usr/local/bin/argus-agent:ro
|
||||
- ./scripts/agent_entrypoint.sh:/usr/local/bin/agent-entrypoint.sh:ro
|
||||
- ../scripts/agent_deployment_verify.sh:/usr/local/bin/agent_deployment_verify.sh:ro
|
||||
entrypoint:
|
||||
- /usr/local/bin/agent-entrypoint.sh
|
||||
networks:
|
||||
default:
|
||||
ipv4_address: 172.28.0.21
|
||||
restart: always
|
||||
|
||||
networks:
|
||||
default:
|
||||
driver: bridge
|
||||
|
||||
@ -7,10 +7,10 @@ SCRIPTS=(
|
||||
"02_up.sh"
|
||||
"03_wait_and_assert_registration.sh"
|
||||
"04_write_health_files.sh"
|
||||
"08_verify_agent.sh"
|
||||
"05_assert_status_on_master.sh"
|
||||
"06_restart_agent_and_reregister.sh"
|
||||
"07_down.sh"
|
||||
"05_verify_agent.sh"
|
||||
"06_assert_status_on_master.sh"
|
||||
"07_restart_agent_and_reregister.sh"
|
||||
"08_down.sh"
|
||||
)
|
||||
|
||||
for script in "${SCRIPTS[@]}"; do
|
||||
|
||||
@ -41,7 +41,7 @@ compose() {
|
||||
fi
|
||||
}
|
||||
|
||||
docker container rm -f argus-agent-e2e argus-master-agent-e2e argus-bind-agent-e2e >/dev/null 2>&1 || true
|
||||
docker container rm -f argus-agent-e2e argus-agent-env-e2e argus-master-agent-e2e argus-bind-agent-e2e >/dev/null 2>&1 || true
|
||||
|
||||
docker network rm tests_default >/dev/null 2>&1 || true
|
||||
|
||||
|
||||
@ -6,11 +6,14 @@ TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
TMP_ROOT="$TEST_ROOT/tmp"
|
||||
API_BASE="http://localhost:32300/api/v1/master"
|
||||
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
||||
ENV_AGENT_HOSTNAME="host_abc"
|
||||
NODE_FILE="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/node.json"
|
||||
ENV_NODE_FILE="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME/node.json"
|
||||
|
||||
mkdir -p "$TMP_ROOT"
|
||||
|
||||
node_id=""
|
||||
primary_node_id=""
|
||||
env_node_id=""
|
||||
for _ in {1..30}; do
|
||||
sleep 2
|
||||
response=$(curl -sS "$API_BASE/nodes" || true)
|
||||
@ -19,24 +22,49 @@ for _ in {1..30}; do
|
||||
fi
|
||||
list_file="$TMP_ROOT/nodes_list.json"
|
||||
echo "$response" > "$list_file"
|
||||
node_id=$(python3 - "$list_file" <<'PY'
|
||||
readarray -t node_ids < <(python3 - "$list_file" "$AGENT_HOSTNAME" "$ENV_AGENT_HOSTNAME" <<'PY'
|
||||
import json, sys
|
||||
|
||||
with open(sys.argv[1]) as handle:
|
||||
nodes = json.load(handle)
|
||||
print(nodes[0]["id"] if nodes else "")
|
||||
|
||||
target_primary = sys.argv[2]
|
||||
target_env = sys.argv[3]
|
||||
|
||||
primary_id = ""
|
||||
env_id = ""
|
||||
|
||||
for node in nodes:
|
||||
if node.get("name") == target_primary:
|
||||
primary_id = node.get("id", "")
|
||||
if node.get("name") == target_env:
|
||||
env_id = node.get("id", "")
|
||||
|
||||
print(primary_id)
|
||||
print(env_id)
|
||||
PY
|
||||
)
|
||||
if [[ -n "$node_id" ]]; then
|
||||
)
|
||||
|
||||
primary_node_id="${node_ids[0]}"
|
||||
env_node_id="${node_ids[1]}"
|
||||
|
||||
if [[ -n "$primary_node_id" && -n "$env_node_id" ]]; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ -z "$node_id" ]]; then
|
||||
echo "[ERROR] Agent did not register within timeout" >&2
|
||||
if [[ -z "$primary_node_id" ]]; then
|
||||
echo "[ERROR] Primary agent did not register within timeout" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "$node_id" > "$TMP_ROOT/node_id"
|
||||
if [[ -z "$env_node_id" ]]; then
|
||||
echo "[ERROR] Env-variable agent did not register within timeout" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "$primary_node_id" > "$TMP_ROOT/node_id"
|
||||
echo "$env_node_id" > "$TMP_ROOT/node_id_host_abc"
|
||||
|
||||
if [[ ! -f "$NODE_FILE" ]]; then
|
||||
echo "[ERROR] node.json not created at $NODE_FILE" >&2
|
||||
@ -50,8 +78,20 @@ with open(sys.argv[1]) as handle:
|
||||
assert "id" in node and node["id"], "node.json missing id"
|
||||
PY
|
||||
|
||||
if [[ ! -f "$ENV_NODE_FILE" ]]; then
|
||||
echo "[ERROR] node.json not created at $ENV_NODE_FILE" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
python3 - "$ENV_NODE_FILE" <<'PY'
|
||||
import json, sys
|
||||
with open(sys.argv[1]) as handle:
|
||||
node = json.load(handle)
|
||||
assert "id" in node and node["id"], "env agent node.json missing id"
|
||||
PY
|
||||
|
||||
detail_file="$TMP_ROOT/initial_detail.json"
|
||||
curl -sS "$API_BASE/nodes/$node_id" -o "$detail_file"
|
||||
curl -sS "$API_BASE/nodes/$primary_node_id" -o "$detail_file"
|
||||
python3 - "$detail_file" "$TMP_ROOT/initial_ip" <<'PY'
|
||||
import json, sys, pathlib
|
||||
with open(sys.argv[1]) as handle:
|
||||
@ -62,4 +102,5 @@ if not ip:
|
||||
pathlib.Path(sys.argv[2]).write_text(ip)
|
||||
PY
|
||||
|
||||
echo "[INFO] Agent registered with node id $node_id"
|
||||
echo "[INFO] Agent registered with node id $primary_node_id"
|
||||
echo "[INFO] Env-variable agent registered with node id $env_node_id"
|
||||
|
||||
60
src/agent/tests/scripts/05_verify_agent.sh
Executable file
@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
REPO_ROOT="$(cd "$TEST_ROOT/.." && pwd)"
|
||||
VERIFY_SCRIPT="$REPO_ROOT/scripts/agent_deployment_verify.sh"
|
||||
ENV_NODE_ID_FILE="$TEST_ROOT/tmp/node_id_host_abc"
|
||||
PRIMARY_CONTAINER="argus-agent-e2e"
|
||||
ENV_CONTAINER="argus-agent-env-e2e"
|
||||
PRIMARY_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
||||
ENV_HOSTNAME="host_abc"
|
||||
|
||||
if ! docker ps --format '{{.Names}}' | grep -q "^${PRIMARY_CONTAINER}$"; then
|
||||
echo "[WARN] agent container not running; skip verification"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if docker exec -i "$PRIMARY_CONTAINER" bash -lc 'command -v curl >/dev/null && command -v jq >/dev/null'; then
|
||||
echo "[INFO] curl/jq already installed in agent container"
|
||||
else
|
||||
echo "[INFO] Installing curl/jq in agent container"
|
||||
docker exec -i "$PRIMARY_CONTAINER" bash -lc 'apt-get update >/dev/null 2>&1 && apt-get install -y curl jq >/dev/null 2>&1' || true
|
||||
fi
|
||||
|
||||
if [[ ! -f "$VERIFY_SCRIPT" ]]; then
|
||||
echo "[ERROR] Verification script missing at $VERIFY_SCRIPT" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
run_verifier() {
|
||||
local container="$1" hostname="$2"
|
||||
|
||||
if ! docker ps --format '{{.Names}}' | grep -q "^${container}$"; then
|
||||
echo "[WARN] container $container not running; skip"
|
||||
return
|
||||
fi
|
||||
|
||||
if ! docker exec -i "$container" bash -lc 'command -v /usr/local/bin/agent_deployment_verify.sh >/dev/null'; then
|
||||
echo "[ERROR] /usr/local/bin/agent_deployment_verify.sh missing in $container" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "[INFO] Running verification for $hostname in $container"
|
||||
docker exec -i "$container" env VERIFY_HOSTNAME="$hostname" /usr/local/bin/agent_deployment_verify.sh
|
||||
}
|
||||
|
||||
run_verifier "$PRIMARY_CONTAINER" "$PRIMARY_HOSTNAME"
|
||||
|
||||
if docker ps --format '{{.Names}}' | grep -q "^${ENV_CONTAINER}$"; then
|
||||
if docker exec -i "$ENV_CONTAINER" bash -lc 'command -v curl >/dev/null && command -v jq >/dev/null'; then
|
||||
echo "[INFO] curl/jq already installed in env agent container"
|
||||
else
|
||||
echo "[INFO] Installing curl/jq in env agent container"
|
||||
docker exec -i "$ENV_CONTAINER" bash -lc 'apt-get update >/dev/null 2>&1 && apt-get install -y curl jq >/dev/null 2>&1' || true
|
||||
fi
|
||||
run_verifier "$ENV_CONTAINER" "$ENV_HOSTNAME"
|
||||
else
|
||||
echo "[WARN] env-driven agent container not running; skip secondary verification"
|
||||
fi
|
||||
@ -6,6 +6,8 @@ TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
TMP_ROOT="$TEST_ROOT/tmp"
|
||||
API_BASE="http://localhost:32300/api/v1/master"
|
||||
NODE_ID="$(cat "$TMP_ROOT/node_id")"
|
||||
ENV_NODE_ID="$(cat "$TMP_ROOT/node_id_host_abc")"
|
||||
ENV_HOSTNAME="host_abc"
|
||||
NODES_JSON="$TEST_ROOT/private/argus/metric/prometheus/nodes.json"
|
||||
|
||||
success=false
|
||||
@ -41,13 +43,36 @@ if [[ ! -f "$NODES_JSON" ]]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
python3 - "$NODES_JSON" <<'PY'
|
||||
python3 - "$NODES_JSON" "$NODE_ID" "$ENV_NODE_ID" <<'PY'
|
||||
import json, sys
|
||||
with open(sys.argv[1]) as handle:
|
||||
nodes = json.load(handle)
|
||||
assert len(nodes) == 1, nodes
|
||||
entry = nodes[0]
|
||||
assert entry["node_id"], entry
|
||||
|
||||
expected_primary = sys.argv[2]
|
||||
expected_env = sys.argv[3]
|
||||
|
||||
ids = {entry.get("node_id") for entry in nodes}
|
||||
assert expected_primary in ids, nodes
|
||||
assert expected_env in ids, nodes
|
||||
assert len(nodes) >= 2, nodes
|
||||
PY
|
||||
|
||||
echo "[INFO] Master reflects agent health and nodes.json entries"
|
||||
|
||||
env_detail_file="$TMP_ROOT/env_agent_detail.json"
|
||||
curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_detail_file"
|
||||
python3 - "$env_detail_file" "$ENV_HOSTNAME" <<'PY'
|
||||
import json, sys
|
||||
with open(sys.argv[1]) as handle:
|
||||
node = json.load(handle)
|
||||
|
||||
expected_name = sys.argv[2]
|
||||
|
||||
assert node.get("name") == expected_name, node
|
||||
meta = node.get("meta_data", {})
|
||||
assert meta.get("env") == "prod", meta
|
||||
assert meta.get("user") == "ml", meta
|
||||
assert meta.get("instance") == "node-3", meta
|
||||
PY
|
||||
|
||||
echo "[INFO] Env-variable agent reports expected metadata"
|
||||
@ -6,10 +6,20 @@ TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
TMP_ROOT="$TEST_ROOT/tmp"
|
||||
API_BASE="http://localhost:32300/api/v1/master"
|
||||
NODE_ID="$(cat "$TMP_ROOT/node_id")"
|
||||
ENV_NODE_ID_FILE="$TMP_ROOT/node_id_host_abc"
|
||||
if [[ ! -f "$ENV_NODE_ID_FILE" ]]; then
|
||||
echo "[ERROR] Env agent node id file missing at $ENV_NODE_ID_FILE" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ENV_NODE_ID="$(cat "$ENV_NODE_ID_FILE")"
|
||||
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
||||
ENV_AGENT_HOSTNAME="host_abc"
|
||||
NETWORK_NAME="tests_default"
|
||||
NEW_AGENT_IP="172.28.0.200"
|
||||
NEW_ENV_AGENT_IP="172.28.0.210"
|
||||
ENTRYPOINT_SCRIPT="$SCRIPT_DIR/agent_entrypoint.sh"
|
||||
VERIFY_SCRIPT="$TEST_ROOT/../scripts/agent_deployment_verify.sh"
|
||||
ENV_FILE="$TEST_ROOT/.env"
|
||||
|
||||
# 中文提示:重启场景也需要同样的入口脚本,确保 DNS 注册逻辑一致
|
||||
@ -18,6 +28,11 @@ if [[ ! -f "$ENTRYPOINT_SCRIPT" ]]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -f "$VERIFY_SCRIPT" ]]; then
|
||||
echo "[ERROR] agent verification script missing at $VERIFY_SCRIPT" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then
|
||||
echo "[ERROR] Agent binary path missing; rerun bootstrap" >&2
|
||||
exit 1
|
||||
@ -74,15 +89,37 @@ if [[ "$prev_ip" != "$initial_ip" ]]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
env_before_file="$TMP_ROOT/env_before_restart.json"
|
||||
curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_before_file"
|
||||
env_prev_last_updated=$(python3 - "$env_before_file" <<'PY'
|
||||
import json, sys
|
||||
with open(sys.argv[1]) as handle:
|
||||
node = json.load(handle)
|
||||
print(node.get("last_updated", ""))
|
||||
PY
|
||||
)
|
||||
env_prev_ip=$(python3 - "$env_before_file" <<'PY'
|
||||
import json, sys
|
||||
with open(sys.argv[1]) as handle:
|
||||
node = json.load(handle)
|
||||
print(node["meta_data"].get("ip", ""))
|
||||
PY
|
||||
)
|
||||
|
||||
pushd "$TEST_ROOT" >/dev/null
|
||||
compose rm -sf agent
|
||||
compose rm -sf agent_env
|
||||
popd >/dev/null
|
||||
|
||||
docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
|
||||
docker container rm -f argus-agent-env-e2e >/dev/null 2>&1 || true
|
||||
|
||||
AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME"
|
||||
HEALTH_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/health"
|
||||
|
||||
ENV_AGENT_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME"
|
||||
ENV_HEALTH_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME/health"
|
||||
|
||||
# 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态
|
||||
if ! docker run -d \
|
||||
--name argus-agent-e2e \
|
||||
@ -94,6 +131,7 @@ if ! docker run -d \
|
||||
-v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
|
||||
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
|
||||
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
|
||||
-v "$VERIFY_SCRIPT:/usr/local/bin/agent_deployment_verify.sh:ro" \
|
||||
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
|
||||
-e REPORT_INTERVAL_SECONDS=2 \
|
||||
-e ARGUS_BUILD_UID="$AGENT_UID" \
|
||||
@ -141,3 +179,76 @@ if [[ "$success" != true ]]; then
|
||||
fi
|
||||
|
||||
echo "[INFO] Agent restart produced successful re-registration with IP change"
|
||||
|
||||
# ---- Restart env-driven agent without metadata environment variables ----
|
||||
|
||||
if [[ ! -d "$ENV_AGENT_DIR" ]]; then
|
||||
echo "[ERROR] Env agent data dir missing at $ENV_AGENT_DIR" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -d "$ENV_HEALTH_DIR" ]]; then
|
||||
mkdir -p "$ENV_HEALTH_DIR"
|
||||
fi
|
||||
|
||||
if ! docker run -d \
|
||||
--name argus-agent-env-e2e \
|
||||
--hostname "$ENV_AGENT_HOSTNAME" \
|
||||
--network "$NETWORK_NAME" \
|
||||
--ip "$NEW_ENV_AGENT_IP" \
|
||||
-v "$ENV_AGENT_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME" \
|
||||
-v "$ENV_HEALTH_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME/health" \
|
||||
-v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
|
||||
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
|
||||
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
|
||||
-v "$VERIFY_SCRIPT:/usr/local/bin/agent_deployment_verify.sh:ro" \
|
||||
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
|
||||
-e REPORT_INTERVAL_SECONDS=2 \
|
||||
-e ARGUS_BUILD_UID="$AGENT_UID" \
|
||||
-e ARGUS_BUILD_GID="$AGENT_GID" \
|
||||
--entrypoint /usr/local/bin/agent-entrypoint.sh \
|
||||
ubuntu:22.04 >/dev/null; then
|
||||
echo "[ERROR] Failed to start env-driven agent container without metadata env" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
env_success=false
|
||||
env_detail_file="$TMP_ROOT/env_post_restart.json"
|
||||
for _ in {1..20}; do
|
||||
sleep 3
|
||||
if ! curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_detail_file"; then
|
||||
continue
|
||||
fi
|
||||
if python3 - "$env_detail_file" "$env_prev_last_updated" "$ENV_NODE_ID" "$env_prev_ip" "$NEW_ENV_AGENT_IP" <<'PY'
|
||||
import json, sys
|
||||
with open(sys.argv[1]) as handle:
|
||||
node = json.load(handle)
|
||||
prev_last_updated = sys.argv[2]
|
||||
expected_id = sys.argv[3]
|
||||
old_ip = sys.argv[4]
|
||||
expected_ip = sys.argv[5]
|
||||
last_updated = node.get("last_updated")
|
||||
current_ip = node["meta_data"].get("ip")
|
||||
meta = node.get("meta_data", {})
|
||||
assert node["id"] == expected_id
|
||||
if current_ip != expected_ip:
|
||||
raise SystemExit(1)
|
||||
if current_ip == old_ip:
|
||||
raise SystemExit(1)
|
||||
if not last_updated or last_updated == prev_last_updated:
|
||||
raise SystemExit(1)
|
||||
if meta.get("env") != "prod" or meta.get("user") != "ml" or meta.get("instance") != "node-3":
|
||||
raise SystemExit(1)
|
||||
PY
|
||||
then
|
||||
env_success=true
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ "$env_success" != true ]]; then
|
||||
echo "[ERROR] Env-driven agent did not reuse persisted metadata after restart" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "[INFO] Env-driven agent restart succeeded with persisted metadata"
|
||||
@ -13,7 +13,7 @@ compose() {
|
||||
fi
|
||||
}
|
||||
|
||||
docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
|
||||
docker container rm -f argus-agent-e2e argus-agent-env-e2e >/dev/null 2>&1 || true
|
||||
|
||||
pushd "$TEST_ROOT" >/dev/null
|
||||
compose down --remove-orphans
|
||||
@ -1,26 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
VERIFY_SCRIPT="$(cd "$TEST_ROOT/.." && pwd)/scripts/agent_deployment_verify.sh"
|
||||
|
||||
if ! docker ps --format '{{.Names}}' | grep -q '^argus-agent-e2e$'; then
|
||||
echo "[WARN] agent container not running; skip verification"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if docker exec -i argus-agent-e2e bash -lc 'command -v curl >/dev/null && command -v jq >/dev/null'; then
|
||||
echo "[INFO] curl/jq already installed in agent container"
|
||||
else
|
||||
echo "[INFO] Installing curl/jq in agent container"
|
||||
docker exec -i argus-agent-e2e bash -lc 'apt-get update >/dev/null 2>&1 && apt-get install -y curl jq >/dev/null 2>&1' || true
|
||||
fi
|
||||
|
||||
if docker exec -i argus-agent-e2e bash -lc 'command -v /usr/local/bin/agent_deployment_verify.sh >/dev/null'; then
|
||||
docker exec -i argus-agent-e2e /usr/local/bin/agent_deployment_verify.sh
|
||||
elif [[ -x "$VERIFY_SCRIPT" ]]; then
|
||||
docker exec -i argus-agent-e2e "$VERIFY_SCRIPT"
|
||||
else
|
||||
echo "[WARN] agent_deployment_verify.sh not found"
|
||||
fi
|
||||
151
src/agent/tests/test_config_metadata.py
Normal file
@ -0,0 +1,151 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import unittest
|
||||
from contextlib import contextmanager
|
||||
from unittest.mock import patch
|
||||
|
||||
from app.config import AgentConfig, load_config
|
||||
|
||||
|
||||
@contextmanager
|
||||
def temp_env(**overrides: str | None):
|
||||
originals: dict[str, str | None] = {}
|
||||
try:
|
||||
for key, value in overrides.items():
|
||||
originals[key] = os.environ.get(key)
|
||||
if value is None:
|
||||
os.environ.pop(key, None)
|
||||
else:
|
||||
os.environ[key] = value
|
||||
yield
|
||||
finally:
|
||||
for key, original in originals.items():
|
||||
if original is None:
|
||||
os.environ.pop(key, None)
|
||||
else:
|
||||
os.environ[key] = original
|
||||
|
||||
|
||||
class LoadConfigMetadataTests(unittest.TestCase):
|
||||
@patch("app.config.Path.mkdir")
|
||||
def test_metadata_from_environment_variables(self, mock_mkdir):
|
||||
with temp_env(
|
||||
MASTER_ENDPOINT="http://master.local",
|
||||
AGENT_HOSTNAME="dev-user-one-pod",
|
||||
AGENT_ENV="prod",
|
||||
AGENT_USER="ops",
|
||||
AGENT_INSTANCE="node-1",
|
||||
):
|
||||
config = load_config()
|
||||
|
||||
self.assertEqual(config.environment, "prod")
|
||||
self.assertEqual(config.user, "ops")
|
||||
self.assertEqual(config.instance, "node-1")
|
||||
mock_mkdir.assert_called()
|
||||
|
||||
@patch("app.config.Path.mkdir")
|
||||
def test_metadata_falls_back_to_hostname(self, mock_mkdir):
|
||||
with temp_env(
|
||||
MASTER_ENDPOINT="http://master.local",
|
||||
AGENT_HOSTNAME="qa-team-abc-pod-2",
|
||||
AGENT_ENV=None,
|
||||
AGENT_USER=None,
|
||||
AGENT_INSTANCE=None,
|
||||
):
|
||||
config = load_config()
|
||||
|
||||
self.assertEqual(config.environment, "qa")
|
||||
self.assertEqual(config.user, "team")
|
||||
self.assertEqual(config.instance, "abc")
|
||||
mock_mkdir.assert_called()
|
||||
|
||||
@patch("app.config._load_metadata_from_state", return_value=("prod", "ops", "node-1"))
|
||||
@patch("app.config.Path.mkdir")
|
||||
def test_metadata_from_node_state(self, mock_mkdir, mock_state):
|
||||
with temp_env(
|
||||
MASTER_ENDPOINT="http://master.local",
|
||||
AGENT_HOSTNAME="host_abc",
|
||||
AGENT_ENV=None,
|
||||
AGENT_USER=None,
|
||||
AGENT_INSTANCE=None,
|
||||
):
|
||||
config = load_config()
|
||||
|
||||
self.assertEqual(config.environment, "prod")
|
||||
self.assertEqual(config.user, "ops")
|
||||
self.assertEqual(config.instance, "node-1")
|
||||
mock_state.assert_called_once()
|
||||
mock_mkdir.assert_called()
|
||||
|
||||
@patch("app.config.Path.mkdir")
|
||||
def test_partial_environment_variables_fallback(self, mock_mkdir):
|
||||
with temp_env(
|
||||
MASTER_ENDPOINT="http://master.local",
|
||||
AGENT_HOSTNAME="stage-ml-001-node",
|
||||
AGENT_ENV="prod",
|
||||
AGENT_USER=None,
|
||||
AGENT_INSTANCE=None,
|
||||
):
|
||||
config = load_config()
|
||||
|
||||
self.assertEqual(config.environment, "stage")
|
||||
self.assertEqual(config.user, "ml")
|
||||
self.assertEqual(config.instance, "001")
|
||||
mock_mkdir.assert_called()
|
||||
|
||||
@patch("app.config.Path.mkdir")
|
||||
def test_invalid_hostname_raises_error(self, mock_mkdir):
|
||||
with temp_env(
|
||||
MASTER_ENDPOINT="http://master.local",
|
||||
AGENT_HOSTNAME="invalidhostname",
|
||||
AGENT_ENV=None,
|
||||
AGENT_USER=None,
|
||||
AGENT_INSTANCE=None,
|
||||
):
|
||||
with self.assertRaises(ValueError):
|
||||
load_config()
|
||||
|
||||
mock_mkdir.assert_not_called()
|
||||
|
||||
|
||||
class CollectMetadataTests(unittest.TestCase):
|
||||
@patch("app.collector._detect_ip_address", return_value="127.0.0.1")
|
||||
@patch("app.collector._detect_gpu_count", return_value=0)
|
||||
@patch("app.collector._detect_memory_bytes", return_value=1024)
|
||||
@patch("app.collector._detect_cpu_count", return_value=8)
|
||||
def test_collect_metadata_uses_config_fields(
|
||||
self,
|
||||
mock_cpu,
|
||||
mock_memory,
|
||||
mock_gpu,
|
||||
mock_ip,
|
||||
):
|
||||
config = AgentConfig(
|
||||
hostname="dev-user-001-pod",
|
||||
environment="prod",
|
||||
user="ops",
|
||||
instance="node-1",
|
||||
node_file="/tmp/node.json",
|
||||
version="1.0.0",
|
||||
master_endpoint="http://master.local",
|
||||
report_interval_seconds=60,
|
||||
health_dir="/tmp/health",
|
||||
)
|
||||
|
||||
from app.collector import collect_metadata
|
||||
|
||||
metadata = collect_metadata(config)
|
||||
|
||||
self.assertEqual(metadata["env"], "prod")
|
||||
self.assertEqual(metadata["user"], "ops")
|
||||
self.assertEqual(metadata["instance"], "node-1")
|
||||
self.assertEqual(metadata["hostname"], "dev-user-001-pod")
|
||||
self.assertEqual(metadata["ip"], "127.0.0.1")
|
||||
self.assertEqual(metadata["cpu_number"], 8)
|
||||
self.assertEqual(metadata["memory_in_bytes"], 1024)
|
||||
self.assertEqual(metadata["gpu_number"], 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@ -0,0 +1,13 @@
|
||||
# Alertmanager
|
||||
|
||||
## 启动示例
|
||||
|
||||
```bash
|
||||
docker run -d --name alertmanager \
|
||||
-p 9093:9093 \
|
||||
-v /opt/alertmanager/data:/alertmanager \
|
||||
argus-alert:latest
|
||||
```
|
||||
|
||||
## 动态配置
|
||||
修改alertmanager.yml后,调用`/-/reload`接口可以重新加载配置
|
||||
86
src/alert/alertmanager/build/Dockerfile
Normal file
@ -0,0 +1,86 @@
|
||||
# 基于 Ubuntu 24.04
|
||||
FROM ubuntu:24.04
|
||||
|
||||
# 切换到 root 用户
|
||||
USER root
|
||||
|
||||
# 安装必要依赖
|
||||
RUN apt-get update && \
|
||||
apt-get install -y wget supervisor net-tools inetutils-ping vim ca-certificates passwd && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 设置 Alertmanager 版本
|
||||
ARG ALERTMANAGER_VERSION=0.28.1
|
||||
|
||||
# 下载并解压 Alertmanager 二进制
|
||||
RUN wget https://github.com/prometheus/alertmanager/releases/download/v${ALERTMANAGER_VERSION}/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz && \
|
||||
tar xvf alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz && \
|
||||
mv alertmanager-${ALERTMANAGER_VERSION}.linux-amd64 /usr/local/alertmanager && \
|
||||
rm alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz
|
||||
|
||||
ENV ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
|
||||
ENV ARGUS_UID=2133
|
||||
ENV ARGUS_GID=2015
|
||||
|
||||
RUN mkdir -p /usr/share/alertmanager && \
|
||||
mkdir -p ${ALERTMANAGER_BASE_PATH} && \
|
||||
mkdir -p /private/argus/etc && \
|
||||
rm -rf /alertmanager && \
|
||||
ln -s ${ALERTMANAGER_BASE_PATH} /alertmanager
|
||||
|
||||
# 创建 alertmanager 用户(可自定义 UID/GID)
|
||||
# 创建 alertmanager 用户组
|
||||
RUN groupadd -g ${ARGUS_GID} alertmanager
|
||||
|
||||
# 创建 alertmanager 用户并指定组
|
||||
RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} alertmanager
|
||||
|
||||
RUN chown -R alertmanager:alertmanager /usr/share/alertmanager && \
|
||||
chown -R alertmanager:alertmanager /alertmanager && \
|
||||
chown -R alertmanager:alertmanager ${ALERTMANAGER_BASE_PATH} && \
|
||||
chown -R alertmanager:alertmanager /private/argus/etc && \
|
||||
chown -R alertmanager:alertmanager /usr/local/bin
|
||||
|
||||
# 配置内网 apt 源 (如果指定了内网选项)
|
||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
echo "Configuring intranet apt sources..." && \
|
||||
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
|
||||
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
||||
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
||||
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
||||
fi
|
||||
|
||||
|
||||
# 配置部署时使用的 apt 源
|
||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
|
||||
fi
|
||||
|
||||
# 创建 supervisor 日志目录
|
||||
RUN mkdir -p /var/log/supervisor
|
||||
|
||||
# 复制 supervisor 配置文件
|
||||
COPY src/alert/alertmanager/build/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
||||
|
||||
# 复制启动脚本
|
||||
COPY src/alert/alertmanager/build/start-am-supervised.sh /usr/local/bin/start-am-supervised.sh
|
||||
RUN chmod +x /usr/local/bin/start-am-supervised.sh
|
||||
|
||||
# 复制 Alertmanager 配置文件
|
||||
COPY src/alert/alertmanager/build/alertmanager.yml /etc/alertmanager/alertmanager.yml
|
||||
RUN chmod +x /etc/alertmanager/alertmanager.yml
|
||||
# COPY src/alert/alertmanager/build/alertmanager.yml ${ALERTMANAGER_BASE_PATH}/alertmanager.yml
|
||||
|
||||
# 复制 DNS 监控脚本
|
||||
COPY src/alert/alertmanager/build/dns-monitor.sh /usr/local/bin/dns-monitor.sh
|
||||
RUN chmod +x /usr/local/bin/dns-monitor.sh
|
||||
|
||||
# 保持 root 用户,由 supervisor 控制 user 切换
|
||||
USER root
|
||||
|
||||
# 暴露端口(Alertmanager 默认端口 9093)
|
||||
EXPOSE 9093
|
||||
|
||||
# 使用 supervisor 作为入口点
|
||||
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
|
||||
|
||||
19
src/alert/alertmanager/build/alertmanager.yml
Normal file
@ -0,0 +1,19 @@
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
group_by: ['alertname', 'instance'] # 分组:相同 alertname + instance 的告警合并
|
||||
group_wait: 30s # 第一个告警后,等 30s 看是否有同组告警一起发
|
||||
group_interval: 5m # 同组告警变化后,至少 5 分钟再发一次
|
||||
repeat_interval: 3h # 相同告警,3 小时重复提醒一次
|
||||
receiver: 'null'
|
||||
|
||||
receivers:
|
||||
- name: 'null'
|
||||
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical' # critical 告警存在时
|
||||
target_match:
|
||||
severity: 'warning' # 抑制相同 instance 的 warning 告警
|
||||
equal: ['instance']
|
||||
68
src/alert/alertmanager/build/dns-monitor.sh
Normal file
@ -0,0 +1,68 @@
|
||||
#!/bin/bash
|
||||
|
||||
# DNS监控脚本 - 每10秒检查dns.conf是否有变化
|
||||
# 如果有变化则执行update-dns.sh脚本
|
||||
|
||||
DNS_CONF="/private/argus/etc/dns.conf"
|
||||
DNS_BACKUP="/tmp/dns.conf.backup"
|
||||
UPDATE_SCRIPT="/private/argus/etc/update-dns.sh"
|
||||
LOG_FILE="/var/log/supervisor/dns-monitor.log"
|
||||
|
||||
# 确保日志文件存在
|
||||
touch "$LOG_FILE"
|
||||
|
||||
log_message() {
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') [DNS-Monitor] $1" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
log_message "DNS监控脚本启动"
|
||||
|
||||
while true; do
|
||||
if [ -f "$DNS_CONF" ]; then
|
||||
if [ -f "$DNS_BACKUP" ]; then
|
||||
# 比较文件内容
|
||||
if ! cmp -s "$DNS_CONF" "$DNS_BACKUP"; then
|
||||
log_message "检测到DNS配置变化"
|
||||
|
||||
# 更新备份文件
|
||||
cp "$DNS_CONF" "$DNS_BACKUP"
|
||||
|
||||
# 执行更新脚本
|
||||
if [ -x "$UPDATE_SCRIPT" ]; then
|
||||
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
|
||||
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
|
||||
if [ $? -eq 0 ]; then
|
||||
log_message "DNS更新脚本执行成功"
|
||||
else
|
||||
log_message "DNS更新脚本执行失败"
|
||||
fi
|
||||
else
|
||||
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
|
||||
# 第一次检测到配置文件,执行更新脚本
|
||||
if [ -x "$UPDATE_SCRIPT" ]; then
|
||||
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
|
||||
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
|
||||
if [ $? -eq 0 ]; then
|
||||
log_message "DNS更新脚本执行成功"
|
||||
|
||||
# 第一次运行,创建备份并执行更新
|
||||
cp "$DNS_CONF" "$DNS_BACKUP"
|
||||
log_message "创建DNS配置备份文件"
|
||||
|
||||
else
|
||||
log_message "DNS更新脚本执行失败"
|
||||
fi
|
||||
else
|
||||
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
log_message "警告: DNS配置文件不存在: $DNS_CONF"
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
done
|
||||
26
src/alert/alertmanager/build/start-am-supervised.sh
Normal file
@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
echo "[INFO] Starting Alertmanager under supervisor..."
|
||||
|
||||
ALERTMANAGER_BASE_PATH=${ALERTMANAGER_BASE_PATH:-/private/argus/alert/alertmanager}
|
||||
|
||||
echo "[INFO] Alertmanager base path: ${ALERTMANAGER_BASE_PATH}"
|
||||
|
||||
# 生成配置文件
|
||||
echo "[INFO] Generating Alertmanager configuration file..."
|
||||
sed "s|\${ALERTMANAGER_BASE_PATH}|${ALERTMANAGER_BASE_PATH}|g" \
|
||||
/etc/alertmanager/alertmanager.yml > ${ALERTMANAGER_BASE_PATH}/alertmanager.yml
|
||||
|
||||
|
||||
# 记录容器 IP 地址
|
||||
DOMAIN=alertmanager.alert.argus.com
|
||||
IP=$(ifconfig | grep -A 1 eth0 | grep inet | awk '{print $2}')
|
||||
echo "current IP: ${IP}"
|
||||
echo "${IP}" > /private/argus/etc/${DOMAIN}
|
||||
|
||||
|
||||
echo "[INFO] Starting Alertmanager process..."
|
||||
|
||||
# 启动 Alertmanager 主进程
|
||||
exec /usr/local/alertmanager/alertmanager --config.file=/etc/alertmanager/alertmanager.yml --storage.path=/alertmanager --cluster.listen-address=""
|
||||
39
src/alert/alertmanager/build/supervisord.conf
Normal file
@ -0,0 +1,39 @@
|
||||
[supervisord]
|
||||
nodaemon=true
|
||||
logfile=/var/log/supervisor/supervisord.log
|
||||
pidfile=/var/run/supervisord.pid
|
||||
user=root
|
||||
|
||||
[program:alertmanager]
|
||||
command=/usr/local/bin/start-am-supervised.sh
|
||||
user=alertmanager
|
||||
stdout_logfile=/var/log/supervisor/alertmanager.log
|
||||
stderr_logfile=/var/log/supervisor/alertmanager_error.log
|
||||
autorestart=true
|
||||
startretries=3
|
||||
startsecs=10
|
||||
stopwaitsecs=20
|
||||
killasgroup=true
|
||||
stopasgroup=true
|
||||
|
||||
[program:dns-monitor]
|
||||
command=/usr/local/bin/dns-monitor.sh
|
||||
user=root
|
||||
stdout_logfile=/var/log/supervisor/dns-monitor.log
|
||||
stderr_logfile=/var/log/supervisor/dns-monitor_error.log
|
||||
autorestart=true
|
||||
startretries=3
|
||||
startsecs=5
|
||||
stopwaitsecs=10
|
||||
killasgroup=true
|
||||
stopasgroup=true
|
||||
|
||||
[unix_http_server]
|
||||
file=/var/run/supervisor.sock
|
||||
chmod=0700
|
||||
|
||||
[supervisorctl]
|
||||
serverurl=unix:///var/run/supervisor.sock
|
||||
|
||||
[rpcinterface:supervisor]
|
||||
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
||||
60
src/alert/alertmanager/config/rule_files/README.md
Normal file
@ -0,0 +1,60 @@
|
||||
# 告警配置
|
||||
|
||||
> 参考:[自定义Prometheus告警规则](https://yunlzheng.gitbook.io/prometheus-book/parti-prometheus-ji-chu/alert/prometheus-alert-rule)
|
||||
|
||||
在Prometheus中配置告警的有两个步骤:
|
||||
|
||||
1. 写告警规则文件(rules文件)
|
||||
2. 在promethues.yml里加载规则,并配置Alertmanager
|
||||
|
||||
## 1. 编写告警规则文件
|
||||
告警规则如下:
|
||||
```yml
|
||||
groups:
|
||||
- name: example-rules
|
||||
interval: 30s # 每30秒评估一次
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "实例 {{ $labels.instance }} 已宕机"
|
||||
description: "{{ $labels.instance }} 在 {{ $labels.job }} 中无响应超过 1 分钟。"
|
||||
|
||||
- alert: HighCpuUsage
|
||||
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "CPU 使用率过高"
|
||||
description: "实例 {{ $labels.instance }} CPU 使用率超过 80% 持续 5 分钟。"
|
||||
```
|
||||
|
||||
其中:
|
||||
|
||||
- `alert`:告警规则的名称。
|
||||
- `expr`:基于PromQL表达式告警触发条件,用于计算是否有时间序列满足该条件。
|
||||
- `for`:评估等待时间,可选参数。用于表示只有当触发条件持续一段时间后才发送告警。在等待期间新产生告警的状态为pending。
|
||||
- `labels`:自定义标签,允许用户指定要附加到告警上的一组附加标签,可以在Alertmanager中做路由和分组。
|
||||
- `annotations`:用于指定一组附加信息,比如用于描述告警详细信息的文字等,annotations的内容在告警产生时会一同作为参数发送到Alertmanager。可以提供告警摘要和详细信息。
|
||||
|
||||
## 2. promothues.yml里引用
|
||||
在prometheus.yml中加上`rule_files`和`alerting`:
|
||||
|
||||
```yml
|
||||
global:
|
||||
[ evaluation_interval: <duration> | default = 1m ]
|
||||
|
||||
rule_files:
|
||||
[ - <filepath_glob> ... ]
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- "localhost:9093" # Alertmanager 地址
|
||||
|
||||
```
|
||||
37
src/alert/alertmanager/config/rule_files/example_rules.yml
Normal file
@ -0,0 +1,37 @@
|
||||
groups:
|
||||
- name: example-rules
|
||||
interval: 30s # 每30秒评估一次
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "实例 {{ $labels.instance }} 已宕机"
|
||||
description: "{{ $labels.instance }} 在 {{ $labels.job }} 中无响应超过 1 分钟。"
|
||||
|
||||
- alert: HighCpuUsage
|
||||
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "CPU 使用率过高"
|
||||
description: "实例 {{ $labels.instance }} CPU 使用率超过 80% 持续 5 分钟。"
|
||||
- alert: HighMemoryUsage
|
||||
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "内存使用率过高"
|
||||
description: "实例 {{ $labels.instance }} 内存使用率超过 80% 持续 5 分钟。"
|
||||
- alert: DiskSpaceLow
|
||||
expr: (node_filesystem_size_bytes{fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{fstype!~"tmpfs|overlay"}) / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"} * 100 > 90
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "磁盘空间不足"
|
||||
description: "实例 {{ $labels.instance }} 磁盘空间不足超过 90% 持续 10 分钟。"
|
||||
19
src/alert/tests/data/alertmanager/alertmanager.yml
Normal file
@ -0,0 +1,19 @@
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
group_by: ['alertname', 'instance'] # 分组:相同 alertname + instance 的告警合并
|
||||
group_wait: 30s # 第一个告警后,等 30s 看是否有同组告警一起发
|
||||
group_interval: 5m # 同组告警变化后,至少 5 分钟再发一次
|
||||
repeat_interval: 3h # 相同告警,3 小时重复提醒一次
|
||||
receiver: 'null'
|
||||
|
||||
receivers:
|
||||
- name: 'null'
|
||||
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical' # critical 告警存在时
|
||||
target_match:
|
||||
severity: 'warning' # 抑制相同 instance 的 warning 告警
|
||||
equal: ['instance']
|
||||
0
src/alert/tests/data/alertmanager/nflog
Normal file
0
src/alert/tests/data/alertmanager/silences
Normal file
1
src/alert/tests/data/etc/alertmanager.alert.argus.com
Normal file
@ -0,0 +1 @@
|
||||
172.18.0.2
|
||||
37
src/alert/tests/docker-compose.yml
Normal file
@ -0,0 +1,37 @@
|
||||
version: '3.8'
|
||||
services:
|
||||
alertmanager:
|
||||
build:
|
||||
context: ../../../
|
||||
dockerfile: src/alert/alertmanager/build/Dockerfile
|
||||
args:
|
||||
ARGUS_UID: ${ARGUS_UID:-2133}
|
||||
ARGUS_GID: ${ARGUS_GID:-2015}
|
||||
USE_INTRANET: ${USE_INTRANET:-false}
|
||||
image: argus-alertmanager:latest
|
||||
container_name: argus-alertmanager
|
||||
environment:
|
||||
- ALERTMANAGER_BASE_PATH=/private/argus/alert/alertmanager
|
||||
- ARGUS_UID=${ARGUS_UID:-2133}
|
||||
- ARGUS_GID=${ARGUS_GID:-2015}
|
||||
ports:
|
||||
- "${ARGUS_PORT:-9093}:9093"
|
||||
volumes:
|
||||
- ${DATA_ROOT:-./data}/alertmanager:/private/argus/alert/alertmanager
|
||||
- ${DATA_ROOT:-./data}/etc:/private/argus/etc
|
||||
networks:
|
||||
- argus-network
|
||||
restart: unless-stopped
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "10m"
|
||||
max-file: "3"
|
||||
networks:
|
||||
argus-network:
|
||||
driver: bridge
|
||||
name: argus-network
|
||||
|
||||
volumes:
|
||||
alertmanager_data:
|
||||
driver: local
|
||||
19
src/alert/tests/scripts/01_bootstrap.sh
Normal file
@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)"
|
||||
project_root="$(cd "$root/../../.." && pwd)"
|
||||
|
||||
source "$project_root/scripts/common/build_user.sh"
|
||||
load_build_user
|
||||
|
||||
# 创建新的private目录结构 (基于argus目录结构)
|
||||
echo "[INFO] Creating private directory structure for supervisor-based containers..."
|
||||
mkdir -p "$root/private/argus/alert/alertmanager"
|
||||
mkdir -p "$root/private/argus/etc/"
|
||||
|
||||
# 设置数据目录权限
|
||||
echo "[INFO] Setting permissions for data directories..."
|
||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/alert/alertmanager" 2>/dev/null || true
|
||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" "$root/private/argus/etc" 2>/dev/null || true
|
||||
|
||||
echo "[INFO] Supervisor-based containers will manage their own scripts and configurations"
|
||||
10
src/alert/tests/scripts/02_up.sh
Normal file
@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
cd "$(dirname "$0")/.."
|
||||
compose_cmd="docker compose"
|
||||
if ! $compose_cmd version >/dev/null 2>&1; then
|
||||
if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else
|
||||
echo "需要 Docker Compose,请安装后重试" >&2; exit 1; fi
|
||||
fi
|
||||
$compose_cmd -p alert-mvp up -d --remove-orphans
|
||||
echo "[OK] 服务已启动:Alertmanager http://localhost:9093"
|
||||
106
src/alert/tests/scripts/03_alertmanager_add_alert.sh
Normal file
@ -0,0 +1,106 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# ==========================================================
|
||||
# Alertmanager 测试脚本
|
||||
# ==========================================================
|
||||
|
||||
ALERTMANAGER_URL="http://localhost:9093"
|
||||
TEST_ALERT_NAME_CRITICAL="NodeDown"
|
||||
TEST_ALERT_NAME_WARNING="HighCPU"
|
||||
TMP_LOG="/tmp/test-alertmanager.log"
|
||||
|
||||
# 等待参数
|
||||
am_wait_attempts=30
|
||||
am_wait_interval=2
|
||||
|
||||
GREEN="\033[1;32m"
|
||||
RED="\033[1;31m"
|
||||
YELLOW="\033[1;33m"
|
||||
RESET="\033[0m"
|
||||
|
||||
# ==========================================================
|
||||
# 函数定义
|
||||
# ==========================================================
|
||||
|
||||
wait_for_alertmanager() {
|
||||
local attempt=1
|
||||
echo "[INFO] 等待 Alertmanager 启动中..."
|
||||
while (( attempt <= am_wait_attempts )); do
|
||||
if curl -fsS "${ALERTMANAGER_URL}/api/v2/status" >/dev/null 2>&1; then
|
||||
echo -e "${GREEN}[OK] Alertmanager 已就绪 (attempt=${attempt}/${am_wait_attempts})${RESET}"
|
||||
return 0
|
||||
fi
|
||||
echo "[..] Alertmanager 尚未就绪 (${attempt}/${am_wait_attempts})"
|
||||
sleep "${am_wait_interval}"
|
||||
(( attempt++ ))
|
||||
done
|
||||
echo -e "${RED}[ERROR] Alertmanager 在 ${am_wait_attempts} 次尝试后仍未就绪${RESET}"
|
||||
return 1
|
||||
}
|
||||
|
||||
log_step() {
|
||||
echo -e "${YELLOW}==== $1 ====${RESET}"
|
||||
}
|
||||
|
||||
# ==========================================================
|
||||
# 主流程
|
||||
# ==========================================================
|
||||
|
||||
log_step "测试 Alertmanager 开始"
|
||||
echo "[INFO] Alertmanager 地址: $ALERTMANAGER_URL"
|
||||
|
||||
# Step 1: 等待 Alertmanager 启动
|
||||
wait_for_alertmanager
|
||||
|
||||
# Step 2: 触发一个critical测试告警
|
||||
echo "[INFO] 发送critical测试告警..."
|
||||
curl -fsS -X POST "${ALERTMANAGER_URL}/api/v2/alerts" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '[
|
||||
{
|
||||
"labels": {
|
||||
"alertname": "'"${TEST_ALERT_NAME_CRITICAL}"'",
|
||||
"instance": "node-1",
|
||||
"severity": "critical"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "节点 node-1 宕机"
|
||||
}
|
||||
}
|
||||
]' \
|
||||
-o "$TMP_LOG"
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo -e "${GREEN}[OK] 已成功发送critical测试告警${RESET}"
|
||||
else
|
||||
echo -e "${RED}[ERROR] critical告警发送失败!${RESET}"
|
||||
cat "$TMP_LOG"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Step 3: 触发一个warning测试告警
|
||||
echo "[INFO] 发送warning测试告警..."
|
||||
curl -fsS -X POST "${ALERTMANAGER_URL}/api/v2/alerts" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '[
|
||||
{
|
||||
"labels": {
|
||||
"alertname": "'"${TEST_ALERT_NAME_WARNING}"'",
|
||||
"instance": "node-1",
|
||||
"severity": "warning"
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "节点 node-1 CPU 使用率过高"
|
||||
}
|
||||
}
|
||||
]' \
|
||||
-o "$TMP_LOG"
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo -e "${GREEN}[OK] 已成功发送warning测试告警${RESET}"
|
||||
else
|
||||
echo -e "${RED}[ERROR] warning告警发送失败!${RESET}"
|
||||
cat "$TMP_LOG"
|
||||
exit 1
|
||||
fi
|
||||
71
src/alert/tests/scripts/04_query_alerts.sh
Normal file
@ -0,0 +1,71 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# ==========================================================
|
||||
# Alertmanager 测试脚本(含启动等待)
|
||||
# ==========================================================
|
||||
|
||||
ALERTMANAGER_URL="http://localhost:9093"
|
||||
TEST_ALERT_NAME_CRITICAL="NodeDown"
|
||||
TEST_ALERT_NAME_WARNING="HighCPU"
|
||||
TMP_LOG="/tmp/test-alertmanager.log"
|
||||
|
||||
# 等待参数
|
||||
am_wait_attempts=30
|
||||
am_wait_interval=2
|
||||
|
||||
GREEN="\033[1;32m"
|
||||
RED="\033[1;31m"
|
||||
YELLOW="\033[1;33m"
|
||||
RESET="\033[0m"
|
||||
|
||||
# ==========================================================
|
||||
# 函数定义
|
||||
# ==========================================================
|
||||
|
||||
wait_for_alertmanager() {
|
||||
local attempt=1
|
||||
echo "[INFO] 等待 Alertmanager 启动中..."
|
||||
while (( attempt <= am_wait_attempts )); do
|
||||
if curl -fsS "${ALERTMANAGER_URL}/api/v2/status" >/dev/null 2>&1; then
|
||||
echo -e "${GREEN}[OK] Alertmanager 已就绪 (attempt=${attempt}/${am_wait_attempts})${RESET}"
|
||||
return 0
|
||||
fi
|
||||
echo "[..] Alertmanager 尚未就绪 (${attempt}/${am_wait_attempts})"
|
||||
sleep "${am_wait_interval}"
|
||||
(( attempt++ ))
|
||||
done
|
||||
echo -e "${RED}[ERROR] Alertmanager 在 ${am_wait_attempts} 次尝试后仍未就绪${RESET}"
|
||||
return 1
|
||||
}
|
||||
|
||||
log_step() {
|
||||
echo -e "${YELLOW}==== $1 ====${RESET}"
|
||||
}
|
||||
|
||||
# ==========================================================
|
||||
# 主流程
|
||||
# ==========================================================
|
||||
|
||||
log_step "查询 Alertmanager 当前告警列表开始"
|
||||
echo "[INFO] Alertmanager 地址: $ALERTMANAGER_URL"
|
||||
|
||||
# Step 1: 等待 Alertmanager 启动
|
||||
wait_for_alertmanager
|
||||
|
||||
# Step 2: 查询当前告警列表
|
||||
echo "[INFO] 查询当前告警..."
|
||||
sleep 1
|
||||
curl -fsS "${ALERTMANAGER_URL}/api/v2/alerts" | jq '.' || {
|
||||
echo -e "${RED}[WARN] 无法解析返回 JSON,请检查 jq 是否安装${RESET}"
|
||||
curl -s "${ALERTMANAGER_URL}/api/v2/alerts"
|
||||
}
|
||||
|
||||
# Step 3: 检查告警是否包含 NodeDown
|
||||
if curl -fsS "${ALERTMANAGER_URL}/api/v2/alerts" | grep -q "${TEST_ALERT_NAME_CRITICAL}"; then
|
||||
echo -e "${GREEN}✅ 测试通过:Alertmanager 已成功接收告警 ${TEST_ALERT_NAME_CRITICAL}${RESET}"
|
||||
else
|
||||
echo -e "${RED}❌ 测试失败:未检测到告警 ${TEST_ALERT_NAME_CRITICAL}${RESET}"
|
||||
fi
|
||||
|
||||
log_step "测试结束"
|
||||
21
src/alert/tests/scripts/05_down.sh
Normal file
@ -0,0 +1,21 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
cd "$(dirname "$0")/.."
|
||||
compose_cmd="docker compose"
|
||||
if ! $compose_cmd version >/dev/null 2>&1; then
|
||||
if command -v docker-compose >/dev/null 2>&1; then compose_cmd="docker-compose"; else
|
||||
echo "需要 Docker Compose,请安装后重试" >&2; exit 1; fi
|
||||
fi
|
||||
$compose_cmd -p alert-mvp down
|
||||
echo "[OK] 已停止所有容器"
|
||||
|
||||
# 清理private目录内容
|
||||
echo "[INFO] 清理private目录内容..."
|
||||
cd "$(dirname "$0")/.."
|
||||
if [ -d "private" ]; then
|
||||
# 删除private目录及其所有内容
|
||||
rm -rf private
|
||||
echo "[OK] 已清理private目录"
|
||||
else
|
||||
echo "[INFO] private目录不存在,无需清理"
|
||||
fi
|
||||
105
src/alert/tests/scripts/e2e_test.sh
Normal file
@ -0,0 +1,105 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
echo "======================================="
|
||||
echo "ARGUS Alert System End-to-End Test"
|
||||
echo "======================================="
|
||||
echo ""
|
||||
|
||||
# 记录测试开始时间
|
||||
test_start_time=$(date +%s)
|
||||
|
||||
# 函数:等待服务就绪
|
||||
wait_for_services() {
|
||||
echo "[INFO] Waiting for all services to be ready..."
|
||||
local max_attempts=${SERVICE_WAIT_ATTEMPTS:-120}
|
||||
local attempt=1
|
||||
|
||||
while [ $attempt -le $max_attempts ]; do
|
||||
if curl -fs http://localhost:9093/api/v2/status >/dev/null 2>&1; then
|
||||
echo "[OK] All services are ready!"
|
||||
return 0
|
||||
fi
|
||||
echo " Waiting for services... ($attempt/$max_attempts)"
|
||||
sleep 5
|
||||
((attempt++))
|
||||
done
|
||||
|
||||
echo "[ERROR] Services not ready after $max_attempts attempts"
|
||||
return 1
|
||||
}
|
||||
|
||||
# 函数:显示测试步骤
|
||||
show_step() {
|
||||
echo ""
|
||||
echo "🔄 Step $1: $2"
|
||||
echo "----------------------------------------"
|
||||
}
|
||||
|
||||
# 函数:验证步骤结果
|
||||
verify_step() {
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "✅ $1 - SUCCESS"
|
||||
else
|
||||
echo "❌ $1 - FAILED"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 开始端到端测试
|
||||
show_step "1" "Bootstrap - Initialize environment"
|
||||
./scripts/01_bootstrap.sh
|
||||
verify_step "Bootstrap"
|
||||
|
||||
show_step "2" "Startup - Start all services"
|
||||
./scripts/02_up.sh
|
||||
verify_step "Service startup"
|
||||
|
||||
# 等待服务完全就绪
|
||||
wait_for_services || exit 1
|
||||
|
||||
# 发送告警数据
|
||||
show_step "3" "Add alerts - Send test alerts to Alertmanager"
|
||||
./scripts/03_alertmanager_add_alert.sh
|
||||
verify_step "Send test alerts"
|
||||
|
||||
# 查询告警数据
|
||||
show_step "4" "Verify data - Query Alertmanager"
|
||||
./scripts/04_query_alerts.sh
|
||||
verify_step "Data verification"
|
||||
|
||||
|
||||
# 检查服务健康状态
|
||||
show_step "Health" "Check service health"
|
||||
echo "[INFO] Checking service health..."
|
||||
|
||||
# 检查 Alertmanager 状态
|
||||
if curl -fs "http://localhost:9093/api/v2/status" >/dev/null 2>&1; then
|
||||
am_status="available"
|
||||
echo "✅ Alertmanager status: $am_status"
|
||||
else
|
||||
am_status="unavailable"
|
||||
echo "⚠️ Alertmanager status: $am_status"
|
||||
fi
|
||||
verify_step "Service health check"
|
||||
|
||||
# 清理环境
|
||||
show_step "5" "Cleanup - Stop all services"
|
||||
./scripts/05_down.sh
|
||||
verify_step "Service cleanup"
|
||||
|
||||
# 计算总测试时间
|
||||
test_end_time=$(date +%s)
|
||||
total_time=$((test_end_time - test_start_time))
|
||||
|
||||
echo ""
|
||||
echo "======================================="
|
||||
echo "🎉 END-TO-END TEST COMPLETED SUCCESSFULLY!"
|
||||
echo "======================================="
|
||||
echo "📊 Test Summary:"
|
||||
echo " • Total time: ${total_time}s"
|
||||
echo " • Alertmanager status: $am_status"
|
||||
echo " • All services started and stopped successfully"
|
||||
echo ""
|
||||
echo "✅ The ARGUS Alert system is working correctly!"
|
||||
echo ""
|
||||
@ -17,6 +17,9 @@ log_message() {
|
||||
|
||||
log_message "DNS监控脚本启动"
|
||||
|
||||
log_message "删除DNS备份文件(如果存在)"
|
||||
rm -f $DNS_BACKUP
|
||||
|
||||
while true; do
|
||||
if [ -f "$DNS_CONF" ]; then
|
||||
if [ -f "$DNS_BACKUP" ]; then
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
set -euo pipefail
|
||||
|
||||
ES_HOST="${ELASTICSEARCH_HOSTS:-http://es:9200}"
|
||||
KB_HOST="http://localhost:5601"
|
||||
KB_HOST="${KB_HOST:-http://127.0.0.1:5601}"
|
||||
|
||||
echo "[INFO] Starting Kibana post-start configuration..."
|
||||
|
||||
@ -83,50 +83,37 @@ fix_replicas_idempotent() {
|
||||
}
|
||||
|
||||
# 幂等创建数据视图
|
||||
create_or_ensure_data_view() {
|
||||
local name="$1"
|
||||
local title="$2"
|
||||
|
||||
local list_response
|
||||
list_response=$(curl -fsS "$KB_HOST/api/data_views" -H 'kbn-xsrf: true' 2>/dev/null || echo "")
|
||||
|
||||
if [ -z "$list_response" ]; then
|
||||
echo "[WARN] Failed to list data views, skipping creation check for $title"
|
||||
return
|
||||
fi
|
||||
|
||||
if echo "$list_response" | grep -Fq "\"title\":\"$title\""; then
|
||||
echo "[INFO] Data view $title already exists, skipping"
|
||||
return
|
||||
fi
|
||||
|
||||
echo "[INFO] Creating data view for $title indices (allowNoIndex)"
|
||||
|
||||
curl -fsS -X POST "$KB_HOST/api/data_views/data_view?allowNoIndex=true" \
|
||||
-H 'kbn-xsrf: true' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d "{\"data_view\":{\"name\":\"$name\",\"title\":\"$title\",\"timeFieldName\":\"@timestamp\",\"allowNoIndex\":true}}" \
|
||||
>/dev/null && echo "[OK] Created $name data view" || echo "[WARN] Failed to create $name data view"
|
||||
}
|
||||
|
||||
create_data_views_idempotent() {
|
||||
echo "[INFO] Checking and creating data views..."
|
||||
|
||||
# 检查是否存在匹配的索引
|
||||
local train_indices=$(curl -s "$ES_HOST/_cat/indices/train-*?h=index" 2>/dev/null | wc -l || echo "0")
|
||||
local infer_indices=$(curl -s "$ES_HOST/_cat/indices/infer-*?h=index" 2>/dev/null | wc -l || echo "0")
|
||||
|
||||
# 创建 train 数据视图
|
||||
if [ "$train_indices" -gt 0 ]; then
|
||||
# 检查数据视图是否已存在
|
||||
local train_exists=$(curl -s "$KB_HOST/api/data_views" -H 'kbn-xsrf: true' 2>/dev/null | grep '"title":"train-\*"' | wc -l )
|
||||
|
||||
if [ "$train_exists" -eq 0 ]; then
|
||||
echo "[INFO] Creating data view for train-* indices"
|
||||
curl -fsS -X POST "$KB_HOST/api/data_views/data_view" \
|
||||
-H 'kbn-xsrf: true' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"data_view":{"name":"train","title":"train-*","timeFieldName":"@timestamp"}}' \
|
||||
>/dev/null && echo "[OK] Created train data view" || echo "[WARN] Failed to create train data view"
|
||||
else
|
||||
echo "[INFO] Train data view already exists, skipping"
|
||||
fi
|
||||
else
|
||||
echo "[INFO] No train-* indices found, skipping train data view creation"
|
||||
fi
|
||||
|
||||
# 创建 infer 数据视图
|
||||
if [ "$infer_indices" -gt 0 ]; then
|
||||
# 检查数据视图是否已存在
|
||||
local infer_exists=$(curl -s "$KB_HOST/api/data_views" -H 'kbn-xsrf: true' 2>/dev/null | grep '"title":"infer-\*"' | wc -l )
|
||||
|
||||
if [ "$infer_exists" -eq 0 ]; then
|
||||
echo "[INFO] Creating data view for infer-* indices"
|
||||
curl -fsS -X POST "$KB_HOST/api/data_views/data_view" \
|
||||
-H 'kbn-xsrf: true' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"data_view":{"name":"infer","title":"infer-*","timeFieldName":"@timestamp"}}' \
|
||||
>/dev/null && echo "[OK] Created infer data view" || echo "[WARN] Failed to create infer data view"
|
||||
else
|
||||
echo "[INFO] Infer data view already exists, skipping"
|
||||
fi
|
||||
else
|
||||
echo "[INFO] No infer-* indices found, skipping infer data view creation"
|
||||
fi
|
||||
create_or_ensure_data_view "train" "train-*"
|
||||
create_or_ensure_data_view "infer" "infer-*"
|
||||
}
|
||||
|
||||
# 主逻辑
|
||||
|
||||
@ -115,20 +115,32 @@ show_step "Health" "Check service health"
|
||||
echo "[INFO] Checking service health..."
|
||||
|
||||
# 检查 Elasticsearch 健康状态
|
||||
health_check_ok=1
|
||||
es_health=$(curl -s "http://localhost:9200/_cluster/health" | grep -o '"status":"[^"]*"' | cut -d'"' -f4)
|
||||
if [ "$es_health" = "green" ] || [ "$es_health" = "yellow" ]; then
|
||||
echo "✅ Elasticsearch health: $es_health"
|
||||
else
|
||||
echo "❌ Elasticsearch health: $es_health"
|
||||
health_check_ok=0
|
||||
fi
|
||||
|
||||
# 检查 Kibana 状态
|
||||
if curl -fs "http://localhost:5601/api/status" >/dev/null 2>&1; then
|
||||
kb_status="available"
|
||||
echo "✅ Kibana status: $kb_status"
|
||||
|
||||
data_views_json=$(curl -fs "http://localhost:5601/api/data_views" -H 'kbn-xsrf: true' 2>/dev/null || true)
|
||||
if echo "$data_views_json" | grep -F '"title":"train-*"' >/dev/null 2>&1 && \
|
||||
echo "$data_views_json" | grep -F '"title":"infer-*"' >/dev/null 2>&1; then
|
||||
echo "✅ Kibana data views: train-* and infer-* present"
|
||||
else
|
||||
echo "❌ Kibana data views missing: train-* or infer-*"
|
||||
health_check_ok=0
|
||||
fi
|
||||
else
|
||||
kb_status="unavailable"
|
||||
echo "⚠️ Kibana status: $kb_status"
|
||||
health_check_ok=0
|
||||
fi
|
||||
|
||||
# 检查 Fluent-Bit 指标
|
||||
@ -139,6 +151,13 @@ if [ "$fb_host01_uptime" -gt 0 ] && [ "$fb_host02_uptime" -gt 0 ]; then
|
||||
echo "✅ Fluent-Bit services: host01 uptime=${fb_host01_uptime}s, host02 uptime=${fb_host02_uptime}s"
|
||||
else
|
||||
echo "⚠️ Fluent-Bit services: host01 uptime=${fb_host01_uptime}s, host02 uptime=${fb_host02_uptime}s"
|
||||
health_check_ok=0
|
||||
fi
|
||||
|
||||
if [ "$health_check_ok" -eq 1 ]; then
|
||||
true
|
||||
else
|
||||
false
|
||||
fi
|
||||
|
||||
verify_step "Service health check"
|
||||
|
||||
@ -3,12 +3,13 @@ set -euo pipefail
|
||||
|
||||
usage() {
|
||||
cat >&2 <<'USAGE'
|
||||
Usage: $0 [--intranet] [--offline] [--tag <image_tag>]
|
||||
Usage: $0 [--intranet] [--offline] [--tag <image_tag>] [--no-cache]
|
||||
|
||||
Options:
|
||||
--intranet 使用指定的 PyPI 镜像源(默认清华镜像)。
|
||||
--offline 完全离线构建,依赖 offline_wheels/ 目录中的离线依赖包。
|
||||
--tag <image_tag> 自定义镜像标签,默认 argus-master:latest。
|
||||
--no-cache 不使用 Docker 构建缓存。
|
||||
USAGE
|
||||
}
|
||||
|
||||
@ -19,6 +20,7 @@ IMAGE_TAG="${IMAGE_TAG:-argus-master:latest}"
|
||||
DOCKERFILE="src/master/Dockerfile"
|
||||
BUILD_ARGS=()
|
||||
OFFLINE_MODE=0
|
||||
NO_CACHE=0
|
||||
|
||||
source "$PROJECT_ROOT/scripts/common/build_user.sh"
|
||||
load_build_user
|
||||
@ -45,6 +47,11 @@ while [[ "$#" -gt 0 ]]; do
|
||||
IMAGE_TAG="$2"
|
||||
shift 2
|
||||
;;
|
||||
--no-cache)
|
||||
NO_CACHE=1
|
||||
BUILD_ARGS+=("--no-cache")
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
|
||||
12
src/sys/debug/.env.example
Normal file
@ -0,0 +1,12 @@
|
||||
# Generated by 01_bootstrap.sh
|
||||
SYS_DEBUG_PRIVATE_CORE=/absolute/path/to/private
|
||||
SYS_DEBUG_PRIVATE_NODEA=/absolute/path/to/private-nodea
|
||||
SYS_DEBUG_PRIVATE_NODEB=/absolute/path/to/private-nodeb
|
||||
SYS_DEBUG_TMP_DIR=/absolute/path/to/tmp
|
||||
SYS_DEBUG_NETWORK_NAME=argus-debug-net
|
||||
SYS_DEBUG_NETWORK_SUBNET=172.30.0.0/16
|
||||
SYS_DEBUG_NETWORK_GATEWAY=172.30.0.1
|
||||
SYS_DEBUG_PROJECT_NAME=argus-debug
|
||||
SYS_DEBUG_CONTAINER_PREFIX=argus-debug
|
||||
ARGUS_BUILD_UID=2133
|
||||
ARGUS_BUILD_GID=2015
|
||||
68
src/sys/debug/README.md
Normal file
@ -0,0 +1,68 @@
|
||||
# ARGUS 系统调试部署模式
|
||||
|
||||
该目录提供基于系统级 E2E 测试构建的调试部署流程,便于本地快速复现与排查问题。核心特性:
|
||||
|
||||
- 独立 docker 网络 `argus-debug-net`(默认子网 `172.30.0.0/16`),避免与 `src/sys/tests` 冲突。
|
||||
- 私有数据目录可通过参数自定义,例如 `--private-root /tmp/argus-debug`。
|
||||
- 默认保留调试过程生成的文件,避免 `down`/`bootstrap` 自动删除。
|
||||
|
||||
## 快速开始
|
||||
|
||||
```bash
|
||||
cd src/sys/debug
|
||||
|
||||
# 仅首次需要,创建 external 网络
|
||||
./scripts/network-create.sh
|
||||
|
||||
# 初始化目录/构建 agent/写入 .env
|
||||
./scripts/01_bootstrap.sh --private-root /tmp/argus-debug
|
||||
|
||||
# 启动调试栈
|
||||
./scripts/02_up.sh
|
||||
|
||||
# 根据需要执行验证脚本(03~08)
|
||||
./scripts/03_wait_ready.sh
|
||||
...
|
||||
|
||||
# 调试结束停止服务
|
||||
./scripts/09_down.sh
|
||||
|
||||
# 若需移除网络或数据
|
||||
./scripts/network-destroy.sh
|
||||
./scripts/clean-data.sh
|
||||
```
|
||||
|
||||
> **提示**:调试与测试栈不能同时运行,应保持 `src/sys/tests` 中的 `argus-sys` 栈已停止。
|
||||
|
||||
## 参数与环境变量
|
||||
|
||||
- `--private-root <path>`:同时指定核心服务与两个节点的私有目录根,脚本自动派生 `private`、`private-nodea`、`private-nodeb`。
|
||||
- `--private-core <path>`、`--private-nodea <path>`、`--private-nodeb <path>`:分别覆盖单独目录。
|
||||
- 环境变量可覆盖 `.env` 中写入的值,例如 `export SYS_DEBUG_NETWORK_NAME=my-debug-net`。
|
||||
- `.env` 文件字段:
|
||||
- `SYS_DEBUG_PRIVATE_CORE`
|
||||
- `SYS_DEBUG_PRIVATE_NODEA`
|
||||
- `SYS_DEBUG_PRIVATE_NODEB`
|
||||
- `SYS_DEBUG_TMP_DIR`
|
||||
- `SYS_DEBUG_NETWORK_NAME`
|
||||
- `SYS_DEBUG_NETWORK_SUBNET`
|
||||
- `SYS_DEBUG_NETWORK_GATEWAY`
|
||||
- `SYS_DEBUG_PROJECT_NAME`
|
||||
- `SYS_DEBUG_CONTAINER_PREFIX`
|
||||
- `ARGUS_BUILD_UID` / `ARGUS_BUILD_GID`
|
||||
|
||||
## 脚本说明
|
||||
|
||||
- `scripts/common.sh`:通用函数与环境加载。
|
||||
- `scripts/network-create.sh` / `network-destroy.sh`:管理 external 网络。
|
||||
- `scripts/00_debug_all.sh`:顺序执行 01~08(默认不执行 09)。
|
||||
- `scripts/clean-data.sh`:选择性清理宿主机私有数据。
|
||||
- `scripts/03_wait_ready.sh`:除了等待各服务就绪,还会在 Elasticsearch 就绪后自动将磁盘水位阈值放宽(97%/98%/99%),避免在磁盘紧张的调试环境中分片分配失败。
|
||||
- `scripts/08_restart_agent_reregister.sh`:将 node-b 切换到 `SYS_DEBUG_NODEB_FIXED_IP`(默认 `172.30.0.200`),如果目标地址与当前 IP 相同脚本会报错提醒重新选择地址。
|
||||
- 其它 `01~09` 与测试目录对应,但针对参数化路径及网络做了调整。
|
||||
|
||||
## 注意事项
|
||||
|
||||
- 若宿主机未安装 Docker,脚本将提示错误并退出。
|
||||
- 当指定的私有目录已存在数据时,脚本不会清理,请确认内容安全后再复用。
|
||||
- 与测试环境共用镜像:请提前执行仓库根目录的 `./build/build_images.sh`。
|
||||
147
src/sys/debug/docker-compose.yml
Normal file
@ -0,0 +1,147 @@
|
||||
version: "3.8"
|
||||
|
||||
networks:
|
||||
argus-debug-net:
|
||||
external: true
|
||||
name: ${SYS_DEBUG_NETWORK_NAME:-argus-debug-net}
|
||||
|
||||
services:
|
||||
bind:
|
||||
image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
|
||||
container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-bind
|
||||
networks:
|
||||
argus-debug-net:
|
||||
ipv4_address: ${SYS_DEBUG_BIND_IP:-172.30.0.2}
|
||||
volumes:
|
||||
- ${SYS_DEBUG_PRIVATE_CORE}:/private
|
||||
restart: unless-stopped
|
||||
|
||||
master:
|
||||
image: ${MASTER_IMAGE_TAG:-argus-master:latest}
|
||||
container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-master
|
||||
depends_on:
|
||||
- bind
|
||||
environment:
|
||||
- OFFLINE_THRESHOLD_SECONDS=6
|
||||
- ONLINE_THRESHOLD_SECONDS=2
|
||||
- SCHEDULER_INTERVAL_SECONDS=1
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
ports:
|
||||
- "32300:3000"
|
||||
volumes:
|
||||
- ${SYS_DEBUG_PRIVATE_CORE}/argus/master:/private/argus/master
|
||||
- ${SYS_DEBUG_PRIVATE_CORE}/argus/metric/prometheus:/private/argus/metric/prometheus
|
||||
- ${SYS_DEBUG_PRIVATE_CORE}/argus/etc:/private/argus/etc
|
||||
networks:
|
||||
argus-debug-net:
|
||||
ipv4_address: ${SYS_DEBUG_MASTER_IP:-172.30.0.10}
|
||||
restart: unless-stopped
|
||||
|
||||
es:
|
||||
image: ${ES_IMAGE_TAG:-argus-elasticsearch:latest}
|
||||
container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-es
|
||||
environment:
|
||||
- discovery.type=single-node
|
||||
- xpack.security.enabled=false
|
||||
- ES_JAVA_OPTS=-Xms512m -Xmx512m
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
volumes:
|
||||
- ${SYS_DEBUG_PRIVATE_CORE}/argus/log/elasticsearch:/private/argus/log/elasticsearch
|
||||
- ${SYS_DEBUG_PRIVATE_CORE}/argus/etc:/private/argus/etc
|
||||
ports:
|
||||
- "9200:9200"
|
||||
networks:
|
||||
argus-debug-net:
|
||||
ipv4_address: ${SYS_DEBUG_ES_IP:-172.30.0.20}
|
||||
restart: unless-stopped
|
||||
|
||||
kibana:
|
||||
image: ${KIBANA_IMAGE_TAG:-argus-kibana:latest}
|
||||
container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-kibana
|
||||
environment:
|
||||
- ELASTICSEARCH_HOSTS=http://es.log.argus.com:9200
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
volumes:
|
||||
- ${SYS_DEBUG_PRIVATE_CORE}/argus/log/kibana:/private/argus/log/kibana
|
||||
- ${SYS_DEBUG_PRIVATE_CORE}/argus/etc:/private/argus/etc
|
||||
depends_on:
|
||||
- es
|
||||
ports:
|
||||
- "5601:5601"
|
||||
networks:
|
||||
argus-debug-net:
|
||||
ipv4_address: ${SYS_DEBUG_KIBANA_IP:-172.30.0.30}
|
||||
restart: unless-stopped
|
||||
|
||||
node-a:
|
||||
image: ubuntu:22.04
|
||||
container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-node-a
|
||||
hostname: ${SYS_DEBUG_NODEA_HOST:-dev-yyrshare-nbnyx10-cp2f-pod-0}
|
||||
depends_on:
|
||||
- master
|
||||
- bind
|
||||
- es
|
||||
environment:
|
||||
- MASTER_ENDPOINT=http://master.argus.com:3000
|
||||
- REPORT_INTERVAL_SECONDS=2
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
- ES_HOST=es
|
||||
- ES_PORT=9200
|
||||
- CLUSTER=local
|
||||
- RACK=dev
|
||||
volumes:
|
||||
- ${SYS_DEBUG_PRIVATE_NODEA}/argus/agent/${SYS_DEBUG_NODEA_HOST:-dev-yyrshare-nbnyx10-cp2f-pod-0}:/private/argus/agent/${SYS_DEBUG_NODEA_HOST:-dev-yyrshare-nbnyx10-cp2f-pod-0}
|
||||
- ../../agent/dist/argus-agent:/usr/local/bin/argus-agent:ro
|
||||
- ../tests/scripts/node_entrypoint.sh:/usr/local/bin/node-entrypoint.sh:ro
|
||||
- ../../log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro
|
||||
- ../../log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro
|
||||
- ../../log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro
|
||||
entrypoint:
|
||||
- /usr/local/bin/node-entrypoint.sh
|
||||
dns:
|
||||
- ${SYS_DEBUG_BIND_IP:-172.30.0.2}
|
||||
ports:
|
||||
- "2020:2020"
|
||||
networks:
|
||||
argus-debug-net:
|
||||
ipv4_address: ${SYS_DEBUG_NODEA_IP:-172.30.0.101}
|
||||
restart: unless-stopped
|
||||
|
||||
node-b:
|
||||
image: ubuntu:22.04
|
||||
container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-node-b
|
||||
hostname: ${SYS_DEBUG_NODEB_HOST:-dev-yyrshare-uuuu10-ep2f-pod-0}
|
||||
depends_on:
|
||||
- master
|
||||
- bind
|
||||
- es
|
||||
environment:
|
||||
- MASTER_ENDPOINT=http://master.argus.com:3000
|
||||
- REPORT_INTERVAL_SECONDS=2
|
||||
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
- ES_HOST=es
|
||||
- ES_PORT=9200
|
||||
- CLUSTER=local
|
||||
- RACK=dev
|
||||
volumes:
|
||||
- ${SYS_DEBUG_PRIVATE_NODEB}/argus/agent/${SYS_DEBUG_NODEB_HOST:-dev-yyrshare-uuuu10-ep2f-pod-0}:/private/argus/agent/${SYS_DEBUG_NODEB_HOST:-dev-yyrshare-uuuu10-ep2f-pod-0}
|
||||
- ../../agent/dist/argus-agent:/usr/local/bin/argus-agent:ro
|
||||
- ../tests/scripts/node_entrypoint.sh:/usr/local/bin/node-entrypoint.sh:ro
|
||||
- ../../log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro
|
||||
- ../../log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro
|
||||
- ../../log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro
|
||||
entrypoint:
|
||||
- /usr/local/bin/node-entrypoint.sh
|
||||
dns:
|
||||
- ${SYS_DEBUG_BIND_IP:-172.30.0.2}
|
||||
ports:
|
||||
- "2021:2020"
|
||||
networks:
|
||||
argus-debug-net:
|
||||
ipv4_address: ${SYS_DEBUG_NODEB_IP:-172.30.0.102}
|
||||
restart: unless-stopped
|
||||
24
src/sys/debug/scripts/00_debug_all.sh
Executable file
@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
SCRIPTS=(
|
||||
"01_bootstrap.sh"
|
||||
"02_up.sh"
|
||||
"03_wait_ready.sh"
|
||||
"04_verify_dns_routing.sh"
|
||||
"05_agent_register.sh"
|
||||
"06_write_health_and_assert.sh"
|
||||
"07_logs_send_and_assert.sh"
|
||||
"08_restart_agent_reregister.sh"
|
||||
)
|
||||
|
||||
for script in "${SCRIPTS[@]}"; do
|
||||
echo "[SYS-DEBUG] Running $script"
|
||||
"$SCRIPT_DIR/$script"
|
||||
echo "[SYS-DEBUG] $script completed"
|
||||
echo
|
||||
done
|
||||
|
||||
echo "[SYS-DEBUG] Complete. Run scripts/09_down.sh when finished (data retained)."
|
||||
210
src/sys/debug/scripts/01_bootstrap.sh
Executable file
@ -0,0 +1,210 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# shellcheck source=common.sh
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
|
||||
|
||||
PRIVATE_ROOT=""
|
||||
PRIVATE_CORE="$SYS_DEBUG_PRIVATE_CORE"
|
||||
PRIVATE_NODEA="$SYS_DEBUG_PRIVATE_NODEA"
|
||||
PRIVATE_NODEB="$SYS_DEBUG_PRIVATE_NODEB"
|
||||
TMP_DIR_VAL="$SYS_DEBUG_TMP_DIR"
|
||||
NETWORK_NAME="$SYS_DEBUG_NETWORK_NAME"
|
||||
NETWORK_SUBNET="$SYS_DEBUG_NETWORK_SUBNET"
|
||||
NETWORK_GATEWAY="$SYS_DEBUG_NETWORK_GATEWAY"
|
||||
PROJECT_NAME="$SYS_DEBUG_PROJECT_NAME"
|
||||
CONTAINER_PREFIX="$SYS_DEBUG_CONTAINER_PREFIX"
|
||||
NODEB_FIXED_IP=${SYS_DEBUG_NODEB_FIXED_IP:-172.30.0.200}
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: ${0##*/} [--private-root PATH] [--private-core PATH] \
|
||||
[--private-nodea PATH] [--private-nodeb PATH] \
|
||||
[--tmp-dir PATH] [--network-name NAME] \
|
||||
[--network-subnet CIDR] [--network-gateway IP]
|
||||
|
||||
Prepare directories, build agent binary, and write .env for debug stack.
|
||||
EOF
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--private-root)
|
||||
shift; [[ $# -gt 0 ]] || { echo "--private-root requires value" >&2; exit 1; }
|
||||
PRIVATE_ROOT="$1"
|
||||
;;
|
||||
--private-root=*)
|
||||
PRIVATE_ROOT="${1#*=}"
|
||||
;;
|
||||
--private-core)
|
||||
shift; [[ $# -gt 0 ]] || { echo "--private-core requires value" >&2; exit 1; }
|
||||
PRIVATE_CORE="$1"
|
||||
;;
|
||||
--private-core=*)
|
||||
PRIVATE_CORE="${1#*=}"
|
||||
;;
|
||||
--private-nodea)
|
||||
shift; [[ $# -gt 0 ]] || { echo "--private-nodea requires value" >&2; exit 1; }
|
||||
PRIVATE_NODEA="$1"
|
||||
;;
|
||||
--private-nodea=*)
|
||||
PRIVATE_NODEA="${1#*=}"
|
||||
;;
|
||||
--private-nodeb)
|
||||
shift; [[ $# -gt 0 ]] || { echo "--private-nodeb requires value" >&2; exit 1; }
|
||||
PRIVATE_NODEB="$1"
|
||||
;;
|
||||
--private-nodeb=*)
|
||||
PRIVATE_NODEB="${1#*=}"
|
||||
;;
|
||||
--tmp-dir)
|
||||
shift; [[ $# -gt 0 ]] || { echo "--tmp-dir requires value" >&2; exit 1; }
|
||||
TMP_DIR_VAL="$1"
|
||||
;;
|
||||
--tmp-dir=*)
|
||||
TMP_DIR_VAL="${1#*=}"
|
||||
;;
|
||||
--network-name)
|
||||
shift; [[ $# -gt 0 ]] || { echo "--network-name requires value" >&2; exit 1; }
|
||||
NETWORK_NAME="$1"
|
||||
;;
|
||||
--network-name=*)
|
||||
NETWORK_NAME="${1#*=}"
|
||||
;;
|
||||
--network-subnet)
|
||||
shift; [[ $# -gt 0 ]] || { echo "--network-subnet requires value" >&2; exit 1; }
|
||||
NETWORK_SUBNET="$1"
|
||||
;;
|
||||
--network-subnet=*)
|
||||
NETWORK_SUBNET="${1#*=}"
|
||||
;;
|
||||
--network-gateway)
|
||||
shift; [[ $# -gt 0 ]] || { echo "--network-gateway requires value" >&2; exit 1; }
|
||||
NETWORK_GATEWAY="$1"
|
||||
;;
|
||||
--network-gateway=*)
|
||||
NETWORK_GATEWAY="${1#*=}"
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
usage >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
if [[ -n "$PRIVATE_ROOT" ]]; then
|
||||
PRIVATE_CORE="$PRIVATE_ROOT/private"
|
||||
PRIVATE_NODEA="$PRIVATE_ROOT/private-nodea"
|
||||
PRIVATE_NODEB="$PRIVATE_ROOT/private-nodeb"
|
||||
fi
|
||||
|
||||
PRIVATE_CORE=$(abs_path "$PRIVATE_CORE")
|
||||
PRIVATE_NODEA=$(abs_path "$PRIVATE_NODEA")
|
||||
PRIVATE_NODEB=$(abs_path "$PRIVATE_NODEB")
|
||||
TMP_DIR_VAL=$(abs_path "$TMP_DIR_VAL")
|
||||
|
||||
log "Preparing directories under $PRIVATE_CORE"
|
||||
mkdir -p \
|
||||
"$PRIVATE_CORE/argus/etc" \
|
||||
"$PRIVATE_CORE/argus/bind" \
|
||||
"$PRIVATE_CORE/argus/master" \
|
||||
"$PRIVATE_CORE/argus/metric/prometheus" \
|
||||
"$PRIVATE_CORE/argus/log/elasticsearch" \
|
||||
"$PRIVATE_CORE/argus/log/kibana" \
|
||||
"$PRIVATE_NODEA/argus/agent/$HOST_A/health" \
|
||||
"$PRIVATE_NODEB/argus/agent/$HOST_B/health" \
|
||||
"$TMP_DIR_VAL"
|
||||
|
||||
log "Aligning ownership for core directories"
|
||||
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" \
|
||||
"$PRIVATE_CORE/argus/log/elasticsearch" \
|
||||
"$PRIVATE_CORE/argus/log/kibana" \
|
||||
"$PRIVATE_CORE/argus/etc" 2>/dev/null || true
|
||||
|
||||
log "Distributing update-dns.sh"
|
||||
BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh"
|
||||
BIND_UPDATE_DEST="$PRIVATE_CORE/argus/etc/update-dns.sh"
|
||||
if [[ -f "$BIND_UPDATE_SRC" ]]; then
|
||||
cp "$BIND_UPDATE_SRC" "$BIND_UPDATE_DEST"
|
||||
chmod +x "$BIND_UPDATE_DEST"
|
||||
else
|
||||
echo "[WARN] Missing $BIND_UPDATE_SRC" >&2
|
||||
fi
|
||||
|
||||
require_docker
|
||||
|
||||
ensure_image() {
|
||||
local image="$1"
|
||||
if ! docker image inspect "$image" >/dev/null 2>&1; then
|
||||
echo "[ERR] Missing image: $image. Run ./build/build_images.sh" >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
log "Ensuring required images exist"
|
||||
ensure_image "${ES_IMAGE_TAG:-argus-elasticsearch:latest}"
|
||||
ensure_image "${KIBANA_IMAGE_TAG:-argus-kibana:latest}"
|
||||
ensure_image "${BIND_IMAGE_TAG:-argus-bind9:latest}"
|
||||
ensure_image "${MASTER_IMAGE_TAG:-argus-master:latest}"
|
||||
|
||||
log "Building agent binary"
|
||||
pushd "$REPO_ROOT/src/agent" >/dev/null
|
||||
./scripts/build_binary.sh
|
||||
popd >/dev/null
|
||||
|
||||
AGENT_BIN="$REPO_ROOT/src/agent/dist/argus-agent"
|
||||
if [[ ! -x "$AGENT_BIN" ]]; then
|
||||
echo "[ERR] Agent binary not found at $AGENT_BIN" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "$AGENT_BIN" > "$TMP_DIR_VAL/agent_binary_path"
|
||||
|
||||
log "Preparing environment file contents"
|
||||
tmp_env="$(mktemp)"
|
||||
cat > "$tmp_env" <<EOF
|
||||
SYS_DEBUG_PRIVATE_CORE=$PRIVATE_CORE
|
||||
SYS_DEBUG_PRIVATE_NODEA=$PRIVATE_NODEA
|
||||
SYS_DEBUG_PRIVATE_NODEB=$PRIVATE_NODEB
|
||||
SYS_DEBUG_TMP_DIR=$TMP_DIR_VAL
|
||||
SYS_DEBUG_NETWORK_NAME=$NETWORK_NAME
|
||||
SYS_DEBUG_NETWORK_SUBNET=$NETWORK_SUBNET
|
||||
SYS_DEBUG_NETWORK_GATEWAY=$NETWORK_GATEWAY
|
||||
SYS_DEBUG_PROJECT_NAME=$PROJECT_NAME
|
||||
SYS_DEBUG_CONTAINER_PREFIX=$CONTAINER_PREFIX
|
||||
SYS_DEBUG_NODEA_HOST=$HOST_A
|
||||
SYS_DEBUG_NODEB_HOST=$HOST_B
|
||||
SYS_DEBUG_BIND_IP=${SYS_DEBUG_BIND_IP:-172.30.0.2}
|
||||
SYS_DEBUG_MASTER_IP=${SYS_DEBUG_MASTER_IP:-172.30.0.10}
|
||||
SYS_DEBUG_ES_IP=${SYS_DEBUG_ES_IP:-172.30.0.20}
|
||||
SYS_DEBUG_KIBANA_IP=${SYS_DEBUG_KIBANA_IP:-172.30.0.30}
|
||||
SYS_DEBUG_NODEA_IP=${SYS_DEBUG_NODEA_IP:-172.30.0.101}
|
||||
SYS_DEBUG_NODEB_IP=${SYS_DEBUG_NODEB_IP:-172.30.0.102}
|
||||
SYS_DEBUG_NODEB_FIXED_IP=$NODEB_FIXED_IP
|
||||
ARGUS_BUILD_UID=$ARGUS_BUILD_UID
|
||||
ARGUS_BUILD_GID=$ARGUS_BUILD_GID
|
||||
EOF
|
||||
|
||||
if [[ -f "$ENV_FILE" ]]; then
|
||||
if cmp -s "$tmp_env" "$ENV_FILE"; then
|
||||
log ".env already up-to-date"
|
||||
rm -f "$tmp_env"
|
||||
if [[ ! -f "$DEBUG_ROOT/.env.lock" ]]; then
|
||||
cp "$ENV_FILE" "$DEBUG_ROOT/.env.lock"
|
||||
fi
|
||||
else
|
||||
mv "$ENV_FILE" "$ENV_FILE.bak"
|
||||
mv "$tmp_env" "$ENV_FILE"
|
||||
cp "$ENV_FILE" "$DEBUG_ROOT/.env.lock"
|
||||
log "Bootstrap updated .env (previous saved at ${ENV_FILE}.bak)"
|
||||
fi
|
||||
else
|
||||
mv "$tmp_env" "$ENV_FILE"
|
||||
cp "$ENV_FILE" "$DEBUG_ROOT/.env.lock"
|
||||
log "Bootstrap created .env at $ENV_FILE"
|
||||
fi
|
||||
19
src/sys/debug/scripts/02_up.sh
Executable file
@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# shellcheck source=common.sh
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
|
||||
|
||||
ensure_env_file
|
||||
ensure_paths_defined
|
||||
require_docker
|
||||
|
||||
if ! docker network inspect "$SYS_DEBUG_NETWORK_NAME" >/dev/null 2>&1; then
|
||||
echo "[ERR] Network $SYS_DEBUG_NETWORK_NAME not found. Run scripts/network-create.sh first." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Starting debug stack on project $SYS_DEBUG_PROJECT_NAME"
|
||||
compose up -d
|
||||
|
||||
log "Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021"
|
||||
84
src/sys/debug/scripts/03_wait_ready.sh
Executable file
@ -0,0 +1,84 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# shellcheck source=common.sh
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
|
||||
|
||||
ensure_env_file
|
||||
ensure_paths_defined
|
||||
|
||||
service_id() {
|
||||
compose ps -q "$1"
|
||||
}
|
||||
|
||||
wait_http() {
|
||||
local url="$1"; local attempts="${2:-120}"; local i=1
|
||||
while (( i <= attempts )); do
|
||||
if curl -fsS "$url" >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
echo "[..] waiting $url ($i/$attempts)"
|
||||
sleep 5
|
||||
((i++))
|
||||
done
|
||||
echo "[ERR] Timeout waiting for $url" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
log "Waiting for ES/Kibana/Master/Fluent Bit/Bind"
|
||||
|
||||
attempt=1; max=120
|
||||
while (( attempt <= max )); do
|
||||
if curl -fsS "http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then
|
||||
break
|
||||
fi
|
||||
echo "[..] waiting ES ($attempt/$max)"
|
||||
sleep 5
|
||||
((attempt++))
|
||||
done
|
||||
if (( attempt > max )); then
|
||||
echo "[ERR] ES not ready" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Applying relaxed ES disk watermarks for debug"
|
||||
curl -fsS -XPUT "http://localhost:9200/_cluster/settings" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"transient": {
|
||||
"cluster.routing.allocation.disk.watermark.low": "99%",
|
||||
"cluster.routing.allocation.disk.watermark.high": "99%",
|
||||
"cluster.routing.allocation.disk.watermark.flood_stage": "99%"
|
||||
}
|
||||
}' >/dev/null || echo "[WARN] Failed to adjust ES watermarks"
|
||||
|
||||
log "Waiting for Kibana to be available (HTTP 200)"
|
||||
kb_attempt=1; kb_max=180
|
||||
while (( kb_attempt <= kb_max )); do
|
||||
body=$(curl -sS "http://localhost:5601/api/status" 2>/dev/null || true)
|
||||
code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:5601/api/status" || echo 000)
|
||||
if [[ "$code" == "200" ]] && echo "$body" | grep -q '"level":"available"'; then
|
||||
log "Kibana available"
|
||||
break
|
||||
fi
|
||||
echo "[..] waiting kibana 200 ($kb_attempt/$kb_max), last_code=$code"
|
||||
sleep 5
|
||||
((kb_attempt++))
|
||||
done
|
||||
if (( kb_attempt > kb_max )); then
|
||||
echo "[ERR] Kibana did not reach HTTP 200" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
wait_http "http://localhost:32300/readyz" 120
|
||||
wait_http "http://localhost:2020/api/v2/metrics" 120
|
||||
wait_http "http://localhost:2021/api/v2/metrics" 120
|
||||
|
||||
BIND_ID="$(service_id bind)"
|
||||
if [[ -n "$BIND_ID" ]]; then
|
||||
docker exec "$BIND_ID" named-checkconf >/dev/null
|
||||
else
|
||||
echo "[WARN] bind container id not found" >&2
|
||||
fi
|
||||
|
||||
log "All services are ready"
|
||||
51
src/sys/debug/scripts/04_verify_dns_routing.sh
Executable file
@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# shellcheck source=common.sh
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
|
||||
|
||||
ensure_env_file
|
||||
ensure_paths_defined
|
||||
|
||||
service_id() {
|
||||
compose ps -q "$1"
|
||||
}
|
||||
|
||||
log "Verifying DNS routing via bind"
|
||||
|
||||
MASTER_FILE="$SYS_DEBUG_PRIVATE_CORE/argus/etc/master.argus.com"
|
||||
if [[ ! -f "$MASTER_FILE" ]]; then
|
||||
echo "[ERR] master.argus.com file missing at $MASTER_FILE" >&2
|
||||
exit 1
|
||||
fi
|
||||
MASTER_IP_HOST="$(tr -d '\r\n' < "$MASTER_FILE" || true)"
|
||||
log "master.argus.com file content: $MASTER_IP_HOST"
|
||||
|
||||
BIN_ID="$(service_id bind)"
|
||||
if [[ -n "$BIN_ID" ]]; then
|
||||
DIG_IP="$(docker exec "$BIN_ID" dig +short master.argus.com A | tail -n1 || true)"
|
||||
log "dig(master.argus.com) from bind container -> $DIG_IP"
|
||||
if [[ -z "$DIG_IP" ]]; then
|
||||
echo "[ERR] bind did not resolve master.argus.com" >&2
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "[WARN] bind container not found; skip dig" >&2
|
||||
fi
|
||||
|
||||
for node in node-a node-b; do
|
||||
CID="$(service_id "$node")"
|
||||
if [[ -z "$CID" ]]; then
|
||||
echo "[ERR] Container for $node not found" >&2
|
||||
exit 1
|
||||
fi
|
||||
log "Checking resolution inside $node"
|
||||
if ! docker exec "$CID" getent hosts master.argus.com >/dev/null 2>&1; then
|
||||
echo "[ERR] $node cannot resolve master.argus.com" >&2
|
||||
exit 1
|
||||
fi
|
||||
RES="$(docker exec "$CID" getent hosts master.argus.com | awk '{print $1}' | head -n1)"
|
||||
log "$node resolved master.argus.com -> $RES"
|
||||
done
|
||||
|
||||
log "DNS routing verified"
|
||||
84
src/sys/debug/scripts/05_agent_register.sh
Executable file
@ -0,0 +1,84 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# shellcheck source=common.sh
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
|
||||
|
||||
ensure_env_file
|
||||
ensure_paths_defined
|
||||
|
||||
TMP_DIR_LOCAL="$TMP_DIR"
|
||||
mkdir -p "$TMP_DIR_LOCAL"
|
||||
|
||||
API_BASE="http://localhost:32300/api/v1/master"
|
||||
|
||||
log "Waiting for agent nodes to register"
|
||||
|
||||
extract_node() {
|
||||
local name="$1"; local output="$2"; local json_file="$3"
|
||||
python3 - "$name" "$output" "$json_file" <<'PY'
|
||||
import json, sys, pathlib
|
||||
name = sys.argv[1]
|
||||
out = pathlib.Path(sys.argv[2])
|
||||
json_file = sys.argv[3]
|
||||
with open(json_file, 'r') as fh:
|
||||
data = json.load(fh)
|
||||
node = next((n for n in data if n.get("name") == name), None)
|
||||
if node:
|
||||
out.write_text(node["id"])
|
||||
print(node["id"])
|
||||
PY
|
||||
}
|
||||
|
||||
ID_A=""; ID_B=""
|
||||
for _ in {1..60}; do
|
||||
sleep 2
|
||||
resp=$(curl -fsS "$API_BASE/nodes" 2>/dev/null || true)
|
||||
[[ -z "$resp" ]] && continue
|
||||
if ! echo "$resp" | head -c1 | grep -q '\['; then
|
||||
continue
|
||||
fi
|
||||
echo "$resp" > "$TMP_DIR_LOCAL/nodes_list.json"
|
||||
ID_A=$(extract_node "$HOST_A" "$TMP_DIR_LOCAL/node_id_a" "$TMP_DIR_LOCAL/nodes_list.json" 2>/dev/null || true)
|
||||
ID_B=$(extract_node "$HOST_B" "$TMP_DIR_LOCAL/node_id_b" "$TMP_DIR_LOCAL/nodes_list.json" 2>/dev/null || true)
|
||||
if [[ -s "$TMP_DIR_LOCAL/node_id_a" && -s "$TMP_DIR_LOCAL/node_id_b" ]]; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ ! -s "$TMP_DIR_LOCAL/node_id_a" || ! -s "$TMP_DIR_LOCAL/node_id_b" ]]; then
|
||||
echo "[ERR] Agents did not register in time" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
node_detail() {
|
||||
local id="$1"; local out="$2"
|
||||
curl -fsS "$API_BASE/nodes/$id" -o "$out"
|
||||
}
|
||||
|
||||
node_detail "$(cat "$TMP_DIR_LOCAL/node_id_a")" "$TMP_DIR_LOCAL/detail_a.json"
|
||||
node_detail "$(cat "$TMP_DIR_LOCAL/node_id_b")" "$TMP_DIR_LOCAL/detail_b.json"
|
||||
|
||||
python3 - "$TMP_DIR_LOCAL/detail_a.json" "$TMP_DIR_LOCAL/initial_ip_a" <<'PY'
|
||||
import json, sys, pathlib
|
||||
node=json.load(open(sys.argv[1]))
|
||||
ip=node.get("meta_data",{}).get("ip")
|
||||
assert ip, "missing ip"
|
||||
pathlib.Path(sys.argv[2]).write_text(ip)
|
||||
PY
|
||||
|
||||
python3 - "$TMP_DIR_LOCAL/detail_b.json" "$TMP_DIR_LOCAL/initial_ip_b" <<'PY'
|
||||
import json, sys, pathlib
|
||||
node=json.load(open(sys.argv[1]))
|
||||
ip=node.get("meta_data",{}).get("ip")
|
||||
assert ip, "missing ip"
|
||||
pathlib.Path(sys.argv[2]).write_text(ip)
|
||||
PY
|
||||
|
||||
NODE_JSON_A="$SYS_DEBUG_PRIVATE_NODEA/argus/agent/$HOST_A/node.json"
|
||||
NODE_JSON_B="$SYS_DEBUG_PRIVATE_NODEB/argus/agent/$HOST_B/node.json"
|
||||
|
||||
[[ -f "$NODE_JSON_A" ]] || { echo "[ERR] node.json missing for $HOST_A" >&2; exit 1; }
|
||||
[[ -f "$NODE_JSON_B" ]] || { echo "[ERR] node.json missing for $HOST_B" >&2; exit 1; }
|
||||
|
||||
log "Agents registered: $(cat "$TMP_DIR_LOCAL/node_id_a") , $(cat "$TMP_DIR_LOCAL/node_id_b")"
|
||||
78
src/sys/debug/scripts/06_write_health_and_assert.sh
Executable file
@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# shellcheck source=common.sh
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
|
||||
|
||||
ensure_env_file
|
||||
ensure_paths_defined
|
||||
|
||||
API_BASE="http://localhost:32300/api/v1/master"
|
||||
|
||||
HEALTH_A="$SYS_DEBUG_PRIVATE_NODEA/argus/agent/$HOST_A/health"
|
||||
HEALTH_B="$SYS_DEBUG_PRIVATE_NODEB/argus/agent/$HOST_B/health"
|
||||
|
||||
write_health() {
|
||||
local dir="$1"; mkdir -p "$dir"
|
||||
cat > "$dir/log-fluentbit.json" <<JSON
|
||||
{ "status": "healthy", "timestamp": "2025-10-13T12:05:00Z" }
|
||||
JSON
|
||||
cat > "$dir/metric-node-exporter.json" <<JSON
|
||||
{ "status": "healthy", "timestamp": "2025-10-13T12:05:00Z" }
|
||||
JSON
|
||||
}
|
||||
|
||||
log "Writing health files for both nodes"
|
||||
write_health "$HEALTH_A"
|
||||
write_health "$HEALTH_B"
|
||||
|
||||
ID_A="$TMP_DIR/node_id_a"
|
||||
ID_B="$TMP_DIR/node_id_b"
|
||||
|
||||
[[ -f "$ID_A" && -f "$ID_B" ]] || { echo "[ERR] node id files missing in $TMP_DIR" >&2; exit 1; }
|
||||
|
||||
ID_A_VAL="$(cat "$ID_A")"
|
||||
ID_B_VAL="$(cat "$ID_B")"
|
||||
|
||||
check_health() {
|
||||
local id="$1"; local tries=40
|
||||
for _ in $(seq 1 $tries); do
|
||||
sleep 2
|
||||
resp=$(curl -fsS "$API_BASE/nodes/$id" 2>/dev/null || true)
|
||||
[[ -z "$resp" ]] && continue
|
||||
echo "$resp" > "$TMP_DIR/node_${id}_detail.json"
|
||||
if python3 - "$TMP_DIR/node_${id}_detail.json" <<'PY'
|
||||
import json,sys
|
||||
node=json.load(open(sys.argv[1]))
|
||||
h=node.get("health",{})
|
||||
if "log-fluentbit" in h and "metric-node-exporter" in h:
|
||||
sys.exit(0)
|
||||
sys.exit(1)
|
||||
PY
|
||||
then
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
check_health "$ID_A_VAL" || { echo "[ERR] health keys not reported for node A" >&2; exit 1; }
|
||||
check_health "$ID_B_VAL" || { echo "[ERR] health keys not reported for node B" >&2; exit 1; }
|
||||
|
||||
NODES_JSON="$SYS_DEBUG_PRIVATE_CORE/argus/metric/prometheus/nodes.json"
|
||||
if [[ ! -f "$NODES_JSON" ]]; then
|
||||
echo "[ERR] nodes.json missing at $NODES_JSON" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
python3 - "$NODES_JSON" <<'PY'
|
||||
import json,sys
|
||||
with open(sys.argv[1]) as h:
|
||||
nodes=json.load(h)
|
||||
if not isinstance(nodes, list):
|
||||
raise SystemExit("nodes.json expected list")
|
||||
if len(nodes) != 2:
|
||||
raise SystemExit(f"expected 2 nodes online, got {len(nodes)}")
|
||||
PY
|
||||
|
||||
log "Health reported and nodes.json has 2 online nodes"
|
||||
73
src/sys/debug/scripts/07_logs_send_and_assert.sh
Executable file
@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# shellcheck source=common.sh
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
|
||||
|
||||
ensure_env_file
|
||||
ensure_paths_defined
|
||||
|
||||
log "Sending logs and asserting ES counts"
|
||||
|
||||
get_count() {
|
||||
local idx="$1"
|
||||
curl -s "http://localhost:9200/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" | sed -E 's/.*"count":([0-9]+).*/\1/' | awk 'NF{print $0;exit} END{if(NR==0)print 0}'
|
||||
}
|
||||
|
||||
train0=$(get_count "train-*")
|
||||
infer0=$(get_count "infer-*")
|
||||
base=$((train0 + infer0))
|
||||
log "initial counts: train=${train0} infer=${infer0} total=${base}"
|
||||
|
||||
service_id() {
|
||||
compose ps -q "$1"
|
||||
}
|
||||
|
||||
send_logs() {
|
||||
local sid="$1"; local hosttag="$2"
|
||||
docker exec "$sid" sh -lc 'mkdir -p /logs/train /logs/infer'
|
||||
docker exec "$sid" sh -lc "ts=\
|
||||
\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log"
|
||||
docker exec "$sid" sh -lc "ts=\
|
||||
\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log"
|
||||
docker exec "$sid" sh -lc "ts=\
|
||||
\$(date '+%F %T'); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log"
|
||||
}
|
||||
|
||||
CID_A="$(service_id node-a)"
|
||||
CID_B="$(service_id node-b)"
|
||||
|
||||
[[ -n "$CID_A" && -n "$CID_B" ]] || { echo "[ERR] node containers not found" >&2; exit 1; }
|
||||
|
||||
send_logs "$CID_A" "host01"
|
||||
send_logs "$CID_B" "host02"
|
||||
|
||||
log "Waiting for ES to ingest"
|
||||
sleep 10
|
||||
|
||||
train1=$(get_count "train-*")
|
||||
infer1=$(get_count "infer-*")
|
||||
final=$((train1 + infer1))
|
||||
log "final counts: train=${train1} infer=${infer1} total=${final}"
|
||||
|
||||
if (( final <= base )); then
|
||||
echo "[ERR] ES total did not increase (${base} -> ${final})" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if (( final < 4 )); then
|
||||
echo "[ERR] ES total below expected threshold: ${final} < 4" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
es_health=$(curl -s "http://localhost:9200/_cluster/health" | grep -o '"status":"[^"]*"' | cut -d'"' -f4)
|
||||
if [[ "$es_health" != "green" && "$es_health" != "yellow" ]]; then
|
||||
echo "[ERR] ES health not green/yellow: $es_health" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! curl -fs "http://localhost:5601/api/status" >/dev/null 2>&1; then
|
||||
echo "[WARN] Kibana status endpoint not available"
|
||||
fi
|
||||
|
||||
log "ES counts increased and services healthy"
|
||||
110
src/sys/debug/scripts/08_restart_agent_reregister.sh
Executable file
@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# shellcheck source=common.sh
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
|
||||
|
||||
ensure_env_file
|
||||
ensure_paths_defined
|
||||
|
||||
API_BASE="http://localhost:32300/api/v1/master"
|
||||
NODE_ENTRYPOINT="$DEBUG_ROOT/../tests/scripts/node_entrypoint.sh"
|
||||
[[ -f "$NODE_ENTRYPOINT" ]] || { echo "[ERR] node entrypoint script missing at $NODE_ENTRYPOINT" >&2; exit 1; }
|
||||
|
||||
TARGET_FIXED_IP="${SYS_DEBUG_NODEB_FIXED_IP:-172.30.0.200}"
|
||||
|
||||
ID_B_FILE="$TMP_DIR/node_id_b"
|
||||
IP_INIT_FILE="$TMP_DIR/initial_ip_b"
|
||||
[[ -f "$ID_B_FILE" && -f "$IP_INIT_FILE" ]] || { echo "[ERR] Required node id/ip files missing in $TMP_DIR" >&2; exit 1; }
|
||||
|
||||
ID_B="$(cat "$ID_B_FILE")"
|
||||
IP0_B="$(cat "$IP_INIT_FILE")"
|
||||
|
||||
DETAIL_BEFORE="$TMP_DIR/node_b_before.json"
|
||||
curl -fsS "$API_BASE/nodes/$ID_B" -o "$DETAIL_BEFORE"
|
||||
LAST0=$(python3 - "$DETAIL_BEFORE" <<'PY'
|
||||
import json,sys
|
||||
node=json.load(open(sys.argv[1]))
|
||||
print(node.get("last_updated",""))
|
||||
PY
|
||||
)
|
||||
IP_BEFORE=$(python3 - "$DETAIL_BEFORE" <<'PY'
|
||||
import json,sys
|
||||
node=json.load(open(sys.argv[1]))
|
||||
print(node.get("meta_data",{}).get("ip",""))
|
||||
PY
|
||||
)
|
||||
|
||||
if [[ "$IP_BEFORE" != "$IP0_B" ]]; then
|
||||
echo "[ERR] Expected initial IP $IP0_B for node-b, got $IP_BEFORE" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "$IP_BEFORE" == "$TARGET_FIXED_IP" ]]; then
|
||||
echo "[ERR] node-b current IP $IP_BEFORE already matches target $TARGET_FIXED_IP. Configure SYS_DEBUG_NODEB_FIXED_IP to a different address before rerun." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
service_id() {
|
||||
compose ps -q "$1"
|
||||
}
|
||||
|
||||
log "Recreating node-b (old IP $IP_BEFORE) with static IP $TARGET_FIXED_IP"
|
||||
compose rm -sf node-b >/dev/null 2>&1 || true
|
||||
|
||||
CONTAINER_NAME="${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-node-b"
|
||||
docker rm -f "$CONTAINER_NAME" >/dev/null 2>&1 || true
|
||||
|
||||
AGENT_BIN_PATH="$(cat "$TMP_DIR/agent_binary_path")"
|
||||
[[ -f "$AGENT_BIN_PATH" ]] || { echo "[ERR] Agent binary path missing in $TMP_DIR" >&2; exit 1; }
|
||||
|
||||
require_docker
|
||||
|
||||
docker run -d \
|
||||
--name "$CONTAINER_NAME" \
|
||||
--hostname "$HOST_B" \
|
||||
--network "$SYS_DEBUG_NETWORK_NAME" \
|
||||
--ip "$TARGET_FIXED_IP" \
|
||||
--dns "${SYS_DEBUG_BIND_IP:-172.30.0.2}" \
|
||||
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
|
||||
-e REPORT_INTERVAL_SECONDS=2 \
|
||||
-e ARGUS_BUILD_UID=$ARGUS_BUILD_UID \
|
||||
-e ARGUS_BUILD_GID=$ARGUS_BUILD_GID \
|
||||
-e ES_HOST=es \
|
||||
-e ES_PORT=9200 \
|
||||
-e CLUSTER=local \
|
||||
-e RACK=dev \
|
||||
-p 2021:2020 \
|
||||
-v "$SYS_DEBUG_PRIVATE_NODEB/argus/agent/$HOST_B:/private/argus/agent/$HOST_B" \
|
||||
-v "$AGENT_BIN_PATH:/usr/local/bin/argus-agent:ro" \
|
||||
-v "$NODE_ENTRYPOINT:/usr/local/bin/node-entrypoint.sh:ro" \
|
||||
-v "$REPO_ROOT/src/log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro" \
|
||||
-v "$REPO_ROOT/src/log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro" \
|
||||
-v "$REPO_ROOT/src/log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro" \
|
||||
--entrypoint /usr/local/bin/node-entrypoint.sh \
|
||||
ubuntu:22.04 >/dev/null
|
||||
|
||||
log "Waiting for node-b to re-register with new IP"
|
||||
for _ in {1..40}; do
|
||||
sleep 3
|
||||
if curl -fsS "$API_BASE/nodes/$ID_B" -o "$TMP_DIR/node_b_after.json"; then
|
||||
if python3 - "$TMP_DIR/node_b_after.json" "$LAST0" "$TARGET_FIXED_IP" <<'PY'
|
||||
import json,sys
|
||||
node=json.load(open(sys.argv[1]))
|
||||
last0=sys.argv[2]
|
||||
expected_ip=sys.argv[3]
|
||||
ip=node.get("meta_data",{}).get("ip")
|
||||
lu=node.get("last_updated")
|
||||
if ip == expected_ip and lu and lu != last0:
|
||||
sys.exit(0)
|
||||
sys.exit(1)
|
||||
PY
|
||||
then
|
||||
log "node-b IP updated: $IP_BEFORE -> $TARGET_FIXED_IP"
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
echo "[ERR] node-b did not update to IP $TARGET_FIXED_IP in time" >&2
|
||||
exit 1
|
||||
13
src/sys/debug/scripts/09_down.sh
Executable file
@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# shellcheck source=common.sh
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
|
||||
|
||||
ensure_env_file
|
||||
require_docker
|
||||
|
||||
log "Stopping debug stack (project $SYS_DEBUG_PROJECT_NAME)"
|
||||
compose down --remove-orphans >/dev/null 2>&1 || true
|
||||
|
||||
log "Containers stopped. No host directories were removed."
|
||||
66
src/sys/debug/scripts/clean-data.sh
Executable file
@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# shellcheck source=common.sh
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
|
||||
|
||||
ensure_env_file
|
||||
ensure_paths_defined
|
||||
|
||||
FORCE=false
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
-y|--yes)
|
||||
FORCE=true
|
||||
;;
|
||||
-h|--help)
|
||||
cat <<USAGE
|
||||
Usage: ${0##*/} [--yes]
|
||||
|
||||
Safely remove debug private directories after adjusting ownership.
|
||||
USAGE
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
if [[ $FORCE == false ]]; then
|
||||
read -r -p "This will delete debug private directories. Continue? [y/N] " reply
|
||||
case "$reply" in
|
||||
y|Y|yes|YES)
|
||||
;;
|
||||
*)
|
||||
echo "Aborted"
|
||||
exit 0
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
paths=(
|
||||
"$SYS_DEBUG_PRIVATE_CORE"
|
||||
"$SYS_DEBUG_PRIVATE_NODEA"
|
||||
"$SYS_DEBUG_PRIVATE_NODEB"
|
||||
"$SYS_DEBUG_TMP_DIR"
|
||||
)
|
||||
|
||||
require_docker
|
||||
|
||||
image="ubuntu:22.04"
|
||||
|
||||
for dir in "${paths[@]}"; do
|
||||
[[ -d "$dir" ]] || continue
|
||||
log "Fixing ownership for $dir"
|
||||
if ! docker run --rm -v "$dir:/target" "$image" chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1; then
|
||||
echo "[WARN] Failed to adjust ownership via $image, attempting local chown" >&2
|
||||
chown -R "$(id -u):$(id -g)" "$dir" >/dev/null 2>&1 || true
|
||||
fi
|
||||
log "Removing $dir"
|
||||
rm -rf "$dir"
|
||||
done
|
||||
|
||||
log "Clean data completed"
|
||||
96
src/sys/debug/scripts/common.sh
Executable file
@ -0,0 +1,96 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
DEBUG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
REPO_ROOT="$(cd "$DEBUG_ROOT/../../.." && pwd)"
|
||||
ENV_FILE="$DEBUG_ROOT/.env"
|
||||
|
||||
source "$REPO_ROOT/scripts/common/build_user.sh"
|
||||
load_build_user
|
||||
|
||||
if [[ -f "$ENV_FILE" ]]; then
|
||||
set -a
|
||||
# shellcheck disable=SC1090
|
||||
source "$ENV_FILE"
|
||||
set +a
|
||||
fi
|
||||
|
||||
SYS_DEBUG_NETWORK_NAME=${SYS_DEBUG_NETWORK_NAME:-argus-debug-net}
|
||||
SYS_DEBUG_NETWORK_SUBNET=${SYS_DEBUG_NETWORK_SUBNET:-172.30.0.0/16}
|
||||
SYS_DEBUG_NETWORK_GATEWAY=${SYS_DEBUG_NETWORK_GATEWAY:-172.30.0.1}
|
||||
SYS_DEBUG_PROJECT_NAME=${SYS_DEBUG_PROJECT_NAME:-argus-debug}
|
||||
SYS_DEBUG_CONTAINER_PREFIX=${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}
|
||||
SYS_DEBUG_PRIVATE_CORE=${SYS_DEBUG_PRIVATE_CORE:-$DEBUG_ROOT/private}
|
||||
SYS_DEBUG_PRIVATE_NODEA=${SYS_DEBUG_PRIVATE_NODEA:-$DEBUG_ROOT/private-nodea}
|
||||
SYS_DEBUG_PRIVATE_NODEB=${SYS_DEBUG_PRIVATE_NODEB:-$DEBUG_ROOT/private-nodeb}
|
||||
SYS_DEBUG_TMP_DIR=${SYS_DEBUG_TMP_DIR:-$DEBUG_ROOT/tmp}
|
||||
ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
|
||||
ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
|
||||
|
||||
SYS_DEBUG_NODEA_HOST=${SYS_DEBUG_NODEA_HOST:-dev-yyrshare-nbnyx10-cp2f-pod-0}
|
||||
SYS_DEBUG_NODEB_HOST=${SYS_DEBUG_NODEB_HOST:-dev-yyrshare-uuuu10-ep2f-pod-0}
|
||||
|
||||
HOST_A="$SYS_DEBUG_NODEA_HOST"
|
||||
HOST_B="$SYS_DEBUG_NODEB_HOST"
|
||||
|
||||
COMPOSE_FILE="$DEBUG_ROOT/docker-compose.yml"
|
||||
|
||||
abs_path() {
|
||||
python3 - "$1" <<'PY'
|
||||
import os, sys
|
||||
path = sys.argv[1]
|
||||
print(os.path.abspath(path))
|
||||
PY
|
||||
}
|
||||
|
||||
ensure_command() {
|
||||
local cmd="$1"
|
||||
if ! command -v "$cmd" >/dev/null 2>&1; then
|
||||
echo "[ERR] Required command '$cmd' not found" >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
require_docker() {
|
||||
ensure_command docker
|
||||
}
|
||||
|
||||
compose() {
|
||||
require_docker
|
||||
local bin
|
||||
if docker compose version >/dev/null 2>&1; then
|
||||
bin=(docker compose)
|
||||
else
|
||||
bin=(docker-compose)
|
||||
fi
|
||||
"${bin[@]}" -p "$SYS_DEBUG_PROJECT_NAME" -f "$COMPOSE_FILE" "$@"
|
||||
}
|
||||
|
||||
ensure_paths_defined() {
|
||||
local missing=()
|
||||
for name in SYS_DEBUG_PRIVATE_CORE SYS_DEBUG_PRIVATE_NODEA SYS_DEBUG_PRIVATE_NODEB SYS_DEBUG_TMP_DIR; do
|
||||
if [[ -z "${!name:-}" ]]; then
|
||||
missing+=("$name")
|
||||
fi
|
||||
done
|
||||
if (( ${#missing[@]} > 0 )); then
|
||||
echo "[ERR] Missing required environment variables: ${missing[*]}" >&2
|
||||
echo " Run 01_bootstrap.sh first." >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
ensure_env_file() {
|
||||
if [[ ! -f "$ENV_FILE" ]]; then
|
||||
echo "[ERR] Missing .env at $ENV_FILE. Run 01_bootstrap.sh first." >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
log() {
|
||||
echo "[INFO] $*"
|
||||
}
|
||||
|
||||
TMP_DIR="$SYS_DEBUG_TMP_DIR"
|
||||
mkdir -p "$TMP_DIR"
|
||||
76
src/sys/debug/scripts/network-create.sh
Executable file
@ -0,0 +1,76 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# shellcheck source=common.sh
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
|
||||
|
||||
NAME="$SYS_DEBUG_NETWORK_NAME"
|
||||
SUBNET="$SYS_DEBUG_NETWORK_SUBNET"
|
||||
GATEWAY="$SYS_DEBUG_NETWORK_GATEWAY"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: ${0##*/} [--name NAME] [--subnet CIDR] [--gateway IP]
|
||||
|
||||
Create (if missing) the external debug docker network.
|
||||
|
||||
Defaults derived from .env or:
|
||||
name = $NAME
|
||||
subnet = $SUBNET
|
||||
gateway = $GATEWAY
|
||||
EOF
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--name)
|
||||
shift; [[ $# -gt 0 ]] || { echo "--name requires value" >&2; exit 1; }
|
||||
NAME="$1"
|
||||
;;
|
||||
--name=*)
|
||||
NAME="${1#*=}"
|
||||
;;
|
||||
--subnet)
|
||||
shift; [[ $# -gt 0 ]] || { echo "--subnet requires value" >&2; exit 1; }
|
||||
SUBNET="$1"
|
||||
;;
|
||||
--subnet=*)
|
||||
SUBNET="${1#*=}"
|
||||
;;
|
||||
--gateway)
|
||||
shift; [[ $# -gt 0 ]] || { echo "--gateway requires value" >&2; exit 1; }
|
||||
GATEWAY="$1"
|
||||
;;
|
||||
--gateway=*)
|
||||
GATEWAY="${1#*=}"
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
usage >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
require_docker
|
||||
|
||||
if docker network inspect "$NAME" >/dev/null 2>&1; then
|
||||
log "Network $NAME already exists"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log "Creating network $NAME (subnet=$SUBNET gateway=$GATEWAY)"
|
||||
docker network create \
|
||||
--driver bridge \
|
||||
--subnet "$SUBNET" \
|
||||
--gateway "$GATEWAY" \
|
||||
"$NAME"
|
||||
|
||||
mkdir -p "$TMP_DIR"
|
||||
echo "$NAME" > "$TMP_DIR/network.created"
|
||||
log "Network $NAME created"
|
||||
55
src/sys/debug/scripts/network-destroy.sh
Executable file
@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# shellcheck source=common.sh
|
||||
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
|
||||
|
||||
NAME="$SYS_DEBUG_NETWORK_NAME"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: ${0##*/} [--name NAME]
|
||||
|
||||
Destroy the debug docker network if no containers are attached.
|
||||
EOF
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--name)
|
||||
shift; [[ $# -gt 0 ]] || { echo "--name requires value" >&2; exit 1; }
|
||||
NAME="$1"
|
||||
;;
|
||||
--name=*)
|
||||
NAME="${1#*=}"
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
usage >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
require_docker
|
||||
|
||||
if ! docker network inspect "$NAME" >/dev/null 2>&1; then
|
||||
log "Network $NAME not found; nothing to do"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
attached=$(docker network inspect -f '{{range $id, $conf := .Containers}}{{printf "%s " $conf.Name}}{{end}}' "$NAME")
|
||||
if [[ -n "${attached// }" ]]; then
|
||||
echo "[ERR] Cannot remove network $NAME: still connected containers -> $attached" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Deleting network $NAME"
|
||||
docker network rm "$NAME" >/dev/null
|
||||
rm -f "$TMP_DIR/network.created"
|
||||
log "Network $NAME removed"
|
||||
@ -8,6 +8,16 @@ REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
|
||||
|
||||
API_BASE="http://localhost:32300/api/v1/master"
|
||||
|
||||
if [[ -f "$TEST_ROOT/.env" ]]; then
|
||||
set -a
|
||||
# shellcheck disable=SC1090
|
||||
source "$TEST_ROOT/.env"
|
||||
set +a
|
||||
else
|
||||
source "$REPO_ROOT/scripts/common/build_user.sh"
|
||||
load_build_user
|
||||
fi
|
||||
|
||||
ID_B="$(cat "$TMP_DIR/node_id_b")"
|
||||
IP0_B="$(cat "$TMP_DIR/initial_ip_b")"
|
||||
|
||||
|
||||
45
src/web/.gitignore
vendored
Normal file
@ -0,0 +1,45 @@
|
||||
# Node modules
|
||||
node_modules/
|
||||
|
||||
# Build output
|
||||
/dist
|
||||
/build
|
||||
|
||||
# Dependency directories
|
||||
jspm_packages/
|
||||
|
||||
# Logs
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
|
||||
# Editor directories and files
|
||||
.idea/
|
||||
.vscode/
|
||||
*.suo
|
||||
*.ntvs*
|
||||
*.njsproj
|
||||
*.sln
|
||||
*.sw?
|
||||
|
||||
# OS generated files
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Environment variables
|
||||
.env
|
||||
.env.local
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
|
||||
# Testing
|
||||
/coverage/
|
||||
|
||||
# Optional: service worker cache
|
||||
/.pwa-cache/
|
||||
|
||||
# Misc
|
||||
*.log
|
||||
|
||||
.vite/
|
||||
@ -0,0 +1,12 @@
|
||||
# Argus-web
|
||||
|
||||
架构:React + Vite + Mantine
|
||||
|
||||
|
||||
## 打包部署
|
||||
根目录下运行
|
||||
```bash
|
||||
bash src/web/buld_tools/frontend/build.sh
|
||||
```
|
||||
|
||||
|
||||
91
src/web/build_tools/frontend/Dockerfile
Normal file
@ -0,0 +1,91 @@
|
||||
# ========== 构建阶段 ==========
|
||||
FROM node:20 AS builder
|
||||
|
||||
# 设置工作目录
|
||||
WORKDIR /app/src/web
|
||||
|
||||
# 复制依赖文件并安装
|
||||
COPY src/web/package*.json ./
|
||||
|
||||
RUN npm install
|
||||
|
||||
# 复制源码并打包
|
||||
COPY src/web ./
|
||||
RUN npm run build
|
||||
|
||||
# ========== 运行阶段 ==========
|
||||
FROM ubuntu:24.04
|
||||
|
||||
USER root
|
||||
|
||||
# 安装 nginx 和 supervisor
|
||||
RUN apt-get update && \
|
||||
apt-get install -y nginx supervisor curl vim net-tools inetutils-ping ca-certificates passwd && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ENV FRONTEND_BASE_PATH=/private/argus/web/frontend
|
||||
ENV ARGUS_UID=2133
|
||||
ENV ARGUS_GID=2015
|
||||
|
||||
RUN mkdir -p ${FRONTEND_BASE_PATH} && \
|
||||
mkdir -p /private/argus/etc
|
||||
|
||||
# 创建 web 用户(可自定义 UID/GID)
|
||||
# 创建 web 用户组
|
||||
RUN groupadd -g ${ARGUS_GID} web
|
||||
|
||||
# 创建 web 用户并指定组
|
||||
RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} web
|
||||
|
||||
RUN chown -R web:web ${FRONTEND_BASE_PATH} && \
|
||||
chown -R web:web /private/argus/etc && \
|
||||
chown -R web:web /usr/local/bin
|
||||
|
||||
# 配置内网 apt 源 (如果指定了内网选项)
|
||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
echo "Configuring intranet apt sources..." && \
|
||||
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
|
||||
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
||||
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
||||
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
||||
fi
|
||||
|
||||
|
||||
# 配置部署时使用的 apt 源
|
||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
|
||||
fi
|
||||
|
||||
# 前端编译产物放到 nginx 目录
|
||||
COPY --from=builder /app/src/web/dist /usr/share/nginx/html
|
||||
|
||||
# 复制 nginx 配置(保证 React 前端路由兼容)
|
||||
COPY src/web/build_tools/frontend/nginx.conf /etc/nginx/nginx.conf
|
||||
# COPY src/web/build_tools/frontend/conf.d/ /etc/nginx/conf.d/
|
||||
|
||||
# 复制 supervisor 配置
|
||||
COPY src/web/build_tools/frontend/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
||||
|
||||
# 创建 supervisor 日志目录
|
||||
RUN mkdir -p /var/log/supervisor
|
||||
|
||||
# 复制启动脚本
|
||||
COPY src/web/build_tools/frontend/start-web-supervised.sh /usr/local/bin/start-web-supervised.sh
|
||||
RUN chmod +x /usr/local/bin/start-web-supervised.sh
|
||||
|
||||
# 复制 DNS 监控脚本
|
||||
COPY src/web/build_tools/frontend/dns-monitor.sh /usr/local/bin/dns-monitor.sh
|
||||
RUN chmod +x /usr/local/bin/dns-monitor.sh
|
||||
|
||||
# 复制健康检查脚本
|
||||
COPY src/web/build_tools/frontend/health-check.sh /usr/local/bin/health-check.sh
|
||||
RUN chmod +x /usr/local/bin/health-check.sh
|
||||
|
||||
# 暴露端口
|
||||
EXPOSE 80
|
||||
|
||||
# 保持 root 用户,由 supervisor 控制 user 切换
|
||||
USER root
|
||||
|
||||
# 以 supervisor 为入口
|
||||
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
|
||||
4
src/web/build_tools/frontend/build.sh
Normal file
@ -0,0 +1,4 @@
|
||||
docker pull node:20
|
||||
docker pull ubuntu:24.04
|
||||
docker build -f src/web/build_tools/frontend/Dockerfile -t argus-web:0.1.1 .
|
||||
rm -f argus-web-0.1.1.tar && sudo docker image save argus-web:0.1.1 > argus-web-0.1.1.tar
|
||||
68
src/web/build_tools/frontend/dns-monitor.sh
Normal file
@ -0,0 +1,68 @@
|
||||
#!/bin/bash
|
||||
|
||||
# DNS监控脚本 - 每10秒检查dns.conf是否有变化
|
||||
# 如果有变化则执行update-dns.sh脚本
|
||||
|
||||
DNS_CONF="/private/argus/etc/dns.conf"
|
||||
DNS_BACKUP="/tmp/dns.conf.backup"
|
||||
UPDATE_SCRIPT="/private/argus/etc/update-dns.sh"
|
||||
LOG_FILE="/var/log/supervisor/dns-monitor.log"
|
||||
|
||||
# 确保日志文件存在
|
||||
touch "$LOG_FILE"
|
||||
|
||||
log_message() {
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') [DNS-Monitor] $1" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
log_message "DNS监控脚本启动"
|
||||
|
||||
while true; do
|
||||
if [ -f "$DNS_CONF" ]; then
|
||||
if [ -f "$DNS_BACKUP" ]; then
|
||||
# 比较文件内容
|
||||
if ! cmp -s "$DNS_CONF" "$DNS_BACKUP"; then
|
||||
log_message "检测到DNS配置变化"
|
||||
|
||||
# 更新备份文件
|
||||
cp "$DNS_CONF" "$DNS_BACKUP"
|
||||
|
||||
# 执行更新脚本
|
||||
if [ -x "$UPDATE_SCRIPT" ]; then
|
||||
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
|
||||
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
|
||||
if [ $? -eq 0 ]; then
|
||||
log_message "DNS更新脚本执行成功"
|
||||
else
|
||||
log_message "DNS更新脚本执行失败"
|
||||
fi
|
||||
else
|
||||
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
|
||||
# 第一次检测到配置文件,执行更新脚本
|
||||
if [ -x "$UPDATE_SCRIPT" ]; then
|
||||
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
|
||||
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
|
||||
if [ $? -eq 0 ]; then
|
||||
log_message "DNS更新脚本执行成功"
|
||||
|
||||
# 第一次运行,创建备份并执行更新
|
||||
cp "$DNS_CONF" "$DNS_BACKUP"
|
||||
log_message "创建DNS配置备份文件"
|
||||
|
||||
else
|
||||
log_message "DNS更新脚本执行失败"
|
||||
fi
|
||||
else
|
||||
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
log_message "警告: DNS配置文件不存在: $DNS_CONF"
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
done
|
||||
16
src/web/build_tools/frontend/health-check.sh
Normal file
@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
URL="http://127.0.0.1:80"
|
||||
|
||||
echo "[INFO] Starting Argus web health check loop for $URL..."
|
||||
|
||||
while true; do
|
||||
if curl -s --max-time 5 "$URL" > /dev/null; then
|
||||
echo "[OK] $(date '+%Y-%m-%d %H:%M:%S') Argus web is healthy"
|
||||
else
|
||||
echo "[ERROR] $(date '+%Y-%m-%d %H:%M:%S') Argus web health check failed"
|
||||
exit 1
|
||||
fi
|
||||
sleep 10
|
||||
done
|
||||
56
src/web/build_tools/frontend/nginx.conf
Normal file
@ -0,0 +1,56 @@
|
||||
user web;
|
||||
worker_processes auto;
|
||||
|
||||
events {
|
||||
worker_connections 1024;
|
||||
}
|
||||
|
||||
http {
|
||||
include mime.types;
|
||||
default_type application/octet-stream;
|
||||
sendfile on;
|
||||
|
||||
# React 前端服务
|
||||
server {
|
||||
listen 80;
|
||||
server_name web.argus.com;
|
||||
|
||||
root /usr/share/nginx/html;
|
||||
index index.html;
|
||||
|
||||
# React 前端路由兼容
|
||||
location / {
|
||||
try_files $uri /index.html;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Master 服务,需要增加 CORS 支持
|
||||
server {
|
||||
listen 80;
|
||||
server_name master.argus.com;
|
||||
|
||||
location / {
|
||||
proxy_pass http://master.argus.com;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
# CORS 支持
|
||||
add_header 'Access-Control-Allow-Origin' 'http://web.argus.com' always;
|
||||
add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always;
|
||||
add_header 'Access-Control-Allow-Headers' 'Origin, Content-Type, Accept, Authorization' always;
|
||||
|
||||
if ($request_method = OPTIONS) {
|
||||
add_header 'Access-Control-Allow-Origin' 'http://web.argus.com' always;
|
||||
add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always;
|
||||
add_header 'Access-Control-Allow-Headers' 'Origin, Content-Type, Accept, Authorization' always;
|
||||
add_header 'Content-Length' 0;
|
||||
add_header 'Content-Type' 'text/plain';
|
||||
return 204;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
39
src/web/build_tools/frontend/start-web-supervised.sh
Normal file
@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
echo "[INFO] Starting React frontend under supervisor..."
|
||||
|
||||
DNS_DIR="/private/argus/etc"
|
||||
DNS_SCRIPT="${DNS_DIR}/update-dns.sh"
|
||||
DOMAIN=web.argus.com
|
||||
WEB_DOMAIN_FILE="${DNS_DIR}/${DOMAIN}"
|
||||
RUNTIME_USER="${ARGUS_RUNTIME_USER:-argus}"
|
||||
RUNTIME_UID="${ARGUS_BUILD_UID:-2133}"
|
||||
RUNTIME_GID="${ARGUS_BUILD_GID:-2015}"
|
||||
|
||||
mkdir -p "$DNS_DIR"
|
||||
chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true
|
||||
|
||||
if [[ -x "$DNS_SCRIPT" ]]; then
|
||||
echo "[INFO] Running update-dns.sh before master starts"
|
||||
# 若脚本存在则执行,保证容器使用 bind 作为 DNS
|
||||
"$DNS_SCRIPT" || echo "[WARN] update-dns.sh execution failed"
|
||||
else
|
||||
echo "[WARN] DNS update script not found or not executable: $DNS_SCRIPT"
|
||||
fi
|
||||
|
||||
|
||||
# 记录容器 IP
|
||||
IP=$(ifconfig | grep -A 1 eth0 | grep inet | awk '{print $2}' || true)
|
||||
if [[ -n "${IP}" ]]; then
|
||||
echo "current IP: ${IP}"
|
||||
echo "${IP}" > "$WEB_DOMAIN_FILE"
|
||||
chown "$RUNTIME_UID:$RUNTIME_GID" "$WEB_DOMAIN_FILE" 2>/dev/null || true
|
||||
else
|
||||
echo "[WARN] Failed to detect web IP via ifconfig"
|
||||
fi
|
||||
|
||||
echo "[INFO] Launching nginx..."
|
||||
|
||||
# 启动 nginx 前台模式
|
||||
exec /usr/sbin/nginx -g "daemon off;"
|
||||
51
src/web/build_tools/frontend/supervisord.conf
Normal file
@ -0,0 +1,51 @@
|
||||
[supervisord]
|
||||
nodaemon=true
|
||||
logfile=/var/log/supervisor/supervisord.log
|
||||
pidfile=/var/run/supervisord.pid
|
||||
user=root
|
||||
|
||||
[program:web]
|
||||
command=/usr/local/bin/start-web-supervised.sh
|
||||
user=root
|
||||
stdout_logfile=/var/log/supervisor/web-frontend.log
|
||||
stderr_logfile=/var/log/supervisor/web-frontend_error.log
|
||||
autorestart=true
|
||||
startretries=3
|
||||
startsecs=5
|
||||
stopwaitsecs=10
|
||||
killasgroup=true
|
||||
stopasgroup=true
|
||||
|
||||
[program:web-health]
|
||||
command=/usr/local/bin/health-check.sh
|
||||
user=web
|
||||
stdout_logfile=/var/log/supervisor/web-health.log
|
||||
stderr_logfile=/var/log/supervisor/web-health_error.log
|
||||
autorestart=true
|
||||
startretries=3
|
||||
startsecs=5
|
||||
stopwaitsecs=10
|
||||
killasgroup=true
|
||||
stopasgroup=true
|
||||
|
||||
[program:dns-monitor]
|
||||
command=/usr/local/bin/dns-monitor.sh
|
||||
user=root
|
||||
stdout_logfile=/var/log/supervisor/dns-monitor.log
|
||||
stderr_logfile=/var/log/supervisor/dns-monitor_error.log
|
||||
autorestart=true
|
||||
startretries=3
|
||||
startsecs=5
|
||||
stopwaitsecs=10
|
||||
killasgroup=true
|
||||
stopasgroup=true
|
||||
|
||||
[unix_http_server]
|
||||
file=/var/run/supervisor.sock
|
||||
chmod=0700
|
||||
|
||||
[supervisorctl]
|
||||
serverurl=unix:///var/run/supervisor.sock
|
||||
|
||||
[rpcinterface:supervisor]
|
||||
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
||||
69
src/web/build_tools/proxy/Dockerfile
Normal file
@ -0,0 +1,69 @@
|
||||
FROM ubuntu:24.04
|
||||
|
||||
USER root
|
||||
|
||||
# 安装 nginx 和 supervisor
|
||||
RUN apt-get update && \
|
||||
apt-get install -y nginx supervisor curl vim net-tools inetutils-ping ca-certificates passwd && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ENV FRONTEND_BASE_PATH=/private/argus/web/proxy
|
||||
ENV ARGUS_UID=2133
|
||||
ENV ARGUS_GID=2015
|
||||
|
||||
RUN mkdir -p ${FRONTEND_BASE_PATH} && \
|
||||
mkdir -p /private/argus/etc
|
||||
|
||||
# 创建 proxy 用户(可自定义 UID/GID)
|
||||
# 创建 proxy 用户组
|
||||
RUN groupadd -g ${ARGUS_GID} web_proxy
|
||||
|
||||
# 创建 proxy 用户并指定组
|
||||
RUN useradd -M -s /usr/sbin/nologin -u ${ARGUS_UID} -g ${ARGUS_GID} web_proxy
|
||||
|
||||
RUN chown -R web_proxy:web_proxy ${FRONTEND_BASE_PATH} && \
|
||||
chown -R web_proxy:web_proxy /private/argus/etc && \
|
||||
chown -R web_proxy:web_proxy /usr/local/bin
|
||||
|
||||
# 配置内网 apt 源 (如果指定了内网选项)
|
||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
echo "Configuring intranet apt sources..." && \
|
||||
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
|
||||
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
||||
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
||||
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
||||
fi
|
||||
|
||||
|
||||
# 配置部署时使用的 apt 源
|
||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
echo "deb [trusted=yes] https://10.92.132.52/mirrors/ubuntu2204/ jammy main" > /etc/apt/sources.list; \
|
||||
fi
|
||||
|
||||
|
||||
# 复制 nginx 配置(保证 React 前端路由兼容)
|
||||
COPY src/web/build_tools/proxy/nginx.conf.template /etc/nginx/nginx.conf.template
|
||||
COPY src/web/build_tools/proxy/conf.d/ /etc/nginx/conf.d/
|
||||
|
||||
# 复制 supervisor 配置
|
||||
COPY src/web/build_tools/proxy/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
||||
|
||||
# 创建 supervisor 日志目录
|
||||
RUN mkdir -p /var/log/supervisor
|
||||
|
||||
# 复制启动脚本
|
||||
COPY src/web/build_tools/proxy/start-proxy-supervised.sh /usr/local/bin/start-proxy-supervised.sh
|
||||
RUN chmod +x /usr/local/bin/start-proxy-supervised.sh
|
||||
|
||||
# 复制 DNS 监控脚本
|
||||
COPY src/web/build_tools/proxy/dns-monitor.sh /usr/local/bin/dns-monitor.sh
|
||||
RUN chmod +x /usr/local/bin/dns-monitor.sh
|
||||
|
||||
# 暴露端口
|
||||
EXPOSE 80
|
||||
|
||||
# 保持 root 用户,由 supervisor 控制 user 切换
|
||||
USER root
|
||||
|
||||
# 以 supervisor 为入口
|
||||
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
|
||||
8
src/web/build_tools/proxy/conf.d/alert.conf
Normal file
@ -0,0 +1,8 @@
|
||||
server {
|
||||
listen 80;
|
||||
server_name alertmanager.alert.argus.com;
|
||||
|
||||
location / {
|
||||
proxy_pass http://alertmanager.alert.argus.com:9093;
|
||||
}
|
||||
}
|
||||
19
src/web/build_tools/proxy/conf.d/log.conf
Normal file
@ -0,0 +1,19 @@
|
||||
# Elasticsearch
|
||||
server {
|
||||
listen 80;
|
||||
server_name es.log.argus.com;
|
||||
|
||||
location / {
|
||||
proxy_pass http://es.log.argus.com;
|
||||
}
|
||||
}
|
||||
|
||||
# Kibana
|
||||
server {
|
||||
listen 80;
|
||||
server_name kibana.log.argus.com;
|
||||
|
||||
location / {
|
||||
proxy_pass http://kibana.log.argus.com;
|
||||
}
|
||||
}
|
||||
27
src/web/build_tools/proxy/conf.d/master.conf
Normal file
@ -0,0 +1,27 @@
|
||||
server {
|
||||
listen 80;
|
||||
server_name master.argus.com;
|
||||
|
||||
location / {
|
||||
# proxy_pass http://master.argus.com;
|
||||
proxy_pass http://master.argus.com;
|
||||
# proxy_set_header Host $host;
|
||||
# proxy_set_header X-Real-IP $remote_addr;
|
||||
# proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
# proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
# # CORS 支持
|
||||
# add_header 'Access-Control-Allow-Origin' 'http://web.argus.com' always;
|
||||
# add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always;
|
||||
# add_header 'Access-Control-Allow-Headers' 'Origin, Content-Type, Accept, Authorization' always;
|
||||
|
||||
# if ($request_method = OPTIONS) {
|
||||
# add_header 'Access-Control-Allow-Origin' 'http://web.argus.com' always;
|
||||
# add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, OPTIONS' always;
|
||||
# add_header 'Access-Control-Allow-Headers' 'Origin, Content-Type, Accept, Authorization' always;
|
||||
# add_header 'Content-Length' 0;
|
||||
# add_header 'Content-Type' 'text/plain';
|
||||
# return 204;
|
||||
# }
|
||||
}
|
||||
}
|
||||
19
src/web/build_tools/proxy/conf.d/metric.conf
Normal file
@ -0,0 +1,19 @@
|
||||
# Prometheus
|
||||
server {
|
||||
listen 80;
|
||||
server_name prometheus.metric.argus.com;
|
||||
|
||||
location / {
|
||||
proxy_pass http://prom.metric.argus.com;
|
||||
}
|
||||
}
|
||||
|
||||
# # Grafana
|
||||
# server {
|
||||
# listen 80;
|
||||
# server_name grafana.metric.argus.com;
|
||||
|
||||
# location / {
|
||||
# proxy_pass http://grafana.metric.argus.com;
|
||||
# }
|
||||
# }
|
||||
8
src/web/build_tools/proxy/conf.d/web.conf
Normal file
@ -0,0 +1,8 @@
|
||||
server {
|
||||
listen 80;
|
||||
server_name web.argus.com;
|
||||
|
||||
location / {
|
||||
proxy_pass http://web.argus.com:80;
|
||||
}
|
||||
}
|
||||
68
src/web/build_tools/proxy/dns-monitor.sh
Normal file
@ -0,0 +1,68 @@
|
||||
#!/bin/bash
|
||||
|
||||
# DNS监控脚本 - 每10秒检查dns.conf是否有变化
|
||||
# 如果有变化则执行update-dns.sh脚本
|
||||
|
||||
DNS_CONF="/private/argus/etc/dns.conf"
|
||||
DNS_BACKUP="/tmp/dns.conf.backup"
|
||||
UPDATE_SCRIPT="/private/argus/etc/update-dns.sh"
|
||||
LOG_FILE="/var/log/supervisor/dns-monitor.log"
|
||||
|
||||
# 确保日志文件存在
|
||||
touch "$LOG_FILE"
|
||||
|
||||
log_message() {
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') [DNS-Monitor] $1" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
log_message "DNS监控脚本启动"
|
||||
|
||||
while true; do
|
||||
if [ -f "$DNS_CONF" ]; then
|
||||
if [ -f "$DNS_BACKUP" ]; then
|
||||
# 比较文件内容
|
||||
if ! cmp -s "$DNS_CONF" "$DNS_BACKUP"; then
|
||||
log_message "检测到DNS配置变化"
|
||||
|
||||
# 更新备份文件
|
||||
cp "$DNS_CONF" "$DNS_BACKUP"
|
||||
|
||||
# 执行更新脚本
|
||||
if [ -x "$UPDATE_SCRIPT" ]; then
|
||||
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
|
||||
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
|
||||
if [ $? -eq 0 ]; then
|
||||
log_message "DNS更新脚本执行成功"
|
||||
else
|
||||
log_message "DNS更新脚本执行失败"
|
||||
fi
|
||||
else
|
||||
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
|
||||
# 第一次检测到配置文件,执行更新脚本
|
||||
if [ -x "$UPDATE_SCRIPT" ]; then
|
||||
log_message "执行DNS更新脚本: $UPDATE_SCRIPT"
|
||||
"$UPDATE_SCRIPT" >> "$LOG_FILE" 2>&1
|
||||
if [ $? -eq 0 ]; then
|
||||
log_message "DNS更新脚本执行成功"
|
||||
|
||||
# 第一次运行,创建备份并执行更新
|
||||
cp "$DNS_CONF" "$DNS_BACKUP"
|
||||
log_message "创建DNS配置备份文件"
|
||||
|
||||
else
|
||||
log_message "DNS更新脚本执行失败"
|
||||
fi
|
||||
else
|
||||
log_message "警告: 更新脚本不存在或不可执行: $UPDATE_SCRIPT"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
log_message "警告: DNS配置文件不存在: $DNS_CONF"
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
done
|
||||
36
src/web/build_tools/proxy/nginx.conf.template
Normal file
@ -0,0 +1,36 @@
|
||||
user web_proxy;
|
||||
worker_processes auto;
|
||||
|
||||
events {
|
||||
worker_connections 1024;
|
||||
}
|
||||
|
||||
server {
|
||||
listen 80 default_server;
|
||||
server_name _;
|
||||
|
||||
location / {
|
||||
proxy_pass http://web.argus.com:80;
|
||||
}
|
||||
}
|
||||
|
||||
http {
|
||||
include mime.types;
|
||||
default_type application/octet-stream;
|
||||
sendfile on;
|
||||
|
||||
# 使用系统 resolv.conf(由 update-dns.sh 动态更新)
|
||||
resolver __RESOLVERS__ valid=30s ipv6=off;
|
||||
|
||||
# 启用访问日志
|
||||
access_log /var/log/nginx/access.log;
|
||||
error_log /var/log/nginx/error.log;
|
||||
|
||||
# 反向代理默认头部
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
include /etc/nginx/conf.d/*.conf;
|
||||
}
|
||||
61
src/web/build_tools/proxy/start-proxy-supervised.sh
Normal file
@ -0,0 +1,61 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
echo "[INFO] Starting proxy under supervisor..."
|
||||
|
||||
TEMPLATE="/etc/nginx/nginx.conf.template"
|
||||
TARGET="/etc/nginx/nginx.conf"
|
||||
DNS_CONF_PRIVATE="/private/argus/etc/dns.conf"
|
||||
DNS_CONF_SYSTEM="/etc/resolv.conf"
|
||||
DNS_DIR="/private/argus/etc"
|
||||
DNS_SCRIPT="${DNS_DIR}/update-dns.sh"
|
||||
RUNTIME_UID="${ARGUS_BUILD_UID:-2133}"
|
||||
RUNTIME_GID="${ARGUS_BUILD_GID:-2015}"
|
||||
|
||||
mkdir -p "$DNS_DIR"
|
||||
chown -R "$RUNTIME_UID:$RUNTIME_GID" "$DNS_DIR" 2>/dev/null || true
|
||||
|
||||
if [[ -x "$DNS_SCRIPT" ]]; then
|
||||
echo "[INFO] Running update-dns.sh before master starts"
|
||||
# 若脚本存在则执行,保证容器使用 bind 作为 DNS
|
||||
"$DNS_SCRIPT" || echo "[WARN] update-dns.sh execution failed"
|
||||
else
|
||||
echo "[WARN] DNS update script not found or not executable: $DNS_SCRIPT"
|
||||
fi
|
||||
|
||||
# ========== 读取 DNS ==========
|
||||
if [ -f "$DNS_CONF_PRIVATE" ]; then
|
||||
echo "从 $DNS_CONF_PRIVATE 读取 DNS 服务器..."
|
||||
RESOLVERS=$(awk '/^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$/ {print $1}' "$DNS_CONF_PRIVATE" | tr '\n' ' ')
|
||||
fi
|
||||
|
||||
# 如果 /private 文件不存在则 fallback
|
||||
if [ -z "${RESOLVERS:-}" ]; then
|
||||
echo "未在 $DNS_CONF_PRIVATE 中找到有效 DNS,使用系统 /etc/resolv.conf"
|
||||
RESOLVERS=$(awk '/^nameserver/ {print $2}' "$DNS_CONF_SYSTEM" | tr '\n' ' ')
|
||||
fi
|
||||
|
||||
# 最后兜底:若仍为空,使用公共 DNS
|
||||
if [ -z "$RESOLVERS" ]; then
|
||||
echo "警告: 未找到任何 DNS,使用默认 8.8.8.8"
|
||||
RESOLVERS="8.8.8.8"
|
||||
fi
|
||||
|
||||
echo "检测到 DNS 服务器列表: $RESOLVERS"
|
||||
|
||||
# ========== 生成 nginx.conf ==========
|
||||
if [ -f "$TEMPLATE" ]; then
|
||||
echo "从模板生成 nginx.conf ..."
|
||||
sed "s|__RESOLVERS__|$RESOLVERS|" "$TEMPLATE" > "$TARGET"
|
||||
else
|
||||
echo "错误: 找不到 nginx.conf.template ($TEMPLATE)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 打印生成结果供排查
|
||||
grep resolver "$TARGET" || true
|
||||
|
||||
echo "[INFO] Launching nginx..."
|
||||
|
||||
# 启动 nginx 前台模式
|
||||
exec /usr/sbin/nginx -g "daemon off;"
|
||||
39
src/web/build_tools/proxy/supervisord.conf
Normal file
@ -0,0 +1,39 @@
|
||||
[supervisord]
|
||||
nodaemon=true
|
||||
logfile=/var/log/supervisor/supervisord.log
|
||||
pidfile=/var/run/supervisord.pid
|
||||
user=root
|
||||
|
||||
[program:proxy]
|
||||
command=/usr/local/bin/start-proxy-supervised.sh
|
||||
user=root
|
||||
stdout_logfile=/var/log/supervisor/web-proxy.log
|
||||
stderr_logfile=/var/log/supervisor/web-proxy_error.log
|
||||
autorestart=true
|
||||
startretries=3
|
||||
startsecs=5
|
||||
stopwaitsecs=10
|
||||
killasgroup=true
|
||||
stopasgroup=true
|
||||
|
||||
[program:dns-monitor]
|
||||
command=/usr/local/bin/dns-monitor.sh
|
||||
user=root
|
||||
stdout_logfile=/var/log/supervisor/dns-monitor.log
|
||||
stderr_logfile=/var/log/supervisor/dns-monitor_error.log
|
||||
autorestart=true
|
||||
startretries=3
|
||||
startsecs=5
|
||||
stopwaitsecs=10
|
||||
killasgroup=true
|
||||
stopasgroup=true
|
||||
|
||||
[unix_http_server]
|
||||
file=/var/run/supervisor.sock
|
||||
chmod=0700
|
||||
|
||||
[supervisorctl]
|
||||
serverurl=unix:///var/run/supervisor.sock
|
||||
|
||||
[rpcinterface:supervisor]
|
||||
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
||||
29
src/web/eslint.config.js
Normal file
@ -0,0 +1,29 @@
|
||||
import js from '@eslint/js'
|
||||
import globals from 'globals'
|
||||
import reactHooks from 'eslint-plugin-react-hooks'
|
||||
import reactRefresh from 'eslint-plugin-react-refresh'
|
||||
import { defineConfig, globalIgnores } from 'eslint/config'
|
||||
|
||||
export default defineConfig([
|
||||
globalIgnores(['dist']),
|
||||
{
|
||||
files: ['**/*.{js,jsx}'],
|
||||
extends: [
|
||||
js.configs.recommended,
|
||||
reactHooks.configs['recommended-latest'],
|
||||
reactRefresh.configs.vite,
|
||||
],
|
||||
languageOptions: {
|
||||
ecmaVersion: 2020,
|
||||
globals: globals.browser,
|
||||
parserOptions: {
|
||||
ecmaVersion: 'latest',
|
||||
ecmaFeatures: { jsx: true },
|
||||
sourceType: 'module',
|
||||
},
|
||||
},
|
||||
rules: {
|
||||
'no-unused-vars': ['error', { varsIgnorePattern: '^[A-Z_]' }],
|
||||
},
|
||||
},
|
||||
])
|
||||
13
src/web/index.html
Normal file
@ -0,0 +1,13 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>GPU集群运维系统</title>
|
||||
<link rel="icon" type="image/png" href="/src/assets/argus.png" />
|
||||
</head>
|
||||
<body>
|
||||
<div id="root"></div>
|
||||
<script type="module" src="/src/main.jsx"></script>
|
||||
</body>
|
||||
</html>
|
||||
3617
src/web/package-lock.json
generated
Normal file
34
src/web/package.json
Normal file
@ -0,0 +1,34 @@
|
||||
{
|
||||
"name": "argus-web",
|
||||
"private": true,
|
||||
"version": "0.0.0",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "vite",
|
||||
"build": "vite build",
|
||||
"lint": "eslint .",
|
||||
"preview": "vite preview"
|
||||
},
|
||||
"dependencies": {
|
||||
"@emotion/react": "^11.14.0",
|
||||
"@mantine/core": "^8.3.1",
|
||||
"@mantine/hooks": "^8.3.1",
|
||||
"@mantine/notifications": "^8.3.1",
|
||||
"@tabler/icons-react": "^3.34.1",
|
||||
"react": "^19.1.1",
|
||||
"react-dom": "^19.1.1",
|
||||
"react-router-dom": "^7.8.2",
|
||||
"tabler-icons-react": "^1.56.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@eslint/js": "^9.33.0",
|
||||
"@types/react": "^19.1.10",
|
||||
"@types/react-dom": "^19.1.7",
|
||||
"@vitejs/plugin-react": "^5.0.0",
|
||||
"eslint": "^9.33.0",
|
||||
"eslint-plugin-react-hooks": "^5.2.0",
|
||||
"eslint-plugin-react-refresh": "^0.4.20",
|
||||
"globals": "^16.3.0",
|
||||
"vite": "^7.1.2"
|
||||
}
|
||||
}
|
||||
BIN
src/web/portal-frontend.tar.gz
Normal file
1
src/web/public/vite.svg
Normal file
@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="iconify iconify--logos" width="31.88" height="32" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 257"><defs><linearGradient id="IconifyId1813088fe1fbc01fb466" x1="-.828%" x2="57.636%" y1="7.652%" y2="78.411%"><stop offset="0%" stop-color="#41D1FF"></stop><stop offset="100%" stop-color="#BD34FE"></stop></linearGradient><linearGradient id="IconifyId1813088fe1fbc01fb467" x1="43.376%" x2="50.316%" y1="2.242%" y2="89.03%"><stop offset="0%" stop-color="#FFEA83"></stop><stop offset="8.333%" stop-color="#FFDD35"></stop><stop offset="100%" stop-color="#FFA800"></stop></linearGradient></defs><path fill="url(#IconifyId1813088fe1fbc01fb466)" d="M255.153 37.938L134.897 252.976c-2.483 4.44-8.862 4.466-11.382.048L.875 37.958c-2.746-4.814 1.371-10.646 6.827-9.67l120.385 21.517a6.537 6.537 0 0 0 2.322-.004l117.867-21.483c5.438-.991 9.574 4.796 6.877 9.62Z"></path><path fill="url(#IconifyId1813088fe1fbc01fb467)" d="M185.432.063L96.44 17.501a3.268 3.268 0 0 0-2.634 3.014l-5.474 92.456a3.268 3.268 0 0 0 3.997 3.378l24.777-5.718c2.318-.535 4.413 1.507 3.936 3.838l-7.361 36.047c-.495 2.426 1.782 4.5 4.151 3.78l15.304-4.649c2.372-.72 4.652 1.36 4.15 3.788l-11.698 56.621c-.732 3.542 3.979 5.473 5.943 2.437l1.313-2.028l72.516-144.72c1.215-2.423-.88-5.186-3.54-4.672l-25.505 4.922c-2.396.462-4.435-1.77-3.759-4.114l16.646-57.705c.677-2.35-1.37-4.583-3.769-4.113Z"></path></svg>
|
||||
|
After Width: | Height: | Size: 1.5 KiB |
42
src/web/src/App.css
Normal file
@ -0,0 +1,42 @@
|
||||
#root {
|
||||
max-width: 1280px;
|
||||
margin: 0 auto;
|
||||
padding: 2rem;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.logo {
|
||||
height: 6em;
|
||||
padding: 1.5em;
|
||||
will-change: filter;
|
||||
transition: filter 300ms;
|
||||
}
|
||||
.logo:hover {
|
||||
filter: drop-shadow(0 0 2em #646cffaa);
|
||||
}
|
||||
.logo.react:hover {
|
||||
filter: drop-shadow(0 0 2em #61dafbaa);
|
||||
}
|
||||
|
||||
@keyframes logo-spin {
|
||||
from {
|
||||
transform: rotate(0deg);
|
||||
}
|
||||
to {
|
||||
transform: rotate(360deg);
|
||||
}
|
||||
}
|
||||
|
||||
@media (prefers-reduced-motion: no-preference) {
|
||||
a:nth-of-type(2) .logo {
|
||||
animation: logo-spin infinite 20s linear;
|
||||
}
|
||||
}
|
||||
|
||||
.card {
|
||||
padding: 2em;
|
||||
}
|
||||
|
||||
.read-the-docs {
|
||||
color: #888;
|
||||
}
|
||||
40
src/web/src/App.jsx
Normal file
@ -0,0 +1,40 @@
|
||||
import { AppShell } from "@mantine/core";
|
||||
import { Routes, Route, Navigate } from "react-router-dom";
|
||||
import Sidebar from "./components/Sidebar";
|
||||
import HeaderBar from "./components/HeaderBar";
|
||||
import Dashboard from "./pages/Dashboard";
|
||||
import NodePage from "./pages/NodePage";
|
||||
import Metrics from "./pages/Metrics";
|
||||
import Logs from "./pages/Logs";
|
||||
import Alerts from "./pages/Alerts";
|
||||
|
||||
export default function App() {
|
||||
return (
|
||||
<AppShell
|
||||
padding="md"
|
||||
header={{ height: 60 }}
|
||||
navbar={{ width: 240, breakpoint: "sm" }}
|
||||
>
|
||||
<AppShell.Header>
|
||||
<HeaderBar />
|
||||
</AppShell.Header>
|
||||
|
||||
<AppShell.Navbar>
|
||||
<Sidebar />
|
||||
</AppShell.Navbar>
|
||||
|
||||
<AppShell.Main>
|
||||
<Routes>
|
||||
<Route path="/" element={<Navigate to="/dashboard" replace />} />
|
||||
|
||||
<Route path="/dashboard" element={<Dashboard />} />
|
||||
<Route path="/nodeInfo" element={<NodePage />} />
|
||||
<Route path="/metrics" element={<Metrics />} />
|
||||
<Route path="/logs" element={<Logs />} />
|
||||
<Route path="/alerts" element={<Alerts />} />
|
||||
<Route path="*" element={<div>404 Not Found</div>} />
|
||||
</Routes>
|
||||
</AppShell.Main>
|
||||
</AppShell>
|
||||
);
|
||||
}
|
||||
BIN
src/web/src/assets/argus.png
Normal file
|
After Width: | Height: | Size: 1.3 MiB |
BIN
src/web/src/assets/es.png
Normal file
|
After Width: | Height: | Size: 17 KiB |
BIN
src/web/src/assets/grafana.png
Normal file
|
After Width: | Height: | Size: 26 KiB |
BIN
src/web/src/assets/kibana.png
Normal file
|
After Width: | Height: | Size: 1.8 KiB |
BIN
src/web/src/assets/prometheus.png
Normal file
|
After Width: | Height: | Size: 145 KiB |
1
src/web/src/assets/react.svg
Normal file
@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="iconify iconify--logos" width="35.93" height="32" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 228"><path fill="#00D8FF" d="M210.483 73.824a171.49 171.49 0 0 0-8.24-2.597c.465-1.9.893-3.777 1.273-5.621c6.238-30.281 2.16-54.676-11.769-62.708c-13.355-7.7-35.196.329-57.254 19.526a171.23 171.23 0 0 0-6.375 5.848a155.866 155.866 0 0 0-4.241-3.917C100.759 3.829 77.587-4.822 63.673 3.233C50.33 10.957 46.379 33.89 51.995 62.588a170.974 170.974 0 0 0 1.892 8.48c-3.28.932-6.445 1.924-9.474 2.98C17.309 83.498 0 98.307 0 113.668c0 15.865 18.582 31.778 46.812 41.427a145.52 145.52 0 0 0 6.921 2.165a167.467 167.467 0 0 0-2.01 9.138c-5.354 28.2-1.173 50.591 12.134 58.266c13.744 7.926 36.812-.22 59.273-19.855a145.567 145.567 0 0 0 5.342-4.923a168.064 168.064 0 0 0 6.92 6.314c21.758 18.722 43.246 26.282 56.54 18.586c13.731-7.949 18.194-32.003 12.4-61.268a145.016 145.016 0 0 0-1.535-6.842c1.62-.48 3.21-.974 4.76-1.488c29.348-9.723 48.443-25.443 48.443-41.52c0-15.417-17.868-30.326-45.517-39.844Zm-6.365 70.984c-1.4.463-2.836.91-4.3 1.345c-3.24-10.257-7.612-21.163-12.963-32.432c5.106-11 9.31-21.767 12.459-31.957c2.619.758 5.16 1.557 7.61 2.4c23.69 8.156 38.14 20.213 38.14 29.504c0 9.896-15.606 22.743-40.946 31.14Zm-10.514 20.834c2.562 12.94 2.927 24.64 1.23 33.787c-1.524 8.219-4.59 13.698-8.382 15.893c-8.067 4.67-25.32-1.4-43.927-17.412a156.726 156.726 0 0 1-6.437-5.87c7.214-7.889 14.423-17.06 21.459-27.246c12.376-1.098 24.068-2.894 34.671-5.345a134.17 134.17 0 0 1 1.386 6.193ZM87.276 214.515c-7.882 2.783-14.16 2.863-17.955.675c-8.075-4.657-11.432-22.636-6.853-46.752a156.923 156.923 0 0 1 1.869-8.499c10.486 2.32 22.093 3.988 34.498 4.994c7.084 9.967 14.501 19.128 21.976 27.15a134.668 134.668 0 0 1-4.877 4.492c-9.933 8.682-19.886 14.842-28.658 17.94ZM50.35 144.747c-12.483-4.267-22.792-9.812-29.858-15.863c-6.35-5.437-9.555-10.836-9.555-15.216c0-9.322 13.897-21.212 37.076-29.293c2.813-.98 5.757-1.905 8.812-2.773c3.204 10.42 7.406 21.315 12.477 32.332c-5.137 11.18-9.399 22.249-12.634 32.792a134.718 134.718 0 0 1-6.318-1.979Zm12.378-84.26c-4.811-24.587-1.616-43.134 6.425-47.789c8.564-4.958 27.502 2.111 47.463 19.835a144.318 144.318 0 0 1 3.841 3.545c-7.438 7.987-14.787 17.08-21.808 26.988c-12.04 1.116-23.565 2.908-34.161 5.309a160.342 160.342 0 0 1-1.76-7.887Zm110.427 27.268a347.8 347.8 0 0 0-7.785-12.803c8.168 1.033 15.994 2.404 23.343 4.08c-2.206 7.072-4.956 14.465-8.193 22.045a381.151 381.151 0 0 0-7.365-13.322Zm-45.032-43.861c5.044 5.465 10.096 11.566 15.065 18.186a322.04 322.04 0 0 0-30.257-.006c4.974-6.559 10.069-12.652 15.192-18.18ZM82.802 87.83a323.167 323.167 0 0 0-7.227 13.238c-3.184-7.553-5.909-14.98-8.134-22.152c7.304-1.634 15.093-2.97 23.209-3.984a321.524 321.524 0 0 0-7.848 12.897Zm8.081 65.352c-8.385-.936-16.291-2.203-23.593-3.793c2.26-7.3 5.045-14.885 8.298-22.6a321.187 321.187 0 0 0 7.257 13.246c2.594 4.48 5.28 8.868 8.038 13.147Zm37.542 31.03c-5.184-5.592-10.354-11.779-15.403-18.433c4.902.192 9.899.29 14.978.29c5.218 0 10.376-.117 15.453-.343c-4.985 6.774-10.018 12.97-15.028 18.486Zm52.198-57.817c3.422 7.8 6.306 15.345 8.596 22.52c-7.422 1.694-15.436 3.058-23.88 4.071a382.417 382.417 0 0 0 7.859-13.026a347.403 347.403 0 0 0 7.425-13.565Zm-16.898 8.101a358.557 358.557 0 0 1-12.281 19.815a329.4 329.4 0 0 1-23.444.823c-7.967 0-15.716-.248-23.178-.732a310.202 310.202 0 0 1-12.513-19.846h.001a307.41 307.41 0 0 1-10.923-20.627a310.278 310.278 0 0 1 10.89-20.637l-.001.001a307.318 307.318 0 0 1 12.413-19.761c7.613-.576 15.42-.876 23.31-.876H128c7.926 0 15.743.303 23.354.883a329.357 329.357 0 0 1 12.335 19.695a358.489 358.489 0 0 1 11.036 20.54a329.472 329.472 0 0 1-11 20.722Zm22.56-122.124c8.572 4.944 11.906 24.881 6.52 51.026c-.344 1.668-.73 3.367-1.15 5.09c-10.622-2.452-22.155-4.275-34.23-5.408c-7.034-10.017-14.323-19.124-21.64-27.008a160.789 160.789 0 0 1 5.888-5.4c18.9-16.447 36.564-22.941 44.612-18.3ZM128 90.808c12.625 0 22.86 10.235 22.86 22.86s-10.235 22.86-22.86 22.86s-22.86-10.235-22.86-22.86s10.235-22.86 22.86-22.86Z"></path></svg>
|
||||
|
After Width: | Height: | Size: 4.0 KiB |
38
src/web/src/components/AlertFilters.jsx
Normal file
@ -0,0 +1,38 @@
|
||||
import { Group, Select } from "@mantine/core";
|
||||
|
||||
export function AlertFilters({ filters, setFilters, nodeOptions }) {
|
||||
return (
|
||||
<Group spacing="md">
|
||||
<Select
|
||||
label="严重性"
|
||||
value={filters.severity}
|
||||
onChange={(value) => setFilters((f) => ({ ...f, severity: value }))}
|
||||
data={[
|
||||
{ value: "all", label: "全部" },
|
||||
{ value: "critical", label: "严重" },
|
||||
{ value: "warning", label: "警告" },
|
||||
{ value: "info", label: "信息" },
|
||||
]}
|
||||
w={150}
|
||||
/>
|
||||
<Select
|
||||
label="状态"
|
||||
value={filters.state}
|
||||
onChange={(value) => setFilters((f) => ({ ...f, state: value }))}
|
||||
data={[
|
||||
{ value: "all", label: "全部" },
|
||||
{ value: "active", label: "Active" },
|
||||
{ value: "resolved", label: "Resolved" },
|
||||
]}
|
||||
w={150}
|
||||
/>
|
||||
<Select
|
||||
label="节点"
|
||||
value={filters.instance}
|
||||
onChange={(value) => setFilters((f) => ({ ...f, instance: value }))}
|
||||
data={nodeOptions}
|
||||
w={150}
|
||||
/>
|
||||
</Group>
|
||||
);
|
||||
}
|
||||
47
src/web/src/components/AlertStats.jsx
Normal file
@ -0,0 +1,47 @@
|
||||
import { Card, Group, Text, Badge, Stack, Anchor } from "@mantine/core";
|
||||
import { Link } from "react-router-dom";
|
||||
|
||||
export function AlertStats({ stats, layout = "row", title, link }) {
|
||||
const Wrapper = layout === "row" ? Group : Stack;
|
||||
|
||||
return (
|
||||
<Card withBorder radius="md" shadow="sm" p="md" mb="md">
|
||||
{(title || link) && (
|
||||
<Group position="apart" mb="sm">
|
||||
{title && <Text fw={700} size="lg">{title}</Text>}
|
||||
{link && (
|
||||
<Anchor component={Link} to={link} size="sm" underline>
|
||||
查看更多
|
||||
</Anchor>
|
||||
)}
|
||||
</Group>
|
||||
)}
|
||||
|
||||
<Wrapper spacing="xl" grow>
|
||||
<Group spacing="xs">
|
||||
<Badge color="gray" radius="sm" variant="filled">●</Badge>
|
||||
<Text size="sm" fw={500}>总数</Text>
|
||||
<Text fw={700} color="gray">{stats.total || 0}</Text>
|
||||
</Group>
|
||||
|
||||
<Group spacing="xs">
|
||||
<Badge color="red" radius="sm" variant="filled">●</Badge>
|
||||
<Text size="sm" fw={500}>严重</Text>
|
||||
<Text fw={700} color="red">{stats.critical || 0}</Text>
|
||||
</Group>
|
||||
|
||||
<Group spacing="xs">
|
||||
<Badge color="orange" radius="sm" variant="filled">●</Badge>
|
||||
<Text size="sm" fw={500}>警告</Text>
|
||||
<Text fw={700} color="orange">{stats.warning || 0}</Text>
|
||||
</Group>
|
||||
|
||||
<Group spacing="xs">
|
||||
<Badge color="blue" radius="sm" variant="filled">●</Badge>
|
||||
<Text size="sm" fw={500}>信息</Text>
|
||||
<Text fw={700} color="blue">{stats.info || 0}</Text>
|
||||
</Group>
|
||||
</Wrapper>
|
||||
</Card>
|
||||
);
|
||||
}
|
||||
96
src/web/src/components/AlertTable.jsx
Normal file
@ -0,0 +1,96 @@
|
||||
import { Table, Group, ActionIcon, Button } from "@mantine/core";
|
||||
import { IconChevronUp, IconChevronDown } from "@tabler/icons-react";
|
||||
|
||||
export function AlertTable({
|
||||
alerts,
|
||||
paginatedAlerts,
|
||||
page,
|
||||
setPage,
|
||||
pageSize,
|
||||
sortedAlerts,
|
||||
sortConfig,
|
||||
handleSort,
|
||||
getRowColor,
|
||||
getSeverityColor,
|
||||
getStateBadge,
|
||||
formatRelativeTime,
|
||||
}) {
|
||||
const totalPages = Math.ceil(sortedAlerts.length / pageSize);
|
||||
|
||||
return (
|
||||
<>
|
||||
<Table striped highlightOnHover>
|
||||
<Table.Thead>
|
||||
<Table.Tr>
|
||||
{[
|
||||
{ key: "alertname", label: "名称" },
|
||||
{ key: "instance", label: "节点" },
|
||||
{ key: "severity", label: "严重性" },
|
||||
{ key: "state", label: "状态" },
|
||||
{ key: "startsAt", label: "开始时间" },
|
||||
{ key: "endsAt", label: "结束时间" },
|
||||
{ key: "updatedAt", label: "更新时间" },
|
||||
{ key: "summary", label: "描述" },
|
||||
].map((col) => (
|
||||
<Table.Th key={col.key}>
|
||||
<Group spacing={4}>
|
||||
{col.label}
|
||||
{["severity", "startsAt", "instance"].includes(col.key) && (
|
||||
<ActionIcon size="xs" onClick={() => handleSort(col.key)}>
|
||||
{sortConfig.key === col.key && sortConfig.direction === "asc" ? (
|
||||
<IconChevronUp size={14} />
|
||||
) : (
|
||||
<IconChevronDown size={14} />
|
||||
)}
|
||||
</ActionIcon>
|
||||
)}
|
||||
</Group>
|
||||
</Table.Th>
|
||||
))}
|
||||
</Table.Tr>
|
||||
</Table.Thead>
|
||||
<Table.Tbody>
|
||||
{paginatedAlerts.map((alert, i) => (
|
||||
<Table.Tr key={i} style={{ backgroundColor: getRowColor(alert) }}>
|
||||
<Table.Td>{alert.labels?.alertname || "-"}</Table.Td>
|
||||
<Table.Td>{alert.labels?.instance || "-"}</Table.Td>
|
||||
<Table.Td style={{ color: getSeverityColor(alert.labels?.severity) }}>
|
||||
{alert.labels?.severity || "info"}
|
||||
</Table.Td>
|
||||
<Table.Td>{getStateBadge(alert.status?.state)}</Table.Td>
|
||||
<Table.Td title={alert.startsAt || "-"}>{formatRelativeTime(alert.startsAt)}</Table.Td>
|
||||
<Table.Td title={alert.endsAt || "-"}>
|
||||
{alert.endsAt ? new Date(alert.endsAt).toLocaleString() : "-"}
|
||||
</Table.Td>
|
||||
<Table.Td title={alert.updatedAt || "-"}>{formatRelativeTime(alert.updatedAt)}</Table.Td>
|
||||
<Table.Td>{alert.annotations?.summary || "-"}</Table.Td>
|
||||
</Table.Tr>
|
||||
))}
|
||||
</Table.Tbody>
|
||||
</Table>
|
||||
|
||||
{/* 分页控件 */}
|
||||
<Group position="apart" mt="sm">
|
||||
<Button
|
||||
disabled={page === 1}
|
||||
onClick={() => setPage((p) => Math.max(1, p - 1))}
|
||||
variant="outline"
|
||||
size="xs"
|
||||
>
|
||||
上一页
|
||||
</Button>
|
||||
<span>
|
||||
{page} / {totalPages}
|
||||
</span>
|
||||
<Button
|
||||
disabled={page >= totalPages}
|
||||
onClick={() => setPage((p) => p + 1)}
|
||||
variant="outline"
|
||||
size="xs"
|
||||
>
|
||||
下一页
|
||||
</Button>
|
||||
</Group>
|
||||
</>
|
||||
);
|
||||
}
|
||||
66
src/web/src/components/EntryCard.jsx
Normal file
@ -0,0 +1,66 @@
|
||||
import { Card, Flex, Image, Text, UnstyledButton } from "@mantine/core";
|
||||
import { IconArrowRight } from "@tabler/icons-react";
|
||||
|
||||
export default function EntryCard({ label, href, icon }) {
|
||||
return (
|
||||
<Card
|
||||
shadow="sm"
|
||||
p="lg"
|
||||
withBorder
|
||||
radius="md"
|
||||
style={{
|
||||
position: "relative",
|
||||
aspectRatio: "1 / 1",
|
||||
transition: "transform 0.2s, box-shadow 0.2s",
|
||||
}}
|
||||
sx={(theme) => ({
|
||||
'&:hover': {
|
||||
transform: 'translateY(-4px)',
|
||||
boxShadow: theme.shadows.md,
|
||||
},
|
||||
})}
|
||||
>
|
||||
{/* 图标 + 标题 居中 */}
|
||||
<Flex
|
||||
direction="column"
|
||||
align="center"
|
||||
justify="center"
|
||||
style={{ flex: 1, textAlign: "center", gap: "12px", height: "100%" }}
|
||||
>
|
||||
<Image src={icon} alt={label} width={48} height={48} fit="contain" />
|
||||
<Text fw={600}>{label}</Text>
|
||||
</Flex>
|
||||
|
||||
{/* 悬浮圆形箭头按钮 + 动画效果 */}
|
||||
<UnstyledButton
|
||||
component="a"
|
||||
href={href}
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
style={{
|
||||
position: "absolute",
|
||||
bottom: 16,
|
||||
right: 16,
|
||||
width: 40,
|
||||
height: 40,
|
||||
borderRadius: "50%",
|
||||
display: "flex",
|
||||
alignItems: "center",
|
||||
justifyContent: "center",
|
||||
backgroundColor: "rgba(0, 0, 0, 0.05)",
|
||||
transition: "background-color 0.2s, transform 0.2s",
|
||||
}}
|
||||
onMouseEnter={(e) => {
|
||||
e.currentTarget.style.backgroundColor = "rgba(0, 0, 0, 0.15)";
|
||||
e.currentTarget.style.transform = "translateX(4px)";
|
||||
}}
|
||||
onMouseLeave={(e) => {
|
||||
e.currentTarget.style.backgroundColor = "rgba(0, 0, 0, 0.05)";
|
||||
e.currentTarget.style.transform = "translateX(0)";
|
||||
}}
|
||||
>
|
||||
<IconArrowRight size={18} />
|
||||
</UnstyledButton>
|
||||
</Card>
|
||||
);
|
||||
}
|
||||
13
src/web/src/components/HeaderBar.jsx
Normal file
@ -0,0 +1,13 @@
|
||||
import { Group, Text } from "@mantine/core";
|
||||
import { SystemIcon } from "../components/SystemIcon";
|
||||
|
||||
export default function HeaderBar() {
|
||||
return (
|
||||
<Group justify="space-between" h="100%" px="md">
|
||||
<Group spacing="sm" align="center">
|
||||
<SystemIcon size={32} />
|
||||
<Text fw={700}>GPU 集群运维系统</Text>
|
||||
</Group>
|
||||
</Group>
|
||||
);
|
||||
}
|
||||
62
src/web/src/components/HealthCard.jsx
Normal file
@ -0,0 +1,62 @@
|
||||
import { Card, Group, Text, RingProgress } from "@mantine/core";
|
||||
|
||||
// 给一些常见状态定义颜色,没定义的就用 gray
|
||||
const statusColors = {
|
||||
healthy: "green",
|
||||
warning: "yellow",
|
||||
error: "red",
|
||||
online: "green",
|
||||
offline: "gray",
|
||||
};
|
||||
|
||||
export function HealthCard({ health }) {
|
||||
const totalNodes = health?.total || 0;
|
||||
const stats = health?.status_statistics || [];
|
||||
|
||||
// 计算环形图 sections
|
||||
const sections = stats.map((s) => ({
|
||||
value: (s.count / totalNodes) * 100,
|
||||
color: statusColors[s.status] || "gray",
|
||||
}));
|
||||
|
||||
// 计算一个主展示百分比(这里沿用原来的逻辑,用 online 或 healthy 优先)
|
||||
const mainStatus = stats.find(
|
||||
(s) => s.status === "online" || s.status === "healthy"
|
||||
);
|
||||
const mainPercent = mainStatus
|
||||
? ((mainStatus.count / totalNodes) * 100).toFixed(1)
|
||||
: "0.0";
|
||||
|
||||
return (
|
||||
<Card shadow="sm" radius="md" p="lg">
|
||||
<Text fw={700} size="lg" mb="md">节点健康状态</Text>
|
||||
|
||||
<Group spacing="xl" align="center">
|
||||
<RingProgress
|
||||
size={140}
|
||||
thickness={14}
|
||||
sections={sections}
|
||||
label={
|
||||
<Text fw={700} ta="center" size="lg">
|
||||
{mainPercent}%
|
||||
</Text>
|
||||
}
|
||||
/>
|
||||
|
||||
<div style={{ display: "flex", flexDirection: "column", justifyContent: "center", gap: 8 }}>
|
||||
{stats.map((s, idx) => (
|
||||
<div
|
||||
key={idx}
|
||||
style={{ display: "flex", justifyContent: "space-between", width: 140 }}
|
||||
>
|
||||
<Text size="sm" color={statusColors[s.status] || "gray"}>
|
||||
{s.status}
|
||||
</Text>
|
||||
<Text fw={600}>{s.count}</Text>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</Group>
|
||||
</Card>
|
||||
);
|
||||
}
|
||||
150
src/web/src/components/NodeConfigCard.jsx
Normal file
@ -0,0 +1,150 @@
|
||||
import { useState, useEffect } from "react";
|
||||
import { Card, Text, Group, TextInput, Stack, ActionIcon } from "@mantine/core";
|
||||
import { IconEdit, IconX, IconCheck, IconPlus, IconTrash } from "@tabler/icons-react";
|
||||
import { apiRequest } from "../config/request";
|
||||
import { MASTER_API } from "../config/api";
|
||||
|
||||
export default function NodeConfigCard({ nodeId, config = {}, onSaved }) {
|
||||
const [editing, setEditing] = useState(false);
|
||||
const [configList, setConfigList] = useState([]);
|
||||
const [newKey, setNewKey] = useState("");
|
||||
const [newValue, setNewValue] = useState("");
|
||||
const [saving, setSaving] = useState(false);
|
||||
|
||||
useEffect(() => {
|
||||
const arr = Object.entries(config || {});
|
||||
setConfigList(arr);
|
||||
}, [config]);
|
||||
|
||||
const removeConfig = (index) => {
|
||||
setConfigList((prev) => prev.filter((_, i) => i !== index));
|
||||
};
|
||||
|
||||
const updateConfig = (index, key, value) => {
|
||||
setConfigList((prev) =>
|
||||
prev.map((item, i) => (i === index ? [key, value] : item))
|
||||
);
|
||||
};
|
||||
|
||||
const addConfig = () => {
|
||||
if (newKey && !configList.find(([k]) => k === newKey)) {
|
||||
setConfigList((prev) => [...prev, [newKey, newValue]]);
|
||||
setNewKey("");
|
||||
setNewValue("");
|
||||
}
|
||||
};
|
||||
|
||||
const handleSave = async () => {
|
||||
setSaving(true);
|
||||
try {
|
||||
let finalList = [...configList];
|
||||
// 如果有未点击“+”的新配置,补充进去
|
||||
if (newKey && !finalList.find(([k]) => k === newKey)) {
|
||||
finalList = [...finalList, [newKey, newValue]];
|
||||
setNewKey("");
|
||||
setNewValue("");
|
||||
}
|
||||
|
||||
const configObj = Object.fromEntries(finalList);
|
||||
|
||||
await apiRequest(MASTER_API.CONFIG(nodeId), {
|
||||
method: "PUT",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ config: configObj }),
|
||||
});
|
||||
|
||||
setConfigList(finalList); // 更新 state,保持 UI 同步
|
||||
setEditing(false);
|
||||
onSaved && onSaved();
|
||||
} finally {
|
||||
setSaving(false);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
return (
|
||||
<Card shadow="sm" radius="md" withBorder>
|
||||
<Group position="apart" mb="sm">
|
||||
<Text fw={600}>配置信息</Text>
|
||||
<Group spacing="xs">
|
||||
{editing ? (
|
||||
<>
|
||||
<ActionIcon
|
||||
color="green"
|
||||
size="sm"
|
||||
loading={saving}
|
||||
onClick={handleSave}
|
||||
>
|
||||
<IconCheck size={16} />
|
||||
</ActionIcon>
|
||||
<ActionIcon color="red" size="sm" onClick={() => setEditing(false)}>
|
||||
<IconX size={16} />
|
||||
</ActionIcon>
|
||||
</>
|
||||
) : (
|
||||
<ActionIcon color="blue" size="sm" onClick={() => setEditing(true)}>
|
||||
<IconEdit size={16} />
|
||||
</ActionIcon>
|
||||
)}
|
||||
</Group>
|
||||
</Group>
|
||||
|
||||
{editing ? (
|
||||
<Stack spacing="xs">
|
||||
{configList.map(([key, value], idx) => (
|
||||
<Group key={idx} spacing="xs">
|
||||
<TextInput
|
||||
placeholder="Key"
|
||||
value={key}
|
||||
onChange={(e) => updateConfig(idx, e.target.value, value)}
|
||||
/>
|
||||
<TextInput
|
||||
placeholder="Value"
|
||||
value={value}
|
||||
onChange={(e) => updateConfig(idx, key, e.target.value)}
|
||||
/>
|
||||
<ActionIcon color="red" onClick={() => removeConfig(idx)}>
|
||||
<IconTrash size={16} />
|
||||
</ActionIcon>
|
||||
</Group>
|
||||
))}
|
||||
<Group spacing="xs">
|
||||
<TextInput
|
||||
placeholder="新增 Key"
|
||||
value={newKey}
|
||||
onChange={(e) => setNewKey(e.target.value)}
|
||||
/>
|
||||
<TextInput
|
||||
placeholder="新增 Value"
|
||||
value={newValue}
|
||||
onChange={(e) => setNewValue(e.target.value)}
|
||||
onKeyDown={(e) => {
|
||||
if (e.key === "Enter") {
|
||||
e.preventDefault();
|
||||
addConfig();
|
||||
}
|
||||
}}
|
||||
/>
|
||||
|
||||
<ActionIcon color="blue" onClick={addConfig}>
|
||||
<IconPlus size={16} />
|
||||
</ActionIcon>
|
||||
</Group>
|
||||
</Stack>
|
||||
) : (
|
||||
<Stack spacing="xs">
|
||||
{configList.length > 0 ? (
|
||||
configList.map(([key, value], idx) => (
|
||||
<Group key={idx} spacing="xs">
|
||||
<Text fw={500}>{key}:</Text>
|
||||
<Text>{String(value)}</Text>
|
||||
</Group>
|
||||
))
|
||||
) : (
|
||||
<Text c="dimmed">暂无配置</Text>
|
||||
)}
|
||||
</Stack>
|
||||
)}
|
||||
</Card>
|
||||
);
|
||||
}
|
||||
131
src/web/src/components/NodeDetailDrawer.jsx
Normal file
@ -0,0 +1,131 @@
|
||||
import { useState, useEffect } from "react";
|
||||
import {
|
||||
Drawer,
|
||||
Text,
|
||||
Loader,
|
||||
Center,
|
||||
ScrollArea,
|
||||
Group,
|
||||
Divider,
|
||||
ThemeIcon,
|
||||
Stack,
|
||||
ActionIcon,
|
||||
} from "@mantine/core";
|
||||
import { IconRefresh } from "@tabler/icons-react";
|
||||
import { healthStatus } from "../config/status";
|
||||
import { apiRequest } from "../config/request";
|
||||
import { MASTER_API } from "../config/api";
|
||||
|
||||
import NodeConfigCard from "./NodeConfigCard";
|
||||
import NodeLabelCard from "./NodeLabelCard";
|
||||
import NodeMetaCard from "./NodeMetaCard";
|
||||
import NodeHealthCard from "./NodeHealthCard";
|
||||
|
||||
export default function NodeDetailDrawer({ opened, nodeId, onClose }) {
|
||||
const [node, setNode] = useState(null);
|
||||
const [loading, setLoading] = useState(false);
|
||||
|
||||
const fetchNodeDetail = async (id) => {
|
||||
if (!id) return;
|
||||
setLoading(true);
|
||||
try {
|
||||
const res = await apiRequest(MASTER_API.DETAIL(id));
|
||||
setNode(res);
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
if (opened && nodeId) fetchNodeDetail(nodeId);
|
||||
}, [opened, nodeId]);
|
||||
|
||||
return (
|
||||
<Drawer
|
||||
opened={opened}
|
||||
onClose={onClose}
|
||||
position="right"
|
||||
size="lg"
|
||||
title="节点详情"
|
||||
padding="lg"
|
||||
overlayProps={{ backgroundOpacity: 0.4, blur: 4 }}
|
||||
>
|
||||
{loading && !node ? (
|
||||
<Center h={200}>
|
||||
<Loader size="sm" />
|
||||
</Center>
|
||||
) : node ? (
|
||||
<div style={{ height: "90vh", display: "flex", flexDirection: "column" }}>
|
||||
{/* 固定头部基础信息 */}
|
||||
<div
|
||||
style={{
|
||||
position: "sticky",
|
||||
top: 0,
|
||||
background: "white",
|
||||
zIndex: 10,
|
||||
paddingBottom: 8,
|
||||
}}
|
||||
>
|
||||
<Group spacing="sm" align="center" position="apart">
|
||||
<Group spacing="sm" align="center">
|
||||
<ThemeIcon
|
||||
size="lg"
|
||||
radius="xl"
|
||||
color={healthStatus(node.status).color}
|
||||
variant="light"
|
||||
>
|
||||
{healthStatus(node.status).icon}
|
||||
</ThemeIcon>
|
||||
|
||||
<Text fw={700} size="xl">{node.name}</Text>
|
||||
<Text c="dimmed">{node.type}</Text>
|
||||
<Text c={healthStatus(node.status).color}>{node.status}</Text>
|
||||
<Text c="dimmed" size="sm">
|
||||
最近上报时间: {new Date(node.last_report).toLocaleString()}
|
||||
</Text>
|
||||
</Group>
|
||||
|
||||
{/* 刷新按钮固定在右侧 */}
|
||||
<ActionIcon
|
||||
color="blue"
|
||||
variant="light"
|
||||
onClick={() => fetchNodeDetail(node.id)}
|
||||
disabled={loading}
|
||||
>
|
||||
<IconRefresh size={18} />
|
||||
</ActionIcon>
|
||||
</Group>
|
||||
|
||||
<Divider my="sm" />
|
||||
</div>
|
||||
|
||||
{/* 滚动内容 */}
|
||||
<ScrollArea style={{ flex: 1 }}>
|
||||
<Stack spacing="md">
|
||||
{/* 配置信息 */}
|
||||
<NodeConfigCard nodeId={node.id} config={node.config || {}} onSaved={() => fetchNodeDetail(node.id)} />
|
||||
|
||||
{/* 标签信息 */}
|
||||
<NodeLabelCard nodeId={node.id} labels={Array.isArray(node.label) ? node.label : []} onSaved={() => fetchNodeDetail(node.id)} />
|
||||
|
||||
{/* 元数据 */}
|
||||
<NodeMetaCard node={node} />
|
||||
|
||||
{/* 健康信息 */}
|
||||
<NodeHealthCard node={node} />
|
||||
|
||||
{/* 其他基础信息展示 */}
|
||||
<Stack spacing="xs">
|
||||
<Text fw={500}>注册时间: <Text span c="dimmed">{new Date(node.register_time).toLocaleString()}</Text></Text>
|
||||
<Text fw={500}>最近上报时间: <Text span c="dimmed">{new Date(node.last_report).toLocaleString()}</Text></Text>
|
||||
<Text fw={500}>最后更新时间: <Text span c="dimmed">{new Date(node.last_updated).toLocaleString()}</Text></Text>
|
||||
</Stack>
|
||||
</Stack>
|
||||
</ScrollArea>
|
||||
</div>
|
||||
) : (
|
||||
<Text c="dimmed">暂无数据</Text>
|
||||
)}
|
||||
</Drawer>
|
||||
);
|
||||
}
|
||||
56
src/web/src/components/NodeHealthCard.jsx
Normal file
@ -0,0 +1,56 @@
|
||||
import { useState } from "react";
|
||||
import { Card, Text, Stack, Group, ActionIcon, Badge, Popover } from "@mantine/core";
|
||||
import { IconInfoCircle } from "@tabler/icons-react";
|
||||
|
||||
export default function NodeHealthCard({ node }) {
|
||||
const health = node.health || {};
|
||||
|
||||
const renderHealthItem = (moduleName, data) => {
|
||||
const status = data?.status || "unknown";
|
||||
const color = status === "healthy" ? "green" : status === "unhealthy" ? "red" : "gray";
|
||||
const [opened, setOpened] = useState(false);
|
||||
|
||||
return (
|
||||
<Group key={moduleName} spacing="xs" align="center">
|
||||
<Text size="sm" fw={500}>{moduleName}</Text>
|
||||
<Badge color={color} variant="light">{status}</Badge>
|
||||
{(data?.error || data?.timestamp) && (
|
||||
<Popover
|
||||
opened={opened}
|
||||
onClose={() => setOpened(false)}
|
||||
position="bottom"
|
||||
withArrow
|
||||
shadow="sm"
|
||||
>
|
||||
<Popover.Target>
|
||||
<ActionIcon size="xs" color="blue" variant="light" onClick={() => setOpened((o) => !o)}>
|
||||
<IconInfoCircle size={14} />
|
||||
</ActionIcon>
|
||||
</Popover.Target>
|
||||
<Popover.Dropdown>
|
||||
<Stack spacing={4}>
|
||||
{data.error && <Text size="xs" c="red">Error: {data.error}</Text>}
|
||||
{data.timestamp && (
|
||||
<Text size="xs" c="dimmed">
|
||||
Updated: {new Date(data.timestamp).toLocaleString()}
|
||||
</Text>
|
||||
)}
|
||||
</Stack>
|
||||
</Popover.Dropdown>
|
||||
</Popover>
|
||||
)}
|
||||
</Group>
|
||||
);
|
||||
};
|
||||
|
||||
return (
|
||||
<Card shadow="xs" radius="md" withBorder>
|
||||
<Text fw={600} mb="sm">健康信息</Text>
|
||||
<Stack spacing="xs">
|
||||
{Object.entries(health).map(([moduleName, data]) =>
|
||||
renderHealthItem(moduleName, data)
|
||||
)}
|
||||
</Stack>
|
||||
</Card>
|
||||
);
|
||||
}
|
||||
115
src/web/src/components/NodeLabelCard.jsx
Normal file
@ -0,0 +1,115 @@
|
||||
import { useState, useEffect } from "react";
|
||||
import { Card, Text, Group, TextInput, Stack, ActionIcon, Badge } from "@mantine/core";
|
||||
import { IconEdit, IconX, IconCheck, IconPlus, IconTrash } from "@tabler/icons-react";
|
||||
import { apiRequest } from "../config/request";
|
||||
import { MASTER_API } from "../config/api";
|
||||
|
||||
export default function NodeLabelCard({ nodeId, labels = [], onSaved }) {
|
||||
const [editing, setEditing] = useState(false);
|
||||
const [tagList, setTagList] = useState([]);
|
||||
const [tagColors, setTagColors] = useState([]);
|
||||
const [newTag, setNewTag] = useState("");
|
||||
const [saving, setSaving] = useState(false);
|
||||
|
||||
const randomColor = () => {
|
||||
const colors = ["red", "pink", "grape", "violet", "indigo", "blue", "cyan", "teal", "green", "lime", "yellow", "orange", "gray"];
|
||||
return colors[Math.floor(Math.random() * colors.length)];
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
const arr = Array.isArray(labels) ? labels : [];
|
||||
setTagList(arr);
|
||||
setTagColors(arr.map(() => randomColor()));
|
||||
}, [labels]);
|
||||
|
||||
const removeTag = (index) => {
|
||||
setTagList((prev) => prev.filter((_, i) => i !== index));
|
||||
setTagColors((prev) => prev.filter((_, i) => i !== index));
|
||||
};
|
||||
|
||||
const updateTag = (index, value) => {
|
||||
setTagList((prev) => prev.map((t, i) => (i === index ? value : t)));
|
||||
};
|
||||
|
||||
const addTag = () => {
|
||||
if (newTag && !tagList.includes(newTag)) {
|
||||
setTagList((prev) => [...prev, newTag]);
|
||||
setTagColors((prev) => [...prev, randomColor()]);
|
||||
setNewTag("");
|
||||
}
|
||||
};
|
||||
|
||||
const handleSave = async () => {
|
||||
setSaving(true);
|
||||
try {
|
||||
let finalTags = [...tagList];
|
||||
if (newTag && !finalTags.includes(newTag)) {
|
||||
finalTags = [...finalTags, newTag];
|
||||
setNewTag(""); // 清空输入框
|
||||
}
|
||||
|
||||
await apiRequest(MASTER_API.CONFIG(nodeId), {
|
||||
method: "PUT",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ label: finalTags }),
|
||||
});
|
||||
|
||||
setTagList(finalTags);
|
||||
setEditing(false);
|
||||
onSaved && onSaved();
|
||||
} finally {
|
||||
setSaving(false);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
return (
|
||||
<Card shadow="sm" radius="md" withBorder>
|
||||
<Group position="apart" mb="sm">
|
||||
<Text fw={600}>标签信息</Text>
|
||||
<Group spacing="xs">
|
||||
{editing ? (
|
||||
<>
|
||||
<ActionIcon color="green" size="sm" loading={saving} onClick={handleSave}><IconCheck size={16} /></ActionIcon>
|
||||
<ActionIcon color="red" size="sm" onClick={() => setEditing(false)}><IconX size={16} /></ActionIcon>
|
||||
</>
|
||||
) : (
|
||||
<ActionIcon color="blue" size="sm" onClick={() => setEditing(true)}><IconEdit size={16} /></ActionIcon>
|
||||
)}
|
||||
</Group>
|
||||
</Group>
|
||||
|
||||
{editing ? (
|
||||
<Stack spacing="xs">
|
||||
{tagList.map((tag, idx) => (
|
||||
<Group key={idx} spacing="xs">
|
||||
<TextInput value={tag} onChange={(e) => updateTag(idx, e.target.value)} />
|
||||
<ActionIcon color="red" onClick={() => removeTag(idx)}><IconTrash size={16} /></ActionIcon>
|
||||
</Group>
|
||||
))}
|
||||
<Group spacing="xs">
|
||||
<TextInput
|
||||
placeholder="新增标签"
|
||||
value={newTag}
|
||||
onChange={(e) => setNewTag(e.target.value)}
|
||||
onKeyDown={(e) => {
|
||||
if (e.key === "Enter") {
|
||||
e.preventDefault(); // 阻止默认提交行为
|
||||
addTag();
|
||||
}
|
||||
}}
|
||||
/>
|
||||
|
||||
<ActionIcon color="blue" onClick={addTag}><IconPlus size={16} /></ActionIcon>
|
||||
</Group>
|
||||
</Stack>
|
||||
) : (
|
||||
<Group spacing="xs" wrap="wrap">
|
||||
{tagList.length > 0 ? tagList.map((tag, idx) => (
|
||||
<Badge key={idx} color={tagColors[idx]} variant="light">{tag}</Badge>
|
||||
)) : <Text c="dimmed">暂无标签</Text>}
|
||||
</Group>
|
||||
)}
|
||||
</Card>
|
||||
);
|
||||
}
|
||||