Compare commits

...

6 Commits

Author SHA1 Message Date
sundapeng.sdp
4965a25ea3 feat: 新增GPU测试节点argus-metric-test-gpu-node,测试安装包安装流程及与agent/master组件之间数据准确性;
refs #20
2025-10-17 10:18:18 +08:00
sundapeng.sdp
e8a543e1d1 fix: 修复 Dockerfile 中 chown 目录可能导致用户权限的问题;
refs #20
2025-10-17 10:18:18 +08:00
sundapeng.sdp
2a79935b7a feat: 新增测试节点,用于测试FTP安装包安装流程;修复写入 /etc/resolv.conf 顺序问题导致dns解析有误;
refs #20
2025-10-17 10:18:18 +08:00
sundapeng.sdp
5db6931315 fix: 修复docker-compose中配置路径不正确的问题;
refs #20
2025-10-17 10:18:18 +08:00
sundapeng.sdp
04051156c9 refactor: 优化argus-metric模块e2e测试;
refs #20
2025-10-17 10:18:18 +08:00
31ccb0b1b8 增加sys/debug 部署测试;agent dev/user/instance元信息提取优化;sys/tests 优化 (#26)
Reviewed-on: #26
Reviewed-by: xuxt <xuxt@zgclab.edu.cn>
Reviewed-by: huhy <husteryezi@163.com>
Reviewed-by: sundapeng <sundp@mail.zgclab.edu.cn>
2025-10-16 17:16:07 +08:00
72 changed files with 3031 additions and 644 deletions

View File

@ -10,6 +10,7 @@ Usage: $0 [OPTIONS]
Options:
--intranet Use intranet mirror for log/bind builds
--master-offline Build master offline image (requires src/master/offline_wheels.tar.gz)
--no-cache Build all images without using Docker layer cache
-h, --help Show this help message
Examples:
@ -23,6 +24,7 @@ EOF
use_intranet=false
build_master=true
build_master_offline=false
no_cache=false
while [[ $# -gt 0 ]]; do
case $1 in
@ -39,6 +41,10 @@ while [[ $# -gt 0 ]]; do
build_master_offline=true
shift
;;
--no-cache)
no_cache=true
shift
;;
-h|--help)
show_help
exit 0
@ -65,6 +71,10 @@ cd "$root"
load_build_user
build_args+=("--build-arg" "ARGUS_BUILD_UID=${ARGUS_BUILD_UID}" "--build-arg" "ARGUS_BUILD_GID=${ARGUS_BUILD_GID}")
if [[ "$no_cache" == true ]]; then
build_args+=("--no-cache")
fi
master_root="$root/src/master"
master_offline_tar="$master_root/offline_wheels.tar.gz"
master_offline_dir="$master_root/offline_wheels"
@ -159,6 +169,9 @@ if [[ "$build_master" == true ]]; then
if [[ "$build_master_offline" == true ]]; then
master_args+=("--offline")
fi
if [[ "$no_cache" == true ]]; then
master_args+=("--no-cache")
fi
if ./scripts/build_images.sh "${master_args[@]}"; then
if [[ "$build_master_offline" == true ]]; then
images_built+=("argus-master:offline")

View File

@ -34,6 +34,18 @@ Agent 不再依赖配置文件;所有参数均由环境变量与主机名推
| `MASTER_ENDPOINT` | 是 | N/A | Master 基础地址,可写 `http://host:3000``host:3000`(自动补全 `http://`)。 |
| `REPORT_INTERVAL_SECONDS` | 否 | `60` | 状态上报间隔(秒)。必须为正整数。 |
| `AGENT_HOSTNAME` | 否 | `$(hostname)` | 覆盖容器内主机名,便于测试或特殊命名需求。 |
| `AGENT_ENV` | 否 | 来源于主机名 | 运行环境标识(如 `dev``prod`)。与 `AGENT_USER``AGENT_INSTANCE` 必须同时设置。 |
| `AGENT_USER` | 否 | 来源于主机名 | 归属用户或团队标识。与 `AGENT_ENV``AGENT_INSTANCE` 必须同时设置。 |
| `AGENT_INSTANCE` | 否 | 来源于主机名 | 实例编号或别名。与 `AGENT_ENV``AGENT_USER` 必须同时设置。 |
主机名与元数据的解析优先级:
1. 若设置 `AGENT_ENV` / `AGENT_USER` / `AGENT_INSTANCE` 且全部存在,则直接使用这些值。
2. 否则检查历史 `node.json`(注册成功后由 Master 返回的信息),若包含 `env` / `user` / `instance` 则沿用。
3. 若以上均不可用,则按历史约定从主机名解析 `env-user-instance` 前缀。
4. 如果仍无法得到完整结果Agent 启动会失败并提示需要提供上述环境变量。
> 提示在首次部署时需确保环境变量或主机名能够提供完整信息。完成注册后Agent 会把 Master 返回的元数据写入 `node.json`,后续重启无需再次提供环境变量就能保持一致性。
派生路径:

View File

@ -18,13 +18,12 @@ _HOSTNAME_PATTERN = re.compile(r"^([^-]+)-([^-]+)-([^-]+)-.*$")
def collect_metadata(config: AgentConfig) -> Dict[str, Any]:
"""汇总节点注册需要的静态信息。"""
hostname = config.hostname
env, user, instance = _parse_hostname(hostname)
meta = {
"hostname": hostname,
"ip": _detect_ip_address(),
"env": env,
"user": user,
"instance": instance,
"env": config.environment,
"user": config.user,
"instance": config.instance,
"cpu_number": _detect_cpu_count(),
"memory_in_bytes": _detect_memory_bytes(),
"gpu_number": _detect_gpu_count(),

View File

@ -6,14 +6,21 @@ from dataclasses import dataclass
from pathlib import Path
from typing import Final
from .state import load_node_state
from .version import VERSION
from .log import get_logger
DEFAULT_REPORT_INTERVAL_SECONDS: Final[int] = 60
LOGGER = get_logger("argus.agent.config")
@dataclass(frozen=True)
class AgentConfig:
hostname: str
environment: str
user: str
instance: str
node_file: str
version: str
master_endpoint: str
@ -47,11 +54,68 @@ def _resolve_hostname() -> str:
return os.environ.get("AGENT_HOSTNAME") or socket.gethostname()
def _load_metadata_from_state(node_file: str) -> tuple[str, str, str] | None:
state = load_node_state(node_file)
if not state:
return None
meta = state.get("meta_data") or {}
env = meta.get("env") or state.get("env")
user = meta.get("user") or state.get("user")
instance = meta.get("instance") or state.get("instance")
if env and user and instance:
LOGGER.debug("Metadata resolved from node state", extra={"node_file": node_file})
return env, user, instance
LOGGER.warning(
"node.json missing metadata fields; ignoring",
extra={"node_file": node_file, "meta_data": meta},
)
return None
def _resolve_metadata_fields(hostname: str, node_file: str) -> tuple[str, str, str]:
env = os.environ.get("AGENT_ENV")
user = os.environ.get("AGENT_USER")
instance = os.environ.get("AGENT_INSTANCE")
if env and user and instance:
return env, user, instance
if any([env, user, instance]):
LOGGER.warning(
"Incomplete metadata environment variables; falling back to persisted metadata",
extra={
"has_env": bool(env),
"has_user": bool(user),
"has_instance": bool(instance),
},
)
state_metadata = _load_metadata_from_state(node_file)
if state_metadata is not None:
return state_metadata
from .collector import _parse_hostname # Local import to avoid circular dependency
env, user, instance = _parse_hostname(hostname)
if not all([env, user, instance]):
raise ValueError(
"Failed to determine metadata fields; set AGENT_ENV/USER/INSTANCE or use supported hostname pattern"
)
return env, user, instance
def load_config() -> AgentConfig:
"""从环境变量推导配置,移除了外部配置文件依赖。"""
hostname = _resolve_hostname()
node_file = f"/private/argus/agent/{hostname}/node.json"
environment, user, instance = _resolve_metadata_fields(hostname, node_file)
health_dir = f"/private/argus/agent/{hostname}/health/"
master_endpoint_env = os.environ.get("MASTER_ENDPOINT")
@ -66,6 +130,9 @@ def load_config() -> AgentConfig:
return AgentConfig(
hostname=hostname,
environment=environment,
user=user,
instance=instance,
node_file=node_file,
version=VERSION,
master_endpoint=master_endpoint,

Binary file not shown.

View File

View File

@ -60,6 +60,36 @@ services:
ipv4_address: 172.28.0.20
restart: always
agent_env:
image: ubuntu:22.04
container_name: argus-agent-env-e2e
hostname: host_abc
depends_on:
- master
- bind
environment:
- MASTER_ENDPOINT=http://master.argus.com:3000
- REPORT_INTERVAL_SECONDS=2
- AGENT_ENV=prod
- AGENT_USER=ml
- AGENT_INSTANCE=node-3
- AGENT_HOSTNAME=host_abc
- "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}"
- "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}"
volumes:
- ./private/argus/agent/host_abc:/private/argus/agent/host_abc
- ./private/argus/agent/host_abc/health:/private/argus/agent/host_abc/health
- ./private/argus/etc:/private/argus/etc
- ../dist/argus-agent:/usr/local/bin/argus-agent:ro
- ./scripts/agent_entrypoint.sh:/usr/local/bin/agent-entrypoint.sh:ro
- ../scripts/agent_deployment_verify.sh:/usr/local/bin/agent_deployment_verify.sh:ro
entrypoint:
- /usr/local/bin/agent-entrypoint.sh
networks:
default:
ipv4_address: 172.28.0.21
restart: always
networks:
default:
driver: bridge

View File

@ -7,10 +7,10 @@ SCRIPTS=(
"02_up.sh"
"03_wait_and_assert_registration.sh"
"04_write_health_files.sh"
"08_verify_agent.sh"
"05_assert_status_on_master.sh"
"06_restart_agent_and_reregister.sh"
"07_down.sh"
"05_verify_agent.sh"
"06_assert_status_on_master.sh"
"07_restart_agent_and_reregister.sh"
"08_down.sh"
)
for script in "${SCRIPTS[@]}"; do

View File

@ -41,7 +41,7 @@ compose() {
fi
}
docker container rm -f argus-agent-e2e argus-master-agent-e2e argus-bind-agent-e2e >/dev/null 2>&1 || true
docker container rm -f argus-agent-e2e argus-agent-env-e2e argus-master-agent-e2e argus-bind-agent-e2e >/dev/null 2>&1 || true
docker network rm tests_default >/dev/null 2>&1 || true

View File

@ -6,11 +6,14 @@ TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_ROOT="$TEST_ROOT/tmp"
API_BASE="http://localhost:32300/api/v1/master"
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
ENV_AGENT_HOSTNAME="host_abc"
NODE_FILE="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/node.json"
ENV_NODE_FILE="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME/node.json"
mkdir -p "$TMP_ROOT"
node_id=""
primary_node_id=""
env_node_id=""
for _ in {1..30}; do
sleep 2
response=$(curl -sS "$API_BASE/nodes" || true)
@ -19,24 +22,49 @@ for _ in {1..30}; do
fi
list_file="$TMP_ROOT/nodes_list.json"
echo "$response" > "$list_file"
node_id=$(python3 - "$list_file" <<'PY'
readarray -t node_ids < <(python3 - "$list_file" "$AGENT_HOSTNAME" "$ENV_AGENT_HOSTNAME" <<'PY'
import json, sys
with open(sys.argv[1]) as handle:
nodes = json.load(handle)
print(nodes[0]["id"] if nodes else "")
target_primary = sys.argv[2]
target_env = sys.argv[3]
primary_id = ""
env_id = ""
for node in nodes:
if node.get("name") == target_primary:
primary_id = node.get("id", "")
if node.get("name") == target_env:
env_id = node.get("id", "")
print(primary_id)
print(env_id)
PY
)
if [[ -n "$node_id" ]]; then
)
primary_node_id="${node_ids[0]}"
env_node_id="${node_ids[1]}"
if [[ -n "$primary_node_id" && -n "$env_node_id" ]]; then
break
fi
done
if [[ -z "$node_id" ]]; then
echo "[ERROR] Agent did not register within timeout" >&2
if [[ -z "$primary_node_id" ]]; then
echo "[ERROR] Primary agent did not register within timeout" >&2
exit 1
fi
echo "$node_id" > "$TMP_ROOT/node_id"
if [[ -z "$env_node_id" ]]; then
echo "[ERROR] Env-variable agent did not register within timeout" >&2
exit 1
fi
echo "$primary_node_id" > "$TMP_ROOT/node_id"
echo "$env_node_id" > "$TMP_ROOT/node_id_host_abc"
if [[ ! -f "$NODE_FILE" ]]; then
echo "[ERROR] node.json not created at $NODE_FILE" >&2
@ -50,8 +78,20 @@ with open(sys.argv[1]) as handle:
assert "id" in node and node["id"], "node.json missing id"
PY
if [[ ! -f "$ENV_NODE_FILE" ]]; then
echo "[ERROR] node.json not created at $ENV_NODE_FILE" >&2
exit 1
fi
python3 - "$ENV_NODE_FILE" <<'PY'
import json, sys
with open(sys.argv[1]) as handle:
node = json.load(handle)
assert "id" in node and node["id"], "env agent node.json missing id"
PY
detail_file="$TMP_ROOT/initial_detail.json"
curl -sS "$API_BASE/nodes/$node_id" -o "$detail_file"
curl -sS "$API_BASE/nodes/$primary_node_id" -o "$detail_file"
python3 - "$detail_file" "$TMP_ROOT/initial_ip" <<'PY'
import json, sys, pathlib
with open(sys.argv[1]) as handle:
@ -62,4 +102,5 @@ if not ip:
pathlib.Path(sys.argv[2]).write_text(ip)
PY
echo "[INFO] Agent registered with node id $node_id"
echo "[INFO] Agent registered with node id $primary_node_id"
echo "[INFO] Env-variable agent registered with node id $env_node_id"

View File

@ -0,0 +1,60 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
REPO_ROOT="$(cd "$TEST_ROOT/.." && pwd)"
VERIFY_SCRIPT="$REPO_ROOT/scripts/agent_deployment_verify.sh"
ENV_NODE_ID_FILE="$TEST_ROOT/tmp/node_id_host_abc"
PRIMARY_CONTAINER="argus-agent-e2e"
ENV_CONTAINER="argus-agent-env-e2e"
PRIMARY_HOSTNAME="dev-e2euser-e2einst-pod-0"
ENV_HOSTNAME="host_abc"
if ! docker ps --format '{{.Names}}' | grep -q "^${PRIMARY_CONTAINER}$"; then
echo "[WARN] agent container not running; skip verification"
exit 0
fi
if docker exec -i "$PRIMARY_CONTAINER" bash -lc 'command -v curl >/dev/null && command -v jq >/dev/null'; then
echo "[INFO] curl/jq already installed in agent container"
else
echo "[INFO] Installing curl/jq in agent container"
docker exec -i "$PRIMARY_CONTAINER" bash -lc 'apt-get update >/dev/null 2>&1 && apt-get install -y curl jq >/dev/null 2>&1' || true
fi
if [[ ! -f "$VERIFY_SCRIPT" ]]; then
echo "[ERROR] Verification script missing at $VERIFY_SCRIPT" >&2
exit 1
fi
run_verifier() {
local container="$1" hostname="$2"
if ! docker ps --format '{{.Names}}' | grep -q "^${container}$"; then
echo "[WARN] container $container not running; skip"
return
fi
if ! docker exec -i "$container" bash -lc 'command -v /usr/local/bin/agent_deployment_verify.sh >/dev/null'; then
echo "[ERROR] /usr/local/bin/agent_deployment_verify.sh missing in $container" >&2
exit 1
fi
echo "[INFO] Running verification for $hostname in $container"
docker exec -i "$container" env VERIFY_HOSTNAME="$hostname" /usr/local/bin/agent_deployment_verify.sh
}
run_verifier "$PRIMARY_CONTAINER" "$PRIMARY_HOSTNAME"
if docker ps --format '{{.Names}}' | grep -q "^${ENV_CONTAINER}$"; then
if docker exec -i "$ENV_CONTAINER" bash -lc 'command -v curl >/dev/null && command -v jq >/dev/null'; then
echo "[INFO] curl/jq already installed in env agent container"
else
echo "[INFO] Installing curl/jq in env agent container"
docker exec -i "$ENV_CONTAINER" bash -lc 'apt-get update >/dev/null 2>&1 && apt-get install -y curl jq >/dev/null 2>&1' || true
fi
run_verifier "$ENV_CONTAINER" "$ENV_HOSTNAME"
else
echo "[WARN] env-driven agent container not running; skip secondary verification"
fi

View File

@ -6,6 +6,8 @@ TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_ROOT="$TEST_ROOT/tmp"
API_BASE="http://localhost:32300/api/v1/master"
NODE_ID="$(cat "$TMP_ROOT/node_id")"
ENV_NODE_ID="$(cat "$TMP_ROOT/node_id_host_abc")"
ENV_HOSTNAME="host_abc"
NODES_JSON="$TEST_ROOT/private/argus/metric/prometheus/nodes.json"
success=false
@ -41,13 +43,36 @@ if [[ ! -f "$NODES_JSON" ]]; then
exit 1
fi
python3 - "$NODES_JSON" <<'PY'
python3 - "$NODES_JSON" "$NODE_ID" "$ENV_NODE_ID" <<'PY'
import json, sys
with open(sys.argv[1]) as handle:
nodes = json.load(handle)
assert len(nodes) == 1, nodes
entry = nodes[0]
assert entry["node_id"], entry
expected_primary = sys.argv[2]
expected_env = sys.argv[3]
ids = {entry.get("node_id") for entry in nodes}
assert expected_primary in ids, nodes
assert expected_env in ids, nodes
assert len(nodes) >= 2, nodes
PY
echo "[INFO] Master reflects agent health and nodes.json entries"
env_detail_file="$TMP_ROOT/env_agent_detail.json"
curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_detail_file"
python3 - "$env_detail_file" "$ENV_HOSTNAME" <<'PY'
import json, sys
with open(sys.argv[1]) as handle:
node = json.load(handle)
expected_name = sys.argv[2]
assert node.get("name") == expected_name, node
meta = node.get("meta_data", {})
assert meta.get("env") == "prod", meta
assert meta.get("user") == "ml", meta
assert meta.get("instance") == "node-3", meta
PY
echo "[INFO] Env-variable agent reports expected metadata"

View File

@ -6,10 +6,20 @@ TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_ROOT="$TEST_ROOT/tmp"
API_BASE="http://localhost:32300/api/v1/master"
NODE_ID="$(cat "$TMP_ROOT/node_id")"
ENV_NODE_ID_FILE="$TMP_ROOT/node_id_host_abc"
if [[ ! -f "$ENV_NODE_ID_FILE" ]]; then
echo "[ERROR] Env agent node id file missing at $ENV_NODE_ID_FILE" >&2
exit 1
fi
ENV_NODE_ID="$(cat "$ENV_NODE_ID_FILE")"
AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
ENV_AGENT_HOSTNAME="host_abc"
NETWORK_NAME="tests_default"
NEW_AGENT_IP="172.28.0.200"
NEW_ENV_AGENT_IP="172.28.0.210"
ENTRYPOINT_SCRIPT="$SCRIPT_DIR/agent_entrypoint.sh"
VERIFY_SCRIPT="$TEST_ROOT/../scripts/agent_deployment_verify.sh"
ENV_FILE="$TEST_ROOT/.env"
# 中文提示:重启场景也需要同样的入口脚本,确保 DNS 注册逻辑一致
@ -18,6 +28,11 @@ if [[ ! -f "$ENTRYPOINT_SCRIPT" ]]; then
exit 1
fi
if [[ ! -f "$VERIFY_SCRIPT" ]]; then
echo "[ERROR] agent verification script missing at $VERIFY_SCRIPT" >&2
exit 1
fi
if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then
echo "[ERROR] Agent binary path missing; rerun bootstrap" >&2
exit 1
@ -74,15 +89,37 @@ if [[ "$prev_ip" != "$initial_ip" ]]; then
exit 1
fi
env_before_file="$TMP_ROOT/env_before_restart.json"
curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_before_file"
env_prev_last_updated=$(python3 - "$env_before_file" <<'PY'
import json, sys
with open(sys.argv[1]) as handle:
node = json.load(handle)
print(node.get("last_updated", ""))
PY
)
env_prev_ip=$(python3 - "$env_before_file" <<'PY'
import json, sys
with open(sys.argv[1]) as handle:
node = json.load(handle)
print(node["meta_data"].get("ip", ""))
PY
)
pushd "$TEST_ROOT" >/dev/null
compose rm -sf agent
compose rm -sf agent_env
popd >/dev/null
docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
docker container rm -f argus-agent-env-e2e >/dev/null 2>&1 || true
AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME"
HEALTH_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/health"
ENV_AGENT_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME"
ENV_HEALTH_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME/health"
# 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态
if ! docker run -d \
--name argus-agent-e2e \
@ -94,6 +131,7 @@ if ! docker run -d \
-v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
-v "$VERIFY_SCRIPT:/usr/local/bin/agent_deployment_verify.sh:ro" \
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
-e REPORT_INTERVAL_SECONDS=2 \
-e ARGUS_BUILD_UID="$AGENT_UID" \
@ -141,3 +179,76 @@ if [[ "$success" != true ]]; then
fi
echo "[INFO] Agent restart produced successful re-registration with IP change"
# ---- Restart env-driven agent without metadata environment variables ----
if [[ ! -d "$ENV_AGENT_DIR" ]]; then
echo "[ERROR] Env agent data dir missing at $ENV_AGENT_DIR" >&2
exit 1
fi
if [[ ! -d "$ENV_HEALTH_DIR" ]]; then
mkdir -p "$ENV_HEALTH_DIR"
fi
if ! docker run -d \
--name argus-agent-env-e2e \
--hostname "$ENV_AGENT_HOSTNAME" \
--network "$NETWORK_NAME" \
--ip "$NEW_ENV_AGENT_IP" \
-v "$ENV_AGENT_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME" \
-v "$ENV_HEALTH_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME/health" \
-v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
-v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \
-v "$VERIFY_SCRIPT:/usr/local/bin/agent_deployment_verify.sh:ro" \
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
-e REPORT_INTERVAL_SECONDS=2 \
-e ARGUS_BUILD_UID="$AGENT_UID" \
-e ARGUS_BUILD_GID="$AGENT_GID" \
--entrypoint /usr/local/bin/agent-entrypoint.sh \
ubuntu:22.04 >/dev/null; then
echo "[ERROR] Failed to start env-driven agent container without metadata env" >&2
exit 1
fi
env_success=false
env_detail_file="$TMP_ROOT/env_post_restart.json"
for _ in {1..20}; do
sleep 3
if ! curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_detail_file"; then
continue
fi
if python3 - "$env_detail_file" "$env_prev_last_updated" "$ENV_NODE_ID" "$env_prev_ip" "$NEW_ENV_AGENT_IP" <<'PY'
import json, sys
with open(sys.argv[1]) as handle:
node = json.load(handle)
prev_last_updated = sys.argv[2]
expected_id = sys.argv[3]
old_ip = sys.argv[4]
expected_ip = sys.argv[5]
last_updated = node.get("last_updated")
current_ip = node["meta_data"].get("ip")
meta = node.get("meta_data", {})
assert node["id"] == expected_id
if current_ip != expected_ip:
raise SystemExit(1)
if current_ip == old_ip:
raise SystemExit(1)
if not last_updated or last_updated == prev_last_updated:
raise SystemExit(1)
if meta.get("env") != "prod" or meta.get("user") != "ml" or meta.get("instance") != "node-3":
raise SystemExit(1)
PY
then
env_success=true
break
fi
done
if [[ "$env_success" != true ]]; then
echo "[ERROR] Env-driven agent did not reuse persisted metadata after restart" >&2
exit 1
fi
echo "[INFO] Env-driven agent restart succeeded with persisted metadata"

View File

@ -13,7 +13,7 @@ compose() {
fi
}
docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true
docker container rm -f argus-agent-e2e argus-agent-env-e2e >/dev/null 2>&1 || true
pushd "$TEST_ROOT" >/dev/null
compose down --remove-orphans

View File

@ -1,26 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
VERIFY_SCRIPT="$(cd "$TEST_ROOT/.." && pwd)/scripts/agent_deployment_verify.sh"
if ! docker ps --format '{{.Names}}' | grep -q '^argus-agent-e2e$'; then
echo "[WARN] agent container not running; skip verification"
exit 0
fi
if docker exec -i argus-agent-e2e bash -lc 'command -v curl >/dev/null && command -v jq >/dev/null'; then
echo "[INFO] curl/jq already installed in agent container"
else
echo "[INFO] Installing curl/jq in agent container"
docker exec -i argus-agent-e2e bash -lc 'apt-get update >/dev/null 2>&1 && apt-get install -y curl jq >/dev/null 2>&1' || true
fi
if docker exec -i argus-agent-e2e bash -lc 'command -v /usr/local/bin/agent_deployment_verify.sh >/dev/null'; then
docker exec -i argus-agent-e2e /usr/local/bin/agent_deployment_verify.sh
elif [[ -x "$VERIFY_SCRIPT" ]]; then
docker exec -i argus-agent-e2e "$VERIFY_SCRIPT"
else
echo "[WARN] agent_deployment_verify.sh not found"
fi

View File

@ -0,0 +1,151 @@
from __future__ import annotations
import os
import unittest
from contextlib import contextmanager
from unittest.mock import patch
from app.config import AgentConfig, load_config
@contextmanager
def temp_env(**overrides: str | None):
originals: dict[str, str | None] = {}
try:
for key, value in overrides.items():
originals[key] = os.environ.get(key)
if value is None:
os.environ.pop(key, None)
else:
os.environ[key] = value
yield
finally:
for key, original in originals.items():
if original is None:
os.environ.pop(key, None)
else:
os.environ[key] = original
class LoadConfigMetadataTests(unittest.TestCase):
@patch("app.config.Path.mkdir")
def test_metadata_from_environment_variables(self, mock_mkdir):
with temp_env(
MASTER_ENDPOINT="http://master.local",
AGENT_HOSTNAME="dev-user-one-pod",
AGENT_ENV="prod",
AGENT_USER="ops",
AGENT_INSTANCE="node-1",
):
config = load_config()
self.assertEqual(config.environment, "prod")
self.assertEqual(config.user, "ops")
self.assertEqual(config.instance, "node-1")
mock_mkdir.assert_called()
@patch("app.config.Path.mkdir")
def test_metadata_falls_back_to_hostname(self, mock_mkdir):
with temp_env(
MASTER_ENDPOINT="http://master.local",
AGENT_HOSTNAME="qa-team-abc-pod-2",
AGENT_ENV=None,
AGENT_USER=None,
AGENT_INSTANCE=None,
):
config = load_config()
self.assertEqual(config.environment, "qa")
self.assertEqual(config.user, "team")
self.assertEqual(config.instance, "abc")
mock_mkdir.assert_called()
@patch("app.config._load_metadata_from_state", return_value=("prod", "ops", "node-1"))
@patch("app.config.Path.mkdir")
def test_metadata_from_node_state(self, mock_mkdir, mock_state):
with temp_env(
MASTER_ENDPOINT="http://master.local",
AGENT_HOSTNAME="host_abc",
AGENT_ENV=None,
AGENT_USER=None,
AGENT_INSTANCE=None,
):
config = load_config()
self.assertEqual(config.environment, "prod")
self.assertEqual(config.user, "ops")
self.assertEqual(config.instance, "node-1")
mock_state.assert_called_once()
mock_mkdir.assert_called()
@patch("app.config.Path.mkdir")
def test_partial_environment_variables_fallback(self, mock_mkdir):
with temp_env(
MASTER_ENDPOINT="http://master.local",
AGENT_HOSTNAME="stage-ml-001-node",
AGENT_ENV="prod",
AGENT_USER=None,
AGENT_INSTANCE=None,
):
config = load_config()
self.assertEqual(config.environment, "stage")
self.assertEqual(config.user, "ml")
self.assertEqual(config.instance, "001")
mock_mkdir.assert_called()
@patch("app.config.Path.mkdir")
def test_invalid_hostname_raises_error(self, mock_mkdir):
with temp_env(
MASTER_ENDPOINT="http://master.local",
AGENT_HOSTNAME="invalidhostname",
AGENT_ENV=None,
AGENT_USER=None,
AGENT_INSTANCE=None,
):
with self.assertRaises(ValueError):
load_config()
mock_mkdir.assert_not_called()
class CollectMetadataTests(unittest.TestCase):
@patch("app.collector._detect_ip_address", return_value="127.0.0.1")
@patch("app.collector._detect_gpu_count", return_value=0)
@patch("app.collector._detect_memory_bytes", return_value=1024)
@patch("app.collector._detect_cpu_count", return_value=8)
def test_collect_metadata_uses_config_fields(
self,
mock_cpu,
mock_memory,
mock_gpu,
mock_ip,
):
config = AgentConfig(
hostname="dev-user-001-pod",
environment="prod",
user="ops",
instance="node-1",
node_file="/tmp/node.json",
version="1.0.0",
master_endpoint="http://master.local",
report_interval_seconds=60,
health_dir="/tmp/health",
)
from app.collector import collect_metadata
metadata = collect_metadata(config)
self.assertEqual(metadata["env"], "prod")
self.assertEqual(metadata["user"], "ops")
self.assertEqual(metadata["instance"], "node-1")
self.assertEqual(metadata["hostname"], "dev-user-001-pod")
self.assertEqual(metadata["ip"], "127.0.0.1")
self.assertEqual(metadata["cpu_number"], 8)
self.assertEqual(metadata["memory_in_bytes"], 1024)
self.assertEqual(metadata["gpu_number"], 0)
if __name__ == "__main__":
unittest.main()

View File

@ -17,6 +17,9 @@ log_message() {
log_message "DNS监控脚本启动"
log_message "删除DNS备份文件如果存在"
rm -f $DNS_BACKUP
while true; do
if [ -f "$DNS_CONF" ]; then
if [ -f "$DNS_BACKUP" ]; then

View File

@ -2,7 +2,7 @@
set -euo pipefail
ES_HOST="${ELASTICSEARCH_HOSTS:-http://es:9200}"
KB_HOST="http://localhost:5601"
KB_HOST="${KB_HOST:-http://127.0.0.1:5601}"
echo "[INFO] Starting Kibana post-start configuration..."
@ -83,50 +83,37 @@ fix_replicas_idempotent() {
}
# 幂等创建数据视图
create_or_ensure_data_view() {
local name="$1"
local title="$2"
local list_response
list_response=$(curl -fsS "$KB_HOST/api/data_views" -H 'kbn-xsrf: true' 2>/dev/null || echo "")
if [ -z "$list_response" ]; then
echo "[WARN] Failed to list data views, skipping creation check for $title"
return
fi
if echo "$list_response" | grep -Fq "\"title\":\"$title\""; then
echo "[INFO] Data view $title already exists, skipping"
return
fi
echo "[INFO] Creating data view for $title indices (allowNoIndex)"
curl -fsS -X POST "$KB_HOST/api/data_views/data_view?allowNoIndex=true" \
-H 'kbn-xsrf: true' \
-H 'Content-Type: application/json' \
-d "{\"data_view\":{\"name\":\"$name\",\"title\":\"$title\",\"timeFieldName\":\"@timestamp\",\"allowNoIndex\":true}}" \
>/dev/null && echo "[OK] Created $name data view" || echo "[WARN] Failed to create $name data view"
}
create_data_views_idempotent() {
echo "[INFO] Checking and creating data views..."
# 检查是否存在匹配的索引
local train_indices=$(curl -s "$ES_HOST/_cat/indices/train-*?h=index" 2>/dev/null | wc -l || echo "0")
local infer_indices=$(curl -s "$ES_HOST/_cat/indices/infer-*?h=index" 2>/dev/null | wc -l || echo "0")
# 创建 train 数据视图
if [ "$train_indices" -gt 0 ]; then
# 检查数据视图是否已存在
local train_exists=$(curl -s "$KB_HOST/api/data_views" -H 'kbn-xsrf: true' 2>/dev/null | grep '"title":"train-\*"' | wc -l )
if [ "$train_exists" -eq 0 ]; then
echo "[INFO] Creating data view for train-* indices"
curl -fsS -X POST "$KB_HOST/api/data_views/data_view" \
-H 'kbn-xsrf: true' \
-H 'Content-Type: application/json' \
-d '{"data_view":{"name":"train","title":"train-*","timeFieldName":"@timestamp"}}' \
>/dev/null && echo "[OK] Created train data view" || echo "[WARN] Failed to create train data view"
else
echo "[INFO] Train data view already exists, skipping"
fi
else
echo "[INFO] No train-* indices found, skipping train data view creation"
fi
# 创建 infer 数据视图
if [ "$infer_indices" -gt 0 ]; then
# 检查数据视图是否已存在
local infer_exists=$(curl -s "$KB_HOST/api/data_views" -H 'kbn-xsrf: true' 2>/dev/null | grep '"title":"infer-\*"' | wc -l )
if [ "$infer_exists" -eq 0 ]; then
echo "[INFO] Creating data view for infer-* indices"
curl -fsS -X POST "$KB_HOST/api/data_views/data_view" \
-H 'kbn-xsrf: true' \
-H 'Content-Type: application/json' \
-d '{"data_view":{"name":"infer","title":"infer-*","timeFieldName":"@timestamp"}}' \
>/dev/null && echo "[OK] Created infer data view" || echo "[WARN] Failed to create infer data view"
else
echo "[INFO] Infer data view already exists, skipping"
fi
else
echo "[INFO] No infer-* indices found, skipping infer data view creation"
fi
create_or_ensure_data_view "train" "train-*"
create_or_ensure_data_view "infer" "infer-*"
}
# 主逻辑

View File

@ -115,20 +115,32 @@ show_step "Health" "Check service health"
echo "[INFO] Checking service health..."
# 检查 Elasticsearch 健康状态
health_check_ok=1
es_health=$(curl -s "http://localhost:9200/_cluster/health" | grep -o '"status":"[^"]*"' | cut -d'"' -f4)
if [ "$es_health" = "green" ] || [ "$es_health" = "yellow" ]; then
echo "✅ Elasticsearch health: $es_health"
else
echo "❌ Elasticsearch health: $es_health"
health_check_ok=0
fi
# 检查 Kibana 状态
if curl -fs "http://localhost:5601/api/status" >/dev/null 2>&1; then
kb_status="available"
echo "✅ Kibana status: $kb_status"
data_views_json=$(curl -fs "http://localhost:5601/api/data_views" -H 'kbn-xsrf: true' 2>/dev/null || true)
if echo "$data_views_json" | grep -F '"title":"train-*"' >/dev/null 2>&1 && \
echo "$data_views_json" | grep -F '"title":"infer-*"' >/dev/null 2>&1; then
echo "✅ Kibana data views: train-* and infer-* present"
else
echo "❌ Kibana data views missing: train-* or infer-*"
health_check_ok=0
fi
else
kb_status="unavailable"
echo "⚠️ Kibana status: $kb_status"
health_check_ok=0
fi
# 检查 Fluent-Bit 指标
@ -139,6 +151,13 @@ if [ "$fb_host01_uptime" -gt 0 ] && [ "$fb_host02_uptime" -gt 0 ]; then
echo "✅ Fluent-Bit services: host01 uptime=${fb_host01_uptime}s, host02 uptime=${fb_host02_uptime}s"
else
echo "⚠️ Fluent-Bit services: host01 uptime=${fb_host01_uptime}s, host02 uptime=${fb_host02_uptime}s"
health_check_ok=0
fi
if [ "$health_check_ok" -eq 1 ]; then
true
else
false
fi
verify_step "Service health check"

View File

@ -3,12 +3,13 @@ set -euo pipefail
usage() {
cat >&2 <<'USAGE'
Usage: $0 [--intranet] [--offline] [--tag <image_tag>]
Usage: $0 [--intranet] [--offline] [--tag <image_tag>] [--no-cache]
Options:
--intranet 使用指定的 PyPI 镜像源(默认清华镜像)。
--offline 完全离线构建,依赖 offline_wheels/ 目录中的离线依赖包。
--tag <image_tag> 自定义镜像标签,默认 argus-master:latest。
--no-cache 不使用 Docker 构建缓存。
USAGE
}
@ -19,6 +20,7 @@ IMAGE_TAG="${IMAGE_TAG:-argus-master:latest}"
DOCKERFILE="src/master/Dockerfile"
BUILD_ARGS=()
OFFLINE_MODE=0
NO_CACHE=0
source "$PROJECT_ROOT/scripts/common/build_user.sh"
load_build_user
@ -45,6 +47,11 @@ while [[ "$#" -gt 0 ]]; do
IMAGE_TAG="$2"
shift 2
;;
--no-cache)
NO_CACHE=1
BUILD_ARGS+=("--no-cache")
shift
;;
-h|--help)
usage
exit 0

View File

@ -4,6 +4,15 @@
set -e
# PID 文件检测,防止重复执行
PIDFILE="/var/run/check_health.pid"
if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then
echo "健康检查脚本已在运行中,跳过本次执行" >&2
exit 0
fi
echo $$ > "$PIDFILE"
trap "rm -f $PIDFILE" EXIT
# 获取脚本所在目录
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
HEALTH_LOG_FILE="$SCRIPT_DIR/.health_log"

View File

@ -200,22 +200,22 @@ parse_version_info() {
VERSION=$(grep '"version"' "$VERSION_FILE_PATH" | sed 's/.*"version": *"\([^"]*\)".*/\1/')
BUILD_TIME=$(grep '"build_time"' "$VERSION_FILE_PATH" | sed 's/.*"build_time": *"\([^"]*\)".*/\1/')
# 解析 artifact_list
grep -A 100 '"artifact_list"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do
# 解析 artifact_list(跳过字段名本身)
grep -A 100 '"artifact_list"' "$VERSION_FILE_PATH" | grep -v '"artifact_list"' | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do
component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/')
version=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/')
echo "$component:$version" >> "$TEMP_DIR/components.txt"
done
# 解析 checksums
grep -A 100 '"checksums"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do
# 解析 checksums(跳过字段名本身)
grep -A 100 '"checksums"' "$VERSION_FILE_PATH" | grep -v '"checksums"' | grep -E '^\s*"[^"]+":\s*"[^"]+"' | while read line; do
component=$(echo "$line" | sed 's/.*"\([^"]*\)":\s*"[^"]*".*/\1/')
checksum=$(echo "$line" | sed 's/.*"[^"]*":\s*"\([^"]*\)".*/\1/')
echo "$component:$checksum" >> "$TEMP_DIR/checksums.txt"
done
# 解析 install_order
grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -E '^\s*"[^"]+"' | while read line; do
# 解析 install_order(跳过字段名本身,只取数组元素)
grep -A 100 '"install_order"' "$VERSION_FILE_PATH" | grep -v '"install_order"' | grep -E '^\s*"[^"]+"' | while read line; do
component=$(echo "$line" | sed 's/.*"\([^"]*\)".*/\1/')
echo "$component" >> "$TEMP_DIR/install_order.txt"
done
@ -317,85 +317,152 @@ create_install_dirs() {
log_success "安装目录创建完成: $INSTALL_DIR"
}
# 获取系统版本
get_system_version() {
if [[ ! -f /etc/os-release ]]; then
log_error "无法检测操作系统版本"
return 1
fi
source /etc/os-release
# 提取主版本号
case "$VERSION_ID" in
"20.04")
echo "ubuntu20"
;;
"22.04")
echo "ubuntu22"
;;
*)
log_warning "未识别的Ubuntu版本: $VERSION_ID尝试使用ubuntu22"
echo "ubuntu22"
;;
esac
}
# 安装系统依赖包
install_system_deps() {
log_info "检查系统依赖包..."
local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
local deps_dir="$script_dir/deps"
# 检查deps目录是否存在
if [[ ! -d "$deps_dir" ]]; then
log_info "deps 目录不存在,跳过系统依赖包安装"
return 0
fi
# 获取系统版本对应的依赖目录
local system_version=$(get_system_version)
local version_deps_dir="$deps_dir/$system_version"
# 检查是否有tar.gz文件
local deps_count=$(find "$deps_dir" -name "*.tar.gz" | wc -l)
if [[ $deps_count -eq 0 ]]; then
log_info "deps 目录中没有 tar.gz 文件,跳过系统依赖包安装"
return 0
log_info "检测到系统版本: $system_version"
# 检查版本特定的依赖目录是否存在
if [[ ! -d "$version_deps_dir" ]]; then
log_warning "未找到 $system_version 版本的依赖目录: $version_deps_dir"
# 回退到旧的逻辑检查根deps目录
local deps_count=$(find "$deps_dir" -name "*.tar.gz" | wc -l)
if [[ $deps_count -eq 0 ]]; then
log_info "deps 目录中没有 tar.gz 文件,跳过系统依赖包安装"
return 0
fi
version_deps_dir="$deps_dir"
else
# 检查版本目录中是否有tar.gz文件
local deps_count=$(find "$version_deps_dir" -name "*.tar.gz" | wc -l)
if [[ $deps_count -eq 0 ]]; then
log_info "$system_version 版本目录中没有 tar.gz 文件,跳过系统依赖包安装"
return 0
fi
fi
log_info "找到 $deps_count 个系统依赖包,开始安装..."
log_info "找到 $system_version 版本的依赖包,开始安装..."
# 创建临时目录用于解压依赖包
local deps_temp_dir="$TEMP_DIR/deps"
local deps_temp_dir="${TEMP_DIR:-/tmp}/deps"
mkdir -p "$deps_temp_dir"
# 定义要检查的核心依赖
local CORE_DEPS=(jq cron curl)
local FAILED_DEPS=()
# 处理每个tar.gz文件
find "$deps_dir" -name "*.tar.gz" | while read tar_file; do
find "$version_deps_dir" -name "*.tar.gz" | while read tar_file; do
local tar_basename=$(basename "$tar_file")
local extract_name="${tar_basename%.tar.gz}"
log_info "处理依赖包: $tar_basename"
# 解压到临时目录
local extract_dir="$deps_temp_dir/$extract_name"
mkdir -p "$extract_dir"
if tar -xzf "$tar_file" -C "$extract_dir" 2>/dev/null; then
log_success " $tar_basename 解压完成"
else
log_error " $tar_basename 解压失败"
continue
fi
# 进入解压目录查找deb包
cd "$extract_dir"
local deb_count=$(find . -name "*.deb" | wc -l)
if [[ $deb_count -gt 0 ]]; then
log_info " 找到 $deb_count 个 deb 包,开始安装..."
# 1. 先尝试安装所有deb包
log_info " 第1步批量安装deb包..."
if dpkg -i *.deb 2>/dev/null; then
log_success " 所有deb包安装成功"
else
log_warning " 部分deb包安装失败可能存在依赖问题"
# 2. 使用apt-get修复依赖
log_info " 第2步修复依赖关系..."
if apt-get install -f -y; then
log_success " 依赖关系修复完成"
else
log_error " 依赖关系修复失败"
# 继续处理其他包,不退出
cd "$extract_dir" || continue
local deb_files=(*.deb)
if [[ ${#deb_files[@]} -gt 0 ]]; then
log_info " 找到 ${#deb_files[@]} 个 deb 包,开始安装..."
for deb in "${deb_files[@]}"; do
local pkg_name
pkg_name=$(dpkg-deb -f "$deb" Package 2>/dev/null)
# 如果已安装,则跳过
if dpkg -s "$pkg_name" &>/dev/null; then
log_success " $pkg_name 已安装,跳过"
continue
fi
fi
# 尝试安装
log_info " 安装 $pkg_name..."
if DEBIAN_FRONTEND=noninteractive dpkg -i "$deb" &>/dev/null; then
log_success " $pkg_name 安装成功"
else
log_warning " $pkg_name 安装失败,尝试修复依赖..."
if DEBIAN_FRONTEND=noninteractive apt-get install -f -y &>/dev/null; then
if dpkg -s "$pkg_name" &>/dev/null; then
log_success " $pkg_name 修复安装成功"
else
log_error " $pkg_name 仍未安装成功"
FAILED_DEPS+=("$pkg_name")
fi
else
log_error " $pkg_name 自动修复失败"
FAILED_DEPS+=("$pkg_name")
fi
fi
done
else
log_info " $tar_basename 中没有找到deb包跳过"
fi
# 返回到依赖临时目录
cd "$deps_temp_dir"
cd "$deps_temp_dir" || continue
done
# 检查并启动 cron 服务
start_cron_service
log_success "系统依赖包安装完成"
# 总结安装结果
if [[ ${#FAILED_DEPS[@]} -gt 0 ]]; then
log_error "以下系统依赖未能成功安装,安装终止,请手动安装后重试:"
for f in "${FAILED_DEPS[@]}"; do
echo " - $f"
done
exit 1
else
log_success "系统依赖包安装完成,全部就绪"
fi
}
# 启动 cron 服务
@ -637,6 +704,18 @@ EOF
log_success "安装记录已创建: $install_record_file"
}
# 检查cron任务是否已存在
check_cron_task_exists() {
local task_pattern="$1"
local temp_cron="$2"
if grep -q "$task_pattern" "$temp_cron"; then
return 0 # 任务已存在
else
return 1 # 任务不存在
fi
}
# 设置健康检查定时任务
setup_health_check_cron() {
log_info "设置健康检查定时任务..."
@ -661,7 +740,7 @@ setup_health_check_cron() {
crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron"
# 检查并删除旧的健康检查任务
if grep -q "check_health.sh" "$temp_cron"; then
if check_cron_task_exists "check_health.sh" "$temp_cron"; then
log_info "发现旧的健康检查定时任务,正在更新..."
# 删除所有包含check_health.sh的行
grep -v "check_health.sh" "$temp_cron" > "$temp_cron.new"
@ -716,7 +795,7 @@ setup_dns_sync_cron() {
crontab -l 2>/dev/null > "$temp_cron" || touch "$temp_cron"
# 检查并删除旧的 DNS 同步任务
if grep -q "sync_dns.sh" "$temp_cron"; then
if check_cron_task_exists "sync_dns.sh" "$temp_cron"; then
log_info "发现旧的 DNS 同步定时任务,正在更新..."
# 删除所有包含sync_dns.sh的行
grep -v "sync_dns.sh" "$temp_cron" > "$temp_cron.new"
@ -724,16 +803,15 @@ setup_dns_sync_cron() {
log_info "旧的 DNS 同步定时任务已删除"
fi
# 添加新的定时任务(每30秒执行一次)
# 添加新的定时任务(每1分钟执行一次)
# 直接使用版本目录中的 DNS 同步脚本
echo "# Argus-Metrics DNS 同步定时任务" >> "$temp_cron"
echo "* * * * * $sync_dns_script >> $INSTALL_DIR/.dns_sync.log 2>&1" >> "$temp_cron"
echo "* * * * * sleep 30; $sync_dns_script >> $INSTALL_DIR/.dns_sync.log 2>&1" >> "$temp_cron"
# 安装新的crontab
if crontab "$temp_cron"; then
log_success "DNS 同步定时任务设置成功"
log_info " 执行频率: 每30秒"
log_info " 执行频率: 每1分钟"
log_info " 日志文件: $INSTALL_DIR/.dns_sync.log"
log_info " 查看定时任务: crontab -l"
log_info " 删除定时任务: crontab -e"
@ -771,7 +849,7 @@ setup_version_check_cron() {
crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron"
# 检查是否已存在版本校验定时任务
if grep -q "check_version.sh" "$temp_cron"; then
if check_cron_task_exists "check_version.sh" "$temp_cron"; then
log_info "发现旧的版本校验定时任务,正在更新..."
# 删除所有包含check_version.sh的行
grep -v "check_version.sh" "$temp_cron" > "$temp_cron.new"
@ -824,7 +902,7 @@ setup_restart_cron() {
crontab -l > "$temp_cron" 2>/dev/null || touch "$temp_cron"
# 检查是否已存在自动重启定时任务
if grep -q "restart_unhealthy.sh" "$temp_cron"; then
if check_cron_task_exists "restart_unhealthy.sh" "$temp_cron"; then
log_info "发现旧的自动重启定时任务,正在更新..."
# 删除所有包含restart_unhealthy.sh的行
grep -v "restart_unhealthy.sh" "$temp_cron" > "$temp_cron.new"
@ -885,9 +963,9 @@ main() {
check_system
find_version_file
create_install_dirs
parse_version_info
install_system_deps
parse_version_info
verify_checksums
install_system_deps
install_components
copy_config_files
create_install_record
@ -895,6 +973,20 @@ main() {
setup_dns_sync_cron
setup_version_check_cron
setup_restart_cron
# 注释掉立即执行健康检查避免与cron任务重复执行
# log_info "立即执行一次健康检查..."
# local check_health_script="$INSTALL_DIR/check_health.sh"
# if [[ -f "$check_health_script" ]]; then
# if "$check_health_script" >> "$INSTALL_DIR/.health_check.log" 2>&1; then
# log_success "健康检查执行完成"
# else
# log_warning "健康检查执行失败,请检查日志: $INSTALL_DIR/.health_check.log"
# fi
# else
# log_warning "健康检查脚本不存在: $check_health_script"
# fi
show_install_info
}

View File

@ -29,26 +29,68 @@ log_error() {
show_help() {
echo "Argus-Metric Artifact 发布脚本"
echo
echo "用法: $0 <版本号>"
echo "用法: $0 <版本号> [选项]"
echo
echo "参数:"
echo " <版本号> 要发布的版本号,对应 artifact 目录中的版本"
echo " <版本号> 要发布的版本号,对应 artifact 目录中的版本"
echo
echo "选项:"
echo " --output-dir <路径> 指定输出目录 (默认: /private/argus/ftp/share/)"
echo " --owner <uid:gid> 指定文件所有者 (默认: 2133:2015)"
echo " -h, --help 显示此帮助信息"
echo
echo "示例:"
echo " $0 1.20.0 # 发布 1.20.0 版本"
echo " $0 1.20.0 # 使用默认配置发布"
echo " $0 1.20.0 --output-dir /tmp/publish # 指定输出目录"
echo " $0 1.20.0 --owner 1000:1000 # 指定文件所有者"
echo " $0 1.20.0 --output-dir /srv/ftp --owner root:root # 同时指定两者"
echo
}
# 检查参数
if [[ $# -ne 1 ]]; then
# 默认配置
DEFAULT_PUBLISH_DIR="/private/argus/ftp/share/"
DEFAULT_OWNER="2133:2015"
# 解析参数
VERSION=""
PUBLISH_DIR="$DEFAULT_PUBLISH_DIR"
OWNER="$DEFAULT_OWNER"
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help)
show_help
exit 0
;;
--output-dir)
PUBLISH_DIR="$2"
shift 2
;;
--owner)
OWNER="$2"
shift 2
;;
*)
if [[ -z "$VERSION" ]]; then
VERSION="$1"
shift
else
log_error "未知参数: $1"
show_help
exit 1
fi
;;
esac
done
# 检查版本号是否提供
if [[ -z "$VERSION" ]]; then
log_error "请提供版本号参数"
show_help
exit 1
fi
VERSION="$1"
ARTIFACT_DIR="artifact/$VERSION"
PUBLISH_DIR="/Users/sundapeng/Project/nlp/aiops/client-plugins/all-in-one/publish/"
# 检查版本目录是否存在
if [[ ! -d "$ARTIFACT_DIR" ]]; then
@ -57,10 +99,12 @@ if [[ ! -d "$ARTIFACT_DIR" ]]; then
fi
log_info "开始发布版本: $VERSION"
log_info "输出目录: $PUBLISH_DIR"
log_info "文件所有者: $OWNER"
# 确保发布目录存在
log_info "确保发布目录存在: $PUBLISH_DIR"
mkdir -p "$PUBLISH_DIR"
sudo mkdir -p "$PUBLISH_DIR"
# 创建临时目录用于打包
TEMP_PACKAGE_DIR="/tmp/argus-metric-package-$$"
@ -164,20 +208,26 @@ fi
TAR_NAME="argus-metric_$(echo $VERSION | tr '.' '_').tar.gz"
log_info "创建发布包: $TAR_NAME"
cd "$TEMP_PACKAGE_DIR"
tar -czf "$PUBLISH_DIR/$TAR_NAME" *
sudo tar -czf "$PUBLISH_DIR/$TAR_NAME" *
cd - > /dev/null
# 设置文件所有者
log_info "设置文件所有者为: $OWNER"
sudo chown "$OWNER" "$PUBLISH_DIR/$TAR_NAME"
# 清理临时目录
rm -rf "$TEMP_PACKAGE_DIR"
# 更新 LATEST_VERSION 文件
log_info "更新 LATEST_VERSION 文件..."
echo "$VERSION" > "$PUBLISH_DIR/LATEST_VERSION"
echo "$VERSION" | sudo tee "$PUBLISH_DIR/LATEST_VERSION" > /dev/null
sudo chown "$OWNER" "$PUBLISH_DIR/LATEST_VERSION"
# 复制 DNS 配置文件到发布目录根目录(直接从 config 目录复制)
if [[ -f "config/dns.conf" ]]; then
log_info "复制 DNS 配置文件到发布目录根目录..."
cp "config/dns.conf" "$PUBLISH_DIR/"
sudo cp "config/dns.conf" "$PUBLISH_DIR/"
sudo chown "$OWNER" "$PUBLISH_DIR/dns.conf"
log_success "DNS 配置文件复制完成: $PUBLISH_DIR/dns.conf"
else
log_warning "未找到 config/dns.conf 文件,跳过 DNS 配置文件复制"
@ -186,7 +236,8 @@ fi
# 复制 setup.sh 到发布目录
if [[ -f "scripts/setup.sh" ]]; then
log_info "复制 setup.sh 到发布目录..."
cp "scripts/setup.sh" "$PUBLISH_DIR/"
sudo cp "scripts/setup.sh" "$PUBLISH_DIR/"
sudo chown "$OWNER" "$PUBLISH_DIR/setup.sh"
fi
# 显示发布结果

View File

@ -2,6 +2,15 @@
# 此脚本会检查各组件的健康状态,并重启不健康的组件
# PID 文件检测,防止重复执行
PIDFILE="/var/run/restart_unhealthy.pid"
if [ -f "$PIDFILE" ] && kill -0 $(cat "$PIDFILE") 2>/dev/null; then
echo "自动重启脚本已在运行中,跳过本次执行" >&2
exit 0
fi
echo $$ > "$PIDFILE"
trap "rm -f $PIDFILE" EXIT
# 获取脚本所在目录
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
INSTALL_RECORD_FILE="$SCRIPT_DIR/.install_record"

View File

@ -1,244 +1,143 @@
#!/bin/bash
# DNS 同步脚本
# 比较 FTP 根目录的 dns.conf 和本地的 dns.conf如果有变化则同步到 /etc/resolv.conf
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 颜色
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
# 日志函数 - 输出到 stderr 避免影响函数返回值
log_info() {
echo -e "${BLUE}[INFO]${NC} $1" >&2
}
# 日志函数
log_info() { echo -e "${BLUE}[INFO]${NC} $1" >&2; }
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" >&2; }
log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" >&2; }
log_error() { echo -e "${RED}[ERROR]${NC} $1" >&2; }
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1" >&2
}
log_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1" >&2
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1" >&2
}
# 获取脚本所在目录
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LOCAL_DNS_CONF="/opt/argus-metric/dns.conf"
REMOTE_DNS_CONF_URL=""
RESOLV_CONF="/etc/resolv.conf"
ALT_RESOLV_CONF="/run/resolv.conf"
LOG_FILE="/opt/argus-metric/.dns_sync.log"
REMOTE_DNS_CONF_URL=""
# 从环境变量或配置文件获取 FTP 服务器信息
# 获取 FTP 配置
get_ftp_config() {
# 优先从环境变量获取配置
log_info "获取 FTP 配置信息..."
# 如果环境变量中没有设置,则尝试从配置文件读取
if [[ -z "$FTP_SERVER" || -z "$FTP_USER" || -z "$FTP_PASSWORD" ]]; then
local config_file="$SCRIPT_DIR/config.env"
if [[ -f "$config_file" ]]; then
log_info "从配置文件读取 FTP 配置: $config_file"
source "$config_file"
fi
else
log_info "使用环境变量中的 FTP 配置"
[[ -f "$SCRIPT_DIR/config.env" ]] && source "$SCRIPT_DIR/config.env"
fi
# 设置默认值(如果环境变量和配置文件都没有设置)
FTP_SERVER="${FTP_SERVER:-localhost}"
FTP_USER="${FTP_USER:-ftpuser}"
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
# 构建远程 DNS 配置文件 URL
REMOTE_DNS_CONF_URL="ftp://${FTP_USER}:${FTP_PASSWORD}@${FTP_SERVER}/dns.conf"
log_info "FTP 配置来源: ${FTP_CONFIG_SOURCE:-环境变量/配置文件}"
}
# 下载远程 DNS 配置文件
# 下载远程 dns.conf
download_remote_dns_conf() {
local temp_file="/tmp/dns.conf.remote.$$"
log_info "从 FTP 服务器下载 DNS 配置文件..."
log_info "远程地址: $REMOTE_DNS_CONF_URL"
log_info "FTP 服务器: $FTP_SERVER"
log_info "FTP 用户: $FTP_USER"
# 先测试 FTP 连接
local tmp="/tmp/dns.remote.$$"
log_info "测试 FTP 连接..."
if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null 2>&1; then
log_success "FTP 服务器连接成功"
else
log_error "无法连接到 FTP 服务器: $FTP_SERVER"
log_error "请检查:"
log_error " 1. FTP 服务器是否运行"
log_error " 2. 网络连接是否正常"
log_error " 3. 服务器地址是否正确"
return 1
if ! curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/" >/dev/null; then
log_error "无法连接到 FTP 服务器: $FTP_SERVER"; return 1
fi
# 测试 dns.conf 文件是否存在
log_info "检查远程 dns.conf 文件是否存在..."
if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sfI "ftp://${FTP_SERVER}/dns.conf" >/dev/null 2>&1; then
log_success "远程 dns.conf 文件存在"
else
log_error "远程 dns.conf 文件不存在或无法访问"
log_error "请检查 FTP 服务器根目录下是否有 dns.conf 文件"
return 1
fi
# 尝试下载文件
log_info "开始下载 dns.conf 文件..."
if curl -u "${FTP_USER}:${FTP_PASSWORD}" -sf "ftp://${FTP_SERVER}/dns.conf" -o "$temp_file" 2>/dev/null; then
log_success "远程 DNS 配置文件下载成功"
echo "$temp_file"
else
log_error "下载 dns.conf 文件失败"
log_error "尝试手动测试命令:"
log_error " curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_SERVER}/dns.conf"
rm -f "$temp_file"
return 1
if ! curl -u "${FTP_USER}:${FTP_PASSWORD}" -sf "ftp://${FTP_SERVER}/dns.conf" -o "$tmp" 2>/dev/null; then
log_error "下载 dns.conf 失败"; rm -f "$tmp"; return 1
fi
echo "$tmp"
}
# 比较两个文件是否相同
compare_files() {
local file1="$1"
local file2="$2"
if [[ ! -f "$file1" || ! -f "$file2" ]]; then
return 1
fi
# 使用 diff 比较文件内容
if diff -q "$file1" "$file2" >/dev/null 2>&1; then
return 0 # 文件相同
else
return 1 # 文件不同
fi
# 文件比较
compare_files() { diff -q "$1" "$2" >/dev/null 2>&1; }
# 从 dns.conf 提取有效 IP
get_dns_ips() {
grep -Eo '^[0-9]{1,3}(\.[0-9]{1,3}){3}$' "$1" | sort -u
}
# 将 DNS 配置追加到 /etc/resolv.conf
# 安全更新 resolv.conf保留符号链接
update_resolv_conf() {
local dns_conf_file="$1"
log_info "更新 /etc/resolv.conf 文件..."
# 备份原始文件
if [[ -f "$RESOLV_CONF" ]]; then
cp "$RESOLV_CONF" "${RESOLV_CONF}.backup.$(date +%Y%m%d_%H%M%S)"
log_info "已备份原始 resolv.conf 文件"
local dns_conf="$1"
local dns_ips
mapfile -t dns_ips < <(get_dns_ips "$dns_conf")
[[ ${#dns_ips[@]} -eq 0 ]] && { log_warning "未检测到有效 DNS"; return; }
local target_file="$RESOLV_CONF"
if [[ ! -w "$RESOLV_CONF" ]]; then
log_warning "/etc/resolv.conf 不可写,使用兜底路径 $ALT_RESOLV_CONF"
target_file="$ALT_RESOLV_CONF"
fi
# 读取 DNS 配置文件并追加到 resolv.conf
while IFS= read -r line; do
# 跳过空行和注释行
[[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
# 验证是否为有效的 IP 地址
if [[ "$line" =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
# 检查是否已存在相同的 nameserver 行
if ! grep -q "nameserver $line" "$RESOLV_CONF" 2>/dev/null; then
echo "nameserver $line" >> "$RESOLV_CONF"
log_info "添加 DNS 服务器: $line"
else
log_info "DNS 服务器已存在,跳过: $line"
fi
else
log_warning "跳过无效的 DNS 地址: $line"
local temp="/tmp/resolv.new.$$"
cp "$target_file" "${target_file}.backup.$(date +%Y%m%d_%H%M%S)" 2>/dev/null || true
log_info "更新 DNS 配置文件: $target_file"
# 写入新的 nameserver 行
for ip in "${dns_ips[@]}"; do
echo "nameserver $ip"
done >"$temp"
# 追加原内容(去掉重复 nameserver
grep -v '^nameserver' "$target_file" >>"$temp" 2>/dev/null || true
awk '!a[$0]++' "$temp" >"${temp}.uniq"
# ⚙️ 使用 cat 原地覆盖,避免 mv 引发 “设备忙”
if cat "${temp}.uniq" >"$target_file" 2>/dev/null; then
chmod 644 "$target_file"
log_success "DNS 更新完成: ${dns_ips[*]}"
else
log_error "无法写入 $target_file,可能被系统锁定"
fi
rm -f "$temp" "${temp}.uniq"
}
# 检查 resolv.conf 是否包含 dns.conf 内容
ensure_dns_in_resolv() {
local dns_conf="$1"
local dns_ips
mapfile -t dns_ips < <(get_dns_ips "$dns_conf")
[[ ${#dns_ips[@]} -eq 0 ]] && return
for ip in "${dns_ips[@]}"; do
if ! grep -q "nameserver $ip" "$RESOLV_CONF" 2>/dev/null; then
log_warning "检测到 /etc/resolv.conf 缺少 $ip,执行兜底修复"
update_resolv_conf "$dns_conf"
return
fi
done < "$dns_conf_file"
# 设置文件权限
chmod 644 "$RESOLV_CONF"
log_success "/etc/resolv.conf 文件更新完成"
done
log_info "/etc/resolv.conf 已包含所有 DNS"
}
# 记录同步日志
log_sync() {
local message="$1"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$timestamp] $message" >> "$LOG_FILE"
}
log_sync() { echo "[$(date '+%F %T')] $1" >>"$LOG_FILE"; }
# 主函数
main() {
log_info "开始 DNS 同步检查..."
log_sync "DNS 同步检查开始"
# 确保系统目录存在
mkdir -p "/opt/argus-metric"
# 获取 FTP 配置
mkdir -p /opt/argus-metric
get_ftp_config
# 检查本地 DNS 配置文件是否存在
local remote_file
if ! remote_file=$(download_remote_dns_conf); then
log_error "下载失败"; log_sync "同步失败"; exit 1
fi
if [[ ! -f "$LOCAL_DNS_CONF" ]]; then
log_warning "本地 DNS 配置文件不存在: $LOCAL_DNS_CONF"
log_warning "将下载远程配置文件并更新系统 DNS 设置"
# 下载远程配置文件
if remote_file=$(download_remote_dns_conf); then
# 复制到本地
cp "$remote_file" "$LOCAL_DNS_CONF"
log_success "远程 DNS 配置文件已保存到本地"
# 更新 resolv.conf
update_resolv_conf "$LOCAL_DNS_CONF"
log_sync "首次同步完成DNS 配置已更新"
# 清理临时文件
rm -f "$remote_file"
else
log_error "无法下载远程 DNS 配置文件,同步失败"
log_sync "同步失败:无法下载远程配置文件"
exit 1
fi
log_info "本地 dns.conf 不存在,初始化..."
cp "$remote_file" "$LOCAL_DNS_CONF"
update_resolv_conf "$LOCAL_DNS_CONF"
log_sync "首次同步完成"
else
log_info "本地 DNS 配置文件存在: $LOCAL_DNS_CONF"
# 下载远程配置文件进行比较
if remote_file=$(download_remote_dns_conf); then
# 比较文件
if compare_files "$LOCAL_DNS_CONF" "$remote_file"; then
log_info "DNS 配置文件无变化,无需更新"
log_sync "DNS 配置文件无变化"
else
log_info "检测到 DNS 配置文件有变化,开始同步..."
log_sync "检测到 DNS 配置文件变化,开始同步"
# 更新本地配置文件
cp "$remote_file" "$LOCAL_DNS_CONF"
log_success "本地 DNS 配置文件已更新"
# 更新 resolv.conf
update_resolv_conf "$LOCAL_DNS_CONF"
log_sync "DNS 配置同步完成"
fi
# 清理临时文件
rm -f "$remote_file"
if compare_files "$LOCAL_DNS_CONF" "$remote_file"; then
log_info "dns.conf 无变化"
ensure_dns_in_resolv "$LOCAL_DNS_CONF"
log_sync "dns.conf 无变化,执行兜底检查"
else
log_error "无法下载远程 DNS 配置文件,跳过本次同步"
log_sync "同步失败:无法下载远程配置文件"
exit 1
log_info "检测到 DNS 配置更新"
cp "$remote_file" "$LOCAL_DNS_CONF"
update_resolv_conf "$LOCAL_DNS_CONF"
log_sync "DNS 配置同步完成"
fi
fi
log_success "DNS 同步检查完成"
log_sync "DNS 同步检查完成"
rm -f "$remote_file"
log_success "DNS 同步流程完成"
}
# 脚本入口
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi

View File

@ -31,25 +31,26 @@ RUN mkdir -p /var/log/supervisor
ENV FTP_BASE_PATH=/private/argus/ftp
# 设置域名环境变量
ENV DOMAIN=prom.ftp.argus.com
ENV DOMAIN=ftp.metric.argus.com
# 设置FTP用户密码环境变量
ENV FTP_PASSWORD=ZGClab1234!
# 设置用户和组ID环境变量
ARG FTP_UID=2133
ARG FTP_GID=2015
ENV FTP_UID=${FTP_UID}
ENV FTP_GID=${FTP_GID}
ARG ARGUS_BUILD_UID=2133
ARG ARGUS_BUILD_GID=2015
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
# 创建FTP用户和目录结构
RUN groupadd -g ${FTP_GID} ftpuser && \
useradd -u ${FTP_UID} -g ${FTP_GID} -d ${FTP_BASE_PATH}/share -s /bin/bash ftpuser && \
RUN groupadd -g ${ARGUS_BUILD_GID} ftpuser && \
useradd -u ${ARGUS_BUILD_UID} -g ${ARGUS_BUILD_GID} -d ${FTP_BASE_PATH}/share -s /bin/bash ftpuser && \
mkdir -p ${FTP_BASE_PATH}/share \
&& mkdir -p /private/argus/etc \
&& mkdir -p /var/log/vsftpd \
&& mkdir -p /var/run/vsftpd/empty \
&& chown -R ftpuser:ftpuser ${FTP_BASE_PATH}
&& chown -R ftpuser:ftpuser ${FTP_BASE_PATH} \
&& mkdir -p /var/run/vsftpd/empty
# 创建vsftpd配置目录和用户列表文件
RUN mkdir -p /etc/vsftpd && \

View File

@ -32,6 +32,9 @@ IP=$(ifconfig eth0 | awk '/inet /{print $2}' || hostname -i)
echo "current IP: ${IP}"
echo "${IP}" > /private/argus/etc/${DOMAIN}
chown ${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID} /private/argus/etc/${DOMAIN}
chmod +x /private/argus/etc/${DOMAIN}
# 启动vsftpd
echo "[INFO] Starting vsftpd..."
exec /usr/sbin/vsftpd /tmp/vsftpd.conf

View File

@ -17,30 +17,31 @@ RUN mkdir -p /var/log/supervisor
ENV GRAFANA_BASE_PATH=/private/argus/metric/grafana
# 设置用户和组ID环境变量
ARG GRAFANA_UID=2133
ARG GRAFANA_GID=2015
ENV GRAFANA_UID=${GRAFANA_UID}
ENV GRAFANA_GID=${GRAFANA_GID}
ARG ARGUS_BUILD_UID=2133
ARG ARGUS_BUILD_GID=2015
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
# 创建基本目录结构
RUN mkdir -p /private/argus/etc \
&& mkdir -p /private/argus/metric/grafana/data \
&& mkdir -p /private/argus/metric/grafana/logs \
&& mkdir -p /private/argus/metric/grafana/plugins \
&& mkdir -p /private/argus/metric/grafana/provisioning/datasources \
&& mkdir -p /private/argus/metric/grafana/provisioning/dashboards \
&& mkdir -p /private/argus/metric/grafana/data/sessions \
&& mkdir -p /private/argus/metric/grafana/data/dashboards \
&& mkdir -p /private/argus/metric/grafana/config \
&& mkdir -p ${GRAFANA_BASE_PATH}/data \
&& mkdir -p ${GRAFANA_BASE_PATH}/logs \
&& mkdir -p ${GRAFANA_BASE_PATH}/plugins \
&& mkdir -p ${GRAFANA_BASE_PATH}/provisioning/datasources \
&& mkdir -p ${GRAFANA_BASE_PATH}/provisioning/dashboards \
&& mkdir -p ${GRAFANA_BASE_PATH}/data/sessions \
&& mkdir -p ${GRAFANA_BASE_PATH}/data/dashboards \
&& mkdir -p ${GRAFANA_BASE_PATH}/config \
&& mkdir -p /etc/grafana \
&& mkdir -p /var/lib/grafana \
&& mkdir -p /var/log/grafana
# 修改 Grafana 用户 UID/GID 并授权
RUN deluser grafana && \
addgroup -g ${GRAFANA_GID} grafana && \
adduser -u ${GRAFANA_UID} -G grafana -s /bin/sh -D grafana && \
chown -R grafana:grafana /var/lib/grafana /etc/grafana /var/log/grafana /private/argus
addgroup -g ${ARGUS_BUILD_GID} grafana && \
adduser -u ${ARGUS_BUILD_UID} -G grafana -s /bin/sh -D grafana && \
chown -R grafana:grafana /var/lib/grafana /etc/grafana /var/log/grafana ${GRAFANA_BASE_PATH}
# 复制配置文件到容器内临时位置
COPY grafana.ini /tmp/grafana.ini

View File

@ -9,6 +9,7 @@ DOMAIN=grafana.metric.argus.com
IP=$(ifconfig | awk '/inet / && $2 != "127.0.0.1" {print $2; exit}')
echo "current IP: ${IP}"
echo "${IP}" > /private/argus/etc/${DOMAIN}
chmod +x /private/argus/etc/${DOMAIN}
# 确保必要目录存在(权限已在 Dockerfile 中设置)
mkdir -p /private/argus/metric/grafana/data
@ -27,7 +28,6 @@ mkdir -p /var/lib/grafana
if [ -f "/tmp/grafana.ini" ]; then
echo "[INFO] Copying grafana.ini to /private/argus/metric/grafana/config/"
cp /tmp/grafana.ini /private/argus/metric/grafana/config/grafana.ini
chown grafana:grafana /private/argus/metric/grafana/config/grafana.ini
echo "[INFO] Grafana configuration copied successfully"
fi
@ -47,12 +47,9 @@ fi
if [ -f "/tmp/datasources.yml" ]; then
echo "[INFO] Copying datasource configuration to /private/argus/metric/grafana/provisioning/datasources/"
cp /tmp/datasources.yml /private/argus/metric/grafana/provisioning/datasources/datasources.yml
chown grafana:grafana /private/argus/metric/grafana/provisioning/datasources/datasources.yml
echo "[INFO] Datasource configuration copied successfully"
elif [ -d "/private/argus/metric/grafana/provisioning/datasources" ] && [ "$(ls -A /private/argus/metric/grafana/provisioning/datasources)" ]; then
echo "[INFO] Found existing datasource provisioning files in /private/argus/metric/grafana/provisioning/datasources"
# 确保数据源配置目录权限正确
chown -R grafana:grafana /private/argus/metric/grafana/provisioning/datasources
elif [ -d "/etc/grafana/provisioning/datasources" ] && [ "$(ls -A /etc/grafana/provisioning/datasources)" ]; then
echo "[INFO] Found datasource provisioning files in /etc/grafana/provisioning/datasources"
# 确保数据源配置目录权限正确
@ -65,7 +62,6 @@ fi
if [ -f "/tmp/dashboards.yml" ]; then
echo "[INFO] Copying dashboard configuration to /private/argus/metric/grafana/provisioning/dashboards/"
cp /tmp/dashboards.yml /private/argus/metric/grafana/provisioning/dashboards/dashboards.yml
chown grafana:grafana /private/argus/metric/grafana/provisioning/dashboards/dashboards.yml
echo "[INFO] Dashboard configuration copied successfully"
fi
@ -73,13 +69,9 @@ fi
if [ -f "/tmp/default_dashboard.json" ]; then
echo "[INFO] Copying default dashboard to /private/argus/metric/grafana/provisioning/dashboards/"
cp /tmp/default_dashboard.json /private/argus/metric/grafana/provisioning/dashboards/default_dashboard.json
chown grafana:grafana /private/argus/metric/grafana/provisioning/dashboards/default_dashboard.json
echo "[INFO] Default dashboard copied successfully"
fi
# 确保所有配置目录权限正确
chown -R grafana:grafana /private/argus/metric/grafana/provisioning/
# 启动 Grafana
if [ -n "$CONFIG_FILE" ]; then
echo "[INFO] Starting Grafana with custom configuration..."

View File

@ -48,11 +48,11 @@ RUN mkdir -p /var/log/supervisor
ENV PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus
# 设置用户和组ID环境变量
ARG PROMETHEUS_UID=2133
ARG PROMETHEUS_GID=2015
ENV PROMETHEUS_UID=${PROMETHEUS_UID}
ENV PROMETHEUS_GID=${PROMETHEUS_GID}
ARG ARGUS_BUILD_UID=2133
ARG ARGUS_BUILD_GID=2015
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
# 创建目录结构
RUN mkdir -p ${PROMETHEUS_BASE_PATH}/rules \
&& mkdir -p ${PROMETHEUS_BASE_PATH}/targets \
@ -61,11 +61,11 @@ RUN mkdir -p ${PROMETHEUS_BASE_PATH}/rules \
&& ln -s ${PROMETHEUS_BASE_PATH} /prometheus
# 修改 Prometheus 用户 UID/GID 并授权
RUN usermod -u ${PROMETHEUS_UID} nobody && \
groupmod -g ${PROMETHEUS_GID} nogroup && \
RUN usermod -u ${ARGUS_BUILD_UID} nobody && \
groupmod -g ${ARGUS_BUILD_GID} nogroup && \
chown -h nobody:nogroup /prometheus && \
chown -R nobody:nogroup /private/argus/metric /etc/prometheus && \
chown -R nobody:nogroup ${PROMETHEUS_BASE_PATH}
chown -R nobody:nogroup ${PROMETHEUS_BASE_PATH} && \
chown -R nobody:nogroup /etc/prometheus
# supervisor 配置
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf

View File

@ -7,7 +7,7 @@ global:
alerting:
alertmanagers:
- static_configs:
- targets: []
- targets: ["alertmanager.alert.argus.com:9093"]
# 规则目录
rule_files:

View File

@ -17,6 +17,7 @@ sed "s|\${PROMETHEUS_BASE_PATH}|${PROMETHEUS_BASE_PATH}|g" \
IP=$(ifconfig eth0 | awk '/inet /{print $2}')
echo "current IP: ${IP}"
echo "${IP}" > /private/argus/etc/${DOMAIN}
chmod +x /private/argus/etc/${DOMAIN}
exec /bin/prometheus \
--config.file=${PROMETHEUS_BASE_PATH}/prometheus.yml \

View File

@ -1,7 +1,7 @@
.env
data/
images-cache/
private-test-node/
*.tar
*.log
.DS_Store

View File

@ -0,0 +1,39 @@
# 使用NVIDIA官方CUDA基础镜像
FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive
# 设置时区
ENV TZ=Asia/Shanghai
RUN apt-get update -qq && \
apt-get install -y -qq \
tzdata \
curl \
wget \
gnupg2 \
software-properties-common \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
# 配置时区
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
WORKDIR /app
# 创建启动脚本在运行时验证GPU
COPY <<EOF /app/start.sh
#!/bin/bash
echo "检查GPU环境..."
if command -v nvidia-smi &> /dev/null; then
nvidia-smi
echo "GPU环境正常"
else
echo "警告: nvidia-smi 命令不可用请确保容器运行时启用了GPU支持"
fi
exec "\$@"
EOF
RUN chmod +x /app/start.sh
CMD ["/app/start.sh", "/bin/bash"]

View File

@ -0,0 +1,6 @@
FROM ubuntu:22.04
RUN apt-get update -qq && \
DEBIAN_FRONTEND=noninteractive apt-get install -y -qq tzdata && \
rm -rf /var/lib/apt/lists/*
ENV TZ=Asia/Shanghai

View File

@ -1,29 +1,39 @@
networks:
default:
name: argus-debug-net
external: true
services:
ftp:
build:
context: ../ftp/build
dockerfile: Dockerfile
args:
FTP_UID: ${FTP_UID:-2133}
FTP_GID: ${FTP_GID:-2015}
ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
USE_INTRANET: ${USE_INTRANET:-false}
image: argus-metric-ftp:latest
container_name: argus-ftp
restart: unless-stopped
environment:
- TZ=Asia/Shanghai
- FTP_BASE_PATH=/private/argus/ftp
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
- DOMAIN=${FTP_DOMAIN:-prom.ftp.argus.com}
- FTP_UID=${FTP_UID:-2133}
- FTP_GID=${FTP_GID:-2015}
- DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com}
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
ports:
- "${FTP_PORT:-21}:21"
- "${FTP_DATA_PORT:-20}:20"
- "21100-21110:21100-21110"
volumes:
- ${DATA_ROOT:-./data}/ftp:/private/argus/ftp
- ${DATA_ROOT:-./data}/etc:/private/argus/etc
- ${DATA_ROOT:-/private}/argus/metric/ftp:/private/argus/ftp
- ${DATA_ROOT:-/private}/argus/etc:/private/argus/etc
- /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro
networks:
- argus-network
default:
ipv4_address: 172.30.0.40
logging:
driver: "json-file"
options:
@ -35,23 +45,27 @@ services:
context: ../prometheus/build
dockerfile: Dockerfile
args:
PROMETHEUS_UID: ${PROMETHEUS_UID:-2133}
PROMETHEUS_GID: ${PROMETHEUS_GID:-2015}
ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
USE_INTRANET: ${USE_INTRANET:-false}
image: argus-metric-prometheus:latest
container_name: argus-prometheus
restart: unless-stopped
environment:
- TZ=Asia/Shanghai
- PROMETHEUS_BASE_PATH=/private/argus/metric/prometheus
- PROMETHEUS_UID=${PROMETHEUS_UID:-2133}
- PROMETHEUS_GID=${PROMETHEUS_GID:-2015}
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
ports:
- "${PROMETHEUS_PORT:-9090}:9090"
volumes:
- ${DATA_ROOT:-./data}/prometheus:/private/argus/metric/prometheus
- ${DATA_ROOT:-./data}/etc:/private/argus/etc
- ${DATA_ROOT:-/private}/argus/metric/prometheus:/private/argus/metric/prometheus
- ${DATA_ROOT:-/private}/argus/etc:/private/argus/etc
- /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro
networks:
- argus-network
default:
ipv4_address: 172.30.0.41
logging:
driver: "json-file"
options:
@ -63,25 +77,29 @@ services:
context: ../grafana/build
dockerfile: Dockerfile
args:
GRAFANA_UID: ${GRAFANA_UID:-2133}
GRAFANA_GID: ${GRAFANA_GID:-2015}
ARGUS_BUILD_UID: ${ARGUS_BUILD_UID:-2133}
ARGUS_BUILD_GID: ${ARGUS_BUILD_GID:-2015}
image: argus-metric-grafana:latest
container_name: argus-grafana
restart: unless-stopped
environment:
- TZ=Asia/Shanghai
- GRAFANA_BASE_PATH=/private/argus/metric/grafana
- GRAFANA_UID=${GRAFANA_UID:-2133}
- GRAFANA_GID=${GRAFANA_GID:-2015}
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
- GF_SERVER_HTTP_PORT=3000
- GF_LOG_LEVEL=warn
- GF_LOG_MODE=console
ports:
- "${GRAFANA_PORT:-3000}:3000"
volumes:
- ${DATA_ROOT:-./data}/grafana:/private/argus/metric/grafana
- ${DATA_ROOT:-./data}/etc:/private/argus/etc
- ${DATA_ROOT:-/private}/argus/metric/grafana:/private/argus/metric/grafana
- ${DATA_ROOT:-/private}/argus/etc:/private/argus/etc
- /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro
networks:
- argus-network
default:
ipv4_address: 172.30.0.42
depends_on:
- prometheus
logging:
@ -90,16 +108,78 @@ services:
max-size: "10m"
max-file: "3"
networks:
argus-network:
driver: bridge
name: argus-network
test-node:
build:
context: ./client-test-node/build
dockerfile: Dockerfile
image: argus-metric-test-node:latest
container_name: argus-metric-test-node
hostname: test-metric-node-001
restart: unless-stopped
privileged: true
depends_on:
- ftp
- prometheus
environment:
- TZ=Asia/Shanghai
- DEBIAN_FRONTEND=noninteractive
- FTP_DOMAIN=${FTP_DOMAIN:-ftp.metric.argus.com}
- FTP_SERVER=${FTP_SERVER:-172.30.0.40}
- FTP_USER=${FTP_USER:-ftpuser}
- FTP_PASSWORD=${FTP_PASSWORD:-ZGClab1234!}
- FTP_PORT=${FTP_PORT:-21}
volumes:
- ${DATA_ROOT:-/private}/argus/agent:/private/argus/agent
- /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro
command: sleep infinity
networks:
default:
ipv4_address: 172.30.0.50
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
volumes:
ftp_data:
driver: local
prometheus_data:
driver: local
grafana_data:
driver: local
test-gpu-node:
build:
context: ./client-test-gpu-node/build
dockerfile: Dockerfile
image: argus-metric-test-gpu-node:latest
container_name: argus-metric-test-gpu-node
hostname: test-metric-gpu-node-001
restart: unless-stopped
privileged: true
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities:
- gpu
depends_on:
- ftp
- prometheus
environment:
- TZ=Asia/Shanghai
- DEBIAN_FRONTEND=noninteractive
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- GPU_MODE=gpu
volumes:
- ${DATA_ROOT:-/private}/argus/agent:/private/argus/agent
- /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro
command: sleep infinity
networks:
default:
ipv4_address: 172.30.0.51
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"

View File

@ -1,19 +1,15 @@
# 用户和组配置
FTP_UID=2133
FTP_GID=2015
PROMETHEUS_UID=2133
PROMETHEUS_GID=2015
GRAFANA_UID=2133
GRAFANA_GID=2015
# 统一用户和组配置
ARGUS_BUILD_UID=1048
ARGUS_BUILD_GID=1048
# 数据根目录
DATA_ROOT=/private/argus
DATA_ROOT=/private
# FTP 配置
FTP_PORT=2122
FTP_DATA_PORT=2022
FTP_PORT=21
FTP_DATA_PORT=20
FTP_PASSWORD=ZGClab1234!
FTP_DOMAIN=prom.ftp.argus.com
FTP_DOMAIN=ftp.metric.argus.com
# Prometheus 配置
PROMETHEUS_PORT=9090

View File

@ -1,90 +0,0 @@
#!/bin/bash
# 初始化目录脚本
# 用于创建所有必要的数据目录并设置正确的权限
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
# 加载 .env 文件(如果存在)
if [ -f .env ]; then
echo "加载 .env 配置文件..."
source .env
fi
# 默认配置
FTP_UID=${FTP_UID:-2133}
FTP_GID=${FTP_GID:-2015}
PROMETHEUS_UID=${PROMETHEUS_UID:-2133}
PROMETHEUS_GID=${PROMETHEUS_GID:-2015}
GRAFANA_UID=${GRAFANA_UID:-2133}
GRAFANA_GID=${GRAFANA_GID:-2015}
DATA_ROOT=${DATA_ROOT:-./data}
echo "开始初始化目录结构..."
echo "数据目录: ${DATA_ROOT}"
echo ""
# 创建 FTP 目录
echo "创建 FTP 目录..."
sudo mkdir -p ${DATA_ROOT}/ftp/share
sudo chown -R ${FTP_UID}:${FTP_GID} ${DATA_ROOT}/ftp
sudo chmod -R 755 ${DATA_ROOT}/ftp
# 创建 Prometheus 目录
echo "创建 Prometheus 目录..."
sudo mkdir -p ${DATA_ROOT}/prometheus/{data,rules,targets}
# 创建默认的 targets 文件(先创建文件再改权限)
if [ ! -f "${DATA_ROOT}/prometheus/targets/node_exporter.json" ]; then
echo "创建默认 node_exporter targets..."
echo '[
{
"targets": [],
"labels": {
"job": "node"
}
}
]' | sudo tee ${DATA_ROOT}/prometheus/targets/node_exporter.json > /dev/null
fi
if [ ! -f "${DATA_ROOT}/prometheus/targets/dcgm_exporter.json" ]; then
echo "创建默认 dcgm_exporter targets..."
echo '[
{
"targets": [],
"labels": {
"job": "dcgm"
}
}
]' | sudo tee ${DATA_ROOT}/prometheus/targets/dcgm_exporter.json > /dev/null
fi
# 统一设置 Prometheus 目录权限
sudo chown -R ${PROMETHEUS_UID}:${PROMETHEUS_GID} ${DATA_ROOT}/prometheus
sudo chmod -R 755 ${DATA_ROOT}/prometheus
# 创建 Grafana 目录
echo "创建 Grafana 目录..."
sudo mkdir -p ${DATA_ROOT}/grafana/{data,logs,plugins,provisioning/datasources,provisioning/dashboards,data/sessions,data/dashboards,config}
sudo chown -R ${GRAFANA_UID}:${GRAFANA_GID} ${DATA_ROOT}/grafana
sudo chmod -R 755 ${DATA_ROOT}/grafana
# 创建公共配置目录
sudo mkdir -p ${DATA_ROOT}/etc
sudo chown -R ${FTP_UID}:${FTP_GID} ${DATA_ROOT}/etc
sudo chmod -R 755 ${DATA_ROOT}/etc
echo "目录初始化完成!"
echo ""
echo "目录结构:"
echo " ${DATA_ROOT}/"
echo " ├── ftp/ (UID:${FTP_UID}, GID:${FTP_GID})"
echo " ├── prometheus/ (UID:${PROMETHEUS_UID}, GID:${PROMETHEUS_GID})"
echo " ├── grafana/ (UID:${GRAFANA_UID}, GID:${GRAFANA_GID})"
echo " └── etc/ (UID:${FTP_UID}, GID:${FTP_GID})"
echo ""
echo "您现在可以运行 'docker-compose up -d' 来启动所有服务"

View File

@ -0,0 +1,18 @@
#!/bin/bash
set -e
SCRIPT_DIR="$(dirname "$0")"
echo "=========================================="
echo "Argus Metric E2E Test"
echo "=========================================="
bash "$SCRIPT_DIR/01_start_services.sh"
bash "$SCRIPT_DIR/02_publish_artifact.sh"
bash "$SCRIPT_DIR/03_test_node_install.sh"
bash "$SCRIPT_DIR/04_verify_install.sh"
echo "=========================================="
echo "E2E 测试完成"
echo "=========================================="

View File

@ -0,0 +1,27 @@
#!/bin/bash
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# 解析参数
REBUILD_FLAG=""
if [[ "$1" == "--rebuild" || "$1" == "-r" ]]; then
REBUILD_FLAG="--rebuild"
echo "[01] 启用强制重新构建模式"
fi
echo "[01] 启动所有服务..."
bash "$SCRIPT_DIR/common/start-all.sh" $REBUILD_FLAG
echo "[01] 等待服务就绪..."
sleep 5
echo "[01] 检查服务状态..."
docker ps | grep argus-ftp
docker ps | grep argus-prometheus
docker ps | grep argus-grafana
docker ps | grep argus-metric-test-node
docker ps | grep argus-metric-test-gpu-node
echo "[01] 基础服务已启动"

View File

@ -0,0 +1,60 @@
#!/bin/bash
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
TEST_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
PLUGIN_DIR="$(cd "$SCRIPT_DIR/../../client-plugins/all-in-one-full" && pwd)"
# 加载 .env
if [ -f "$TEST_DIR/.env" ]; then
source "$TEST_DIR/.env"
fi
# 检测容器挂载目录
if docker ps --format '{{.Names}}' | grep -q '^argus-ftp$'; then
FTP_MOUNT=$(docker inspect argus-ftp --format '{{range .Mounts}}{{if eq .Destination "/private/argus/ftp"}}{{.Source}}{{end}}{{end}}')
OUTPUT_DIR="${FTP_MOUNT}/share"
echo "[02] 容器挂载: $OUTPUT_DIR"
else
OUTPUT_DIR="${DATA_ROOT:-$TEST_DIR/data}/ftp/share"
echo "[02] 默认路径: $OUTPUT_DIR"
fi
OWNER="${ARGUS_BUILD_UID:-2133}:${ARGUS_BUILD_GID:-2015}"
cd "$PLUGIN_DIR"
echo "[02] 递增版本号..."
bash scripts/version-manager.sh bump minor
VERSION_FILE="config/VERSION"
if [ ! -f "$VERSION_FILE" ]; then
echo "[02] 错误: 未找到 $VERSION_FILE"
exit 1
fi
VERSION=$(cat "$VERSION_FILE" | tr -d '[:space:]')
echo "[02] 新版本: $VERSION"
echo "[02] 构建安装包..."
bash scripts/package_artifact.sh --force
echo "[02] 发布到 FTP: $OUTPUT_DIR"
sudo bash scripts/publish_artifact.sh "$VERSION" --output-dir "$OUTPUT_DIR" --owner "$OWNER"
echo "[02] 设置文件权限..."
# 设置所有者
sudo chown -R "$OWNER" "$OUTPUT_DIR"
# 设置目录权限为 755 (rwxr-xr-x)
sudo find "$OUTPUT_DIR" -type d -exec chmod 755 {} \;
# 设置文件权限为 644 (rw-r--r--)
sudo find "$OUTPUT_DIR" -type f -exec chmod 644 {} \;
# 特别处理 .sh 文件,给予执行权限 755
sudo find "$OUTPUT_DIR" -type f -name "*.sh" -exec chmod 755 {} \;
echo "[02] 权限设置完成 (UID:GID=$OWNER, dirs=755, files=644, scripts=755)"
echo "[02] 发布完成,验证文件..."
ls -lh "$OUTPUT_DIR"
echo "[02] 完成"

View File

@ -0,0 +1,33 @@
#!/bin/bash
set -e
FTP_SERVER="${FTP_SERVER:-172.30.0.40}"
FTP_USER="${FTP_USER:-ftpuser}"
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
FTP_PORT="${FTP_PORT:-21}"
FTP_HOST="${FTP_SERVER}"
echo "[03] 进入测试节点执行安装..."
echo "[03] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}"
docker exec argus-metric-test-node bash -c "
set -e
if ! command -v curl &>/dev/null; then
echo '[03] curl 未安装,正在安装...'
apt-get update && apt-get install -y curl
fi
cd /tmp
echo '[03] 下载 setup.sh...'
curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_HOST}:${FTP_PORT}/setup.sh -o setup.sh
echo '[03] 执行安装...'
chmod +x setup.sh
bash setup.sh --server ${FTP_HOST} --user ${FTP_USER} --password '${FTP_PASSWORD}' --port ${FTP_PORT}
echo '[03] 安装完成'
"
echo "[03] 完成"

View File

@ -0,0 +1,33 @@
#!/bin/bash
set -e
FTP_SERVER="${FTP_SERVER:-172.30.0.40}"
FTP_USER="${FTP_USER:-ftpuser}"
FTP_PASSWORD="${FTP_PASSWORD:-ZGClab1234!}"
FTP_PORT="${FTP_PORT:-21}"
FTP_HOST="${FTP_SERVER}"
echo "[03] 进入测试节点执行安装..."
echo "[03] 使用 FTP 地址: ${FTP_HOST}:${FTP_PORT}"
docker exec argus-metric-test-gpu-node bash -c "
set -e
if ! command -v curl &>/dev/null; then
echo '[03] curl 未安装,正在安装...'
apt-get update && apt-get install -y curl
fi
cd /tmp
echo '[03] 下载 setup.sh...'
curl -u ${FTP_USER}:${FTP_PASSWORD} ftp://${FTP_HOST}:${FTP_PORT}/setup.sh -o setup.sh
echo '[03] 执行安装...'
chmod +x setup.sh
bash setup.sh --server ${FTP_HOST} --user ${FTP_USER} --password '${FTP_PASSWORD}' --port ${FTP_PORT}
echo '[03] 安装完成'
"
echo "[03] 完成"

View File

@ -0,0 +1,96 @@
#!/bin/bash
set -e
echo "[04] 验证安装结果 - 检查监控端口..."
echo "=========================================="
# 检查容器是否运行
if ! docker ps --format '{{.Names}}' | grep -q '^argus-metric-test-node$'; then
echo "错误: 容器 argus-metric-test-node 未运行"
exit 1
fi
ERRORS=0
# ==================== 检查监听端口 ====================
echo ""
echo "[1] 检查监听端口..."
echo "----------------------------------------"
CHECK_RESULT=$(docker exec argus-metric-test-node bash -c '
if command -v netstat >/dev/null 2>&1; then
echo "使用 netstat 检查端口:"
if netstat -tlnp 2>/dev/null | grep -E ":(9100|9400|2020)"; then
echo "✓ 找到监控端口"
exit 0
else
echo "✗ 未找到监控端口 (9100/9400/2020)"
exit 1
fi
elif command -v ss >/dev/null 2>&1; then
echo "使用 ss 检查端口:"
if ss -tlnp 2>/dev/null | grep -E ":(9100|9400|2020)"; then
echo "✓ 找到监控端口"
exit 0
else
echo "✗ 未找到监控端口 (9100/9400/2020)"
exit 1
fi
elif command -v lsof >/dev/null 2>&1; then
echo "使用 lsof 检查端口:"
if lsof -i :9100 -i :9400 -i :2020 2>/dev/null | grep LISTEN; then
echo "✓ 找到监控端口"
exit 0
else
echo "✗ 未找到监控端口 (9100/9400/2020)"
exit 1
fi
else
echo "? 没有可用的端口检查工具 (netstat/ss/lsof),跳过此检查"
exit 0
fi
')
echo "$CHECK_RESULT"
# 只有在明确失败时才计入错误exit 1没有工具exit 0不算错误
if echo "$CHECK_RESULT" | grep -q "✗ 未找到监控端口"; then
ERRORS=$((ERRORS + 1))
fi
# ==================== 测试端口连通性 ====================
echo ""
echo "[2] 测试端口连通性..."
echo "----------------------------------------"
docker exec argus-metric-test-node bash -c '
if command -v curl >/dev/null 2>&1; then
FAILED=0
for port in 9100 9400 2020; do
echo -n "端口 $port: "
if curl -s --connect-timeout 2 "http://localhost:$port/metrics" > /dev/null 2>&1; then
echo "✓ 可访问 (/metrics)"
elif curl -s --connect-timeout 2 "http://localhost:$port/" > /dev/null 2>&1; then
echo "✓ 可访问 (根路径)"
else
echo "✗ 不可访问"
FAILED=$((FAILED + 1))
fi
done
exit $FAILED
else
echo "? curl 不可用,跳过连通性测试"
exit 0
fi
' || ERRORS=$((ERRORS + 1))
echo ""
echo "=========================================="
if [ $ERRORS -eq 0 ]; then
echo "✓ [04] 验证完成 - 所有端口检查通过"
else
echo "✗ [04] 验证失败 - 发现 $ERRORS 个问题"
echo ""
echo "调试建议:"
echo " 1. 进入容器检查: docker exec -it argus-metric-test-node bash"
echo " 2. 查看进程: docker exec argus-metric-test-node ps aux"
echo " 3. 查看日志: docker exec argus-metric-test-node cat /tmp/argus_install.log"
exit 1
fi
echo "=========================================="

View File

@ -0,0 +1,11 @@
#!/bin/bash
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
echo "[05] 清理环境..."
bash "$SCRIPT_DIR/common/stop-all.sh" || true
echo "[05] 清理完成"

View File

@ -6,7 +6,8 @@
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
cd "$TEST_DIR"
echo "=========================================="
echo " 路径检查脚本"
@ -18,15 +19,15 @@ echo ""
# 检查配置文件
echo "检查配置文件..."
if [ -f "$SCRIPT_DIR/docker-compose.yml" ]; then
if [ -f "$TEST_DIR/docker-compose.yml" ]; then
echo " ✓ docker-compose.yml 存在"
else
echo " ✗ docker-compose.yml 不存在"
fi
if [ -f "$SCRIPT_DIR/.env" ]; then
if [ -f "$TEST_DIR/.env" ]; then
echo " ✓ .env 存在"
elif [ -f "$SCRIPT_DIR/env.example" ]; then
elif [ -f "$TEST_DIR/env.example" ]; then
echo " ⚠ .env 不存在,但 env.example 存在"
else
echo " ✗ .env 和 env.example 都不存在"

View File

@ -0,0 +1,61 @@
#!/bin/bash
# 初始化目录脚本
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
cd "$TEST_DIR"
# 加载 .env 文件(如果存在)
if [ -f .env ]; then
echo "加载 .env 配置文件..."
source .env
fi
# 默认配置
ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
DATA_ROOT=${DATA_ROOT:-/private}
echo "开始初始化目录结构..."
echo "数据根目录: ${DATA_ROOT}"
echo "统一 UID: ${ARGUS_BUILD_UID}"
echo "统一 GID: ${ARGUS_BUILD_GID}"
# 创建基础目录结构
echo "创建基础目录结构..."
sudo mkdir -p ${DATA_ROOT}/argus/metric
sudo mkdir -p ${DATA_ROOT}/argus/etc
sudo mkdir -p ${DATA_ROOT}/argus/agent
# 创建 FTP 目录
echo "创建 FTP 目录..."
sudo mkdir -p ${DATA_ROOT}/argus/metric/ftp/share
# 创建 Prometheus 目录
echo "创建 Prometheus 目录..."
sudo mkdir -p ${DATA_ROOT}/argus/metric/prometheus/{data,rules,targets}
# 创建 Grafana 目录
echo "创建 Grafana 目录..."
sudo mkdir -p ${DATA_ROOT}/argus/metric/grafana/{data,logs,plugins,provisioning/datasources,provisioning/dashboards,data/sessions,data/dashboards,config}
# 统一设置所有目录权限
echo "设置目录权限..."
sudo chown -R ${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID} ${DATA_ROOT}/argus/metric
sudo chmod -R 755 ${DATA_ROOT}/argus/metric
echo "目录初始化完成!"
echo ""
echo "目录结构:"
echo " ${DATA_ROOT}/"
echo " ├── argus/ (UID:${ARGUS_BUILD_UID}, GID:${ARGUS_BUILD_GID})"
echo " │ ├── metric/"
echo " │ │ ├── ftp/"
echo " │ │ ├── prometheus/"
echo " │ │ └── grafana/"
echo ""
echo "您现在可以运行 'docker-compose up -d' 来启动所有服务"

View File

@ -6,7 +6,8 @@
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
cd "$TEST_DIR"
# 检测 docker-compose 命令
if command -v docker-compose &> /dev/null; then
@ -19,7 +20,7 @@ else
fi
# 镜像缓存目录
IMAGE_CACHE_DIR="./images-cache"
IMAGE_CACHE_DIR="$TEST_DIR/images-cache"
mkdir -p "$IMAGE_CACHE_DIR"
# 定义镜像列表

View File

@ -6,13 +6,20 @@
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
cd "$TEST_DIR"
# 解析参数
FORCE_REBUILD=false
if [[ "$1" == "--rebuild" ]]; then
FORCE_REBUILD=true
fi
echo "=========================================="
echo " Argus Metrics 一键启动脚本"
echo "=========================================="
echo ""
echo "当前工作目录: $SCRIPT_DIR"
echo "当前工作目录: $TEST_DIR"
echo ""
# 检查 Docker 和 Docker Compose
@ -21,19 +28,13 @@ if ! command -v docker &> /dev/null; then
exit 1
fi
# 检测 docker-compose 命令(兼容新旧版本)
COMPOSE_FILE="$SCRIPT_DIR/docker-compose.yml"
if command -v docker-compose &> /dev/null; then
DOCKER_COMPOSE="docker-compose -f $COMPOSE_FILE"
echo "使用: docker-compose"
elif docker compose version &> /dev/null 2>&1; then
DOCKER_COMPOSE="docker compose -f $COMPOSE_FILE"
echo "使用: docker compose"
else
echo "错误: 未找到 docker-compose 或 docker compose 命令"
# 检查 docker compose 命令
if ! docker compose version &> /dev/null 2>&1; then
echo "错误: 未找到 docker compose 命令,请确保 Docker Compose V2 已安装"
exit 1
fi
echo "Compose 文件: $COMPOSE_FILE"
echo "使用: docker compose"
echo "Compose 文件: $TEST_DIR/docker-compose.yml"
echo ""
# 检查必要的构建目录
@ -42,6 +43,8 @@ BUILD_DIRS=(
"../ftp/build"
"../prometheus/build"
"../grafana/build"
"client-test-node/build"
"client-test-gpu-node/build"
)
for dir in "${BUILD_DIRS[@]}"; do
@ -65,6 +68,18 @@ fi
# 加载环境变量
source .env
# 检查并创建 Docker 网络
echo "检查 Docker 网络..."
NETWORK_NAME="argus-debug-net"
if docker network inspect "$NETWORK_NAME" >/dev/null 2>&1; then
echo "网络 $NETWORK_NAME 已存在"
else
echo "创建网络 $NETWORK_NAME..."
docker network create --driver bridge --subnet 172.30.0.0/16 "$NETWORK_NAME"
echo "网络创建成功"
fi
echo ""
echo "1. 初始化目录结构..."
bash "$SCRIPT_DIR/init-directories.sh"
@ -72,8 +87,8 @@ echo ""
echo "2. 准备 Docker 镜像..."
# 检查镜像是否存在
IMAGE_CACHE_DIR="./images-cache"
IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest")
IMAGE_CACHE_DIR="$TEST_DIR/images-cache"
IMAGES=("argus-metric-ftp:latest" "argus-metric-prometheus:latest" "argus-metric-grafana:latest" "argus-metric-test-node:latest" "argus-metric-test-gpu-node:latest")
all_images_exist=true
for image in "${IMAGES[@]}"; do
@ -83,7 +98,12 @@ for image in "${IMAGES[@]}"; do
fi
done
if $all_images_exist; then
if $FORCE_REBUILD; then
echo "强制重新构建镜像(--rebuild 模式)..."
cd "$TEST_DIR"
docker compose build --no-cache
echo "镜像重新构建完成"
elif $all_images_exist; then
echo "所有镜像已存在,跳过构建"
else
echo "检测到缺失镜像,尝试从缓存加载..."
@ -104,6 +124,12 @@ else
"argus-metric-grafana:latest")
cache_file="${IMAGE_CACHE_DIR}/argus-grafana.tar"
;;
"argus-metric-test-node:latest")
cache_file="${IMAGE_CACHE_DIR}/argus-test-node.tar"
;;
"argus-metric-test-gpu-node:latest")
cache_file="${IMAGE_CACHE_DIR}/argus-test-gpu-node.tar"
;;
esac
if [ -f "$cache_file" ]; then
@ -128,8 +154,8 @@ else
echo ""
echo "部分镜像缺失,开始构建..."
echo "工作目录: $(pwd)"
cd "$SCRIPT_DIR"
$DOCKER_COMPOSE build
cd "$TEST_DIR"
docker compose build --no-cache
# 询问是否保存镜像
echo ""
@ -149,6 +175,12 @@ else
"argus-metric-grafana:latest")
docker save -o "${IMAGE_CACHE_DIR}/argus-grafana.tar" "$image" && echo " 已保存: argus-grafana.tar"
;;
"argus-metric-test-node:latest")
docker save -o "${IMAGE_CACHE_DIR}/argus-test-node.tar" "$image" && echo " 已保存: argus-test-node.tar"
;;
"argus-metric-test-gpu-node:latest")
docker save -o "${IMAGE_CACHE_DIR}/argus-test-gpu-node.tar" "$image" && echo " 已保存: argus-test-gpu-node.tar"
;;
esac
done
echo "镜像已保存到: $IMAGE_CACHE_DIR/"
@ -160,40 +192,12 @@ else
fi
echo ""
echo "3. 启动服务..."
cd "$SCRIPT_DIR"
$DOCKER_COMPOSE up -d
echo "3. 启动基础服务..."
cd "$TEST_DIR"
# 启动除GPU节点外的所有服务
docker compose up -d ftp prometheus grafana test-node test-gpu-node
echo ""
echo "4. 等待服务启动..."
sleep 5
echo ""
echo "5. 检查服务状态..."
cd "$SCRIPT_DIR"
$DOCKER_COMPOSE ps
echo ""
echo "=========================================="
echo " 服务启动完成!"
echo "=========================================="
echo ""
echo "服务访问地址:"
echo " - FTP: ftp://localhost:${FTP_PORT:-21}"
echo " 用户名: ftpuser"
echo " 密码: ${FTP_PASSWORD:-ZGClab1234!}"
echo ""
echo " - Prometheus: http://localhost:${PROMETHEUS_PORT:-9090}"
echo ""
echo " - Grafana: http://localhost:${GRAFANA_PORT:-3000}"
echo " 用户名: admin"
echo " 密码: admin"
echo ""
echo "常用命令:"
echo " 查看日志: $DOCKER_COMPOSE logs -f [service]"
echo " 停止服务: $DOCKER_COMPOSE stop"
echo " 重启服务: $DOCKER_COMPOSE restart"
echo " 停止并删除: $DOCKER_COMPOSE down"
echo " 停止并删除卷: $DOCKER_COMPOSE down -v"
echo ""

View File

@ -0,0 +1,50 @@
#!/bin/bash
# 停止所有服务脚本
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
cd "$TEST_DIR"
# 检查 docker compose 命令
if ! docker compose version &> /dev/null 2>&1; then
echo "错误: 未找到 docker compose 命令,请确保 Docker Compose V2 已安装"
exit 1
fi
echo "=========================================="
echo " 停止 Argus Metrics 服务"
echo "=========================================="
echo ""
echo "使用: docker compose"
echo "Compose 文件: $TEST_DIR/docker-compose.yml"
echo ""
# 检查是否有运行的容器
if [ "$(docker compose ps -q)" ]; then
echo "停止所有服务..."
docker compose stop
echo ""
read -p "是否要删除容器? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
docker compose down
echo "容器已删除"
read -p "是否要删除数据卷? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
docker compose down -v
echo "数据卷已删除"
fi
fi
else
echo "没有运行的服务"
fi
echo ""
echo "完成!"

View File

@ -0,0 +1,85 @@
#!/bin/bash
# 镜像加载脚本
# 用于从 tar 文件加载 Docker 镜像
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
INPUT_DIR="${1:-$TEST_DIR/images-cache}"
echo "=========================================="
echo " Docker 镜像加载脚本"
echo "=========================================="
echo ""
echo "输入目录: $INPUT_DIR"
echo ""
# 检查输入目录是否存在
if [ ! -d "$INPUT_DIR" ]; then
echo "错误: 目录不存在: $INPUT_DIR"
exit 1
fi
# 查找所有tar文件并加载
total=0
success=0
failed=0
# 查找目录下所有.tar文件
tar_files=($(find "$INPUT_DIR" -name "*.tar" -type f 2>/dev/null | sort))
if [ ${#tar_files[@]} -eq 0 ]; then
echo "错误: 在目录 $INPUT_DIR 中未找到任何 .tar 文件"
exit 1
fi
echo "找到 ${#tar_files[@]} 个镜像文件:"
for tar_file in "${tar_files[@]}"; do
echo " - $(basename "$tar_file")"
done
echo ""
for tar_file in "${tar_files[@]}"; do
total=$((total + 1))
tar_filename=$(basename "$tar_file")
echo "[$total] 处理: $tar_filename"
# 强制加载,不检查镜像是否已存在
echo " 加载镜像..."
if docker load -i "$tar_file"; then
echo " 加载成功: $tar_filename"
success=$((success + 1))
else
echo " 加载失败: $tar_filename"
failed=$((failed + 1))
fi
echo ""
done
echo "=========================================="
echo " 加载完成"
echo "=========================================="
echo ""
echo "统计:"
echo " 总计: $total"
echo " 成功: $success"
echo " 失败: $failed"
echo ""
# 显示当前所有镜像
echo "当前所有镜像:"
docker images
echo ""
if [ $failed -gt 0 ]; then
echo "部分镜像加载失败,请检查!"
exit 1
fi
if [ $success -gt 0 ]; then
echo "镜像加载成功!"
fi

View File

@ -0,0 +1,94 @@
#!/bin/bash
# 镜像保存脚本
# 用于保存 Docker 镜像到 tar 文件,便于离线部署
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
OUTPUT_DIR="${1:-$TEST_DIR/images-cache}"
echo "=========================================="
echo " Docker 镜像保存脚本"
echo "=========================================="
echo ""
echo "输出目录: $OUTPUT_DIR"
echo ""
# 创建输出目录
mkdir -p "$OUTPUT_DIR"
# 定义镜像名称(与 docker-compose.yml 保持一致)
declare -A IMAGES=(
["argus-metric-ftp:latest"]="argus-ftp.tar"
["argus-metric-prometheus:latest"]="argus-prometheus.tar"
["argus-metric-grafana:latest"]="argus-grafana.tar"
["argus-metric-test-node:latest"]="argus-test-node.tar"
["argus-metric-test-gpu-node:latest"]="argus-test-gpu-node.tar"
)
# 检查镜像是否存在并保存
total=0
success=0
failed=0
for image in "${!IMAGES[@]}"; do
total=$((total + 1))
output_file="${OUTPUT_DIR}/${IMAGES[$image]}"
echo "[$total] 检查镜像: $image"
if docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "^${image}$"; then
echo " ✓ 镜像存在,开始保存..."
if docker save -o "$output_file" "$image"; then
file_size=$(ls -lh "$output_file" | awk '{print $5}')
echo " ✓ 保存成功: ${IMAGES[$image]} ($file_size)"
success=$((success + 1))
else
echo " ✗ 保存失败: $image"
failed=$((failed + 1))
fi
else
echo " ✗ 镜像不存在,请先构建镜像"
failed=$((failed + 1))
fi
echo ""
done
echo "=========================================="
echo " 保存完成"
echo "=========================================="
echo ""
echo "统计:"
echo " 总计: $total"
echo " 成功: $success"
echo " 失败: $failed"
echo ""
echo "输出目录: $OUTPUT_DIR"
echo ""
if [ $success -gt 0 ]; then
echo "已保存的文件:"
ls -lh "$OUTPUT_DIR"/*.tar 2>/dev/null || true
echo ""
echo "文件列表:"
for image in "${!IMAGES[@]}"; do
output_file="${OUTPUT_DIR}/${IMAGES[$image]}"
if [ -f "$output_file" ]; then
file_size=$(ls -lh "$output_file" | awk '{print $5}')
echo " - ${IMAGES[$image]} ($file_size)"
fi
done
fi
echo ""
echo "使用说明:"
echo "1. 将 images-cache 目录复制到目标服务器的 ~/argus/src/metric/tests/ 下"
echo "2. 在目标服务器运行: bash scripts/common/start-all.sh"
echo ""
if [ $failed -gt 0 ]; then
exit 1
fi

View File

@ -1,51 +0,0 @@
#!/bin/bash
# 停止所有服务脚本
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
# 检测 docker-compose 命令(兼容新旧版本)
COMPOSE_FILE="$SCRIPT_DIR/docker-compose.yml"
if command -v docker-compose &> /dev/null; then
DOCKER_COMPOSE="docker-compose -f $COMPOSE_FILE"
elif docker compose version &> /dev/null 2>&1; then
DOCKER_COMPOSE="docker compose -f $COMPOSE_FILE"
else
echo "错误: 未找到 docker-compose 或 docker compose 命令"
exit 1
fi
echo "=========================================="
echo " 停止 Argus Metrics 服务"
echo "=========================================="
echo ""
# 检查是否有运行的容器
if [ "$($DOCKER_COMPOSE ps -q)" ]; then
echo "停止所有服务..."
$DOCKER_COMPOSE stop
echo ""
read -p "是否要删除容器? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
$DOCKER_COMPOSE down
echo "容器已删除"
read -p "是否要删除数据卷? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
$DOCKER_COMPOSE down -v
echo "数据卷已删除"
fi
fi
else
echo "没有运行的服务"
fi
echo ""
echo "完成!"

View File

@ -0,0 +1,12 @@
# Generated by 01_bootstrap.sh
SYS_DEBUG_PRIVATE_CORE=/absolute/path/to/private
SYS_DEBUG_PRIVATE_NODEA=/absolute/path/to/private-nodea
SYS_DEBUG_PRIVATE_NODEB=/absolute/path/to/private-nodeb
SYS_DEBUG_TMP_DIR=/absolute/path/to/tmp
SYS_DEBUG_NETWORK_NAME=argus-debug-net
SYS_DEBUG_NETWORK_SUBNET=172.30.0.0/16
SYS_DEBUG_NETWORK_GATEWAY=172.30.0.1
SYS_DEBUG_PROJECT_NAME=argus-debug
SYS_DEBUG_CONTAINER_PREFIX=argus-debug
ARGUS_BUILD_UID=2133
ARGUS_BUILD_GID=2015

68
src/sys/debug/README.md Normal file
View File

@ -0,0 +1,68 @@
# ARGUS 系统调试部署模式
该目录提供基于系统级 E2E 测试构建的调试部署流程,便于本地快速复现与排查问题。核心特性:
- 独立 docker 网络 `argus-debug-net`(默认子网 `172.30.0.0/16`),避免与 `src/sys/tests` 冲突。
- 私有数据目录可通过参数自定义,例如 `--private-root /tmp/argus-debug`
- 默认保留调试过程生成的文件,避免 `down`/`bootstrap` 自动删除。
## 快速开始
```bash
cd src/sys/debug
# 仅首次需要,创建 external 网络
./scripts/network-create.sh
# 初始化目录/构建 agent/写入 .env
./scripts/01_bootstrap.sh --private-root /tmp/argus-debug
# 启动调试栈
./scripts/02_up.sh
# 根据需要执行验证脚本0308
./scripts/03_wait_ready.sh
...
# 调试结束停止服务
./scripts/09_down.sh
# 若需移除网络或数据
./scripts/network-destroy.sh
./scripts/clean-data.sh
```
> **提示**:调试与测试栈不能同时运行,应保持 `src/sys/tests` 中的 `argus-sys` 栈已停止。
## 参数与环境变量
- `--private-root <path>`:同时指定核心服务与两个节点的私有目录根,脚本自动派生 `private``private-nodea``private-nodeb`
- `--private-core <path>``--private-nodea <path>``--private-nodeb <path>`:分别覆盖单独目录。
- 环境变量可覆盖 `.env` 中写入的值,例如 `export SYS_DEBUG_NETWORK_NAME=my-debug-net`
- `.env` 文件字段:
- `SYS_DEBUG_PRIVATE_CORE`
- `SYS_DEBUG_PRIVATE_NODEA`
- `SYS_DEBUG_PRIVATE_NODEB`
- `SYS_DEBUG_TMP_DIR`
- `SYS_DEBUG_NETWORK_NAME`
- `SYS_DEBUG_NETWORK_SUBNET`
- `SYS_DEBUG_NETWORK_GATEWAY`
- `SYS_DEBUG_PROJECT_NAME`
- `SYS_DEBUG_CONTAINER_PREFIX`
- `ARGUS_BUILD_UID` / `ARGUS_BUILD_GID`
## 脚本说明
- `scripts/common.sh`:通用函数与环境加载。
- `scripts/network-create.sh` / `network-destroy.sh`:管理 external 网络。
- `scripts/00_debug_all.sh`:顺序执行 0108默认不执行 09
- `scripts/clean-data.sh`:选择性清理宿主机私有数据。
- `scripts/03_wait_ready.sh`:除了等待各服务就绪,还会在 Elasticsearch 就绪后自动将磁盘水位阈值放宽97%/98%/99%),避免在磁盘紧张的调试环境中分片分配失败。
- `scripts/08_restart_agent_reregister.sh`:将 node-b 切换到 `SYS_DEBUG_NODEB_FIXED_IP`(默认 `172.30.0.200`),如果目标地址与当前 IP 相同脚本会报错提醒重新选择地址。
- 其它 `0109` 与测试目录对应,但针对参数化路径及网络做了调整。
## 注意事项
- 若宿主机未安装 Docker脚本将提示错误并退出。
- 当指定的私有目录已存在数据时,脚本不会清理,请确认内容安全后再复用。
- 与测试环境共用镜像:请提前执行仓库根目录的 `./build/build_images.sh`

View File

@ -0,0 +1,147 @@
version: "3.8"
networks:
argus-debug-net:
external: true
name: ${SYS_DEBUG_NETWORK_NAME:-argus-debug-net}
services:
bind:
image: ${BIND_IMAGE_TAG:-argus-bind9:latest}
container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-bind
networks:
argus-debug-net:
ipv4_address: ${SYS_DEBUG_BIND_IP:-172.30.0.2}
volumes:
- ${SYS_DEBUG_PRIVATE_CORE}:/private
restart: unless-stopped
master:
image: ${MASTER_IMAGE_TAG:-argus-master:latest}
container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-master
depends_on:
- bind
environment:
- OFFLINE_THRESHOLD_SECONDS=6
- ONLINE_THRESHOLD_SECONDS=2
- SCHEDULER_INTERVAL_SECONDS=1
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
ports:
- "32300:3000"
volumes:
- ${SYS_DEBUG_PRIVATE_CORE}/argus/master:/private/argus/master
- ${SYS_DEBUG_PRIVATE_CORE}/argus/metric/prometheus:/private/argus/metric/prometheus
- ${SYS_DEBUG_PRIVATE_CORE}/argus/etc:/private/argus/etc
networks:
argus-debug-net:
ipv4_address: ${SYS_DEBUG_MASTER_IP:-172.30.0.10}
restart: unless-stopped
es:
image: ${ES_IMAGE_TAG:-argus-elasticsearch:latest}
container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-es
environment:
- discovery.type=single-node
- xpack.security.enabled=false
- ES_JAVA_OPTS=-Xms512m -Xmx512m
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
volumes:
- ${SYS_DEBUG_PRIVATE_CORE}/argus/log/elasticsearch:/private/argus/log/elasticsearch
- ${SYS_DEBUG_PRIVATE_CORE}/argus/etc:/private/argus/etc
ports:
- "9200:9200"
networks:
argus-debug-net:
ipv4_address: ${SYS_DEBUG_ES_IP:-172.30.0.20}
restart: unless-stopped
kibana:
image: ${KIBANA_IMAGE_TAG:-argus-kibana:latest}
container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-kibana
environment:
- ELASTICSEARCH_HOSTS=http://es.log.argus.com:9200
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
volumes:
- ${SYS_DEBUG_PRIVATE_CORE}/argus/log/kibana:/private/argus/log/kibana
- ${SYS_DEBUG_PRIVATE_CORE}/argus/etc:/private/argus/etc
depends_on:
- es
ports:
- "5601:5601"
networks:
argus-debug-net:
ipv4_address: ${SYS_DEBUG_KIBANA_IP:-172.30.0.30}
restart: unless-stopped
node-a:
image: ubuntu:22.04
container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-node-a
hostname: ${SYS_DEBUG_NODEA_HOST:-dev-yyrshare-nbnyx10-cp2f-pod-0}
depends_on:
- master
- bind
- es
environment:
- MASTER_ENDPOINT=http://master.argus.com:3000
- REPORT_INTERVAL_SECONDS=2
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
- ES_HOST=es
- ES_PORT=9200
- CLUSTER=local
- RACK=dev
volumes:
- ${SYS_DEBUG_PRIVATE_NODEA}/argus/agent/${SYS_DEBUG_NODEA_HOST:-dev-yyrshare-nbnyx10-cp2f-pod-0}:/private/argus/agent/${SYS_DEBUG_NODEA_HOST:-dev-yyrshare-nbnyx10-cp2f-pod-0}
- ../../agent/dist/argus-agent:/usr/local/bin/argus-agent:ro
- ../tests/scripts/node_entrypoint.sh:/usr/local/bin/node-entrypoint.sh:ro
- ../../log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro
- ../../log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro
- ../../log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro
entrypoint:
- /usr/local/bin/node-entrypoint.sh
dns:
- ${SYS_DEBUG_BIND_IP:-172.30.0.2}
ports:
- "2020:2020"
networks:
argus-debug-net:
ipv4_address: ${SYS_DEBUG_NODEA_IP:-172.30.0.101}
restart: unless-stopped
node-b:
image: ubuntu:22.04
container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-node-b
hostname: ${SYS_DEBUG_NODEB_HOST:-dev-yyrshare-uuuu10-ep2f-pod-0}
depends_on:
- master
- bind
- es
environment:
- MASTER_ENDPOINT=http://master.argus.com:3000
- REPORT_INTERVAL_SECONDS=2
- ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
- ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
- ES_HOST=es
- ES_PORT=9200
- CLUSTER=local
- RACK=dev
volumes:
- ${SYS_DEBUG_PRIVATE_NODEB}/argus/agent/${SYS_DEBUG_NODEB_HOST:-dev-yyrshare-uuuu10-ep2f-pod-0}:/private/argus/agent/${SYS_DEBUG_NODEB_HOST:-dev-yyrshare-uuuu10-ep2f-pod-0}
- ../../agent/dist/argus-agent:/usr/local/bin/argus-agent:ro
- ../tests/scripts/node_entrypoint.sh:/usr/local/bin/node-entrypoint.sh:ro
- ../../log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro
- ../../log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro
- ../../log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro
entrypoint:
- /usr/local/bin/node-entrypoint.sh
dns:
- ${SYS_DEBUG_BIND_IP:-172.30.0.2}
ports:
- "2021:2020"
networks:
argus-debug-net:
ipv4_address: ${SYS_DEBUG_NODEB_IP:-172.30.0.102}
restart: unless-stopped

View File

@ -0,0 +1,24 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SCRIPTS=(
"01_bootstrap.sh"
"02_up.sh"
"03_wait_ready.sh"
"04_verify_dns_routing.sh"
"05_agent_register.sh"
"06_write_health_and_assert.sh"
"07_logs_send_and_assert.sh"
"08_restart_agent_reregister.sh"
)
for script in "${SCRIPTS[@]}"; do
echo "[SYS-DEBUG] Running $script"
"$SCRIPT_DIR/$script"
echo "[SYS-DEBUG] $script completed"
echo
done
echo "[SYS-DEBUG] Complete. Run scripts/09_down.sh when finished (data retained)."

View File

@ -0,0 +1,210 @@
#!/usr/bin/env bash
set -euo pipefail
# shellcheck source=common.sh
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
PRIVATE_ROOT=""
PRIVATE_CORE="$SYS_DEBUG_PRIVATE_CORE"
PRIVATE_NODEA="$SYS_DEBUG_PRIVATE_NODEA"
PRIVATE_NODEB="$SYS_DEBUG_PRIVATE_NODEB"
TMP_DIR_VAL="$SYS_DEBUG_TMP_DIR"
NETWORK_NAME="$SYS_DEBUG_NETWORK_NAME"
NETWORK_SUBNET="$SYS_DEBUG_NETWORK_SUBNET"
NETWORK_GATEWAY="$SYS_DEBUG_NETWORK_GATEWAY"
PROJECT_NAME="$SYS_DEBUG_PROJECT_NAME"
CONTAINER_PREFIX="$SYS_DEBUG_CONTAINER_PREFIX"
NODEB_FIXED_IP=${SYS_DEBUG_NODEB_FIXED_IP:-172.30.0.200}
usage() {
cat <<EOF
Usage: ${0##*/} [--private-root PATH] [--private-core PATH] \
[--private-nodea PATH] [--private-nodeb PATH] \
[--tmp-dir PATH] [--network-name NAME] \
[--network-subnet CIDR] [--network-gateway IP]
Prepare directories, build agent binary, and write .env for debug stack.
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--private-root)
shift; [[ $# -gt 0 ]] || { echo "--private-root requires value" >&2; exit 1; }
PRIVATE_ROOT="$1"
;;
--private-root=*)
PRIVATE_ROOT="${1#*=}"
;;
--private-core)
shift; [[ $# -gt 0 ]] || { echo "--private-core requires value" >&2; exit 1; }
PRIVATE_CORE="$1"
;;
--private-core=*)
PRIVATE_CORE="${1#*=}"
;;
--private-nodea)
shift; [[ $# -gt 0 ]] || { echo "--private-nodea requires value" >&2; exit 1; }
PRIVATE_NODEA="$1"
;;
--private-nodea=*)
PRIVATE_NODEA="${1#*=}"
;;
--private-nodeb)
shift; [[ $# -gt 0 ]] || { echo "--private-nodeb requires value" >&2; exit 1; }
PRIVATE_NODEB="$1"
;;
--private-nodeb=*)
PRIVATE_NODEB="${1#*=}"
;;
--tmp-dir)
shift; [[ $# -gt 0 ]] || { echo "--tmp-dir requires value" >&2; exit 1; }
TMP_DIR_VAL="$1"
;;
--tmp-dir=*)
TMP_DIR_VAL="${1#*=}"
;;
--network-name)
shift; [[ $# -gt 0 ]] || { echo "--network-name requires value" >&2; exit 1; }
NETWORK_NAME="$1"
;;
--network-name=*)
NETWORK_NAME="${1#*=}"
;;
--network-subnet)
shift; [[ $# -gt 0 ]] || { echo "--network-subnet requires value" >&2; exit 1; }
NETWORK_SUBNET="$1"
;;
--network-subnet=*)
NETWORK_SUBNET="${1#*=}"
;;
--network-gateway)
shift; [[ $# -gt 0 ]] || { echo "--network-gateway requires value" >&2; exit 1; }
NETWORK_GATEWAY="$1"
;;
--network-gateway=*)
NETWORK_GATEWAY="${1#*=}"
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 1
;;
esac
shift
done
if [[ -n "$PRIVATE_ROOT" ]]; then
PRIVATE_CORE="$PRIVATE_ROOT/private"
PRIVATE_NODEA="$PRIVATE_ROOT/private-nodea"
PRIVATE_NODEB="$PRIVATE_ROOT/private-nodeb"
fi
PRIVATE_CORE=$(abs_path "$PRIVATE_CORE")
PRIVATE_NODEA=$(abs_path "$PRIVATE_NODEA")
PRIVATE_NODEB=$(abs_path "$PRIVATE_NODEB")
TMP_DIR_VAL=$(abs_path "$TMP_DIR_VAL")
log "Preparing directories under $PRIVATE_CORE"
mkdir -p \
"$PRIVATE_CORE/argus/etc" \
"$PRIVATE_CORE/argus/bind" \
"$PRIVATE_CORE/argus/master" \
"$PRIVATE_CORE/argus/metric/prometheus" \
"$PRIVATE_CORE/argus/log/elasticsearch" \
"$PRIVATE_CORE/argus/log/kibana" \
"$PRIVATE_NODEA/argus/agent/$HOST_A/health" \
"$PRIVATE_NODEB/argus/agent/$HOST_B/health" \
"$TMP_DIR_VAL"
log "Aligning ownership for core directories"
chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" \
"$PRIVATE_CORE/argus/log/elasticsearch" \
"$PRIVATE_CORE/argus/log/kibana" \
"$PRIVATE_CORE/argus/etc" 2>/dev/null || true
log "Distributing update-dns.sh"
BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh"
BIND_UPDATE_DEST="$PRIVATE_CORE/argus/etc/update-dns.sh"
if [[ -f "$BIND_UPDATE_SRC" ]]; then
cp "$BIND_UPDATE_SRC" "$BIND_UPDATE_DEST"
chmod +x "$BIND_UPDATE_DEST"
else
echo "[WARN] Missing $BIND_UPDATE_SRC" >&2
fi
require_docker
ensure_image() {
local image="$1"
if ! docker image inspect "$image" >/dev/null 2>&1; then
echo "[ERR] Missing image: $image. Run ./build/build_images.sh" >&2
exit 1
fi
}
log "Ensuring required images exist"
ensure_image "${ES_IMAGE_TAG:-argus-elasticsearch:latest}"
ensure_image "${KIBANA_IMAGE_TAG:-argus-kibana:latest}"
ensure_image "${BIND_IMAGE_TAG:-argus-bind9:latest}"
ensure_image "${MASTER_IMAGE_TAG:-argus-master:latest}"
log "Building agent binary"
pushd "$REPO_ROOT/src/agent" >/dev/null
./scripts/build_binary.sh
popd >/dev/null
AGENT_BIN="$REPO_ROOT/src/agent/dist/argus-agent"
if [[ ! -x "$AGENT_BIN" ]]; then
echo "[ERR] Agent binary not found at $AGENT_BIN" >&2
exit 1
fi
echo "$AGENT_BIN" > "$TMP_DIR_VAL/agent_binary_path"
log "Preparing environment file contents"
tmp_env="$(mktemp)"
cat > "$tmp_env" <<EOF
SYS_DEBUG_PRIVATE_CORE=$PRIVATE_CORE
SYS_DEBUG_PRIVATE_NODEA=$PRIVATE_NODEA
SYS_DEBUG_PRIVATE_NODEB=$PRIVATE_NODEB
SYS_DEBUG_TMP_DIR=$TMP_DIR_VAL
SYS_DEBUG_NETWORK_NAME=$NETWORK_NAME
SYS_DEBUG_NETWORK_SUBNET=$NETWORK_SUBNET
SYS_DEBUG_NETWORK_GATEWAY=$NETWORK_GATEWAY
SYS_DEBUG_PROJECT_NAME=$PROJECT_NAME
SYS_DEBUG_CONTAINER_PREFIX=$CONTAINER_PREFIX
SYS_DEBUG_NODEA_HOST=$HOST_A
SYS_DEBUG_NODEB_HOST=$HOST_B
SYS_DEBUG_BIND_IP=${SYS_DEBUG_BIND_IP:-172.30.0.2}
SYS_DEBUG_MASTER_IP=${SYS_DEBUG_MASTER_IP:-172.30.0.10}
SYS_DEBUG_ES_IP=${SYS_DEBUG_ES_IP:-172.30.0.20}
SYS_DEBUG_KIBANA_IP=${SYS_DEBUG_KIBANA_IP:-172.30.0.30}
SYS_DEBUG_NODEA_IP=${SYS_DEBUG_NODEA_IP:-172.30.0.101}
SYS_DEBUG_NODEB_IP=${SYS_DEBUG_NODEB_IP:-172.30.0.102}
SYS_DEBUG_NODEB_FIXED_IP=$NODEB_FIXED_IP
ARGUS_BUILD_UID=$ARGUS_BUILD_UID
ARGUS_BUILD_GID=$ARGUS_BUILD_GID
EOF
if [[ -f "$ENV_FILE" ]]; then
if cmp -s "$tmp_env" "$ENV_FILE"; then
log ".env already up-to-date"
rm -f "$tmp_env"
if [[ ! -f "$DEBUG_ROOT/.env.lock" ]]; then
cp "$ENV_FILE" "$DEBUG_ROOT/.env.lock"
fi
else
mv "$ENV_FILE" "$ENV_FILE.bak"
mv "$tmp_env" "$ENV_FILE"
cp "$ENV_FILE" "$DEBUG_ROOT/.env.lock"
log "Bootstrap updated .env (previous saved at ${ENV_FILE}.bak)"
fi
else
mv "$tmp_env" "$ENV_FILE"
cp "$ENV_FILE" "$DEBUG_ROOT/.env.lock"
log "Bootstrap created .env at $ENV_FILE"
fi

19
src/sys/debug/scripts/02_up.sh Executable file
View File

@ -0,0 +1,19 @@
#!/usr/bin/env bash
set -euo pipefail
# shellcheck source=common.sh
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
ensure_env_file
ensure_paths_defined
require_docker
if ! docker network inspect "$SYS_DEBUG_NETWORK_NAME" >/dev/null 2>&1; then
echo "[ERR] Network $SYS_DEBUG_NETWORK_NAME not found. Run scripts/network-create.sh first." >&2
exit 1
fi
log "Starting debug stack on project $SYS_DEBUG_PROJECT_NAME"
compose up -d
log "Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021"

View File

@ -0,0 +1,84 @@
#!/usr/bin/env bash
set -euo pipefail
# shellcheck source=common.sh
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
ensure_env_file
ensure_paths_defined
service_id() {
compose ps -q "$1"
}
wait_http() {
local url="$1"; local attempts="${2:-120}"; local i=1
while (( i <= attempts )); do
if curl -fsS "$url" >/dev/null 2>&1; then
return 0
fi
echo "[..] waiting $url ($i/$attempts)"
sleep 5
((i++))
done
echo "[ERR] Timeout waiting for $url" >&2
return 1
}
log "Waiting for ES/Kibana/Master/Fluent Bit/Bind"
attempt=1; max=120
while (( attempt <= max )); do
if curl -fsS "http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then
break
fi
echo "[..] waiting ES ($attempt/$max)"
sleep 5
((attempt++))
done
if (( attempt > max )); then
echo "[ERR] ES not ready" >&2
exit 1
fi
log "Applying relaxed ES disk watermarks for debug"
curl -fsS -XPUT "http://localhost:9200/_cluster/settings" \
-H 'Content-Type: application/json' \
-d '{
"transient": {
"cluster.routing.allocation.disk.watermark.low": "99%",
"cluster.routing.allocation.disk.watermark.high": "99%",
"cluster.routing.allocation.disk.watermark.flood_stage": "99%"
}
}' >/dev/null || echo "[WARN] Failed to adjust ES watermarks"
log "Waiting for Kibana to be available (HTTP 200)"
kb_attempt=1; kb_max=180
while (( kb_attempt <= kb_max )); do
body=$(curl -sS "http://localhost:5601/api/status" 2>/dev/null || true)
code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:5601/api/status" || echo 000)
if [[ "$code" == "200" ]] && echo "$body" | grep -q '"level":"available"'; then
log "Kibana available"
break
fi
echo "[..] waiting kibana 200 ($kb_attempt/$kb_max), last_code=$code"
sleep 5
((kb_attempt++))
done
if (( kb_attempt > kb_max )); then
echo "[ERR] Kibana did not reach HTTP 200" >&2
exit 1
fi
wait_http "http://localhost:32300/readyz" 120
wait_http "http://localhost:2020/api/v2/metrics" 120
wait_http "http://localhost:2021/api/v2/metrics" 120
BIND_ID="$(service_id bind)"
if [[ -n "$BIND_ID" ]]; then
docker exec "$BIND_ID" named-checkconf >/dev/null
else
echo "[WARN] bind container id not found" >&2
fi
log "All services are ready"

View File

@ -0,0 +1,51 @@
#!/usr/bin/env bash
set -euo pipefail
# shellcheck source=common.sh
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
ensure_env_file
ensure_paths_defined
service_id() {
compose ps -q "$1"
}
log "Verifying DNS routing via bind"
MASTER_FILE="$SYS_DEBUG_PRIVATE_CORE/argus/etc/master.argus.com"
if [[ ! -f "$MASTER_FILE" ]]; then
echo "[ERR] master.argus.com file missing at $MASTER_FILE" >&2
exit 1
fi
MASTER_IP_HOST="$(tr -d '\r\n' < "$MASTER_FILE" || true)"
log "master.argus.com file content: $MASTER_IP_HOST"
BIN_ID="$(service_id bind)"
if [[ -n "$BIN_ID" ]]; then
DIG_IP="$(docker exec "$BIN_ID" dig +short master.argus.com A | tail -n1 || true)"
log "dig(master.argus.com) from bind container -> $DIG_IP"
if [[ -z "$DIG_IP" ]]; then
echo "[ERR] bind did not resolve master.argus.com" >&2
exit 1
fi
else
echo "[WARN] bind container not found; skip dig" >&2
fi
for node in node-a node-b; do
CID="$(service_id "$node")"
if [[ -z "$CID" ]]; then
echo "[ERR] Container for $node not found" >&2
exit 1
fi
log "Checking resolution inside $node"
if ! docker exec "$CID" getent hosts master.argus.com >/dev/null 2>&1; then
echo "[ERR] $node cannot resolve master.argus.com" >&2
exit 1
fi
RES="$(docker exec "$CID" getent hosts master.argus.com | awk '{print $1}' | head -n1)"
log "$node resolved master.argus.com -> $RES"
done
log "DNS routing verified"

View File

@ -0,0 +1,84 @@
#!/usr/bin/env bash
set -euo pipefail
# shellcheck source=common.sh
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
ensure_env_file
ensure_paths_defined
TMP_DIR_LOCAL="$TMP_DIR"
mkdir -p "$TMP_DIR_LOCAL"
API_BASE="http://localhost:32300/api/v1/master"
log "Waiting for agent nodes to register"
extract_node() {
local name="$1"; local output="$2"; local json_file="$3"
python3 - "$name" "$output" "$json_file" <<'PY'
import json, sys, pathlib
name = sys.argv[1]
out = pathlib.Path(sys.argv[2])
json_file = sys.argv[3]
with open(json_file, 'r') as fh:
data = json.load(fh)
node = next((n for n in data if n.get("name") == name), None)
if node:
out.write_text(node["id"])
print(node["id"])
PY
}
ID_A=""; ID_B=""
for _ in {1..60}; do
sleep 2
resp=$(curl -fsS "$API_BASE/nodes" 2>/dev/null || true)
[[ -z "$resp" ]] && continue
if ! echo "$resp" | head -c1 | grep -q '\['; then
continue
fi
echo "$resp" > "$TMP_DIR_LOCAL/nodes_list.json"
ID_A=$(extract_node "$HOST_A" "$TMP_DIR_LOCAL/node_id_a" "$TMP_DIR_LOCAL/nodes_list.json" 2>/dev/null || true)
ID_B=$(extract_node "$HOST_B" "$TMP_DIR_LOCAL/node_id_b" "$TMP_DIR_LOCAL/nodes_list.json" 2>/dev/null || true)
if [[ -s "$TMP_DIR_LOCAL/node_id_a" && -s "$TMP_DIR_LOCAL/node_id_b" ]]; then
break
fi
done
if [[ ! -s "$TMP_DIR_LOCAL/node_id_a" || ! -s "$TMP_DIR_LOCAL/node_id_b" ]]; then
echo "[ERR] Agents did not register in time" >&2
exit 1
fi
node_detail() {
local id="$1"; local out="$2"
curl -fsS "$API_BASE/nodes/$id" -o "$out"
}
node_detail "$(cat "$TMP_DIR_LOCAL/node_id_a")" "$TMP_DIR_LOCAL/detail_a.json"
node_detail "$(cat "$TMP_DIR_LOCAL/node_id_b")" "$TMP_DIR_LOCAL/detail_b.json"
python3 - "$TMP_DIR_LOCAL/detail_a.json" "$TMP_DIR_LOCAL/initial_ip_a" <<'PY'
import json, sys, pathlib
node=json.load(open(sys.argv[1]))
ip=node.get("meta_data",{}).get("ip")
assert ip, "missing ip"
pathlib.Path(sys.argv[2]).write_text(ip)
PY
python3 - "$TMP_DIR_LOCAL/detail_b.json" "$TMP_DIR_LOCAL/initial_ip_b" <<'PY'
import json, sys, pathlib
node=json.load(open(sys.argv[1]))
ip=node.get("meta_data",{}).get("ip")
assert ip, "missing ip"
pathlib.Path(sys.argv[2]).write_text(ip)
PY
NODE_JSON_A="$SYS_DEBUG_PRIVATE_NODEA/argus/agent/$HOST_A/node.json"
NODE_JSON_B="$SYS_DEBUG_PRIVATE_NODEB/argus/agent/$HOST_B/node.json"
[[ -f "$NODE_JSON_A" ]] || { echo "[ERR] node.json missing for $HOST_A" >&2; exit 1; }
[[ -f "$NODE_JSON_B" ]] || { echo "[ERR] node.json missing for $HOST_B" >&2; exit 1; }
log "Agents registered: $(cat "$TMP_DIR_LOCAL/node_id_a") , $(cat "$TMP_DIR_LOCAL/node_id_b")"

View File

@ -0,0 +1,78 @@
#!/usr/bin/env bash
set -euo pipefail
# shellcheck source=common.sh
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
ensure_env_file
ensure_paths_defined
API_BASE="http://localhost:32300/api/v1/master"
HEALTH_A="$SYS_DEBUG_PRIVATE_NODEA/argus/agent/$HOST_A/health"
HEALTH_B="$SYS_DEBUG_PRIVATE_NODEB/argus/agent/$HOST_B/health"
write_health() {
local dir="$1"; mkdir -p "$dir"
cat > "$dir/log-fluentbit.json" <<JSON
{ "status": "healthy", "timestamp": "2025-10-13T12:05:00Z" }
JSON
cat > "$dir/metric-node-exporter.json" <<JSON
{ "status": "healthy", "timestamp": "2025-10-13T12:05:00Z" }
JSON
}
log "Writing health files for both nodes"
write_health "$HEALTH_A"
write_health "$HEALTH_B"
ID_A="$TMP_DIR/node_id_a"
ID_B="$TMP_DIR/node_id_b"
[[ -f "$ID_A" && -f "$ID_B" ]] || { echo "[ERR] node id files missing in $TMP_DIR" >&2; exit 1; }
ID_A_VAL="$(cat "$ID_A")"
ID_B_VAL="$(cat "$ID_B")"
check_health() {
local id="$1"; local tries=40
for _ in $(seq 1 $tries); do
sleep 2
resp=$(curl -fsS "$API_BASE/nodes/$id" 2>/dev/null || true)
[[ -z "$resp" ]] && continue
echo "$resp" > "$TMP_DIR/node_${id}_detail.json"
if python3 - "$TMP_DIR/node_${id}_detail.json" <<'PY'
import json,sys
node=json.load(open(sys.argv[1]))
h=node.get("health",{})
if "log-fluentbit" in h and "metric-node-exporter" in h:
sys.exit(0)
sys.exit(1)
PY
then
return 0
fi
done
return 1
}
check_health "$ID_A_VAL" || { echo "[ERR] health keys not reported for node A" >&2; exit 1; }
check_health "$ID_B_VAL" || { echo "[ERR] health keys not reported for node B" >&2; exit 1; }
NODES_JSON="$SYS_DEBUG_PRIVATE_CORE/argus/metric/prometheus/nodes.json"
if [[ ! -f "$NODES_JSON" ]]; then
echo "[ERR] nodes.json missing at $NODES_JSON" >&2
exit 1
fi
python3 - "$NODES_JSON" <<'PY'
import json,sys
with open(sys.argv[1]) as h:
nodes=json.load(h)
if not isinstance(nodes, list):
raise SystemExit("nodes.json expected list")
if len(nodes) != 2:
raise SystemExit(f"expected 2 nodes online, got {len(nodes)}")
PY
log "Health reported and nodes.json has 2 online nodes"

View File

@ -0,0 +1,73 @@
#!/usr/bin/env bash
set -euo pipefail
# shellcheck source=common.sh
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
ensure_env_file
ensure_paths_defined
log "Sending logs and asserting ES counts"
get_count() {
local idx="$1"
curl -s "http://localhost:9200/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" | sed -E 's/.*"count":([0-9]+).*/\1/' | awk 'NF{print $0;exit} END{if(NR==0)print 0}'
}
train0=$(get_count "train-*")
infer0=$(get_count "infer-*")
base=$((train0 + infer0))
log "initial counts: train=${train0} infer=${infer0} total=${base}"
service_id() {
compose ps -q "$1"
}
send_logs() {
local sid="$1"; local hosttag="$2"
docker exec "$sid" sh -lc 'mkdir -p /logs/train /logs/infer'
docker exec "$sid" sh -lc "ts=\
\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log"
docker exec "$sid" sh -lc "ts=\
\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log"
docker exec "$sid" sh -lc "ts=\
\$(date '+%F %T'); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log"
}
CID_A="$(service_id node-a)"
CID_B="$(service_id node-b)"
[[ -n "$CID_A" && -n "$CID_B" ]] || { echo "[ERR] node containers not found" >&2; exit 1; }
send_logs "$CID_A" "host01"
send_logs "$CID_B" "host02"
log "Waiting for ES to ingest"
sleep 10
train1=$(get_count "train-*")
infer1=$(get_count "infer-*")
final=$((train1 + infer1))
log "final counts: train=${train1} infer=${infer1} total=${final}"
if (( final <= base )); then
echo "[ERR] ES total did not increase (${base} -> ${final})" >&2
exit 1
fi
if (( final < 4 )); then
echo "[ERR] ES total below expected threshold: ${final} < 4" >&2
exit 1
fi
es_health=$(curl -s "http://localhost:9200/_cluster/health" | grep -o '"status":"[^"]*"' | cut -d'"' -f4)
if [[ "$es_health" != "green" && "$es_health" != "yellow" ]]; then
echo "[ERR] ES health not green/yellow: $es_health" >&2
exit 1
fi
if ! curl -fs "http://localhost:5601/api/status" >/dev/null 2>&1; then
echo "[WARN] Kibana status endpoint not available"
fi
log "ES counts increased and services healthy"

View File

@ -0,0 +1,110 @@
#!/usr/bin/env bash
set -euo pipefail
# shellcheck source=common.sh
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
ensure_env_file
ensure_paths_defined
API_BASE="http://localhost:32300/api/v1/master"
NODE_ENTRYPOINT="$DEBUG_ROOT/../tests/scripts/node_entrypoint.sh"
[[ -f "$NODE_ENTRYPOINT" ]] || { echo "[ERR] node entrypoint script missing at $NODE_ENTRYPOINT" >&2; exit 1; }
TARGET_FIXED_IP="${SYS_DEBUG_NODEB_FIXED_IP:-172.30.0.200}"
ID_B_FILE="$TMP_DIR/node_id_b"
IP_INIT_FILE="$TMP_DIR/initial_ip_b"
[[ -f "$ID_B_FILE" && -f "$IP_INIT_FILE" ]] || { echo "[ERR] Required node id/ip files missing in $TMP_DIR" >&2; exit 1; }
ID_B="$(cat "$ID_B_FILE")"
IP0_B="$(cat "$IP_INIT_FILE")"
DETAIL_BEFORE="$TMP_DIR/node_b_before.json"
curl -fsS "$API_BASE/nodes/$ID_B" -o "$DETAIL_BEFORE"
LAST0=$(python3 - "$DETAIL_BEFORE" <<'PY'
import json,sys
node=json.load(open(sys.argv[1]))
print(node.get("last_updated",""))
PY
)
IP_BEFORE=$(python3 - "$DETAIL_BEFORE" <<'PY'
import json,sys
node=json.load(open(sys.argv[1]))
print(node.get("meta_data",{}).get("ip",""))
PY
)
if [[ "$IP_BEFORE" != "$IP0_B" ]]; then
echo "[ERR] Expected initial IP $IP0_B for node-b, got $IP_BEFORE" >&2
exit 1
fi
if [[ "$IP_BEFORE" == "$TARGET_FIXED_IP" ]]; then
echo "[ERR] node-b current IP $IP_BEFORE already matches target $TARGET_FIXED_IP. Configure SYS_DEBUG_NODEB_FIXED_IP to a different address before rerun." >&2
exit 1
fi
service_id() {
compose ps -q "$1"
}
log "Recreating node-b (old IP $IP_BEFORE) with static IP $TARGET_FIXED_IP"
compose rm -sf node-b >/dev/null 2>&1 || true
CONTAINER_NAME="${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-node-b"
docker rm -f "$CONTAINER_NAME" >/dev/null 2>&1 || true
AGENT_BIN_PATH="$(cat "$TMP_DIR/agent_binary_path")"
[[ -f "$AGENT_BIN_PATH" ]] || { echo "[ERR] Agent binary path missing in $TMP_DIR" >&2; exit 1; }
require_docker
docker run -d \
--name "$CONTAINER_NAME" \
--hostname "$HOST_B" \
--network "$SYS_DEBUG_NETWORK_NAME" \
--ip "$TARGET_FIXED_IP" \
--dns "${SYS_DEBUG_BIND_IP:-172.30.0.2}" \
-e MASTER_ENDPOINT=http://master.argus.com:3000 \
-e REPORT_INTERVAL_SECONDS=2 \
-e ARGUS_BUILD_UID=$ARGUS_BUILD_UID \
-e ARGUS_BUILD_GID=$ARGUS_BUILD_GID \
-e ES_HOST=es \
-e ES_PORT=9200 \
-e CLUSTER=local \
-e RACK=dev \
-p 2021:2020 \
-v "$SYS_DEBUG_PRIVATE_NODEB/argus/agent/$HOST_B:/private/argus/agent/$HOST_B" \
-v "$AGENT_BIN_PATH:/usr/local/bin/argus-agent:ro" \
-v "$NODE_ENTRYPOINT:/usr/local/bin/node-entrypoint.sh:ro" \
-v "$REPO_ROOT/src/log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro" \
-v "$REPO_ROOT/src/log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro" \
-v "$REPO_ROOT/src/log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro" \
--entrypoint /usr/local/bin/node-entrypoint.sh \
ubuntu:22.04 >/dev/null
log "Waiting for node-b to re-register with new IP"
for _ in {1..40}; do
sleep 3
if curl -fsS "$API_BASE/nodes/$ID_B" -o "$TMP_DIR/node_b_after.json"; then
if python3 - "$TMP_DIR/node_b_after.json" "$LAST0" "$TARGET_FIXED_IP" <<'PY'
import json,sys
node=json.load(open(sys.argv[1]))
last0=sys.argv[2]
expected_ip=sys.argv[3]
ip=node.get("meta_data",{}).get("ip")
lu=node.get("last_updated")
if ip == expected_ip and lu and lu != last0:
sys.exit(0)
sys.exit(1)
PY
then
log "node-b IP updated: $IP_BEFORE -> $TARGET_FIXED_IP"
exit 0
fi
fi
done
echo "[ERR] node-b did not update to IP $TARGET_FIXED_IP in time" >&2
exit 1

View File

@ -0,0 +1,13 @@
#!/usr/bin/env bash
set -euo pipefail
# shellcheck source=common.sh
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
ensure_env_file
require_docker
log "Stopping debug stack (project $SYS_DEBUG_PROJECT_NAME)"
compose down --remove-orphans >/dev/null 2>&1 || true
log "Containers stopped. No host directories were removed."

View File

@ -0,0 +1,66 @@
#!/usr/bin/env bash
set -euo pipefail
# shellcheck source=common.sh
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
ensure_env_file
ensure_paths_defined
FORCE=false
while [[ $# -gt 0 ]]; do
case "$1" in
-y|--yes)
FORCE=true
;;
-h|--help)
cat <<USAGE
Usage: ${0##*/} [--yes]
Safely remove debug private directories after adjusting ownership.
USAGE
exit 0
;;
*)
echo "Unknown argument: $1" >&2
exit 1
;;
esac
shift
done
if [[ $FORCE == false ]]; then
read -r -p "This will delete debug private directories. Continue? [y/N] " reply
case "$reply" in
y|Y|yes|YES)
;;
*)
echo "Aborted"
exit 0
;;
esac
fi
paths=(
"$SYS_DEBUG_PRIVATE_CORE"
"$SYS_DEBUG_PRIVATE_NODEA"
"$SYS_DEBUG_PRIVATE_NODEB"
"$SYS_DEBUG_TMP_DIR"
)
require_docker
image="ubuntu:22.04"
for dir in "${paths[@]}"; do
[[ -d "$dir" ]] || continue
log "Fixing ownership for $dir"
if ! docker run --rm -v "$dir:/target" "$image" chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1; then
echo "[WARN] Failed to adjust ownership via $image, attempting local chown" >&2
chown -R "$(id -u):$(id -g)" "$dir" >/dev/null 2>&1 || true
fi
log "Removing $dir"
rm -rf "$dir"
done
log "Clean data completed"

96
src/sys/debug/scripts/common.sh Executable file
View File

@ -0,0 +1,96 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DEBUG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
REPO_ROOT="$(cd "$DEBUG_ROOT/../../.." && pwd)"
ENV_FILE="$DEBUG_ROOT/.env"
source "$REPO_ROOT/scripts/common/build_user.sh"
load_build_user
if [[ -f "$ENV_FILE" ]]; then
set -a
# shellcheck disable=SC1090
source "$ENV_FILE"
set +a
fi
SYS_DEBUG_NETWORK_NAME=${SYS_DEBUG_NETWORK_NAME:-argus-debug-net}
SYS_DEBUG_NETWORK_SUBNET=${SYS_DEBUG_NETWORK_SUBNET:-172.30.0.0/16}
SYS_DEBUG_NETWORK_GATEWAY=${SYS_DEBUG_NETWORK_GATEWAY:-172.30.0.1}
SYS_DEBUG_PROJECT_NAME=${SYS_DEBUG_PROJECT_NAME:-argus-debug}
SYS_DEBUG_CONTAINER_PREFIX=${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}
SYS_DEBUG_PRIVATE_CORE=${SYS_DEBUG_PRIVATE_CORE:-$DEBUG_ROOT/private}
SYS_DEBUG_PRIVATE_NODEA=${SYS_DEBUG_PRIVATE_NODEA:-$DEBUG_ROOT/private-nodea}
SYS_DEBUG_PRIVATE_NODEB=${SYS_DEBUG_PRIVATE_NODEB:-$DEBUG_ROOT/private-nodeb}
SYS_DEBUG_TMP_DIR=${SYS_DEBUG_TMP_DIR:-$DEBUG_ROOT/tmp}
ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}
ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}
SYS_DEBUG_NODEA_HOST=${SYS_DEBUG_NODEA_HOST:-dev-yyrshare-nbnyx10-cp2f-pod-0}
SYS_DEBUG_NODEB_HOST=${SYS_DEBUG_NODEB_HOST:-dev-yyrshare-uuuu10-ep2f-pod-0}
HOST_A="$SYS_DEBUG_NODEA_HOST"
HOST_B="$SYS_DEBUG_NODEB_HOST"
COMPOSE_FILE="$DEBUG_ROOT/docker-compose.yml"
abs_path() {
python3 - "$1" <<'PY'
import os, sys
path = sys.argv[1]
print(os.path.abspath(path))
PY
}
ensure_command() {
local cmd="$1"
if ! command -v "$cmd" >/dev/null 2>&1; then
echo "[ERR] Required command '$cmd' not found" >&2
exit 1
fi
}
require_docker() {
ensure_command docker
}
compose() {
require_docker
local bin
if docker compose version >/dev/null 2>&1; then
bin=(docker compose)
else
bin=(docker-compose)
fi
"${bin[@]}" -p "$SYS_DEBUG_PROJECT_NAME" -f "$COMPOSE_FILE" "$@"
}
ensure_paths_defined() {
local missing=()
for name in SYS_DEBUG_PRIVATE_CORE SYS_DEBUG_PRIVATE_NODEA SYS_DEBUG_PRIVATE_NODEB SYS_DEBUG_TMP_DIR; do
if [[ -z "${!name:-}" ]]; then
missing+=("$name")
fi
done
if (( ${#missing[@]} > 0 )); then
echo "[ERR] Missing required environment variables: ${missing[*]}" >&2
echo " Run 01_bootstrap.sh first." >&2
exit 1
fi
}
ensure_env_file() {
if [[ ! -f "$ENV_FILE" ]]; then
echo "[ERR] Missing .env at $ENV_FILE. Run 01_bootstrap.sh first." >&2
exit 1
fi
}
log() {
echo "[INFO] $*"
}
TMP_DIR="$SYS_DEBUG_TMP_DIR"
mkdir -p "$TMP_DIR"

View File

@ -0,0 +1,76 @@
#!/usr/bin/env bash
set -euo pipefail
# shellcheck source=common.sh
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
NAME="$SYS_DEBUG_NETWORK_NAME"
SUBNET="$SYS_DEBUG_NETWORK_SUBNET"
GATEWAY="$SYS_DEBUG_NETWORK_GATEWAY"
usage() {
cat <<EOF
Usage: ${0##*/} [--name NAME] [--subnet CIDR] [--gateway IP]
Create (if missing) the external debug docker network.
Defaults derived from .env or:
name = $NAME
subnet = $SUBNET
gateway = $GATEWAY
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--name)
shift; [[ $# -gt 0 ]] || { echo "--name requires value" >&2; exit 1; }
NAME="$1"
;;
--name=*)
NAME="${1#*=}"
;;
--subnet)
shift; [[ $# -gt 0 ]] || { echo "--subnet requires value" >&2; exit 1; }
SUBNET="$1"
;;
--subnet=*)
SUBNET="${1#*=}"
;;
--gateway)
shift; [[ $# -gt 0 ]] || { echo "--gateway requires value" >&2; exit 1; }
GATEWAY="$1"
;;
--gateway=*)
GATEWAY="${1#*=}"
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 1
;;
esac
shift
done
require_docker
if docker network inspect "$NAME" >/dev/null 2>&1; then
log "Network $NAME already exists"
exit 0
fi
log "Creating network $NAME (subnet=$SUBNET gateway=$GATEWAY)"
docker network create \
--driver bridge \
--subnet "$SUBNET" \
--gateway "$GATEWAY" \
"$NAME"
mkdir -p "$TMP_DIR"
echo "$NAME" > "$TMP_DIR/network.created"
log "Network $NAME created"

View File

@ -0,0 +1,55 @@
#!/usr/bin/env bash
set -euo pipefail
# shellcheck source=common.sh
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh"
NAME="$SYS_DEBUG_NETWORK_NAME"
usage() {
cat <<EOF
Usage: ${0##*/} [--name NAME]
Destroy the debug docker network if no containers are attached.
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--name)
shift; [[ $# -gt 0 ]] || { echo "--name requires value" >&2; exit 1; }
NAME="$1"
;;
--name=*)
NAME="${1#*=}"
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 1
;;
esac
shift
done
require_docker
if ! docker network inspect "$NAME" >/dev/null 2>&1; then
log "Network $NAME not found; nothing to do"
exit 0
fi
attached=$(docker network inspect -f '{{range $id, $conf := .Containers}}{{printf "%s " $conf.Name}}{{end}}' "$NAME")
if [[ -n "${attached// }" ]]; then
echo "[ERR] Cannot remove network $NAME: still connected containers -> $attached" >&2
exit 1
fi
log "Deleting network $NAME"
docker network rm "$NAME" >/dev/null
rm -f "$TMP_DIR/network.created"
log "Network $NAME removed"

View File

@ -8,6 +8,16 @@ REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)"
API_BASE="http://localhost:32300/api/v1/master"
if [[ -f "$TEST_ROOT/.env" ]]; then
set -a
# shellcheck disable=SC1090
source "$TEST_ROOT/.env"
set +a
else
source "$REPO_ROOT/scripts/common/build_user.sh"
load_build_user
fi
ID_B="$(cat "$TMP_DIR/node_id_b")"
IP0_B="$(cat "$TMP_DIR/initial_ip_b")"