diff --git a/build/build_images.sh b/build/build_images.sh index bf712a4..562c964 100755 --- a/build/build_images.sh +++ b/build/build_images.sh @@ -10,6 +10,7 @@ Usage: $0 [OPTIONS] Options: --intranet Use intranet mirror for log/bind builds --master-offline Build master offline image (requires src/master/offline_wheels.tar.gz) + --no-cache Build all images without using Docker layer cache -h, --help Show this help message Examples: @@ -23,6 +24,7 @@ EOF use_intranet=false build_master=true build_master_offline=false +no_cache=false while [[ $# -gt 0 ]]; do case $1 in @@ -39,6 +41,10 @@ while [[ $# -gt 0 ]]; do build_master_offline=true shift ;; + --no-cache) + no_cache=true + shift + ;; -h|--help) show_help exit 0 @@ -65,6 +71,10 @@ cd "$root" load_build_user build_args+=("--build-arg" "ARGUS_BUILD_UID=${ARGUS_BUILD_UID}" "--build-arg" "ARGUS_BUILD_GID=${ARGUS_BUILD_GID}") +if [[ "$no_cache" == true ]]; then + build_args+=("--no-cache") +fi + master_root="$root/src/master" master_offline_tar="$master_root/offline_wheels.tar.gz" master_offline_dir="$master_root/offline_wheels" @@ -159,6 +169,9 @@ if [[ "$build_master" == true ]]; then if [[ "$build_master_offline" == true ]]; then master_args+=("--offline") fi + if [[ "$no_cache" == true ]]; then + master_args+=("--no-cache") + fi if ./scripts/build_images.sh "${master_args[@]}"; then if [[ "$build_master_offline" == true ]]; then images_built+=("argus-master:offline") diff --git a/src/agent/README.md b/src/agent/README.md index f89334d..df96bdb 100644 --- a/src/agent/README.md +++ b/src/agent/README.md @@ -34,6 +34,18 @@ Agent 不再依赖配置文件;所有参数均由环境变量与主机名推 | `MASTER_ENDPOINT` | 是 | N/A | Master 基础地址,可写 `http://host:3000` 或 `host:3000`(自动补全 `http://`)。 | | `REPORT_INTERVAL_SECONDS` | 否 | `60` | 状态上报间隔(秒)。必须为正整数。 | | `AGENT_HOSTNAME` | 否 | `$(hostname)` | 覆盖容器内主机名,便于测试或特殊命名需求。 | +| `AGENT_ENV` | 否 | 来源于主机名 | 运行环境标识(如 `dev`、`prod`)。与 `AGENT_USER`、`AGENT_INSTANCE` 必须同时设置。 | +| `AGENT_USER` | 否 | 来源于主机名 | 归属用户或团队标识。与 `AGENT_ENV`、`AGENT_INSTANCE` 必须同时设置。 | +| `AGENT_INSTANCE` | 否 | 来源于主机名 | 实例编号或别名。与 `AGENT_ENV`、`AGENT_USER` 必须同时设置。 | + +主机名与元数据的解析优先级: + +1. 若设置 `AGENT_ENV` / `AGENT_USER` / `AGENT_INSTANCE` 且全部存在,则直接使用这些值。 +2. 否则检查历史 `node.json`(注册成功后由 Master 返回的信息),若包含 `env` / `user` / `instance` 则沿用。 +3. 若以上均不可用,则按历史约定从主机名解析 `env-user-instance` 前缀。 +4. 如果仍无法得到完整结果,Agent 启动会失败并提示需要提供上述环境变量。 + +> 提示:在首次部署时需确保环境变量或主机名能够提供完整信息。完成注册后,Agent 会把 Master 返回的元数据写入 `node.json`,后续重启无需再次提供环境变量就能保持一致性。 派生路径: diff --git a/src/agent/app/collector.py b/src/agent/app/collector.py index 1b61caa..6c913df 100644 --- a/src/agent/app/collector.py +++ b/src/agent/app/collector.py @@ -18,13 +18,12 @@ _HOSTNAME_PATTERN = re.compile(r"^([^-]+)-([^-]+)-([^-]+)-.*$") def collect_metadata(config: AgentConfig) -> Dict[str, Any]: """汇总节点注册需要的静态信息。""" hostname = config.hostname - env, user, instance = _parse_hostname(hostname) meta = { "hostname": hostname, "ip": _detect_ip_address(), - "env": env, - "user": user, - "instance": instance, + "env": config.environment, + "user": config.user, + "instance": config.instance, "cpu_number": _detect_cpu_count(), "memory_in_bytes": _detect_memory_bytes(), "gpu_number": _detect_gpu_count(), diff --git a/src/agent/app/config.py b/src/agent/app/config.py index dae5d47..057b92a 100644 --- a/src/agent/app/config.py +++ b/src/agent/app/config.py @@ -6,14 +6,21 @@ from dataclasses import dataclass from pathlib import Path from typing import Final +from .state import load_node_state from .version import VERSION +from .log import get_logger DEFAULT_REPORT_INTERVAL_SECONDS: Final[int] = 60 +LOGGER = get_logger("argus.agent.config") + @dataclass(frozen=True) class AgentConfig: hostname: str + environment: str + user: str + instance: str node_file: str version: str master_endpoint: str @@ -47,11 +54,68 @@ def _resolve_hostname() -> str: return os.environ.get("AGENT_HOSTNAME") or socket.gethostname() +def _load_metadata_from_state(node_file: str) -> tuple[str, str, str] | None: + state = load_node_state(node_file) + if not state: + return None + + meta = state.get("meta_data") or {} + env = meta.get("env") or state.get("env") + user = meta.get("user") or state.get("user") + instance = meta.get("instance") or state.get("instance") + + if env and user and instance: + LOGGER.debug("Metadata resolved from node state", extra={"node_file": node_file}) + return env, user, instance + + LOGGER.warning( + "node.json missing metadata fields; ignoring", + extra={"node_file": node_file, "meta_data": meta}, + ) + return None + + +def _resolve_metadata_fields(hostname: str, node_file: str) -> tuple[str, str, str]: + env = os.environ.get("AGENT_ENV") + user = os.environ.get("AGENT_USER") + instance = os.environ.get("AGENT_INSTANCE") + + if env and user and instance: + return env, user, instance + + if any([env, user, instance]): + LOGGER.warning( + "Incomplete metadata environment variables; falling back to persisted metadata", + extra={ + "has_env": bool(env), + "has_user": bool(user), + "has_instance": bool(instance), + }, + ) + + state_metadata = _load_metadata_from_state(node_file) + if state_metadata is not None: + return state_metadata + + from .collector import _parse_hostname # Local import to avoid circular dependency + + env, user, instance = _parse_hostname(hostname) + + if not all([env, user, instance]): + raise ValueError( + "Failed to determine metadata fields; set AGENT_ENV/USER/INSTANCE or use supported hostname pattern" + ) + + return env, user, instance + + def load_config() -> AgentConfig: """从环境变量推导配置,移除了外部配置文件依赖。""" hostname = _resolve_hostname() node_file = f"/private/argus/agent/{hostname}/node.json" + environment, user, instance = _resolve_metadata_fields(hostname, node_file) + health_dir = f"/private/argus/agent/{hostname}/health/" master_endpoint_env = os.environ.get("MASTER_ENDPOINT") @@ -66,6 +130,9 @@ def load_config() -> AgentConfig: return AgentConfig( hostname=hostname, + environment=environment, + user=user, + instance=instance, node_file=node_file, version=VERSION, master_endpoint=master_endpoint, diff --git a/src/agent/dist/argus-agent b/src/agent/dist/argus-agent index 4fef67c..1a335c4 100755 Binary files a/src/agent/dist/argus-agent and b/src/agent/dist/argus-agent differ diff --git a/src/agent/tests/__init__.py b/src/agent/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/agent/tests/docker-compose.yml b/src/agent/tests/docker-compose.yml index e24e252..0dd4743 100644 --- a/src/agent/tests/docker-compose.yml +++ b/src/agent/tests/docker-compose.yml @@ -60,6 +60,36 @@ services: ipv4_address: 172.28.0.20 restart: always + agent_env: + image: ubuntu:22.04 + container_name: argus-agent-env-e2e + hostname: host_abc + depends_on: + - master + - bind + environment: + - MASTER_ENDPOINT=http://master.argus.com:3000 + - REPORT_INTERVAL_SECONDS=2 + - AGENT_ENV=prod + - AGENT_USER=ml + - AGENT_INSTANCE=node-3 + - AGENT_HOSTNAME=host_abc + - "ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133}" + - "ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015}" + volumes: + - ./private/argus/agent/host_abc:/private/argus/agent/host_abc + - ./private/argus/agent/host_abc/health:/private/argus/agent/host_abc/health + - ./private/argus/etc:/private/argus/etc + - ../dist/argus-agent:/usr/local/bin/argus-agent:ro + - ./scripts/agent_entrypoint.sh:/usr/local/bin/agent-entrypoint.sh:ro + - ../scripts/agent_deployment_verify.sh:/usr/local/bin/agent_deployment_verify.sh:ro + entrypoint: + - /usr/local/bin/agent-entrypoint.sh + networks: + default: + ipv4_address: 172.28.0.21 + restart: always + networks: default: driver: bridge diff --git a/src/agent/tests/scripts/00_e2e_test.sh b/src/agent/tests/scripts/00_e2e_test.sh index 9515d34..14e27f7 100755 --- a/src/agent/tests/scripts/00_e2e_test.sh +++ b/src/agent/tests/scripts/00_e2e_test.sh @@ -7,10 +7,10 @@ SCRIPTS=( "02_up.sh" "03_wait_and_assert_registration.sh" "04_write_health_files.sh" - "08_verify_agent.sh" - "05_assert_status_on_master.sh" - "06_restart_agent_and_reregister.sh" - "07_down.sh" + "05_verify_agent.sh" + "06_assert_status_on_master.sh" + "07_restart_agent_and_reregister.sh" + "08_down.sh" ) for script in "${SCRIPTS[@]}"; do diff --git a/src/agent/tests/scripts/02_up.sh b/src/agent/tests/scripts/02_up.sh index 56c4cda..d490a50 100755 --- a/src/agent/tests/scripts/02_up.sh +++ b/src/agent/tests/scripts/02_up.sh @@ -41,7 +41,7 @@ compose() { fi } -docker container rm -f argus-agent-e2e argus-master-agent-e2e argus-bind-agent-e2e >/dev/null 2>&1 || true +docker container rm -f argus-agent-e2e argus-agent-env-e2e argus-master-agent-e2e argus-bind-agent-e2e >/dev/null 2>&1 || true docker network rm tests_default >/dev/null 2>&1 || true diff --git a/src/agent/tests/scripts/03_wait_and_assert_registration.sh b/src/agent/tests/scripts/03_wait_and_assert_registration.sh index 7e9c127..8b0481b 100755 --- a/src/agent/tests/scripts/03_wait_and_assert_registration.sh +++ b/src/agent/tests/scripts/03_wait_and_assert_registration.sh @@ -6,11 +6,14 @@ TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" TMP_ROOT="$TEST_ROOT/tmp" API_BASE="http://localhost:32300/api/v1/master" AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0" +ENV_AGENT_HOSTNAME="host_abc" NODE_FILE="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/node.json" +ENV_NODE_FILE="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME/node.json" mkdir -p "$TMP_ROOT" -node_id="" +primary_node_id="" +env_node_id="" for _ in {1..30}; do sleep 2 response=$(curl -sS "$API_BASE/nodes" || true) @@ -19,24 +22,49 @@ for _ in {1..30}; do fi list_file="$TMP_ROOT/nodes_list.json" echo "$response" > "$list_file" - node_id=$(python3 - "$list_file" <<'PY' + readarray -t node_ids < <(python3 - "$list_file" "$AGENT_HOSTNAME" "$ENV_AGENT_HOSTNAME" <<'PY' import json, sys + with open(sys.argv[1]) as handle: nodes = json.load(handle) -print(nodes[0]["id"] if nodes else "") + +target_primary = sys.argv[2] +target_env = sys.argv[3] + +primary_id = "" +env_id = "" + +for node in nodes: + if node.get("name") == target_primary: + primary_id = node.get("id", "") + if node.get("name") == target_env: + env_id = node.get("id", "") + +print(primary_id) +print(env_id) PY -) - if [[ -n "$node_id" ]]; then + ) + + primary_node_id="${node_ids[0]}" + env_node_id="${node_ids[1]}" + + if [[ -n "$primary_node_id" && -n "$env_node_id" ]]; then break fi done -if [[ -z "$node_id" ]]; then - echo "[ERROR] Agent did not register within timeout" >&2 +if [[ -z "$primary_node_id" ]]; then + echo "[ERROR] Primary agent did not register within timeout" >&2 exit 1 fi -echo "$node_id" > "$TMP_ROOT/node_id" +if [[ -z "$env_node_id" ]]; then + echo "[ERROR] Env-variable agent did not register within timeout" >&2 + exit 1 +fi + +echo "$primary_node_id" > "$TMP_ROOT/node_id" +echo "$env_node_id" > "$TMP_ROOT/node_id_host_abc" if [[ ! -f "$NODE_FILE" ]]; then echo "[ERROR] node.json not created at $NODE_FILE" >&2 @@ -50,8 +78,20 @@ with open(sys.argv[1]) as handle: assert "id" in node and node["id"], "node.json missing id" PY +if [[ ! -f "$ENV_NODE_FILE" ]]; then + echo "[ERROR] node.json not created at $ENV_NODE_FILE" >&2 + exit 1 +fi + +python3 - "$ENV_NODE_FILE" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +assert "id" in node and node["id"], "env agent node.json missing id" +PY + detail_file="$TMP_ROOT/initial_detail.json" -curl -sS "$API_BASE/nodes/$node_id" -o "$detail_file" +curl -sS "$API_BASE/nodes/$primary_node_id" -o "$detail_file" python3 - "$detail_file" "$TMP_ROOT/initial_ip" <<'PY' import json, sys, pathlib with open(sys.argv[1]) as handle: @@ -62,4 +102,5 @@ if not ip: pathlib.Path(sys.argv[2]).write_text(ip) PY -echo "[INFO] Agent registered with node id $node_id" +echo "[INFO] Agent registered with node id $primary_node_id" +echo "[INFO] Env-variable agent registered with node id $env_node_id" diff --git a/src/agent/tests/scripts/05_verify_agent.sh b/src/agent/tests/scripts/05_verify_agent.sh new file mode 100755 index 0000000..2d4d9b8 --- /dev/null +++ b/src/agent/tests/scripts/05_verify_agent.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +REPO_ROOT="$(cd "$TEST_ROOT/.." && pwd)" +VERIFY_SCRIPT="$REPO_ROOT/scripts/agent_deployment_verify.sh" +ENV_NODE_ID_FILE="$TEST_ROOT/tmp/node_id_host_abc" +PRIMARY_CONTAINER="argus-agent-e2e" +ENV_CONTAINER="argus-agent-env-e2e" +PRIMARY_HOSTNAME="dev-e2euser-e2einst-pod-0" +ENV_HOSTNAME="host_abc" + +if ! docker ps --format '{{.Names}}' | grep -q "^${PRIMARY_CONTAINER}$"; then + echo "[WARN] agent container not running; skip verification" + exit 0 +fi + +if docker exec -i "$PRIMARY_CONTAINER" bash -lc 'command -v curl >/dev/null && command -v jq >/dev/null'; then + echo "[INFO] curl/jq already installed in agent container" +else + echo "[INFO] Installing curl/jq in agent container" + docker exec -i "$PRIMARY_CONTAINER" bash -lc 'apt-get update >/dev/null 2>&1 && apt-get install -y curl jq >/dev/null 2>&1' || true +fi + +if [[ ! -f "$VERIFY_SCRIPT" ]]; then + echo "[ERROR] Verification script missing at $VERIFY_SCRIPT" >&2 + exit 1 +fi + +run_verifier() { + local container="$1" hostname="$2" + + if ! docker ps --format '{{.Names}}' | grep -q "^${container}$"; then + echo "[WARN] container $container not running; skip" + return + fi + + if ! docker exec -i "$container" bash -lc 'command -v /usr/local/bin/agent_deployment_verify.sh >/dev/null'; then + echo "[ERROR] /usr/local/bin/agent_deployment_verify.sh missing in $container" >&2 + exit 1 + fi + + echo "[INFO] Running verification for $hostname in $container" + docker exec -i "$container" env VERIFY_HOSTNAME="$hostname" /usr/local/bin/agent_deployment_verify.sh +} + +run_verifier "$PRIMARY_CONTAINER" "$PRIMARY_HOSTNAME" + +if docker ps --format '{{.Names}}' | grep -q "^${ENV_CONTAINER}$"; then + if docker exec -i "$ENV_CONTAINER" bash -lc 'command -v curl >/dev/null && command -v jq >/dev/null'; then + echo "[INFO] curl/jq already installed in env agent container" + else + echo "[INFO] Installing curl/jq in env agent container" + docker exec -i "$ENV_CONTAINER" bash -lc 'apt-get update >/dev/null 2>&1 && apt-get install -y curl jq >/dev/null 2>&1' || true + fi + run_verifier "$ENV_CONTAINER" "$ENV_HOSTNAME" +else + echo "[WARN] env-driven agent container not running; skip secondary verification" +fi diff --git a/src/agent/tests/scripts/05_assert_status_on_master.sh b/src/agent/tests/scripts/06_assert_status_on_master.sh similarity index 57% rename from src/agent/tests/scripts/05_assert_status_on_master.sh rename to src/agent/tests/scripts/06_assert_status_on_master.sh index b1b0a87..3c58426 100755 --- a/src/agent/tests/scripts/05_assert_status_on_master.sh +++ b/src/agent/tests/scripts/06_assert_status_on_master.sh @@ -6,6 +6,8 @@ TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" TMP_ROOT="$TEST_ROOT/tmp" API_BASE="http://localhost:32300/api/v1/master" NODE_ID="$(cat "$TMP_ROOT/node_id")" +ENV_NODE_ID="$(cat "$TMP_ROOT/node_id_host_abc")" +ENV_HOSTNAME="host_abc" NODES_JSON="$TEST_ROOT/private/argus/metric/prometheus/nodes.json" success=false @@ -41,13 +43,36 @@ if [[ ! -f "$NODES_JSON" ]]; then exit 1 fi -python3 - "$NODES_JSON" <<'PY' +python3 - "$NODES_JSON" "$NODE_ID" "$ENV_NODE_ID" <<'PY' import json, sys with open(sys.argv[1]) as handle: nodes = json.load(handle) -assert len(nodes) == 1, nodes -entry = nodes[0] -assert entry["node_id"], entry + +expected_primary = sys.argv[2] +expected_env = sys.argv[3] + +ids = {entry.get("node_id") for entry in nodes} +assert expected_primary in ids, nodes +assert expected_env in ids, nodes +assert len(nodes) >= 2, nodes PY echo "[INFO] Master reflects agent health and nodes.json entries" + +env_detail_file="$TMP_ROOT/env_agent_detail.json" +curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_detail_file" +python3 - "$env_detail_file" "$ENV_HOSTNAME" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) + +expected_name = sys.argv[2] + +assert node.get("name") == expected_name, node +meta = node.get("meta_data", {}) +assert meta.get("env") == "prod", meta +assert meta.get("user") == "ml", meta +assert meta.get("instance") == "node-3", meta +PY + +echo "[INFO] Env-variable agent reports expected metadata" diff --git a/src/agent/tests/scripts/06_restart_agent_and_reregister.sh b/src/agent/tests/scripts/07_restart_agent_and_reregister.sh similarity index 53% rename from src/agent/tests/scripts/06_restart_agent_and_reregister.sh rename to src/agent/tests/scripts/07_restart_agent_and_reregister.sh index 78c6322..4da99d3 100755 --- a/src/agent/tests/scripts/06_restart_agent_and_reregister.sh +++ b/src/agent/tests/scripts/07_restart_agent_and_reregister.sh @@ -6,10 +6,20 @@ TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" TMP_ROOT="$TEST_ROOT/tmp" API_BASE="http://localhost:32300/api/v1/master" NODE_ID="$(cat "$TMP_ROOT/node_id")" +ENV_NODE_ID_FILE="$TMP_ROOT/node_id_host_abc" +if [[ ! -f "$ENV_NODE_ID_FILE" ]]; then + echo "[ERROR] Env agent node id file missing at $ENV_NODE_ID_FILE" >&2 + exit 1 +fi + +ENV_NODE_ID="$(cat "$ENV_NODE_ID_FILE")" AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0" +ENV_AGENT_HOSTNAME="host_abc" NETWORK_NAME="tests_default" NEW_AGENT_IP="172.28.0.200" +NEW_ENV_AGENT_IP="172.28.0.210" ENTRYPOINT_SCRIPT="$SCRIPT_DIR/agent_entrypoint.sh" +VERIFY_SCRIPT="$TEST_ROOT/../scripts/agent_deployment_verify.sh" ENV_FILE="$TEST_ROOT/.env" # 中文提示:重启场景也需要同样的入口脚本,确保 DNS 注册逻辑一致 @@ -18,6 +28,11 @@ if [[ ! -f "$ENTRYPOINT_SCRIPT" ]]; then exit 1 fi +if [[ ! -f "$VERIFY_SCRIPT" ]]; then + echo "[ERROR] agent verification script missing at $VERIFY_SCRIPT" >&2 + exit 1 +fi + if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then echo "[ERROR] Agent binary path missing; rerun bootstrap" >&2 exit 1 @@ -74,15 +89,37 @@ if [[ "$prev_ip" != "$initial_ip" ]]; then exit 1 fi +env_before_file="$TMP_ROOT/env_before_restart.json" +curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_before_file" +env_prev_last_updated=$(python3 - "$env_before_file" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +print(node.get("last_updated", "")) +PY +) +env_prev_ip=$(python3 - "$env_before_file" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +print(node["meta_data"].get("ip", "")) +PY +) + pushd "$TEST_ROOT" >/dev/null compose rm -sf agent +compose rm -sf agent_env popd >/dev/null docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true +docker container rm -f argus-agent-env-e2e >/dev/null 2>&1 || true AGENT_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME" HEALTH_DIR="$TEST_ROOT/private/argus/agent/$AGENT_HOSTNAME/health" +ENV_AGENT_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME" +ENV_HEALTH_DIR="$TEST_ROOT/private/argus/agent/$ENV_AGENT_HOSTNAME/health" + # 先以 sleep 方式启动容器,确保我们掌握注册时的网络状态 if ! docker run -d \ --name argus-agent-e2e \ @@ -94,6 +131,7 @@ if ! docker run -d \ -v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \ -v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \ -v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \ + -v "$VERIFY_SCRIPT:/usr/local/bin/agent_deployment_verify.sh:ro" \ -e MASTER_ENDPOINT=http://master.argus.com:3000 \ -e REPORT_INTERVAL_SECONDS=2 \ -e ARGUS_BUILD_UID="$AGENT_UID" \ @@ -141,3 +179,76 @@ if [[ "$success" != true ]]; then fi echo "[INFO] Agent restart produced successful re-registration with IP change" + +# ---- Restart env-driven agent without metadata environment variables ---- + +if [[ ! -d "$ENV_AGENT_DIR" ]]; then + echo "[ERROR] Env agent data dir missing at $ENV_AGENT_DIR" >&2 + exit 1 +fi + +if [[ ! -d "$ENV_HEALTH_DIR" ]]; then + mkdir -p "$ENV_HEALTH_DIR" +fi + +if ! docker run -d \ + --name argus-agent-env-e2e \ + --hostname "$ENV_AGENT_HOSTNAME" \ + --network "$NETWORK_NAME" \ + --ip "$NEW_ENV_AGENT_IP" \ + -v "$ENV_AGENT_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME" \ + -v "$ENV_HEALTH_DIR:/private/argus/agent/$ENV_AGENT_HOSTNAME/health" \ + -v "$TEST_ROOT/private/argus/etc:/private/argus/etc" \ + -v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \ + -v "$ENTRYPOINT_SCRIPT:/usr/local/bin/agent-entrypoint.sh:ro" \ + -v "$VERIFY_SCRIPT:/usr/local/bin/agent_deployment_verify.sh:ro" \ + -e MASTER_ENDPOINT=http://master.argus.com:3000 \ + -e REPORT_INTERVAL_SECONDS=2 \ + -e ARGUS_BUILD_UID="$AGENT_UID" \ + -e ARGUS_BUILD_GID="$AGENT_GID" \ + --entrypoint /usr/local/bin/agent-entrypoint.sh \ + ubuntu:22.04 >/dev/null; then + echo "[ERROR] Failed to start env-driven agent container without metadata env" >&2 + exit 1 +fi + +env_success=false +env_detail_file="$TMP_ROOT/env_post_restart.json" +for _ in {1..20}; do + sleep 3 + if ! curl -sS "$API_BASE/nodes/$ENV_NODE_ID" -o "$env_detail_file"; then + continue + fi + if python3 - "$env_detail_file" "$env_prev_last_updated" "$ENV_NODE_ID" "$env_prev_ip" "$NEW_ENV_AGENT_IP" <<'PY' +import json, sys +with open(sys.argv[1]) as handle: + node = json.load(handle) +prev_last_updated = sys.argv[2] +expected_id = sys.argv[3] +old_ip = sys.argv[4] +expected_ip = sys.argv[5] +last_updated = node.get("last_updated") +current_ip = node["meta_data"].get("ip") +meta = node.get("meta_data", {}) +assert node["id"] == expected_id +if current_ip != expected_ip: + raise SystemExit(1) +if current_ip == old_ip: + raise SystemExit(1) +if not last_updated or last_updated == prev_last_updated: + raise SystemExit(1) +if meta.get("env") != "prod" or meta.get("user") != "ml" or meta.get("instance") != "node-3": + raise SystemExit(1) +PY + then + env_success=true + break + fi +done + +if [[ "$env_success" != true ]]; then + echo "[ERROR] Env-driven agent did not reuse persisted metadata after restart" >&2 + exit 1 +fi + +echo "[INFO] Env-driven agent restart succeeded with persisted metadata" diff --git a/src/agent/tests/scripts/07_down.sh b/src/agent/tests/scripts/08_down.sh similarity index 89% rename from src/agent/tests/scripts/07_down.sh rename to src/agent/tests/scripts/08_down.sh index b9674ee..4accf14 100755 --- a/src/agent/tests/scripts/07_down.sh +++ b/src/agent/tests/scripts/08_down.sh @@ -13,7 +13,7 @@ compose() { fi } -docker container rm -f argus-agent-e2e >/dev/null 2>&1 || true +docker container rm -f argus-agent-e2e argus-agent-env-e2e >/dev/null 2>&1 || true pushd "$TEST_ROOT" >/dev/null compose down --remove-orphans diff --git a/src/agent/tests/scripts/08_verify_agent.sh b/src/agent/tests/scripts/08_verify_agent.sh deleted file mode 100755 index 8b347b0..0000000 --- a/src/agent/tests/scripts/08_verify_agent.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" -VERIFY_SCRIPT="$(cd "$TEST_ROOT/.." && pwd)/scripts/agent_deployment_verify.sh" - -if ! docker ps --format '{{.Names}}' | grep -q '^argus-agent-e2e$'; then - echo "[WARN] agent container not running; skip verification" - exit 0 -fi - -if docker exec -i argus-agent-e2e bash -lc 'command -v curl >/dev/null && command -v jq >/dev/null'; then - echo "[INFO] curl/jq already installed in agent container" -else - echo "[INFO] Installing curl/jq in agent container" - docker exec -i argus-agent-e2e bash -lc 'apt-get update >/dev/null 2>&1 && apt-get install -y curl jq >/dev/null 2>&1' || true -fi - -if docker exec -i argus-agent-e2e bash -lc 'command -v /usr/local/bin/agent_deployment_verify.sh >/dev/null'; then - docker exec -i argus-agent-e2e /usr/local/bin/agent_deployment_verify.sh -elif [[ -x "$VERIFY_SCRIPT" ]]; then - docker exec -i argus-agent-e2e "$VERIFY_SCRIPT" -else - echo "[WARN] agent_deployment_verify.sh not found" -fi diff --git a/src/agent/tests/test_config_metadata.py b/src/agent/tests/test_config_metadata.py new file mode 100644 index 0000000..2ddd45a --- /dev/null +++ b/src/agent/tests/test_config_metadata.py @@ -0,0 +1,151 @@ +from __future__ import annotations + +import os +import unittest +from contextlib import contextmanager +from unittest.mock import patch + +from app.config import AgentConfig, load_config + + +@contextmanager +def temp_env(**overrides: str | None): + originals: dict[str, str | None] = {} + try: + for key, value in overrides.items(): + originals[key] = os.environ.get(key) + if value is None: + os.environ.pop(key, None) + else: + os.environ[key] = value + yield + finally: + for key, original in originals.items(): + if original is None: + os.environ.pop(key, None) + else: + os.environ[key] = original + + +class LoadConfigMetadataTests(unittest.TestCase): + @patch("app.config.Path.mkdir") + def test_metadata_from_environment_variables(self, mock_mkdir): + with temp_env( + MASTER_ENDPOINT="http://master.local", + AGENT_HOSTNAME="dev-user-one-pod", + AGENT_ENV="prod", + AGENT_USER="ops", + AGENT_INSTANCE="node-1", + ): + config = load_config() + + self.assertEqual(config.environment, "prod") + self.assertEqual(config.user, "ops") + self.assertEqual(config.instance, "node-1") + mock_mkdir.assert_called() + + @patch("app.config.Path.mkdir") + def test_metadata_falls_back_to_hostname(self, mock_mkdir): + with temp_env( + MASTER_ENDPOINT="http://master.local", + AGENT_HOSTNAME="qa-team-abc-pod-2", + AGENT_ENV=None, + AGENT_USER=None, + AGENT_INSTANCE=None, + ): + config = load_config() + + self.assertEqual(config.environment, "qa") + self.assertEqual(config.user, "team") + self.assertEqual(config.instance, "abc") + mock_mkdir.assert_called() + + @patch("app.config._load_metadata_from_state", return_value=("prod", "ops", "node-1")) + @patch("app.config.Path.mkdir") + def test_metadata_from_node_state(self, mock_mkdir, mock_state): + with temp_env( + MASTER_ENDPOINT="http://master.local", + AGENT_HOSTNAME="host_abc", + AGENT_ENV=None, + AGENT_USER=None, + AGENT_INSTANCE=None, + ): + config = load_config() + + self.assertEqual(config.environment, "prod") + self.assertEqual(config.user, "ops") + self.assertEqual(config.instance, "node-1") + mock_state.assert_called_once() + mock_mkdir.assert_called() + + @patch("app.config.Path.mkdir") + def test_partial_environment_variables_fallback(self, mock_mkdir): + with temp_env( + MASTER_ENDPOINT="http://master.local", + AGENT_HOSTNAME="stage-ml-001-node", + AGENT_ENV="prod", + AGENT_USER=None, + AGENT_INSTANCE=None, + ): + config = load_config() + + self.assertEqual(config.environment, "stage") + self.assertEqual(config.user, "ml") + self.assertEqual(config.instance, "001") + mock_mkdir.assert_called() + + @patch("app.config.Path.mkdir") + def test_invalid_hostname_raises_error(self, mock_mkdir): + with temp_env( + MASTER_ENDPOINT="http://master.local", + AGENT_HOSTNAME="invalidhostname", + AGENT_ENV=None, + AGENT_USER=None, + AGENT_INSTANCE=None, + ): + with self.assertRaises(ValueError): + load_config() + + mock_mkdir.assert_not_called() + + +class CollectMetadataTests(unittest.TestCase): + @patch("app.collector._detect_ip_address", return_value="127.0.0.1") + @patch("app.collector._detect_gpu_count", return_value=0) + @patch("app.collector._detect_memory_bytes", return_value=1024) + @patch("app.collector._detect_cpu_count", return_value=8) + def test_collect_metadata_uses_config_fields( + self, + mock_cpu, + mock_memory, + mock_gpu, + mock_ip, + ): + config = AgentConfig( + hostname="dev-user-001-pod", + environment="prod", + user="ops", + instance="node-1", + node_file="/tmp/node.json", + version="1.0.0", + master_endpoint="http://master.local", + report_interval_seconds=60, + health_dir="/tmp/health", + ) + + from app.collector import collect_metadata + + metadata = collect_metadata(config) + + self.assertEqual(metadata["env"], "prod") + self.assertEqual(metadata["user"], "ops") + self.assertEqual(metadata["instance"], "node-1") + self.assertEqual(metadata["hostname"], "dev-user-001-pod") + self.assertEqual(metadata["ip"], "127.0.0.1") + self.assertEqual(metadata["cpu_number"], 8) + self.assertEqual(metadata["memory_in_bytes"], 1024) + self.assertEqual(metadata["gpu_number"], 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/src/bind/build/dns-monitor.sh b/src/bind/build/dns-monitor.sh index 2890b47..12fdb76 100644 --- a/src/bind/build/dns-monitor.sh +++ b/src/bind/build/dns-monitor.sh @@ -17,6 +17,9 @@ log_message() { log_message "DNS监控脚本启动" +log_message "删除DNS备份文件(如果存在)" +rm -f $DNS_BACKUP + while true; do if [ -f "$DNS_CONF" ]; then if [ -f "$DNS_BACKUP" ]; then diff --git a/src/log/kibana/build/kibana-post-start.sh b/src/log/kibana/build/kibana-post-start.sh index d4b30e0..8b96945 100644 --- a/src/log/kibana/build/kibana-post-start.sh +++ b/src/log/kibana/build/kibana-post-start.sh @@ -2,7 +2,7 @@ set -euo pipefail ES_HOST="${ELASTICSEARCH_HOSTS:-http://es:9200}" -KB_HOST="http://localhost:5601" +KB_HOST="${KB_HOST:-http://127.0.0.1:5601}" echo "[INFO] Starting Kibana post-start configuration..." @@ -83,50 +83,37 @@ fix_replicas_idempotent() { } # 幂等创建数据视图 +create_or_ensure_data_view() { + local name="$1" + local title="$2" + + local list_response + list_response=$(curl -fsS "$KB_HOST/api/data_views" -H 'kbn-xsrf: true' 2>/dev/null || echo "") + + if [ -z "$list_response" ]; then + echo "[WARN] Failed to list data views, skipping creation check for $title" + return + fi + + if echo "$list_response" | grep -Fq "\"title\":\"$title\""; then + echo "[INFO] Data view $title already exists, skipping" + return + fi + + echo "[INFO] Creating data view for $title indices (allowNoIndex)" + + curl -fsS -X POST "$KB_HOST/api/data_views/data_view?allowNoIndex=true" \ + -H 'kbn-xsrf: true' \ + -H 'Content-Type: application/json' \ + -d "{\"data_view\":{\"name\":\"$name\",\"title\":\"$title\",\"timeFieldName\":\"@timestamp\",\"allowNoIndex\":true}}" \ + >/dev/null && echo "[OK] Created $name data view" || echo "[WARN] Failed to create $name data view" +} + create_data_views_idempotent() { echo "[INFO] Checking and creating data views..." - # 检查是否存在匹配的索引 - local train_indices=$(curl -s "$ES_HOST/_cat/indices/train-*?h=index" 2>/dev/null | wc -l || echo "0") - local infer_indices=$(curl -s "$ES_HOST/_cat/indices/infer-*?h=index" 2>/dev/null | wc -l || echo "0") - - # 创建 train 数据视图 - if [ "$train_indices" -gt 0 ]; then - # 检查数据视图是否已存在 - local train_exists=$(curl -s "$KB_HOST/api/data_views" -H 'kbn-xsrf: true' 2>/dev/null | grep '"title":"train-\*"' | wc -l ) - - if [ "$train_exists" -eq 0 ]; then - echo "[INFO] Creating data view for train-* indices" - curl -fsS -X POST "$KB_HOST/api/data_views/data_view" \ - -H 'kbn-xsrf: true' \ - -H 'Content-Type: application/json' \ - -d '{"data_view":{"name":"train","title":"train-*","timeFieldName":"@timestamp"}}' \ - >/dev/null && echo "[OK] Created train data view" || echo "[WARN] Failed to create train data view" - else - echo "[INFO] Train data view already exists, skipping" - fi - else - echo "[INFO] No train-* indices found, skipping train data view creation" - fi - - # 创建 infer 数据视图 - if [ "$infer_indices" -gt 0 ]; then - # 检查数据视图是否已存在 - local infer_exists=$(curl -s "$KB_HOST/api/data_views" -H 'kbn-xsrf: true' 2>/dev/null | grep '"title":"infer-\*"' | wc -l ) - - if [ "$infer_exists" -eq 0 ]; then - echo "[INFO] Creating data view for infer-* indices" - curl -fsS -X POST "$KB_HOST/api/data_views/data_view" \ - -H 'kbn-xsrf: true' \ - -H 'Content-Type: application/json' \ - -d '{"data_view":{"name":"infer","title":"infer-*","timeFieldName":"@timestamp"}}' \ - >/dev/null && echo "[OK] Created infer data view" || echo "[WARN] Failed to create infer data view" - else - echo "[INFO] Infer data view already exists, skipping" - fi - else - echo "[INFO] No infer-* indices found, skipping infer data view creation" - fi + create_or_ensure_data_view "train" "train-*" + create_or_ensure_data_view "infer" "infer-*" } # 主逻辑 diff --git a/src/log/tests/scripts/e2e_test.sh b/src/log/tests/scripts/e2e_test.sh index fbe5197..ed88803 100755 --- a/src/log/tests/scripts/e2e_test.sh +++ b/src/log/tests/scripts/e2e_test.sh @@ -115,20 +115,32 @@ show_step "Health" "Check service health" echo "[INFO] Checking service health..." # 检查 Elasticsearch 健康状态 +health_check_ok=1 es_health=$(curl -s "http://localhost:9200/_cluster/health" | grep -o '"status":"[^"]*"' | cut -d'"' -f4) if [ "$es_health" = "green" ] || [ "$es_health" = "yellow" ]; then echo "✅ Elasticsearch health: $es_health" else echo "❌ Elasticsearch health: $es_health" + health_check_ok=0 fi # 检查 Kibana 状态 if curl -fs "http://localhost:5601/api/status" >/dev/null 2>&1; then kb_status="available" echo "✅ Kibana status: $kb_status" + + data_views_json=$(curl -fs "http://localhost:5601/api/data_views" -H 'kbn-xsrf: true' 2>/dev/null || true) + if echo "$data_views_json" | grep -F '"title":"train-*"' >/dev/null 2>&1 && \ + echo "$data_views_json" | grep -F '"title":"infer-*"' >/dev/null 2>&1; then + echo "✅ Kibana data views: train-* and infer-* present" + else + echo "❌ Kibana data views missing: train-* or infer-*" + health_check_ok=0 + fi else kb_status="unavailable" echo "⚠️ Kibana status: $kb_status" + health_check_ok=0 fi # 检查 Fluent-Bit 指标 @@ -139,6 +151,13 @@ if [ "$fb_host01_uptime" -gt 0 ] && [ "$fb_host02_uptime" -gt 0 ]; then echo "✅ Fluent-Bit services: host01 uptime=${fb_host01_uptime}s, host02 uptime=${fb_host02_uptime}s" else echo "⚠️ Fluent-Bit services: host01 uptime=${fb_host01_uptime}s, host02 uptime=${fb_host02_uptime}s" + health_check_ok=0 +fi + +if [ "$health_check_ok" -eq 1 ]; then + true +else + false fi verify_step "Service health check" diff --git a/src/master/scripts/build_images.sh b/src/master/scripts/build_images.sh index ebb8060..914cadb 100755 --- a/src/master/scripts/build_images.sh +++ b/src/master/scripts/build_images.sh @@ -3,12 +3,13 @@ set -euo pipefail usage() { cat >&2 <<'USAGE' -Usage: $0 [--intranet] [--offline] [--tag ] +Usage: $0 [--intranet] [--offline] [--tag ] [--no-cache] Options: --intranet 使用指定的 PyPI 镜像源(默认清华镜像)。 --offline 完全离线构建,依赖 offline_wheels/ 目录中的离线依赖包。 --tag 自定义镜像标签,默认 argus-master:latest。 + --no-cache 不使用 Docker 构建缓存。 USAGE } @@ -19,6 +20,7 @@ IMAGE_TAG="${IMAGE_TAG:-argus-master:latest}" DOCKERFILE="src/master/Dockerfile" BUILD_ARGS=() OFFLINE_MODE=0 +NO_CACHE=0 source "$PROJECT_ROOT/scripts/common/build_user.sh" load_build_user @@ -45,6 +47,11 @@ while [[ "$#" -gt 0 ]]; do IMAGE_TAG="$2" shift 2 ;; + --no-cache) + NO_CACHE=1 + BUILD_ARGS+=("--no-cache") + shift + ;; -h|--help) usage exit 0 diff --git a/src/sys/debug/.env.example b/src/sys/debug/.env.example new file mode 100644 index 0000000..4ee2fa5 --- /dev/null +++ b/src/sys/debug/.env.example @@ -0,0 +1,12 @@ +# Generated by 01_bootstrap.sh +SYS_DEBUG_PRIVATE_CORE=/absolute/path/to/private +SYS_DEBUG_PRIVATE_NODEA=/absolute/path/to/private-nodea +SYS_DEBUG_PRIVATE_NODEB=/absolute/path/to/private-nodeb +SYS_DEBUG_TMP_DIR=/absolute/path/to/tmp +SYS_DEBUG_NETWORK_NAME=argus-debug-net +SYS_DEBUG_NETWORK_SUBNET=172.30.0.0/16 +SYS_DEBUG_NETWORK_GATEWAY=172.30.0.1 +SYS_DEBUG_PROJECT_NAME=argus-debug +SYS_DEBUG_CONTAINER_PREFIX=argus-debug +ARGUS_BUILD_UID=2133 +ARGUS_BUILD_GID=2015 diff --git a/src/sys/debug/README.md b/src/sys/debug/README.md new file mode 100644 index 0000000..cebfaa4 --- /dev/null +++ b/src/sys/debug/README.md @@ -0,0 +1,68 @@ +# ARGUS 系统调试部署模式 + +该目录提供基于系统级 E2E 测试构建的调试部署流程,便于本地快速复现与排查问题。核心特性: + +- 独立 docker 网络 `argus-debug-net`(默认子网 `172.30.0.0/16`),避免与 `src/sys/tests` 冲突。 +- 私有数据目录可通过参数自定义,例如 `--private-root /tmp/argus-debug`。 +- 默认保留调试过程生成的文件,避免 `down`/`bootstrap` 自动删除。 + +## 快速开始 + +```bash +cd src/sys/debug + +# 仅首次需要,创建 external 网络 +./scripts/network-create.sh + +# 初始化目录/构建 agent/写入 .env +./scripts/01_bootstrap.sh --private-root /tmp/argus-debug + +# 启动调试栈 +./scripts/02_up.sh + +# 根据需要执行验证脚本(03~08) +./scripts/03_wait_ready.sh +... + +# 调试结束停止服务 +./scripts/09_down.sh + +# 若需移除网络或数据 +./scripts/network-destroy.sh +./scripts/clean-data.sh +``` + +> **提示**:调试与测试栈不能同时运行,应保持 `src/sys/tests` 中的 `argus-sys` 栈已停止。 + +## 参数与环境变量 + +- `--private-root `:同时指定核心服务与两个节点的私有目录根,脚本自动派生 `private`、`private-nodea`、`private-nodeb`。 +- `--private-core `、`--private-nodea `、`--private-nodeb `:分别覆盖单独目录。 +- 环境变量可覆盖 `.env` 中写入的值,例如 `export SYS_DEBUG_NETWORK_NAME=my-debug-net`。 +- `.env` 文件字段: + - `SYS_DEBUG_PRIVATE_CORE` + - `SYS_DEBUG_PRIVATE_NODEA` + - `SYS_DEBUG_PRIVATE_NODEB` + - `SYS_DEBUG_TMP_DIR` + - `SYS_DEBUG_NETWORK_NAME` + - `SYS_DEBUG_NETWORK_SUBNET` + - `SYS_DEBUG_NETWORK_GATEWAY` + - `SYS_DEBUG_PROJECT_NAME` + - `SYS_DEBUG_CONTAINER_PREFIX` + - `ARGUS_BUILD_UID` / `ARGUS_BUILD_GID` + +## 脚本说明 + +- `scripts/common.sh`:通用函数与环境加载。 +- `scripts/network-create.sh` / `network-destroy.sh`:管理 external 网络。 +- `scripts/00_debug_all.sh`:顺序执行 01~08(默认不执行 09)。 +- `scripts/clean-data.sh`:选择性清理宿主机私有数据。 +- `scripts/03_wait_ready.sh`:除了等待各服务就绪,还会在 Elasticsearch 就绪后自动将磁盘水位阈值放宽(97%/98%/99%),避免在磁盘紧张的调试环境中分片分配失败。 +- `scripts/08_restart_agent_reregister.sh`:将 node-b 切换到 `SYS_DEBUG_NODEB_FIXED_IP`(默认 `172.30.0.200`),如果目标地址与当前 IP 相同脚本会报错提醒重新选择地址。 +- 其它 `01~09` 与测试目录对应,但针对参数化路径及网络做了调整。 + +## 注意事项 + +- 若宿主机未安装 Docker,脚本将提示错误并退出。 +- 当指定的私有目录已存在数据时,脚本不会清理,请确认内容安全后再复用。 +- 与测试环境共用镜像:请提前执行仓库根目录的 `./build/build_images.sh`。 diff --git a/src/sys/debug/docker-compose.yml b/src/sys/debug/docker-compose.yml new file mode 100644 index 0000000..c11f777 --- /dev/null +++ b/src/sys/debug/docker-compose.yml @@ -0,0 +1,147 @@ +version: "3.8" + +networks: + argus-debug-net: + external: true + name: ${SYS_DEBUG_NETWORK_NAME:-argus-debug-net} + +services: + bind: + image: ${BIND_IMAGE_TAG:-argus-bind9:latest} + container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-bind + networks: + argus-debug-net: + ipv4_address: ${SYS_DEBUG_BIND_IP:-172.30.0.2} + volumes: + - ${SYS_DEBUG_PRIVATE_CORE}:/private + restart: unless-stopped + + master: + image: ${MASTER_IMAGE_TAG:-argus-master:latest} + container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-master + depends_on: + - bind + environment: + - OFFLINE_THRESHOLD_SECONDS=6 + - ONLINE_THRESHOLD_SECONDS=2 + - SCHEDULER_INTERVAL_SECONDS=1 + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + ports: + - "32300:3000" + volumes: + - ${SYS_DEBUG_PRIVATE_CORE}/argus/master:/private/argus/master + - ${SYS_DEBUG_PRIVATE_CORE}/argus/metric/prometheus:/private/argus/metric/prometheus + - ${SYS_DEBUG_PRIVATE_CORE}/argus/etc:/private/argus/etc + networks: + argus-debug-net: + ipv4_address: ${SYS_DEBUG_MASTER_IP:-172.30.0.10} + restart: unless-stopped + + es: + image: ${ES_IMAGE_TAG:-argus-elasticsearch:latest} + container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-es + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - ES_JAVA_OPTS=-Xms512m -Xmx512m + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ${SYS_DEBUG_PRIVATE_CORE}/argus/log/elasticsearch:/private/argus/log/elasticsearch + - ${SYS_DEBUG_PRIVATE_CORE}/argus/etc:/private/argus/etc + ports: + - "9200:9200" + networks: + argus-debug-net: + ipv4_address: ${SYS_DEBUG_ES_IP:-172.30.0.20} + restart: unless-stopped + + kibana: + image: ${KIBANA_IMAGE_TAG:-argus-kibana:latest} + container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-kibana + environment: + - ELASTICSEARCH_HOSTS=http://es.log.argus.com:9200 + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + volumes: + - ${SYS_DEBUG_PRIVATE_CORE}/argus/log/kibana:/private/argus/log/kibana + - ${SYS_DEBUG_PRIVATE_CORE}/argus/etc:/private/argus/etc + depends_on: + - es + ports: + - "5601:5601" + networks: + argus-debug-net: + ipv4_address: ${SYS_DEBUG_KIBANA_IP:-172.30.0.30} + restart: unless-stopped + + node-a: + image: ubuntu:22.04 + container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-node-a + hostname: ${SYS_DEBUG_NODEA_HOST:-dev-yyrshare-nbnyx10-cp2f-pod-0} + depends_on: + - master + - bind + - es + environment: + - MASTER_ENDPOINT=http://master.argus.com:3000 + - REPORT_INTERVAL_SECONDS=2 + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - ES_HOST=es + - ES_PORT=9200 + - CLUSTER=local + - RACK=dev + volumes: + - ${SYS_DEBUG_PRIVATE_NODEA}/argus/agent/${SYS_DEBUG_NODEA_HOST:-dev-yyrshare-nbnyx10-cp2f-pod-0}:/private/argus/agent/${SYS_DEBUG_NODEA_HOST:-dev-yyrshare-nbnyx10-cp2f-pod-0} + - ../../agent/dist/argus-agent:/usr/local/bin/argus-agent:ro + - ../tests/scripts/node_entrypoint.sh:/usr/local/bin/node-entrypoint.sh:ro + - ../../log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro + - ../../log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro + - ../../log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro + entrypoint: + - /usr/local/bin/node-entrypoint.sh + dns: + - ${SYS_DEBUG_BIND_IP:-172.30.0.2} + ports: + - "2020:2020" + networks: + argus-debug-net: + ipv4_address: ${SYS_DEBUG_NODEA_IP:-172.30.0.101} + restart: unless-stopped + + node-b: + image: ubuntu:22.04 + container_name: ${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-node-b + hostname: ${SYS_DEBUG_NODEB_HOST:-dev-yyrshare-uuuu10-ep2f-pod-0} + depends_on: + - master + - bind + - es + environment: + - MASTER_ENDPOINT=http://master.argus.com:3000 + - REPORT_INTERVAL_SECONDS=2 + - ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} + - ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + - ES_HOST=es + - ES_PORT=9200 + - CLUSTER=local + - RACK=dev + volumes: + - ${SYS_DEBUG_PRIVATE_NODEB}/argus/agent/${SYS_DEBUG_NODEB_HOST:-dev-yyrshare-uuuu10-ep2f-pod-0}:/private/argus/agent/${SYS_DEBUG_NODEB_HOST:-dev-yyrshare-uuuu10-ep2f-pod-0} + - ../../agent/dist/argus-agent:/usr/local/bin/argus-agent:ro + - ../tests/scripts/node_entrypoint.sh:/usr/local/bin/node-entrypoint.sh:ro + - ../../log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro + - ../../log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro + - ../../log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro + entrypoint: + - /usr/local/bin/node-entrypoint.sh + dns: + - ${SYS_DEBUG_BIND_IP:-172.30.0.2} + ports: + - "2021:2020" + networks: + argus-debug-net: + ipv4_address: ${SYS_DEBUG_NODEB_IP:-172.30.0.102} + restart: unless-stopped diff --git a/src/sys/debug/scripts/00_debug_all.sh b/src/sys/debug/scripts/00_debug_all.sh new file mode 100755 index 0000000..6e39309 --- /dev/null +++ b/src/sys/debug/scripts/00_debug_all.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +SCRIPTS=( + "01_bootstrap.sh" + "02_up.sh" + "03_wait_ready.sh" + "04_verify_dns_routing.sh" + "05_agent_register.sh" + "06_write_health_and_assert.sh" + "07_logs_send_and_assert.sh" + "08_restart_agent_reregister.sh" +) + +for script in "${SCRIPTS[@]}"; do + echo "[SYS-DEBUG] Running $script" + "$SCRIPT_DIR/$script" + echo "[SYS-DEBUG] $script completed" + echo +done + +echo "[SYS-DEBUG] Complete. Run scripts/09_down.sh when finished (data retained)." diff --git a/src/sys/debug/scripts/01_bootstrap.sh b/src/sys/debug/scripts/01_bootstrap.sh new file mode 100755 index 0000000..e044e5e --- /dev/null +++ b/src/sys/debug/scripts/01_bootstrap.sh @@ -0,0 +1,210 @@ +#!/usr/bin/env bash +set -euo pipefail + +# shellcheck source=common.sh +source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" + +PRIVATE_ROOT="" +PRIVATE_CORE="$SYS_DEBUG_PRIVATE_CORE" +PRIVATE_NODEA="$SYS_DEBUG_PRIVATE_NODEA" +PRIVATE_NODEB="$SYS_DEBUG_PRIVATE_NODEB" +TMP_DIR_VAL="$SYS_DEBUG_TMP_DIR" +NETWORK_NAME="$SYS_DEBUG_NETWORK_NAME" +NETWORK_SUBNET="$SYS_DEBUG_NETWORK_SUBNET" +NETWORK_GATEWAY="$SYS_DEBUG_NETWORK_GATEWAY" +PROJECT_NAME="$SYS_DEBUG_PROJECT_NAME" +CONTAINER_PREFIX="$SYS_DEBUG_CONTAINER_PREFIX" +NODEB_FIXED_IP=${SYS_DEBUG_NODEB_FIXED_IP:-172.30.0.200} + +usage() { + cat <&2; exit 1; } + PRIVATE_ROOT="$1" + ;; + --private-root=*) + PRIVATE_ROOT="${1#*=}" + ;; + --private-core) + shift; [[ $# -gt 0 ]] || { echo "--private-core requires value" >&2; exit 1; } + PRIVATE_CORE="$1" + ;; + --private-core=*) + PRIVATE_CORE="${1#*=}" + ;; + --private-nodea) + shift; [[ $# -gt 0 ]] || { echo "--private-nodea requires value" >&2; exit 1; } + PRIVATE_NODEA="$1" + ;; + --private-nodea=*) + PRIVATE_NODEA="${1#*=}" + ;; + --private-nodeb) + shift; [[ $# -gt 0 ]] || { echo "--private-nodeb requires value" >&2; exit 1; } + PRIVATE_NODEB="$1" + ;; + --private-nodeb=*) + PRIVATE_NODEB="${1#*=}" + ;; + --tmp-dir) + shift; [[ $# -gt 0 ]] || { echo "--tmp-dir requires value" >&2; exit 1; } + TMP_DIR_VAL="$1" + ;; + --tmp-dir=*) + TMP_DIR_VAL="${1#*=}" + ;; + --network-name) + shift; [[ $# -gt 0 ]] || { echo "--network-name requires value" >&2; exit 1; } + NETWORK_NAME="$1" + ;; + --network-name=*) + NETWORK_NAME="${1#*=}" + ;; + --network-subnet) + shift; [[ $# -gt 0 ]] || { echo "--network-subnet requires value" >&2; exit 1; } + NETWORK_SUBNET="$1" + ;; + --network-subnet=*) + NETWORK_SUBNET="${1#*=}" + ;; + --network-gateway) + shift; [[ $# -gt 0 ]] || { echo "--network-gateway requires value" >&2; exit 1; } + NETWORK_GATEWAY="$1" + ;; + --network-gateway=*) + NETWORK_GATEWAY="${1#*=}" + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage >&2 + exit 1 + ;; + esac + shift +done + +if [[ -n "$PRIVATE_ROOT" ]]; then + PRIVATE_CORE="$PRIVATE_ROOT/private" + PRIVATE_NODEA="$PRIVATE_ROOT/private-nodea" + PRIVATE_NODEB="$PRIVATE_ROOT/private-nodeb" +fi + +PRIVATE_CORE=$(abs_path "$PRIVATE_CORE") +PRIVATE_NODEA=$(abs_path "$PRIVATE_NODEA") +PRIVATE_NODEB=$(abs_path "$PRIVATE_NODEB") +TMP_DIR_VAL=$(abs_path "$TMP_DIR_VAL") + +log "Preparing directories under $PRIVATE_CORE" +mkdir -p \ + "$PRIVATE_CORE/argus/etc" \ + "$PRIVATE_CORE/argus/bind" \ + "$PRIVATE_CORE/argus/master" \ + "$PRIVATE_CORE/argus/metric/prometheus" \ + "$PRIVATE_CORE/argus/log/elasticsearch" \ + "$PRIVATE_CORE/argus/log/kibana" \ + "$PRIVATE_NODEA/argus/agent/$HOST_A/health" \ + "$PRIVATE_NODEB/argus/agent/$HOST_B/health" \ + "$TMP_DIR_VAL" + +log "Aligning ownership for core directories" +chown -R "${ARGUS_BUILD_UID}:${ARGUS_BUILD_GID}" \ + "$PRIVATE_CORE/argus/log/elasticsearch" \ + "$PRIVATE_CORE/argus/log/kibana" \ + "$PRIVATE_CORE/argus/etc" 2>/dev/null || true + +log "Distributing update-dns.sh" +BIND_UPDATE_SRC="$REPO_ROOT/src/bind/build/update-dns.sh" +BIND_UPDATE_DEST="$PRIVATE_CORE/argus/etc/update-dns.sh" +if [[ -f "$BIND_UPDATE_SRC" ]]; then + cp "$BIND_UPDATE_SRC" "$BIND_UPDATE_DEST" + chmod +x "$BIND_UPDATE_DEST" +else + echo "[WARN] Missing $BIND_UPDATE_SRC" >&2 +fi + +require_docker + +ensure_image() { + local image="$1" + if ! docker image inspect "$image" >/dev/null 2>&1; then + echo "[ERR] Missing image: $image. Run ./build/build_images.sh" >&2 + exit 1 + fi +} + +log "Ensuring required images exist" +ensure_image "${ES_IMAGE_TAG:-argus-elasticsearch:latest}" +ensure_image "${KIBANA_IMAGE_TAG:-argus-kibana:latest}" +ensure_image "${BIND_IMAGE_TAG:-argus-bind9:latest}" +ensure_image "${MASTER_IMAGE_TAG:-argus-master:latest}" + +log "Building agent binary" +pushd "$REPO_ROOT/src/agent" >/dev/null +./scripts/build_binary.sh +popd >/dev/null + +AGENT_BIN="$REPO_ROOT/src/agent/dist/argus-agent" +if [[ ! -x "$AGENT_BIN" ]]; then + echo "[ERR] Agent binary not found at $AGENT_BIN" >&2 + exit 1 +fi +echo "$AGENT_BIN" > "$TMP_DIR_VAL/agent_binary_path" + +log "Preparing environment file contents" +tmp_env="$(mktemp)" +cat > "$tmp_env" </dev/null 2>&1; then + echo "[ERR] Network $SYS_DEBUG_NETWORK_NAME not found. Run scripts/network-create.sh first." >&2 + exit 1 +fi + +log "Starting debug stack on project $SYS_DEBUG_PROJECT_NAME" +compose up -d + +log "Services started: master:32300 es:9200 kibana:5601 node-a:2020 node-b:2021" diff --git a/src/sys/debug/scripts/03_wait_ready.sh b/src/sys/debug/scripts/03_wait_ready.sh new file mode 100755 index 0000000..768d0f4 --- /dev/null +++ b/src/sys/debug/scripts/03_wait_ready.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +set -euo pipefail + +# shellcheck source=common.sh +source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" + +ensure_env_file +ensure_paths_defined + +service_id() { + compose ps -q "$1" +} + +wait_http() { + local url="$1"; local attempts="${2:-120}"; local i=1 + while (( i <= attempts )); do + if curl -fsS "$url" >/dev/null 2>&1; then + return 0 + fi + echo "[..] waiting $url ($i/$attempts)" + sleep 5 + ((i++)) + done + echo "[ERR] Timeout waiting for $url" >&2 + return 1 +} + +log "Waiting for ES/Kibana/Master/Fluent Bit/Bind" + +attempt=1; max=120 +while (( attempt <= max )); do + if curl -fsS "http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=1s" >/dev/null 2>&1; then + break + fi + echo "[..] waiting ES ($attempt/$max)" + sleep 5 + ((attempt++)) +done +if (( attempt > max )); then + echo "[ERR] ES not ready" >&2 + exit 1 +fi + +log "Applying relaxed ES disk watermarks for debug" +curl -fsS -XPUT "http://localhost:9200/_cluster/settings" \ + -H 'Content-Type: application/json' \ + -d '{ + "transient": { + "cluster.routing.allocation.disk.watermark.low": "99%", + "cluster.routing.allocation.disk.watermark.high": "99%", + "cluster.routing.allocation.disk.watermark.flood_stage": "99%" + } + }' >/dev/null || echo "[WARN] Failed to adjust ES watermarks" + +log "Waiting for Kibana to be available (HTTP 200)" +kb_attempt=1; kb_max=180 +while (( kb_attempt <= kb_max )); do + body=$(curl -sS "http://localhost:5601/api/status" 2>/dev/null || true) + code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:5601/api/status" || echo 000) + if [[ "$code" == "200" ]] && echo "$body" | grep -q '"level":"available"'; then + log "Kibana available" + break + fi + echo "[..] waiting kibana 200 ($kb_attempt/$kb_max), last_code=$code" + sleep 5 + ((kb_attempt++)) +done +if (( kb_attempt > kb_max )); then + echo "[ERR] Kibana did not reach HTTP 200" >&2 + exit 1 +fi + +wait_http "http://localhost:32300/readyz" 120 +wait_http "http://localhost:2020/api/v2/metrics" 120 +wait_http "http://localhost:2021/api/v2/metrics" 120 + +BIND_ID="$(service_id bind)" +if [[ -n "$BIND_ID" ]]; then + docker exec "$BIND_ID" named-checkconf >/dev/null +else + echo "[WARN] bind container id not found" >&2 +fi + +log "All services are ready" diff --git a/src/sys/debug/scripts/04_verify_dns_routing.sh b/src/sys/debug/scripts/04_verify_dns_routing.sh new file mode 100755 index 0000000..4244e8d --- /dev/null +++ b/src/sys/debug/scripts/04_verify_dns_routing.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +set -euo pipefail + +# shellcheck source=common.sh +source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" + +ensure_env_file +ensure_paths_defined + +service_id() { + compose ps -q "$1" +} + +log "Verifying DNS routing via bind" + +MASTER_FILE="$SYS_DEBUG_PRIVATE_CORE/argus/etc/master.argus.com" +if [[ ! -f "$MASTER_FILE" ]]; then + echo "[ERR] master.argus.com file missing at $MASTER_FILE" >&2 + exit 1 +fi +MASTER_IP_HOST="$(tr -d '\r\n' < "$MASTER_FILE" || true)" +log "master.argus.com file content: $MASTER_IP_HOST" + +BIN_ID="$(service_id bind)" +if [[ -n "$BIN_ID" ]]; then + DIG_IP="$(docker exec "$BIN_ID" dig +short master.argus.com A | tail -n1 || true)" + log "dig(master.argus.com) from bind container -> $DIG_IP" + if [[ -z "$DIG_IP" ]]; then + echo "[ERR] bind did not resolve master.argus.com" >&2 + exit 1 + fi +else + echo "[WARN] bind container not found; skip dig" >&2 +fi + +for node in node-a node-b; do + CID="$(service_id "$node")" + if [[ -z "$CID" ]]; then + echo "[ERR] Container for $node not found" >&2 + exit 1 + fi + log "Checking resolution inside $node" + if ! docker exec "$CID" getent hosts master.argus.com >/dev/null 2>&1; then + echo "[ERR] $node cannot resolve master.argus.com" >&2 + exit 1 + fi + RES="$(docker exec "$CID" getent hosts master.argus.com | awk '{print $1}' | head -n1)" + log "$node resolved master.argus.com -> $RES" +done + +log "DNS routing verified" diff --git a/src/sys/debug/scripts/05_agent_register.sh b/src/sys/debug/scripts/05_agent_register.sh new file mode 100755 index 0000000..ec41857 --- /dev/null +++ b/src/sys/debug/scripts/05_agent_register.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +set -euo pipefail + +# shellcheck source=common.sh +source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" + +ensure_env_file +ensure_paths_defined + +TMP_DIR_LOCAL="$TMP_DIR" +mkdir -p "$TMP_DIR_LOCAL" + +API_BASE="http://localhost:32300/api/v1/master" + +log "Waiting for agent nodes to register" + +extract_node() { + local name="$1"; local output="$2"; local json_file="$3" + python3 - "$name" "$output" "$json_file" <<'PY' +import json, sys, pathlib +name = sys.argv[1] +out = pathlib.Path(sys.argv[2]) +json_file = sys.argv[3] +with open(json_file, 'r') as fh: + data = json.load(fh) +node = next((n for n in data if n.get("name") == name), None) +if node: + out.write_text(node["id"]) + print(node["id"]) +PY +} + +ID_A=""; ID_B="" +for _ in {1..60}; do + sleep 2 + resp=$(curl -fsS "$API_BASE/nodes" 2>/dev/null || true) + [[ -z "$resp" ]] && continue + if ! echo "$resp" | head -c1 | grep -q '\['; then + continue + fi + echo "$resp" > "$TMP_DIR_LOCAL/nodes_list.json" + ID_A=$(extract_node "$HOST_A" "$TMP_DIR_LOCAL/node_id_a" "$TMP_DIR_LOCAL/nodes_list.json" 2>/dev/null || true) + ID_B=$(extract_node "$HOST_B" "$TMP_DIR_LOCAL/node_id_b" "$TMP_DIR_LOCAL/nodes_list.json" 2>/dev/null || true) + if [[ -s "$TMP_DIR_LOCAL/node_id_a" && -s "$TMP_DIR_LOCAL/node_id_b" ]]; then + break + fi +done + +if [[ ! -s "$TMP_DIR_LOCAL/node_id_a" || ! -s "$TMP_DIR_LOCAL/node_id_b" ]]; then + echo "[ERR] Agents did not register in time" >&2 + exit 1 +fi + +node_detail() { + local id="$1"; local out="$2" + curl -fsS "$API_BASE/nodes/$id" -o "$out" +} + +node_detail "$(cat "$TMP_DIR_LOCAL/node_id_a")" "$TMP_DIR_LOCAL/detail_a.json" +node_detail "$(cat "$TMP_DIR_LOCAL/node_id_b")" "$TMP_DIR_LOCAL/detail_b.json" + +python3 - "$TMP_DIR_LOCAL/detail_a.json" "$TMP_DIR_LOCAL/initial_ip_a" <<'PY' +import json, sys, pathlib +node=json.load(open(sys.argv[1])) +ip=node.get("meta_data",{}).get("ip") +assert ip, "missing ip" +pathlib.Path(sys.argv[2]).write_text(ip) +PY + +python3 - "$TMP_DIR_LOCAL/detail_b.json" "$TMP_DIR_LOCAL/initial_ip_b" <<'PY' +import json, sys, pathlib +node=json.load(open(sys.argv[1])) +ip=node.get("meta_data",{}).get("ip") +assert ip, "missing ip" +pathlib.Path(sys.argv[2]).write_text(ip) +PY + +NODE_JSON_A="$SYS_DEBUG_PRIVATE_NODEA/argus/agent/$HOST_A/node.json" +NODE_JSON_B="$SYS_DEBUG_PRIVATE_NODEB/argus/agent/$HOST_B/node.json" + +[[ -f "$NODE_JSON_A" ]] || { echo "[ERR] node.json missing for $HOST_A" >&2; exit 1; } +[[ -f "$NODE_JSON_B" ]] || { echo "[ERR] node.json missing for $HOST_B" >&2; exit 1; } + +log "Agents registered: $(cat "$TMP_DIR_LOCAL/node_id_a") , $(cat "$TMP_DIR_LOCAL/node_id_b")" diff --git a/src/sys/debug/scripts/06_write_health_and_assert.sh b/src/sys/debug/scripts/06_write_health_and_assert.sh new file mode 100755 index 0000000..1cf85ca --- /dev/null +++ b/src/sys/debug/scripts/06_write_health_and_assert.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +set -euo pipefail + +# shellcheck source=common.sh +source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" + +ensure_env_file +ensure_paths_defined + +API_BASE="http://localhost:32300/api/v1/master" + +HEALTH_A="$SYS_DEBUG_PRIVATE_NODEA/argus/agent/$HOST_A/health" +HEALTH_B="$SYS_DEBUG_PRIVATE_NODEB/argus/agent/$HOST_B/health" + +write_health() { + local dir="$1"; mkdir -p "$dir" + cat > "$dir/log-fluentbit.json" < "$dir/metric-node-exporter.json" <&2; exit 1; } + +ID_A_VAL="$(cat "$ID_A")" +ID_B_VAL="$(cat "$ID_B")" + +check_health() { + local id="$1"; local tries=40 + for _ in $(seq 1 $tries); do + sleep 2 + resp=$(curl -fsS "$API_BASE/nodes/$id" 2>/dev/null || true) + [[ -z "$resp" ]] && continue + echo "$resp" > "$TMP_DIR/node_${id}_detail.json" + if python3 - "$TMP_DIR/node_${id}_detail.json" <<'PY' +import json,sys +node=json.load(open(sys.argv[1])) +h=node.get("health",{}) +if "log-fluentbit" in h and "metric-node-exporter" in h: + sys.exit(0) +sys.exit(1) +PY + then + return 0 + fi + done + return 1 +} + +check_health "$ID_A_VAL" || { echo "[ERR] health keys not reported for node A" >&2; exit 1; } +check_health "$ID_B_VAL" || { echo "[ERR] health keys not reported for node B" >&2; exit 1; } + +NODES_JSON="$SYS_DEBUG_PRIVATE_CORE/argus/metric/prometheus/nodes.json" +if [[ ! -f "$NODES_JSON" ]]; then + echo "[ERR] nodes.json missing at $NODES_JSON" >&2 + exit 1 +fi + +python3 - "$NODES_JSON" <<'PY' +import json,sys +with open(sys.argv[1]) as h: + nodes=json.load(h) +if not isinstance(nodes, list): + raise SystemExit("nodes.json expected list") +if len(nodes) != 2: + raise SystemExit(f"expected 2 nodes online, got {len(nodes)}") +PY + +log "Health reported and nodes.json has 2 online nodes" diff --git a/src/sys/debug/scripts/07_logs_send_and_assert.sh b/src/sys/debug/scripts/07_logs_send_and_assert.sh new file mode 100755 index 0000000..775a886 --- /dev/null +++ b/src/sys/debug/scripts/07_logs_send_and_assert.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +set -euo pipefail + +# shellcheck source=common.sh +source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" + +ensure_env_file +ensure_paths_defined + +log "Sending logs and asserting ES counts" + +get_count() { + local idx="$1" + curl -s "http://localhost:9200/${idx}/_count?ignore_unavailable=true&allow_no_indices=true" | sed -E 's/.*"count":([0-9]+).*/\1/' | awk 'NF{print $0;exit} END{if(NR==0)print 0}' +} + +train0=$(get_count "train-*") +infer0=$(get_count "infer-*") +base=$((train0 + infer0)) +log "initial counts: train=${train0} infer=${infer0} total=${base}" + +service_id() { + compose ps -q "$1" +} + +send_logs() { + local sid="$1"; local hosttag="$2" + docker exec "$sid" sh -lc 'mkdir -p /logs/train /logs/infer' + docker exec "$sid" sh -lc "ts=\ +\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log" + docker exec "$sid" sh -lc "ts=\ +\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log" + docker exec "$sid" sh -lc "ts=\ +\$(date '+%F %T'); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log" +} + +CID_A="$(service_id node-a)" +CID_B="$(service_id node-b)" + +[[ -n "$CID_A" && -n "$CID_B" ]] || { echo "[ERR] node containers not found" >&2; exit 1; } + +send_logs "$CID_A" "host01" +send_logs "$CID_B" "host02" + +log "Waiting for ES to ingest" +sleep 10 + +train1=$(get_count "train-*") +infer1=$(get_count "infer-*") +final=$((train1 + infer1)) +log "final counts: train=${train1} infer=${infer1} total=${final}" + +if (( final <= base )); then + echo "[ERR] ES total did not increase (${base} -> ${final})" >&2 + exit 1 +fi + +if (( final < 4 )); then + echo "[ERR] ES total below expected threshold: ${final} < 4" >&2 + exit 1 +fi + +es_health=$(curl -s "http://localhost:9200/_cluster/health" | grep -o '"status":"[^"]*"' | cut -d'"' -f4) +if [[ "$es_health" != "green" && "$es_health" != "yellow" ]]; then + echo "[ERR] ES health not green/yellow: $es_health" >&2 + exit 1 +fi + +if ! curl -fs "http://localhost:5601/api/status" >/dev/null 2>&1; then + echo "[WARN] Kibana status endpoint not available" +fi + +log "ES counts increased and services healthy" diff --git a/src/sys/debug/scripts/08_restart_agent_reregister.sh b/src/sys/debug/scripts/08_restart_agent_reregister.sh new file mode 100755 index 0000000..30b1298 --- /dev/null +++ b/src/sys/debug/scripts/08_restart_agent_reregister.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +set -euo pipefail + +# shellcheck source=common.sh +source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" + +ensure_env_file +ensure_paths_defined + +API_BASE="http://localhost:32300/api/v1/master" +NODE_ENTRYPOINT="$DEBUG_ROOT/../tests/scripts/node_entrypoint.sh" +[[ -f "$NODE_ENTRYPOINT" ]] || { echo "[ERR] node entrypoint script missing at $NODE_ENTRYPOINT" >&2; exit 1; } + +TARGET_FIXED_IP="${SYS_DEBUG_NODEB_FIXED_IP:-172.30.0.200}" + +ID_B_FILE="$TMP_DIR/node_id_b" +IP_INIT_FILE="$TMP_DIR/initial_ip_b" +[[ -f "$ID_B_FILE" && -f "$IP_INIT_FILE" ]] || { echo "[ERR] Required node id/ip files missing in $TMP_DIR" >&2; exit 1; } + +ID_B="$(cat "$ID_B_FILE")" +IP0_B="$(cat "$IP_INIT_FILE")" + +DETAIL_BEFORE="$TMP_DIR/node_b_before.json" +curl -fsS "$API_BASE/nodes/$ID_B" -o "$DETAIL_BEFORE" +LAST0=$(python3 - "$DETAIL_BEFORE" <<'PY' +import json,sys +node=json.load(open(sys.argv[1])) +print(node.get("last_updated","")) +PY +) +IP_BEFORE=$(python3 - "$DETAIL_BEFORE" <<'PY' +import json,sys +node=json.load(open(sys.argv[1])) +print(node.get("meta_data",{}).get("ip","")) +PY +) + +if [[ "$IP_BEFORE" != "$IP0_B" ]]; then + echo "[ERR] Expected initial IP $IP0_B for node-b, got $IP_BEFORE" >&2 + exit 1 +fi + +if [[ "$IP_BEFORE" == "$TARGET_FIXED_IP" ]]; then + echo "[ERR] node-b current IP $IP_BEFORE already matches target $TARGET_FIXED_IP. Configure SYS_DEBUG_NODEB_FIXED_IP to a different address before rerun." >&2 + exit 1 +fi + +service_id() { + compose ps -q "$1" +} + +log "Recreating node-b (old IP $IP_BEFORE) with static IP $TARGET_FIXED_IP" +compose rm -sf node-b >/dev/null 2>&1 || true + +CONTAINER_NAME="${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug}-node-b" +docker rm -f "$CONTAINER_NAME" >/dev/null 2>&1 || true + +AGENT_BIN_PATH="$(cat "$TMP_DIR/agent_binary_path")" +[[ -f "$AGENT_BIN_PATH" ]] || { echo "[ERR] Agent binary path missing in $TMP_DIR" >&2; exit 1; } + +require_docker + +docker run -d \ + --name "$CONTAINER_NAME" \ + --hostname "$HOST_B" \ + --network "$SYS_DEBUG_NETWORK_NAME" \ + --ip "$TARGET_FIXED_IP" \ + --dns "${SYS_DEBUG_BIND_IP:-172.30.0.2}" \ + -e MASTER_ENDPOINT=http://master.argus.com:3000 \ + -e REPORT_INTERVAL_SECONDS=2 \ + -e ARGUS_BUILD_UID=$ARGUS_BUILD_UID \ + -e ARGUS_BUILD_GID=$ARGUS_BUILD_GID \ + -e ES_HOST=es \ + -e ES_PORT=9200 \ + -e CLUSTER=local \ + -e RACK=dev \ + -p 2021:2020 \ + -v "$SYS_DEBUG_PRIVATE_NODEB/argus/agent/$HOST_B:/private/argus/agent/$HOST_B" \ + -v "$AGENT_BIN_PATH:/usr/local/bin/argus-agent:ro" \ + -v "$NODE_ENTRYPOINT:/usr/local/bin/node-entrypoint.sh:ro" \ + -v "$REPO_ROOT/src/log/fluent-bit/build/start-fluent-bit.sh:/assets/start-fluent-bit.sh:ro" \ + -v "$REPO_ROOT/src/log/fluent-bit/build/etc:/assets/fluent-bit/etc:ro" \ + -v "$REPO_ROOT/src/log/fluent-bit/build/packages:/assets/fluent-bit/packages:ro" \ + --entrypoint /usr/local/bin/node-entrypoint.sh \ + ubuntu:22.04 >/dev/null + +log "Waiting for node-b to re-register with new IP" +for _ in {1..40}; do + sleep 3 + if curl -fsS "$API_BASE/nodes/$ID_B" -o "$TMP_DIR/node_b_after.json"; then + if python3 - "$TMP_DIR/node_b_after.json" "$LAST0" "$TARGET_FIXED_IP" <<'PY' +import json,sys +node=json.load(open(sys.argv[1])) +last0=sys.argv[2] +expected_ip=sys.argv[3] +ip=node.get("meta_data",{}).get("ip") +lu=node.get("last_updated") +if ip == expected_ip and lu and lu != last0: + sys.exit(0) +sys.exit(1) +PY + then + log "node-b IP updated: $IP_BEFORE -> $TARGET_FIXED_IP" + exit 0 + fi + fi +done + +echo "[ERR] node-b did not update to IP $TARGET_FIXED_IP in time" >&2 +exit 1 diff --git a/src/sys/debug/scripts/09_down.sh b/src/sys/debug/scripts/09_down.sh new file mode 100755 index 0000000..87ef0bf --- /dev/null +++ b/src/sys/debug/scripts/09_down.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -euo pipefail + +# shellcheck source=common.sh +source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" + +ensure_env_file +require_docker + +log "Stopping debug stack (project $SYS_DEBUG_PROJECT_NAME)" +compose down --remove-orphans >/dev/null 2>&1 || true + +log "Containers stopped. No host directories were removed." diff --git a/src/sys/debug/scripts/clean-data.sh b/src/sys/debug/scripts/clean-data.sh new file mode 100755 index 0000000..79267aa --- /dev/null +++ b/src/sys/debug/scripts/clean-data.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +set -euo pipefail + +# shellcheck source=common.sh +source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" + +ensure_env_file +ensure_paths_defined + +FORCE=false +while [[ $# -gt 0 ]]; do + case "$1" in + -y|--yes) + FORCE=true + ;; + -h|--help) + cat <&2 + exit 1 + ;; + esac + shift +done + +if [[ $FORCE == false ]]; then + read -r -p "This will delete debug private directories. Continue? [y/N] " reply + case "$reply" in + y|Y|yes|YES) + ;; + *) + echo "Aborted" + exit 0 + ;; + esac +fi + +paths=( + "$SYS_DEBUG_PRIVATE_CORE" + "$SYS_DEBUG_PRIVATE_NODEA" + "$SYS_DEBUG_PRIVATE_NODEB" + "$SYS_DEBUG_TMP_DIR" +) + +require_docker + +image="ubuntu:22.04" + +for dir in "${paths[@]}"; do + [[ -d "$dir" ]] || continue + log "Fixing ownership for $dir" + if ! docker run --rm -v "$dir:/target" "$image" chown -R "$(id -u):$(id -g)" /target >/dev/null 2>&1; then + echo "[WARN] Failed to adjust ownership via $image, attempting local chown" >&2 + chown -R "$(id -u):$(id -g)" "$dir" >/dev/null 2>&1 || true + fi + log "Removing $dir" + rm -rf "$dir" +done + +log "Clean data completed" diff --git a/src/sys/debug/scripts/common.sh b/src/sys/debug/scripts/common.sh new file mode 100755 index 0000000..1510e65 --- /dev/null +++ b/src/sys/debug/scripts/common.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +DEBUG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +REPO_ROOT="$(cd "$DEBUG_ROOT/../../.." && pwd)" +ENV_FILE="$DEBUG_ROOT/.env" + +source "$REPO_ROOT/scripts/common/build_user.sh" +load_build_user + +if [[ -f "$ENV_FILE" ]]; then + set -a + # shellcheck disable=SC1090 + source "$ENV_FILE" + set +a +fi + +SYS_DEBUG_NETWORK_NAME=${SYS_DEBUG_NETWORK_NAME:-argus-debug-net} +SYS_DEBUG_NETWORK_SUBNET=${SYS_DEBUG_NETWORK_SUBNET:-172.30.0.0/16} +SYS_DEBUG_NETWORK_GATEWAY=${SYS_DEBUG_NETWORK_GATEWAY:-172.30.0.1} +SYS_DEBUG_PROJECT_NAME=${SYS_DEBUG_PROJECT_NAME:-argus-debug} +SYS_DEBUG_CONTAINER_PREFIX=${SYS_DEBUG_CONTAINER_PREFIX:-argus-debug} +SYS_DEBUG_PRIVATE_CORE=${SYS_DEBUG_PRIVATE_CORE:-$DEBUG_ROOT/private} +SYS_DEBUG_PRIVATE_NODEA=${SYS_DEBUG_PRIVATE_NODEA:-$DEBUG_ROOT/private-nodea} +SYS_DEBUG_PRIVATE_NODEB=${SYS_DEBUG_PRIVATE_NODEB:-$DEBUG_ROOT/private-nodeb} +SYS_DEBUG_TMP_DIR=${SYS_DEBUG_TMP_DIR:-$DEBUG_ROOT/tmp} +ARGUS_BUILD_UID=${ARGUS_BUILD_UID:-2133} +ARGUS_BUILD_GID=${ARGUS_BUILD_GID:-2015} + +SYS_DEBUG_NODEA_HOST=${SYS_DEBUG_NODEA_HOST:-dev-yyrshare-nbnyx10-cp2f-pod-0} +SYS_DEBUG_NODEB_HOST=${SYS_DEBUG_NODEB_HOST:-dev-yyrshare-uuuu10-ep2f-pod-0} + +HOST_A="$SYS_DEBUG_NODEA_HOST" +HOST_B="$SYS_DEBUG_NODEB_HOST" + +COMPOSE_FILE="$DEBUG_ROOT/docker-compose.yml" + +abs_path() { + python3 - "$1" <<'PY' +import os, sys +path = sys.argv[1] +print(os.path.abspath(path)) +PY +} + +ensure_command() { + local cmd="$1" + if ! command -v "$cmd" >/dev/null 2>&1; then + echo "[ERR] Required command '$cmd' not found" >&2 + exit 1 + fi +} + +require_docker() { + ensure_command docker +} + +compose() { + require_docker + local bin + if docker compose version >/dev/null 2>&1; then + bin=(docker compose) + else + bin=(docker-compose) + fi + "${bin[@]}" -p "$SYS_DEBUG_PROJECT_NAME" -f "$COMPOSE_FILE" "$@" +} + +ensure_paths_defined() { + local missing=() + for name in SYS_DEBUG_PRIVATE_CORE SYS_DEBUG_PRIVATE_NODEA SYS_DEBUG_PRIVATE_NODEB SYS_DEBUG_TMP_DIR; do + if [[ -z "${!name:-}" ]]; then + missing+=("$name") + fi + done + if (( ${#missing[@]} > 0 )); then + echo "[ERR] Missing required environment variables: ${missing[*]}" >&2 + echo " Run 01_bootstrap.sh first." >&2 + exit 1 + fi +} + +ensure_env_file() { + if [[ ! -f "$ENV_FILE" ]]; then + echo "[ERR] Missing .env at $ENV_FILE. Run 01_bootstrap.sh first." >&2 + exit 1 + fi +} + +log() { + echo "[INFO] $*" +} + +TMP_DIR="$SYS_DEBUG_TMP_DIR" +mkdir -p "$TMP_DIR" diff --git a/src/sys/debug/scripts/network-create.sh b/src/sys/debug/scripts/network-create.sh new file mode 100755 index 0000000..25eb3b4 --- /dev/null +++ b/src/sys/debug/scripts/network-create.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +set -euo pipefail + +# shellcheck source=common.sh +source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" + +NAME="$SYS_DEBUG_NETWORK_NAME" +SUBNET="$SYS_DEBUG_NETWORK_SUBNET" +GATEWAY="$SYS_DEBUG_NETWORK_GATEWAY" + +usage() { + cat <&2; exit 1; } + NAME="$1" + ;; + --name=*) + NAME="${1#*=}" + ;; + --subnet) + shift; [[ $# -gt 0 ]] || { echo "--subnet requires value" >&2; exit 1; } + SUBNET="$1" + ;; + --subnet=*) + SUBNET="${1#*=}" + ;; + --gateway) + shift; [[ $# -gt 0 ]] || { echo "--gateway requires value" >&2; exit 1; } + GATEWAY="$1" + ;; + --gateway=*) + GATEWAY="${1#*=}" + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage >&2 + exit 1 + ;; + esac + shift +done + +require_docker + +if docker network inspect "$NAME" >/dev/null 2>&1; then + log "Network $NAME already exists" + exit 0 +fi + +log "Creating network $NAME (subnet=$SUBNET gateway=$GATEWAY)" +docker network create \ + --driver bridge \ + --subnet "$SUBNET" \ + --gateway "$GATEWAY" \ + "$NAME" + +mkdir -p "$TMP_DIR" +echo "$NAME" > "$TMP_DIR/network.created" +log "Network $NAME created" diff --git a/src/sys/debug/scripts/network-destroy.sh b/src/sys/debug/scripts/network-destroy.sh new file mode 100755 index 0000000..ade15f5 --- /dev/null +++ b/src/sys/debug/scripts/network-destroy.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +set -euo pipefail + +# shellcheck source=common.sh +source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/common.sh" + +NAME="$SYS_DEBUG_NETWORK_NAME" + +usage() { + cat <&2; exit 1; } + NAME="$1" + ;; + --name=*) + NAME="${1#*=}" + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage >&2 + exit 1 + ;; + esac + shift +done + +require_docker + +if ! docker network inspect "$NAME" >/dev/null 2>&1; then + log "Network $NAME not found; nothing to do" + exit 0 +fi + +attached=$(docker network inspect -f '{{range $id, $conf := .Containers}}{{printf "%s " $conf.Name}}{{end}}' "$NAME") +if [[ -n "${attached// }" ]]; then + echo "[ERR] Cannot remove network $NAME: still connected containers -> $attached" >&2 + exit 1 +fi + +log "Deleting network $NAME" +docker network rm "$NAME" >/dev/null +rm -f "$TMP_DIR/network.created" +log "Network $NAME removed" diff --git a/src/sys/tests/scripts/08_restart_agent_reregister.sh b/src/sys/tests/scripts/08_restart_agent_reregister.sh index 97a68ec..d9bf43a 100755 --- a/src/sys/tests/scripts/08_restart_agent_reregister.sh +++ b/src/sys/tests/scripts/08_restart_agent_reregister.sh @@ -8,6 +8,16 @@ REPO_ROOT="$(cd "$TEST_ROOT/../../.." && pwd)" API_BASE="http://localhost:32300/api/v1/master" +if [[ -f "$TEST_ROOT/.env" ]]; then + set -a + # shellcheck disable=SC1090 + source "$TEST_ROOT/.env" + set +a +else + source "$REPO_ROOT/scripts/common/build_user.sh" + load_build_user +fi + ID_B="$(cat "$TMP_DIR/node_id_b")" IP0_B="$(cat "$TMP_DIR/initial_ip_b")"