dev_1.0.0_yuyr_2:重新提交 PR,增加 master/agent 以及系统集成测试 #17
3
src/agent/.gitignore
vendored
Normal file
3
src/agent/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
build/
|
||||
*.egg-info/
|
||||
__pycache__/
|
@ -1,23 +0,0 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
ARG PIP_INDEX_URL=
|
||||
ENV PIP_NO_CACHE_DIR=1 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PYTHONPATH=/app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt ./
|
||||
RUN set -euxo pipefail \
|
||||
&& python -m pip install --upgrade pip \
|
||||
&& if [[ -n "$PIP_INDEX_URL" ]]; then \
|
||||
PIP_INDEX_URL="$PIP_INDEX_URL" python -m pip install -r requirements.txt; \
|
||||
else \
|
||||
python -m pip install -r requirements.txt; \
|
||||
fi
|
||||
|
||||
COPY app ./app
|
||||
|
||||
CMD ["python", "-m", "app.main"]
|
@ -6,26 +6,34 @@ Python agent that registers with the Argus master service, persists node informa
|
||||
|
||||
```bash
|
||||
cd src/agent
|
||||
./scripts/build_images.sh # builds argus-agent:dev
|
||||
./scripts/build_binary.sh # produces dist/argus-agent
|
||||
```
|
||||
|
||||
Runtime expects a configuration file (generated by installer) at `/private/argus/agent/<hostname>/config`. Key fields:
|
||||
The resulting executable (`dist/argus-agent`) bundles the runtime via PyInstaller. Runtime configuration is now derived from environment variables and the container hostname—no local config file is required.
|
||||
|
||||
- `HOSTNAME`, `NODE_FILE`, `VERSION`
|
||||
- `MASTER_ENDPOINT` (e.g. `http://master:3000`)
|
||||
- `REPORT_INTERVAL_SECONDS`
|
||||
- `SUBMODULE_HEALTH_FILE_DIR` (supports `{hostname}` placeholder)
|
||||
- optional `GPU_NUMBER`
|
||||
Required variables:
|
||||
|
||||
Health files live under `/private/argus/agent/health/<hostname>/` and must follow `<prefix>-*.json` naming (e.g. `log-fluentbit.json`). The agent sends parsed JSON objects keyed by file stem.
|
||||
- `MASTER_ENDPOINT`:Master 服务的完整地址,若未带协议会自动补全为 `http://`。
|
||||
- `REPORT_INTERVAL_SECONDS`:状态上报周期,可选,默认 60。
|
||||
|
||||
Additional overrides:
|
||||
|
||||
- `AGENT_HOSTNAME`:可选,若需要覆盖容器 `hostname`。
|
||||
|
||||
At startup the agent会读取容器主机名(或 `AGENT_HOSTNAME` 覆盖值)并固定以下路径:
|
||||
|
||||
- 节点状态持久化:`/private/argus/agent/<hostname>/node.json`
|
||||
- 子模块健康目录:`/private/argus/agent/health/<hostname>/`
|
||||
|
||||
健康文件需按 `<模块名前缀>-*.json` 命名,例如 `log-fluentbit.json`,文件内容会以文件名前缀为键写入上报 payload。
|
||||
|
||||
## Tests
|
||||
|
||||
Docker-based E2E stack (master + agent):
|
||||
Docker 端到端测试会启动 master 容器与一个普通 `ubuntu:24.04` 容器,在其中挂载并执行打包后的 agent(通过环境变量注入 `MASTER_ENDPOINT` 与 `REPORT_INTERVAL_SECONDS`):
|
||||
|
||||
```bash
|
||||
cd src/agent/tests
|
||||
./scripts/00_e2e_test.sh
|
||||
```
|
||||
|
||||
The scripts provision configs/health directories under `tests/private/` and clean up via `07_down.sh`.
|
||||
测试脚本会自动调用 `../scripts/build_binary.sh` 生成可执行文件,并在 `tests/private/` 下准备配置与健康目录,最后通过 `07_down.sh` 清理环境。
|
||||
|
@ -27,7 +27,7 @@ def collect_metadata(config: AgentConfig) -> Dict[str, Any]:
|
||||
"instance": instance,
|
||||
"cpu_number": _detect_cpu_count(),
|
||||
"memory_in_bytes": _detect_memory_bytes(),
|
||||
"gpu_number": _detect_gpu_count(config),
|
||||
"gpu_number": _detect_gpu_count(),
|
||||
}
|
||||
return meta
|
||||
|
||||
@ -70,11 +70,8 @@ def _detect_memory_bytes() -> int:
|
||||
return 0
|
||||
|
||||
|
||||
def _detect_gpu_count(config: AgentConfig) -> int:
|
||||
"""采集 GPU 数量,可被配置覆盖。"""
|
||||
if config.gpu_number_override is not None:
|
||||
return config.gpu_number_override
|
||||
|
||||
def _detect_gpu_count() -> int:
|
||||
"""采集 GPU 数量,如无法探测则默认为 0。"""
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
["nvidia-smi", "-L"],
|
||||
|
@ -1,8 +1,14 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import socket
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Final
|
||||
|
||||
from .version import VERSION
|
||||
|
||||
DEFAULT_REPORT_INTERVAL_SECONDS: Final[int] = 60
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@ -12,90 +18,57 @@ class AgentConfig:
|
||||
version: str
|
||||
master_endpoint: str
|
||||
report_interval_seconds: int
|
||||
health_dir_template: str
|
||||
gpu_number_override: int | None
|
||||
health_dir: str
|
||||
request_timeout_seconds: int = 10
|
||||
|
||||
@property
|
||||
def health_dir(self) -> str:
|
||||
return self.health_dir_template.format(hostname=self.hostname)
|
||||
|
||||
def _normalise_master_endpoint(value: str) -> str:
|
||||
value = value.strip()
|
||||
if not value:
|
||||
raise ValueError("MASTER_ENDPOINT environment variable is required")
|
||||
if not value.startswith("http://") and not value.startswith("https://"):
|
||||
value = f"http://{value}"
|
||||
return value.rstrip("/")
|
||||
|
||||
|
||||
def _parse_config_file(path: str) -> dict[str, str]:
|
||||
result: dict[str, str] = {}
|
||||
def _read_report_interval(raw_value: str | None) -> int:
|
||||
if raw_value is None or raw_value.strip() == "":
|
||||
return DEFAULT_REPORT_INTERVAL_SECONDS
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as handle:
|
||||
for raw_line in handle:
|
||||
line = raw_line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
if "=" not in line:
|
||||
continue
|
||||
key, value = line.split("=", 1)
|
||||
result[key.strip().upper()] = value.strip()
|
||||
except FileNotFoundError:
|
||||
raise FileNotFoundError(f"Agent config file not found: {path}") from None
|
||||
return result
|
||||
|
||||
|
||||
def load_config(path: str) -> AgentConfig:
|
||||
"""读取配置文件并结合环境变量,返回 AgentConfig。"""
|
||||
config_values = _parse_config_file(path)
|
||||
force_env = os.environ.get("AGENT_FORCE_ENV", "0").lower() in {"1", "true", "yes"}
|
||||
|
||||
def read_key(key: str, default: str | None = None, *, required: bool = False) -> str:
|
||||
env_key = f"AGENT_{key}"
|
||||
if env_key in os.environ:
|
||||
return os.environ[env_key]
|
||||
if force_env and key in os.environ:
|
||||
return os.environ[key]
|
||||
if key in config_values:
|
||||
return config_values[key]
|
||||
if default is not None:
|
||||
return default
|
||||
if required:
|
||||
raise ValueError(f"Missing required configuration key: {key}")
|
||||
return ""
|
||||
|
||||
hostname = read_key("HOSTNAME", required=True)
|
||||
node_file = read_key("NODE_FILE", f"/private/argus/agent/{hostname}/node.json")
|
||||
version = read_key("VERSION", "1.0.0")
|
||||
master_endpoint = read_key("MASTER_ENDPOINT", required=True)
|
||||
report_interval_raw = read_key("REPORT_INTERVAL_SECONDS", "60")
|
||||
health_dir_template = read_key(
|
||||
"SUBMODULE_HEALTH_FILE_DIR",
|
||||
f"/private/argus/agent/health/{{hostname}}/",
|
||||
)
|
||||
gpu_override_raw = read_key("GPU_NUMBER", "")
|
||||
|
||||
try:
|
||||
report_interval_seconds = int(report_interval_raw)
|
||||
interval = int(raw_value)
|
||||
except ValueError as exc:
|
||||
raise ValueError("REPORT_INTERVAL_SECONDS must be an integer") from exc
|
||||
if report_interval_seconds <= 0:
|
||||
if interval <= 0:
|
||||
raise ValueError("REPORT_INTERVAL_SECONDS must be positive")
|
||||
return interval
|
||||
|
||||
gpu_override = None
|
||||
if gpu_override_raw:
|
||||
try:
|
||||
gpu_override = int(gpu_override_raw)
|
||||
except ValueError as exc:
|
||||
raise ValueError("GPU_NUMBER must be an integer when provided") from exc
|
||||
if gpu_override < 0:
|
||||
raise ValueError("GPU_NUMBER must be non-negative")
|
||||
|
||||
if not master_endpoint.startswith("http://") and not master_endpoint.startswith("https://"):
|
||||
master_endpoint = f"http://{master_endpoint}"
|
||||
def _resolve_hostname() -> str:
|
||||
return os.environ.get("AGENT_HOSTNAME") or socket.gethostname()
|
||||
|
||||
|
||||
def load_config() -> AgentConfig:
|
||||
"""从环境变量推导配置,移除了外部配置文件依赖。"""
|
||||
|
||||
hostname = _resolve_hostname()
|
||||
node_file = f"/private/argus/agent/{hostname}/node.json"
|
||||
health_dir = f"/private/argus/agent/health/{hostname}/"
|
||||
|
||||
master_endpoint_env = os.environ.get("MASTER_ENDPOINT")
|
||||
if master_endpoint_env is None:
|
||||
raise ValueError("MASTER_ENDPOINT environment variable is not set")
|
||||
master_endpoint = _normalise_master_endpoint(master_endpoint_env)
|
||||
|
||||
report_interval_seconds = _read_report_interval(os.environ.get("REPORT_INTERVAL_SECONDS"))
|
||||
|
||||
Path(node_file).parent.mkdir(parents=True, exist_ok=True)
|
||||
Path(health_dir_template.format(hostname=hostname)).mkdir(parents=True, exist_ok=True)
|
||||
Path(health_dir).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
return AgentConfig(
|
||||
hostname=hostname,
|
||||
node_file=node_file,
|
||||
version=version,
|
||||
master_endpoint=master_endpoint.rstrip("/"),
|
||||
version=VERSION,
|
||||
master_endpoint=master_endpoint,
|
||||
report_interval_seconds=report_interval_seconds,
|
||||
health_dir_template=health_dir_template,
|
||||
gpu_number_override=gpu_override,
|
||||
health_dir=health_dir,
|
||||
)
|
||||
|
@ -1,8 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
@ -32,28 +30,15 @@ class StopSignal:
|
||||
return self._stop
|
||||
|
||||
|
||||
def parse_args(argv: list[str]) -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Argus agent")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
dest="config_path",
|
||||
default=None,
|
||||
help="Path to agent config file",
|
||||
)
|
||||
return parser.parse_args(argv)
|
||||
|
||||
|
||||
def main(argv: Optional[list[str]] = None) -> int:
|
||||
def main(argv: Optional[list[str]] = None) -> int: # noqa: ARG001 - 保留签名以兼容入口调用
|
||||
setup_logging()
|
||||
args = parse_args(argv or sys.argv[1:])
|
||||
|
||||
stop_signal = StopSignal()
|
||||
signal.signal(signal.SIGTERM, stop_signal.set)
|
||||
signal.signal(signal.SIGINT, stop_signal.set)
|
||||
|
||||
try:
|
||||
config_path = args.config_path or _default_config_path()
|
||||
config = load_config(config_path)
|
||||
config = load_config()
|
||||
except Exception as exc:
|
||||
LOGGER.error("Failed to load configuration", extra={"error": str(exc)})
|
||||
return 1
|
||||
@ -89,13 +74,6 @@ def main(argv: Optional[list[str]] = None) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
def _default_config_path() -> str:
|
||||
from socket import gethostname
|
||||
|
||||
hostname = gethostname()
|
||||
return f"/private/argus/agent/{hostname}/config"
|
||||
|
||||
|
||||
def _register_with_retry(
|
||||
client: AgentClient,
|
||||
config: AgentConfig,
|
||||
|
69
src/agent/app/version.py
Normal file
69
src/agent/app/version.py
Normal file
@ -0,0 +1,69 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import importlib.metadata
|
||||
|
||||
try:
|
||||
import tomllib
|
||||
except ModuleNotFoundError: # pragma: no cover
|
||||
import tomli as tomllib # type: ignore[no-redef]
|
||||
|
||||
|
||||
def _candidate_paths() -> list[Path]:
|
||||
paths = []
|
||||
bundle_dir: Optional[str] = getattr(sys, "_MEIPASS", None)
|
||||
if bundle_dir:
|
||||
paths.append(Path(bundle_dir) / "pyproject.toml")
|
||||
paths.append(Path(__file__).resolve().parent.parent / "pyproject.toml")
|
||||
paths.append(Path(__file__).resolve().parent / "pyproject.toml")
|
||||
paths.append(Path.cwd() / "pyproject.toml")
|
||||
return paths
|
||||
|
||||
|
||||
def _read_from_pyproject() -> Optional[str]:
|
||||
for path in _candidate_paths():
|
||||
if not path.exists():
|
||||
continue
|
||||
try:
|
||||
with path.open("rb") as handle:
|
||||
data = tomllib.load(handle)
|
||||
except (OSError, tomllib.TOMLDecodeError):
|
||||
continue
|
||||
project = data.get("project")
|
||||
if isinstance(project, dict):
|
||||
version = project.get("version")
|
||||
if isinstance(version, str):
|
||||
return version
|
||||
tool = data.get("tool")
|
||||
if isinstance(tool, dict):
|
||||
argus_cfg = tool.get("argus")
|
||||
if isinstance(argus_cfg, dict):
|
||||
version = argus_cfg.get("version")
|
||||
if isinstance(version, str):
|
||||
return version
|
||||
return None
|
||||
|
||||
|
||||
def _detect_version() -> str:
|
||||
try:
|
||||
return importlib.metadata.version("argus-agent")
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
pass
|
||||
override = os.environ.get("AGENT_VERSION_OVERRIDE")
|
||||
if override:
|
||||
return override
|
||||
fallback = _read_from_pyproject()
|
||||
if fallback:
|
||||
return fallback
|
||||
return "0.0.0"
|
||||
|
||||
|
||||
VERSION: str = _detect_version()
|
||||
|
||||
|
||||
def get_version() -> str:
|
||||
return VERSION
|
BIN
src/agent/dist/argus-agent
vendored
Executable file
BIN
src/agent/dist/argus-agent
vendored
Executable file
Binary file not shown.
10
src/agent/entry.py
Normal file
10
src/agent/entry.py
Normal file
@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
|
||||
from app.main import main as agent_main
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(agent_main())
|
19
src/agent/pyproject.toml
Normal file
19
src/agent/pyproject.toml
Normal file
@ -0,0 +1,19 @@
|
||||
[project]
|
||||
name = "argus-agent"
|
||||
version = "1.1.0"
|
||||
description = "Argus agent binary"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"requests==2.31.0"
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["setuptools>=69", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[tool.argus]
|
||||
entry = "app.main:main"
|
||||
|
||||
[tool.setuptools]
|
||||
packages = ["app"]
|
@ -1 +0,0 @@
|
||||
requests==2.31.0
|
42
src/agent/scripts/build_binary.sh
Executable file
42
src/agent/scripts/build_binary.sh
Executable file
@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
MODULE_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
BUILD_ROOT="$MODULE_ROOT/build"
|
||||
DIST_DIR="$MODULE_ROOT/dist"
|
||||
PYINSTALLER_BUILD="$BUILD_ROOT/pyinstaller"
|
||||
VENV_DIR="$BUILD_ROOT/venv"
|
||||
|
||||
mkdir -p "$PYINSTALLER_BUILD"
|
||||
mkdir -p "$DIST_DIR"
|
||||
|
||||
if [[ ! -d "$VENV_DIR" ]]; then
|
||||
python3 -m venv "$VENV_DIR"
|
||||
fi
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$VENV_DIR/bin/activate"
|
||||
|
||||
pip install --upgrade pip
|
||||
pip install "$MODULE_ROOT"
|
||||
pip install "pyinstaller==6.6.0"
|
||||
|
||||
rm -rf "$PYINSTALLER_BUILD"/*
|
||||
rm -f "$DIST_DIR/argus-agent"
|
||||
|
||||
pyinstaller \
|
||||
--clean \
|
||||
--onefile \
|
||||
--name argus-agent \
|
||||
--distpath "$DIST_DIR" \
|
||||
--workpath "$PYINSTALLER_BUILD/work" \
|
||||
--specpath "$PYINSTALLER_BUILD/spec" \
|
||||
--add-data "$MODULE_ROOT/pyproject.toml:." \
|
||||
"$MODULE_ROOT/entry.py"
|
||||
|
||||
chmod +x "$DIST_DIR/argus-agent"
|
||||
|
||||
deactivate
|
||||
|
||||
echo "[INFO] Agent binary generated at $DIST_DIR/argus-agent"
|
@ -1,39 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 [--intranet] [--tag <image_tag>]" >&2
|
||||
}
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
IMAGE_TAG="${IMAGE_TAG:-argus-agent:dev}"
|
||||
BUILD_ARGS=()
|
||||
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--intranet)
|
||||
INTRANET_INDEX="${INTRANET_INDEX:-https://pypi.tuna.tsinghua.edu.cn/simple}"
|
||||
BUILD_ARGS+=("--build-arg" "PIP_INDEX_URL=${INTRANET_INDEX}")
|
||||
shift
|
||||
;;
|
||||
--tag)
|
||||
[[ $# -ge 2 ]] || { usage; exit 1; }
|
||||
IMAGE_TAG="$2"
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1" >&2
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo "[INFO] Building image $IMAGE_TAG"
|
||||
docker build "${BUILD_ARGS[@]}" -t "$IMAGE_TAG" "$PROJECT_ROOT"
|
||||
echo "[OK] Image $IMAGE_TAG built"
|
@ -1,39 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 [--file <tar_path>]" >&2
|
||||
}
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
DEFAULT_INPUT="$PROJECT_ROOT/images/argus-agent-dev.tar"
|
||||
IMAGE_TAR="$DEFAULT_INPUT"
|
||||
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--file)
|
||||
[[ $# -ge 2 ]] || { usage; exit 1; }
|
||||
IMAGE_TAR="$2"
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1" >&2
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ ! -f "$IMAGE_TAR" ]]; then
|
||||
echo "[ERROR] Image tarball not found: $IMAGE_TAR" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "[INFO] Loading image from $IMAGE_TAR"
|
||||
docker image load -i "$IMAGE_TAR"
|
||||
echo "[OK] Image loaded"
|
@ -1,41 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 [--tag <image_tag>] [--output <tar_path>]" >&2
|
||||
}
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
DEFAULT_OUTPUT="$PROJECT_ROOT/images/argus-agent-dev.tar"
|
||||
IMAGE_TAG="${IMAGE_TAG:-argus-agent:dev}"
|
||||
OUTPUT_PATH="$DEFAULT_OUTPUT"
|
||||
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--tag)
|
||||
[[ $# -ge 2 ]] || { usage; exit 1; }
|
||||
IMAGE_TAG="$2"
|
||||
shift 2
|
||||
;;
|
||||
--output)
|
||||
[[ $# -ge 2 ]] || { usage; exit 1; }
|
||||
OUTPUT_PATH="$2"
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1" >&2
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
mkdir -p "$(dirname "$OUTPUT_PATH")"
|
||||
echo "[INFO] Saving image $IMAGE_TAG to $OUTPUT_PATH"
|
||||
docker image save "$IMAGE_TAG" -o "$OUTPUT_PATH"
|
||||
echo "[OK] Image saved"
|
@ -13,14 +13,20 @@ services:
|
||||
- ./private/argus/metric/prometheus:/private/argus/metric/prometheus
|
||||
|
||||
agent:
|
||||
image: argus-agent:dev
|
||||
image: ubuntu:24.04
|
||||
container_name: argus-agent-e2e
|
||||
hostname: dev-e2euser-e2einst-pod-0
|
||||
depends_on:
|
||||
- master
|
||||
environment:
|
||||
- MASTER_ENDPOINT=http://master:3000
|
||||
- REPORT_INTERVAL_SECONDS=2
|
||||
volumes:
|
||||
- ./private/argus/agent/dev-e2euser-e2einst-pod-0:/private/argus/agent/dev-e2euser-e2einst-pod-0
|
||||
- ./private/argus/agent/health/dev-e2euser-e2einst-pod-0:/private/argus/agent/health/dev-e2euser-e2einst-pod-0
|
||||
- ../dist/argus-agent:/usr/local/bin/argus-agent:ro
|
||||
entrypoint:
|
||||
- /usr/local/bin/argus-agent
|
||||
|
||||
networks:
|
||||
default:
|
||||
|
@ -20,25 +20,23 @@ mkdir -p "$MASTER_PRIVATE_DIR"
|
||||
mkdir -p "$METRIC_PRIVATE_DIR"
|
||||
mkdir -p "$TMP_ROOT"
|
||||
|
||||
cat > "$AGENT_CONFIG_DIR/config" <<CONFIG
|
||||
HOSTNAME=$AGENT_HOSTNAME
|
||||
NODE_FILE=/private/argus/agent/$AGENT_HOSTNAME/node.json
|
||||
VERSION=1.1.0
|
||||
MASTER_ENDPOINT=http://master:3000
|
||||
REPORT_INTERVAL_SECONDS=2
|
||||
SUBMODULE_HEALTH_FILE_DIR=/private/argus/agent/health/{hostname}/
|
||||
GPU_NUMBER=
|
||||
IP_OVERRIDE=
|
||||
CONFIG
|
||||
|
||||
touch "$AGENT_HEALTH_DIR/.keep"
|
||||
|
||||
pushd "$MASTER_ROOT" >/dev/null
|
||||
./scripts/build_images.sh --tag argus-master:dev
|
||||
popd >/dev/null
|
||||
|
||||
AGENT_BINARY="$AGENT_ROOT/dist/argus-agent"
|
||||
|
||||
pushd "$AGENT_ROOT" >/dev/null
|
||||
./scripts/build_images.sh --tag argus-agent:dev
|
||||
./scripts/build_binary.sh
|
||||
popd >/dev/null
|
||||
|
||||
if [[ ! -x "$AGENT_BINARY" ]]; then
|
||||
echo "[ERROR] Agent binary not found at $AGENT_BINARY" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "$AGENT_BINARY" > "$TMP_ROOT/agent_binary_path"
|
||||
|
||||
echo "[INFO] Agent E2E bootstrap complete"
|
||||
|
@ -4,6 +4,19 @@ set -euo pipefail
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
TMP_ROOT="$TEST_ROOT/tmp"
|
||||
|
||||
if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then
|
||||
echo "[ERROR] Agent binary path missing; run 01_bootstrap.sh first" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
AGENT_BINARY="$(cat "$TMP_ROOT/agent_binary_path")"
|
||||
if [[ ! -x "$AGENT_BINARY" ]]; then
|
||||
echo "[ERROR] Agent binary not executable: $AGENT_BINARY" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
compose() {
|
||||
if docker compose version >/dev/null 2>&1; then
|
||||
docker compose "$@"
|
||||
|
@ -10,6 +10,17 @@ AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
|
||||
NETWORK_NAME="tests_default"
|
||||
NEW_AGENT_IP="172.28.0.200"
|
||||
|
||||
if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then
|
||||
echo "[ERROR] Agent binary path missing; rerun bootstrap" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
AGENT_BINARY="$(cat "$TMP_ROOT/agent_binary_path")"
|
||||
if [[ ! -x "$AGENT_BINARY" ]]; then
|
||||
echo "[ERROR] Agent binary not executable: $AGENT_BINARY" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
compose() {
|
||||
if docker compose version >/dev/null 2>&1; then
|
||||
docker compose "$@"
|
||||
@ -57,14 +68,17 @@ if ! docker run -d \
|
||||
--ip "$NEW_AGENT_IP" \
|
||||
-v "$AGENT_DIR:/private/argus/agent/$AGENT_HOSTNAME" \
|
||||
-v "$HEALTH_DIR:/private/argus/agent/health/$AGENT_HOSTNAME" \
|
||||
argus-agent:dev \
|
||||
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
|
||||
-e MASTER_ENDPOINT=http://master:3000 \
|
||||
-e REPORT_INTERVAL_SECONDS=2 \
|
||||
ubuntu:24.04 \
|
||||
sleep 300 >/dev/null; then
|
||||
echo "[ERROR] Failed to start agent container with custom IP" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 在容器内启动真实 agent 进程
|
||||
if ! docker exec -d argus-agent-e2e python -m app.main; then
|
||||
if ! docker exec -d argus-agent-e2e /usr/local/bin/argus-agent; then
|
||||
echo "[ERROR] Failed to spawn agent process inside container" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
Loading…
x
Reference in New Issue
Block a user