[#2] agent使用pyinstaller打包成二进制可执行文件,通过环境变量指定master endpoint和上报间隔

This commit is contained in:
yuyr 2025-09-24 08:14:55 +00:00
parent bc016826f1
commit fbe6def185
20 changed files with 254 additions and 267 deletions

3
src/agent/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
build/
*.egg-info/
__pycache__/

View File

@ -1,23 +0,0 @@
FROM python:3.11-slim
SHELL ["/bin/bash", "-c"]
ARG PIP_INDEX_URL=
ENV PIP_NO_CACHE_DIR=1 \
PYTHONUNBUFFERED=1 \
PYTHONPATH=/app
WORKDIR /app
COPY requirements.txt ./
RUN set -euxo pipefail \
&& python -m pip install --upgrade pip \
&& if [[ -n "$PIP_INDEX_URL" ]]; then \
PIP_INDEX_URL="$PIP_INDEX_URL" python -m pip install -r requirements.txt; \
else \
python -m pip install -r requirements.txt; \
fi
COPY app ./app
CMD ["python", "-m", "app.main"]

View File

@ -6,26 +6,34 @@ Python agent that registers with the Argus master service, persists node informa
```bash
cd src/agent
./scripts/build_images.sh # builds argus-agent:dev
./scripts/build_binary.sh # produces dist/argus-agent
```
Runtime expects a configuration file (generated by installer) at `/private/argus/agent/<hostname>/config`. Key fields:
The resulting executable (`dist/argus-agent`) bundles the runtime via PyInstaller. Runtime configuration is now derived from environment variables and the container hostname—no local config file is required.
- `HOSTNAME`, `NODE_FILE`, `VERSION`
- `MASTER_ENDPOINT` (e.g. `http://master:3000`)
- `REPORT_INTERVAL_SECONDS`
- `SUBMODULE_HEALTH_FILE_DIR` (supports `{hostname}` placeholder)
- optional `GPU_NUMBER`
Required variables:
Health files live under `/private/argus/agent/health/<hostname>/` and must follow `<prefix>-*.json` naming (e.g. `log-fluentbit.json`). The agent sends parsed JSON objects keyed by file stem.
- `MASTER_ENDPOINT`Master 服务的完整地址,若未带协议会自动补全为 `http://`
- `REPORT_INTERVAL_SECONDS`:状态上报周期,可选,默认 60。
Additional overrides
- `AGENT_HOSTNAME`:可选,若需要覆盖容器 `hostname`
At startup the agent会读取容器主机名`AGENT_HOSTNAME` 覆盖值)并固定以下路径:
- 节点状态持久化:`/private/argus/agent/<hostname>/node.json`
- 子模块健康目录:`/private/argus/agent/health/<hostname>/`
健康文件需按 `<模块名前缀>-*.json` 命名,例如 `log-fluentbit.json`,文件内容会以文件名前缀为键写入上报 payload。
## Tests
Docker-based E2E stack (master + agent):
Docker 端到端测试会启动 master 容器与一个普通 `ubuntu:24.04` 容器,在其中挂载并执行打包后的 agent通过环境变量注入 `MASTER_ENDPOINT``REPORT_INTERVAL_SECONDS`
```bash
cd src/agent/tests
./scripts/00_e2e_test.sh
```
The scripts provision configs/health directories under `tests/private/` and clean up via `07_down.sh`.
测试脚本会自动调用 `../scripts/build_binary.sh` 生成可执行文件,并在 `tests/private/` 下准备配置与健康目录,最后通过 `07_down.sh` 清理环境。

View File

@ -27,7 +27,7 @@ def collect_metadata(config: AgentConfig) -> Dict[str, Any]:
"instance": instance,
"cpu_number": _detect_cpu_count(),
"memory_in_bytes": _detect_memory_bytes(),
"gpu_number": _detect_gpu_count(config),
"gpu_number": _detect_gpu_count(),
}
return meta
@ -70,11 +70,8 @@ def _detect_memory_bytes() -> int:
return 0
def _detect_gpu_count(config: AgentConfig) -> int:
"""采集 GPU 数量,可被配置覆盖。"""
if config.gpu_number_override is not None:
return config.gpu_number_override
def _detect_gpu_count() -> int:
"""采集 GPU 数量,如无法探测则默认为 0。"""
try:
proc = subprocess.run(
["nvidia-smi", "-L"],

View File

@ -1,8 +1,14 @@
from __future__ import annotations
import os
import socket
from dataclasses import dataclass
from pathlib import Path
from typing import Final
from .version import VERSION
DEFAULT_REPORT_INTERVAL_SECONDS: Final[int] = 60
@dataclass(frozen=True)
@ -12,90 +18,57 @@ class AgentConfig:
version: str
master_endpoint: str
report_interval_seconds: int
health_dir_template: str
gpu_number_override: int | None
health_dir: str
request_timeout_seconds: int = 10
@property
def health_dir(self) -> str:
return self.health_dir_template.format(hostname=self.hostname)
def _normalise_master_endpoint(value: str) -> str:
value = value.strip()
if not value:
raise ValueError("MASTER_ENDPOINT environment variable is required")
if not value.startswith("http://") and not value.startswith("https://"):
value = f"http://{value}"
return value.rstrip("/")
def _parse_config_file(path: str) -> dict[str, str]:
result: dict[str, str] = {}
def _read_report_interval(raw_value: str | None) -> int:
if raw_value is None or raw_value.strip() == "":
return DEFAULT_REPORT_INTERVAL_SECONDS
try:
with open(path, "r", encoding="utf-8") as handle:
for raw_line in handle:
line = raw_line.strip()
if not line or line.startswith("#"):
continue
if "=" not in line:
continue
key, value = line.split("=", 1)
result[key.strip().upper()] = value.strip()
except FileNotFoundError:
raise FileNotFoundError(f"Agent config file not found: {path}") from None
return result
def load_config(path: str) -> AgentConfig:
"""读取配置文件并结合环境变量,返回 AgentConfig。"""
config_values = _parse_config_file(path)
force_env = os.environ.get("AGENT_FORCE_ENV", "0").lower() in {"1", "true", "yes"}
def read_key(key: str, default: str | None = None, *, required: bool = False) -> str:
env_key = f"AGENT_{key}"
if env_key in os.environ:
return os.environ[env_key]
if force_env and key in os.environ:
return os.environ[key]
if key in config_values:
return config_values[key]
if default is not None:
return default
if required:
raise ValueError(f"Missing required configuration key: {key}")
return ""
hostname = read_key("HOSTNAME", required=True)
node_file = read_key("NODE_FILE", f"/private/argus/agent/{hostname}/node.json")
version = read_key("VERSION", "1.0.0")
master_endpoint = read_key("MASTER_ENDPOINT", required=True)
report_interval_raw = read_key("REPORT_INTERVAL_SECONDS", "60")
health_dir_template = read_key(
"SUBMODULE_HEALTH_FILE_DIR",
f"/private/argus/agent/health/{{hostname}}/",
)
gpu_override_raw = read_key("GPU_NUMBER", "")
try:
report_interval_seconds = int(report_interval_raw)
interval = int(raw_value)
except ValueError as exc:
raise ValueError("REPORT_INTERVAL_SECONDS must be an integer") from exc
if report_interval_seconds <= 0:
if interval <= 0:
raise ValueError("REPORT_INTERVAL_SECONDS must be positive")
return interval
gpu_override = None
if gpu_override_raw:
try:
gpu_override = int(gpu_override_raw)
except ValueError as exc:
raise ValueError("GPU_NUMBER must be an integer when provided") from exc
if gpu_override < 0:
raise ValueError("GPU_NUMBER must be non-negative")
if not master_endpoint.startswith("http://") and not master_endpoint.startswith("https://"):
master_endpoint = f"http://{master_endpoint}"
def _resolve_hostname() -> str:
return os.environ.get("AGENT_HOSTNAME") or socket.gethostname()
def load_config() -> AgentConfig:
"""从环境变量推导配置,移除了外部配置文件依赖。"""
hostname = _resolve_hostname()
node_file = f"/private/argus/agent/{hostname}/node.json"
health_dir = f"/private/argus/agent/health/{hostname}/"
master_endpoint_env = os.environ.get("MASTER_ENDPOINT")
if master_endpoint_env is None:
raise ValueError("MASTER_ENDPOINT environment variable is not set")
master_endpoint = _normalise_master_endpoint(master_endpoint_env)
report_interval_seconds = _read_report_interval(os.environ.get("REPORT_INTERVAL_SECONDS"))
Path(node_file).parent.mkdir(parents=True, exist_ok=True)
Path(health_dir_template.format(hostname=hostname)).mkdir(parents=True, exist_ok=True)
Path(health_dir).mkdir(parents=True, exist_ok=True)
return AgentConfig(
hostname=hostname,
node_file=node_file,
version=version,
master_endpoint=master_endpoint.rstrip("/"),
version=VERSION,
master_endpoint=master_endpoint,
report_interval_seconds=report_interval_seconds,
health_dir_template=health_dir_template,
gpu_number_override=gpu_override,
health_dir=health_dir,
)

View File

@ -1,8 +1,6 @@
from __future__ import annotations
import argparse
import signal
import sys
import time
from datetime import datetime, timezone
from typing import Optional
@ -32,28 +30,15 @@ class StopSignal:
return self._stop
def parse_args(argv: list[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Argus agent")
parser.add_argument(
"--config",
dest="config_path",
default=None,
help="Path to agent config file",
)
return parser.parse_args(argv)
def main(argv: Optional[list[str]] = None) -> int:
def main(argv: Optional[list[str]] = None) -> int: # noqa: ARG001 - 保留签名以兼容入口调用
setup_logging()
args = parse_args(argv or sys.argv[1:])
stop_signal = StopSignal()
signal.signal(signal.SIGTERM, stop_signal.set)
signal.signal(signal.SIGINT, stop_signal.set)
try:
config_path = args.config_path or _default_config_path()
config = load_config(config_path)
config = load_config()
except Exception as exc:
LOGGER.error("Failed to load configuration", extra={"error": str(exc)})
return 1
@ -89,13 +74,6 @@ def main(argv: Optional[list[str]] = None) -> int:
return 0
def _default_config_path() -> str:
from socket import gethostname
hostname = gethostname()
return f"/private/argus/agent/{hostname}/config"
def _register_with_retry(
client: AgentClient,
config: AgentConfig,

69
src/agent/app/version.py Normal file
View File

@ -0,0 +1,69 @@
from __future__ import annotations
import os
import sys
from pathlib import Path
from typing import Optional
import importlib.metadata
try:
import tomllib
except ModuleNotFoundError: # pragma: no cover
import tomli as tomllib # type: ignore[no-redef]
def _candidate_paths() -> list[Path]:
paths = []
bundle_dir: Optional[str] = getattr(sys, "_MEIPASS", None)
if bundle_dir:
paths.append(Path(bundle_dir) / "pyproject.toml")
paths.append(Path(__file__).resolve().parent.parent / "pyproject.toml")
paths.append(Path(__file__).resolve().parent / "pyproject.toml")
paths.append(Path.cwd() / "pyproject.toml")
return paths
def _read_from_pyproject() -> Optional[str]:
for path in _candidate_paths():
if not path.exists():
continue
try:
with path.open("rb") as handle:
data = tomllib.load(handle)
except (OSError, tomllib.TOMLDecodeError):
continue
project = data.get("project")
if isinstance(project, dict):
version = project.get("version")
if isinstance(version, str):
return version
tool = data.get("tool")
if isinstance(tool, dict):
argus_cfg = tool.get("argus")
if isinstance(argus_cfg, dict):
version = argus_cfg.get("version")
if isinstance(version, str):
return version
return None
def _detect_version() -> str:
try:
return importlib.metadata.version("argus-agent")
except importlib.metadata.PackageNotFoundError:
pass
override = os.environ.get("AGENT_VERSION_OVERRIDE")
if override:
return override
fallback = _read_from_pyproject()
if fallback:
return fallback
return "0.0.0"
VERSION: str = _detect_version()
def get_version() -> str:
return VERSION

BIN
src/agent/dist/argus-agent vendored Executable file

Binary file not shown.

10
src/agent/entry.py Normal file
View File

@ -0,0 +1,10 @@
#!/usr/bin/env python3
from __future__ import annotations
import sys
from app.main import main as agent_main
if __name__ == "__main__":
sys.exit(agent_main())

19
src/agent/pyproject.toml Normal file
View File

@ -0,0 +1,19 @@
[project]
name = "argus-agent"
version = "1.1.0"
description = "Argus agent binary"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"requests==2.31.0"
]
[build-system]
requires = ["setuptools>=69", "wheel"]
build-backend = "setuptools.build_meta"
[tool.argus]
entry = "app.main:main"
[tool.setuptools]
packages = ["app"]

View File

@ -1 +0,0 @@
requests==2.31.0

View File

@ -0,0 +1,42 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
MODULE_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
BUILD_ROOT="$MODULE_ROOT/build"
DIST_DIR="$MODULE_ROOT/dist"
PYINSTALLER_BUILD="$BUILD_ROOT/pyinstaller"
VENV_DIR="$BUILD_ROOT/venv"
mkdir -p "$PYINSTALLER_BUILD"
mkdir -p "$DIST_DIR"
if [[ ! -d "$VENV_DIR" ]]; then
python3 -m venv "$VENV_DIR"
fi
# shellcheck disable=SC1091
source "$VENV_DIR/bin/activate"
pip install --upgrade pip
pip install "$MODULE_ROOT"
pip install "pyinstaller==6.6.0"
rm -rf "$PYINSTALLER_BUILD"/*
rm -f "$DIST_DIR/argus-agent"
pyinstaller \
--clean \
--onefile \
--name argus-agent \
--distpath "$DIST_DIR" \
--workpath "$PYINSTALLER_BUILD/work" \
--specpath "$PYINSTALLER_BUILD/spec" \
--add-data "$MODULE_ROOT/pyproject.toml:." \
"$MODULE_ROOT/entry.py"
chmod +x "$DIST_DIR/argus-agent"
deactivate
echo "[INFO] Agent binary generated at $DIST_DIR/argus-agent"

View File

@ -1,39 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
usage() {
echo "Usage: $0 [--intranet] [--tag <image_tag>]" >&2
}
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
IMAGE_TAG="${IMAGE_TAG:-argus-agent:dev}"
BUILD_ARGS=()
while [[ "$#" -gt 0 ]]; do
case "$1" in
--intranet)
INTRANET_INDEX="${INTRANET_INDEX:-https://pypi.tuna.tsinghua.edu.cn/simple}"
BUILD_ARGS+=("--build-arg" "PIP_INDEX_URL=${INTRANET_INDEX}")
shift
;;
--tag)
[[ $# -ge 2 ]] || { usage; exit 1; }
IMAGE_TAG="$2"
shift 2
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown option: $1" >&2
usage
exit 1
;;
esac
done
echo "[INFO] Building image $IMAGE_TAG"
docker build "${BUILD_ARGS[@]}" -t "$IMAGE_TAG" "$PROJECT_ROOT"
echo "[OK] Image $IMAGE_TAG built"

View File

@ -1,39 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
usage() {
echo "Usage: $0 [--file <tar_path>]" >&2
}
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
DEFAULT_INPUT="$PROJECT_ROOT/images/argus-agent-dev.tar"
IMAGE_TAR="$DEFAULT_INPUT"
while [[ "$#" -gt 0 ]]; do
case "$1" in
--file)
[[ $# -ge 2 ]] || { usage; exit 1; }
IMAGE_TAR="$2"
shift 2
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown option: $1" >&2
usage
exit 1
;;
esac
done
if [[ ! -f "$IMAGE_TAR" ]]; then
echo "[ERROR] Image tarball not found: $IMAGE_TAR" >&2
exit 1
fi
echo "[INFO] Loading image from $IMAGE_TAR"
docker image load -i "$IMAGE_TAR"
echo "[OK] Image loaded"

View File

@ -1,41 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
usage() {
echo "Usage: $0 [--tag <image_tag>] [--output <tar_path>]" >&2
}
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
DEFAULT_OUTPUT="$PROJECT_ROOT/images/argus-agent-dev.tar"
IMAGE_TAG="${IMAGE_TAG:-argus-agent:dev}"
OUTPUT_PATH="$DEFAULT_OUTPUT"
while [[ "$#" -gt 0 ]]; do
case "$1" in
--tag)
[[ $# -ge 2 ]] || { usage; exit 1; }
IMAGE_TAG="$2"
shift 2
;;
--output)
[[ $# -ge 2 ]] || { usage; exit 1; }
OUTPUT_PATH="$2"
shift 2
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown option: $1" >&2
usage
exit 1
;;
esac
done
mkdir -p "$(dirname "$OUTPUT_PATH")"
echo "[INFO] Saving image $IMAGE_TAG to $OUTPUT_PATH"
docker image save "$IMAGE_TAG" -o "$OUTPUT_PATH"
echo "[OK] Image saved"

View File

@ -13,14 +13,20 @@ services:
- ./private/argus/metric/prometheus:/private/argus/metric/prometheus
agent:
image: argus-agent:dev
image: ubuntu:24.04
container_name: argus-agent-e2e
hostname: dev-e2euser-e2einst-pod-0
depends_on:
- master
environment:
- MASTER_ENDPOINT=http://master:3000
- REPORT_INTERVAL_SECONDS=2
volumes:
- ./private/argus/agent/dev-e2euser-e2einst-pod-0:/private/argus/agent/dev-e2euser-e2einst-pod-0
- ./private/argus/agent/health/dev-e2euser-e2einst-pod-0:/private/argus/agent/health/dev-e2euser-e2einst-pod-0
- ../dist/argus-agent:/usr/local/bin/argus-agent:ro
entrypoint:
- /usr/local/bin/argus-agent
networks:
default:

View File

@ -20,25 +20,23 @@ mkdir -p "$MASTER_PRIVATE_DIR"
mkdir -p "$METRIC_PRIVATE_DIR"
mkdir -p "$TMP_ROOT"
cat > "$AGENT_CONFIG_DIR/config" <<CONFIG
HOSTNAME=$AGENT_HOSTNAME
NODE_FILE=/private/argus/agent/$AGENT_HOSTNAME/node.json
VERSION=1.1.0
MASTER_ENDPOINT=http://master:3000
REPORT_INTERVAL_SECONDS=2
SUBMODULE_HEALTH_FILE_DIR=/private/argus/agent/health/{hostname}/
GPU_NUMBER=
IP_OVERRIDE=
CONFIG
touch "$AGENT_HEALTH_DIR/.keep"
pushd "$MASTER_ROOT" >/dev/null
./scripts/build_images.sh --tag argus-master:dev
popd >/dev/null
AGENT_BINARY="$AGENT_ROOT/dist/argus-agent"
pushd "$AGENT_ROOT" >/dev/null
./scripts/build_images.sh --tag argus-agent:dev
./scripts/build_binary.sh
popd >/dev/null
if [[ ! -x "$AGENT_BINARY" ]]; then
echo "[ERROR] Agent binary not found at $AGENT_BINARY" >&2
exit 1
fi
echo "$AGENT_BINARY" > "$TMP_ROOT/agent_binary_path"
echo "[INFO] Agent E2E bootstrap complete"

View File

@ -4,6 +4,19 @@ set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
TMP_ROOT="$TEST_ROOT/tmp"
if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then
echo "[ERROR] Agent binary path missing; run 01_bootstrap.sh first" >&2
exit 1
fi
AGENT_BINARY="$(cat "$TMP_ROOT/agent_binary_path")"
if [[ ! -x "$AGENT_BINARY" ]]; then
echo "[ERROR] Agent binary not executable: $AGENT_BINARY" >&2
exit 1
fi
compose() {
if docker compose version >/dev/null 2>&1; then
docker compose "$@"

View File

@ -10,6 +10,17 @@ AGENT_HOSTNAME="dev-e2euser-e2einst-pod-0"
NETWORK_NAME="tests_default"
NEW_AGENT_IP="172.28.0.200"
if [[ ! -f "$TMP_ROOT/agent_binary_path" ]]; then
echo "[ERROR] Agent binary path missing; rerun bootstrap" >&2
exit 1
fi
AGENT_BINARY="$(cat "$TMP_ROOT/agent_binary_path")"
if [[ ! -x "$AGENT_BINARY" ]]; then
echo "[ERROR] Agent binary not executable: $AGENT_BINARY" >&2
exit 1
fi
compose() {
if docker compose version >/dev/null 2>&1; then
docker compose "$@"
@ -57,14 +68,17 @@ if ! docker run -d \
--ip "$NEW_AGENT_IP" \
-v "$AGENT_DIR:/private/argus/agent/$AGENT_HOSTNAME" \
-v "$HEALTH_DIR:/private/argus/agent/health/$AGENT_HOSTNAME" \
argus-agent:dev \
-v "$AGENT_BINARY:/usr/local/bin/argus-agent:ro" \
-e MASTER_ENDPOINT=http://master:3000 \
-e REPORT_INTERVAL_SECONDS=2 \
ubuntu:24.04 \
sleep 300 >/dev/null; then
echo "[ERROR] Failed to start agent container with custom IP" >&2
exit 1
fi
# 在容器内启动真实 agent 进程
if ! docker exec -d argus-agent-e2e python -m app.main; then
if ! docker exec -d argus-agent-e2e /usr/local/bin/argus-agent; then
echo "[ERROR] Failed to spawn agent process inside container" >&2
exit 1
fi