Compare commits

...

2 Commits

Author SHA1 Message Date
824eddde67 [#31] grafana 面板uid冲突问题解决 2025-10-28 17:13:42 +08:00
d036da2d5e [#34] 增加sys/tests节点镜像构建 2025-10-28 17:12:42 +08:00
8 changed files with 520 additions and 19 deletions

View File

@ -12,7 +12,7 @@ Options:
--master-offline Build master offline image (requires src/master/offline_wheels.tar.gz)
--metric Build metric module images (ftp, prometheus, grafana, test nodes)
--no-cache Build all images without using Docker layer cache
--only LIST Comma-separated targets to build: core,master,metric,web,alert,all
--only LIST Comma-separated targets to build: core,master,metric,web,alert,sys,all
-h, --help Show this help message
Examples:
@ -31,6 +31,7 @@ build_master_offline=false
build_metric=true
build_web=true
build_alert=true
build_sys=true
no_cache=false
while [[ $# -gt 0 ]]; do
@ -62,7 +63,7 @@ while [[ $# -gt 0 ]]; do
fi
sel="$2"; shift 2
# reset all, then enable selected
build_core=false; build_master=false; build_metric=false; build_web=false; build_alert=false
build_core=false; build_master=false; build_metric=false; build_web=false; build_alert=false; build_sys=false
IFS=',' read -ra parts <<< "$sel"
for p in "${parts[@]}"; do
case "$p" in
@ -71,7 +72,8 @@ while [[ $# -gt 0 ]]; do
metric) build_metric=true ;;
web) build_web=true ;;
alert) build_alert=true ;;
all) build_core=true; build_master=true; build_metric=true; build_web=true; build_alert=true ;;
sys) build_sys=true ;;
all) build_core=true; build_master=true; build_metric=true; build_web=true; build_alert=true; build_sys=true ;;
*) echo "Unknown --only target: $p" >&2; exit 1 ;;
esac
done
@ -286,6 +288,42 @@ if [[ "$build_metric" == true ]]; then
done
fi
# =======================================
# Sys (system tests) node images
# =======================================
if [[ "$build_sys" == true ]]; then
echo ""
echo "Building Sys node images..."
sys_base_images=(
"ubuntu:22.04"
"nvidia/cuda:12.2.2-runtime-ubuntu22.04"
)
for base_image in "${sys_base_images[@]}"; do
if ! pull_base_image "$base_image"; then
build_failed=true
fi
done
sys_builds=(
"Sys Node|src/sys/build/node/Dockerfile|argus-sys-node:latest|."
"Sys Metric Test Node|src/sys/build/test-node/Dockerfile|argus-sys-metric-test-node:latest|."
"Sys Metric Test GPU Node|src/sys/build/test-gpu-node/Dockerfile|argus-sys-metric-test-gpu-node:latest|."
)
for build_spec in "${sys_builds[@]}"; do
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
images_built+=("$image_tag")
else
build_failed=true
fi
echo ""
done
fi
# =======================================
# Web & Alert module images
# =======================================

View File

@ -1 +1 @@
1.33.0
1.35.0

View File

@ -581,6 +581,372 @@
],
"title": "Node Process Count",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "GPU Utilization (%)",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "orange",
"value": 80
},
{
"color": "red",
"value": 95
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 24
},
"id": 301,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"expr": "DCGM_FI_DEV_GPU_UTIL{hostname=~\"$hostname\"}",
"legendFormat": "{{hostname}} GPU{{gpu}}",
"refId": "A"
}
],
"title": "GPU 利用率 (单卡)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": true,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "Memory Used (%)",
"axisPlacement": "left",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "orange",
"value": 80
},
{
"color": "red",
"value": 95
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 24
},
"id": 403,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"expr": "round(DCGM_FI_DEV_FB_USED{hostname=~\"$hostname\"} / (DCGM_FI_DEV_FB_USED{hostname=~\"$hostname\"} + DCGM_FI_DEV_FB_FREE{hostname=~\"$hostname\"}) * 100)",
"legendFormat": "{{hostname}} GPU{{gpu}}",
"refId": "A"
}
],
"title": "GPU 显存使用率 (单卡)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": true,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "Temperature (℃)",
"axisPlacement": "left",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 85
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 32
},
"id": 501,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"expr": "DCGM_FI_DEV_GPU_TEMP{hostname=~\"$hostname\"}",
"legendFormat": "{{hostname}} GPU{{gpu}}",
"refId": "A"
}
],
"title": "GPU 温度(单卡)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": true,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "Power (W)",
"axisPlacement": "left",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 300,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "orange",
"value": 200
},
{
"color": "red",
"value": 300
}
]
},
"unit": "watt"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 32
},
"id": 502,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"expr": "DCGM_FI_DEV_POWER_USAGE{hostname=~\"$hostname\"}",
"legendFormat": "{{hostname}} GPU{{gpu}}",
"refId": "A"
}
],
"title": "GPU 功率 (单卡)",
"type": "timeseries"
}
],
"refresh": "15s",
@ -589,11 +955,6 @@
"templating": {
"list": [
{
"current": {
"selected": true,
"text": "node-exporter-A1",
"value": "node-exporter-A1"
},
"datasource": {
"type": "prometheus"
},
@ -623,7 +984,7 @@
},
"timepicker": {},
"timezone": "",
"title": "Node and GPU Metrics",
"uid": "node_gpu_metrics",
"title": "Node and GPU Metrics (by hostname)",
"uid": "node_gpu_metrics_by_hostname",
"weekStart": ""
}
}

View File

@ -622,7 +622,7 @@
},
"timepicker": {},
"timezone": "",
"title": "Node and GPU Metrics",
"uid": "node_gpu_metrics",
"title": "Node and GPU Metrics (by instance)",
"uid": "node_gpu_metrics_by_instance",
"weekStart": ""
}
}

View File

@ -0,0 +1,36 @@
FROM ubuntu:22.04
ENV DEBIAN_FRONTEND=noninteractive \
TZ=Asia/Shanghai
ARG USE_INTRANET=false
ARG ARGUS_BUILD_UID=2133
ARG ARGUS_BUILD_GID=2015
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
# Optional: switch to intranet apt mirrors during build
RUN if [ "$USE_INTRANET" = "true" ]; then \
echo "Configuring intranet apt sources..." && \
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
fi
# Install base tools and all libs that Fluent Bit may require at runtime
# so that start-fluent-bit.sh will NOT fallback to apt during container start.
RUN set -eux; \
apt-get update; \
apt-get install -y --no-install-recommends \
ca-certificates tzdata \
procps iproute2 net-tools lsof \
libpq5 libyaml-0-2 libsasl2-2 libldap-2.5-0; \
rm -rf /var/lib/apt/lists/*
# Keep root; compose provides entrypoint via bind mount
USER root
CMD ["bash", "-lc", "sleep infinity"]

View File

@ -0,0 +1,34 @@
FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive \
TZ=Asia/Shanghai \
NVIDIA_VISIBLE_DEVICES=all \
NVIDIA_DRIVER_CAPABILITIES=compute,utility
ARG USE_INTRANET=false
ARG ARGUS_BUILD_UID=2133
ARG ARGUS_BUILD_GID=2015
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
# Optional intranet mirror for build-time apt
RUN if [ "$USE_INTRANET" = "true" ]; then \
echo "Configuring intranet apt sources..." && \
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
fi
# Pre-install curl and diagnostics to avoid runtime apt installs in GPU test node
RUN set -eux; \
apt-get update; \
apt-get install -y --no-install-recommends \
curl ca-certificates tzdata \
procps iproute2 net-tools lsof; \
rm -rf /var/lib/apt/lists/*
USER root
CMD ["bash", "-lc", "sleep infinity"]

View File

@ -0,0 +1,32 @@
FROM ubuntu:22.04
ENV DEBIAN_FRONTEND=noninteractive \
TZ=Asia/Shanghai
ARG USE_INTRANET=false
ARG ARGUS_BUILD_UID=2133
ARG ARGUS_BUILD_GID=2015
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
# Optional intranet mirror for build-time apt
RUN if [ "$USE_INTRANET" = "true" ]; then \
echo "Configuring intranet apt sources..." && \
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
fi
# Pre-install curl and common diagnostics to avoid runtime apt installs
RUN set -eux; \
apt-get update; \
apt-get install -y --no-install-recommends \
curl ca-certificates tzdata \
procps iproute2 net-tools lsof; \
rm -rf /var/lib/apt/lists/*
USER root
CMD ["bash", "-lc", "sleep infinity"]

View File

@ -78,7 +78,7 @@ services:
ipv4_address: 172.31.0.4
node-a:
image: ubuntu:22.04
image: argus-sys-node:latest
container_name: argus-node-a
hostname: dev-yyrshare-nbnyx10-cp2f-pod-0
depends_on:
@ -113,7 +113,7 @@ services:
- sysnet
node-b:
image: ubuntu:22.04
image: argus-sys-node:latest
container_name: argus-node-b
hostname: dev-yyrshare-uuuu10-ep2f-pod-0
depends_on:
@ -260,7 +260,7 @@ services:
max-file: "3"
test-node:
image: ubuntu:22.04
image: argus-sys-metric-test-node:latest
container_name: argus-metric-test-node
hostname: test-metric-node-001
restart: unless-stopped
@ -303,7 +303,7 @@ services:
test-gpu-node:
profiles: ["gpu"]
image: nvidia/cuda:12.2.2-runtime-ubuntu22.04
image: argus-sys-metric-test-gpu-node:latest
container_name: argus-metric-test-gpu-node
hostname: test-metric-gpu-node-001
restart: unless-stopped