Compare commits
2 Commits
8e01264e3f
...
824eddde67
| Author | SHA1 | Date | |
|---|---|---|---|
| 824eddde67 | |||
| d036da2d5e |
@ -12,7 +12,7 @@ Options:
|
||||
--master-offline Build master offline image (requires src/master/offline_wheels.tar.gz)
|
||||
--metric Build metric module images (ftp, prometheus, grafana, test nodes)
|
||||
--no-cache Build all images without using Docker layer cache
|
||||
--only LIST Comma-separated targets to build: core,master,metric,web,alert,all
|
||||
--only LIST Comma-separated targets to build: core,master,metric,web,alert,sys,all
|
||||
-h, --help Show this help message
|
||||
|
||||
Examples:
|
||||
@ -31,6 +31,7 @@ build_master_offline=false
|
||||
build_metric=true
|
||||
build_web=true
|
||||
build_alert=true
|
||||
build_sys=true
|
||||
no_cache=false
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
@ -62,7 +63,7 @@ while [[ $# -gt 0 ]]; do
|
||||
fi
|
||||
sel="$2"; shift 2
|
||||
# reset all, then enable selected
|
||||
build_core=false; build_master=false; build_metric=false; build_web=false; build_alert=false
|
||||
build_core=false; build_master=false; build_metric=false; build_web=false; build_alert=false; build_sys=false
|
||||
IFS=',' read -ra parts <<< "$sel"
|
||||
for p in "${parts[@]}"; do
|
||||
case "$p" in
|
||||
@ -71,7 +72,8 @@ while [[ $# -gt 0 ]]; do
|
||||
metric) build_metric=true ;;
|
||||
web) build_web=true ;;
|
||||
alert) build_alert=true ;;
|
||||
all) build_core=true; build_master=true; build_metric=true; build_web=true; build_alert=true ;;
|
||||
sys) build_sys=true ;;
|
||||
all) build_core=true; build_master=true; build_metric=true; build_web=true; build_alert=true; build_sys=true ;;
|
||||
*) echo "Unknown --only target: $p" >&2; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
@ -286,6 +288,42 @@ if [[ "$build_metric" == true ]]; then
|
||||
done
|
||||
fi
|
||||
|
||||
# =======================================
|
||||
# Sys (system tests) node images
|
||||
# =======================================
|
||||
|
||||
if [[ "$build_sys" == true ]]; then
|
||||
echo ""
|
||||
echo "Building Sys node images..."
|
||||
|
||||
sys_base_images=(
|
||||
"ubuntu:22.04"
|
||||
"nvidia/cuda:12.2.2-runtime-ubuntu22.04"
|
||||
)
|
||||
|
||||
for base_image in "${sys_base_images[@]}"; do
|
||||
if ! pull_base_image "$base_image"; then
|
||||
build_failed=true
|
||||
fi
|
||||
done
|
||||
|
||||
sys_builds=(
|
||||
"Sys Node|src/sys/build/node/Dockerfile|argus-sys-node:latest|."
|
||||
"Sys Metric Test Node|src/sys/build/test-node/Dockerfile|argus-sys-metric-test-node:latest|."
|
||||
"Sys Metric Test GPU Node|src/sys/build/test-gpu-node/Dockerfile|argus-sys-metric-test-gpu-node:latest|."
|
||||
)
|
||||
|
||||
for build_spec in "${sys_builds[@]}"; do
|
||||
IFS='|' read -r image_label dockerfile_path image_tag build_context <<< "$build_spec"
|
||||
if build_image "$image_label" "$dockerfile_path" "$image_tag" "$build_context"; then
|
||||
images_built+=("$image_tag")
|
||||
else
|
||||
build_failed=true
|
||||
fi
|
||||
echo ""
|
||||
done
|
||||
fi
|
||||
|
||||
# =======================================
|
||||
# Web & Alert module images
|
||||
# =======================================
|
||||
|
||||
@ -1 +1 @@
|
||||
1.33.0
|
||||
1.35.0
|
||||
|
||||
@ -581,6 +581,372 @@
|
||||
],
|
||||
"title": "Node Process Count",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "GPU Utilization (%)",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 80
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 95
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
},
|
||||
"id": 301,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "DCGM_FI_DEV_GPU_UTIL{hostname=~\"$hostname\"}",
|
||||
"legendFormat": "{{hostname}} GPU{{gpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "GPU 利用率 (单卡)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": true,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "Memory Used (%)",
|
||||
"axisPlacement": "left",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 80
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 95
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 24
|
||||
},
|
||||
"id": 403,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "round(DCGM_FI_DEV_FB_USED{hostname=~\"$hostname\"} / (DCGM_FI_DEV_FB_USED{hostname=~\"$hostname\"} + DCGM_FI_DEV_FB_FREE{hostname=~\"$hostname\"}) * 100)",
|
||||
"legendFormat": "{{hostname}} GPU{{gpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "GPU 显存使用率 (单卡)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": true,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "Temperature (℃)",
|
||||
"axisPlacement": "left",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 75
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 85
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "celsius"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 32
|
||||
},
|
||||
"id": 501,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "DCGM_FI_DEV_GPU_TEMP{hostname=~\"$hostname\"}",
|
||||
"legendFormat": "{{hostname}} GPU{{gpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "GPU 温度(单卡)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": true,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "Power (W)",
|
||||
"axisPlacement": "left",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": {
|
||||
"legend": false,
|
||||
"tooltip": false,
|
||||
"viz": false
|
||||
},
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": {
|
||||
"type": "linear"
|
||||
},
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": {
|
||||
"group": "A",
|
||||
"mode": "none"
|
||||
},
|
||||
"thresholdsStyle": {
|
||||
"mode": "off"
|
||||
}
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 300,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 200
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 300
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "watt"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 32
|
||||
},
|
||||
"id": 502,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "DCGM_FI_DEV_POWER_USAGE{hostname=~\"$hostname\"}",
|
||||
"legendFormat": "{{hostname}} GPU{{gpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "GPU 功率 (单卡)",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "15s",
|
||||
@ -589,11 +955,6 @@
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"selected": true,
|
||||
"text": "node-exporter-A1",
|
||||
"value": "node-exporter-A1"
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus"
|
||||
},
|
||||
@ -623,7 +984,7 @@
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Node and GPU Metrics",
|
||||
"uid": "node_gpu_metrics",
|
||||
"title": "Node and GPU Metrics (by hostname)",
|
||||
"uid": "node_gpu_metrics_by_hostname",
|
||||
"weekStart": ""
|
||||
}
|
||||
}
|
||||
|
||||
@ -622,7 +622,7 @@
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Node and GPU Metrics",
|
||||
"uid": "node_gpu_metrics",
|
||||
"title": "Node and GPU Metrics (by instance)",
|
||||
"uid": "node_gpu_metrics_by_instance",
|
||||
"weekStart": ""
|
||||
}
|
||||
}
|
||||
|
||||
36
src/sys/build/node/Dockerfile
Normal file
36
src/sys/build/node/Dockerfile
Normal file
@ -0,0 +1,36 @@
|
||||
FROM ubuntu:22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
TZ=Asia/Shanghai
|
||||
|
||||
ARG USE_INTRANET=false
|
||||
ARG ARGUS_BUILD_UID=2133
|
||||
ARG ARGUS_BUILD_GID=2015
|
||||
|
||||
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
||||
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
||||
|
||||
# Optional: switch to intranet apt mirrors during build
|
||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
echo "Configuring intranet apt sources..." && \
|
||||
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
|
||||
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
||||
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
||||
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
||||
fi
|
||||
|
||||
# Install base tools and all libs that Fluent Bit may require at runtime
|
||||
# so that start-fluent-bit.sh will NOT fallback to apt during container start.
|
||||
RUN set -eux; \
|
||||
apt-get update; \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ca-certificates tzdata \
|
||||
procps iproute2 net-tools lsof \
|
||||
libpq5 libyaml-0-2 libsasl2-2 libldap-2.5-0; \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Keep root; compose provides entrypoint via bind mount
|
||||
USER root
|
||||
|
||||
CMD ["bash", "-lc", "sleep infinity"]
|
||||
|
||||
34
src/sys/build/test-gpu-node/Dockerfile
Normal file
34
src/sys/build/test-gpu-node/Dockerfile
Normal file
@ -0,0 +1,34 @@
|
||||
FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
TZ=Asia/Shanghai \
|
||||
NVIDIA_VISIBLE_DEVICES=all \
|
||||
NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||
|
||||
ARG USE_INTRANET=false
|
||||
ARG ARGUS_BUILD_UID=2133
|
||||
ARG ARGUS_BUILD_GID=2015
|
||||
|
||||
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
||||
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
||||
|
||||
# Optional intranet mirror for build-time apt
|
||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
echo "Configuring intranet apt sources..." && \
|
||||
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
|
||||
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
||||
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
||||
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
||||
fi
|
||||
|
||||
# Pre-install curl and diagnostics to avoid runtime apt installs in GPU test node
|
||||
RUN set -eux; \
|
||||
apt-get update; \
|
||||
apt-get install -y --no-install-recommends \
|
||||
curl ca-certificates tzdata \
|
||||
procps iproute2 net-tools lsof; \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
USER root
|
||||
CMD ["bash", "-lc", "sleep infinity"]
|
||||
|
||||
32
src/sys/build/test-node/Dockerfile
Normal file
32
src/sys/build/test-node/Dockerfile
Normal file
@ -0,0 +1,32 @@
|
||||
FROM ubuntu:22.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
TZ=Asia/Shanghai
|
||||
|
||||
ARG USE_INTRANET=false
|
||||
ARG ARGUS_BUILD_UID=2133
|
||||
ARG ARGUS_BUILD_GID=2015
|
||||
|
||||
ENV ARGUS_BUILD_UID=${ARGUS_BUILD_UID} \
|
||||
ARGUS_BUILD_GID=${ARGUS_BUILD_GID}
|
||||
|
||||
# Optional intranet mirror for build-time apt
|
||||
RUN if [ "$USE_INTRANET" = "true" ]; then \
|
||||
echo "Configuring intranet apt sources..." && \
|
||||
cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
|
||||
echo "deb [trusted=yes] http://10.68.64.1/ubuntu2204/ jammy main" > /etc/apt/sources.list && \
|
||||
echo 'Acquire::https::Verify-Peer "false";' > /etc/apt/apt.conf.d/99disable-ssl-check && \
|
||||
echo 'Acquire::https::Verify-Host "false";' >> /etc/apt/apt.conf.d/99disable-ssl-check; \
|
||||
fi
|
||||
|
||||
# Pre-install curl and common diagnostics to avoid runtime apt installs
|
||||
RUN set -eux; \
|
||||
apt-get update; \
|
||||
apt-get install -y --no-install-recommends \
|
||||
curl ca-certificates tzdata \
|
||||
procps iproute2 net-tools lsof; \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
USER root
|
||||
CMD ["bash", "-lc", "sleep infinity"]
|
||||
|
||||
@ -78,7 +78,7 @@ services:
|
||||
ipv4_address: 172.31.0.4
|
||||
|
||||
node-a:
|
||||
image: ubuntu:22.04
|
||||
image: argus-sys-node:latest
|
||||
container_name: argus-node-a
|
||||
hostname: dev-yyrshare-nbnyx10-cp2f-pod-0
|
||||
depends_on:
|
||||
@ -113,7 +113,7 @@ services:
|
||||
- sysnet
|
||||
|
||||
node-b:
|
||||
image: ubuntu:22.04
|
||||
image: argus-sys-node:latest
|
||||
container_name: argus-node-b
|
||||
hostname: dev-yyrshare-uuuu10-ep2f-pod-0
|
||||
depends_on:
|
||||
@ -260,7 +260,7 @@ services:
|
||||
max-file: "3"
|
||||
|
||||
test-node:
|
||||
image: ubuntu:22.04
|
||||
image: argus-sys-metric-test-node:latest
|
||||
container_name: argus-metric-test-node
|
||||
hostname: test-metric-node-001
|
||||
restart: unless-stopped
|
||||
@ -303,7 +303,7 @@ services:
|
||||
|
||||
test-gpu-node:
|
||||
profiles: ["gpu"]
|
||||
image: nvidia/cuda:12.2.2-runtime-ubuntu22.04
|
||||
image: argus-sys-metric-test-gpu-node:latest
|
||||
container_name: argus-metric-test-gpu-node
|
||||
hostname: test-metric-gpu-node-001
|
||||
restart: unless-stopped
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user