[#37] 修复log时间戳测试问题
This commit is contained in:
parent
d1fad4a05a
commit
0b9268332f
@ -22,8 +22,7 @@
|
|||||||
[PARSER]
|
[PARSER]
|
||||||
Name timestamp_parser
|
Name timestamp_parser
|
||||||
Format regex
|
Format regex
|
||||||
Regex ^(?<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?<level>\w+)\s+(?<message>.*)$
|
Regex ^(?<timestamp>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:Z|[+-]\d{2}:?\d{2}))\s+(?<level>\w+)\s+(?<message>.*)$
|
||||||
Time_Key timestamp
|
Time_Key timestamp
|
||||||
Time_Format %Y-%m-%d %H:%M:%S
|
Time_Format %Y-%m-%dT%H:%M:%S%z
|
||||||
Time_Offset +0800
|
|
||||||
Time_Keep On
|
Time_Keep On
|
||||||
|
|||||||
@ -28,11 +28,11 @@ fi
|
|||||||
docker exec "$container_name" mkdir -p /logs/train /logs/infer
|
docker exec "$container_name" mkdir -p /logs/train /logs/infer
|
||||||
|
|
||||||
# 写入训练日志 (host01)
|
# 写入训练日志 (host01)
|
||||||
docker exec "$container_name" sh -c "printf '%s INFO [host01] training step=1 loss=1.23 model=bert\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log"
|
docker exec "$container_name" sh -c "printf '%s INFO [host01] training step=1 loss=1.23 model=bert\n' \"\$(date -u +%Y-%m-%dT%H:%M:%SZ)\" >> /logs/train/train-demo.log"
|
||||||
docker exec "$container_name" sh -c "printf '%s INFO [host01] training step=2 loss=1.15 model=bert\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log"
|
docker exec "$container_name" sh -c "printf '%s INFO [host01] training step=2 loss=1.15 model=bert\n' \"\$(date -u +%Y-%m-%dT%H:%M:%SZ)\" >> /logs/train/train-demo.log"
|
||||||
|
|
||||||
# 写入推理日志 (host01)
|
# 写入推理日志 (host01)
|
||||||
docker exec "$container_name" sh -c "printf '%s ERROR [host01] inference failed on batch=1\n' \"\$(date '+%F %T')\" >> /logs/infer/infer-demo.log"
|
docker exec "$container_name" sh -c "printf '%s ERROR [host01] inference failed on batch=1\n' \"\$(date -u +%Y-%m-%dT%H:%M:%SZ)\" >> /logs/infer/infer-demo.log"
|
||||||
docker exec "$container_name" sh -c "cat <<'STACK' >> /logs/infer/infer-demo.log
|
docker exec "$container_name" sh -c "cat <<'STACK' >> /logs/infer/infer-demo.log
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
File \"inference.py\", line 15, in <module>
|
File \"inference.py\", line 15, in <module>
|
||||||
|
|||||||
@ -28,13 +28,13 @@ fi
|
|||||||
docker exec "$container_name" mkdir -p /logs/train /logs/infer
|
docker exec "$container_name" mkdir -p /logs/train /logs/infer
|
||||||
|
|
||||||
# 写入训练日志 (host02)
|
# 写入训练日志 (host02)
|
||||||
docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=1 loss=1.45 model=gpt\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log"
|
docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=1 loss=1.45 model=gpt\n' \"\$(date -u +%Y-%m-%dT%H:%M:%SZ)\" >> /logs/train/train-demo.log"
|
||||||
docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=2 loss=1.38 model=gpt\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log"
|
docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=2 loss=1.38 model=gpt\n' \"\$(date -u +%Y-%m-%dT%H:%M:%SZ)\" >> /logs/train/train-demo.log"
|
||||||
docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=3 loss=1.32 model=gpt\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log"
|
docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=3 loss=1.32 model=gpt\n' \"\$(date -u +%Y-%m-%dT%H:%M:%SZ)\" >> /logs/train/train-demo.log"
|
||||||
|
|
||||||
# 写入推理日志 (host02)
|
# 写入推理日志 (host02)
|
||||||
docker exec "$container_name" sh -c "printf '%s WARN [host02] inference slow on batch=5 latency=2.3s\n' \"\$(date '+%F %T')\" >> /logs/infer/infer-demo.log"
|
docker exec "$container_name" sh -c "printf '%s WARN [host02] inference slow on batch=5 latency=2.3s\n' \"\$(date -u +%Y-%m-%dT%H:%M:%SZ)\" >> /logs/infer/infer-demo.log"
|
||||||
docker exec "$container_name" sh -c "printf '%s INFO [host02] inference completed batch=6 latency=0.8s\n' \"\$(date '+%F %T')\" >> /logs/infer/infer-demo.log"
|
docker exec "$container_name" sh -c "printf '%s INFO [host02] inference completed batch=6 latency=0.8s\n' \"\$(date -u +%Y-%m-%dT%H:%M:%SZ)\" >> /logs/infer/infer-demo.log"
|
||||||
|
|
||||||
echo "[OK] 已通过docker exec写入测试日志到 host02 容器内:"
|
echo "[OK] 已通过docker exec写入测试日志到 host02 容器内:"
|
||||||
echo " - /logs/train/train-demo.log"
|
echo " - /logs/train/train-demo.log"
|
||||||
|
|||||||
@ -22,6 +22,6 @@
|
|||||||
[PARSER]
|
[PARSER]
|
||||||
Name timestamp_parser
|
Name timestamp_parser
|
||||||
Format regex
|
Format regex
|
||||||
Regex ^(?<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?<level>\w+)\s+(?<message>.*)$
|
Regex ^(?<timestamp>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:Z|[+-]\d{2}:?\d{2}))\s+(?<level>\w+)\s+(?<message>.*)$
|
||||||
Time_Key timestamp
|
Time_Key timestamp
|
||||||
Time_Format %Y-%m-%d %H:%M:%S
|
Time_Format %Y-%m-%dT%H:%M:%S%z
|
||||||
|
|||||||
1
src/sys/build/node-bundle/bundle/.gitignore
vendored
1
src/sys/build/node-bundle/bundle/.gitignore
vendored
@ -1,2 +1 @@
|
|||||||
|
|
||||||
argus-metric_*.tar.gz
|
argus-metric_*.tar.gz
|
||||||
|
|||||||
@ -26,12 +26,9 @@ service_id() {
|
|||||||
send_logs() {
|
send_logs() {
|
||||||
local sid="$1"; local hosttag="$2"
|
local sid="$1"; local hosttag="$2"
|
||||||
docker exec "$sid" sh -lc 'mkdir -p /logs/train /logs/infer'
|
docker exec "$sid" sh -lc 'mkdir -p /logs/train /logs/infer'
|
||||||
docker exec "$sid" sh -lc "ts=\
|
docker exec "$sid" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log"
|
||||||
\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log"
|
docker exec "$sid" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log"
|
||||||
docker exec "$sid" sh -lc "ts=\
|
docker exec "$sid" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log"
|
||||||
\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log"
|
|
||||||
docker exec "$sid" sh -lc "ts=\
|
|
||||||
\$(date '+%F %T'); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
CID_A="$(service_id node-a)"
|
CID_A="$(service_id node-a)"
|
||||||
|
|||||||
2
src/sys/swarm_tests/.gitignore
vendored
Normal file
2
src/sys/swarm_tests/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
|
||||||
|
private-*/
|
||||||
@ -47,6 +47,16 @@ ensure_fluentbit() {
|
|||||||
if [ -f "$o" ] && (grep -q "$EH" "$o" || grep -q "$EP" "$o"); then
|
if [ -f "$o" ] && (grep -q "$EH" "$o" || grep -q "$EP" "$o"); then
|
||||||
sed -i "s|Host $EH|Host es.log.argus.com|g; s|Port $EP|Port 9200|g" "$o"
|
sed -i "s|Host $EH|Host es.log.argus.com|g; s|Port $EP|Port 9200|g" "$o"
|
||||||
fi
|
fi
|
||||||
|
# ensure parser supports ISO8601 with timezone
|
||||||
|
p=/etc/fluent-bit/parsers.conf
|
||||||
|
if [ -f "$p" ]; then
|
||||||
|
if grep -q "Time_Format %Y-%m-%d %H:%M:%S" "$p"; then
|
||||||
|
sed -i "s|Time_Format %Y-%m-%d %H:%M:%S|Time_Format %Y-%m-%dT%H:%M:%S%z|" "$p"
|
||||||
|
fi
|
||||||
|
if grep -q "Regex ^(?<timestamp>\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2})\\s+" "$p"; then
|
||||||
|
sed -i "s|Regex ^(?<timestamp>\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2})\\s+|Regex ^(?<timestamp>\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(?:Z|[+-]\\d{2}:?\\d{2}))\\s+|" "$p"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
' >/dev/null 2>&1 || true
|
' >/dev/null 2>&1 || true
|
||||||
# 3) restart fluent-bit (best-effort) and wait
|
# 3) restart fluent-bit (best-effort) and wait
|
||||||
docker exec "$cname" bash -lc 'pkill -x fluent-bit >/dev/null 2>&1 || true; sleep 1; setsid su -s /bin/bash fluent-bit -c "/opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf >> /var/log/fluent-bit.log 2>&1" &>/dev/null & echo ok' >/dev/null 2>&1 || true
|
docker exec "$cname" bash -lc 'pkill -x fluent-bit >/dev/null 2>&1 || true; sleep 1; setsid su -s /bin/bash fluent-bit -c "/opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf >> /var/log/fluent-bit.log 2>&1" &>/dev/null & echo ok' >/dev/null 2>&1 || true
|
||||||
@ -136,9 +146,9 @@ info "initial ES counts: train=${train0} infer=${infer0} total=${base}"
|
|||||||
send_logs() {
|
send_logs() {
|
||||||
local cname="$1"; local hosttag="$2"
|
local cname="$1"; local hosttag="$2"
|
||||||
docker exec "$cname" sh -lc 'mkdir -p /logs/train /logs/infer'
|
docker exec "$cname" sh -lc 'mkdir -p /logs/train /logs/infer'
|
||||||
docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log"
|
docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log"
|
||||||
docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log"
|
docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log"
|
||||||
docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log"
|
docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log"
|
||||||
}
|
}
|
||||||
|
|
||||||
NODE_CONT="${SWARM_NODE_CNAME:-argus-metric-test-node-swarm}"
|
NODE_CONT="${SWARM_NODE_CNAME:-argus-metric-test-node-swarm}"
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.13:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.13:9400/metrics","globalUrl":"http://10.0.1.13:9400/metrics","lastError":"","lastScrape":"2025-11-06T16:36:25.585236213+08:00","lastScrapeDuration":0.002520163,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.13:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.13:9100/metrics","globalUrl":"http://10.0.1.13:9100/metrics","lastError":"","lastScrape":"2025-11-06T16:36:33.694723606+08:00","lastScrapeDuration":0.021800606,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}}
|
{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.13:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.13:9400/metrics","globalUrl":"http://10.0.1.13:9400/metrics","lastError":"","lastScrape":"2025-11-06T17:13:58.44079249+08:00","lastScrapeDuration":0.001229132,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.13:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.13:9100/metrics","globalUrl":"http://10.0.1.13:9100/metrics","lastError":"","lastScrape":"2025-11-06T17:13:54.277705211+08:00","lastScrapeDuration":0.024348657,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}}
|
||||||
@ -32,12 +32,9 @@ echo "[INFO] initial counts: train=${train0} infer=${infer0} total=${base}"
|
|||||||
send_logs() {
|
send_logs() {
|
||||||
local cname="$1"; local hosttag="$2"
|
local cname="$1"; local hosttag="$2"
|
||||||
docker exec "$cname" sh -lc 'mkdir -p /logs/train /logs/infer'
|
docker exec "$cname" sh -lc 'mkdir -p /logs/train /logs/infer'
|
||||||
docker exec "$cname" sh -lc "ts=\
|
docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log"
|
||||||
\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log"
|
docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log"
|
||||||
docker exec "$cname" sh -lc "ts=\
|
docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log"
|
||||||
\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log"
|
|
||||||
docker exec "$cname" sh -lc "ts=\
|
|
||||||
\$(date '+%F %T'); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Determine container names
|
# Determine container names
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user