diff --git a/src/log/fluent-bit/build/etc/parsers.conf b/src/log/fluent-bit/build/etc/parsers.conf index 32f5571..8f6ca24 100644 --- a/src/log/fluent-bit/build/etc/parsers.conf +++ b/src/log/fluent-bit/build/etc/parsers.conf @@ -22,8 +22,7 @@ [PARSER] Name timestamp_parser Format regex - Regex ^(?\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?\w+)\s+(?.*)$ + Regex ^(?\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:Z|[+-]\d{2}:?\d{2}))\s+(?\w+)\s+(?.*)$ Time_Key timestamp - Time_Format %Y-%m-%d %H:%M:%S - Time_Offset +0800 + Time_Format %Y-%m-%dT%H:%M:%S%z Time_Keep On diff --git a/src/log/tests/scripts/03_send_test_host01.sh b/src/log/tests/scripts/03_send_test_host01.sh index 2fe11b8..6f3e926 100755 --- a/src/log/tests/scripts/03_send_test_host01.sh +++ b/src/log/tests/scripts/03_send_test_host01.sh @@ -28,11 +28,11 @@ fi docker exec "$container_name" mkdir -p /logs/train /logs/infer # 写入训练日志 (host01) -docker exec "$container_name" sh -c "printf '%s INFO [host01] training step=1 loss=1.23 model=bert\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log" -docker exec "$container_name" sh -c "printf '%s INFO [host01] training step=2 loss=1.15 model=bert\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log" +docker exec "$container_name" sh -c "printf '%s INFO [host01] training step=1 loss=1.23 model=bert\n' \"\$(date -u +%Y-%m-%dT%H:%M:%SZ)\" >> /logs/train/train-demo.log" +docker exec "$container_name" sh -c "printf '%s INFO [host01] training step=2 loss=1.15 model=bert\n' \"\$(date -u +%Y-%m-%dT%H:%M:%SZ)\" >> /logs/train/train-demo.log" # 写入推理日志 (host01) -docker exec "$container_name" sh -c "printf '%s ERROR [host01] inference failed on batch=1\n' \"\$(date '+%F %T')\" >> /logs/infer/infer-demo.log" +docker exec "$container_name" sh -c "printf '%s ERROR [host01] inference failed on batch=1\n' \"\$(date -u +%Y-%m-%dT%H:%M:%SZ)\" >> /logs/infer/infer-demo.log" docker exec "$container_name" sh -c "cat <<'STACK' >> /logs/infer/infer-demo.log Traceback (most recent call last): File \"inference.py\", line 15, in diff --git a/src/log/tests/scripts/03_send_test_host02.sh b/src/log/tests/scripts/03_send_test_host02.sh index d36ecf4..96aab03 100755 --- a/src/log/tests/scripts/03_send_test_host02.sh +++ b/src/log/tests/scripts/03_send_test_host02.sh @@ -28,13 +28,13 @@ fi docker exec "$container_name" mkdir -p /logs/train /logs/infer # 写入训练日志 (host02) -docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=1 loss=1.45 model=gpt\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log" -docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=2 loss=1.38 model=gpt\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log" -docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=3 loss=1.32 model=gpt\n' \"\$(date '+%F %T')\" >> /logs/train/train-demo.log" +docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=1 loss=1.45 model=gpt\n' \"\$(date -u +%Y-%m-%dT%H:%M:%SZ)\" >> /logs/train/train-demo.log" +docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=2 loss=1.38 model=gpt\n' \"\$(date -u +%Y-%m-%dT%H:%M:%SZ)\" >> /logs/train/train-demo.log" +docker exec "$container_name" sh -c "printf '%s INFO [host02] training step=3 loss=1.32 model=gpt\n' \"\$(date -u +%Y-%m-%dT%H:%M:%SZ)\" >> /logs/train/train-demo.log" # 写入推理日志 (host02) -docker exec "$container_name" sh -c "printf '%s WARN [host02] inference slow on batch=5 latency=2.3s\n' \"\$(date '+%F %T')\" >> /logs/infer/infer-demo.log" -docker exec "$container_name" sh -c "printf '%s INFO [host02] inference completed batch=6 latency=0.8s\n' \"\$(date '+%F %T')\" >> /logs/infer/infer-demo.log" +docker exec "$container_name" sh -c "printf '%s WARN [host02] inference slow on batch=5 latency=2.3s\n' \"\$(date -u +%Y-%m-%dT%H:%M:%SZ)\" >> /logs/infer/infer-demo.log" +docker exec "$container_name" sh -c "printf '%s INFO [host02] inference completed batch=6 latency=0.8s\n' \"\$(date -u +%Y-%m-%dT%H:%M:%SZ)\" >> /logs/infer/infer-demo.log" echo "[OK] 已通过docker exec写入测试日志到 host02 容器内:" echo " - /logs/train/train-demo.log" diff --git a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/parsers.conf b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/parsers.conf index d86fa06..1fbcbe0 100644 --- a/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/parsers.conf +++ b/src/metric/client-plugins/all-in-one-full/plugins/fluent-bit/config/parsers.conf @@ -22,6 +22,6 @@ [PARSER] Name timestamp_parser Format regex - Regex ^(?\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(?\w+)\s+(?.*)$ + Regex ^(?\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:Z|[+-]\d{2}:?\d{2}))\s+(?\w+)\s+(?.*)$ Time_Key timestamp - Time_Format %Y-%m-%d %H:%M:%S + Time_Format %Y-%m-%dT%H:%M:%S%z diff --git a/src/sys/build/node-bundle/bundle/.gitignore b/src/sys/build/node-bundle/bundle/.gitignore index 11e243e..11b12c6 100644 --- a/src/sys/build/node-bundle/bundle/.gitignore +++ b/src/sys/build/node-bundle/bundle/.gitignore @@ -1,2 +1 @@ - argus-metric_*.tar.gz diff --git a/src/sys/debug/scripts/07_logs_send_and_assert.sh b/src/sys/debug/scripts/07_logs_send_and_assert.sh index 775a886..fc7e3b2 100755 --- a/src/sys/debug/scripts/07_logs_send_and_assert.sh +++ b/src/sys/debug/scripts/07_logs_send_and_assert.sh @@ -26,12 +26,9 @@ service_id() { send_logs() { local sid="$1"; local hosttag="$2" docker exec "$sid" sh -lc 'mkdir -p /logs/train /logs/infer' - docker exec "$sid" sh -lc "ts=\ -\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log" - docker exec "$sid" sh -lc "ts=\ -\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log" - docker exec "$sid" sh -lc "ts=\ -\$(date '+%F %T'); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log" + docker exec "$sid" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log" + docker exec "$sid" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log" + docker exec "$sid" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log" } CID_A="$(service_id node-a)" diff --git a/src/sys/swarm_tests/.gitignore b/src/sys/swarm_tests/.gitignore new file mode 100644 index 0000000..c333a3f --- /dev/null +++ b/src/sys/swarm_tests/.gitignore @@ -0,0 +1,2 @@ + +private-*/ diff --git a/src/sys/swarm_tests/scripts/04_metric_verify.sh b/src/sys/swarm_tests/scripts/04_metric_verify.sh index ce2a162..2bc4ac6 100755 --- a/src/sys/swarm_tests/scripts/04_metric_verify.sh +++ b/src/sys/swarm_tests/scripts/04_metric_verify.sh @@ -47,6 +47,16 @@ ensure_fluentbit() { if [ -f "$o" ] && (grep -q "$EH" "$o" || grep -q "$EP" "$o"); then sed -i "s|Host $EH|Host es.log.argus.com|g; s|Port $EP|Port 9200|g" "$o" fi + # ensure parser supports ISO8601 with timezone + p=/etc/fluent-bit/parsers.conf + if [ -f "$p" ]; then + if grep -q "Time_Format %Y-%m-%d %H:%M:%S" "$p"; then + sed -i "s|Time_Format %Y-%m-%d %H:%M:%S|Time_Format %Y-%m-%dT%H:%M:%S%z|" "$p" + fi + if grep -q "Regex ^(?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2})\\s+" "$p"; then + sed -i "s|Regex ^(?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2})\\s+|Regex ^(?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(?:Z|[+-]\\d{2}:?\\d{2}))\\s+|" "$p" + fi + fi ' >/dev/null 2>&1 || true # 3) restart fluent-bit (best-effort) and wait docker exec "$cname" bash -lc 'pkill -x fluent-bit >/dev/null 2>&1 || true; sleep 1; setsid su -s /bin/bash fluent-bit -c "/opt/fluent-bit/bin/fluent-bit --config=/etc/fluent-bit/fluent-bit.conf >> /var/log/fluent-bit.log 2>&1" &>/dev/null & echo ok' >/dev/null 2>&1 || true @@ -136,9 +146,9 @@ info "initial ES counts: train=${train0} infer=${infer0} total=${base}" send_logs() { local cname="$1"; local hosttag="$2" docker exec "$cname" sh -lc 'mkdir -p /logs/train /logs/infer' - docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log" - docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log" - docker exec "$cname" sh -lc "ts=\$(date '+%F %T'); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log" + docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log" + docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log" + docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log" } NODE_CONT="${SWARM_NODE_CNAME:-argus-metric-test-node-swarm}" diff --git a/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json b/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json index 4adff0a..88b3bf2 100644 --- a/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json +++ b/src/sys/swarm_tests/tmp/metric-verify/prom_targets.json @@ -1 +1 @@ -{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.13:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.13:9400/metrics","globalUrl":"http://10.0.1.13:9400/metrics","lastError":"","lastScrape":"2025-11-06T16:36:25.585236213+08:00","lastScrapeDuration":0.002520163,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.13:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.13:9100/metrics","globalUrl":"http://10.0.1.13:9100/metrics","lastError":"","lastScrape":"2025-11-06T16:36:33.694723606+08:00","lastScrapeDuration":0.021800606,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}} \ No newline at end of file +{"status":"success","data":{"activeTargets":[{"discoveredLabels":{"__address__":"10.0.1.13:9400","__meta_filepath":"/private/argus/metric/prometheus/targets/dcgm_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"dcgm-exporter-A1","ip":"10.0.1.13","job":"dcgm","node_id":"A1","user_id":"yuyr"},"scrapePool":"dcgm","scrapeUrl":"http://10.0.1.13:9400/metrics","globalUrl":"http://10.0.1.13:9400/metrics","lastError":"","lastScrape":"2025-11-06T17:13:58.44079249+08:00","lastScrapeDuration":0.001229132,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"},{"discoveredLabels":{"__address__":"10.0.1.13:9100","__meta_filepath":"/private/argus/metric/prometheus/targets/node_exporter.json","__metrics_path__":"/metrics","__scheme__":"http","__scrape_interval__":"15s","__scrape_timeout__":"10s","hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"labels":{"hostname":"swarm-metric-node-001","instance":"node-exporter-A1","ip":"10.0.1.13","job":"node","node_id":"A1","user_id":"yuyr"},"scrapePool":"node","scrapeUrl":"http://10.0.1.13:9100/metrics","globalUrl":"http://10.0.1.13:9100/metrics","lastError":"","lastScrape":"2025-11-06T17:13:54.277705211+08:00","lastScrapeDuration":0.024348657,"health":"up","scrapeInterval":"15s","scrapeTimeout":"10s"}],"droppedTargets":[],"droppedTargetCounts":{"dcgm":0,"node":0}}} \ No newline at end of file diff --git a/src/sys/tests/scripts/07_logs_send_and_assert.sh b/src/sys/tests/scripts/07_logs_send_and_assert.sh index 7c58319..d5e1886 100755 --- a/src/sys/tests/scripts/07_logs_send_and_assert.sh +++ b/src/sys/tests/scripts/07_logs_send_and_assert.sh @@ -32,12 +32,9 @@ echo "[INFO] initial counts: train=${train0} infer=${infer0} total=${base}" send_logs() { local cname="$1"; local hosttag="$2" docker exec "$cname" sh -lc 'mkdir -p /logs/train /logs/infer' - docker exec "$cname" sh -lc "ts=\ -\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log" - docker exec "$cname" sh -lc "ts=\ -\$(date '+%F %T'); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log" - docker exec "$cname" sh -lc "ts=\ -\$(date '+%F %T'); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log" + docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts INFO [$hosttag] training step=1 loss=1.23 model=bert\" >> /logs/train/train-demo.log" + docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts INFO [$hosttag] training step=2 loss=1.10 model=bert\" >> /logs/train/train-demo.log" + docker exec "$cname" sh -lc "ts=\$(date -u +%Y-%m-%dT%H:%M:%SZ); echo \"\$ts WARN [$hosttag] inference slow on batch=2 latency=1.9s\" >> /logs/infer/infer-demo.log" } # Determine container names