groups: - name: example-rules interval: 30s # 每30秒评估一次 rules: - alert: InstanceDown expr: up == 0 for: 1m labels: severity: critical annotations: summary: "实例 {{ $labels.instance }} 已宕机" description: "{{ $labels.instance }} 在 {{ $labels.job }} 中无响应超过 1 分钟。" - alert: HighCpuUsage expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "CPU 使用率过高" description: "实例 {{ $labels.instance }} CPU 使用率超过 80% 持续 5 分钟。" - alert: HighMemoryUsage expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 80 for: 5m labels: severity: warning annotations: summary: "内存使用率过高" description: "实例 {{ $labels.instance }} 内存使用率超过 80% 持续 5 分钟。" - alert: DiskSpaceLow expr: (node_filesystem_size_bytes{fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{fstype!~"tmpfs|overlay"}) / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"} * 100 > 90 for: 10m labels: severity: warning annotations: summary: "磁盘空间不足" description: "实例 {{ $labels.instance }} 磁盘空间不足超过 90% 持续 10 分钟。"