groups: - name: region_health rules: - alert: RegionUnhealthy expr: up{region=~".+"} == 0 for: 5m labels: severity: critical annotations: summary: "Region {{ $labels.region }} is unhealthy" description: "Region {{ $labels.region }} has been unhealthy for more than 5 minutes" - alert: RegionHighLatency expr: avg(region_latency_seconds) > 1 for: 10m labels: severity: warning annotations: summary: "High latency in region {{ $labels.region }}" description: "Average latency in region {{ $labels.region }} is {{ $value }}s" - alert: RegionNodeFailure expr: count(kube_node_status_condition{condition="Ready",status="true"}) < 3 for: 5m labels: severity: critical annotations: summary: "Multiple node failures in region {{ $labels.region }}" description: "Less than 3 nodes are ready in region {{ $labels.region }}" - alert: RegionPodFailure expr: count(kube_pod_status_phase{phase="Running"}) < count(kube_pod_status_phase{}) * 0.8 for: 5m labels: severity: warning annotations: summary: "High pod failure rate in region {{ $labels.region }}" description: "Less than 80% of pods are running in region {{ $labels.region }}"