102 lines
3.9 KiB
YAML
102 lines
3.9 KiB
YAML
# Prometheus Alerting Rules
|
|
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: shared-services-alerts
|
|
namespace: monitoring
|
|
labels:
|
|
app: prometheus
|
|
spec:
|
|
groups:
|
|
- name: kubernetes.rules
|
|
interval: 30s
|
|
rules:
|
|
- alert: PodCrashLooping
|
|
expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
|
|
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value }} times in the last 15 minutes"
|
|
|
|
- alert: PodNotReady
|
|
expr: sum by (namespace, pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not ready"
|
|
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for more than 10 minutes"
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: (sum by (namespace, pod) (container_memory_usage_bytes) / sum by (namespace, pod) (kube_pod_container_resource_limits_memory_bytes)) > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} memory usage is high"
|
|
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value | humanizePercentage }} of its memory limit"
|
|
|
|
- alert: HighCPUUsage
|
|
expr: (sum by (namespace, pod) (rate(container_cpu_usage_seconds_total[5m])) / sum by (namespace, pod) (kube_pod_container_resource_limits_cpu)) > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} CPU usage is high"
|
|
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is using {{ $value | humanizePercentage }} of its CPU limit"
|
|
|
|
- name: application.rules
|
|
interval: 30s
|
|
rules:
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "High error rate detected"
|
|
description: "Error rate is {{ $value }} errors per second"
|
|
|
|
- alert: HighLatency
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High latency detected"
|
|
description: "95th percentile latency is {{ $value }} seconds"
|
|
|
|
- alert: ServiceDown
|
|
expr: up{job=~".+"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Service {{ $labels.job }} is down"
|
|
description: "Service {{ $labels.job }} has been down for more than 1 minute"
|
|
|
|
- name: infrastructure.rules
|
|
interval: 30s
|
|
rules:
|
|
- alert: NodeNotReady
|
|
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Node {{ $labels.node }} is not ready"
|
|
description: "Node {{ $labels.node }} has been in a not-ready state for more than 5 minutes"
|
|
|
|
- alert: DiskSpaceLow
|
|
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk space low on {{ $labels.instance }}"
|
|
description: "Disk {{ $labels.device }} on {{ $labels.instance }} has only {{ $value | humanizePercentage }} space available"
|
|
|