- Add comprehensive database migrations (001-024) for schema evolution - Enhance API schema with expanded type definitions and resolvers - Add new middleware: audit logging, rate limiting, MFA enforcement, security, tenant auth - Implement new services: AI optimization, billing, blockchain, compliance, marketplace - Add adapter layer for cloud integrations (Cloudflare, Kubernetes, Proxmox, storage) - Update Crossplane provider with enhanced VM management capabilities - Add comprehensive test suite for API endpoints and services - Update frontend components with improved GraphQL subscriptions and real-time updates - Enhance security configurations and headers (CSP, CORS, etc.) - Update documentation and configuration files - Add new CI/CD workflows and validation scripts - Implement design system improvements and UI enhancements
208 lines
6.3 KiB
YAML
208 lines
6.3 KiB
YAML
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: sankofa-alerts
|
|
namespace: monitoring
|
|
labels:
|
|
app: sankofa
|
|
prometheus: kube-prometheus
|
|
role: alert-rules
|
|
spec:
|
|
groups:
|
|
- name: api
|
|
interval: 30s
|
|
rules:
|
|
# API High Error Rate
|
|
- alert: APIHighErrorRate
|
|
expr: |
|
|
sum(rate(http_requests_total{job="api",status=~"5.."}[5m]))
|
|
/
|
|
sum(rate(http_requests_total{job="api"}[5m])) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "API error rate is above 5%"
|
|
description: "API error rate is {{ $value | humanizePercentage }} for the last 5 minutes"
|
|
|
|
# API High Latency
|
|
- alert: APIHighLatency
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(http_request_duration_seconds_bucket{job="api"}[5m])) by (le)
|
|
) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "API p95 latency is above 500ms"
|
|
description: "API p95 latency is {{ $value }}s"
|
|
|
|
# API Down
|
|
- alert: APIDown
|
|
expr: up{job="api"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "API is down"
|
|
description: "API service has been down for more than 1 minute"
|
|
|
|
- name: portal
|
|
interval: 30s
|
|
rules:
|
|
# Portal High Error Rate
|
|
- alert: PortalHighErrorRate
|
|
expr: |
|
|
sum(rate(http_requests_total{job="portal",status=~"5.."}[5m]))
|
|
/
|
|
sum(rate(http_requests_total{job="portal"}[5m])) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Portal error rate is above 5%"
|
|
description: "Portal error rate is {{ $value | humanizePercentage }}"
|
|
|
|
# Portal Down
|
|
- alert: PortalDown
|
|
expr: up{job="portal"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Portal is down"
|
|
description: "Portal service has been down for more than 1 minute"
|
|
|
|
- name: database
|
|
interval: 30s
|
|
rules:
|
|
# Database High Connection Count
|
|
- alert: DatabaseHighConnections
|
|
expr: |
|
|
pg_stat_database_numbackends{datname="sankofa"} > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Database connection count is high"
|
|
description: "Database has {{ $value }} active connections"
|
|
|
|
# Database Slow Queries
|
|
- alert: DatabaseSlowQueries
|
|
expr: |
|
|
pg_stat_activity_count{state="active"} > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Database has slow queries"
|
|
description: "Database has {{ $value }} active queries running for more than 5 minutes"
|
|
|
|
# Database Down
|
|
- alert: DatabaseDown
|
|
expr: pg_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Database is down"
|
|
description: "PostgreSQL database is not responding"
|
|
|
|
- name: keycloak
|
|
interval: 30s
|
|
rules:
|
|
# Keycloak Down
|
|
- alert: KeycloakDown
|
|
expr: up{job="keycloak"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Keycloak is down"
|
|
description: "Keycloak authentication service is down"
|
|
|
|
# Keycloak High Authentication Failures
|
|
- alert: KeycloakHighAuthFailures
|
|
expr: |
|
|
sum(rate(keycloak_login_failures_total[5m])) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High authentication failure rate"
|
|
description: "Keycloak has {{ $value }} authentication failures per second"
|
|
|
|
- name: infrastructure
|
|
interval: 30s
|
|
rules:
|
|
# High CPU Usage
|
|
- alert: HighCPUUsage
|
|
expr: |
|
|
(1 - avg(rate(container_cpu_usage_seconds_total{container!="POD"}[5m]))) < 0.1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage"
|
|
description: "CPU usage is above 90% for 10 minutes"
|
|
|
|
# High Memory Usage
|
|
- alert: HighMemoryUsage
|
|
expr: |
|
|
(1 - (container_memory_working_set_bytes{container!="POD"} / container_spec_memory_limit_bytes)) < 0.1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage"
|
|
description: "Memory usage is above 90% for 10 minutes"
|
|
|
|
# Pod CrashLooping
|
|
- alert: PodCrashLooping
|
|
expr: |
|
|
rate(kube_pod_container_status_restarts_total[15m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pod is crash looping"
|
|
description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} is crash looping"
|
|
|
|
# Disk Space Low
|
|
- alert: DiskSpaceLow
|
|
expr: |
|
|
(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk space is low"
|
|
description: "Disk space is below 10% on {{ $labels.instance }}"
|
|
|
|
- name: backups
|
|
interval: 1h
|
|
rules:
|
|
# Backup Failed
|
|
- alert: BackupFailed
|
|
expr: |
|
|
time() - backup_last_success_timestamp > 86400
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Backup has not run in 24 hours"
|
|
description: "Last successful backup was more than 24 hours ago"
|
|
|
|
# Backup Too Old
|
|
- alert: BackupTooOld
|
|
expr: |
|
|
time() - backup_last_success_timestamp > 172800
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Backup is more than 48 hours old"
|
|
description: "Last successful backup was {{ $value }} seconds ago"
|
|
|