- Add unit tests for all core services (identity, intake, finance, dataroom) - Create integration test framework with shared setup utilities - Add E2E test suite for complete user workflows - Add test utilities package (server factory) - Configure Prometheus alert rules (service health, infrastructure, database, Azure) - Add alert rules ConfigMap for Kubernetes - Update Prometheus deployment with alert rules - Fix tsconfig.json to include test files - Add tests/tsconfig.json for integration/E2E tests - Fix server-factory.ts linting issues
87 lines
2.9 KiB
YAML
87 lines
2.9 KiB
YAML
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: alert-rules
|
|
namespace: the-order
|
|
data:
|
|
alert-rules.yml: |
|
|
# Prometheus Alert Rules
|
|
# Defines alerting conditions for The Order services
|
|
|
|
groups:
|
|
- name: service_health
|
|
interval: 30s
|
|
rules:
|
|
- alert: ServiceDown
|
|
expr: up{job=~"identity-service|intake-service|finance-service|dataroom-service|legal-documents-service"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Service {{ $labels.job }} is down"
|
|
description: "Service {{ $labels.job }} has been down for more than 5 minutes"
|
|
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate for {{ $labels.job }}"
|
|
description: "Error rate is {{ $value }} errors per second"
|
|
|
|
- alert: HighResponseTime
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High response time for {{ $labels.job }}"
|
|
description: "95th percentile response time is {{ $value }} seconds"
|
|
|
|
- name: infrastructure
|
|
interval: 30s
|
|
rules:
|
|
- alert: HighCPUUsage
|
|
expr: rate(process_cpu_user_seconds_total[5m]) > 0.8
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage for {{ $labels.job }}"
|
|
description: "CPU usage is {{ $value }}%"
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: (process_resident_memory_bytes / process_virtual_memory_bytes) > 0.9
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage for {{ $labels.job }}"
|
|
description: "Memory usage is {{ $value }}%"
|
|
|
|
- name: database
|
|
interval: 30s
|
|
rules:
|
|
- alert: DatabaseConnectionPoolExhausted
|
|
expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Database connection pool nearly exhausted"
|
|
description: "{{ $value }}% of connections in use"
|
|
|
|
- name: azure
|
|
interval: 30s
|
|
rules:
|
|
- alert: EntraAPIRateLimit
|
|
expr: rate(entra_api_requests_total{status="429"}[5m]) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Entra API rate limit hit"
|
|
description: "Rate limit errors detected for Entra VerifiedID API"
|
|
|