74 lines
2.3 KiB
YAML
74 lines
2.3 KiB
YAML
# Prometheus alerting rules for SolaceNet
|
|
|
|
groups:
|
|
- name: solacenet_capabilities
|
|
interval: 30s
|
|
rules:
|
|
- alert: CapabilityDisabled
|
|
expr: solacenet_capability_state{state="disabled"} > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Capability {{ $labels.capability_id }} is disabled"
|
|
description: "Capability {{ $labels.capability_id }} has been disabled for {{ $labels.tenant_id }}"
|
|
|
|
- alert: KillSwitchActivated
|
|
expr: increase(solacenet_kill_switch_activations_total[5m]) > 0
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Kill switch activated for {{ $labels.capability_id }}"
|
|
description: "Emergency kill switch was activated for capability {{ $labels.capability_id }}"
|
|
|
|
- alert: HighPolicyDecisionLatency
|
|
expr: histogram_quantile(0.95, solacenet_policy_decision_duration_seconds_bucket) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High policy decision latency"
|
|
description: "95th percentile policy decision latency is {{ $value }}s"
|
|
|
|
- name: solacenet_risk
|
|
interval: 30s
|
|
rules:
|
|
- alert: HighRiskScore
|
|
expr: solacenet_risk_score > 80
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High risk score detected"
|
|
description: "Risk score of {{ $value }} detected for transaction {{ $labels.transaction_id }}"
|
|
|
|
- alert: RiskEngineDown
|
|
expr: up{job="risk-engine"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Risk engine is down"
|
|
description: "Risk rules engine is not responding"
|
|
|
|
- name: solacenet_infrastructure
|
|
interval: 30s
|
|
rules:
|
|
- alert: RedisDown
|
|
expr: up{job="redis"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Redis is down"
|
|
description: "Redis cache is not available, policy decisions will not be cached"
|
|
|
|
- alert: GatewayDown
|
|
expr: up{job="solacenet-gateway"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "SolaceNet Gateway is down"
|
|
description: "The SolaceNet API Gateway is not responding"
|