Files
FusionAGI/k8s/templates/prometheus-rules.yaml
Devin AI 01b3f27b0f
Some checks failed
CI / lint (pull_request) Failing after 44s
CI / test (3.10) (pull_request) Failing after 30s
CI / test (3.11) (pull_request) Failing after 33s
CI / test (3.12) (pull_request) Successful in 1m26s
CI / migrations (pull_request) Successful in 24s
CI / helm (pull_request) Successful in 20s
CI / docker (pull_request) Has been skipped
feat: complete all 15 next recommendations
Frontend wiring:
- Wire useMarkdownWorker into Markdown component (worker-first, sync fallback)
- Wire useIndexedDB as primary storage in useChatHistory (500 msg cap, localStorage fallback)

Backend depth:
- Persistent audit store (SQLite, thread-safe, WAL mode) with record/query/filter
- Wire audit store into session routes (session.create, prompt.submit events)
- Wire audit store into audit export routes (persistent-first, telemetry fallback)
- CSRF double-submit cookie pattern (token generation, cookie set, header validation)

Production:
- Helm chart CI: helm lint + helm template validation
- Database migration CI: verify step in pipeline
- Prometheus alerting rules (error rate, latency, pod restarts, memory, CPU, queue, health)
- Rate limiting per API key (3x IP limit, sliding window, advisory)
- WebSocket SSE fallback (auto-downgrade after MAX_RETRIES WS failures)

Tests: 605 Python + 56 frontend = 661 total, 0 ruff errors
Co-Authored-By: Nakamoto, S <defi@defi-oracle.io>
2026-05-02 04:57:52 +00:00

97 lines
3.2 KiB
YAML

{{- if .Values.monitoring.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ include "fusionagi.fullname" . }}-alerts
labels:
{{- include "fusionagi.labels" . | nindent 4 }}
prometheus: kube-prometheus
spec:
groups:
- name: fusionagi.rules
rules:
# High error rate
- alert: FusionAGIHighErrorRate
expr: |
sum(rate(fusionagi_requests_total{status=~"5.."}[5m]))
/ sum(rate(fusionagi_requests_total[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "FusionAGI error rate above 5%"
description: "Error rate is {{ "{{ $value | humanizePercentage }}" }} over the last 5 minutes."
# High latency
- alert: FusionAGIHighLatency
expr: |
histogram_quantile(0.95,
sum(rate(fusionagi_request_duration_seconds_bucket[5m])) by (le)
) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "FusionAGI p95 latency above 10s"
description: "95th percentile latency is {{ "{{ $value }}s" }}."
# Pod restarts
- alert: FusionAGIPodRestarting
expr: |
increase(kube_pod_container_status_restarts_total{
container="{{ include "fusionagi.fullname" . }}"
}[1h]) > 3
for: 5m
labels:
severity: warning
annotations:
summary: "FusionAGI pod restarting frequently"
description: "Pod has restarted {{ "{{ $value }}" }} times in the last hour."
# High memory usage
- alert: FusionAGIHighMemory
expr: |
container_memory_usage_bytes{
container="{{ include "fusionagi.fullname" . }}"
} / container_spec_memory_limit_bytes > 0.85
for: 10m
labels:
severity: warning
annotations:
summary: "FusionAGI memory usage above 85%"
description: "Memory usage is {{ "{{ $value | humanizePercentage }}" }}."
# CPU throttling
- alert: FusionAGICPUThrottled
expr: |
rate(container_cpu_cfs_throttled_seconds_total{
container="{{ include "fusionagi.fullname" . }}"
}[5m]) > 0.5
for: 10m
labels:
severity: warning
annotations:
summary: "FusionAGI CPU throttled"
description: "CPU throttling rate is {{ "{{ $value }}s/s" }}."
# Queue depth (if task queue is instrumented)
- alert: FusionAGIQueueBacklog
expr: fusionagi_task_queue_depth > 50
for: 5m
labels:
severity: warning
annotations:
summary: "FusionAGI task queue backlog"
description: "Queue depth is {{ "{{ $value }}" }}."
# Health check failures
- alert: FusionAGIUnhealthy
expr: fusionagi_health_status == 0
for: 2m
labels:
severity: critical
annotations:
summary: "FusionAGI health check failing"
description: "Health endpoint returning unhealthy for 2+ minutes."
{{- end }}