Some checks failed
CI / lint (pull_request) Failing after 44s
CI / test (3.10) (pull_request) Failing after 30s
CI / test (3.11) (pull_request) Failing after 33s
CI / test (3.12) (pull_request) Successful in 1m26s
CI / migrations (pull_request) Successful in 24s
CI / helm (pull_request) Successful in 20s
CI / docker (pull_request) Has been skipped
Frontend wiring: - Wire useMarkdownWorker into Markdown component (worker-first, sync fallback) - Wire useIndexedDB as primary storage in useChatHistory (500 msg cap, localStorage fallback) Backend depth: - Persistent audit store (SQLite, thread-safe, WAL mode) with record/query/filter - Wire audit store into session routes (session.create, prompt.submit events) - Wire audit store into audit export routes (persistent-first, telemetry fallback) - CSRF double-submit cookie pattern (token generation, cookie set, header validation) Production: - Helm chart CI: helm lint + helm template validation - Database migration CI: verify step in pipeline - Prometheus alerting rules (error rate, latency, pod restarts, memory, CPU, queue, health) - Rate limiting per API key (3x IP limit, sliding window, advisory) - WebSocket SSE fallback (auto-downgrade after MAX_RETRIES WS failures) Tests: 605 Python + 56 frontend = 661 total, 0 ruff errors Co-Authored-By: Nakamoto, S <defi@defi-oracle.io>
97 lines
3.2 KiB
YAML
97 lines
3.2 KiB
YAML
{{- if .Values.monitoring.enabled }}
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: {{ include "fusionagi.fullname" . }}-alerts
|
|
labels:
|
|
{{- include "fusionagi.labels" . | nindent 4 }}
|
|
prometheus: kube-prometheus
|
|
spec:
|
|
groups:
|
|
- name: fusionagi.rules
|
|
rules:
|
|
# High error rate
|
|
- alert: FusionAGIHighErrorRate
|
|
expr: |
|
|
sum(rate(fusionagi_requests_total{status=~"5.."}[5m]))
|
|
/ sum(rate(fusionagi_requests_total[5m])) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "FusionAGI error rate above 5%"
|
|
description: "Error rate is {{ "{{ $value | humanizePercentage }}" }} over the last 5 minutes."
|
|
|
|
# High latency
|
|
- alert: FusionAGIHighLatency
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(fusionagi_request_duration_seconds_bucket[5m])) by (le)
|
|
) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "FusionAGI p95 latency above 10s"
|
|
description: "95th percentile latency is {{ "{{ $value }}s" }}."
|
|
|
|
# Pod restarts
|
|
- alert: FusionAGIPodRestarting
|
|
expr: |
|
|
increase(kube_pod_container_status_restarts_total{
|
|
container="{{ include "fusionagi.fullname" . }}"
|
|
}[1h]) > 3
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "FusionAGI pod restarting frequently"
|
|
description: "Pod has restarted {{ "{{ $value }}" }} times in the last hour."
|
|
|
|
# High memory usage
|
|
- alert: FusionAGIHighMemory
|
|
expr: |
|
|
container_memory_usage_bytes{
|
|
container="{{ include "fusionagi.fullname" . }}"
|
|
} / container_spec_memory_limit_bytes > 0.85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "FusionAGI memory usage above 85%"
|
|
description: "Memory usage is {{ "{{ $value | humanizePercentage }}" }}."
|
|
|
|
# CPU throttling
|
|
- alert: FusionAGICPUThrottled
|
|
expr: |
|
|
rate(container_cpu_cfs_throttled_seconds_total{
|
|
container="{{ include "fusionagi.fullname" . }}"
|
|
}[5m]) > 0.5
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "FusionAGI CPU throttled"
|
|
description: "CPU throttling rate is {{ "{{ $value }}s/s" }}."
|
|
|
|
# Queue depth (if task queue is instrumented)
|
|
- alert: FusionAGIQueueBacklog
|
|
expr: fusionagi_task_queue_depth > 50
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "FusionAGI task queue backlog"
|
|
description: "Queue depth is {{ "{{ $value }}" }}."
|
|
|
|
# Health check failures
|
|
- alert: FusionAGIUnhealthy
|
|
expr: fusionagi_health_status == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "FusionAGI health check failing"
|
|
description: "Health endpoint returning unhealthy for 2+ minutes."
|
|
{{- end }}
|