feat: comprehensive project structure improvements and Cloud for Sovereignty landing zone
- Add Cloud for Sovereignty landing zone architecture and deployment - Implement complete legal document management system - Reorganize documentation with improved navigation - Add infrastructure improvements (Dockerfiles, K8s, monitoring) - Add operational improvements (graceful shutdown, rate limiting, caching) - Create comprehensive project structure documentation - Add Azure deployment automation scripts - Improve repository navigation and organization
This commit is contained in:
103
infra/monitoring/alert-rules.yml
Normal file
103
infra/monitoring/alert-rules.yml
Normal file
@@ -0,0 +1,103 @@
|
||||
groups:
|
||||
- name: service_health
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: ServiceDown
|
||||
expr: up{job=~".*-service"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Service {{ $labels.job }} is down"
|
||||
description: "Service {{ $labels.job }} has been down for more than 5 minutes"
|
||||
|
||||
- alert: HighErrorRate
|
||||
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate in {{ $labels.job }}"
|
||||
description: "Error rate is {{ $value }} errors per second"
|
||||
|
||||
- alert: HighResponseTime
|
||||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High response time in {{ $labels.job }}"
|
||||
description: "95th percentile response time is {{ $value }} seconds"
|
||||
|
||||
- name: resource_usage
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HighCPUUsage
|
||||
expr: rate(container_cpu_usage_seconds_total[5m]) > 0.8
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage in {{ $labels.pod }}"
|
||||
description: "CPU usage is {{ $value }}"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage in {{ $labels.pod }}"
|
||||
description: "Memory usage is {{ $value }}%"
|
||||
|
||||
- alert: PodCrashLooping
|
||||
expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Pod {{ $labels.pod }} is crash looping"
|
||||
description: "Pod has restarted {{ $value }} times in the last 15 minutes"
|
||||
|
||||
- name: database
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: DatabaseConnectionHigh
|
||||
expr: pg_stat_database_numbackends / pg_stat_database_max_connections > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High database connection usage"
|
||||
description: "{{ $value }}% of max connections in use"
|
||||
|
||||
- alert: DatabaseSlowQueries
|
||||
expr: rate(pg_stat_statements_mean_exec_time[5m]) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Slow database queries detected"
|
||||
description: "Average query time is {{ $value }} seconds"
|
||||
|
||||
- name: entra_verifiedid
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: EntraAPIFailure
|
||||
expr: rate(entra_api_errors_total[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "High Entra VerifiedID API error rate"
|
||||
description: "Error rate is {{ $value }} errors per second"
|
||||
|
||||
- alert: EntraRateLimitApproaching
|
||||
expr: entra_rate_limit_remaining / entra_rate_limit_total < 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Entra VerifiedID rate limit approaching"
|
||||
description: "Only {{ $value }}% of rate limit remaining"
|
||||
|
||||
85
infra/monitoring/grafana-dashboards/services-overview.json
Normal file
85
infra/monitoring/grafana-dashboards/services-overview.json
Normal file
@@ -0,0 +1,85 @@
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "The Order Services Overview",
|
||||
"tags": ["the-order", "services", "overview"],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 27,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Request Rate",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total[5m])) by (job)",
|
||||
"legendFormat": "{{job}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Error Rate",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) by (job)",
|
||||
"legendFormat": "{{job}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Response Time (95th percentile)",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job))",
|
||||
"legendFormat": "{{job}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "CPU Usage",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (pod)",
|
||||
"legendFormat": "{{pod}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Memory Usage",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(container_memory_usage_bytes) by (pod) / sum(container_spec_memory_limit_bytes) by (pod) * 100",
|
||||
"legendFormat": "{{pod}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Active Connections",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(pg_stat_database_numbackends) by (datname)",
|
||||
"legendFormat": "{{datname}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
60
infra/monitoring/logging/fluentd-config.yaml
Normal file
60
infra/monitoring/logging/fluentd-config.yaml
Normal file
@@ -0,0 +1,60 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: fluentd-config
|
||||
namespace: the-order
|
||||
data:
|
||||
fluent.conf: |
|
||||
<source>
|
||||
@type tail
|
||||
path /var/log/containers/*.log
|
||||
pos_file /var/log/fluentd-containers.log.pos
|
||||
tag kubernetes.*
|
||||
read_from_head true
|
||||
<parse>
|
||||
@type json
|
||||
time_key time
|
||||
time_format %Y-%m-%dT%H:%M:%S.%NZ
|
||||
keep_time_key true
|
||||
</parse>
|
||||
</source>
|
||||
|
||||
<filter kubernetes.**>
|
||||
@type kubernetes_metadata
|
||||
</filter>
|
||||
|
||||
<filter kubernetes.**>
|
||||
@type record_transformer
|
||||
<record>
|
||||
cluster_name the-order
|
||||
environment ${ENVIRONMENT:-production}
|
||||
</record>
|
||||
</filter>
|
||||
|
||||
<match kubernetes.**>
|
||||
@type opensearch
|
||||
host opensearch.logging.svc.cluster.local
|
||||
port 9200
|
||||
index_name the-order-logs
|
||||
type_name _doc
|
||||
logstash_format true
|
||||
logstash_prefix the-order
|
||||
logstash_dateformat %Y.%m.%d
|
||||
include_tag_key true
|
||||
tag_key @log_name
|
||||
flush_interval 10s
|
||||
<buffer>
|
||||
@type file
|
||||
path /var/log/fluentd-buffers/kubernetes.system.buffer
|
||||
flush_mode interval
|
||||
retry_type exponential_backoff
|
||||
flush_thread_count 2
|
||||
flush_interval 5s
|
||||
retry_max_interval 30
|
||||
retry_timeout 60m
|
||||
chunk_limit_size 2M
|
||||
queue_limit_length 8
|
||||
overflow_action block
|
||||
</buffer>
|
||||
</match>
|
||||
|
||||
15
infra/monitoring/logging/opensearch-config.yaml
Normal file
15
infra/monitoring/logging/opensearch-config.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: opensearch-config
|
||||
namespace: the-order
|
||||
data:
|
||||
opensearch.yml: |
|
||||
cluster.name: the-order-logs
|
||||
node.name: opensearch-0
|
||||
network.host: 0.0.0.0
|
||||
discovery.type: single-node
|
||||
path.data: /usr/share/opensearch/data
|
||||
path.logs: /usr/share/opensearch/logs
|
||||
plugins.security.disabled: true
|
||||
|
||||
142
infra/monitoring/prometheus-config.yml
Normal file
142
infra/monitoring/prometheus-config.yml
Normal file
@@ -0,0 +1,142 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
cluster: 'the-order'
|
||||
environment: 'production'
|
||||
|
||||
scrape_configs:
|
||||
# Prometheus itself
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# Intake Service
|
||||
- job_name: 'intake-service'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- the-order
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app]
|
||||
action: keep
|
||||
regex: intake-service
|
||||
- source_labels: [__meta_kubernetes_pod_ip]
|
||||
action: replace
|
||||
target_label: __address__
|
||||
replacement: $1:4001
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_label_(.+)
|
||||
metrics_path: '/metrics'
|
||||
|
||||
# Identity Service
|
||||
- job_name: 'identity-service'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- the-order
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app]
|
||||
action: keep
|
||||
regex: identity-service
|
||||
- source_labels: [__meta_kubernetes_pod_ip]
|
||||
action: replace
|
||||
target_label: __address__
|
||||
replacement: $1:4002
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_label_(.+)
|
||||
metrics_path: '/metrics'
|
||||
|
||||
# Finance Service
|
||||
- job_name: 'finance-service'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- the-order
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app]
|
||||
action: keep
|
||||
regex: finance-service
|
||||
- source_labels: [__meta_kubernetes_pod_ip]
|
||||
action: replace
|
||||
target_label: __address__
|
||||
replacement: $1:4003
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_label_(.+)
|
||||
metrics_path: '/metrics'
|
||||
|
||||
# Dataroom Service
|
||||
- job_name: 'dataroom-service'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- the-order
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app]
|
||||
action: keep
|
||||
regex: dataroom-service
|
||||
- source_labels: [__meta_kubernetes_pod_ip]
|
||||
action: replace
|
||||
target_label: __address__
|
||||
replacement: $1:4004
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_label_(.+)
|
||||
metrics_path: '/metrics'
|
||||
|
||||
# Legal Documents Service
|
||||
- job_name: 'legal-documents-service'
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- the-order
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app]
|
||||
action: keep
|
||||
regex: legal-documents-service
|
||||
- source_labels: [__meta_kubernetes_pod_ip]
|
||||
action: replace
|
||||
target_label: __address__
|
||||
replacement: $1:4005
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_pod_label_(.+)
|
||||
metrics_path: '/metrics'
|
||||
|
||||
# Kubernetes API
|
||||
- job_name: 'kubernetes-apiservers'
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
scheme: https
|
||||
tls_config:
|
||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
|
||||
action: keep
|
||||
regex: default;kubernetes;https
|
||||
|
||||
# Kubernetes nodes
|
||||
- job_name: 'kubernetes-nodes'
|
||||
kubernetes_sd_configs:
|
||||
- role: node
|
||||
scheme: https
|
||||
tls_config:
|
||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
relabel_configs:
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_node_label_(.+)
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
rule_files:
|
||||
- '/etc/prometheus/alerts/*.yml'
|
||||
|
||||
Reference in New Issue
Block a user