feat: comprehensive project structure improvements and Cloud for Sovereignty landing zone

- Add Cloud for Sovereignty landing zone architecture and deployment
- Implement complete legal document management system
- Reorganize documentation with improved navigation
- Add infrastructure improvements (Dockerfiles, K8s, monitoring)
- Add operational improvements (graceful shutdown, rate limiting, caching)
- Create comprehensive project structure documentation
- Add Azure deployment automation scripts
- Improve repository navigation and organization
This commit is contained in:
defiQUG
2025-11-13 09:32:55 -08:00
parent 92cc41d26d
commit 6a8582e54d
202 changed files with 22699 additions and 981 deletions

View File

@@ -0,0 +1,103 @@
groups:
- name: service_health
interval: 30s
rules:
- alert: ServiceDown
expr: up{job=~".*-service"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down"
description: "Service {{ $labels.job }} has been down for more than 5 minutes"
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate in {{ $labels.job }}"
description: "Error rate is {{ $value }} errors per second"
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 10m
labels:
severity: warning
annotations:
summary: "High response time in {{ $labels.job }}"
description: "95th percentile response time is {{ $value }} seconds"
- name: resource_usage
interval: 30s
rules:
- alert: HighCPUUsage
expr: rate(container_cpu_usage_seconds_total[5m]) > 0.8
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage in {{ $labels.pod }}"
description: "CPU usage is {{ $value }}"
- alert: HighMemoryUsage
expr: container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9
for: 10m
labels:
severity: warning
annotations:
summary: "High memory usage in {{ $labels.pod }}"
description: "Memory usage is {{ $value }}%"
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Pod {{ $labels.pod }} is crash looping"
description: "Pod has restarted {{ $value }} times in the last 15 minutes"
- name: database
interval: 30s
rules:
- alert: DatabaseConnectionHigh
expr: pg_stat_database_numbackends / pg_stat_database_max_connections > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High database connection usage"
description: "{{ $value }}% of max connections in use"
- alert: DatabaseSlowQueries
expr: rate(pg_stat_statements_mean_exec_time[5m]) > 1
for: 10m
labels:
severity: warning
annotations:
summary: "Slow database queries detected"
description: "Average query time is {{ $value }} seconds"
- name: entra_verifiedid
interval: 30s
rules:
- alert: EntraAPIFailure
expr: rate(entra_api_errors_total[5m]) > 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "High Entra VerifiedID API error rate"
description: "Error rate is {{ $value }} errors per second"
- alert: EntraRateLimitApproaching
expr: entra_rate_limit_remaining / entra_rate_limit_total < 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "Entra VerifiedID rate limit approaching"
description: "Only {{ $value }}% of rate limit remaining"

View File

@@ -0,0 +1,85 @@
{
"dashboard": {
"title": "The Order Services Overview",
"tags": ["the-order", "services", "overview"],
"timezone": "browser",
"schemaVersion": 27,
"version": 1,
"refresh": "30s",
"panels": [
{
"id": 1,
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total[5m])) by (job)",
"legendFormat": "{{job}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
},
{
"id": 2,
"title": "Error Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) by (job)",
"legendFormat": "{{job}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
},
{
"id": 3,
"title": "Response Time (95th percentile)",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job))",
"legendFormat": "{{job}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
},
{
"id": 4,
"title": "CPU Usage",
"type": "graph",
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (pod)",
"legendFormat": "{{pod}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
},
{
"id": 5,
"title": "Memory Usage",
"type": "graph",
"targets": [
{
"expr": "sum(container_memory_usage_bytes) by (pod) / sum(container_spec_memory_limit_bytes) by (pod) * 100",
"legendFormat": "{{pod}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}
},
{
"id": 6,
"title": "Active Connections",
"type": "graph",
"targets": [
{
"expr": "sum(pg_stat_database_numbackends) by (datname)",
"legendFormat": "{{datname}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}
}
]
}
}

View File

@@ -0,0 +1,60 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: fluentd-config
namespace: the-order
data:
fluent.conf: |
<source>
@type tail
path /var/log/containers/*.log
pos_file /var/log/fluentd-containers.log.pos
tag kubernetes.*
read_from_head true
<parse>
@type json
time_key time
time_format %Y-%m-%dT%H:%M:%S.%NZ
keep_time_key true
</parse>
</source>
<filter kubernetes.**>
@type kubernetes_metadata
</filter>
<filter kubernetes.**>
@type record_transformer
<record>
cluster_name the-order
environment ${ENVIRONMENT:-production}
</record>
</filter>
<match kubernetes.**>
@type opensearch
host opensearch.logging.svc.cluster.local
port 9200
index_name the-order-logs
type_name _doc
logstash_format true
logstash_prefix the-order
logstash_dateformat %Y.%m.%d
include_tag_key true
tag_key @log_name
flush_interval 10s
<buffer>
@type file
path /var/log/fluentd-buffers/kubernetes.system.buffer
flush_mode interval
retry_type exponential_backoff
flush_thread_count 2
flush_interval 5s
retry_max_interval 30
retry_timeout 60m
chunk_limit_size 2M
queue_limit_length 8
overflow_action block
</buffer>
</match>

View File

@@ -0,0 +1,15 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: opensearch-config
namespace: the-order
data:
opensearch.yml: |
cluster.name: the-order-logs
node.name: opensearch-0
network.host: 0.0.0.0
discovery.type: single-node
path.data: /usr/share/opensearch/data
path.logs: /usr/share/opensearch/logs
plugins.security.disabled: true

View File

@@ -0,0 +1,142 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'the-order'
environment: 'production'
scrape_configs:
# Prometheus itself
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Intake Service
- job_name: 'intake-service'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- the-order
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: intake-service
- source_labels: [__meta_kubernetes_pod_ip]
action: replace
target_label: __address__
replacement: $1:4001
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
metrics_path: '/metrics'
# Identity Service
- job_name: 'identity-service'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- the-order
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: identity-service
- source_labels: [__meta_kubernetes_pod_ip]
action: replace
target_label: __address__
replacement: $1:4002
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
metrics_path: '/metrics'
# Finance Service
- job_name: 'finance-service'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- the-order
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: finance-service
- source_labels: [__meta_kubernetes_pod_ip]
action: replace
target_label: __address__
replacement: $1:4003
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
metrics_path: '/metrics'
# Dataroom Service
- job_name: 'dataroom-service'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- the-order
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: dataroom-service
- source_labels: [__meta_kubernetes_pod_ip]
action: replace
target_label: __address__
replacement: $1:4004
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
metrics_path: '/metrics'
# Legal Documents Service
- job_name: 'legal-documents-service'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- the-order
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: legal-documents-service
- source_labels: [__meta_kubernetes_pod_ip]
action: replace
target_label: __address__
replacement: $1:4005
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
metrics_path: '/metrics'
# Kubernetes API
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# Kubernetes nodes
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files:
- '/etc/prometheus/alerts/*.yml'