Files
smom-dbis-138/monitoring/prometheus/alerts/besu.yml
defiQUG 1fb7266469 Add Oracle Aggregator and CCIP Integration
- Introduced Aggregator.sol for Chainlink-compatible oracle functionality, including round-based updates and access control.
- Added OracleWithCCIP.sol to extend Aggregator with CCIP cross-chain messaging capabilities.
- Created .gitmodules to include OpenZeppelin contracts as a submodule.
- Developed a comprehensive deployment guide in NEXT_STEPS_COMPLETE_GUIDE.md for Phase 2 and smart contract deployment.
- Implemented Vite configuration for the orchestration portal, supporting both Vue and React frameworks.
- Added server-side logic for the Multi-Cloud Orchestration Portal, including API endpoints for environment management and monitoring.
- Created scripts for resource import and usage validation across non-US regions.
- Added tests for CCIP error handling and integration to ensure robust functionality.
- Included various new files and directories for the orchestration portal and deployment scripts.
2025-12-12 14:57:48 -08:00

128 lines
4.4 KiB
YAML

groups:
- name: besu_alerts
interval: 30s
rules:
# Peer count alerts
- alert: BesuLowPeerCount
expr: besu_peers_connected_total < 2
for: 5m
labels:
severity: warning
annotations:
summary: "Besu node has low peer count"
description: "Besu node {{ $labels.instance }} has only {{ $value }} peers connected"
# Block production alerts
- alert: BesuChainStalled
expr: time() - besu_blockchain_head_timestamp > 20
for: 1m
labels:
severity: critical
annotations:
summary: "Besu chain is stalled"
description: "Besu node {{ $labels.instance }} has not produced a block in {{ $value }} seconds"
# Block lag alerts
- alert: BesuBlockLag
expr: besu_blockchain_head_number - besu_blockchain_fork_choice_head_number > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Besu node is lagging behind"
description: "Besu node {{ $labels.instance }} is {{ $value }} blocks behind the head"
# RPC error rate alerts
- alert: BesuHighRpcErrorRate
expr: rate(besu_rpc_requests_total{method=~"eth_.*",result="error"}[5m]) > 0.02
for: 5m
labels:
severity: warning
annotations:
summary: "Besu RPC has high error rate"
description: "Besu node {{ $labels.instance }} has {{ $value }} error rate for RPC requests"
# Disk usage alerts
- alert: BesuHighDiskUsage
expr: (node_filesystem_avail_bytes{mountpoint="/data"} / node_filesystem_size_bytes{mountpoint="/data"}) < 0.2
for: 5m
labels:
severity: warning
annotations:
summary: "Besu node has high disk usage"
description: "Besu node {{ $labels.instance }} has less than 20% disk space available"
# Validator missed proposals
- alert: BesuValidatorMissedProposals
expr: increase(besu_validator_missed_proposals_total[1h]) > 5
for: 1h
labels:
severity: warning
annotations:
summary: "Besu validator missed proposals"
description: "Besu validator {{ $labels.instance }} has missed {{ $value }} proposals in the last hour"
# Transaction pool size alerts
- alert: BesuHighTransactionPoolSize
expr: besu_transactions_added_total > 10000
for: 5m
labels:
severity: warning
annotations:
summary: "Besu transaction pool is large"
description: "Besu node {{ $labels.instance }} has {{ $value }} transactions in the pool"
# Memory usage alerts
- alert: BesuHighMemoryUsage
expr: (container_memory_usage_bytes{pod=~"besu-.*"} / container_spec_memory_limit_bytes{pod=~"besu-.*"}) > 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "Besu node has high memory usage"
description: "Besu node {{ $labels.instance }} is using {{ $value }} of its memory limit"
# CPU usage alerts
- alert: BesuHighCpuUsage
expr: rate(container_cpu_usage_seconds_total{pod=~"besu-.*"}[5m]) > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "Besu node has high CPU usage"
description: "Besu node {{ $labels.instance }} is using {{ $value }} CPU"
- name: oracle_alerts
interval: 30s
rules:
# Oracle update alerts
- alert: OracleUpdateFailure
expr: increase(oracle_update_errors_total[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Oracle update failed"
description: "Oracle publisher has {{ $value }} update errors in the last 5 minutes"
# Oracle stale data alerts
- alert: OracleStaleData
expr: time() - oracle_last_update_time > 300
for: 5m
labels:
severity: critical
annotations:
summary: "Oracle data is stale"
description: "Oracle has not been updated in {{ $value }} seconds"
# Oracle price deviation alerts
- alert: OracleHighPriceDeviation
expr: oracle_price_deviation > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Oracle has high price deviation"
description: "Oracle price deviation is {{ $value }}%"