chain138: harden besu txpool recovery and runbooks

This commit is contained in:
defiQUG
2026-04-24 10:46:40 -07:00
parent d9a3053a58
commit 58ca82bbe3
11 changed files with 344 additions and 106 deletions

View File

@@ -4,16 +4,16 @@
set -euo pipefail
# Load IP configuration
# Load shared project environment / VMID host mapping
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh" 2>/dev/null || true
RPC_CORE_1="${RPC_CORE_1:-192.168.11.211}"
RPC_URL="${RPC_URL:-http://${RPC_CORE_1}:8545}"
DEPLOYER="${DEPLOYER:-0x4A666F96fC8764181194447A7dFdb7d471b301C8}"
PROXMOX_USER="${PROXMOX_USER:-root}"
PROXMOX_ML110="${PROXMOX_ML110:-${PROXMOX_HOST_ML110:-192.168.11.10}}"
PROXMOX_SSH_USER="${PROXMOX_SSH_USER:-root}"
PROXMOX_R630="${PROXMOX_R630:-${PROXMOX_R630_01:-${PROXMOX_HOST_R630_01:-192.168.11.11}}}"
# Colors
@@ -30,6 +30,29 @@ log_warn() { echo -e "${YELLOW}[⚠]${NC} $1"; }
log_error() { echo -e "${RED}[✗]${NC} $1"; }
log_section() { echo -e "\n${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"; echo -e "${CYAN}$1${NC}"; echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"; }
validator_host() {
local vmid="$1"
if type get_host_for_vmid >/dev/null 2>&1; then
get_host_for_vmid "$vmid"
elif [ "$vmid" -le 1002 ]; then
echo "$PROXMOX_R630"
else
echo "${PROXMOX_HOST_ML110:-192.168.11.10}"
fi
}
validator_ip() {
local vmid="$1"
case "$vmid" in
1000) echo "${IP_VALIDATOR_0:-192.168.11.100}" ;;
1001) echo "${IP_VALIDATOR_1:-192.168.11.101}" ;;
1002) echo "${IP_VALIDATOR_2:-192.168.11.102}" ;;
1003) echo "${IP_VALIDATOR_3:-192.168.11.103}" ;;
1004) echo "${IP_VALIDATOR_4:-192.168.11.104}" ;;
*) return 1 ;;
esac
}
echo "=== Blockchain Health Monitor ==="
echo "Timestamp: $(date '+%Y-%m-%d %H:%M:%S')"
echo ""
@@ -62,7 +85,7 @@ if [ "$BLOCK_DIFF" -gt 0 ]; then
else
log_error "Block production stalled (no new blocks in 5s)"
# If validators are all active, they may still be syncing (QBFT does not produce until sync completes)
SYNC_HINT=$(ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${PROXMOX_R630}" \
SYNC_HINT=$(ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no "${PROXMOX_SSH_USER}@${PROXMOX_R630}" \
"pct exec 1000 -- journalctl -u besu-validator --no-pager -n 30 2>/dev/null" 2>/dev/null | grep -c "Full sync\|initial sync in progress\|QBFT mining coordinator not starting" || true)
if [ "${SYNC_HINT:-0}" -gt 0 ]; then
echo " → Validators may be syncing; block production will resume when sync completes (see docs/06-besu/CRITICAL_ISSUE_BLOCK_PRODUCTION_STOPPED.md)."
@@ -104,20 +127,32 @@ else
log_warn "$PENDING_COUNT pending transactions (nonces $((LATEST_DEC + 1))-$PENDING_DEC)"
fi
GLOBAL_PENDING=0
TXPOOL_JSON=$(cast rpc txpool_besuTransactions --rpc-url "$RPC_URL" 2>/dev/null || echo "")
if [ -n "$TXPOOL_JSON" ]; then
GLOBAL_PENDING=$(echo "$TXPOOL_JSON" | jq 'length' 2>/dev/null || echo "0")
if [ "${GLOBAL_PENDING:-0}" -gt 0 ]; then
log_warn "Global txpool still has $GLOBAL_PENDING transaction(s)"
else
log_success "Global txpool is empty"
fi
fi
# Check validator status
log_section "Validator Status"
VALIDATORS=(
"1000:$PROXMOX_R630"
"1001:$PROXMOX_R630"
"1002:$PROXMOX_R630"
"1003:$PROXMOX_ML110"
"1004:$PROXMOX_ML110"
"1000:$(validator_host 1000)"
"1001:$(validator_host 1001)"
"1002:$(validator_host 1002)"
"1003:$(validator_host 1003)"
"1004:$(validator_host 1004)"
)
ACTIVE_COUNT=0
P2P_MISMATCHES=0
for validator in "${VALIDATORS[@]}"; do
IFS=':' read -r VMID HOST <<< "$validator"
SSH_TARGET="${PROXMOX_USER}@${HOST}"
SSH_TARGET="${PROXMOX_SSH_USER}@${HOST}"
STATUS=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$SSH_TARGET" \
"pct exec $VMID -- systemctl is-active besu-validator" 2>/dev/null || echo "unknown")
@@ -127,6 +162,17 @@ for validator in "${VALIDATORS[@]}"; do
else
log_warn "Validator $VMID: $STATUS"
fi
EXPECTED_P2P_HOST="$(validator_ip "$VMID" 2>/dev/null || echo "")"
if [ -n "$EXPECTED_P2P_HOST" ]; then
P2P_HOST_CFG=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$SSH_TARGET" \
"pct exec $VMID -- bash -lc 'grep -E \"^p2p-host=\" /etc/besu/config-validator.toml 2>/dev/null | sed -E \"s/^[^\\\"]*\\\"([^\\\"]+)\\\".*/\\1/\"'" \
2>/dev/null || echo "")
if [ -n "$P2P_HOST_CFG" ] && [ "$P2P_HOST_CFG" != "$EXPECTED_P2P_HOST" ]; then
log_warn "Validator $VMID p2p-host mismatch: $P2P_HOST_CFG (expected $EXPECTED_P2P_HOST)"
P2P_MISMATCHES=$((P2P_MISMATCHES + 1))
fi
fi
done
if [ "$ACTIVE_COUNT" -eq 5 ]; then
@@ -173,11 +219,21 @@ else
log_success "✓ All validators active"
fi
if [ "$P2P_MISMATCHES" -gt 0 ]; then
log_error "$P2P_MISMATCHES validator(s) have the wrong p2p-host"
ISSUES=$((ISSUES + 1))
fi
if [ "$PENDING_COUNT" -gt 10 ]; then
log_warn "⚠ High number of pending transactions ($PENDING_COUNT)"
ISSUES=$((ISSUES + 1))
fi
if [ "${GLOBAL_PENDING:-0}" -gt 0 ] && [ "$TX_COUNT_TOTAL" -eq 0 ]; then
log_warn "⚠ Global txpool has $GLOBAL_PENDING transaction(s) but recent blocks are empty"
ISSUES=$((ISSUES + 1))
fi
echo ""
if [ "$ISSUES" -eq 0 ]; then
log_success "Overall Status: HEALTHY"