chain138: harden besu txpool recovery and runbooks

This commit is contained in:
defiQUG
2026-04-24 10:46:40 -07:00
parent d9a3053a58
commit 58ca82bbe3
11 changed files with 344 additions and 106 deletions

View File

@@ -243,7 +243,7 @@ From **LAN** (SSH to Proxmox + reach NPMplus):
## After running "complete all next steps"
1. **Automated (workspace):** `bash scripts/run-all-next-steps.sh` — report in `docs/04-configuration/verification-evidence/NEXT_STEPS_RUN_*.md`.
2. **Validators + tx-pool:** `bash scripts/fix-all-validators-and-txpool.sh` (requires SSH to .10, .11).
2. **Validators + tx-pool:** `bash scripts/fix-all-validators-and-txpool.sh` then `bash scripts/maintenance/apply-chain138-strict-future-tx-pool.sh` then `bash scripts/clear-all-transaction-pools.sh` (requires SSH to .11, .12, .13, .14).
3. **Flush stuck tx (if any):** `bash scripts/flush-stuck-tx-rpc-and-validators.sh --full` (clears RPC 2101 + validators 10001004).
4. **Verify from LAN:** From a host on 192.168.11.x run `bash scripts/monitoring/monitor-blockchain-health.sh` and `bash scripts/skip-stuck-transactions.sh`. See [NEXT_STEPS_COMPLETION_RUN_20260208.md](../04-configuration/verification-evidence/NEXT_STEPS_COMPLETION_RUN_20260208.md) § Verify from LAN.

View File

@@ -15,6 +15,8 @@
**Current live execution path:** [LIVE_SESSION_CRONOS_AND_TIER1_PHASE_C.md](../03-deployment/LIVE_SESSION_CRONOS_AND_TIER1_PHASE_C.md) — close Cronos config + LINK, then activate Tier 1 Phase C on Gnosis, Polygon, and BSC. **Current priority docs:** [FULLY_OPERATIONAL_EXECUTION_CHECKLIST.md](FULLY_OPERATIONAL_EXECUTION_CHECKLIST.md), [PHASE_C_PROFIT_FIRST_PRIORITY.md](../03-deployment/PHASE_C_PROFIT_FIRST_PRIORITY.md), [PHASE_C_TIER1_EXECUTION_TASK_SHEET.md](../03-deployment/PHASE_C_TIER1_EXECUTION_TASK_SHEET.md).
**Chain 138 txpool incident standard path:** `bash scripts/fix-all-validators-and-txpool.sh` then `bash scripts/maintenance/apply-chain138-strict-future-tx-pool.sh` then `bash scripts/clear-all-transaction-pools.sh` then `bash scripts/monitoring/monitor-blockchain-health.sh`. Use this sequence when block production stalls, pending hashes keep reappearing, or future-nonce residue survives a normal txpool clear.
---
## Completed in this session (2026-03-26)

View File

@@ -14,6 +14,8 @@ This document provides a master index of all operational runbooks and procedures
**Proxmox VE hosts, peering, FQDN/NPMplus summary, deployment gates (human + JSON):** [PROXMOX_VE_OPERATIONAL_DEPLOYMENT_TEMPLATE.md](PROXMOX_VE_OPERATIONAL_DEPLOYMENT_TEMPLATE.md).
**Chain 138 txpool incident standard recovery:** `bash scripts/fix-all-validators-and-txpool.sh` then `bash scripts/maintenance/apply-chain138-strict-future-tx-pool.sh` then `bash scripts/clear-all-transaction-pools.sh` then `bash scripts/monitoring/monitor-blockchain-health.sh`. This is the default path when pending hashes keep reappearing, future-nonce junk survives a clear, or block production is affected by txpool residue.
---
## Quick Reference
@@ -202,6 +204,7 @@ If an RPC node returns wrong chain ID or block 0 / no block: use the dedicated r
- **[QBFT_TROUBLESHOOTING.md](../09-troubleshooting/QBFT_TROUBLESHOOTING.md)** - QBFT consensus troubleshooting
- **Block Production Issues** - [BLOCK_PRODUCTION_FIX_RUNBOOK.md](../08-monitoring/BLOCK_PRODUCTION_FIX_RUNBOOK.md) — restore block production (permissioning TOML, tx-pool, restart validators 10001004)
- **Validator Recognition** - Validator not being recognized
- **Chain 138 txpool incidents** - Run `bash scripts/fix-all-validators-and-txpool.sh`, then `bash scripts/maintenance/apply-chain138-strict-future-tx-pool.sh`, then `bash scripts/clear-all-transaction-pools.sh`, then `bash scripts/monitoring/monitor-blockchain-health.sh`. The strict future-queue step normalizes `tx-pool-max-future-by-sender=1` fleet-wide so far-future thirdweb or automation residue does not repopulate the mesh after restart.
---
@@ -213,6 +216,13 @@ If an RPC node returns wrong chain ID or block 0 / no block: use the dedicated r
---
## TsunamiSwap
- **[../00-meta/AAVE_CHAIN138_AND_MARIONETTE_TSUNAMISWAP_PLAN.md](../00-meta/AAVE_CHAIN138_AND_MARIONETTE_TSUNAMISWAP_PLAN.md)** — Canonical TsunamiSwap VM `5010` plan, current repo-authoritative DEX link, and publish checklist.
- **Inventory helper:** [`../../scripts/deployment/tsunamiswap-vm-5010-provision.sh`](../../scripts/deployment/tsunamiswap-vm-5010-provision.sh)
---
## GRU M1 Listing Operations
### GRU M1 Listing Dry-Run

View File

@@ -8,10 +8,10 @@ set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh" 2>/dev/null || true
PROXMOX_USER="${PROXMOX_USER:-root}"
PROXMOX_ML110="${PROXMOX_ML110:-192.168.11.10}"
PROXMOX_SSH_USER="${PROXMOX_SSH_USER:-root}"
PROXMOX_R630="${PROXMOX_R630:-192.168.11.11}"
# Colors
@@ -31,12 +31,25 @@ log_section() { echo -e "\n${CYAN}━━━━━━━━━━━━━━━
echo "=== Clear Transaction Pools on All Nodes ==="
echo ""
host_for_vmid() {
local vmid="$1"
if type get_host_for_vmid >/dev/null 2>&1; then
get_host_for_vmid "$vmid"
elif [ "$vmid" -le 1002 ]; then
echo "$PROXMOX_R630"
elif [ "$vmid" -eq 2201 ]; then
echo "${PROXMOX_R630_02:-${PROXMOX_HOST_R630_02:-192.168.11.12}}"
else
echo "${PROXMOX_HOST_ML110:-192.168.11.10}"
fi
}
# Function to clear transaction pool for a node
clear_node_pool() {
local VMID=$1
local HOST=$2
local NODE_TYPE=$3
local SSH_TARGET="${PROXMOX_USER}@${HOST}"
local SSH_TARGET="${PROXMOX_SSH_USER}@${HOST}"
log_info "Clearing transaction pool for $NODE_TYPE (VMID $VMID on $HOST)..."
@@ -93,11 +106,11 @@ clear_node_pool() {
log_section "Clearing Validator Transaction Pools"
VALIDATORS=(
"1000:$PROXMOX_R630:Validator"
"1001:$PROXMOX_R630:Validator"
"1002:$PROXMOX_R630:Validator"
"1003:$PROXMOX_ML110:Validator"
"1004:$PROXMOX_ML110:Validator"
"1000:$(host_for_vmid 1000):Validator"
"1001:$(host_for_vmid 1001):Validator"
"1002:$(host_for_vmid 1002):Validator"
"1003:$(host_for_vmid 1003):Validator"
"1004:$(host_for_vmid 1004):Validator"
)
for validator in "${VALIDATORS[@]}"; do
@@ -105,28 +118,61 @@ for validator in "${VALIDATORS[@]}"; do
clear_node_pool "$VMID" "$HOST" "$TYPE"
done
# Clear RPC Core (2101)
log_section "Clearing RPC Transaction Pool (2101)"
RPC_NODES=(
"2101:Core RPC"
"2102:Core RPC Replica"
"2103:Thirdweb admin core RPC"
"2201:RPC Public"
"2301:RPC Private"
"2303:RPC Ali 0x8a"
"2304:RPC Ali 0x1"
"2305:RPC Luis 0x8a"
"2306:RPC Luis 0x1"
"2307:RPC Putu 0x8a"
"2308:RPC Putu 0x1"
"2400:Thirdweb RPC 1"
"2401:Thirdweb RPC 2"
"2402:Thirdweb RPC 3"
"2403:Thirdweb RPC 4"
)
if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${PROXMOX_ML110}" \
"pct list | grep -q '2101'" 2>/dev/null; then
clear_node_pool 2101 "$PROXMOX_ML110" "RPC"
elif ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${PROXMOX_R630}" \
"pct list | grep -q '2101'" 2>/dev/null; then
clear_node_pool 2101 "$PROXMOX_R630" "RPC"
else
log_warn "RPC node (2101) not found on either host"
fi
log_section "Clearing RPC Transaction Pools"
for rpc in "${RPC_NODES[@]}"; do
IFS=':' read -r VMID TYPE <<< "$rpc"
HOST="$(host_for_vmid "$VMID")"
if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_SSH_USER}@${HOST}" \
"pct list | awk '{print \$1}' | grep -qx '$VMID'" 2>/dev/null; then
clear_node_pool "$VMID" "$HOST" "$TYPE"
else
log_warn "$TYPE (VMID $VMID) not found on ${HOST}"
fi
done
# Clear RPC Public (2201) — often used when Core is down; ensures deploy txs not stuck
log_section "Clearing RPC Public (2201)"
R630_02="${PROXMOX_R630_02:-${PROXMOX_HOST_R630_02:-192.168.11.12}}"
if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${R630_02}" \
"pct list | grep -q '2201'" 2>/dev/null; then
clear_node_pool 2201 "$R630_02" "RPC Public"
else
log_warn "RPC Public (2201) not found on ${R630_02}"
fi
SENTRY_NODES=(
"1500:Sentry"
"1501:Sentry"
"1502:Sentry"
"1503:Sentry"
"1504:Sentry"
"1505:Sentry"
"1506:Sentry"
"1507:Sentry"
"1508:Sentry"
"1509:Sentry"
"1510:Sentry"
)
log_section "Clearing Sentry Transaction Pools"
for sentry in "${SENTRY_NODES[@]}"; do
IFS=':' read -r VMID TYPE <<< "$sentry"
HOST="$(host_for_vmid "$VMID")"
if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "${PROXMOX_SSH_USER}@${HOST}" \
"pct list | awk '{print \$1}' | grep -qx '$VMID'" 2>/dev/null; then
clear_node_pool "$VMID" "$HOST" "$TYPE"
else
log_warn "$TYPE (VMID $VMID) not found on ${HOST}"
fi
done
log_section "Transaction Pool Clear Complete"

View File

@@ -1,10 +1,6 @@
#!/usr/bin/env bash
# Fix all validators: remove legacy tx-pool options, set layered pool with aggressive eviction,
# restart besu-validator. Run from project root (sources config/ip-addresses.conf).
#
# Eviction: Besu layered pool has no "drop after N blocks". We use tx-pool-min-score=0
# so transactions that are penalized (not included) get evicted once score drops to 0 or below,
# reducing stuck transactions. See docs/06-besu/TXPOOL_EVICTION_PREVENT_STUCK.md.
# Fix all validators: remove legacy tx-pool options, normalize validator networking,
# and restart besu-validator. Run from project root (sources config/ip-addresses.conf).
#
# Usage: bash scripts/fix-all-validators-and-txpool.sh [--dry-run]
@@ -14,30 +10,43 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
cd "$PROJECT_ROOT"
[ -f config/ip-addresses.conf ] && source config/ip-addresses.conf 2>/dev/null || true
[ -f scripts/lib/load-project-env.sh ] && source scripts/lib/load-project-env.sh 2>/dev/null || true
PROXMOX_USER="${PROXMOX_USER:-root}"
PROXMOX_SSH_USER="${PROXMOX_SSH_USER:-root}"
R630_01="${PROXMOX_HOST_R630_01:-${PROXMOX_R630_01:-192.168.11.11}}"
R630_03="${PROXMOX_HOST_R630_03:-${PROXMOX_R630_03:-192.168.11.13}}"
# VMID -> Proxmox host
# 1000,1001,1002 on r630-01; 1003,1004 on r630-03
validator_host() {
local vmid="$1"
if type get_host_for_vmid >/dev/null 2>&1; then
get_host_for_vmid "$vmid"
elif [[ "$vmid" -le 1002 ]]; then
echo "$R630_01"
else
echo "${PROXMOX_HOST_ML110:-192.168.11.10}"
fi
}
validator_ip() {
local vmid="$1"
case "$vmid" in
1000) echo "${IP_VALIDATOR_0:-192.168.11.100}" ;;
1001) echo "${IP_VALIDATOR_1:-192.168.11.101}" ;;
1002) echo "${IP_VALIDATOR_2:-192.168.11.102}" ;;
1003) echo "${IP_VALIDATOR_3:-192.168.11.103}" ;;
1004) echo "${IP_VALIDATOR_4:-192.168.11.104}" ;;
*) return 1 ;;
esac
}
VALIDATORS=(
"1000:$R630_01"
"1001:$R630_01"
"1002:$R630_01"
"1003:$R630_03"
"1004:$R630_03"
"1000:$(validator_host 1000)"
"1001:$(validator_host 1001)"
"1002:$(validator_host 1002)"
"1003:$(validator_host 1003)"
"1004:$(validator_host 1004)"
)
CONFIG_PATH="/etc/besu/config-validator.toml"
LAYERED_BLOCK="# Layered Transaction Pool (Besu 23.10+); evict penalized txs (min-score=0)
tx-pool-max-future-by-sender=200
tx-pool-layer-max-capacity=12500000
tx-pool-max-prioritized=2000
tx-pool-price-bump=10
tx-pool-min-score=0
"
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
@@ -58,7 +67,13 @@ echo ""
fix_one() {
local vmid="$1"
local host="$2"
local ssh_target="${PROXMOX_USER}@${host}"
local validator_ip
local ssh_target="${PROXMOX_SSH_USER}@${host}"
validator_ip="$(validator_ip "$vmid")" || {
log_error " Could not determine validator IP for VMID $vmid"
return 1
}
log_info "Validator $vmid on $host"
@@ -71,11 +86,11 @@ fix_one() {
fi
if "$DRY_RUN"; then
log_info " [dry-run] Would remove legacy tx-pool lines and add layered + tx-pool-min-score=0, then restart besu-validator"
log_info " [dry-run] Would remove legacy tx-pool lines, set p2p-host=${validator_ip}, force sync-mode=FULL, and restart besu-validator"
return 0
fi
# Remove legacy options (cause crash with layered pool); add layered + min-score
# Remove legacy options (cause crash with layered pool); normalize validator networking.
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$ssh_target" "pct exec $vmid -- bash -c '
set -e
CFG=/etc/besu/config-validator.toml
@@ -84,13 +99,19 @@ fix_one() {
sed -i \"/^tx-pool-max-size=/d\" \"\$CFG\" 2>/dev/null || true
sed -i \"/^tx-pool-limit-by-account-percentage=/d\" \"\$CFG\" 2>/dev/null || true
sed -i \"/^tx-pool-retention-hours=/d\" \"\$CFG\" 2>/dev/null || true
sed -i \"s|^p2p-host=.*|p2p-host=\\\"${validator_ip}\\\"|\" \"\$CFG\"
sed -i \"s|^sync-mode=.*|sync-mode=\\\"FULL\\\"|\" \"\$CFG\"
grep -q \"^p2p-host=\" \"\$CFG\" || echo \"p2p-host=\\\"${validator_ip}\\\"\" >> \"\$CFG\"
grep -q \"^sync-mode=\" \"\$CFG\" || echo \"sync-mode=\\\"FULL\\\"\" >> \"\$CFG\"
if ! grep -q \"tx-pool-max-future-by-sender\" \"\$CFG\"; then
echo \"\" >> \"\$CFG\"
echo \"# Layered Transaction Pool (Besu 23.10+)\" >> \"\$CFG\"
echo \"tx-pool-max-future-by-sender=200\" >> \"\$CFG\"
echo \"# Layered Transaction Pool (Besu 23.10+; keep future queue tight)\" >> \"\$CFG\"
echo \"tx-pool-max-future-by-sender=1\" >> \"\$CFG\"
echo \"tx-pool-layer-max-capacity=12500000\" >> \"\$CFG\"
echo \"tx-pool-max-prioritized=2000\" >> \"\$CFG\"
echo \"tx-pool-price-bump=10\" >> \"\$CFG\"
else
sed -i \"s/^tx-pool-max-future-by-sender=.*/tx-pool-max-future-by-sender=1/\" \"\$CFG\"
fi
# tx-pool-min-score=0 not added: unsupported in some Besu builds (causes Unknown option and crash loop)
sed -i \"/^tx-pool-min-score=/d\" \"\$CFG\" 2>/dev/null || true
@@ -98,8 +119,15 @@ fix_one() {
log_success " Config updated"
# Restart
if ! ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$ssh_target" "pct exec $vmid -- systemctl restart besu-validator" 2>/dev/null; then
# Restart with a hard fallback so a wedged Besu process cannot block the whole recovery.
if ! ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$ssh_target" "pct exec $vmid -- bash -lc '
timeout 30 systemctl restart besu-validator || {
systemctl kill -s SIGKILL besu-validator || true
sleep 2
systemctl reset-failed besu-validator || true
systemctl start besu-validator
}
'" 2>/dev/null; then
log_error " Restart failed"
return 1
fi

View File

@@ -53,11 +53,13 @@ fix_one() {
sed -i "/^tx-pool-retention-hours=/d" "$CFG" 2>/dev/null || true
if ! grep -q "tx-pool-max-future-by-sender" "$CFG"; then
echo "" >> "$CFG"
echo "# Layered Transaction Pool (Besu 23.10+)" >> "$CFG"
echo "tx-pool-max-future-by-sender=200" >> "$CFG"
echo "# Layered Transaction Pool (Besu 23.10+; keep future queue tight)" >> "$CFG"
echo "tx-pool-max-future-by-sender=1" >> "$CFG"
echo "tx-pool-layer-max-capacity=12500000" >> "$CFG"
echo "tx-pool-max-prioritized=2000" >> "$CFG"
echo "tx-pool-price-bump=10" >> "$CFG"
else
sed -i "s/^tx-pool-max-future-by-sender=.*/tx-pool-max-future-by-sender=1/" "$CFG"
fi
# Remove tx-pool-min-score if present (unsupported in some Besu builds)
sed -i "/^tx-pool-min-score=/d" "$CFG" 2>/dev/null || true

View File

@@ -154,7 +154,8 @@ get_host_for_vmid() {
7800|7801|7802|7803|7804|7805|7806) echo "${PROXMOX_HOST_R630_01}";;
10130|10150|10151|106|107|108|10000|10001|10020|10100|10101|10120|10203|10233|10235) echo "${PROXMOX_HOST_R630_01}";;
1000|1001|1002|1500|1501|1502|2101|2103) echo "${PROXMOX_HOST_R630_01}";;
1003|1004|1503|1504|1505|1506|1507|1508|1509|1510|2102|2301|2304|2400|2402|2403) echo "${PROXMOX_HOST_R630_03}";;
1003|1004|1503|1504|1505|1506|1507|1509|1510|2102|2301|2304|2400|2402|2403) echo "${PROXMOX_HOST_R630_03}";;
1508) echo "${PROXMOX_HOST_R630_04}";;
5700) echo "${PROXMOX_HOST_R630_04}";;
5000|7810|2201|2303|2305|2306|2307|2308|2401|6200|6201|6202|6203|6204|6205|10234|10237|5800|5801) echo "${PROXMOX_HOST_R630_02}";;
2420|2430|2440|2460|2470|2480) echo "${PROXMOX_HOST_R630_01}";;

View File

@@ -0,0 +1,78 @@
#!/usr/bin/env bash
# Normalize Chain 138 Besu nodes to a strict layered tx-pool future queue.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh"
VALUE=1
RESTART=true
while [[ $# -gt 0 ]]; do
case "$1" in
--no-restart) RESTART=false; shift ;;
--value)
VALUE="${2:-}"
shift 2
;;
*)
echo "Unknown argument: $1" >&2
exit 2
;;
esac
done
if [[ -z "${VALUE}" || "${VALUE}" -le 0 ]]; then
echo "Refusing unsafe value '${VALUE}'. Use 1 or more." >&2
exit 1
fi
NODES=(
1000 1001 1002 1003 1004
1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510
2101 2102 2103 2201 2301 2303 2304 2305 2306 2307 2308
2400 2401 2402 2403
)
for vmid in "${NODES[@]}"; do
host="$(get_host_for_vmid "$vmid")"
echo "=== VMID ${vmid} on ${host} ==="
ssh -o ConnectTimeout=8 -o StrictHostKeyChecking=no "root@${host}" "pct exec ${vmid} -- bash -lc '
set -e
shopt -s nullglob
found=0
for cfg in /etc/besu/config*.toml /config/config*.toml; do
[ -f \"\$cfg\" ] || continue
found=1
cp -a \"\$cfg\" \"\$cfg.bak.\$(date +%Y%m%d%H%M%S)\"
sed -i \"/^tx-pool-min-score=/d\" \"\$cfg\" 2>/dev/null || true
if grep -q \"^tx-pool-max-future-by-sender=\" \"\$cfg\"; then
sed -i \"s/^tx-pool-max-future-by-sender=.*/tx-pool-max-future-by-sender=${VALUE}/\" \"\$cfg\"
else
printf \"\\n# Strict future queue to avoid stale far-future tx residue\\ntx-pool-max-future-by-sender=${VALUE}\\n\" >> \"\$cfg\"
fi
echo \"\$(basename \"\$cfg\"): \$(grep -E \"^tx-pool-max-future-by-sender=\" \"\$cfg\" | tail -1)\"
done
if [ \"\$found\" -eq 0 ]; then
echo \"No Besu config files found\"
fi
if ${RESTART}; then
if systemctl list-unit-files | grep -q \"^besu-validator.service\"; then
systemctl restart besu-validator
systemctl is-active besu-validator
elif systemctl list-unit-files | grep -q \"^besu-sentry.service\"; then
systemctl restart besu-sentry
systemctl is-active besu-sentry
elif systemctl list-unit-files | grep -q \"^besu-rpc-core.service\"; then
systemctl restart besu-rpc-core
systemctl is-active besu-rpc-core
elif systemctl list-unit-files | grep -q \"^besu-rpc.service\"; then
systemctl restart besu-rpc
systemctl is-active besu-rpc
fi
fi
'"
echo
done

View File

@@ -4,16 +4,16 @@
set -euo pipefail
# Load IP configuration
# Load shared project environment / VMID host mapping
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh" 2>/dev/null || true
RPC_CORE_1="${RPC_CORE_1:-192.168.11.211}"
RPC_URL="${RPC_URL:-http://${RPC_CORE_1}:8545}"
DEPLOYER="${DEPLOYER:-0x4A666F96fC8764181194447A7dFdb7d471b301C8}"
PROXMOX_USER="${PROXMOX_USER:-root}"
PROXMOX_ML110="${PROXMOX_ML110:-${PROXMOX_HOST_ML110:-192.168.11.10}}"
PROXMOX_SSH_USER="${PROXMOX_SSH_USER:-root}"
PROXMOX_R630="${PROXMOX_R630:-${PROXMOX_R630_01:-${PROXMOX_HOST_R630_01:-192.168.11.11}}}"
# Colors
@@ -30,6 +30,29 @@ log_warn() { echo -e "${YELLOW}[⚠]${NC} $1"; }
log_error() { echo -e "${RED}[✗]${NC} $1"; }
log_section() { echo -e "\n${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"; echo -e "${CYAN}$1${NC}"; echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"; }
validator_host() {
local vmid="$1"
if type get_host_for_vmid >/dev/null 2>&1; then
get_host_for_vmid "$vmid"
elif [ "$vmid" -le 1002 ]; then
echo "$PROXMOX_R630"
else
echo "${PROXMOX_HOST_ML110:-192.168.11.10}"
fi
}
validator_ip() {
local vmid="$1"
case "$vmid" in
1000) echo "${IP_VALIDATOR_0:-192.168.11.100}" ;;
1001) echo "${IP_VALIDATOR_1:-192.168.11.101}" ;;
1002) echo "${IP_VALIDATOR_2:-192.168.11.102}" ;;
1003) echo "${IP_VALIDATOR_3:-192.168.11.103}" ;;
1004) echo "${IP_VALIDATOR_4:-192.168.11.104}" ;;
*) return 1 ;;
esac
}
echo "=== Blockchain Health Monitor ==="
echo "Timestamp: $(date '+%Y-%m-%d %H:%M:%S')"
echo ""
@@ -62,7 +85,7 @@ if [ "$BLOCK_DIFF" -gt 0 ]; then
else
log_error "Block production stalled (no new blocks in 5s)"
# If validators are all active, they may still be syncing (QBFT does not produce until sync completes)
SYNC_HINT=$(ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no "${PROXMOX_USER}@${PROXMOX_R630}" \
SYNC_HINT=$(ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no "${PROXMOX_SSH_USER}@${PROXMOX_R630}" \
"pct exec 1000 -- journalctl -u besu-validator --no-pager -n 30 2>/dev/null" 2>/dev/null | grep -c "Full sync\|initial sync in progress\|QBFT mining coordinator not starting" || true)
if [ "${SYNC_HINT:-0}" -gt 0 ]; then
echo " → Validators may be syncing; block production will resume when sync completes (see docs/06-besu/CRITICAL_ISSUE_BLOCK_PRODUCTION_STOPPED.md)."
@@ -104,20 +127,32 @@ else
log_warn "$PENDING_COUNT pending transactions (nonces $((LATEST_DEC + 1))-$PENDING_DEC)"
fi
GLOBAL_PENDING=0
TXPOOL_JSON=$(cast rpc txpool_besuTransactions --rpc-url "$RPC_URL" 2>/dev/null || echo "")
if [ -n "$TXPOOL_JSON" ]; then
GLOBAL_PENDING=$(echo "$TXPOOL_JSON" | jq 'length' 2>/dev/null || echo "0")
if [ "${GLOBAL_PENDING:-0}" -gt 0 ]; then
log_warn "Global txpool still has $GLOBAL_PENDING transaction(s)"
else
log_success "Global txpool is empty"
fi
fi
# Check validator status
log_section "Validator Status"
VALIDATORS=(
"1000:$PROXMOX_R630"
"1001:$PROXMOX_R630"
"1002:$PROXMOX_R630"
"1003:$PROXMOX_ML110"
"1004:$PROXMOX_ML110"
"1000:$(validator_host 1000)"
"1001:$(validator_host 1001)"
"1002:$(validator_host 1002)"
"1003:$(validator_host 1003)"
"1004:$(validator_host 1004)"
)
ACTIVE_COUNT=0
P2P_MISMATCHES=0
for validator in "${VALIDATORS[@]}"; do
IFS=':' read -r VMID HOST <<< "$validator"
SSH_TARGET="${PROXMOX_USER}@${HOST}"
SSH_TARGET="${PROXMOX_SSH_USER}@${HOST}"
STATUS=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$SSH_TARGET" \
"pct exec $VMID -- systemctl is-active besu-validator" 2>/dev/null || echo "unknown")
@@ -127,6 +162,17 @@ for validator in "${VALIDATORS[@]}"; do
else
log_warn "Validator $VMID: $STATUS"
fi
EXPECTED_P2P_HOST="$(validator_ip "$VMID" 2>/dev/null || echo "")"
if [ -n "$EXPECTED_P2P_HOST" ]; then
P2P_HOST_CFG=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$SSH_TARGET" \
"pct exec $VMID -- bash -lc 'grep -E \"^p2p-host=\" /etc/besu/config-validator.toml 2>/dev/null | sed -E \"s/^[^\\\"]*\\\"([^\\\"]+)\\\".*/\\1/\"'" \
2>/dev/null || echo "")
if [ -n "$P2P_HOST_CFG" ] && [ "$P2P_HOST_CFG" != "$EXPECTED_P2P_HOST" ]; then
log_warn "Validator $VMID p2p-host mismatch: $P2P_HOST_CFG (expected $EXPECTED_P2P_HOST)"
P2P_MISMATCHES=$((P2P_MISMATCHES + 1))
fi
fi
done
if [ "$ACTIVE_COUNT" -eq 5 ]; then
@@ -173,11 +219,21 @@ else
log_success "✓ All validators active"
fi
if [ "$P2P_MISMATCHES" -gt 0 ]; then
log_error "$P2P_MISMATCHES validator(s) have the wrong p2p-host"
ISSUES=$((ISSUES + 1))
fi
if [ "$PENDING_COUNT" -gt 10 ]; then
log_warn "⚠ High number of pending transactions ($PENDING_COUNT)"
ISSUES=$((ISSUES + 1))
fi
if [ "${GLOBAL_PENDING:-0}" -gt 0 ] && [ "$TX_COUNT_TOTAL" -eq 0 ]; then
log_warn "⚠ Global txpool has $GLOBAL_PENDING transaction(s) but recent blocks are empty"
ISSUES=$((ISSUES + 1))
fi
echo ""
if [ "$ISSUES" -eq 0 ]; then
log_success "Overall Status: HEALTHY"

View File

@@ -10,11 +10,24 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
cd "$PROJECT_ROOT"
[ -f config/ip-addresses.conf ] && source config/ip-addresses.conf 2>/dev/null || true
[ -f scripts/lib/load-project-env.sh ] && source scripts/lib/load-project-env.sh 2>/dev/null || true
PROXMOX_USER="${PROXMOX_USER:-root}"
PROXMOX_SSH_USER="${PROXMOX_SSH_USER:-root}"
R630_01="${PROXMOX_HOST_R630_01:-${PROXMOX_R630_01:-192.168.11.11}}"
R630_02="${PROXMOX_HOST_R630_02:-${PROXMOX_R630_02:-192.168.11.12}}"
ML110="${PROXMOX_HOST_ML110:-${PROXMOX_ML110:-192.168.11.10}}"
host_for_vmid() {
local vmid="$1"
if type get_host_for_vmid >/dev/null 2>&1; then
get_host_for_vmid "$vmid"
elif [[ "$vmid" -le 1002 ]] || [[ "$vmid" == "1500" ]] || [[ "$vmid" == "1501" ]] || [[ "$vmid" == "1502" ]] || [[ "$vmid" == "2101" ]]; then
echo "$R630_01"
elif [[ "$vmid" == "2201" ]] || [[ "$vmid" == "2303" ]] || [[ "$vmid" == "2401" ]]; then
echo "$R630_02"
else
echo "${PROXMOX_HOST_ML110:-${PROXMOX_ML110:-192.168.11.10}}"
fi
}
# VMID:host:service:config_name (config path under /etc/besu/)
# Host mapping per docs/04-configuration/verification-evidence/BESU_VMIDS_FROM_PROXMOX_20260208.md
@@ -22,38 +35,38 @@ ML110="${PROXMOX_HOST_ML110:-${PROXMOX_ML110:-192.168.11.10}}"
# Sentries: 1500-1502 r630-01; 1503-1506 ml110
# RPC: 2101 r630-01; 2201,2303,2401 r630-02; 2301,2304-2308,2400,2402,2403 ml110; 2503-2505 r630-01 (HYBX; use besu.service, not besu-rpc — if Besu not installed, service is disabled). 2506-2508 destroyed 2026-02-08.
VALIDATORS=(
"1000:$R630_01:besu-validator:config-validator.toml"
"1001:$R630_01:besu-validator:config-validator.toml"
"1002:$R630_01:besu-validator:config-validator.toml"
"1003:$ML110:besu-validator:config-validator.toml"
"1004:$ML110:besu-validator:config-validator.toml"
"1000:$(host_for_vmid 1000):besu-validator:config-validator.toml"
"1001:$(host_for_vmid 1001):besu-validator:config-validator.toml"
"1002:$(host_for_vmid 1002):besu-validator:config-validator.toml"
"1003:$(host_for_vmid 1003):besu-validator:config-validator.toml"
"1004:$(host_for_vmid 1004):besu-validator:config-validator.toml"
)
SENTRIES=(
"1500:$R630_01:besu-sentry:config-sentry.toml"
"1501:$R630_01:besu-sentry:config-sentry.toml"
"1502:$R630_01:besu-sentry:config-sentry.toml"
"1503:$ML110:besu-sentry:config-sentry.toml"
"1504:$ML110:besu-sentry:config-sentry.toml"
"1505:$ML110:besu-sentry:config-sentry.toml"
"1506:$ML110:besu-sentry:config-sentry.toml"
"1500:$(host_for_vmid 1500):besu-sentry:config-sentry.toml"
"1501:$(host_for_vmid 1501):besu-sentry:config-sentry.toml"
"1502:$(host_for_vmid 1502):besu-sentry:config-sentry.toml"
"1503:$(host_for_vmid 1503):besu-sentry:config-sentry.toml"
"1504:$(host_for_vmid 1504):besu-sentry:config-sentry.toml"
"1505:$(host_for_vmid 1505):besu-sentry:config-sentry.toml"
"1506:$(host_for_vmid 1506):besu-sentry:config-sentry.toml"
)
RPC_NODES=(
"2101:$R630_01:besu-rpc:config-rpc-core.toml"
"2201:$R630_02:besu-rpc:config-rpc-public.toml"
"2301:$ML110:besu-rpc:config-rpc-private.toml"
"2303:$R630_02:besu-rpc:config-rpc.toml"
"2304:$ML110:besu-rpc:config-rpc.toml"
"2305:$ML110:besu-rpc:config-rpc.toml"
"2306:$ML110:besu-rpc:config-rpc.toml"
"2307:$ML110:besu-rpc:config-rpc.toml"
"2308:$ML110:besu-rpc:config-rpc.toml"
"2400:$ML110:besu-rpc:config-rpc.toml"
"2401:$R630_02:besu-rpc:config-rpc.toml"
"2402:$ML110:besu-rpc:config-rpc.toml"
"2403:$ML110:besu-rpc:config-rpc.toml"
"2503:$R630_01:besu-rpc:config-rpc.toml"
"2504:$R630_01:besu-rpc:config-rpc.toml"
"2505:$R630_01:besu-rpc:config-rpc.toml"
"2101:$(host_for_vmid 2101):besu-rpc:config-rpc-core.toml"
"2201:$(host_for_vmid 2201):besu-rpc:config-rpc-public.toml"
"2301:$(host_for_vmid 2301):besu-rpc:config-rpc-private.toml"
"2303:$(host_for_vmid 2303):besu-rpc:config-rpc.toml"
"2304:$(host_for_vmid 2304):besu-rpc:config-rpc.toml"
"2305:$(host_for_vmid 2305):besu-rpc:config-rpc.toml"
"2306:$(host_for_vmid 2306):besu-rpc:config-rpc.toml"
"2307:$(host_for_vmid 2307):besu-rpc:config-rpc.toml"
"2308:$(host_for_vmid 2308):besu-rpc:config-rpc.toml"
"2400:$(host_for_vmid 2400):besu-rpc:config-rpc.toml"
"2401:$(host_for_vmid 2401):besu-rpc:config-rpc.toml"
"2402:$(host_for_vmid 2402):besu-rpc:config-rpc.toml"
"2403:$(host_for_vmid 2403):besu-rpc:config-rpc.toml"
"2503:$(host_for_vmid 2503):besu-rpc:config-rpc.toml"
"2504:$(host_for_vmid 2504):besu-rpc:config-rpc.toml"
"2505:$(host_for_vmid 2505):besu-rpc:config-rpc.toml"
)
RED='\033[0;31m'
@@ -72,7 +85,7 @@ APPLY_TXPOOL=false
review_one() {
local vmid="$1" host="$2" service="$3" config_name="$4" is_rpc="${5:-false}"
local ssh_target="${PROXMOX_USER}@${host}"
local ssh_target="${PROXMOX_SSH_USER}@${host}"
local ct_status service_status block_info=""
ct_status=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$ssh_target" "pct status $vmid 2>/dev/null" | awk '{print $2}' || echo "unknown")
@@ -129,10 +142,12 @@ review_one() {
if ! grep -q \"tx-pool-max-future-by-sender\" \"\$CFG\"; then
echo \"\" >> \"\$CFG\"
echo \"# Layered Tx-Pool (tx-pool-min-score not used: unsupported in some Besu builds)\" >> \"\$CFG\"
echo \"tx-pool-max-future-by-sender=200\" >> \"\$CFG\"
echo \"tx-pool-max-future-by-sender=1\" >> \"\$CFG\"
echo \"tx-pool-layer-max-capacity=12500000\" >> \"\$CFG\"
echo \"tx-pool-max-prioritized=2000\" >> \"\$CFG\"
echo \"tx-pool-price-bump=10\" >> \"\$CFG\"
else
sed -i \"s/^tx-pool-max-future-by-sender=.*/tx-pool-max-future-by-sender=1/\" \"\$CFG\"
fi
sed -i \"/^tx-pool-min-score=/d\" \"\$CFG\" 2>/dev/null || true
'" 2>/dev/null && {

View File

@@ -144,7 +144,7 @@ cat >> "$REPORT_FILE" << 'EOF'
| 13c | Verify contracts (Blockscout) | `source smom-dbis-138/.env && ./scripts/verify/run-contract-verification-with-proxy.sh` |
| NPMplus backup | Backup NPMplus | `bash scripts/verify/backup-npmplus.sh` |
| Wave 0 from LAN | NPMplus RPC fix + backup | `bash scripts/run-wave0-from-lan.sh` |
| Validators + block production | Fix validators / tx-pool | `bash scripts/fix-all-validators-and-txpool.sh` then `scripts/monitoring/monitor-blockchain-health.sh` |
| Validators + block production | Fix validators / tx-pool | `bash scripts/fix-all-validators-and-txpool.sh` then `bash scripts/maintenance/apply-chain138-strict-future-tx-pool.sh` then `bash scripts/clear-all-transaction-pools.sh` then `bash scripts/monitoring/monitor-blockchain-health.sh` |
---