ops: destroy duplicate besu cts and add cluster inventory audit

This commit is contained in:
defiQUG
2026-04-24 11:52:46 -07:00
parent 91ba6f4f2f
commit a1eacd3098
5 changed files with 296 additions and 29 deletions

View File

@@ -149,18 +149,18 @@ These are live Besu RPC containers and should not be confused with the older dec
| 2504 | 192.168.11.247 | besu-rpc-hybx-2 | ✅ Running | Besu: 8545/8546, P2P: 30303, Metrics: 9545 | Internal HYBX RPC 2 |
| 2505 | 192.168.11.248 | besu-rpc-hybx-3 | ✅ Running | Besu: 8545/8546, P2P: 30303, Metrics: 9545 | Internal HYBX RPC 3 |
### Retired Legacy Duplicate ALLTRA / HYBX RPC Containers (Not Canonical Fleet)
### Destroyed Legacy Duplicate ALLTRA / HYBX RPC Containers (Not Canonical Fleet)
These were found live on `r630-01` during the same SSH pass, but they do not exist in `config/proxmox-operational-template.json`. A controlled shutdown pass then retired them by disabling `besu-rpc`, setting `onboot: 0`, and stopping the CTs. Use the `2500-2505` rows above as the canonical intended fleet.
These were found live on `r630-01` during the same SSH pass, but they do not exist in `config/proxmox-operational-template.json`. They were first retired, then permanently destroyed on `2026-04-24`. Use the `2500-2505` rows above as the canonical intended fleet.
| VMID | IP Address | Hostname | Status | Endpoints | Purpose |
|------|------------|----------|--------|-----------|---------|
| 2420 | 192.168.11.172 | besu-rpc-alltra-1 | ⏹ Retired | CT stopped, `onboot: 0` | Legacy duplicate of canonical VMID 2500 |
| 2430 | 192.168.11.173 | besu-rpc-alltra-2 | ⏹ Retired | CT stopped, `onboot: 0` | Legacy duplicate of canonical VMID 2501 |
| 2440 | 192.168.11.174 | besu-rpc-alltra-3 | ⏹ Retired | CT stopped, `onboot: 0` | Legacy duplicate of canonical VMID 2502 |
| 2460 | 192.168.11.246 | besu-rpc-hybx-1 | ⏹ Retired | CT stopped, `onboot: 0` | Legacy duplicate of canonical VMID 2503 |
| 2470 | 192.168.11.247 | besu-rpc-hybx-2 | ⏹ Retired | CT stopped, `onboot: 0` | Legacy duplicate of canonical VMID 2504 |
| 2480 | 192.168.11.248 | besu-rpc-hybx-3 | ⏹ Retired | CT stopped, `onboot: 0` | Legacy duplicate of canonical VMID 2505 |
| 2420 | 192.168.11.172 | besu-rpc-alltra-1 | 🗑 Destroyed | `pct destroy --purge 1` completed | Legacy duplicate of canonical VMID 2500 |
| 2430 | 192.168.11.173 | besu-rpc-alltra-2 | 🗑 Destroyed | `pct destroy --purge 1` completed | Legacy duplicate of canonical VMID 2501 |
| 2440 | 192.168.11.174 | besu-rpc-alltra-3 | 🗑 Destroyed | `pct destroy --purge 1` completed | Legacy duplicate of canonical VMID 2502 |
| 2460 | 192.168.11.246 | besu-rpc-hybx-1 | 🗑 Destroyed | `pct destroy --purge 1` completed | Legacy duplicate of canonical VMID 2503 |
| 2470 | 192.168.11.247 | besu-rpc-hybx-2 | 🗑 Destroyed | `pct destroy --purge 1` completed | Legacy duplicate of canonical VMID 2504 |
| 2480 | 192.168.11.248 | besu-rpc-hybx-3 | 🗑 Destroyed | `pct destroy --purge 1` completed | Legacy duplicate of canonical VMID 2505 |
---

View File

@@ -123,20 +123,20 @@ These are live Besu RPC containers on `r630-01` and should not be confused with
| 2504 | 192.168.11.247 | besu-rpc-hybx-2 | ✅ Running | Internal HYBX RPC |
| 2505 | 192.168.11.248 | besu-rpc-hybx-3 | ✅ Running | Internal HYBX RPC |
### Retired Legacy Duplicate RPC Containers On `r630-01`
### Destroyed Legacy Duplicate RPC Containers On `r630-01`
These CTs were found live during the 2026-04-24 SSH reconciliation, but they are not part of the intended fleet in `config/proxmox-operational-template.json`. A controlled shutdown pass then retired them by disabling `besu-rpc`, setting `onboot: 0`, and stopping all six containers.
These CTs were found live during the 2026-04-24 SSH reconciliation, but they are not part of the intended fleet in `config/proxmox-operational-template.json`. After the controlled shutdown pass, they were permanently destroyed with `pct destroy --purge 1`.
| VMID | IP Address | Hostname | Final state | Canonical counterpart | Notes |
|------|------------|----------|---------|---------------|-----------------------|-------|
| 2420 | 192.168.11.172 | besu-rpc-alltra-1 | stopped, `onboot: 0` | 2500 | `2500` fully resynced after duplicate retirement |
| 2430 | 192.168.11.173 | besu-rpc-alltra-2 | stopped, `onboot: 0` | 2501 | Retired cleanly |
| 2440 | 192.168.11.174 | besu-rpc-alltra-3 | stopped, `onboot: 0` | 2502 | Retired cleanly |
| 2460 | 192.168.11.246 | besu-rpc-hybx-1 | stopped, `onboot: 0` | 2503 | Retired cleanly |
| 2470 | 192.168.11.247 | besu-rpc-hybx-2 | stopped, `onboot: 0` | 2504 | Retired cleanly |
| 2480 | 192.168.11.248 | besu-rpc-hybx-3 | stopped, `onboot: 0` | 2505 | Retired cleanly |
| 2420 | 192.168.11.172 | besu-rpc-alltra-1 | destroyed | 2500 | `2500` fully resynced after duplicate retirement, then the duplicate CT was removed |
| 2430 | 192.168.11.173 | besu-rpc-alltra-2 | destroyed | 2501 | Destroyed cleanly |
| 2440 | 192.168.11.174 | besu-rpc-alltra-3 | destroyed | 2502 | Destroyed cleanly |
| 2460 | 192.168.11.246 | besu-rpc-hybx-1 | destroyed | 2503 | Destroyed cleanly |
| 2470 | 192.168.11.247 | besu-rpc-hybx-2 | destroyed | 2504 | Destroyed cleanly |
| 2480 | 192.168.11.248 | besu-rpc-hybx-3 | destroyed | 2505 | Destroyed cleanly |
These duplicate VMIDs should be treated as retired legacy residue. They should not be used as the canonical fleet inventory for new automation or runbooks.
These duplicate VMIDs should be treated as historical legacy residue only. They should not be used as the canonical fleet inventory for new automation or runbooks.
**Fixed Permanent:** VMID 2201 = 192.168.11.221 (besu-rpc-public-1). Do not change. Source: `config/ip-addresses.conf`.

View File

@@ -29,7 +29,7 @@ Use these in this order when they disagree:
- Live SSH also found a second, non-template legacy duplicate set on `r630-01`:
- `2420`, `2430`, `2440`, `2460`, `2470`, `2480`
- those nodes use the same role names and LAN IPs as the intended `2500-2505` ALLTRA/HYBX tier
- a controlled shutdown pass on `2026-04-24` stopped them and set `onboot: 0`, so they are now documented as retired duplicates rather than canonical fleet members
- a controlled shutdown pass retired them, and a follow-up destroy pass on `2026-04-24` removed them permanently
- The second strongest conflict is config policy drift:
- class templates often use `p2p-host="0.0.0.0"`
- generated per-node TOMLs use explicit LAN IPs
@@ -179,22 +179,22 @@ flowchart LR
| 2504 | `192.168.11.247` | `besu-rpc-hybx-2` | Yes | Yes | Yes | Yes | Canonical intended HYBX RPC 2 |
| 2505 | `192.168.11.248` | `besu-rpc-hybx-3` | Yes | Yes | Yes | Yes | Canonical intended HYBX RPC 3 |
### Retired Legacy Duplicate RPC Containers
### Destroyed Legacy Duplicate RPC Containers
These containers were found live on `r630-01` during the 2026-04-24 SSH pass, but they are **not** present in `config/proxmox-operational-template.json`. A controlled shutdown pass on `2026-04-24` disabled `besu-rpc`, set `onboot: 0`, and stopped all six CTs so the canonical `2500-2505` fleet now owns those LAN IPs cleanly.
These containers were found live on `r630-01` during the 2026-04-24 SSH pass, but they are **not** present in `config/proxmox-operational-template.json`. A controlled shutdown pass first retired them, then a destroy pass removed all six CTs permanently so the canonical `2500-2505` fleet owns those LAN IPs cleanly.
| VMID | IP | Hostname | Final duplicate state | Canonical counterpart | Notes |
|---|---|---|---|---|---|
| 2420 | `192.168.11.172` | `besu-rpc-alltra-1` | stopped, `onboot: 0` | `2500` | This pair caused the only lagging canonical lane; `2500` fully resynced after the duplicate was retired |
| 2430 | `192.168.11.173` | `besu-rpc-alltra-2` | stopped, `onboot: 0` | `2501` | Duplicate retired cleanly |
| 2440 | `192.168.11.174` | `besu-rpc-alltra-3` | stopped, `onboot: 0` | `2502` | Duplicate retired cleanly |
| 2460 | `192.168.11.246` | `besu-rpc-hybx-1` | stopped, `onboot: 0` | `2503` | Duplicate retired cleanly |
| 2470 | `192.168.11.247` | `besu-rpc-hybx-2` | stopped, `onboot: 0` | `2504` | Duplicate retired cleanly |
| 2480 | `192.168.11.248` | `besu-rpc-hybx-3` | stopped, `onboot: 0` | `2505` | Duplicate retired cleanly |
| 2420 | `192.168.11.172` | `besu-rpc-alltra-1` | destroyed | `2500` | This pair caused the only lagging canonical lane; `2500` fully resynced before the duplicate CT was destroyed |
| 2430 | `192.168.11.173` | `besu-rpc-alltra-2` | destroyed | `2501` | Duplicate destroyed cleanly |
| 2440 | `192.168.11.174` | `besu-rpc-alltra-3` | destroyed | `2502` | Duplicate destroyed cleanly |
| 2460 | `192.168.11.246` | `besu-rpc-hybx-1` | destroyed | `2503` | Duplicate destroyed cleanly |
| 2470 | `192.168.11.247` | `besu-rpc-hybx-2` | destroyed | `2504` | Duplicate destroyed cleanly |
| 2480 | `192.168.11.248` | `besu-rpc-hybx-3` | destroyed | `2505` | Duplicate destroyed cleanly |
Reconciliation decision for this doc:
- `2500-2505` are the canonical intended ALLTRA/HYBX fleet because they are in the operational template, generated per-node TOMLs, and the Besu allowlists.
- `2420/2430/2440/2460/2470/2480` are retired legacy duplicates, not intended fleet members.
- `2420/2430/2440/2460/2470/2480` are destroyed legacy duplicates, not intended fleet members.
## Configuration Pattern Map
@@ -275,7 +275,7 @@ Impact:
- `2301` is live on `r630-03`
But:
- live SSH found a second duplicate set `2420/2430/2440/2460/2470/2480` using the same role names and IPs, and a controlled shutdown pass then retired them
- live SSH found a second duplicate set `2420/2430/2440/2460/2470/2480` using the same role names and IPs, and later retirement/destroy passes removed them
- `RPC_ENDPOINTS_MASTER.md` historically mixed old migration rows with current fleet rows
- the initial 3-host pass missed `r630-03` and `r630-04`, which made live nodes appear unresolved until cluster-wide reconciliation
@@ -371,6 +371,6 @@ Impact:
## Highest-Priority Fixes
1. Verify the newly-added `2103` enode is deployed to all Besu nodes.
2. Keep `2420/2430/2440/2460/2470/2480` documented as retired legacy duplicates unless they are later destroyed and removed from host inventory.
2. Keep `2420/2430/2440/2460/2470/2480` documented as destroyed historical duplicates so future audits understand why those VMIDs disappeared.
3. Update `ALL_VMIDS_ENDPOINTS.md` so it no longer conflicts with the operational template.
4. Normalize `p2p-host` handling so generic templates do not rely on a hidden rewrite step.

View File

@@ -37,6 +37,7 @@ One-line install (Debian/Ubuntu): `sudo apt install -y sshpass rsync dnsutils ip
- `reconcile-env-canonical.sh` - Emit recommended .env lines for Chain 138 (canonical source of truth); use to reconcile `smom-dbis-138/.env` with [CONTRACT_ADDRESSES_REFERENCE](../../docs/11-references/CONTRACT_ADDRESSES_REFERENCE.md). Usage: `./scripts/verify/reconcile-env-canonical.sh [--print]`
- `check-deployer-balance-blockscout-vs-rpc.sh` - Compare deployer native balance from Blockscout API vs RPC (to verify index matches current chain); see [EXPLORER_AND_BLOCKSCAN_REFERENCE](../../docs/11-references/EXPLORER_AND_BLOCKSCAN_REFERENCE.md)
- `check-dependencies.sh` - Verify required tools (bash, curl, jq, openssl, ssh)
- `check-cluster-besu-inventory.sh` - Cluster-wide Besu inventory audit using `pvesh /cluster/resources` via a Proxmox cluster node so host placement on `r630-03` / `r630-04` is not missed. Prints VMID, type, node, status, name, IP, canonical-vs-extra classification, and any missing canonical VMIDs. Use `--json` for machine-readable output.
- `check-pnpm-workspace-lockfile.sh` - Ensures every path in `pnpm-workspace.yaml` has an `importer` in `pnpm-lock.yaml` (run `pnpm install` at root if it fails; avoids broken `pnpm outdated -r`)
- `export-cloudflare-dns-records.sh` - Export Cloudflare DNS records
- `export-npmplus-config.sh` - Export NPMplus proxy hosts and certificates via API

View File

@@ -0,0 +1,266 @@
#!/usr/bin/env bash
# Cluster-wide Besu inventory audit.
# Uses Proxmox cluster resources so nodes hosted on r630-03 / r630-04 are not missed.
#
# Usage:
# bash scripts/verify/check-cluster-besu-inventory.sh
# PVE_CLUSTER_API_HOST=192.168.11.11 bash scripts/verify/check-cluster-besu-inventory.sh
# bash scripts/verify/check-cluster-besu-inventory.sh --json
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
CLUSTER_HOST="${PVE_CLUSTER_API_HOST:-192.168.11.11}"
JSON_ONLY=false
if [[ "${1:-}" == "--json" ]]; then
JSON_ONLY=true
fi
need_cmd() {
command -v "$1" >/dev/null 2>&1 || {
echo "missing required command: $1" >&2
exit 1
}
}
need_cmd ssh
need_cmd jq
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
NC='\033[0m'
section() {
$JSON_ONLY || echo -e "\n${CYAN}━━━ $1 ━━━${NC}"
}
info() {
$JSON_ONLY || echo "$1"
}
canonical_vmids=(
1000 1001 1002 1003 1004
1500 1501 1502 1503 1504 1505 1506 1507 1508
2101 2102 2103 2201 2301 2303 2304 2305 2306 2307 2308
2400 2401 2402 2403
2500 2501 2502 2503 2504 2505
)
declare -A canonical_set=()
for vmid in "${canonical_vmids[@]}"; do
canonical_set["$vmid"]=1
done
declare -A expected_ip=(
[1000]=192.168.11.100
[1001]=192.168.11.101
[1002]=192.168.11.102
[1003]=192.168.11.103
[1004]=192.168.11.104
[1500]=192.168.11.150
[1501]=192.168.11.151
[1502]=192.168.11.152
[1503]=192.168.11.153
[1504]=192.168.11.154
[1505]=192.168.11.213
[1506]=192.168.11.214
[1507]=192.168.11.244
[1508]=192.168.11.245
[2101]=192.168.11.211
[2102]=192.168.11.212
[2103]=192.168.11.217
[2201]=192.168.11.221
[2301]=192.168.11.232
[2303]=192.168.11.233
[2304]=192.168.11.234
[2305]=192.168.11.235
[2306]=192.168.11.236
[2307]=192.168.11.237
[2308]=192.168.11.238
[2400]=192.168.11.240
[2401]=192.168.11.241
[2402]=192.168.11.242
[2403]=192.168.11.243
[2500]=192.168.11.172
[2501]=192.168.11.173
[2502]=192.168.11.174
[2503]=192.168.11.246
[2504]=192.168.11.247
[2505]=192.168.11.248
)
cluster_nodes_json="$(ssh -o BatchMode=yes -o ConnectTimeout=10 "root@${CLUSTER_HOST}" \
'pvesh get /nodes --output-format json')"
resources_json="$(ssh -o BatchMode=yes -o ConnectTimeout=10 "root@${CLUSTER_HOST}" \
'pvesh get /cluster/resources --output-format json')"
config_rows_text="$(ssh -o BatchMode=yes -o ConnectTimeout=10 "root@${CLUSTER_HOST}" '
shopt -s nullglob
for f in /etc/pve/nodes/*/lxc/*.conf /etc/pve/nodes/*/qemu-server/*.conf; do
[ -f "$f" ] || continue
node="$(basename "$(dirname "$(dirname "$f")")")"
kind="$(basename "$(dirname "$f")")"
vmid="$(basename "$f" .conf)"
net0="$(grep -m1 "^net0:" "$f" || true)"
printf "%s|%s|%s|%s\n" "$kind" "$node" "$vmid" "$net0"
done
')"
besu_rows_json="$(
jq -c '
[
.[]
| select((.type == "lxc" or .type == "qemu"))
| {
vmid: (.vmid | tostring),
type,
node,
status,
name
}
]
| sort_by(.vmid | tonumber)
' <<<"$resources_json"
)"
declare -A found_vmids=()
declare -A live_ip_to_vmid=()
declare -A config_ip=()
declare -a table_rows=()
declare -a unexpected_rows=()
while IFS='|' read -r kind node vmid net0; do
[[ -z "$vmid" ]] && continue
ip=""
if [[ "$net0" =~ ip=([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)/ ]]; then
ip="${BASH_REMATCH[1]}"
fi
config_ip["${kind}:${node}:${vmid}"]="$ip"
done <<< "$config_rows_text"
while IFS= read -r row; do
[[ -z "$row" ]] && continue
vmid="$(jq -r '.vmid' <<<"$row")"
type="$(jq -r '.type' <<<"$row")"
node="$(jq -r '.node' <<<"$row")"
status="$(jq -r '.status' <<<"$row")"
name="$(jq -r '.name' <<<"$row")"
ip="${config_ip[${type}:${node}:${vmid}]:-}"
if [[ -z "${canonical_set[$vmid]:-}" && ! "$name" =~ besu|thirdweb-rpc ]]; then
continue
fi
canonical="no"
note="extra/non-canonical"
if [[ -n "${canonical_set[$vmid]:-}" ]]; then
canonical="yes"
note="canonical"
found_vmids["$vmid"]=1
if [[ -n "$ip" ]]; then
live_ip_to_vmid["$ip"]="$vmid"
if [[ -n "${expected_ip[$vmid]:-}" && "${expected_ip[$vmid]}" != "$ip" ]]; then
note="canonical, ip-mismatch expected=${expected_ip[$vmid]}"
fi
else
note="canonical, ip-unresolved"
fi
fi
row_json="$(jq -cn \
--arg vmid "$vmid" \
--arg type "$type" \
--arg node "$node" \
--arg status "$status" \
--arg name "$name" \
--arg ip "$ip" \
--arg canonical "$canonical" \
--arg note "$note" \
'{vmid:$vmid,type:$type,node:$node,status:$status,name:$name,ip:$ip,canonical:$canonical,note:$note}')"
table_rows+=("$row_json")
if [[ "$canonical" == "no" ]]; then
unexpected_rows+=("$row_json")
fi
done < <(jq -c '.[]' <<<"$besu_rows_json")
missing_items=()
for vmid in "${canonical_vmids[@]}"; do
if [[ -z "${found_vmids[$vmid]:-}" ]]; then
expected="${expected_ip[$vmid]:-}"
missing_items+=("$(jq -cn --arg vmid "$vmid" --arg expected_ip "$expected" '{vmid:$vmid, expected_ip:$expected_ip}')")
fi
done
rows_json="$(printf '%s\n' "${table_rows[@]}" | jq -s '.')"
unexpected_json="$(printf '%s\n' "${unexpected_rows[@]:-}" | jq -s '.')"
missing_json="$(printf '%s\n' "${missing_items[@]:-}" | jq -s '.')"
nodes_summary_json="$(
jq -c '[.[] | {node, status}] | sort_by(.node)' <<<"$cluster_nodes_json"
)"
final_json="$(
jq -cn \
--arg cluster_host "$CLUSTER_HOST" \
--argjson nodes "$nodes_summary_json" \
--argjson rows "$rows_json" \
--argjson missing "$missing_json" \
--argjson unexpected "$unexpected_json" \
'{
cluster_host: $cluster_host,
nodes: $nodes,
besu_resources: $rows,
missing_canonical_vmids: $missing,
unexpected_besu_resources: $unexpected
}'
)"
if $JSON_ONLY; then
echo "$final_json"
exit 0
fi
section "Cluster Nodes"
jq -r '.nodes[] | " - \(.node): \(.status)"' <<<"$final_json"
section "Besu Resources"
printf '%-6s %-4s %-8s %-9s %-30s %-15s %-10s %s\n' "VMID" "TYPE" "NODE" "STATUS" "NAME" "IP" "CANON" "NOTE"
printf '%-6s %-4s %-8s %-9s %-30s %-15s %-10s %s\n' "------" "----" "--------" "---------" "------------------------------" "---------------" "----------" "------------------------------"
while IFS= read -r row; do
vmid="$(jq -r '.vmid' <<<"$row")"
type="$(jq -r '.type' <<<"$row")"
node="$(jq -r '.node' <<<"$row")"
status="$(jq -r '.status' <<<"$row")"
name="$(jq -r '.name' <<<"$row")"
ip="$(jq -r '.ip // "-"' <<<"$row")"
canonical="$(jq -r '.canonical' <<<"$row")"
note="$(jq -r '.note' <<<"$row")"
printf '%-6s %-4s %-8s %-9s %-30s %-15s %-10s %s\n' \
"$vmid" "$type" "$node" "$status" "$name" "${ip:--}" "$canonical" "$note"
done < <(jq -c '.besu_resources[]' <<<"$final_json")
section "Missing Canonical VMIDs"
if [[ "$(jq 'length' <<<"$missing_json")" -eq 0 ]]; then
echo -e "${GREEN}[✓]${NC} All canonical Besu VMIDs are present in cluster resources."
else
while IFS= read -r row; do
vmid="$(jq -r '.vmid' <<<"$row")"
ip="$(jq -r '.expected_ip' <<<"$row")"
echo -e "${YELLOW}[⚠]${NC} Missing canonical VMID ${vmid} (expected IP ${ip})"
done < <(jq -c '.[]' <<<"$missing_json")
fi
section "Unexpected / Extra Besu Resources"
if [[ "${#unexpected_rows[@]}" -eq 0 ]]; then
echo -e "${GREEN}[✓]${NC} No extra Besu resources found outside the canonical VMID set."
else
while IFS= read -r row; do
echo -e "${YELLOW}[⚠]${NC} Extra Besu resource VMID $(jq -r '.vmid' <<<"$row") on $(jq -r '.node' <<<"$row") ($(jq -r '.name' <<<"$row"))"
done < <(jq -c '.unexpected_besu_resources[]' <<<"$final_json")
fi
exit 0