Files
proxmox/scripts/verify/onchain_check_cache.py
2026-04-24 10:56:01 -07:00

232 lines
7.5 KiB
Python

"""
On-disk cache for per-(chain, address) `run_check` results to avoid
repeating Etherscan / Sourcify / RPC on every inventory regen.
The cache is intentionally tied to the effective explorer/RPC endpoints used
by the inventory scripts. A prior run against a public Blockscout or a
different RPC should not mask later LAN-backed results for the same
chain/address pair.
"""
from __future__ import annotations
import json
import os
import threading
import time
from pathlib import Path
from typing import Any
# Same directory as other inventory artifacts; safe to commit for team "resume" runs.
def cache_path_for(root: Path) -> Path:
return root / "reports" / "inventory" / "contract-inventory-onchain-check-cache.json"
CACHE_VERSION = 2
_write_lock = threading.Lock()
def _runtime_fingerprint() -> dict[str, str]:
"""
Capture the endpoint settings that materially affect explorer/on-chain
verification results.
Keep this small and explicit so cache invalidation is predictable.
"""
return {
"ETHEREUM_MAINNET_RPC": os.environ.get(
"ETHEREUM_MAINNET_RPC", "https://ethereum.publicnode.com"
),
"RPC_URL_138": os.environ.get("RPC_URL_138", "https://rpc-core.d-bis.org"),
"BLOCKSCOUT_138_URL": os.environ.get(
"BLOCKSCOUT_138_URL", "https://explorer.d-bis.org"
).rstrip("/"),
}
def _key(chain: str, address: str) -> str:
a = (address or "").lower().replace("0x", "")
return f"{str(chain).strip()}:{a}" if a else f"{str(chain).strip()}:-"
def load_check_cache(path: Path) -> dict[str, Any]:
if not path.is_file():
return {
"version": CACHE_VERSION,
"updated_utc": None,
"runtime_fingerprint": _runtime_fingerprint(),
"keys": {},
}
try:
data = json.loads(path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError, UnicodeDecodeError):
return {
"version": CACHE_VERSION,
"updated_utc": None,
"runtime_fingerprint": _runtime_fingerprint(),
"keys": {},
}
if not isinstance(data, dict) or "keys" not in data:
return {
"version": CACHE_VERSION,
"updated_utc": None,
"runtime_fingerprint": _runtime_fingerprint(),
"keys": {},
}
if data.get("version") != CACHE_VERSION:
return {
"version": CACHE_VERSION,
"updated_utc": data.get("updated_utc"),
"runtime_fingerprint": _runtime_fingerprint(),
"keys": {},
}
if data.get("runtime_fingerprint") != _runtime_fingerprint():
return {
"version": CACHE_VERSION,
"updated_utc": data.get("updated_utc"),
"runtime_fingerprint": _runtime_fingerprint(),
"keys": {},
}
return data
def _atomic_write(path: Path, text: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
tmp = path.with_name(path.name + ".tmp")
try:
tmp.write_text(text, encoding="utf-8")
os.replace(tmp, path)
finally:
if tmp.is_file():
try:
tmp.unlink()
except OSError:
pass
def save_check_cache(path: Path, data: dict[str, Any]) -> None:
data = dict(data)
data["version"] = CACHE_VERSION
data["updated_utc"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
data["runtime_fingerprint"] = _runtime_fingerprint()
with _write_lock:
_atomic_write(path, json.dumps(data, indent=2) + "\n")
def should_refetch_transient(cached: dict[str, Any] | None) -> bool:
"""True if a previous run likely hit rate limits or RPC flakiness and should be retried."""
if not cached:
return False
s_es = str(cached.get("source_etherscan") or "")
s_code = str(cached.get("code_detail") or "")
s_sfy = str(cached.get("source_sourcify") or "")
s_bs = str(cached.get("source_blockscout") or "")
for s in (s_es, s_code, s_sfy, s_bs):
low = s.lower()
if "max calls" in low or "3/sec" in low:
return True
if "per sec" in low and "rate" in low:
return True
if "rate" in low and "limit" in low:
return True
if cached.get("code_on_chain") is None and "rpc" in s_code.lower():
return True
if "http error" in s_sfy.lower() and "429" in s_sfy:
return True
if "http error" in s_bs.lower() and "403" in s_bs:
return True
return False
def get_cached_for_entry(
keys_store: dict[str, Any], chain: str, address: str
) -> dict[str, Any] | None:
k = _key(chain, address)
ent = keys_store.get(k)
return ent if isinstance(ent, dict) else None
def set_cached_for_entry(
data: dict[str, Any], result: dict[str, Any]
) -> None:
ch = str(result.get("chain", ""))
addr = str(result.get("address", ""))
if not ch or not addr:
return
k = _key(ch, addr)
st = data.setdefault("keys", {})
if not isinstance(st, dict):
data["keys"] = st = {}
st[k] = {kk: result[kk] for kk in result if not str(kk).startswith("_")}
def run_checks_with_cache(
to_run: list[dict[str, Any]],
es_key: str | None,
run_check: Any,
cache_path: Path,
use_cache: bool,
refresh_transient: bool,
max_workers: int,
progress_every: int = 50,
) -> dict[tuple[str, str], dict[str, Any]]:
"""
Load `run_check` results for each entry in to_run, using on-disk cache on cache hits
to avoid Etherscan / Sourcify / RPC re-queries.
"""
from concurrent.futures import ThreadPoolExecutor, as_completed
data: dict[str, Any]
if use_cache:
data = load_check_cache(cache_path)
else:
data = {
"version": CACHE_VERSION,
"updated_utc": None,
"runtime_fingerprint": _runtime_fingerprint(),
"keys": {},
}
keys: dict[str, Any] = data.setdefault("keys", {}) # type: ignore[assignment]
if not isinstance(keys, dict):
data["keys"] = keys = {}
to_fetch: list[dict[str, Any]] = []
for e in to_run:
c = get_cached_for_entry(keys, e["chain"], e["address"])
if not use_cache:
to_fetch.append(e)
elif c is None:
to_fetch.append(e)
elif refresh_transient and should_refetch_transient(c):
to_fetch.append(e)
n_hits = len(to_run) - len(to_fetch)
print(
f" cache: {n_hits} hit(s), {len(to_fetch)} fetch(es) (resume saves Etherscan + other APIs on hits)"
)
if to_fetch:
with ThreadPoolExecutor(max_workers=max_workers) as ex:
futs = {ex.submit(run_check, dict(e), es_key): e for e in to_fetch}
n = 0
for f in as_completed(futs):
r = f.result()
set_cached_for_entry(data, r)
n += 1
if n % 5 == 0 or n == len(futs):
save_check_cache(cache_path, data)
if n % progress_every == 0:
print(f" progress {n}/{len(futs)}")
save_check_cache(cache_path, data)
by_addr: dict[tuple[str, str], dict[str, Any]] = {}
for e in to_run:
c = get_cached_for_entry(keys, e["chain"], e["address"])
if c is None:
raise RuntimeError(
f"Internal error: no cache after run for {e.get('chain')} {e.get('address')}"
)
by_addr[(e["chain"], e["address"].lower())] = c
if to_run and not to_fetch:
print(f" (all {len(to_run)} pairs served from cache; 0 fetches this run)")
return by_addr