232 lines
7.5 KiB
Python
232 lines
7.5 KiB
Python
"""
|
|
On-disk cache for per-(chain, address) `run_check` results to avoid
|
|
repeating Etherscan / Sourcify / RPC on every inventory regen.
|
|
|
|
The cache is intentionally tied to the effective explorer/RPC endpoints used
|
|
by the inventory scripts. A prior run against a public Blockscout or a
|
|
different RPC should not mask later LAN-backed results for the same
|
|
chain/address pair.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import threading
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
# Same directory as other inventory artifacts; safe to commit for team "resume" runs.
|
|
def cache_path_for(root: Path) -> Path:
|
|
return root / "reports" / "inventory" / "contract-inventory-onchain-check-cache.json"
|
|
|
|
|
|
CACHE_VERSION = 2
|
|
_write_lock = threading.Lock()
|
|
|
|
|
|
def _runtime_fingerprint() -> dict[str, str]:
|
|
"""
|
|
Capture the endpoint settings that materially affect explorer/on-chain
|
|
verification results.
|
|
|
|
Keep this small and explicit so cache invalidation is predictable.
|
|
"""
|
|
return {
|
|
"ETHEREUM_MAINNET_RPC": os.environ.get(
|
|
"ETHEREUM_MAINNET_RPC", "https://ethereum.publicnode.com"
|
|
),
|
|
"RPC_URL_138": os.environ.get("RPC_URL_138", "https://rpc-core.d-bis.org"),
|
|
"BLOCKSCOUT_138_URL": os.environ.get(
|
|
"BLOCKSCOUT_138_URL", "https://explorer.d-bis.org"
|
|
).rstrip("/"),
|
|
}
|
|
|
|
|
|
def _key(chain: str, address: str) -> str:
|
|
a = (address or "").lower().replace("0x", "")
|
|
return f"{str(chain).strip()}:{a}" if a else f"{str(chain).strip()}:-"
|
|
|
|
|
|
def load_check_cache(path: Path) -> dict[str, Any]:
|
|
if not path.is_file():
|
|
return {
|
|
"version": CACHE_VERSION,
|
|
"updated_utc": None,
|
|
"runtime_fingerprint": _runtime_fingerprint(),
|
|
"keys": {},
|
|
}
|
|
try:
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
except (json.JSONDecodeError, OSError, UnicodeDecodeError):
|
|
return {
|
|
"version": CACHE_VERSION,
|
|
"updated_utc": None,
|
|
"runtime_fingerprint": _runtime_fingerprint(),
|
|
"keys": {},
|
|
}
|
|
if not isinstance(data, dict) or "keys" not in data:
|
|
return {
|
|
"version": CACHE_VERSION,
|
|
"updated_utc": None,
|
|
"runtime_fingerprint": _runtime_fingerprint(),
|
|
"keys": {},
|
|
}
|
|
if data.get("version") != CACHE_VERSION:
|
|
return {
|
|
"version": CACHE_VERSION,
|
|
"updated_utc": data.get("updated_utc"),
|
|
"runtime_fingerprint": _runtime_fingerprint(),
|
|
"keys": {},
|
|
}
|
|
if data.get("runtime_fingerprint") != _runtime_fingerprint():
|
|
return {
|
|
"version": CACHE_VERSION,
|
|
"updated_utc": data.get("updated_utc"),
|
|
"runtime_fingerprint": _runtime_fingerprint(),
|
|
"keys": {},
|
|
}
|
|
return data
|
|
|
|
|
|
def _atomic_write(path: Path, text: str) -> None:
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
tmp = path.with_name(path.name + ".tmp")
|
|
try:
|
|
tmp.write_text(text, encoding="utf-8")
|
|
os.replace(tmp, path)
|
|
finally:
|
|
if tmp.is_file():
|
|
try:
|
|
tmp.unlink()
|
|
except OSError:
|
|
pass
|
|
|
|
|
|
def save_check_cache(path: Path, data: dict[str, Any]) -> None:
|
|
data = dict(data)
|
|
data["version"] = CACHE_VERSION
|
|
data["updated_utc"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
|
data["runtime_fingerprint"] = _runtime_fingerprint()
|
|
with _write_lock:
|
|
_atomic_write(path, json.dumps(data, indent=2) + "\n")
|
|
|
|
|
|
def should_refetch_transient(cached: dict[str, Any] | None) -> bool:
|
|
"""True if a previous run likely hit rate limits or RPC flakiness and should be retried."""
|
|
if not cached:
|
|
return False
|
|
s_es = str(cached.get("source_etherscan") or "")
|
|
s_code = str(cached.get("code_detail") or "")
|
|
s_sfy = str(cached.get("source_sourcify") or "")
|
|
s_bs = str(cached.get("source_blockscout") or "")
|
|
for s in (s_es, s_code, s_sfy, s_bs):
|
|
low = s.lower()
|
|
if "max calls" in low or "3/sec" in low:
|
|
return True
|
|
if "per sec" in low and "rate" in low:
|
|
return True
|
|
if "rate" in low and "limit" in low:
|
|
return True
|
|
if cached.get("code_on_chain") is None and "rpc" in s_code.lower():
|
|
return True
|
|
if "http error" in s_sfy.lower() and "429" in s_sfy:
|
|
return True
|
|
if "http error" in s_bs.lower() and "403" in s_bs:
|
|
return True
|
|
return False
|
|
|
|
|
|
def get_cached_for_entry(
|
|
keys_store: dict[str, Any], chain: str, address: str
|
|
) -> dict[str, Any] | None:
|
|
k = _key(chain, address)
|
|
ent = keys_store.get(k)
|
|
return ent if isinstance(ent, dict) else None
|
|
|
|
|
|
def set_cached_for_entry(
|
|
data: dict[str, Any], result: dict[str, Any]
|
|
) -> None:
|
|
ch = str(result.get("chain", ""))
|
|
addr = str(result.get("address", ""))
|
|
if not ch or not addr:
|
|
return
|
|
k = _key(ch, addr)
|
|
st = data.setdefault("keys", {})
|
|
if not isinstance(st, dict):
|
|
data["keys"] = st = {}
|
|
st[k] = {kk: result[kk] for kk in result if not str(kk).startswith("_")}
|
|
|
|
|
|
def run_checks_with_cache(
|
|
to_run: list[dict[str, Any]],
|
|
es_key: str | None,
|
|
run_check: Any,
|
|
cache_path: Path,
|
|
use_cache: bool,
|
|
refresh_transient: bool,
|
|
max_workers: int,
|
|
progress_every: int = 50,
|
|
) -> dict[tuple[str, str], dict[str, Any]]:
|
|
"""
|
|
Load `run_check` results for each entry in to_run, using on-disk cache on cache hits
|
|
to avoid Etherscan / Sourcify / RPC re-queries.
|
|
"""
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
data: dict[str, Any]
|
|
if use_cache:
|
|
data = load_check_cache(cache_path)
|
|
else:
|
|
data = {
|
|
"version": CACHE_VERSION,
|
|
"updated_utc": None,
|
|
"runtime_fingerprint": _runtime_fingerprint(),
|
|
"keys": {},
|
|
}
|
|
keys: dict[str, Any] = data.setdefault("keys", {}) # type: ignore[assignment]
|
|
if not isinstance(keys, dict):
|
|
data["keys"] = keys = {}
|
|
|
|
to_fetch: list[dict[str, Any]] = []
|
|
for e in to_run:
|
|
c = get_cached_for_entry(keys, e["chain"], e["address"])
|
|
if not use_cache:
|
|
to_fetch.append(e)
|
|
elif c is None:
|
|
to_fetch.append(e)
|
|
elif refresh_transient and should_refetch_transient(c):
|
|
to_fetch.append(e)
|
|
|
|
n_hits = len(to_run) - len(to_fetch)
|
|
print(
|
|
f" cache: {n_hits} hit(s), {len(to_fetch)} fetch(es) (resume saves Etherscan + other APIs on hits)"
|
|
)
|
|
|
|
if to_fetch:
|
|
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
|
futs = {ex.submit(run_check, dict(e), es_key): e for e in to_fetch}
|
|
n = 0
|
|
for f in as_completed(futs):
|
|
r = f.result()
|
|
set_cached_for_entry(data, r)
|
|
n += 1
|
|
if n % 5 == 0 or n == len(futs):
|
|
save_check_cache(cache_path, data)
|
|
if n % progress_every == 0:
|
|
print(f" progress {n}/{len(futs)}")
|
|
save_check_cache(cache_path, data)
|
|
|
|
by_addr: dict[tuple[str, str], dict[str, Any]] = {}
|
|
for e in to_run:
|
|
c = get_cached_for_entry(keys, e["chain"], e["address"])
|
|
if c is None:
|
|
raise RuntimeError(
|
|
f"Internal error: no cache after run for {e.get('chain')} {e.get('address')}"
|
|
)
|
|
by_addr[(e["chain"], e["address"].lower())] = c
|
|
if to_run and not to_fetch:
|
|
print(f" (all {len(to_run)} pairs served from cache; 0 fetches this run)")
|
|
return by_addr
|