"""Benchmarking suite — performance baselines for reasoning pipeline latency. Provides repeatable micro-benchmarks for: - Decomposition latency - Multi-path scoring throughput - Consensus engine latency - Memory search latency - End-to-end Super Big Brain pipeline """ from __future__ import annotations import time from dataclasses import dataclass, field from typing import Any, Callable from fusionagi._logger import logger @dataclass class BenchmarkResult: """Result of a single benchmark run.""" name: str iterations: int total_seconds: float mean_ms: float min_ms: float max_ms: float std_ms: float metadata: dict[str, Any] = field(default_factory=dict) def summary(self) -> str: """Human-readable summary.""" return ( f"{self.name}: mean={self.mean_ms:.2f}ms " f"min={self.min_ms:.2f}ms max={self.max_ms:.2f}ms " f"std={self.std_ms:.2f}ms ({self.iterations} iters)" ) def _compute_stats(times: list[float]) -> tuple[float, float, float, float]: """Compute mean, min, max, std from a list of times in seconds.""" n = len(times) if n == 0: return 0.0, 0.0, 0.0, 0.0 times_ms = [t * 1000 for t in times] mean = sum(times_ms) / n mn = min(times_ms) mx = max(times_ms) variance = sum((t - mean) ** 2 for t in times_ms) / n std = variance ** 0.5 return mean, mn, mx, std def run_benchmark( name: str, fn: Callable[[], Any], iterations: int = 100, warmup: int = 5, metadata: dict[str, Any] | None = None, ) -> BenchmarkResult: """Run a micro-benchmark. Args: name: Benchmark name. fn: Function to benchmark (called with no args). iterations: Number of timed iterations. warmup: Number of warmup iterations (not timed). metadata: Additional context. Returns: Benchmark result with timing statistics. """ for _ in range(warmup): fn() times: list[float] = [] total_start = time.perf_counter() for _ in range(iterations): start = time.perf_counter() fn() elapsed = time.perf_counter() - start times.append(elapsed) total_elapsed = time.perf_counter() - total_start mean, mn, mx, std = _compute_stats(times) result = BenchmarkResult( name=name, iterations=iterations, total_seconds=total_elapsed, mean_ms=mean, min_ms=mn, max_ms=mx, std_ms=std, metadata=metadata or {}, ) logger.info("Benchmark complete", extra={"name": name, "mean_ms": mean}) return result class BenchmarkSuite: """Collection of benchmarks for the FusionAGI pipeline.""" def __init__(self) -> None: self._results: list[BenchmarkResult] = [] def add_result(self, result: BenchmarkResult) -> None: """Add a benchmark result.""" self._results.append(result) def run_decomposition_benchmark(self, iterations: int = 50) -> BenchmarkResult: """Benchmark the decomposition pipeline.""" from fusionagi.reasoning.decomposition import decompose_recursive prompt = ( "Explain the implications of quantum computing on modern cryptography, " "including RSA, elliptic curve, and lattice-based schemes." ) result = run_benchmark( "decomposition", lambda: decompose_recursive(prompt, max_depth=2), iterations=iterations, ) self._results.append(result) return result def run_multi_path_benchmark(self, iterations: int = 50) -> BenchmarkResult: """Benchmark multi-path hypothesis scoring.""" from fusionagi.reasoning.decomposition import decompose_recursive from fusionagi.reasoning.multi_path import generate_and_score_parallel prompt = "Evaluate the risk-reward tradeoff of early AGI deployment." decomp = decompose_recursive(prompt, max_depth=2) hypotheses = [u.content for u in decomp.units[:3] if u.content] if not hypotheses: hypotheses = ["test hypothesis"] result = run_benchmark( "multi_path_scoring", lambda: generate_and_score_parallel(hypotheses, decomp.units), iterations=iterations, ) self._results.append(result) return result def run_recomposition_benchmark(self, iterations: int = 50) -> BenchmarkResult: """Benchmark the recomposition step.""" from fusionagi.reasoning.decomposition import decompose_recursive from fusionagi.reasoning.recomposition import recompose from fusionagi.reasoning.tot import ThoughtNode prompt = "What are the key challenges in aligning superintelligent AI?" decomp = decompose_recursive(prompt, max_depth=2) node = ThoughtNode( thought="Alignment requires both technical and governance solutions.", unit_refs=[u.unit_id for u in decomp.units[:5]], ) result = run_benchmark( "recomposition", lambda: recompose([node], decomp.units), iterations=iterations, ) self._results.append(result) return result def run_end_to_end_benchmark(self, iterations: int = 20) -> BenchmarkResult: """Benchmark the full Super Big Brain pipeline.""" from fusionagi.core.super_big_brain import SuperBigBrainConfig, run_super_big_brain from fusionagi.memory import SemanticGraphMemory graph = SemanticGraphMemory() config = SuperBigBrainConfig(max_decomposition_depth=2, parallel_hypotheses=2) prompt = "What is the most promising path from AGI to ASI?" result = run_benchmark( "end_to_end_super_big_brain", lambda: run_super_big_brain(prompt, graph, config), iterations=iterations, warmup=2, ) self._results.append(result) return result def run_all(self, iterations: int = 30) -> list[BenchmarkResult]: """Run all benchmarks. Args: iterations: Number of iterations per benchmark. Returns: List of all benchmark results. """ self._results.clear() self.run_decomposition_benchmark(iterations) self.run_multi_path_benchmark(iterations) self.run_recomposition_benchmark(iterations) self.run_end_to_end_benchmark(max(iterations // 3, 5)) return list(self._results) def summary(self) -> str: """Generate summary report.""" if not self._results: return "No benchmarks run." lines = ["FusionAGI Benchmark Results", "=" * 40] for r in self._results: lines.append(r.summary()) return "\n".join(lines) def to_dict(self) -> list[dict[str, Any]]: """Export results as list of dicts.""" return [ { "name": r.name, "mean_ms": r.mean_ms, "min_ms": r.min_ms, "max_ms": r.max_ms, "std_ms": r.std_ms, "iterations": r.iterations, } for r in self._results ] __all__ = [ "BenchmarkResult", "BenchmarkSuite", "run_benchmark", ]