FusionAGI/fusionagi/evaluation/benchmarks.py

"""Benchmarking suite — performance baselines for reasoning pipeline latency.

Provides repeatable micro-benchmarks for:
- Decomposition latency
- Multi-path scoring throughput
- Consensus engine latency
- Memory search latency
- End-to-end Super Big Brain pipeline
"""

from __future__ import annotations

import time
from dataclasses import dataclass, field
from typing import Any, Callable

from fusionagi._logger import logger


@dataclass
class BenchmarkResult:
    """Result of a single benchmark run."""

    name: str
    iterations: int
    total_seconds: float
    mean_ms: float
    min_ms: float
    max_ms: float
    std_ms: float
    metadata: dict[str, Any] = field(default_factory=dict)

    def summary(self) -> str:
        """Human-readable summary."""
        return (
            f"{self.name}: mean={self.mean_ms:.2f}ms "
            f"min={self.min_ms:.2f}ms max={self.max_ms:.2f}ms "
            f"std={self.std_ms:.2f}ms ({self.iterations} iters)"
        )


def _compute_stats(times: list[float]) -> tuple[float, float, float, float]:
    """Compute mean, min, max, std from a list of times in seconds."""
    n = len(times)
    if n == 0:
        return 0.0, 0.0, 0.0, 0.0
    times_ms = [t * 1000 for t in times]
    mean = sum(times_ms) / n
    mn = min(times_ms)
    mx = max(times_ms)
    variance = sum((t - mean) ** 2 for t in times_ms) / n
    std = variance ** 0.5
    return mean, mn, mx, std


def run_benchmark(
    name: str,
    fn: Callable[[], Any],
    iterations: int = 100,
    warmup: int = 5,
    metadata: dict[str, Any] | None = None,
) -> BenchmarkResult:
    """Run a micro-benchmark.

    Args:
        name: Benchmark name.
        fn: Function to benchmark (called with no args).
        iterations: Number of timed iterations.
        warmup: Number of warmup iterations (not timed).
        metadata: Additional context.

    Returns:
        Benchmark result with timing statistics.
    """
    for _ in range(warmup):
        fn()

    times: list[float] = []
    total_start = time.perf_counter()
    for _ in range(iterations):
        start = time.perf_counter()
        fn()
        elapsed = time.perf_counter() - start
        times.append(elapsed)
    total_elapsed = time.perf_counter() - total_start

    mean, mn, mx, std = _compute_stats(times)
    result = BenchmarkResult(
        name=name,
        iterations=iterations,
        total_seconds=total_elapsed,
        mean_ms=mean,
        min_ms=mn,
        max_ms=mx,
        std_ms=std,
        metadata=metadata or {},
    )

    logger.info("Benchmark complete", extra={"name": name, "mean_ms": mean})
    return result


class BenchmarkSuite:
    """Collection of benchmarks for the FusionAGI pipeline."""

    def __init__(self) -> None:
        self._results: list[BenchmarkResult] = []

    def add_result(self, result: BenchmarkResult) -> None:
        """Add a benchmark result."""
        self._results.append(result)

    def run_decomposition_benchmark(self, iterations: int = 50) -> BenchmarkResult:
        """Benchmark the decomposition pipeline."""
        from fusionagi.reasoning.decomposition import decompose_recursive

        prompt = (
            "Explain the implications of quantum computing on modern cryptography, "
            "including RSA, elliptic curve, and lattice-based schemes."
        )
        result = run_benchmark(
            "decomposition",
            lambda: decompose_recursive(prompt, max_depth=2),
            iterations=iterations,
        )
        self._results.append(result)
        return result

    def run_multi_path_benchmark(self, iterations: int = 50) -> BenchmarkResult:
        """Benchmark multi-path hypothesis scoring."""
        from fusionagi.reasoning.decomposition import decompose_recursive
        from fusionagi.reasoning.multi_path import generate_and_score_parallel

        prompt = "Evaluate the risk-reward tradeoff of early AGI deployment."
        decomp = decompose_recursive(prompt, max_depth=2)
        hypotheses = [u.content for u in decomp.units[:3] if u.content]
        if not hypotheses:
            hypotheses = ["test hypothesis"]

        result = run_benchmark(
            "multi_path_scoring",
            lambda: generate_and_score_parallel(hypotheses, decomp.units),
            iterations=iterations,
        )
        self._results.append(result)
        return result

    def run_recomposition_benchmark(self, iterations: int = 50) -> BenchmarkResult:
        """Benchmark the recomposition step."""
        from fusionagi.reasoning.decomposition import decompose_recursive
        from fusionagi.reasoning.recomposition import recompose
        from fusionagi.reasoning.tot import ThoughtNode

        prompt = "What are the key challenges in aligning superintelligent AI?"
        decomp = decompose_recursive(prompt, max_depth=2)
        node = ThoughtNode(
            thought="Alignment requires both technical and governance solutions.",
            unit_refs=[u.unit_id for u in decomp.units[:5]],
        )

        result = run_benchmark(
            "recomposition",
            lambda: recompose([node], decomp.units),
            iterations=iterations,
        )
        self._results.append(result)
        return result

    def run_end_to_end_benchmark(self, iterations: int = 20) -> BenchmarkResult:
        """Benchmark the full Super Big Brain pipeline."""
        from fusionagi.core.super_big_brain import SuperBigBrainConfig, run_super_big_brain
        from fusionagi.memory import SemanticGraphMemory

        graph = SemanticGraphMemory()
        config = SuperBigBrainConfig(max_decomposition_depth=2, parallel_hypotheses=2)
        prompt = "What is the most promising path from AGI to ASI?"

        result = run_benchmark(
            "end_to_end_super_big_brain",
            lambda: run_super_big_brain(prompt, graph, config),
            iterations=iterations,
            warmup=2,
        )
        self._results.append(result)
        return result

    def run_all(self, iterations: int = 30) -> list[BenchmarkResult]:
        """Run all benchmarks.

        Args:
            iterations: Number of iterations per benchmark.

        Returns:
            List of all benchmark results.
        """
        self._results.clear()
        self.run_decomposition_benchmark(iterations)
        self.run_multi_path_benchmark(iterations)
        self.run_recomposition_benchmark(iterations)
        self.run_end_to_end_benchmark(max(iterations // 3, 5))
        return list(self._results)

    def summary(self) -> str:
        """Generate summary report."""
        if not self._results:
            return "No benchmarks run."
        lines = ["FusionAGI Benchmark Results", "=" * 40]
        for r in self._results:
            lines.append(r.summary())
        return "\n".join(lines)

    def to_dict(self) -> list[dict[str, Any]]:
        """Export results as list of dicts."""
        return [
            {
                "name": r.name,
                "mean_ms": r.mean_ms,
                "min_ms": r.min_ms,
                "max_ms": r.max_ms,
                "std_ms": r.std_ms,
                "iterations": r.iterations,
            }
            for r in self._results
        ]


__all__ = [
    "BenchmarkResult",
    "BenchmarkSuite",
    "run_benchmark",
]