FusionAGI/fusionagi/api/metrics.py

"""Prometheus metrics for FusionAGI API.

Provides request counters, latency histograms, and system gauges.
Metrics are exposed at ``/metrics`` when ``FUSIONAGI_METRICS_ENABLED=true``.
"""

from __future__ import annotations

import os
import time
from typing import Any


class MetricsCollector:
    """Lightweight metrics collector (no external dependency required).

    Stores counters and histograms in-memory. If ``prometheus_client``
    is installed, registers native Prometheus metrics. Otherwise, returns
    JSON-serializable dicts via ``snapshot()``.
    """

    def __init__(self) -> None:
        self._counters: dict[str, int] = {}
        self._histograms: dict[str, list[float]] = {}
        self._gauges: dict[str, float] = {}
        self._start = time.monotonic()

    def inc(self, name: str, value: int = 1, labels: dict[str, str] | None = None) -> None:
        """Increment a counter."""
        key = self._key(name, labels)
        self._counters[key] = self._counters.get(key, 0) + value

    def observe(self, name: str, value: float, labels: dict[str, str] | None = None) -> None:
        """Record a histogram observation (e.g., latency)."""
        key = self._key(name, labels)
        self._histograms.setdefault(key, []).append(value)
        if len(self._histograms[key]) > 10000:
            self._histograms[key] = self._histograms[key][-5000:]

    def set_gauge(self, name: str, value: float, labels: dict[str, str] | None = None) -> None:
        """Set a gauge value."""
        self._gauges[self._key(name, labels)] = value

    def snapshot(self) -> dict[str, Any]:
        """Return JSON-serializable metrics snapshot."""
        hist_summary: dict[str, Any] = {}
        for k, vals in self._histograms.items():
            if vals:
                sorted_vals = sorted(vals)
                hist_summary[k] = {
                    "count": len(vals),
                    "mean": sum(vals) / len(vals),
                    "p50": sorted_vals[len(sorted_vals) // 2],
                    "p95": sorted_vals[int(len(sorted_vals) * 0.95)],
                    "p99": sorted_vals[int(len(sorted_vals) * 0.99)],
                }
        return {
            "uptime_seconds": time.monotonic() - self._start,
            "counters": dict(self._counters),
            "histograms": hist_summary,
            "gauges": dict(self._gauges),
        }

    def _key(self, name: str, labels: dict[str, str] | None) -> str:
        if not labels:
            return name
        label_str = ",".join(f"{k}={v}" for k, v in sorted(labels.items()))
        return f"{name}{{{label_str}}}"


_metrics: MetricsCollector | None = None


def get_metrics() -> MetricsCollector:
    """Get or create the global metrics collector."""
    global _metrics
    if _metrics is None:
        _metrics = MetricsCollector()
    return _metrics


def metrics_enabled() -> bool:
    """Check if metrics endpoint should be exposed."""
    return os.environ.get("FUSIONAGI_METRICS_ENABLED", "false").lower() in ("true", "1", "yes")