FusionAGI/fusionagi/adapters/tts_adapter.py

"""TTS adapter protocol and implementations for speech synthesis."""

from __future__ import annotations

import base64
from abc import ABC, abstractmethod
from typing import Any

from fusionagi._logger import logger


class TTSAdapter(ABC):
    """Abstract adapter for text-to-speech synthesis.

    Implementations handle provider-specific API calls (ElevenLabs,
    Azure Cognitive Services, Google Cloud TTS, etc.).
    """

    @abstractmethod
    async def synthesize(
        self,
        text: str,
        *,
        voice_id: str | None = None,
        language: str = "en",
        **kwargs: Any,
    ) -> bytes | None:
        """Synthesize text to audio bytes.

        Args:
            text: Text to synthesize.
            voice_id: Provider-specific voice identifier.
            language: Language code (BCP-47).
            **kwargs: Provider-specific options.

        Returns:
            Raw audio bytes (mp3/wav) or None on failure.
        """
        ...


class StubTTSAdapter(TTSAdapter):
    """Stub TTS adapter for testing; returns empty audio."""

    async def synthesize(
        self,
        text: str,
        *,
        voice_id: str | None = None,
        language: str = "en",
        **kwargs: Any,
    ) -> bytes | None:
        """Return empty bytes for testing."""
        logger.debug("StubTTS: synthesize called", extra={"text": text[:50], "voice_id": voice_id})
        return b""


class ElevenLabsTTSAdapter(TTSAdapter):
    """ElevenLabs TTS adapter.

    Requires the ``httpx`` package and an ElevenLabs API key.
    """

    API_BASE = "https://api.elevenlabs.io/v1"
    DEFAULT_VOICE = "21m00Tcm4TlvDq8ikWAM"  # Rachel

    def __init__(
        self,
        api_key: str,
        *,
        default_voice_id: str | None = None,
        model_id: str = "eleven_monolingual_v1",
    ) -> None:
        self._api_key = api_key
        self._default_voice = default_voice_id or self.DEFAULT_VOICE
        self._model_id = model_id

    async def synthesize(
        self,
        text: str,
        *,
        voice_id: str | None = None,
        language: str = "en",
        **kwargs: Any,
    ) -> bytes | None:
        """Call ElevenLabs TTS API."""
        try:
            import httpx
        except ImportError:
            logger.error("httpx not installed; pip install httpx")
            return None

        vid = voice_id or self._default_voice
        url = f"{self.API_BASE}/text-to-speech/{vid}"
        headers = {"xi-api-key": self._api_key, "Content-Type": "application/json"}
        payload = {
            "text": text,
            "model_id": self._model_id,
            "voice_settings": {"stability": 0.5, "similarity_boost": 0.75},
        }

        try:
            async with httpx.AsyncClient() as client:
                resp = await client.post(url, json=payload, headers=headers, timeout=30.0)
                resp.raise_for_status()
                return resp.content
        except Exception as e:
            logger.error("ElevenLabs TTS failed", extra={"error": str(e)})
            return None


def audio_to_base64(audio_bytes: bytes) -> str:
    """Encode raw audio bytes to base64 string."""
    return base64.b64encode(audio_bytes).decode()


__all__ = [
    "TTSAdapter",
    "StubTTSAdapter",
    "ElevenLabsTTSAdapter",
    "audio_to_base64",
]