FusionAGI/fusionagi/adapters/stt_adapter.py

"""STT adapter: speech-to-text with Whisper, Azure, and stub implementations."""

from __future__ import annotations

from abc import ABC, abstractmethod
from typing import Any

from fusionagi._logger import logger


class STTAdapter(ABC):
    """Abstract adapter for speech-to-text transcription."""

    @abstractmethod
    async def transcribe(
        self,
        audio_data: bytes,
        *,
        language: str = "en",
        **kwargs: Any,
    ) -> str | None:
        """Transcribe audio bytes to text.

        Args:
            audio_data: Raw audio bytes (wav/mp3/ogg).
            language: BCP-47 language code hint.
            **kwargs: Provider-specific options.

        Returns:
            Transcribed text or None on failure.
        """
        ...


class StubSTTAdapter(STTAdapter):
    """Stub STT adapter for testing; returns placeholder text."""

    async def transcribe(
        self,
        audio_data: bytes,
        *,
        language: str = "en",
        **kwargs: Any,
    ) -> str | None:
        logger.debug("StubSTT: transcribe called", extra={"audio_size": len(audio_data)})
        return "[stub transcription]"


class WhisperSTTAdapter(STTAdapter):
    """OpenAI Whisper STT adapter.

    Requires the ``openai`` package and an OpenAI API key.
    """

    def __init__(self, api_key: str | None = None, model: str = "whisper-1") -> None:
        self._api_key = api_key
        self._model = model

    async def transcribe(
        self,
        audio_data: bytes,
        *,
        language: str = "en",
        **kwargs: Any,
    ) -> str | None:
        try:
            import io

            import openai

            client = openai.OpenAI(api_key=self._api_key)
            audio_file = io.BytesIO(audio_data)
            audio_file.name = "audio.wav"
            transcript = client.audio.transcriptions.create(
                model=self._model,
                file=audio_file,
                language=language,
            )
            return transcript.text
        except ImportError:
            logger.error("openai not installed; pip install fusionagi[openai]")
            return None
        except Exception as e:
            logger.error("Whisper STT failed", extra={"error": str(e)})
            return None


class AzureSTTAdapter(STTAdapter):
    """Azure Cognitive Services STT adapter.

    Requires ``httpx`` and an Azure Speech Services key.
    """

    def __init__(self, api_key: str, region: str = "eastus") -> None:
        self._api_key = api_key
        self._region = region
        self._endpoint = f"https://{region}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1"

    async def transcribe(
        self,
        audio_data: bytes,
        *,
        language: str = "en-US",
        **kwargs: Any,
    ) -> str | None:
        try:
            import httpx

            headers = {
                "Ocp-Apim-Subscription-Key": self._api_key,
                "Content-Type": "audio/wav",
            }
            params = {"language": language}
            async with httpx.AsyncClient() as client:
                resp = await client.post(
                    self._endpoint,
                    headers=headers,
                    params=params,
                    content=audio_data,
                    timeout=30.0,
                )
                resp.raise_for_status()
                data = resp.json()
                return data.get("DisplayText") or data.get("RecognitionStatus")
        except ImportError:
            logger.error("httpx not installed; pip install httpx")
            return None
        except Exception as e:
            logger.error("Azure STT failed", extra={"error": str(e)})
            return None


__all__ = [
    "STTAdapter",
    "StubSTTAdapter",
    "WhisperSTTAdapter",
    "AzureSTTAdapter",
]