"""STT adapter: speech-to-text with Whisper, Azure, and stub implementations.""" from __future__ import annotations from abc import ABC, abstractmethod from typing import Any from fusionagi._logger import logger class STTAdapter(ABC): """Abstract adapter for speech-to-text transcription.""" @abstractmethod async def transcribe( self, audio_data: bytes, *, language: str = "en", **kwargs: Any, ) -> str | None: """Transcribe audio bytes to text. Args: audio_data: Raw audio bytes (wav/mp3/ogg). language: BCP-47 language code hint. **kwargs: Provider-specific options. Returns: Transcribed text or None on failure. """ ... class StubSTTAdapter(STTAdapter): """Stub STT adapter for testing; returns placeholder text.""" async def transcribe( self, audio_data: bytes, *, language: str = "en", **kwargs: Any, ) -> str | None: logger.debug("StubSTT: transcribe called", extra={"audio_size": len(audio_data)}) return "[stub transcription]" class WhisperSTTAdapter(STTAdapter): """OpenAI Whisper STT adapter. Requires the ``openai`` package and an OpenAI API key. """ def __init__(self, api_key: str | None = None, model: str = "whisper-1") -> None: self._api_key = api_key self._model = model async def transcribe( self, audio_data: bytes, *, language: str = "en", **kwargs: Any, ) -> str | None: try: import io import openai client = openai.OpenAI(api_key=self._api_key) audio_file = io.BytesIO(audio_data) audio_file.name = "audio.wav" transcript = client.audio.transcriptions.create( model=self._model, file=audio_file, language=language, ) return transcript.text except ImportError: logger.error("openai not installed; pip install fusionagi[openai]") return None except Exception as e: logger.error("Whisper STT failed", extra={"error": str(e)}) return None class AzureSTTAdapter(STTAdapter): """Azure Cognitive Services STT adapter. Requires ``httpx`` and an Azure Speech Services key. """ def __init__(self, api_key: str, region: str = "eastus") -> None: self._api_key = api_key self._region = region self._endpoint = f"https://{region}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1" async def transcribe( self, audio_data: bytes, *, language: str = "en-US", **kwargs: Any, ) -> str | None: try: import httpx headers = { "Ocp-Apim-Subscription-Key": self._api_key, "Content-Type": "audio/wav", } params = {"language": language} async with httpx.AsyncClient() as client: resp = await client.post( self._endpoint, headers=headers, params=params, content=audio_data, timeout=30.0, ) resp.raise_for_status() data = resp.json() return data.get("DisplayText") or data.get("RecognitionStatus") except ImportError: logger.error("httpx not installed; pip install httpx") return None except Exception as e: logger.error("Azure STT failed", extra={"error": str(e)}) return None __all__ = [ "STTAdapter", "StubSTTAdapter", "WhisperSTTAdapter", "AzureSTTAdapter", ]