Files
virtual-banker/backend/tts/elevenlabs-adapter.go
defiQUG 9839401d1d
Some checks failed
CI / build (push) Has been cancelled
TTS: configurable auth, Health check, Phoenix options; .env.example; Gitea CI workflow
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-10 16:54:10 -08:00

400 lines
12 KiB
Go

package tts
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"path"
"strings"
"time"
)
// ElevenLabsTTSService integrates with ElevenLabs TTS API or a Phoenix-compatible endpoint
type ElevenLabsTTSService struct {
apiKey string
voiceID string
modelID string
baseURL string
authHeaderName string // default "xi-api-key" when empty
authHeaderValue string
httpClient *http.Client
defaultVoiceConfig *VoiceConfig
}
// TTSOptions allows optional overrides when creating the TTS service (e.g. Phoenix auth)
type TTSOptions struct {
BaseURL string // e.g. "https://phoenix.example.com/tts/v1"
AuthHeaderName string // e.g. "Authorization"; empty = "xi-api-key"
AuthHeaderValue string // e.g. "Bearer token"; empty = apiKey
}
// VoiceConfig holds ElevenLabs voice configuration
type VoiceConfig struct {
Stability float64 `json:"stability"`
SimilarityBoost float64 `json:"similarity_boost"`
Style float64 `json:"style,omitempty"`
UseSpeakerBoost bool `json:"use_speaker_boost,omitempty"`
}
// ElevenLabsRequest represents the request body for ElevenLabs API
type ElevenLabsRequest struct {
Text string `json:"text"`
ModelID string `json:"model_id,omitempty"`
VoiceSettings VoiceConfig `json:"voice_settings,omitempty"`
}
// DefaultElevenLabsBaseURL is the default TTS API base (ElevenLabs or Phoenix-compatible).
const DefaultElevenLabsBaseURL = "https://api.elevenlabs.io/v1"
// NewElevenLabsTTSService creates a new TTS service for ElevenLabs or a Phoenix-hosted
// ElevenLabs-compatible API. Use baseURL "" for default (api.elevenlabs.io); set to
// your Phoenix TTS base (e.g. https://phoenix.example.com/tts/v1) to swap endpoint.
func NewElevenLabsTTSService(apiKey, voiceID string) *ElevenLabsTTSService {
return NewElevenLabsTTSServiceWithOptions(apiKey, voiceID, "")
}
// NewElevenLabsTTSServiceWithOptions creates a TTS service with a configurable base URL.
// baseURL: if empty, uses DefaultElevenLabsBaseURL (ElevenLabs). For Phoenix, use e.g.
// "https://phoenix.example.com/tts/v1" so that /text-to-speech/:id and /stream are used.
func NewElevenLabsTTSServiceWithOptions(apiKey, voiceID, baseURL string) *ElevenLabsTTSService {
return NewElevenLabsTTSServiceWithOptionsFull(apiKey, voiceID, TTSOptions{BaseURL: baseURL})
}
// NewElevenLabsTTSServiceWithOptionsFull creates a TTS service with full options (base URL, auth header).
// Use for Phoenix when auth differs from ElevenLabs (e.g. Authorization: Bearer <token>).
func NewElevenLabsTTSServiceWithOptionsFull(apiKey, voiceID string, opts TTSOptions) *ElevenLabsTTSService {
baseURL := strings.TrimSuffix(opts.BaseURL, "/")
if baseURL == "" {
baseURL = DefaultElevenLabsBaseURL
}
authName := opts.AuthHeaderName
if authName == "" {
authName = "xi-api-key"
}
authVal := opts.AuthHeaderValue
if authVal == "" {
authVal = apiKey
}
return &ElevenLabsTTSService{
apiKey: apiKey,
voiceID: voiceID,
modelID: "eleven_multilingual_v2",
baseURL: baseURL,
authHeaderName: authName,
authHeaderValue: authVal,
httpClient: &http.Client{
Timeout: 30 * time.Second,
},
defaultVoiceConfig: &VoiceConfig{
Stability: 0.5,
SimilarityBoost: 0.75,
UseSpeakerBoost: true,
},
}
}
// SetModelID sets the model ID for synthesis
func (s *ElevenLabsTTSService) SetModelID(modelID string) {
s.modelID = modelID
}
// SetVoiceConfig sets the default voice configuration
func (s *ElevenLabsTTSService) SetVoiceConfig(config *VoiceConfig) {
s.defaultVoiceConfig = config
}
// Synthesize synthesizes text to audio using ElevenLabs REST API
func (s *ElevenLabsTTSService) Synthesize(ctx context.Context, text string) ([]byte, error) {
return s.SynthesizeWithConfig(ctx, text, s.defaultVoiceConfig)
}
// SynthesizeWithConfig synthesizes text to audio with custom voice configuration
func (s *ElevenLabsTTSService) SynthesizeWithConfig(ctx context.Context, text string, config *VoiceConfig) ([]byte, error) {
if s.authHeaderValue == "" && s.apiKey == "" {
return nil, fmt.Errorf("TTS API key or auth not configured")
}
if s.voiceID == "" {
return nil, fmt.Errorf("ElevenLabs voice ID not configured")
}
if text == "" {
return nil, fmt.Errorf("text cannot be empty")
}
// Use default config if none provided
if config == nil {
config = s.defaultVoiceConfig
}
// Prepare request body
reqBody := ElevenLabsRequest{
Text: text,
ModelID: s.modelID,
VoiceSettings: *config,
}
jsonBody, err := json.Marshal(reqBody)
if err != nil {
return nil, fmt.Errorf("failed to marshal request: %w", err)
}
// Build request URL
url := fmt.Sprintf("%s/text-to-speech/%s", s.baseURL, s.voiceID)
// Create HTTP request
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonBody))
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Accept", "audio/mpeg")
req.Header.Set("Content-Type", "application/json")
if s.authHeaderValue != "" {
req.Header.Set(s.authHeaderName, s.authHeaderValue)
}
// Execute request with retry logic
var resp *http.Response
maxRetries := 3
for i := 0; i < maxRetries; i++ {
resp, err = s.httpClient.Do(req)
if err == nil && resp.StatusCode == http.StatusOK {
break
}
if err != nil {
if i < maxRetries-1 {
// Exponential backoff
backoff := time.Duration(i+1) * time.Second
time.Sleep(backoff)
continue
}
return nil, fmt.Errorf("failed to call ElevenLabs API after %d retries: %w", maxRetries, err)
}
if resp.StatusCode != http.StatusOK {
bodyBytes, _ := io.ReadAll(resp.Body)
resp.Body.Close()
// Retry on 5xx errors
if resp.StatusCode >= 500 && i < maxRetries-1 {
backoff := time.Duration(i+1) * time.Second
time.Sleep(backoff)
continue
}
return nil, fmt.Errorf("ElevenLabs API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
}
}
defer resp.Body.Close()
// Read audio data
audioData, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read audio data: %w", err)
}
return audioData, nil
}
// SynthesizeStream synthesizes text to audio using ElevenLabs streaming API
func (s *ElevenLabsTTSService) SynthesizeStream(ctx context.Context, text string) (io.Reader, error) {
return s.SynthesizeStreamWithConfig(ctx, text, s.defaultVoiceConfig)
}
// SynthesizeStreamWithConfig synthesizes text to audio stream with custom voice configuration
func (s *ElevenLabsTTSService) SynthesizeStreamWithConfig(ctx context.Context, text string, config *VoiceConfig) (io.Reader, error) {
if s.authHeaderValue == "" && s.apiKey == "" {
return nil, fmt.Errorf("TTS API key or auth not configured")
}
if s.voiceID == "" {
return nil, fmt.Errorf("ElevenLabs voice ID not configured")
}
if text == "" {
return nil, fmt.Errorf("text cannot be empty")
}
// Use default config if none provided
if config == nil {
config = s.defaultVoiceConfig
}
// Prepare request body
reqBody := ElevenLabsRequest{
Text: text,
ModelID: s.modelID,
VoiceSettings: *config,
}
jsonBody, err := json.Marshal(reqBody)
if err != nil {
return nil, fmt.Errorf("failed to marshal request: %w", err)
}
// Build request URL for streaming
url := fmt.Sprintf("%s/text-to-speech/%s/stream", s.baseURL, s.voiceID)
// Create HTTP request
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonBody))
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Accept", "audio/mpeg")
req.Header.Set("Content-Type", "application/json")
if s.authHeaderValue != "" {
req.Header.Set(s.authHeaderName, s.authHeaderValue)
}
// Execute request
resp, err := s.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to call ElevenLabs streaming API: %w", err)
}
if resp.StatusCode != http.StatusOK {
bodyBytes, _ := io.ReadAll(resp.Body)
resp.Body.Close()
return nil, fmt.Errorf("TTS streaming API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
}
// Return stream reader (caller is responsible for closing)
return resp.Body, nil
}
// Health checks connectivity to the TTS backend. For Phoenix, expects GET {baseURL}/../health (or /health).
// For ElevenLabs (default base URL), this is a no-op and returns nil (no public health endpoint).
func (s *ElevenLabsTTSService) Health(ctx context.Context) error {
if s.baseURL == DefaultElevenLabsBaseURL {
return nil // ElevenLabs has no public health; skip to avoid unnecessary calls
}
u, err := url.Parse(s.baseURL)
if err != nil {
return fmt.Errorf("TTS base URL invalid: %w", err)
}
u.Path = path.Join(path.Dir(u.Path), "health")
req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil)
if err != nil {
return err
}
if s.authHeaderValue != "" {
req.Header.Set(s.authHeaderName, s.authHeaderValue)
}
resp, err := s.httpClient.Do(req)
if err != nil {
return fmt.Errorf("TTS health check failed: %w", err)
}
resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return fmt.Errorf("TTS health returned status %d", resp.StatusCode)
}
return nil
}
// GetVisemes returns viseme events for lip sync
// ElevenLabs doesn't provide viseme data directly, so we use phoneme-to-viseme mapping
func (s *ElevenLabsTTSService) GetVisemes(ctx context.Context, text string) ([]VisemeEvent, error) {
if text == "" {
return nil, fmt.Errorf("text cannot be empty")
}
// Use phoneme-to-viseme mapping to generate viseme events
// This is a simplified implementation - in production, you might want to use
// a more sophisticated phoneme-to-viseme mapping service or library
visemes := s.generateVisemesFromText(text)
return visemes, nil
}
// generateVisemesFromText generates viseme events from text using basic phoneme-to-viseme mapping
// This is a simplified implementation. For production, consider using:
// - A dedicated phoneme-to-viseme mapping service
// - A TTS provider that provides phoneme timing data (e.g., Azure TTS with SSML)
// - Integration with a speech analysis library
func (s *ElevenLabsTTSService) generateVisemesFromText(text string) []VisemeEvent {
// Basic phoneme-to-viseme mapping
phonemeToViseme := map[string]string{
// Vowels
"aa": "aa", "ae": "aa", "ah": "aa", "ao": "aa", "aw": "aa",
"ay": "aa", "eh": "ee", "er": "er", "ey": "ee", "ih": "ee",
"iy": "ee", "ow": "oh", "oy": "oh", "uh": "ou", "uw": "ou",
// Consonants
"b": "aa", "p": "aa", "m": "aa",
"f": "ee", "v": "ee",
"th": "ee",
"d": "aa", "t": "aa", "n": "aa", "l": "aa",
"k": "aa", "g": "aa", "ng": "aa",
"s": "ee", "z": "ee",
"sh": "ee", "zh": "ee", "ch": "ee", "jh": "ee",
"y": "ee",
"w": "ou",
"r": "er",
"h": "sil",
"sil": "sil", "sp": "sil",
}
// Simple word-to-phoneme approximation
// In production, use a proper TTS API that provides phoneme timing or a phoneme-to-viseme service
words := strings.Fields(strings.ToLower(text))
visemes := []VisemeEvent{}
currentTime := 0.0
durationPerWord := 0.3 // Approximate duration per word in seconds
initialPause := 0.1
// Initial silence
visemes = append(visemes, VisemeEvent{
Viseme: "sil",
StartTime: currentTime,
EndTime: currentTime + initialPause,
Phoneme: "sil",
})
currentTime += initialPause
// Generate visemes for each word
for _, word := range words {
// Simple approximation: map first phoneme to viseme
viseme := "aa" // default
if len(word) > 0 {
firstChar := string(word[0])
if mapped, ok := phonemeToViseme[firstChar]; ok {
viseme = mapped
} else {
// Map common starting consonants
switch firstChar {
case "a", "e", "i", "o", "u":
viseme = "aa"
default:
viseme = "aa"
}
}
}
visemes = append(visemes, VisemeEvent{
Viseme: viseme,
StartTime: currentTime,
EndTime: currentTime + durationPerWord,
Phoneme: word,
})
currentTime += durationPerWord
// Small pause between words
visemes = append(visemes, VisemeEvent{
Viseme: "sil",
StartTime: currentTime,
EndTime: currentTime + 0.05,
Phoneme: "sil",
})
currentTime += 0.05
}
// Final silence
visemes = append(visemes, VisemeEvent{
Viseme: "sil",
StartTime: currentTime,
EndTime: currentTime + 0.1,
Phoneme: "sil",
})
return visemes
}