virtual-banker/backend/tts/elevenlabs-adapter.go

package tts

import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"net/url"
	"path"
	"strings"
	"time"
)

// ElevenLabsTTSService integrates with ElevenLabs TTS API or a Phoenix-compatible endpoint
type ElevenLabsTTSService struct {
	apiKey             string
	voiceID            string
	modelID            string
	baseURL            string
	authHeaderName     string // default "xi-api-key" when empty
	authHeaderValue    string
	httpClient         *http.Client
	defaultVoiceConfig *VoiceConfig
}

// TTSOptions allows optional overrides when creating the TTS service (e.g. Phoenix auth)
type TTSOptions struct {
	BaseURL         string // e.g. "https://phoenix.example.com/tts/v1"
	AuthHeaderName  string // e.g. "Authorization"; empty = "xi-api-key"
	AuthHeaderValue string // e.g. "Bearer token"; empty = apiKey
}

// VoiceConfig holds ElevenLabs voice configuration
type VoiceConfig struct {
	Stability       float64 `json:"stability"`
	SimilarityBoost float64 `json:"similarity_boost"`
	Style           float64 `json:"style,omitempty"`
	UseSpeakerBoost bool    `json:"use_speaker_boost,omitempty"`
}

// ElevenLabsRequest represents the request body for ElevenLabs API
type ElevenLabsRequest struct {
	Text      string       `json:"text"`
	ModelID   string       `json:"model_id,omitempty"`
	VoiceSettings VoiceConfig `json:"voice_settings,omitempty"`
}

// DefaultElevenLabsBaseURL is the default TTS API base (ElevenLabs or Phoenix-compatible).
const DefaultElevenLabsBaseURL = "https://api.elevenlabs.io/v1"

// NewElevenLabsTTSService creates a new TTS service for ElevenLabs or a Phoenix-hosted
// ElevenLabs-compatible API. Use baseURL "" for default (api.elevenlabs.io); set to
// your Phoenix TTS base (e.g. https://phoenix.example.com/tts/v1) to swap endpoint.
func NewElevenLabsTTSService(apiKey, voiceID string) *ElevenLabsTTSService {
	return NewElevenLabsTTSServiceWithOptions(apiKey, voiceID, "")
}

// NewElevenLabsTTSServiceWithOptions creates a TTS service with a configurable base URL.
// baseURL: if empty, uses DefaultElevenLabsBaseURL (ElevenLabs). For Phoenix, use e.g.
// "https://phoenix.example.com/tts/v1" so that /text-to-speech/:id and /stream are used.
func NewElevenLabsTTSServiceWithOptions(apiKey, voiceID, baseURL string) *ElevenLabsTTSService {
	return NewElevenLabsTTSServiceWithOptionsFull(apiKey, voiceID, TTSOptions{BaseURL: baseURL})
}

// NewElevenLabsTTSServiceWithOptionsFull creates a TTS service with full options (base URL, auth header).
// Use for Phoenix when auth differs from ElevenLabs (e.g. Authorization: Bearer <token>).
func NewElevenLabsTTSServiceWithOptionsFull(apiKey, voiceID string, opts TTSOptions) *ElevenLabsTTSService {
	baseURL := strings.TrimSuffix(opts.BaseURL, "/")
	if baseURL == "" {
		baseURL = DefaultElevenLabsBaseURL
	}
	authName := opts.AuthHeaderName
	if authName == "" {
		authName = "xi-api-key"
	}
	authVal := opts.AuthHeaderValue
	if authVal == "" {
		authVal = apiKey
	}
	return &ElevenLabsTTSService{
		apiKey:          apiKey,
		voiceID:         voiceID,
		modelID:         "eleven_multilingual_v2",
		baseURL:         baseURL,
		authHeaderName:  authName,
		authHeaderValue: authVal,
		httpClient: &http.Client{
			Timeout: 30 * time.Second,
		},
		defaultVoiceConfig: &VoiceConfig{
			Stability:       0.5,
			SimilarityBoost: 0.75,
			UseSpeakerBoost: true,
		},
	}
}

// SetModelID sets the model ID for synthesis
func (s *ElevenLabsTTSService) SetModelID(modelID string) {
	s.modelID = modelID
}

// SetVoiceConfig sets the default voice configuration
func (s *ElevenLabsTTSService) SetVoiceConfig(config *VoiceConfig) {
	s.defaultVoiceConfig = config
}

// Synthesize synthesizes text to audio using ElevenLabs REST API
func (s *ElevenLabsTTSService) Synthesize(ctx context.Context, text string) ([]byte, error) {
	return s.SynthesizeWithConfig(ctx, text, s.defaultVoiceConfig)
}

// SynthesizeWithConfig synthesizes text to audio with custom voice configuration
func (s *ElevenLabsTTSService) SynthesizeWithConfig(ctx context.Context, text string, config *VoiceConfig) ([]byte, error) {
	if s.authHeaderValue == "" && s.apiKey == "" {
		return nil, fmt.Errorf("TTS API key or auth not configured")
	}
	if s.voiceID == "" {
		return nil, fmt.Errorf("ElevenLabs voice ID not configured")
	}
	if text == "" {
		return nil, fmt.Errorf("text cannot be empty")
	}

	// Use default config if none provided
	if config == nil {
		config = s.defaultVoiceConfig
	}

	// Prepare request body
	reqBody := ElevenLabsRequest{
		Text:      text,
		ModelID:   s.modelID,
		VoiceSettings: *config,
	}

	jsonBody, err := json.Marshal(reqBody)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal request: %w", err)
	}

	// Build request URL
	url := fmt.Sprintf("%s/text-to-speech/%s", s.baseURL, s.voiceID)

	// Create HTTP request
	req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonBody))
	if err != nil {
		return nil, fmt.Errorf("failed to create request: %w", err)
	}

	req.Header.Set("Accept", "audio/mpeg")
	req.Header.Set("Content-Type", "application/json")
	if s.authHeaderValue != "" {
		req.Header.Set(s.authHeaderName, s.authHeaderValue)
	}
	// Execute request with retry logic
	var resp *http.Response
	maxRetries := 3
	for i := 0; i < maxRetries; i++ {
		resp, err = s.httpClient.Do(req)
		if err == nil && resp.StatusCode == http.StatusOK {
			break
		}

		if err != nil {
			if i < maxRetries-1 {
				// Exponential backoff
				backoff := time.Duration(i+1) * time.Second
				time.Sleep(backoff)
				continue
			}
			return nil, fmt.Errorf("failed to call ElevenLabs API after %d retries: %w", maxRetries, err)
		}

		if resp.StatusCode != http.StatusOK {
			bodyBytes, _ := io.ReadAll(resp.Body)
			resp.Body.Close()
			// Retry on 5xx errors
			if resp.StatusCode >= 500 && i < maxRetries-1 {
				backoff := time.Duration(i+1) * time.Second
				time.Sleep(backoff)
				continue
			}

			return nil, fmt.Errorf("ElevenLabs API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
		}
	}
	defer resp.Body.Close()

	// Read audio data
	audioData, err := io.ReadAll(resp.Body)
	if err != nil {
		return nil, fmt.Errorf("failed to read audio data: %w", err)
	}

	return audioData, nil
}

// SynthesizeStream synthesizes text to audio using ElevenLabs streaming API
func (s *ElevenLabsTTSService) SynthesizeStream(ctx context.Context, text string) (io.Reader, error) {
	return s.SynthesizeStreamWithConfig(ctx, text, s.defaultVoiceConfig)
}

// SynthesizeStreamWithConfig synthesizes text to audio stream with custom voice configuration
func (s *ElevenLabsTTSService) SynthesizeStreamWithConfig(ctx context.Context, text string, config *VoiceConfig) (io.Reader, error) {
	if s.authHeaderValue == "" && s.apiKey == "" {
		return nil, fmt.Errorf("TTS API key or auth not configured")
	}
	if s.voiceID == "" {
		return nil, fmt.Errorf("ElevenLabs voice ID not configured")
	}
	if text == "" {
		return nil, fmt.Errorf("text cannot be empty")
	}

	// Use default config if none provided
	if config == nil {
		config = s.defaultVoiceConfig
	}

	// Prepare request body
	reqBody := ElevenLabsRequest{
		Text:      text,
		ModelID:   s.modelID,
		VoiceSettings: *config,
	}

	jsonBody, err := json.Marshal(reqBody)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal request: %w", err)
	}

	// Build request URL for streaming
	url := fmt.Sprintf("%s/text-to-speech/%s/stream", s.baseURL, s.voiceID)

	// Create HTTP request
	req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonBody))
	if err != nil {
		return nil, fmt.Errorf("failed to create request: %w", err)
	}

	req.Header.Set("Accept", "audio/mpeg")
	req.Header.Set("Content-Type", "application/json")
	if s.authHeaderValue != "" {
		req.Header.Set(s.authHeaderName, s.authHeaderValue)
	}
	// Execute request
	resp, err := s.httpClient.Do(req)
	if err != nil {
		return nil, fmt.Errorf("failed to call ElevenLabs streaming API: %w", err)
	}

	if resp.StatusCode != http.StatusOK {
		bodyBytes, _ := io.ReadAll(resp.Body)
		resp.Body.Close()
		return nil, fmt.Errorf("TTS streaming API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
	}

	// Return stream reader (caller is responsible for closing)
	return resp.Body, nil
}

// Health checks connectivity to the TTS backend. For Phoenix, expects GET {baseURL}/../health (or /health).
// For ElevenLabs (default base URL), this is a no-op and returns nil (no public health endpoint).
func (s *ElevenLabsTTSService) Health(ctx context.Context) error {
	if s.baseURL == DefaultElevenLabsBaseURL {
		return nil // ElevenLabs has no public health; skip to avoid unnecessary calls
	}
	u, err := url.Parse(s.baseURL)
	if err != nil {
		return fmt.Errorf("TTS base URL invalid: %w", err)
	}
	u.Path = path.Join(path.Dir(u.Path), "health")
	req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil)
	if err != nil {
		return err
	}
	if s.authHeaderValue != "" {
		req.Header.Set(s.authHeaderName, s.authHeaderValue)
	}
	resp, err := s.httpClient.Do(req)
	if err != nil {
		return fmt.Errorf("TTS health check failed: %w", err)
	}
	resp.Body.Close()
	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
		return fmt.Errorf("TTS health returned status %d", resp.StatusCode)
	}
	return nil
}

// GetVisemes returns viseme events for lip sync
// ElevenLabs doesn't provide viseme data directly, so we use phoneme-to-viseme mapping
func (s *ElevenLabsTTSService) GetVisemes(ctx context.Context, text string) ([]VisemeEvent, error) {
	if text == "" {
		return nil, fmt.Errorf("text cannot be empty")
	}

	// Use phoneme-to-viseme mapping to generate viseme events
	// This is a simplified implementation - in production, you might want to use
	// a more sophisticated phoneme-to-viseme mapping service or library
	visemes := s.generateVisemesFromText(text)

	return visemes, nil
}

// generateVisemesFromText generates viseme events from text using basic phoneme-to-viseme mapping
// This is a simplified implementation. For production, consider using:
// - A dedicated phoneme-to-viseme mapping service
// - A TTS provider that provides phoneme timing data (e.g., Azure TTS with SSML)
// - Integration with a speech analysis library
func (s *ElevenLabsTTSService) generateVisemesFromText(text string) []VisemeEvent {
	// Basic phoneme-to-viseme mapping
	phonemeToViseme := map[string]string{
		// Vowels
		"aa": "aa", "ae": "aa", "ah": "aa", "ao": "aa", "aw": "aa",
		"ay": "aa", "eh": "ee", "er": "er", "ey": "ee", "ih": "ee",
		"iy": "ee", "ow": "oh", "oy": "oh", "uh": "ou", "uw": "ou",
		// Consonants
		"b": "aa", "p": "aa", "m": "aa",
		"f": "ee", "v": "ee",
		"th": "ee",
		"d": "aa", "t": "aa", "n": "aa", "l": "aa",
		"k": "aa", "g": "aa", "ng": "aa",
		"s": "ee", "z": "ee",
		"sh": "ee", "zh": "ee", "ch": "ee", "jh": "ee",
		"y": "ee",
		"w": "ou",
		"r": "er",
		"h": "sil",
		"sil": "sil", "sp": "sil",
	}

	// Simple word-to-phoneme approximation
	// In production, use a proper TTS API that provides phoneme timing or a phoneme-to-viseme service
	words := strings.Fields(strings.ToLower(text))
	visemes := []VisemeEvent{}
	currentTime := 0.0
	durationPerWord := 0.3 // Approximate duration per word in seconds
	initialPause := 0.1

	// Initial silence
	visemes = append(visemes, VisemeEvent{
		Viseme:    "sil",
		StartTime: currentTime,
		EndTime:   currentTime + initialPause,
		Phoneme:   "sil",
	})
	currentTime += initialPause

	// Generate visemes for each word
	for _, word := range words {
		// Simple approximation: map first phoneme to viseme
		viseme := "aa" // default
		if len(word) > 0 {
			firstChar := string(word[0])
			if mapped, ok := phonemeToViseme[firstChar]; ok {
				viseme = mapped
			} else {
				// Map common starting consonants
				switch firstChar {
				case "a", "e", "i", "o", "u":
					viseme = "aa"
				default:
					viseme = "aa"
				}
			}
		}

		visemes = append(visemes, VisemeEvent{
			Viseme:    viseme,
			StartTime: currentTime,
			EndTime:   currentTime + durationPerWord,
			Phoneme:   word,
		})
		currentTime += durationPerWord

		// Small pause between words
		visemes = append(visemes, VisemeEvent{
			Viseme:    "sil",
			StartTime: currentTime,
			EndTime:   currentTime + 0.05,
			Phoneme:   "sil",
		})
		currentTime += 0.05
	}

	// Final silence
	visemes = append(visemes, VisemeEvent{
		Viseme:    "sil",
		StartTime: currentTime,
		EndTime:   currentTime + 0.1,
		Phoneme:   "sil",
	})

	return visemes
}