Some checks failed
CI / build (push) Has been cancelled
Co-authored-by: Cursor <cursoragent@cursor.com>
400 lines
12 KiB
Go
400 lines
12 KiB
Go
package tts
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"path"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// ElevenLabsTTSService integrates with ElevenLabs TTS API or a Phoenix-compatible endpoint
|
|
type ElevenLabsTTSService struct {
|
|
apiKey string
|
|
voiceID string
|
|
modelID string
|
|
baseURL string
|
|
authHeaderName string // default "xi-api-key" when empty
|
|
authHeaderValue string
|
|
httpClient *http.Client
|
|
defaultVoiceConfig *VoiceConfig
|
|
}
|
|
|
|
// TTSOptions allows optional overrides when creating the TTS service (e.g. Phoenix auth)
|
|
type TTSOptions struct {
|
|
BaseURL string // e.g. "https://phoenix.example.com/tts/v1"
|
|
AuthHeaderName string // e.g. "Authorization"; empty = "xi-api-key"
|
|
AuthHeaderValue string // e.g. "Bearer token"; empty = apiKey
|
|
}
|
|
|
|
// VoiceConfig holds ElevenLabs voice configuration
|
|
type VoiceConfig struct {
|
|
Stability float64 `json:"stability"`
|
|
SimilarityBoost float64 `json:"similarity_boost"`
|
|
Style float64 `json:"style,omitempty"`
|
|
UseSpeakerBoost bool `json:"use_speaker_boost,omitempty"`
|
|
}
|
|
|
|
// ElevenLabsRequest represents the request body for ElevenLabs API
|
|
type ElevenLabsRequest struct {
|
|
Text string `json:"text"`
|
|
ModelID string `json:"model_id,omitempty"`
|
|
VoiceSettings VoiceConfig `json:"voice_settings,omitempty"`
|
|
}
|
|
|
|
// DefaultElevenLabsBaseURL is the default TTS API base (ElevenLabs or Phoenix-compatible).
|
|
const DefaultElevenLabsBaseURL = "https://api.elevenlabs.io/v1"
|
|
|
|
// NewElevenLabsTTSService creates a new TTS service for ElevenLabs or a Phoenix-hosted
|
|
// ElevenLabs-compatible API. Use baseURL "" for default (api.elevenlabs.io); set to
|
|
// your Phoenix TTS base (e.g. https://phoenix.example.com/tts/v1) to swap endpoint.
|
|
func NewElevenLabsTTSService(apiKey, voiceID string) *ElevenLabsTTSService {
|
|
return NewElevenLabsTTSServiceWithOptions(apiKey, voiceID, "")
|
|
}
|
|
|
|
// NewElevenLabsTTSServiceWithOptions creates a TTS service with a configurable base URL.
|
|
// baseURL: if empty, uses DefaultElevenLabsBaseURL (ElevenLabs). For Phoenix, use e.g.
|
|
// "https://phoenix.example.com/tts/v1" so that /text-to-speech/:id and /stream are used.
|
|
func NewElevenLabsTTSServiceWithOptions(apiKey, voiceID, baseURL string) *ElevenLabsTTSService {
|
|
return NewElevenLabsTTSServiceWithOptionsFull(apiKey, voiceID, TTSOptions{BaseURL: baseURL})
|
|
}
|
|
|
|
// NewElevenLabsTTSServiceWithOptionsFull creates a TTS service with full options (base URL, auth header).
|
|
// Use for Phoenix when auth differs from ElevenLabs (e.g. Authorization: Bearer <token>).
|
|
func NewElevenLabsTTSServiceWithOptionsFull(apiKey, voiceID string, opts TTSOptions) *ElevenLabsTTSService {
|
|
baseURL := strings.TrimSuffix(opts.BaseURL, "/")
|
|
if baseURL == "" {
|
|
baseURL = DefaultElevenLabsBaseURL
|
|
}
|
|
authName := opts.AuthHeaderName
|
|
if authName == "" {
|
|
authName = "xi-api-key"
|
|
}
|
|
authVal := opts.AuthHeaderValue
|
|
if authVal == "" {
|
|
authVal = apiKey
|
|
}
|
|
return &ElevenLabsTTSService{
|
|
apiKey: apiKey,
|
|
voiceID: voiceID,
|
|
modelID: "eleven_multilingual_v2",
|
|
baseURL: baseURL,
|
|
authHeaderName: authName,
|
|
authHeaderValue: authVal,
|
|
httpClient: &http.Client{
|
|
Timeout: 30 * time.Second,
|
|
},
|
|
defaultVoiceConfig: &VoiceConfig{
|
|
Stability: 0.5,
|
|
SimilarityBoost: 0.75,
|
|
UseSpeakerBoost: true,
|
|
},
|
|
}
|
|
}
|
|
|
|
// SetModelID sets the model ID for synthesis
|
|
func (s *ElevenLabsTTSService) SetModelID(modelID string) {
|
|
s.modelID = modelID
|
|
}
|
|
|
|
// SetVoiceConfig sets the default voice configuration
|
|
func (s *ElevenLabsTTSService) SetVoiceConfig(config *VoiceConfig) {
|
|
s.defaultVoiceConfig = config
|
|
}
|
|
|
|
// Synthesize synthesizes text to audio using ElevenLabs REST API
|
|
func (s *ElevenLabsTTSService) Synthesize(ctx context.Context, text string) ([]byte, error) {
|
|
return s.SynthesizeWithConfig(ctx, text, s.defaultVoiceConfig)
|
|
}
|
|
|
|
// SynthesizeWithConfig synthesizes text to audio with custom voice configuration
|
|
func (s *ElevenLabsTTSService) SynthesizeWithConfig(ctx context.Context, text string, config *VoiceConfig) ([]byte, error) {
|
|
if s.authHeaderValue == "" && s.apiKey == "" {
|
|
return nil, fmt.Errorf("TTS API key or auth not configured")
|
|
}
|
|
if s.voiceID == "" {
|
|
return nil, fmt.Errorf("ElevenLabs voice ID not configured")
|
|
}
|
|
if text == "" {
|
|
return nil, fmt.Errorf("text cannot be empty")
|
|
}
|
|
|
|
// Use default config if none provided
|
|
if config == nil {
|
|
config = s.defaultVoiceConfig
|
|
}
|
|
|
|
// Prepare request body
|
|
reqBody := ElevenLabsRequest{
|
|
Text: text,
|
|
ModelID: s.modelID,
|
|
VoiceSettings: *config,
|
|
}
|
|
|
|
jsonBody, err := json.Marshal(reqBody)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to marshal request: %w", err)
|
|
}
|
|
|
|
// Build request URL
|
|
url := fmt.Sprintf("%s/text-to-speech/%s", s.baseURL, s.voiceID)
|
|
|
|
// Create HTTP request
|
|
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonBody))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create request: %w", err)
|
|
}
|
|
|
|
req.Header.Set("Accept", "audio/mpeg")
|
|
req.Header.Set("Content-Type", "application/json")
|
|
if s.authHeaderValue != "" {
|
|
req.Header.Set(s.authHeaderName, s.authHeaderValue)
|
|
}
|
|
// Execute request with retry logic
|
|
var resp *http.Response
|
|
maxRetries := 3
|
|
for i := 0; i < maxRetries; i++ {
|
|
resp, err = s.httpClient.Do(req)
|
|
if err == nil && resp.StatusCode == http.StatusOK {
|
|
break
|
|
}
|
|
|
|
if err != nil {
|
|
if i < maxRetries-1 {
|
|
// Exponential backoff
|
|
backoff := time.Duration(i+1) * time.Second
|
|
time.Sleep(backoff)
|
|
continue
|
|
}
|
|
return nil, fmt.Errorf("failed to call ElevenLabs API after %d retries: %w", maxRetries, err)
|
|
}
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
bodyBytes, _ := io.ReadAll(resp.Body)
|
|
resp.Body.Close()
|
|
// Retry on 5xx errors
|
|
if resp.StatusCode >= 500 && i < maxRetries-1 {
|
|
backoff := time.Duration(i+1) * time.Second
|
|
time.Sleep(backoff)
|
|
continue
|
|
}
|
|
|
|
return nil, fmt.Errorf("ElevenLabs API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
|
|
}
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
// Read audio data
|
|
audioData, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read audio data: %w", err)
|
|
}
|
|
|
|
return audioData, nil
|
|
}
|
|
|
|
// SynthesizeStream synthesizes text to audio using ElevenLabs streaming API
|
|
func (s *ElevenLabsTTSService) SynthesizeStream(ctx context.Context, text string) (io.Reader, error) {
|
|
return s.SynthesizeStreamWithConfig(ctx, text, s.defaultVoiceConfig)
|
|
}
|
|
|
|
// SynthesizeStreamWithConfig synthesizes text to audio stream with custom voice configuration
|
|
func (s *ElevenLabsTTSService) SynthesizeStreamWithConfig(ctx context.Context, text string, config *VoiceConfig) (io.Reader, error) {
|
|
if s.authHeaderValue == "" && s.apiKey == "" {
|
|
return nil, fmt.Errorf("TTS API key or auth not configured")
|
|
}
|
|
if s.voiceID == "" {
|
|
return nil, fmt.Errorf("ElevenLabs voice ID not configured")
|
|
}
|
|
if text == "" {
|
|
return nil, fmt.Errorf("text cannot be empty")
|
|
}
|
|
|
|
// Use default config if none provided
|
|
if config == nil {
|
|
config = s.defaultVoiceConfig
|
|
}
|
|
|
|
// Prepare request body
|
|
reqBody := ElevenLabsRequest{
|
|
Text: text,
|
|
ModelID: s.modelID,
|
|
VoiceSettings: *config,
|
|
}
|
|
|
|
jsonBody, err := json.Marshal(reqBody)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to marshal request: %w", err)
|
|
}
|
|
|
|
// Build request URL for streaming
|
|
url := fmt.Sprintf("%s/text-to-speech/%s/stream", s.baseURL, s.voiceID)
|
|
|
|
// Create HTTP request
|
|
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonBody))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create request: %w", err)
|
|
}
|
|
|
|
req.Header.Set("Accept", "audio/mpeg")
|
|
req.Header.Set("Content-Type", "application/json")
|
|
if s.authHeaderValue != "" {
|
|
req.Header.Set(s.authHeaderName, s.authHeaderValue)
|
|
}
|
|
// Execute request
|
|
resp, err := s.httpClient.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to call ElevenLabs streaming API: %w", err)
|
|
}
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
bodyBytes, _ := io.ReadAll(resp.Body)
|
|
resp.Body.Close()
|
|
return nil, fmt.Errorf("TTS streaming API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
|
|
}
|
|
|
|
// Return stream reader (caller is responsible for closing)
|
|
return resp.Body, nil
|
|
}
|
|
|
|
// Health checks connectivity to the TTS backend. For Phoenix, expects GET {baseURL}/../health (or /health).
|
|
// For ElevenLabs (default base URL), this is a no-op and returns nil (no public health endpoint).
|
|
func (s *ElevenLabsTTSService) Health(ctx context.Context) error {
|
|
if s.baseURL == DefaultElevenLabsBaseURL {
|
|
return nil // ElevenLabs has no public health; skip to avoid unnecessary calls
|
|
}
|
|
u, err := url.Parse(s.baseURL)
|
|
if err != nil {
|
|
return fmt.Errorf("TTS base URL invalid: %w", err)
|
|
}
|
|
u.Path = path.Join(path.Dir(u.Path), "health")
|
|
req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if s.authHeaderValue != "" {
|
|
req.Header.Set(s.authHeaderName, s.authHeaderValue)
|
|
}
|
|
resp, err := s.httpClient.Do(req)
|
|
if err != nil {
|
|
return fmt.Errorf("TTS health check failed: %w", err)
|
|
}
|
|
resp.Body.Close()
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
return fmt.Errorf("TTS health returned status %d", resp.StatusCode)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// GetVisemes returns viseme events for lip sync
|
|
// ElevenLabs doesn't provide viseme data directly, so we use phoneme-to-viseme mapping
|
|
func (s *ElevenLabsTTSService) GetVisemes(ctx context.Context, text string) ([]VisemeEvent, error) {
|
|
if text == "" {
|
|
return nil, fmt.Errorf("text cannot be empty")
|
|
}
|
|
|
|
// Use phoneme-to-viseme mapping to generate viseme events
|
|
// This is a simplified implementation - in production, you might want to use
|
|
// a more sophisticated phoneme-to-viseme mapping service or library
|
|
visemes := s.generateVisemesFromText(text)
|
|
|
|
return visemes, nil
|
|
}
|
|
|
|
// generateVisemesFromText generates viseme events from text using basic phoneme-to-viseme mapping
|
|
// This is a simplified implementation. For production, consider using:
|
|
// - A dedicated phoneme-to-viseme mapping service
|
|
// - A TTS provider that provides phoneme timing data (e.g., Azure TTS with SSML)
|
|
// - Integration with a speech analysis library
|
|
func (s *ElevenLabsTTSService) generateVisemesFromText(text string) []VisemeEvent {
|
|
// Basic phoneme-to-viseme mapping
|
|
phonemeToViseme := map[string]string{
|
|
// Vowels
|
|
"aa": "aa", "ae": "aa", "ah": "aa", "ao": "aa", "aw": "aa",
|
|
"ay": "aa", "eh": "ee", "er": "er", "ey": "ee", "ih": "ee",
|
|
"iy": "ee", "ow": "oh", "oy": "oh", "uh": "ou", "uw": "ou",
|
|
// Consonants
|
|
"b": "aa", "p": "aa", "m": "aa",
|
|
"f": "ee", "v": "ee",
|
|
"th": "ee",
|
|
"d": "aa", "t": "aa", "n": "aa", "l": "aa",
|
|
"k": "aa", "g": "aa", "ng": "aa",
|
|
"s": "ee", "z": "ee",
|
|
"sh": "ee", "zh": "ee", "ch": "ee", "jh": "ee",
|
|
"y": "ee",
|
|
"w": "ou",
|
|
"r": "er",
|
|
"h": "sil",
|
|
"sil": "sil", "sp": "sil",
|
|
}
|
|
|
|
// Simple word-to-phoneme approximation
|
|
// In production, use a proper TTS API that provides phoneme timing or a phoneme-to-viseme service
|
|
words := strings.Fields(strings.ToLower(text))
|
|
visemes := []VisemeEvent{}
|
|
currentTime := 0.0
|
|
durationPerWord := 0.3 // Approximate duration per word in seconds
|
|
initialPause := 0.1
|
|
|
|
// Initial silence
|
|
visemes = append(visemes, VisemeEvent{
|
|
Viseme: "sil",
|
|
StartTime: currentTime,
|
|
EndTime: currentTime + initialPause,
|
|
Phoneme: "sil",
|
|
})
|
|
currentTime += initialPause
|
|
|
|
// Generate visemes for each word
|
|
for _, word := range words {
|
|
// Simple approximation: map first phoneme to viseme
|
|
viseme := "aa" // default
|
|
if len(word) > 0 {
|
|
firstChar := string(word[0])
|
|
if mapped, ok := phonemeToViseme[firstChar]; ok {
|
|
viseme = mapped
|
|
} else {
|
|
// Map common starting consonants
|
|
switch firstChar {
|
|
case "a", "e", "i", "o", "u":
|
|
viseme = "aa"
|
|
default:
|
|
viseme = "aa"
|
|
}
|
|
}
|
|
}
|
|
|
|
visemes = append(visemes, VisemeEvent{
|
|
Viseme: viseme,
|
|
StartTime: currentTime,
|
|
EndTime: currentTime + durationPerWord,
|
|
Phoneme: word,
|
|
})
|
|
currentTime += durationPerWord
|
|
|
|
// Small pause between words
|
|
visemes = append(visemes, VisemeEvent{
|
|
Viseme: "sil",
|
|
StartTime: currentTime,
|
|
EndTime: currentTime + 0.05,
|
|
Phoneme: "sil",
|
|
})
|
|
currentTime += 0.05
|
|
}
|
|
|
|
// Final silence
|
|
visemes = append(visemes, VisemeEvent{
|
|
Viseme: "sil",
|
|
StartTime: currentTime,
|
|
EndTime: currentTime + 0.1,
|
|
Phoneme: "sil",
|
|
})
|
|
|
|
return visemes
|
|
}
|