Add full monorepo: virtual-banker, backend, frontend, docs, scripts, deployment
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
329
backend/tts/elevenlabs-adapter.go
Normal file
329
backend/tts/elevenlabs-adapter.go
Normal file
@@ -0,0 +1,329 @@
|
||||
package tts
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ElevenLabsTTSService integrates with ElevenLabs TTS API
|
||||
type ElevenLabsTTSService struct {
|
||||
apiKey string
|
||||
voiceID string
|
||||
modelID string
|
||||
baseURL string
|
||||
httpClient *http.Client
|
||||
defaultVoiceConfig *VoiceConfig
|
||||
}
|
||||
|
||||
// VoiceConfig holds ElevenLabs voice configuration
|
||||
type VoiceConfig struct {
|
||||
Stability float64 `json:"stability"`
|
||||
SimilarityBoost float64 `json:"similarity_boost"`
|
||||
Style float64 `json:"style,omitempty"`
|
||||
UseSpeakerBoost bool `json:"use_speaker_boost,omitempty"`
|
||||
}
|
||||
|
||||
// ElevenLabsRequest represents the request body for ElevenLabs API
|
||||
type ElevenLabsRequest struct {
|
||||
Text string `json:"text"`
|
||||
ModelID string `json:"model_id,omitempty"`
|
||||
VoiceSettings VoiceConfig `json:"voice_settings,omitempty"`
|
||||
}
|
||||
|
||||
// NewElevenLabsTTSService creates a new ElevenLabs TTS service
|
||||
func NewElevenLabsTTSService(apiKey, voiceID string) *ElevenLabsTTSService {
|
||||
return &ElevenLabsTTSService{
|
||||
apiKey: apiKey,
|
||||
voiceID: voiceID,
|
||||
modelID: "eleven_multilingual_v2", // Default model
|
||||
baseURL: "https://api.elevenlabs.io/v1",
|
||||
httpClient: &http.Client{
|
||||
Timeout: 30 * time.Second,
|
||||
},
|
||||
defaultVoiceConfig: &VoiceConfig{
|
||||
Stability: 0.5,
|
||||
SimilarityBoost: 0.75,
|
||||
UseSpeakerBoost: true,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// SetModelID sets the model ID for synthesis
|
||||
func (s *ElevenLabsTTSService) SetModelID(modelID string) {
|
||||
s.modelID = modelID
|
||||
}
|
||||
|
||||
// SetVoiceConfig sets the default voice configuration
|
||||
func (s *ElevenLabsTTSService) SetVoiceConfig(config *VoiceConfig) {
|
||||
s.defaultVoiceConfig = config
|
||||
}
|
||||
|
||||
// Synthesize synthesizes text to audio using ElevenLabs REST API
|
||||
func (s *ElevenLabsTTSService) Synthesize(ctx context.Context, text string) ([]byte, error) {
|
||||
return s.SynthesizeWithConfig(ctx, text, s.defaultVoiceConfig)
|
||||
}
|
||||
|
||||
// SynthesizeWithConfig synthesizes text to audio with custom voice configuration
|
||||
func (s *ElevenLabsTTSService) SynthesizeWithConfig(ctx context.Context, text string, config *VoiceConfig) ([]byte, error) {
|
||||
if s.apiKey == "" {
|
||||
return nil, fmt.Errorf("ElevenLabs API key not configured")
|
||||
}
|
||||
if s.voiceID == "" {
|
||||
return nil, fmt.Errorf("ElevenLabs voice ID not configured")
|
||||
}
|
||||
if text == "" {
|
||||
return nil, fmt.Errorf("text cannot be empty")
|
||||
}
|
||||
|
||||
// Use default config if none provided
|
||||
if config == nil {
|
||||
config = s.defaultVoiceConfig
|
||||
}
|
||||
|
||||
// Prepare request body
|
||||
reqBody := ElevenLabsRequest{
|
||||
Text: text,
|
||||
ModelID: s.modelID,
|
||||
VoiceSettings: *config,
|
||||
}
|
||||
|
||||
jsonBody, err := json.Marshal(reqBody)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to marshal request: %w", err)
|
||||
}
|
||||
|
||||
// Build request URL
|
||||
url := fmt.Sprintf("%s/text-to-speech/%s", s.baseURL, s.voiceID)
|
||||
|
||||
// Create HTTP request
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonBody))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Accept", "audio/mpeg")
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("xi-api-key", s.apiKey)
|
||||
|
||||
// Execute request with retry logic
|
||||
var resp *http.Response
|
||||
maxRetries := 3
|
||||
for i := 0; i < maxRetries; i++ {
|
||||
resp, err = s.httpClient.Do(req)
|
||||
if err == nil && resp.StatusCode == http.StatusOK {
|
||||
break
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if i < maxRetries-1 {
|
||||
// Exponential backoff
|
||||
backoff := time.Duration(i+1) * time.Second
|
||||
time.Sleep(backoff)
|
||||
continue
|
||||
}
|
||||
return nil, fmt.Errorf("failed to call ElevenLabs API after %d retries: %w", maxRetries, err)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
resp.Body.Close()
|
||||
bodyBytes, _ := io.ReadAll(bytes.NewReader([]byte{}))
|
||||
if resp.Body != nil {
|
||||
bodyBytes, _ = io.ReadAll(resp.Body)
|
||||
}
|
||||
|
||||
// Retry on 5xx errors
|
||||
if resp.StatusCode >= 500 && i < maxRetries-1 {
|
||||
backoff := time.Duration(i+1) * time.Second
|
||||
time.Sleep(backoff)
|
||||
continue
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("ElevenLabs API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Read audio data
|
||||
audioData, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read audio data: %w", err)
|
||||
}
|
||||
|
||||
return audioData, nil
|
||||
}
|
||||
|
||||
// SynthesizeStream synthesizes text to audio using ElevenLabs streaming API
|
||||
func (s *ElevenLabsTTSService) SynthesizeStream(ctx context.Context, text string) (io.Reader, error) {
|
||||
return s.SynthesizeStreamWithConfig(ctx, text, s.defaultVoiceConfig)
|
||||
}
|
||||
|
||||
// SynthesizeStreamWithConfig synthesizes text to audio stream with custom voice configuration
|
||||
func (s *ElevenLabsTTSService) SynthesizeStreamWithConfig(ctx context.Context, text string, config *VoiceConfig) (io.Reader, error) {
|
||||
if s.apiKey == "" {
|
||||
return nil, fmt.Errorf("ElevenLabs API key not configured")
|
||||
}
|
||||
if s.voiceID == "" {
|
||||
return nil, fmt.Errorf("ElevenLabs voice ID not configured")
|
||||
}
|
||||
if text == "" {
|
||||
return nil, fmt.Errorf("text cannot be empty")
|
||||
}
|
||||
|
||||
// Use default config if none provided
|
||||
if config == nil {
|
||||
config = s.defaultVoiceConfig
|
||||
}
|
||||
|
||||
// Prepare request body
|
||||
reqBody := ElevenLabsRequest{
|
||||
Text: text,
|
||||
ModelID: s.modelID,
|
||||
VoiceSettings: *config,
|
||||
}
|
||||
|
||||
jsonBody, err := json.Marshal(reqBody)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to marshal request: %w", err)
|
||||
}
|
||||
|
||||
// Build request URL for streaming
|
||||
url := fmt.Sprintf("%s/text-to-speech/%s/stream", s.baseURL, s.voiceID)
|
||||
|
||||
// Create HTTP request
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonBody))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Accept", "audio/mpeg")
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("xi-api-key", s.apiKey)
|
||||
|
||||
// Execute request
|
||||
resp, err := s.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to call ElevenLabs streaming API: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
resp.Body.Close()
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("ElevenLabs streaming API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
|
||||
// Return stream reader (caller is responsible for closing)
|
||||
return resp.Body, nil
|
||||
}
|
||||
|
||||
// GetVisemes returns viseme events for lip sync
|
||||
// ElevenLabs doesn't provide viseme data directly, so we use phoneme-to-viseme mapping
|
||||
func (s *ElevenLabsTTSService) GetVisemes(ctx context.Context, text string) ([]VisemeEvent, error) {
|
||||
if text == "" {
|
||||
return nil, fmt.Errorf("text cannot be empty")
|
||||
}
|
||||
|
||||
// Use phoneme-to-viseme mapping to generate viseme events
|
||||
// This is a simplified implementation - in production, you might want to use
|
||||
// a more sophisticated phoneme-to-viseme mapping service or library
|
||||
visemes := s.generateVisemesFromText(text)
|
||||
|
||||
return visemes, nil
|
||||
}
|
||||
|
||||
// generateVisemesFromText generates viseme events from text using basic phoneme-to-viseme mapping
|
||||
// This is a simplified implementation. For production, consider using:
|
||||
// - A dedicated phoneme-to-viseme mapping service
|
||||
// - A TTS provider that provides phoneme timing data (e.g., Azure TTS with SSML)
|
||||
// - Integration with a speech analysis library
|
||||
func (s *ElevenLabsTTSService) generateVisemesFromText(text string) []VisemeEvent {
|
||||
// Basic phoneme-to-viseme mapping
|
||||
phonemeToViseme := map[string]string{
|
||||
// Vowels
|
||||
"aa": "aa", "ae": "aa", "ah": "aa", "ao": "aa", "aw": "aa",
|
||||
"ay": "aa", "eh": "ee", "er": "er", "ey": "ee", "ih": "ee",
|
||||
"iy": "ee", "ow": "oh", "oy": "oh", "uh": "ou", "uw": "ou",
|
||||
// Consonants
|
||||
"b": "aa", "p": "aa", "m": "aa",
|
||||
"f": "ee", "v": "ee",
|
||||
"th": "ee",
|
||||
"d": "aa", "t": "aa", "n": "aa", "l": "aa",
|
||||
"k": "aa", "g": "aa", "ng": "aa",
|
||||
"s": "ee", "z": "ee",
|
||||
"sh": "ee", "zh": "ee", "ch": "ee", "jh": "ee",
|
||||
"y": "ee",
|
||||
"w": "ou",
|
||||
"r": "er",
|
||||
"h": "sil",
|
||||
"sil": "sil", "sp": "sil",
|
||||
}
|
||||
|
||||
// Simple word-to-phoneme approximation
|
||||
// In production, use a proper TTS API that provides phoneme timing or a phoneme-to-viseme service
|
||||
words := strings.Fields(strings.ToLower(text))
|
||||
visemes := []VisemeEvent{}
|
||||
currentTime := 0.0
|
||||
durationPerWord := 0.3 // Approximate duration per word in seconds
|
||||
initialPause := 0.1
|
||||
|
||||
// Initial silence
|
||||
visemes = append(visemes, VisemeEvent{
|
||||
Viseme: "sil",
|
||||
StartTime: currentTime,
|
||||
EndTime: currentTime + initialPause,
|
||||
Phoneme: "sil",
|
||||
})
|
||||
currentTime += initialPause
|
||||
|
||||
// Generate visemes for each word
|
||||
for _, word := range words {
|
||||
// Simple approximation: map first phoneme to viseme
|
||||
viseme := "aa" // default
|
||||
if len(word) > 0 {
|
||||
firstChar := string(word[0])
|
||||
if mapped, ok := phonemeToViseme[firstChar]; ok {
|
||||
viseme = mapped
|
||||
} else {
|
||||
// Map common starting consonants
|
||||
switch firstChar {
|
||||
case "a", "e", "i", "o", "u":
|
||||
viseme = "aa"
|
||||
default:
|
||||
viseme = "aa"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
visemes = append(visemes, VisemeEvent{
|
||||
Viseme: viseme,
|
||||
StartTime: currentTime,
|
||||
EndTime: currentTime + durationPerWord,
|
||||
Phoneme: word,
|
||||
})
|
||||
currentTime += durationPerWord
|
||||
|
||||
// Small pause between words
|
||||
visemes = append(visemes, VisemeEvent{
|
||||
Viseme: "sil",
|
||||
StartTime: currentTime,
|
||||
EndTime: currentTime + 0.05,
|
||||
Phoneme: "sil",
|
||||
})
|
||||
currentTime += 0.05
|
||||
}
|
||||
|
||||
// Final silence
|
||||
visemes = append(visemes, VisemeEvent{
|
||||
Viseme: "sil",
|
||||
StartTime: currentTime,
|
||||
EndTime: currentTime + 0.1,
|
||||
Phoneme: "sil",
|
||||
})
|
||||
|
||||
return visemes
|
||||
}
|
||||
58
backend/tts/service.go
Normal file
58
backend/tts/service.go
Normal file
@@ -0,0 +1,58 @@
|
||||
package tts
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
)
|
||||
|
||||
// Service provides text-to-speech functionality
|
||||
type Service interface {
|
||||
SynthesizeStream(ctx context.Context, text string) (io.Reader, error)
|
||||
Synthesize(ctx context.Context, text string) ([]byte, error)
|
||||
GetVisemes(ctx context.Context, text string) ([]VisemeEvent, error)
|
||||
}
|
||||
|
||||
// VisemeEvent represents a viseme (lip shape) event for lip sync
|
||||
type VisemeEvent struct {
|
||||
Viseme string `json:"viseme"` // e.g., "sil", "aa", "ee", "oh", "ou"
|
||||
StartTime float64 `json:"start_time"`
|
||||
EndTime float64 `json:"end_time"`
|
||||
Phoneme string `json:"phoneme,omitempty"`
|
||||
}
|
||||
|
||||
// MockTTSService is a mock implementation for development
|
||||
type MockTTSService struct{}
|
||||
|
||||
// NewMockTTSService creates a new mock TTS service
|
||||
func NewMockTTSService() *MockTTSService {
|
||||
return &MockTTSService{}
|
||||
}
|
||||
|
||||
// SynthesizeStream synthesizes text to audio stream
|
||||
func (s *MockTTSService) SynthesizeStream(ctx context.Context, text string) (io.Reader, error) {
|
||||
// Mock implementation - in production, integrate with ElevenLabs, Azure TTS, etc.
|
||||
// For now, return empty reader
|
||||
return io.NopCloser(io.Reader(nil)), nil
|
||||
}
|
||||
|
||||
// Synthesize synthesizes text to audio
|
||||
func (s *MockTTSService) Synthesize(ctx context.Context, text string) ([]byte, error) {
|
||||
// Mock implementation
|
||||
return []byte{}, nil
|
||||
}
|
||||
|
||||
// GetVisemes returns viseme events for lip sync
|
||||
func (s *MockTTSService) GetVisemes(ctx context.Context, text string) ([]VisemeEvent, error) {
|
||||
// Mock implementation - return basic visemes
|
||||
return []VisemeEvent{
|
||||
{Viseme: "sil", StartTime: 0.0, EndTime: 0.1},
|
||||
{Viseme: "aa", StartTime: 0.1, EndTime: 0.3},
|
||||
{Viseme: "ee", StartTime: 0.3, EndTime: 0.5},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ElevenLabsTTSService integrates with ElevenLabs (implementation in elevenlabs-adapter.go)
|
||||
// This interface definition is kept for backwards compatibility
|
||||
// The actual implementation is in elevenlabs-adapter.go
|
||||
|
||||
Reference in New Issue
Block a user