Add full monorepo: virtual-banker, backend, frontend, docs, scripts, deployment

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-10 11:32:49 -08:00
commit b4753cef7e
81 changed files with 9255 additions and 0 deletions
--- a/backend/tts/elevenlabs-adapter.go
+++ b/backend/tts/elevenlabs-adapter.go
@@ -0,0 +1,329 @@
+package tts
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+)
+
+// ElevenLabsTTSService integrates with ElevenLabs TTS API
+type ElevenLabsTTSService struct {
+	apiKey      string
+	voiceID     string
+	modelID     string
+	baseURL     string
+	httpClient  *http.Client
+	defaultVoiceConfig *VoiceConfig
+}
+
+// VoiceConfig holds ElevenLabs voice configuration
+type VoiceConfig struct {
+	Stability       float64 `json:"stability"`
+	SimilarityBoost float64 `json:"similarity_boost"`
+	Style           float64 `json:"style,omitempty"`
+	UseSpeakerBoost bool    `json:"use_speaker_boost,omitempty"`
+}
+
+// ElevenLabsRequest represents the request body for ElevenLabs API
+type ElevenLabsRequest struct {
+	Text      string       `json:"text"`
+	ModelID   string       `json:"model_id,omitempty"`
+	VoiceSettings VoiceConfig `json:"voice_settings,omitempty"`
+}
+
+// NewElevenLabsTTSService creates a new ElevenLabs TTS service
+func NewElevenLabsTTSService(apiKey, voiceID string) *ElevenLabsTTSService {
+	return &ElevenLabsTTSService{
+		apiKey:  apiKey,
+		voiceID: voiceID,
+		modelID: "eleven_multilingual_v2", // Default model
+		baseURL: "https://api.elevenlabs.io/v1",
+		httpClient: &http.Client{
+			Timeout: 30 * time.Second,
+		},
+		defaultVoiceConfig: &VoiceConfig{
+			Stability:       0.5,
+			SimilarityBoost: 0.75,
+			UseSpeakerBoost: true,
+		},
+	}
+}
+
+// SetModelID sets the model ID for synthesis
+func (s *ElevenLabsTTSService) SetModelID(modelID string) {
+	s.modelID = modelID
+}
+
+// SetVoiceConfig sets the default voice configuration
+func (s *ElevenLabsTTSService) SetVoiceConfig(config *VoiceConfig) {
+	s.defaultVoiceConfig = config
+}
+
+// Synthesize synthesizes text to audio using ElevenLabs REST API
+func (s *ElevenLabsTTSService) Synthesize(ctx context.Context, text string) ([]byte, error) {
+	return s.SynthesizeWithConfig(ctx, text, s.defaultVoiceConfig)
+}
+
+// SynthesizeWithConfig synthesizes text to audio with custom voice configuration
+func (s *ElevenLabsTTSService) SynthesizeWithConfig(ctx context.Context, text string, config *VoiceConfig) ([]byte, error) {
+	if s.apiKey == "" {
+		return nil, fmt.Errorf("ElevenLabs API key not configured")
+	}
+	if s.voiceID == "" {
+		return nil, fmt.Errorf("ElevenLabs voice ID not configured")
+	}
+	if text == "" {
+		return nil, fmt.Errorf("text cannot be empty")
+	}
+
+	// Use default config if none provided
+	if config == nil {
+		config = s.defaultVoiceConfig
+	}
+
+	// Prepare request body
+	reqBody := ElevenLabsRequest{
+		Text:      text,
+		ModelID:   s.modelID,
+		VoiceSettings: *config,
+	}
+
+	jsonBody, err := json.Marshal(reqBody)
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal request: %w", err)
+	}
+
+	// Build request URL
+	url := fmt.Sprintf("%s/text-to-speech/%s", s.baseURL, s.voiceID)
+
+	// Create HTTP request
+	req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonBody))
+	if err != nil {
+		return nil, fmt.Errorf("failed to create request: %w", err)
+	}
+
+	req.Header.Set("Accept", "audio/mpeg")
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("xi-api-key", s.apiKey)
+
+	// Execute request with retry logic
+	var resp *http.Response
+	maxRetries := 3
+	for i := 0; i < maxRetries; i++ {
+		resp, err = s.httpClient.Do(req)
+		if err == nil && resp.StatusCode == http.StatusOK {
+			break
+		}
+		
+		if err != nil {
+			if i < maxRetries-1 {
+				// Exponential backoff
+				backoff := time.Duration(i+1) * time.Second
+				time.Sleep(backoff)
+				continue
+			}
+			return nil, fmt.Errorf("failed to call ElevenLabs API after %d retries: %w", maxRetries, err)
+		}
+
+		if resp.StatusCode != http.StatusOK {
+			resp.Body.Close()
+			bodyBytes, _ := io.ReadAll(bytes.NewReader([]byte{}))
+			if resp.Body != nil {
+				bodyBytes, _ = io.ReadAll(resp.Body)
+			}
+			
+			// Retry on 5xx errors
+			if resp.StatusCode >= 500 && i < maxRetries-1 {
+				backoff := time.Duration(i+1) * time.Second
+				time.Sleep(backoff)
+				continue
+			}
+			
+			return nil, fmt.Errorf("ElevenLabs API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
+		}
+	}
+	defer resp.Body.Close()
+
+	// Read audio data
+	audioData, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read audio data: %w", err)
+	}
+
+	return audioData, nil
+}
+
+// SynthesizeStream synthesizes text to audio using ElevenLabs streaming API
+func (s *ElevenLabsTTSService) SynthesizeStream(ctx context.Context, text string) (io.Reader, error) {
+	return s.SynthesizeStreamWithConfig(ctx, text, s.defaultVoiceConfig)
+}
+
+// SynthesizeStreamWithConfig synthesizes text to audio stream with custom voice configuration
+func (s *ElevenLabsTTSService) SynthesizeStreamWithConfig(ctx context.Context, text string, config *VoiceConfig) (io.Reader, error) {
+	if s.apiKey == "" {
+		return nil, fmt.Errorf("ElevenLabs API key not configured")
+	}
+	if s.voiceID == "" {
+		return nil, fmt.Errorf("ElevenLabs voice ID not configured")
+	}
+	if text == "" {
+		return nil, fmt.Errorf("text cannot be empty")
+	}
+
+	// Use default config if none provided
+	if config == nil {
+		config = s.defaultVoiceConfig
+	}
+
+	// Prepare request body
+	reqBody := ElevenLabsRequest{
+		Text:      text,
+		ModelID:   s.modelID,
+		VoiceSettings: *config,
+	}
+
+	jsonBody, err := json.Marshal(reqBody)
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal request: %w", err)
+	}
+
+	// Build request URL for streaming
+	url := fmt.Sprintf("%s/text-to-speech/%s/stream", s.baseURL, s.voiceID)
+
+	// Create HTTP request
+	req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonBody))
+	if err != nil {
+		return nil, fmt.Errorf("failed to create request: %w", err)
+	}
+
+	req.Header.Set("Accept", "audio/mpeg")
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("xi-api-key", s.apiKey)
+
+	// Execute request
+	resp, err := s.httpClient.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("failed to call ElevenLabs streaming API: %w", err)
+	}
+
+	if resp.StatusCode != http.StatusOK {
+		resp.Body.Close()
+		bodyBytes, _ := io.ReadAll(resp.Body)
+		return nil, fmt.Errorf("ElevenLabs streaming API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
+	}
+
+	// Return stream reader (caller is responsible for closing)
+	return resp.Body, nil
+}
+
+// GetVisemes returns viseme events for lip sync
+// ElevenLabs doesn't provide viseme data directly, so we use phoneme-to-viseme mapping
+func (s *ElevenLabsTTSService) GetVisemes(ctx context.Context, text string) ([]VisemeEvent, error) {
+	if text == "" {
+		return nil, fmt.Errorf("text cannot be empty")
+	}
+
+	// Use phoneme-to-viseme mapping to generate viseme events
+	// This is a simplified implementation - in production, you might want to use
+	// a more sophisticated phoneme-to-viseme mapping service or library
+	visemes := s.generateVisemesFromText(text)
+	
+	return visemes, nil
+}
+
+// generateVisemesFromText generates viseme events from text using basic phoneme-to-viseme mapping
+// This is a simplified implementation. For production, consider using:
+// - A dedicated phoneme-to-viseme mapping service
+// - A TTS provider that provides phoneme timing data (e.g., Azure TTS with SSML)
+// - Integration with a speech analysis library
+func (s *ElevenLabsTTSService) generateVisemesFromText(text string) []VisemeEvent {
+	// Basic phoneme-to-viseme mapping
+	phonemeToViseme := map[string]string{
+		// Vowels
+		"aa": "aa", "ae": "aa", "ah": "aa", "ao": "aa", "aw": "aa",
+		"ay": "aa", "eh": "ee", "er": "er", "ey": "ee", "ih": "ee",
+		"iy": "ee", "ow": "oh", "oy": "oh", "uh": "ou", "uw": "ou",
+		// Consonants
+		"b": "aa", "p": "aa", "m": "aa",
+		"f": "ee", "v": "ee",
+		"th": "ee",
+		"d": "aa", "t": "aa", "n": "aa", "l": "aa",
+		"k": "aa", "g": "aa", "ng": "aa",
+		"s": "ee", "z": "ee",
+		"sh": "ee", "zh": "ee", "ch": "ee", "jh": "ee",
+		"y": "ee",
+		"w": "ou",
+		"r": "er",
+		"h": "sil",
+		"sil": "sil", "sp": "sil",
+	}
+
+	// Simple word-to-phoneme approximation
+	// In production, use a proper TTS API that provides phoneme timing or a phoneme-to-viseme service
+	words := strings.Fields(strings.ToLower(text))
+	visemes := []VisemeEvent{}
+	currentTime := 0.0
+	durationPerWord := 0.3 // Approximate duration per word in seconds
+	initialPause := 0.1
+
+	// Initial silence
+	visemes = append(visemes, VisemeEvent{
+		Viseme:    "sil",
+		StartTime: currentTime,
+		EndTime:   currentTime + initialPause,
+		Phoneme:   "sil",
+	})
+	currentTime += initialPause
+
+	// Generate visemes for each word
+	for _, word := range words {
+		// Simple approximation: map first phoneme to viseme
+		viseme := "aa" // default
+		if len(word) > 0 {
+			firstChar := string(word[0])
+			if mapped, ok := phonemeToViseme[firstChar]; ok {
+				viseme = mapped
+			} else {
+				// Map common starting consonants
+				switch firstChar {
+				case "a", "e", "i", "o", "u":
+					viseme = "aa"
+				default:
+					viseme = "aa"
+				}
+			}
+		}
+
+		visemes = append(visemes, VisemeEvent{
+			Viseme:    viseme,
+			StartTime: currentTime,
+			EndTime:   currentTime + durationPerWord,
+			Phoneme:   word,
+		})
+		currentTime += durationPerWord
+
+		// Small pause between words
+		visemes = append(visemes, VisemeEvent{
+			Viseme:    "sil",
+			StartTime: currentTime,
+			EndTime:   currentTime + 0.05,
+			Phoneme:   "sil",
+		})
+		currentTime += 0.05
+	}
+
+	// Final silence
+	visemes = append(visemes, VisemeEvent{
+		Viseme:    "sil",
+		StartTime: currentTime,
+		EndTime:   currentTime + 0.1,
+		Phoneme:   "sil",
+	})
+
+	return visemes
+}
--- a/backend/tts/service.go
+++ b/backend/tts/service.go
@@ -0,0 +1,58 @@
+package tts
+
+import (
+	"context"
+	"fmt"
+	"io"
+)
+
+// Service provides text-to-speech functionality
+type Service interface {
+	SynthesizeStream(ctx context.Context, text string) (io.Reader, error)
+	Synthesize(ctx context.Context, text string) ([]byte, error)
+	GetVisemes(ctx context.Context, text string) ([]VisemeEvent, error)
+}
+
+// VisemeEvent represents a viseme (lip shape) event for lip sync
+type VisemeEvent struct {
+	Viseme    string  `json:"viseme"` // e.g., "sil", "aa", "ee", "oh", "ou"
+	StartTime float64 `json:"start_time"`
+	EndTime   float64 `json:"end_time"`
+	Phoneme   string  `json:"phoneme,omitempty"`
+}
+
+// MockTTSService is a mock implementation for development
+type MockTTSService struct{}
+
+// NewMockTTSService creates a new mock TTS service
+func NewMockTTSService() *MockTTSService {
+	return &MockTTSService{}
+}
+
+// SynthesizeStream synthesizes text to audio stream
+func (s *MockTTSService) SynthesizeStream(ctx context.Context, text string) (io.Reader, error) {
+	// Mock implementation - in production, integrate with ElevenLabs, Azure TTS, etc.
+	// For now, return empty reader
+	return io.NopCloser(io.Reader(nil)), nil
+}
+
+// Synthesize synthesizes text to audio
+func (s *MockTTSService) Synthesize(ctx context.Context, text string) ([]byte, error) {
+	// Mock implementation
+	return []byte{}, nil
+}
+
+// GetVisemes returns viseme events for lip sync
+func (s *MockTTSService) GetVisemes(ctx context.Context, text string) ([]VisemeEvent, error) {
+	// Mock implementation - return basic visemes
+	return []VisemeEvent{
+		{Viseme: "sil", StartTime: 0.0, EndTime: 0.1},
+		{Viseme: "aa", StartTime: 0.1, EndTime: 0.3},
+		{Viseme: "ee", StartTime: 0.3, EndTime: 0.5},
+	}, nil
+}
+
+// ElevenLabsTTSService integrates with ElevenLabs (implementation in elevenlabs-adapter.go)
+// This interface definition is kept for backwards compatibility
+// The actual implementation is in elevenlabs-adapter.go
+