From 9839401d1da5e8a07a3457157f854fa7c00a6298 Mon Sep 17 00:00:00 2001 From: defiQUG Date: Tue, 10 Feb 2026 16:54:10 -0800 Subject: [PATCH] TTS: configurable auth, Health check, Phoenix options; .env.example; Gitea CI workflow Co-authored-by: Cursor --- .env.example | 21 +++++ .gitea/workflows/ci.yml | 23 ++++++ README.md | 4 + backend/main.go | 26 ++++++- backend/observability/tracing.go | 1 - backend/tts/README.md | 86 +++++++++++++++++++++ backend/tts/elevenlabs-adapter.go | 122 +++++++++++++++++++++++------- backend/tts/service.go | 6 +- 8 files changed, 259 insertions(+), 30 deletions(-) create mode 100644 .env.example create mode 100644 .gitea/workflows/ci.yml create mode 100644 backend/tts/README.md diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..ac4d75a --- /dev/null +++ b/.env.example @@ -0,0 +1,21 @@ +# Virtual Banker — example environment (copy to .env and set values) +# Do not commit .env; use secrets in CI/production. + +# Database and Redis +DATABASE_URL=postgres://user:pass@localhost:5432/virtual_banker?sslmode=disable +REDIS_URL=redis://localhost:6379 +PORT=8081 + +# TTS: ElevenLabs (default) or Phoenix +# Leave unset to use mock TTS. Set TTS_VOICE_ID + one of the keys for real TTS. +TTS_VOICE_ID= +TTS_API_KEY= +# ELEVENLABS_API_KEY= # alternative to TTS_API_KEY +# ELEVENLABS_VOICE_ID= # alternative to TTS_VOICE_ID + +# Phoenix / custom TTS endpoint (optional) +# TTS_BASE_URL=https://phoenix.example.com/tts/v1 +# TTS_AUTH_HEADER_NAME=Authorization +# TTS_AUTH_HEADER_VALUE=Bearer your-token +# USE_PHOENIX_TTS=true +# PHOENIX_TTS_BASE_URL=https://phoenix.example.com/tts/v1 diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml new file mode 100644 index 0000000..8b70825 --- /dev/null +++ b/.gitea/workflows/ci.yml @@ -0,0 +1,23 @@ +# Gitea Actions: build and test virtual-banker backend on push +name: CI +on: + push: + branches: [master, main] + pull_request: + branches: [master, main] +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.21' + - name: Build + run: go build ./... + working-directory: backend + - name: Test + run: go test ./... + working-directory: backend diff --git a/README.md b/README.md index f0761ea..1a8c7b7 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,10 @@ virtual-banker/ - PostgreSQL 16+ with pgvector extension - Redis +### Configuration + +- Copy `.env.example` to `.env` and set `DATABASE_URL`, `REDIS_URL`, and optionally TTS vars (`TTS_BASE_URL`, `TTS_API_KEY`, `TTS_VOICE_ID`) for ElevenLabs or Phoenix. See `backend/tts/README.md` for TTS backend selection and Phoenix endpoint swap. + ### Development Setup 1. **Start infrastructure** (uses existing postgres/redis from main monorepo): diff --git a/backend/main.go b/backend/main.go index a00983c..0bed28b 100644 --- a/backend/main.go +++ b/backend/main.go @@ -55,9 +55,9 @@ func main() { // Initialize services sessionManager := session.NewManager(db, redisClient) - // Initialize ASR/TTS (using mocks for now) + // Initialize ASR/TTS asrService := asr.NewMockASRService() - ttsService := tts.NewMockTTSService() + ttsService := newTTSService() // Initialize LLM (using mock for now) llmGateway := llm.NewMockLLMGateway() @@ -128,6 +128,28 @@ func main() { log.Println("Server exited") } +// newTTSService returns a TTS service from env: use real API when TTS_API_KEY (or +// ELEVENLABS_API_KEY) and TTS_VOICE_ID are set. Optional: TTS_BASE_URL (Phoenix), +// TTS_AUTH_HEADER_NAME / TTS_AUTH_HEADER_VALUE (e.g. Authorization: Bearer), +// USE_PHOENIX_TTS=true to require TTS_BASE_URL. +func newTTSService() tts.Service { + apiKey := getEnv("TTS_API_KEY", os.Getenv("ELEVENLABS_API_KEY")) + voiceID := getEnv("TTS_VOICE_ID", os.Getenv("ELEVENLABS_VOICE_ID")) + baseURL := getEnv("TTS_BASE_URL", "") + authName := getEnv("TTS_AUTH_HEADER_NAME", "") + authValue := getEnv("TTS_AUTH_HEADER_VALUE", "") + usePhoenix := getEnv("USE_PHOENIX_TTS", "") == "true" || getEnv("USE_PHOENIX_TTS", "") == "1" + if usePhoenix && baseURL == "" { + baseURL = getEnv("PHOENIX_TTS_BASE_URL", "https://phoenix.example.com/tts/v1") + } + hasAuth := apiKey != "" || authValue != "" + if hasAuth && voiceID != "" { + opts := tts.TTSOptions{BaseURL: baseURL, AuthHeaderName: authName, AuthHeaderValue: authValue} + return tts.NewElevenLabsTTSServiceWithOptionsFull(apiKey, voiceID, opts) + } + return tts.NewMockTTSService() +} + func getEnv(key, defaultValue string) string { if value := os.Getenv(key); value != "" { return value diff --git a/backend/observability/tracing.go b/backend/observability/tracing.go index 73b0519..34b421b 100644 --- a/backend/observability/tracing.go +++ b/backend/observability/tracing.go @@ -2,7 +2,6 @@ package observability import ( "context" - "fmt" ) // Tracer provides distributed tracing diff --git a/backend/tts/README.md b/backend/tts/README.md new file mode 100644 index 0000000..747d671 --- /dev/null +++ b/backend/tts/README.md @@ -0,0 +1,86 @@ +# TTS package — ElevenLabs-compatible, Phoenix endpoint swap + +This package provides a **text-to-speech client** that matches the [ElevenLabs TTS API](https://elevenlabs.io/docs/api-reference/text-to-speech) contract. You can point it at **ElevenLabs** or at a **Phoenix-hosted** TTS service that implements the same API shape; switching is a config change (base URL), no code change. + +**Note:** The repo [eleven-labs/api-service](https://github.com/eleven-labs/api-service) on GitHub is a PHP OpenAPI consumer library, not the voice TTS API. This client targets the **REST TTS API** at `api.elevenlabs.io` (and compatible backends). + +--- + +## Parity with ElevenLabs TTS API + +| Feature | ElevenLabs API | This client | +|--------|----------------|-------------| +| **Sync** `POST /v1/text-to-speech/:voice_id` | ✅ | ✅ `Synthesize` | +| **Stream** `POST /v1/text-to-speech/:voice_id/stream` | ✅ | ✅ `SynthesizeStream` | +| **Voice settings** (stability, similarity_boost, style, speaker_boost) | ✅ | ✅ `VoiceConfig` | +| **Model** (`model_id`) | ✅ | ✅ `SetModelID` / default `eleven_multilingual_v2` | +| **Auth** `xi-api-key` header | ✅ | ✅ | +| **Output** `Accept: audio/mpeg` (mp3) | ✅ | ✅ | +| **Retries** (5xx, backoff) | — | ✅ on sync | +| **Visemes** (lip sync) | ❌ (no phoneme API) | ✅ client-side approximation | + +Optional ElevenLabs features not used here: `output_format` query, `optimize_streaming_latency`, WebSocket streaming. For “just change endpoint” to Phoenix, the host only needs to implement the same **sync + stream** JSON body and return **audio/mpeg**. + +--- + +## Which TTS backend? (decision table) + +| Env / condition | Backend used | +|----------------|--------------| +| `TTS_VOICE_ID` unset (or no auth) | **Mock** (no real synthesis) | +| `TTS_VOICE_ID` + `TTS_API_KEY` or `ELEVENLABS_*` set, `TTS_BASE_URL` unset | **ElevenLabs** (api.elevenlabs.io) | +| `TTS_BASE_URL` set (e.g. Phoenix) + auth + voice | **Phoenix** (or other compatible host) | +| `USE_PHOENIX_TTS=true` | Prefer Phoenix; use `TTS_BASE_URL` or `PHOENIX_TTS_BASE_URL` | + +Auth: default header is `xi-api-key` (ElevenLabs). For Phoenix with Bearer token set `TTS_AUTH_HEADER_NAME=Authorization` and `TTS_AUTH_HEADER_VALUE=Bearer `. + +--- + +## Using with Phoenix (swap endpoint) + +1. **Phoenix TTS service** must expose the same contract: + - `POST /v1/text-to-speech/:voice_id` — body: `{"text","model_id","voice_settings"}` → response: raw mp3 + - `POST /v1/text-to-speech/:voice_id/stream` — same body → response: streaming mp3 + - **Health:** `GET /health` at the same origin (e.g. `{baseURL}/../health`) returning 2xx so `tts.Service.Health(ctx)` can be used for readiness. + +2. **Configure the app** with the Phoenix base URL (and optional auth): + + ```bash + export TTS_BASE_URL="https://phoenix.example.com/tts/v1" + export TTS_VOICE_ID="default-voice-id" + # Optional: Phoenix uses Bearer token + export TTS_AUTH_HEADER_NAME="Authorization" + export TTS_AUTH_HEADER_VALUE="Bearer your-token" + # Or feature flag to force Phoenix + export USE_PHOENIX_TTS=true + export PHOENIX_TTS_BASE_URL="https://phoenix.example.com/tts/v1" + ``` + +3. **Health check:** The client’s `Health(ctx)` calls `GET {baseURL}/../health` when base URL is not ElevenLabs. Wire this into your readiness probe or a `/ready` endpoint if you need TTS to be up before accepting traffic. + +4. **In code** (e.g. for reuse in another project): + + ```go + opts := tts.TTSOptions{ + BaseURL: "https://phoenix.example.com/tts/v1", + AuthHeaderName: "Authorization", + AuthHeaderValue: "Bearer token", + } + svc := tts.NewElevenLabsTTSServiceWithOptionsFull(apiKey, voiceID, opts) + if err := svc.Health(ctx); err != nil { /* not ready */ } + audio, err := svc.Synthesize(ctx, "Hello world") + ``` + +No code change beyond config: same interface, different base URL and optional auth header. + +--- + +## Reuse across projects + +This package lives in **virtual-banker** and can be depended on as a Go module path (e.g. `github.com/your-org/virtual-banker/backend/tts` or via a shared repo). Any project that needs TTS can: + +- Depend on this package. +- Use `tts.Service` and either `NewMockTTSService()` or `NewElevenLabsTTSServiceWithOptions(apiKey, voiceID, baseURL)` / `NewElevenLabsTTSServiceWithOptionsFull(apiKey, voiceID, opts)` for custom auth. +- Set `baseURL` to ElevenLabs (`""` or `https://api.elevenlabs.io/v1`) or to the Phoenix TTS base URL. + +The **interface** (`Synthesize`, `SynthesizeStream`, `GetVisemes`) stays the same regardless of backend. diff --git a/backend/tts/elevenlabs-adapter.go b/backend/tts/elevenlabs-adapter.go index 2fb2948..db75ed8 100644 --- a/backend/tts/elevenlabs-adapter.go +++ b/backend/tts/elevenlabs-adapter.go @@ -7,20 +7,31 @@ import ( "fmt" "io" "net/http" + "net/url" + "path" "strings" "time" ) -// ElevenLabsTTSService integrates with ElevenLabs TTS API +// ElevenLabsTTSService integrates with ElevenLabs TTS API or a Phoenix-compatible endpoint type ElevenLabsTTSService struct { - apiKey string - voiceID string - modelID string - baseURL string - httpClient *http.Client + apiKey string + voiceID string + modelID string + baseURL string + authHeaderName string // default "xi-api-key" when empty + authHeaderValue string + httpClient *http.Client defaultVoiceConfig *VoiceConfig } +// TTSOptions allows optional overrides when creating the TTS service (e.g. Phoenix auth) +type TTSOptions struct { + BaseURL string // e.g. "https://phoenix.example.com/tts/v1" + AuthHeaderName string // e.g. "Authorization"; empty = "xi-api-key" + AuthHeaderValue string // e.g. "Bearer token"; empty = apiKey +} + // VoiceConfig holds ElevenLabs voice configuration type VoiceConfig struct { Stability float64 `json:"stability"` @@ -36,13 +47,45 @@ type ElevenLabsRequest struct { VoiceSettings VoiceConfig `json:"voice_settings,omitempty"` } -// NewElevenLabsTTSService creates a new ElevenLabs TTS service +// DefaultElevenLabsBaseURL is the default TTS API base (ElevenLabs or Phoenix-compatible). +const DefaultElevenLabsBaseURL = "https://api.elevenlabs.io/v1" + +// NewElevenLabsTTSService creates a new TTS service for ElevenLabs or a Phoenix-hosted +// ElevenLabs-compatible API. Use baseURL "" for default (api.elevenlabs.io); set to +// your Phoenix TTS base (e.g. https://phoenix.example.com/tts/v1) to swap endpoint. func NewElevenLabsTTSService(apiKey, voiceID string) *ElevenLabsTTSService { + return NewElevenLabsTTSServiceWithOptions(apiKey, voiceID, "") +} + +// NewElevenLabsTTSServiceWithOptions creates a TTS service with a configurable base URL. +// baseURL: if empty, uses DefaultElevenLabsBaseURL (ElevenLabs). For Phoenix, use e.g. +// "https://phoenix.example.com/tts/v1" so that /text-to-speech/:id and /stream are used. +func NewElevenLabsTTSServiceWithOptions(apiKey, voiceID, baseURL string) *ElevenLabsTTSService { + return NewElevenLabsTTSServiceWithOptionsFull(apiKey, voiceID, TTSOptions{BaseURL: baseURL}) +} + +// NewElevenLabsTTSServiceWithOptionsFull creates a TTS service with full options (base URL, auth header). +// Use for Phoenix when auth differs from ElevenLabs (e.g. Authorization: Bearer ). +func NewElevenLabsTTSServiceWithOptionsFull(apiKey, voiceID string, opts TTSOptions) *ElevenLabsTTSService { + baseURL := strings.TrimSuffix(opts.BaseURL, "/") + if baseURL == "" { + baseURL = DefaultElevenLabsBaseURL + } + authName := opts.AuthHeaderName + if authName == "" { + authName = "xi-api-key" + } + authVal := opts.AuthHeaderValue + if authVal == "" { + authVal = apiKey + } return &ElevenLabsTTSService{ - apiKey: apiKey, - voiceID: voiceID, - modelID: "eleven_multilingual_v2", // Default model - baseURL: "https://api.elevenlabs.io/v1", + apiKey: apiKey, + voiceID: voiceID, + modelID: "eleven_multilingual_v2", + baseURL: baseURL, + authHeaderName: authName, + authHeaderValue: authVal, httpClient: &http.Client{ Timeout: 30 * time.Second, }, @@ -71,8 +114,8 @@ func (s *ElevenLabsTTSService) Synthesize(ctx context.Context, text string) ([]b // SynthesizeWithConfig synthesizes text to audio with custom voice configuration func (s *ElevenLabsTTSService) SynthesizeWithConfig(ctx context.Context, text string, config *VoiceConfig) ([]byte, error) { - if s.apiKey == "" { - return nil, fmt.Errorf("ElevenLabs API key not configured") + if s.authHeaderValue == "" && s.apiKey == "" { + return nil, fmt.Errorf("TTS API key or auth not configured") } if s.voiceID == "" { return nil, fmt.Errorf("ElevenLabs voice ID not configured") @@ -109,8 +152,9 @@ func (s *ElevenLabsTTSService) SynthesizeWithConfig(ctx context.Context, text st req.Header.Set("Accept", "audio/mpeg") req.Header.Set("Content-Type", "application/json") - req.Header.Set("xi-api-key", s.apiKey) - + if s.authHeaderValue != "" { + req.Header.Set(s.authHeaderName, s.authHeaderValue) + } // Execute request with retry logic var resp *http.Response maxRetries := 3 @@ -131,12 +175,8 @@ func (s *ElevenLabsTTSService) SynthesizeWithConfig(ctx context.Context, text st } if resp.StatusCode != http.StatusOK { + bodyBytes, _ := io.ReadAll(resp.Body) resp.Body.Close() - bodyBytes, _ := io.ReadAll(bytes.NewReader([]byte{})) - if resp.Body != nil { - bodyBytes, _ = io.ReadAll(resp.Body) - } - // Retry on 5xx errors if resp.StatusCode >= 500 && i < maxRetries-1 { backoff := time.Duration(i+1) * time.Second @@ -165,8 +205,8 @@ func (s *ElevenLabsTTSService) SynthesizeStream(ctx context.Context, text string // SynthesizeStreamWithConfig synthesizes text to audio stream with custom voice configuration func (s *ElevenLabsTTSService) SynthesizeStreamWithConfig(ctx context.Context, text string, config *VoiceConfig) (io.Reader, error) { - if s.apiKey == "" { - return nil, fmt.Errorf("ElevenLabs API key not configured") + if s.authHeaderValue == "" && s.apiKey == "" { + return nil, fmt.Errorf("TTS API key or auth not configured") } if s.voiceID == "" { return nil, fmt.Errorf("ElevenLabs voice ID not configured") @@ -203,8 +243,9 @@ func (s *ElevenLabsTTSService) SynthesizeStreamWithConfig(ctx context.Context, t req.Header.Set("Accept", "audio/mpeg") req.Header.Set("Content-Type", "application/json") - req.Header.Set("xi-api-key", s.apiKey) - + if s.authHeaderValue != "" { + req.Header.Set(s.authHeaderName, s.authHeaderValue) + } // Execute request resp, err := s.httpClient.Do(req) if err != nil { @@ -212,15 +253,44 @@ func (s *ElevenLabsTTSService) SynthesizeStreamWithConfig(ctx context.Context, t } if resp.StatusCode != http.StatusOK { - resp.Body.Close() bodyBytes, _ := io.ReadAll(resp.Body) - return nil, fmt.Errorf("ElevenLabs streaming API error: status %d, body: %s", resp.StatusCode, string(bodyBytes)) + resp.Body.Close() + return nil, fmt.Errorf("TTS streaming API error: status %d, body: %s", resp.StatusCode, string(bodyBytes)) } // Return stream reader (caller is responsible for closing) return resp.Body, nil } +// Health checks connectivity to the TTS backend. For Phoenix, expects GET {baseURL}/../health (or /health). +// For ElevenLabs (default base URL), this is a no-op and returns nil (no public health endpoint). +func (s *ElevenLabsTTSService) Health(ctx context.Context) error { + if s.baseURL == DefaultElevenLabsBaseURL { + return nil // ElevenLabs has no public health; skip to avoid unnecessary calls + } + u, err := url.Parse(s.baseURL) + if err != nil { + return fmt.Errorf("TTS base URL invalid: %w", err) + } + u.Path = path.Join(path.Dir(u.Path), "health") + req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil) + if err != nil { + return err + } + if s.authHeaderValue != "" { + req.Header.Set(s.authHeaderName, s.authHeaderValue) + } + resp, err := s.httpClient.Do(req) + if err != nil { + return fmt.Errorf("TTS health check failed: %w", err) + } + resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return fmt.Errorf("TTS health returned status %d", resp.StatusCode) + } + return nil +} + // GetVisemes returns viseme events for lip sync // ElevenLabs doesn't provide viseme data directly, so we use phoneme-to-viseme mapping func (s *ElevenLabsTTSService) GetVisemes(ctx context.Context, text string) ([]VisemeEvent, error) { diff --git a/backend/tts/service.go b/backend/tts/service.go index 7e34a97..9ae778d 100644 --- a/backend/tts/service.go +++ b/backend/tts/service.go @@ -2,7 +2,6 @@ package tts import ( "context" - "fmt" "io" ) @@ -11,6 +10,8 @@ type Service interface { SynthesizeStream(ctx context.Context, text string) (io.Reader, error) Synthesize(ctx context.Context, text string) ([]byte, error) GetVisemes(ctx context.Context, text string) ([]VisemeEvent, error) + // Health checks connectivity to the TTS backend (e.g. Phoenix /health). No-op for mocks. + Health(ctx context.Context) error } // VisemeEvent represents a viseme (lip shape) event for lip sync @@ -52,6 +53,9 @@ func (s *MockTTSService) GetVisemes(ctx context.Context, text string) ([]VisemeE }, nil } +// Health is a no-op for the mock (no backend). +func (s *MockTTSService) Health(ctx context.Context) error { return nil } + // ElevenLabsTTSService integrates with ElevenLabs (implementation in elevenlabs-adapter.go) // This interface definition is kept for backwards compatibility // The actual implementation is in elevenlabs-adapter.go