From 69b28d684e046e5f340644fe7abf80d1a575ba76 Mon Sep 17 00:00:00 2001 From: Stephanie Gredell Date: Thu, 11 Dec 2025 15:25:23 -0800 Subject: [PATCH] Add ElevenLabs TTS integration with caching --- backend/scripts/clear-pronunciations.sql | 3 + backend/src/config/env.ts | 6 +- .../controllers/speechSounds.controller.ts | 174 ++++++++++++++++++ backend/src/db/migrate.ts | 35 ++++ backend/src/index.ts | 2 + backend/src/routes/speechSounds.routes.ts | 14 ++ backend/src/services/elevenlabs.service.ts | 74 ++++++++ frontend/src/pages/SpeechSoundsApp.tsx | 68 +++++-- frontend/src/services/audioService.ts | 47 +++++ 9 files changed, 409 insertions(+), 14 deletions(-) create mode 100644 backend/scripts/clear-pronunciations.sql create mode 100644 backend/src/controllers/speechSounds.controller.ts create mode 100644 backend/src/routes/speechSounds.routes.ts create mode 100644 backend/src/services/elevenlabs.service.ts create mode 100644 frontend/src/services/audioService.ts diff --git a/backend/scripts/clear-pronunciations.sql b/backend/scripts/clear-pronunciations.sql new file mode 100644 index 0000000..b77d42a --- /dev/null +++ b/backend/scripts/clear-pronunciations.sql @@ -0,0 +1,3 @@ +-- Clear all cached pronunciations +-- This will force regeneration with new model/settings on next play +DELETE FROM word_pronunciations; diff --git a/backend/src/config/env.ts b/backend/src/config/env.ts index cd29923..b459474 100644 --- a/backend/src/config/env.ts +++ b/backend/src/config/env.ts @@ -15,7 +15,8 @@ const optionalEnvVars = { NODE_ENV: 'development', ACCESS_TOKEN_EXPIRY: '15m', REFRESH_TOKEN_EXPIRY: '7d', - INITIAL_ADMIN_USERNAME: 'admin' + INITIAL_ADMIN_USERNAME: 'admin', + ELEVENLABS_API_KEY: '' } as const; export function validateEnv() { @@ -55,7 +56,8 @@ export const env = { accessTokenExpiry: process.env.ACCESS_TOKEN_EXPIRY!, refreshTokenExpiry: process.env.REFRESH_TOKEN_EXPIRY!, initialAdminUsername: process.env.INITIAL_ADMIN_USERNAME, - initialAdminPassword: process.env.INITIAL_ADMIN_PASSWORD + initialAdminPassword: process.env.INITIAL_ADMIN_PASSWORD, + elevenLabsApiKey: process.env.ELEVENLABS_API_KEY || '' }; diff --git a/backend/src/controllers/speechSounds.controller.ts b/backend/src/controllers/speechSounds.controller.ts new file mode 100644 index 0000000..efba31a --- /dev/null +++ b/backend/src/controllers/speechSounds.controller.ts @@ -0,0 +1,174 @@ +import { Response } from 'express'; +import { AuthRequest } from '../types/index.js'; +import { db } from '../config/database.js'; +import { generateSpeech } from '../services/elevenlabs.service.js'; + +/** + * Get pronunciation audio for a word + * Checks cache first, then calls ElevenLabs API if not cached + */ +export async function pronounceWord(req: AuthRequest, res: Response) { + try { + const wordId = parseInt(req.params.wordId); + + if (!wordId || isNaN(wordId)) { + return res.status(400).json({ + success: false, + error: { + code: 'INVALID_WORD_ID', + message: 'Invalid word ID' + } + }); + } + + // Get word text from database + const wordResult = await db.execute({ + sql: 'SELECT word FROM words WHERE id = ?', + args: [wordId] + }); + + if (!wordResult.rows.length) { + return res.status(404).json({ + success: false, + error: { + code: 'WORD_NOT_FOUND', + message: 'Word not found' + } + }); + } + + const wordText = wordResult.rows[0].word as string; + const voiceId = '1FSm04EkRXraU6SyzoLr'; // Can be made configurable later + + // Check cache first + const cacheResult = await db.execute({ + sql: 'SELECT audio_data, audio_format FROM word_pronunciations WHERE word_id = ? AND voice_id = ?', + args: [wordId, voiceId] + }); + + if (cacheResult.rows.length > 0) { + // Return cached audio + const audioDataValue = cacheResult.rows[0].audio_data; + const audioFormat = cacheResult.rows[0].audio_format as string; + + // Convert database BLOB to Buffer + let audioBuffer: Buffer; + if (audioDataValue instanceof Uint8Array) { + audioBuffer = Buffer.from(audioDataValue); + } else if (audioDataValue instanceof ArrayBuffer) { + audioBuffer = Buffer.from(audioDataValue); + } else { + // Fallback: convert to string then to buffer + audioBuffer = Buffer.from(audioDataValue as any); + } + + const contentType = audioFormat === 'mp3' ? 'audio/mpeg' : + audioFormat === 'wav' ? 'audio/wav' : + audioFormat === 'ogg' ? 'audio/ogg' : 'audio/mpeg'; + + res.setHeader('Content-Type', contentType); + res.setHeader('Content-Length', audioBuffer.length); + res.setHeader('Cache-Control', 'public, max-age=31536000'); // Cache for 1 year + res.send(audioBuffer); + return; + } + + // Not cached - generate speech using ElevenLabs + try { + const { audio, format } = await generateSpeech(wordText, voiceId); + + // Store in cache + await db.execute({ + sql: ` + INSERT INTO word_pronunciations (word_id, voice_id, audio_data, audio_format) + VALUES (?, ?, ?, ?) + ON CONFLICT(word_id, voice_id) DO UPDATE SET + audio_data = excluded.audio_data, + audio_format = excluded.audio_format, + created_at = CURRENT_TIMESTAMP + `, + args: [wordId, voiceId, audio, format] + }); + + // Return audio + const contentType = format === 'mp3' ? 'audio/mpeg' : + format === 'wav' ? 'audio/wav' : + format === 'ogg' ? 'audio/ogg' : 'audio/mpeg'; + + res.setHeader('Content-Type', contentType); + res.setHeader('Content-Length', audio.length); + res.setHeader('Cache-Control', 'public, max-age=31536000'); // Cache for 1 year + res.send(audio); + } catch (error: any) { + console.error('Error generating speech:', error); + + // If ElevenLabs fails and API key is not configured, return helpful error + if (error.message.includes('not configured')) { + return res.status(503).json({ + success: false, + error: { + code: 'ELEVENLABS_NOT_CONFIGURED', + message: 'Text-to-speech is not configured. Please set ELEVENLABS_API_KEY environment variable.' + } + }); + } + + return res.status(500).json({ + success: false, + error: { + code: 'SPEECH_GENERATION_ERROR', + message: 'Failed to generate speech pronunciation' + } + }); + } + } catch (error: any) { + console.error('Pronounce word error:', error); + res.status(500).json({ + success: false, + error: { + code: 'PRONOUNCE_WORD_ERROR', + message: 'Error generating word pronunciation' + } + }); + } +} + +/** + * Clear all cached pronunciations (admin only) + * Forces regeneration with current model/settings + */ +export async function clearPronunciationsCache(req: AuthRequest, res: Response) { + try { + if (!req.userId) { + return res.status(401).json({ + success: false, + error: { + code: 'UNAUTHORIZED', + message: 'Authentication required' + } + }); + } + + // Delete all cached pronunciations + const result = await db.execute({ + sql: 'DELETE FROM word_pronunciations' + }); + + res.json({ + success: true, + data: { + message: 'Pronunciation cache cleared successfully', + deletedCount: result.rowsAffected || 0 + } + }); + } catch (error: any) { + console.error('Clear pronunciations cache error:', error); + res.status(500).json({ + success: false, + error: { + code: 'CLEAR_CACHE_ERROR', + message: 'Error clearing pronunciation cache' + } + }); + } +} diff --git a/backend/src/db/migrate.ts b/backend/src/db/migrate.ts index 3d1cfd5..2e8e8fe 100644 --- a/backend/src/db/migrate.ts +++ b/backend/src/db/migrate.ts @@ -297,6 +297,41 @@ const migrations = [ console.log('✓ Settings profiles tables already exist, skipping'); } } + }, + { + id: 7, + name: 'create_word_pronunciations', + up: async () => { + // Check if table already exists + const tableCheck = await db.execute(` + SELECT name FROM sqlite_master + WHERE type='table' AND name='word_pronunciations' + `); + + if (tableCheck.rows.length === 0) { + // Create word_pronunciations table + await db.execute(` + CREATE TABLE word_pronunciations ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + word_id INTEGER NOT NULL, + voice_id TEXT NOT NULL DEFAULT 'default', + audio_data BLOB NOT NULL, + audio_format TEXT NOT NULL DEFAULT 'mp3', + created_at DATETIME DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (word_id) REFERENCES words(id) ON DELETE CASCADE, + UNIQUE(word_id, voice_id) + ) + `); + + // Create indexes + await db.execute('CREATE INDEX IF NOT EXISTS idx_word_pronunciations_word_id ON word_pronunciations(word_id)'); + await db.execute('CREATE INDEX IF NOT EXISTS idx_word_pronunciations_voice_id ON word_pronunciations(voice_id)'); + + console.log('✓ Created word_pronunciations table'); + } else { + console.log('✓ Word pronunciations table already exists, skipping'); + } + } } ]; diff --git a/backend/src/index.ts b/backend/src/index.ts index ddcac69..bcc92a9 100644 --- a/backend/src/index.ts +++ b/backend/src/index.ts @@ -13,6 +13,7 @@ import wordGroupsRoutes from './routes/wordGroups.routes.js'; import usersRoutes from './routes/users.routes.js'; import settingsProfilesRoutes from './routes/settingsProfiles.routes.js'; import magicCodeRoutes from './routes/magicCode.routes.js'; +import speechSoundsRoutes from './routes/speechSounds.routes.js'; import { errorHandler } from './middleware/errorHandler.js'; import { apiLimiter } from './middleware/rateLimiter.js'; import { createWebSocketServer } from './services/websocket.service.js'; @@ -56,6 +57,7 @@ async function startServer() { app.use('/api/users', usersRoutes); app.use('/api/settings-profiles', settingsProfilesRoutes); app.use('/api/magic-code', magicCodeRoutes); + app.use('/api/speech-sounds', speechSoundsRoutes); // Error handling app.use(errorHandler); diff --git a/backend/src/routes/speechSounds.routes.ts b/backend/src/routes/speechSounds.routes.ts new file mode 100644 index 0000000..e1e0c13 --- /dev/null +++ b/backend/src/routes/speechSounds.routes.ts @@ -0,0 +1,14 @@ +import { Router } from 'express'; +import { pronounceWord, clearPronunciationsCache } from '../controllers/speechSounds.controller.js'; +import { authMiddleware } from '../middleware/auth.js'; +import { adminMiddleware } from '../middleware/admin.js'; + +const router = Router(); + +// Public route - no authentication required for pronunciation +router.get('/pronounce/:wordId', pronounceWord); + +// Admin route - clear pronunciation cache +router.delete('/cache', authMiddleware, adminMiddleware, clearPronunciationsCache); + +export default router; diff --git a/backend/src/services/elevenlabs.service.ts b/backend/src/services/elevenlabs.service.ts new file mode 100644 index 0000000..7564c28 --- /dev/null +++ b/backend/src/services/elevenlabs.service.ts @@ -0,0 +1,74 @@ +import { env } from '../config/env.js'; + +const ELEVENLABS_API_URL = 'https://api.elevenlabs.io/v1'; +const DEFAULT_VOICE_ID = '21m00Tcm4TlvDq8ikWAM'; // Rachel - a friendly, clear voice + +export interface ElevenLabsResponse { + audio: Buffer; + format: string; +} + +/** + * Generate speech audio using ElevenLabs API + * @param text The text to convert to speech + * @param voiceId Optional voice ID (defaults to Rachel) + * @returns Audio buffer and format + */ +export async function generateSpeech( + text: string, + voiceId: string = DEFAULT_VOICE_ID +): Promise { + if (!env.elevenLabsApiKey) { + throw new Error('ElevenLabs API key is not configured'); + } + + if (!text || text.trim().length === 0) { + throw new Error('Text cannot be empty'); + } + + try { + const response = await fetch(`${ELEVENLABS_API_URL}/text-to-speech/${voiceId}`, { + method: 'POST', + headers: { + 'Accept': 'audio/mpeg', + 'Content-Type': 'application/json', + 'xi-api-key': env.elevenLabsApiKey + }, + body: JSON.stringify({ + text: text.trim(), + model_id: 'eleven_turbo_v2_5' + // No voice_settings - uses the voice's default settings from ElevenLabs + }) + }); + + if (!response.ok) { + const errorText = await response.text(); + console.error('ElevenLabs API error:', response.status, errorText); + throw new Error(`ElevenLabs API error: ${response.status} - ${errorText}`); + } + + const audioBuffer = Buffer.from(await response.arrayBuffer()); + const contentType = response.headers.get('content-type') || 'audio/mpeg'; + + // Determine format from content type + let format = 'mp3'; + if (contentType.includes('mp3')) { + format = 'mp3'; + } else if (contentType.includes('wav')) { + format = 'wav'; + } else if (contentType.includes('ogg')) { + format = 'ogg'; + } + + return { + audio: audioBuffer, + format + }; + } catch (error: any) { + if (error.message.includes('ElevenLabs API error')) { + throw error; + } + console.error('Error calling ElevenLabs API:', error); + throw new Error(`Failed to generate speech: ${error.message}`); + } +} diff --git a/frontend/src/pages/SpeechSoundsApp.tsx b/frontend/src/pages/SpeechSoundsApp.tsx index 9cc88e1..3e3a677 100644 --- a/frontend/src/pages/SpeechSoundsApp.tsx +++ b/frontend/src/pages/SpeechSoundsApp.tsx @@ -1,5 +1,6 @@ import { useState, useEffect } from 'react'; import { wordGroupsApi } from '../services/apiClient'; +import { playWordPronunciation } from '../services/audioService'; interface Word { id: number; @@ -28,6 +29,8 @@ export function SpeechSoundsApp() { const [loading, setLoading] = useState(true); const [error, setError] = useState(null); const [showWordPractice, setShowWordPractice] = useState(false); + const [playingWordId, setPlayingWordId] = useState(null); + const [audioError, setAudioError] = useState(null); // Load practice data from localStorage useEffect(() => { @@ -125,6 +128,26 @@ export function SpeechSoundsApp() { setCurrentWordIndex(0); }; + const handlePlayPronunciation = async (wordId: number) => { + try { + setPlayingWordId(wordId); + setAudioError(null); + await playWordPronunciation(wordId); + // Reset playing state after a short delay to allow audio to start + setTimeout(() => { + setPlayingWordId(null); + }, 100); + } catch (err: any) { + console.error('Error playing pronunciation:', err); + setAudioError('Unable to play pronunciation. Please try again.'); + setPlayingWordId(null); + // Clear error after 3 seconds + setTimeout(() => { + setAudioError(null); + }, 3000); + } + }; + if (loading) { return (
@@ -179,14 +202,35 @@ export function SpeechSoundsApp() {
-

- {currentWord.word} -

+
+

+ {currentWord.word} +

+ +
+ {audioError && ( +

{audioError}

+ )}
✓ {passCount} Pass @@ -219,10 +263,10 @@ export function SpeechSoundsApp() {