From 69b28d684e046e5f340644fe7abf80d1a575ba76 Mon Sep 17 00:00:00 2001
From: Stephanie Gredell <s.raide@gmail.com>
Date: Thu, 11 Dec 2025 15:25:23 -0800
Subject: [PATCH] Add ElevenLabs TTS integration with caching

---
 backend/scripts/clear-pronunciations.sql      |   3 +
 backend/src/config/env.ts                     |   6 +-
 .../controllers/speechSounds.controller.ts    | 174 ++++++++++++++++++
 backend/src/db/migrate.ts                     |  35 ++++
 backend/src/index.ts                          |   2 +
 backend/src/routes/speechSounds.routes.ts     |  14 ++
 backend/src/services/elevenlabs.service.ts    |  74 ++++++++
 frontend/src/pages/SpeechSoundsApp.tsx        |  68 +++++--
 frontend/src/services/audioService.ts         |  47 +++++
 9 files changed, 409 insertions(+), 14 deletions(-)
 create mode 100644 backend/scripts/clear-pronunciations.sql
 create mode 100644 backend/src/controllers/speechSounds.controller.ts
 create mode 100644 backend/src/routes/speechSounds.routes.ts
 create mode 100644 backend/src/services/elevenlabs.service.ts
 create mode 100644 frontend/src/services/audioService.ts

diff --git a/backend/scripts/clear-pronunciations.sql b/backend/scripts/clear-pronunciations.sql
new file mode 100644
index 0000000..b77d42a
--- /dev/null
+++ b/backend/scripts/clear-pronunciations.sql
@@ -0,0 +1,3 @@
+-- Clear all cached pronunciations
+-- This will force regeneration with new model/settings on next play
+DELETE FROM word_pronunciations;
diff --git a/backend/src/config/env.ts b/backend/src/config/env.ts
index cd29923..b459474 100644
--- a/backend/src/config/env.ts
+++ b/backend/src/config/env.ts
@@ -15,7 +15,8 @@ const optionalEnvVars = {
   NODE_ENV: 'development',
   ACCESS_TOKEN_EXPIRY: '15m',
   REFRESH_TOKEN_EXPIRY: '7d',
-  INITIAL_ADMIN_USERNAME: 'admin'
+  INITIAL_ADMIN_USERNAME: 'admin',
+  ELEVENLABS_API_KEY: ''
 } as const;
 
 export function validateEnv() {
@@ -55,7 +56,8 @@ export const env = {
   accessTokenExpiry: process.env.ACCESS_TOKEN_EXPIRY!,
   refreshTokenExpiry: process.env.REFRESH_TOKEN_EXPIRY!,
   initialAdminUsername: process.env.INITIAL_ADMIN_USERNAME,
-  initialAdminPassword: process.env.INITIAL_ADMIN_PASSWORD
+  initialAdminPassword: process.env.INITIAL_ADMIN_PASSWORD,
+  elevenLabsApiKey: process.env.ELEVENLABS_API_KEY || ''
 };
 
 
diff --git a/backend/src/controllers/speechSounds.controller.ts b/backend/src/controllers/speechSounds.controller.ts
new file mode 100644
index 0000000..efba31a
--- /dev/null
+++ b/backend/src/controllers/speechSounds.controller.ts
@@ -0,0 +1,174 @@
+import { Response } from 'express';
+import { AuthRequest } from '../types/index.js';
+import { db } from '../config/database.js';
+import { generateSpeech } from '../services/elevenlabs.service.js';
+
+/**
+ * Get pronunciation audio for a word
+ * Checks cache first, then calls ElevenLabs API if not cached
+ */
+export async function pronounceWord(req: AuthRequest, res: Response) {
+  try {
+    const wordId = parseInt(req.params.wordId);
+    
+    if (!wordId || isNaN(wordId)) {
+      return res.status(400).json({
+        success: false,
+        error: {
+          code: 'INVALID_WORD_ID',
+          message: 'Invalid word ID'
+        }
+      });
+    }
+
+    // Get word text from database
+    const wordResult = await db.execute({
+      sql: 'SELECT word FROM words WHERE id = ?',
+      args: [wordId]
+    });
+
+    if (!wordResult.rows.length) {
+      return res.status(404).json({
+        success: false,
+        error: {
+          code: 'WORD_NOT_FOUND',
+          message: 'Word not found'
+        }
+      });
+    }
+
+    const wordText = wordResult.rows[0].word as string;
+    const voiceId = '1FSm04EkRXraU6SyzoLr'; // Can be made configurable later
+
+    // Check cache first
+    const cacheResult = await db.execute({
+      sql: 'SELECT audio_data, audio_format FROM word_pronunciations WHERE word_id = ? AND voice_id = ?',
+      args: [wordId, voiceId]
+    });
+
+    if (cacheResult.rows.length > 0) {
+      // Return cached audio
+      const audioDataValue = cacheResult.rows[0].audio_data;
+      const audioFormat = cacheResult.rows[0].audio_format as string;
+      
+      // Convert database BLOB to Buffer
+      let audioBuffer: Buffer;
+      if (audioDataValue instanceof Uint8Array) {
+        audioBuffer = Buffer.from(audioDataValue);
+      } else if (audioDataValue instanceof ArrayBuffer) {
+        audioBuffer = Buffer.from(audioDataValue);
+      } else {
+        // Fallback: convert to string then to buffer
+        audioBuffer = Buffer.from(audioDataValue as any);
+      }
+      
+      const contentType = audioFormat === 'mp3' ? 'audio/mpeg' : 
+                         audioFormat === 'wav' ? 'audio/wav' : 
+                         audioFormat === 'ogg' ? 'audio/ogg' : 'audio/mpeg';
+
+      res.setHeader('Content-Type', contentType);
+      res.setHeader('Content-Length', audioBuffer.length);
+      res.setHeader('Cache-Control', 'public, max-age=31536000'); // Cache for 1 year
+      res.send(audioBuffer);
+      return;
+    }
+
+    // Not cached - generate speech using ElevenLabs
+    try {
+      const { audio, format } = await generateSpeech(wordText, voiceId);
+
+      // Store in cache
+      await db.execute({
+        sql: `
+          INSERT INTO word_pronunciations (word_id, voice_id, audio_data, audio_format)
+          VALUES (?, ?, ?, ?)
+          ON CONFLICT(word_id, voice_id) DO UPDATE SET
+            audio_data = excluded.audio_data,
+            audio_format = excluded.audio_format,
+            created_at = CURRENT_TIMESTAMP
+        `,
+        args: [wordId, voiceId, audio, format]
+      });
+
+      // Return audio
+      const contentType = format === 'mp3' ? 'audio/mpeg' : 
+                         format === 'wav' ? 'audio/wav' : 
+                         format === 'ogg' ? 'audio/ogg' : 'audio/mpeg';
+
+      res.setHeader('Content-Type', contentType);
+      res.setHeader('Content-Length', audio.length);
+      res.setHeader('Cache-Control', 'public, max-age=31536000'); // Cache for 1 year
+      res.send(audio);
+    } catch (error: any) {
+      console.error('Error generating speech:', error);
+      
+      // If ElevenLabs fails and API key is not configured, return helpful error
+      if (error.message.includes('not configured')) {
+        return res.status(503).json({
+          success: false,
+          error: {
+            code: 'ELEVENLABS_NOT_CONFIGURED',
+            message: 'Text-to-speech is not configured. Please set ELEVENLABS_API_KEY environment variable.'
+          }
+        });
+      }
+
+      return res.status(500).json({
+        success: false,
+        error: {
+          code: 'SPEECH_GENERATION_ERROR',
+          message: 'Failed to generate speech pronunciation'
+        }
+      });
+    }
+  } catch (error: any) {
+    console.error('Pronounce word error:', error);
+    res.status(500).json({
+      success: false,
+      error: {
+        code: 'PRONOUNCE_WORD_ERROR',
+        message: 'Error generating word pronunciation'
+      }
+    });
+  }
+}
+
+/**
+ * Clear all cached pronunciations (admin only)
+ * Forces regeneration with current model/settings
+ */
+export async function clearPronunciationsCache(req: AuthRequest, res: Response) {
+  try {
+    if (!req.userId) {
+      return res.status(401).json({
+        success: false,
+        error: {
+          code: 'UNAUTHORIZED',
+          message: 'Authentication required'
+        }
+      });
+    }
+
+    // Delete all cached pronunciations
+    const result = await db.execute({
+      sql: 'DELETE FROM word_pronunciations'
+    });
+
+    res.json({
+      success: true,
+      data: {
+        message: 'Pronunciation cache cleared successfully',
+        deletedCount: result.rowsAffected || 0
+      }
+    });
+  } catch (error: any) {
+    console.error('Clear pronunciations cache error:', error);
+    res.status(500).json({
+      success: false,
+      error: {
+        code: 'CLEAR_CACHE_ERROR',
+        message: 'Error clearing pronunciation cache'
+      }
+    });
+  }
+}
diff --git a/backend/src/db/migrate.ts b/backend/src/db/migrate.ts
index 3d1cfd5..2e8e8fe 100644
--- a/backend/src/db/migrate.ts
+++ b/backend/src/db/migrate.ts
@@ -297,6 +297,41 @@ const migrations = [
         console.log('✓ Settings profiles tables already exist, skipping');
       }
     }
+  },
+  {
+    id: 7,
+    name: 'create_word_pronunciations',
+    up: async () => {
+      // Check if table already exists
+      const tableCheck = await db.execute(`
+        SELECT name FROM sqlite_master
+        WHERE type='table' AND name='word_pronunciations'
+      `);
+      
+      if (tableCheck.rows.length === 0) {
+        // Create word_pronunciations table
+        await db.execute(`
+          CREATE TABLE word_pronunciations (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            word_id INTEGER NOT NULL,
+            voice_id TEXT NOT NULL DEFAULT 'default',
+            audio_data BLOB NOT NULL,
+            audio_format TEXT NOT NULL DEFAULT 'mp3',
+            created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+            FOREIGN KEY (word_id) REFERENCES words(id) ON DELETE CASCADE,
+            UNIQUE(word_id, voice_id)
+          )
+        `);
+        
+        // Create indexes
+        await db.execute('CREATE INDEX IF NOT EXISTS idx_word_pronunciations_word_id ON word_pronunciations(word_id)');
+        await db.execute('CREATE INDEX IF NOT EXISTS idx_word_pronunciations_voice_id ON word_pronunciations(voice_id)');
+        
+        console.log('✓ Created word_pronunciations table');
+      } else {
+        console.log('✓ Word pronunciations table already exists, skipping');
+      }
+    }
   }
 ];
 
diff --git a/backend/src/index.ts b/backend/src/index.ts
index ddcac69..bcc92a9 100644
--- a/backend/src/index.ts
+++ b/backend/src/index.ts
@@ -13,6 +13,7 @@ import wordGroupsRoutes from './routes/wordGroups.routes.js';
 import usersRoutes from './routes/users.routes.js';
 import settingsProfilesRoutes from './routes/settingsProfiles.routes.js';
 import magicCodeRoutes from './routes/magicCode.routes.js';
+import speechSoundsRoutes from './routes/speechSounds.routes.js';
 import { errorHandler } from './middleware/errorHandler.js';
 import { apiLimiter } from './middleware/rateLimiter.js';
 import { createWebSocketServer } from './services/websocket.service.js';
@@ -56,6 +57,7 @@ async function startServer() {
     app.use('/api/users', usersRoutes);
     app.use('/api/settings-profiles', settingsProfilesRoutes);
     app.use('/api/magic-code', magicCodeRoutes);
+    app.use('/api/speech-sounds', speechSoundsRoutes);
     
     // Error handling
     app.use(errorHandler);
diff --git a/backend/src/routes/speechSounds.routes.ts b/backend/src/routes/speechSounds.routes.ts
new file mode 100644
index 0000000..e1e0c13
--- /dev/null
+++ b/backend/src/routes/speechSounds.routes.ts
@@ -0,0 +1,14 @@
+import { Router } from 'express';
+import { pronounceWord, clearPronunciationsCache } from '../controllers/speechSounds.controller.js';
+import { authMiddleware } from '../middleware/auth.js';
+import { adminMiddleware } from '../middleware/admin.js';
+
+const router = Router();
+
+// Public route - no authentication required for pronunciation
+router.get('/pronounce/:wordId', pronounceWord);
+
+// Admin route - clear pronunciation cache
+router.delete('/cache', authMiddleware, adminMiddleware, clearPronunciationsCache);
+
+export default router;
diff --git a/backend/src/services/elevenlabs.service.ts b/backend/src/services/elevenlabs.service.ts
new file mode 100644
index 0000000..7564c28
--- /dev/null
+++ b/backend/src/services/elevenlabs.service.ts
@@ -0,0 +1,74 @@
+import { env } from '../config/env.js';
+
+const ELEVENLABS_API_URL = 'https://api.elevenlabs.io/v1';
+const DEFAULT_VOICE_ID = '21m00Tcm4TlvDq8ikWAM'; // Rachel - a friendly, clear voice
+
+export interface ElevenLabsResponse {
+  audio: Buffer;
+  format: string;
+}
+
+/**
+ * Generate speech audio using ElevenLabs API
+ * @param text The text to convert to speech
+ * @param voiceId Optional voice ID (defaults to Rachel)
+ * @returns Audio buffer and format
+ */
+export async function generateSpeech(
+  text: string,
+  voiceId: string = DEFAULT_VOICE_ID
+): Promise<ElevenLabsResponse> {
+  if (!env.elevenLabsApiKey) {
+    throw new Error('ElevenLabs API key is not configured');
+  }
+
+  if (!text || text.trim().length === 0) {
+    throw new Error('Text cannot be empty');
+  }
+
+  try {
+    const response = await fetch(`${ELEVENLABS_API_URL}/text-to-speech/${voiceId}`, {
+      method: 'POST',
+      headers: {
+        'Accept': 'audio/mpeg',
+        'Content-Type': 'application/json',
+        'xi-api-key': env.elevenLabsApiKey
+      },
+      body: JSON.stringify({
+        text: text.trim(),
+        model_id: 'eleven_turbo_v2_5'
+        // No voice_settings - uses the voice's default settings from ElevenLabs
+      })
+    });
+
+    if (!response.ok) {
+      const errorText = await response.text();
+      console.error('ElevenLabs API error:', response.status, errorText);
+      throw new Error(`ElevenLabs API error: ${response.status} - ${errorText}`);
+    }
+
+    const audioBuffer = Buffer.from(await response.arrayBuffer());
+    const contentType = response.headers.get('content-type') || 'audio/mpeg';
+    
+    // Determine format from content type
+    let format = 'mp3';
+    if (contentType.includes('mp3')) {
+      format = 'mp3';
+    } else if (contentType.includes('wav')) {
+      format = 'wav';
+    } else if (contentType.includes('ogg')) {
+      format = 'ogg';
+    }
+
+    return {
+      audio: audioBuffer,
+      format
+    };
+  } catch (error: any) {
+    if (error.message.includes('ElevenLabs API error')) {
+      throw error;
+    }
+    console.error('Error calling ElevenLabs API:', error);
+    throw new Error(`Failed to generate speech: ${error.message}`);
+  }
+}
diff --git a/frontend/src/pages/SpeechSoundsApp.tsx b/frontend/src/pages/SpeechSoundsApp.tsx
index 9cc88e1..3e3a677 100644
--- a/frontend/src/pages/SpeechSoundsApp.tsx
+++ b/frontend/src/pages/SpeechSoundsApp.tsx
@@ -1,5 +1,6 @@
 import { useState, useEffect } from 'react';
 import { wordGroupsApi } from '../services/apiClient';
+import { playWordPronunciation } from '../services/audioService';
 
 interface Word {
   id: number;
@@ -28,6 +29,8 @@ export function SpeechSoundsApp() {
   const [loading, setLoading] = useState(true);
   const [error, setError] = useState<string | null>(null);
   const [showWordPractice, setShowWordPractice] = useState(false);
+  const [playingWordId, setPlayingWordId] = useState<number | null>(null);
+  const [audioError, setAudioError] = useState<string | null>(null);
 
   // Load practice data from localStorage
   useEffect(() => {
@@ -125,6 +128,26 @@ export function SpeechSoundsApp() {
     setCurrentWordIndex(0);
   };
 
+  const handlePlayPronunciation = async (wordId: number) => {
+    try {
+      setPlayingWordId(wordId);
+      setAudioError(null);
+      await playWordPronunciation(wordId);
+      // Reset playing state after a short delay to allow audio to start
+      setTimeout(() => {
+        setPlayingWordId(null);
+      }, 100);
+    } catch (err: any) {
+      console.error('Error playing pronunciation:', err);
+      setAudioError('Unable to play pronunciation. Please try again.');
+      setPlayingWordId(null);
+      // Clear error after 3 seconds
+      setTimeout(() => {
+        setAudioError(null);
+      }, 3000);
+    }
+  };
+
   if (loading) {
     return (
       <div className="min-h-[calc(100vh-60px)] bg-background px-6 py-6 max-w-[900px] mx-auto">
@@ -179,14 +202,35 @@ export function SpeechSoundsApp() {
 
         <div className="bg-card rounded-[32px] p-10 shadow-lg border-4 border-primary">
           <div className="text-center mb-10">
-            <h2 
-              className="text-[72px] md:text-[72px] text-[48px] font-black mb-5 tracking-[4px] bg-gradient-to-r from-primary to-accent bg-clip-text text-transparent animate-[wordBounce_0.5s_ease-out]"
-              style={{
-                animation: 'wordBounce 0.5s ease-out',
-              }}
-            >
-              {currentWord.word}
-            </h2>
+            <div className="flex items-center justify-center gap-4 mb-5">
+              <h2 
+                className="text-[72px] md:text-[72px] text-[48px] font-black tracking-[4px] bg-gradient-to-r from-primary to-accent bg-clip-text text-transparent animate-[wordBounce_0.5s_ease-out]"
+                style={{
+                  animation: 'wordBounce 0.5s ease-out',
+                }}
+              >
+                {currentWord.word}
+              </h2>
+              <button
+                onClick={() => handlePlayPronunciation(currentWord.id)}
+                disabled={playingWordId === currentWord.id}
+                className={`w-16 h-16 rounded-full border-4 border-primary flex items-center justify-center text-3xl font-bold transition-all shadow-lg hover:scale-110 hover:shadow-xl disabled:opacity-50 disabled:cursor-not-allowed ${
+                  playingWordId === currentWord.id
+                    ? 'bg-primary text-primary-foreground animate-pulse'
+                    : 'bg-primary text-primary-foreground hover:bg-primary/90'
+                }`}
+                title="Play pronunciation"
+              >
+                {playingWordId === currentWord.id ? (
+                  <span className="animate-spin">⟳</span>
+                ) : (
+                  '▶'
+                )}
+              </button>
+            </div>
+            {audioError && (
+              <p className="text-sm text-destructive mb-2">{audioError}</p>
+            )}
             <div className="flex justify-center gap-5 mt-5 flex-wrap">
               <span className="text-lg font-bold py-3 px-5 rounded-[25px] shadow-md border-[3px] bg-[#10b981] border-[#10b981] text-white">
                 ✓ {passCount} Pass
@@ -219,10 +263,10 @@ export function SpeechSoundsApp() {
                     <div className="flex gap-2.5">
                       <button
                         onClick={() => togglePractice(currentWord.id, i, 'pass')}
-                        className={`w-11 h-11 border-[3px] rounded-xl bg-white text-2xl font-bold cursor-pointer transition-all flex items-center justify-center shadow-md ${
+                        className={`w-11 h-11 border-[3px] rounded-xl text-2xl font-bold cursor-pointer transition-all flex items-center justify-center shadow-md ${
                           isPass 
                             ? 'bg-[#10b981] text-white border-[#10b981] scale-110 shadow-lg' 
-                            : 'text-[#065f46] border-[#10b981] hover:bg-[#10b981] hover:border-[#10b981] hover:text-white hover:scale-110 hover:shadow-lg'
+                            : 'bg-white text-[#065f46] border-[#10b981] hover:bg-[#10b981] hover:border-[#10b981] hover:text-white hover:scale-110 hover:shadow-lg'
                         }`}
                         title="Mark as pass"
                       >
@@ -230,10 +274,10 @@ export function SpeechSoundsApp() {
                       </button>
                       <button
                         onClick={() => togglePractice(currentWord.id, i, 'fail')}
-                        className={`w-11 h-11 border-[3px] rounded-xl bg-white text-2xl font-bold cursor-pointer transition-all flex items-center justify-center shadow-md ${
+                        className={`w-11 h-11 border-[3px] rounded-xl text-2xl font-bold cursor-pointer transition-all flex items-center justify-center shadow-md ${
                           isFail 
                             ? 'bg-[#ef4444] text-white border-[#ef4444] scale-110 shadow-lg' 
-                            : 'text-[#991b1b] border-[#ef4444] hover:bg-[#ef4444] hover:border-[#ef4444] hover:text-white hover:scale-110 hover:shadow-lg'
+                            : 'bg-white text-[#991b1b] border-[#ef4444] hover:bg-[#ef4444] hover:border-[#ef4444] hover:text-white hover:scale-110 hover:shadow-lg'
                         }`}
                         title="Mark as fail"
                       >
diff --git a/frontend/src/services/audioService.ts b/frontend/src/services/audioService.ts
new file mode 100644
index 0000000..ca8314f
--- /dev/null
+++ b/frontend/src/services/audioService.ts
@@ -0,0 +1,47 @@
+const API_BASE_URL = import.meta.env.VITE_API_URL || 'http://localhost:8080/api';
+
+/**
+ * Play pronunciation audio for a word
+ * @param wordId The ID of the word to pronounce
+ * @returns Promise that resolves when audio starts playing
+ */
+export async function playWordPronunciation(wordId: number): Promise<void> {
+  return new Promise((resolve, reject) => {
+    const audio = new Audio(`${API_BASE_URL}/speech-sounds/pronounce/${wordId}`);
+    
+    audio.onloadeddata = () => {
+      audio.play().then(() => {
+        resolve();
+      }).catch((error) => {
+        console.error('Error playing audio:', error);
+        reject(new Error('Failed to play audio'));
+      });
+    };
+    
+    audio.onerror = (error) => {
+      console.error('Audio loading error:', error);
+      reject(new Error('Failed to load audio'));
+    };
+    
+    // Handle audio end
+    audio.onended = () => {
+      // Clean up
+    };
+  });
+}
+
+/**
+ * Check if audio is available for a word (without playing it)
+ * @param wordId The ID of the word to check
+ * @returns Promise that resolves to true if audio is available
+ */
+export async function checkAudioAvailability(wordId: number): Promise<boolean> {
+  try {
+    const response = await fetch(`${API_BASE_URL}/speech-sounds/pronounce/${wordId}`, {
+      method: 'HEAD'
+    });
+    return response.ok;
+  } catch (error) {
+    return false;
+  }
+}