Add ElevenLabs TTS integration with caching

3 months ago · 69b28d684e
9 changed files with 409 additions and 14 deletions
--- a/backend/scripts/clear-pronunciations.sql
+++ b/backend/scripts/clear-pronunciations.sql
@ -0,0 +1,3 @@
 -- Clear all cached pronunciations
 -- This will force regeneration with new model/settings on next play
 DELETE FROM word_pronunciations;
--- a/backend/src/config/env.ts
+++ b/backend/src/config/env.ts
@ -15,7 +15,8 @@ const optionalEnvVars = {
  NODE_ENV: 'development',
  ACCESS_TOKEN_EXPIRY: '15m',
  REFRESH_TOKEN_EXPIRY: '7d',
-  INITIAL_ADMIN_USERNAME: 'admin'
+  INITIAL_ADMIN_USERNAME: 'admin',
  ELEVENLABS_API_KEY: ''
 } as const;
 export function validateEnv() {
@ -55,7 +56,8 @@ export const env = {
  accessTokenExpiry: process.env.ACCESS_TOKEN_EXPIRY!,
  refreshTokenExpiry: process.env.REFRESH_TOKEN_EXPIRY!,
  initialAdminUsername: process.env.INITIAL_ADMIN_USERNAME,
-  initialAdminPassword: process.env.INITIAL_ADMIN_PASSWORD
+  initialAdminPassword: process.env.INITIAL_ADMIN_PASSWORD,
  elevenLabsApiKey: process.env.ELEVENLABS_API_KEY || ''
 };
--- a/backend/src/controllers/speechSounds.controller.ts
+++ b/backend/src/controllers/speechSounds.controller.ts
@ -0,0 +1,174 @@
 import { Response } from 'express';
 import { AuthRequest } from '../types/index.js';
 import { db } from '../config/database.js';
 import { generateSpeech } from '../services/elevenlabs.service.js';
 /**
 * Get pronunciation audio for a word
 * Checks cache first, then calls ElevenLabs API if not cached
 */
 export async function pronounceWord(req: AuthRequest, res: Response) {
  try {
    const wordId = parseInt(req.params.wordId);
    if (!wordId || isNaN(wordId)) {
      return res.status(400).json({
        success: false,
        error: {
          code: 'INVALID_WORD_ID',
          message: 'Invalid word ID'
        }
      });
    }
    // Get word text from database
    const wordResult = await db.execute({
      sql: 'SELECT word FROM words WHERE id = ?',
      args: [wordId]
    });
    if (!wordResult.rows.length) {
      return res.status(404).json({
        success: false,
        error: {
          code: 'WORD_NOT_FOUND',
          message: 'Word not found'
        }
      });
    }
    const wordText = wordResult.rows[0].word as string;
    const voiceId = '1FSm04EkRXraU6SyzoLr'; // Can be made configurable later
    // Check cache first
    const cacheResult = await db.execute({
      sql: 'SELECT audio_data, audio_format FROM word_pronunciations WHERE word_id = ? AND voice_id = ?',
      args: [wordId, voiceId]
    });
    if (cacheResult.rows.length > 0) {
      // Return cached audio
      const audioDataValue = cacheResult.rows[0].audio_data;
      const audioFormat = cacheResult.rows[0].audio_format as string;
      // Convert database BLOB to Buffer
      let audioBuffer: Buffer;
      if (audioDataValue instanceof Uint8Array) {
        audioBuffer = Buffer.from(audioDataValue);
      } else if (audioDataValue instanceof ArrayBuffer) {
        audioBuffer = Buffer.from(audioDataValue);
      } else {
        // Fallback: convert to string then to buffer
        audioBuffer = Buffer.from(audioDataValue as any);
      }
      const contentType = audioFormat === 'mp3' ? 'audio/mpeg' : 
                         audioFormat === 'wav' ? 'audio/wav' : 
                         audioFormat === 'ogg' ? 'audio/ogg' : 'audio/mpeg';
      res.setHeader('Content-Type', contentType);
      res.setHeader('Content-Length', audioBuffer.length);
      res.setHeader('Cache-Control', 'public, max-age=31536000'); // Cache for 1 year
      res.send(audioBuffer);
      return;
    }
    // Not cached - generate speech using ElevenLabs
    try {
      const { audio, format } = await generateSpeech(wordText, voiceId);
      // Store in cache
      await db.execute({
        sql: `
          INSERT INTO word_pronunciations (word_id, voice_id, audio_data, audio_format)
          VALUES (?, ?, ?, ?)
          ON CONFLICT(word_id, voice_id) DO UPDATE SET
            audio_data = excluded.audio_data,
            audio_format = excluded.audio_format,
            created_at = CURRENT_TIMESTAMP
        `,
        args: [wordId, voiceId, audio, format]
      });
      // Return audio
      const contentType = format === 'mp3' ? 'audio/mpeg' : 
                         format === 'wav' ? 'audio/wav' : 
                         format === 'ogg' ? 'audio/ogg' : 'audio/mpeg';
      res.setHeader('Content-Type', contentType);
      res.setHeader('Content-Length', audio.length);
      res.setHeader('Cache-Control', 'public, max-age=31536000'); // Cache for 1 year
      res.send(audio);
    } catch (error: any) {
      console.error('Error generating speech:', error);
      // If ElevenLabs fails and API key is not configured, return helpful error
      if (error.message.includes('not configured')) {
        return res.status(503).json({
          success: false,
          error: {
            code: 'ELEVENLABS_NOT_CONFIGURED',
            message: 'Text-to-speech is not configured. Please set ELEVENLABS_API_KEY environment variable.'
          }
        });
      }
      return res.status(500).json({
        success: false,
        error: {
          code: 'SPEECH_GENERATION_ERROR',
          message: 'Failed to generate speech pronunciation'
        }
      });
    }
  } catch (error: any) {
    console.error('Pronounce word error:', error);
    res.status(500).json({
      success: false,
      error: {
        code: 'PRONOUNCE_WORD_ERROR',
        message: 'Error generating word pronunciation'
      }
    });
  }
 }
 /**
 * Clear all cached pronunciations (admin only)
 * Forces regeneration with current model/settings
 */
 export async function clearPronunciationsCache(req: AuthRequest, res: Response) {
  try {
    if (!req.userId) {
      return res.status(401).json({
        success: false,
        error: {
          code: 'UNAUTHORIZED',
          message: 'Authentication required'
        }
      });
    }
    // Delete all cached pronunciations
    const result = await db.execute({
      sql: 'DELETE FROM word_pronunciations'
    });
    res.json({
      success: true,
      data: {
        message: 'Pronunciation cache cleared successfully',
        deletedCount: result.rowsAffected || 0
      }
    });
  } catch (error: any) {
    console.error('Clear pronunciations cache error:', error);
    res.status(500).json({
      success: false,
      error: {
        code: 'CLEAR_CACHE_ERROR',
        message: 'Error clearing pronunciation cache'
      }
    });
  }
 }
--- a/backend/src/db/migrate.ts
+++ b/backend/src/db/migrate.ts
@ -297,6 +297,41 @@ const migrations = [
        console.log('✓ Settings profiles tables already exist, skipping');
      }
    }
  },
  {
    id: 7,
    name: 'create_word_pronunciations',
    up: async () => {
      // Check if table already exists
      const tableCheck = await db.execute(`
        SELECT name FROM sqlite_master
        WHERE type='table' AND name='word_pronunciations'
      `);
      if (tableCheck.rows.length === 0) {
        // Create word_pronunciations table
        await db.execute(`
          CREATE TABLE word_pronunciations (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            word_id INTEGER NOT NULL,
            voice_id TEXT NOT NULL DEFAULT 'default',
            audio_data BLOB NOT NULL,
            audio_format TEXT NOT NULL DEFAULT 'mp3',
            created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
            FOREIGN KEY (word_id) REFERENCES words(id) ON DELETE CASCADE,
            UNIQUE(word_id, voice_id)
          )
        `);
        // Create indexes
        await db.execute('CREATE INDEX IF NOT EXISTS idx_word_pronunciations_word_id ON word_pronunciations(word_id)');
        await db.execute('CREATE INDEX IF NOT EXISTS idx_word_pronunciations_voice_id ON word_pronunciations(voice_id)');
        console.log('✓ Created word_pronunciations table');
      } else {
        console.log('✓ Word pronunciations table already exists, skipping');
      }
    }
  }
 ];
--- a/backend/src/index.ts
+++ b/backend/src/index.ts
@ -13,6 +13,7 @@ import wordGroupsRoutes from './routes/wordGroups.routes.js';
 import usersRoutes from './routes/users.routes.js';
 import settingsProfilesRoutes from './routes/settingsProfiles.routes.js';
 import magicCodeRoutes from './routes/magicCode.routes.js';
 import speechSoundsRoutes from './routes/speechSounds.routes.js';
 import { errorHandler } from './middleware/errorHandler.js';
 import { apiLimiter } from './middleware/rateLimiter.js';
 import { createWebSocketServer } from './services/websocket.service.js';
@ -56,6 +57,7 @@ async function startServer() {
    app.use('/api/users', usersRoutes);
    app.use('/api/settings-profiles', settingsProfilesRoutes);
    app.use('/api/magic-code', magicCodeRoutes);
    app.use('/api/speech-sounds', speechSoundsRoutes);
    // Error handling
    app.use(errorHandler);
--- a/backend/src/routes/speechSounds.routes.ts
+++ b/backend/src/routes/speechSounds.routes.ts
@ -0,0 +1,14 @@
 import { Router } from 'express';
 import { pronounceWord, clearPronunciationsCache } from '../controllers/speechSounds.controller.js';
 import { authMiddleware } from '../middleware/auth.js';
 import { adminMiddleware } from '../middleware/admin.js';
 const router = Router();
 // Public route - no authentication required for pronunciation
 router.get('/pronounce/:wordId', pronounceWord);
 // Admin route - clear pronunciation cache
 router.delete('/cache', authMiddleware, adminMiddleware, clearPronunciationsCache);
 export default router;
--- a/backend/src/services/elevenlabs.service.ts
+++ b/backend/src/services/elevenlabs.service.ts
@ -0,0 +1,74 @@
 import { env } from '../config/env.js';
 const ELEVENLABS_API_URL = 'https://api.elevenlabs.io/v1';
 const DEFAULT_VOICE_ID = '21m00Tcm4TlvDq8ikWAM'; // Rachel - a friendly, clear voice
 export interface ElevenLabsResponse {
  audio: Buffer;
  format: string;
 }
 /**
 * Generate speech audio using ElevenLabs API
 * @param text The text to convert to speech
 * @param voiceId Optional voice ID (defaults to Rachel)
 * @returns Audio buffer and format
 */
 export async function generateSpeech(
  text: string,
  voiceId: string = DEFAULT_VOICE_ID
 ): Promise<ElevenLabsResponse> {
  if (!env.elevenLabsApiKey) {
    throw new Error('ElevenLabs API key is not configured');
  }
  if (!text || text.trim().length === 0) {
    throw new Error('Text cannot be empty');
  }
  try {
    const response = await fetch(`${ELEVENLABS_API_URL}/text-to-speech/${voiceId}`, {
      method: 'POST',
      headers: {
        'Accept': 'audio/mpeg',
        'Content-Type': 'application/json',
        'xi-api-key': env.elevenLabsApiKey
      },
      body: JSON.stringify({
        text: text.trim(),
        model_id: 'eleven_turbo_v2_5'
        // No voice_settings - uses the voice's default settings from ElevenLabs
      })
    });
    if (!response.ok) {
      const errorText = await response.text();
      console.error('ElevenLabs API error:', response.status, errorText);
      throw new Error(`ElevenLabs API error: ${response.status} - ${errorText}`);
    }
    const audioBuffer = Buffer.from(await response.arrayBuffer());
    const contentType = response.headers.get('content-type') || 'audio/mpeg';
    // Determine format from content type
    let format = 'mp3';
    if (contentType.includes('mp3')) {
      format = 'mp3';
    } else if (contentType.includes('wav')) {
      format = 'wav';
    } else if (contentType.includes('ogg')) {
      format = 'ogg';
    }
    return {
      audio: audioBuffer,
      format
    };
  } catch (error: any) {
    if (error.message.includes('ElevenLabs API error')) {
      throw error;
    }
    console.error('Error calling ElevenLabs API:', error);
    throw new Error(`Failed to generate speech: ${error.message}`);
  }
 }
--- a/frontend/src/pages/SpeechSoundsApp.tsx
+++ b/frontend/src/pages/SpeechSoundsApp.tsx
@ -1,5 +1,6 @@
 import { useState, useEffect } from 'react';
 import { wordGroupsApi } from '../services/apiClient';
 import { playWordPronunciation } from '../services/audioService';
 interface Word {
  id: number;
@ -28,6 +29,8 @@ export function SpeechSoundsApp() {
  const [loading, setLoading] = useState(true);
  const [error, setError] = useState<string | null>(null);
  const [showWordPractice, setShowWordPractice] = useState(false);
  const [playingWordId, setPlayingWordId] = useState<number | null>(null);
  const [audioError, setAudioError] = useState<string | null>(null);
  // Load practice data from localStorage
  useEffect(() => {
@ -125,6 +128,26 @@ export function SpeechSoundsApp() {
    setCurrentWordIndex(0);
  };
  const handlePlayPronunciation = async (wordId: number) => {
    try {
      setPlayingWordId(wordId);
      setAudioError(null);
      await playWordPronunciation(wordId);
      // Reset playing state after a short delay to allow audio to start
      setTimeout(() => {
        setPlayingWordId(null);
      }, 100);
    } catch (err: any) {
      console.error('Error playing pronunciation:', err);
      setAudioError('Unable to play pronunciation. Please try again.');
      setPlayingWordId(null);
      // Clear error after 3 seconds
      setTimeout(() => {
        setAudioError(null);
      }, 3000);
    }
  };
  if (loading) {
    return (
      <div className="min-h-[calc(100vh-60px)] bg-background px-6 py-6 max-w-[900px] mx-auto">
@ -179,14 +202,35 @@ export function SpeechSoundsApp() {
        <div className="bg-card rounded-[32px] p-10 shadow-lg border-4 border-primary">
          <div className="text-center mb-10">
            <div className="flex items-center justify-center gap-4 mb-5">
              <h2 
-              className="text-[72px] md:text-[72px] text-[48px] font-black mb-5 tracking-[4px] bg-gradient-to-r from-primary to-accent bg-clip-text text-transparent animate-[wordBounce_0.5s_ease-out]"
+                className="text-[72px] md:text-[72px] text-[48px] font-black tracking-[4px] bg-gradient-to-r from-primary to-accent bg-clip-text text-transparent animate-[wordBounce_0.5s_ease-out]"
                style={{
                  animation: 'wordBounce 0.5s ease-out',
                }}
              >
                {currentWord.word}
              </h2>
              <button
                onClick={() => handlePlayPronunciation(currentWord.id)}
                disabled={playingWordId === currentWord.id}
                className={`w-16 h-16 rounded-full border-4 border-primary flex items-center justify-center text-3xl font-bold transition-all shadow-lg hover:scale-110 hover:shadow-xl disabled:opacity-50 disabled:cursor-not-allowed ${
                  playingWordId === currentWord.id
                    ? 'bg-primary text-primary-foreground animate-pulse'
                    : 'bg-primary text-primary-foreground hover:bg-primary/90'
                }`}
                title="Play pronunciation"
              >
                {playingWordId === currentWord.id ? (
                  <span className="animate-spin">⟳</span>
                ) : (
                  '▶'
                )}
              </button>
            </div>
            {audioError && (
              <p className="text-sm text-destructive mb-2">{audioError}</p>
            )}
            <div className="flex justify-center gap-5 mt-5 flex-wrap">
              <span className="text-lg font-bold py-3 px-5 rounded-[25px] shadow-md border-[3px] bg-[#10b981] border-[#10b981] text-white">
                ✓ {passCount} Pass
@ -219,10 +263,10 @@ export function SpeechSoundsApp() {
                    <div className="flex gap-2.5">
                      <button
                        onClick={() => togglePractice(currentWord.id, i, 'pass')}
-                        className={`w-11 h-11 border-[3px] rounded-xl bg-white text-2xl font-bold cursor-pointer transition-all flex items-center justify-center shadow-md ${
+                        className={`w-11 h-11 border-[3px] rounded-xl text-2xl font-bold cursor-pointer transition-all flex items-center justify-center shadow-md ${
                          isPass 
                            ? 'bg-[#10b981] text-white border-[#10b981] scale-110 shadow-lg' 
-                            : 'text-[#065f46] border-[#10b981] hover:bg-[#10b981] hover:border-[#10b981] hover:text-white hover:scale-110 hover:shadow-lg'
+                            : 'bg-white text-[#065f46] border-[#10b981] hover:bg-[#10b981] hover:border-[#10b981] hover:text-white hover:scale-110 hover:shadow-lg'
                        }`}
                        title="Mark as pass"
                      >
@ -230,10 +274,10 @@ export function SpeechSoundsApp() {
                      </button>
                      <button
                        onClick={() => togglePractice(currentWord.id, i, 'fail')}
-                        className={`w-11 h-11 border-[3px] rounded-xl bg-white text-2xl font-bold cursor-pointer transition-all flex items-center justify-center shadow-md ${
+                        className={`w-11 h-11 border-[3px] rounded-xl text-2xl font-bold cursor-pointer transition-all flex items-center justify-center shadow-md ${
                          isFail 
                            ? 'bg-[#ef4444] text-white border-[#ef4444] scale-110 shadow-lg' 
-                            : 'text-[#991b1b] border-[#ef4444] hover:bg-[#ef4444] hover:border-[#ef4444] hover:text-white hover:scale-110 hover:shadow-lg'
+                            : 'bg-white text-[#991b1b] border-[#ef4444] hover:bg-[#ef4444] hover:border-[#ef4444] hover:text-white hover:scale-110 hover:shadow-lg'
                        }`}
                        title="Mark as fail"
                      >
--- a/frontend/src/services/audioService.ts
+++ b/frontend/src/services/audioService.ts
@ -0,0 +1,47 @@
 const API_BASE_URL = import.meta.env.VITE_API_URL || 'http://localhost:8080/api';
 /**
 * Play pronunciation audio for a word
 * @param wordId The ID of the word to pronounce
 * @returns Promise that resolves when audio starts playing
 */
 export async function playWordPronunciation(wordId: number): Promise<void> {
  return new Promise((resolve, reject) => {
    const audio = new Audio(`${API_BASE_URL}/speech-sounds/pronounce/${wordId}`);
    audio.onloadeddata = () => {
      audio.play().then(() => {
        resolve();
      }).catch((error) => {
        console.error('Error playing audio:', error);
        reject(new Error('Failed to play audio'));
      });
    };
    audio.onerror = (error) => {
      console.error('Audio loading error:', error);
      reject(new Error('Failed to load audio'));
    };
    // Handle audio end
    audio.onended = () => {
      // Clean up
    };
  });
 }
 /**
 * Check if audio is available for a word (without playing it)
 * @param wordId The ID of the word to check
 * @returns Promise that resolves to true if audio is available
 */
 export async function checkAudioAvailability(wordId: number): Promise<boolean> {
  try {
    const response = await fetch(`${API_BASE_URL}/speech-sounds/pronounce/${wordId}`, {
      method: 'HEAD'
    });
    return response.ok;
  } catch (error) {
    return false;
  }
 }