Browse Source

Add ElevenLabs TTS integration with caching

drawing-pad
Stephanie Gredell 1 month ago
parent
commit
69b28d684e
  1. 3
      backend/scripts/clear-pronunciations.sql
  2. 6
      backend/src/config/env.ts
  3. 174
      backend/src/controllers/speechSounds.controller.ts
  4. 35
      backend/src/db/migrate.ts
  5. 2
      backend/src/index.ts
  6. 14
      backend/src/routes/speechSounds.routes.ts
  7. 74
      backend/src/services/elevenlabs.service.ts
  8. 54
      frontend/src/pages/SpeechSoundsApp.tsx
  9. 47
      frontend/src/services/audioService.ts

3
backend/scripts/clear-pronunciations.sql

@ -0,0 +1,3 @@
-- Clear all cached pronunciations
-- This will force regeneration with new model/settings on next play
DELETE FROM word_pronunciations;

6
backend/src/config/env.ts

@ -15,7 +15,8 @@ const optionalEnvVars = {
NODE_ENV: 'development', NODE_ENV: 'development',
ACCESS_TOKEN_EXPIRY: '15m', ACCESS_TOKEN_EXPIRY: '15m',
REFRESH_TOKEN_EXPIRY: '7d', REFRESH_TOKEN_EXPIRY: '7d',
INITIAL_ADMIN_USERNAME: 'admin' INITIAL_ADMIN_USERNAME: 'admin',
ELEVENLABS_API_KEY: ''
} as const; } as const;
export function validateEnv() { export function validateEnv() {
@ -55,7 +56,8 @@ export const env = {
accessTokenExpiry: process.env.ACCESS_TOKEN_EXPIRY!, accessTokenExpiry: process.env.ACCESS_TOKEN_EXPIRY!,
refreshTokenExpiry: process.env.REFRESH_TOKEN_EXPIRY!, refreshTokenExpiry: process.env.REFRESH_TOKEN_EXPIRY!,
initialAdminUsername: process.env.INITIAL_ADMIN_USERNAME, initialAdminUsername: process.env.INITIAL_ADMIN_USERNAME,
initialAdminPassword: process.env.INITIAL_ADMIN_PASSWORD initialAdminPassword: process.env.INITIAL_ADMIN_PASSWORD,
elevenLabsApiKey: process.env.ELEVENLABS_API_KEY || ''
}; };

174
backend/src/controllers/speechSounds.controller.ts

@ -0,0 +1,174 @@
import { Response } from 'express';
import { AuthRequest } from '../types/index.js';
import { db } from '../config/database.js';
import { generateSpeech } from '../services/elevenlabs.service.js';
/**
* Get pronunciation audio for a word
* Checks cache first, then calls ElevenLabs API if not cached
*/
export async function pronounceWord(req: AuthRequest, res: Response) {
try {
const wordId = parseInt(req.params.wordId);
if (!wordId || isNaN(wordId)) {
return res.status(400).json({
success: false,
error: {
code: 'INVALID_WORD_ID',
message: 'Invalid word ID'
}
});
}
// Get word text from database
const wordResult = await db.execute({
sql: 'SELECT word FROM words WHERE id = ?',
args: [wordId]
});
if (!wordResult.rows.length) {
return res.status(404).json({
success: false,
error: {
code: 'WORD_NOT_FOUND',
message: 'Word not found'
}
});
}
const wordText = wordResult.rows[0].word as string;
const voiceId = '1FSm04EkRXraU6SyzoLr'; // Can be made configurable later
// Check cache first
const cacheResult = await db.execute({
sql: 'SELECT audio_data, audio_format FROM word_pronunciations WHERE word_id = ? AND voice_id = ?',
args: [wordId, voiceId]
});
if (cacheResult.rows.length > 0) {
// Return cached audio
const audioDataValue = cacheResult.rows[0].audio_data;
const audioFormat = cacheResult.rows[0].audio_format as string;
// Convert database BLOB to Buffer
let audioBuffer: Buffer;
if (audioDataValue instanceof Uint8Array) {
audioBuffer = Buffer.from(audioDataValue);
} else if (audioDataValue instanceof ArrayBuffer) {
audioBuffer = Buffer.from(audioDataValue);
} else {
// Fallback: convert to string then to buffer
audioBuffer = Buffer.from(audioDataValue as any);
}
const contentType = audioFormat === 'mp3' ? 'audio/mpeg' :
audioFormat === 'wav' ? 'audio/wav' :
audioFormat === 'ogg' ? 'audio/ogg' : 'audio/mpeg';
res.setHeader('Content-Type', contentType);
res.setHeader('Content-Length', audioBuffer.length);
res.setHeader('Cache-Control', 'public, max-age=31536000'); // Cache for 1 year
res.send(audioBuffer);
return;
}
// Not cached - generate speech using ElevenLabs
try {
const { audio, format } = await generateSpeech(wordText, voiceId);
// Store in cache
await db.execute({
sql: `
INSERT INTO word_pronunciations (word_id, voice_id, audio_data, audio_format)
VALUES (?, ?, ?, ?)
ON CONFLICT(word_id, voice_id) DO UPDATE SET
audio_data = excluded.audio_data,
audio_format = excluded.audio_format,
created_at = CURRENT_TIMESTAMP
`,
args: [wordId, voiceId, audio, format]
});
// Return audio
const contentType = format === 'mp3' ? 'audio/mpeg' :
format === 'wav' ? 'audio/wav' :
format === 'ogg' ? 'audio/ogg' : 'audio/mpeg';
res.setHeader('Content-Type', contentType);
res.setHeader('Content-Length', audio.length);
res.setHeader('Cache-Control', 'public, max-age=31536000'); // Cache for 1 year
res.send(audio);
} catch (error: any) {
console.error('Error generating speech:', error);
// If ElevenLabs fails and API key is not configured, return helpful error
if (error.message.includes('not configured')) {
return res.status(503).json({
success: false,
error: {
code: 'ELEVENLABS_NOT_CONFIGURED',
message: 'Text-to-speech is not configured. Please set ELEVENLABS_API_KEY environment variable.'
}
});
}
return res.status(500).json({
success: false,
error: {
code: 'SPEECH_GENERATION_ERROR',
message: 'Failed to generate speech pronunciation'
}
});
}
} catch (error: any) {
console.error('Pronounce word error:', error);
res.status(500).json({
success: false,
error: {
code: 'PRONOUNCE_WORD_ERROR',
message: 'Error generating word pronunciation'
}
});
}
}
/**
* Clear all cached pronunciations (admin only)
* Forces regeneration with current model/settings
*/
export async function clearPronunciationsCache(req: AuthRequest, res: Response) {
try {
if (!req.userId) {
return res.status(401).json({
success: false,
error: {
code: 'UNAUTHORIZED',
message: 'Authentication required'
}
});
}
// Delete all cached pronunciations
const result = await db.execute({
sql: 'DELETE FROM word_pronunciations'
});
res.json({
success: true,
data: {
message: 'Pronunciation cache cleared successfully',
deletedCount: result.rowsAffected || 0
}
});
} catch (error: any) {
console.error('Clear pronunciations cache error:', error);
res.status(500).json({
success: false,
error: {
code: 'CLEAR_CACHE_ERROR',
message: 'Error clearing pronunciation cache'
}
});
}
}

35
backend/src/db/migrate.ts

@ -297,6 +297,41 @@ const migrations = [
console.log('✓ Settings profiles tables already exist, skipping'); console.log('✓ Settings profiles tables already exist, skipping');
} }
} }
},
{
id: 7,
name: 'create_word_pronunciations',
up: async () => {
// Check if table already exists
const tableCheck = await db.execute(`
SELECT name FROM sqlite_master
WHERE type='table' AND name='word_pronunciations'
`);
if (tableCheck.rows.length === 0) {
// Create word_pronunciations table
await db.execute(`
CREATE TABLE word_pronunciations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
word_id INTEGER NOT NULL,
voice_id TEXT NOT NULL DEFAULT 'default',
audio_data BLOB NOT NULL,
audio_format TEXT NOT NULL DEFAULT 'mp3',
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (word_id) REFERENCES words(id) ON DELETE CASCADE,
UNIQUE(word_id, voice_id)
)
`);
// Create indexes
await db.execute('CREATE INDEX IF NOT EXISTS idx_word_pronunciations_word_id ON word_pronunciations(word_id)');
await db.execute('CREATE INDEX IF NOT EXISTS idx_word_pronunciations_voice_id ON word_pronunciations(voice_id)');
console.log('✓ Created word_pronunciations table');
} else {
console.log('✓ Word pronunciations table already exists, skipping');
}
}
} }
]; ];

2
backend/src/index.ts

@ -13,6 +13,7 @@ import wordGroupsRoutes from './routes/wordGroups.routes.js';
import usersRoutes from './routes/users.routes.js'; import usersRoutes from './routes/users.routes.js';
import settingsProfilesRoutes from './routes/settingsProfiles.routes.js'; import settingsProfilesRoutes from './routes/settingsProfiles.routes.js';
import magicCodeRoutes from './routes/magicCode.routes.js'; import magicCodeRoutes from './routes/magicCode.routes.js';
import speechSoundsRoutes from './routes/speechSounds.routes.js';
import { errorHandler } from './middleware/errorHandler.js'; import { errorHandler } from './middleware/errorHandler.js';
import { apiLimiter } from './middleware/rateLimiter.js'; import { apiLimiter } from './middleware/rateLimiter.js';
import { createWebSocketServer } from './services/websocket.service.js'; import { createWebSocketServer } from './services/websocket.service.js';
@ -56,6 +57,7 @@ async function startServer() {
app.use('/api/users', usersRoutes); app.use('/api/users', usersRoutes);
app.use('/api/settings-profiles', settingsProfilesRoutes); app.use('/api/settings-profiles', settingsProfilesRoutes);
app.use('/api/magic-code', magicCodeRoutes); app.use('/api/magic-code', magicCodeRoutes);
app.use('/api/speech-sounds', speechSoundsRoutes);
// Error handling // Error handling
app.use(errorHandler); app.use(errorHandler);

14
backend/src/routes/speechSounds.routes.ts

@ -0,0 +1,14 @@
import { Router } from 'express';
import { pronounceWord, clearPronunciationsCache } from '../controllers/speechSounds.controller.js';
import { authMiddleware } from '../middleware/auth.js';
import { adminMiddleware } from '../middleware/admin.js';
const router = Router();
// Public route - no authentication required for pronunciation
router.get('/pronounce/:wordId', pronounceWord);
// Admin route - clear pronunciation cache
router.delete('/cache', authMiddleware, adminMiddleware, clearPronunciationsCache);
export default router;

74
backend/src/services/elevenlabs.service.ts

@ -0,0 +1,74 @@
import { env } from '../config/env.js';
const ELEVENLABS_API_URL = 'https://api.elevenlabs.io/v1';
const DEFAULT_VOICE_ID = '21m00Tcm4TlvDq8ikWAM'; // Rachel - a friendly, clear voice
export interface ElevenLabsResponse {
audio: Buffer;
format: string;
}
/**
* Generate speech audio using ElevenLabs API
* @param text The text to convert to speech
* @param voiceId Optional voice ID (defaults to Rachel)
* @returns Audio buffer and format
*/
export async function generateSpeech(
text: string,
voiceId: string = DEFAULT_VOICE_ID
): Promise<ElevenLabsResponse> {
if (!env.elevenLabsApiKey) {
throw new Error('ElevenLabs API key is not configured');
}
if (!text || text.trim().length === 0) {
throw new Error('Text cannot be empty');
}
try {
const response = await fetch(`${ELEVENLABS_API_URL}/text-to-speech/${voiceId}`, {
method: 'POST',
headers: {
'Accept': 'audio/mpeg',
'Content-Type': 'application/json',
'xi-api-key': env.elevenLabsApiKey
},
body: JSON.stringify({
text: text.trim(),
model_id: 'eleven_turbo_v2_5'
// No voice_settings - uses the voice's default settings from ElevenLabs
})
});
if (!response.ok) {
const errorText = await response.text();
console.error('ElevenLabs API error:', response.status, errorText);
throw new Error(`ElevenLabs API error: ${response.status} - ${errorText}`);
}
const audioBuffer = Buffer.from(await response.arrayBuffer());
const contentType = response.headers.get('content-type') || 'audio/mpeg';
// Determine format from content type
let format = 'mp3';
if (contentType.includes('mp3')) {
format = 'mp3';
} else if (contentType.includes('wav')) {
format = 'wav';
} else if (contentType.includes('ogg')) {
format = 'ogg';
}
return {
audio: audioBuffer,
format
};
} catch (error: any) {
if (error.message.includes('ElevenLabs API error')) {
throw error;
}
console.error('Error calling ElevenLabs API:', error);
throw new Error(`Failed to generate speech: ${error.message}`);
}
}

54
frontend/src/pages/SpeechSoundsApp.tsx

@ -1,5 +1,6 @@
import { useState, useEffect } from 'react'; import { useState, useEffect } from 'react';
import { wordGroupsApi } from '../services/apiClient'; import { wordGroupsApi } from '../services/apiClient';
import { playWordPronunciation } from '../services/audioService';
interface Word { interface Word {
id: number; id: number;
@ -28,6 +29,8 @@ export function SpeechSoundsApp() {
const [loading, setLoading] = useState(true); const [loading, setLoading] = useState(true);
const [error, setError] = useState<string | null>(null); const [error, setError] = useState<string | null>(null);
const [showWordPractice, setShowWordPractice] = useState(false); const [showWordPractice, setShowWordPractice] = useState(false);
const [playingWordId, setPlayingWordId] = useState<number | null>(null);
const [audioError, setAudioError] = useState<string | null>(null);
// Load practice data from localStorage // Load practice data from localStorage
useEffect(() => { useEffect(() => {
@ -125,6 +128,26 @@ export function SpeechSoundsApp() {
setCurrentWordIndex(0); setCurrentWordIndex(0);
}; };
const handlePlayPronunciation = async (wordId: number) => {
try {
setPlayingWordId(wordId);
setAudioError(null);
await playWordPronunciation(wordId);
// Reset playing state after a short delay to allow audio to start
setTimeout(() => {
setPlayingWordId(null);
}, 100);
} catch (err: any) {
console.error('Error playing pronunciation:', err);
setAudioError('Unable to play pronunciation. Please try again.');
setPlayingWordId(null);
// Clear error after 3 seconds
setTimeout(() => {
setAudioError(null);
}, 3000);
}
};
if (loading) { if (loading) {
return ( return (
<div className="min-h-[calc(100vh-60px)] bg-background px-6 py-6 max-w-[900px] mx-auto"> <div className="min-h-[calc(100vh-60px)] bg-background px-6 py-6 max-w-[900px] mx-auto">
@ -179,14 +202,35 @@ export function SpeechSoundsApp() {
<div className="bg-card rounded-[32px] p-10 shadow-lg border-4 border-primary"> <div className="bg-card rounded-[32px] p-10 shadow-lg border-4 border-primary">
<div className="text-center mb-10"> <div className="text-center mb-10">
<div className="flex items-center justify-center gap-4 mb-5">
<h2 <h2
className="text-[72px] md:text-[72px] text-[48px] font-black mb-5 tracking-[4px] bg-gradient-to-r from-primary to-accent bg-clip-text text-transparent animate-[wordBounce_0.5s_ease-out]" className="text-[72px] md:text-[72px] text-[48px] font-black tracking-[4px] bg-gradient-to-r from-primary to-accent bg-clip-text text-transparent animate-[wordBounce_0.5s_ease-out]"
style={{ style={{
animation: 'wordBounce 0.5s ease-out', animation: 'wordBounce 0.5s ease-out',
}} }}
> >
{currentWord.word} {currentWord.word}
</h2> </h2>
<button
onClick={() => handlePlayPronunciation(currentWord.id)}
disabled={playingWordId === currentWord.id}
className={`w-16 h-16 rounded-full border-4 border-primary flex items-center justify-center text-3xl font-bold transition-all shadow-lg hover:scale-110 hover:shadow-xl disabled:opacity-50 disabled:cursor-not-allowed ${
playingWordId === currentWord.id
? 'bg-primary text-primary-foreground animate-pulse'
: 'bg-primary text-primary-foreground hover:bg-primary/90'
}`}
title="Play pronunciation"
>
{playingWordId === currentWord.id ? (
<span className="animate-spin"></span>
) : (
'▶'
)}
</button>
</div>
{audioError && (
<p className="text-sm text-destructive mb-2">{audioError}</p>
)}
<div className="flex justify-center gap-5 mt-5 flex-wrap"> <div className="flex justify-center gap-5 mt-5 flex-wrap">
<span className="text-lg font-bold py-3 px-5 rounded-[25px] shadow-md border-[3px] bg-[#10b981] border-[#10b981] text-white"> <span className="text-lg font-bold py-3 px-5 rounded-[25px] shadow-md border-[3px] bg-[#10b981] border-[#10b981] text-white">
{passCount} Pass {passCount} Pass
@ -219,10 +263,10 @@ export function SpeechSoundsApp() {
<div className="flex gap-2.5"> <div className="flex gap-2.5">
<button <button
onClick={() => togglePractice(currentWord.id, i, 'pass')} onClick={() => togglePractice(currentWord.id, i, 'pass')}
className={`w-11 h-11 border-[3px] rounded-xl bg-white text-2xl font-bold cursor-pointer transition-all flex items-center justify-center shadow-md ${ className={`w-11 h-11 border-[3px] rounded-xl text-2xl font-bold cursor-pointer transition-all flex items-center justify-center shadow-md ${
isPass isPass
? 'bg-[#10b981] text-white border-[#10b981] scale-110 shadow-lg' ? 'bg-[#10b981] text-white border-[#10b981] scale-110 shadow-lg'
: 'text-[#065f46] border-[#10b981] hover:bg-[#10b981] hover:border-[#10b981] hover:text-white hover:scale-110 hover:shadow-lg' : 'bg-white text-[#065f46] border-[#10b981] hover:bg-[#10b981] hover:border-[#10b981] hover:text-white hover:scale-110 hover:shadow-lg'
}`} }`}
title="Mark as pass" title="Mark as pass"
> >
@ -230,10 +274,10 @@ export function SpeechSoundsApp() {
</button> </button>
<button <button
onClick={() => togglePractice(currentWord.id, i, 'fail')} onClick={() => togglePractice(currentWord.id, i, 'fail')}
className={`w-11 h-11 border-[3px] rounded-xl bg-white text-2xl font-bold cursor-pointer transition-all flex items-center justify-center shadow-md ${ className={`w-11 h-11 border-[3px] rounded-xl text-2xl font-bold cursor-pointer transition-all flex items-center justify-center shadow-md ${
isFail isFail
? 'bg-[#ef4444] text-white border-[#ef4444] scale-110 shadow-lg' ? 'bg-[#ef4444] text-white border-[#ef4444] scale-110 shadow-lg'
: 'text-[#991b1b] border-[#ef4444] hover:bg-[#ef4444] hover:border-[#ef4444] hover:text-white hover:scale-110 hover:shadow-lg' : 'bg-white text-[#991b1b] border-[#ef4444] hover:bg-[#ef4444] hover:border-[#ef4444] hover:text-white hover:scale-110 hover:shadow-lg'
}`} }`}
title="Mark as fail" title="Mark as fail"
> >

47
frontend/src/services/audioService.ts

@ -0,0 +1,47 @@
const API_BASE_URL = import.meta.env.VITE_API_URL || 'http://localhost:8080/api';
/**
* Play pronunciation audio for a word
* @param wordId The ID of the word to pronounce
* @returns Promise that resolves when audio starts playing
*/
export async function playWordPronunciation(wordId: number): Promise<void> {
return new Promise((resolve, reject) => {
const audio = new Audio(`${API_BASE_URL}/speech-sounds/pronounce/${wordId}`);
audio.onloadeddata = () => {
audio.play().then(() => {
resolve();
}).catch((error) => {
console.error('Error playing audio:', error);
reject(new Error('Failed to play audio'));
});
};
audio.onerror = (error) => {
console.error('Audio loading error:', error);
reject(new Error('Failed to load audio'));
};
// Handle audio end
audio.onended = () => {
// Clean up
};
});
}
/**
* Check if audio is available for a word (without playing it)
* @param wordId The ID of the word to check
* @returns Promise that resolves to true if audio is available
*/
export async function checkAudioAvailability(wordId: number): Promise<boolean> {
try {
const response = await fetch(`${API_BASE_URL}/speech-sounds/pronounce/${wordId}`, {
method: 'HEAD'
});
return response.ok;
} catch (error) {
return false;
}
}
Loading…
Cancel
Save