feat(speech): add speech-to-text and wake word detection modules

- Implement SpeechToText class with Docker-based transcription capabilities - Add wake word detection using OpenWakeWord and fast-whisper models - Create Dockerfile for speech processing container - Develop comprehensive test suite for speech recognition functionality - Include audio processing and event-driven transcription features
2025-02-04 19:08:01 +01:00
parent 47f11b3d95
commit 60f18f8e71
5 changed files with 649 additions and 246 deletions
--- a/src/speech/tests/speechToText.test.ts
+++ b/src/speech/tests/speechToText.test.ts
@@ -0,0 +1,114 @@
+import { SpeechToText, WakeWordEvent } from '../speechToText';
+import fs from 'fs';
+import path from 'path';
+
+describe('SpeechToText', () => {
+    let speechToText: SpeechToText;
+    const testAudioDir = path.join(__dirname, 'test_audio');
+
+    beforeEach(() => {
+        speechToText = new SpeechToText('fast-whisper');
+        // Create test audio directory if it doesn't exist
+        if (!fs.existsSync(testAudioDir)) {
+            fs.mkdirSync(testAudioDir, { recursive: true });
+        }
+    });
+
+    afterEach(() => {
+        speechToText.stopWakeWordDetection();
+        // Clean up test files
+        if (fs.existsSync(testAudioDir)) {
+            fs.rmSync(testAudioDir, { recursive: true, force: true });
+        }
+    });
+
+    describe('checkHealth', () => {
+        it('should return true when the container is running', async () => {
+            const isHealthy = await speechToText.checkHealth();
+            expect(isHealthy).toBeDefined();
+        });
+    });
+
+    describe('wake word detection', () => {
+        it('should detect new audio files and emit wake word events', (done) => {
+            const testFile = path.join(testAudioDir, 'wake_word_20240203_123456.wav');
+            const testMetadata = `${testFile}.json`;
+
+            speechToText.startWakeWordDetection(testAudioDir);
+
+            speechToText.on('wake_word', (event: WakeWordEvent) => {
+                expect(event).toBeDefined();
+                expect(event.audioFile).toBe(testFile);
+                expect(event.metadataFile).toBe(testMetadata);
+                expect(event.timestamp).toBe('123456');
+                done();
+            });
+
+            // Create a test audio file to trigger the event
+            fs.writeFileSync(testFile, 'test audio content');
+        });
+
+        it('should automatically transcribe detected wake word audio', (done) => {
+            const testFile = path.join(testAudioDir, 'wake_word_20240203_123456.wav');
+
+            speechToText.startWakeWordDetection(testAudioDir);
+
+            speechToText.on('transcription', (event) => {
+                expect(event).toBeDefined();
+                expect(event.audioFile).toBe(testFile);
+                expect(event.result).toBeDefined();
+                done();
+            });
+
+            // Create a test audio file to trigger the event
+            fs.writeFileSync(testFile, 'test audio content');
+        });
+
+        it('should handle errors during wake word audio transcription', (done) => {
+            const testFile = path.join(testAudioDir, 'wake_word_20240203_123456.wav');
+
+            speechToText.startWakeWordDetection(testAudioDir);
+
+            speechToText.on('error', (error) => {
+                expect(error).toBeDefined();
+                expect(error.message).toContain('Transcription failed');
+                done();
+            });
+
+            // Create an invalid audio file to trigger an error
+            fs.writeFileSync(testFile, 'invalid audio content');
+        });
+    });
+
+    describe('transcribeAudio', () => {
+        it('should transcribe an audio file', async () => {
+            const result = await speechToText.transcribeAudio('/audio/test.wav');
+
+            expect(result).toBeDefined();
+            expect(result.text).toBeDefined();
+            expect(result.segments).toBeDefined();
+            expect(Array.isArray(result.segments)).toBe(true);
+        }, 30000);
+
+        it('should handle transcription errors', async () => {
+            await expect(
+                speechToText.transcribeAudio('/audio/nonexistent.wav')
+            ).rejects.toThrow();
+        });
+
+        it('should emit progress events', (done) => {
+            const progressEvents: Array<{ type: string; data: string }> = [];
+
+            speechToText.on('progress', (event: { type: string; data: string }) => {
+                progressEvents.push(event);
+                if (event.type === 'stderr' && event.data.includes('error')) {
+                    expect(progressEvents.length).toBeGreaterThan(0);
+                    done();
+                }
+            });
+
+            // Trigger an error to test progress events
+            speechToText.transcribeAudio('/audio/nonexistent.wav').catch(() => { });
+        });
+    });
+}); 
--- a/src/speech/speechToText.ts
+++ b/src/speech/speechToText.ts
@@ -0,0 +1,161 @@
+import { spawn } from 'child_process';
+import { EventEmitter } from 'events';
+import { watch } from 'fs';
+import path from 'path';
+
+export interface TranscriptionOptions {
+    model?: 'tiny.en' | 'base.en' | 'small.en' | 'medium.en' | 'large-v2';
+    language?: string;
+    temperature?: number;
+    beamSize?: number;
+    patience?: number;
+    device?: 'cpu' | 'cuda';
+}
+
+export interface TranscriptionResult {
+    text: string;
+    segments: Array<{
+        text: string;
+        start: number;
+        end: number;
+        confidence: number;
+    }>;
+}
+
+export interface WakeWordEvent {
+    timestamp: string;
+    audioFile: string;
+    metadataFile: string;
+}
+
+export class TranscriptionError extends Error {
+    constructor(message: string) {
+        super(message);
+        this.name = 'TranscriptionError';
+    }
+}
+
+export class SpeechToText extends EventEmitter {
+    private containerName: string;
+    private audioWatcher?: ReturnType<typeof watch>;
+
+    constructor(containerName = 'fast-whisper') {
+        super();
+        this.containerName = containerName;
+    }
+
+    startWakeWordDetection(audioDir: string = './audio'): void {
+        // Watch for new audio files from wake word detection
+        this.audioWatcher = watch(audioDir, (eventType, filename) => {
+            if (eventType === 'rename' && filename && filename.startsWith('wake_word_') && filename.endsWith('.wav')) {
+                const audioFile = path.join(audioDir, filename);
+                const metadataFile = `${audioFile}.json`;
+
+                // Emit wake word event
+                this.emit('wake_word', {
+                    timestamp: filename.split('_')[2].split('.')[0],
+                    audioFile,
+                    metadataFile
+                } as WakeWordEvent);
+
+                // Automatically transcribe the wake word audio
+                this.transcribeAudio(audioFile)
+                    .then(result => {
+                        this.emit('transcription', { audioFile, result });
+                    })
+                    .catch(error => {
+                        this.emit('error', error);
+                    });
+            }
+        });
+    }
+
+    stopWakeWordDetection(): void {
+        if (this.audioWatcher) {
+            this.audioWatcher.close();
+            this.audioWatcher = undefined;
+        }
+    }
+
+    async transcribeAudio(
+        audioFilePath: string,
+        options: TranscriptionOptions = {}
+    ): Promise<TranscriptionResult> {
+        const {
+            model = 'base.en',
+            language = 'en',
+            temperature = 0,
+            beamSize = 5,
+            patience = 1,
+            device = 'cpu'
+        } = options;
+
+        return new Promise((resolve, reject) => {
+            // Construct Docker command to run fast-whisper
+            const args = [
+                'exec',
+                this.containerName,
+                'fast-whisper',
+                '--model', model,
+                '--language', language,
+                '--temperature', temperature.toString(),
+                '--beam-size', beamSize.toString(),
+                '--patience', patience.toString(),
+                '--device', device,
+                '--output-json',
+                audioFilePath
+            ];
+
+            const process = spawn('docker', args);
+            let stdout = '';
+            let stderr = '';
+
+            process.stdout.on('data', (data: Buffer) => {
+                stdout += data.toString();
+                this.emit('progress', { type: 'stdout', data: data.toString() });
+            });
+
+            process.stderr.on('data', (data: Buffer) => {
+                stderr += data.toString();
+                this.emit('progress', { type: 'stderr', data: data.toString() });
+            });
+
+            process.on('close', (code: number) => {
+                if (code !== 0) {
+                    reject(new TranscriptionError(`Transcription failed: ${stderr}`));
+                    return;
+                }
+
+                try {
+                    const result = JSON.parse(stdout) as TranscriptionResult;
+                    resolve(result);
+                } catch (error: unknown) {
+                    if (error instanceof Error) {
+                        reject(new TranscriptionError(`Failed to parse transcription result: ${error.message}`));
+                    } else {
+                        reject(new TranscriptionError('Failed to parse transcription result: Unknown error'));
+                    }
+                }
+            });
+        });
+    }
+
+    async checkHealth(): Promise<boolean> {
+        try {
+            const process = spawn('docker', ['ps', '--filter', `name=${this.containerName}`, '--format', '{{.Status}}']);
+
+            return new Promise((resolve) => {
+                let output = '';
+                process.stdout.on('data', (data: Buffer) => {
+                    output += data.toString();
+                });
+
+                process.on('close', (code: number) => {
+                    resolve(code === 0 && output.toLowerCase().includes('up'));
+                });
+            });
+        } catch (error) {
+            return false;
+        }
+    }
+}