feat(speech): add speech-to-text and wake word detection modules
- Implement SpeechToText class with Docker-based transcription capabilities - Add wake word detection using OpenWakeWord and fast-whisper models - Create Dockerfile for speech processing container - Develop comprehensive test suite for speech recognition functionality - Include audio processing and event-driven transcription features
This commit is contained in:
114
src/speech/__tests__/speechToText.test.ts
Normal file
114
src/speech/__tests__/speechToText.test.ts
Normal file
@@ -0,0 +1,114 @@
|
||||
import { SpeechToText, WakeWordEvent } from '../speechToText';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
|
||||
describe('SpeechToText', () => {
|
||||
let speechToText: SpeechToText;
|
||||
const testAudioDir = path.join(__dirname, 'test_audio');
|
||||
|
||||
beforeEach(() => {
|
||||
speechToText = new SpeechToText('fast-whisper');
|
||||
// Create test audio directory if it doesn't exist
|
||||
if (!fs.existsSync(testAudioDir)) {
|
||||
fs.mkdirSync(testAudioDir, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
speechToText.stopWakeWordDetection();
|
||||
// Clean up test files
|
||||
if (fs.existsSync(testAudioDir)) {
|
||||
fs.rmSync(testAudioDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
describe('checkHealth', () => {
|
||||
it('should return true when the container is running', async () => {
|
||||
const isHealthy = await speechToText.checkHealth();
|
||||
expect(isHealthy).toBeDefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('wake word detection', () => {
|
||||
it('should detect new audio files and emit wake word events', (done) => {
|
||||
const testFile = path.join(testAudioDir, 'wake_word_20240203_123456.wav');
|
||||
const testMetadata = `${testFile}.json`;
|
||||
|
||||
speechToText.startWakeWordDetection(testAudioDir);
|
||||
|
||||
speechToText.on('wake_word', (event: WakeWordEvent) => {
|
||||
expect(event).toBeDefined();
|
||||
expect(event.audioFile).toBe(testFile);
|
||||
expect(event.metadataFile).toBe(testMetadata);
|
||||
expect(event.timestamp).toBe('123456');
|
||||
done();
|
||||
});
|
||||
|
||||
// Create a test audio file to trigger the event
|
||||
fs.writeFileSync(testFile, 'test audio content');
|
||||
});
|
||||
|
||||
it('should automatically transcribe detected wake word audio', (done) => {
|
||||
const testFile = path.join(testAudioDir, 'wake_word_20240203_123456.wav');
|
||||
|
||||
speechToText.startWakeWordDetection(testAudioDir);
|
||||
|
||||
speechToText.on('transcription', (event) => {
|
||||
expect(event).toBeDefined();
|
||||
expect(event.audioFile).toBe(testFile);
|
||||
expect(event.result).toBeDefined();
|
||||
done();
|
||||
});
|
||||
|
||||
// Create a test audio file to trigger the event
|
||||
fs.writeFileSync(testFile, 'test audio content');
|
||||
});
|
||||
|
||||
it('should handle errors during wake word audio transcription', (done) => {
|
||||
const testFile = path.join(testAudioDir, 'wake_word_20240203_123456.wav');
|
||||
|
||||
speechToText.startWakeWordDetection(testAudioDir);
|
||||
|
||||
speechToText.on('error', (error) => {
|
||||
expect(error).toBeDefined();
|
||||
expect(error.message).toContain('Transcription failed');
|
||||
done();
|
||||
});
|
||||
|
||||
// Create an invalid audio file to trigger an error
|
||||
fs.writeFileSync(testFile, 'invalid audio content');
|
||||
});
|
||||
});
|
||||
|
||||
describe('transcribeAudio', () => {
|
||||
it('should transcribe an audio file', async () => {
|
||||
const result = await speechToText.transcribeAudio('/audio/test.wav');
|
||||
|
||||
expect(result).toBeDefined();
|
||||
expect(result.text).toBeDefined();
|
||||
expect(result.segments).toBeDefined();
|
||||
expect(Array.isArray(result.segments)).toBe(true);
|
||||
}, 30000);
|
||||
|
||||
it('should handle transcription errors', async () => {
|
||||
await expect(
|
||||
speechToText.transcribeAudio('/audio/nonexistent.wav')
|
||||
).rejects.toThrow();
|
||||
});
|
||||
|
||||
it('should emit progress events', (done) => {
|
||||
const progressEvents: Array<{ type: string; data: string }> = [];
|
||||
|
||||
speechToText.on('progress', (event: { type: string; data: string }) => {
|
||||
progressEvents.push(event);
|
||||
if (event.type === 'stderr' && event.data.includes('error')) {
|
||||
expect(progressEvents.length).toBeGreaterThan(0);
|
||||
done();
|
||||
}
|
||||
});
|
||||
|
||||
// Trigger an error to test progress events
|
||||
speechToText.transcribeAudio('/audio/nonexistent.wav').catch(() => { });
|
||||
});
|
||||
});
|
||||
});
|
||||
161
src/speech/speechToText.ts
Normal file
161
src/speech/speechToText.ts
Normal file
@@ -0,0 +1,161 @@
|
||||
import { spawn } from 'child_process';
|
||||
import { EventEmitter } from 'events';
|
||||
import { watch } from 'fs';
|
||||
import path from 'path';
|
||||
|
||||
export interface TranscriptionOptions {
|
||||
model?: 'tiny.en' | 'base.en' | 'small.en' | 'medium.en' | 'large-v2';
|
||||
language?: string;
|
||||
temperature?: number;
|
||||
beamSize?: number;
|
||||
patience?: number;
|
||||
device?: 'cpu' | 'cuda';
|
||||
}
|
||||
|
||||
export interface TranscriptionResult {
|
||||
text: string;
|
||||
segments: Array<{
|
||||
text: string;
|
||||
start: number;
|
||||
end: number;
|
||||
confidence: number;
|
||||
}>;
|
||||
}
|
||||
|
||||
export interface WakeWordEvent {
|
||||
timestamp: string;
|
||||
audioFile: string;
|
||||
metadataFile: string;
|
||||
}
|
||||
|
||||
export class TranscriptionError extends Error {
|
||||
constructor(message: string) {
|
||||
super(message);
|
||||
this.name = 'TranscriptionError';
|
||||
}
|
||||
}
|
||||
|
||||
export class SpeechToText extends EventEmitter {
|
||||
private containerName: string;
|
||||
private audioWatcher?: ReturnType<typeof watch>;
|
||||
|
||||
constructor(containerName = 'fast-whisper') {
|
||||
super();
|
||||
this.containerName = containerName;
|
||||
}
|
||||
|
||||
startWakeWordDetection(audioDir: string = './audio'): void {
|
||||
// Watch for new audio files from wake word detection
|
||||
this.audioWatcher = watch(audioDir, (eventType, filename) => {
|
||||
if (eventType === 'rename' && filename && filename.startsWith('wake_word_') && filename.endsWith('.wav')) {
|
||||
const audioFile = path.join(audioDir, filename);
|
||||
const metadataFile = `${audioFile}.json`;
|
||||
|
||||
// Emit wake word event
|
||||
this.emit('wake_word', {
|
||||
timestamp: filename.split('_')[2].split('.')[0],
|
||||
audioFile,
|
||||
metadataFile
|
||||
} as WakeWordEvent);
|
||||
|
||||
// Automatically transcribe the wake word audio
|
||||
this.transcribeAudio(audioFile)
|
||||
.then(result => {
|
||||
this.emit('transcription', { audioFile, result });
|
||||
})
|
||||
.catch(error => {
|
||||
this.emit('error', error);
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
stopWakeWordDetection(): void {
|
||||
if (this.audioWatcher) {
|
||||
this.audioWatcher.close();
|
||||
this.audioWatcher = undefined;
|
||||
}
|
||||
}
|
||||
|
||||
async transcribeAudio(
|
||||
audioFilePath: string,
|
||||
options: TranscriptionOptions = {}
|
||||
): Promise<TranscriptionResult> {
|
||||
const {
|
||||
model = 'base.en',
|
||||
language = 'en',
|
||||
temperature = 0,
|
||||
beamSize = 5,
|
||||
patience = 1,
|
||||
device = 'cpu'
|
||||
} = options;
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
// Construct Docker command to run fast-whisper
|
||||
const args = [
|
||||
'exec',
|
||||
this.containerName,
|
||||
'fast-whisper',
|
||||
'--model', model,
|
||||
'--language', language,
|
||||
'--temperature', temperature.toString(),
|
||||
'--beam-size', beamSize.toString(),
|
||||
'--patience', patience.toString(),
|
||||
'--device', device,
|
||||
'--output-json',
|
||||
audioFilePath
|
||||
];
|
||||
|
||||
const process = spawn('docker', args);
|
||||
let stdout = '';
|
||||
let stderr = '';
|
||||
|
||||
process.stdout.on('data', (data: Buffer) => {
|
||||
stdout += data.toString();
|
||||
this.emit('progress', { type: 'stdout', data: data.toString() });
|
||||
});
|
||||
|
||||
process.stderr.on('data', (data: Buffer) => {
|
||||
stderr += data.toString();
|
||||
this.emit('progress', { type: 'stderr', data: data.toString() });
|
||||
});
|
||||
|
||||
process.on('close', (code: number) => {
|
||||
if (code !== 0) {
|
||||
reject(new TranscriptionError(`Transcription failed: ${stderr}`));
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const result = JSON.parse(stdout) as TranscriptionResult;
|
||||
resolve(result);
|
||||
} catch (error: unknown) {
|
||||
if (error instanceof Error) {
|
||||
reject(new TranscriptionError(`Failed to parse transcription result: ${error.message}`));
|
||||
} else {
|
||||
reject(new TranscriptionError('Failed to parse transcription result: Unknown error'));
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async checkHealth(): Promise<boolean> {
|
||||
try {
|
||||
const process = spawn('docker', ['ps', '--filter', `name=${this.containerName}`, '--format', '{{.Status}}']);
|
||||
|
||||
return new Promise((resolve) => {
|
||||
let output = '';
|
||||
process.stdout.on('data', (data: Buffer) => {
|
||||
output += data.toString();
|
||||
});
|
||||
|
||||
process.on('close', (code: number) => {
|
||||
resolve(code === 0 && output.toLowerCase().includes('up'));
|
||||
});
|
||||
});
|
||||
} catch (error) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user