feat(speech): add speech-to-text and wake word detection modules

- Implement SpeechToText class with Docker-based transcription capabilities - Add wake word detection using OpenWakeWord and fast-whisper models - Create Dockerfile for speech processing container - Develop comprehensive test suite for speech recognition functionality - Include audio processing and event-driven transcription features
2025-02-04 19:08:01 +01:00
parent 47f11b3d95
commit 60f18f8e71
5 changed files with 649 additions and 246 deletions
--- a/docker/speech/Dockerfile
+++ b/docker/speech/Dockerfile
@@ -0,0 +1,39 @@
+FROM python:3.10-slim
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    build-essential \
+    portaudio19-dev \
+    python3-pyaudio \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install fast-whisper and its dependencies
+RUN pip install --no-cache-dir torch torchaudio --index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir fast-whisper
+
+# Install wake word detection
+RUN pip install --no-cache-dir openwakeword pyaudio sounddevice
+
+# Create directories
+RUN mkdir -p /models /audio
+
+# Download the base model by default
+RUN python -c "from faster_whisper import WhisperModel; WhisperModel.download_model('base.en', cache_dir='/models')"
+
+# Download OpenWakeWord models
+RUN mkdir -p /models/wake_word && \
+    python -c "import openwakeword; openwakeword.download_models(['hey_jarvis', 'ok_google', 'alexa'], '/models/wake_word')"
+
+WORKDIR /app
+
+# Copy the wake word detection script
+COPY wake_word_detector.py .
+
+# Set environment variables
+ENV WHISPER_MODEL_PATH=/models
+ENV WAKEWORD_MODEL_PATH=/models/wake_word
+ENV PYTHONUNBUFFERED=1
+
+# Run the wake word detection service
+CMD ["python", "wake_word_detector.py"] 
--- a/docker/speech/wake_word_detector.py
+++ b/docker/speech/wake_word_detector.py
@@ -0,0 +1,104 @@
+import os
+import json
+import queue
+import threading
+import numpy as np
+import sounddevice as sd
+from openwakeword import Model
+from datetime import datetime
+import wave
+
+# Configuration
+SAMPLE_RATE = 16000
+CHANNELS = 1
+CHUNK_SIZE = 1024
+BUFFER_DURATION = 30  # seconds to keep in buffer
+DETECTION_THRESHOLD = 0.5
+
+class AudioProcessor:
+    def __init__(self):
+        self.wake_word_model = Model(
+            wakeword_models=["hey_jarvis", "ok_google", "alexa"],
+            model_path=os.environ.get('WAKEWORD_MODEL_PATH', '/models/wake_word')
+        )
+        self.audio_buffer = queue.Queue()
+        self.recording = False
+        self.buffer = np.zeros(SAMPLE_RATE * BUFFER_DURATION)
+        self.buffer_lock = threading.Lock()
+
+    def audio_callback(self, indata, frames, time, status):
+        """Callback for audio input"""
+        if status:
+            print(f"Audio callback status: {status}")
+        
+        # Convert to mono if necessary
+        if CHANNELS > 1:
+            audio_data = np.mean(indata, axis=1)
+        else:
+            audio_data = indata.flatten()
+
+        # Update circular buffer
+        with self.buffer_lock:
+            self.buffer = np.roll(self.buffer, -len(audio_data))
+            self.buffer[-len(audio_data):] = audio_data
+
+        # Process for wake word detection
+        prediction = self.wake_word_model.predict(audio_data)
+        
+        # Check if wake word detected
+        for wake_word, score in prediction.items():
+            if score > DETECTION_THRESHOLD:
+                print(f"Wake word detected: {wake_word} (confidence: {score:.2f})")
+                self.save_audio_segment()
+                break
+
+    def save_audio_segment(self):
+        """Save the audio buffer when wake word is detected"""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"/audio/wake_word_{timestamp}.wav"
+        
+        # Save the audio buffer to a WAV file
+        with wave.open(filename, 'wb') as wf:
+            wf.setnchannels(CHANNELS)
+            wf.setsampwidth(2)  # 16-bit audio
+            wf.setframerate(SAMPLE_RATE)
+            
+            # Convert float32 to int16
+            audio_data = (self.buffer * 32767).astype(np.int16)
+            wf.writeframes(audio_data.tobytes())
+        
+        print(f"Saved audio segment to {filename}")
+        
+        # Write metadata
+        metadata = {
+            "timestamp": timestamp,
+            "sample_rate": SAMPLE_RATE,
+            "channels": CHANNELS,
+            "duration": BUFFER_DURATION
+        }
+        
+        with open(f"{filename}.json", 'w') as f:
+            json.dump(metadata, f, indent=2)
+
+    def start(self):
+        """Start audio processing"""
+        try:
+            with sd.InputStream(
+                channels=CHANNELS,
+                samplerate=SAMPLE_RATE,
+                blocksize=CHUNK_SIZE,
+                callback=self.audio_callback
+            ):
+                print("Wake word detection started. Listening...")
+                while True:
+                    sd.sleep(1000)  # Sleep for 1 second
+                    
+        except KeyboardInterrupt:
+            print("\nStopping wake word detection...")
+        except Exception as e:
+            print(f"Error in audio processing: {e}")
+
+if __name__ == "__main__":
+    print("Initializing wake word detection...")
+    processor = AudioProcessor()
+    processor.start()