feat(speech): enhance speech configuration and example integration

- Add comprehensive speech configuration in .env.example and app config - Update Docker speech Dockerfile for more flexible model handling - Create detailed README for speech-to-text examples - Implement example script demonstrating speech features - Improve speech service initialization and configuration management
2025-02-04 19:35:50 +01:00
parent 60f18f8e71
commit 3a6f79c9a8
14 changed files with 669 additions and 86 deletions
--- a/docker/speech/Dockerfile
+++ b/docker/speech/Dockerfile
@@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -y \

 # Install fast-whisper and its dependencies
 RUN pip install --no-cache-dir torch torchaudio --index-url https://download.pytorch.org/whl/cpu
-RUN pip install --no-cache-dir fast-whisper
+RUN pip install --no-cache-dir faster-whisper

 # Install wake word detection
 RUN pip install --no-cache-dir openwakeword pyaudio sounddevice
@@ -19,11 +19,13 @@ RUN pip install --no-cache-dir openwakeword pyaudio sounddevice
 RUN mkdir -p /models /audio

 # Download the base model by default
-RUN python -c "from faster_whisper import WhisperModel; WhisperModel.download_model('base.en', cache_dir='/models')"
+# The model will be downloaded automatically when first used
+ENV ASR_MODEL=base.en
+ENV ASR_MODEL_PATH=/models

-# Download OpenWakeWord models
-RUN mkdir -p /models/wake_word && \
-    python -c "import openwakeword; openwakeword.download_models(['hey_jarvis', 'ok_google', 'alexa'], '/models/wake_word')"
+# Create wake word model directory
+# Models will be downloaded automatically when first used
+RUN mkdir -p /models/wake_word

 WORKDIR /app

--- a/docker/speech/wake_word_detector.py
+++ b/docker/speech/wake_word_detector.py
@@ -7,6 +7,7 @@ import sounddevice as sd
 from openwakeword import Model
 from datetime import datetime
 import wave
+from faster_whisper import WhisperModel

 # Configuration
 SAMPLE_RATE = 16000
@@ -15,12 +16,29 @@ CHUNK_SIZE = 1024
 BUFFER_DURATION = 30  # seconds to keep in buffer
 DETECTION_THRESHOLD = 0.5

+# Wake word models to use
+WAKE_WORDS = ["hey_jarvis", "ok_google", "alexa"]
+
+# Initialize the ASR model
+asr_model = WhisperModel(
+    model_size_or_path=os.environ.get('ASR_MODEL', 'base.en'),
+    device="cpu",
+    compute_type="int8",
+    download_root=os.environ.get('ASR_MODEL_PATH', '/models')
+)
+
 class AudioProcessor:
    def __init__(self):
+        # Initialize wake word detection model
        self.wake_word_model = Model(
-            wakeword_models=["hey_jarvis", "ok_google", "alexa"],
-            model_path=os.environ.get('WAKEWORD_MODEL_PATH', '/models/wake_word')
+            custom_model_paths=None,  # Use default models
+            inference_framework="onnx"  # Use ONNX for better performance
        )
+
+        # Pre-load the wake word models
+        for wake_word in WAKE_WORDS:
+            self.wake_word_model.add_model(wake_word)
+
        self.audio_buffer = queue.Queue()
        self.recording = False
        self.buffer = np.zeros(SAMPLE_RATE * BUFFER_DURATION)
@@ -46,16 +64,16 @@ class AudioProcessor:
        prediction = self.wake_word_model.predict(audio_data)
        
        # Check if wake word detected
-        for wake_word, score in prediction.items():
-            if score > DETECTION_THRESHOLD:
-                print(f"Wake word detected: {wake_word} (confidence: {score:.2f})")
-                self.save_audio_segment()
+        for wake_word in WAKE_WORDS:
+            if prediction[wake_word] > DETECTION_THRESHOLD:
+                print(f"Wake word detected: {wake_word} (confidence: {prediction[wake_word]:.2f})")
+                self.save_audio_segment(wake_word)
                break

-    def save_audio_segment(self):
+    def save_audio_segment(self, wake_word):
        """Save the audio buffer when wake word is detected"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        filename = f"/audio/wake_word_{timestamp}.wav"
+        filename = f"/audio/wake_word_{wake_word}_{timestamp}.wav"
        
        # Save the audio buffer to a WAV file
        with wave.open(filename, 'wb') as wf:
@@ -68,28 +86,80 @@ class AudioProcessor:
            wf.writeframes(audio_data.tobytes())
        
        print(f"Saved audio segment to {filename}")
-        
-        # Write metadata
-        metadata = {
-            "timestamp": timestamp,
-            "sample_rate": SAMPLE_RATE,
-            "channels": CHANNELS,
-            "duration": BUFFER_DURATION
-        }
-        
-        with open(f"{filename}.json", 'w') as f:
-            json.dump(metadata, f, indent=2)
+
+        # Transcribe the audio
+        try:
+            segments, info = asr_model.transcribe(
+                filename,
+                language="en",
+                beam_size=5,
+                temperature=0
+            )
+            
+            # Format the transcription result
+            result = {
+                "text": " ".join(segment.text for segment in segments),
+                "segments": [
+                    {
+                        "text": segment.text,
+                        "start": segment.start,
+                        "end": segment.end,
+                        "confidence": segment.confidence
+                    }
+                    for segment in segments
+                ]
+            }
+            
+            # Save metadata and transcription
+            metadata = {
+                "timestamp": timestamp,
+                "wake_word": wake_word,
+                "wake_word_confidence": float(prediction[wake_word]),
+                "sample_rate": SAMPLE_RATE,
+                "channels": CHANNELS,
+                "duration": BUFFER_DURATION,
+                "transcription": result
+            }
+            
+            with open(f"{filename}.json", 'w') as f:
+                json.dump(metadata, f, indent=2)
+                
+            print("\nTranscription result:")
+            print(f"Text: {result['text']}")
+            print("\nSegments:")
+            for segment in result["segments"]:
+                print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['confidence']:.2%})")
+                print(f'"{segment["text"]}"')
+                
+        except Exception as e:
+            print(f"Error during transcription: {e}")
+            metadata = {
+                "timestamp": timestamp,
+                "wake_word": wake_word,
+                "wake_word_confidence": float(prediction[wake_word]),
+                "sample_rate": SAMPLE_RATE,
+                "channels": CHANNELS,
+                "duration": BUFFER_DURATION,
+                "error": str(e)
+            }
+            with open(f"{filename}.json", 'w') as f:
+                json.dump(metadata, f, indent=2)

    def start(self):
        """Start audio processing"""
        try:
+            print("Initializing wake word detection...")
+            print(f"Loaded wake words: {', '.join(WAKE_WORDS)}")
+            
            with sd.InputStream(
                channels=CHANNELS,
                samplerate=SAMPLE_RATE,
                blocksize=CHUNK_SIZE,
                callback=self.audio_callback
            ):
-                print("Wake word detection started. Listening...")
+                print("\nWake word detection started. Listening...")
+                print("Press Ctrl+C to stop")
+                
                while True:
                    sd.sleep(1000)  # Sleep for 1 second
                    
@@ -99,6 +169,5 @@ class AudioProcessor:
            print(f"Error in audio processing: {e}")

 if __name__ == "__main__":
-    print("Initializing wake word detection...")
    processor = AudioProcessor()
    processor.start()