homeassistant-mcp/docker/speech/wake_word_detector.py

import os
import json
import queue
import threading
import numpy as np
import sounddevice as sd
from openwakeword import Model
from datetime import datetime
import wave
from faster_whisper import WhisperModel

# Configuration
SAMPLE_RATE = 16000
CHANNELS = 1
CHUNK_SIZE = 1024
BUFFER_DURATION = 30  # seconds to keep in buffer
DETECTION_THRESHOLD = 0.5

# Wake word models to use
WAKE_WORDS = ["hey_jarvis", "ok_google", "alexa"]

# Initialize the ASR model
asr_model = WhisperModel(
    model_size_or_path=os.environ.get('ASR_MODEL', 'base.en'),
    device="cpu",
    compute_type="int8",
    download_root=os.environ.get('ASR_MODEL_PATH', '/models')
)

class AudioProcessor:
    def __init__(self):
        # Initialize wake word detection model
        self.wake_word_model = Model(
            custom_model_paths=None,  # Use default models
            inference_framework="onnx"  # Use ONNX for better performance
        )

        # Pre-load the wake word models
        for wake_word in WAKE_WORDS:
            self.wake_word_model.add_model(wake_word)

        self.audio_buffer = queue.Queue()
        self.recording = False
        self.buffer = np.zeros(SAMPLE_RATE * BUFFER_DURATION)
        self.buffer_lock = threading.Lock()

    def audio_callback(self, indata, frames, time, status):
        """Callback for audio input"""
        if status:
            print(f"Audio callback status: {status}")

        # Convert to mono if necessary
        if CHANNELS > 1:
            audio_data = np.mean(indata, axis=1)
        else:
            audio_data = indata.flatten()

        # Update circular buffer
        with self.buffer_lock:
            self.buffer = np.roll(self.buffer, -len(audio_data))
            self.buffer[-len(audio_data):] = audio_data

        # Process for wake word detection
        prediction = self.wake_word_model.predict(audio_data)

        # Check if wake word detected
        for wake_word in WAKE_WORDS:
            if prediction[wake_word] > DETECTION_THRESHOLD:
                print(f"Wake word detected: {wake_word} (confidence: {prediction[wake_word]:.2f})")
                self.save_audio_segment(wake_word)
                break

    def save_audio_segment(self, wake_word):
        """Save the audio buffer when wake word is detected"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"/audio/wake_word_{wake_word}_{timestamp}.wav"

        # Save the audio buffer to a WAV file
        with wave.open(filename, 'wb') as wf:
            wf.setnchannels(CHANNELS)
            wf.setsampwidth(2)  # 16-bit audio
            wf.setframerate(SAMPLE_RATE)

            # Convert float32 to int16
            audio_data = (self.buffer * 32767).astype(np.int16)
            wf.writeframes(audio_data.tobytes())

        print(f"Saved audio segment to {filename}")

        # Transcribe the audio
        try:
            segments, info = asr_model.transcribe(
                filename,
                language="en",
                beam_size=5,
                temperature=0
            )

            # Format the transcription result
            result = {
                "text": " ".join(segment.text for segment in segments),
                "segments": [
                    {
                        "text": segment.text,
                        "start": segment.start,
                        "end": segment.end,
                        "confidence": segment.confidence
                    }
                    for segment in segments
                ]
            }

            # Save metadata and transcription
            metadata = {
                "timestamp": timestamp,
                "wake_word": wake_word,
                "wake_word_confidence": float(prediction[wake_word]),
                "sample_rate": SAMPLE_RATE,
                "channels": CHANNELS,
                "duration": BUFFER_DURATION,
                "transcription": result
            }

            with open(f"{filename}.json", 'w') as f:
                json.dump(metadata, f, indent=2)

            print("\nTranscription result:")
            print(f"Text: {result['text']}")
            print("\nSegments:")
            for segment in result["segments"]:
                print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['confidence']:.2%})")
                print(f'"{segment["text"]}"')

        except Exception as e:
            print(f"Error during transcription: {e}")
            metadata = {
                "timestamp": timestamp,
                "wake_word": wake_word,
                "wake_word_confidence": float(prediction[wake_word]),
                "sample_rate": SAMPLE_RATE,
                "channels": CHANNELS,
                "duration": BUFFER_DURATION,
                "error": str(e)
            }
            with open(f"{filename}.json", 'w') as f:
                json.dump(metadata, f, indent=2)

    def start(self):
        """Start audio processing"""
        try:
            print("Initializing wake word detection...")
            print(f"Loaded wake words: {', '.join(WAKE_WORDS)}")

            with sd.InputStream(
                channels=CHANNELS,
                samplerate=SAMPLE_RATE,
                blocksize=CHUNK_SIZE,
                callback=self.audio_callback
            ):
                print("\nWake word detection started. Listening...")
                print("Press Ctrl+C to stop")

                while True:
                    sd.sleep(1000)  # Sleep for 1 second

        except KeyboardInterrupt:
            print("\nStopping wake word detection...")
        except Exception as e:
            print(f"Error in audio processing: {e}")

if __name__ == "__main__":
    processor = AudioProcessor()
    processor.start()