Files
homeassistant-mcp/docker/speech/wake_word_detector.py
jango-blockchained 3a6f79c9a8 feat(speech): enhance speech configuration and example integration
- Add comprehensive speech configuration in .env.example and app config
- Update Docker speech Dockerfile for more flexible model handling
- Create detailed README for speech-to-text examples
- Implement example script demonstrating speech features
- Improve speech service initialization and configuration management
2025-02-04 19:35:50 +01:00

173 lines
5.8 KiB
Python

import os
import json
import queue
import threading
import numpy as np
import sounddevice as sd
from openwakeword import Model
from datetime import datetime
import wave
from faster_whisper import WhisperModel
# Configuration
SAMPLE_RATE = 16000
CHANNELS = 1
CHUNK_SIZE = 1024
BUFFER_DURATION = 30 # seconds to keep in buffer
DETECTION_THRESHOLD = 0.5
# Wake word models to use
WAKE_WORDS = ["hey_jarvis", "ok_google", "alexa"]
# Initialize the ASR model
asr_model = WhisperModel(
model_size_or_path=os.environ.get('ASR_MODEL', 'base.en'),
device="cpu",
compute_type="int8",
download_root=os.environ.get('ASR_MODEL_PATH', '/models')
)
class AudioProcessor:
def __init__(self):
# Initialize wake word detection model
self.wake_word_model = Model(
custom_model_paths=None, # Use default models
inference_framework="onnx" # Use ONNX for better performance
)
# Pre-load the wake word models
for wake_word in WAKE_WORDS:
self.wake_word_model.add_model(wake_word)
self.audio_buffer = queue.Queue()
self.recording = False
self.buffer = np.zeros(SAMPLE_RATE * BUFFER_DURATION)
self.buffer_lock = threading.Lock()
def audio_callback(self, indata, frames, time, status):
"""Callback for audio input"""
if status:
print(f"Audio callback status: {status}")
# Convert to mono if necessary
if CHANNELS > 1:
audio_data = np.mean(indata, axis=1)
else:
audio_data = indata.flatten()
# Update circular buffer
with self.buffer_lock:
self.buffer = np.roll(self.buffer, -len(audio_data))
self.buffer[-len(audio_data):] = audio_data
# Process for wake word detection
prediction = self.wake_word_model.predict(audio_data)
# Check if wake word detected
for wake_word in WAKE_WORDS:
if prediction[wake_word] > DETECTION_THRESHOLD:
print(f"Wake word detected: {wake_word} (confidence: {prediction[wake_word]:.2f})")
self.save_audio_segment(wake_word)
break
def save_audio_segment(self, wake_word):
"""Save the audio buffer when wake word is detected"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"/audio/wake_word_{wake_word}_{timestamp}.wav"
# Save the audio buffer to a WAV file
with wave.open(filename, 'wb') as wf:
wf.setnchannels(CHANNELS)
wf.setsampwidth(2) # 16-bit audio
wf.setframerate(SAMPLE_RATE)
# Convert float32 to int16
audio_data = (self.buffer * 32767).astype(np.int16)
wf.writeframes(audio_data.tobytes())
print(f"Saved audio segment to {filename}")
# Transcribe the audio
try:
segments, info = asr_model.transcribe(
filename,
language="en",
beam_size=5,
temperature=0
)
# Format the transcription result
result = {
"text": " ".join(segment.text for segment in segments),
"segments": [
{
"text": segment.text,
"start": segment.start,
"end": segment.end,
"confidence": segment.confidence
}
for segment in segments
]
}
# Save metadata and transcription
metadata = {
"timestamp": timestamp,
"wake_word": wake_word,
"wake_word_confidence": float(prediction[wake_word]),
"sample_rate": SAMPLE_RATE,
"channels": CHANNELS,
"duration": BUFFER_DURATION,
"transcription": result
}
with open(f"{filename}.json", 'w') as f:
json.dump(metadata, f, indent=2)
print("\nTranscription result:")
print(f"Text: {result['text']}")
print("\nSegments:")
for segment in result["segments"]:
print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['confidence']:.2%})")
print(f'"{segment["text"]}"')
except Exception as e:
print(f"Error during transcription: {e}")
metadata = {
"timestamp": timestamp,
"wake_word": wake_word,
"wake_word_confidence": float(prediction[wake_word]),
"sample_rate": SAMPLE_RATE,
"channels": CHANNELS,
"duration": BUFFER_DURATION,
"error": str(e)
}
with open(f"{filename}.json", 'w') as f:
json.dump(metadata, f, indent=2)
def start(self):
"""Start audio processing"""
try:
print("Initializing wake word detection...")
print(f"Loaded wake words: {', '.join(WAKE_WORDS)}")
with sd.InputStream(
channels=CHANNELS,
samplerate=SAMPLE_RATE,
blocksize=CHUNK_SIZE,
callback=self.audio_callback
):
print("\nWake word detection started. Listening...")
print("Press Ctrl+C to stop")
while True:
sd.sleep(1000) # Sleep for 1 second
except KeyboardInterrupt:
print("\nStopping wake word detection...")
except Exception as e:
print(f"Error in audio processing: {e}")
if __name__ == "__main__":
processor = AudioProcessor()
processor.start()