feat(speech): add speech-to-text and wake word detection modules
- Implement SpeechToText class with Docker-based transcription capabilities - Add wake word detection using OpenWakeWord and fast-whisper models - Create Dockerfile for speech processing container - Develop comprehensive test suite for speech recognition functionality - Include audio processing and event-driven transcription features
This commit is contained in:
39
docker/speech/Dockerfile
Normal file
39
docker/speech/Dockerfile
Normal file
@@ -0,0 +1,39 @@
|
||||
FROM python:3.10-slim
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
git \
|
||||
build-essential \
|
||||
portaudio19-dev \
|
||||
python3-pyaudio \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install fast-whisper and its dependencies
|
||||
RUN pip install --no-cache-dir torch torchaudio --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN pip install --no-cache-dir fast-whisper
|
||||
|
||||
# Install wake word detection
|
||||
RUN pip install --no-cache-dir openwakeword pyaudio sounddevice
|
||||
|
||||
# Create directories
|
||||
RUN mkdir -p /models /audio
|
||||
|
||||
# Download the base model by default
|
||||
RUN python -c "from faster_whisper import WhisperModel; WhisperModel.download_model('base.en', cache_dir='/models')"
|
||||
|
||||
# Download OpenWakeWord models
|
||||
RUN mkdir -p /models/wake_word && \
|
||||
python -c "import openwakeword; openwakeword.download_models(['hey_jarvis', 'ok_google', 'alexa'], '/models/wake_word')"
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy the wake word detection script
|
||||
COPY wake_word_detector.py .
|
||||
|
||||
# Set environment variables
|
||||
ENV WHISPER_MODEL_PATH=/models
|
||||
ENV WAKEWORD_MODEL_PATH=/models/wake_word
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
# Run the wake word detection service
|
||||
CMD ["python", "wake_word_detector.py"]
|
||||
104
docker/speech/wake_word_detector.py
Normal file
104
docker/speech/wake_word_detector.py
Normal file
@@ -0,0 +1,104 @@
|
||||
import os
|
||||
import json
|
||||
import queue
|
||||
import threading
|
||||
import numpy as np
|
||||
import sounddevice as sd
|
||||
from openwakeword import Model
|
||||
from datetime import datetime
|
||||
import wave
|
||||
|
||||
# Configuration
|
||||
SAMPLE_RATE = 16000
|
||||
CHANNELS = 1
|
||||
CHUNK_SIZE = 1024
|
||||
BUFFER_DURATION = 30 # seconds to keep in buffer
|
||||
DETECTION_THRESHOLD = 0.5
|
||||
|
||||
class AudioProcessor:
|
||||
def __init__(self):
|
||||
self.wake_word_model = Model(
|
||||
wakeword_models=["hey_jarvis", "ok_google", "alexa"],
|
||||
model_path=os.environ.get('WAKEWORD_MODEL_PATH', '/models/wake_word')
|
||||
)
|
||||
self.audio_buffer = queue.Queue()
|
||||
self.recording = False
|
||||
self.buffer = np.zeros(SAMPLE_RATE * BUFFER_DURATION)
|
||||
self.buffer_lock = threading.Lock()
|
||||
|
||||
def audio_callback(self, indata, frames, time, status):
|
||||
"""Callback for audio input"""
|
||||
if status:
|
||||
print(f"Audio callback status: {status}")
|
||||
|
||||
# Convert to mono if necessary
|
||||
if CHANNELS > 1:
|
||||
audio_data = np.mean(indata, axis=1)
|
||||
else:
|
||||
audio_data = indata.flatten()
|
||||
|
||||
# Update circular buffer
|
||||
with self.buffer_lock:
|
||||
self.buffer = np.roll(self.buffer, -len(audio_data))
|
||||
self.buffer[-len(audio_data):] = audio_data
|
||||
|
||||
# Process for wake word detection
|
||||
prediction = self.wake_word_model.predict(audio_data)
|
||||
|
||||
# Check if wake word detected
|
||||
for wake_word, score in prediction.items():
|
||||
if score > DETECTION_THRESHOLD:
|
||||
print(f"Wake word detected: {wake_word} (confidence: {score:.2f})")
|
||||
self.save_audio_segment()
|
||||
break
|
||||
|
||||
def save_audio_segment(self):
|
||||
"""Save the audio buffer when wake word is detected"""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"/audio/wake_word_{timestamp}.wav"
|
||||
|
||||
# Save the audio buffer to a WAV file
|
||||
with wave.open(filename, 'wb') as wf:
|
||||
wf.setnchannels(CHANNELS)
|
||||
wf.setsampwidth(2) # 16-bit audio
|
||||
wf.setframerate(SAMPLE_RATE)
|
||||
|
||||
# Convert float32 to int16
|
||||
audio_data = (self.buffer * 32767).astype(np.int16)
|
||||
wf.writeframes(audio_data.tobytes())
|
||||
|
||||
print(f"Saved audio segment to {filename}")
|
||||
|
||||
# Write metadata
|
||||
metadata = {
|
||||
"timestamp": timestamp,
|
||||
"sample_rate": SAMPLE_RATE,
|
||||
"channels": CHANNELS,
|
||||
"duration": BUFFER_DURATION
|
||||
}
|
||||
|
||||
with open(f"{filename}.json", 'w') as f:
|
||||
json.dump(metadata, f, indent=2)
|
||||
|
||||
def start(self):
|
||||
"""Start audio processing"""
|
||||
try:
|
||||
with sd.InputStream(
|
||||
channels=CHANNELS,
|
||||
samplerate=SAMPLE_RATE,
|
||||
blocksize=CHUNK_SIZE,
|
||||
callback=self.audio_callback
|
||||
):
|
||||
print("Wake word detection started. Listening...")
|
||||
while True:
|
||||
sd.sleep(1000) # Sleep for 1 second
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nStopping wake word detection...")
|
||||
except Exception as e:
|
||||
print(f"Error in audio processing: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Initializing wake word detection...")
|
||||
processor = AudioProcessor()
|
||||
processor.start()
|
||||
Reference in New Issue
Block a user