feat(speech): enhance speech processing with advanced audio setup and detection

- Add audio setup script for PulseAudio configuration - Improve wake word detection with advanced noise filtering - Implement continuous transcription and command processing - Update speech Dockerfile with additional audio dependencies - Enhance logging and error handling in wake word detector
2025-02-04 22:51:06 +01:00
parent e1db799b1d
commit b9727981cc
4 changed files with 374 additions and 103 deletions
--- a/3
+++ b/3
@@ -53,6 +53,9 @@ COPY --from=builder --chown=bunjs:nodejs /app/dist ./dist
 COPY --from=builder --chown=bunjs:nodejs /app/node_modules ./node_modules
 COPY --chown=bunjs:nodejs package.json ./
 # Create logs directory with proper permissions
 RUN mkdir -p /app/logs && chown -R bunjs:nodejs /app/logs
 # Switch to non-root user
 USER bunjs
--- a/docker/speech/Dockerfile
+++ b/docker/speech/Dockerfile
@@ -13,9 +13,10 @@ RUN python -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
 # Install Python dependencies with specific versions and CPU-only variants
-RUN pip install --no-cache-dir numpy==1.24.3
+RUN pip install --no-cache-dir "numpy>=1.24.3,<2.0.0" && \
-RUN pip install --no-cache-dir torch==2.1.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu
+    pip install --no-cache-dir torch==2.1.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu && \
-RUN pip install --no-cache-dir faster-whisper==0.10.0 openwakeword==0.4.0 pyaudio==0.2.14 sounddevice==0.4.6
+    pip install --no-cache-dir faster-whisper==0.10.0 openwakeword==0.4.0 pyaudio==0.2.14 sounddevice==0.4.6 requests==2.31.0 && \
    pip freeze > /opt/venv/requirements.txt
 # Create final image
 FROM python:3.10-slim
@@ -24,10 +25,14 @@ FROM python:3.10-slim
 COPY --from=builder /opt/venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
-# Install only runtime dependencies
+# Install audio dependencies
 RUN apt-get update && apt-get install -y \
    portaudio19-dev \
    python3-pyaudio \
    alsa-utils \
    libasound2 \
    libasound2-plugins \
    pulseaudio \
    && rm -rf /var/lib/apt/lists/*
 # Create necessary directories
@@ -55,5 +60,9 @@ ENV PYTHONMALLOC=malloc \
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
    CMD ps aux | grep '[p]ython' || exit 1
-# Run the wake word detection service with resource constraints
+# Copy audio setup script
-CMD ["python", "-X", "faulthandler", "wake_word_detector.py"] 
+COPY setup-audio.sh /setup-audio.sh
 RUN chmod +x /setup-audio.sh
 # Start command
 CMD ["/bin/bash", "-c", "/setup-audio.sh && python -u wake_word_detector.py"] 
--- a/docker/speech/setup-audio.sh
+++ b/docker/speech/setup-audio.sh
@@ -0,0 +1,16 @@
 #!/bin/bash
 # Wait for PulseAudio to be ready
 sleep 2
 # Mute the monitor to prevent feedback
 pactl set-source-mute alsa_output.pci-0000_00_1b.0.analog-stereo.monitor 1
 # Set microphone sensitivity to 65%
 pactl set-source-volume alsa_input.pci-0000_00_1b.0.analog-stereo 65%
 # Set speaker volume to 40%
 pactl set-sink-volume alsa_output.pci-0000_00_1b.0.analog-stereo 40%
 # Make the script executable
 chmod +x /setup-audio.sh 
--- a/docker/speech/wake_word_detector.py
+++ b/docker/speech/wake_word_detector.py
@@ -8,45 +8,274 @@ from openwakeword import Model
 from datetime import datetime
 import wave
 from faster_whisper import WhisperModel
 import requests
 import logging
 import time
 # Set up logging
 logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 # Configuration
 SAMPLE_RATE = 16000
 CHANNELS = 1
 CHUNK_SIZE = 1024
-BUFFER_DURATION = 30  # seconds to keep in buffer
+BUFFER_DURATION = 10  # seconds to keep in buffer
 DETECTION_THRESHOLD = 0.5
 CONTINUOUS_TRANSCRIPTION_INTERVAL = 3  # seconds between transcriptions
 MAX_MODEL_LOAD_RETRIES = 3
 MODEL_LOAD_RETRY_DELAY = 5  # seconds
 MODEL_DOWNLOAD_TIMEOUT = 600  # 10 minutes timeout for model download
-# Wake word models to use
+# Audio processing parameters
-WAKE_WORDS = ["hey_jarvis", "ok_google", "alexa"]
+NOISE_THRESHOLD = 0.08  # Increased threshold for better noise filtering
 MIN_SPEECH_DURATION = 2.0  # Longer minimum duration to avoid fragments
 SILENCE_DURATION = 1.0  # Longer silence duration
 MAX_REPETITIONS = 1  # More aggressive repetition filtering
 ECHO_THRESHOLD = 0.75  # More sensitive echo detection
 MIN_SEGMENT_DURATION = 1.0  # Longer minimum segment duration
 FEEDBACK_WINDOW = 5  # Window size for feedback detection in seconds
-# Initialize the ASR model
+# Feature flags from environment
-asr_model = WhisperModel(
+WAKE_WORD_ENABLED = os.environ.get('ENABLE_WAKE_WORD', 'false').lower() == 'true'
-    model_size_or_path=os.environ.get('ASR_MODEL', 'base.en'),
+SPEECH_ENABLED = os.environ.get('ENABLE_SPEECH_FEATURES', 'true').lower() == 'true'
-    device="cpu",
+
-    compute_type="int8",
+# Wake word models to use (only if wake word is enabled)
-    download_root=os.environ.get('ASR_MODEL_PATH', '/models')
+WAKE_WORDS = ["alexa"]  # Using 'alexa' as temporary replacement for 'gaja'
-)
+WAKE_WORD_ALIAS = "gaja"  # What we print when wake word is detected
 # Home Assistant Configuration
 HASS_HOST = os.environ.get('HASS_HOST', 'http://homeassistant.local:8123')
 HASS_TOKEN = os.environ.get('HASS_TOKEN')
 def initialize_asr_model():
    """Initialize the ASR model with retries and timeout"""
    model_path = os.environ.get('ASR_MODEL_PATH', '/models')
    model_name = os.environ.get('ASR_MODEL', 'large-v3')
    start_time = time.time()
    for attempt in range(MAX_MODEL_LOAD_RETRIES):
        try:
            if time.time() - start_time > MODEL_DOWNLOAD_TIMEOUT:
                logger.error("Model download timeout exceeded")
                raise TimeoutError("Model download took too long")
            logger.info(f"Loading ASR model (attempt {attempt + 1}/{MAX_MODEL_LOAD_RETRIES})")
            model = WhisperModel(
                model_size_or_path=model_name,
                device="cpu",
                compute_type="int8",
                download_root=model_path,
                num_workers=1  # Reduce concurrent downloads
            )
            logger.info("ASR model loaded successfully")
            return model
        except Exception as e:
            logger.error(f"Failed to load ASR model (attempt {attempt + 1}): {e}")
            if attempt < MAX_MODEL_LOAD_RETRIES - 1:
                logger.info(f"Retrying in {MODEL_LOAD_RETRY_DELAY} seconds...")
                time.sleep(MODEL_LOAD_RETRY_DELAY)
            else:
                logger.error("Failed to load ASR model after all retries")
                raise
 # Initialize the ASR model with retries
 try:
    asr_model = initialize_asr_model()
 except Exception as e:
    logger.error(f"Critical error initializing ASR model: {e}")
    raise
 def send_command_to_hass(domain, service, entity_id):
    """Send command to Home Assistant"""
    if not HASS_TOKEN:
        logger.error("Error: HASS_TOKEN not set")
        return False
    headers = {
        "Authorization": f"Bearer {HASS_TOKEN}",
        "Content-Type": "application/json",
    }
    url = f"{HASS_HOST}/api/services/{domain}/{service}"
    data = {"entity_id": entity_id}
    try:
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()
        logger.info(f"Command sent: {domain}.{service} for {entity_id}")
        return True
    except Exception as e:
        logger.error(f"Error sending command to Home Assistant: {e}")
        return False
 def is_speech(audio_data, threshold=NOISE_THRESHOLD):
    """Detect if audio segment contains speech based on amplitude and frequency content"""
    # Calculate RMS amplitude
    rms = np.sqrt(np.mean(np.square(audio_data)))
    # Calculate signal energy in speech frequency range (100-4000 Hz)
    fft = np.fft.fft(audio_data)
    freqs = np.fft.fftfreq(len(audio_data), 1/SAMPLE_RATE)
    speech_mask = (np.abs(freqs) >= 100) & (np.abs(freqs) <= 4000)
    speech_energy = np.sum(np.abs(fft[speech_mask])) / len(audio_data)
    # Enhanced echo detection
    # 1. Check for periodic patterns in the signal
    autocorr = np.correlate(audio_data, audio_data, mode='full')
    autocorr = autocorr[len(autocorr)//2:]  # Use only positive lags
    peaks = np.where(autocorr > ECHO_THRESHOLD * np.max(autocorr))[0]
    peak_spacing = np.diff(peaks)
    has_periodic_echo = len(peak_spacing) > 2 and np.std(peak_spacing) < 0.1 * np.mean(peak_spacing)
    # 2. Check for sudden amplitude changes
    amplitude_envelope = np.abs(audio_data)
    amplitude_changes = np.diff(amplitude_envelope)
    has_feedback_spikes = np.any(np.abs(amplitude_changes) > threshold * 2)
    # 3. Check frequency distribution
    freq_magnitudes = np.abs(fft)[:len(fft)//2]
    peak_freqs = freqs[:len(fft)//2][np.argsort(freq_magnitudes)[-3:]]
    has_feedback_freqs = np.any((peak_freqs > 2000) & (peak_freqs < 4000))
    # Combine all criteria
    is_valid_speech = (
        rms > threshold and
        speech_energy > threshold and
        not has_periodic_echo and
        not has_feedback_spikes and
        not has_feedback_freqs
    )
    return is_valid_speech
 def process_command(text):
    """Process the transcribed command and execute appropriate action"""
    text = text.lower().strip()
    # Skip if text is too short or contains numbers (likely noise)
    if len(text) < 5 or any(char.isdigit() for char in text):
        logger.debug("Text too short or contains numbers, skipping")
        return
    # Enhanced noise pattern detection
    noise_patterns = ["lei", "los", "und", "aber", "nicht mehr", "das das", "und und"]
    for pattern in noise_patterns:
        if text.count(pattern) > 1:  # More aggressive pattern filtering
            logger.debug(f"Detected noise pattern '{pattern}', skipping")
            return
    # More aggressive repetition detection
    words = text.split()
    if len(words) >= 2:
        # Check for immediate word repetitions
        for i in range(len(words)-1):
            if words[i] == words[i+1]:
                logger.debug(f"Detected immediate word repetition: '{words[i]}', skipping")
                return
        # Check for phrase repetitions
        phrases = [' '.join(words[i:i+2]) for i in range(len(words)-1)]
        phrase_counts = {}
        for phrase in phrases:
            phrase_counts[phrase] = phrase_counts.get(phrase, 0) + 1
            if phrase_counts[phrase] > MAX_REPETITIONS:
                logger.debug(f"Skipping due to excessive repetition: '{phrase}'")
                return
    # German command mappings
    commands = {
        "ausschalten": "turn_off",
        "einschalten": "turn_on",
        "an": "turn_on",
        "aus": "turn_off"
    }
    rooms = {
        "wohnzimmer": "living_room",
        "küche": "kitchen",
        "schlafzimmer": "bedroom",
        "bad": "bathroom"
    }
    # Detect room
    detected_room = None
    for german_room, english_room in rooms.items():
        if german_room in text:
            detected_room = english_room
            break
    # Detect command
    detected_command = None
    for german_cmd, english_cmd in commands.items():
        if german_cmd in text:
            detected_command = english_cmd
            break
    if detected_room and detected_command:
        # Construct entity ID (assuming light)
        entity_id = f"light.{detected_room}"
        # Send command to Home Assistant
        if send_command_to_hass("light", detected_command, entity_id):
            logger.info(f"Executed: {detected_command} for {entity_id}")
        else:
            logger.error("Failed to execute command")
    else:
        logger.debug(f"No command found in text: '{text}'")
 class AudioProcessor:
    def __init__(self):
-        # Initialize wake word detection model
+        logger.info("Initializing AudioProcessor...")
        self.wake_word_model = Model(
            inference_framework="onnx"  # Use ONNX for better performance
        )
        # Pre-load the wake word models
        for wake_word in WAKE_WORDS:
            self.wake_word_model.add_model(wake_word)
        self.audio_buffer = queue.Queue()
        self.recording = False
        self.buffer = np.zeros(SAMPLE_RATE * BUFFER_DURATION)
        self.buffer_lock = threading.Lock()
        self.last_transcription_time = 0
        self.stream = None
        self.speech_detected = False
        self.silence_frames = 0
        self.speech_frames = 0
        # Initialize wake word detection only if enabled
        if WAKE_WORD_ENABLED:
            try:
                logger.info("Initializing wake word model...")
                self.wake_word_model = Model(vad_threshold=0.5)
                self.last_prediction = None
                logger.info("Wake word model initialized successfully")
            except Exception as e:
                logger.error(f"Failed to initialize wake word model: {e}")
                raise
        else:
            self.wake_word_model = None
            self.last_prediction = None
            logger.info("Wake word detection disabled")
    def should_transcribe(self):
        """Determine if we should transcribe based on mode and timing"""
        current_time = datetime.now().timestamp()
        if not WAKE_WORD_ENABLED:
            # Check if enough time has passed since last transcription
            time_since_last = current_time - self.last_transcription_time
            if time_since_last >= CONTINUOUS_TRANSCRIPTION_INTERVAL:
                # Only transcribe if we detect speech
                frames_per_chunk = CHUNK_SIZE
                min_speech_frames = int(MIN_SPEECH_DURATION * SAMPLE_RATE / frames_per_chunk)
                if self.speech_frames >= min_speech_frames:
                    self.last_transcription_time = current_time
                    self.speech_frames = 0  # Reset counter
                    return True
        return False
    def audio_callback(self, indata, frames, time, status):
        """Callback for audio input"""
        if status:
-            print(f"Audio callback status: {status}")
+            logger.warning(f"Audio callback status: {status}")
        # Convert to mono if necessary
        if CHANNELS > 1:
@@ -54,25 +283,45 @@ class AudioProcessor:
        else:
            audio_data = indata.flatten()
        # Check for speech
        if is_speech(audio_data):
            self.speech_frames += 1
            self.silence_frames = 0
        else:
            self.silence_frames += 1
            frames_per_chunk = CHUNK_SIZE
            silence_frames_threshold = int(SILENCE_DURATION * SAMPLE_RATE / frames_per_chunk)
            if self.silence_frames >= silence_frames_threshold:
                self.speech_frames = 0
        # Update circular buffer
        with self.buffer_lock:
            self.buffer = np.roll(self.buffer, -len(audio_data))
            self.buffer[-len(audio_data):] = audio_data
-        # Process for wake word detection
+        if WAKE_WORD_ENABLED:
-        prediction = self.wake_word_model.predict(audio_data)
+            # Process for wake word detection
            self.last_prediction = self.wake_word_model.predict(audio_data)
-        # Check if wake word detected
+            # Check if wake word detected
-        for wake_word in WAKE_WORDS:
+            for wake_word in WAKE_WORDS:
-            if prediction[wake_word] > DETECTION_THRESHOLD:
+                confidence = self.last_prediction[wake_word]
-                print(f"Wake word detected: {wake_word} (confidence: {prediction[wake_word]:.2f})")
+                if confidence > DETECTION_THRESHOLD:
-                self.save_audio_segment(wake_word)
+                    logger.info(
-                break
+                        f"Wake word: {WAKE_WORD_ALIAS} (confidence: {confidence:.2f})"
                    )
                    self.process_audio()
                    break
        else:
            # Continuous transcription mode
            if self.should_transcribe():
                self.process_audio()
-    def save_audio_segment(self, wake_word):
+    def process_audio(self):
-        """Save the audio buffer when wake word is detected"""
+        """Process the current audio buffer (save and transcribe)"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        filename = f"/audio/wake_word_{wake_word}_{timestamp}.wav"
+        filename = f"/audio/audio_segment_{timestamp}.wav"
        # Save the audio buffer to a WAV file
        with wave.open(filename, 'wb') as wf:
@@ -84,89 +333,83 @@ class AudioProcessor:
            audio_data = (self.buffer * 32767).astype(np.int16)
            wf.writeframes(audio_data.tobytes())
-        print(f"Saved audio segment to {filename}")
+        logger.info(f"Saved audio segment to {filename}")
-        # Transcribe the audio
+        # Transcribe the audio with German language preference
        try:
            segments, info = asr_model.transcribe(
                filename,
-                language="en",
+                language="de",  # Set German as preferred language
                beam_size=5,
                temperature=0
            )
-            # Format the transcription result
+            # Get the full transcribed text
-            result = {
+            transcribed_text = " ".join(segment.text for segment in segments)
-                "text": " ".join(segment.text for segment in segments),
+            logger.info(f"Transcribed text: {transcribed_text}")
                "segments": [
                    {
                        "text": segment.text,
                        "start": segment.start,
                        "end": segment.end,
                        "confidence": segment.confidence
                    }
                    for segment in segments
                ]
            }
-            # Save metadata and transcription
+            # Process the command
-            metadata = {
+            process_command(transcribed_text)
                "timestamp": timestamp,
                "wake_word": wake_word,
                "wake_word_confidence": float(prediction[wake_word]),
                "sample_rate": SAMPLE_RATE,
                "channels": CHANNELS,
                "duration": BUFFER_DURATION,
                "transcription": result
            }
            with open(f"{filename}.json", 'w') as f:
                json.dump(metadata, f, indent=2)
            print("\nTranscription result:")
            print(f"Text: {result['text']}")
            print("\nSegments:")
            for segment in result["segments"]:
                print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['confidence']:.2%})")
                print(f'"{segment["text"]}"')
        except Exception as e:
-            print(f"Error during transcription: {e}")
+            logger.error(f"Error during transcription or processing: {e}")
            metadata = {
                "timestamp": timestamp,
                "wake_word": wake_word,
                "wake_word_confidence": float(prediction[wake_word]),
                "sample_rate": SAMPLE_RATE,
                "channels": CHANNELS,
                "duration": BUFFER_DURATION,
                "error": str(e)
            }
            with open(f"{filename}.json", 'w') as f:
                json.dump(metadata, f, indent=2)
    def start(self):
        """Start audio processing"""
        try:
-            print("Initializing wake word detection...")
+            logger.info("Starting audio processor...")
            print(f"Loaded wake words: {', '.join(WAKE_WORDS)}")
-            with sd.InputStream(
+            # Log configuration
-                channels=CHANNELS,
+            logger.debug(f"Sample Rate: {SAMPLE_RATE}")
-                samplerate=SAMPLE_RATE,
+            logger.debug(f"Channels: {CHANNELS}")
-                blocksize=CHUNK_SIZE,
+            logger.debug(f"Chunk Size: {CHUNK_SIZE}")
-                callback=self.audio_callback
+            logger.debug(f"Buffer Duration: {BUFFER_DURATION}")
-            ):
+            logger.debug(f"Wake Word Enabled: {WAKE_WORD_ENABLED}")
-                print("\nWake word detection started. Listening...")
+            logger.debug(f"Speech Enabled: {SPEECH_ENABLED}")
-                print("Press Ctrl+C to stop")
+            logger.debug(f"ASR Model: {os.environ.get('ASR_MODEL')}")
-                while True:
+            if WAKE_WORD_ENABLED:
-                    sd.sleep(1000)  # Sleep for 1 second
+                logger.info("Initializing wake word detection...")
                logger.info(f"Loaded wake words: {', '.join(WAKE_WORDS)}")
            else:
                logger.info("Starting continuous transcription mode...")
                interval = CONTINUOUS_TRANSCRIPTION_INTERVAL
                logger.info(f"Will transcribe every {interval} seconds")
            try:
                logger.debug("Setting up audio input stream...")
                with sd.InputStream(
                    channels=CHANNELS,
                    samplerate=SAMPLE_RATE,
                    blocksize=CHUNK_SIZE,
                    callback=self.audio_callback
                ):
                    logger.info("Audio input stream started successfully")
                    logger.info("Listening for audio input...")
                    logger.info("Press Ctrl+C to stop")
                    while True:
                        sd.sleep(1000)  # Sleep for 1 second
            except sd.PortAudioError as e:
                logger.error(f"Error setting up audio stream: {e}")
                logger.error("Check if microphone is connected and accessible")
                raise
            except Exception as e:
                logger.error(f"Unexpected error in audio stream: {e}")
                raise
        except KeyboardInterrupt:
-            print("\nStopping wake word detection...")
+            logger.info("\nStopping audio processing...")
        except Exception as e:
-            print(f"Error in audio processing: {e}")
+            logger.error("Critical error in audio processing", exc_info=True)
            raise
 if __name__ == "__main__":
-    processor = AudioProcessor()
+    try:
-    processor.start() 
+        logger.info("Initializing AudioProcessor...")
        processor = AudioProcessor()
        processor.start()
    except Exception as e:
        logger.error("Failed to start AudioProcessor", exc_info=True)
        raise