feat(speech): enhance speech processing with advanced audio setup and detection

- Add audio setup script for PulseAudio configuration - Improve wake word detection with advanced noise filtering - Implement continuous transcription and command processing - Update speech Dockerfile with additional audio dependencies - Enhance logging and error handling in wake word detector
2025-02-04 22:51:06 +01:00
4 changed files with 374 additions and 103 deletions
--- a/3
+++ b/3
@@ -53,6 +53,9 @@ COPY --from=builder --chown=bunjs:nodejs /app/dist ./dist
 COPY --from=builder --chown=bunjs:nodejs /app/node_modules ./node_modules
 COPY --chown=bunjs:nodejs package.json ./

+# Create logs directory with proper permissions
+RUN mkdir -p /app/logs && chown -R bunjs:nodejs /app/logs
+
 # Switch to non-root user
 USER bunjs

--- a/docker/speech/Dockerfile
+++ b/docker/speech/Dockerfile
@@ -13,9 +13,10 @@ RUN python -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"

 # Install Python dependencies with specific versions and CPU-only variants
-RUN pip install --no-cache-dir numpy==1.24.3
-RUN pip install --no-cache-dir torch==2.1.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu
-RUN pip install --no-cache-dir faster-whisper==0.10.0 openwakeword==0.4.0 pyaudio==0.2.14 sounddevice==0.4.6
+RUN pip install --no-cache-dir "numpy>=1.24.3,<2.0.0" && \
+    pip install --no-cache-dir torch==2.1.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu && \
+    pip install --no-cache-dir faster-whisper==0.10.0 openwakeword==0.4.0 pyaudio==0.2.14 sounddevice==0.4.6 requests==2.31.0 && \
+    pip freeze > /opt/venv/requirements.txt

 # Create final image
 FROM python:3.10-slim
@@ -24,10 +25,14 @@ FROM python:3.10-slim
 COPY --from=builder /opt/venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"

-# Install only runtime dependencies
+# Install audio dependencies
 RUN apt-get update && apt-get install -y \
    portaudio19-dev \
    python3-pyaudio \
+    alsa-utils \
+    libasound2 \
+    libasound2-plugins \
+    pulseaudio \
    && rm -rf /var/lib/apt/lists/*

 # Create necessary directories
@@ -55,5 +60,9 @@ ENV PYTHONMALLOC=malloc \
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
    CMD ps aux | grep '[p]ython' || exit 1

-# Run the wake word detection service with resource constraints
-CMD ["python", "-X", "faulthandler", "wake_word_detector.py"] 
+# Copy audio setup script
+COPY setup-audio.sh /setup-audio.sh
+RUN chmod +x /setup-audio.sh
+
+# Start command
+CMD ["/bin/bash", "-c", "/setup-audio.sh && python -u wake_word_detector.py"] 
--- a/docker/speech/setup-audio.sh
+++ b/docker/speech/setup-audio.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Wait for PulseAudio to be ready
+sleep 2
+
+# Mute the monitor to prevent feedback
+pactl set-source-mute alsa_output.pci-0000_00_1b.0.analog-stereo.monitor 1
+
+# Set microphone sensitivity to 65%
+pactl set-source-volume alsa_input.pci-0000_00_1b.0.analog-stereo 65%
+
+# Set speaker volume to 40%
+pactl set-sink-volume alsa_output.pci-0000_00_1b.0.analog-stereo 40%
+
+# Make the script executable
+chmod +x /setup-audio.sh 
--- a/docker/speech/wake_word_detector.py
+++ b/docker/speech/wake_word_detector.py
@@ -8,45 +8,274 @@ from openwakeword import Model
 from datetime import datetime
 import wave
 from faster_whisper import WhisperModel
+import requests
+import logging
+import time
+
+# Set up logging
+logging.basicConfig(
+    level=logging.DEBUG,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)

 # Configuration
 SAMPLE_RATE = 16000
 CHANNELS = 1
 CHUNK_SIZE = 1024
-BUFFER_DURATION = 30  # seconds to keep in buffer
+BUFFER_DURATION = 10  # seconds to keep in buffer
 DETECTION_THRESHOLD = 0.5
+CONTINUOUS_TRANSCRIPTION_INTERVAL = 3  # seconds between transcriptions
+MAX_MODEL_LOAD_RETRIES = 3
+MODEL_LOAD_RETRY_DELAY = 5  # seconds
+MODEL_DOWNLOAD_TIMEOUT = 600  # 10 minutes timeout for model download

-# Wake word models to use
-WAKE_WORDS = ["hey_jarvis", "ok_google", "alexa"]
+# Audio processing parameters
+NOISE_THRESHOLD = 0.08  # Increased threshold for better noise filtering
+MIN_SPEECH_DURATION = 2.0  # Longer minimum duration to avoid fragments
+SILENCE_DURATION = 1.0  # Longer silence duration
+MAX_REPETITIONS = 1  # More aggressive repetition filtering
+ECHO_THRESHOLD = 0.75  # More sensitive echo detection
+MIN_SEGMENT_DURATION = 1.0  # Longer minimum segment duration
+FEEDBACK_WINDOW = 5  # Window size for feedback detection in seconds

-# Initialize the ASR model
-asr_model = WhisperModel(
-    model_size_or_path=os.environ.get('ASR_MODEL', 'base.en'),
+# Feature flags from environment
+WAKE_WORD_ENABLED = os.environ.get('ENABLE_WAKE_WORD', 'false').lower() == 'true'
+SPEECH_ENABLED = os.environ.get('ENABLE_SPEECH_FEATURES', 'true').lower() == 'true'
+
+# Wake word models to use (only if wake word is enabled)
+WAKE_WORDS = ["alexa"]  # Using 'alexa' as temporary replacement for 'gaja'
+WAKE_WORD_ALIAS = "gaja"  # What we print when wake word is detected
+
+# Home Assistant Configuration
+HASS_HOST = os.environ.get('HASS_HOST', 'http://homeassistant.local:8123')
+HASS_TOKEN = os.environ.get('HASS_TOKEN')
+
+def initialize_asr_model():
+    """Initialize the ASR model with retries and timeout"""
+    model_path = os.environ.get('ASR_MODEL_PATH', '/models')
+    model_name = os.environ.get('ASR_MODEL', 'large-v3')
+    
+    start_time = time.time()
+    for attempt in range(MAX_MODEL_LOAD_RETRIES):
+        try:
+            if time.time() - start_time > MODEL_DOWNLOAD_TIMEOUT:
+                logger.error("Model download timeout exceeded")
+                raise TimeoutError("Model download took too long")
+                
+            logger.info(f"Loading ASR model (attempt {attempt + 1}/{MAX_MODEL_LOAD_RETRIES})")
+            model = WhisperModel(
+                model_size_or_path=model_name,
                device="cpu",
                compute_type="int8",
-    download_root=os.environ.get('ASR_MODEL_PATH', '/models')
+                download_root=model_path,
+                num_workers=1  # Reduce concurrent downloads
            )
+            logger.info("ASR model loaded successfully")
+            return model
+        except Exception as e:
+            logger.error(f"Failed to load ASR model (attempt {attempt + 1}): {e}")
+            if attempt < MAX_MODEL_LOAD_RETRIES - 1:
+                logger.info(f"Retrying in {MODEL_LOAD_RETRY_DELAY} seconds...")
+                time.sleep(MODEL_LOAD_RETRY_DELAY)
+            else:
+                logger.error("Failed to load ASR model after all retries")
+                raise
+
+# Initialize the ASR model with retries
+try:
+    asr_model = initialize_asr_model()
+except Exception as e:
+    logger.error(f"Critical error initializing ASR model: {e}")
+    raise
+
+def send_command_to_hass(domain, service, entity_id):
+    """Send command to Home Assistant"""
+    if not HASS_TOKEN:
+        logger.error("Error: HASS_TOKEN not set")
+        return False
+
+    headers = {
+        "Authorization": f"Bearer {HASS_TOKEN}",
+        "Content-Type": "application/json",
+    }
+
+    url = f"{HASS_HOST}/api/services/{domain}/{service}"
+    data = {"entity_id": entity_id}
+
+    try:
+        response = requests.post(url, headers=headers, json=data)
+        response.raise_for_status()
+        logger.info(f"Command sent: {domain}.{service} for {entity_id}")
+        return True
+    except Exception as e:
+        logger.error(f"Error sending command to Home Assistant: {e}")
+        return False
+
+def is_speech(audio_data, threshold=NOISE_THRESHOLD):
+    """Detect if audio segment contains speech based on amplitude and frequency content"""
+    # Calculate RMS amplitude
+    rms = np.sqrt(np.mean(np.square(audio_data)))
+    
+    # Calculate signal energy in speech frequency range (100-4000 Hz)
+    fft = np.fft.fft(audio_data)
+    freqs = np.fft.fftfreq(len(audio_data), 1/SAMPLE_RATE)
+    speech_mask = (np.abs(freqs) >= 100) & (np.abs(freqs) <= 4000)
+    speech_energy = np.sum(np.abs(fft[speech_mask])) / len(audio_data)
+    
+    # Enhanced echo detection
+    # 1. Check for periodic patterns in the signal
+    autocorr = np.correlate(audio_data, audio_data, mode='full')
+    autocorr = autocorr[len(autocorr)//2:]  # Use only positive lags
+    peaks = np.where(autocorr > ECHO_THRESHOLD * np.max(autocorr))[0]
+    peak_spacing = np.diff(peaks)
+    has_periodic_echo = len(peak_spacing) > 2 and np.std(peak_spacing) < 0.1 * np.mean(peak_spacing)
+    
+    # 2. Check for sudden amplitude changes
+    amplitude_envelope = np.abs(audio_data)
+    amplitude_changes = np.diff(amplitude_envelope)
+    has_feedback_spikes = np.any(np.abs(amplitude_changes) > threshold * 2)
+    
+    # 3. Check frequency distribution
+    freq_magnitudes = np.abs(fft)[:len(fft)//2]
+    peak_freqs = freqs[:len(fft)//2][np.argsort(freq_magnitudes)[-3:]]
+    has_feedback_freqs = np.any((peak_freqs > 2000) & (peak_freqs < 4000))
+    
+    # Combine all criteria
+    is_valid_speech = (
+        rms > threshold and
+        speech_energy > threshold and
+        not has_periodic_echo and
+        not has_feedback_spikes and
+        not has_feedback_freqs
+    )
+    
+    return is_valid_speech
+
+def process_command(text):
+    """Process the transcribed command and execute appropriate action"""
+    text = text.lower().strip()
+    
+    # Skip if text is too short or contains numbers (likely noise)
+    if len(text) < 5 or any(char.isdigit() for char in text):
+        logger.debug("Text too short or contains numbers, skipping")
+        return
+    
+    # Enhanced noise pattern detection
+    noise_patterns = ["lei", "los", "und", "aber", "nicht mehr", "das das", "und und"]
+    for pattern in noise_patterns:
+        if text.count(pattern) > 1:  # More aggressive pattern filtering
+            logger.debug(f"Detected noise pattern '{pattern}', skipping")
+            return
+    
+    # More aggressive repetition detection
+    words = text.split()
+    if len(words) >= 2:
+        # Check for immediate word repetitions
+        for i in range(len(words)-1):
+            if words[i] == words[i+1]:
+                logger.debug(f"Detected immediate word repetition: '{words[i]}', skipping")
+                return
+        
+        # Check for phrase repetitions
+        phrases = [' '.join(words[i:i+2]) for i in range(len(words)-1)]
+        phrase_counts = {}
+        for phrase in phrases:
+            phrase_counts[phrase] = phrase_counts.get(phrase, 0) + 1
+            if phrase_counts[phrase] > MAX_REPETITIONS:
+                logger.debug(f"Skipping due to excessive repetition: '{phrase}'")
+                return
+    
+    # German command mappings
+    commands = {
+        "ausschalten": "turn_off",
+        "einschalten": "turn_on",
+        "an": "turn_on",
+        "aus": "turn_off"
+    }
+    
+    rooms = {
+        "wohnzimmer": "living_room",
+        "küche": "kitchen",
+        "schlafzimmer": "bedroom",
+        "bad": "bathroom"
+    }
+    
+    # Detect room
+    detected_room = None
+    for german_room, english_room in rooms.items():
+        if german_room in text:
+            detected_room = english_room
+            break
+    
+    # Detect command
+    detected_command = None
+    for german_cmd, english_cmd in commands.items():
+        if german_cmd in text:
+            detected_command = english_cmd
+            break
+    
+    if detected_room and detected_command:
+        # Construct entity ID (assuming light)
+        entity_id = f"light.{detected_room}"
+        
+        # Send command to Home Assistant
+        if send_command_to_hass("light", detected_command, entity_id):
+            logger.info(f"Executed: {detected_command} for {entity_id}")
+        else:
+            logger.error("Failed to execute command")
+    else:
+        logger.debug(f"No command found in text: '{text}'")

 class AudioProcessor:
    def __init__(self):
-        # Initialize wake word detection model
-        self.wake_word_model = Model(
-            inference_framework="onnx"  # Use ONNX for better performance
-        )
-
-        # Pre-load the wake word models
-        for wake_word in WAKE_WORDS:
-            self.wake_word_model.add_model(wake_word)
-
+        logger.info("Initializing AudioProcessor...")
        self.audio_buffer = queue.Queue()
        self.recording = False
        self.buffer = np.zeros(SAMPLE_RATE * BUFFER_DURATION)
        self.buffer_lock = threading.Lock()
+        self.last_transcription_time = 0
+        self.stream = None
+        self.speech_detected = False
+        self.silence_frames = 0
+        self.speech_frames = 0
+
+        # Initialize wake word detection only if enabled
+        if WAKE_WORD_ENABLED:
+            try:
+                logger.info("Initializing wake word model...")
+                self.wake_word_model = Model(vad_threshold=0.5)
+                self.last_prediction = None
+                logger.info("Wake word model initialized successfully")
+            except Exception as e:
+                logger.error(f"Failed to initialize wake word model: {e}")
+                raise
+        else:
+            self.wake_word_model = None
+            self.last_prediction = None
+            logger.info("Wake word detection disabled")
+
+    def should_transcribe(self):
+        """Determine if we should transcribe based on mode and timing"""
+        current_time = datetime.now().timestamp()
+        if not WAKE_WORD_ENABLED:
+            # Check if enough time has passed since last transcription
+            time_since_last = current_time - self.last_transcription_time
+            if time_since_last >= CONTINUOUS_TRANSCRIPTION_INTERVAL:
+                # Only transcribe if we detect speech
+                frames_per_chunk = CHUNK_SIZE
+                min_speech_frames = int(MIN_SPEECH_DURATION * SAMPLE_RATE / frames_per_chunk)
+                
+                if self.speech_frames >= min_speech_frames:
+                    self.last_transcription_time = current_time
+                    self.speech_frames = 0  # Reset counter
+                    return True
+        return False

    def audio_callback(self, indata, frames, time, status):
        """Callback for audio input"""
        if status:
-            print(f"Audio callback status: {status}")
+            logger.warning(f"Audio callback status: {status}")
        
        # Convert to mono if necessary
        if CHANNELS > 1:
@@ -54,25 +283,45 @@ class AudioProcessor:
        else:
            audio_data = indata.flatten()

+        # Check for speech
+        if is_speech(audio_data):
+            self.speech_frames += 1
+            self.silence_frames = 0
+        else:
+            self.silence_frames += 1
+            frames_per_chunk = CHUNK_SIZE
+            silence_frames_threshold = int(SILENCE_DURATION * SAMPLE_RATE / frames_per_chunk)
+            
+            if self.silence_frames >= silence_frames_threshold:
+                self.speech_frames = 0
+
        # Update circular buffer
        with self.buffer_lock:
            self.buffer = np.roll(self.buffer, -len(audio_data))
            self.buffer[-len(audio_data):] = audio_data

+        if WAKE_WORD_ENABLED:
            # Process for wake word detection
-        prediction = self.wake_word_model.predict(audio_data)
+            self.last_prediction = self.wake_word_model.predict(audio_data)
            
            # Check if wake word detected
            for wake_word in WAKE_WORDS:
-            if prediction[wake_word] > DETECTION_THRESHOLD:
-                print(f"Wake word detected: {wake_word} (confidence: {prediction[wake_word]:.2f})")
-                self.save_audio_segment(wake_word)
+                confidence = self.last_prediction[wake_word]
+                if confidence > DETECTION_THRESHOLD:
+                    logger.info(
+                        f"Wake word: {WAKE_WORD_ALIAS} (confidence: {confidence:.2f})"
+                    )
+                    self.process_audio()
                    break
+        else:
+            # Continuous transcription mode
+            if self.should_transcribe():
+                self.process_audio()

-    def save_audio_segment(self, wake_word):
-        """Save the audio buffer when wake word is detected"""
+    def process_audio(self):
+        """Process the current audio buffer (save and transcribe)"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        filename = f"/audio/wake_word_{wake_word}_{timestamp}.wav"
+        filename = f"/audio/audio_segment_{timestamp}.wav"
        
        # Save the audio buffer to a WAV file
        with wave.open(filename, 'wb') as wf:
@@ -84,89 +333,83 @@ class AudioProcessor:
            audio_data = (self.buffer * 32767).astype(np.int16)
            wf.writeframes(audio_data.tobytes())
        
-        print(f"Saved audio segment to {filename}")
+        logger.info(f"Saved audio segment to {filename}")

-        # Transcribe the audio
+        # Transcribe the audio with German language preference
        try:
            segments, info = asr_model.transcribe(
                filename,
-                language="en",
+                language="de",  # Set German as preferred language
                beam_size=5,
                temperature=0
            )
            
-            # Format the transcription result
-            result = {
-                "text": " ".join(segment.text for segment in segments),
-                "segments": [
-                    {
-                        "text": segment.text,
-                        "start": segment.start,
-                        "end": segment.end,
-                        "confidence": segment.confidence
-                    }
-                    for segment in segments
-                ]
-            }
+            # Get the full transcribed text
+            transcribed_text = " ".join(segment.text for segment in segments)
+            logger.info(f"Transcribed text: {transcribed_text}")
            
-            # Save metadata and transcription
-            metadata = {
-                "timestamp": timestamp,
-                "wake_word": wake_word,
-                "wake_word_confidence": float(prediction[wake_word]),
-                "sample_rate": SAMPLE_RATE,
-                "channels": CHANNELS,
-                "duration": BUFFER_DURATION,
-                "transcription": result
-            }
-            
-            with open(f"{filename}.json", 'w') as f:
-                json.dump(metadata, f, indent=2)
-                
-            print("\nTranscription result:")
-            print(f"Text: {result['text']}")
-            print("\nSegments:")
-            for segment in result["segments"]:
-                print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['confidence']:.2%})")
-                print(f'"{segment["text"]}"')
+            # Process the command
+            process_command(transcribed_text)
                
        except Exception as e:
-            print(f"Error during transcription: {e}")
-            metadata = {
-                "timestamp": timestamp,
-                "wake_word": wake_word,
-                "wake_word_confidence": float(prediction[wake_word]),
-                "sample_rate": SAMPLE_RATE,
-                "channels": CHANNELS,
-                "duration": BUFFER_DURATION,
-                "error": str(e)
-            }
-            with open(f"{filename}.json", 'w') as f:
-                json.dump(metadata, f, indent=2)
+            logger.error(f"Error during transcription or processing: {e}")

    def start(self):
        """Start audio processing"""
        try:
-            print("Initializing wake word detection...")
-            print(f"Loaded wake words: {', '.join(WAKE_WORDS)}")
+            logger.info("Starting audio processor...")
            
+            # Log configuration
+            logger.debug(f"Sample Rate: {SAMPLE_RATE}")
+            logger.debug(f"Channels: {CHANNELS}")
+            logger.debug(f"Chunk Size: {CHUNK_SIZE}")
+            logger.debug(f"Buffer Duration: {BUFFER_DURATION}")
+            logger.debug(f"Wake Word Enabled: {WAKE_WORD_ENABLED}")
+            logger.debug(f"Speech Enabled: {SPEECH_ENABLED}")
+            logger.debug(f"ASR Model: {os.environ.get('ASR_MODEL')}")
+            
+            if WAKE_WORD_ENABLED:
+                logger.info("Initializing wake word detection...")
+                logger.info(f"Loaded wake words: {', '.join(WAKE_WORDS)}")
+            else:
+                logger.info("Starting continuous transcription mode...")
+                interval = CONTINUOUS_TRANSCRIPTION_INTERVAL
+                logger.info(f"Will transcribe every {interval} seconds")
+            
+            try:
+                logger.debug("Setting up audio input stream...")
                with sd.InputStream(
                    channels=CHANNELS,
                    samplerate=SAMPLE_RATE,
                    blocksize=CHUNK_SIZE,
                    callback=self.audio_callback
                ):
-                print("\nWake word detection started. Listening...")
-                print("Press Ctrl+C to stop")
+                    logger.info("Audio input stream started successfully")
+                    logger.info("Listening for audio input...")
+                    logger.info("Press Ctrl+C to stop")
                    
                    while True:
                        sd.sleep(1000)  # Sleep for 1 second
                        
-        except KeyboardInterrupt:
-            print("\nStopping wake word detection...")
+            except sd.PortAudioError as e:
+                logger.error(f"Error setting up audio stream: {e}")
+                logger.error("Check if microphone is connected and accessible")
+                raise
            except Exception as e:
-            print(f"Error in audio processing: {e}")
+                logger.error(f"Unexpected error in audio stream: {e}")
+                raise
+                    
+        except KeyboardInterrupt:
+            logger.info("\nStopping audio processing...")
+        except Exception as e:
+            logger.error("Critical error in audio processing", exc_info=True)
+            raise

 if __name__ == "__main__":
+    try:
+        logger.info("Initializing AudioProcessor...")
        processor = AudioProcessor()
        processor.start()
+    except Exception as e:
+        logger.error("Failed to start AudioProcessor", exc_info=True)
+        raise