diff --git a/Dockerfile b/Dockerfile index 96f59fc..d28f856 100644 --- a/Dockerfile +++ b/Dockerfile @@ -53,6 +53,9 @@ COPY --from=builder --chown=bunjs:nodejs /app/dist ./dist COPY --from=builder --chown=bunjs:nodejs /app/node_modules ./node_modules COPY --chown=bunjs:nodejs package.json ./ +# Create logs directory with proper permissions +RUN mkdir -p /app/logs && chown -R bunjs:nodejs /app/logs + # Switch to non-root user USER bunjs diff --git a/docker/speech/Dockerfile b/docker/speech/Dockerfile index a1fd93d..c28de94 100644 --- a/docker/speech/Dockerfile +++ b/docker/speech/Dockerfile @@ -13,9 +13,10 @@ RUN python -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" # Install Python dependencies with specific versions and CPU-only variants -RUN pip install --no-cache-dir numpy==1.24.3 -RUN pip install --no-cache-dir torch==2.1.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu -RUN pip install --no-cache-dir faster-whisper==0.10.0 openwakeword==0.4.0 pyaudio==0.2.14 sounddevice==0.4.6 +RUN pip install --no-cache-dir "numpy>=1.24.3,<2.0.0" && \ + pip install --no-cache-dir torch==2.1.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu && \ + pip install --no-cache-dir faster-whisper==0.10.0 openwakeword==0.4.0 pyaudio==0.2.14 sounddevice==0.4.6 requests==2.31.0 && \ + pip freeze > /opt/venv/requirements.txt # Create final image FROM python:3.10-slim @@ -24,10 +25,14 @@ FROM python:3.10-slim COPY --from=builder /opt/venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" -# Install only runtime dependencies +# Install audio dependencies RUN apt-get update && apt-get install -y \ portaudio19-dev \ python3-pyaudio \ + alsa-utils \ + libasound2 \ + libasound2-plugins \ + pulseaudio \ && rm -rf /var/lib/apt/lists/* # Create necessary directories @@ -55,5 +60,9 @@ ENV PYTHONMALLOC=malloc \ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ CMD ps aux | grep '[p]ython' || exit 1 -# Run the wake word detection service with resource constraints -CMD ["python", "-X", "faulthandler", "wake_word_detector.py"] \ No newline at end of file +# Copy audio setup script +COPY setup-audio.sh /setup-audio.sh +RUN chmod +x /setup-audio.sh + +# Start command +CMD ["/bin/bash", "-c", "/setup-audio.sh && python -u wake_word_detector.py"] \ No newline at end of file diff --git a/docker/speech/setup-audio.sh b/docker/speech/setup-audio.sh new file mode 100755 index 0000000..e50466d --- /dev/null +++ b/docker/speech/setup-audio.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# Wait for PulseAudio to be ready +sleep 2 + +# Mute the monitor to prevent feedback +pactl set-source-mute alsa_output.pci-0000_00_1b.0.analog-stereo.monitor 1 + +# Set microphone sensitivity to 65% +pactl set-source-volume alsa_input.pci-0000_00_1b.0.analog-stereo 65% + +# Set speaker volume to 40% +pactl set-sink-volume alsa_output.pci-0000_00_1b.0.analog-stereo 40% + +# Make the script executable +chmod +x /setup-audio.sh \ No newline at end of file diff --git a/docker/speech/wake_word_detector.py b/docker/speech/wake_word_detector.py index d1442dc..c5d2340 100644 --- a/docker/speech/wake_word_detector.py +++ b/docker/speech/wake_word_detector.py @@ -8,45 +8,274 @@ from openwakeword import Model from datetime import datetime import wave from faster_whisper import WhisperModel +import requests +import logging +import time + +# Set up logging +logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) # Configuration SAMPLE_RATE = 16000 CHANNELS = 1 CHUNK_SIZE = 1024 -BUFFER_DURATION = 30 # seconds to keep in buffer +BUFFER_DURATION = 10 # seconds to keep in buffer DETECTION_THRESHOLD = 0.5 +CONTINUOUS_TRANSCRIPTION_INTERVAL = 3 # seconds between transcriptions +MAX_MODEL_LOAD_RETRIES = 3 +MODEL_LOAD_RETRY_DELAY = 5 # seconds +MODEL_DOWNLOAD_TIMEOUT = 600 # 10 minutes timeout for model download -# Wake word models to use -WAKE_WORDS = ["hey_jarvis", "ok_google", "alexa"] +# Audio processing parameters +NOISE_THRESHOLD = 0.08 # Increased threshold for better noise filtering +MIN_SPEECH_DURATION = 2.0 # Longer minimum duration to avoid fragments +SILENCE_DURATION = 1.0 # Longer silence duration +MAX_REPETITIONS = 1 # More aggressive repetition filtering +ECHO_THRESHOLD = 0.75 # More sensitive echo detection +MIN_SEGMENT_DURATION = 1.0 # Longer minimum segment duration +FEEDBACK_WINDOW = 5 # Window size for feedback detection in seconds -# Initialize the ASR model -asr_model = WhisperModel( - model_size_or_path=os.environ.get('ASR_MODEL', 'base.en'), - device="cpu", - compute_type="int8", - download_root=os.environ.get('ASR_MODEL_PATH', '/models') -) +# Feature flags from environment +WAKE_WORD_ENABLED = os.environ.get('ENABLE_WAKE_WORD', 'false').lower() == 'true' +SPEECH_ENABLED = os.environ.get('ENABLE_SPEECH_FEATURES', 'true').lower() == 'true' + +# Wake word models to use (only if wake word is enabled) +WAKE_WORDS = ["alexa"] # Using 'alexa' as temporary replacement for 'gaja' +WAKE_WORD_ALIAS = "gaja" # What we print when wake word is detected + +# Home Assistant Configuration +HASS_HOST = os.environ.get('HASS_HOST', 'http://homeassistant.local:8123') +HASS_TOKEN = os.environ.get('HASS_TOKEN') + +def initialize_asr_model(): + """Initialize the ASR model with retries and timeout""" + model_path = os.environ.get('ASR_MODEL_PATH', '/models') + model_name = os.environ.get('ASR_MODEL', 'large-v3') + + start_time = time.time() + for attempt in range(MAX_MODEL_LOAD_RETRIES): + try: + if time.time() - start_time > MODEL_DOWNLOAD_TIMEOUT: + logger.error("Model download timeout exceeded") + raise TimeoutError("Model download took too long") + + logger.info(f"Loading ASR model (attempt {attempt + 1}/{MAX_MODEL_LOAD_RETRIES})") + model = WhisperModel( + model_size_or_path=model_name, + device="cpu", + compute_type="int8", + download_root=model_path, + num_workers=1 # Reduce concurrent downloads + ) + logger.info("ASR model loaded successfully") + return model + except Exception as e: + logger.error(f"Failed to load ASR model (attempt {attempt + 1}): {e}") + if attempt < MAX_MODEL_LOAD_RETRIES - 1: + logger.info(f"Retrying in {MODEL_LOAD_RETRY_DELAY} seconds...") + time.sleep(MODEL_LOAD_RETRY_DELAY) + else: + logger.error("Failed to load ASR model after all retries") + raise + +# Initialize the ASR model with retries +try: + asr_model = initialize_asr_model() +except Exception as e: + logger.error(f"Critical error initializing ASR model: {e}") + raise + +def send_command_to_hass(domain, service, entity_id): + """Send command to Home Assistant""" + if not HASS_TOKEN: + logger.error("Error: HASS_TOKEN not set") + return False + + headers = { + "Authorization": f"Bearer {HASS_TOKEN}", + "Content-Type": "application/json", + } + + url = f"{HASS_HOST}/api/services/{domain}/{service}" + data = {"entity_id": entity_id} + + try: + response = requests.post(url, headers=headers, json=data) + response.raise_for_status() + logger.info(f"Command sent: {domain}.{service} for {entity_id}") + return True + except Exception as e: + logger.error(f"Error sending command to Home Assistant: {e}") + return False + +def is_speech(audio_data, threshold=NOISE_THRESHOLD): + """Detect if audio segment contains speech based on amplitude and frequency content""" + # Calculate RMS amplitude + rms = np.sqrt(np.mean(np.square(audio_data))) + + # Calculate signal energy in speech frequency range (100-4000 Hz) + fft = np.fft.fft(audio_data) + freqs = np.fft.fftfreq(len(audio_data), 1/SAMPLE_RATE) + speech_mask = (np.abs(freqs) >= 100) & (np.abs(freqs) <= 4000) + speech_energy = np.sum(np.abs(fft[speech_mask])) / len(audio_data) + + # Enhanced echo detection + # 1. Check for periodic patterns in the signal + autocorr = np.correlate(audio_data, audio_data, mode='full') + autocorr = autocorr[len(autocorr)//2:] # Use only positive lags + peaks = np.where(autocorr > ECHO_THRESHOLD * np.max(autocorr))[0] + peak_spacing = np.diff(peaks) + has_periodic_echo = len(peak_spacing) > 2 and np.std(peak_spacing) < 0.1 * np.mean(peak_spacing) + + # 2. Check for sudden amplitude changes + amplitude_envelope = np.abs(audio_data) + amplitude_changes = np.diff(amplitude_envelope) + has_feedback_spikes = np.any(np.abs(amplitude_changes) > threshold * 2) + + # 3. Check frequency distribution + freq_magnitudes = np.abs(fft)[:len(fft)//2] + peak_freqs = freqs[:len(fft)//2][np.argsort(freq_magnitudes)[-3:]] + has_feedback_freqs = np.any((peak_freqs > 2000) & (peak_freqs < 4000)) + + # Combine all criteria + is_valid_speech = ( + rms > threshold and + speech_energy > threshold and + not has_periodic_echo and + not has_feedback_spikes and + not has_feedback_freqs + ) + + return is_valid_speech + +def process_command(text): + """Process the transcribed command and execute appropriate action""" + text = text.lower().strip() + + # Skip if text is too short or contains numbers (likely noise) + if len(text) < 5 or any(char.isdigit() for char in text): + logger.debug("Text too short or contains numbers, skipping") + return + + # Enhanced noise pattern detection + noise_patterns = ["lei", "los", "und", "aber", "nicht mehr", "das das", "und und"] + for pattern in noise_patterns: + if text.count(pattern) > 1: # More aggressive pattern filtering + logger.debug(f"Detected noise pattern '{pattern}', skipping") + return + + # More aggressive repetition detection + words = text.split() + if len(words) >= 2: + # Check for immediate word repetitions + for i in range(len(words)-1): + if words[i] == words[i+1]: + logger.debug(f"Detected immediate word repetition: '{words[i]}', skipping") + return + + # Check for phrase repetitions + phrases = [' '.join(words[i:i+2]) for i in range(len(words)-1)] + phrase_counts = {} + for phrase in phrases: + phrase_counts[phrase] = phrase_counts.get(phrase, 0) + 1 + if phrase_counts[phrase] > MAX_REPETITIONS: + logger.debug(f"Skipping due to excessive repetition: '{phrase}'") + return + + # German command mappings + commands = { + "ausschalten": "turn_off", + "einschalten": "turn_on", + "an": "turn_on", + "aus": "turn_off" + } + + rooms = { + "wohnzimmer": "living_room", + "küche": "kitchen", + "schlafzimmer": "bedroom", + "bad": "bathroom" + } + + # Detect room + detected_room = None + for german_room, english_room in rooms.items(): + if german_room in text: + detected_room = english_room + break + + # Detect command + detected_command = None + for german_cmd, english_cmd in commands.items(): + if german_cmd in text: + detected_command = english_cmd + break + + if detected_room and detected_command: + # Construct entity ID (assuming light) + entity_id = f"light.{detected_room}" + + # Send command to Home Assistant + if send_command_to_hass("light", detected_command, entity_id): + logger.info(f"Executed: {detected_command} for {entity_id}") + else: + logger.error("Failed to execute command") + else: + logger.debug(f"No command found in text: '{text}'") class AudioProcessor: def __init__(self): - # Initialize wake word detection model - self.wake_word_model = Model( - inference_framework="onnx" # Use ONNX for better performance - ) - - # Pre-load the wake word models - for wake_word in WAKE_WORDS: - self.wake_word_model.add_model(wake_word) - + logger.info("Initializing AudioProcessor...") self.audio_buffer = queue.Queue() self.recording = False self.buffer = np.zeros(SAMPLE_RATE * BUFFER_DURATION) self.buffer_lock = threading.Lock() + self.last_transcription_time = 0 + self.stream = None + self.speech_detected = False + self.silence_frames = 0 + self.speech_frames = 0 + + # Initialize wake word detection only if enabled + if WAKE_WORD_ENABLED: + try: + logger.info("Initializing wake word model...") + self.wake_word_model = Model(vad_threshold=0.5) + self.last_prediction = None + logger.info("Wake word model initialized successfully") + except Exception as e: + logger.error(f"Failed to initialize wake word model: {e}") + raise + else: + self.wake_word_model = None + self.last_prediction = None + logger.info("Wake word detection disabled") + + def should_transcribe(self): + """Determine if we should transcribe based on mode and timing""" + current_time = datetime.now().timestamp() + if not WAKE_WORD_ENABLED: + # Check if enough time has passed since last transcription + time_since_last = current_time - self.last_transcription_time + if time_since_last >= CONTINUOUS_TRANSCRIPTION_INTERVAL: + # Only transcribe if we detect speech + frames_per_chunk = CHUNK_SIZE + min_speech_frames = int(MIN_SPEECH_DURATION * SAMPLE_RATE / frames_per_chunk) + + if self.speech_frames >= min_speech_frames: + self.last_transcription_time = current_time + self.speech_frames = 0 # Reset counter + return True + return False def audio_callback(self, indata, frames, time, status): """Callback for audio input""" if status: - print(f"Audio callback status: {status}") + logger.warning(f"Audio callback status: {status}") # Convert to mono if necessary if CHANNELS > 1: @@ -54,25 +283,45 @@ class AudioProcessor: else: audio_data = indata.flatten() + # Check for speech + if is_speech(audio_data): + self.speech_frames += 1 + self.silence_frames = 0 + else: + self.silence_frames += 1 + frames_per_chunk = CHUNK_SIZE + silence_frames_threshold = int(SILENCE_DURATION * SAMPLE_RATE / frames_per_chunk) + + if self.silence_frames >= silence_frames_threshold: + self.speech_frames = 0 + # Update circular buffer with self.buffer_lock: self.buffer = np.roll(self.buffer, -len(audio_data)) self.buffer[-len(audio_data):] = audio_data - # Process for wake word detection - prediction = self.wake_word_model.predict(audio_data) - - # Check if wake word detected - for wake_word in WAKE_WORDS: - if prediction[wake_word] > DETECTION_THRESHOLD: - print(f"Wake word detected: {wake_word} (confidence: {prediction[wake_word]:.2f})") - self.save_audio_segment(wake_word) - break + if WAKE_WORD_ENABLED: + # Process for wake word detection + self.last_prediction = self.wake_word_model.predict(audio_data) + + # Check if wake word detected + for wake_word in WAKE_WORDS: + confidence = self.last_prediction[wake_word] + if confidence > DETECTION_THRESHOLD: + logger.info( + f"Wake word: {WAKE_WORD_ALIAS} (confidence: {confidence:.2f})" + ) + self.process_audio() + break + else: + # Continuous transcription mode + if self.should_transcribe(): + self.process_audio() - def save_audio_segment(self, wake_word): - """Save the audio buffer when wake word is detected""" + def process_audio(self): + """Process the current audio buffer (save and transcribe)""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - filename = f"/audio/wake_word_{wake_word}_{timestamp}.wav" + filename = f"/audio/audio_segment_{timestamp}.wav" # Save the audio buffer to a WAV file with wave.open(filename, 'wb') as wf: @@ -84,89 +333,83 @@ class AudioProcessor: audio_data = (self.buffer * 32767).astype(np.int16) wf.writeframes(audio_data.tobytes()) - print(f"Saved audio segment to {filename}") + logger.info(f"Saved audio segment to {filename}") - # Transcribe the audio + # Transcribe the audio with German language preference try: segments, info = asr_model.transcribe( filename, - language="en", + language="de", # Set German as preferred language beam_size=5, temperature=0 ) - # Format the transcription result - result = { - "text": " ".join(segment.text for segment in segments), - "segments": [ - { - "text": segment.text, - "start": segment.start, - "end": segment.end, - "confidence": segment.confidence - } - for segment in segments - ] - } + # Get the full transcribed text + transcribed_text = " ".join(segment.text for segment in segments) + logger.info(f"Transcribed text: {transcribed_text}") - # Save metadata and transcription - metadata = { - "timestamp": timestamp, - "wake_word": wake_word, - "wake_word_confidence": float(prediction[wake_word]), - "sample_rate": SAMPLE_RATE, - "channels": CHANNELS, - "duration": BUFFER_DURATION, - "transcription": result - } - - with open(f"{filename}.json", 'w') as f: - json.dump(metadata, f, indent=2) - - print("\nTranscription result:") - print(f"Text: {result['text']}") - print("\nSegments:") - for segment in result["segments"]: - print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['confidence']:.2%})") - print(f'"{segment["text"]}"') + # Process the command + process_command(transcribed_text) except Exception as e: - print(f"Error during transcription: {e}") - metadata = { - "timestamp": timestamp, - "wake_word": wake_word, - "wake_word_confidence": float(prediction[wake_word]), - "sample_rate": SAMPLE_RATE, - "channels": CHANNELS, - "duration": BUFFER_DURATION, - "error": str(e) - } - with open(f"{filename}.json", 'w') as f: - json.dump(metadata, f, indent=2) + logger.error(f"Error during transcription or processing: {e}") def start(self): """Start audio processing""" try: - print("Initializing wake word detection...") - print(f"Loaded wake words: {', '.join(WAKE_WORDS)}") + logger.info("Starting audio processor...") - with sd.InputStream( - channels=CHANNELS, - samplerate=SAMPLE_RATE, - blocksize=CHUNK_SIZE, - callback=self.audio_callback - ): - print("\nWake word detection started. Listening...") - print("Press Ctrl+C to stop") - - while True: - sd.sleep(1000) # Sleep for 1 second + # Log configuration + logger.debug(f"Sample Rate: {SAMPLE_RATE}") + logger.debug(f"Channels: {CHANNELS}") + logger.debug(f"Chunk Size: {CHUNK_SIZE}") + logger.debug(f"Buffer Duration: {BUFFER_DURATION}") + logger.debug(f"Wake Word Enabled: {WAKE_WORD_ENABLED}") + logger.debug(f"Speech Enabled: {SPEECH_ENABLED}") + logger.debug(f"ASR Model: {os.environ.get('ASR_MODEL')}") + + if WAKE_WORD_ENABLED: + logger.info("Initializing wake word detection...") + logger.info(f"Loaded wake words: {', '.join(WAKE_WORDS)}") + else: + logger.info("Starting continuous transcription mode...") + interval = CONTINUOUS_TRANSCRIPTION_INTERVAL + logger.info(f"Will transcribe every {interval} seconds") + + try: + logger.debug("Setting up audio input stream...") + with sd.InputStream( + channels=CHANNELS, + samplerate=SAMPLE_RATE, + blocksize=CHUNK_SIZE, + callback=self.audio_callback + ): + logger.info("Audio input stream started successfully") + logger.info("Listening for audio input...") + logger.info("Press Ctrl+C to stop") + + while True: + sd.sleep(1000) # Sleep for 1 second + + except sd.PortAudioError as e: + logger.error(f"Error setting up audio stream: {e}") + logger.error("Check if microphone is connected and accessible") + raise + except Exception as e: + logger.error(f"Unexpected error in audio stream: {e}") + raise except KeyboardInterrupt: - print("\nStopping wake word detection...") + logger.info("\nStopping audio processing...") except Exception as e: - print(f"Error in audio processing: {e}") + logger.error("Critical error in audio processing", exc_info=True) + raise if __name__ == "__main__": - processor = AudioProcessor() - processor.start() \ No newline at end of file + try: + logger.info("Initializing AudioProcessor...") + processor = AudioProcessor() + processor.start() + except Exception as e: + logger.error("Failed to start AudioProcessor", exc_info=True) + raise \ No newline at end of file