feat(speech): enhance speech processing with advanced audio setup and detection
- Add audio setup script for PulseAudio configuration - Improve wake word detection with advanced noise filtering - Implement continuous transcription and command processing - Update speech Dockerfile with additional audio dependencies - Enhance logging and error handling in wake word detector
This commit is contained in:
@@ -53,6 +53,9 @@ COPY --from=builder --chown=bunjs:nodejs /app/dist ./dist
|
|||||||
COPY --from=builder --chown=bunjs:nodejs /app/node_modules ./node_modules
|
COPY --from=builder --chown=bunjs:nodejs /app/node_modules ./node_modules
|
||||||
COPY --chown=bunjs:nodejs package.json ./
|
COPY --chown=bunjs:nodejs package.json ./
|
||||||
|
|
||||||
|
# Create logs directory with proper permissions
|
||||||
|
RUN mkdir -p /app/logs && chown -R bunjs:nodejs /app/logs
|
||||||
|
|
||||||
# Switch to non-root user
|
# Switch to non-root user
|
||||||
USER bunjs
|
USER bunjs
|
||||||
|
|
||||||
|
|||||||
@@ -13,9 +13,10 @@ RUN python -m venv /opt/venv
|
|||||||
ENV PATH="/opt/venv/bin:$PATH"
|
ENV PATH="/opt/venv/bin:$PATH"
|
||||||
|
|
||||||
# Install Python dependencies with specific versions and CPU-only variants
|
# Install Python dependencies with specific versions and CPU-only variants
|
||||||
RUN pip install --no-cache-dir numpy==1.24.3
|
RUN pip install --no-cache-dir "numpy>=1.24.3,<2.0.0" && \
|
||||||
RUN pip install --no-cache-dir torch==2.1.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu
|
pip install --no-cache-dir torch==2.1.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu && \
|
||||||
RUN pip install --no-cache-dir faster-whisper==0.10.0 openwakeword==0.4.0 pyaudio==0.2.14 sounddevice==0.4.6
|
pip install --no-cache-dir faster-whisper==0.10.0 openwakeword==0.4.0 pyaudio==0.2.14 sounddevice==0.4.6 requests==2.31.0 && \
|
||||||
|
pip freeze > /opt/venv/requirements.txt
|
||||||
|
|
||||||
# Create final image
|
# Create final image
|
||||||
FROM python:3.10-slim
|
FROM python:3.10-slim
|
||||||
@@ -24,10 +25,14 @@ FROM python:3.10-slim
|
|||||||
COPY --from=builder /opt/venv /opt/venv
|
COPY --from=builder /opt/venv /opt/venv
|
||||||
ENV PATH="/opt/venv/bin:$PATH"
|
ENV PATH="/opt/venv/bin:$PATH"
|
||||||
|
|
||||||
# Install only runtime dependencies
|
# Install audio dependencies
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y \
|
||||||
portaudio19-dev \
|
portaudio19-dev \
|
||||||
python3-pyaudio \
|
python3-pyaudio \
|
||||||
|
alsa-utils \
|
||||||
|
libasound2 \
|
||||||
|
libasound2-plugins \
|
||||||
|
pulseaudio \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Create necessary directories
|
# Create necessary directories
|
||||||
@@ -55,5 +60,9 @@ ENV PYTHONMALLOC=malloc \
|
|||||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||||
CMD ps aux | grep '[p]ython' || exit 1
|
CMD ps aux | grep '[p]ython' || exit 1
|
||||||
|
|
||||||
# Run the wake word detection service with resource constraints
|
# Copy audio setup script
|
||||||
CMD ["python", "-X", "faulthandler", "wake_word_detector.py"]
|
COPY setup-audio.sh /setup-audio.sh
|
||||||
|
RUN chmod +x /setup-audio.sh
|
||||||
|
|
||||||
|
# Start command
|
||||||
|
CMD ["/bin/bash", "-c", "/setup-audio.sh && python -u wake_word_detector.py"]
|
||||||
16
docker/speech/setup-audio.sh
Executable file
16
docker/speech/setup-audio.sh
Executable file
@@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Wait for PulseAudio to be ready
|
||||||
|
sleep 2
|
||||||
|
|
||||||
|
# Mute the monitor to prevent feedback
|
||||||
|
pactl set-source-mute alsa_output.pci-0000_00_1b.0.analog-stereo.monitor 1
|
||||||
|
|
||||||
|
# Set microphone sensitivity to 65%
|
||||||
|
pactl set-source-volume alsa_input.pci-0000_00_1b.0.analog-stereo 65%
|
||||||
|
|
||||||
|
# Set speaker volume to 40%
|
||||||
|
pactl set-sink-volume alsa_output.pci-0000_00_1b.0.analog-stereo 40%
|
||||||
|
|
||||||
|
# Make the script executable
|
||||||
|
chmod +x /setup-audio.sh
|
||||||
@@ -8,45 +8,274 @@ from openwakeword import Model
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import wave
|
import wave
|
||||||
from faster_whisper import WhisperModel
|
from faster_whisper import WhisperModel
|
||||||
|
import requests
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
|
||||||
|
# Set up logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.DEBUG,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
SAMPLE_RATE = 16000
|
SAMPLE_RATE = 16000
|
||||||
CHANNELS = 1
|
CHANNELS = 1
|
||||||
CHUNK_SIZE = 1024
|
CHUNK_SIZE = 1024
|
||||||
BUFFER_DURATION = 30 # seconds to keep in buffer
|
BUFFER_DURATION = 10 # seconds to keep in buffer
|
||||||
DETECTION_THRESHOLD = 0.5
|
DETECTION_THRESHOLD = 0.5
|
||||||
|
CONTINUOUS_TRANSCRIPTION_INTERVAL = 3 # seconds between transcriptions
|
||||||
|
MAX_MODEL_LOAD_RETRIES = 3
|
||||||
|
MODEL_LOAD_RETRY_DELAY = 5 # seconds
|
||||||
|
MODEL_DOWNLOAD_TIMEOUT = 600 # 10 minutes timeout for model download
|
||||||
|
|
||||||
# Wake word models to use
|
# Audio processing parameters
|
||||||
WAKE_WORDS = ["hey_jarvis", "ok_google", "alexa"]
|
NOISE_THRESHOLD = 0.08 # Increased threshold for better noise filtering
|
||||||
|
MIN_SPEECH_DURATION = 2.0 # Longer minimum duration to avoid fragments
|
||||||
|
SILENCE_DURATION = 1.0 # Longer silence duration
|
||||||
|
MAX_REPETITIONS = 1 # More aggressive repetition filtering
|
||||||
|
ECHO_THRESHOLD = 0.75 # More sensitive echo detection
|
||||||
|
MIN_SEGMENT_DURATION = 1.0 # Longer minimum segment duration
|
||||||
|
FEEDBACK_WINDOW = 5 # Window size for feedback detection in seconds
|
||||||
|
|
||||||
# Initialize the ASR model
|
# Feature flags from environment
|
||||||
asr_model = WhisperModel(
|
WAKE_WORD_ENABLED = os.environ.get('ENABLE_WAKE_WORD', 'false').lower() == 'true'
|
||||||
model_size_or_path=os.environ.get('ASR_MODEL', 'base.en'),
|
SPEECH_ENABLED = os.environ.get('ENABLE_SPEECH_FEATURES', 'true').lower() == 'true'
|
||||||
device="cpu",
|
|
||||||
compute_type="int8",
|
# Wake word models to use (only if wake word is enabled)
|
||||||
download_root=os.environ.get('ASR_MODEL_PATH', '/models')
|
WAKE_WORDS = ["alexa"] # Using 'alexa' as temporary replacement for 'gaja'
|
||||||
)
|
WAKE_WORD_ALIAS = "gaja" # What we print when wake word is detected
|
||||||
|
|
||||||
|
# Home Assistant Configuration
|
||||||
|
HASS_HOST = os.environ.get('HASS_HOST', 'http://homeassistant.local:8123')
|
||||||
|
HASS_TOKEN = os.environ.get('HASS_TOKEN')
|
||||||
|
|
||||||
|
def initialize_asr_model():
|
||||||
|
"""Initialize the ASR model with retries and timeout"""
|
||||||
|
model_path = os.environ.get('ASR_MODEL_PATH', '/models')
|
||||||
|
model_name = os.environ.get('ASR_MODEL', 'large-v3')
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
for attempt in range(MAX_MODEL_LOAD_RETRIES):
|
||||||
|
try:
|
||||||
|
if time.time() - start_time > MODEL_DOWNLOAD_TIMEOUT:
|
||||||
|
logger.error("Model download timeout exceeded")
|
||||||
|
raise TimeoutError("Model download took too long")
|
||||||
|
|
||||||
|
logger.info(f"Loading ASR model (attempt {attempt + 1}/{MAX_MODEL_LOAD_RETRIES})")
|
||||||
|
model = WhisperModel(
|
||||||
|
model_size_or_path=model_name,
|
||||||
|
device="cpu",
|
||||||
|
compute_type="int8",
|
||||||
|
download_root=model_path,
|
||||||
|
num_workers=1 # Reduce concurrent downloads
|
||||||
|
)
|
||||||
|
logger.info("ASR model loaded successfully")
|
||||||
|
return model
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to load ASR model (attempt {attempt + 1}): {e}")
|
||||||
|
if attempt < MAX_MODEL_LOAD_RETRIES - 1:
|
||||||
|
logger.info(f"Retrying in {MODEL_LOAD_RETRY_DELAY} seconds...")
|
||||||
|
time.sleep(MODEL_LOAD_RETRY_DELAY)
|
||||||
|
else:
|
||||||
|
logger.error("Failed to load ASR model after all retries")
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Initialize the ASR model with retries
|
||||||
|
try:
|
||||||
|
asr_model = initialize_asr_model()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Critical error initializing ASR model: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def send_command_to_hass(domain, service, entity_id):
|
||||||
|
"""Send command to Home Assistant"""
|
||||||
|
if not HASS_TOKEN:
|
||||||
|
logger.error("Error: HASS_TOKEN not set")
|
||||||
|
return False
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {HASS_TOKEN}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
|
||||||
|
url = f"{HASS_HOST}/api/services/{domain}/{service}"
|
||||||
|
data = {"entity_id": entity_id}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(url, headers=headers, json=data)
|
||||||
|
response.raise_for_status()
|
||||||
|
logger.info(f"Command sent: {domain}.{service} for {entity_id}")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error sending command to Home Assistant: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def is_speech(audio_data, threshold=NOISE_THRESHOLD):
|
||||||
|
"""Detect if audio segment contains speech based on amplitude and frequency content"""
|
||||||
|
# Calculate RMS amplitude
|
||||||
|
rms = np.sqrt(np.mean(np.square(audio_data)))
|
||||||
|
|
||||||
|
# Calculate signal energy in speech frequency range (100-4000 Hz)
|
||||||
|
fft = np.fft.fft(audio_data)
|
||||||
|
freqs = np.fft.fftfreq(len(audio_data), 1/SAMPLE_RATE)
|
||||||
|
speech_mask = (np.abs(freqs) >= 100) & (np.abs(freqs) <= 4000)
|
||||||
|
speech_energy = np.sum(np.abs(fft[speech_mask])) / len(audio_data)
|
||||||
|
|
||||||
|
# Enhanced echo detection
|
||||||
|
# 1. Check for periodic patterns in the signal
|
||||||
|
autocorr = np.correlate(audio_data, audio_data, mode='full')
|
||||||
|
autocorr = autocorr[len(autocorr)//2:] # Use only positive lags
|
||||||
|
peaks = np.where(autocorr > ECHO_THRESHOLD * np.max(autocorr))[0]
|
||||||
|
peak_spacing = np.diff(peaks)
|
||||||
|
has_periodic_echo = len(peak_spacing) > 2 and np.std(peak_spacing) < 0.1 * np.mean(peak_spacing)
|
||||||
|
|
||||||
|
# 2. Check for sudden amplitude changes
|
||||||
|
amplitude_envelope = np.abs(audio_data)
|
||||||
|
amplitude_changes = np.diff(amplitude_envelope)
|
||||||
|
has_feedback_spikes = np.any(np.abs(amplitude_changes) > threshold * 2)
|
||||||
|
|
||||||
|
# 3. Check frequency distribution
|
||||||
|
freq_magnitudes = np.abs(fft)[:len(fft)//2]
|
||||||
|
peak_freqs = freqs[:len(fft)//2][np.argsort(freq_magnitudes)[-3:]]
|
||||||
|
has_feedback_freqs = np.any((peak_freqs > 2000) & (peak_freqs < 4000))
|
||||||
|
|
||||||
|
# Combine all criteria
|
||||||
|
is_valid_speech = (
|
||||||
|
rms > threshold and
|
||||||
|
speech_energy > threshold and
|
||||||
|
not has_periodic_echo and
|
||||||
|
not has_feedback_spikes and
|
||||||
|
not has_feedback_freqs
|
||||||
|
)
|
||||||
|
|
||||||
|
return is_valid_speech
|
||||||
|
|
||||||
|
def process_command(text):
|
||||||
|
"""Process the transcribed command and execute appropriate action"""
|
||||||
|
text = text.lower().strip()
|
||||||
|
|
||||||
|
# Skip if text is too short or contains numbers (likely noise)
|
||||||
|
if len(text) < 5 or any(char.isdigit() for char in text):
|
||||||
|
logger.debug("Text too short or contains numbers, skipping")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Enhanced noise pattern detection
|
||||||
|
noise_patterns = ["lei", "los", "und", "aber", "nicht mehr", "das das", "und und"]
|
||||||
|
for pattern in noise_patterns:
|
||||||
|
if text.count(pattern) > 1: # More aggressive pattern filtering
|
||||||
|
logger.debug(f"Detected noise pattern '{pattern}', skipping")
|
||||||
|
return
|
||||||
|
|
||||||
|
# More aggressive repetition detection
|
||||||
|
words = text.split()
|
||||||
|
if len(words) >= 2:
|
||||||
|
# Check for immediate word repetitions
|
||||||
|
for i in range(len(words)-1):
|
||||||
|
if words[i] == words[i+1]:
|
||||||
|
logger.debug(f"Detected immediate word repetition: '{words[i]}', skipping")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check for phrase repetitions
|
||||||
|
phrases = [' '.join(words[i:i+2]) for i in range(len(words)-1)]
|
||||||
|
phrase_counts = {}
|
||||||
|
for phrase in phrases:
|
||||||
|
phrase_counts[phrase] = phrase_counts.get(phrase, 0) + 1
|
||||||
|
if phrase_counts[phrase] > MAX_REPETITIONS:
|
||||||
|
logger.debug(f"Skipping due to excessive repetition: '{phrase}'")
|
||||||
|
return
|
||||||
|
|
||||||
|
# German command mappings
|
||||||
|
commands = {
|
||||||
|
"ausschalten": "turn_off",
|
||||||
|
"einschalten": "turn_on",
|
||||||
|
"an": "turn_on",
|
||||||
|
"aus": "turn_off"
|
||||||
|
}
|
||||||
|
|
||||||
|
rooms = {
|
||||||
|
"wohnzimmer": "living_room",
|
||||||
|
"küche": "kitchen",
|
||||||
|
"schlafzimmer": "bedroom",
|
||||||
|
"bad": "bathroom"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Detect room
|
||||||
|
detected_room = None
|
||||||
|
for german_room, english_room in rooms.items():
|
||||||
|
if german_room in text:
|
||||||
|
detected_room = english_room
|
||||||
|
break
|
||||||
|
|
||||||
|
# Detect command
|
||||||
|
detected_command = None
|
||||||
|
for german_cmd, english_cmd in commands.items():
|
||||||
|
if german_cmd in text:
|
||||||
|
detected_command = english_cmd
|
||||||
|
break
|
||||||
|
|
||||||
|
if detected_room and detected_command:
|
||||||
|
# Construct entity ID (assuming light)
|
||||||
|
entity_id = f"light.{detected_room}"
|
||||||
|
|
||||||
|
# Send command to Home Assistant
|
||||||
|
if send_command_to_hass("light", detected_command, entity_id):
|
||||||
|
logger.info(f"Executed: {detected_command} for {entity_id}")
|
||||||
|
else:
|
||||||
|
logger.error("Failed to execute command")
|
||||||
|
else:
|
||||||
|
logger.debug(f"No command found in text: '{text}'")
|
||||||
|
|
||||||
class AudioProcessor:
|
class AudioProcessor:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
# Initialize wake word detection model
|
logger.info("Initializing AudioProcessor...")
|
||||||
self.wake_word_model = Model(
|
|
||||||
inference_framework="onnx" # Use ONNX for better performance
|
|
||||||
)
|
|
||||||
|
|
||||||
# Pre-load the wake word models
|
|
||||||
for wake_word in WAKE_WORDS:
|
|
||||||
self.wake_word_model.add_model(wake_word)
|
|
||||||
|
|
||||||
self.audio_buffer = queue.Queue()
|
self.audio_buffer = queue.Queue()
|
||||||
self.recording = False
|
self.recording = False
|
||||||
self.buffer = np.zeros(SAMPLE_RATE * BUFFER_DURATION)
|
self.buffer = np.zeros(SAMPLE_RATE * BUFFER_DURATION)
|
||||||
self.buffer_lock = threading.Lock()
|
self.buffer_lock = threading.Lock()
|
||||||
|
self.last_transcription_time = 0
|
||||||
|
self.stream = None
|
||||||
|
self.speech_detected = False
|
||||||
|
self.silence_frames = 0
|
||||||
|
self.speech_frames = 0
|
||||||
|
|
||||||
|
# Initialize wake word detection only if enabled
|
||||||
|
if WAKE_WORD_ENABLED:
|
||||||
|
try:
|
||||||
|
logger.info("Initializing wake word model...")
|
||||||
|
self.wake_word_model = Model(vad_threshold=0.5)
|
||||||
|
self.last_prediction = None
|
||||||
|
logger.info("Wake word model initialized successfully")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to initialize wake word model: {e}")
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
self.wake_word_model = None
|
||||||
|
self.last_prediction = None
|
||||||
|
logger.info("Wake word detection disabled")
|
||||||
|
|
||||||
|
def should_transcribe(self):
|
||||||
|
"""Determine if we should transcribe based on mode and timing"""
|
||||||
|
current_time = datetime.now().timestamp()
|
||||||
|
if not WAKE_WORD_ENABLED:
|
||||||
|
# Check if enough time has passed since last transcription
|
||||||
|
time_since_last = current_time - self.last_transcription_time
|
||||||
|
if time_since_last >= CONTINUOUS_TRANSCRIPTION_INTERVAL:
|
||||||
|
# Only transcribe if we detect speech
|
||||||
|
frames_per_chunk = CHUNK_SIZE
|
||||||
|
min_speech_frames = int(MIN_SPEECH_DURATION * SAMPLE_RATE / frames_per_chunk)
|
||||||
|
|
||||||
|
if self.speech_frames >= min_speech_frames:
|
||||||
|
self.last_transcription_time = current_time
|
||||||
|
self.speech_frames = 0 # Reset counter
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def audio_callback(self, indata, frames, time, status):
|
def audio_callback(self, indata, frames, time, status):
|
||||||
"""Callback for audio input"""
|
"""Callback for audio input"""
|
||||||
if status:
|
if status:
|
||||||
print(f"Audio callback status: {status}")
|
logger.warning(f"Audio callback status: {status}")
|
||||||
|
|
||||||
# Convert to mono if necessary
|
# Convert to mono if necessary
|
||||||
if CHANNELS > 1:
|
if CHANNELS > 1:
|
||||||
@@ -54,25 +283,45 @@ class AudioProcessor:
|
|||||||
else:
|
else:
|
||||||
audio_data = indata.flatten()
|
audio_data = indata.flatten()
|
||||||
|
|
||||||
|
# Check for speech
|
||||||
|
if is_speech(audio_data):
|
||||||
|
self.speech_frames += 1
|
||||||
|
self.silence_frames = 0
|
||||||
|
else:
|
||||||
|
self.silence_frames += 1
|
||||||
|
frames_per_chunk = CHUNK_SIZE
|
||||||
|
silence_frames_threshold = int(SILENCE_DURATION * SAMPLE_RATE / frames_per_chunk)
|
||||||
|
|
||||||
|
if self.silence_frames >= silence_frames_threshold:
|
||||||
|
self.speech_frames = 0
|
||||||
|
|
||||||
# Update circular buffer
|
# Update circular buffer
|
||||||
with self.buffer_lock:
|
with self.buffer_lock:
|
||||||
self.buffer = np.roll(self.buffer, -len(audio_data))
|
self.buffer = np.roll(self.buffer, -len(audio_data))
|
||||||
self.buffer[-len(audio_data):] = audio_data
|
self.buffer[-len(audio_data):] = audio_data
|
||||||
|
|
||||||
# Process for wake word detection
|
if WAKE_WORD_ENABLED:
|
||||||
prediction = self.wake_word_model.predict(audio_data)
|
# Process for wake word detection
|
||||||
|
self.last_prediction = self.wake_word_model.predict(audio_data)
|
||||||
|
|
||||||
# Check if wake word detected
|
# Check if wake word detected
|
||||||
for wake_word in WAKE_WORDS:
|
for wake_word in WAKE_WORDS:
|
||||||
if prediction[wake_word] > DETECTION_THRESHOLD:
|
confidence = self.last_prediction[wake_word]
|
||||||
print(f"Wake word detected: {wake_word} (confidence: {prediction[wake_word]:.2f})")
|
if confidence > DETECTION_THRESHOLD:
|
||||||
self.save_audio_segment(wake_word)
|
logger.info(
|
||||||
break
|
f"Wake word: {WAKE_WORD_ALIAS} (confidence: {confidence:.2f})"
|
||||||
|
)
|
||||||
|
self.process_audio()
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# Continuous transcription mode
|
||||||
|
if self.should_transcribe():
|
||||||
|
self.process_audio()
|
||||||
|
|
||||||
def save_audio_segment(self, wake_word):
|
def process_audio(self):
|
||||||
"""Save the audio buffer when wake word is detected"""
|
"""Process the current audio buffer (save and transcribe)"""
|
||||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
filename = f"/audio/wake_word_{wake_word}_{timestamp}.wav"
|
filename = f"/audio/audio_segment_{timestamp}.wav"
|
||||||
|
|
||||||
# Save the audio buffer to a WAV file
|
# Save the audio buffer to a WAV file
|
||||||
with wave.open(filename, 'wb') as wf:
|
with wave.open(filename, 'wb') as wf:
|
||||||
@@ -84,89 +333,83 @@ class AudioProcessor:
|
|||||||
audio_data = (self.buffer * 32767).astype(np.int16)
|
audio_data = (self.buffer * 32767).astype(np.int16)
|
||||||
wf.writeframes(audio_data.tobytes())
|
wf.writeframes(audio_data.tobytes())
|
||||||
|
|
||||||
print(f"Saved audio segment to {filename}")
|
logger.info(f"Saved audio segment to {filename}")
|
||||||
|
|
||||||
# Transcribe the audio
|
# Transcribe the audio with German language preference
|
||||||
try:
|
try:
|
||||||
segments, info = asr_model.transcribe(
|
segments, info = asr_model.transcribe(
|
||||||
filename,
|
filename,
|
||||||
language="en",
|
language="de", # Set German as preferred language
|
||||||
beam_size=5,
|
beam_size=5,
|
||||||
temperature=0
|
temperature=0
|
||||||
)
|
)
|
||||||
|
|
||||||
# Format the transcription result
|
# Get the full transcribed text
|
||||||
result = {
|
transcribed_text = " ".join(segment.text for segment in segments)
|
||||||
"text": " ".join(segment.text for segment in segments),
|
logger.info(f"Transcribed text: {transcribed_text}")
|
||||||
"segments": [
|
|
||||||
{
|
|
||||||
"text": segment.text,
|
|
||||||
"start": segment.start,
|
|
||||||
"end": segment.end,
|
|
||||||
"confidence": segment.confidence
|
|
||||||
}
|
|
||||||
for segment in segments
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
# Save metadata and transcription
|
# Process the command
|
||||||
metadata = {
|
process_command(transcribed_text)
|
||||||
"timestamp": timestamp,
|
|
||||||
"wake_word": wake_word,
|
|
||||||
"wake_word_confidence": float(prediction[wake_word]),
|
|
||||||
"sample_rate": SAMPLE_RATE,
|
|
||||||
"channels": CHANNELS,
|
|
||||||
"duration": BUFFER_DURATION,
|
|
||||||
"transcription": result
|
|
||||||
}
|
|
||||||
|
|
||||||
with open(f"{filename}.json", 'w') as f:
|
|
||||||
json.dump(metadata, f, indent=2)
|
|
||||||
|
|
||||||
print("\nTranscription result:")
|
|
||||||
print(f"Text: {result['text']}")
|
|
||||||
print("\nSegments:")
|
|
||||||
for segment in result["segments"]:
|
|
||||||
print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['confidence']:.2%})")
|
|
||||||
print(f'"{segment["text"]}"')
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error during transcription: {e}")
|
logger.error(f"Error during transcription or processing: {e}")
|
||||||
metadata = {
|
|
||||||
"timestamp": timestamp,
|
|
||||||
"wake_word": wake_word,
|
|
||||||
"wake_word_confidence": float(prediction[wake_word]),
|
|
||||||
"sample_rate": SAMPLE_RATE,
|
|
||||||
"channels": CHANNELS,
|
|
||||||
"duration": BUFFER_DURATION,
|
|
||||||
"error": str(e)
|
|
||||||
}
|
|
||||||
with open(f"{filename}.json", 'w') as f:
|
|
||||||
json.dump(metadata, f, indent=2)
|
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
"""Start audio processing"""
|
"""Start audio processing"""
|
||||||
try:
|
try:
|
||||||
print("Initializing wake word detection...")
|
logger.info("Starting audio processor...")
|
||||||
print(f"Loaded wake words: {', '.join(WAKE_WORDS)}")
|
|
||||||
|
|
||||||
with sd.InputStream(
|
# Log configuration
|
||||||
channels=CHANNELS,
|
logger.debug(f"Sample Rate: {SAMPLE_RATE}")
|
||||||
samplerate=SAMPLE_RATE,
|
logger.debug(f"Channels: {CHANNELS}")
|
||||||
blocksize=CHUNK_SIZE,
|
logger.debug(f"Chunk Size: {CHUNK_SIZE}")
|
||||||
callback=self.audio_callback
|
logger.debug(f"Buffer Duration: {BUFFER_DURATION}")
|
||||||
):
|
logger.debug(f"Wake Word Enabled: {WAKE_WORD_ENABLED}")
|
||||||
print("\nWake word detection started. Listening...")
|
logger.debug(f"Speech Enabled: {SPEECH_ENABLED}")
|
||||||
print("Press Ctrl+C to stop")
|
logger.debug(f"ASR Model: {os.environ.get('ASR_MODEL')}")
|
||||||
|
|
||||||
while True:
|
if WAKE_WORD_ENABLED:
|
||||||
sd.sleep(1000) # Sleep for 1 second
|
logger.info("Initializing wake word detection...")
|
||||||
|
logger.info(f"Loaded wake words: {', '.join(WAKE_WORDS)}")
|
||||||
|
else:
|
||||||
|
logger.info("Starting continuous transcription mode...")
|
||||||
|
interval = CONTINUOUS_TRANSCRIPTION_INTERVAL
|
||||||
|
logger.info(f"Will transcribe every {interval} seconds")
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.debug("Setting up audio input stream...")
|
||||||
|
with sd.InputStream(
|
||||||
|
channels=CHANNELS,
|
||||||
|
samplerate=SAMPLE_RATE,
|
||||||
|
blocksize=CHUNK_SIZE,
|
||||||
|
callback=self.audio_callback
|
||||||
|
):
|
||||||
|
logger.info("Audio input stream started successfully")
|
||||||
|
logger.info("Listening for audio input...")
|
||||||
|
logger.info("Press Ctrl+C to stop")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
sd.sleep(1000) # Sleep for 1 second
|
||||||
|
|
||||||
|
except sd.PortAudioError as e:
|
||||||
|
logger.error(f"Error setting up audio stream: {e}")
|
||||||
|
logger.error("Check if microphone is connected and accessible")
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error in audio stream: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
print("\nStopping wake word detection...")
|
logger.info("\nStopping audio processing...")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in audio processing: {e}")
|
logger.error("Critical error in audio processing", exc_info=True)
|
||||||
|
raise
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
processor = AudioProcessor()
|
try:
|
||||||
processor.start()
|
logger.info("Initializing AudioProcessor...")
|
||||||
|
processor = AudioProcessor()
|
||||||
|
processor.start()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Failed to start AudioProcessor", exc_info=True)
|
||||||
|
raise
|
||||||
Reference in New Issue
Block a user