feat(speech): enhance speech processing with advanced audio setup and detection
- Add audio setup script for PulseAudio configuration - Improve wake word detection with advanced noise filtering - Implement continuous transcription and command processing - Update speech Dockerfile with additional audio dependencies - Enhance logging and error handling in wake word detector
This commit is contained in:
@@ -53,6 +53,9 @@ COPY --from=builder --chown=bunjs:nodejs /app/dist ./dist
|
||||
COPY --from=builder --chown=bunjs:nodejs /app/node_modules ./node_modules
|
||||
COPY --chown=bunjs:nodejs package.json ./
|
||||
|
||||
# Create logs directory with proper permissions
|
||||
RUN mkdir -p /app/logs && chown -R bunjs:nodejs /app/logs
|
||||
|
||||
# Switch to non-root user
|
||||
USER bunjs
|
||||
|
||||
|
||||
@@ -13,9 +13,10 @@ RUN python -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Install Python dependencies with specific versions and CPU-only variants
|
||||
RUN pip install --no-cache-dir numpy==1.24.3
|
||||
RUN pip install --no-cache-dir torch==2.1.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN pip install --no-cache-dir faster-whisper==0.10.0 openwakeword==0.4.0 pyaudio==0.2.14 sounddevice==0.4.6
|
||||
RUN pip install --no-cache-dir "numpy>=1.24.3,<2.0.0" && \
|
||||
pip install --no-cache-dir torch==2.1.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cpu && \
|
||||
pip install --no-cache-dir faster-whisper==0.10.0 openwakeword==0.4.0 pyaudio==0.2.14 sounddevice==0.4.6 requests==2.31.0 && \
|
||||
pip freeze > /opt/venv/requirements.txt
|
||||
|
||||
# Create final image
|
||||
FROM python:3.10-slim
|
||||
@@ -24,10 +25,14 @@ FROM python:3.10-slim
|
||||
COPY --from=builder /opt/venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Install only runtime dependencies
|
||||
# Install audio dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
portaudio19-dev \
|
||||
python3-pyaudio \
|
||||
alsa-utils \
|
||||
libasound2 \
|
||||
libasound2-plugins \
|
||||
pulseaudio \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create necessary directories
|
||||
@@ -55,5 +60,9 @@ ENV PYTHONMALLOC=malloc \
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD ps aux | grep '[p]ython' || exit 1
|
||||
|
||||
# Run the wake word detection service with resource constraints
|
||||
CMD ["python", "-X", "faulthandler", "wake_word_detector.py"]
|
||||
# Copy audio setup script
|
||||
COPY setup-audio.sh /setup-audio.sh
|
||||
RUN chmod +x /setup-audio.sh
|
||||
|
||||
# Start command
|
||||
CMD ["/bin/bash", "-c", "/setup-audio.sh && python -u wake_word_detector.py"]
|
||||
16
docker/speech/setup-audio.sh
Executable file
16
docker/speech/setup-audio.sh
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Wait for PulseAudio to be ready
|
||||
sleep 2
|
||||
|
||||
# Mute the monitor to prevent feedback
|
||||
pactl set-source-mute alsa_output.pci-0000_00_1b.0.analog-stereo.monitor 1
|
||||
|
||||
# Set microphone sensitivity to 65%
|
||||
pactl set-source-volume alsa_input.pci-0000_00_1b.0.analog-stereo 65%
|
||||
|
||||
# Set speaker volume to 40%
|
||||
pactl set-sink-volume alsa_output.pci-0000_00_1b.0.analog-stereo 40%
|
||||
|
||||
# Make the script executable
|
||||
chmod +x /setup-audio.sh
|
||||
@@ -8,45 +8,274 @@ from openwakeword import Model
|
||||
from datetime import datetime
|
||||
import wave
|
||||
from faster_whisper import WhisperModel
|
||||
import requests
|
||||
import logging
|
||||
import time
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration
|
||||
SAMPLE_RATE = 16000
|
||||
CHANNELS = 1
|
||||
CHUNK_SIZE = 1024
|
||||
BUFFER_DURATION = 30 # seconds to keep in buffer
|
||||
BUFFER_DURATION = 10 # seconds to keep in buffer
|
||||
DETECTION_THRESHOLD = 0.5
|
||||
CONTINUOUS_TRANSCRIPTION_INTERVAL = 3 # seconds between transcriptions
|
||||
MAX_MODEL_LOAD_RETRIES = 3
|
||||
MODEL_LOAD_RETRY_DELAY = 5 # seconds
|
||||
MODEL_DOWNLOAD_TIMEOUT = 600 # 10 minutes timeout for model download
|
||||
|
||||
# Wake word models to use
|
||||
WAKE_WORDS = ["hey_jarvis", "ok_google", "alexa"]
|
||||
# Audio processing parameters
|
||||
NOISE_THRESHOLD = 0.08 # Increased threshold for better noise filtering
|
||||
MIN_SPEECH_DURATION = 2.0 # Longer minimum duration to avoid fragments
|
||||
SILENCE_DURATION = 1.0 # Longer silence duration
|
||||
MAX_REPETITIONS = 1 # More aggressive repetition filtering
|
||||
ECHO_THRESHOLD = 0.75 # More sensitive echo detection
|
||||
MIN_SEGMENT_DURATION = 1.0 # Longer minimum segment duration
|
||||
FEEDBACK_WINDOW = 5 # Window size for feedback detection in seconds
|
||||
|
||||
# Initialize the ASR model
|
||||
asr_model = WhisperModel(
|
||||
model_size_or_path=os.environ.get('ASR_MODEL', 'base.en'),
|
||||
# Feature flags from environment
|
||||
WAKE_WORD_ENABLED = os.environ.get('ENABLE_WAKE_WORD', 'false').lower() == 'true'
|
||||
SPEECH_ENABLED = os.environ.get('ENABLE_SPEECH_FEATURES', 'true').lower() == 'true'
|
||||
|
||||
# Wake word models to use (only if wake word is enabled)
|
||||
WAKE_WORDS = ["alexa"] # Using 'alexa' as temporary replacement for 'gaja'
|
||||
WAKE_WORD_ALIAS = "gaja" # What we print when wake word is detected
|
||||
|
||||
# Home Assistant Configuration
|
||||
HASS_HOST = os.environ.get('HASS_HOST', 'http://homeassistant.local:8123')
|
||||
HASS_TOKEN = os.environ.get('HASS_TOKEN')
|
||||
|
||||
def initialize_asr_model():
|
||||
"""Initialize the ASR model with retries and timeout"""
|
||||
model_path = os.environ.get('ASR_MODEL_PATH', '/models')
|
||||
model_name = os.environ.get('ASR_MODEL', 'large-v3')
|
||||
|
||||
start_time = time.time()
|
||||
for attempt in range(MAX_MODEL_LOAD_RETRIES):
|
||||
try:
|
||||
if time.time() - start_time > MODEL_DOWNLOAD_TIMEOUT:
|
||||
logger.error("Model download timeout exceeded")
|
||||
raise TimeoutError("Model download took too long")
|
||||
|
||||
logger.info(f"Loading ASR model (attempt {attempt + 1}/{MAX_MODEL_LOAD_RETRIES})")
|
||||
model = WhisperModel(
|
||||
model_size_or_path=model_name,
|
||||
device="cpu",
|
||||
compute_type="int8",
|
||||
download_root=os.environ.get('ASR_MODEL_PATH', '/models')
|
||||
download_root=model_path,
|
||||
num_workers=1 # Reduce concurrent downloads
|
||||
)
|
||||
logger.info("ASR model loaded successfully")
|
||||
return model
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load ASR model (attempt {attempt + 1}): {e}")
|
||||
if attempt < MAX_MODEL_LOAD_RETRIES - 1:
|
||||
logger.info(f"Retrying in {MODEL_LOAD_RETRY_DELAY} seconds...")
|
||||
time.sleep(MODEL_LOAD_RETRY_DELAY)
|
||||
else:
|
||||
logger.error("Failed to load ASR model after all retries")
|
||||
raise
|
||||
|
||||
# Initialize the ASR model with retries
|
||||
try:
|
||||
asr_model = initialize_asr_model()
|
||||
except Exception as e:
|
||||
logger.error(f"Critical error initializing ASR model: {e}")
|
||||
raise
|
||||
|
||||
def send_command_to_hass(domain, service, entity_id):
|
||||
"""Send command to Home Assistant"""
|
||||
if not HASS_TOKEN:
|
||||
logger.error("Error: HASS_TOKEN not set")
|
||||
return False
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {HASS_TOKEN}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
url = f"{HASS_HOST}/api/services/{domain}/{service}"
|
||||
data = {"entity_id": entity_id}
|
||||
|
||||
try:
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
response.raise_for_status()
|
||||
logger.info(f"Command sent: {domain}.{service} for {entity_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending command to Home Assistant: {e}")
|
||||
return False
|
||||
|
||||
def is_speech(audio_data, threshold=NOISE_THRESHOLD):
|
||||
"""Detect if audio segment contains speech based on amplitude and frequency content"""
|
||||
# Calculate RMS amplitude
|
||||
rms = np.sqrt(np.mean(np.square(audio_data)))
|
||||
|
||||
# Calculate signal energy in speech frequency range (100-4000 Hz)
|
||||
fft = np.fft.fft(audio_data)
|
||||
freqs = np.fft.fftfreq(len(audio_data), 1/SAMPLE_RATE)
|
||||
speech_mask = (np.abs(freqs) >= 100) & (np.abs(freqs) <= 4000)
|
||||
speech_energy = np.sum(np.abs(fft[speech_mask])) / len(audio_data)
|
||||
|
||||
# Enhanced echo detection
|
||||
# 1. Check for periodic patterns in the signal
|
||||
autocorr = np.correlate(audio_data, audio_data, mode='full')
|
||||
autocorr = autocorr[len(autocorr)//2:] # Use only positive lags
|
||||
peaks = np.where(autocorr > ECHO_THRESHOLD * np.max(autocorr))[0]
|
||||
peak_spacing = np.diff(peaks)
|
||||
has_periodic_echo = len(peak_spacing) > 2 and np.std(peak_spacing) < 0.1 * np.mean(peak_spacing)
|
||||
|
||||
# 2. Check for sudden amplitude changes
|
||||
amplitude_envelope = np.abs(audio_data)
|
||||
amplitude_changes = np.diff(amplitude_envelope)
|
||||
has_feedback_spikes = np.any(np.abs(amplitude_changes) > threshold * 2)
|
||||
|
||||
# 3. Check frequency distribution
|
||||
freq_magnitudes = np.abs(fft)[:len(fft)//2]
|
||||
peak_freqs = freqs[:len(fft)//2][np.argsort(freq_magnitudes)[-3:]]
|
||||
has_feedback_freqs = np.any((peak_freqs > 2000) & (peak_freqs < 4000))
|
||||
|
||||
# Combine all criteria
|
||||
is_valid_speech = (
|
||||
rms > threshold and
|
||||
speech_energy > threshold and
|
||||
not has_periodic_echo and
|
||||
not has_feedback_spikes and
|
||||
not has_feedback_freqs
|
||||
)
|
||||
|
||||
return is_valid_speech
|
||||
|
||||
def process_command(text):
|
||||
"""Process the transcribed command and execute appropriate action"""
|
||||
text = text.lower().strip()
|
||||
|
||||
# Skip if text is too short or contains numbers (likely noise)
|
||||
if len(text) < 5 or any(char.isdigit() for char in text):
|
||||
logger.debug("Text too short or contains numbers, skipping")
|
||||
return
|
||||
|
||||
# Enhanced noise pattern detection
|
||||
noise_patterns = ["lei", "los", "und", "aber", "nicht mehr", "das das", "und und"]
|
||||
for pattern in noise_patterns:
|
||||
if text.count(pattern) > 1: # More aggressive pattern filtering
|
||||
logger.debug(f"Detected noise pattern '{pattern}', skipping")
|
||||
return
|
||||
|
||||
# More aggressive repetition detection
|
||||
words = text.split()
|
||||
if len(words) >= 2:
|
||||
# Check for immediate word repetitions
|
||||
for i in range(len(words)-1):
|
||||
if words[i] == words[i+1]:
|
||||
logger.debug(f"Detected immediate word repetition: '{words[i]}', skipping")
|
||||
return
|
||||
|
||||
# Check for phrase repetitions
|
||||
phrases = [' '.join(words[i:i+2]) for i in range(len(words)-1)]
|
||||
phrase_counts = {}
|
||||
for phrase in phrases:
|
||||
phrase_counts[phrase] = phrase_counts.get(phrase, 0) + 1
|
||||
if phrase_counts[phrase] > MAX_REPETITIONS:
|
||||
logger.debug(f"Skipping due to excessive repetition: '{phrase}'")
|
||||
return
|
||||
|
||||
# German command mappings
|
||||
commands = {
|
||||
"ausschalten": "turn_off",
|
||||
"einschalten": "turn_on",
|
||||
"an": "turn_on",
|
||||
"aus": "turn_off"
|
||||
}
|
||||
|
||||
rooms = {
|
||||
"wohnzimmer": "living_room",
|
||||
"küche": "kitchen",
|
||||
"schlafzimmer": "bedroom",
|
||||
"bad": "bathroom"
|
||||
}
|
||||
|
||||
# Detect room
|
||||
detected_room = None
|
||||
for german_room, english_room in rooms.items():
|
||||
if german_room in text:
|
||||
detected_room = english_room
|
||||
break
|
||||
|
||||
# Detect command
|
||||
detected_command = None
|
||||
for german_cmd, english_cmd in commands.items():
|
||||
if german_cmd in text:
|
||||
detected_command = english_cmd
|
||||
break
|
||||
|
||||
if detected_room and detected_command:
|
||||
# Construct entity ID (assuming light)
|
||||
entity_id = f"light.{detected_room}"
|
||||
|
||||
# Send command to Home Assistant
|
||||
if send_command_to_hass("light", detected_command, entity_id):
|
||||
logger.info(f"Executed: {detected_command} for {entity_id}")
|
||||
else:
|
||||
logger.error("Failed to execute command")
|
||||
else:
|
||||
logger.debug(f"No command found in text: '{text}'")
|
||||
|
||||
class AudioProcessor:
|
||||
def __init__(self):
|
||||
# Initialize wake word detection model
|
||||
self.wake_word_model = Model(
|
||||
inference_framework="onnx" # Use ONNX for better performance
|
||||
)
|
||||
|
||||
# Pre-load the wake word models
|
||||
for wake_word in WAKE_WORDS:
|
||||
self.wake_word_model.add_model(wake_word)
|
||||
|
||||
logger.info("Initializing AudioProcessor...")
|
||||
self.audio_buffer = queue.Queue()
|
||||
self.recording = False
|
||||
self.buffer = np.zeros(SAMPLE_RATE * BUFFER_DURATION)
|
||||
self.buffer_lock = threading.Lock()
|
||||
self.last_transcription_time = 0
|
||||
self.stream = None
|
||||
self.speech_detected = False
|
||||
self.silence_frames = 0
|
||||
self.speech_frames = 0
|
||||
|
||||
# Initialize wake word detection only if enabled
|
||||
if WAKE_WORD_ENABLED:
|
||||
try:
|
||||
logger.info("Initializing wake word model...")
|
||||
self.wake_word_model = Model(vad_threshold=0.5)
|
||||
self.last_prediction = None
|
||||
logger.info("Wake word model initialized successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize wake word model: {e}")
|
||||
raise
|
||||
else:
|
||||
self.wake_word_model = None
|
||||
self.last_prediction = None
|
||||
logger.info("Wake word detection disabled")
|
||||
|
||||
def should_transcribe(self):
|
||||
"""Determine if we should transcribe based on mode and timing"""
|
||||
current_time = datetime.now().timestamp()
|
||||
if not WAKE_WORD_ENABLED:
|
||||
# Check if enough time has passed since last transcription
|
||||
time_since_last = current_time - self.last_transcription_time
|
||||
if time_since_last >= CONTINUOUS_TRANSCRIPTION_INTERVAL:
|
||||
# Only transcribe if we detect speech
|
||||
frames_per_chunk = CHUNK_SIZE
|
||||
min_speech_frames = int(MIN_SPEECH_DURATION * SAMPLE_RATE / frames_per_chunk)
|
||||
|
||||
if self.speech_frames >= min_speech_frames:
|
||||
self.last_transcription_time = current_time
|
||||
self.speech_frames = 0 # Reset counter
|
||||
return True
|
||||
return False
|
||||
|
||||
def audio_callback(self, indata, frames, time, status):
|
||||
"""Callback for audio input"""
|
||||
if status:
|
||||
print(f"Audio callback status: {status}")
|
||||
logger.warning(f"Audio callback status: {status}")
|
||||
|
||||
# Convert to mono if necessary
|
||||
if CHANNELS > 1:
|
||||
@@ -54,25 +283,45 @@ class AudioProcessor:
|
||||
else:
|
||||
audio_data = indata.flatten()
|
||||
|
||||
# Check for speech
|
||||
if is_speech(audio_data):
|
||||
self.speech_frames += 1
|
||||
self.silence_frames = 0
|
||||
else:
|
||||
self.silence_frames += 1
|
||||
frames_per_chunk = CHUNK_SIZE
|
||||
silence_frames_threshold = int(SILENCE_DURATION * SAMPLE_RATE / frames_per_chunk)
|
||||
|
||||
if self.silence_frames >= silence_frames_threshold:
|
||||
self.speech_frames = 0
|
||||
|
||||
# Update circular buffer
|
||||
with self.buffer_lock:
|
||||
self.buffer = np.roll(self.buffer, -len(audio_data))
|
||||
self.buffer[-len(audio_data):] = audio_data
|
||||
|
||||
if WAKE_WORD_ENABLED:
|
||||
# Process for wake word detection
|
||||
prediction = self.wake_word_model.predict(audio_data)
|
||||
self.last_prediction = self.wake_word_model.predict(audio_data)
|
||||
|
||||
# Check if wake word detected
|
||||
for wake_word in WAKE_WORDS:
|
||||
if prediction[wake_word] > DETECTION_THRESHOLD:
|
||||
print(f"Wake word detected: {wake_word} (confidence: {prediction[wake_word]:.2f})")
|
||||
self.save_audio_segment(wake_word)
|
||||
confidence = self.last_prediction[wake_word]
|
||||
if confidence > DETECTION_THRESHOLD:
|
||||
logger.info(
|
||||
f"Wake word: {WAKE_WORD_ALIAS} (confidence: {confidence:.2f})"
|
||||
)
|
||||
self.process_audio()
|
||||
break
|
||||
else:
|
||||
# Continuous transcription mode
|
||||
if self.should_transcribe():
|
||||
self.process_audio()
|
||||
|
||||
def save_audio_segment(self, wake_word):
|
||||
"""Save the audio buffer when wake word is detected"""
|
||||
def process_audio(self):
|
||||
"""Process the current audio buffer (save and transcribe)"""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"/audio/wake_word_{wake_word}_{timestamp}.wav"
|
||||
filename = f"/audio/audio_segment_{timestamp}.wav"
|
||||
|
||||
# Save the audio buffer to a WAV file
|
||||
with wave.open(filename, 'wb') as wf:
|
||||
@@ -84,89 +333,83 @@ class AudioProcessor:
|
||||
audio_data = (self.buffer * 32767).astype(np.int16)
|
||||
wf.writeframes(audio_data.tobytes())
|
||||
|
||||
print(f"Saved audio segment to {filename}")
|
||||
logger.info(f"Saved audio segment to {filename}")
|
||||
|
||||
# Transcribe the audio
|
||||
# Transcribe the audio with German language preference
|
||||
try:
|
||||
segments, info = asr_model.transcribe(
|
||||
filename,
|
||||
language="en",
|
||||
language="de", # Set German as preferred language
|
||||
beam_size=5,
|
||||
temperature=0
|
||||
)
|
||||
|
||||
# Format the transcription result
|
||||
result = {
|
||||
"text": " ".join(segment.text for segment in segments),
|
||||
"segments": [
|
||||
{
|
||||
"text": segment.text,
|
||||
"start": segment.start,
|
||||
"end": segment.end,
|
||||
"confidence": segment.confidence
|
||||
}
|
||||
for segment in segments
|
||||
]
|
||||
}
|
||||
# Get the full transcribed text
|
||||
transcribed_text = " ".join(segment.text for segment in segments)
|
||||
logger.info(f"Transcribed text: {transcribed_text}")
|
||||
|
||||
# Save metadata and transcription
|
||||
metadata = {
|
||||
"timestamp": timestamp,
|
||||
"wake_word": wake_word,
|
||||
"wake_word_confidence": float(prediction[wake_word]),
|
||||
"sample_rate": SAMPLE_RATE,
|
||||
"channels": CHANNELS,
|
||||
"duration": BUFFER_DURATION,
|
||||
"transcription": result
|
||||
}
|
||||
|
||||
with open(f"{filename}.json", 'w') as f:
|
||||
json.dump(metadata, f, indent=2)
|
||||
|
||||
print("\nTranscription result:")
|
||||
print(f"Text: {result['text']}")
|
||||
print("\nSegments:")
|
||||
for segment in result["segments"]:
|
||||
print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['confidence']:.2%})")
|
||||
print(f'"{segment["text"]}"')
|
||||
# Process the command
|
||||
process_command(transcribed_text)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during transcription: {e}")
|
||||
metadata = {
|
||||
"timestamp": timestamp,
|
||||
"wake_word": wake_word,
|
||||
"wake_word_confidence": float(prediction[wake_word]),
|
||||
"sample_rate": SAMPLE_RATE,
|
||||
"channels": CHANNELS,
|
||||
"duration": BUFFER_DURATION,
|
||||
"error": str(e)
|
||||
}
|
||||
with open(f"{filename}.json", 'w') as f:
|
||||
json.dump(metadata, f, indent=2)
|
||||
logger.error(f"Error during transcription or processing: {e}")
|
||||
|
||||
def start(self):
|
||||
"""Start audio processing"""
|
||||
try:
|
||||
print("Initializing wake word detection...")
|
||||
print(f"Loaded wake words: {', '.join(WAKE_WORDS)}")
|
||||
logger.info("Starting audio processor...")
|
||||
|
||||
# Log configuration
|
||||
logger.debug(f"Sample Rate: {SAMPLE_RATE}")
|
||||
logger.debug(f"Channels: {CHANNELS}")
|
||||
logger.debug(f"Chunk Size: {CHUNK_SIZE}")
|
||||
logger.debug(f"Buffer Duration: {BUFFER_DURATION}")
|
||||
logger.debug(f"Wake Word Enabled: {WAKE_WORD_ENABLED}")
|
||||
logger.debug(f"Speech Enabled: {SPEECH_ENABLED}")
|
||||
logger.debug(f"ASR Model: {os.environ.get('ASR_MODEL')}")
|
||||
|
||||
if WAKE_WORD_ENABLED:
|
||||
logger.info("Initializing wake word detection...")
|
||||
logger.info(f"Loaded wake words: {', '.join(WAKE_WORDS)}")
|
||||
else:
|
||||
logger.info("Starting continuous transcription mode...")
|
||||
interval = CONTINUOUS_TRANSCRIPTION_INTERVAL
|
||||
logger.info(f"Will transcribe every {interval} seconds")
|
||||
|
||||
try:
|
||||
logger.debug("Setting up audio input stream...")
|
||||
with sd.InputStream(
|
||||
channels=CHANNELS,
|
||||
samplerate=SAMPLE_RATE,
|
||||
blocksize=CHUNK_SIZE,
|
||||
callback=self.audio_callback
|
||||
):
|
||||
print("\nWake word detection started. Listening...")
|
||||
print("Press Ctrl+C to stop")
|
||||
logger.info("Audio input stream started successfully")
|
||||
logger.info("Listening for audio input...")
|
||||
logger.info("Press Ctrl+C to stop")
|
||||
|
||||
while True:
|
||||
sd.sleep(1000) # Sleep for 1 second
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nStopping wake word detection...")
|
||||
except sd.PortAudioError as e:
|
||||
logger.error(f"Error setting up audio stream: {e}")
|
||||
logger.error("Check if microphone is connected and accessible")
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"Error in audio processing: {e}")
|
||||
logger.error(f"Unexpected error in audio stream: {e}")
|
||||
raise
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("\nStopping audio processing...")
|
||||
except Exception as e:
|
||||
logger.error("Critical error in audio processing", exc_info=True)
|
||||
raise
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
logger.info("Initializing AudioProcessor...")
|
||||
processor = AudioProcessor()
|
||||
processor.start()
|
||||
except Exception as e:
|
||||
logger.error("Failed to start AudioProcessor", exc_info=True)
|
||||
raise
|
||||
Reference in New Issue
Block a user