feat: Enhance speech and AI configuration with advanced environment settings

- Update `.env.example` with comprehensive speech and AI configuration options
- Modify Docker Compose speech configuration for more flexible audio and ASR settings
- Enhance Dockerfile to support Python virtual environment and speech dependencies
- Refactor environment loading to use Bun's file system utilities
- Improve device listing tool with more detailed device statistics
- Add support for multiple AI models and dynamic configuration
This commit is contained in:
jango-blockchained
2025-02-10 03:28:58 +01:00
parent 986b1949cd
commit b6bd53b01a
10 changed files with 764 additions and 283 deletions

35
docker/speech/asound.conf Normal file
View File

@@ -0,0 +1,35 @@
pcm.!default {
type pulse
fallback "sysdefault"
hint {
show on
description "Default ALSA Output (currently PulseAudio Sound Server)"
}
}
ctl.!default {
type pulse
fallback "sysdefault"
}
# Use PulseAudio by default
pcm.pulse {
type pulse
}
ctl.pulse {
type pulse
}
# Explicit device for recording
pcm.microphone {
type hw
card 0
device 0
}
# Default capture device
pcm.!default {
type pulse
hint.description "Default Audio Device"
}

View File

@@ -30,6 +30,9 @@ MAX_MODEL_LOAD_RETRIES = 3
MODEL_LOAD_RETRY_DELAY = 5 # seconds
MODEL_DOWNLOAD_TIMEOUT = 600 # 10 minutes timeout for model download
# ALSA device configuration
AUDIO_DEVICE = 'hw:0,0' # Use ALSA hardware device directly
# Audio processing parameters
NOISE_THRESHOLD = 0.08 # Increased threshold for better noise filtering
MIN_SPEECH_DURATION = 2.0 # Longer minimum duration to avoid fragments
@@ -44,7 +47,7 @@ WAKE_WORD_ENABLED = os.environ.get('ENABLE_WAKE_WORD', 'false').lower() == 'true
SPEECH_ENABLED = os.environ.get('ENABLE_SPEECH_FEATURES', 'true').lower() == 'true'
# Wake word models to use (only if wake word is enabled)
WAKE_WORDS = ["alexa"] # Using 'alexa' as temporary replacement for 'gaja'
WAKE_WORDS = ["hey_jarvis"] # Using hey_jarvis as it's more similar to "hey gaja"
WAKE_WORD_ALIAS = "gaja" # What we print when wake word is detected
# Home Assistant Configuration
@@ -235,7 +238,22 @@ class AudioProcessor:
self.buffer = np.zeros(SAMPLE_RATE * BUFFER_DURATION)
self.buffer_lock = threading.Lock()
self.last_transcription_time = 0
self.stream = None
try:
logger.info(f"Opening audio device: {AUDIO_DEVICE}")
self.stream = sd.InputStream(
device=AUDIO_DEVICE,
samplerate=SAMPLE_RATE,
channels=CHANNELS,
dtype=np.int16,
blocksize=CHUNK_SIZE,
callback=self._audio_callback
)
logger.info("Audio stream initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize audio stream: {e}")
raise
self.speech_detected = False
self.silence_frames = 0
self.speech_frames = 0
@@ -272,7 +290,7 @@ class AudioProcessor:
return True
return False
def audio_callback(self, indata, frames, time, status):
def _audio_callback(self, indata, frames, time, status):
"""Callback for audio input"""
if status:
logger.warning(f"Audio callback status: {status}")
@@ -382,7 +400,7 @@ class AudioProcessor:
channels=CHANNELS,
samplerate=SAMPLE_RATE,
blocksize=CHUNK_SIZE,
callback=self.audio_callback
callback=self._audio_callback
):
logger.info("Audio input stream started successfully")
logger.info("Listening for audio input...")