- Add comprehensive speech configuration in .env.example and app config - Update Docker speech Dockerfile for more flexible model handling - Create detailed README for speech-to-text examples - Implement example script demonstrating speech features - Improve speech service initialization and configuration management
173 lines
5.8 KiB
Python
173 lines
5.8 KiB
Python
import os
|
|
import json
|
|
import queue
|
|
import threading
|
|
import numpy as np
|
|
import sounddevice as sd
|
|
from openwakeword import Model
|
|
from datetime import datetime
|
|
import wave
|
|
from faster_whisper import WhisperModel
|
|
|
|
# Configuration
|
|
SAMPLE_RATE = 16000
|
|
CHANNELS = 1
|
|
CHUNK_SIZE = 1024
|
|
BUFFER_DURATION = 30 # seconds to keep in buffer
|
|
DETECTION_THRESHOLD = 0.5
|
|
|
|
# Wake word models to use
|
|
WAKE_WORDS = ["hey_jarvis", "ok_google", "alexa"]
|
|
|
|
# Initialize the ASR model
|
|
asr_model = WhisperModel(
|
|
model_size_or_path=os.environ.get('ASR_MODEL', 'base.en'),
|
|
device="cpu",
|
|
compute_type="int8",
|
|
download_root=os.environ.get('ASR_MODEL_PATH', '/models')
|
|
)
|
|
|
|
class AudioProcessor:
|
|
def __init__(self):
|
|
# Initialize wake word detection model
|
|
self.wake_word_model = Model(
|
|
custom_model_paths=None, # Use default models
|
|
inference_framework="onnx" # Use ONNX for better performance
|
|
)
|
|
|
|
# Pre-load the wake word models
|
|
for wake_word in WAKE_WORDS:
|
|
self.wake_word_model.add_model(wake_word)
|
|
|
|
self.audio_buffer = queue.Queue()
|
|
self.recording = False
|
|
self.buffer = np.zeros(SAMPLE_RATE * BUFFER_DURATION)
|
|
self.buffer_lock = threading.Lock()
|
|
|
|
def audio_callback(self, indata, frames, time, status):
|
|
"""Callback for audio input"""
|
|
if status:
|
|
print(f"Audio callback status: {status}")
|
|
|
|
# Convert to mono if necessary
|
|
if CHANNELS > 1:
|
|
audio_data = np.mean(indata, axis=1)
|
|
else:
|
|
audio_data = indata.flatten()
|
|
|
|
# Update circular buffer
|
|
with self.buffer_lock:
|
|
self.buffer = np.roll(self.buffer, -len(audio_data))
|
|
self.buffer[-len(audio_data):] = audio_data
|
|
|
|
# Process for wake word detection
|
|
prediction = self.wake_word_model.predict(audio_data)
|
|
|
|
# Check if wake word detected
|
|
for wake_word in WAKE_WORDS:
|
|
if prediction[wake_word] > DETECTION_THRESHOLD:
|
|
print(f"Wake word detected: {wake_word} (confidence: {prediction[wake_word]:.2f})")
|
|
self.save_audio_segment(wake_word)
|
|
break
|
|
|
|
def save_audio_segment(self, wake_word):
|
|
"""Save the audio buffer when wake word is detected"""
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
filename = f"/audio/wake_word_{wake_word}_{timestamp}.wav"
|
|
|
|
# Save the audio buffer to a WAV file
|
|
with wave.open(filename, 'wb') as wf:
|
|
wf.setnchannels(CHANNELS)
|
|
wf.setsampwidth(2) # 16-bit audio
|
|
wf.setframerate(SAMPLE_RATE)
|
|
|
|
# Convert float32 to int16
|
|
audio_data = (self.buffer * 32767).astype(np.int16)
|
|
wf.writeframes(audio_data.tobytes())
|
|
|
|
print(f"Saved audio segment to {filename}")
|
|
|
|
# Transcribe the audio
|
|
try:
|
|
segments, info = asr_model.transcribe(
|
|
filename,
|
|
language="en",
|
|
beam_size=5,
|
|
temperature=0
|
|
)
|
|
|
|
# Format the transcription result
|
|
result = {
|
|
"text": " ".join(segment.text for segment in segments),
|
|
"segments": [
|
|
{
|
|
"text": segment.text,
|
|
"start": segment.start,
|
|
"end": segment.end,
|
|
"confidence": segment.confidence
|
|
}
|
|
for segment in segments
|
|
]
|
|
}
|
|
|
|
# Save metadata and transcription
|
|
metadata = {
|
|
"timestamp": timestamp,
|
|
"wake_word": wake_word,
|
|
"wake_word_confidence": float(prediction[wake_word]),
|
|
"sample_rate": SAMPLE_RATE,
|
|
"channels": CHANNELS,
|
|
"duration": BUFFER_DURATION,
|
|
"transcription": result
|
|
}
|
|
|
|
with open(f"{filename}.json", 'w') as f:
|
|
json.dump(metadata, f, indent=2)
|
|
|
|
print("\nTranscription result:")
|
|
print(f"Text: {result['text']}")
|
|
print("\nSegments:")
|
|
for segment in result["segments"]:
|
|
print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['confidence']:.2%})")
|
|
print(f'"{segment["text"]}"')
|
|
|
|
except Exception as e:
|
|
print(f"Error during transcription: {e}")
|
|
metadata = {
|
|
"timestamp": timestamp,
|
|
"wake_word": wake_word,
|
|
"wake_word_confidence": float(prediction[wake_word]),
|
|
"sample_rate": SAMPLE_RATE,
|
|
"channels": CHANNELS,
|
|
"duration": BUFFER_DURATION,
|
|
"error": str(e)
|
|
}
|
|
with open(f"{filename}.json", 'w') as f:
|
|
json.dump(metadata, f, indent=2)
|
|
|
|
def start(self):
|
|
"""Start audio processing"""
|
|
try:
|
|
print("Initializing wake word detection...")
|
|
print(f"Loaded wake words: {', '.join(WAKE_WORDS)}")
|
|
|
|
with sd.InputStream(
|
|
channels=CHANNELS,
|
|
samplerate=SAMPLE_RATE,
|
|
blocksize=CHUNK_SIZE,
|
|
callback=self.audio_callback
|
|
):
|
|
print("\nWake word detection started. Listening...")
|
|
print("Press Ctrl+C to stop")
|
|
|
|
while True:
|
|
sd.sleep(1000) # Sleep for 1 second
|
|
|
|
except KeyboardInterrupt:
|
|
print("\nStopping wake word detection...")
|
|
except Exception as e:
|
|
print(f"Error in audio processing: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
processor = AudioProcessor()
|
|
processor.start() |