-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnovastt.py
More file actions
195 lines (166 loc) · 7.33 KB
/
novastt.py
File metadata and controls
195 lines (166 loc) · 7.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python3
import sounddevice as sd
import numpy as np
import io
import wave
import os
import time
import threading
import collections
from groq import Groq
import config
class SpeechToText:
"""
Nova STT: Optimized Automatic Voice Activity Detection (VAD) using energy thresholds.
Eliminates the need for manual keyboard toggles with pre-buffering and adaptive thresholds.
"""
def __init__(self, on_record_start=None):
self.client = self._initialize_groq_client()
self.on_record_start = on_record_start
self.transcribed_text = None
self.is_running = True
# Audio Settings - Optimized for low latency
self.samplerate = config.MIC_SAMPLE_RATE
self.channels = config.MIC_CHANNELS
self.threshold = config.STT_ENERGY_THRESHOLD
self.silence_limit = config.STT_SILENCE_DURATION
# Ultra-optimized Buffers
self.prebuffer = collections.deque(maxlen=int(self.samplerate * 0.3)) # 300ms prebuffer for faster start
self.audio_data = []
self.is_recording = False
self.silence_start = None
self.ambient_noise_level = 0
self.noise_samples = []
self.max_noise_samples = 30 # Faster ambient noise adaptation
def _initialize_groq_client(self):
try:
client = Groq(api_key=config.GROQ_API_KEY)
return client
except Exception as e:
print(f"❌ Groq Init Error: {e}")
return None
def _get_energy(self, audio_chunk):
"""Calculates the RMS energy of an audio chunk."""
return np.sqrt(np.mean(audio_chunk**2))
def adaptive_vad_threshold(self, ambient_noise_level):
"""Adaptive VAD threshold based on ambient noise."""
return max(200, ambient_noise_level * 1.5)
def _audio_callback(self, indata, frames, time_info, status):
"""Optimized audio callback with pre-buffering and adaptive thresholds."""
if status:
print(f"⚠️ Audio Status: {status}")
# Add to prebuffer always
self.prebuffer.extend(indata.copy().flatten())
# Sample ambient noise during silence
if not self.is_recording and len(self.noise_samples) < self.max_noise_samples:
self.noise_samples.append(self._get_energy(indata))
if len(self.noise_samples) == self.max_noise_samples:
self.ambient_noise_level = np.mean(self.noise_samples)
self.threshold = self.adaptive_vad_threshold(self.ambient_noise_level)
print(f"🎚️ Adaptive threshold set: {self.threshold:.1f}")
energy = self._get_energy(indata)
adaptive_threshold = self.adaptive_vad_threshold(self.ambient_noise_level) if self.ambient_noise_level > 0 else self.threshold
if energy > adaptive_threshold:
if not self.is_recording:
print("🎙️ Voice Detected. Recording...")
self.is_recording = True
if self.on_record_start:
self.on_record_start()
# Include prebuffered audio
if self.prebuffer:
prebuffer_audio = np.array(list(self.prebuffer)).reshape(-1, self.channels)
self.audio_data = [prebuffer_audio]
self.audio_data.append(indata.copy())
self.silence_start = None
elif self.is_recording:
# Continue recording brief silence to avoid clipping
self.audio_data.append(indata.copy())
if self.silence_start is None:
self.silence_start = time.time()
elif time.time() - self.silence_start > self.silence_limit:
print("⏹️ Silence Detected. Processing...")
self.is_recording = False
self._process_recording()
def _process_recording(self):
"""Converts buffer to WAV and sends to Groq."""
if not self.audio_data:
return
recording = np.concatenate(self.audio_data)
self.audio_data = [] # Reset buffer
# Convert to 16-bit PCM
recording_int = (recording * 32767).astype(np.int16)
buffer = io.BytesIO()
with wave.open(buffer, 'wb') as wf:
wf.setnchannels(self.channels)
wf.setsampwidth(2) # 16-bit
wf.setframerate(self.samplerate)
wf.writeframes(recording_int.tobytes())
buffer.seek(0)
threading.Thread(target=self._transcribe, args=(buffer,), daemon=True).start()
def _transcribe(self, audio_buffer):
"""Optimized transcription using Groq Whisper API."""
if self.client is None:
return
try:
transcription = self.client.audio.transcriptions.create(
file=("speech.wav", audio_buffer),
model="whisper-large-v3-turbo",
response_format="json",
language="en",
temperature=0.0
)
if hasattr(transcription, 'text') and transcription.text:
text = transcription.text.strip()
if text:
print(f"📝 Transcribed: \"{text}\"")
self.transcribed_text = text
except Exception as e:
print(f"❌ Transcription Error: {e}")
def stop_listener(self):
"""Stops the audio stream and transcription thread."""
self.is_running = False
# The sounddevice stream is not stored as an attribute in start_listener in the current code
# I should modify start_listener to store it.
def start_listener(self):
"""Starts the non-blocking background audio monitor."""
if not self.client:
return
print(f"✅ STT Listener Active (Requested Rate: {self.samplerate})")
try:
# Start the sounddevice stream with ultra-low latency
self.stream = sd.InputStream(
samplerate=self.samplerate,
channels=self.channels,
callback=self._audio_callback,
blocksize=int(self.samplerate * 0.025) # 25ms chunks for minimal latency
)
self.stream.start()
except Exception as e:
print(f"⚠️ Initial samplerate {self.samplerate} failed: {e}. Trying 44100...")
try:
self.samplerate = 44100
self.stream = sd.InputStream(
samplerate=self.samplerate,
channels=self.channels,
callback=self._audio_callback,
blocksize=int(self.samplerate * 0.025)
)
self.stream.start()
print(f"✅ STT Listener Active (Fallback Rate: {self.samplerate})")
except Exception as e2:
print(f"❌ STT Listener failed completely: {e2}")
def stop_listener(self):
"""Stops the audio stream."""
if hasattr(self, 'stream') and self.stream:
self.stream.stop()
self.stream.close()
print("🛑 STT Listener Deactivated.")
if __name__ == "__main__":
# Test block
stt = SpeechToText()
stt.start_listener()
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
print("\nStopping...")