nova/novastt.py at main · alexbuildstech/nova · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python3
import sounddevice as sd
import numpy as np
import io
import wave
import os
import time
import threading
import collections
from groq import Groq
import config

class SpeechToText:
    """
    Nova STT: Optimized Automatic Voice Activity Detection (VAD) using energy thresholds.
    Eliminates the need for manual keyboard toggles with pre-buffering and adaptive thresholds.
    """

    def __init__(self, on_record_start=None):
        self.client = self._initialize_groq_client()
        self.on_record_start = on_record_start
        self.transcribed_text = None
        self.is_running = True

        # Audio Settings - Optimized for low latency
        self.samplerate = config.MIC_SAMPLE_RATE
        self.channels = config.MIC_CHANNELS
        self.threshold = config.STT_ENERGY_THRESHOLD
        self.silence_limit = config.STT_SILENCE_DURATION

        # Ultra-optimized Buffers
        self.prebuffer = collections.deque(maxlen=int(self.samplerate * 0.3))  # 300ms prebuffer for faster start
        self.audio_data = []
        self.is_recording = False
        self.silence_start = None
        self.ambient_noise_level = 0
        self.noise_samples = []
        self.max_noise_samples = 30  # Faster ambient noise adaptation

    def _initialize_groq_client(self):
        try:
            client = Groq(api_key=config.GROQ_API_KEY)
            return client
        except Exception as e:
            print(f"❌ Groq Init Error: {e}")
            return None

    def _get_energy(self, audio_chunk):
        """Calculates the RMS energy of an audio chunk."""
        return np.sqrt(np.mean(audio_chunk**2))

    def adaptive_vad_threshold(self, ambient_noise_level):
        """Adaptive VAD threshold based on ambient noise."""
        return max(200, ambient_noise_level * 1.5)

    def _audio_callback(self, indata, frames, time_info, status):
        """Optimized audio callback with pre-buffering and adaptive thresholds."""
        if status:
            print(f"⚠️ Audio Status: {status}")

        # Add to prebuffer always
        self.prebuffer.extend(indata.copy().flatten())

        # Sample ambient noise during silence
        if not self.is_recording and len(self.noise_samples) < self.max_noise_samples:
            self.noise_samples.append(self._get_energy(indata))
            if len(self.noise_samples) == self.max_noise_samples:
                self.ambient_noise_level = np.mean(self.noise_samples)
                self.threshold = self.adaptive_vad_threshold(self.ambient_noise_level)
                print(f"🎚️ Adaptive threshold set: {self.threshold:.1f}")

        energy = self._get_energy(indata)
        adaptive_threshold = self.adaptive_vad_threshold(self.ambient_noise_level) if self.ambient_noise_level > 0 else self.threshold

        if energy > adaptive_threshold:
            if not self.is_recording:
                print("🎙️ Voice Detected. Recording...")
                self.is_recording = True
                if self.on_record_start:
                    self.on_record_start()

                # Include prebuffered audio
                if self.prebuffer:
                    prebuffer_audio = np.array(list(self.prebuffer)).reshape(-1, self.channels)
                    self.audio_data = [prebuffer_audio]

            self.audio_data.append(indata.copy())
            self.silence_start = None
        elif self.is_recording:
            # Continue recording brief silence to avoid clipping
            self.audio_data.append(indata.copy())

            if self.silence_start is None:
                self.silence_start = time.time()
            elif time.time() - self.silence_start > self.silence_limit:
                print("⏹️ Silence Detected. Processing...")
                self.is_recording = False
                self._process_recording()

    def _process_recording(self):
        """Converts buffer to WAV and sends to Groq."""
        if not self.audio_data:
            return

        recording = np.concatenate(self.audio_data)
        self.audio_data = [] # Reset buffer

        # Convert to 16-bit PCM
        recording_int = (recording * 32767).astype(np.int16)

        buffer = io.BytesIO()
        with wave.open(buffer, 'wb') as wf:
            wf.setnchannels(self.channels)
            wf.setsampwidth(2) # 16-bit
            wf.setframerate(self.samplerate)
            wf.writeframes(recording_int.tobytes())

        buffer.seek(0)
        threading.Thread(target=self._transcribe, args=(buffer,), daemon=True).start()

    def _transcribe(self, audio_buffer):
        """Optimized transcription using Groq Whisper API."""
        if self.client is None:
            return

        try:
            transcription = self.client.audio.transcriptions.create(
                file=("speech.wav", audio_buffer),
                model="whisper-large-v3-turbo",
                response_format="json",
                language="en",
                temperature=0.0
            )

            if hasattr(transcription, 'text') and transcription.text:
                text = transcription.text.strip()
                if text:
                    print(f"📝 Transcribed: \"{text}\"")
                    self.transcribed_text = text
        except Exception as e:
            print(f"❌ Transcription Error: {e}")

    def stop_listener(self):
        """Stops the audio stream and transcription thread."""
        self.is_running = False
        # The sounddevice stream is not stored as an attribute in start_listener in the current code
        # I should modify start_listener to store it.

    def start_listener(self):
        """Starts the non-blocking background audio monitor."""
        if not self.client:
            return

        print(f"✅ STT Listener Active (Requested Rate: {self.samplerate})")

        try:
            # Start the sounddevice stream with ultra-low latency
            self.stream = sd.InputStream(
                samplerate=self.samplerate,
                channels=self.channels,
                callback=self._audio_callback,
                blocksize=int(self.samplerate * 0.025)  # 25ms chunks for minimal latency
            )
            self.stream.start()
        except Exception as e:
            print(f"⚠️ Initial samplerate {self.samplerate} failed: {e}. Trying 44100...")
            try:
                self.samplerate = 44100
                self.stream = sd.InputStream(
                    samplerate=self.samplerate,
                    channels=self.channels,
                    callback=self._audio_callback,
                    blocksize=int(self.samplerate * 0.025)
                )
                self.stream.start()
                print(f"✅ STT Listener Active (Fallback Rate: {self.samplerate})")
            except Exception as e2:
                print(f"❌ STT Listener failed completely: {e2}")

    def stop_listener(self):
        """Stops the audio stream."""
        if hasattr(self, 'stream') and self.stream:
            self.stream.stop()
            self.stream.close()
            print("🛑 STT Listener Deactivated.")

if __name__ == "__main__":
    # Test block
    stt = SpeechToText()
    stt.start_listener()
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        print("\nStopping...")