Fix Voxtral Realtime runner flush (#18387)

pytorchbot · mattjcly · web-flow · commit 60fcdcb283cb · 2026-03-20T18:52:20.000-04:00
### Summary `VoxtralRealtimeRunner` was outputting excessive duplicate tokens/gibberish on stream flush. In an audio file where I say "The weather is clear today" run like ``` voxtral_realtime_runner \ --model_path model.pte \ --tokenizer_path tekken.json \ --preprocessor_path preprocessor.pte \ --streaming \ --audio_path audio.wav ``` I would get output: `The weather is clear todayoday.</s>` I also experienced this with periods and many other circumstances with repeating tokens at the end of the stream. Upon investigation into vLLM's (Mistrals recommended inferencing runner for [Voxtral Realtime](https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602#vllm-recommended)), I observed that vLLM finishes the stream by closing the streaming input and draining model-defined right-padding audio, whereas ExecuTorch `flush()` finished by switching into post-audio text-only decoding after audio ended. See vLLM ref: https://github.com/vllm-project/vllm/blob/2f9f946/vllm/model_executor/models/voxtral_realtime.py#L239-L270. Therefore, apply similar logic here by converting the model-defined transcription delay into a finite number of trailing silent streaming steps to properly conclude the stream. After this, the same command outputs: ``` The weather is clear today. ``` As I would expect. No `</s>` b/c like vLLM, the stream ends by naturally draining the padded audio tail and letting the model emit whatever final delayed text it wants. ### Test plan Tested with above example and a few other audio files to observe the behavior improvement and lack of giberrish/incorrect end of stream. Co-authored-by: Matt Clayton <156335168+mattjcly@users.noreply.github.com>
diff --git a/examples/models/voxtral_realtime/README.md b/examples/models/voxtral_realtime/README.md
@@ -269,7 +269,7 @@ Ctrl+C stops recording and flushes remaining text.
 | `--preprocessor_path` | (none) | Path to mel preprocessor `.pte` |
 | `--audio_path` | (none) | Path to 16kHz mono WAV file |
 | `--temperature` | `0.0` | Sampling temperature (0 = greedy) |
-| `--max_new_tokens` | `500` | Maximum tokens to generate |
+| `--offline_max_new_tokens` | `500` | Offline-only: maximum extra tokens after audio embeddings are exhausted |
 | `--streaming` | off | Use streaming transcription (from WAV file) |
 | `--mic` | off | Live microphone mode (reads raw f32le PCM from stdin) |
 | `--mic_chunk_ms` | `80` | Mic read chunk size in ms (multiples of 80 recommended) |
diff --git a/examples/models/voxtral_realtime/export_voxtral_rt.py b/examples/models/voxtral_realtime/export_voxtral_rt.py
@@ -600,6 +600,8 @@ def main():
     else:
         programs, metadata = export_all(model, args.max_seq_len, **quant_args)
 
+    metadata["delay_tokens"] = args.delay_tokens
+
     # Lower
     et = lower_to_executorch(programs, metadata, backend=args.backend)
 
diff --git a/examples/models/voxtral_realtime/main.cpp b/examples/models/voxtral_realtime/main.cpp
@@ -49,7 +49,11 @@ DEFINE_string(tokenizer_path, "tekken.json", "Path to Tekken tokenizer file.");
 DEFINE_string(preprocessor_path, "", "Path to mel preprocessor (.pte).");
 DEFINE_string(audio_path, "", "Path to input audio file (.wav).");
 DEFINE_double(temperature, 0.0, "Sampling temperature (0 = greedy).");
-DEFINE_int32(max_new_tokens, 500, "Maximum number of tokens to generate.");
+DEFINE_int32(
+    offline_max_new_tokens,
+    500,
+    "Offline-only: maximum extra tokens to generate after audio embeddings "
+    "are exhausted.");
 DEFINE_bool(streaming, false, "Use streaming transcription mode.");
 DEFINE_bool(
     mic,
@@ -74,6 +78,12 @@ volatile sig_atomic_t g_interrupted = 0;
 void sigint_handler(int) {
   g_interrupted = 1;
 }
+
+voxtral_realtime::StreamingTranscribeConfig make_streaming_config() {
+  voxtral_realtime::StreamingTranscribeConfig config;
+  config.temperature = static_cast<float>(FLAGS_temperature);
+  return config;
+}
 } // namespace
 
 int main(int argc, char** argv) {
@@ -89,6 +99,16 @@ int main(int argc, char** argv) {
     return 1;
   }
 
+  if ((FLAGS_streaming || FLAGS_mic) &&
+      !gflags::GetCommandLineFlagInfoOrDie("offline_max_new_tokens")
+           .is_default) {
+    ET_LOG(
+        Error,
+        "--offline_max_new_tokens only applies to offline transcription. "
+        "Streaming mode drains until EOS or the padded audio tail is exhausted.");
+    return 1;
+  }
+
   if (FLAGS_preprocessor_path.empty()) {
     ET_LOG(Error, "preprocessor_path flag must be provided.");
     return 1;
@@ -109,10 +129,6 @@ int main(int argc, char** argv) {
   stats.model_load_end_ms = ::executorch::extension::llm::time_in_ms();
   stats.inference_start_ms = ::executorch::extension::llm::time_in_ms();
 
-  voxtral_realtime::TranscribeConfig config;
-  config.temperature = static_cast<float>(FLAGS_temperature);
-  config.max_new_tokens = FLAGS_max_new_tokens;
-
   stats.num_prompt_tokens = 0;
   bool first_token = true;
 
@@ -156,7 +172,8 @@ int main(int argc, char** argv) {
     ET_CHECK_MSG(
         runner.is_streaming(),
         "Model was not exported with --streaming. Re-export with --streaming flag.");
-    auto session = runner.create_streaming_session(config, token_cb);
+    auto session =
+        runner.create_streaming_session(make_streaming_config(), token_cb);
 
     // Drain any audio that buffered in stdin during model loading/warmup.
     // Without this, piped audio (e.g., from ffmpeg) accumulates while the
@@ -207,7 +224,8 @@ int main(int argc, char** argv) {
     ET_LOG(Info, "Loading audio from: %s", FLAGS_audio_path.c_str());
     auto audio_data =
         ::executorch::extension::llm::load_wav_audio_data(FLAGS_audio_path);
-    auto session = runner.create_streaming_session(config, token_cb);
+    auto session =
+        runner.create_streaming_session(make_streaming_config(), token_cb);
 
     const int64_t chunk_size = 1280;
     for (int64_t offset = 0; offset < static_cast<int64_t>(audio_data.size());
@@ -221,10 +239,13 @@ int main(int argc, char** argv) {
     ET_LOG(Info, "Loading audio from: %s", FLAGS_audio_path.c_str());
     auto audio_data =
         ::executorch::extension::llm::load_wav_audio_data(FLAGS_audio_path);
+    voxtral_realtime::OfflineTranscribeConfig offline_config;
+    offline_config.temperature = static_cast<float>(FLAGS_temperature);
+    offline_config.max_new_tokens = FLAGS_offline_max_new_tokens;
     num_generated = runner.transcribe(
         audio_data.data(),
         static_cast<int64_t>(audio_data.size()),
-        config,
+        offline_config,
         token_cb);
   }
 
diff --git a/examples/models/voxtral_realtime/model.md b/examples/models/voxtral_realtime/model.md
@@ -248,8 +248,9 @@ runner (`StreamingSession::decode_step`) then:
 3. Feeds the combined embedding to `text_decoder` at the current position
 4. Samples one token from the output logits
 
-After audio ends, `flush()` continues text-only decoding (token
-embedding only, no audio) until EOS or max tokens.
+After audio ends, `flush()` pads the unfinished tail with silence and keeps
+running the same audio-conditioned streaming path until the final partial
+step and transcription delay are drained.
 
 ### Conv state management
 
diff --git a/examples/models/voxtral_realtime/voxtral_realtime_runner.cpp b/examples/models/voxtral_realtime/voxtral_realtime_runner.cpp
@@ -124,6 +124,10 @@ VoxtralRealtimeRunner::VoxtralRealtimeRunner(
     if (msf.ok())
       mel_skip_frames_ = msf.get()[0].toInt();
 
+    auto dt = model_->execute("delay_tokens", empty);
+    if (dt.ok())
+      delay_tokens_ = dt.get()[0].toInt();
+
     ET_LOG(
         Info,
         "Streaming: chunk_mel=%ld, max_enc=%ld, enc_dim=%ld",
@@ -158,8 +162,7 @@ VoxtralRealtimeRunner::VoxtralRealtimeRunner(
     std::vector<float> dummy_audio(static_cast<size_t>(step_samples_), 0.0f);
 
     if (is_streaming_) {
-      TranscribeConfig warmup_config;
-      warmup_config.max_new_tokens = 1;
+      StreamingTranscribeConfig warmup_config;
       auto session =
           create_streaming_session(warmup_config, [](const std::string&) {});
       session->feed_audio(dummy_audio.data(), step_samples_);
@@ -291,7 +294,7 @@ TensorPtr VoxtralRealtimeRunner::convert_to_model_dtype(TensorPtr tensor) {
 int VoxtralRealtimeRunner::transcribe(
     const float* audio_data,
     int64_t num_samples,
-    const TranscribeConfig& config,
+    const OfflineTranscribeConfig& config,
     TokenCallback token_cb) {
   // --- Step 1: Preprocess raw audio to mel spectrogram ---
   ET_CHECK_MSG(preprocessor_ != nullptr, "No preprocessor provided.");
@@ -423,7 +426,7 @@ int VoxtralRealtimeRunner::transcribe(
 
 std::unique_ptr<StreamingSession>
 VoxtralRealtimeRunner::create_streaming_session(
-    const TranscribeConfig& config,
+    const StreamingTranscribeConfig& config,
     TokenCallback token_cb) {
   ET_CHECK_MSG(is_streaming_, "Model was not exported with --streaming.");
   ET_CHECK_MSG(
@@ -434,10 +437,9 @@ VoxtralRealtimeRunner::create_streaming_session(
 
 StreamingSession::StreamingSession(
     VoxtralRealtimeRunner& runner,
-    TranscribeConfig config,
+    StreamingTranscribeConfig config,
     TokenCallback token_cb)
     : runner_(runner),
-      config_(config),
       token_cb_(std::move(token_cb)),
       prev_token_(runner.bos_id_),
       sampler_(
@@ -452,9 +454,9 @@ StreamingSession::StreamingSession(
 int StreamingSession::feed_audio(const float* data, int64_t num_samples) {
   audio_buf_.insert(audio_buf_.end(), data, data + num_samples);
 
-  int new_tokens = 0;
+  const int generated_before = num_generated_;
   while (!eos_reached_ && try_process_step()) {
-    new_tokens++;
+    // num_generated_ is updated inside try_process_step()
   }
 
   // Trim consumed audio to bound memory growth. Keep stft_left_overlap_
@@ -467,7 +469,7 @@ int StreamingSession::feed_audio(const float* data, int64_t num_samples) {
     samples_consumed_ -= keep_from;
   }
 
-  return new_tokens;
+  return num_generated_ - generated_before;
 }
 
 bool StreamingSession::try_process_step() {
@@ -586,10 +588,10 @@ bool StreamingSession::try_process_step() {
   samples_consumed_ += step;
 
   // --- Decode one step ---
-  return decode_step(&audio_embeds_ptr);
+  return decode_step(audio_embeds_ptr);
 }
 
-bool StreamingSession::decode_step(const TensorPtr* audio_embeds_tensor) {
+bool StreamingSession::decode_step(const TensorPtr& audio_embeds_tensor) {
   const int64_t dim = runner_.dim_;
   const auto model_dtype = runner_.model_dtype_;
 
@@ -603,33 +605,25 @@ bool StreamingSession::decode_step(const TensorPtr* audio_embeds_tensor) {
   ET_CHECK_MSG(tok_result.ok(), "token_embedding failed.");
   auto tok_embed = tok_result.get()[0].toTensor();
 
-  // Sum audio + token embeddings (or token-only if no audio).
+  // Sum audio + token embeddings.
   // Reuses pre-allocated input_embeds_ buffer (no per-token allocation).
-  if (audio_embeds_tensor != nullptr) {
-    auto& audio_embeds = **audio_embeds_tensor;
-    if (model_dtype == ::executorch::aten::ScalarType::BFloat16) {
-      auto* out =
-          input_embeds_->mutable_data_ptr<::executorch::aten::BFloat16>();
-      const auto* af =
-          audio_embeds.const_data_ptr<::executorch::aten::BFloat16>();
-      const auto* tf = tok_embed.const_data_ptr<::executorch::aten::BFloat16>();
-      for (int64_t i = 0; i < dim; i++) {
-        out[i] = ::executorch::aten::BFloat16(
-            static_cast<float>(af[i]) + static_cast<float>(tf[i]));
-      }
-    } else {
-      auto* out = input_embeds_->mutable_data_ptr<float>();
-      const auto* af = audio_embeds.const_data_ptr<float>();
-      const auto* tf = tok_embed.const_data_ptr<float>();
-      for (int64_t i = 0; i < dim; i++) {
-        out[i] = af[i] + tf[i];
-      }
+  auto& audio_embeds = *audio_embeds_tensor;
+  if (model_dtype == ::executorch::aten::ScalarType::BFloat16) {
+    auto* out = input_embeds_->mutable_data_ptr<::executorch::aten::BFloat16>();
+    const auto* af =
+        audio_embeds.const_data_ptr<::executorch::aten::BFloat16>();
+    const auto* tf = tok_embed.const_data_ptr<::executorch::aten::BFloat16>();
+    for (int64_t i = 0; i < dim; i++) {
+      out[i] = ::executorch::aten::BFloat16(
+          static_cast<float>(af[i]) + static_cast<float>(tf[i]));
     }
   } else {
-    std::memcpy(
-        input_embeds_->mutable_data_ptr(),
-        tok_embed.const_data_ptr(),
-        static_cast<size_t>(dim) * input_embeds_->element_size());
+    auto* out = input_embeds_->mutable_data_ptr<float>();
+    const auto* af = audio_embeds.const_data_ptr<float>();
+    const auto* tf = tok_embed.const_data_ptr<float>();
+    for (int64_t i = 0; i < dim; i++) {
+      out[i] = af[i] + tf[i];
+    }
   }
 
   auto cache_pos =
@@ -669,31 +663,31 @@ int StreamingSession::flush() {
   }
   flushed_ = true;
 
-  // Pad with silence so any remaining audio (including partial steps and
-  // the right look-ahead for the last complete step) can be processed.
   const int64_t remaining =
       static_cast<int64_t>(audio_buf_.size()) - samples_consumed_;
   if (remaining > 0 && !eos_reached_) {
     const int64_t step = runner_.step_samples_;
     const int64_t right_lookahead = runner_.stft_right_lookahead_;
-    // Pad to next full step + right look-ahead
-    int64_t pad_to = ((remaining + step - 1) / step) * step + right_lookahead;
-    std::vector<float> silence(static_cast<size_t>(pad_to - remaining), 0.0f);
+    const int64_t right_pad_audio_steps = runner_.delay_tokens_;
+
+    // Stay on the normal audio-conditioned path through the final partial
+    // step, the preprocessor look-ahead, and the model's transcription delay.
+    // Matches vLLM flush behavior:
+    // https://github.com/vllm-project/vllm/blob/2f9f946/vllm/model_executor/models/voxtral_realtime.py#L239-L270
+    int64_t pad_to =
+        (((remaining + step - 1) / step) + right_pad_audio_steps) * step +
+        right_lookahead;
+    const int64_t silence_padded_samples = pad_to - remaining;
+    std::vector<float> silence(
+        static_cast<size_t>(silence_padded_samples), 0.0f);
     audio_buf_.insert(audio_buf_.end(), silence.begin(), silence.end());
 
+    // Guaranteed to terminate b/c each call to try_process_step() consumes a
+    // fixed number of audio samples and the padded audio buffer is finite.
     while (!eos_reached_ && try_process_step()) {
     }
   }
 
-  // Text-only decoding after audio ends.
-  const int64_t max_text_steps = std::min(
-      static_cast<int64_t>(config_.max_new_tokens) - num_generated_,
-      runner_.max_seq_len_ - dec_pos_);
-
-  for (int64_t i = 0; i < max_text_steps && !eos_reached_; i++) {
-    decode_step(nullptr);
-  }
-
   return num_generated_;
 }
 
diff --git a/examples/models/voxtral_realtime/voxtral_realtime_runner.h b/examples/models/voxtral_realtime/voxtral_realtime_runner.h
@@ -25,11 +25,15 @@ namespace voxtral_realtime {
 // audio and text embeddings at each position (element-wise add), while
 // MultimodalRunner concatenates modality segments sequentially.
 
-struct TranscribeConfig {
+struct OfflineTranscribeConfig {
   int max_new_tokens = 500;
   float temperature = 0.0f; // 0 = greedy
 };
 
+struct StreamingTranscribeConfig {
+  float temperature = 0.0f; // 0 = greedy
+};
+
 using TokenCallback = std::function<void(const std::string&)>;
 
 class StreamingSession;
@@ -47,14 +51,14 @@ class VoxtralRealtimeRunner {
   int transcribe(
       const float* audio_data,
       int64_t num_samples,
-      const TranscribeConfig& config,
+      const OfflineTranscribeConfig& config,
       TokenCallback token_cb);
 
   // Streaming transcription: processes raw audio incrementally via
   // StreamingSession. Requires a model exported with --streaming and
   // a streaming preprocessor .pte.
   std::unique_ptr<StreamingSession> create_streaming_session(
-      const TranscribeConfig& config,
+      const StreamingTranscribeConfig& config,
       TokenCallback token_cb);
 
   int64_t max_seq_len() const {
@@ -101,6 +105,9 @@ class VoxtralRealtimeRunner {
   int64_t stft_right_lookahead_ = 40;
   int64_t mel_skip_frames_ = 2;
 
+  // Streaming transcription delay in steps (read from model metadata).
+  int64_t delay_tokens_ = 6;
+
   // Tokenizer special tokens
   uint64_t bos_id_ = 1;
   uint64_t eos_id_ = 2;
@@ -121,16 +128,16 @@ class StreamingSession {
  public:
   StreamingSession(
       VoxtralRealtimeRunner& runner,
-      TranscribeConfig config,
+      StreamingTranscribeConfig config,
       TokenCallback token_cb);
 
   // Feed raw audio (16kHz float32). Processes as many complete 80ms steps
   // as possible. Returns number of new tokens generated.
   int feed_audio(const float* data, int64_t num_samples);
 
-  // Signal end of audio. Pads last partial step, then generates remaining
-  // text-only tokens until EOS or max_new_tokens. Returns total tokens
-  // generated across the entire session.
+  // Signal end of audio. Pads the unfinished tail with silence so the final
+  // partial step and model delay drain through the normal audio-conditioned
+  // streaming path, then returns the total tokens generated for the session.
   int flush();
 
   int total_tokens() const {
@@ -139,7 +146,6 @@ class StreamingSession {
 
  private:
   VoxtralRealtimeRunner& runner_;
-  TranscribeConfig config_;
   TokenCallback token_cb_;
 
   // Raw audio accumulation buffer
@@ -163,11 +169,10 @@ class StreamingSession {
   // Process one 80ms step from the audio buffer.
   bool try_process_step();
 
-  // Run one decoder step (token_embed + optional audio_embed -> logits).
-  // audio_embeds_tensor is the output from encode_audio_chunk, or nullptr
-  // for text-only decoding after audio ends.
+  // Run one audio-conditioned decoder step
+  // (token_embed + audio_embed -> logits).
   bool decode_step(
-      const ::executorch::extension::TensorPtr* audio_embeds_tensor);
+      const ::executorch::extension::TensorPtr& audio_embeds_tensor);
 };
 
 } // namespace voxtral_realtime