Skip to content

Commit ff52ab9

Browse files
tarekghTarek Mahmoud Sayed
authored andcommitted
Add VoiceActivityDetection options to realtime session abstractions (#7399)
* Add VoiceActivityDetection options to realtime session abstractions Introduce VoiceActivityDetectionOptions with Enabled and AllowInterruption properties to RealtimeSessionOptions. These represent the common VAD options supported across multiple realtime AI models (OpenAI, Gemini, Anthropic Claude, AWS Nova Sonic). - Add VoiceActivityDetectionOptions class with Enabled (default true) and AllowInterruption (default true) properties - Add VoiceActivityDetection property to RealtimeSessionOptions - Map VAD options to OpenAI SDK TurnDetection in both conversation and transcription session paths - Add concurrency guidance for SendAsync in IRealtimeClientSession docs - Add unit tests for the new types * Address PR review feedback - Fix type name in SendAsync concurrency docs: FunctionInvokingRealtimeSession -> FunctionInvokingRealtimeClientSession - Preserve existing TurnDetection from seed options (RawRepresentationFactory) by mutating InterruptResponseEnabled on existing RealtimeServerVadTurnDetection instead of replacing it, in both conversation and transcription paths * Clarify relationship between Enabled and AllowInterruption in VAD docs Document that AllowInterruption only takes effect when Enabled is true, and that disabling VAD fully disables turn detection making interruption not applicable. --------- Co-authored-by: Tarek Mahmoud Sayed <tarekms@ntdev.microsoft.com>
1 parent 8e1fea7 commit ff52ab9

5 files changed

Lines changed: 163 additions & 1 deletion

File tree

src/Libraries/Microsoft.Extensions.AI.Abstractions/Realtime/IRealtimeClientSession.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,17 @@ public interface IRealtimeClientSession : IAsyncDisposable
2727
/// <param name="cancellationToken">A token to cancel the operation.</param>
2828
/// <returns>A task that represents the asynchronous send operation.</returns>
2929
/// <remarks>
30+
/// <para>
3031
/// This method allows for sending client messages to the session at any time, which can be used to influence the session's behavior or state.
32+
/// </para>
33+
/// <para>
34+
/// <strong>Concurrency note for provider implementers:</strong> <see cref="SendAsync"/> may be called concurrently
35+
/// from multiple sources. For example, a caller may stream audio via <see cref="SendAsync"/> on one thread while
36+
/// middleware such as <c>FunctionInvokingRealtimeClientSession</c> calls <see cref="SendAsync"/> to return tool results
37+
/// from within <see cref="GetStreamingResponseAsync"/> enumeration on another thread. If the underlying transport
38+
/// (e.g., a WebSocket) does not support concurrent sends, provider implementations must serialize access — for
39+
/// example by using a <see cref="System.Threading.SemaphoreSlim"/> — to prevent protocol violations.
40+
/// </para>
3141
/// </remarks>
3242
Task SendAsync(RealtimeClientMessage message, CancellationToken cancellationToken = default);
3343

src/Libraries/Microsoft.Extensions.AI.Abstractions/Realtime/RealtimeSessionOptions.cs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,15 @@ public class RealtimeSessionOptions
7272
/// </summary>
7373
public IReadOnlyList<AITool>? Tools { get; init; }
7474

75+
/// <summary>
76+
/// Gets the voice activity detection (VAD) options for the session.
77+
/// </summary>
78+
/// <remarks>
79+
/// When set, configures how the server detects user speech to manage turn-taking.
80+
/// When <see langword="null"/>, the provider's default VAD behavior is used.
81+
/// </remarks>
82+
public VoiceActivityDetectionOptions? VoiceActivityDetection { get; init; }
83+
7584
/// <summary>
7685
/// Gets a callback responsible for creating the raw representation of the session options from an underlying implementation.
7786
/// </summary>
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
using System.Diagnostics.CodeAnalysis;
5+
using Microsoft.Shared.DiagnosticIds;
6+
7+
namespace Microsoft.Extensions.AI;
8+
9+
/// <summary>
10+
/// Represents options for configuring voice activity detection (VAD) in a real-time session.
11+
/// </summary>
12+
/// <remarks>
13+
/// Voice activity detection automatically determines when a user starts and stops speaking,
14+
/// enabling natural turn-taking in conversational audio interactions.
15+
/// When <see cref="Enabled"/> is <see langword="true"/>, the server detects speech boundaries
16+
/// and manages turn transitions automatically.
17+
/// When <see cref="Enabled"/> is <see langword="false"/>, the client must explicitly signal
18+
/// activity boundaries (e.g., via audio buffer commit and response creation).
19+
/// </remarks>
20+
[Experimental(DiagnosticIds.Experiments.AIRealTime, UrlFormat = DiagnosticIds.UrlFormat)]
21+
public class VoiceActivityDetectionOptions
22+
{
23+
/// <summary>
24+
/// Initializes a new instance of the <see cref="VoiceActivityDetectionOptions"/> class.
25+
/// </summary>
26+
public VoiceActivityDetectionOptions()
27+
{
28+
}
29+
30+
/// <summary>
31+
/// Gets or sets a value indicating whether server-side voice activity detection is enabled.
32+
/// </summary>
33+
/// <remarks>
34+
/// When <see langword="true"/>, the server automatically detects speech start and end,
35+
/// and may automatically trigger responses when the user stops speaking.
36+
/// When <see langword="false"/>, turn detection is fully disabled and the client controls
37+
/// turn boundaries manually (e.g., via audio buffer commit and explicit response creation).
38+
/// Other properties on this class, such as <see cref="AllowInterruption"/>, only take effect
39+
/// when this property is <see langword="true"/>.
40+
/// The default is <see langword="true"/>.
41+
/// </remarks>
42+
public bool Enabled { get; set; } = true;
43+
44+
/// <summary>
45+
/// Gets or sets a value indicating whether the user's speech can interrupt the model's audio output.
46+
/// </summary>
47+
/// <remarks>
48+
/// This property is only meaningful when <see cref="Enabled"/> is <see langword="true"/>.
49+
/// When voice activity detection is disabled, the server does not detect speech, so interruption
50+
/// does not apply.
51+
/// When <see langword="true"/>, the model's response will be cut off when the user starts speaking (barge-in).
52+
/// When <see langword="false"/>, the model's response will continue to completion regardless of user input.
53+
/// The default is <see langword="true"/>.
54+
/// Not all providers support this option; those that do not will ignore it.
55+
/// </remarks>
56+
public bool AllowInterruption { get; set; } = true;
57+
}

src/Libraries/Microsoft.Extensions.AI.OpenAI/OpenAIRealtimeClientSession.cs

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,26 @@ private static Sdk.RealtimeConversationSessionOptions BuildConversationSessionOp
414414
outputAudioOptions.Voice = new Sdk.RealtimeVoice(options.Voice);
415415
}
416416

417+
if (options.VoiceActivityDetection is { } vad)
418+
{
419+
if (!vad.Enabled)
420+
{
421+
inputAudioOptions.DisableTurnDetection();
422+
}
423+
else if (inputAudioOptions.TurnDetection is Sdk.RealtimeServerVadTurnDetection existingVad)
424+
{
425+
existingVad.InterruptResponseEnabled = vad.AllowInterruption;
426+
}
427+
else
428+
{
429+
inputAudioOptions.TurnDetection = new Sdk.RealtimeServerVadTurnDetection
430+
{
431+
InterruptResponseEnabled = vad.AllowInterruption,
432+
CreateResponseEnabled = true,
433+
};
434+
}
435+
}
436+
417437
audioOptions.InputAudioOptions = inputAudioOptions;
418438
audioOptions.OutputAudioOptions = outputAudioOptions;
419439
convOptions.AudioOptions = audioOptions;
@@ -464,7 +484,7 @@ private static Sdk.RealtimeTranscriptionSessionOptions BuildTranscriptionSession
464484
{
465485
var transOptions = new Sdk.RealtimeTranscriptionSessionOptions();
466486

467-
if (options.InputAudioFormat is not null || options.TranscriptionOptions is not null)
487+
if (options.InputAudioFormat is not null || options.TranscriptionOptions is not null || options.VoiceActivityDetection is not null)
468488
{
469489
var inputAudioOptions = new Sdk.RealtimeTranscriptionSessionInputAudioOptions();
470490

@@ -483,6 +503,26 @@ private static Sdk.RealtimeTranscriptionSessionOptions BuildTranscriptionSession
483503
};
484504
}
485505

506+
if (options.VoiceActivityDetection is { } vad)
507+
{
508+
if (!vad.Enabled)
509+
{
510+
inputAudioOptions.DisableTurnDetection();
511+
}
512+
else if (inputAudioOptions.TurnDetection is Sdk.RealtimeServerVadTurnDetection existingVad)
513+
{
514+
existingVad.InterruptResponseEnabled = vad.AllowInterruption;
515+
}
516+
else
517+
{
518+
inputAudioOptions.TurnDetection = new Sdk.RealtimeServerVadTurnDetection
519+
{
520+
InterruptResponseEnabled = vad.AllowInterruption,
521+
CreateResponseEnabled = true,
522+
};
523+
}
524+
}
525+
486526
transOptions.AudioOptions = new Sdk.RealtimeTranscriptionSessionAudioOptions
487527
{
488528
InputAudioOptions = inputAudioOptions,

test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/Realtime/RealtimeSessionOptionsTests.cs

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ public void Constructor_Parameterless_PropsDefaulted()
2626
Assert.Null(options.OutputModalities);
2727
Assert.Null(options.ToolMode);
2828
Assert.Null(options.Tools);
29+
Assert.Null(options.VoiceActivityDetection);
2930
}
3031

3132
[Fact]
@@ -90,4 +91,49 @@ public void TranscriptionOptions_PromptDefaultsToNull()
9091
Assert.Null(options.Prompt);
9192
}
9293

94+
[Fact]
95+
public void VoiceActivityDetection_DefaultsToNull()
96+
{
97+
RealtimeSessionOptions options = new();
98+
Assert.Null(options.VoiceActivityDetection);
99+
}
100+
101+
[Fact]
102+
public void VoiceActivityDetection_Roundtrip()
103+
{
104+
var vad = new VoiceActivityDetectionOptions();
105+
RealtimeSessionOptions options = new()
106+
{
107+
VoiceActivityDetection = vad,
108+
};
109+
110+
Assert.Same(vad, options.VoiceActivityDetection);
111+
}
112+
113+
[Fact]
114+
public void VoiceActivityDetectionOptions_DefaultValues()
115+
{
116+
var vad = new VoiceActivityDetectionOptions();
117+
Assert.True(vad.Enabled);
118+
Assert.True(vad.AllowInterruption);
119+
}
120+
121+
[Fact]
122+
public void VoiceActivityDetectionOptions_Properties_Roundtrip()
123+
{
124+
var vad = new VoiceActivityDetectionOptions
125+
{
126+
Enabled = false,
127+
AllowInterruption = false,
128+
};
129+
130+
Assert.False(vad.Enabled);
131+
Assert.False(vad.AllowInterruption);
132+
133+
vad.Enabled = true;
134+
vad.AllowInterruption = true;
135+
136+
Assert.True(vad.Enabled);
137+
Assert.True(vad.AllowInterruption);
138+
}
93139
}

0 commit comments

Comments
 (0)