From 528a93a7ced61ad5c0e452a01da7d24ab7102867 Mon Sep 17 00:00:00 2001 From: JOY Date: Tue, 26 May 2026 12:25:02 +0700 Subject: [PATCH 1/7] feat: add facial voice presentation hooks --- .../Scripts/AI/PrototypeAgentBrain.cs | 18 +- .../AI/PrototypeFacialAnimationDriver.cs | 308 ++++++++++++++++++ .../AI/PrototypeFacialAnimationDriver.cs.meta | 2 + .../Scripts/AI/PrototypeLLMAgentDriver.cs | 16 +- .../Scripts/AI/PrototypeNPCChatClient.cs | 13 +- .../Scripts/AI/PrototypeNpcVoicePresenter.cs | 196 +++++++++++ .../AI/PrototypeNpcVoicePresenter.cs.meta | 2 + .../Scripts/AI/PrototypeVoiceCue.cs | 69 +++- ...ed-npc-dialogue-portrait-lipsync-design.md | 18 + 9 files changed, 628 insertions(+), 14 deletions(-) create mode 100644 Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs create mode 100644 Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs.meta create mode 100644 Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs create mode 100644 Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs.meta diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeAgentBrain.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeAgentBrain.cs index 3b257153..cf025e2f 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeAgentBrain.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeAgentBrain.cs @@ -61,6 +61,7 @@ private enum BrainPhase private AgentContextDto _context; private PrototypeSpeechBubble _speechBubble; private PrototypeVoiceCue _voiceCue; + private PrototypeNpcVoicePresenter _voicePresenter; private VisualAnimationIntentDriver _intentDriver; private Animator _animator; private GameObject _visualRoot; @@ -127,6 +128,7 @@ private void Awake() _baseMoveSpeed = _moveSpeed; _speechBubble = GetOrAdd(); _voiceCue = GetOrAdd(); + _voicePresenter = GetOrAdd(); _gateway = FindAnyObjectByType(); } @@ -1366,7 +1368,7 @@ private IEnumerator ApplyDecision(AgentDecisionDto decision, AgentDecisionReques { _speechBubble.Show(text); } - _voiceCue.PlayCue(text); + PresentNpcSpeech(text, request.world_snapshot?.conversation_session_id); _intentDriver?.TryPlay(CharacterActionId.Talk); RememberSpeech(text); _nextTalkAt = Time.time + Mathf.Max(2f, _talkIntervalSeconds); @@ -1396,6 +1398,20 @@ private IEnumerator ApplyDecision(AgentDecisionDto decision, AgentDecisionReques yield break; } + private void PresentNpcSpeech(string text, string conversationSessionId) + { + if (_voicePresenter != null) + { + _voicePresenter.PresentSpeech(AgentId, conversationSessionId, text, _gateway); + return; + } + + if (_voiceCue != null) + { + _voiceCue.PlayCue(text); + } + } + private static bool IsModelDecisionSource(string source) { return string.Equals(source, "model", System.StringComparison.OrdinalIgnoreCase) || diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs new file mode 100644 index 00000000..5fd1996c --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs @@ -0,0 +1,308 @@ +using System; +using UnityEngine; + +namespace SecondSpawn.AI +{ + [DisallowMultipleComponent] + public sealed class PrototypeFacialAnimationDriver : MonoBehaviour + { + [SerializeField] private SkinnedMeshRenderer _faceRenderer; + [SerializeField] private bool _autoFindRenderer = true; + [SerializeField, Range(0f, 100f)] private float _maxJawOpenWeight = 55f; + [SerializeField, Range(0f, 100f)] private float _maxFunnelWeight = 22f; + [SerializeField, Range(0f, 100f)] private float _maxBlinkWeight = 85f; + [SerializeField, Min(0.1f)] private float _textPulseRate = 7.5f; + [SerializeField, Min(0f)] private float _speechHoldSeconds = 0.12f; + [SerializeField, Min(0.25f)] private float _blinkIntervalSeconds = 4.5f; + [SerializeField, Min(0.1f)] private float _resolveRetrySeconds = 1f; + [SerializeField] private string[] _jawOpenBlendShapes = { "jawOpen", "JawOpen", "MouthOpen", "mouthOpen", "Mouth_Open" }; + [SerializeField] private string[] _mouthFunnelBlendShapes = { "mouthFunnel", "MouthFunnel", "Mouth_Funnel", "mouthPucker", "MouthPucker" }; + [SerializeField] private string[] _leftBlinkBlendShapes = { "eyeBlinkLeft", "EyeBlinkLeft", "Blink_Left", "LeftEyeBlink" }; + [SerializeField] private string[] _rightBlinkBlendShapes = { "eyeBlinkRight", "EyeBlinkRight", "Blink_Right", "RightEyeBlink" }; + + private readonly float[] _audioSamples = new float[64]; + private AudioSource _speechSource; + private string _speechText; + private float _speechStartedAt; + private float _speechEndsAt; + private float _nextBlinkAt; + private float _blinkStartedAt = -1f; + private int _jawOpenIndex = -1; + private int _mouthFunnelIndex = -1; + private int _leftBlinkIndex = -1; + private int _rightBlinkIndex = -1; + private bool _indicesResolved; + private float _nextResolveAt; + + public bool HasBlendshapeTargets + { + get + { + EnsureResolved(); + return _jawOpenIndex >= 0 || _mouthFunnelIndex >= 0 || _leftBlinkIndex >= 0 || _rightBlinkIndex >= 0; + } + } + + private void Awake() + { + EnsureResolved(); + ScheduleNextBlink(); + } + + private void Update() + { + EnsureResolved(); + TickSpeech(); + TickBlink(); + } + + public void BeginTextSpeech(string text, float durationSeconds = 0f) + { + _speechSource = null; + _speechText = text ?? ""; + BeginSpeech(durationSeconds); + } + + public void BeginAudioSpeech(AudioSource source, string fallbackText, float durationSeconds = 0f) + { + _speechSource = source; + _speechText = fallbackText ?? ""; + BeginSpeech(durationSeconds); + } + + public void StopSpeech() + { + _speechSource = null; + _speechText = ""; + _speechEndsAt = 0f; + ApplyMouth(0f); + } + + private void BeginSpeech(float durationSeconds) + { + EnsureResolved(); + _speechStartedAt = Time.time; + var fallbackDuration = Mathf.Clamp((_speechText.Length <= 0 ? 8 : _speechText.Length) * 0.035f, 0.35f, 4.5f); + _speechEndsAt = Time.time + Mathf.Max(0.1f, durationSeconds > 0f ? durationSeconds : fallbackDuration); + } + + private void TickSpeech() + { + if (_faceRenderer == null || _faceRenderer.sharedMesh == null) + { + return; + } + + var speakingFromAudio = _speechSource != null && _speechSource.isPlaying; + var speakingFromTimer = Time.time <= _speechEndsAt + _speechHoldSeconds; + if (!speakingFromAudio && !speakingFromTimer) + { + ApplyMouth(0f); + return; + } + + var mouth = speakingFromAudio ? ReadAudioMouthWeight() : ReadTextMouthWeight(); + ApplyMouth(mouth); + } + + private float ReadAudioMouthWeight() + { + if (_speechSource == null) + { + return ReadTextMouthWeight(); + } + + try + { + _speechSource.GetOutputData(_audioSamples, 0); + } + catch (Exception) + { + return ReadTextMouthWeight(); + } + + var total = 0f; + for (var i = 0; i < _audioSamples.Length; i++) + { + total += _audioSamples[i] * _audioSamples[i]; + } + + var rms = Mathf.Sqrt(total / _audioSamples.Length); + return Mathf.Clamp01(rms * 18f); + } + + private float ReadTextMouthWeight() + { + var age = Mathf.Max(0f, Time.time - _speechStartedAt); + var pulse = Mathf.Sin(age * _textPulseRate * Mathf.PI * 2f) * 0.5f + 0.5f; + var punctuationSoftener = EndsWithSoftPunctuation(_speechText) ? 0.65f : 1f; + return Mathf.Clamp01((0.25f + pulse * 0.75f) * punctuationSoftener); + } + + private void ApplyMouth(float normalizedWeight) + { + SetWeight(_jawOpenIndex, normalizedWeight * _maxJawOpenWeight); + SetWeight(_mouthFunnelIndex, normalizedWeight * _maxFunnelWeight); + } + + private void TickBlink() + { + if (_faceRenderer == null || _faceRenderer.sharedMesh == null) + { + return; + } + + if (_blinkStartedAt < 0f && Time.time >= _nextBlinkAt) + { + _blinkStartedAt = Time.time; + } + + if (_blinkStartedAt < 0f) + { + return; + } + + const float blinkDuration = 0.16f; + var age = Time.time - _blinkStartedAt; + if (age >= blinkDuration) + { + SetWeight(_leftBlinkIndex, 0f); + SetWeight(_rightBlinkIndex, 0f); + _blinkStartedAt = -1f; + ScheduleNextBlink(); + return; + } + + var blinkWeight = Mathf.Sin(age / blinkDuration * Mathf.PI) * _maxBlinkWeight; + SetWeight(_leftBlinkIndex, blinkWeight); + SetWeight(_rightBlinkIndex, blinkWeight); + } + + private void EnsureResolved() + { + if (_indicesResolved && _faceRenderer != null) + { + return; + } + + if (_indicesResolved && _faceRenderer == null && Time.time < _nextResolveAt) + { + return; + } + + _nextResolveAt = Time.time + Mathf.Max(0.1f, _resolveRetrySeconds); + if (_faceRenderer == null && _autoFindRenderer) + { + _faceRenderer = FindFaceRenderer(); + } + + _jawOpenIndex = -1; + _mouthFunnelIndex = -1; + _leftBlinkIndex = -1; + _rightBlinkIndex = -1; + if (_faceRenderer != null && _faceRenderer.sharedMesh != null) + { + _jawOpenIndex = ResolveBlendShape(_faceRenderer.sharedMesh, _jawOpenBlendShapes); + _mouthFunnelIndex = ResolveBlendShape(_faceRenderer.sharedMesh, _mouthFunnelBlendShapes); + _leftBlinkIndex = ResolveBlendShape(_faceRenderer.sharedMesh, _leftBlinkBlendShapes); + _rightBlinkIndex = ResolveBlendShape(_faceRenderer.sharedMesh, _rightBlinkBlendShapes); + } + + _indicesResolved = true; + } + + private SkinnedMeshRenderer FindFaceRenderer() + { + var renderers = GetComponentsInChildren(true); + SkinnedMeshRenderer fallback = null; + for (var i = 0; i < renderers.Length; i++) + { + var renderer = renderers[i]; + if (renderer == null || renderer.sharedMesh == null || renderer.sharedMesh.blendShapeCount <= 0) + { + continue; + } + + fallback ??= renderer; + if (ResolveBlendShape(renderer.sharedMesh, _jawOpenBlendShapes) >= 0) + { + return renderer; + } + } + + return fallback; + } + + private void SetWeight(int index, float weight) + { + if (_faceRenderer == null || index < 0) + { + return; + } + + _faceRenderer.SetBlendShapeWeight(index, Mathf.Clamp(weight, 0f, 100f)); + } + + private void ScheduleNextBlink() + { + _nextBlinkAt = Time.time + Mathf.Max(0.25f, _blinkIntervalSeconds) + UnityEngine.Random.Range(0f, 1.25f); + } + + private static int ResolveBlendShape(Mesh mesh, string[] names) + { + if (mesh == null || names == null) + { + return -1; + } + + for (var i = 0; i < names.Length; i++) + { + var candidate = names[i]; + if (string.IsNullOrWhiteSpace(candidate)) + { + continue; + } + + var exactIndex = mesh.GetBlendShapeIndex(candidate); + if (exactIndex >= 0) + { + return exactIndex; + } + } + + for (var blendShapeIndex = 0; blendShapeIndex < mesh.blendShapeCount; blendShapeIndex++) + { + var blendShapeName = NormalizeBlendShapeName(mesh.GetBlendShapeName(blendShapeIndex)); + for (var i = 0; i < names.Length; i++) + { + if (string.Equals(blendShapeName, NormalizeBlendShapeName(names[i]), StringComparison.OrdinalIgnoreCase)) + { + return blendShapeIndex; + } + } + } + + return -1; + } + + private static string NormalizeBlendShapeName(string value) + { + return string.IsNullOrWhiteSpace(value) + ? "" + : value.Replace("_", "").Replace("-", "").Replace(" ", "").Trim(); + } + + private static bool EndsWithSoftPunctuation(string value) + { + if (string.IsNullOrWhiteSpace(value)) + { + return false; + } + + var trimmed = value.TrimEnd(); + return trimmed.EndsWith(".", StringComparison.Ordinal) || + trimmed.EndsWith(",", StringComparison.Ordinal) || + trimmed.EndsWith("?", StringComparison.Ordinal); + } + } +} diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs.meta b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs.meta new file mode 100644 index 00000000..22eb0220 --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs.meta @@ -0,0 +1,2 @@ +fileFormatVersion: 2 +guid: 7baa8d8752d5464fbfa0a3f34396f62e diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeLLMAgentDriver.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeLLMAgentDriver.cs index 26af95cf..c7fbd361 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeLLMAgentDriver.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeLLMAgentDriver.cs @@ -24,6 +24,7 @@ public sealed class PrototypeLLMAgentDriver : MonoBehaviour private NetworkPlayer _networkPlayer; private PrototypeSpeechBubble _speechBubble; private PrototypeVoiceCue _voiceCue; + private PrototypeNpcVoicePresenter _voicePresenter; private Coroutine _loop; private void Awake() @@ -41,6 +42,12 @@ private void Awake() _voiceCue = gameObject.AddComponent(); } + _voicePresenter = GetComponent(); + if (_voicePresenter == null) + { + _voicePresenter = gameObject.AddComponent(); + } + _gateway = FindAnyObjectByType(); _memorySync = _gateway != null ? _gateway.GetComponent() : null; } @@ -189,7 +196,14 @@ private void ApplyDecision(AgentDecisionDto decision) _networkPlayer.ClearPrototypeAgentInput(); Debug.Log($"[PrototypeLLMAgentDriver] Agent says: {decision.say}"); _speechBubble.Show(decision.say); - _voiceCue.PlayCue(decision.say); + if (_voicePresenter != null) + { + _voicePresenter.PresentSpeech(gameObject.name, null, decision.say, _gateway); + } + else + { + _voiceCue.PlayCue(decision.say); + } PlayCharacterAction(CharacterActionId.Talk); } else if (decision.action == "interact") diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNPCChatClient.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNPCChatClient.cs index 2eff5381..ab26aff3 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNPCChatClient.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNPCChatClient.cs @@ -56,7 +56,7 @@ private IEnumerator SendPrototypeChat() }, response => { Debug.Log($"[PrototypeNPCChatClient] {response.npc_id}: {response.text}"); - PresentSpeech(response.text); + PresentSpeech(response.npc_id, response.text); PlayTalkAnimation(); }, Debug.LogWarning); } @@ -67,7 +67,7 @@ private IEnumerator CheckVoiceSession() { actor_id = _npcId, playback_mode = "voice_preview", - lip_sync_tiers = new[] { "text_timed", "audio_amplitude_hook" } + lip_sync_tiers = new[] { "text_timed", "audio_amplitude_hook", "provider_viseme_hook" } }, response => { Debug.Log($"[PrototypeNPCChatClient] Voice provider={response.provider}, available={response.voice_available}, reason={response.reason}"); @@ -83,7 +83,7 @@ private static void PlayTalkAnimation() } } - private static void PresentSpeech(string text) + private void PresentSpeech(string actorId, string text) { var speechBubble = FindAnyObjectByType(); if (speechBubble != null) @@ -91,6 +91,13 @@ private static void PresentSpeech(string text) speechBubble.Show(text); } + var presenter = FindAnyObjectByType(); + if (presenter != null) + { + presenter.PresentSpeech(actorId, null, text, _gateway); + return; + } + var voiceCue = FindAnyObjectByType(); if (voiceCue != null) { diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs new file mode 100644 index 00000000..07c414bf --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs @@ -0,0 +1,196 @@ +using System.Collections; +using UnityEngine; +using UnityEngine.Networking; + +namespace SecondSpawn.AI +{ + [DisallowMultipleComponent] + public sealed class PrototypeNpcVoicePresenter : MonoBehaviour + { + [SerializeField] private bool _requestScopedVoiceSession = true; + [SerializeField] private string _playbackMode = "voice_preview"; + [SerializeField, Range(0f, 1f)] private float _clipVolume = 0.8f; + [SerializeField] private string[] _lipSyncTiers = { "text_timed", "audio_amplitude_hook", "provider_viseme_hook" }; + + private PrototypeVoiceCue _voiceCue; + private PrototypeFacialAnimationDriver _facialDriver; + private Coroutine _presentationLoop; + private int _lineSequence; + + public string LastVoiceProvider { get; private set; } = ""; + public string LastVoiceReason { get; private set; } = ""; + public string LastVoiceSessionId { get; private set; } = ""; + public string LastPresentationMode { get; private set; } = "idle"; + + private void Awake() + { + _voiceCue = GetComponent(); + if (_voiceCue == null) + { + _voiceCue = gameObject.AddComponent(); + } + + _facialDriver = GetComponent(); + if (_facialDriver == null) + { + _facialDriver = gameObject.AddComponent(); + } + } + + private void OnDisable() + { + StopPresentation(); + } + + public void PresentSpeech(string actorId, string conversationSessionId, string text, SecondSpawnGatewayClient gateway) + { + if (string.IsNullOrWhiteSpace(text)) + { + return; + } + + if (_presentationLoop != null) + { + StopCoroutine(_presentationLoop); + } + + _presentationLoop = StartCoroutine(PresentSpeechLoop(actorId, conversationSessionId, text, gateway)); + } + + public void PresentFallbackSpeech(string text) + { + PlayFallback(text, "fallback_only"); + } + + public void StopPresentation() + { + if (_presentationLoop != null) + { + StopCoroutine(_presentationLoop); + _presentationLoop = null; + } + + _voiceCue?.StopCue(); + _facialDriver?.StopSpeech(); + LastPresentationMode = "idle"; + } + + private IEnumerator PresentSpeechLoop(string actorId, string conversationSessionId, string text, SecondSpawnGatewayClient gateway) + { + VoiceSessionDto voiceSession = null; + string voiceError = null; + if (_requestScopedVoiceSession && gateway != null) + { + yield return gateway.GetVoiceSession(new VoiceSessionRequestDto + { + actor_id = actorId, + conversation_session_id = conversationSessionId, + line_id = BuildLineId(actorId, text), + playback_mode = _playbackMode, + ttl_seconds = 90, + lip_sync_tiers = _lipSyncTiers + }, value => voiceSession = value, error => voiceError = error); + } + + LastVoiceProvider = voiceSession?.provider ?? ""; + LastVoiceReason = !string.IsNullOrWhiteSpace(voiceError) ? voiceError : (voiceSession?.reason ?? ""); + LastVoiceSessionId = voiceSession?.session?.session_id ?? ""; + + if (voiceSession == null) + { + PlayFallback(text, string.IsNullOrWhiteSpace(voiceError) ? "voice_session_missing" : voiceError); + yield break; + } + + if (!voiceSession.voice_available) + { + PlayFallback(text, FirstNonEmpty(voiceSession.reason, "voice_unavailable")); + yield break; + } + + var endpoint = voiceSession.session?.endpoint; + if (!IsHttpEndpoint(endpoint)) + { + PlayFallback(text, FirstNonEmpty(voiceSession.reason, "voice_transport_not_playable_in_unity")); + yield break; + } + + yield return DownloadAndPlayClip(endpoint, voiceSession.session?.ephemeral_token, text); + } + + private IEnumerator DownloadAndPlayClip(string endpoint, string ephemeralToken, string fallbackText) + { + using var request = UnityWebRequestMultimedia.GetAudioClip(endpoint, ResolveAudioType(endpoint)); + if (!string.IsNullOrWhiteSpace(ephemeralToken)) + { + request.SetRequestHeader("Authorization", "Bearer " + ephemeralToken); + } + + yield return request.SendWebRequest(); + if (request.result != UnityWebRequest.Result.Success) + { + PlayFallback(fallbackText, "voice_clip_download_failed: " + request.error); + yield break; + } + + var clip = DownloadHandlerAudioClip.GetContent(request); + var duration = _voiceCue.PlayClip(clip, _clipVolume); + _facialDriver.BeginAudioSpeech(_voiceCue.OutputSource, fallbackText, duration); + LastPresentationMode = "server_voice_clip"; + } + + private void PlayFallback(string text, string reason) + { + var duration = _voiceCue.PlayCue(text); + _facialDriver.BeginAudioSpeech(_voiceCue.OutputSource, text, duration); + LastVoiceReason = reason ?? ""; + LastPresentationMode = "prototype_tone_with_blendshape"; + } + + private string BuildLineId(string actorId, string text) + { + _lineSequence++; + var safeActor = string.IsNullOrWhiteSpace(actorId) ? "unknown-actor" : actorId.Trim(); + var safeHash = text.GetHashCode() & 0x7fffffff; + return $"{safeActor}-line-{_lineSequence:0000}-{safeHash}"; + } + + private static bool IsHttpEndpoint(string endpoint) + { + return !string.IsNullOrWhiteSpace(endpoint) && + (endpoint.StartsWith("http://", System.StringComparison.OrdinalIgnoreCase) || + endpoint.StartsWith("https://", System.StringComparison.OrdinalIgnoreCase)); + } + + private static AudioType ResolveAudioType(string endpoint) + { + if (string.IsNullOrWhiteSpace(endpoint)) + { + return AudioType.UNKNOWN; + } + + var lower = endpoint.ToLowerInvariant(); + if (lower.Contains(".wav")) + { + return AudioType.WAV; + } + + if (lower.Contains(".ogg")) + { + return AudioType.OGGVORBIS; + } + + if (lower.Contains(".mp3")) + { + return AudioType.MPEG; + } + + return AudioType.UNKNOWN; + } + + private static string FirstNonEmpty(string value, string fallback) + { + return string.IsNullOrWhiteSpace(value) ? fallback : value; + } + } +} diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs.meta b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs.meta new file mode 100644 index 00000000..af53a1c6 --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs.meta @@ -0,0 +1,2 @@ +fileFormatVersion: 2 +guid: 2f49b5719cd94b37b981b721af17db00 diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeVoiceCue.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeVoiceCue.cs index feb9465c..23ef4956 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeVoiceCue.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeVoiceCue.cs @@ -12,28 +12,79 @@ public sealed class PrototypeVoiceCue : MonoBehaviour private AudioSource _audioSource; + public AudioSource OutputSource => EnsureAudioSource(); + public bool IsPlaying => _audioSource != null && _audioSource.isPlaying; + public float LastCueDuration { get; private set; } + private void Awake() { - _audioSource = GetComponent(); - if (_audioSource == null) + EnsureAudioSource(); + } + + public float EstimateDuration(string text) + { + return Mathf.Clamp((text?.Length ?? 8) * _secondsPerCharacter, _minSeconds, _maxSeconds); + } + + public float PlayCue(string text) + { + var source = EnsureAudioSource(); + if (source == null) { - _audioSource = gameObject.AddComponent(); - _audioSource.playOnAwake = false; - _audioSource.spatialBlend = 0.65f; + LastCueDuration = 0f; + return 0f; } + + var duration = EstimateDuration(text); + var clip = BuildCue(duration); + return PlayClip(clip, _volume); } - public void PlayCue(string text) + public float PlayClip(AudioClip clip, float volumeScale = 1f) + { + var source = EnsureAudioSource(); + if (source == null || clip == null) + { + LastCueDuration = 0f; + return 0f; + } + + source.Stop(); + source.clip = clip; + source.volume = Mathf.Clamp01(volumeScale); + source.Play(); + LastCueDuration = clip.length; + return LastCueDuration; + } + + public void StopCue() { if (_audioSource == null) { return; } - var duration = Mathf.Clamp((text?.Length ?? 8) * _secondsPerCharacter, _minSeconds, _maxSeconds); - var clip = BuildCue(duration); _audioSource.Stop(); - _audioSource.PlayOneShot(clip, _volume); + _audioSource.clip = null; + LastCueDuration = 0f; + } + + private AudioSource EnsureAudioSource() + { + if (_audioSource != null) + { + return _audioSource; + } + + _audioSource = GetComponent(); + if (_audioSource == null) + { + _audioSource = gameObject.AddComponent(); + } + + _audioSource.playOnAwake = false; + _audioSource.spatialBlend = 0.65f; + return _audioSource; } private static AudioClip BuildCue(float duration) diff --git a/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md b/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md index 51d6d35d..b1ede702 100644 --- a/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md +++ b/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md @@ -203,6 +203,24 @@ Evidence: compatible Ida body. - Fallback note for one body or prefab without compatible face blendshapes. +### Unity Alpha Components + +The first implementation uses local presentation components: + +- `PrototypeNpcVoicePresenter`: requests a scoped voice session from Nakama, + plays a server-provided audio clip when the session exposes a playable + temporary endpoint, and falls back to a prototype tone when voice is disabled + or unavailable. +- `PrototypeFacialAnimationDriver`: auto-finds a child `SkinnedMeshRenderer` + with face blendshapes, resolves common ARKit-style jaw, mouth, and blink + names, and drives mouth movement from either text timing or audio amplitude. +- `PrototypeVoiceCue`: owns the local `AudioSource` used by fallback tone + playback and amplitude-driven mouth movement. + +Unity must not hold provider API keys or call model providers directly. The +only accepted online voice path is a short-lived session material returned by +Nakama or `api.dos.ai`; all local animation data remains presentation only. + --- ## 6. Conversation State Machine From a9ee9d2753ca2d75afa3b61ed166984d55411e3e Mon Sep 17 00:00:00 2001 From: JOY Date: Tue, 26 May 2026 17:13:51 +0700 Subject: [PATCH 2/7] feat: improve NPC facial voice diagnostics --- ...econdSpawnFacialBlendshapeReportUtility.cs | 94 +++++++++++++++++++ .../Scripts/AI/PrototypeAgentBrain.cs | 24 +++++ .../AI/PrototypeFacialAnimationDriver.cs | 62 ++++++++++++ .../Scripts/AI/PrototypeNPCChatClient.cs | 9 +- .../Scripts/AI/PrototypeNpcVoicePresenter.cs | 31 +++++- .../Scripts/AI/PrototypeNpcWorldDebugPanel.cs | 7 ++ .../Scripts/Networking/VisualPrefabCatalog.cs | 5 + ...ed-npc-dialogue-portrait-lipsync-design.md | 10 +- 8 files changed, 237 insertions(+), 5 deletions(-) create mode 100644 Unity/Assets/_SecondSpawn/Editor/SecondSpawnFacialBlendshapeReportUtility.cs diff --git a/Unity/Assets/_SecondSpawn/Editor/SecondSpawnFacialBlendshapeReportUtility.cs b/Unity/Assets/_SecondSpawn/Editor/SecondSpawnFacialBlendshapeReportUtility.cs new file mode 100644 index 00000000..04858fb5 --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Editor/SecondSpawnFacialBlendshapeReportUtility.cs @@ -0,0 +1,94 @@ +#if UNITY_EDITOR +using System.Text; +using SecondSpawn.Networking; +using UnityEditor; +using UnityEngine; + +namespace SecondSpawn.EditorTools +{ + public static class SecondSpawnFacialBlendshapeReportUtility + { + [MenuItem("Second Spawn/Art/Report Selected Facial Blendshapes")] + public static void ReportSelectedFacialBlendshapes() + { + var selected = Selection.activeGameObject; + if (selected == null) + { + Debug.LogWarning("[SecondSpawnFacialBlendshapeReportUtility] Select a character prefab or scene object first."); + return; + } + + Debug.Log(BuildReport(selected)); + } + + [MenuItem("Second Spawn/Art/Report Generated Visual Facial Blendshapes")] + public static void ReportGeneratedVisualFacialBlendshapes() + { + var builder = new StringBuilder(); + builder.AppendLine("[SecondSpawnFacialBlendshapeReportUtility] Generated visual facial blendshape report"); + for (var variant = 0; variant < VisualPrefabCatalog.Count; variant++) + { + var path = VisualPrefabCatalog.GetCleanAssetPath(variant); + var prefab = AssetDatabase.LoadAssetAtPath(path); + if (prefab == null) + { + path = VisualPrefabCatalog.GetSourceAssetPath(variant); + prefab = AssetDatabase.LoadAssetAtPath(path); + } + + builder.AppendLine($"Variant {variant:00}: {VisualPrefabCatalog.GetLabel(variant)}"); + builder.AppendLine(prefab == null + ? $" missing prefab at {path}" + : Indent(BuildReport(prefab), " ")); + } + + Debug.Log(builder.ToString()); + } + + private static string BuildReport(GameObject root) + { + var builder = new StringBuilder(); + builder.AppendLine($"Facial blendshape report for {root.name}"); + var renderers = root.GetComponentsInChildren(includeInactive: true); + if (renderers.Length == 0) + { + builder.AppendLine("No SkinnedMeshRenderer found."); + return builder.ToString(); + } + + var anyBlendshapes = false; + foreach (var renderer in renderers) + { + if (renderer == null || renderer.sharedMesh == null || renderer.sharedMesh.blendShapeCount <= 0) + { + continue; + } + + anyBlendshapes = true; + builder.AppendLine($"{renderer.name} | mesh={renderer.sharedMesh.name} | blendshapes={renderer.sharedMesh.blendShapeCount}"); + for (var index = 0; index < renderer.sharedMesh.blendShapeCount; index++) + { + builder.AppendLine($" {index:00}: {renderer.sharedMesh.GetBlendShapeName(index)}"); + } + } + + if (!anyBlendshapes) + { + builder.AppendLine("No blendshape-enabled renderer found."); + } + + return builder.ToString(); + } + + private static string Indent(string value, string prefix) + { + if (string.IsNullOrWhiteSpace(value)) + { + return ""; + } + + return prefix + value.TrimEnd().Replace("\n", "\n" + prefix); + } + } +} +#endif diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeAgentBrain.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeAgentBrain.cs index cf025e2f..d693a791 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeAgentBrain.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeAgentBrain.cs @@ -234,6 +234,30 @@ public void ConfigureCrowdTuning( public string AgentId => string.IsNullOrWhiteSpace(_agentId) ? name : _agentId.Trim(); public string DisplayName => string.IsNullOrWhiteSpace(_displayName) ? name : _displayName.Trim(); + public string VoicePresentationMode => _voicePresenter != null ? _voicePresenter.LastPresentationMode : "none"; + public string VoicePresentationReason => _voicePresenter != null ? _voicePresenter.LastVoiceReason : ""; + public string FacialTargetSummary => _voicePresenter != null ? _voicePresenter.FacialTargetSummary : "voice_presenter=missing"; + + public static PrototypeAgentBrain FindActiveByAgentId(string actorId) + { + if (string.IsNullOrWhiteSpace(actorId)) + { + return null; + } + + var normalized = actorId.Trim(); + for (var i = 0; i < ActiveBrains.Count; i++) + { + var brain = ActiveBrains[i]; + if (brain != null && + string.Equals(brain.AgentId, normalized, System.StringComparison.OrdinalIgnoreCase)) + { + return brain; + } + } + + return null; + } public void NotifyNearbyPlayerChat( string message, diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs index 5fd1996c..58fcd21c 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs @@ -33,6 +33,11 @@ public sealed class PrototypeFacialAnimationDriver : MonoBehaviour private int _rightBlinkIndex = -1; private bool _indicesResolved; private float _nextResolveAt; + private float _lastMouthWeight; + private string _jawOpenName = ""; + private string _mouthFunnelName = ""; + private string _leftBlinkName = ""; + private string _rightBlinkName = ""; public bool HasBlendshapeTargets { @@ -42,6 +47,39 @@ public bool HasBlendshapeTargets return _jawOpenIndex >= 0 || _mouthFunnelIndex >= 0 || _leftBlinkIndex >= 0 || _rightBlinkIndex >= 0; } } + public bool HasMouthTargets + { + get + { + EnsureResolved(); + return _jawOpenIndex >= 0 || _mouthFunnelIndex >= 0; + } + } + public bool HasBlinkTargets + { + get + { + EnsureResolved(); + return _leftBlinkIndex >= 0 || _rightBlinkIndex >= 0; + } + } + public string TargetRendererName + { + get + { + EnsureResolved(); + return _faceRenderer != null ? _faceRenderer.name : ""; + } + } + public string TargetSummary + { + get + { + EnsureResolved(); + return $"renderer={Fallback(TargetRendererName, "none")}, jaw={Fallback(_jawOpenName, "none")}, funnel={Fallback(_mouthFunnelName, "none")}, blinkL={Fallback(_leftBlinkName, "none")}, blinkR={Fallback(_rightBlinkName, "none")}"; + } + } + public float LastMouthWeight => _lastMouthWeight; private void Awake() { @@ -141,6 +179,7 @@ private float ReadTextMouthWeight() private void ApplyMouth(float normalizedWeight) { + _lastMouthWeight = Mathf.Clamp01(normalizedWeight); SetWeight(_jawOpenIndex, normalizedWeight * _maxJawOpenWeight); SetWeight(_mouthFunnelIndex, normalizedWeight * _maxFunnelWeight); } @@ -200,12 +239,20 @@ private void EnsureResolved() _mouthFunnelIndex = -1; _leftBlinkIndex = -1; _rightBlinkIndex = -1; + _jawOpenName = ""; + _mouthFunnelName = ""; + _leftBlinkName = ""; + _rightBlinkName = ""; if (_faceRenderer != null && _faceRenderer.sharedMesh != null) { _jawOpenIndex = ResolveBlendShape(_faceRenderer.sharedMesh, _jawOpenBlendShapes); _mouthFunnelIndex = ResolveBlendShape(_faceRenderer.sharedMesh, _mouthFunnelBlendShapes); _leftBlinkIndex = ResolveBlendShape(_faceRenderer.sharedMesh, _leftBlinkBlendShapes); _rightBlinkIndex = ResolveBlendShape(_faceRenderer.sharedMesh, _rightBlinkBlendShapes); + _jawOpenName = ResolveBlendShapeName(_faceRenderer.sharedMesh, _jawOpenIndex); + _mouthFunnelName = ResolveBlendShapeName(_faceRenderer.sharedMesh, _mouthFunnelIndex); + _leftBlinkName = ResolveBlendShapeName(_faceRenderer.sharedMesh, _leftBlinkIndex); + _rightBlinkName = ResolveBlendShapeName(_faceRenderer.sharedMesh, _rightBlinkIndex); } _indicesResolved = true; @@ -285,6 +332,16 @@ private static int ResolveBlendShape(Mesh mesh, string[] names) return -1; } + private static string ResolveBlendShapeName(Mesh mesh, int index) + { + if (mesh == null || index < 0 || index >= mesh.blendShapeCount) + { + return ""; + } + + return mesh.GetBlendShapeName(index); + } + private static string NormalizeBlendShapeName(string value) { return string.IsNullOrWhiteSpace(value) @@ -292,6 +349,11 @@ private static string NormalizeBlendShapeName(string value) : value.Replace("_", "").Replace("-", "").Replace(" ", "").Trim(); } + private static string Fallback(string value, string fallback) + { + return string.IsNullOrWhiteSpace(value) ? fallback : value.Trim(); + } + private static bool EndsWithSoftPunctuation(string value) { if (string.IsNullOrWhiteSpace(value)) diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNPCChatClient.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNPCChatClient.cs index ab26aff3..987a3de6 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNPCChatClient.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNPCChatClient.cs @@ -85,13 +85,18 @@ private static void PlayTalkAnimation() private void PresentSpeech(string actorId, string text) { - var speechBubble = FindAnyObjectByType(); + var brain = PrototypeAgentBrain.FindActiveByAgentId(actorId); + var speechBubble = brain != null + ? brain.GetComponent() + : FindAnyObjectByType(); if (speechBubble != null) { speechBubble.Show(text); } - var presenter = FindAnyObjectByType(); + var presenter = brain != null + ? brain.GetComponent() + : FindAnyObjectByType(); if (presenter != null) { presenter.PresentSpeech(actorId, null, text, _gateway); diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs index 07c414bf..06a74b2c 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs @@ -20,7 +20,11 @@ public sealed class PrototypeNpcVoicePresenter : MonoBehaviour public string LastVoiceProvider { get; private set; } = ""; public string LastVoiceReason { get; private set; } = ""; public string LastVoiceSessionId { get; private set; } = ""; + public string LastLineId { get; private set; } = ""; public string LastPresentationMode { get; private set; } = "idle"; + public bool IsPresenting => _presentationLoop != null || (_voiceCue != null && _voiceCue.IsPlaying); + public bool HasFacialBlendshapeTargets => _facialDriver != null && _facialDriver.HasBlendshapeTargets; + public string FacialTargetSummary => _facialDriver != null ? _facialDriver.TargetSummary : "facial_driver=missing"; private void Awake() { @@ -134,6 +138,12 @@ private IEnumerator DownloadAndPlayClip(string endpoint, string ephemeralToken, } var clip = DownloadHandlerAudioClip.GetContent(request); + if (clip == null) + { + PlayFallback(fallbackText, "voice_clip_decode_failed"); + yield break; + } + var duration = _voiceCue.PlayClip(clip, _clipVolume); _facialDriver.BeginAudioSpeech(_voiceCue.OutputSource, fallbackText, duration); LastPresentationMode = "server_voice_clip"; @@ -151,8 +161,25 @@ private string BuildLineId(string actorId, string text) { _lineSequence++; var safeActor = string.IsNullOrWhiteSpace(actorId) ? "unknown-actor" : actorId.Trim(); - var safeHash = text.GetHashCode() & 0x7fffffff; - return $"{safeActor}-line-{_lineSequence:0000}-{safeHash}"; + var safeHash = StableHash(text); + LastLineId = $"{safeActor}-line-{_lineSequence:0000}-{safeHash:x8}"; + return LastLineId; + } + + private static uint StableHash(string value) + { + unchecked + { + var hash = 2166136261u; + var safeValue = string.IsNullOrWhiteSpace(value) ? "" : value.Trim(); + for (var i = 0; i < safeValue.Length; i++) + { + hash ^= safeValue[i]; + hash *= 16777619u; + } + + return hash; + } } private static bool IsHttpEndpoint(string endpoint) diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcWorldDebugPanel.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcWorldDebugPanel.cs index 3e88cfc6..ddc62871 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcWorldDebugPanel.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcWorldDebugPanel.cs @@ -274,6 +274,13 @@ private void DrawSelectedNpc() GUILayout.Label($"{NpcProfession(npc)} | {NpcAge(npc)} | {SafeText(npc.body?.identity?.home_base, "unknown base")}", _labelStyle); GUILayout.Label($"Lv {NpcLevel(npc)} | HP {NpcStats(npc)?.max_health ?? 0} | ATK {NpcStats(npc)?.attack_power ?? 0} | DEF {NpcStats(npc)?.defense_power ?? 0} | V{NpcVisualVariant(npc)}", _labelStyle); GUILayout.Label($"Voice: {SafeText(npc.body?.voice_profile?.profile_id, "unassigned")} | {SafeText(npc.body?.voice_profile?.pace_hint, "steady")}", _labelStyle); + var activeBrain = PrototypeAgentBrain.FindActiveByAgentId(npc.actor_id); + if (activeBrain != null) + { + GUILayout.Label($"Voice runtime: {SafeText(activeBrain.VoicePresentationMode, "idle")} | {SafeText(activeBrain.VoicePresentationReason, "ready")}", _mutedStyle); + GUILayout.Label(Shorten(activeBrain.FacialTargetSummary, 120), _mutedStyle); + } + GUILayout.Label($"Soul: {SafeText(npc.body?.soul?.name, "unknown")}", _labelStyle); GUILayout.Label(Shorten(SafeText(npc.memory != null && npc.memory.Length > 0 ? npc.memory[0].summary : "", "No seed memory."), 120), _mutedStyle); } diff --git a/Unity/Assets/_SecondSpawn/Scripts/Networking/VisualPrefabCatalog.cs b/Unity/Assets/_SecondSpawn/Scripts/Networking/VisualPrefabCatalog.cs index 11ffd200..106e068a 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/Networking/VisualPrefabCatalog.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/Networking/VisualPrefabCatalog.cs @@ -83,6 +83,11 @@ public static string GetCleanPrefabName(int variant) return $"Visual_{index:00}_{Entries[index].CleanName}.prefab"; } + public static string GetLabel(int variant) + { + return Entries[NormalizeVariant(variant)].CleanName; + } + public static bool IsSemiRealCharacterVariant(int variant) { return VisualAnimationProfileCatalog.IsSemiRealCharacterVariant(variant); diff --git a/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md b/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md index b1ede702..2b0f802d 100644 --- a/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md +++ b/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md @@ -213,9 +213,17 @@ The first implementation uses local presentation components: or unavailable. - `PrototypeFacialAnimationDriver`: auto-finds a child `SkinnedMeshRenderer` with face blendshapes, resolves common ARKit-style jaw, mouth, and blink - names, and drives mouth movement from either text timing or audio amplitude. + names, exposes its resolved target summary for debug panels, and drives mouth + movement from either text timing or audio amplitude. - `PrototypeVoiceCue`: owns the local `AudioSource` used by fallback tone playback and amplitude-driven mouth movement. +- `PrototypeNPCChatClient`: resolves the active `PrototypeAgentBrain` by actor + id before presenting speech so prototype hotkey replies animate the intended + NPC instead of whichever presenter Unity finds first. +- `SecondSpawnFacialBlendshapeReportUtility`: editor-only reporting for + selected characters and generated visual prefabs. Agents use it to inspect + real imported `SkinnedMeshRenderer` blendshape names before approving an + Ida-family lip sync profile. Unity must not hold provider API keys or call model providers directly. The only accepted online voice path is a short-lived session material returned by From 7feff7dc828ea09c7e02ab5b57fe36b17dd4ed42 Mon Sep 17 00:00:00 2001 From: JOY Date: Tue, 26 May 2026 17:15:58 +0700 Subject: [PATCH 3/7] chore: add facial blendshape utility meta --- .../Editor/SecondSpawnFacialBlendshapeReportUtility.cs.meta | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Unity/Assets/_SecondSpawn/Editor/SecondSpawnFacialBlendshapeReportUtility.cs.meta diff --git a/Unity/Assets/_SecondSpawn/Editor/SecondSpawnFacialBlendshapeReportUtility.cs.meta b/Unity/Assets/_SecondSpawn/Editor/SecondSpawnFacialBlendshapeReportUtility.cs.meta new file mode 100644 index 00000000..99b36fa7 --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Editor/SecondSpawnFacialBlendshapeReportUtility.cs.meta @@ -0,0 +1,2 @@ +fileFormatVersion: 2 +guid: 3dc97550137eb3a449df53715bacac1f From 051f8eedc3a65f0ce61b0f349d18309448c1e4eb Mon Sep 17 00:00:00 2001 From: JOY Date: Tue, 26 May 2026 17:18:15 +0700 Subject: [PATCH 4/7] fix: reset NPC voice presentation state --- .../_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs index 06a74b2c..f31792b7 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs @@ -80,6 +80,12 @@ public void StopPresentation() } private IEnumerator PresentSpeechLoop(string actorId, string conversationSessionId, string text, SecondSpawnGatewayClient gateway) + { + yield return PresentSpeechCore(actorId, conversationSessionId, text, gateway); + _presentationLoop = null; + } + + private IEnumerator PresentSpeechCore(string actorId, string conversationSessionId, string text, SecondSpawnGatewayClient gateway) { VoiceSessionDto voiceSession = null; string voiceError = null; From ef4b186d24d263f7c3a70ff10fdf8ebc3d37ef73 Mon Sep 17 00:00:00 2001 From: JOY Date: Tue, 26 May 2026 18:06:09 +0700 Subject: [PATCH 5/7] feat: add realtime NPC voice input hook --- .../Scripts/AI/AgentContextDto.cs | 70 +++ .../Scripts/AI/PrototypeNearbyNpcChatBox.cs | 73 ++++ .../AI/PrototypeNpcRealtimeVoiceClient.cs | 398 ++++++++++++++++++ .../PrototypeNpcRealtimeVoiceClient.cs.meta | 2 + .../Scripts/AI/SecondSpawnGatewayClient.cs | 10 + .../Scripts/UI/NearbyNpcChatPanel.cs | 59 ++- ...ed-npc-dialogue-portrait-lipsync-design.md | 36 +- 7 files changed, 646 insertions(+), 2 deletions(-) create mode 100644 Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs create mode 100644 Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs.meta diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/AgentContextDto.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/AgentContextDto.cs index 4423fa06..6707daa7 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/AgentContextDto.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/AgentContextDto.cs @@ -1445,4 +1445,74 @@ public sealed class VoiceSessionDebugDto public string provider_status; public string fallback_mode; } + + [Serializable] + public sealed class RealtimeVoiceSessionRequestDto + { + public string actor_id; + public string conversation_session_id; + public string requested_transport = "livekit_ready"; + public string input_mode = "text_or_microphone"; + public int ttl_seconds = 120; + public bool text_input_supported = true; + public bool microphone_input_supported = true; + public int sample_rate_hz = 16000; + public int channels = 1; + public string client_platform = "unity"; + public string provider_hint = "gemini_live_or_tts"; + } + + [Serializable] + public sealed class RealtimeVoiceSessionDto + { + public bool session_available; + public string provider; + public string reason; + public string actor_id; + public string conversation_session_id; + public VoiceSessionMaterialDto session; + public RealtimeVoiceInputPolicyDto input_policy; + public VoiceSessionDebugDto debug; + } + + [Serializable] + public sealed class RealtimeVoiceInputPolicyDto + { + public bool accepts_text; + public bool accepts_audio; + public int max_audio_ms; + public int sample_rate_hz; + public int channels; + public string[] accepted_audio_formats; + } + + [Serializable] + public sealed class RealtimeVoiceInputRequestDto + { + public string client_event_id; + public string session_id; + public string actor_id; + public string conversation_session_id; + public string input_kind; + public string text; + public string audio_format; + public int sample_rate_hz; + public int channels; + public int duration_ms; + public string audio_base64; + } + + [Serializable] + public sealed class RealtimeVoiceInputResponseDto + { + public bool accepted; + public string provider; + public string reason; + public string conversation_session_id; + public string transcript; + public string npc_actor_id; + public string npc_text; + public bool fallback_to_text_chat; + public VoiceSessionDebugDto debug; + } } diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNearbyNpcChatBox.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNearbyNpcChatBox.cs index 9aec3ca5..1edafc35 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNearbyNpcChatBox.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNearbyNpcChatBox.cs @@ -56,6 +56,7 @@ public sealed class PrototypeNearbyNpcChatBox : MonoBehaviour private Vector2 _historyScrollPosition; private bool _societyEventRpcAvailable = true; private CharacterMemorySync _memorySync; + private PrototypeNpcRealtimeVoiceClient _realtimeVoice; private string _focusedNpcActorId; private string _focusedNpcDisplayName; private PrototypeAgentBrain _focusedNpcBrain; @@ -83,6 +84,9 @@ public sealed class PrototypeNearbyNpcChatBox : MonoBehaviour public string QuestActionStatus => string.IsNullOrWhiteSpace(_questActionStatus) ? PrototypeQuestStatusLine : _questActionStatus; public string PrototypeQuestStatusLine => BuildPrototypeQuestStatusLine(PrototypeQuest); public string FocusedNpcDisplayName => string.IsNullOrWhiteSpace(_focusedNpcDisplayName) ? "Nearby NPC" : _focusedNpcDisplayName; + public string FocusedNpcActorId => string.IsNullOrWhiteSpace(_focusedNpcActorId) ? "" : _focusedNpcActorId.Trim(); + public string ActiveConversationSessionId => string.IsNullOrWhiteSpace(_activeConversationSessionId) ? "" : _activeConversationSessionId.Trim(); + public PrototypeNpcRealtimeVoiceClient RealtimeVoice => _realtimeVoice; public string DisplayName { get => SafeDisplayName(); @@ -144,6 +148,13 @@ private void Awake() s_activeInstance = this; _gateway = GetComponent(); _memorySync = GetComponent(); + _realtimeVoice = GetComponent(); + if (_realtimeVoice == null) + { + _realtimeVoice = gameObject.AddComponent(); + } + + _realtimeVoice.Bind(this, _gateway); } private void OnDestroy() @@ -287,6 +298,68 @@ public void SubmitLocalPlayerMessage(string message) StartCoroutine(SendNearbyMessage(message.Trim(), ResolveFocusedNpcRecipient())); } + public void SubmitRealtimeVoiceTranscript(string transcript, string source) + { + if (string.IsNullOrWhiteSpace(transcript)) + { + return; + } + + if (_busy || IsFocusedNpcResponding()) + { + _status = $"{FocusedNpcDisplayName} is still answering."; + return; + } + + if (!IsFocusedNpcActive()) + { + AddSystemLine("Stand near an NPC and press E before sending realtime voice input."); + return; + } + + var safeSource = string.IsNullOrWhiteSpace(source) ? "voice" : source.Trim(); + Debug.Log($"[PrototypeNearbyNpcChatBox] Realtime {safeSource} transcript routed to focused NPC actor={_focusedNpcActorId}, text={Shorten(transcript, 80)}"); + StartCoroutine(SendNearbyMessage(transcript.Trim(), ResolveFocusedNpcRecipient())); + } + + public void PresentRealtimeNpcResponse(string actorId, string text, string conversationSessionId) + { + if (string.IsNullOrWhiteSpace(text)) + { + return; + } + + var safeActorId = string.IsNullOrWhiteSpace(actorId) ? FocusedNpcActorId : actorId.Trim(); + if (string.IsNullOrWhiteSpace(safeActorId)) + { + AddSystemLine("Realtime voice returned NPC speech without a focused actor."); + return; + } + + if (!string.IsNullOrWhiteSpace(conversationSessionId)) + { + _activeConversationSessionId = conversationSessionId.Trim(); + } + + var brain = ResolveBrain(safeActorId); + TryAddFocusedNpcSpeech(safeActorId, brain != null ? brain.DisplayName : FocusedNpcDisplayName, text); + var presenter = brain != null ? brain.GetComponent() : null; + presenter?.PresentSpeech(safeActorId, ActiveConversationSessionId, text, _gateway); + } + + public void RememberRealtimeVoiceConversationSession(string conversationSessionId) + { + if (!string.IsNullOrWhiteSpace(conversationSessionId)) + { + _activeConversationSessionId = conversationSessionId.Trim(); + } + } + + public void AddPrototypeSystemLine(string text) + { + AddSystemLine(text); + } + public void AcceptPrototypeQuest() { if (_questBusy || _memorySync == null) diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs new file mode 100644 index 00000000..991b65b8 --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs @@ -0,0 +1,398 @@ +using System; +using System.Collections; +using System.IO; +using SecondSpawn.Networking; +using UnityEngine; +using UnityEngine.InputSystem; + +namespace SecondSpawn.AI +{ + [DisallowMultipleComponent] + [RequireComponent(typeof(SecondSpawnGatewayClient))] + public sealed class PrototypeNpcRealtimeVoiceClient : MonoBehaviour + { + [SerializeField] private bool _enablePrototypeHotkeys = true; + [SerializeField] private Key _pushToTalkKey = Key.C; + [SerializeField] private int _sampleRateHz = 16000; + [SerializeField] private int _maxRecordingSeconds = 8; + [SerializeField] private string _requestedTransport = "livekit_ready"; + [SerializeField] private string _providerHint = "gemini_live_or_tts"; + + private SecondSpawnGatewayClient _gateway; + private PrototypeNearbyNpcChatBox _chat; + private RealtimeVoiceSessionDto _session; + private AudioClip _recordingClip; + private string _recordingDevice = ""; + private float _recordingStartedAt; + private bool _busy; + private bool _sessionRpcUnavailable; + private string _status = "Voice input ready"; + private string _lastTranscript = ""; + private string _lastNpcText = ""; + + public bool IsBusy => _busy; + public bool IsRecording => _recordingClip != null && !string.IsNullOrWhiteSpace(_recordingDevice) && Microphone.IsRecording(_recordingDevice); + public bool HasMicrophone => Microphone.devices != null && Microphone.devices.Length > 0; + public string Status => _status; + public string LastTranscript => _lastTranscript; + public string LastNpcText => _lastNpcText; + public Key PushToTalkKey => _pushToTalkKey; + public string SessionTransport => _session?.session?.transport ?? _requestedTransport; + + private void Awake() + { + _gateway = GetComponent(); + } + + private void Update() + { + if (!_enablePrototypeHotkeys || _chat == null || !_chat.IsChatModeActive) + { + return; + } + + var keyboard = Keyboard.current; + if (keyboard == null || PrototypeInputFocusGate.IsTextInputFocused) + { + return; + } + + var key = keyboard[_pushToTalkKey]; + if (key.wasPressedThisFrame) + { + BeginPushToTalk(); + } + + if (key.wasReleasedThisFrame) + { + EndPushToTalkAndSubmit(); + } + } + + private void OnDisable() + { + CancelRecording(); + } + + public void Bind(PrototypeNearbyNpcChatBox chat, SecondSpawnGatewayClient gateway) + { + _chat = chat; + if (gateway != null) + { + _gateway = gateway; + } + } + + public void TogglePushToTalk() + { + if (IsRecording) + { + EndPushToTalkAndSubmit(); + return; + } + + BeginPushToTalk(); + } + + public void RequestSessionForFocusedNpc() + { + if (_busy || _chat == null || !_chat.IsChatModeActive) + { + return; + } + + StartCoroutine(EnsureSession(_chat.FocusedNpcActorId, _chat.ActiveConversationSessionId)); + } + + public void BeginPushToTalk() + { + if (_busy || IsRecording) + { + return; + } + + if (_chat == null || !_chat.IsChatModeActive) + { + _status = "Stand near an NPC and press E before using voice."; + _chat?.AddPrototypeSystemLine(_status); + return; + } + + if (!HasMicrophone) + { + _status = "No microphone device is available."; + _chat.AddPrototypeSystemLine(_status); + return; + } + + _recordingDevice = Microphone.devices[0]; + _recordingClip = Microphone.Start(_recordingDevice, false, Mathf.Max(1, _maxRecordingSeconds), Mathf.Max(8000, _sampleRateHz)); + _recordingStartedAt = Time.realtimeSinceStartup; + _status = $"Recording microphone for {_chat.FocusedNpcDisplayName}."; + } + + public void EndPushToTalkAndSubmit() + { + if (!IsRecording || _recordingClip == null) + { + return; + } + + var device = _recordingDevice; + var clip = _recordingClip; + var position = Mathf.Clamp(Microphone.GetPosition(device), 0, clip.samples); + Microphone.End(device); + _recordingClip = null; + _recordingDevice = ""; + + var elapsedMs = Mathf.RoundToInt(Mathf.Max(0.05f, Time.realtimeSinceStartup - _recordingStartedAt) * 1000f); + var sampleFrames = position > 0 ? position : Mathf.Clamp(Mathf.RoundToInt(elapsedMs * clip.frequency / 1000f), 1, clip.samples); + StartCoroutine(SubmitRecording(clip, sampleFrames, elapsedMs)); + } + + public void CancelRecording() + { + if (!string.IsNullOrWhiteSpace(_recordingDevice) && Microphone.IsRecording(_recordingDevice)) + { + Microphone.End(_recordingDevice); + } + + _recordingClip = null; + _recordingDevice = ""; + } + + public void SubmitRealtimeText(string text) + { + if (string.IsNullOrWhiteSpace(text) || _chat == null) + { + return; + } + + StartCoroutine(SubmitText(text.Trim())); + } + + private IEnumerator SubmitText(string text) + { + _busy = true; + yield return EnsureSession(_chat.FocusedNpcActorId, _chat.ActiveConversationSessionId); + + if (_sessionRpcUnavailable || _gateway == null) + { + _busy = false; + _chat.SubmitRealtimeVoiceTranscript(text, "text"); + yield break; + } + + RealtimeVoiceInputResponseDto response = null; + string error = null; + yield return _gateway.SubmitRealtimeVoiceInput(new RealtimeVoiceInputRequestDto + { + client_event_id = CharacterMemorySync.BuildClientEventId("realtime-text"), + session_id = _session?.session?.session_id, + actor_id = _chat.FocusedNpcActorId, + conversation_session_id = _chat.ActiveConversationSessionId, + input_kind = "text", + text = text + }, value => response = value, value => error = value); + + ApplyInputResponse(response, error, text); + _busy = false; + } + + private IEnumerator SubmitRecording(AudioClip clip, int sampleFrames, int elapsedMs) + { + if (_chat == null || clip == null) + { + yield break; + } + + _busy = true; + _status = "Submitting microphone input."; + yield return EnsureSession(_chat.FocusedNpcActorId, _chat.ActiveConversationSessionId); + + if (_sessionRpcUnavailable || _gateway == null) + { + _status = "Voice captured locally; realtime voice backend is not loaded yet."; + _chat.AddPrototypeSystemLine(_status); + _busy = false; + yield break; + } + + var audioBase64 = EncodeClipToWavBase64(clip, sampleFrames); + RealtimeVoiceInputResponseDto response = null; + string error = null; + yield return _gateway.SubmitRealtimeVoiceInput(new RealtimeVoiceInputRequestDto + { + client_event_id = CharacterMemorySync.BuildClientEventId("realtime-mic"), + session_id = _session?.session?.session_id, + actor_id = _chat.FocusedNpcActorId, + conversation_session_id = _chat.ActiveConversationSessionId, + input_kind = "microphone", + audio_format = "wav_pcm16", + sample_rate_hz = clip.frequency, + channels = clip.channels, + duration_ms = Mathf.Min(elapsedMs, Mathf.Max(1, _maxRecordingSeconds) * 1000), + audio_base64 = audioBase64 + }, value => response = value, value => error = value); + + ApplyInputResponse(response, error, ""); + _busy = false; + } + + private IEnumerator EnsureSession(string actorId, string conversationSessionId) + { + if (_sessionRpcUnavailable || _gateway == null) + { + yield break; + } + + if (_session != null && !string.IsNullOrWhiteSpace(_session.session?.session_id)) + { + yield break; + } + + _status = "Requesting realtime voice session."; + RealtimeVoiceSessionDto response = null; + string error = null; + yield return _gateway.GetRealtimeVoiceSession(new RealtimeVoiceSessionRequestDto + { + actor_id = actorId, + conversation_session_id = conversationSessionId, + requested_transport = _requestedTransport, + provider_hint = _providerHint, + ttl_seconds = 120, + sample_rate_hz = Mathf.Max(8000, _sampleRateHz), + channels = 1 + }, value => response = value, value => error = value); + + if (response == null) + { + if (IsRpcNotLoaded(error)) + { + _sessionRpcUnavailable = true; + } + + _status = $"Realtime voice session unavailable: {Shorten(error, 90)}"; + yield break; + } + + _session = response; + _status = response.session_available + ? $"Realtime voice ready via {FirstNonEmpty(response.provider, response.session?.transport, _requestedTransport)}." + : $"Realtime voice unavailable: {FirstNonEmpty(response.reason, "provider unavailable")}."; + } + + private void ApplyInputResponse(RealtimeVoiceInputResponseDto response, string error, string textFallback) + { + if (response == null) + { + if (IsRpcNotLoaded(error)) + { + _sessionRpcUnavailable = true; + } + + _status = $"Realtime voice input failed: {Shorten(error, 90)}"; + if (!string.IsNullOrWhiteSpace(textFallback)) + { + _chat.SubmitRealtimeVoiceTranscript(textFallback, "text"); + } + else + { + _chat.AddPrototypeSystemLine(_status); + } + return; + } + + _lastTranscript = response.transcript ?? ""; + _lastNpcText = response.npc_text ?? ""; + if (!string.IsNullOrWhiteSpace(response.conversation_session_id)) + { + _chat.RememberRealtimeVoiceConversationSession(response.conversation_session_id); + } + + if (!string.IsNullOrWhiteSpace(response.transcript)) + { + _chat.SubmitRealtimeVoiceTranscript(response.transcript, "microphone"); + } + + if (!string.IsNullOrWhiteSpace(response.npc_text)) + { + _chat.PresentRealtimeNpcResponse(response.npc_actor_id, response.npc_text, response.conversation_session_id); + } + + _status = response.accepted + ? $"Realtime voice accepted by {FirstNonEmpty(response.provider, "api.dos.ai")}." + : $"Realtime voice rejected: {FirstNonEmpty(response.reason, "unknown reason")}."; + } + + private static string EncodeClipToWavBase64(AudioClip clip, int sampleFrames) + { + var frames = Mathf.Clamp(sampleFrames, 1, clip.samples); + var channels = Mathf.Max(1, clip.channels); + var samples = new float[frames * channels]; + clip.GetData(samples, 0); + return Convert.ToBase64String(EncodePcm16Wav(samples, clip.frequency, channels)); + } + + private static byte[] EncodePcm16Wav(float[] samples, int sampleRate, int channels) + { + using var stream = new MemoryStream(); + using var writer = new BinaryWriter(stream); + var byteRate = sampleRate * channels * 2; + var dataSize = samples.Length * 2; + + writer.Write(System.Text.Encoding.ASCII.GetBytes("RIFF")); + writer.Write(36 + dataSize); + writer.Write(System.Text.Encoding.ASCII.GetBytes("WAVE")); + writer.Write(System.Text.Encoding.ASCII.GetBytes("fmt ")); + writer.Write(16); + writer.Write((short)1); + writer.Write((short)channels); + writer.Write(sampleRate); + writer.Write(byteRate); + writer.Write((short)(channels * 2)); + writer.Write((short)16); + writer.Write(System.Text.Encoding.ASCII.GetBytes("data")); + writer.Write(dataSize); + + foreach (var sample in samples) + { + writer.Write((short)Mathf.Clamp(Mathf.RoundToInt(sample * 32767f), short.MinValue, short.MaxValue)); + } + + return stream.ToArray(); + } + + private static bool IsRpcNotLoaded(string error) + { + return !string.IsNullOrWhiteSpace(error) && + (error.IndexOf("not found", StringComparison.OrdinalIgnoreCase) >= 0 || + error.IndexOf("not registered", StringComparison.OrdinalIgnoreCase) >= 0 || + error.IndexOf("rpc id", StringComparison.OrdinalIgnoreCase) >= 0); + } + + private static string FirstNonEmpty(params string[] values) + { + foreach (var value in values) + { + if (!string.IsNullOrWhiteSpace(value)) + { + return value.Trim(); + } + } + + return ""; + } + + private static string Shorten(string value, int maxLength) + { + if (string.IsNullOrWhiteSpace(value)) + { + return ""; + } + + var trimmed = value.Trim(); + return trimmed.Length <= maxLength ? trimmed : trimmed.Substring(0, Mathf.Max(0, maxLength - 3)) + "..."; + } + } +} diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs.meta b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs.meta new file mode 100644 index 00000000..e696fff7 --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs.meta @@ -0,0 +1,2 @@ +fileFormatVersion: 2 +guid: 957b08da5f4d4730a87ec6d53b0d40b5 diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/SecondSpawnGatewayClient.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/SecondSpawnGatewayClient.cs index 7074f9e2..3a3a3303 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/SecondSpawnGatewayClient.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/SecondSpawnGatewayClient.cs @@ -438,6 +438,16 @@ public IEnumerator GetVoiceSession(VoiceSessionRequestDto request, Action onSuccess, Action onError = null) + { + yield return SendNakamaRpc("secondspawn_realtime_voice_session_request", request ?? new RealtimeVoiceSessionRequestDto(), onSuccess, onError); + } + + public IEnumerator SubmitRealtimeVoiceInput(RealtimeVoiceInputRequestDto request, Action onSuccess, Action onError = null) + { + yield return SendNakamaRpc("secondspawn_realtime_voice_input", request ?? new RealtimeVoiceInputRequestDto(), onSuccess, onError, _agentDecisionRequestTimeoutSeconds); + } + private IEnumerator SendNakamaRpc( string rpcId, object payload, diff --git a/Unity/Assets/_SecondSpawn/Scripts/UI/NearbyNpcChatPanel.cs b/Unity/Assets/_SecondSpawn/Scripts/UI/NearbyNpcChatPanel.cs index edc67c75..01ab7d21 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/UI/NearbyNpcChatPanel.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/UI/NearbyNpcChatPanel.cs @@ -20,6 +20,7 @@ public sealed class NearbyNpcChatPanel : MonoBehaviour private const float SubmitDebounceSeconds = 0.15f; private PrototypeNearbyNpcChatBox _chat; + private PrototypeNpcRealtimeVoiceClient _voiceClient; private RectTransform _panel; private Text _titleText; private InputField _displayNameInput; @@ -29,6 +30,8 @@ public sealed class NearbyNpcChatPanel : MonoBehaviour private RectTransform _historyContent; private ScrollRect _historyScroll; private Button _sendButton; + private Button _micButton; + private Text _micButtonText; private string _lastRenderedState = ""; private bool _lastChatMode; private bool _scrollToBottomPending; @@ -53,6 +56,7 @@ private static void AttachOnSceneLoad() public void Bind(PrototypeNearbyNpcChatBox chat) { _chat = chat; + _voiceClient = chat != null ? chat.RealtimeVoice : null; BuildUi(); Render(); } @@ -66,6 +70,8 @@ private void Update() { return; } + + _voiceClient = _chat.RealtimeVoice; } var textFocused = (_messageInput != null && _messageInput.isFocused) || @@ -85,6 +91,11 @@ private void Update() SubmitMessage(); } + if (_voiceClient == null && _chat != null) + { + _voiceClient = _chat.RealtimeVoice; + } + Render(); } @@ -176,6 +187,11 @@ private void BuildUi() var messageLayout = _messageInput.gameObject.AddComponent(); messageLayout.flexibleWidth = 1f; + _micButton = CreateButton("MicButton", row, "Mic"); + _micButtonText = _micButton.GetComponentInChildren(); + SetPreferredWidth(_micButton.transform as RectTransform, 58f); + _micButton.onClick.AddListener(ToggleMicrophone); + _sendButton = CreateButton("SendButton", row, "Send"); SetPreferredWidth(_sendButton.transform as RectTransform, 68f); _sendButton.onClick.AddListener(SubmitMessage); @@ -215,6 +231,23 @@ private void SubmitMessage() Render(); } + private void ToggleMicrophone() + { + if (_chat == null) + { + return; + } + + _voiceClient ??= _chat.RealtimeVoice; + if (_voiceClient == null) + { + return; + } + + _voiceClient.TogglePushToTalk(); + Render(); + } + private void Render() { if (_chat == null || _statusText == null || _historyContent == null) @@ -238,6 +271,10 @@ private void Render() builder.Append('|'); builder.Append(_chat.IsBusy ? "busy" : "ready"); builder.Append('|'); + builder.Append(_voiceClient != null && _voiceClient.IsRecording ? "recording" : "not-recording"); + builder.Append('|'); + builder.Append(_voiceClient != null ? _voiceClient.Status : ""); + builder.Append('|'); foreach (var line in _chat.DialogueLines) { builder.Append('|'); @@ -261,12 +298,32 @@ private void Render() ? $"Talking to {_chat.FocusedNpcDisplayName}" : "Nearby NPC Chat"; _statusText.text = _chat.IsBusy ? _chat.Status + "..." : _chat.Status; + if (_voiceClient != null && (_voiceClient.IsRecording || _voiceClient.IsBusy)) + { + _statusText.text = _voiceClient.Status; + } + _statusText.color = _chat.IsBusy ? new Color(1f, 0.86f, 0.35f) : new Color(0.74f, 0.86f, 0.92f); + if (_voiceClient != null && _voiceClient.IsRecording) + { + _statusText.color = new Color(1f, 0.58f, 0.42f); + } + if (_sendButton != null) { _sendButton.interactable = !_chat.IsBusy; } + if (_micButton != null) + { + var canStartMic = _voiceClient != null && _chat.IsChatModeActive && !_chat.IsBusy && !_voiceClient.IsBusy; + _micButton.interactable = _voiceClient != null && (_voiceClient.IsRecording || canStartMic); + if (_micButtonText != null) + { + _micButtonText.text = _voiceClient != null && _voiceClient.IsRecording ? "Stop" : "Mic"; + } + } + if (_messageInput != null) { _messageInput.interactable = !_chat.IsBusy; @@ -330,7 +387,7 @@ private void RenderDialogueRows() if (_chat.DialogueLines.Count == 0) { CreateSystemRow(_chat.IsChatModeActive - ? "Ask a nearby NPC. Enter sends - Esc exits Chat Mode." + ? "Ask a nearby NPC. Enter sends - Mic records - Esc exits Chat Mode." : "Type near NPCs. Enter sends - Esc exits Chat Mode."); return; } diff --git a/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md b/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md index 2b0f802d..74b9e5b5 100644 --- a/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md +++ b/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md @@ -220,6 +220,11 @@ The first implementation uses local presentation components: - `PrototypeNPCChatClient`: resolves the active `PrototypeAgentBrain` by actor id before presenting speech so prototype hotkey replies animate the intended NPC instead of whichever presenter Unity finds first. +- `PrototypeNpcRealtimeVoiceClient`: captures focused-dialogue text or local + microphone input behind a LiveKit-ready session contract. It requests + server-minted realtime session material, submits typed text or WAV PCM mic + clips to Nakama, routes returned transcripts through the existing dialogue + path, and falls back honestly when the backend RPC is not loaded. - `SecondSpawnFacialBlendshapeReportUtility`: editor-only reporting for selected characters and generated visual prefabs. Agents use it to inspect real imported `SkinnedMeshRenderer` blendshape names before approving an @@ -228,6 +233,9 @@ The first implementation uses local presentation components: Unity must not hold provider API keys or call model providers directly. The only accepted online voice path is a short-lived session material returned by Nakama or `api.dos.ai`; all local animation data remains presentation only. +LiveKit is the preferred future media transport for low-latency microphone +sessions, but Photon remains the game networking layer and Fusion/Nakama keep +all gameplay authority. --- @@ -296,6 +304,10 @@ Nakama owns: Voice or facial-animation providers may own optional transport-level audio, viseme, blendshape, or facial animation data for a single scoped session. +LiveKit may own WebRTC room transport, participant audio tracks, interruption +signals, and voice-agent media routing for one scoped conversation. It must not +own dialogue authorization, canonical memory writes, quest updates, combat, or +economy effects. No provider may own canonical NPC memory, relationship, quest, TIME, SECOND, inventory, combat, or body lifecycle state. @@ -357,7 +369,29 @@ Evidence: - Inspector or debug note showing which NPC presentation profile has text-only, audio-amplitude, or viseme-capable mode. -### D4: Play Mode Smoke Update +### D4: Realtime Text And Microphone Input Hook + +Issues: #139, #262 + +Build: + +- Keep typed text as the baseline focused-dialogue input. +- Add local microphone capture for push-to-talk focused dialogue. +- Request a scoped realtime voice session from Nakama before submitting audio. +- Keep the contract LiveKit-ready without importing the LiveKit Unity SDK until + the backend room/token lane is available. +- Route returned speech transcripts back through the normal player-to-NPC + dialogue path so memory, relationship, quest, and rate-limit rules stay + server-owned. +- Show an honest local fallback when the realtime voice RPC is not deployed. + +Evidence: + +- Compile evidence for the microphone capture and realtime session DTOs. +- Play Mode note showing text input still works and microphone capture reports + backend availability honestly. + +### D5: Play Mode Smoke Update Issues: #139, #140 From 5fb3026bfe02a3c0ace71d23a932556e98d29201 Mon Sep 17 00:00:00 2001 From: JOY Date: Tue, 26 May 2026 18:43:48 +0700 Subject: [PATCH 6/7] feat: wire realtime NPC voice turns --- .../AI/PrototypeNpcRealtimeVoiceClient.cs | 120 ++++++++- .../Scripts/AI/PrototypeNpcVoicePresenter.cs | 31 +++ .../AI/PrototypeWindowsSpeechBridge.cs | 151 +++++++++++ .../AI/PrototypeWindowsSpeechBridge.cs.meta | 2 + backend/nakama/README.md | 21 ++ backend/nakama/local.example.yml | 3 + backend/nakama/modules/index.ts | 253 ++++++++++++++++++ .../tests/supabase_custom_auth.test.mjs | 95 ++++++- ...ed-npc-dialogue-portrait-lipsync-design.md | 6 + 9 files changed, 680 insertions(+), 2 deletions(-) create mode 100644 Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeWindowsSpeechBridge.cs create mode 100644 Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeWindowsSpeechBridge.cs.meta diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs index 991b65b8..36e7c689 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs @@ -17,6 +17,7 @@ public sealed class PrototypeNpcRealtimeVoiceClient : MonoBehaviour [SerializeField] private int _maxRecordingSeconds = 8; [SerializeField] private string _requestedTransport = "livekit_ready"; [SerializeField] private string _providerHint = "gemini_live_or_tts"; + [SerializeField] private bool _useWindowsDictationFallback = true; private SecondSpawnGatewayClient _gateway; private PrototypeNearbyNpcChatBox _chat; @@ -29,9 +30,14 @@ public sealed class PrototypeNpcRealtimeVoiceClient : MonoBehaviour private string _status = "Voice input ready"; private string _lastTranscript = ""; private string _lastNpcText = ""; +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + private UnityEngine.Windows.Speech.DictationRecognizer _dictationRecognizer; + private string _dictationTranscript = ""; +#endif public bool IsBusy => _busy; - public bool IsRecording => _recordingClip != null && !string.IsNullOrWhiteSpace(_recordingDevice) && Microphone.IsRecording(_recordingDevice); + public bool IsRecording => IsWindowsDictationActive() || + (_recordingClip != null && !string.IsNullOrWhiteSpace(_recordingDevice) && Microphone.IsRecording(_recordingDevice)); public bool HasMicrophone => Microphone.devices != null && Microphone.devices.Length > 0; public string Status => _status; public string LastTranscript => _lastTranscript; @@ -118,6 +124,12 @@ public void BeginPushToTalk() return; } + if (CanUseWindowsDictationFallback()) + { + BeginWindowsDictation(); + return; + } + if (!HasMicrophone) { _status = "No microphone device is available."; @@ -133,6 +145,12 @@ public void BeginPushToTalk() public void EndPushToTalkAndSubmit() { + if (IsWindowsDictationActive() || HasPendingWindowsDictationTranscript()) + { + EndWindowsDictationAndSubmit(); + return; + } + if (!IsRecording || _recordingClip == null) { return; @@ -159,6 +177,7 @@ public void CancelRecording() _recordingClip = null; _recordingDevice = ""; + StopWindowsDictation(); } public void SubmitRealtimeText(string text) @@ -394,5 +413,104 @@ private static string Shorten(string value, int maxLength) var trimmed = value.Trim(); return trimmed.Length <= maxLength ? trimmed : trimmed.Substring(0, Mathf.Max(0, maxLength - 3)) + "..."; } + + private bool CanUseWindowsDictationFallback() + { +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + return _useWindowsDictationFallback && PrototypeWindowsSpeechBridge.IsDictationAvailable; +#else + return false; +#endif + } + + private bool IsWindowsDictationActive() + { +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + return _dictationRecognizer != null && + _dictationRecognizer.Status == UnityEngine.Windows.Speech.SpeechSystemStatus.Running; +#else + return false; +#endif + } + + private bool HasPendingWindowsDictationTranscript() + { +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + return !string.IsNullOrWhiteSpace(_dictationTranscript); +#else + return false; +#endif + } + + private void BeginWindowsDictation() + { +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + if (_dictationRecognizer == null) + { + _dictationRecognizer = new UnityEngine.Windows.Speech.DictationRecognizer(); + _dictationRecognizer.DictationResult += (text, confidence) => + { + if (!string.IsNullOrWhiteSpace(text)) + { + _dictationTranscript = text.Trim(); + _status = $"Heard: {_dictationTranscript}"; + } + }; + _dictationRecognizer.DictationHypothesis += text => + { + if (!string.IsNullOrWhiteSpace(text)) + { + _status = $"Listening: {Shorten(text, 72)}"; + } + }; + _dictationRecognizer.DictationError += (error, hresult) => + { + _status = $"Windows dictation error: {error}"; + }; + _dictationRecognizer.DictationComplete += completionCause => + { + if (completionCause != UnityEngine.Windows.Speech.DictationCompletionCause.Complete && + completionCause != UnityEngine.Windows.Speech.DictationCompletionCause.TimeoutExceeded) + { + _status = $"Windows dictation stopped: {completionCause}"; + } + }; + } + + _dictationTranscript = ""; + _dictationRecognizer.Start(); + _status = $"Listening to {_chat.FocusedNpcDisplayName}."; +#endif + } + + private void EndWindowsDictationAndSubmit() + { +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + StopWindowsDictation(); + if (string.IsNullOrWhiteSpace(_dictationTranscript)) + { + _status = "No speech transcript was captured."; + _chat?.AddPrototypeSystemLine(_status); + return; + } + + var transcript = _dictationTranscript.Trim(); + _lastTranscript = transcript; + _dictationTranscript = ""; + _status = $"Voice transcript ready: {Shorten(transcript, 72)}"; + _chat.SubmitRealtimeVoiceTranscript(transcript, "windows_dictation"); +#endif + } + + private void StopWindowsDictation() + { +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + if (_dictationRecognizer != null && + _dictationRecognizer.Status == UnityEngine.Windows.Speech.SpeechSystemStatus.Running) + { + _dictationRecognizer.Stop(); + } +#endif + } } } diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs index f31792b7..2599662c 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs @@ -8,6 +8,7 @@ namespace SecondSpawn.AI public sealed class PrototypeNpcVoicePresenter : MonoBehaviour { [SerializeField] private bool _requestScopedVoiceSession = true; + [SerializeField] private bool _allowWindowsSpeechFallback = true; [SerializeField] private string _playbackMode = "voice_preview"; [SerializeField, Range(0f, 1f)] private float _clipVolume = 0.8f; [SerializeField] private string[] _lipSyncTiers = { "text_timed", "audio_amplitude_hook", "provider_viseme_hook" }; @@ -156,6 +157,36 @@ private IEnumerator DownloadAndPlayClip(string endpoint, string ephemeralToken, } private void PlayFallback(string text, string reason) + { + LastVoiceReason = reason ?? ""; + if (_allowWindowsSpeechFallback && PrototypeWindowsSpeechBridge.IsSpeechSynthesisAvailable && isActiveAndEnabled) + { + LastPresentationMode = "windows_sapi_pending"; + StartCoroutine(PlayWindowsSpeechFallback(text, reason)); + return; + } + + PlayPrototypeTone(text, reason); + } + + private IEnumerator PlayWindowsSpeechFallback(string text, string reason) + { + AudioClip clip = null; + string error = null; + yield return PrototypeWindowsSpeechBridge.SynthesizeToClip(text, value => clip = value, value => error = value); + if (clip == null) + { + PlayPrototypeTone(text, string.IsNullOrWhiteSpace(error) ? reason : error); + yield break; + } + + var duration = _voiceCue.PlayClip(clip, _clipVolume); + _facialDriver.BeginAudioSpeech(_voiceCue.OutputSource, text, duration); + LastVoiceReason = string.IsNullOrWhiteSpace(reason) ? "windows_sapi_local_tts" : reason; + LastPresentationMode = "windows_sapi_voice_with_blendshape"; + } + + private void PlayPrototypeTone(string text, string reason) { var duration = _voiceCue.PlayCue(text); _facialDriver.BeginAudioSpeech(_voiceCue.OutputSource, text, duration); diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeWindowsSpeechBridge.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeWindowsSpeechBridge.cs new file mode 100644 index 00000000..2ea4507a --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeWindowsSpeechBridge.cs @@ -0,0 +1,151 @@ +using System; +using System.Collections; +using System.Diagnostics; +using System.IO; +using System.Text; +using UnityEngine; +using UnityEngine.Networking; + +namespace SecondSpawn.AI +{ + public static class PrototypeWindowsSpeechBridge + { +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + public static bool IsDictationAvailable => true; + public static bool IsSpeechSynthesisAvailable => true; +#else + public static bool IsDictationAvailable => false; + public static bool IsSpeechSynthesisAvailable => false; +#endif + + public static IEnumerator SynthesizeToClip(string text, Action onClip, Action onError = null) + { +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + if (string.IsNullOrWhiteSpace(text)) + { + onError?.Invoke("empty_text"); + yield break; + } + + var path = Path.Combine(Application.temporaryCachePath, "second-spawn-npc-voice-" + Guid.NewGuid().ToString("N") + ".wav"); + var processError = ""; + yield return RunSapiSynthesis(text.Trim(), path, value => processError = value); + if (!string.IsNullOrWhiteSpace(processError) || !File.Exists(path)) + { + onError?.Invoke(string.IsNullOrWhiteSpace(processError) ? "windows_sapi_no_output" : processError); + yield break; + } + + using var request = UnityWebRequestMultimedia.GetAudioClip("file:///" + path.Replace("\\", "/"), AudioType.WAV); + yield return request.SendWebRequest(); + if (request.result != UnityWebRequest.Result.Success) + { + onError?.Invoke("windows_sapi_clip_decode_failed: " + request.error); + TryDelete(path); + yield break; + } + + var clip = DownloadHandlerAudioClip.GetContent(request); + TryDelete(path); + if (clip == null) + { + onError?.Invoke("windows_sapi_empty_clip"); + yield break; + } + + onClip?.Invoke(clip); +#else + onError?.Invoke("windows_speech_unavailable"); + yield break; +#endif + } + +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + private static IEnumerator RunSapiSynthesis(string text, string outputPath, Action onError) + { + var encodedText = Convert.ToBase64String(Encoding.UTF8.GetBytes(text)); + var encodedPath = Convert.ToBase64String(Encoding.UTF8.GetBytes(outputPath)); + var script = string.Join(Environment.NewLine, new[] + { + "$ErrorActionPreference = 'Stop'", + "Add-Type -AssemblyName System.Speech", + "$text = [Text.Encoding]::UTF8.GetString([Convert]::FromBase64String('" + encodedText + "'))", + "$path = [Text.Encoding]::UTF8.GetString([Convert]::FromBase64String('" + encodedPath + "'))", + "$voice = New-Object System.Speech.Synthesis.SpeechSynthesizer", + "$voice.Rate = 0", + "$voice.Volume = 100", + "$voice.SetOutputToWaveFile($path)", + "$voice.Speak($text)", + "$voice.Dispose()" + }); + var encodedCommand = Convert.ToBase64String(Encoding.Unicode.GetBytes(script)); + var start = new ProcessStartInfo + { + FileName = "powershell.exe", + Arguments = "-NoProfile -NonInteractive -ExecutionPolicy Bypass -EncodedCommand " + encodedCommand, + CreateNoWindow = true, + UseShellExecute = false, + RedirectStandardError = true, + RedirectStandardOutput = true + }; + + Process process; + try + { + process = Process.Start(start); + } + catch (Exception ex) + { + onError?.Invoke("windows_sapi_start_failed: " + ex.Message); + yield break; + } + + var startedAt = Time.realtimeSinceStartup; + while (process != null && !process.HasExited) + { + if (Time.realtimeSinceStartup - startedAt > 12f) + { + try + { + process.Kill(); + } + catch (Exception) + { + } + + onError?.Invoke("windows_sapi_timeout"); + yield break; + } + + yield return null; + } + + if (process == null) + { + onError?.Invoke("windows_sapi_process_missing"); + yield break; + } + + var error = process.StandardError.ReadToEnd(); + if (process.ExitCode != 0) + { + onError?.Invoke("windows_sapi_exit_" + process.ExitCode + ": " + error); + } + } +#endif + + private static void TryDelete(string path) + { + try + { + if (File.Exists(path)) + { + File.Delete(path); + } + } + catch (Exception) + { + } + } + } +} diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeWindowsSpeechBridge.cs.meta b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeWindowsSpeechBridge.cs.meta new file mode 100644 index 00000000..f8a3152f --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeWindowsSpeechBridge.cs.meta @@ -0,0 +1,2 @@ +fileFormatVersion: 2 +guid: 80c379d4e0e344a38ac8e1d5668c86bf diff --git a/backend/nakama/README.md b/backend/nakama/README.md index 4f2255e4..65485c17 100644 --- a/backend/nakama/README.md +++ b/backend/nakama/README.md @@ -150,6 +150,23 @@ lifecycle state. When voice is disabled, unconfigured, timed out, or rejected, the RPC returns a structured text-only fallback so focused dialogue remains usable. +Optional realtime voice turn env: + +```text +DOS_AI_REALTIME_VOICE_ENABLED=false +DOS_AI_REALTIME_VOICE_URL=https://api.dos.ai/v1/voice/realtime/turns +DOS_AI_REALTIME_VOICE_MAX_AUDIO_MS=8000 +``` + +`secondspawn_realtime_voice_session_request` and +`secondspawn_realtime_voice_input` are also disabled by default. When enabled, +Nakama submits scoped text or microphone turn material to `api.dos.ai` and +expects a transcript plus an NPC reply. Unity can use this for LiveKit-backed +rooms later, but Photon remains the game networking layer and the response is +dialogue/presentation only. In local Windows Editor development, Unity may use +the OS dictation and SAPI speech fallback when these runtime env values are not +configured, keeping provider keys out of the client. + ### Metrics and Structured Logs Nakama exposes its normal Prometheus-style server metrics through deployment @@ -175,6 +192,10 @@ can filter prototype game events without parsing free-form text: - `secondspawn.voice_session` records voice session availability, provider, fallback reason, target actor, and voice profile id without logging provider keys or ephemeral tokens. +- `secondspawn.realtime_voice_session` and + `secondspawn.realtime_voice_input` record realtime voice availability, + accepted turn status, transcript length, and NPC reply length without logging + raw audio or provider keys. Structured logs must stay public-safe: do not log provider API keys, RPC secrets, raw prompts, raw payloads, or private provider responses. Use diff --git a/backend/nakama/local.example.yml b/backend/nakama/local.example.yml index 8f563282..f8cb2234 100644 --- a/backend/nakama/local.example.yml +++ b/backend/nakama/local.example.yml @@ -19,6 +19,9 @@ runtime: - "DOS_AI_DECISION_DAILY_TOKEN_BUDGET=250000" - "DOS_AI_DIRECT_CHAT_DAILY_REQUEST_LIMIT=1000" - "DOS_AI_DIRECT_CHAT_DAILY_TOKEN_BUDGET=250000" + - "DOS_AI_VOICE_SESSIONS_ENABLED=false" + - "DOS_AI_REALTIME_VOICE_ENABLED=false" + - "DOS_AI_REALTIME_VOICE_MAX_AUDIO_MS=8000" # Set DOS_AI_API_KEY only in private local config or deployment secrets. # Set SECOND_SPAWN_INTERNAL_RPC_SECRET and SECOND_SPAWN_ADMIN_RPC_SECRET # only in private local config or deployment secrets. diff --git a/backend/nakama/modules/index.ts b/backend/nakama/modules/index.ts index 0f73fe4e..c9bdb2b6 100644 --- a/backend/nakama/modules/index.ts +++ b/backend/nakama/modules/index.ts @@ -25,6 +25,8 @@ var rpcIdAgentPolicyUpdate = "secondspawn_agent_policy_update"; var rpcIdAgentReturnReport = "secondspawn_agent_return_report"; var rpcIdAgentActivityAdd = "secondspawn_agent_activity_add"; var rpcIdVoiceSessionRequest = "secondspawn_voice_session_request"; +var rpcIdRealtimeVoiceSessionRequest = "secondspawn_realtime_voice_session_request"; +var rpcIdRealtimeVoiceInput = "secondspawn_realtime_voice_input"; var rpcIdActorProfileGet = "secondspawn_actor_profile_get"; var rpcIdActorMemoryAdd = "secondspawn_actor_memory_add"; var rpcIdActorMemoryQuery = "secondspawn_actor_memory_query"; @@ -80,6 +82,8 @@ var rpcBoundaryCatalog = [ { id: rpcIdAgentReturnReport, boundary: rpcBoundaryClient }, { id: rpcIdAgentActivityAdd, boundary: rpcBoundaryClient }, { id: rpcIdVoiceSessionRequest, boundary: rpcBoundaryClient }, + { id: rpcIdRealtimeVoiceSessionRequest, boundary: rpcBoundaryClient }, + { id: rpcIdRealtimeVoiceInput, boundary: rpcBoundaryClient }, { id: rpcIdActorProfileGet, boundary: rpcBoundaryClient }, { id: rpcIdActorMemoryAdd, boundary: rpcBoundaryClient }, { id: rpcIdActorMemoryQuery, boundary: rpcBoundaryClient }, @@ -706,6 +710,8 @@ let InitModule: nkruntime.InitModule = function ( initializer.registerRpc(rpcIdAgentReturnReport, rpcAgentReturnReport); initializer.registerRpc(rpcIdAgentActivityAdd, rpcAgentActivityAdd); initializer.registerRpc(rpcIdVoiceSessionRequest, rpcVoiceSessionRequest); + initializer.registerRpc(rpcIdRealtimeVoiceSessionRequest, rpcRealtimeVoiceSessionRequest); + initializer.registerRpc(rpcIdRealtimeVoiceInput, rpcRealtimeVoiceInput); initializer.registerRpc(rpcIdActorProfileGet, rpcActorProfileGet); initializer.registerRpc(rpcIdActorMemoryAdd, rpcActorMemoryAdd); initializer.registerRpc(rpcIdActorMemoryQuery, rpcActorMemoryQuery); @@ -1344,6 +1350,109 @@ function rpcVoiceSessionRequest( return JSON.stringify(response); } +function rpcRealtimeVoiceSessionRequest( + ctx: nkruntime.Context, + logger: nkruntime.Logger, + nk: nkruntime.Nakama, + payload: string +): string { + var userId = requireUserId(ctx); + var request = parseJson(payload || "{}", "realtime voice session payload"); + var voiceTarget = resolveVoiceSessionTarget(ctx, nk, userId, request); + var ttlSeconds = voiceSessionTtlSeconds(ctx, request); + var expiresAtMs = new Date().getTime() + ttlSeconds * 1000; + var sessionDescriptor = buildVoiceSessionDescriptor(nk, voiceTarget, request, ttlSeconds, expiresAtMs); + var enabled = isRealtimeVoiceEnabled(ctx) && trimString(ctx.env["DOS_AI_API_KEY"]) && dosAiRealtimeVoiceEndpoint(ctx); + var response = { + session_available: !!enabled, + provider: enabled ? "api_dos_ai_realtime_voice" : "windows_local_or_text", + reason: enabled ? "" : realtimeVoiceUnavailableReason(ctx), + actor_id: voiceTarget.actor_id, + conversation_session_id: sessionDescriptor.conversation_session_id, + session: { + session_id: sessionDescriptor.session_id, + expires_at_ms: expiresAtMs, + ttl_seconds: ttlSeconds, + audience: "unity_realtime_voice", + transport: enabled ? trimString(request.requested_transport || "api_dos_ai_realtime_voice") : "windows_local_or_text", + endpoint: enabled ? dosAiRealtimeVoiceEndpoint(ctx) : "", + ephemeral_token: "", + lip_sync_tiers: sessionDescriptor.lip_sync_tiers, + presentation_only: true, + authority_note: "Realtime voice can submit dialogue text only. Nakama and Fusion validate any gameplay effect." + }, + input_policy: { + accepts_text: true, + accepts_audio: !!enabled, + max_audio_ms: realtimeVoiceMaxAudioMs(ctx), + sample_rate_hz: finiteNumberOrDefault(request.sample_rate_hz, 16000), + channels: finiteNumberOrDefault(request.channels, 1), + accepted_audio_formats: ["wav_pcm16"] + }, + debug: { + source: "nakama_realtime_voice_session_rpc", + provider_status: enabled ? "configured" : "local_fallback", + fallback_mode: voiceTarget.voice_profile.fallback_mode + } + }; + logStructuredInfo(logger, "realtime_voice_session", { + owner_id: userId, + actor_id: voiceTarget.actor_id, + provider: response.provider, + session_available: response.session_available, + reason: response.reason + }); + return JSON.stringify(response); +} + +function rpcRealtimeVoiceInput( + ctx: nkruntime.Context, + logger: nkruntime.Logger, + nk: nkruntime.Nakama, + payload: string +): string { + var userId = requireUserId(ctx); + var request = parseJson(payload || "{}", "realtime voice input payload"); + var voiceTarget = resolveVoiceSessionTarget(ctx, nk, userId, request); + var inputKind = sanitizeQuestIdentifier(trimString(request.input_kind), "text"); + var text = sanitizePublicNpcSpeech(request.text); + if (inputKind === "text" && !text) { + throw new Error("realtime text input requires text"); + } + + var apiResult = tryDosAiRealtimeVoiceInput(ctx, logger, nk, userId, voiceTarget, request, inputKind, text); + var transcript = sanitizePublicNpcSpeech(apiResult.transcript || text); + var npcText = sanitizePublicNpcSpeech(apiResult.npc_text); + var response = { + accepted: apiResult.accepted, + provider: apiResult.provider, + reason: apiResult.reason, + conversation_session_id: trimString(apiResult.conversation_session_id || request.conversation_session_id), + transcript: transcript, + npc_actor_id: voiceTarget.actor_id, + npc_text: npcText, + fallback_to_text_chat: !apiResult.accepted && inputKind === "text", + voice_audio_base64: trimString(apiResult.voice_audio_base64), + voice_audio_format: trimString(apiResult.voice_audio_format || "pcm_s16le_24000"), + debug: { + source: "nakama_realtime_voice_input_rpc", + provider_status: apiResult.reason, + fallback_mode: voiceTarget.voice_profile.fallback_mode + } + }; + logStructuredInfo(logger, "realtime_voice_input", { + owner_id: userId, + actor_id: voiceTarget.actor_id, + input_kind: inputKind, + accepted: response.accepted, + provider: response.provider, + reason: response.reason, + transcript_length: transcript.length, + npc_text_length: npcText.length + }); + return JSON.stringify(response); +} + function logAgentDecision( logger: nkruntime.Logger, ownerId: string, @@ -8992,6 +9101,150 @@ function logVoiceSession(logger: nkruntime.Logger, ownerId: string, target: any, }); } +function isRealtimeVoiceEnabled(ctx: nkruntime.Context): boolean { + var enabled = lowercase(ctx.env["DOS_AI_REALTIME_VOICE_ENABLED"]); + return enabled === "true" || enabled === "1" || enabled === "yes"; +} + +function dosAiRealtimeVoiceEndpoint(ctx: nkruntime.Context): string { + var explicitEndpoint = trimString(ctx.env["DOS_AI_REALTIME_VOICE_URL"]); + if (explicitEndpoint) { + return explicitEndpoint; + } + var baseUrl = trimTrailingSlash(ctx.env["DOS_AI_BASE_URL"] || ""); + return baseUrl ? baseUrl + "/voice/realtime/turns" : ""; +} + +function realtimeVoiceUnavailableReason(ctx: nkruntime.Context): string { + if (!isRealtimeVoiceEnabled(ctx)) { + return "realtime_voice_disabled"; + } + if (!trimString(ctx.env["DOS_AI_API_KEY"])) { + return "dos_ai_unconfigured"; + } + if (!dosAiRealtimeVoiceEndpoint(ctx)) { + return "dos_ai_realtime_voice_endpoint_unconfigured"; + } + return "realtime_voice_unavailable"; +} + +function realtimeVoiceMaxAudioMs(ctx: nkruntime.Context): number { + return Math.floor(clampNumber(finiteNumberOrDefault(ctx.env["DOS_AI_REALTIME_VOICE_MAX_AUDIO_MS"], 8000), 1000, 15000)); +} + +function tryDosAiRealtimeVoiceInput( + ctx: nkruntime.Context, + logger: nkruntime.Logger, + nk: nkruntime.Nakama, + ownerId: string, + target: any, + request: any, + inputKind: string, + text: string +): any { + var endpoint = dosAiRealtimeVoiceEndpoint(ctx); + var apiKey = trimString(ctx.env["DOS_AI_API_KEY"]); + var hasAudio = !!trimString(request.audio_base64); + if (!isRealtimeVoiceEnabled(ctx) || !apiKey || !endpoint) { + if (inputKind === "text") { + return { + accepted: false, + provider: "text_fallback", + reason: realtimeVoiceUnavailableReason(ctx), + transcript: text, + npc_text: "" + }; + } + return { + accepted: false, + provider: "windows_local_or_text", + reason: realtimeVoiceUnavailableReason(ctx), + transcript: "", + npc_text: "" + }; + } + if (inputKind !== "text" && !hasAudio) { + return { + accepted: false, + provider: "api_dos_ai_realtime_voice", + reason: "missing_audio", + transcript: "", + npc_text: "" + }; + } + + var body = { + player_id: ownerId, + actor_id: target.actor_id, + body_id: target.body_id, + display_name: target.display_name, + voice_profile: target.voice_profile, + session_id: trimString(request.session_id), + conversation_session_id: trimString(request.conversation_session_id), + input_kind: inputKind, + text: text, + audio_format: sanitizeQuestIdentifier(trimString(request.audio_format), "wav_pcm16"), + sample_rate_hz: finiteNumberOrDefault(request.sample_rate_hz, 16000), + channels: finiteNumberOrDefault(request.channels, 1), + duration_ms: Math.floor(clampNumber(finiteNumberOrDefault(request.duration_ms, 0), 0, realtimeVoiceMaxAudioMs(ctx))), + audio_base64: trimString(request.audio_base64), + presentation_only: true, + forbidden_state_mutations: [ + "memory", + "relationship", + "quest", + "TIME", + "SECOND", + "inventory", + "combat", + "body_lifecycle" + ] + }; + + var response: any; + var startedAtMs = new Date().getTime(); + try { + response = nk.httpRequest(endpoint, "post", { + "content-type": "application/json", + "accept": "application/json", + "authorization": "Bearer " + apiKey + }, JSON.stringify(body), dosAiDecisionTimeoutMs(ctx)); + } catch (err) { + logger.info("DOS.AI realtime voice input threw: " + err); + return { + accepted: false, + provider: "api_dos_ai_realtime_voice", + reason: isTimeoutLikeError(err) ? "dos_ai_realtime_voice_timeout" : "dos_ai_realtime_voice_exception", + transcript: "", + npc_text: "" + }; + } + + if (response.code < 200 || response.code > 299) { + logger.info("DOS.AI realtime voice input failed with status " + response.code); + return { + accepted: false, + provider: "api_dos_ai_realtime_voice", + reason: "dos_ai_realtime_voice_http_" + response.code, + transcript: "", + npc_text: "" + }; + } + + var decoded = parseJsonOrNull(response.body) || {}; + return { + accepted: true, + provider: trimString(decoded.provider) || "api_dos_ai_realtime_voice", + reason: trimString(decoded.reason) || "accepted", + conversation_session_id: trimString(decoded.conversation_session_id || request.conversation_session_id), + transcript: sanitizePublicNpcSpeech(decoded.transcript || decoded.text || text), + npc_text: sanitizePublicNpcSpeech(decoded.npc_text || decoded.reply_text || decoded.say), + voice_audio_base64: trimString(decoded.voice_audio_base64 || decoded.audio_base64), + voice_audio_format: trimString(decoded.voice_audio_format || decoded.audio_format), + latency_ms: elapsedSince(startedAtMs) + }; +} + function tryDosAiAgentDecision( ctx: nkruntime.Context, logger: nkruntime.Logger, diff --git a/backend/nakama/tests/supabase_custom_auth.test.mjs b/backend/nakama/tests/supabase_custom_auth.test.mjs index eb1fcdd8..81b46935 100644 --- a/backend/nakama/tests/supabase_custom_auth.test.mjs +++ b/backend/nakama/tests/supabase_custom_auth.test.mjs @@ -165,7 +165,7 @@ assert.equal( const harness = createRuntimeHarness(module); assert.equal(harness.registeredHooks.length, 1); -assert.equal(harness.registeredRpcs.size, 50); +assert.equal(harness.registeredRpcs.size, 52); assert.ok(harness.registeredRpcs.has("secondspawn_health")); assert.ok(harness.registeredRpcs.has("secondspawn_profile_get")); assert.ok(harness.registeredRpcs.has("secondspawn_memory_add")); @@ -175,6 +175,8 @@ assert.ok(harness.registeredRpcs.has("secondspawn_agent_policy_update")); assert.ok(harness.registeredRpcs.has("secondspawn_agent_return_report")); assert.ok(harness.registeredRpcs.has("secondspawn_agent_activity_add")); assert.ok(harness.registeredRpcs.has("secondspawn_voice_session_request")); +assert.ok(harness.registeredRpcs.has("secondspawn_realtime_voice_session_request")); +assert.ok(harness.registeredRpcs.has("secondspawn_realtime_voice_input")); assert.ok(harness.registeredRpcs.has("secondspawn_actor_profile_get")); assert.ok(harness.registeredRpcs.has("secondspawn_actor_memory_add")); assert.ok(harness.registeredRpcs.has("secondspawn_actor_memory_query")); @@ -308,6 +310,97 @@ assert.deepEqual(voiceRequestBody.forbidden_state_mutations, [ "combat", "body_lifecycle" ]); + +const disabledRealtimeVoiceSession = JSON.parse(harness.registeredRpcs.get("secondspawn_realtime_voice_session_request")( + { userId: "normal-player", env: defaultRuntimeEnv }, + harness.logger, + harness.nk, + JSON.stringify({ + actor_id: "npc-synthetic-sentinel-0101", + conversation_session_id: "conversation-live", + requested_transport: "livekit_ready" + }) +)); +assert.equal(disabledRealtimeVoiceSession.session_available, false); +assert.equal(disabledRealtimeVoiceSession.provider, "windows_local_or_text"); +assert.equal(disabledRealtimeVoiceSession.reason, "realtime_voice_disabled"); +assert.equal(disabledRealtimeVoiceSession.input_policy.accepts_text, true); +assert.equal(disabledRealtimeVoiceSession.input_policy.accepts_audio, false); + +const textFallbackRealtimeInput = JSON.parse(harness.registeredRpcs.get("secondspawn_realtime_voice_input")( + { userId: "normal-player", env: defaultRuntimeEnv }, + harness.logger, + harness.nk, + JSON.stringify({ + actor_id: "npc-synthetic-sentinel-0101", + conversation_session_id: "conversation-live", + input_kind: "text", + text: "Can you hear me?" + }) +)); +assert.equal(textFallbackRealtimeInput.accepted, false); +assert.equal(textFallbackRealtimeInput.provider, "text_fallback"); +assert.equal(textFallbackRealtimeInput.fallback_to_text_chat, true); +assert.equal(textFallbackRealtimeInput.transcript, "Can you hear me?"); +assert.equal(textFallbackRealtimeInput.npc_text, ""); + +const enabledRealtimeVoiceEnv = { + ...defaultRuntimeEnv, + DOS_AI_REALTIME_VOICE_ENABLED: "true", + DOS_AI_API_KEY: "test-dos-ai-key", + DOS_AI_REALTIME_VOICE_URL: "https://api.dos.ai/v1/voice/realtime/turns" +}; +const realtimeVoiceHarness = createRuntimeHarness(module, enabledRealtimeVoiceEnv); +let realtimeVoiceRequestBody = null; +realtimeVoiceHarness.nk.httpRequest = (url, method, headers, body, timeout) => { + realtimeVoiceRequestBody = JSON.parse(body); + assert.equal(url, "https://api.dos.ai/v1/voice/realtime/turns"); + assert.equal(method, "post"); + assert.equal(headers.authorization, "Bearer test-dos-ai-key"); + assert.equal(timeout, 8000); + return { + code: 200, + body: JSON.stringify({ + provider: "gemini_realtime_voice", + transcript: "Cho tôi hỏi đường.", + npc_text: "Gate is open, but stay close to the relay lights.", + conversation_session_id: "conversation-live" + }) + }; +}; +const acceptedRealtimeVoiceInput = JSON.parse(realtimeVoiceHarness.registeredRpcs.get("secondspawn_realtime_voice_input")( + { userId: "normal-player", env: enabledRealtimeVoiceEnv }, + realtimeVoiceHarness.logger, + realtimeVoiceHarness.nk, + JSON.stringify({ + actor_id: "npc-synthetic-sentinel-0101", + conversation_session_id: "conversation-live", + input_kind: "microphone", + audio_format: "wav_pcm16", + sample_rate_hz: 16000, + channels: 1, + duration_ms: 1200, + audio_base64: "UklGRg==" + }) +)); +assert.equal(acceptedRealtimeVoiceInput.accepted, true); +assert.equal(acceptedRealtimeVoiceInput.provider, "gemini_realtime_voice"); +assert.equal(acceptedRealtimeVoiceInput.transcript, "Cho tôi hỏi đường."); +assert.equal(acceptedRealtimeVoiceInput.npc_actor_id, "npc-synthetic-sentinel-0101"); +assert.equal(acceptedRealtimeVoiceInput.npc_text, "Gate is open, but stay close to the relay lights."); +assert.equal(realtimeVoiceRequestBody.actor_id, "npc-synthetic-sentinel-0101"); +assert.equal(realtimeVoiceRequestBody.input_kind, "microphone"); +assert.equal(realtimeVoiceRequestBody.audio_format, "wav_pcm16"); +assert.deepEqual(realtimeVoiceRequestBody.forbidden_state_mutations, [ + "memory", + "relationship", + "quest", + "TIME", + "SECOND", + "inventory", + "combat", + "body_lifecycle" +]); assert.doesNotMatch(JSON.stringify(mintedVoiceSession), /test-dos-ai-key|DOS_AI_API_KEY/i); assert.throws( diff --git a/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md b/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md index 74b9e5b5..2f1442f8 100644 --- a/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md +++ b/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md @@ -225,6 +225,10 @@ The first implementation uses local presentation components: server-minted realtime session material, submits typed text or WAV PCM mic clips to Nakama, routes returned transcripts through the existing dialogue path, and falls back honestly when the backend RPC is not loaded. +- `PrototypeWindowsSpeechBridge`: development-only Windows Editor fallback for + local voice testing. It uses Windows dictation for player microphone + transcripts and SAPI WAV synthesis for NPC voice playback while cloud voice + sessions are unconfigured. - `SecondSpawnFacialBlendshapeReportUtility`: editor-only reporting for selected characters and generated visual prefabs. Agents use it to inspect real imported `SkinnedMeshRenderer` blendshape names before approving an @@ -384,6 +388,8 @@ Build: dialogue path so memory, relationship, quest, and rate-limit rules stay server-owned. - Show an honest local fallback when the realtime voice RPC is not deployed. +- In Windows Editor, allow local OS speech fallback so agents can verify a real + speak/listen loop before cloud voice credentials exist. Evidence: From 25866044e93cc2324251501a73f62e4d9faad566 Mon Sep 17 00:00:00 2001 From: JOY Date: Tue, 26 May 2026 23:14:05 +0700 Subject: [PATCH 7/7] feat: play realtime NPC voice audio --- .../Scripts/AI/AgentContextDto.cs | 4 + .../Scripts/AI/PrototypeNearbyNpcChatBox.cs | 41 ++- .../AI/PrototypeNpcRealtimeVoiceClient.cs | 25 +- .../Scripts/AI/PrototypeNpcVoicePresenter.cs | 273 ++++++++++++++++++ .../tests/supabase_custom_auth.test.mjs | 4 + ...ed-npc-dialogue-portrait-lipsync-design.md | 7 +- 6 files changed, 348 insertions(+), 6 deletions(-) diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/AgentContextDto.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/AgentContextDto.cs index 6707daa7..e39d9799 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/AgentContextDto.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/AgentContextDto.cs @@ -1512,6 +1512,10 @@ public sealed class RealtimeVoiceInputResponseDto public string transcript; public string npc_actor_id; public string npc_text; + public string voice_audio_base64; + public string voice_audio_format; + public int voice_sample_rate_hz; + public int voice_channels; public bool fallback_to_text_chat; public VoiceSessionDebugDto debug; } diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNearbyNpcChatBox.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNearbyNpcChatBox.cs index 1edafc35..03e7b06e 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNearbyNpcChatBox.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNearbyNpcChatBox.cs @@ -322,7 +322,26 @@ public void SubmitRealtimeVoiceTranscript(string transcript, string source) StartCoroutine(SendNearbyMessage(transcript.Trim(), ResolveFocusedNpcRecipient())); } - public void PresentRealtimeNpcResponse(string actorId, string text, string conversationSessionId) + public void DisplayRealtimeVoiceTranscript(string transcript, string source) + { + if (string.IsNullOrWhiteSpace(transcript)) + { + return; + } + + var safeSource = string.IsNullOrWhiteSpace(source) ? "voice" : source.Trim(); + AddHistory(SafeDisplayName(), transcript.Trim(), true); + _status = $"Realtime {safeSource} transcript received."; + } + + public void PresentRealtimeNpcResponse( + string actorId, + string text, + string conversationSessionId, + string voiceAudioBase64 = "", + string voiceAudioFormat = "", + int voiceSampleRateHz = 0, + int voiceChannels = 0) { if (string.IsNullOrWhiteSpace(text)) { @@ -344,7 +363,25 @@ public void PresentRealtimeNpcResponse(string actorId, string text, string conve var brain = ResolveBrain(safeActorId); TryAddFocusedNpcSpeech(safeActorId, brain != null ? brain.DisplayName : FocusedNpcDisplayName, text); var presenter = brain != null ? brain.GetComponent() : null; - presenter?.PresentSpeech(safeActorId, ActiveConversationSessionId, text, _gateway); + if (presenter == null) + { + return; + } + + if (!string.IsNullOrWhiteSpace(voiceAudioBase64)) + { + presenter.PresentRealtimeAudio( + safeActorId, + ActiveConversationSessionId, + text, + voiceAudioBase64, + voiceAudioFormat, + voiceSampleRateHz, + voiceChannels); + return; + } + + presenter.PresentSpeech(safeActorId, ActiveConversationSessionId, text, _gateway); } public void RememberRealtimeVoiceConversationSession(string conversationSessionId) diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs index 36e7c689..f8152128 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs @@ -329,14 +329,33 @@ private void ApplyInputResponse(RealtimeVoiceInputResponseDto response, string e _chat.RememberRealtimeVoiceConversationSession(response.conversation_session_id); } + var hasDirectNpcResponse = !string.IsNullOrWhiteSpace(response.npc_text); if (!string.IsNullOrWhiteSpace(response.transcript)) { - _chat.SubmitRealtimeVoiceTranscript(response.transcript, "microphone"); + if (hasDirectNpcResponse) + { + _chat.DisplayRealtimeVoiceTranscript(response.transcript, "microphone"); + } + else + { + _chat.SubmitRealtimeVoiceTranscript(response.transcript, "microphone"); + } + } + else if (hasDirectNpcResponse && !string.IsNullOrWhiteSpace(textFallback)) + { + _chat.DisplayRealtimeVoiceTranscript(textFallback, "text"); } - if (!string.IsNullOrWhiteSpace(response.npc_text)) + if (hasDirectNpcResponse) { - _chat.PresentRealtimeNpcResponse(response.npc_actor_id, response.npc_text, response.conversation_session_id); + _chat.PresentRealtimeNpcResponse( + response.npc_actor_id, + response.npc_text, + response.conversation_session_id, + response.voice_audio_base64, + response.voice_audio_format, + response.voice_sample_rate_hz, + response.voice_channels); } _status = response.accepted diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs index 2599662c..93708079 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs @@ -1,3 +1,4 @@ +using System; using System.Collections; using UnityEngine; using UnityEngine.Networking; @@ -67,6 +68,35 @@ public void PresentFallbackSpeech(string text) PlayFallback(text, "fallback_only"); } + public void PresentRealtimeAudio( + string actorId, + string conversationSessionId, + string text, + string audioBase64, + string audioFormat, + int sampleRateHz, + int channels) + { + if (string.IsNullOrWhiteSpace(text) && string.IsNullOrWhiteSpace(audioBase64)) + { + return; + } + + if (_presentationLoop != null) + { + StopCoroutine(_presentationLoop); + } + + _presentationLoop = StartCoroutine(PresentRealtimeAudioLoop( + actorId, + conversationSessionId, + text, + audioBase64, + audioFormat, + sampleRateHz, + channels)); + } + public void StopPresentation() { if (_presentationLoop != null) @@ -80,6 +110,34 @@ public void StopPresentation() LastPresentationMode = "idle"; } + private IEnumerator PresentRealtimeAudioLoop( + string actorId, + string conversationSessionId, + string text, + string audioBase64, + string audioFormat, + int sampleRateHz, + int channels) + { + LastVoiceProvider = "api.dos.ai_realtime_voice"; + LastVoiceSessionId = conversationSessionId ?? ""; + LastVoiceReason = ""; + BuildLineId(actorId, text); + + if (!TryBuildAudioClipFromBase64(audioBase64, audioFormat, sampleRateHz, channels, out var clip, out var reason)) + { + PlayFallback(text, reason); + _presentationLoop = null; + yield break; + } + + var duration = _voiceCue.PlayClip(clip, _clipVolume); + _facialDriver.BeginAudioSpeech(_voiceCue.OutputSource, text, duration); + LastPresentationMode = "realtime_voice_audio_with_blendshape"; + yield return null; + _presentationLoop = null; + } + private IEnumerator PresentSpeechLoop(string actorId, string conversationSessionId, string text, SecondSpawnGatewayClient gateway) { yield return PresentSpeechCore(actorId, conversationSessionId, text, gateway); @@ -252,6 +310,221 @@ private static AudioType ResolveAudioType(string endpoint) return AudioType.UNKNOWN; } + private static bool TryBuildAudioClipFromBase64( + string audioBase64, + string audioFormat, + int sampleRateHz, + int channels, + out AudioClip clip, + out string reason) + { + clip = null; + reason = ""; + if (string.IsNullOrWhiteSpace(audioBase64)) + { + reason = "realtime_voice_audio_missing"; + return false; + } + + byte[] bytes; + try + { + bytes = Convert.FromBase64String(audioBase64.Trim()); + } + catch (FormatException) + { + reason = "realtime_voice_audio_base64_invalid"; + return false; + } + + if (bytes.Length < 2) + { + reason = "realtime_voice_audio_empty"; + return false; + } + + if (LooksLikeWav(bytes)) + { + return TryBuildWavClip(bytes, out clip, out reason); + } + + var safeRate = sampleRateHz > 0 ? sampleRateHz : ParseSampleRate(audioFormat, 24000); + var safeChannels = channels > 0 ? channels : 1; + return TryBuildPcm16Clip(bytes, 0, bytes.Length, safeRate, safeChannels, "SecondSpawnRealtimeVoicePcm16", out clip, out reason); + } + + private static bool TryBuildWavClip(byte[] bytes, out AudioClip clip, out string reason) + { + clip = null; + reason = ""; + if (!LooksLikeWav(bytes)) + { + reason = "realtime_voice_wav_header_invalid"; + return false; + } + + var offset = 12; + var formatCode = 0; + var channels = 0; + var sampleRate = 0; + var bitsPerSample = 0; + var dataOffset = -1; + var dataSize = 0; + + while (offset + 8 <= bytes.Length) + { + var chunkId = ReadFourCc(bytes, offset); + var chunkSize = ReadInt32LE(bytes, offset + 4); + var chunkDataOffset = offset + 8; + if (chunkSize < 0 || chunkDataOffset + chunkSize > bytes.Length) + { + reason = "realtime_voice_wav_chunk_invalid"; + return false; + } + + if (chunkId == "fmt " && chunkSize >= 16) + { + formatCode = ReadInt16LE(bytes, chunkDataOffset); + channels = ReadInt16LE(bytes, chunkDataOffset + 2); + sampleRate = ReadInt32LE(bytes, chunkDataOffset + 4); + bitsPerSample = ReadInt16LE(bytes, chunkDataOffset + 14); + } + else if (chunkId == "data") + { + dataOffset = chunkDataOffset; + dataSize = chunkSize; + } + + offset = chunkDataOffset + chunkSize + (chunkSize & 1); + } + + if (formatCode != 1 || bitsPerSample != 16) + { + reason = "realtime_voice_wav_format_not_pcm16"; + return false; + } + + return TryBuildPcm16Clip(bytes, dataOffset, dataSize, sampleRate, channels, "SecondSpawnRealtimeVoiceWav", out clip, out reason); + } + + private static bool TryBuildPcm16Clip( + byte[] bytes, + int offset, + int byteCount, + int sampleRate, + int channels, + string clipName, + out AudioClip clip, + out string reason) + { + clip = null; + reason = ""; + if (offset < 0 || byteCount <= 0 || offset + byteCount > bytes.Length) + { + reason = "realtime_voice_pcm_range_invalid"; + return false; + } + + if (sampleRate < 8000 || channels <= 0 || channels > 2) + { + reason = "realtime_voice_pcm_layout_invalid"; + return false; + } + + var sampleCount = byteCount / 2; + var frameCount = sampleCount / channels; + if (frameCount <= 0) + { + reason = "realtime_voice_pcm_empty"; + return false; + } + + var audio = new float[frameCount * channels]; + for (var index = 0; index < audio.Length; index++) + { + var byteIndex = offset + index * 2; + var pcm = (short)(bytes[byteIndex] | (bytes[byteIndex + 1] << 8)); + audio[index] = Mathf.Clamp(pcm / 32768f, -1f, 1f); + } + + clip = AudioClip.Create(clipName, frameCount, channels, sampleRate, false); + clip.SetData(audio, 0); + return true; + } + + private static bool LooksLikeWav(byte[] bytes) + { + return bytes.Length >= 12 && + bytes[0] == 'R' && + bytes[1] == 'I' && + bytes[2] == 'F' && + bytes[3] == 'F' && + bytes[8] == 'W' && + bytes[9] == 'A' && + bytes[10] == 'V' && + bytes[11] == 'E'; + } + + private static int ParseSampleRate(string audioFormat, int fallback) + { + if (string.IsNullOrWhiteSpace(audioFormat)) + { + return fallback; + } + + var lastNumber = 0; + var current = 0; + var hasCurrent = false; + for (var index = 0; index < audioFormat.Length; index++) + { + var character = audioFormat[index]; + if (character >= '0' && character <= '9') + { + hasCurrent = true; + current = current * 10 + character - '0'; + continue; + } + + if (hasCurrent) + { + lastNumber = current; + current = 0; + hasCurrent = false; + } + } + + if (hasCurrent) + { + lastNumber = current; + } + + return lastNumber >= 8000 ? lastNumber : fallback; + } + + private static string ReadFourCc(byte[] bytes, int offset) + { + return new string(new[] + { + (char)bytes[offset], + (char)bytes[offset + 1], + (char)bytes[offset + 2], + (char)bytes[offset + 3] + }); + } + + private static int ReadInt16LE(byte[] bytes, int offset) + { + return bytes[offset] | (bytes[offset + 1] << 8); + } + + private static int ReadInt32LE(byte[] bytes, int offset) + { + return bytes[offset] | + (bytes[offset + 1] << 8) | + (bytes[offset + 2] << 16) | + (bytes[offset + 3] << 24); + } + private static string FirstNonEmpty(string value, string fallback) { return string.IsNullOrWhiteSpace(value) ? fallback : value; diff --git a/backend/nakama/tests/supabase_custom_auth.test.mjs b/backend/nakama/tests/supabase_custom_auth.test.mjs index 81b46935..eaf3ac85 100644 --- a/backend/nakama/tests/supabase_custom_auth.test.mjs +++ b/backend/nakama/tests/supabase_custom_auth.test.mjs @@ -364,6 +364,8 @@ realtimeVoiceHarness.nk.httpRequest = (url, method, headers, body, timeout) => { provider: "gemini_realtime_voice", transcript: "Cho tôi hỏi đường.", npc_text: "Gate is open, but stay close to the relay lights.", + voice_audio_base64: "AAAAAA==", + voice_audio_format: "pcm_s16le_24000", conversation_session_id: "conversation-live" }) }; @@ -388,6 +390,8 @@ assert.equal(acceptedRealtimeVoiceInput.provider, "gemini_realtime_voice"); assert.equal(acceptedRealtimeVoiceInput.transcript, "Cho tôi hỏi đường."); assert.equal(acceptedRealtimeVoiceInput.npc_actor_id, "npc-synthetic-sentinel-0101"); assert.equal(acceptedRealtimeVoiceInput.npc_text, "Gate is open, but stay close to the relay lights."); +assert.equal(acceptedRealtimeVoiceInput.voice_audio_base64, "AAAAAA=="); +assert.equal(acceptedRealtimeVoiceInput.voice_audio_format, "pcm_s16le_24000"); assert.equal(realtimeVoiceRequestBody.actor_id, "npc-synthetic-sentinel-0101"); assert.equal(realtimeVoiceRequestBody.input_kind, "microphone"); assert.equal(realtimeVoiceRequestBody.audio_format, "wav_pcm16"); diff --git a/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md b/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md index 2f1442f8..f38c8de0 100644 --- a/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md +++ b/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md @@ -384,9 +384,14 @@ Build: - Request a scoped realtime voice session from Nakama before submitting audio. - Keep the contract LiveKit-ready without importing the LiveKit Unity SDK until the backend room/token lane is available. -- Route returned speech transcripts back through the normal player-to-NPC +- Route transcript-only voice responses back through the normal player-to-NPC dialogue path so memory, relationship, quest, and rate-limit rules stay server-owned. +- When the realtime provider returns a complete NPC turn, play + `voice_audio_base64` directly in Unity and drive the existing audio-amplitude + facial hook from that clip instead of requesting a second TTS session. +- Support `pcm_s16le_` and WAV PCM16 response payloads for the MVP + bridge. Provider viseme or blendshape streams remain the later D3 tier. - Show an honest local fallback when the realtime voice RPC is not deployed. - In Windows Editor, allow local OS speech fallback so agents can verify a real speak/listen loop before cloud voice credentials exist.