diff --git a/Unity/Assets/_SecondSpawn/Editor/SecondSpawnFacialBlendshapeReportUtility.cs b/Unity/Assets/_SecondSpawn/Editor/SecondSpawnFacialBlendshapeReportUtility.cs new file mode 100644 index 00000000..04858fb5 --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Editor/SecondSpawnFacialBlendshapeReportUtility.cs @@ -0,0 +1,94 @@ +#if UNITY_EDITOR +using System.Text; +using SecondSpawn.Networking; +using UnityEditor; +using UnityEngine; + +namespace SecondSpawn.EditorTools +{ + public static class SecondSpawnFacialBlendshapeReportUtility + { + [MenuItem("Second Spawn/Art/Report Selected Facial Blendshapes")] + public static void ReportSelectedFacialBlendshapes() + { + var selected = Selection.activeGameObject; + if (selected == null) + { + Debug.LogWarning("[SecondSpawnFacialBlendshapeReportUtility] Select a character prefab or scene object first."); + return; + } + + Debug.Log(BuildReport(selected)); + } + + [MenuItem("Second Spawn/Art/Report Generated Visual Facial Blendshapes")] + public static void ReportGeneratedVisualFacialBlendshapes() + { + var builder = new StringBuilder(); + builder.AppendLine("[SecondSpawnFacialBlendshapeReportUtility] Generated visual facial blendshape report"); + for (var variant = 0; variant < VisualPrefabCatalog.Count; variant++) + { + var path = VisualPrefabCatalog.GetCleanAssetPath(variant); + var prefab = AssetDatabase.LoadAssetAtPath(path); + if (prefab == null) + { + path = VisualPrefabCatalog.GetSourceAssetPath(variant); + prefab = AssetDatabase.LoadAssetAtPath(path); + } + + builder.AppendLine($"Variant {variant:00}: {VisualPrefabCatalog.GetLabel(variant)}"); + builder.AppendLine(prefab == null + ? $" missing prefab at {path}" + : Indent(BuildReport(prefab), " ")); + } + + Debug.Log(builder.ToString()); + } + + private static string BuildReport(GameObject root) + { + var builder = new StringBuilder(); + builder.AppendLine($"Facial blendshape report for {root.name}"); + var renderers = root.GetComponentsInChildren(includeInactive: true); + if (renderers.Length == 0) + { + builder.AppendLine("No SkinnedMeshRenderer found."); + return builder.ToString(); + } + + var anyBlendshapes = false; + foreach (var renderer in renderers) + { + if (renderer == null || renderer.sharedMesh == null || renderer.sharedMesh.blendShapeCount <= 0) + { + continue; + } + + anyBlendshapes = true; + builder.AppendLine($"{renderer.name} | mesh={renderer.sharedMesh.name} | blendshapes={renderer.sharedMesh.blendShapeCount}"); + for (var index = 0; index < renderer.sharedMesh.blendShapeCount; index++) + { + builder.AppendLine($" {index:00}: {renderer.sharedMesh.GetBlendShapeName(index)}"); + } + } + + if (!anyBlendshapes) + { + builder.AppendLine("No blendshape-enabled renderer found."); + } + + return builder.ToString(); + } + + private static string Indent(string value, string prefix) + { + if (string.IsNullOrWhiteSpace(value)) + { + return ""; + } + + return prefix + value.TrimEnd().Replace("\n", "\n" + prefix); + } + } +} +#endif diff --git a/Unity/Assets/_SecondSpawn/Editor/SecondSpawnFacialBlendshapeReportUtility.cs.meta b/Unity/Assets/_SecondSpawn/Editor/SecondSpawnFacialBlendshapeReportUtility.cs.meta new file mode 100644 index 00000000..99b36fa7 --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Editor/SecondSpawnFacialBlendshapeReportUtility.cs.meta @@ -0,0 +1,2 @@ +fileFormatVersion: 2 +guid: 3dc97550137eb3a449df53715bacac1f diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/AgentContextDto.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/AgentContextDto.cs index 4423fa06..e39d9799 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/AgentContextDto.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/AgentContextDto.cs @@ -1445,4 +1445,78 @@ public sealed class VoiceSessionDebugDto public string provider_status; public string fallback_mode; } + + [Serializable] + public sealed class RealtimeVoiceSessionRequestDto + { + public string actor_id; + public string conversation_session_id; + public string requested_transport = "livekit_ready"; + public string input_mode = "text_or_microphone"; + public int ttl_seconds = 120; + public bool text_input_supported = true; + public bool microphone_input_supported = true; + public int sample_rate_hz = 16000; + public int channels = 1; + public string client_platform = "unity"; + public string provider_hint = "gemini_live_or_tts"; + } + + [Serializable] + public sealed class RealtimeVoiceSessionDto + { + public bool session_available; + public string provider; + public string reason; + public string actor_id; + public string conversation_session_id; + public VoiceSessionMaterialDto session; + public RealtimeVoiceInputPolicyDto input_policy; + public VoiceSessionDebugDto debug; + } + + [Serializable] + public sealed class RealtimeVoiceInputPolicyDto + { + public bool accepts_text; + public bool accepts_audio; + public int max_audio_ms; + public int sample_rate_hz; + public int channels; + public string[] accepted_audio_formats; + } + + [Serializable] + public sealed class RealtimeVoiceInputRequestDto + { + public string client_event_id; + public string session_id; + public string actor_id; + public string conversation_session_id; + public string input_kind; + public string text; + public string audio_format; + public int sample_rate_hz; + public int channels; + public int duration_ms; + public string audio_base64; + } + + [Serializable] + public sealed class RealtimeVoiceInputResponseDto + { + public bool accepted; + public string provider; + public string reason; + public string conversation_session_id; + public string transcript; + public string npc_actor_id; + public string npc_text; + public string voice_audio_base64; + public string voice_audio_format; + public int voice_sample_rate_hz; + public int voice_channels; + public bool fallback_to_text_chat; + public VoiceSessionDebugDto debug; + } } diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeAgentBrain.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeAgentBrain.cs index 3b257153..d693a791 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeAgentBrain.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeAgentBrain.cs @@ -61,6 +61,7 @@ private enum BrainPhase private AgentContextDto _context; private PrototypeSpeechBubble _speechBubble; private PrototypeVoiceCue _voiceCue; + private PrototypeNpcVoicePresenter _voicePresenter; private VisualAnimationIntentDriver _intentDriver; private Animator _animator; private GameObject _visualRoot; @@ -127,6 +128,7 @@ private void Awake() _baseMoveSpeed = _moveSpeed; _speechBubble = GetOrAdd(); _voiceCue = GetOrAdd(); + _voicePresenter = GetOrAdd(); _gateway = FindAnyObjectByType(); } @@ -232,6 +234,30 @@ public void ConfigureCrowdTuning( public string AgentId => string.IsNullOrWhiteSpace(_agentId) ? name : _agentId.Trim(); public string DisplayName => string.IsNullOrWhiteSpace(_displayName) ? name : _displayName.Trim(); + public string VoicePresentationMode => _voicePresenter != null ? _voicePresenter.LastPresentationMode : "none"; + public string VoicePresentationReason => _voicePresenter != null ? _voicePresenter.LastVoiceReason : ""; + public string FacialTargetSummary => _voicePresenter != null ? _voicePresenter.FacialTargetSummary : "voice_presenter=missing"; + + public static PrototypeAgentBrain FindActiveByAgentId(string actorId) + { + if (string.IsNullOrWhiteSpace(actorId)) + { + return null; + } + + var normalized = actorId.Trim(); + for (var i = 0; i < ActiveBrains.Count; i++) + { + var brain = ActiveBrains[i]; + if (brain != null && + string.Equals(brain.AgentId, normalized, System.StringComparison.OrdinalIgnoreCase)) + { + return brain; + } + } + + return null; + } public void NotifyNearbyPlayerChat( string message, @@ -1366,7 +1392,7 @@ private IEnumerator ApplyDecision(AgentDecisionDto decision, AgentDecisionReques { _speechBubble.Show(text); } - _voiceCue.PlayCue(text); + PresentNpcSpeech(text, request.world_snapshot?.conversation_session_id); _intentDriver?.TryPlay(CharacterActionId.Talk); RememberSpeech(text); _nextTalkAt = Time.time + Mathf.Max(2f, _talkIntervalSeconds); @@ -1396,6 +1422,20 @@ private IEnumerator ApplyDecision(AgentDecisionDto decision, AgentDecisionReques yield break; } + private void PresentNpcSpeech(string text, string conversationSessionId) + { + if (_voicePresenter != null) + { + _voicePresenter.PresentSpeech(AgentId, conversationSessionId, text, _gateway); + return; + } + + if (_voiceCue != null) + { + _voiceCue.PlayCue(text); + } + } + private static bool IsModelDecisionSource(string source) { return string.Equals(source, "model", System.StringComparison.OrdinalIgnoreCase) || diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs new file mode 100644 index 00000000..58fcd21c --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs @@ -0,0 +1,370 @@ +using System; +using UnityEngine; + +namespace SecondSpawn.AI +{ + [DisallowMultipleComponent] + public sealed class PrototypeFacialAnimationDriver : MonoBehaviour + { + [SerializeField] private SkinnedMeshRenderer _faceRenderer; + [SerializeField] private bool _autoFindRenderer = true; + [SerializeField, Range(0f, 100f)] private float _maxJawOpenWeight = 55f; + [SerializeField, Range(0f, 100f)] private float _maxFunnelWeight = 22f; + [SerializeField, Range(0f, 100f)] private float _maxBlinkWeight = 85f; + [SerializeField, Min(0.1f)] private float _textPulseRate = 7.5f; + [SerializeField, Min(0f)] private float _speechHoldSeconds = 0.12f; + [SerializeField, Min(0.25f)] private float _blinkIntervalSeconds = 4.5f; + [SerializeField, Min(0.1f)] private float _resolveRetrySeconds = 1f; + [SerializeField] private string[] _jawOpenBlendShapes = { "jawOpen", "JawOpen", "MouthOpen", "mouthOpen", "Mouth_Open" }; + [SerializeField] private string[] _mouthFunnelBlendShapes = { "mouthFunnel", "MouthFunnel", "Mouth_Funnel", "mouthPucker", "MouthPucker" }; + [SerializeField] private string[] _leftBlinkBlendShapes = { "eyeBlinkLeft", "EyeBlinkLeft", "Blink_Left", "LeftEyeBlink" }; + [SerializeField] private string[] _rightBlinkBlendShapes = { "eyeBlinkRight", "EyeBlinkRight", "Blink_Right", "RightEyeBlink" }; + + private readonly float[] _audioSamples = new float[64]; + private AudioSource _speechSource; + private string _speechText; + private float _speechStartedAt; + private float _speechEndsAt; + private float _nextBlinkAt; + private float _blinkStartedAt = -1f; + private int _jawOpenIndex = -1; + private int _mouthFunnelIndex = -1; + private int _leftBlinkIndex = -1; + private int _rightBlinkIndex = -1; + private bool _indicesResolved; + private float _nextResolveAt; + private float _lastMouthWeight; + private string _jawOpenName = ""; + private string _mouthFunnelName = ""; + private string _leftBlinkName = ""; + private string _rightBlinkName = ""; + + public bool HasBlendshapeTargets + { + get + { + EnsureResolved(); + return _jawOpenIndex >= 0 || _mouthFunnelIndex >= 0 || _leftBlinkIndex >= 0 || _rightBlinkIndex >= 0; + } + } + public bool HasMouthTargets + { + get + { + EnsureResolved(); + return _jawOpenIndex >= 0 || _mouthFunnelIndex >= 0; + } + } + public bool HasBlinkTargets + { + get + { + EnsureResolved(); + return _leftBlinkIndex >= 0 || _rightBlinkIndex >= 0; + } + } + public string TargetRendererName + { + get + { + EnsureResolved(); + return _faceRenderer != null ? _faceRenderer.name : ""; + } + } + public string TargetSummary + { + get + { + EnsureResolved(); + return $"renderer={Fallback(TargetRendererName, "none")}, jaw={Fallback(_jawOpenName, "none")}, funnel={Fallback(_mouthFunnelName, "none")}, blinkL={Fallback(_leftBlinkName, "none")}, blinkR={Fallback(_rightBlinkName, "none")}"; + } + } + public float LastMouthWeight => _lastMouthWeight; + + private void Awake() + { + EnsureResolved(); + ScheduleNextBlink(); + } + + private void Update() + { + EnsureResolved(); + TickSpeech(); + TickBlink(); + } + + public void BeginTextSpeech(string text, float durationSeconds = 0f) + { + _speechSource = null; + _speechText = text ?? ""; + BeginSpeech(durationSeconds); + } + + public void BeginAudioSpeech(AudioSource source, string fallbackText, float durationSeconds = 0f) + { + _speechSource = source; + _speechText = fallbackText ?? ""; + BeginSpeech(durationSeconds); + } + + public void StopSpeech() + { + _speechSource = null; + _speechText = ""; + _speechEndsAt = 0f; + ApplyMouth(0f); + } + + private void BeginSpeech(float durationSeconds) + { + EnsureResolved(); + _speechStartedAt = Time.time; + var fallbackDuration = Mathf.Clamp((_speechText.Length <= 0 ? 8 : _speechText.Length) * 0.035f, 0.35f, 4.5f); + _speechEndsAt = Time.time + Mathf.Max(0.1f, durationSeconds > 0f ? durationSeconds : fallbackDuration); + } + + private void TickSpeech() + { + if (_faceRenderer == null || _faceRenderer.sharedMesh == null) + { + return; + } + + var speakingFromAudio = _speechSource != null && _speechSource.isPlaying; + var speakingFromTimer = Time.time <= _speechEndsAt + _speechHoldSeconds; + if (!speakingFromAudio && !speakingFromTimer) + { + ApplyMouth(0f); + return; + } + + var mouth = speakingFromAudio ? ReadAudioMouthWeight() : ReadTextMouthWeight(); + ApplyMouth(mouth); + } + + private float ReadAudioMouthWeight() + { + if (_speechSource == null) + { + return ReadTextMouthWeight(); + } + + try + { + _speechSource.GetOutputData(_audioSamples, 0); + } + catch (Exception) + { + return ReadTextMouthWeight(); + } + + var total = 0f; + for (var i = 0; i < _audioSamples.Length; i++) + { + total += _audioSamples[i] * _audioSamples[i]; + } + + var rms = Mathf.Sqrt(total / _audioSamples.Length); + return Mathf.Clamp01(rms * 18f); + } + + private float ReadTextMouthWeight() + { + var age = Mathf.Max(0f, Time.time - _speechStartedAt); + var pulse = Mathf.Sin(age * _textPulseRate * Mathf.PI * 2f) * 0.5f + 0.5f; + var punctuationSoftener = EndsWithSoftPunctuation(_speechText) ? 0.65f : 1f; + return Mathf.Clamp01((0.25f + pulse * 0.75f) * punctuationSoftener); + } + + private void ApplyMouth(float normalizedWeight) + { + _lastMouthWeight = Mathf.Clamp01(normalizedWeight); + SetWeight(_jawOpenIndex, normalizedWeight * _maxJawOpenWeight); + SetWeight(_mouthFunnelIndex, normalizedWeight * _maxFunnelWeight); + } + + private void TickBlink() + { + if (_faceRenderer == null || _faceRenderer.sharedMesh == null) + { + return; + } + + if (_blinkStartedAt < 0f && Time.time >= _nextBlinkAt) + { + _blinkStartedAt = Time.time; + } + + if (_blinkStartedAt < 0f) + { + return; + } + + const float blinkDuration = 0.16f; + var age = Time.time - _blinkStartedAt; + if (age >= blinkDuration) + { + SetWeight(_leftBlinkIndex, 0f); + SetWeight(_rightBlinkIndex, 0f); + _blinkStartedAt = -1f; + ScheduleNextBlink(); + return; + } + + var blinkWeight = Mathf.Sin(age / blinkDuration * Mathf.PI) * _maxBlinkWeight; + SetWeight(_leftBlinkIndex, blinkWeight); + SetWeight(_rightBlinkIndex, blinkWeight); + } + + private void EnsureResolved() + { + if (_indicesResolved && _faceRenderer != null) + { + return; + } + + if (_indicesResolved && _faceRenderer == null && Time.time < _nextResolveAt) + { + return; + } + + _nextResolveAt = Time.time + Mathf.Max(0.1f, _resolveRetrySeconds); + if (_faceRenderer == null && _autoFindRenderer) + { + _faceRenderer = FindFaceRenderer(); + } + + _jawOpenIndex = -1; + _mouthFunnelIndex = -1; + _leftBlinkIndex = -1; + _rightBlinkIndex = -1; + _jawOpenName = ""; + _mouthFunnelName = ""; + _leftBlinkName = ""; + _rightBlinkName = ""; + if (_faceRenderer != null && _faceRenderer.sharedMesh != null) + { + _jawOpenIndex = ResolveBlendShape(_faceRenderer.sharedMesh, _jawOpenBlendShapes); + _mouthFunnelIndex = ResolveBlendShape(_faceRenderer.sharedMesh, _mouthFunnelBlendShapes); + _leftBlinkIndex = ResolveBlendShape(_faceRenderer.sharedMesh, _leftBlinkBlendShapes); + _rightBlinkIndex = ResolveBlendShape(_faceRenderer.sharedMesh, _rightBlinkBlendShapes); + _jawOpenName = ResolveBlendShapeName(_faceRenderer.sharedMesh, _jawOpenIndex); + _mouthFunnelName = ResolveBlendShapeName(_faceRenderer.sharedMesh, _mouthFunnelIndex); + _leftBlinkName = ResolveBlendShapeName(_faceRenderer.sharedMesh, _leftBlinkIndex); + _rightBlinkName = ResolveBlendShapeName(_faceRenderer.sharedMesh, _rightBlinkIndex); + } + + _indicesResolved = true; + } + + private SkinnedMeshRenderer FindFaceRenderer() + { + var renderers = GetComponentsInChildren(true); + SkinnedMeshRenderer fallback = null; + for (var i = 0; i < renderers.Length; i++) + { + var renderer = renderers[i]; + if (renderer == null || renderer.sharedMesh == null || renderer.sharedMesh.blendShapeCount <= 0) + { + continue; + } + + fallback ??= renderer; + if (ResolveBlendShape(renderer.sharedMesh, _jawOpenBlendShapes) >= 0) + { + return renderer; + } + } + + return fallback; + } + + private void SetWeight(int index, float weight) + { + if (_faceRenderer == null || index < 0) + { + return; + } + + _faceRenderer.SetBlendShapeWeight(index, Mathf.Clamp(weight, 0f, 100f)); + } + + private void ScheduleNextBlink() + { + _nextBlinkAt = Time.time + Mathf.Max(0.25f, _blinkIntervalSeconds) + UnityEngine.Random.Range(0f, 1.25f); + } + + private static int ResolveBlendShape(Mesh mesh, string[] names) + { + if (mesh == null || names == null) + { + return -1; + } + + for (var i = 0; i < names.Length; i++) + { + var candidate = names[i]; + if (string.IsNullOrWhiteSpace(candidate)) + { + continue; + } + + var exactIndex = mesh.GetBlendShapeIndex(candidate); + if (exactIndex >= 0) + { + return exactIndex; + } + } + + for (var blendShapeIndex = 0; blendShapeIndex < mesh.blendShapeCount; blendShapeIndex++) + { + var blendShapeName = NormalizeBlendShapeName(mesh.GetBlendShapeName(blendShapeIndex)); + for (var i = 0; i < names.Length; i++) + { + if (string.Equals(blendShapeName, NormalizeBlendShapeName(names[i]), StringComparison.OrdinalIgnoreCase)) + { + return blendShapeIndex; + } + } + } + + return -1; + } + + private static string ResolveBlendShapeName(Mesh mesh, int index) + { + if (mesh == null || index < 0 || index >= mesh.blendShapeCount) + { + return ""; + } + + return mesh.GetBlendShapeName(index); + } + + private static string NormalizeBlendShapeName(string value) + { + return string.IsNullOrWhiteSpace(value) + ? "" + : value.Replace("_", "").Replace("-", "").Replace(" ", "").Trim(); + } + + private static string Fallback(string value, string fallback) + { + return string.IsNullOrWhiteSpace(value) ? fallback : value.Trim(); + } + + private static bool EndsWithSoftPunctuation(string value) + { + if (string.IsNullOrWhiteSpace(value)) + { + return false; + } + + var trimmed = value.TrimEnd(); + return trimmed.EndsWith(".", StringComparison.Ordinal) || + trimmed.EndsWith(",", StringComparison.Ordinal) || + trimmed.EndsWith("?", StringComparison.Ordinal); + } + } +} diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs.meta b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs.meta new file mode 100644 index 00000000..22eb0220 --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeFacialAnimationDriver.cs.meta @@ -0,0 +1,2 @@ +fileFormatVersion: 2 +guid: 7baa8d8752d5464fbfa0a3f34396f62e diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeLLMAgentDriver.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeLLMAgentDriver.cs index 26af95cf..c7fbd361 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeLLMAgentDriver.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeLLMAgentDriver.cs @@ -24,6 +24,7 @@ public sealed class PrototypeLLMAgentDriver : MonoBehaviour private NetworkPlayer _networkPlayer; private PrototypeSpeechBubble _speechBubble; private PrototypeVoiceCue _voiceCue; + private PrototypeNpcVoicePresenter _voicePresenter; private Coroutine _loop; private void Awake() @@ -41,6 +42,12 @@ private void Awake() _voiceCue = gameObject.AddComponent(); } + _voicePresenter = GetComponent(); + if (_voicePresenter == null) + { + _voicePresenter = gameObject.AddComponent(); + } + _gateway = FindAnyObjectByType(); _memorySync = _gateway != null ? _gateway.GetComponent() : null; } @@ -189,7 +196,14 @@ private void ApplyDecision(AgentDecisionDto decision) _networkPlayer.ClearPrototypeAgentInput(); Debug.Log($"[PrototypeLLMAgentDriver] Agent says: {decision.say}"); _speechBubble.Show(decision.say); - _voiceCue.PlayCue(decision.say); + if (_voicePresenter != null) + { + _voicePresenter.PresentSpeech(gameObject.name, null, decision.say, _gateway); + } + else + { + _voiceCue.PlayCue(decision.say); + } PlayCharacterAction(CharacterActionId.Talk); } else if (decision.action == "interact") diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNPCChatClient.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNPCChatClient.cs index 2eff5381..987a3de6 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNPCChatClient.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNPCChatClient.cs @@ -56,7 +56,7 @@ private IEnumerator SendPrototypeChat() }, response => { Debug.Log($"[PrototypeNPCChatClient] {response.npc_id}: {response.text}"); - PresentSpeech(response.text); + PresentSpeech(response.npc_id, response.text); PlayTalkAnimation(); }, Debug.LogWarning); } @@ -67,7 +67,7 @@ private IEnumerator CheckVoiceSession() { actor_id = _npcId, playback_mode = "voice_preview", - lip_sync_tiers = new[] { "text_timed", "audio_amplitude_hook" } + lip_sync_tiers = new[] { "text_timed", "audio_amplitude_hook", "provider_viseme_hook" } }, response => { Debug.Log($"[PrototypeNPCChatClient] Voice provider={response.provider}, available={response.voice_available}, reason={response.reason}"); @@ -83,14 +83,26 @@ private static void PlayTalkAnimation() } } - private static void PresentSpeech(string text) + private void PresentSpeech(string actorId, string text) { - var speechBubble = FindAnyObjectByType(); + var brain = PrototypeAgentBrain.FindActiveByAgentId(actorId); + var speechBubble = brain != null + ? brain.GetComponent() + : FindAnyObjectByType(); if (speechBubble != null) { speechBubble.Show(text); } + var presenter = brain != null + ? brain.GetComponent() + : FindAnyObjectByType(); + if (presenter != null) + { + presenter.PresentSpeech(actorId, null, text, _gateway); + return; + } + var voiceCue = FindAnyObjectByType(); if (voiceCue != null) { diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNearbyNpcChatBox.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNearbyNpcChatBox.cs index 9aec3ca5..03e7b06e 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNearbyNpcChatBox.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNearbyNpcChatBox.cs @@ -56,6 +56,7 @@ public sealed class PrototypeNearbyNpcChatBox : MonoBehaviour private Vector2 _historyScrollPosition; private bool _societyEventRpcAvailable = true; private CharacterMemorySync _memorySync; + private PrototypeNpcRealtimeVoiceClient _realtimeVoice; private string _focusedNpcActorId; private string _focusedNpcDisplayName; private PrototypeAgentBrain _focusedNpcBrain; @@ -83,6 +84,9 @@ public sealed class PrototypeNearbyNpcChatBox : MonoBehaviour public string QuestActionStatus => string.IsNullOrWhiteSpace(_questActionStatus) ? PrototypeQuestStatusLine : _questActionStatus; public string PrototypeQuestStatusLine => BuildPrototypeQuestStatusLine(PrototypeQuest); public string FocusedNpcDisplayName => string.IsNullOrWhiteSpace(_focusedNpcDisplayName) ? "Nearby NPC" : _focusedNpcDisplayName; + public string FocusedNpcActorId => string.IsNullOrWhiteSpace(_focusedNpcActorId) ? "" : _focusedNpcActorId.Trim(); + public string ActiveConversationSessionId => string.IsNullOrWhiteSpace(_activeConversationSessionId) ? "" : _activeConversationSessionId.Trim(); + public PrototypeNpcRealtimeVoiceClient RealtimeVoice => _realtimeVoice; public string DisplayName { get => SafeDisplayName(); @@ -144,6 +148,13 @@ private void Awake() s_activeInstance = this; _gateway = GetComponent(); _memorySync = GetComponent(); + _realtimeVoice = GetComponent(); + if (_realtimeVoice == null) + { + _realtimeVoice = gameObject.AddComponent(); + } + + _realtimeVoice.Bind(this, _gateway); } private void OnDestroy() @@ -287,6 +298,105 @@ public void SubmitLocalPlayerMessage(string message) StartCoroutine(SendNearbyMessage(message.Trim(), ResolveFocusedNpcRecipient())); } + public void SubmitRealtimeVoiceTranscript(string transcript, string source) + { + if (string.IsNullOrWhiteSpace(transcript)) + { + return; + } + + if (_busy || IsFocusedNpcResponding()) + { + _status = $"{FocusedNpcDisplayName} is still answering."; + return; + } + + if (!IsFocusedNpcActive()) + { + AddSystemLine("Stand near an NPC and press E before sending realtime voice input."); + return; + } + + var safeSource = string.IsNullOrWhiteSpace(source) ? "voice" : source.Trim(); + Debug.Log($"[PrototypeNearbyNpcChatBox] Realtime {safeSource} transcript routed to focused NPC actor={_focusedNpcActorId}, text={Shorten(transcript, 80)}"); + StartCoroutine(SendNearbyMessage(transcript.Trim(), ResolveFocusedNpcRecipient())); + } + + public void DisplayRealtimeVoiceTranscript(string transcript, string source) + { + if (string.IsNullOrWhiteSpace(transcript)) + { + return; + } + + var safeSource = string.IsNullOrWhiteSpace(source) ? "voice" : source.Trim(); + AddHistory(SafeDisplayName(), transcript.Trim(), true); + _status = $"Realtime {safeSource} transcript received."; + } + + public void PresentRealtimeNpcResponse( + string actorId, + string text, + string conversationSessionId, + string voiceAudioBase64 = "", + string voiceAudioFormat = "", + int voiceSampleRateHz = 0, + int voiceChannels = 0) + { + if (string.IsNullOrWhiteSpace(text)) + { + return; + } + + var safeActorId = string.IsNullOrWhiteSpace(actorId) ? FocusedNpcActorId : actorId.Trim(); + if (string.IsNullOrWhiteSpace(safeActorId)) + { + AddSystemLine("Realtime voice returned NPC speech without a focused actor."); + return; + } + + if (!string.IsNullOrWhiteSpace(conversationSessionId)) + { + _activeConversationSessionId = conversationSessionId.Trim(); + } + + var brain = ResolveBrain(safeActorId); + TryAddFocusedNpcSpeech(safeActorId, brain != null ? brain.DisplayName : FocusedNpcDisplayName, text); + var presenter = brain != null ? brain.GetComponent() : null; + if (presenter == null) + { + return; + } + + if (!string.IsNullOrWhiteSpace(voiceAudioBase64)) + { + presenter.PresentRealtimeAudio( + safeActorId, + ActiveConversationSessionId, + text, + voiceAudioBase64, + voiceAudioFormat, + voiceSampleRateHz, + voiceChannels); + return; + } + + presenter.PresentSpeech(safeActorId, ActiveConversationSessionId, text, _gateway); + } + + public void RememberRealtimeVoiceConversationSession(string conversationSessionId) + { + if (!string.IsNullOrWhiteSpace(conversationSessionId)) + { + _activeConversationSessionId = conversationSessionId.Trim(); + } + } + + public void AddPrototypeSystemLine(string text) + { + AddSystemLine(text); + } + public void AcceptPrototypeQuest() { if (_questBusy || _memorySync == null) diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs new file mode 100644 index 00000000..f8152128 --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs @@ -0,0 +1,535 @@ +using System; +using System.Collections; +using System.IO; +using SecondSpawn.Networking; +using UnityEngine; +using UnityEngine.InputSystem; + +namespace SecondSpawn.AI +{ + [DisallowMultipleComponent] + [RequireComponent(typeof(SecondSpawnGatewayClient))] + public sealed class PrototypeNpcRealtimeVoiceClient : MonoBehaviour + { + [SerializeField] private bool _enablePrototypeHotkeys = true; + [SerializeField] private Key _pushToTalkKey = Key.C; + [SerializeField] private int _sampleRateHz = 16000; + [SerializeField] private int _maxRecordingSeconds = 8; + [SerializeField] private string _requestedTransport = "livekit_ready"; + [SerializeField] private string _providerHint = "gemini_live_or_tts"; + [SerializeField] private bool _useWindowsDictationFallback = true; + + private SecondSpawnGatewayClient _gateway; + private PrototypeNearbyNpcChatBox _chat; + private RealtimeVoiceSessionDto _session; + private AudioClip _recordingClip; + private string _recordingDevice = ""; + private float _recordingStartedAt; + private bool _busy; + private bool _sessionRpcUnavailable; + private string _status = "Voice input ready"; + private string _lastTranscript = ""; + private string _lastNpcText = ""; +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + private UnityEngine.Windows.Speech.DictationRecognizer _dictationRecognizer; + private string _dictationTranscript = ""; +#endif + + public bool IsBusy => _busy; + public bool IsRecording => IsWindowsDictationActive() || + (_recordingClip != null && !string.IsNullOrWhiteSpace(_recordingDevice) && Microphone.IsRecording(_recordingDevice)); + public bool HasMicrophone => Microphone.devices != null && Microphone.devices.Length > 0; + public string Status => _status; + public string LastTranscript => _lastTranscript; + public string LastNpcText => _lastNpcText; + public Key PushToTalkKey => _pushToTalkKey; + public string SessionTransport => _session?.session?.transport ?? _requestedTransport; + + private void Awake() + { + _gateway = GetComponent(); + } + + private void Update() + { + if (!_enablePrototypeHotkeys || _chat == null || !_chat.IsChatModeActive) + { + return; + } + + var keyboard = Keyboard.current; + if (keyboard == null || PrototypeInputFocusGate.IsTextInputFocused) + { + return; + } + + var key = keyboard[_pushToTalkKey]; + if (key.wasPressedThisFrame) + { + BeginPushToTalk(); + } + + if (key.wasReleasedThisFrame) + { + EndPushToTalkAndSubmit(); + } + } + + private void OnDisable() + { + CancelRecording(); + } + + public void Bind(PrototypeNearbyNpcChatBox chat, SecondSpawnGatewayClient gateway) + { + _chat = chat; + if (gateway != null) + { + _gateway = gateway; + } + } + + public void TogglePushToTalk() + { + if (IsRecording) + { + EndPushToTalkAndSubmit(); + return; + } + + BeginPushToTalk(); + } + + public void RequestSessionForFocusedNpc() + { + if (_busy || _chat == null || !_chat.IsChatModeActive) + { + return; + } + + StartCoroutine(EnsureSession(_chat.FocusedNpcActorId, _chat.ActiveConversationSessionId)); + } + + public void BeginPushToTalk() + { + if (_busy || IsRecording) + { + return; + } + + if (_chat == null || !_chat.IsChatModeActive) + { + _status = "Stand near an NPC and press E before using voice."; + _chat?.AddPrototypeSystemLine(_status); + return; + } + + if (CanUseWindowsDictationFallback()) + { + BeginWindowsDictation(); + return; + } + + if (!HasMicrophone) + { + _status = "No microphone device is available."; + _chat.AddPrototypeSystemLine(_status); + return; + } + + _recordingDevice = Microphone.devices[0]; + _recordingClip = Microphone.Start(_recordingDevice, false, Mathf.Max(1, _maxRecordingSeconds), Mathf.Max(8000, _sampleRateHz)); + _recordingStartedAt = Time.realtimeSinceStartup; + _status = $"Recording microphone for {_chat.FocusedNpcDisplayName}."; + } + + public void EndPushToTalkAndSubmit() + { + if (IsWindowsDictationActive() || HasPendingWindowsDictationTranscript()) + { + EndWindowsDictationAndSubmit(); + return; + } + + if (!IsRecording || _recordingClip == null) + { + return; + } + + var device = _recordingDevice; + var clip = _recordingClip; + var position = Mathf.Clamp(Microphone.GetPosition(device), 0, clip.samples); + Microphone.End(device); + _recordingClip = null; + _recordingDevice = ""; + + var elapsedMs = Mathf.RoundToInt(Mathf.Max(0.05f, Time.realtimeSinceStartup - _recordingStartedAt) * 1000f); + var sampleFrames = position > 0 ? position : Mathf.Clamp(Mathf.RoundToInt(elapsedMs * clip.frequency / 1000f), 1, clip.samples); + StartCoroutine(SubmitRecording(clip, sampleFrames, elapsedMs)); + } + + public void CancelRecording() + { + if (!string.IsNullOrWhiteSpace(_recordingDevice) && Microphone.IsRecording(_recordingDevice)) + { + Microphone.End(_recordingDevice); + } + + _recordingClip = null; + _recordingDevice = ""; + StopWindowsDictation(); + } + + public void SubmitRealtimeText(string text) + { + if (string.IsNullOrWhiteSpace(text) || _chat == null) + { + return; + } + + StartCoroutine(SubmitText(text.Trim())); + } + + private IEnumerator SubmitText(string text) + { + _busy = true; + yield return EnsureSession(_chat.FocusedNpcActorId, _chat.ActiveConversationSessionId); + + if (_sessionRpcUnavailable || _gateway == null) + { + _busy = false; + _chat.SubmitRealtimeVoiceTranscript(text, "text"); + yield break; + } + + RealtimeVoiceInputResponseDto response = null; + string error = null; + yield return _gateway.SubmitRealtimeVoiceInput(new RealtimeVoiceInputRequestDto + { + client_event_id = CharacterMemorySync.BuildClientEventId("realtime-text"), + session_id = _session?.session?.session_id, + actor_id = _chat.FocusedNpcActorId, + conversation_session_id = _chat.ActiveConversationSessionId, + input_kind = "text", + text = text + }, value => response = value, value => error = value); + + ApplyInputResponse(response, error, text); + _busy = false; + } + + private IEnumerator SubmitRecording(AudioClip clip, int sampleFrames, int elapsedMs) + { + if (_chat == null || clip == null) + { + yield break; + } + + _busy = true; + _status = "Submitting microphone input."; + yield return EnsureSession(_chat.FocusedNpcActorId, _chat.ActiveConversationSessionId); + + if (_sessionRpcUnavailable || _gateway == null) + { + _status = "Voice captured locally; realtime voice backend is not loaded yet."; + _chat.AddPrototypeSystemLine(_status); + _busy = false; + yield break; + } + + var audioBase64 = EncodeClipToWavBase64(clip, sampleFrames); + RealtimeVoiceInputResponseDto response = null; + string error = null; + yield return _gateway.SubmitRealtimeVoiceInput(new RealtimeVoiceInputRequestDto + { + client_event_id = CharacterMemorySync.BuildClientEventId("realtime-mic"), + session_id = _session?.session?.session_id, + actor_id = _chat.FocusedNpcActorId, + conversation_session_id = _chat.ActiveConversationSessionId, + input_kind = "microphone", + audio_format = "wav_pcm16", + sample_rate_hz = clip.frequency, + channels = clip.channels, + duration_ms = Mathf.Min(elapsedMs, Mathf.Max(1, _maxRecordingSeconds) * 1000), + audio_base64 = audioBase64 + }, value => response = value, value => error = value); + + ApplyInputResponse(response, error, ""); + _busy = false; + } + + private IEnumerator EnsureSession(string actorId, string conversationSessionId) + { + if (_sessionRpcUnavailable || _gateway == null) + { + yield break; + } + + if (_session != null && !string.IsNullOrWhiteSpace(_session.session?.session_id)) + { + yield break; + } + + _status = "Requesting realtime voice session."; + RealtimeVoiceSessionDto response = null; + string error = null; + yield return _gateway.GetRealtimeVoiceSession(new RealtimeVoiceSessionRequestDto + { + actor_id = actorId, + conversation_session_id = conversationSessionId, + requested_transport = _requestedTransport, + provider_hint = _providerHint, + ttl_seconds = 120, + sample_rate_hz = Mathf.Max(8000, _sampleRateHz), + channels = 1 + }, value => response = value, value => error = value); + + if (response == null) + { + if (IsRpcNotLoaded(error)) + { + _sessionRpcUnavailable = true; + } + + _status = $"Realtime voice session unavailable: {Shorten(error, 90)}"; + yield break; + } + + _session = response; + _status = response.session_available + ? $"Realtime voice ready via {FirstNonEmpty(response.provider, response.session?.transport, _requestedTransport)}." + : $"Realtime voice unavailable: {FirstNonEmpty(response.reason, "provider unavailable")}."; + } + + private void ApplyInputResponse(RealtimeVoiceInputResponseDto response, string error, string textFallback) + { + if (response == null) + { + if (IsRpcNotLoaded(error)) + { + _sessionRpcUnavailable = true; + } + + _status = $"Realtime voice input failed: {Shorten(error, 90)}"; + if (!string.IsNullOrWhiteSpace(textFallback)) + { + _chat.SubmitRealtimeVoiceTranscript(textFallback, "text"); + } + else + { + _chat.AddPrototypeSystemLine(_status); + } + return; + } + + _lastTranscript = response.transcript ?? ""; + _lastNpcText = response.npc_text ?? ""; + if (!string.IsNullOrWhiteSpace(response.conversation_session_id)) + { + _chat.RememberRealtimeVoiceConversationSession(response.conversation_session_id); + } + + var hasDirectNpcResponse = !string.IsNullOrWhiteSpace(response.npc_text); + if (!string.IsNullOrWhiteSpace(response.transcript)) + { + if (hasDirectNpcResponse) + { + _chat.DisplayRealtimeVoiceTranscript(response.transcript, "microphone"); + } + else + { + _chat.SubmitRealtimeVoiceTranscript(response.transcript, "microphone"); + } + } + else if (hasDirectNpcResponse && !string.IsNullOrWhiteSpace(textFallback)) + { + _chat.DisplayRealtimeVoiceTranscript(textFallback, "text"); + } + + if (hasDirectNpcResponse) + { + _chat.PresentRealtimeNpcResponse( + response.npc_actor_id, + response.npc_text, + response.conversation_session_id, + response.voice_audio_base64, + response.voice_audio_format, + response.voice_sample_rate_hz, + response.voice_channels); + } + + _status = response.accepted + ? $"Realtime voice accepted by {FirstNonEmpty(response.provider, "api.dos.ai")}." + : $"Realtime voice rejected: {FirstNonEmpty(response.reason, "unknown reason")}."; + } + + private static string EncodeClipToWavBase64(AudioClip clip, int sampleFrames) + { + var frames = Mathf.Clamp(sampleFrames, 1, clip.samples); + var channels = Mathf.Max(1, clip.channels); + var samples = new float[frames * channels]; + clip.GetData(samples, 0); + return Convert.ToBase64String(EncodePcm16Wav(samples, clip.frequency, channels)); + } + + private static byte[] EncodePcm16Wav(float[] samples, int sampleRate, int channels) + { + using var stream = new MemoryStream(); + using var writer = new BinaryWriter(stream); + var byteRate = sampleRate * channels * 2; + var dataSize = samples.Length * 2; + + writer.Write(System.Text.Encoding.ASCII.GetBytes("RIFF")); + writer.Write(36 + dataSize); + writer.Write(System.Text.Encoding.ASCII.GetBytes("WAVE")); + writer.Write(System.Text.Encoding.ASCII.GetBytes("fmt ")); + writer.Write(16); + writer.Write((short)1); + writer.Write((short)channels); + writer.Write(sampleRate); + writer.Write(byteRate); + writer.Write((short)(channels * 2)); + writer.Write((short)16); + writer.Write(System.Text.Encoding.ASCII.GetBytes("data")); + writer.Write(dataSize); + + foreach (var sample in samples) + { + writer.Write((short)Mathf.Clamp(Mathf.RoundToInt(sample * 32767f), short.MinValue, short.MaxValue)); + } + + return stream.ToArray(); + } + + private static bool IsRpcNotLoaded(string error) + { + return !string.IsNullOrWhiteSpace(error) && + (error.IndexOf("not found", StringComparison.OrdinalIgnoreCase) >= 0 || + error.IndexOf("not registered", StringComparison.OrdinalIgnoreCase) >= 0 || + error.IndexOf("rpc id", StringComparison.OrdinalIgnoreCase) >= 0); + } + + private static string FirstNonEmpty(params string[] values) + { + foreach (var value in values) + { + if (!string.IsNullOrWhiteSpace(value)) + { + return value.Trim(); + } + } + + return ""; + } + + private static string Shorten(string value, int maxLength) + { + if (string.IsNullOrWhiteSpace(value)) + { + return ""; + } + + var trimmed = value.Trim(); + return trimmed.Length <= maxLength ? trimmed : trimmed.Substring(0, Mathf.Max(0, maxLength - 3)) + "..."; + } + + private bool CanUseWindowsDictationFallback() + { +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + return _useWindowsDictationFallback && PrototypeWindowsSpeechBridge.IsDictationAvailable; +#else + return false; +#endif + } + + private bool IsWindowsDictationActive() + { +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + return _dictationRecognizer != null && + _dictationRecognizer.Status == UnityEngine.Windows.Speech.SpeechSystemStatus.Running; +#else + return false; +#endif + } + + private bool HasPendingWindowsDictationTranscript() + { +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + return !string.IsNullOrWhiteSpace(_dictationTranscript); +#else + return false; +#endif + } + + private void BeginWindowsDictation() + { +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + if (_dictationRecognizer == null) + { + _dictationRecognizer = new UnityEngine.Windows.Speech.DictationRecognizer(); + _dictationRecognizer.DictationResult += (text, confidence) => + { + if (!string.IsNullOrWhiteSpace(text)) + { + _dictationTranscript = text.Trim(); + _status = $"Heard: {_dictationTranscript}"; + } + }; + _dictationRecognizer.DictationHypothesis += text => + { + if (!string.IsNullOrWhiteSpace(text)) + { + _status = $"Listening: {Shorten(text, 72)}"; + } + }; + _dictationRecognizer.DictationError += (error, hresult) => + { + _status = $"Windows dictation error: {error}"; + }; + _dictationRecognizer.DictationComplete += completionCause => + { + if (completionCause != UnityEngine.Windows.Speech.DictationCompletionCause.Complete && + completionCause != UnityEngine.Windows.Speech.DictationCompletionCause.TimeoutExceeded) + { + _status = $"Windows dictation stopped: {completionCause}"; + } + }; + } + + _dictationTranscript = ""; + _dictationRecognizer.Start(); + _status = $"Listening to {_chat.FocusedNpcDisplayName}."; +#endif + } + + private void EndWindowsDictationAndSubmit() + { +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + StopWindowsDictation(); + if (string.IsNullOrWhiteSpace(_dictationTranscript)) + { + _status = "No speech transcript was captured."; + _chat?.AddPrototypeSystemLine(_status); + return; + } + + var transcript = _dictationTranscript.Trim(); + _lastTranscript = transcript; + _dictationTranscript = ""; + _status = $"Voice transcript ready: {Shorten(transcript, 72)}"; + _chat.SubmitRealtimeVoiceTranscript(transcript, "windows_dictation"); +#endif + } + + private void StopWindowsDictation() + { +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + if (_dictationRecognizer != null && + _dictationRecognizer.Status == UnityEngine.Windows.Speech.SpeechSystemStatus.Running) + { + _dictationRecognizer.Stop(); + } +#endif + } + } +} diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs.meta b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs.meta new file mode 100644 index 00000000..e696fff7 --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcRealtimeVoiceClient.cs.meta @@ -0,0 +1,2 @@ +fileFormatVersion: 2 +guid: 957b08da5f4d4730a87ec6d53b0d40b5 diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs new file mode 100644 index 00000000..93708079 --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs @@ -0,0 +1,533 @@ +using System; +using System.Collections; +using UnityEngine; +using UnityEngine.Networking; + +namespace SecondSpawn.AI +{ + [DisallowMultipleComponent] + public sealed class PrototypeNpcVoicePresenter : MonoBehaviour + { + [SerializeField] private bool _requestScopedVoiceSession = true; + [SerializeField] private bool _allowWindowsSpeechFallback = true; + [SerializeField] private string _playbackMode = "voice_preview"; + [SerializeField, Range(0f, 1f)] private float _clipVolume = 0.8f; + [SerializeField] private string[] _lipSyncTiers = { "text_timed", "audio_amplitude_hook", "provider_viseme_hook" }; + + private PrototypeVoiceCue _voiceCue; + private PrototypeFacialAnimationDriver _facialDriver; + private Coroutine _presentationLoop; + private int _lineSequence; + + public string LastVoiceProvider { get; private set; } = ""; + public string LastVoiceReason { get; private set; } = ""; + public string LastVoiceSessionId { get; private set; } = ""; + public string LastLineId { get; private set; } = ""; + public string LastPresentationMode { get; private set; } = "idle"; + public bool IsPresenting => _presentationLoop != null || (_voiceCue != null && _voiceCue.IsPlaying); + public bool HasFacialBlendshapeTargets => _facialDriver != null && _facialDriver.HasBlendshapeTargets; + public string FacialTargetSummary => _facialDriver != null ? _facialDriver.TargetSummary : "facial_driver=missing"; + + private void Awake() + { + _voiceCue = GetComponent(); + if (_voiceCue == null) + { + _voiceCue = gameObject.AddComponent(); + } + + _facialDriver = GetComponent(); + if (_facialDriver == null) + { + _facialDriver = gameObject.AddComponent(); + } + } + + private void OnDisable() + { + StopPresentation(); + } + + public void PresentSpeech(string actorId, string conversationSessionId, string text, SecondSpawnGatewayClient gateway) + { + if (string.IsNullOrWhiteSpace(text)) + { + return; + } + + if (_presentationLoop != null) + { + StopCoroutine(_presentationLoop); + } + + _presentationLoop = StartCoroutine(PresentSpeechLoop(actorId, conversationSessionId, text, gateway)); + } + + public void PresentFallbackSpeech(string text) + { + PlayFallback(text, "fallback_only"); + } + + public void PresentRealtimeAudio( + string actorId, + string conversationSessionId, + string text, + string audioBase64, + string audioFormat, + int sampleRateHz, + int channels) + { + if (string.IsNullOrWhiteSpace(text) && string.IsNullOrWhiteSpace(audioBase64)) + { + return; + } + + if (_presentationLoop != null) + { + StopCoroutine(_presentationLoop); + } + + _presentationLoop = StartCoroutine(PresentRealtimeAudioLoop( + actorId, + conversationSessionId, + text, + audioBase64, + audioFormat, + sampleRateHz, + channels)); + } + + public void StopPresentation() + { + if (_presentationLoop != null) + { + StopCoroutine(_presentationLoop); + _presentationLoop = null; + } + + _voiceCue?.StopCue(); + _facialDriver?.StopSpeech(); + LastPresentationMode = "idle"; + } + + private IEnumerator PresentRealtimeAudioLoop( + string actorId, + string conversationSessionId, + string text, + string audioBase64, + string audioFormat, + int sampleRateHz, + int channels) + { + LastVoiceProvider = "api.dos.ai_realtime_voice"; + LastVoiceSessionId = conversationSessionId ?? ""; + LastVoiceReason = ""; + BuildLineId(actorId, text); + + if (!TryBuildAudioClipFromBase64(audioBase64, audioFormat, sampleRateHz, channels, out var clip, out var reason)) + { + PlayFallback(text, reason); + _presentationLoop = null; + yield break; + } + + var duration = _voiceCue.PlayClip(clip, _clipVolume); + _facialDriver.BeginAudioSpeech(_voiceCue.OutputSource, text, duration); + LastPresentationMode = "realtime_voice_audio_with_blendshape"; + yield return null; + _presentationLoop = null; + } + + private IEnumerator PresentSpeechLoop(string actorId, string conversationSessionId, string text, SecondSpawnGatewayClient gateway) + { + yield return PresentSpeechCore(actorId, conversationSessionId, text, gateway); + _presentationLoop = null; + } + + private IEnumerator PresentSpeechCore(string actorId, string conversationSessionId, string text, SecondSpawnGatewayClient gateway) + { + VoiceSessionDto voiceSession = null; + string voiceError = null; + if (_requestScopedVoiceSession && gateway != null) + { + yield return gateway.GetVoiceSession(new VoiceSessionRequestDto + { + actor_id = actorId, + conversation_session_id = conversationSessionId, + line_id = BuildLineId(actorId, text), + playback_mode = _playbackMode, + ttl_seconds = 90, + lip_sync_tiers = _lipSyncTiers + }, value => voiceSession = value, error => voiceError = error); + } + + LastVoiceProvider = voiceSession?.provider ?? ""; + LastVoiceReason = !string.IsNullOrWhiteSpace(voiceError) ? voiceError : (voiceSession?.reason ?? ""); + LastVoiceSessionId = voiceSession?.session?.session_id ?? ""; + + if (voiceSession == null) + { + PlayFallback(text, string.IsNullOrWhiteSpace(voiceError) ? "voice_session_missing" : voiceError); + yield break; + } + + if (!voiceSession.voice_available) + { + PlayFallback(text, FirstNonEmpty(voiceSession.reason, "voice_unavailable")); + yield break; + } + + var endpoint = voiceSession.session?.endpoint; + if (!IsHttpEndpoint(endpoint)) + { + PlayFallback(text, FirstNonEmpty(voiceSession.reason, "voice_transport_not_playable_in_unity")); + yield break; + } + + yield return DownloadAndPlayClip(endpoint, voiceSession.session?.ephemeral_token, text); + } + + private IEnumerator DownloadAndPlayClip(string endpoint, string ephemeralToken, string fallbackText) + { + using var request = UnityWebRequestMultimedia.GetAudioClip(endpoint, ResolveAudioType(endpoint)); + if (!string.IsNullOrWhiteSpace(ephemeralToken)) + { + request.SetRequestHeader("Authorization", "Bearer " + ephemeralToken); + } + + yield return request.SendWebRequest(); + if (request.result != UnityWebRequest.Result.Success) + { + PlayFallback(fallbackText, "voice_clip_download_failed: " + request.error); + yield break; + } + + var clip = DownloadHandlerAudioClip.GetContent(request); + if (clip == null) + { + PlayFallback(fallbackText, "voice_clip_decode_failed"); + yield break; + } + + var duration = _voiceCue.PlayClip(clip, _clipVolume); + _facialDriver.BeginAudioSpeech(_voiceCue.OutputSource, fallbackText, duration); + LastPresentationMode = "server_voice_clip"; + } + + private void PlayFallback(string text, string reason) + { + LastVoiceReason = reason ?? ""; + if (_allowWindowsSpeechFallback && PrototypeWindowsSpeechBridge.IsSpeechSynthesisAvailable && isActiveAndEnabled) + { + LastPresentationMode = "windows_sapi_pending"; + StartCoroutine(PlayWindowsSpeechFallback(text, reason)); + return; + } + + PlayPrototypeTone(text, reason); + } + + private IEnumerator PlayWindowsSpeechFallback(string text, string reason) + { + AudioClip clip = null; + string error = null; + yield return PrototypeWindowsSpeechBridge.SynthesizeToClip(text, value => clip = value, value => error = value); + if (clip == null) + { + PlayPrototypeTone(text, string.IsNullOrWhiteSpace(error) ? reason : error); + yield break; + } + + var duration = _voiceCue.PlayClip(clip, _clipVolume); + _facialDriver.BeginAudioSpeech(_voiceCue.OutputSource, text, duration); + LastVoiceReason = string.IsNullOrWhiteSpace(reason) ? "windows_sapi_local_tts" : reason; + LastPresentationMode = "windows_sapi_voice_with_blendshape"; + } + + private void PlayPrototypeTone(string text, string reason) + { + var duration = _voiceCue.PlayCue(text); + _facialDriver.BeginAudioSpeech(_voiceCue.OutputSource, text, duration); + LastVoiceReason = reason ?? ""; + LastPresentationMode = "prototype_tone_with_blendshape"; + } + + private string BuildLineId(string actorId, string text) + { + _lineSequence++; + var safeActor = string.IsNullOrWhiteSpace(actorId) ? "unknown-actor" : actorId.Trim(); + var safeHash = StableHash(text); + LastLineId = $"{safeActor}-line-{_lineSequence:0000}-{safeHash:x8}"; + return LastLineId; + } + + private static uint StableHash(string value) + { + unchecked + { + var hash = 2166136261u; + var safeValue = string.IsNullOrWhiteSpace(value) ? "" : value.Trim(); + for (var i = 0; i < safeValue.Length; i++) + { + hash ^= safeValue[i]; + hash *= 16777619u; + } + + return hash; + } + } + + private static bool IsHttpEndpoint(string endpoint) + { + return !string.IsNullOrWhiteSpace(endpoint) && + (endpoint.StartsWith("http://", System.StringComparison.OrdinalIgnoreCase) || + endpoint.StartsWith("https://", System.StringComparison.OrdinalIgnoreCase)); + } + + private static AudioType ResolveAudioType(string endpoint) + { + if (string.IsNullOrWhiteSpace(endpoint)) + { + return AudioType.UNKNOWN; + } + + var lower = endpoint.ToLowerInvariant(); + if (lower.Contains(".wav")) + { + return AudioType.WAV; + } + + if (lower.Contains(".ogg")) + { + return AudioType.OGGVORBIS; + } + + if (lower.Contains(".mp3")) + { + return AudioType.MPEG; + } + + return AudioType.UNKNOWN; + } + + private static bool TryBuildAudioClipFromBase64( + string audioBase64, + string audioFormat, + int sampleRateHz, + int channels, + out AudioClip clip, + out string reason) + { + clip = null; + reason = ""; + if (string.IsNullOrWhiteSpace(audioBase64)) + { + reason = "realtime_voice_audio_missing"; + return false; + } + + byte[] bytes; + try + { + bytes = Convert.FromBase64String(audioBase64.Trim()); + } + catch (FormatException) + { + reason = "realtime_voice_audio_base64_invalid"; + return false; + } + + if (bytes.Length < 2) + { + reason = "realtime_voice_audio_empty"; + return false; + } + + if (LooksLikeWav(bytes)) + { + return TryBuildWavClip(bytes, out clip, out reason); + } + + var safeRate = sampleRateHz > 0 ? sampleRateHz : ParseSampleRate(audioFormat, 24000); + var safeChannels = channels > 0 ? channels : 1; + return TryBuildPcm16Clip(bytes, 0, bytes.Length, safeRate, safeChannels, "SecondSpawnRealtimeVoicePcm16", out clip, out reason); + } + + private static bool TryBuildWavClip(byte[] bytes, out AudioClip clip, out string reason) + { + clip = null; + reason = ""; + if (!LooksLikeWav(bytes)) + { + reason = "realtime_voice_wav_header_invalid"; + return false; + } + + var offset = 12; + var formatCode = 0; + var channels = 0; + var sampleRate = 0; + var bitsPerSample = 0; + var dataOffset = -1; + var dataSize = 0; + + while (offset + 8 <= bytes.Length) + { + var chunkId = ReadFourCc(bytes, offset); + var chunkSize = ReadInt32LE(bytes, offset + 4); + var chunkDataOffset = offset + 8; + if (chunkSize < 0 || chunkDataOffset + chunkSize > bytes.Length) + { + reason = "realtime_voice_wav_chunk_invalid"; + return false; + } + + if (chunkId == "fmt " && chunkSize >= 16) + { + formatCode = ReadInt16LE(bytes, chunkDataOffset); + channels = ReadInt16LE(bytes, chunkDataOffset + 2); + sampleRate = ReadInt32LE(bytes, chunkDataOffset + 4); + bitsPerSample = ReadInt16LE(bytes, chunkDataOffset + 14); + } + else if (chunkId == "data") + { + dataOffset = chunkDataOffset; + dataSize = chunkSize; + } + + offset = chunkDataOffset + chunkSize + (chunkSize & 1); + } + + if (formatCode != 1 || bitsPerSample != 16) + { + reason = "realtime_voice_wav_format_not_pcm16"; + return false; + } + + return TryBuildPcm16Clip(bytes, dataOffset, dataSize, sampleRate, channels, "SecondSpawnRealtimeVoiceWav", out clip, out reason); + } + + private static bool TryBuildPcm16Clip( + byte[] bytes, + int offset, + int byteCount, + int sampleRate, + int channels, + string clipName, + out AudioClip clip, + out string reason) + { + clip = null; + reason = ""; + if (offset < 0 || byteCount <= 0 || offset + byteCount > bytes.Length) + { + reason = "realtime_voice_pcm_range_invalid"; + return false; + } + + if (sampleRate < 8000 || channels <= 0 || channels > 2) + { + reason = "realtime_voice_pcm_layout_invalid"; + return false; + } + + var sampleCount = byteCount / 2; + var frameCount = sampleCount / channels; + if (frameCount <= 0) + { + reason = "realtime_voice_pcm_empty"; + return false; + } + + var audio = new float[frameCount * channels]; + for (var index = 0; index < audio.Length; index++) + { + var byteIndex = offset + index * 2; + var pcm = (short)(bytes[byteIndex] | (bytes[byteIndex + 1] << 8)); + audio[index] = Mathf.Clamp(pcm / 32768f, -1f, 1f); + } + + clip = AudioClip.Create(clipName, frameCount, channels, sampleRate, false); + clip.SetData(audio, 0); + return true; + } + + private static bool LooksLikeWav(byte[] bytes) + { + return bytes.Length >= 12 && + bytes[0] == 'R' && + bytes[1] == 'I' && + bytes[2] == 'F' && + bytes[3] == 'F' && + bytes[8] == 'W' && + bytes[9] == 'A' && + bytes[10] == 'V' && + bytes[11] == 'E'; + } + + private static int ParseSampleRate(string audioFormat, int fallback) + { + if (string.IsNullOrWhiteSpace(audioFormat)) + { + return fallback; + } + + var lastNumber = 0; + var current = 0; + var hasCurrent = false; + for (var index = 0; index < audioFormat.Length; index++) + { + var character = audioFormat[index]; + if (character >= '0' && character <= '9') + { + hasCurrent = true; + current = current * 10 + character - '0'; + continue; + } + + if (hasCurrent) + { + lastNumber = current; + current = 0; + hasCurrent = false; + } + } + + if (hasCurrent) + { + lastNumber = current; + } + + return lastNumber >= 8000 ? lastNumber : fallback; + } + + private static string ReadFourCc(byte[] bytes, int offset) + { + return new string(new[] + { + (char)bytes[offset], + (char)bytes[offset + 1], + (char)bytes[offset + 2], + (char)bytes[offset + 3] + }); + } + + private static int ReadInt16LE(byte[] bytes, int offset) + { + return bytes[offset] | (bytes[offset + 1] << 8); + } + + private static int ReadInt32LE(byte[] bytes, int offset) + { + return bytes[offset] | + (bytes[offset + 1] << 8) | + (bytes[offset + 2] << 16) | + (bytes[offset + 3] << 24); + } + + private static string FirstNonEmpty(string value, string fallback) + { + return string.IsNullOrWhiteSpace(value) ? fallback : value; + } + } +} diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs.meta b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs.meta new file mode 100644 index 00000000..af53a1c6 --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcVoicePresenter.cs.meta @@ -0,0 +1,2 @@ +fileFormatVersion: 2 +guid: 2f49b5719cd94b37b981b721af17db00 diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcWorldDebugPanel.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcWorldDebugPanel.cs index 3e88cfc6..ddc62871 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcWorldDebugPanel.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNpcWorldDebugPanel.cs @@ -274,6 +274,13 @@ private void DrawSelectedNpc() GUILayout.Label($"{NpcProfession(npc)} | {NpcAge(npc)} | {SafeText(npc.body?.identity?.home_base, "unknown base")}", _labelStyle); GUILayout.Label($"Lv {NpcLevel(npc)} | HP {NpcStats(npc)?.max_health ?? 0} | ATK {NpcStats(npc)?.attack_power ?? 0} | DEF {NpcStats(npc)?.defense_power ?? 0} | V{NpcVisualVariant(npc)}", _labelStyle); GUILayout.Label($"Voice: {SafeText(npc.body?.voice_profile?.profile_id, "unassigned")} | {SafeText(npc.body?.voice_profile?.pace_hint, "steady")}", _labelStyle); + var activeBrain = PrototypeAgentBrain.FindActiveByAgentId(npc.actor_id); + if (activeBrain != null) + { + GUILayout.Label($"Voice runtime: {SafeText(activeBrain.VoicePresentationMode, "idle")} | {SafeText(activeBrain.VoicePresentationReason, "ready")}", _mutedStyle); + GUILayout.Label(Shorten(activeBrain.FacialTargetSummary, 120), _mutedStyle); + } + GUILayout.Label($"Soul: {SafeText(npc.body?.soul?.name, "unknown")}", _labelStyle); GUILayout.Label(Shorten(SafeText(npc.memory != null && npc.memory.Length > 0 ? npc.memory[0].summary : "", "No seed memory."), 120), _mutedStyle); } diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeVoiceCue.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeVoiceCue.cs index feb9465c..23ef4956 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeVoiceCue.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeVoiceCue.cs @@ -12,28 +12,79 @@ public sealed class PrototypeVoiceCue : MonoBehaviour private AudioSource _audioSource; + public AudioSource OutputSource => EnsureAudioSource(); + public bool IsPlaying => _audioSource != null && _audioSource.isPlaying; + public float LastCueDuration { get; private set; } + private void Awake() { - _audioSource = GetComponent(); - if (_audioSource == null) + EnsureAudioSource(); + } + + public float EstimateDuration(string text) + { + return Mathf.Clamp((text?.Length ?? 8) * _secondsPerCharacter, _minSeconds, _maxSeconds); + } + + public float PlayCue(string text) + { + var source = EnsureAudioSource(); + if (source == null) { - _audioSource = gameObject.AddComponent(); - _audioSource.playOnAwake = false; - _audioSource.spatialBlend = 0.65f; + LastCueDuration = 0f; + return 0f; } + + var duration = EstimateDuration(text); + var clip = BuildCue(duration); + return PlayClip(clip, _volume); } - public void PlayCue(string text) + public float PlayClip(AudioClip clip, float volumeScale = 1f) + { + var source = EnsureAudioSource(); + if (source == null || clip == null) + { + LastCueDuration = 0f; + return 0f; + } + + source.Stop(); + source.clip = clip; + source.volume = Mathf.Clamp01(volumeScale); + source.Play(); + LastCueDuration = clip.length; + return LastCueDuration; + } + + public void StopCue() { if (_audioSource == null) { return; } - var duration = Mathf.Clamp((text?.Length ?? 8) * _secondsPerCharacter, _minSeconds, _maxSeconds); - var clip = BuildCue(duration); _audioSource.Stop(); - _audioSource.PlayOneShot(clip, _volume); + _audioSource.clip = null; + LastCueDuration = 0f; + } + + private AudioSource EnsureAudioSource() + { + if (_audioSource != null) + { + return _audioSource; + } + + _audioSource = GetComponent(); + if (_audioSource == null) + { + _audioSource = gameObject.AddComponent(); + } + + _audioSource.playOnAwake = false; + _audioSource.spatialBlend = 0.65f; + return _audioSource; } private static AudioClip BuildCue(float duration) diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeWindowsSpeechBridge.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeWindowsSpeechBridge.cs new file mode 100644 index 00000000..2ea4507a --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeWindowsSpeechBridge.cs @@ -0,0 +1,151 @@ +using System; +using System.Collections; +using System.Diagnostics; +using System.IO; +using System.Text; +using UnityEngine; +using UnityEngine.Networking; + +namespace SecondSpawn.AI +{ + public static class PrototypeWindowsSpeechBridge + { +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + public static bool IsDictationAvailable => true; + public static bool IsSpeechSynthesisAvailable => true; +#else + public static bool IsDictationAvailable => false; + public static bool IsSpeechSynthesisAvailable => false; +#endif + + public static IEnumerator SynthesizeToClip(string text, Action onClip, Action onError = null) + { +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + if (string.IsNullOrWhiteSpace(text)) + { + onError?.Invoke("empty_text"); + yield break; + } + + var path = Path.Combine(Application.temporaryCachePath, "second-spawn-npc-voice-" + Guid.NewGuid().ToString("N") + ".wav"); + var processError = ""; + yield return RunSapiSynthesis(text.Trim(), path, value => processError = value); + if (!string.IsNullOrWhiteSpace(processError) || !File.Exists(path)) + { + onError?.Invoke(string.IsNullOrWhiteSpace(processError) ? "windows_sapi_no_output" : processError); + yield break; + } + + using var request = UnityWebRequestMultimedia.GetAudioClip("file:///" + path.Replace("\\", "/"), AudioType.WAV); + yield return request.SendWebRequest(); + if (request.result != UnityWebRequest.Result.Success) + { + onError?.Invoke("windows_sapi_clip_decode_failed: " + request.error); + TryDelete(path); + yield break; + } + + var clip = DownloadHandlerAudioClip.GetContent(request); + TryDelete(path); + if (clip == null) + { + onError?.Invoke("windows_sapi_empty_clip"); + yield break; + } + + onClip?.Invoke(clip); +#else + onError?.Invoke("windows_speech_unavailable"); + yield break; +#endif + } + +#if UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN + private static IEnumerator RunSapiSynthesis(string text, string outputPath, Action onError) + { + var encodedText = Convert.ToBase64String(Encoding.UTF8.GetBytes(text)); + var encodedPath = Convert.ToBase64String(Encoding.UTF8.GetBytes(outputPath)); + var script = string.Join(Environment.NewLine, new[] + { + "$ErrorActionPreference = 'Stop'", + "Add-Type -AssemblyName System.Speech", + "$text = [Text.Encoding]::UTF8.GetString([Convert]::FromBase64String('" + encodedText + "'))", + "$path = [Text.Encoding]::UTF8.GetString([Convert]::FromBase64String('" + encodedPath + "'))", + "$voice = New-Object System.Speech.Synthesis.SpeechSynthesizer", + "$voice.Rate = 0", + "$voice.Volume = 100", + "$voice.SetOutputToWaveFile($path)", + "$voice.Speak($text)", + "$voice.Dispose()" + }); + var encodedCommand = Convert.ToBase64String(Encoding.Unicode.GetBytes(script)); + var start = new ProcessStartInfo + { + FileName = "powershell.exe", + Arguments = "-NoProfile -NonInteractive -ExecutionPolicy Bypass -EncodedCommand " + encodedCommand, + CreateNoWindow = true, + UseShellExecute = false, + RedirectStandardError = true, + RedirectStandardOutput = true + }; + + Process process; + try + { + process = Process.Start(start); + } + catch (Exception ex) + { + onError?.Invoke("windows_sapi_start_failed: " + ex.Message); + yield break; + } + + var startedAt = Time.realtimeSinceStartup; + while (process != null && !process.HasExited) + { + if (Time.realtimeSinceStartup - startedAt > 12f) + { + try + { + process.Kill(); + } + catch (Exception) + { + } + + onError?.Invoke("windows_sapi_timeout"); + yield break; + } + + yield return null; + } + + if (process == null) + { + onError?.Invoke("windows_sapi_process_missing"); + yield break; + } + + var error = process.StandardError.ReadToEnd(); + if (process.ExitCode != 0) + { + onError?.Invoke("windows_sapi_exit_" + process.ExitCode + ": " + error); + } + } +#endif + + private static void TryDelete(string path) + { + try + { + if (File.Exists(path)) + { + File.Delete(path); + } + } + catch (Exception) + { + } + } + } +} diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeWindowsSpeechBridge.cs.meta b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeWindowsSpeechBridge.cs.meta new file mode 100644 index 00000000..f8a3152f --- /dev/null +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeWindowsSpeechBridge.cs.meta @@ -0,0 +1,2 @@ +fileFormatVersion: 2 +guid: 80c379d4e0e344a38ac8e1d5668c86bf diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/SecondSpawnGatewayClient.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/SecondSpawnGatewayClient.cs index 7074f9e2..3a3a3303 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/SecondSpawnGatewayClient.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/SecondSpawnGatewayClient.cs @@ -438,6 +438,16 @@ public IEnumerator GetVoiceSession(VoiceSessionRequestDto request, Action onSuccess, Action onError = null) + { + yield return SendNakamaRpc("secondspawn_realtime_voice_session_request", request ?? new RealtimeVoiceSessionRequestDto(), onSuccess, onError); + } + + public IEnumerator SubmitRealtimeVoiceInput(RealtimeVoiceInputRequestDto request, Action onSuccess, Action onError = null) + { + yield return SendNakamaRpc("secondspawn_realtime_voice_input", request ?? new RealtimeVoiceInputRequestDto(), onSuccess, onError, _agentDecisionRequestTimeoutSeconds); + } + private IEnumerator SendNakamaRpc( string rpcId, object payload, diff --git a/Unity/Assets/_SecondSpawn/Scripts/Networking/VisualPrefabCatalog.cs b/Unity/Assets/_SecondSpawn/Scripts/Networking/VisualPrefabCatalog.cs index 11ffd200..106e068a 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/Networking/VisualPrefabCatalog.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/Networking/VisualPrefabCatalog.cs @@ -83,6 +83,11 @@ public static string GetCleanPrefabName(int variant) return $"Visual_{index:00}_{Entries[index].CleanName}.prefab"; } + public static string GetLabel(int variant) + { + return Entries[NormalizeVariant(variant)].CleanName; + } + public static bool IsSemiRealCharacterVariant(int variant) { return VisualAnimationProfileCatalog.IsSemiRealCharacterVariant(variant); diff --git a/Unity/Assets/_SecondSpawn/Scripts/UI/NearbyNpcChatPanel.cs b/Unity/Assets/_SecondSpawn/Scripts/UI/NearbyNpcChatPanel.cs index edc67c75..01ab7d21 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/UI/NearbyNpcChatPanel.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/UI/NearbyNpcChatPanel.cs @@ -20,6 +20,7 @@ public sealed class NearbyNpcChatPanel : MonoBehaviour private const float SubmitDebounceSeconds = 0.15f; private PrototypeNearbyNpcChatBox _chat; + private PrototypeNpcRealtimeVoiceClient _voiceClient; private RectTransform _panel; private Text _titleText; private InputField _displayNameInput; @@ -29,6 +30,8 @@ public sealed class NearbyNpcChatPanel : MonoBehaviour private RectTransform _historyContent; private ScrollRect _historyScroll; private Button _sendButton; + private Button _micButton; + private Text _micButtonText; private string _lastRenderedState = ""; private bool _lastChatMode; private bool _scrollToBottomPending; @@ -53,6 +56,7 @@ private static void AttachOnSceneLoad() public void Bind(PrototypeNearbyNpcChatBox chat) { _chat = chat; + _voiceClient = chat != null ? chat.RealtimeVoice : null; BuildUi(); Render(); } @@ -66,6 +70,8 @@ private void Update() { return; } + + _voiceClient = _chat.RealtimeVoice; } var textFocused = (_messageInput != null && _messageInput.isFocused) || @@ -85,6 +91,11 @@ private void Update() SubmitMessage(); } + if (_voiceClient == null && _chat != null) + { + _voiceClient = _chat.RealtimeVoice; + } + Render(); } @@ -176,6 +187,11 @@ private void BuildUi() var messageLayout = _messageInput.gameObject.AddComponent(); messageLayout.flexibleWidth = 1f; + _micButton = CreateButton("MicButton", row, "Mic"); + _micButtonText = _micButton.GetComponentInChildren(); + SetPreferredWidth(_micButton.transform as RectTransform, 58f); + _micButton.onClick.AddListener(ToggleMicrophone); + _sendButton = CreateButton("SendButton", row, "Send"); SetPreferredWidth(_sendButton.transform as RectTransform, 68f); _sendButton.onClick.AddListener(SubmitMessage); @@ -215,6 +231,23 @@ private void SubmitMessage() Render(); } + private void ToggleMicrophone() + { + if (_chat == null) + { + return; + } + + _voiceClient ??= _chat.RealtimeVoice; + if (_voiceClient == null) + { + return; + } + + _voiceClient.TogglePushToTalk(); + Render(); + } + private void Render() { if (_chat == null || _statusText == null || _historyContent == null) @@ -238,6 +271,10 @@ private void Render() builder.Append('|'); builder.Append(_chat.IsBusy ? "busy" : "ready"); builder.Append('|'); + builder.Append(_voiceClient != null && _voiceClient.IsRecording ? "recording" : "not-recording"); + builder.Append('|'); + builder.Append(_voiceClient != null ? _voiceClient.Status : ""); + builder.Append('|'); foreach (var line in _chat.DialogueLines) { builder.Append('|'); @@ -261,12 +298,32 @@ private void Render() ? $"Talking to {_chat.FocusedNpcDisplayName}" : "Nearby NPC Chat"; _statusText.text = _chat.IsBusy ? _chat.Status + "..." : _chat.Status; + if (_voiceClient != null && (_voiceClient.IsRecording || _voiceClient.IsBusy)) + { + _statusText.text = _voiceClient.Status; + } + _statusText.color = _chat.IsBusy ? new Color(1f, 0.86f, 0.35f) : new Color(0.74f, 0.86f, 0.92f); + if (_voiceClient != null && _voiceClient.IsRecording) + { + _statusText.color = new Color(1f, 0.58f, 0.42f); + } + if (_sendButton != null) { _sendButton.interactable = !_chat.IsBusy; } + if (_micButton != null) + { + var canStartMic = _voiceClient != null && _chat.IsChatModeActive && !_chat.IsBusy && !_voiceClient.IsBusy; + _micButton.interactable = _voiceClient != null && (_voiceClient.IsRecording || canStartMic); + if (_micButtonText != null) + { + _micButtonText.text = _voiceClient != null && _voiceClient.IsRecording ? "Stop" : "Mic"; + } + } + if (_messageInput != null) { _messageInput.interactable = !_chat.IsBusy; @@ -330,7 +387,7 @@ private void RenderDialogueRows() if (_chat.DialogueLines.Count == 0) { CreateSystemRow(_chat.IsChatModeActive - ? "Ask a nearby NPC. Enter sends - Esc exits Chat Mode." + ? "Ask a nearby NPC. Enter sends - Mic records - Esc exits Chat Mode." : "Type near NPCs. Enter sends - Esc exits Chat Mode."); return; } diff --git a/backend/nakama/README.md b/backend/nakama/README.md index 4f2255e4..65485c17 100644 --- a/backend/nakama/README.md +++ b/backend/nakama/README.md @@ -150,6 +150,23 @@ lifecycle state. When voice is disabled, unconfigured, timed out, or rejected, the RPC returns a structured text-only fallback so focused dialogue remains usable. +Optional realtime voice turn env: + +```text +DOS_AI_REALTIME_VOICE_ENABLED=false +DOS_AI_REALTIME_VOICE_URL=https://api.dos.ai/v1/voice/realtime/turns +DOS_AI_REALTIME_VOICE_MAX_AUDIO_MS=8000 +``` + +`secondspawn_realtime_voice_session_request` and +`secondspawn_realtime_voice_input` are also disabled by default. When enabled, +Nakama submits scoped text or microphone turn material to `api.dos.ai` and +expects a transcript plus an NPC reply. Unity can use this for LiveKit-backed +rooms later, but Photon remains the game networking layer and the response is +dialogue/presentation only. In local Windows Editor development, Unity may use +the OS dictation and SAPI speech fallback when these runtime env values are not +configured, keeping provider keys out of the client. + ### Metrics and Structured Logs Nakama exposes its normal Prometheus-style server metrics through deployment @@ -175,6 +192,10 @@ can filter prototype game events without parsing free-form text: - `secondspawn.voice_session` records voice session availability, provider, fallback reason, target actor, and voice profile id without logging provider keys or ephemeral tokens. +- `secondspawn.realtime_voice_session` and + `secondspawn.realtime_voice_input` record realtime voice availability, + accepted turn status, transcript length, and NPC reply length without logging + raw audio or provider keys. Structured logs must stay public-safe: do not log provider API keys, RPC secrets, raw prompts, raw payloads, or private provider responses. Use diff --git a/backend/nakama/local.example.yml b/backend/nakama/local.example.yml index 8f563282..f8cb2234 100644 --- a/backend/nakama/local.example.yml +++ b/backend/nakama/local.example.yml @@ -19,6 +19,9 @@ runtime: - "DOS_AI_DECISION_DAILY_TOKEN_BUDGET=250000" - "DOS_AI_DIRECT_CHAT_DAILY_REQUEST_LIMIT=1000" - "DOS_AI_DIRECT_CHAT_DAILY_TOKEN_BUDGET=250000" + - "DOS_AI_VOICE_SESSIONS_ENABLED=false" + - "DOS_AI_REALTIME_VOICE_ENABLED=false" + - "DOS_AI_REALTIME_VOICE_MAX_AUDIO_MS=8000" # Set DOS_AI_API_KEY only in private local config or deployment secrets. # Set SECOND_SPAWN_INTERNAL_RPC_SECRET and SECOND_SPAWN_ADMIN_RPC_SECRET # only in private local config or deployment secrets. diff --git a/backend/nakama/modules/index.ts b/backend/nakama/modules/index.ts index 0f73fe4e..c9bdb2b6 100644 --- a/backend/nakama/modules/index.ts +++ b/backend/nakama/modules/index.ts @@ -25,6 +25,8 @@ var rpcIdAgentPolicyUpdate = "secondspawn_agent_policy_update"; var rpcIdAgentReturnReport = "secondspawn_agent_return_report"; var rpcIdAgentActivityAdd = "secondspawn_agent_activity_add"; var rpcIdVoiceSessionRequest = "secondspawn_voice_session_request"; +var rpcIdRealtimeVoiceSessionRequest = "secondspawn_realtime_voice_session_request"; +var rpcIdRealtimeVoiceInput = "secondspawn_realtime_voice_input"; var rpcIdActorProfileGet = "secondspawn_actor_profile_get"; var rpcIdActorMemoryAdd = "secondspawn_actor_memory_add"; var rpcIdActorMemoryQuery = "secondspawn_actor_memory_query"; @@ -80,6 +82,8 @@ var rpcBoundaryCatalog = [ { id: rpcIdAgentReturnReport, boundary: rpcBoundaryClient }, { id: rpcIdAgentActivityAdd, boundary: rpcBoundaryClient }, { id: rpcIdVoiceSessionRequest, boundary: rpcBoundaryClient }, + { id: rpcIdRealtimeVoiceSessionRequest, boundary: rpcBoundaryClient }, + { id: rpcIdRealtimeVoiceInput, boundary: rpcBoundaryClient }, { id: rpcIdActorProfileGet, boundary: rpcBoundaryClient }, { id: rpcIdActorMemoryAdd, boundary: rpcBoundaryClient }, { id: rpcIdActorMemoryQuery, boundary: rpcBoundaryClient }, @@ -706,6 +710,8 @@ let InitModule: nkruntime.InitModule = function ( initializer.registerRpc(rpcIdAgentReturnReport, rpcAgentReturnReport); initializer.registerRpc(rpcIdAgentActivityAdd, rpcAgentActivityAdd); initializer.registerRpc(rpcIdVoiceSessionRequest, rpcVoiceSessionRequest); + initializer.registerRpc(rpcIdRealtimeVoiceSessionRequest, rpcRealtimeVoiceSessionRequest); + initializer.registerRpc(rpcIdRealtimeVoiceInput, rpcRealtimeVoiceInput); initializer.registerRpc(rpcIdActorProfileGet, rpcActorProfileGet); initializer.registerRpc(rpcIdActorMemoryAdd, rpcActorMemoryAdd); initializer.registerRpc(rpcIdActorMemoryQuery, rpcActorMemoryQuery); @@ -1344,6 +1350,109 @@ function rpcVoiceSessionRequest( return JSON.stringify(response); } +function rpcRealtimeVoiceSessionRequest( + ctx: nkruntime.Context, + logger: nkruntime.Logger, + nk: nkruntime.Nakama, + payload: string +): string { + var userId = requireUserId(ctx); + var request = parseJson(payload || "{}", "realtime voice session payload"); + var voiceTarget = resolveVoiceSessionTarget(ctx, nk, userId, request); + var ttlSeconds = voiceSessionTtlSeconds(ctx, request); + var expiresAtMs = new Date().getTime() + ttlSeconds * 1000; + var sessionDescriptor = buildVoiceSessionDescriptor(nk, voiceTarget, request, ttlSeconds, expiresAtMs); + var enabled = isRealtimeVoiceEnabled(ctx) && trimString(ctx.env["DOS_AI_API_KEY"]) && dosAiRealtimeVoiceEndpoint(ctx); + var response = { + session_available: !!enabled, + provider: enabled ? "api_dos_ai_realtime_voice" : "windows_local_or_text", + reason: enabled ? "" : realtimeVoiceUnavailableReason(ctx), + actor_id: voiceTarget.actor_id, + conversation_session_id: sessionDescriptor.conversation_session_id, + session: { + session_id: sessionDescriptor.session_id, + expires_at_ms: expiresAtMs, + ttl_seconds: ttlSeconds, + audience: "unity_realtime_voice", + transport: enabled ? trimString(request.requested_transport || "api_dos_ai_realtime_voice") : "windows_local_or_text", + endpoint: enabled ? dosAiRealtimeVoiceEndpoint(ctx) : "", + ephemeral_token: "", + lip_sync_tiers: sessionDescriptor.lip_sync_tiers, + presentation_only: true, + authority_note: "Realtime voice can submit dialogue text only. Nakama and Fusion validate any gameplay effect." + }, + input_policy: { + accepts_text: true, + accepts_audio: !!enabled, + max_audio_ms: realtimeVoiceMaxAudioMs(ctx), + sample_rate_hz: finiteNumberOrDefault(request.sample_rate_hz, 16000), + channels: finiteNumberOrDefault(request.channels, 1), + accepted_audio_formats: ["wav_pcm16"] + }, + debug: { + source: "nakama_realtime_voice_session_rpc", + provider_status: enabled ? "configured" : "local_fallback", + fallback_mode: voiceTarget.voice_profile.fallback_mode + } + }; + logStructuredInfo(logger, "realtime_voice_session", { + owner_id: userId, + actor_id: voiceTarget.actor_id, + provider: response.provider, + session_available: response.session_available, + reason: response.reason + }); + return JSON.stringify(response); +} + +function rpcRealtimeVoiceInput( + ctx: nkruntime.Context, + logger: nkruntime.Logger, + nk: nkruntime.Nakama, + payload: string +): string { + var userId = requireUserId(ctx); + var request = parseJson(payload || "{}", "realtime voice input payload"); + var voiceTarget = resolveVoiceSessionTarget(ctx, nk, userId, request); + var inputKind = sanitizeQuestIdentifier(trimString(request.input_kind), "text"); + var text = sanitizePublicNpcSpeech(request.text); + if (inputKind === "text" && !text) { + throw new Error("realtime text input requires text"); + } + + var apiResult = tryDosAiRealtimeVoiceInput(ctx, logger, nk, userId, voiceTarget, request, inputKind, text); + var transcript = sanitizePublicNpcSpeech(apiResult.transcript || text); + var npcText = sanitizePublicNpcSpeech(apiResult.npc_text); + var response = { + accepted: apiResult.accepted, + provider: apiResult.provider, + reason: apiResult.reason, + conversation_session_id: trimString(apiResult.conversation_session_id || request.conversation_session_id), + transcript: transcript, + npc_actor_id: voiceTarget.actor_id, + npc_text: npcText, + fallback_to_text_chat: !apiResult.accepted && inputKind === "text", + voice_audio_base64: trimString(apiResult.voice_audio_base64), + voice_audio_format: trimString(apiResult.voice_audio_format || "pcm_s16le_24000"), + debug: { + source: "nakama_realtime_voice_input_rpc", + provider_status: apiResult.reason, + fallback_mode: voiceTarget.voice_profile.fallback_mode + } + }; + logStructuredInfo(logger, "realtime_voice_input", { + owner_id: userId, + actor_id: voiceTarget.actor_id, + input_kind: inputKind, + accepted: response.accepted, + provider: response.provider, + reason: response.reason, + transcript_length: transcript.length, + npc_text_length: npcText.length + }); + return JSON.stringify(response); +} + function logAgentDecision( logger: nkruntime.Logger, ownerId: string, @@ -8992,6 +9101,150 @@ function logVoiceSession(logger: nkruntime.Logger, ownerId: string, target: any, }); } +function isRealtimeVoiceEnabled(ctx: nkruntime.Context): boolean { + var enabled = lowercase(ctx.env["DOS_AI_REALTIME_VOICE_ENABLED"]); + return enabled === "true" || enabled === "1" || enabled === "yes"; +} + +function dosAiRealtimeVoiceEndpoint(ctx: nkruntime.Context): string { + var explicitEndpoint = trimString(ctx.env["DOS_AI_REALTIME_VOICE_URL"]); + if (explicitEndpoint) { + return explicitEndpoint; + } + var baseUrl = trimTrailingSlash(ctx.env["DOS_AI_BASE_URL"] || ""); + return baseUrl ? baseUrl + "/voice/realtime/turns" : ""; +} + +function realtimeVoiceUnavailableReason(ctx: nkruntime.Context): string { + if (!isRealtimeVoiceEnabled(ctx)) { + return "realtime_voice_disabled"; + } + if (!trimString(ctx.env["DOS_AI_API_KEY"])) { + return "dos_ai_unconfigured"; + } + if (!dosAiRealtimeVoiceEndpoint(ctx)) { + return "dos_ai_realtime_voice_endpoint_unconfigured"; + } + return "realtime_voice_unavailable"; +} + +function realtimeVoiceMaxAudioMs(ctx: nkruntime.Context): number { + return Math.floor(clampNumber(finiteNumberOrDefault(ctx.env["DOS_AI_REALTIME_VOICE_MAX_AUDIO_MS"], 8000), 1000, 15000)); +} + +function tryDosAiRealtimeVoiceInput( + ctx: nkruntime.Context, + logger: nkruntime.Logger, + nk: nkruntime.Nakama, + ownerId: string, + target: any, + request: any, + inputKind: string, + text: string +): any { + var endpoint = dosAiRealtimeVoiceEndpoint(ctx); + var apiKey = trimString(ctx.env["DOS_AI_API_KEY"]); + var hasAudio = !!trimString(request.audio_base64); + if (!isRealtimeVoiceEnabled(ctx) || !apiKey || !endpoint) { + if (inputKind === "text") { + return { + accepted: false, + provider: "text_fallback", + reason: realtimeVoiceUnavailableReason(ctx), + transcript: text, + npc_text: "" + }; + } + return { + accepted: false, + provider: "windows_local_or_text", + reason: realtimeVoiceUnavailableReason(ctx), + transcript: "", + npc_text: "" + }; + } + if (inputKind !== "text" && !hasAudio) { + return { + accepted: false, + provider: "api_dos_ai_realtime_voice", + reason: "missing_audio", + transcript: "", + npc_text: "" + }; + } + + var body = { + player_id: ownerId, + actor_id: target.actor_id, + body_id: target.body_id, + display_name: target.display_name, + voice_profile: target.voice_profile, + session_id: trimString(request.session_id), + conversation_session_id: trimString(request.conversation_session_id), + input_kind: inputKind, + text: text, + audio_format: sanitizeQuestIdentifier(trimString(request.audio_format), "wav_pcm16"), + sample_rate_hz: finiteNumberOrDefault(request.sample_rate_hz, 16000), + channels: finiteNumberOrDefault(request.channels, 1), + duration_ms: Math.floor(clampNumber(finiteNumberOrDefault(request.duration_ms, 0), 0, realtimeVoiceMaxAudioMs(ctx))), + audio_base64: trimString(request.audio_base64), + presentation_only: true, + forbidden_state_mutations: [ + "memory", + "relationship", + "quest", + "TIME", + "SECOND", + "inventory", + "combat", + "body_lifecycle" + ] + }; + + var response: any; + var startedAtMs = new Date().getTime(); + try { + response = nk.httpRequest(endpoint, "post", { + "content-type": "application/json", + "accept": "application/json", + "authorization": "Bearer " + apiKey + }, JSON.stringify(body), dosAiDecisionTimeoutMs(ctx)); + } catch (err) { + logger.info("DOS.AI realtime voice input threw: " + err); + return { + accepted: false, + provider: "api_dos_ai_realtime_voice", + reason: isTimeoutLikeError(err) ? "dos_ai_realtime_voice_timeout" : "dos_ai_realtime_voice_exception", + transcript: "", + npc_text: "" + }; + } + + if (response.code < 200 || response.code > 299) { + logger.info("DOS.AI realtime voice input failed with status " + response.code); + return { + accepted: false, + provider: "api_dos_ai_realtime_voice", + reason: "dos_ai_realtime_voice_http_" + response.code, + transcript: "", + npc_text: "" + }; + } + + var decoded = parseJsonOrNull(response.body) || {}; + return { + accepted: true, + provider: trimString(decoded.provider) || "api_dos_ai_realtime_voice", + reason: trimString(decoded.reason) || "accepted", + conversation_session_id: trimString(decoded.conversation_session_id || request.conversation_session_id), + transcript: sanitizePublicNpcSpeech(decoded.transcript || decoded.text || text), + npc_text: sanitizePublicNpcSpeech(decoded.npc_text || decoded.reply_text || decoded.say), + voice_audio_base64: trimString(decoded.voice_audio_base64 || decoded.audio_base64), + voice_audio_format: trimString(decoded.voice_audio_format || decoded.audio_format), + latency_ms: elapsedSince(startedAtMs) + }; +} + function tryDosAiAgentDecision( ctx: nkruntime.Context, logger: nkruntime.Logger, diff --git a/backend/nakama/tests/supabase_custom_auth.test.mjs b/backend/nakama/tests/supabase_custom_auth.test.mjs index eb1fcdd8..eaf3ac85 100644 --- a/backend/nakama/tests/supabase_custom_auth.test.mjs +++ b/backend/nakama/tests/supabase_custom_auth.test.mjs @@ -165,7 +165,7 @@ assert.equal( const harness = createRuntimeHarness(module); assert.equal(harness.registeredHooks.length, 1); -assert.equal(harness.registeredRpcs.size, 50); +assert.equal(harness.registeredRpcs.size, 52); assert.ok(harness.registeredRpcs.has("secondspawn_health")); assert.ok(harness.registeredRpcs.has("secondspawn_profile_get")); assert.ok(harness.registeredRpcs.has("secondspawn_memory_add")); @@ -175,6 +175,8 @@ assert.ok(harness.registeredRpcs.has("secondspawn_agent_policy_update")); assert.ok(harness.registeredRpcs.has("secondspawn_agent_return_report")); assert.ok(harness.registeredRpcs.has("secondspawn_agent_activity_add")); assert.ok(harness.registeredRpcs.has("secondspawn_voice_session_request")); +assert.ok(harness.registeredRpcs.has("secondspawn_realtime_voice_session_request")); +assert.ok(harness.registeredRpcs.has("secondspawn_realtime_voice_input")); assert.ok(harness.registeredRpcs.has("secondspawn_actor_profile_get")); assert.ok(harness.registeredRpcs.has("secondspawn_actor_memory_add")); assert.ok(harness.registeredRpcs.has("secondspawn_actor_memory_query")); @@ -308,6 +310,101 @@ assert.deepEqual(voiceRequestBody.forbidden_state_mutations, [ "combat", "body_lifecycle" ]); + +const disabledRealtimeVoiceSession = JSON.parse(harness.registeredRpcs.get("secondspawn_realtime_voice_session_request")( + { userId: "normal-player", env: defaultRuntimeEnv }, + harness.logger, + harness.nk, + JSON.stringify({ + actor_id: "npc-synthetic-sentinel-0101", + conversation_session_id: "conversation-live", + requested_transport: "livekit_ready" + }) +)); +assert.equal(disabledRealtimeVoiceSession.session_available, false); +assert.equal(disabledRealtimeVoiceSession.provider, "windows_local_or_text"); +assert.equal(disabledRealtimeVoiceSession.reason, "realtime_voice_disabled"); +assert.equal(disabledRealtimeVoiceSession.input_policy.accepts_text, true); +assert.equal(disabledRealtimeVoiceSession.input_policy.accepts_audio, false); + +const textFallbackRealtimeInput = JSON.parse(harness.registeredRpcs.get("secondspawn_realtime_voice_input")( + { userId: "normal-player", env: defaultRuntimeEnv }, + harness.logger, + harness.nk, + JSON.stringify({ + actor_id: "npc-synthetic-sentinel-0101", + conversation_session_id: "conversation-live", + input_kind: "text", + text: "Can you hear me?" + }) +)); +assert.equal(textFallbackRealtimeInput.accepted, false); +assert.equal(textFallbackRealtimeInput.provider, "text_fallback"); +assert.equal(textFallbackRealtimeInput.fallback_to_text_chat, true); +assert.equal(textFallbackRealtimeInput.transcript, "Can you hear me?"); +assert.equal(textFallbackRealtimeInput.npc_text, ""); + +const enabledRealtimeVoiceEnv = { + ...defaultRuntimeEnv, + DOS_AI_REALTIME_VOICE_ENABLED: "true", + DOS_AI_API_KEY: "test-dos-ai-key", + DOS_AI_REALTIME_VOICE_URL: "https://api.dos.ai/v1/voice/realtime/turns" +}; +const realtimeVoiceHarness = createRuntimeHarness(module, enabledRealtimeVoiceEnv); +let realtimeVoiceRequestBody = null; +realtimeVoiceHarness.nk.httpRequest = (url, method, headers, body, timeout) => { + realtimeVoiceRequestBody = JSON.parse(body); + assert.equal(url, "https://api.dos.ai/v1/voice/realtime/turns"); + assert.equal(method, "post"); + assert.equal(headers.authorization, "Bearer test-dos-ai-key"); + assert.equal(timeout, 8000); + return { + code: 200, + body: JSON.stringify({ + provider: "gemini_realtime_voice", + transcript: "Cho tôi hỏi đường.", + npc_text: "Gate is open, but stay close to the relay lights.", + voice_audio_base64: "AAAAAA==", + voice_audio_format: "pcm_s16le_24000", + conversation_session_id: "conversation-live" + }) + }; +}; +const acceptedRealtimeVoiceInput = JSON.parse(realtimeVoiceHarness.registeredRpcs.get("secondspawn_realtime_voice_input")( + { userId: "normal-player", env: enabledRealtimeVoiceEnv }, + realtimeVoiceHarness.logger, + realtimeVoiceHarness.nk, + JSON.stringify({ + actor_id: "npc-synthetic-sentinel-0101", + conversation_session_id: "conversation-live", + input_kind: "microphone", + audio_format: "wav_pcm16", + sample_rate_hz: 16000, + channels: 1, + duration_ms: 1200, + audio_base64: "UklGRg==" + }) +)); +assert.equal(acceptedRealtimeVoiceInput.accepted, true); +assert.equal(acceptedRealtimeVoiceInput.provider, "gemini_realtime_voice"); +assert.equal(acceptedRealtimeVoiceInput.transcript, "Cho tôi hỏi đường."); +assert.equal(acceptedRealtimeVoiceInput.npc_actor_id, "npc-synthetic-sentinel-0101"); +assert.equal(acceptedRealtimeVoiceInput.npc_text, "Gate is open, but stay close to the relay lights."); +assert.equal(acceptedRealtimeVoiceInput.voice_audio_base64, "AAAAAA=="); +assert.equal(acceptedRealtimeVoiceInput.voice_audio_format, "pcm_s16le_24000"); +assert.equal(realtimeVoiceRequestBody.actor_id, "npc-synthetic-sentinel-0101"); +assert.equal(realtimeVoiceRequestBody.input_kind, "microphone"); +assert.equal(realtimeVoiceRequestBody.audio_format, "wav_pcm16"); +assert.deepEqual(realtimeVoiceRequestBody.forbidden_state_mutations, [ + "memory", + "relationship", + "quest", + "TIME", + "SECOND", + "inventory", + "combat", + "body_lifecycle" +]); assert.doesNotMatch(JSON.stringify(mintedVoiceSession), /test-dos-ai-key|DOS_AI_API_KEY/i); assert.throws( diff --git a/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md b/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md index 51d6d35d..f38c8de0 100644 --- a/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md +++ b/docs/design/56-focused-npc-dialogue-portrait-lipsync-design.md @@ -203,6 +203,44 @@ Evidence: compatible Ida body. - Fallback note for one body or prefab without compatible face blendshapes. +### Unity Alpha Components + +The first implementation uses local presentation components: + +- `PrototypeNpcVoicePresenter`: requests a scoped voice session from Nakama, + plays a server-provided audio clip when the session exposes a playable + temporary endpoint, and falls back to a prototype tone when voice is disabled + or unavailable. +- `PrototypeFacialAnimationDriver`: auto-finds a child `SkinnedMeshRenderer` + with face blendshapes, resolves common ARKit-style jaw, mouth, and blink + names, exposes its resolved target summary for debug panels, and drives mouth + movement from either text timing or audio amplitude. +- `PrototypeVoiceCue`: owns the local `AudioSource` used by fallback tone + playback and amplitude-driven mouth movement. +- `PrototypeNPCChatClient`: resolves the active `PrototypeAgentBrain` by actor + id before presenting speech so prototype hotkey replies animate the intended + NPC instead of whichever presenter Unity finds first. +- `PrototypeNpcRealtimeVoiceClient`: captures focused-dialogue text or local + microphone input behind a LiveKit-ready session contract. It requests + server-minted realtime session material, submits typed text or WAV PCM mic + clips to Nakama, routes returned transcripts through the existing dialogue + path, and falls back honestly when the backend RPC is not loaded. +- `PrototypeWindowsSpeechBridge`: development-only Windows Editor fallback for + local voice testing. It uses Windows dictation for player microphone + transcripts and SAPI WAV synthesis for NPC voice playback while cloud voice + sessions are unconfigured. +- `SecondSpawnFacialBlendshapeReportUtility`: editor-only reporting for + selected characters and generated visual prefabs. Agents use it to inspect + real imported `SkinnedMeshRenderer` blendshape names before approving an + Ida-family lip sync profile. + +Unity must not hold provider API keys or call model providers directly. The +only accepted online voice path is a short-lived session material returned by +Nakama or `api.dos.ai`; all local animation data remains presentation only. +LiveKit is the preferred future media transport for low-latency microphone +sessions, but Photon remains the game networking layer and Fusion/Nakama keep +all gameplay authority. + --- ## 6. Conversation State Machine @@ -270,6 +308,10 @@ Nakama owns: Voice or facial-animation providers may own optional transport-level audio, viseme, blendshape, or facial animation data for a single scoped session. +LiveKit may own WebRTC room transport, participant audio tracks, interruption +signals, and voice-agent media routing for one scoped conversation. It must not +own dialogue authorization, canonical memory writes, quest updates, combat, or +economy effects. No provider may own canonical NPC memory, relationship, quest, TIME, SECOND, inventory, combat, or body lifecycle state. @@ -331,7 +373,36 @@ Evidence: - Inspector or debug note showing which NPC presentation profile has text-only, audio-amplitude, or viseme-capable mode. -### D4: Play Mode Smoke Update +### D4: Realtime Text And Microphone Input Hook + +Issues: #139, #262 + +Build: + +- Keep typed text as the baseline focused-dialogue input. +- Add local microphone capture for push-to-talk focused dialogue. +- Request a scoped realtime voice session from Nakama before submitting audio. +- Keep the contract LiveKit-ready without importing the LiveKit Unity SDK until + the backend room/token lane is available. +- Route transcript-only voice responses back through the normal player-to-NPC + dialogue path so memory, relationship, quest, and rate-limit rules stay + server-owned. +- When the realtime provider returns a complete NPC turn, play + `voice_audio_base64` directly in Unity and drive the existing audio-amplitude + facial hook from that clip instead of requesting a second TTS session. +- Support `pcm_s16le_` and WAV PCM16 response payloads for the MVP + bridge. Provider viseme or blendshape streams remain the later D3 tier. +- Show an honest local fallback when the realtime voice RPC is not deployed. +- In Windows Editor, allow local OS speech fallback so agents can verify a real + speak/listen loop before cloud voice credentials exist. + +Evidence: + +- Compile evidence for the microphone capture and realtime session DTOs. +- Play Mode note showing text input still works and microphone capture reports + backend availability honestly. + +### D5: Play Mode Smoke Update Issues: #139, #140