From 14f16c9c0d63ff0ebe7c98f5ef1ef9198a51c4f1 Mon Sep 17 00:00:00 2001 From: JOY Date: Tue, 26 May 2026 11:54:44 +0700 Subject: [PATCH] feat: add scoped NPC voice session lane --- .../Scripts/AI/AgentContextDto.cs | 40 +++ .../Scripts/AI/PrototypeNPCChatClient.cs | 9 +- .../Scripts/AI/SecondSpawnGatewayClient.cs | 14 +- backend/nakama/README.md | 20 ++ backend/nakama/modules/index.ts | 320 ++++++++++++++++++ .../tests/supabase_custom_auth.test.mjs | 96 +++++- docs/ARCHITECTURE.md | 11 +- .../37-ai-npc-backend-client-roadmap.md | 6 + 8 files changed, 502 insertions(+), 14 deletions(-) diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/AgentContextDto.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/AgentContextDto.cs index d157c84e..4423fa06 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/AgentContextDto.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/AgentContextDto.cs @@ -1404,5 +1404,45 @@ public sealed class VoiceSessionDto public string provider; public bool requires_ephemeral_token; public string reason; + public string actor_id; + public string body_id; + public string conversation_session_id; + public VoiceProfileDto voice_profile; + public VoiceSessionMaterialDto session; + public VoiceSessionDebugDto debug; + } + + [Serializable] + public sealed class VoiceSessionRequestDto + { + public string actor_id; + public string conversation_session_id; + public string line_id; + public string playback_mode = "tts"; + public int ttl_seconds = 90; + public string[] lip_sync_tiers; + } + + [Serializable] + public sealed class VoiceSessionMaterialDto + { + public string session_id; + public long expires_at_ms; + public int ttl_seconds; + public string audience; + public string transport; + public string endpoint; + public string ephemeral_token; + public string[] lip_sync_tiers; + public bool presentation_only; + public string authority_note; + } + + [Serializable] + public sealed class VoiceSessionDebugDto + { + public string source; + public string provider_status; + public string fallback_mode; } } diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNPCChatClient.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNPCChatClient.cs index 39bb939f..2eff5381 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNPCChatClient.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/PrototypeNPCChatClient.cs @@ -10,7 +10,7 @@ namespace SecondSpawn.AI public sealed class PrototypeNPCChatClient : MonoBehaviour { [SerializeField] private bool _enablePrototypeHotkeys; - [SerializeField] private string _npcId = "prototype-guide"; + [SerializeField] private string _npcId = "npc-synthetic-sentinel-0101"; [SerializeField, TextArea] private string _prototypeMessage = "What should this body remember while I am offline?"; [SerializeField] private Key _talkKey = Key.O; @@ -63,7 +63,12 @@ private IEnumerator SendPrototypeChat() private IEnumerator CheckVoiceSession() { - yield return _gateway.GetVoiceSession(response => + yield return _gateway.GetVoiceSession(new VoiceSessionRequestDto + { + actor_id = _npcId, + playback_mode = "voice_preview", + lip_sync_tiers = new[] { "text_timed", "audio_amplitude_hook" } + }, response => { Debug.Log($"[PrototypeNPCChatClient] Voice provider={response.provider}, available={response.voice_available}, reason={response.reason}"); }, Debug.LogWarning); diff --git a/Unity/Assets/_SecondSpawn/Scripts/AI/SecondSpawnGatewayClient.cs b/Unity/Assets/_SecondSpawn/Scripts/AI/SecondSpawnGatewayClient.cs index 0b519d62..7074f9e2 100644 --- a/Unity/Assets/_SecondSpawn/Scripts/AI/SecondSpawnGatewayClient.cs +++ b/Unity/Assets/_SecondSpawn/Scripts/AI/SecondSpawnGatewayClient.cs @@ -430,14 +430,12 @@ public IEnumerator Chat(NpcChatRequestDto request, Action on public IEnumerator GetVoiceSession(Action onSuccess, Action onError = null) { - yield return null; - onSuccess?.Invoke(new VoiceSessionDto - { - voice_available = false, - provider = "not_configured", - requires_ephemeral_token = true, - reason = "Voice sessions require a future Nakama RPC that mints an api.dos.ai ephemeral token." - }); + yield return GetVoiceSession(new VoiceSessionRequestDto(), onSuccess, onError); + } + + public IEnumerator GetVoiceSession(VoiceSessionRequestDto request, Action onSuccess, Action onError = null) + { + yield return SendNakamaRpc("secondspawn_voice_session_request", request ?? new VoiceSessionRequestDto(), onSuccess, onError); } private IEnumerator SendNakamaRpc( diff --git a/backend/nakama/README.md b/backend/nakama/README.md index d7e2e698..4f2255e4 100644 --- a/backend/nakama/README.md +++ b/backend/nakama/README.md @@ -133,6 +133,23 @@ circuit-open state and tries DOS.AI again. If the model still fails, the NPC does not invent a canned reply. It returns a degraded `stop` decision with the model failure reason so Unity can show `AI BACKOFF` or `AI FALLBACK` honestly. +Optional scoped voice session env: + +```text +DOS_AI_VOICE_SESSIONS_ENABLED=false +DOS_AI_VOICE_SESSION_URL=https://api.dos.ai/v1/voice/sessions +DOS_AI_VOICE_SESSION_TTL_SECONDS=90 +``` + +`secondspawn_voice_session_request` is disabled by default. When enabled, +Nakama uses `DOS_AI_API_KEY` server-side to ask `api.dos.ai` for short-lived +voice playback material, then returns only a scoped session descriptor and an +ephemeral token to Unity. The response is presentation-only and cannot mutate +memory, relationships, quests, TIME, SECOND, inventory, combat, or body +lifecycle state. When voice is disabled, unconfigured, timed out, or rejected, +the RPC returns a structured text-only fallback so focused dialogue remains +usable. + ### Metrics and Structured Logs Nakama exposes its normal Prometheus-style server metrics through deployment @@ -155,6 +172,9 @@ can filter prototype game events without parsing free-form text: duplicate claims. - `secondspawn.body_time_mutation` records accepted BodyTime changes and duplicate BodyTime events. +- `secondspawn.voice_session` records voice session availability, provider, + fallback reason, target actor, and voice profile id without logging provider + keys or ephemeral tokens. Structured logs must stay public-safe: do not log provider API keys, RPC secrets, raw prompts, raw payloads, or private provider responses. Use diff --git a/backend/nakama/modules/index.ts b/backend/nakama/modules/index.ts index 738e39ff..0f73fe4e 100644 --- a/backend/nakama/modules/index.ts +++ b/backend/nakama/modules/index.ts @@ -24,6 +24,7 @@ var rpcIdAgentDecide = "secondspawn_agent_decide"; var rpcIdAgentPolicyUpdate = "secondspawn_agent_policy_update"; var rpcIdAgentReturnReport = "secondspawn_agent_return_report"; var rpcIdAgentActivityAdd = "secondspawn_agent_activity_add"; +var rpcIdVoiceSessionRequest = "secondspawn_voice_session_request"; var rpcIdActorProfileGet = "secondspawn_actor_profile_get"; var rpcIdActorMemoryAdd = "secondspawn_actor_memory_add"; var rpcIdActorMemoryQuery = "secondspawn_actor_memory_query"; @@ -78,6 +79,7 @@ var rpcBoundaryCatalog = [ { id: rpcIdAgentPolicyUpdate, boundary: rpcBoundaryClient }, { id: rpcIdAgentReturnReport, boundary: rpcBoundaryClient }, { id: rpcIdAgentActivityAdd, boundary: rpcBoundaryClient }, + { id: rpcIdVoiceSessionRequest, boundary: rpcBoundaryClient }, { id: rpcIdActorProfileGet, boundary: rpcBoundaryClient }, { id: rpcIdActorMemoryAdd, boundary: rpcBoundaryClient }, { id: rpcIdActorMemoryQuery, boundary: rpcBoundaryClient }, @@ -176,6 +178,8 @@ var dosAiDecisionDailyRequestLimitDefault = 1000; var dosAiDecisionDailyTokenBudgetDefault = 250000; var dosAiDirectChatDailyRequestLimitDefault = 1000; var dosAiDirectChatDailyTokenBudgetDefault = 250000; +var dosAiVoiceSessionDefaultTtlSeconds = 90; +var dosAiVoiceSessionMaxTtlSeconds = 300; var prototypeVisualVariantMax = 20; var initialInhabitationFramePoolSize = 10; var bodyArchetypePool = [ @@ -701,6 +705,7 @@ let InitModule: nkruntime.InitModule = function ( initializer.registerRpc(rpcIdAgentPolicyUpdate, rpcAgentPolicyUpdate); initializer.registerRpc(rpcIdAgentReturnReport, rpcAgentReturnReport); initializer.registerRpc(rpcIdAgentActivityAdd, rpcAgentActivityAdd); + initializer.registerRpc(rpcIdVoiceSessionRequest, rpcVoiceSessionRequest); initializer.registerRpc(rpcIdActorProfileGet, rpcActorProfileGet); initializer.registerRpc(rpcIdActorMemoryAdd, rpcActorMemoryAdd); initializer.registerRpc(rpcIdActorMemoryQuery, rpcActorMemoryQuery); @@ -1284,6 +1289,61 @@ function rpcAgentActivityAdd( return JSON.stringify(context); } +function rpcVoiceSessionRequest( + ctx: nkruntime.Context, + logger: nkruntime.Logger, + nk: nkruntime.Nakama, + payload: string +): string { + var userId = requireUserId(ctx); + var request = parseJson(payload || "{}", "voice session payload"); + var voiceTarget = resolveVoiceSessionTarget(ctx, nk, userId, request); + var ttlSeconds = voiceSessionTtlSeconds(ctx, request); + var expiresAtMs = new Date().getTime() + ttlSeconds * 1000; + var sessionDescriptor = buildVoiceSessionDescriptor(nk, voiceTarget, request, ttlSeconds, expiresAtMs); + var disabledResponse = voiceSessionDisabledResponse(ctx, voiceTarget, sessionDescriptor); + if (disabledResponse) { + logVoiceSession(logger, userId, voiceTarget, disabledResponse, "disabled"); + return JSON.stringify(disabledResponse); + } + + var apiSession = tryDosAiVoiceSession(ctx, logger, nk, voiceTarget, sessionDescriptor, ttlSeconds); + if (!apiSession.voice_available) { + logVoiceSession(logger, userId, voiceTarget, apiSession, "degraded"); + return JSON.stringify(apiSession); + } + + var response = { + voice_available: true, + provider: "api_dos_ai_voice", + requires_ephemeral_token: true, + reason: "", + actor_id: voiceTarget.actor_id, + body_id: voiceTarget.body_id, + conversation_session_id: sessionDescriptor.conversation_session_id, + voice_profile: voiceTarget.voice_profile, + session: { + session_id: sessionDescriptor.session_id, + expires_at_ms: expiresAtMs, + ttl_seconds: ttlSeconds, + audience: "unity_voice_playback", + transport: apiSession.transport || "api_dos_ai_voice_session", + endpoint: apiSession.endpoint || "", + ephemeral_token: apiSession.ephemeral_token, + lip_sync_tiers: sessionDescriptor.lip_sync_tiers, + presentation_only: true, + authority_note: "Voice playback cannot mutate memory, relationship, quest, TIME, SECOND, inventory, combat, or body lifecycle state." + }, + debug: { + source: "nakama_voice_session_rpc", + provider_status: apiSession.provider_status || "minted", + fallback_mode: voiceTarget.voice_profile.fallback_mode + } + }; + logVoiceSession(logger, userId, voiceTarget, response, "minted"); + return JSON.stringify(response); +} + function logAgentDecision( logger: nkruntime.Logger, ownerId: string, @@ -8672,6 +8732,266 @@ function selectInteractTargetId(world: any): string { return ""; } +function resolveVoiceSessionTarget( + ctx: nkruntime.Context, + nk: nkruntime.Nakama, + userId: string, + request: any +): any { + var requestedActorId = trimString(request.actor_id || request.npc_id || request.target_actor_id); + if (requestedActorId) { + var actorId = normalizeActorId(requestedActorId); + if (!findPermanentNpcFrame(actorId)) { + throw new Error("voice session actor must be a permanent NPC or omitted for the current body"); + } + + var actorState = getOrCreateWorldNpcProfileState(nk, userId, actorId); + var actorProfile = actorState.profile || {}; + var body = actorProfile.body || {}; + return { + owner_id: userId, + actor_id: actorProfile.actor_id || actorId, + body_id: trimString(body.body_id) || actorId, + display_name: trimString(actorProfile.display_name || body.identity && body.identity.public_name) || actorDisplayName(actorId), + voice_profile: normalizeVoiceProfile( + body.voice_profile || {}, + body.identity || {}, + body.soul || {}, + body.behavior_tendencies || {}, + actorProfile.actor_id || actorId + ) + }; + } + + var state = getOrCreateAgentContextState(ctx, nk); + var context = state.context || {}; + var player = context.player || {}; + var currentBody = context.body || {}; + var sourceActorId = trimString(currentBody.inhabitation && currentBody.inhabitation.source_actor_id) || + trimString(player.player_id) || + userId; + return { + owner_id: userId, + actor_id: trimString(player.player_id) || userId, + body_id: trimString(currentBody.body_id) || "body-" + userId, + display_name: trimString(player.display_name || currentBody.identity && currentBody.identity.public_name) || "Current Body", + voice_profile: normalizeVoiceProfile( + currentBody.voice_profile || {}, + currentBody.identity || {}, + currentBody.soul || {}, + currentBody.behavior_tendencies || {}, + sourceActorId + ) + }; +} + +function buildVoiceSessionDescriptor( + nk: nkruntime.Nakama, + target: any, + request: any, + ttlSeconds: number, + expiresAtMs: number +): any { + var requestedLipSync = normalizeStringArray(request.lip_sync_tiers || request.lip_sync || request.lipsync_tiers, []); + var lipSyncTiers = requestedLipSync.length > 0 + ? filterVoiceLipSyncTiers(requestedLipSync) + : ["text_timed", "audio_amplitude_hook"]; + return { + session_id: "voice-session-" + nk.uuidv4(), + actor_id: target.actor_id, + body_id: target.body_id, + display_name: target.display_name, + conversation_session_id: trimString(request.conversation_session_id), + line_id: trimString(request.line_id || request.message_id || request.turn_id), + playback_mode: normalizeVoicePlaybackMode(request.playback_mode || request.mode), + ttl_seconds: ttlSeconds, + expires_at_ms: expiresAtMs, + lip_sync_tiers: lipSyncTiers, + presentation_only: true, + forbidden_state_mutations: [ + "memory", + "relationship", + "quest", + "TIME", + "SECOND", + "inventory", + "combat", + "body_lifecycle" + ] + }; +} + +function voiceSessionDisabledResponse(ctx: nkruntime.Context, target: any, descriptor: any): any { + if (!isVoiceSessionEnabled(ctx)) { + return voiceSessionUnavailableResponse(target, descriptor, "voice_sessions_disabled", "text_only"); + } + if (!trimString(ctx.env["DOS_AI_API_KEY"])) { + return voiceSessionUnavailableResponse(target, descriptor, "dos_ai_unconfigured", "text_only"); + } + if (!dosAiVoiceSessionEndpoint(ctx)) { + return voiceSessionUnavailableResponse(target, descriptor, "dos_ai_voice_endpoint_unconfigured", "text_only"); + } + return null; +} + +function tryDosAiVoiceSession( + ctx: nkruntime.Context, + logger: nkruntime.Logger, + nk: nkruntime.Nakama, + target: any, + descriptor: any, + ttlSeconds: number +): any { + var apiKey = trimString(ctx.env["DOS_AI_API_KEY"]); + var endpoint = dosAiVoiceSessionEndpoint(ctx); + var startedAtMs = new Date().getTime(); + var body = { + actor_id: target.actor_id, + body_id: target.body_id, + display_name: target.display_name, + voice_profile: target.voice_profile, + session_id: descriptor.session_id, + conversation_session_id: descriptor.conversation_session_id, + line_id: descriptor.line_id, + playback_mode: descriptor.playback_mode, + ttl_seconds: ttlSeconds, + lip_sync_tiers: descriptor.lip_sync_tiers, + presentation_only: true, + forbidden_state_mutations: descriptor.forbidden_state_mutations + }; + + var response: any; + try { + response = nk.httpRequest(endpoint, "post", { + "content-type": "application/json", + "accept": "application/json", + "authorization": "Bearer " + apiKey + }, JSON.stringify(body), dosAiDecisionTimeoutMs(ctx)); + } catch (err) { + logger.info("DOS.AI voice session request threw: " + err); + return voiceSessionUnavailableResponse( + target, + descriptor, + isTimeoutLikeError(err) ? "dos_ai_voice_timeout" : "dos_ai_voice_exception", + "api_dos_ai_voice" + ); + } + + if (response.code < 200 || response.code > 299) { + logger.info("DOS.AI voice session request failed with status " + response.code); + return voiceSessionUnavailableResponse(target, descriptor, "dos_ai_voice_http_" + response.code, "api_dos_ai_voice"); + } + + var decoded = parseJsonOrNull(response.body); + var token = extractDosAiVoiceSessionToken(decoded); + if (!token) { + return voiceSessionUnavailableResponse(target, descriptor, "dos_ai_voice_empty_token", "api_dos_ai_voice"); + } + + return { + voice_available: true, + provider_status: "minted", + endpoint: trimString(decoded && decoded.endpoint), + transport: trimString(decoded && decoded.transport) || "api_dos_ai_voice_session", + ephemeral_token: token, + latency_ms: elapsedSince(startedAtMs) + }; +} + +function extractDosAiVoiceSessionToken(decoded: any): string { + return firstNonEmpty( + decoded && decoded.ephemeral_token, + decoded && decoded.session_token, + decoded && decoded.token, + decoded && decoded.client_secret && decoded.client_secret.value, + decoded && decoded.client_secret + ); +} + +function voiceSessionUnavailableResponse(target: any, descriptor: any, reason: string, provider: string): any { + return { + voice_available: false, + provider: provider, + requires_ephemeral_token: true, + reason: reason, + actor_id: target.actor_id, + body_id: target.body_id, + conversation_session_id: descriptor.conversation_session_id, + voice_profile: target.voice_profile, + session: { + session_id: descriptor.session_id, + expires_at_ms: descriptor.expires_at_ms, + ttl_seconds: descriptor.ttl_seconds, + audience: "unity_voice_playback", + transport: "none", + endpoint: "", + ephemeral_token: "", + lip_sync_tiers: descriptor.lip_sync_tiers, + presentation_only: true, + authority_note: "Voice unavailable. Unity must keep validated text and text-timed speaking fallback." + }, + debug: { + source: "nakama_voice_session_rpc", + provider_status: reason, + fallback_mode: target.voice_profile.fallback_mode + } + }; +} + +function isVoiceSessionEnabled(ctx: nkruntime.Context): boolean { + var enabled = lowercase(ctx.env["DOS_AI_VOICE_SESSIONS_ENABLED"]); + return enabled === "true" || enabled === "1" || enabled === "yes"; +} + +function dosAiVoiceSessionEndpoint(ctx: nkruntime.Context): string { + var explicitEndpoint = trimString(ctx.env["DOS_AI_VOICE_SESSION_URL"]); + if (explicitEndpoint) { + return explicitEndpoint; + } + var baseUrl = trimTrailingSlash(ctx.env["DOS_AI_BASE_URL"] || ""); + return baseUrl ? baseUrl + "/voice/sessions" : ""; +} + +function voiceSessionTtlSeconds(ctx: nkruntime.Context, request: any): number { + var configured = finiteNumberOrDefault( + request.ttl_seconds || ctx.env["DOS_AI_VOICE_SESSION_TTL_SECONDS"], + dosAiVoiceSessionDefaultTtlSeconds + ); + return Math.floor(clampNumber(configured, 15, dosAiVoiceSessionMaxTtlSeconds)); +} + +function normalizeVoicePlaybackMode(value: any): string { + var mode = sanitizeQuestIdentifier(trimString(value), "tts").replace(/-/g, "_"); + return mode === "tts" || mode === "voice_preview" || mode === "audio_amplitude" || mode === "viseme" + ? mode + : "tts"; +} + +function filterVoiceLipSyncTiers(values: string[]): string[] { + var allowed = ["text_timed", "audio_amplitude_hook", "provider_viseme_hook"]; + var result: string[] = []; + for (var index = 0; index < values.length; index += 1) { + var tier = sanitizeQuestIdentifier(values[index], "").replace(/-/g, "_"); + if (arrayContains(allowed, tier) && !arrayContains(result, tier)) { + result.push(tier); + } + } + return result.length > 0 ? result : ["text_timed", "audio_amplitude_hook"]; +} + +function logVoiceSession(logger: nkruntime.Logger, ownerId: string, target: any, response: any, outcome: string): void { + logStructuredInfo(logger, "voice_session", { + owner_id: ownerId, + actor_id: target.actor_id, + body_id: target.body_id, + provider: trimString(response && response.provider), + voice_available: !!(response && response.voice_available), + reason: trimString(response && response.reason), + outcome: outcome, + voice_profile_id: trimString(target && target.voice_profile && target.voice_profile.profile_id) + }); +} + function tryDosAiAgentDecision( ctx: nkruntime.Context, logger: nkruntime.Logger, diff --git a/backend/nakama/tests/supabase_custom_auth.test.mjs b/backend/nakama/tests/supabase_custom_auth.test.mjs index 9478888c..eb1fcdd8 100644 --- a/backend/nakama/tests/supabase_custom_auth.test.mjs +++ b/backend/nakama/tests/supabase_custom_auth.test.mjs @@ -165,7 +165,7 @@ assert.equal( const harness = createRuntimeHarness(module); assert.equal(harness.registeredHooks.length, 1); -assert.equal(harness.registeredRpcs.size, 49); +assert.equal(harness.registeredRpcs.size, 50); assert.ok(harness.registeredRpcs.has("secondspawn_health")); assert.ok(harness.registeredRpcs.has("secondspawn_profile_get")); assert.ok(harness.registeredRpcs.has("secondspawn_memory_add")); @@ -174,6 +174,7 @@ assert.ok(harness.registeredRpcs.has("secondspawn_agent_decide")); assert.ok(harness.registeredRpcs.has("secondspawn_agent_policy_update")); assert.ok(harness.registeredRpcs.has("secondspawn_agent_return_report")); assert.ok(harness.registeredRpcs.has("secondspawn_agent_activity_add")); +assert.ok(harness.registeredRpcs.has("secondspawn_voice_session_request")); assert.ok(harness.registeredRpcs.has("secondspawn_actor_profile_get")); assert.ok(harness.registeredRpcs.has("secondspawn_actor_memory_add")); assert.ok(harness.registeredRpcs.has("secondspawn_actor_memory_query")); @@ -216,6 +217,99 @@ assert.ok(harness.registeredRpcs.has("secondspawn_inventory_salvage")); assert.ok(harness.registeredRpcs.has("secondspawn_inventory_use")); assert.ok(harness.registeredRpcs.has("secondspawn_run_loot_claim")); +const disabledVoiceSession = JSON.parse(harness.registeredRpcs.get("secondspawn_voice_session_request")( + { userId: "normal-player", env: defaultRuntimeEnv }, + harness.logger, + harness.nk, + JSON.stringify({ + actor_id: "npc-synthetic-sentinel-0101", + conversation_session_id: "conversation-smoke", + ttl_seconds: 999, + lip_sync_tiers: ["provider_viseme_hook", "bad-tier"] + }) +)); +assert.equal(disabledVoiceSession.voice_available, false); +assert.equal(disabledVoiceSession.provider, "text_only"); +assert.equal(disabledVoiceSession.reason, "voice_sessions_disabled"); +assert.equal(disabledVoiceSession.actor_id, "npc-synthetic-sentinel-0101"); +assert.equal(disabledVoiceSession.conversation_session_id, "conversation-smoke"); +assert.equal(disabledVoiceSession.session.ttl_seconds, 300); +assert.equal(disabledVoiceSession.session.ephemeral_token, ""); +assert.deepEqual(disabledVoiceSession.session.lip_sync_tiers, ["provider_viseme_hook"]); +assert.ok(disabledVoiceSession.voice_profile.profile_id.startsWith("voice:npc-synthetic-sentinel-0101")); +assert.doesNotMatch(JSON.stringify(disabledVoiceSession), /test-internal-secret|test-admin-secret|DOS_AI_API_KEY/i); + +const enabledWithoutKeyEnv = { + ...defaultRuntimeEnv, + DOS_AI_VOICE_SESSIONS_ENABLED: "true", + DOS_AI_BASE_URL: "https://api.dos.ai/v1" +}; +const enabledWithoutKeyHarness = createRuntimeHarness(module, enabledWithoutKeyEnv); +const noKeyVoiceSession = JSON.parse(enabledWithoutKeyHarness.registeredRpcs.get("secondspawn_voice_session_request")( + { userId: "normal-player", env: enabledWithoutKeyEnv }, + enabledWithoutKeyHarness.logger, + enabledWithoutKeyHarness.nk, + JSON.stringify({ actor_id: "npc-wasteland-courier-0244" }) +)); +assert.equal(noKeyVoiceSession.voice_available, false); +assert.equal(noKeyVoiceSession.reason, "dos_ai_unconfigured"); + +const enabledVoiceEnv = { + ...defaultRuntimeEnv, + DOS_AI_VOICE_SESSIONS_ENABLED: "true", + DOS_AI_API_KEY: "test-dos-ai-key", + DOS_AI_VOICE_SESSION_URL: "https://api.dos.ai/v1/voice/sessions" +}; +const enabledVoiceHarness = createRuntimeHarness(module, enabledVoiceEnv); +let voiceRequestBody = null; +enabledVoiceHarness.nk.httpRequest = (url, method, headers, body, timeout) => { + voiceRequestBody = JSON.parse(body); + assert.equal(url, "https://api.dos.ai/v1/voice/sessions"); + assert.equal(method, "post"); + assert.equal(headers.authorization, "Bearer test-dos-ai-key"); + assert.equal(timeout, 8000); + return { + code: 200, + body: JSON.stringify({ + ephemeral_token: "voice-session-token", + endpoint: "wss://voice.example/session", + transport: "realtime" + }) + }; +}; +const mintedVoiceSession = JSON.parse(enabledVoiceHarness.registeredRpcs.get("secondspawn_voice_session_request")( + { userId: "normal-player", env: enabledVoiceEnv }, + enabledVoiceHarness.logger, + enabledVoiceHarness.nk, + JSON.stringify({ + actor_id: "npc-synthetic-sentinel-0101", + conversation_session_id: "conversation-live", + line_id: "line-1", + playback_mode: "voice_preview", + ttl_seconds: 45 + }) +)); +assert.equal(mintedVoiceSession.voice_available, true); +assert.equal(mintedVoiceSession.provider, "api_dos_ai_voice"); +assert.equal(mintedVoiceSession.session.ephemeral_token, "voice-session-token"); +assert.equal(mintedVoiceSession.session.presentation_only, true); +assert.equal(mintedVoiceSession.session.transport, "realtime"); +assert.equal(mintedVoiceSession.session.endpoint, "wss://voice.example/session"); +assert.equal(voiceRequestBody.actor_id, "npc-synthetic-sentinel-0101"); +assert.equal(voiceRequestBody.line_id, "line-1"); +assert.equal(voiceRequestBody.playback_mode, "voice_preview"); +assert.deepEqual(voiceRequestBody.forbidden_state_mutations, [ + "memory", + "relationship", + "quest", + "TIME", + "SECOND", + "inventory", + "combat", + "body_lifecycle" +]); +assert.doesNotMatch(JSON.stringify(mintedVoiceSession), /test-dos-ai-key|DOS_AI_API_KEY/i); + assert.throws( () => harness.registeredRpcs.get("secondspawn_npc_seed")( { userId: "normal-player", env: defaultRuntimeEnv }, diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index b0305b3a..7452d4a1 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -370,9 +370,14 @@ Current fields: | `fallback_mode` | Text fallback behavior when voice is disabled or unavailable. | The profile is normalized server-side for player-inhabited bodies and permanent -NPCs so each body keeps a consistent voice surface across sessions. TTS -playback, voice vendor selection, and ephemeral token minting remain separate -future work. +NPCs so each body keeps a consistent voice surface across sessions. +`secondspawn_voice_session_request` is the first scoped session boundary for +future TTS. It is disabled by default, mints only short-lived `api.dos.ai` +playback material when configured, and returns a text fallback reason when +voice is unavailable. The session response is presentation-only: it cannot +write memory, relationship, quest, TIME, SECOND, inventory, combat, or body +lifecycle state. Convai remains an isolated phase 1 spike lane for one boss or +hub NPC until cost, latency, reliability, and stable voice identity are proven. #### NPC Society Event Path diff --git a/docs/design/37-ai-npc-backend-client-roadmap.md b/docs/design/37-ai-npc-backend-client-roadmap.md index 84a1eeeb..bc66fdb2 100644 --- a/docs/design/37-ai-npc-backend-client-roadmap.md +++ b/docs/design/37-ai-npc-backend-client-roadmap.md @@ -193,6 +193,11 @@ Client features: ### C2: Focused NPC Dialogue +Implementation status: text-first focused dialogue is the default. The +voice-session lane now has a scoped Nakama request boundary for future +`api.dos.ai` playback material, but voice, TTS, Convai, and provider visemes +must not block focused dialogue delivery. + - Keep player and NPC locked into dialogue state until exit. - Use bottom RPG-style dialogue panel for 1:1 conversations. - Align player lines and NPC lines clearly. @@ -355,6 +360,7 @@ Use that file when starting work on `#132`, `#133`, `#134`, `#135`, `#137`, | [#139](https://github.com/DOS/Second-Spawn/issues/139) | Unity / AI Agent | Focused and ambient NPC dialogue presentation. | | [#140](https://github.com/DOS/Second-Spawn/issues/140) | Unity / AI Agent / Docs | AI NPC debug tools and Play Mode verification checklist. | | [#249](https://github.com/DOS/Second-Spawn/issues/249) | AI Agent / Model Service | Role-play provider bake-off behind `api.dos.ai`, starting with Alibaba Qwen-Character as an R&D candidate. | +| [#262](https://github.com/DOS/Second-Spawn/issues/262) | Nakama / Unity / AI Agent | Scoped NPC voice session and Convai decision lane. | ### Client, Gameplay, And DevOps Backlog