fix: make system_fingerprint consistent in m serve

markstur · markstur · commit 417261b965e9 · 2026-04-13T13:41:33.000-07:00
Added streaming support w/ setting system_fingerprint. Make it consistent.

We are currently just setting it to None but now it is consistent for
future use.

Signed-off-by: Mark Sturdevant &lt;mark.sturdevant@ibm.com&gt;
diff --git a/cli/serve/app.py b/cli/serve/app.py
@@ -161,6 +161,11 @@ async def endpoint(request: ChatCompletionRequest):
                     model_options=model_options,
                 )
 
+            # system_fingerprint represents backend config hash, not model name
+            # The model name is already in response.model (line 73)
+            # Leave as None since we don't track backend config fingerprints yet
+            system_fingerprint = None
+
             # Handle streaming response
             if request.stream:
                 return StreamingResponse(
@@ -170,15 +175,11 @@ async def endpoint(request: ChatCompletionRequest):
                         model=request.model,
                         created=created_timestamp,
                         stream_options=request.stream_options,
+                        system_fingerprint=system_fingerprint,
                     ),
                     media_type="text/event-stream",
                 )
 
-            # system_fingerprint represents backend config hash, not model name
-            # The model name is already in response.model (line 73)
-            # Leave as None since we don't track backend config fingerprints yet
-            system_fingerprint = None
-
             return ChatCompletion(
                 id=completion_id,
                 model=request.model,
diff --git a/mellea/helpers/openai_compatible_helpers.py b/mellea/helpers/openai_compatible_helpers.py
@@ -252,6 +252,7 @@ async def stream_chat_completion_chunks(
     model: str,
     created: int,
     stream_options: dict[str, Any] | None = None,
+    system_fingerprint: str | None = None,
 ) -> AsyncGenerator[str, None]:
     """Generate OpenAI-compatible SSE chat completion chunks from a model output.
 
@@ -263,6 +264,8 @@ async def stream_chat_completion_chunks(
         stream_options: OpenAI-compatible streaming options. Currently supports
             ``include_usage`` (bool) to control whether usage stats are included
             in the final chunk. Defaults to including usage when available.
+        system_fingerprint: Backend configuration fingerprint to include in chunks.
+            Defaults to ``None``.
 
     Yields:
         Server-sent event payload strings representing OpenAI-compatible chat
@@ -289,6 +292,7 @@ async def stream_chat_completion_chunks(
                 )
             ],
             object="chat.completion.chunk",
+            system_fingerprint=system_fingerprint,
         )
         yield f"data: {initial_chunk.model_dump_json()}\n\n"
 
@@ -310,6 +314,7 @@ async def stream_chat_completion_chunks(
                         )
                     ],
                     object="chat.completion.chunk",
+                    system_fingerprint=system_fingerprint,
                 )
                 yield f"data: {chunk.model_dump_json()}\n\n"
 
@@ -333,6 +338,7 @@ async def stream_chat_completion_chunks(
                 )
             ],
             object="chat.completion.chunk",
+            system_fingerprint=system_fingerprint,
             usage=usage,
         )
         yield f"data: {final_chunk.model_dump_json()}\n\n"
diff --git a/test/cli/test_serve_streaming.py b/test/cli/test_serve_streaming.py
@@ -628,6 +628,68 @@ async def mock_astream():
         assert last_chunk["usage"] is not None
         assert last_chunk["usage"]["total_tokens"] == 8
 
+    @pytest.mark.asyncio
+    async def test_streaming_system_fingerprint_always_none(
+        self, mock_module, streaming_request
+    ):
+        """Test that system_fingerprint is None in all streaming chunks.
+
+        Per OpenAI spec, system_fingerprint represents a hash of backend config,
+        not the model name. The model name is in chunk.model.
+        We don't track backend config fingerprints yet, so it should be None.
+        """
+        mock_output = ModelOutputThunk(None)
+        mock_output._computed = False
+        mock_output._generate_type = mock_output._generate_type.ASYNC
+
+        chunks = ["Hello", " world"]
+
+        async def mock_astream():
+            if chunks:
+                chunk = chunks.pop(0)
+                if not chunks:
+                    mock_output._computed = True
+                return chunk
+            mock_output._computed = True
+            return ""
+
+        mock_output.astream = mock_astream
+        mock_output.is_computed = lambda: mock_output._computed
+        mock_output.usage = {
+            "prompt_tokens": 5,
+            "completion_tokens": 2,
+            "total_tokens": 7,
+        }
+        mock_module.serve.return_value = mock_output
+
+        # Create test app
+        app = FastAPI()
+        app.add_api_route(
+            "/v1/chat/completions", make_chat_endpoint(mock_module), methods=["POST"]
+        )
+        client = TestClient(app)
+
+        # Make streaming request
+        response = client.post(
+            "/v1/chat/completions", json=streaming_request.model_dump(mode="json")
+        )
+
+        assert response.status_code == 200
+
+        # Parse all chunks
+        events = []
+        for line in response.text.strip().split("\n\n"):
+            if line.startswith("data: "):
+                data = line[6:]
+                if data != "[DONE]":
+                    events.append(json.loads(data))
+
+        # All chunks should have system_fingerprint as None
+        for chunk in events:
+            assert chunk["system_fingerprint"] is None
+            # Model name should be in the model field
+            assert chunk["model"] == "test-model"
+
     @pytest.mark.asyncio
     async def test_stream_options_ignored_for_non_streaming(self, mock_module):
         """Test that stream_options is ignored when stream=False (usage always included)."""

Original file line number	Diff line number	Diff line change
`@@ -252,6 +252,7 @@ async def stream_chat_completion_chunks(`
`252`	`252`	`model: str,`
`253`	`253`	`created: int,`
`254`	`254`	`stream_options: dict[str, Any] \| None = None,`
	`255`	`+ system_fingerprint: str \| None = None,`
`255`	`256`	`) -> AsyncGenerator[str, None]:`
`256`	`257`	`"""Generate OpenAI-compatible SSE chat completion chunks from a model output.`
`257`	`258`
`@@ -263,6 +264,8 @@ async def stream_chat_completion_chunks(`
`263`	`264`	`stream_options: OpenAI-compatible streaming options. Currently supports`
`264`	`265`	``include_usage`` (bool) to control whether usage stats are included
`265`	`266`	`in the final chunk. Defaults to including usage when available.`
	`267`	`+ system_fingerprint: Backend configuration fingerprint to include in chunks.`
	`268`	+ Defaults to ``None``.
`266`	`269`
`267`	`270`	`Yields:`
`268`	`271`	`Server-sent event payload strings representing OpenAI-compatible chat`
`@@ -289,6 +292,7 @@ async def stream_chat_completion_chunks(`
`289`	`292`	`)`
`290`	`293`	`],`
`291`	`294`	`object="chat.completion.chunk",`
	`295`	`+ system_fingerprint=system_fingerprint,`
`292`	`296`	`)`
`293`	`297`	`yield f"data: {initial_chunk.model_dump_json()}\n\n"`
`294`	`298`
`@@ -310,6 +314,7 @@ async def stream_chat_completion_chunks(`
`310`	`314`	`)`
`311`	`315`	`],`
`312`	`316`	`object="chat.completion.chunk",`
	`317`	`+ system_fingerprint=system_fingerprint,`
`313`	`318`	`)`
`314`	`319`	`yield f"data: {chunk.model_dump_json()}\n\n"`
`315`	`320`
`@@ -333,6 +338,7 @@ async def stream_chat_completion_chunks(`
`333`	`338`	`)`
`334`	`339`	`],`
`335`	`340`	`object="chat.completion.chunk",`
	`341`	`+ system_fingerprint=system_fingerprint,`
`336`	`342`	`usage=usage,`
`337`	`343`	`)`
`338`	`344`	`yield f"data: {final_chunk.model_dump_json()}\n\n"`