fix: do not post_process before finally in ModelOutputThunk.astream (#580)

psschwei · web-flow · commit af250375aaec · 2026-03-06T15:10:08.000Z
* fix: do not post_process before finally in ModelOutputThunk.astream

Signed-off-by: Paul S. Schweigert &lt;paul@paulschweigert.com&gt;

* add test

Signed-off-by: Paul S. Schweigert &lt;paul@paulschweigert.com&gt;

* cleanup

Signed-off-by: Paul S. Schweigert &lt;paul@paulschweigert.com&gt;

* handle spans

Signed-off-by: Paul S. Schweigert &lt;paul@paulschweigert.com&gt;

---------

Signed-off-by: Paul S. Schweigert &lt;paul@paulschweigert.com&gt;
diff --git a/mellea/core/base.py b/mellea/core/base.py
@@ -282,88 +282,84 @@ async def astream(self) -> str:
             0 if self._underlying_value is None else len(str(self._underlying_value))
         )  # type: ignore
 
-        exception_to_raise = None
-        try:
-            # Type of the chunk depends on the backend.
-            chunks: list[Any | None] = []
-            while True:
-                try:
-                    item = self._async_queue.get_nowait()
-                    chunks.append(item)
-                except asyncio.QueueEmpty:
-                    # We've exhausted the current items in the queue.
-                    break
-
-            # Make sure we always get the minimum chunk size.
-            while len(chunks) <= self._chunk_size:
-                if len(chunks) > 0:
-                    if chunks[-1] is None or isinstance(chunks[-1], Exception):
-                        break  # Hit sentinel value or an error.
-                    # We could switch to relying on the `done` / `finish_reason` field of chunks,
-                    # but that forces us to know about the chunk type here. Prefer sentinel values
-                    # for now.
-
-                item = await self._async_queue.get()
+        # Type of the chunk depends on the backend.
+        chunks: list[Any | None] = []
+        while True:
+            try:
+                item = self._async_queue.get_nowait()
                 chunks.append(item)
-
-            # Process the sentinel value if it's there.
-            if chunks[-1] is None:
-                chunks.pop()  # Remove the sentinel value.
-                do_set_computed = True
-
-                # Shouldn't be needed, but cancel the Tasks this ModelOutputThunk relied on.
-                if self._generate is not None:
-                    self._generate.cancel()
-                if self._generate_extra is not None:
-                    # Covers an hf edge case. The task is done generating anything useful but isn't `done` yet.
-                    await self._generate_extra
-                    self._generate_extra.cancel()
-
-                # If ModelOutputThunks get too bulky, we can do additional cleanup here
-                # and set fields to None.
-
-            elif isinstance(chunks[-1], Exception):
-                # Mark as computed so post_process runs in finally block
-                self._computed = True
-                # Store exception to re-raise after cleanup
-                exception_to_raise = chunks[-1]
-
-            for chunk in chunks:
-                assert self._process is not None
-                await self._process(self, chunk)
-
-            if do_set_computed:
-                assert self._underlying_value is not None
-                self._computed = True
-        finally:
-            # Always call post_process if computed, even on exception
-            # This ensures telemetry spans are properly closed
-            if self._computed:
-                assert self._post_process is not None
-                await self._post_process(self)
-
-                # Only parse if no exception occurred
-                if exception_to_raise is None:
-                    match self._action:
-                        case Component():
-                            self.parsed_repr = self._action._parse(self)
-                        case CBlock():
-                            assert self.value is not None, (
-                                "value must be non-None since this thunk is computed"
-                            )
-                            self.parsed_repr = self.value  # type: ignore
-                        case _:
-                            raise ValueError(
-                                "attempted to astream from a model output thunk with no ._action set"
-                            )
-                    assert self.parsed_repr is not None, (
-                        "enforce constraint that a computed ModelOutputThunk has a non-None parsed_repr"
+            except asyncio.QueueEmpty:
+                # We've exhausted the current items in the queue.
+                break
+
+        # Make sure we always get the minimum chunk size.
+        while len(chunks) <= self._chunk_size:
+            if len(chunks) > 0:
+                if chunks[-1] is None or isinstance(chunks[-1], Exception):
+                    break  # Hit sentinel value or an error.
+                # We could switch to relying on the `done` / `finish_reason` field of chunks,
+                # but that forces us to know about the chunk type here. Prefer sentinel values
+                # for now.
+
+            item = await self._async_queue.get()
+            chunks.append(item)
+
+        # Process the sentinel value if it's there.
+        if chunks[-1] is None:
+            chunks.pop()  # Remove the sentinel value.
+            do_set_computed = True
+
+            # Shouldn't be needed, but cancel the Tasks this ModelOutputThunk relied on.
+            if self._generate is not None:
+                self._generate.cancel()
+            if self._generate_extra is not None:
+                # Covers an hf edge case. The task is done generating anything useful but isn't `done` yet.
+                await self._generate_extra
+                self._generate_extra.cancel()
+
+            # If ModelOutputThunks get too bulky, we can do additional cleanup here
+            # and set fields to None.
+
+        elif isinstance(chunks[-1], Exception):
+            # Close any open telemetry span before propagating the error.
+            # We can't call full post_process here (it assumes success invariants),
+            # but we must not leak the span.
+            span = self._meta.get("_telemetry_span")
+            if span is not None:
+                from ..telemetry import end_backend_span, set_span_error
+
+                set_span_error(span, chunks[-1])
+                end_backend_span(span)
+                del self._meta["_telemetry_span"]
+            raise chunks[-1]
+
+        for chunk in chunks:
+            assert self._process is not None
+            await self._process(self, chunk)
+
+        if do_set_computed:
+            assert self._underlying_value is not None
+            self._computed = True
+
+            assert self._post_process is not None
+            await self._post_process(self)
+
+            match self._action:
+                case Component():
+                    self.parsed_repr = self._action._parse(self)
+                case CBlock():
+                    assert self.value is not None, (
+                        "value must be non-None since this thunk is computed"
                     )
-                    return self._underlying_value  # type: ignore
-
-        # Re-raise exception after cleanup if one occurred
-        if exception_to_raise is not None:
-            raise exception_to_raise
+                    self.parsed_repr = self.value  # type: ignore
+                case _:
+                    raise ValueError(
+                        "attempted to astream from a model output thunk with no ._action set"
+                    )
+            assert self.parsed_repr is not None, (
+                "enforce constraint that a computed ModelOutputThunk has a non-None parsed_repr"
+            )
+            return self._underlying_value  # type: ignore
 
         return (
             self._underlying_value
diff --git a/test/core/test_astream_exception_propagation.py b/test/core/test_astream_exception_propagation.py
@@ -0,0 +1,106 @@
+"""Tests that exceptions during generation propagate correctly through ModelOutputThunk.astream().
+
+Regression test for issue #577: post_process in a finally block was swallowing
+the original generation exception by raising a secondary error from post_process
+(which assumes system invariants that don't hold during failures).
+"""
+
+import pytest
+
+from mellea.core.base import CBlock, GenerateType, ModelOutputThunk
+
+
+async def _noop_process(mot, chunk):
+    if mot._underlying_value is None:
+        mot._underlying_value = ""
+    mot._underlying_value += str(chunk)
+
+
+async def _failing_post_process(mot):
+    raise RuntimeError("post_process failed due to broken invariants")
+
+
+def _make_thunk(post_process=_failing_post_process):
+    mot = ModelOutputThunk(value=None)
+    mot._generate_type = GenerateType.ASYNC
+    mot._process = _noop_process
+    mot._post_process = post_process
+    mot._action = CBlock("test")
+    mot._chunk_size = 0
+    return mot
+
+
+@pytest.mark.parametrize(
+    "error",
+    [ValueError("connection reset by peer"), ConnectionError("server unavailable")],
+)
+async def test_astream_propagates_generation_exception(error):
+    """The original generation error must propagate, not a secondary error from post_process."""
+    mot = _make_thunk()
+    await mot._async_queue.put(error)
+
+    with pytest.raises(type(error), match=str(error)):
+        await mot.astream()
+
+
+async def test_astream_post_process_only_called_on_success():
+    """post_process must be called on success but not on error."""
+    post_process_called = False
+
+    async def _tracking_post_process(mot):
+        nonlocal post_process_called
+        post_process_called = True
+
+    # Error path: post_process should NOT be called
+    mot = _make_thunk(post_process=_tracking_post_process)
+    await mot._async_queue.put(RuntimeError("generation failed"))
+
+    with pytest.raises(RuntimeError, match="generation failed"):
+        await mot.astream()
+
+    assert not post_process_called, (
+        "post_process should not be called when generation fails"
+    )
+
+    # Success path: post_process SHOULD be called
+    post_process_called = False
+    mot = _make_thunk(post_process=_tracking_post_process)
+    await mot._async_queue.put("hello")
+    await mot._async_queue.put(None)  # sentinel for completion
+
+    await mot.astream()
+
+    assert post_process_called, "post_process should be called on successful completion"
+
+
+async def test_astream_closes_telemetry_span_on_error():
+    """Telemetry span must be ended and error recorded when generation fails."""
+    from unittest.mock import MagicMock
+
+    mock_span = MagicMock()
+    mot = _make_thunk()
+    mot._meta["_telemetry_span"] = mock_span
+
+    error = ConnectionError("server unavailable")
+    await mot._async_queue.put(error)
+
+    with pytest.raises(ConnectionError, match="server unavailable"):
+        await mot.astream()
+
+    # Span should have been ended and cleaned up
+    mock_span.record_exception.assert_called_once_with(error)
+    mock_span.set_status.assert_called_once()
+    mock_span.end.assert_called_once()
+    assert "_telemetry_span" not in mot._meta
+
+
+async def test_astream_no_span_leak_when_no_telemetry():
+    """When no telemetry span is present, error propagation still works."""
+    mot = _make_thunk()
+    assert "_telemetry_span" not in mot._meta
+
+    error = ValueError("test error")
+    await mot._async_queue.put(error)
+
+    with pytest.raises(ValueError, match="test error"):
+        await mot.astream()