test: misc test cleanup — improve assertions, add diagnostics, fix markers (#600)

planetf1 · web-flow · commit 3717cf9382bf · 2026-03-06T17:16:19.000Z
- Strengthen test_generate_from_raw: check all results, add CONTEXT_WINDOW: 2048,
  timeout(150), and diagnostic assertion message
- Increase MAX_NEW_TOKENS to 2**10 in format tests (prevents JSON truncation)
- Add FancyLogger.warning when generate_from_raw catches an exception
- Mark researcher.py example as slow; add markers to query_clarification.py
- Update slow marker description to "&gt;1 minute"
diff --git a/docs/examples/intrinsics/query_clarification.py b/docs/examples/intrinsics/query_clarification.py
@@ -1,3 +1,4 @@
+# pytest: huggingface, requires_heavy_ram, llm
 """
 Example usage of the query clarification intrinsic for RAG applications.
 
diff --git a/docs/examples/mini_researcher/researcher.py b/docs/examples/mini_researcher/researcher.py
@@ -1,4 +1,4 @@
-# pytest: ollama, qualitative, llm
+# pytest: ollama, qualitative, llm, slow
 
 from collections.abc import Callable
 from functools import cache
diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py
@@ -470,6 +470,10 @@ async def generate_from_raw(
             result = None
             error = None
             if isinstance(response, BaseException):
+                FancyLogger.get_logger().warning(
+                    f"generate_from_raw: request {i} failed with "
+                    f"{type(response).__name__}: {response}"
+                )
                 result = ModelOutputThunk(value="")
                 error = response
             else:
diff --git a/pyproject.toml b/pyproject.toml
@@ -259,7 +259,7 @@ markers = [
     "requires_gpu: Tests requiring GPU",
     "requires_heavy_ram: Tests requiring 48GB+ RAM",
     "qualitative: Non-deterministic quality tests",
-    "slow: Tests taking >5 minutes (e.g., dataset loading)",
+    "slow: Tests taking >1 minute (e.g., multi-step pipelines like researcher)",
 
     # Composite markers
     "llm: Tests that make LLM calls (needs at least Ollama)",
diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py
@@ -87,7 +87,7 @@ class Email(pydantic.BaseModel):
     output = session.instruct(
         "Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ",
         format=Email,
-        model_options={ModelOption.MAX_NEW_TOKENS: 2**8},
+        model_options={ModelOption.MAX_NEW_TOKENS: 2**10},
     )
     print("Formatted output:")
     email = Email.model_validate_json(
@@ -102,15 +102,20 @@ class Email(pydantic.BaseModel):
 
 
 @pytest.mark.qualitative
+@pytest.mark.timeout(150)
 async def test_generate_from_raw(session) -> None:
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
     results = await session.backend.generate_from_raw(
-        actions=[CBlock(value=prompt) for prompt in prompts], ctx=session.ctx
+        actions=[CBlock(value=prompt) for prompt in prompts],
+        ctx=session.ctx,
+        model_options={ModelOption.CONTEXT_WINDOW: 2048},
     )
 
     assert len(results) == len(prompts)
-    assert results[0].value is not None
+    assert all(r.value for r in results), (
+        f"One or more requests returned empty (possible backend timeout): {[r.value for r in results]}"
+    )
 
 
 @pytest.mark.xfail(reason="ollama sometimes fails generated structured outputs")
@@ -125,17 +130,19 @@ class Answer(pydantic.BaseModel):
         actions=[CBlock(value=prompt) for prompt in prompts],
         ctx=session.ctx,
         format=Answer,
+        model_options={ModelOption.CONTEXT_WINDOW: 2048},
     )
 
     assert len(results) == len(prompts)
+    assert all(r.value for r in results), (
+        f"One or more requests returned empty (possible backend timeout): {[r.value for r in results]}"
+    )
 
-    random_result = results[0]
-    try:
-        Answer.model_validate_json(random_result.value)
-    except pydantic.ValidationError as e:
-        assert False, (
-            f"formatting directive failed for {random_result.value}: {e.json()}"
-        )
+    for result in results:
+        try:
+            Answer.model_validate_json(result.value)
+        except pydantic.ValidationError as e:
+            assert False, f"formatting directive failed for {result.value}: {e.json()}"
 
 
 async def test_async_parallel_requests(session) -> None:
diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py
@@ -104,7 +104,7 @@ class Email(pydantic.BaseModel):
     output = m_session.instruct(
         "Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ",
         format=Email,
-        model_options={ModelOption.MAX_NEW_TOKENS: 2**8},
+        model_options={ModelOption.MAX_NEW_TOKENS: 2**10},
     )
     print("Formatted output:")
     email = Email.model_validate_json(

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+# pytest: huggingface, requires_heavy_ram, llm`
`1`	`2`	`"""`
`2`	`3`	`Example usage of the query clarification intrinsic for RAG applications.`
`3`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# pytest: ollama, qualitative, llm`
	`1`	`+# pytest: ollama, qualitative, llm, slow`
`2`	`2`
`3`	`3`	`from collections.abc import Callable`
`4`	`4`	`from functools import cache`
Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@ class Email(pydantic.BaseModel):`
`104`	`104`	`output = m_session.instruct(`
`105`	`105`	`"Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ",`
`106`	`106`	`format=Email,`
`107`		`- model_options={ModelOption.MAX_NEW_TOKENS: 2**8},`
	`107`	`+ model_options={ModelOption.MAX_NEW_TOKENS: 2**10},`
`108`	`108`	`)`
`109`	`109`	`print("Formatted output:")`
`110`	`110`	`email = Email.model_validate_json(`