Skip to content

Commit 3717cf9

Browse files
authored
test: misc test cleanup — improve assertions, add diagnostics, fix markers (#600)
- Strengthen test_generate_from_raw: check all results, add CONTEXT_WINDOW: 2048, timeout(150), and diagnostic assertion message - Increase MAX_NEW_TOKENS to 2**10 in format tests (prevents JSON truncation) - Add FancyLogger.warning when generate_from_raw catches an exception - Mark researcher.py example as slow; add markers to query_clarification.py - Update slow marker description to ">1 minute"
1 parent a3f3f71 commit 3717cf9

6 files changed

Lines changed: 25 additions & 13 deletions

File tree

docs/examples/intrinsics/query_clarification.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# pytest: huggingface, requires_heavy_ram, llm
12
"""
23
Example usage of the query clarification intrinsic for RAG applications.
34

docs/examples/mini_researcher/researcher.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# pytest: ollama, qualitative, llm
1+
# pytest: ollama, qualitative, llm, slow
22

33
from collections.abc import Callable
44
from functools import cache

mellea/backends/ollama.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,10 @@ async def generate_from_raw(
470470
result = None
471471
error = None
472472
if isinstance(response, BaseException):
473+
FancyLogger.get_logger().warning(
474+
f"generate_from_raw: request {i} failed with "
475+
f"{type(response).__name__}: {response}"
476+
)
473477
result = ModelOutputThunk(value="")
474478
error = response
475479
else:

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ markers = [
259259
"requires_gpu: Tests requiring GPU",
260260
"requires_heavy_ram: Tests requiring 48GB+ RAM",
261261
"qualitative: Non-deterministic quality tests",
262-
"slow: Tests taking >5 minutes (e.g., dataset loading)",
262+
"slow: Tests taking >1 minute (e.g., multi-step pipelines like researcher)",
263263

264264
# Composite markers
265265
"llm: Tests that make LLM calls (needs at least Ollama)",

test/backends/test_ollama.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ class Email(pydantic.BaseModel):
8787
output = session.instruct(
8888
"Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ",
8989
format=Email,
90-
model_options={ModelOption.MAX_NEW_TOKENS: 2**8},
90+
model_options={ModelOption.MAX_NEW_TOKENS: 2**10},
9191
)
9292
print("Formatted output:")
9393
email = Email.model_validate_json(
@@ -102,15 +102,20 @@ class Email(pydantic.BaseModel):
102102

103103

104104
@pytest.mark.qualitative
105+
@pytest.mark.timeout(150)
105106
async def test_generate_from_raw(session) -> None:
106107
prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
107108

108109
results = await session.backend.generate_from_raw(
109-
actions=[CBlock(value=prompt) for prompt in prompts], ctx=session.ctx
110+
actions=[CBlock(value=prompt) for prompt in prompts],
111+
ctx=session.ctx,
112+
model_options={ModelOption.CONTEXT_WINDOW: 2048},
110113
)
111114

112115
assert len(results) == len(prompts)
113-
assert results[0].value is not None
116+
assert all(r.value for r in results), (
117+
f"One or more requests returned empty (possible backend timeout): {[r.value for r in results]}"
118+
)
114119

115120

116121
@pytest.mark.xfail(reason="ollama sometimes fails generated structured outputs")
@@ -125,17 +130,19 @@ class Answer(pydantic.BaseModel):
125130
actions=[CBlock(value=prompt) for prompt in prompts],
126131
ctx=session.ctx,
127132
format=Answer,
133+
model_options={ModelOption.CONTEXT_WINDOW: 2048},
128134
)
129135

130136
assert len(results) == len(prompts)
137+
assert all(r.value for r in results), (
138+
f"One or more requests returned empty (possible backend timeout): {[r.value for r in results]}"
139+
)
131140

132-
random_result = results[0]
133-
try:
134-
Answer.model_validate_json(random_result.value)
135-
except pydantic.ValidationError as e:
136-
assert False, (
137-
f"formatting directive failed for {random_result.value}: {e.json()}"
138-
)
141+
for result in results:
142+
try:
143+
Answer.model_validate_json(result.value)
144+
except pydantic.ValidationError as e:
145+
assert False, f"formatting directive failed for {result.value}: {e.json()}"
139146

140147

141148
async def test_async_parallel_requests(session) -> None:

test/backends/test_openai_ollama.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ class Email(pydantic.BaseModel):
104104
output = m_session.instruct(
105105
"Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ",
106106
format=Email,
107-
model_options={ModelOption.MAX_NEW_TOKENS: 2**8},
107+
model_options={ModelOption.MAX_NEW_TOKENS: 2**10},
108108
)
109109
print("Formatted output:")
110110
email = Email.model_validate_json(

0 commit comments

Comments
 (0)