Skip to content

Commit 417b7c8

Browse files
authored
fix: evict Ollama models between test modules to prevent memory starvation (#804)
* fix: evict Ollama models between test modules to prevent memory starvation (#798) Add per-module Ollama model eviction to test/conftest.py. When pytest crosses a file boundary between Ollama-marked tests, all loaded models are discovered via /api/ps and evicted with keep_alive=0. This prevents heavyweight models from accumulating in memory across the test suite. Covers both test/ and docs/examples/ without requiring --group-by-backend. * fix: revert unrelated examples README changes Restore docs/examples/README.md to match main — the original command and heading were correct. * fix: add missing ollama markers to test files test_streaming_sync_functions.py and test_computed_model_output_thunk.py call start_session() (Ollama backend) but lacked pytest.mark.ollama, so the per-module eviction hook never fired for them. The inert `# pytest: ollama` comment is only parsed for docs/examples/. * fix: evict Ollama models after example tests (#798) Examples run as isolated subprocesses. Ollama's default keep_alive keeps models resident after exit, starving later examples of memory. Add teardown hook to evict after every ollama-marked example.
1 parent 11e146b commit 417b7c8

5 files changed

Lines changed: 144 additions & 8 deletions

File tree

docs/examples/conftest.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,56 @@ def pytest_runtest_setup(item):
603603
)
604604

605605

606+
def pytest_runtest_teardown(item, nextitem):
607+
"""Evict Ollama models after each ollama-marked example.
608+
609+
Examples run as subprocesses, so Ollama's default keep_alive keeps
610+
models resident after exit. Evict after every example to prevent
611+
heavyweight models from starving subsequent examples of memory (#798).
612+
"""
613+
if not isinstance(item, ExampleItem):
614+
return
615+
if not item.get_closest_marker("ollama"):
616+
return
617+
618+
_evict_ollama_models()
619+
620+
621+
def _evict_ollama_models() -> None:
622+
"""Evict all currently loaded Ollama models (best-effort)."""
623+
import requests
624+
625+
host = os.environ.get("OLLAMA_HOST", "127.0.0.1")
626+
if ":" in host:
627+
host, port = host.rsplit(":", 1)
628+
else:
629+
port = os.environ.get("OLLAMA_PORT", "11434")
630+
631+
if host == "0.0.0.0":
632+
host = "127.0.0.1"
633+
634+
base_url = f"http://{host}:{port}"
635+
636+
try:
637+
resp = requests.get(f"{base_url}/api/ps", timeout=5)
638+
resp.raise_for_status()
639+
loaded = resp.json().get("models", [])
640+
except Exception:
641+
return
642+
643+
for entry in loaded:
644+
model_name = entry.get("name") or entry.get("model", "unknown")
645+
try:
646+
requests.post(
647+
f"{base_url}/api/generate",
648+
json={"model": model_name, "keep_alive": 0},
649+
timeout=10,
650+
)
651+
print(f"ollama-evict: evicted {model_name}", file=sys.stderr)
652+
except Exception:
653+
pass
654+
655+
606656
def pytest_collection_modifyitems(items):
607657
"""Apply markers from example files to ExampleItem objects.
608658

test/README.md

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,31 @@ uv run pytest -m slow
1919

2020
- `CICD=1` - Enable CI mode (skips qualitative tests)
2121
- `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True` - Helps with GPU memory fragmentation
22-
- `OLLAMA_KEEP_ALIVE=1m` - Reduce Ollama model idle window from the default 5 minutes to 1 minute.
23-
Useful when running without `--group-by-backend`: limits how long a loaded Ollama model occupies
24-
unified memory while HF/torch tests are running. Has no effect mid-run (timer resets per request),
25-
but reduces the overlap window when switching between backend groups.
22+
23+
## Ollama Model Eviction
24+
25+
When pytest orchestrates many Ollama-backed tests in sequence, the default 5-minute
26+
keep-alive means models from earlier tests stay resident and accumulate, eventually
27+
starving later tests of memory.
28+
29+
Two mechanisms in `test/conftest.py` handle this:
30+
31+
- **Per-module eviction** (`pytest_runtest_teardown`) — when crossing a file
32+
boundary between Ollama-marked tests, queries `/api/ps` for all loaded models
33+
and evicts them with `keep_alive=0`. Covers both `test/` and `docs/examples/`.
34+
Always active, no flags required.
35+
- **Group warm-up/eviction** (`pytest_runtest_setup`) — warms up a fixed set of CI
36+
models (`keep_alive=-1`) when entering the Ollama backend group and evicts them
37+
when leaving. Requires `--group-by-backend`.
38+
39+
**Trade-off:** if two consecutive test files use the same model, it will be unloaded
40+
and reloaded (~5-15 s overhead). Predictable memory behaviour is more important
41+
than saving a reload, especially on constrained CI runners. Tests within a single
42+
file share the loaded model with no overhead.
43+
44+
**Caveat:** eviction targets *all* loaded Ollama models, not just those loaded by
45+
the test. If you are using Ollama interactively while the suite runs, your model
46+
will be evicted between test modules.
2647

2748
## GPU Testing on CUDA Systems
2849

test/conftest.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,20 @@ def pytest_runtest_setup(item):
549549
# to prevent fixture setup errors
550550

551551

552+
def pytest_runtest_teardown(item, nextitem):
553+
"""Evict Ollama models when crossing a module boundary.
554+
555+
Prevents models from accumulating across test files while avoiding
556+
redundant unload/reload within a single module (where tests typically
557+
share a model). Also evicts after the very last test.
558+
"""
559+
if not item.get_closest_marker("ollama"):
560+
return
561+
562+
if nextitem is None or nextitem.path != item.path:
563+
evict_ollama_models()
564+
565+
552566
def memory_cleaner():
553567
"""Lightweight memory cleanup — safety net for per-test GPU leaks."""
554568
yield
@@ -566,6 +580,53 @@ def memory_cleaner():
566580
pass
567581

568582

583+
def evict_ollama_models() -> None:
584+
"""Evict all currently loaded Ollama models to free memory.
585+
586+
Queries /api/ps to discover loaded models, then sends keep_alive=0
587+
to each via /api/generate. Prevents heavyweight models from starving
588+
subsequent tests of memory (see #798).
589+
590+
Best-effort: errors are logged but never raised.
591+
"""
592+
logger = FancyLogger.get_logger()
593+
594+
# Parse OLLAMA_HOST which may be "host", "host:port", or absent.
595+
host = os.environ.get("OLLAMA_HOST", "127.0.0.1")
596+
if ":" in host:
597+
host, port = host.rsplit(":", 1)
598+
else:
599+
port = os.environ.get("OLLAMA_PORT", "11434")
600+
601+
if host == "0.0.0.0":
602+
host = "127.0.0.1"
603+
604+
base_url = f"http://{host}:{port}"
605+
606+
try:
607+
resp = requests.get(f"{base_url}/api/ps", timeout=5)
608+
resp.raise_for_status()
609+
loaded = resp.json().get("models", [])
610+
except Exception as e:
611+
logger.warning("ollama-evict: could not query loaded models: %s", e)
612+
return
613+
614+
if not loaded:
615+
return
616+
617+
for entry in loaded:
618+
model_name = entry.get("name") or entry.get("model", "unknown")
619+
try:
620+
requests.post(
621+
f"{base_url}/api/generate",
622+
json={"model": model_name, "keep_alive": 0},
623+
timeout=10,
624+
)
625+
logger.info("ollama-evict: evicted %s", model_name)
626+
except Exception as e:
627+
logger.warning("ollama-evict: failed to evict %s: %s", model_name, e)
628+
629+
569630
@pytest.fixture(autouse=True, scope="session")
570631
def normalize_ollama_host():
571632
"""Normalize OLLAMA_HOST to work with client libraries.

test/core/test_computed_model_output_thunk.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
"""Tests for ComputedModelOutputThunk."""
22

3-
# pytest: ollama, llm
4-
53
import pytest
64

75
from mellea.core import ComputedModelOutputThunk, ModelOutputThunk
@@ -72,6 +70,8 @@ def test_computed_thunk_with_parsed_repr():
7270
assert computed_thunk.parsed_repr == "parsed value"
7371

7472

73+
@pytest.mark.ollama
74+
@pytest.mark.e2e
7575
def test_sync_functions_return_computed_thunks():
7676
"""Test that synchronous session functions return ComputedModelOutputThunk."""
7777
with start_session() as session:
@@ -83,6 +83,8 @@ def test_sync_functions_return_computed_thunks():
8383
assert result.value is not None
8484

8585

86+
@pytest.mark.ollama
87+
@pytest.mark.e2e
8688
def test_sync_functions_with_sampling_return_computed_thunks():
8789
"""Test that synchronous functions with sampling return ComputedModelOutputThunk."""
8890
from mellea.stdlib.sampling import RejectionSamplingStrategy
@@ -98,6 +100,8 @@ def test_sync_functions_with_sampling_return_computed_thunks():
98100
assert result.value is not None
99101

100102

103+
@pytest.mark.ollama
104+
@pytest.mark.e2e
101105
async def test_async_functions_return_computed_thunks():
102106
"""Test that async session functions return ComputedModelOutputThunk when await_result=True."""
103107
with start_session() as session:

test/core/test_streaming_sync_functions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
"""Tests for streaming support using async functions with await_result parameter."""
22

3-
# pytest: ollama, llm
4-
53
import asyncio
64

75
import pytest
86

7+
pytestmark = [pytest.mark.ollama, pytest.mark.e2e]
8+
99
from mellea.core import ComputedModelOutputThunk, ModelOutputThunk
1010
from mellea.stdlib.session import start_session
1111

0 commit comments

Comments
 (0)