fix: evict Ollama models between test modules to prevent memory starvation (#804)

planetf1 · web-flow · commit 417b7c89e25e · 2026-04-09T22:41:28.000Z
* fix: evict Ollama models between test modules to prevent memory starvation (#798) Add per-module Ollama model eviction to test/conftest.py. When pytest crosses a file boundary between Ollama-marked tests, all loaded models are discovered via /api/ps and evicted with keep_alive=0. This prevents heavyweight models from accumulating in memory across the test suite. Covers both test/ and docs/examples/ without requiring --group-by-backend. * fix: revert unrelated examples README changes Restore docs/examples/README.md to match main — the original command and heading were correct. * fix: add missing ollama markers to test files test_streaming_sync_functions.py and test_computed_model_output_thunk.py call start_session() (Ollama backend) but lacked pytest.mark.ollama, so the per-module eviction hook never fired for them. The inert `# pytest: ollama` comment is only parsed for docs/examples/. * fix: evict Ollama models after example tests (#798) Examples run as isolated subprocesses. Ollama's default keep_alive keeps models resident after exit, starving later examples of memory. Add teardown hook to evict after every ollama-marked example.
diff --git a/docs/examples/conftest.py b/docs/examples/conftest.py
@@ -603,6 +603,56 @@ def pytest_runtest_setup(item):
             )
 
 
+def pytest_runtest_teardown(item, nextitem):
+    """Evict Ollama models after each ollama-marked example.
+
+    Examples run as subprocesses, so Ollama's default keep_alive keeps
+    models resident after exit. Evict after every example to prevent
+    heavyweight models from starving subsequent examples of memory (#798).
+    """
+    if not isinstance(item, ExampleItem):
+        return
+    if not item.get_closest_marker("ollama"):
+        return
+
+    _evict_ollama_models()
+
+
+def _evict_ollama_models() -> None:
+    """Evict all currently loaded Ollama models (best-effort)."""
+    import requests
+
+    host = os.environ.get("OLLAMA_HOST", "127.0.0.1")
+    if ":" in host:
+        host, port = host.rsplit(":", 1)
+    else:
+        port = os.environ.get("OLLAMA_PORT", "11434")
+
+    if host == "0.0.0.0":
+        host = "127.0.0.1"
+
+    base_url = f"http://{host}:{port}"
+
+    try:
+        resp = requests.get(f"{base_url}/api/ps", timeout=5)
+        resp.raise_for_status()
+        loaded = resp.json().get("models", [])
+    except Exception:
+        return
+
+    for entry in loaded:
+        model_name = entry.get("name") or entry.get("model", "unknown")
+        try:
+            requests.post(
+                f"{base_url}/api/generate",
+                json={"model": model_name, "keep_alive": 0},
+                timeout=10,
+            )
+            print(f"ollama-evict: evicted {model_name}", file=sys.stderr)
+        except Exception:
+            pass
+
+
 def pytest_collection_modifyitems(items):
     """Apply markers from example files to ExampleItem objects.
 
diff --git a/test/README.md b/test/README.md
@@ -19,10 +19,31 @@ uv run pytest -m slow
 
 - `CICD=1` - Enable CI mode (skips qualitative tests)
 - `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True` - Helps with GPU memory fragmentation
-- `OLLAMA_KEEP_ALIVE=1m` - Reduce Ollama model idle window from the default 5 minutes to 1 minute.
-  Useful when running without `--group-by-backend`: limits how long a loaded Ollama model occupies
-  unified memory while HF/torch tests are running. Has no effect mid-run (timer resets per request),
-  but reduces the overlap window when switching between backend groups.
+
+## Ollama Model Eviction
+
+When pytest orchestrates many Ollama-backed tests in sequence, the default 5-minute
+keep-alive means models from earlier tests stay resident and accumulate, eventually
+starving later tests of memory.
+
+Two mechanisms in `test/conftest.py` handle this:
+
+- **Per-module eviction** (`pytest_runtest_teardown`) — when crossing a file
+  boundary between Ollama-marked tests, queries `/api/ps` for all loaded models
+  and evicts them with `keep_alive=0`. Covers both `test/` and `docs/examples/`.
+  Always active, no flags required.
+- **Group warm-up/eviction** (`pytest_runtest_setup`) — warms up a fixed set of CI
+  models (`keep_alive=-1`) when entering the Ollama backend group and evicts them
+  when leaving. Requires `--group-by-backend`.
+
+**Trade-off:** if two consecutive test files use the same model, it will be unloaded
+and reloaded (~5-15 s overhead). Predictable memory behaviour is more important
+than saving a reload, especially on constrained CI runners. Tests within a single
+file share the loaded model with no overhead.
+
+**Caveat:** eviction targets *all* loaded Ollama models, not just those loaded by
+the test. If you are using Ollama interactively while the suite runs, your model
+will be evicted between test modules.
 
 ## GPU Testing on CUDA Systems
 
diff --git a/test/conftest.py b/test/conftest.py
@@ -549,6 +549,20 @@ def pytest_runtest_setup(item):
     # to prevent fixture setup errors
 
 
+def pytest_runtest_teardown(item, nextitem):
+    """Evict Ollama models when crossing a module boundary.
+
+    Prevents models from accumulating across test files while avoiding
+    redundant unload/reload within a single module (where tests typically
+    share a model). Also evicts after the very last test.
+    """
+    if not item.get_closest_marker("ollama"):
+        return
+
+    if nextitem is None or nextitem.path != item.path:
+        evict_ollama_models()
+
+
 def memory_cleaner():
     """Lightweight memory cleanup — safety net for per-test GPU leaks."""
     yield
@@ -566,6 +580,53 @@ def memory_cleaner():
         pass
 
 
+def evict_ollama_models() -> None:
+    """Evict all currently loaded Ollama models to free memory.
+
+    Queries /api/ps to discover loaded models, then sends keep_alive=0
+    to each via /api/generate. Prevents heavyweight models from starving
+    subsequent tests of memory (see #798).
+
+    Best-effort: errors are logged but never raised.
+    """
+    logger = FancyLogger.get_logger()
+
+    # Parse OLLAMA_HOST which may be "host", "host:port", or absent.
+    host = os.environ.get("OLLAMA_HOST", "127.0.0.1")
+    if ":" in host:
+        host, port = host.rsplit(":", 1)
+    else:
+        port = os.environ.get("OLLAMA_PORT", "11434")
+
+    if host == "0.0.0.0":
+        host = "127.0.0.1"
+
+    base_url = f"http://{host}:{port}"
+
+    try:
+        resp = requests.get(f"{base_url}/api/ps", timeout=5)
+        resp.raise_for_status()
+        loaded = resp.json().get("models", [])
+    except Exception as e:
+        logger.warning("ollama-evict: could not query loaded models: %s", e)
+        return
+
+    if not loaded:
+        return
+
+    for entry in loaded:
+        model_name = entry.get("name") or entry.get("model", "unknown")
+        try:
+            requests.post(
+                f"{base_url}/api/generate",
+                json={"model": model_name, "keep_alive": 0},
+                timeout=10,
+            )
+            logger.info("ollama-evict: evicted %s", model_name)
+        except Exception as e:
+            logger.warning("ollama-evict: failed to evict %s: %s", model_name, e)
+
+
 @pytest.fixture(autouse=True, scope="session")
 def normalize_ollama_host():
     """Normalize OLLAMA_HOST to work with client libraries.
diff --git a/test/core/test_computed_model_output_thunk.py b/test/core/test_computed_model_output_thunk.py
@@ -1,7 +1,5 @@
 """Tests for ComputedModelOutputThunk."""
 
-# pytest: ollama, llm
-
 import pytest
 
 from mellea.core import ComputedModelOutputThunk, ModelOutputThunk
@@ -72,6 +70,8 @@ def test_computed_thunk_with_parsed_repr():
     assert computed_thunk.parsed_repr == "parsed value"
 
 
+@pytest.mark.ollama
+@pytest.mark.e2e
 def test_sync_functions_return_computed_thunks():
     """Test that synchronous session functions return ComputedModelOutputThunk."""
     with start_session() as session:
@@ -83,6 +83,8 @@ def test_sync_functions_return_computed_thunks():
         assert result.value is not None
 
 
+@pytest.mark.ollama
+@pytest.mark.e2e
 def test_sync_functions_with_sampling_return_computed_thunks():
     """Test that synchronous functions with sampling return ComputedModelOutputThunk."""
     from mellea.stdlib.sampling import RejectionSamplingStrategy
@@ -98,6 +100,8 @@ def test_sync_functions_with_sampling_return_computed_thunks():
         assert result.value is not None
 
 
+@pytest.mark.ollama
+@pytest.mark.e2e
 async def test_async_functions_return_computed_thunks():
     """Test that async session functions return ComputedModelOutputThunk when await_result=True."""
     with start_session() as session:
diff --git a/test/core/test_streaming_sync_functions.py b/test/core/test_streaming_sync_functions.py
@@ -1,11 +1,11 @@
 """Tests for streaming support using async functions with await_result parameter."""
 
-# pytest: ollama, llm
-
 import asyncio
 
 import pytest
 
+pytestmark = [pytest.mark.ollama, pytest.mark.e2e]
+
 from mellea.core import ComputedModelOutputThunk, ModelOutputThunk
 from mellea.stdlib.session import start_session