fix: huggingface memory leak (#544)

avinash2692 · nrfulton · web-flow · commit 2f748534f7ef · 2026-02-17T16:28:42.000Z
* adding logic to cleanup on evict in cache

* implement cleanup logic for KV cache eviction to free GPU memory

* adding uuid for cache key

* reverting changes to precommit

* adding scores to cache and removing it from the constructor

* adding more robust type checking to run hf tests

* suppressing code cov output to stdout to make tests more readable

* suppressing code cov output to stdout to make tests more readable

* small fix

* removing cov-report from subprocesses

* setting lru cache to 0 for now; till we figure out block_attention and dynamic LRU size

* removing return_scores from docs

---------

Co-authored-by: Nathan Fulton &lt;nathan@ibm.com&gt;
diff --git a/mellea/backends/cache.py b/mellea/backends/cache.py
@@ -2,6 +2,7 @@
 
 import abc
 from collections import OrderedDict
+from collections.abc import Callable
 from typing import Any
 
 
@@ -11,12 +12,12 @@ class Cache(abc.ABC):
     # Whenever PEP 695 generics are supported by mypy, we should use them here.
 
     @abc.abstractmethod
-    def put(self, key: str, value: Any):
+    def put(self, key: str | int, value: Any):
         """Inserts into the cache. May result in eviction of other cached values."""
         ...
 
     @abc.abstractmethod
-    def get(self, key: str) -> Any | None:
+    def get(self, key: str | int) -> Any | None:
         """Retrieves a value from the cache. Returns `None` if the `id` has no cached value. May impact which cache values are evicted."""
         ...
 
@@ -29,19 +30,25 @@ def current_size(self) -> int:
 class SimpleLRUCache(Cache):
     """A simple [LRU](https://en.wikipedia.org/wiki/Cache_replacement_policies#Least_Recently_Used_(LRU)) cache."""
 
-    def __init__(self, capacity: int):
+    def __init__(self, capacity: int, on_evict: Callable[[Any], None] | None = None):
         """Initializes the LRU cache with a certain capacity.
 
         The `SimpleLRUCache` either contains a value or it doesn't. There is no cache hierarchy. Take care when choosing `capacity`. In practice usually a small value will be fine, but ideally you should try to choose a capacity based upon your available device memory and the context size of your model.
+
+        Args:
+            capacity: Maximum number of items to store in the cache.
+            on_evict: Optional callback function called when an item is evicted from the cache.
+                      This can be used to free resources (e.g., GPU memory) when items are removed.
         """
         self.capacity = capacity
         self.cache: OrderedDict = OrderedDict()
+        self.on_evict = on_evict
 
     def current_size(self):
         """Just return the size of the key set. This isn't necessarily safe."""
         return len(self.cache.keys())
 
-    def get(self, key: str) -> Any | None:
+    def get(self, key: str | int) -> Any | None:
         """Gets a value from the cache."""
         if key not in self.cache:
             return None
@@ -51,13 +58,16 @@ def get(self, key: str) -> Any | None:
             self.cache[key] = value
             return value
 
-    def put(self, key: str, value: Any):
+    def put(self, key: str | int, value: Any):
         """Put a value into the cache."""
         if key in self.cache:
             # If the key exists, move it to the end (most recent)
             self.cache.pop(key)
         elif len(self.cache) >= self.capacity:
             # If the cache is full, remove the least recently used item
-            self.cache.popitem(last=False)
+            _evicted_key, evicted_value = self.cache.popitem(last=False)
+            # Call eviction callback if provided (e.g., to free GPU memory)
+            if self.on_evict is not None:
+                self.on_evict(evicted_value)
         # Add the new key-value pair to the end (most recent)
         self.cache[key] = value
diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py
@@ -82,10 +82,55 @@
 class HFAloraCacheInfo:
     """A dataclass for holding some KV cache and associated information."""
 
-    kv_cache: DynamicCache
+    kv_cache: DynamicCache | None
     merged_token_ids: Any
     merged_attention: Any
     q_end: int = -1
+    scores: Any = None
+
+
+def _cleanup_kv_cache(cache_info: HFAloraCacheInfo) -> None:
+    """Free GPU memory when KV cache is evicted from LRU.
+
+    This function is called by SimpleLRUCache when an entry is evicted.
+    It explicitly deletes tensor references and calls torch.cuda.empty_cache()
+    to return pooled CUDA memory to the device.
+
+    Args:
+        cache_info: The HFAloraCacheInfo being evicted from cache.
+    """
+    import gc
+
+    if cache_info is None:
+        return
+
+    kv = cache_info.kv_cache
+    if kv is not None:
+        # Delete individual tensors from each layer
+        if hasattr(kv, "key_cache"):
+            for tensor in kv.key_cache:
+                del tensor
+            kv.key_cache.clear()
+        if hasattr(kv, "value_cache"):
+            for tensor in kv.value_cache:
+                del tensor
+            kv.value_cache.clear()
+        del cache_info.kv_cache
+
+    # Delete other tensors
+    if cache_info.merged_attention is not None:
+        del cache_info.merged_attention
+
+    # Delete score tensors if present
+    if cache_info.scores is not None:
+        for tensor in cache_info.scores:
+            del tensor
+        del cache_info.scores
+
+    # Force Python garbage collection and return CUDA memory to device
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
 
 
 # modified from VLLM v0.9.2 code base
@@ -244,7 +289,11 @@ def __init__(
         ), "vocab size mismatch between llguidance and huggingface tokenizers ... wtf?"
 
         self._use_caches = use_caches
-        self._cache = cache if cache is not None else SimpleLRUCache(3)
+        self._cache = (
+            cache
+            if cache is not None
+            else SimpleLRUCache(0, on_evict=_cleanup_kv_cache)
+        )
 
         # Adapters can be made known to the backend (added) and loaded.
         self._added_adapters: dict[str, LocalHFAdapter] = {}
@@ -877,7 +926,7 @@ async def _generate_from_context_standard(
                 # Passed as args/kwargs to generate.
                 input_ids,
                 return_dict_in_generate=True,
-                output_scores=True,
+                use_cache=self._use_caches,  # Only create KV cache if caching is enabled
                 **self._make_backend_specific_and_remove(generate_options),
                 **streaming_kwargs,  # type: ignore
                 **format_kwargs,  # type: ignore
@@ -941,7 +990,7 @@ async def processing(
         # and already decoded.
         if isinstance(chunk, str):
             mot._underlying_value += chunk
-        else:
+        elif isinstance(chunk, GenerateDecoderOnlyOutput):
             # Otherwise, it's a non-streaming request. Decode it here.
             mot._meta["hf_output"] = chunk
             mot._underlying_value += self._tokenizer.decode(
@@ -968,19 +1017,31 @@ async def post_processing(
         # The ModelOutputThunk must be computed by this point.
         assert mot.value is not None
 
-        # Add an entry to the cache for ALora reuse.
-        if self._use_caches and mot._meta.get("hf_output", None) is not None:
-            output_complete = mot._meta["hf_output"].sequences[0]
-            cache: DynamicCache = mot._meta["hf_output"].past_key_values  # type: ignore
+        # Store KV cache in LRU separately (not in mot._meta) to enable proper cleanup on eviction.
+        # This prevents GPU memory from being held by ModelOutputThunk references.
+        hf_output = mot._meta.get("hf_output", None)
+        if (
+            self._use_caches
+            and isinstance(hf_output, GenerateDecoderOnlyOutput)
+            and (hf_output.past_key_values is not None or hf_output.scores is not None)
+        ):
+            output_complete = hf_output.sequences[0]
+            kv_cache: DynamicCache | None = hf_output.past_key_values  # type: ignore
 
             cache_info = HFAloraCacheInfo(
-                kv_cache=cache,
+                kv_cache=kv_cache,
                 merged_token_ids=output_complete,
                 merged_attention=torch.ones_like(output_complete).to(self._device),
                 q_end=len(input_ids[0]),  # type: ignore
+                scores=hf_output.scores,
             )
 
-            self.cache_put(mot.value, cache_info)
+            cache_key = id(mot.value)
+            self.cache_put(cache_key, cache_info)
+
+            # Clear KV cache and scores from HF output - they're now owned by the LRU cache
+            hf_output.past_key_values = None
+            hf_output.scores = None
 
         # Only scan for tools if we are not doing structured output and tool calls were provided to the model.
         if _format is None and tool_calls:
@@ -1002,14 +1063,32 @@ async def post_processing(
             # HuggingFace local models don't typically provide token counts
             # but we can record response metadata if available
             hf_output = mot._meta.get("hf_output")
-            if hf_output is not None:
+            if isinstance(hf_output, GenerateDecoderOnlyOutput):
                 record_response_metadata(span, hf_output)
 
             # Close the span now that async operation is complete
             end_backend_span(span)
             # Clean up span reference
             del mot._meta["_telemetry_span"]
 
+        # When caching is disabled, clear hf_output from meta to free GPU memory.
+        # The sequences tensor is on GPU and accumulates if not cleared.
+        if not self._use_caches and isinstance(
+            mot._meta.get("hf_output"), GenerateDecoderOnlyOutput
+        ):
+            import gc
+
+            hf_out = mot._meta["hf_output"]
+            if hasattr(hf_out, "sequences") and hf_out.sequences is not None:
+                del hf_out.sequences
+            if hasattr(hf_out, "scores") and hf_out.scores is not None:
+                del hf_out.scores
+            del mot._meta["hf_output"]
+
+            # Force Python GC and return CUDA memory to device
+            gc.collect()
+            torch.cuda.empty_cache()
+
         # Generate the log for this ModelOutputThunk.
         generate_log = GenerateLog()
         generate_log.prompt = conversation
@@ -1159,13 +1238,13 @@ async def generate_from_raw(
         return results
 
     # region cache management
-    def cache_get(self, id: str) -> HFAloraCacheInfo | None:
+    def cache_get(self, id: str | int) -> HFAloraCacheInfo | None:
         """Retrieve from cache."""
         v = self._cache.get(id)
         assert v is None or type(v) is HFAloraCacheInfo
         return v
 
-    def cache_put(self, id: str, v: HFAloraCacheInfo):
+    def cache_put(self, id: str | int, v: HFAloraCacheInfo):
         """Put into cache."""
         self._cache.put(id, v)
 
diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py
@@ -510,7 +510,11 @@ async def test_error_during_generate_with_lock(backend) -> None:
     b: LocalHFBackend = copy(backend)
     model = copy(b._model)
     b._model = model
-    b._model.set_adapter([])
+    try:
+        b._model.set_adapter([])
+    except ValueError as e:
+        if "No adapter loaded" not in str(e):
+            raise
     b._added_adapters = {}
     b._loaded_adapters = {}
     b.add_adapter(
diff --git a/test/conftest.py b/test/conftest.py
@@ -244,7 +244,7 @@ def _run_heavy_modules_isolated(session, heavy_modules: list[str]) -> int:
         print("-" * 70)
 
         # Build pytest command with same options as parent session
-        cmd = [sys.executable, "-m", "pytest", module_path, "-v"]
+        cmd = [sys.executable, "-m", "pytest", module_path, "-v", "--no-cov"]
 
         # Add markers from original command if present
         config = session.config