fix: flush MPS cache in alora test GPU cleanup (#790) (#800)

planetf1 · web-flow · commit 839eead1ec1a · 2026-04-10T12:27:43.000Z
* fix: flush MPS cache in alora test GPU cleanup (#790) Add torch.mps.empty_cache() after CUDA cleanup blocks in both alora integration tests, matching the existing conftest pattern. Prevents MPS memory from accumulating between tests on Apple Silicon. * refactor(test): extract flush_device_caches() helper for GPU cleanup Consolidate the duplicated gc.collect + CUDA/MPS cache flush pattern into a single flush_device_caches() function in test/conftest.py. - Replaces 4 inline flush sites with a single call - Adds MPS support to sites that previously only handled CUDA (pytest_runtest_setup backend transitions, memory_cleaner fixture) - Fixes a bug where gc.collect() was conditional on CUDA availability in pytest_runtest_setup (now runs unconditionally) - Adds torch.mps.synchronize() for parity with CUDA synchronize() - Enriches cleanup_gpu_backend() VRAM logging: device-aware reporting for both CUDA (free/total/allocated/reserved/fragmentation) and MPS (allocated/max), with reclaimed bytes on both paths - Removes unused shutil/sys imports from test_alora_train_integration
diff --git a/test/cli/test_alora_train_integration.py b/test/cli/test_alora_train_integration.py
@@ -5,13 +5,13 @@
 
 import json
 import os
-import shutil
-import sys
 import tempfile
 from pathlib import Path
 
 import pytest
 
+from test.conftest import flush_device_caches
+
 torch = pytest.importorskip("torch", reason="torch not installed — install mellea[hf]")
 from transformers import AutoTokenizer
 
@@ -292,8 +292,6 @@ def test_alora_training_integration():
         )
 
         # Cleanup GPU memory
-        import gc
-
         # 1. Remove accelerate dispatch hooks before moving to CPU.
         #    device_map="auto" installs hooks that prevent full VRAM release otherwise.
         try:
@@ -310,12 +308,8 @@ def test_alora_training_integration():
         base_model.cpu()
         del base_model
 
-        # 4. Force GC and flush CUDA cache synchronously.
-        gc.collect()
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
+        # 4. Flush device caches.
+        flush_device_caches()
 
 
 def test_lora_training_integration():
@@ -391,10 +385,4 @@ def test_lora_training_integration():
         )
 
         # Cleanup GPU memory after training
-        import gc
-
-        gc.collect()
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
+        flush_device_caches()
diff --git a/test/conftest.py b/test/conftest.py
@@ -260,6 +260,33 @@ def pytest_configure(config):
 # ============================================================================
 
 
+# ============================================================================
+# Device Cache Flush Helper
+# ============================================================================
+
+
+def flush_device_caches() -> None:
+    """Force garbage collection and flush GPU device caches (CUDA and MPS).
+
+    Safe to call unconditionally — skips gracefully when torch is absent
+    or no accelerator is available.
+    """
+    gc.collect()
+    gc.collect()
+
+    try:
+        import torch
+
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        if torch.backends.mps.is_available():
+            torch.mps.synchronize()
+            torch.mps.empty_cache()
+    except ImportError:
+        pass
+
+
 # ============================================================================
 # vLLM Backend Cleanup Helper
 # ============================================================================
@@ -275,22 +302,34 @@ def cleanup_gpu_backend(backend, backend_name="unknown"):
         backend: The backend instance to clean up.
         backend_name: Name for logging.
     """
-    import gc
-
     logger = FancyLogger.get_logger()
     logger.info(f"Cleaning up {backend_name} backend GPU memory...")
 
     try:
         import torch
 
+        # Snapshot memory before cleanup for reporting
+        free_before = 0
+        allocated_before = 0
         if torch.cuda.is_available():
-            free_before, total = torch.cuda.mem_get_info()
+            free_before, total_mem = torch.cuda.mem_get_info()
+            reserved = torch.cuda.memory_reserved()
+            allocated = torch.cuda.memory_allocated()
             logger.info(
-                f"  GPU before cleanup: {free_before / 1024**3:.1f}GB free "
-                f"/ {total / 1024**3:.1f}GB total"
+                f"  CUDA before cleanup: {free_before / 1024**3:.1f}GB free "
+                f"/ {total_mem / 1024**3:.1f}GB total "
+                f"(allocated {allocated / 1024**2:.0f}MB, "
+                f"reserved {reserved / 1024**2:.0f}MB, "
+                f"fragmentation {(reserved - allocated) / 1024**2:.0f}MB)"
+            )
+        elif torch.backends.mps.is_available():
+            allocated_before = torch.mps.current_allocated_memory()
+            max_mem = torch.mps.recommended_max_memory()
+            logger.info(
+                f"  MPS before cleanup: "
+                f"allocated {allocated_before / 1024**2:.0f}MB "
+                f"/ {max_mem / 1024**3:.1f}GB max"
             )
-        else:
-            free_before = 0
 
         # 1. Clear the LRU cache (holds DynamicCache KV tensors on GPU)
         if hasattr(backend, "_cache") and hasattr(backend._cache, "cache"):
@@ -357,21 +396,27 @@ def cleanup_gpu_backend(backend, backend_name="unknown"):
             del backend._tokenizer
 
         # 7. Force garbage collection and flush device caches
-        gc.collect()
-        gc.collect()
+        flush_device_caches()
 
+        # Report memory after cleanup
         if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
-
-            free_after, total = torch.cuda.mem_get_info()
+            free_after, total_mem = torch.cuda.mem_get_info()
+            reserved = torch.cuda.memory_reserved()
+            allocated = torch.cuda.memory_allocated()
             logger.info(
-                f"  GPU after cleanup: {free_after / 1024**3:.1f}GB free "
-                f"/ {total / 1024**3:.1f}GB total "
-                f"(reclaimed {(free_after - free_before) / 1024**3:.1f}GB)"
+                f"  CUDA after cleanup: {free_after / 1024**3:.1f}GB free "
+                f"/ {total_mem / 1024**3:.1f}GB total "
+                f"(allocated {allocated / 1024**2:.0f}MB, "
+                f"reserved {reserved / 1024**2:.0f}MB, "
+                f"reclaimed {(free_after - free_before) / 1024**3:.1f}GB)"
+            )
+        elif torch.backends.mps.is_available():
+            allocated_after = torch.mps.current_allocated_memory()
+            logger.info(
+                f"  MPS after cleanup: "
+                f"allocated {allocated_after / 1024**2:.0f}MB "
+                f"(reclaimed {(allocated_before - allocated_after) / 1024**2:.0f}MB)"
             )
-        if torch.backends.mps.is_available():
-            torch.mps.empty_cache()
 
     except ImportError:
         pass
@@ -478,17 +523,7 @@ def pytest_runtest_setup(item):
                 "Running GPU cleanup."
             )
 
-            # General GPU flush for any transition
-            try:
-                import torch
-
-                if torch.cuda.is_available():
-                    gc.collect()
-                    gc.collect()
-                    torch.cuda.empty_cache()
-                    torch.cuda.synchronize()
-            except ImportError:
-                pass
+            flush_device_caches()
 
         # Warm up Ollama models when entering Ollama group
         if current_group == "ollama" and prev_group != "ollama":
@@ -566,18 +601,7 @@ def pytest_runtest_teardown(item, nextitem):
 def memory_cleaner():
     """Lightweight memory cleanup — safety net for per-test GPU leaks."""
     yield
-
-    gc.collect()
-    gc.collect()
-
-    try:
-        import torch
-
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
-    except ImportError:
-        pass
+    flush_device_caches()
 
 
 def evict_ollama_models() -> None: