adding some extra vram cleanup to make end to end tests smoother (#765)

avinash2692 · web-flow · commit 243a1614f39a · 2026-03-30T14:57:19.000Z
diff --git a/test/cli/test_alora_train_integration.py b/test/cli/test_alora_train_integration.py
@@ -288,14 +288,30 @@ def test_alora_training_integration():
         )
 
         # Cleanup GPU memory
-        base_model.cpu()
+        import gc
+
+        # 1. Remove accelerate dispatch hooks before moving to CPU.
+        #    device_map="auto" installs hooks that prevent full VRAM release otherwise.
+        try:
+            from accelerate.hooks import remove_hook_from_module
+
+            remove_hook_from_module(base_model, recurse=True)
+        except (ImportError, Exception):
+            pass
+
+        # 2. Delete the PeftModel wrapper first — it holds internal refs to base_model.
         del model_with_adapter
+
+        # 3. Now move base_model to CPU and delete it.
+        base_model.cpu()
         del base_model
-        import gc
 
+        # 4. Force GC and flush CUDA cache synchronously.
+        gc.collect()
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+            torch.cuda.synchronize()
 
 
 def test_lora_training_integration():
@@ -369,3 +385,12 @@ def test_lora_training_integration():
         print(
             f"✅ Config format verified: {config.get('peft_type')} without alora_invocation_tokens"
         )
+
+        # Cleanup GPU memory after training
+        import gc
+
+        gc.collect()
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
diff --git a/test/conftest.py b/test/conftest.py
@@ -767,6 +767,47 @@ def pytest_runtest_setup(item):
             except ImportError:
                 pass
 
+        # Warm up Ollama models when entering Ollama group
+        if current_group == "ollama" and prev_group != "ollama":
+            logger = FancyLogger.get_logger()
+            host_str = os.environ.get("OLLAMA_HOST", "127.0.0.1")
+            port = os.environ.get("OLLAMA_PORT", "11434")
+            logger.info(
+                "Warming up ollama models before ollama group (keep_alive=-1)..."
+            )
+            for model in ["granite4:micro", "granite4:micro-h", "granite3.2-vision"]:
+                try:
+                    requests.post(
+                        f"http://{host_str}:{port}/api/generate",
+                        json={
+                            "model": model,
+                            "prompt": "hi",
+                            "stream": False,
+                            "keep_alive": -1,
+                        },
+                        timeout=120,
+                    )
+                    logger.info("  Warmed up and pinned: %s", model)
+                except Exception as e:
+                    logger.warning("  Warmup failed for %s: %s", model, e)
+
+        # Evict Ollama models when leaving Ollama group
+        if prev_group == "ollama" and current_group != "ollama":
+            logger = FancyLogger.get_logger()
+            host_str = os.environ.get("OLLAMA_HOST", "127.0.0.1")
+            port = os.environ.get("OLLAMA_PORT", "11434")
+            logger.info("Evicting ollama models from VRAM after ollama group...")
+            for model in ["granite4:micro", "granite4:micro-h", "granite3.2-vision"]:
+                try:
+                    requests.post(
+                        f"http://{host_str}:{port}/api/generate",
+                        json={"model": model, "keep_alive": 0},
+                        timeout=10,
+                    )
+                    logger.info("  Evicted: %s", model)
+                except Exception as e:
+                    logger.warning("  Eviction failed for %s: %s", model, e)
+
         pytest_runtest_setup._last_backend_group = current_group
 
     # Check for override flags from CLI
diff --git a/test/scripts/run_tests_with_ollama.sh b/test/scripts/run_tests_with_ollama.sh
@@ -46,6 +46,10 @@ fi
 mkdir -p "$LOGDIR"
 
 cleanup() {
+    if [[ "${OLLAMA_EXTERNAL:-0}" == "1" ]]; then
+        log "Ollama managed externally (OLLAMA_EXTERNAL=1) — skipping shutdown"
+        return
+    fi
     log "Shutting down ollama server..."
     if [[ -n "${OLLAMA_PID:-}" ]] && kill -0 "$OLLAMA_PID" 2>/dev/null; then
         kill "$OLLAMA_PID" 2>/dev/null
@@ -138,14 +142,18 @@ done
 log "All models ready."
 
 # --- Warm up models (first load into memory is slow) ---
-log "Warming up models..."
-for model in "${OLLAMA_MODELS[@]}"; do
-    log "  Warming $model ..."
-    curl -sf "http://127.0.0.1:${OLLAMA_PORT}/api/generate" \
-        -d "{\"model\": \"$model\", \"prompt\": \"hi\", \"stream\": false}" \
-        -o /dev/null --max-time 120 || log "  Warning: warmup for $model timed out (will load on first test)"
-done
-log "Warmup complete."
+if [[ "${OLLAMA_SKIP_WARMUP:-0}" == "1" ]]; then
+    log "Skipping model warmup (OLLAMA_SKIP_WARMUP=1)"
+else
+    log "Warming up models..."
+    for model in "${OLLAMA_MODELS[@]}"; do
+        log "  Warming $model ..."
+        curl -sf "http://127.0.0.1:${OLLAMA_PORT}/api/generate" \
+            -d "{\"model\": \"$model\", \"prompt\": \"hi\", \"stream\": false}" \
+            -o /dev/null --max-time 120 || log "  Warning: warmup for $model timed out (will load on first test)"
+    done
+    log "Warmup complete."
+fi
 
 # --- Run tests ---
 log "Starting pytest..."