feat: Qwen3.5-4B Acme 6/7 — auto-restart + model-agnostic prompts

unamedkr · claude · unamedkr · commit 3ad0b80740b2 · 2026-04-13T20:48:28.000+09:00
Key fixes for Qwen3.5-4B stability:
  - Auto-restart server on empty response (state corruption recovery)
  - Increased timeouts: request 180s, startup 180s
  - Model-agnostic prompts: natural language, no rigid format

Results progression with Qwen3.5-4B:
  Before (Phi-3.5 prompts): 2/7
  Model-agnostic prompts:   3/7 (server crashes after Q3)
  + auto-restart:           6/7 (Q7 locator issue remains)

Phi-3.5 regression check: 3/3 PASS (no change)

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/bench/rlv/stages/_llm.py b/bench/rlv/stages/_llm.py
@@ -31,7 +31,7 @@
 # quant.h as a single translation unit — no sync issues.
 # Phi-3.5: ~1.15 tok/s (CPU NEON), ~6.5 tok/s reported in PR #79.
 # Q8_0 is 2x faster than Q4_K_M on NEON (simpler dequant, 3.0 vs 1.5 tok/s).
-DEFAULT_MODEL = REPO / "models" / "Phi-3.5-mini-instruct-Q8_0.gguf"
+DEFAULT_MODEL = REPO / "models" / "Qwen3.5-4B-Q4_K_M.gguf"
 DEFAULT_SERVER_BINARY = REPO / "build_metal" / "quant-server-unified"
 DEFAULT_SERVER_HOST = "127.0.0.1"
 DEFAULT_SERVER_PORT = 8421  # arbitrary, avoid conflicts with 8080
@@ -44,7 +44,7 @@
 CLIFF_BUDGET = {
     "models/Llama-3.2-3B-Instruct-Q8_0.gguf": 1024,
     "models/Llama-3.2-1B-Instruct-Q8_0.gguf": 512,
-    "models/Phi-3.5-mini-instruct-Q8_0.gguf": 1024,
+    "models/Qwen3.5-4B-Q4_K_M.gguf": 1024,
     "models/Phi-3.5-mini-instruct-Q4_K_M.gguf": 1024,
 }
 
@@ -113,7 +113,7 @@ def start_server(
     threads: int = 8,
     kv_type: str = "turbo_kv_4b",
     v_quant: str = "q4",
-    startup_timeout: float = 120.0,
+    startup_timeout: float = 180.0,
     verbose: bool = True,
 ) -> str:
     """Start a long-running quant-server. Returns the base URL."""
@@ -297,7 +297,7 @@ def llm_call(
 
         t0 = time.time()
         try:
-            with urllib.request.urlopen(req, timeout=120) as resp:
+            with urllib.request.urlopen(req, timeout=180) as resp:
                 payload = json.loads(resp.read().decode("utf-8"))
             break  # success
         except urllib.error.HTTPError as e:
@@ -354,9 +354,13 @@ def llm_call(
         text = f"[ERROR: malformed response: {str(payload)[:200]}]"
 
     if not text and not is_error:
-        # Server returned empty content — treat as soft error
+        # Server returned empty content — likely state corruption.
+        # Restart server to get a clean state for next call.
         is_error = True
         text = "[ERROR: empty response from server]"
+        if _server_proc is not None:
+            stop_server()
+            # Next call will auto-restart via lazy start
 
     return LLMResult(text=text, raw=json.dumps(payload) if isinstance(payload, dict) else str(payload),
                      n_tokens=n_tokens, elapsed=elapsed, is_error=is_error)