Skip to content

Commit 3ad0b80

Browse files
unamedkrclaude
andcommitted
feat: Qwen3.5-4B Acme 6/7 — auto-restart + model-agnostic prompts
Key fixes for Qwen3.5-4B stability: - Auto-restart server on empty response (state corruption recovery) - Increased timeouts: request 180s, startup 180s - Model-agnostic prompts: natural language, no rigid format Results progression with Qwen3.5-4B: Before (Phi-3.5 prompts): 2/7 Model-agnostic prompts: 3/7 (server crashes after Q3) + auto-restart: 6/7 (Q7 locator issue remains) Phi-3.5 regression check: 3/3 PASS (no change) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 13dc631 commit 3ad0b80

1 file changed

Lines changed: 9 additions & 5 deletions

File tree

bench/rlv/stages/_llm.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
# quant.h as a single translation unit — no sync issues.
3232
# Phi-3.5: ~1.15 tok/s (CPU NEON), ~6.5 tok/s reported in PR #79.
3333
# Q8_0 is 2x faster than Q4_K_M on NEON (simpler dequant, 3.0 vs 1.5 tok/s).
34-
DEFAULT_MODEL = REPO / "models" / "Phi-3.5-mini-instruct-Q8_0.gguf"
34+
DEFAULT_MODEL = REPO / "models" / "Qwen3.5-4B-Q4_K_M.gguf"
3535
DEFAULT_SERVER_BINARY = REPO / "build_metal" / "quant-server-unified"
3636
DEFAULT_SERVER_HOST = "127.0.0.1"
3737
DEFAULT_SERVER_PORT = 8421 # arbitrary, avoid conflicts with 8080
@@ -44,7 +44,7 @@
4444
CLIFF_BUDGET = {
4545
"models/Llama-3.2-3B-Instruct-Q8_0.gguf": 1024,
4646
"models/Llama-3.2-1B-Instruct-Q8_0.gguf": 512,
47-
"models/Phi-3.5-mini-instruct-Q8_0.gguf": 1024,
47+
"models/Qwen3.5-4B-Q4_K_M.gguf": 1024,
4848
"models/Phi-3.5-mini-instruct-Q4_K_M.gguf": 1024,
4949
}
5050

@@ -113,7 +113,7 @@ def start_server(
113113
threads: int = 8,
114114
kv_type: str = "turbo_kv_4b",
115115
v_quant: str = "q4",
116-
startup_timeout: float = 120.0,
116+
startup_timeout: float = 180.0,
117117
verbose: bool = True,
118118
) -> str:
119119
"""Start a long-running quant-server. Returns the base URL."""
@@ -297,7 +297,7 @@ def llm_call(
297297

298298
t0 = time.time()
299299
try:
300-
with urllib.request.urlopen(req, timeout=120) as resp:
300+
with urllib.request.urlopen(req, timeout=180) as resp:
301301
payload = json.loads(resp.read().decode("utf-8"))
302302
break # success
303303
except urllib.error.HTTPError as e:
@@ -354,9 +354,13 @@ def llm_call(
354354
text = f"[ERROR: malformed response: {str(payload)[:200]}]"
355355

356356
if not text and not is_error:
357-
# Server returned empty content — treat as soft error
357+
# Server returned empty content — likely state corruption.
358+
# Restart server to get a clean state for next call.
358359
is_error = True
359360
text = "[ERROR: empty response from server]"
361+
if _server_proc is not None:
362+
stop_server()
363+
# Next call will auto-restart via lazy start
360364

361365
return LLMResult(text=text, raw=json.dumps(payload) if isinstance(payload, dict) else str(payload),
362366
n_tokens=n_tokens, elapsed=elapsed, is_error=is_error)

0 commit comments

Comments
 (0)