phase 3 day 5: Phi-3.5 Q8 + lookup improvements → Acme 7/7

unamedkr · claude · unamedkr · commit d06d0bca7907 · 2026-04-12T17:13:13.000+09:00
Karpathy loop on Acme benchmark with Phi-3.5-mini (Q8_0, unified server):
  Baseline: 5/7
  Loop 1: lookup prompt "DIRECTLY answers" → 6/7 (+Q1, +Q7)
  Loop 2: 3-sentence window (was 2) → 7/7 (+Q6)

Changes:
- _llm.py: switch to Q8_0 model (3x faster than Q4_K_M)
- lookup.py: stronger select-by-index prompt + 3-sentence context window

D3 gate: PASS (7/7, 157s, ~22s/question on M3)

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/bench/rlv/stages/_llm.py b/bench/rlv/stages/_llm.py
@@ -28,7 +28,7 @@
 # that produced garbage for Phi-3.5/SmolLM2. The unified server compiles
 # quant.h as a single translation unit — no sync issues.
 # Phi-3.5: ~1.15 tok/s (CPU NEON), ~6.5 tok/s reported in PR #79.
-DEFAULT_MODEL = REPO / "models" / "Phi-3.5-mini-instruct-Q4_K_M.gguf"
+DEFAULT_MODEL = REPO / "models" / "Phi-3.5-mini-instruct-Q8_0.gguf"
 DEFAULT_SERVER_BINARY = REPO / "build_metal" / "quant-server-unified"
 DEFAULT_SERVER_HOST = "127.0.0.1"
 DEFAULT_SERVER_PORT = 8421  # arbitrary, avoid conflicts with 8080
@@ -41,7 +41,7 @@
 CLIFF_BUDGET = {
     "models/Llama-3.2-3B-Instruct-Q8_0.gguf": 1024,
     "models/Llama-3.2-1B-Instruct-Q8_0.gguf": 512,
-    "models/Phi-3.5-mini-instruct-Q4_K_M.gguf": 1024,
+    "models/Phi-3.5-mini-instruct-Q8_0.gguf": 1024,
 }
 
 
diff --git a/bench/rlv/stages/lookup.py b/bench/rlv/stages/lookup.py
@@ -28,13 +28,13 @@
 
 # Day 3 v3: numbered-sentence selection prompt. The model picks an
 # integer; we map it back to a verbatim sentence.
-LOOKUP_PROMPT_TEMPLATE = """Sentences from the document:
+LOOKUP_PROMPT_TEMPLATE = """Read these sentences carefully:
 
 {numbered_sentences}
 
 Question: {question}
 
-Which sentence number contains the answer? Reply with only one digit: the sentence number."""
+Which sentence number DIRECTLY answers the question? Pick the sentence that contains the specific fact being asked about. Reply with ONLY the number."""
 
 # Fallback "quote" prompt for chunks with very few sentences (≤1) where
 # selection is trivial and we can ask the model directly.
@@ -159,12 +159,16 @@ def lookup(
     # in Mercury Fur. He was directed by John Tiffany." — picking either
     # sentence alone loses the connection). For Acme-style structured
     # docs, the previous sentence is benign extra context.
-    selected = sentences[idx - 1]
-    if idx >= 2:
-        prev = sentences[idx - 2]
-        answer = f"{prev} {selected}"
-    else:
-        answer = selected
+    # Return a 3-sentence window centered on the selected sentence.
+    # Multi-hop questions often require context from adjacent sentences
+    # (e.g., "strategy proposed at what event?" spans sentences about
+    # the strategy AND the event name in the next sentence).
+    window = []
+    for offset in range(-1, 2):  # prev, selected, next
+        i = idx - 1 + offset
+        if 0 <= i < len(sentences):
+            window.append(sentences[i])
+    answer = " ".join(window)
     if verbose:
         print(f"[lookup] selected sentence {idx}/{len(sentences)}: {selected[:80]!r}")
     return LookupResult(