phase A-2: universal coherence check + speed optimizations → 19/20

unamedkr · claude · unamedkr · commit dfd0a4451604 · 2026-04-13T02:07:45.000+09:00
Core improvement — universal LLM coherence check (verifier.py):
  Single prompt: "Is the user's specific question answered?
  Not just related information, but the EXACT thing asked."
  No type-specific branching. No hardcoding.

Speed optimizations:
  - Lookup: integrated self-check (ANSWER/NONE format, -1 LLM call)
  - Verifier: fast-accept for specific answers (skip coherence)
  - Verifier: instant UNSURE for NONE answers
  - Max tokens: 32→24 (lookup), 32→16 (default)
  - Max retries: 3→2
  - Timeout: 300→120s

Results (1.3MB large doc, 2754 chunks, 20 questions):
  Previous: 15/20 (75%)
  Now:      19/20 (95%) — 4 previously failing questions fixed
  Q15 remains: locator picks wrong chunk (22 chunks away from answer)

Karpathy loop progression:
  Hardcoded alignment:    0/5 fixed (wrong approach)
  Type-aware prompts:     4/5 fixed (overfitting)
  Universal coherence:    4/5 fixed (correct approach, same result)

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/bench/rlv/stages/_llm.py b/bench/rlv/stages/_llm.py
@@ -236,7 +236,7 @@ def _restart_server_if_dead(model: str | Path = DEFAULT_MODEL, verbose: bool = T
 def llm_call(
     prompt: str,
     *,
-    max_tokens: int = 32,
+    max_tokens: int = 16,
     temperature: float = 0.0,
     model: str | Path = DEFAULT_MODEL,
     enforce_budget: bool = True,
@@ -266,7 +266,7 @@ def llm_call(
 
     # Validate max_tokens
     if max_tokens <= 0:
-        max_tokens = 32
+        max_tokens = 16
 
     messages = []
     if system:
@@ -297,7 +297,7 @@ def llm_call(
 
         t0 = time.time()
         try:
-            with urllib.request.urlopen(req, timeout=300) as resp:
+            with urllib.request.urlopen(req, timeout=120) as resp:
                 payload = json.loads(resp.read().decode("utf-8"))
             break  # success
         except urllib.error.HTTPError as e:
diff --git a/bench/rlv/stages/lookup.py b/bench/rlv/stages/lookup.py
@@ -49,7 +49,8 @@
 
 Question: {question}
 
-Quote the single sentence from the text above that answers this question. Reply with only that sentence, no explanation."""
+If the text contains the EXACT answer, reply: ANSWER: <the answer>
+If the text does NOT answer this specific question, reply: NONE"""
 
 
 @dataclass
@@ -147,14 +148,22 @@ def lookup(
             mode = "direct-answer" if len(sentences) > MAX_SENTENCES_FOR_SELECT else "single-sentence"
             print(f"[lookup] chunk {region.chunk_id} ({len(region_text)} chars), "
                   f"{len(sentences)} sentences -> {mode}")
-        result = _llm.llm_call(prompt, max_tokens=32)
+        result = _llm.llm_call(prompt, max_tokens=24)
         if result.is_error:
             return LookupResult(
                 answer=result.text, region_text=region_text,
                 chunk_id=region.chunk_id, raw_llm_output=result.text, method="error",
             )
+        text = result.text.strip()
+        # Integrated self-check: if model says NONE, it couldn't find the answer
+        # in this chunk → verifier will mark UNSURE → triggers research
+        if text.upper().startswith("NONE") or "does not contain" in text.lower():
+            text = f"[NONE] {text}"
+        # Strip "ANSWER:" prefix if present
+        if text.upper().startswith("ANSWER:"):
+            text = text[7:].strip()
         return LookupResult(
-            answer=result.text.strip(),
+            answer=text,
             region_text=region_text,
             chunk_id=region.chunk_id,
             raw_llm_output=result.text,
@@ -189,7 +198,7 @@ def lookup(
         prompt = LOOKUP_QUOTE_FALLBACK_TEMPLATE.format(
             region_text=region_text, question=question,
         )
-        result2 = _llm.llm_call(prompt, max_tokens=32)
+        result2 = _llm.llm_call(prompt, max_tokens=24)
         return LookupResult(
             answer=result2.text.strip(),
             region_text=region_text,
diff --git a/bench/rlv/stages/researcher.py b/bench/rlv/stages/researcher.py
@@ -13,7 +13,7 @@
 from .verifier import VerifyResult
 
 
-MAX_RETRIES = 3
+MAX_RETRIES = 2
 
 
 @dataclass
diff --git a/bench/rlv/stages/verifier.py b/bench/rlv/stages/verifier.py
@@ -165,36 +165,8 @@ def _literal_verify(
     if len(answer) < 200 and any(p in answer_head for p in refusal_phrases):
         return "UNSURE", f"answer is a refusal ('{answer[:60]}...')"
 
-    # Phase A-2: Answer-Question alignment check.
-    # The answer must actually ADDRESS the question type. An answer that
-    # contains region-grounded facts but doesn't answer the specific
-    # question is "related but wrong" — the hardest hallucination to catch.
-    # This is RLV's core differentiator: detecting WRONG answers, not just
-    # fabricated ones.
-    q_lower = question.lower()
-    answer_norm = answer.lower()
-
-    # "When/what year/what date" → answer must contain a year or date
-    if re.search(r'\b(what year|in what year|when did|what date|on what date)\b', q_lower):
-        has_year = bool(re.search(r'\b(1[0-9]{3}|20[0-9]{2})\b', answer))
-        has_month = bool(re.search(r'\b(january|february|march|april|may|june|july|august|september|october|november|december)\b', answer.lower()))
-        if not has_year and not has_month:
-            return "UNSURE", f"temporal question but answer has no year/date"
-
-    # "After/before which battle/event" → answer must name a specific event
-    # AND the answer must contain an event-type word (battle, war, etc.)
-    # "They were modernized in 1934" doesn't answer "after which battle?"
-    if re.search(r'\b(which battle|after which battle|what battle|which war|after which war)\b', q_lower):
-        event_words = ["battle", "war", "rebellion", "siege", "campaign", "invasion", "attack", "offensive"]
-        has_event_word = any(w in answer.lower() for w in event_words)
-        if not has_event_word:
-            return "UNSURE", f"battle/war question but answer names no battle/war"
-
-    # "What does X mean" → answer should contain a definition signal
-    if re.search(r'\b(what does|what is the meaning|what does the (?:name|word|term))\b', q_lower):
-        has_def = any(w in answer.lower() for w in ["means", "meaning", "refers to", "derived from", "to cut", "headed"])
-        if not has_def and len(answer) < 150:
-            return "UNSURE", f"definition question but answer lacks definition"
+    # Phase A-2: removed hardcoded type-specific alignment checks.
+    # The universal LLM coherence check in verify() handles this properly.
 
     word_terms, number_terms = _extract_answer_key_terms(answer)
     if not word_terms and not number_terms:
@@ -268,6 +240,54 @@ def verify(
             question, answer, region_text,
             chunk_id=chunk_id, gist=gist,
         )
+        if verdict == "CONFIDENT":
+            # Phase A-2: LLM coherence check — the CORE of RLV's value.
+            # ONE universal prompt, no type-specific branching.
+            #
+            # Speed optimization: skip coherence for high-confidence answers.
+            # - "[NONE]" answers: instant UNSURE (model already said it can't answer)
+            # - Short specific answers (names, numbers): fast-accept
+            # - Vague/generic answers: require coherence check
+            answer_stripped = answer.strip()
+
+            # Self-check: model said NONE → instant UNSURE
+            if answer_stripped.startswith("[NONE]"):
+                if verbose:
+                    print(f"[verifier] model self-check: NONE → UNSURE")
+                return VerifyResult(verdict="UNSURE", reason="model self-check: NONE", method="self-check")
+
+            # Fast-accept: short answer with specific entity (name/number/date)
+            # These are almost always correct when literal check passes
+            is_specific = (
+                len(answer_stripped) < 80 and
+                (bool(re.search(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', answer_stripped)) or  # proper noun
+                 bool(re.search(r'\b\d{3,4}\b', answer_stripped)))  # year/number
+            )
+            if is_specific:
+                if verbose:
+                    print(f"[verifier] fast-accept: specific answer ({answer_stripped[:40]})")
+                return VerifyResult(verdict="CONFIDENT", reason=reason, method="literal+fast-accept")
+
+            # Generic/vague answer → LLM coherence check required
+            coherence_prompt = (
+                f"A user asked: \"{question}\"\n"
+                f"The system answered: \"{answer[:200]}\"\n\n"
+                f"Is the user's specific question answered? "
+                f"Not just related information, but the EXACT thing asked. "
+                f"YES or NO."
+            )
+            coherence_result = _llm.llm_call(coherence_prompt, max_tokens=4)
+            coherence_text = coherence_result.text.strip().lower()[:10]
+            if verbose:
+                print(f"[verifier] coherence check: {coherence_text!r}")
+            if "no" in coherence_text and "yes" not in coherence_text:
+                return VerifyResult(
+                    verdict="UNSURE",
+                    reason=f"literal:CONFIDENT but coherence:NO ({reason})",
+                    method="literal+coherence",
+                )
+            return VerifyResult(verdict="CONFIDENT", reason=reason, method="literal+coherence")
+
         if verdict != "UNSURE" or not use_llm_fallback:
             if verbose:
                 print(f"[verifier] literal -> {verdict} ({reason})")