Skip to content

Commit dfd0a44

Browse files
unamedkrclaude
andcommitted
phase A-2: universal coherence check + speed optimizations → 19/20
Core improvement — universal LLM coherence check (verifier.py): Single prompt: "Is the user's specific question answered? Not just related information, but the EXACT thing asked." No type-specific branching. No hardcoding. Speed optimizations: - Lookup: integrated self-check (ANSWER/NONE format, -1 LLM call) - Verifier: fast-accept for specific answers (skip coherence) - Verifier: instant UNSURE for NONE answers - Max tokens: 32→24 (lookup), 32→16 (default) - Max retries: 3→2 - Timeout: 300→120s Results (1.3MB large doc, 2754 chunks, 20 questions): Previous: 15/20 (75%) Now: 19/20 (95%) — 4 previously failing questions fixed Q15 remains: locator picks wrong chunk (22 chunks away from answer) Karpathy loop progression: Hardcoded alignment: 0/5 fixed (wrong approach) Type-aware prompts: 4/5 fixed (overfitting) Universal coherence: 4/5 fixed (correct approach, same result) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent ebe5e69 commit dfd0a44

4 files changed

Lines changed: 67 additions & 38 deletions

File tree

bench/rlv/stages/_llm.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ def _restart_server_if_dead(model: str | Path = DEFAULT_MODEL, verbose: bool = T
236236
def llm_call(
237237
prompt: str,
238238
*,
239-
max_tokens: int = 32,
239+
max_tokens: int = 16,
240240
temperature: float = 0.0,
241241
model: str | Path = DEFAULT_MODEL,
242242
enforce_budget: bool = True,
@@ -266,7 +266,7 @@ def llm_call(
266266

267267
# Validate max_tokens
268268
if max_tokens <= 0:
269-
max_tokens = 32
269+
max_tokens = 16
270270

271271
messages = []
272272
if system:
@@ -297,7 +297,7 @@ def llm_call(
297297

298298
t0 = time.time()
299299
try:
300-
with urllib.request.urlopen(req, timeout=300) as resp:
300+
with urllib.request.urlopen(req, timeout=120) as resp:
301301
payload = json.loads(resp.read().decode("utf-8"))
302302
break # success
303303
except urllib.error.HTTPError as e:

bench/rlv/stages/lookup.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@
4949
5050
Question: {question}
5151
52-
Quote the single sentence from the text above that answers this question. Reply with only that sentence, no explanation."""
52+
If the text contains the EXACT answer, reply: ANSWER: <the answer>
53+
If the text does NOT answer this specific question, reply: NONE"""
5354

5455

5556
@dataclass
@@ -147,14 +148,22 @@ def lookup(
147148
mode = "direct-answer" if len(sentences) > MAX_SENTENCES_FOR_SELECT else "single-sentence"
148149
print(f"[lookup] chunk {region.chunk_id} ({len(region_text)} chars), "
149150
f"{len(sentences)} sentences -> {mode}")
150-
result = _llm.llm_call(prompt, max_tokens=32)
151+
result = _llm.llm_call(prompt, max_tokens=24)
151152
if result.is_error:
152153
return LookupResult(
153154
answer=result.text, region_text=region_text,
154155
chunk_id=region.chunk_id, raw_llm_output=result.text, method="error",
155156
)
157+
text = result.text.strip()
158+
# Integrated self-check: if model says NONE, it couldn't find the answer
159+
# in this chunk → verifier will mark UNSURE → triggers research
160+
if text.upper().startswith("NONE") or "does not contain" in text.lower():
161+
text = f"[NONE] {text}"
162+
# Strip "ANSWER:" prefix if present
163+
if text.upper().startswith("ANSWER:"):
164+
text = text[7:].strip()
156165
return LookupResult(
157-
answer=result.text.strip(),
166+
answer=text,
158167
region_text=region_text,
159168
chunk_id=region.chunk_id,
160169
raw_llm_output=result.text,
@@ -189,7 +198,7 @@ def lookup(
189198
prompt = LOOKUP_QUOTE_FALLBACK_TEMPLATE.format(
190199
region_text=region_text, question=question,
191200
)
192-
result2 = _llm.llm_call(prompt, max_tokens=32)
201+
result2 = _llm.llm_call(prompt, max_tokens=24)
193202
return LookupResult(
194203
answer=result2.text.strip(),
195204
region_text=region_text,

bench/rlv/stages/researcher.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from .verifier import VerifyResult
1414

1515

16-
MAX_RETRIES = 3
16+
MAX_RETRIES = 2
1717

1818

1919
@dataclass

bench/rlv/stages/verifier.py

Lines changed: 50 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -165,36 +165,8 @@ def _literal_verify(
165165
if len(answer) < 200 and any(p in answer_head for p in refusal_phrases):
166166
return "UNSURE", f"answer is a refusal ('{answer[:60]}...')"
167167

168-
# Phase A-2: Answer-Question alignment check.
169-
# The answer must actually ADDRESS the question type. An answer that
170-
# contains region-grounded facts but doesn't answer the specific
171-
# question is "related but wrong" — the hardest hallucination to catch.
172-
# This is RLV's core differentiator: detecting WRONG answers, not just
173-
# fabricated ones.
174-
q_lower = question.lower()
175-
answer_norm = answer.lower()
176-
177-
# "When/what year/what date" → answer must contain a year or date
178-
if re.search(r'\b(what year|in what year|when did|what date|on what date)\b', q_lower):
179-
has_year = bool(re.search(r'\b(1[0-9]{3}|20[0-9]{2})\b', answer))
180-
has_month = bool(re.search(r'\b(january|february|march|april|may|june|july|august|september|october|november|december)\b', answer.lower()))
181-
if not has_year and not has_month:
182-
return "UNSURE", f"temporal question but answer has no year/date"
183-
184-
# "After/before which battle/event" → answer must name a specific event
185-
# AND the answer must contain an event-type word (battle, war, etc.)
186-
# "They were modernized in 1934" doesn't answer "after which battle?"
187-
if re.search(r'\b(which battle|after which battle|what battle|which war|after which war)\b', q_lower):
188-
event_words = ["battle", "war", "rebellion", "siege", "campaign", "invasion", "attack", "offensive"]
189-
has_event_word = any(w in answer.lower() for w in event_words)
190-
if not has_event_word:
191-
return "UNSURE", f"battle/war question but answer names no battle/war"
192-
193-
# "What does X mean" → answer should contain a definition signal
194-
if re.search(r'\b(what does|what is the meaning|what does the (?:name|word|term))\b', q_lower):
195-
has_def = any(w in answer.lower() for w in ["means", "meaning", "refers to", "derived from", "to cut", "headed"])
196-
if not has_def and len(answer) < 150:
197-
return "UNSURE", f"definition question but answer lacks definition"
168+
# Phase A-2: removed hardcoded type-specific alignment checks.
169+
# The universal LLM coherence check in verify() handles this properly.
198170

199171
word_terms, number_terms = _extract_answer_key_terms(answer)
200172
if not word_terms and not number_terms:
@@ -268,6 +240,54 @@ def verify(
268240
question, answer, region_text,
269241
chunk_id=chunk_id, gist=gist,
270242
)
243+
if verdict == "CONFIDENT":
244+
# Phase A-2: LLM coherence check — the CORE of RLV's value.
245+
# ONE universal prompt, no type-specific branching.
246+
#
247+
# Speed optimization: skip coherence for high-confidence answers.
248+
# - "[NONE]" answers: instant UNSURE (model already said it can't answer)
249+
# - Short specific answers (names, numbers): fast-accept
250+
# - Vague/generic answers: require coherence check
251+
answer_stripped = answer.strip()
252+
253+
# Self-check: model said NONE → instant UNSURE
254+
if answer_stripped.startswith("[NONE]"):
255+
if verbose:
256+
print(f"[verifier] model self-check: NONE → UNSURE")
257+
return VerifyResult(verdict="UNSURE", reason="model self-check: NONE", method="self-check")
258+
259+
# Fast-accept: short answer with specific entity (name/number/date)
260+
# These are almost always correct when literal check passes
261+
is_specific = (
262+
len(answer_stripped) < 80 and
263+
(bool(re.search(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', answer_stripped)) or # proper noun
264+
bool(re.search(r'\b\d{3,4}\b', answer_stripped))) # year/number
265+
)
266+
if is_specific:
267+
if verbose:
268+
print(f"[verifier] fast-accept: specific answer ({answer_stripped[:40]})")
269+
return VerifyResult(verdict="CONFIDENT", reason=reason, method="literal+fast-accept")
270+
271+
# Generic/vague answer → LLM coherence check required
272+
coherence_prompt = (
273+
f"A user asked: \"{question}\"\n"
274+
f"The system answered: \"{answer[:200]}\"\n\n"
275+
f"Is the user's specific question answered? "
276+
f"Not just related information, but the EXACT thing asked. "
277+
f"YES or NO."
278+
)
279+
coherence_result = _llm.llm_call(coherence_prompt, max_tokens=4)
280+
coherence_text = coherence_result.text.strip().lower()[:10]
281+
if verbose:
282+
print(f"[verifier] coherence check: {coherence_text!r}")
283+
if "no" in coherence_text and "yes" not in coherence_text:
284+
return VerifyResult(
285+
verdict="UNSURE",
286+
reason=f"literal:CONFIDENT but coherence:NO ({reason})",
287+
method="literal+coherence",
288+
)
289+
return VerifyResult(verdict="CONFIDENT", reason=reason, method="literal+coherence")
290+
271291
if verdict != "UNSURE" or not use_llm_fallback:
272292
if verbose:
273293
print(f"[verifier] literal -> {verdict} ({reason})")

0 commit comments

Comments
 (0)