Skip to content

Commit 2bb8ae1

Browse files
unamedkrclaude
andcommitted
fix(rlv): production hardening — 6 robustness improvements
1. Input validation (orchestrator.py): - Guard None/empty doc_text and question → clear error dict - No more crashes on invalid inputs 2. JSON response parsing (\_llm.py): - Handle malformed/null choices, missing message field - Detect empty responses → mark is_error=True - Validate max_tokens (0/negative → default 64) 3. Refusal detection precision (verifier.py): - Only flag refusal when phrase is in first 120 chars AND answer < 200 chars - Prevents false positives on legitimate answers like "The study does not provide evidence for..." (long quoted content) - Narrowed phrase list to more specific patterns 4. BM25 IDF robustness (locator.py): - Clamp IDF to ≥ 0 (prevent negative scores on edge cases) - Adaptive RRF k: k=60 for <100 chunks, k=min(N,200) for large docs 5. Error-aware researcher (researcher.py): - Skip verification on lookup errors (server crash/timeout) - Log error and continue to next chunk instead of crashing 6. max_tokens validation (\_llm.py): - Clamp ≤ 0 values to default 64 Tested: None inputs, empty docs, refusal precision (long vs short), BM25 all-identical-chunks, max_tokens=0. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent f8fd8b6 commit 2bb8ae1

5 files changed

Lines changed: 73 additions & 17 deletions

File tree

bench/rlv/rlv_orchestrator.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,16 @@ def answer_question(
7777
) -> dict:
7878
"""Run the full RLV pipeline. Returns a dict with the final answer
7979
and per-stage diagnostic info."""
80+
# Input validation — fail fast with clear errors
81+
if not doc_text or not isinstance(doc_text, str):
82+
return {"question": question or "", "final_answer": "[ERROR: empty or invalid document]",
83+
"confidence": "none", "research": {"verdict": "ERROR", "n_retries": 0, "attempts": []},
84+
"timings": {}, "gist_n_chunks": 0}
85+
if not question or not isinstance(question, str) or not question.strip():
86+
return {"question": "", "final_answer": "[ERROR: empty or invalid question]",
87+
"confidence": "none", "research": {"verdict": "ERROR", "n_retries": 0, "attempts": []},
88+
"timings": {}, "gist_n_chunks": 0}
89+
8090
t_start = time.time()
8191
timings = {}
8292

bench/rlv/stages/_llm.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,10 @@ def llm_call(
228228
if _server_url is None:
229229
start_server(model=model)
230230

231+
# Validate max_tokens
232+
if max_tokens <= 0:
233+
max_tokens = 64
234+
231235
messages = []
232236
if system:
233237
messages.append({"role": "system", "content": system})
@@ -260,12 +264,26 @@ def llm_call(
260264
elapsed=elapsed, is_error=True)
261265
elapsed = time.time() - t0
262266

267+
# Robust JSON response parsing — handle malformed/incomplete responses
263268
text = ""
264269
n_tokens = 0
265-
if "choices" in payload and payload["choices"]:
266-
msg = payload["choices"][0].get("message", {})
267-
text = msg.get("content", "").strip()
268-
if "usage" in payload:
269-
n_tokens = payload["usage"].get("completion_tokens", 0)
270-
271-
return LLMResult(text=text, raw=json.dumps(payload), n_tokens=n_tokens, elapsed=elapsed)
270+
is_error = False
271+
try:
272+
choices = payload.get("choices")
273+
if choices and isinstance(choices, list) and len(choices) > 0:
274+
msg = choices[0].get("message") or choices[0].get("delta") or {}
275+
text = (msg.get("content") or "").strip()
276+
usage = payload.get("usage")
277+
if usage and isinstance(usage, dict):
278+
n_tokens = usage.get("completion_tokens", 0)
279+
except (KeyError, TypeError, IndexError, AttributeError):
280+
is_error = True
281+
text = f"[ERROR: malformed response: {str(payload)[:200]}]"
282+
283+
if not text and not is_error:
284+
# Server returned empty content — treat as soft error
285+
is_error = True
286+
text = "[ERROR: empty response from server]"
287+
288+
return LLMResult(text=text, raw=json.dumps(payload) if isinstance(payload, dict) else str(payload),
289+
n_tokens=n_tokens, elapsed=elapsed, is_error=is_error)

bench/rlv/stages/locator.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,9 @@ def _bm25_score_chunks(question: str, gist: Gist, excluded: List[int],
285285
(len(w) >= 3 and len(term) >= 3 and
286286
w[:min(4, len(w))] == term[:min(4, len(term))]))
287287
n = df.get(term, 0)
288-
idf = math.log((N - n + 0.5) / (n + 0.5) + 1.0) if n < N else 0.0
288+
# Standard BM25 IDF: log((N-n+0.5)/(n+0.5)+1). Terms appearing
289+
# in ALL chunks (n==N) get idf=0 (no discriminating power).
290+
idf = max(0.0, math.log((N - n + 0.5) / (n + 0.5) + 1.0)) if n < N else 0.0
289291
denom = tf + k1 * (1 - b + b * dl / max(avg_dl, 1))
290292
tf_norm = (tf * (k1 + 1)) / max(denom, 1e-9)
291293
score += idf * tf_norm
@@ -402,7 +404,11 @@ def locate(
402404
print(f"[locator] bm25 top3: {bm25_scores[:3]}")
403405

404406
# --- Step 3: Reciprocal Rank Fusion (keyword + BM25) ---
405-
rrf_k = 60
407+
# RRF k parameter: controls how much the top ranks dominate.
408+
# Standard k=60 works for <100 chunks. For very large documents
409+
# (400+ chunks), increase k to preserve ranking discrimination.
410+
n_chunks = len(kw_scores)
411+
rrf_k = 60 if n_chunks < 100 else min(n_chunks, 200)
406412
rrf = {}
407413
for rank, (cid, _) in enumerate(kw_scores):
408414
rrf[cid] = rrf.get(cid, 0) + 1.0 / (rrf_k + rank)

bench/rlv/stages/researcher.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,20 @@ def research(
6565
break
6666

6767
new_lookup = lookup.lookup(question, new_region, doc_text, verbose=verbose)
68+
69+
# Skip verification if lookup returned an error (server crash/timeout)
70+
if new_lookup.method == "error":
71+
if verbose:
72+
print(f"[researcher] lookup error on chunk {new_region.chunk_id}, skipping")
73+
excluded.append(new_region.chunk_id)
74+
attempts.append({
75+
"chunk": new_region.chunk_id,
76+
"answer": new_lookup.answer,
77+
"verdict": "ERROR",
78+
"reason": "lookup error",
79+
})
80+
continue
81+
6882
new_verify = verifier.verify(
6983
question, new_lookup.answer, gist,
7084
region_text=new_lookup.region_text,

bench/rlv/stages/verifier.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -191,17 +191,25 @@ def _literal_verify(
191191
if not q_ok:
192192
return "CONTRADICTED", f"question not grounded ({q_reason}) — likely wrong chunk"
193193

194-
# Day 5: detect "I don't know" / "not provided" answers — these should
195-
# never be CONFIDENT. The model is explicitly saying it couldn't find
196-
# the answer, so send it back to RESEARCH for a different chunk.
194+
# Day 5: detect "I don't know" / "not provided" refusal answers.
195+
# These should never be CONFIDENT — the model is saying it couldn't
196+
# find the answer, so send it back to RESEARCH for a different chunk.
197+
#
198+
# Production hardening: only detect refusal when the phrase appears
199+
# in the FIRST 120 chars of the answer (not embedded in a valid
200+
# quoted sentence like "The study does not provide evidence for...").
201+
# Also require the answer to be SHORT (< 200 chars) — long answers
202+
# that happen to contain a refusal phrase are likely real content.
197203
answer_lower = answer.lower()
204+
answer_head = answer_lower[:120]
198205
refusal_phrases = [
199-
"does not provide", "not provide", "no information",
200-
"not contain", "not mention", "cannot determine",
201-
"unable to", "not specified", "not stated", "not available",
202-
"i don't know", "i'm not sure", "unclear",
206+
"does not provide", "no information", "not contain the answer",
207+
"cannot determine", "unable to find", "unable to determine",
208+
"not specified in", "not stated in", "not available in",
209+
"i don't know", "i'm not sure", "no relevant information",
210+
"the text does not", "the passage does not",
203211
]
204-
if any(p in answer_lower for p in refusal_phrases):
212+
if len(answer) < 200 and any(p in answer_head for p in refusal_phrases):
205213
return "UNSURE", f"answer is a refusal ('{answer[:60]}...')"
206214

207215
word_terms, number_terms = _extract_answer_key_terms(answer)

0 commit comments

Comments
 (0)