fix(rlv): production hardening — 6 robustness improvements

unamedkr · claude · unamedkr · commit 2bb8ae11196f · 2026-04-12T21:50:25.000+09:00
1. Input validation (orchestrator.py):
   - Guard None/empty doc_text and question → clear error dict
   - No more crashes on invalid inputs

2. JSON response parsing (\_llm.py):
   - Handle malformed/null choices, missing message field
   - Detect empty responses → mark is_error=True
   - Validate max_tokens (0/negative → default 64)

3. Refusal detection precision (verifier.py):
   - Only flag refusal when phrase is in first 120 chars AND answer &lt; 200 chars
   - Prevents false positives on legitimate answers like
     "The study does not provide evidence for..." (long quoted content)
   - Narrowed phrase list to more specific patterns

4. BM25 IDF robustness (locator.py):
   - Clamp IDF to ≥ 0 (prevent negative scores on edge cases)
   - Adaptive RRF k: k=60 for &lt;100 chunks, k=min(N,200) for large docs

5. Error-aware researcher (researcher.py):
   - Skip verification on lookup errors (server crash/timeout)
   - Log error and continue to next chunk instead of crashing

6. max_tokens validation (\_llm.py):
   - Clamp ≤ 0 values to default 64

Tested: None inputs, empty docs, refusal precision (long vs short),
BM25 all-identical-chunks, max_tokens=0.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/bench/rlv/rlv_orchestrator.py b/bench/rlv/rlv_orchestrator.py
@@ -77,6 +77,16 @@ def answer_question(
 ) -> dict:
     """Run the full RLV pipeline. Returns a dict with the final answer
     and per-stage diagnostic info."""
+    # Input validation — fail fast with clear errors
+    if not doc_text or not isinstance(doc_text, str):
+        return {"question": question or "", "final_answer": "[ERROR: empty or invalid document]",
+                "confidence": "none", "research": {"verdict": "ERROR", "n_retries": 0, "attempts": []},
+                "timings": {}, "gist_n_chunks": 0}
+    if not question or not isinstance(question, str) or not question.strip():
+        return {"question": "", "final_answer": "[ERROR: empty or invalid question]",
+                "confidence": "none", "research": {"verdict": "ERROR", "n_retries": 0, "attempts": []},
+                "timings": {}, "gist_n_chunks": 0}
+
     t_start = time.time()
     timings = {}
 
diff --git a/bench/rlv/stages/_llm.py b/bench/rlv/stages/_llm.py
@@ -228,6 +228,10 @@ def llm_call(
     if _server_url is None:
         start_server(model=model)
 
+    # Validate max_tokens
+    if max_tokens <= 0:
+        max_tokens = 64
+
     messages = []
     if system:
         messages.append({"role": "system", "content": system})
@@ -260,12 +264,26 @@ def llm_call(
                          elapsed=elapsed, is_error=True)
     elapsed = time.time() - t0
 
+    # Robust JSON response parsing — handle malformed/incomplete responses
     text = ""
     n_tokens = 0
-    if "choices" in payload and payload["choices"]:
-        msg = payload["choices"][0].get("message", {})
-        text = msg.get("content", "").strip()
-    if "usage" in payload:
-        n_tokens = payload["usage"].get("completion_tokens", 0)
-
-    return LLMResult(text=text, raw=json.dumps(payload), n_tokens=n_tokens, elapsed=elapsed)
+    is_error = False
+    try:
+        choices = payload.get("choices")
+        if choices and isinstance(choices, list) and len(choices) > 0:
+            msg = choices[0].get("message") or choices[0].get("delta") or {}
+            text = (msg.get("content") or "").strip()
+        usage = payload.get("usage")
+        if usage and isinstance(usage, dict):
+            n_tokens = usage.get("completion_tokens", 0)
+    except (KeyError, TypeError, IndexError, AttributeError):
+        is_error = True
+        text = f"[ERROR: malformed response: {str(payload)[:200]}]"
+
+    if not text and not is_error:
+        # Server returned empty content — treat as soft error
+        is_error = True
+        text = "[ERROR: empty response from server]"
+
+    return LLMResult(text=text, raw=json.dumps(payload) if isinstance(payload, dict) else str(payload),
+                     n_tokens=n_tokens, elapsed=elapsed, is_error=is_error)
diff --git a/bench/rlv/stages/locator.py b/bench/rlv/stages/locator.py
@@ -285,7 +285,9 @@ def _bm25_score_chunks(question: str, gist: Gist, excluded: List[int],
                      (len(w) >= 3 and len(term) >= 3 and
                       w[:min(4, len(w))] == term[:min(4, len(term))]))
             n = df.get(term, 0)
-            idf = math.log((N - n + 0.5) / (n + 0.5) + 1.0) if n < N else 0.0
+            # Standard BM25 IDF: log((N-n+0.5)/(n+0.5)+1). Terms appearing
+            # in ALL chunks (n==N) get idf=0 (no discriminating power).
+            idf = max(0.0, math.log((N - n + 0.5) / (n + 0.5) + 1.0)) if n < N else 0.0
             denom = tf + k1 * (1 - b + b * dl / max(avg_dl, 1))
             tf_norm = (tf * (k1 + 1)) / max(denom, 1e-9)
             score += idf * tf_norm
@@ -402,7 +404,11 @@ def locate(
         print(f"[locator] bm25   top3: {bm25_scores[:3]}")
 
     # --- Step 3: Reciprocal Rank Fusion (keyword + BM25) ---
-    rrf_k = 60
+    # RRF k parameter: controls how much the top ranks dominate.
+    # Standard k=60 works for <100 chunks. For very large documents
+    # (400+ chunks), increase k to preserve ranking discrimination.
+    n_chunks = len(kw_scores)
+    rrf_k = 60 if n_chunks < 100 else min(n_chunks, 200)
     rrf = {}
     for rank, (cid, _) in enumerate(kw_scores):
         rrf[cid] = rrf.get(cid, 0) + 1.0 / (rrf_k + rank)
diff --git a/bench/rlv/stages/researcher.py b/bench/rlv/stages/researcher.py
@@ -65,6 +65,20 @@ def research(
             break
 
         new_lookup = lookup.lookup(question, new_region, doc_text, verbose=verbose)
+
+        # Skip verification if lookup returned an error (server crash/timeout)
+        if new_lookup.method == "error":
+            if verbose:
+                print(f"[researcher] lookup error on chunk {new_region.chunk_id}, skipping")
+            excluded.append(new_region.chunk_id)
+            attempts.append({
+                "chunk": new_region.chunk_id,
+                "answer": new_lookup.answer,
+                "verdict": "ERROR",
+                "reason": "lookup error",
+            })
+            continue
+
         new_verify = verifier.verify(
             question, new_lookup.answer, gist,
             region_text=new_lookup.region_text,
diff --git a/bench/rlv/stages/verifier.py b/bench/rlv/stages/verifier.py
@@ -191,17 +191,25 @@ def _literal_verify(
     if not q_ok:
         return "CONTRADICTED", f"question not grounded ({q_reason}) — likely wrong chunk"
 
-    # Day 5: detect "I don't know" / "not provided" answers — these should
-    # never be CONFIDENT. The model is explicitly saying it couldn't find
-    # the answer, so send it back to RESEARCH for a different chunk.
+    # Day 5: detect "I don't know" / "not provided" refusal answers.
+    # These should never be CONFIDENT — the model is saying it couldn't
+    # find the answer, so send it back to RESEARCH for a different chunk.
+    #
+    # Production hardening: only detect refusal when the phrase appears
+    # in the FIRST 120 chars of the answer (not embedded in a valid
+    # quoted sentence like "The study does not provide evidence for...").
+    # Also require the answer to be SHORT (< 200 chars) — long answers
+    # that happen to contain a refusal phrase are likely real content.
     answer_lower = answer.lower()
+    answer_head = answer_lower[:120]
     refusal_phrases = [
-        "does not provide", "not provide", "no information",
-        "not contain", "not mention", "cannot determine",
-        "unable to", "not specified", "not stated", "not available",
-        "i don't know", "i'm not sure", "unclear",
+        "does not provide", "no information", "not contain the answer",
+        "cannot determine", "unable to find", "unable to determine",
+        "not specified in", "not stated in", "not available in",
+        "i don't know", "i'm not sure", "no relevant information",
+        "the text does not", "the passage does not",
     ]
-    if any(p in answer_lower for p in refusal_phrases):
+    if len(answer) < 200 and any(p in answer_head for p in refusal_phrases):
         return "UNSURE", f"answer is a refusal ('{answer[:60]}...')"
 
     word_terms, number_terms = _extract_answer_key_terms(answer)