fix(rlv): second-pass audit — 5 additional hardening fixes

unamedkr · claude · unamedkr · commit 3a491db1f5c1 · 2026-04-12T21:03:20.000+09:00
Critical:
- locator.py: bounds-check chunk_id in _llm_locate() before gist.chunks access
- locator.py: guard empty rrf_ranked (IndexError when all chunks excluded)
- locator.py: BM25 division-by-zero guard (max(denom, 1e-9))
- locator.py: deterministic RRF sort by (-score, chunk_id) for tie-breaking

Robustness:
- _llm.py: LLMResult.is_error field + broader exception handling
  (ConnectionResetError, TimeoutError, OSError)
- lookup.py: early return on LLM error (both select and direct paths)
- gist.py: empty/whitespace document guard in build_gist()

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/bench/rlv/stages/_llm.py b/bench/rlv/stages/_llm.py
@@ -53,6 +53,7 @@ class LLMResult:
     raw: str           # the full CLI stdout+stderr
     n_tokens: int      # generated token count
     elapsed: float     # wall seconds
+    is_error: bool = False  # True if the call failed (text contains error message)
 
 
 def estimate_tokens(text: str) -> int:
@@ -252,9 +253,11 @@ def llm_call(
     try:
         with urllib.request.urlopen(req, timeout=600) as resp:
             payload = json.loads(resp.read().decode("utf-8"))
-    except (urllib.error.URLError, urllib.error.HTTPError) as e:
+    except (urllib.error.URLError, urllib.error.HTTPError, ConnectionResetError,
+            TimeoutError, OSError) as e:
         elapsed = time.time() - t0
-        return LLMResult(text=f"[ERROR: {e}]", raw=str(e), n_tokens=0, elapsed=elapsed)
+        return LLMResult(text=f"[ERROR: {e}]", raw=str(e), n_tokens=0,
+                         elapsed=elapsed, is_error=True)
     elapsed = time.time() - t0
 
     text = ""
diff --git a/bench/rlv/stages/gist.py b/bench/rlv/stages/gist.py
@@ -211,6 +211,12 @@ def build_gist(
     a richer index for cases where the chunk head text isn't
     representative of the section.
     """
+    # Guard: empty or whitespace-only documents produce no chunks
+    if not doc_text or not doc_text.strip():
+        if verbose:
+            print(f"[gist] doc_id={doc_id} — empty document, returning empty gist")
+        return Gist(doc_id=doc_id, n_chars=len(doc_text or ""), chunks=[])
+
     chunks_raw = chunk_document(doc_text, chunk_chars=chunk_chars)
     if verbose:
         print(f"[gist] doc_id={doc_id} len={len(doc_text)} chars, {len(chunks_raw)} chunks "
diff --git a/bench/rlv/stages/locator.py b/bench/rlv/stages/locator.py
@@ -286,7 +286,8 @@ def _bm25_score_chunks(question: str, gist: Gist, excluded: List[int],
                       w[:min(4, len(w))] == term[:min(4, len(term))]))
             n = df.get(term, 0)
             idf = math.log((N - n + 0.5) / (n + 0.5) + 1.0) if n < N else 0.0
-            tf_norm = (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * dl / max(avg_dl, 1)))
+            denom = tf + k1 * (1 - b + b * dl / max(avg_dl, 1))
+            tf_norm = (tf * (k1 + 1)) / max(denom, 1e-9)
             score += idf * tf_norm
         scores.append((chunk.chunk_id, score))
 
@@ -330,6 +331,8 @@ def _llm_locate(
 
     lines = []
     for choice_num, cid in enumerate(available, start=1):
+        if cid >= len(gist.chunks):
+            continue  # skip invalid chunk_id
         chunk = gist.chunks[cid]
         text = (chunk.full_text or chunk.head_text).replace("\n", " ").strip()
         # Show first 2 sentences (more context than just head)
@@ -405,11 +408,21 @@ def locate(
         rrf[cid] = rrf.get(cid, 0) + 1.0 / (rrf_k + rank)
     for rank, (cid, _) in enumerate(bm25_scores):
         rrf[cid] = rrf.get(cid, 0) + 1.0 / (rrf_k + rank)
-    rrf_ranked = sorted(rrf.items(), key=lambda x: x[1], reverse=True)
+    # Sort by (score DESC, chunk_id ASC) for deterministic tie-breaking
+    rrf_ranked = sorted(rrf.items(), key=lambda x: (-x[1], x[0]))
 
     if verbose:
         print(f"[locator] rrf    top3: {rrf_ranked[:3]}")
 
+    # Guard: if no chunks survived scoring, return first available
+    if not rrf_ranked:
+        chunk = available[0]
+        return RegionPointer(
+            chunk_id=chunk.chunk_id, confidence="low", method="fallback",
+            candidates=[], char_start=chunk.char_start, char_end=chunk.char_end,
+            score=0.0,
+        )
+
     # --- Step 4: LLM classification on top candidates ---
     # Always run LLM on the top 5 RRF candidates (not just when ambiguous)
     top_candidates = [cid for cid, _ in rrf_ranked[:5]]
diff --git a/bench/rlv/stages/lookup.py b/bench/rlv/stages/lookup.py
@@ -133,6 +133,11 @@ def lookup(
             print(f"[lookup] chunk {region.chunk_id} ({len(region_text)} chars), "
                   f"{len(sentences)} sentences -> {mode}")
         result = _llm.llm_call(prompt, max_tokens=64)
+        if result.is_error:
+            return LookupResult(
+                answer=result.text, region_text=region_text,
+                chunk_id=region.chunk_id, raw_llm_output=result.text, method="error",
+            )
         return LookupResult(
             answer=result.text.strip(),
             region_text=region_text,
@@ -156,6 +161,11 @@ def lookup(
 
     # Only need a single digit — minimize tokens for slow CPU models
     result = _llm.llm_call(prompt, max_tokens=8)
+    if result.is_error:
+        return LookupResult(
+            answer=result.text, region_text=region_text,
+            chunk_id=region.chunk_id, raw_llm_output=result.text, method="error",
+        )
     idx = _parse_sentence_index(result.text, len(sentences))
 
     if idx < 1: