@@ -286,7 +286,8 @@ def _bm25_score_chunks(question: str, gist: Gist, excluded: List[int],
286286 w [:min (4 , len (w ))] == term [:min (4 , len (term ))]))
287287 n = df .get (term , 0 )
288288 idf = math .log ((N - n + 0.5 ) / (n + 0.5 ) + 1.0 ) if n < N else 0.0
289- tf_norm = (tf * (k1 + 1 )) / (tf + k1 * (1 - b + b * dl / max (avg_dl , 1 )))
289+ denom = tf + k1 * (1 - b + b * dl / max (avg_dl , 1 ))
290+ tf_norm = (tf * (k1 + 1 )) / max (denom , 1e-9 )
290291 score += idf * tf_norm
291292 scores .append ((chunk .chunk_id , score ))
292293
@@ -330,6 +331,8 @@ def _llm_locate(
330331
331332 lines = []
332333 for choice_num , cid in enumerate (available , start = 1 ):
334+ if cid >= len (gist .chunks ):
335+ continue # skip invalid chunk_id
333336 chunk = gist .chunks [cid ]
334337 text = (chunk .full_text or chunk .head_text ).replace ("\n " , " " ).strip ()
335338 # Show first 2 sentences (more context than just head)
@@ -405,11 +408,21 @@ def locate(
405408 rrf [cid ] = rrf .get (cid , 0 ) + 1.0 / (rrf_k + rank )
406409 for rank , (cid , _ ) in enumerate (bm25_scores ):
407410 rrf [cid ] = rrf .get (cid , 0 ) + 1.0 / (rrf_k + rank )
408- rrf_ranked = sorted (rrf .items (), key = lambda x : x [1 ], reverse = True )
411+ # Sort by (score DESC, chunk_id ASC) for deterministic tie-breaking
412+ rrf_ranked = sorted (rrf .items (), key = lambda x : (- x [1 ], x [0 ]))
409413
410414 if verbose :
411415 print (f"[locator] rrf top3: { rrf_ranked [:3 ]} " )
412416
417+ # Guard: if no chunks survived scoring, return first available
418+ if not rrf_ranked :
419+ chunk = available [0 ]
420+ return RegionPointer (
421+ chunk_id = chunk .chunk_id , confidence = "low" , method = "fallback" ,
422+ candidates = [], char_start = chunk .char_start , char_end = chunk .char_end ,
423+ score = 0.0 ,
424+ )
425+
413426 # --- Step 4: LLM classification on top candidates ---
414427 # Always run LLM on the top 5 RRF candidates (not just when ambiguous)
415428 top_candidates = [cid for cid , _ in rrf_ranked [:5 ]]
0 commit comments