@@ -54,14 +54,34 @@ class LookupResult:
5454 method : str = "" # "select" | "quote" | "select-fallback"
5555
5656
57- _SENTENCE_SPLIT_RE = re .compile (r"(?<=[.!?])\s+" )
57+ # Common abbreviations that end with a period but aren't sentence endings.
58+ _ABBREVIATIONS = {"dr" , "mr" , "mrs" , "ms" , "jr" , "sr" , "st" , "vs" , "etc" ,
59+ "prof" , "rev" , "gen" , "corp" , "inc" , "ltd" , "vol" , "no" ,
60+ "approx" , "dept" , "est" , "govt" }
5861
5962
6063def _split_into_sentences (text : str ) -> List [str ]:
61- """Split text into sentences. Conservative: snap on period/!? followed
62- by whitespace. Filters out tiny fragments that aren't real sentences."""
63- parts = [s .strip () for s in _SENTENCE_SPLIT_RE .split (text ) if s .strip ()]
64- return [p for p in parts if len (p ) >= 8 ]
64+ """Split text into sentences. Snaps on period/!?/whitespace but avoids
65+ splitting on common abbreviations (Dr., Mr., etc.) and single-letter
66+ initials (J. K. Rowling).
67+ Filters out tiny fragments (< 8 chars) that aren't real sentences."""
68+ # Strategy: split on `. ` / `! ` / `? `, then re-join fragments that
69+ # ended with an abbreviation or single letter.
70+ raw = re .split (r"(?<=[.!?])\s+" , text )
71+ merged : List [str ] = []
72+ for frag in raw :
73+ frag = frag .strip ()
74+ if not frag :
75+ continue
76+ if merged :
77+ prev = merged [- 1 ]
78+ # Check if prev ended with an abbreviation or single initial
79+ last_word = prev .rsplit (None , 1 )[- 1 ].rstrip ("." ).lower () if prev else ""
80+ if last_word in _ABBREVIATIONS or (len (last_word ) == 1 and last_word .isalpha ()):
81+ merged [- 1 ] = prev + " " + frag
82+ continue
83+ merged .append (frag )
84+ return [s for s in merged if len (s ) >= 8 ]
6585
6686
6787def _parse_sentence_index (text : str , n_sentences : int ) -> int :
@@ -170,7 +190,7 @@ def lookup(
170190 window .append (sentences [i ])
171191 answer = " " .join (window )
172192 if verbose :
173- print (f"[lookup] selected sentence { idx } /{ len (sentences )} : { selected [:80 ]!r} " )
193+ print (f"[lookup] selected sentence { idx } /{ len (sentences )} : { sentences [ idx - 1 ] [:80 ]!r} " )
174194 return LookupResult (
175195 answer = answer ,
176196 region_text = region_text ,
0 commit comments