|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Phase A-2: Large document stress test — 1.3MB wikitext (860+ chunks). |
| 3 | +
|
| 4 | +This tests whether RLV scales to REAL document sizes. The original |
| 5 | +eval_wikitext.py uses ppl_8k.txt (35K chars, 23 chunks, 3 articles). |
| 6 | +This uses wikitext2_test.txt (1.3MB, ~860 chunks, 63 articles). |
| 7 | +
|
| 8 | +Key challenges at this scale: |
| 9 | +- 860 chunks → locator must discriminate among 37x more candidates |
| 10 | +- 63 articles with overlapping topics (military history, sports, poetry) |
| 11 | +- BM25 IDF changes: common words have lower discrimination power |
| 12 | +- Multiple articles about similar subjects (battles, sports figures) |
| 13 | +""" |
| 14 | +import argparse |
| 15 | +import sys |
| 16 | +import time |
| 17 | +from pathlib import Path |
| 18 | + |
| 19 | +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) |
| 20 | + |
| 21 | +from rlv_orchestrator import answer_question |
| 22 | +from stages import _llm |
| 23 | +from stages import gist as gist_stage |
| 24 | + |
| 25 | +DOC_PATH = Path(__file__).resolve().parent.parent.parent / "data" / "wikitext2_test.txt" |
| 26 | + |
| 27 | +# 20 questions across 10+ different articles — diverse topics, difficulty levels |
| 28 | +QUESTIONS = [ |
| 29 | + # === Already-tested articles (Boulter, Du Fu, Kiss You) — regression check === |
| 30 | + {"id": 1, "topic": "boulter", "type": "single-hop", |
| 31 | + "question": "Who directed the production of Mercury Fur in which Boulter appeared?", |
| 32 | + "fragments": ["john tiffany", "tiffany"]}, |
| 33 | + {"id": 2, "topic": "dufu", "type": "single-hop", |
| 34 | + "question": "In what year did Du Fu first meet Li Bai?", |
| 35 | + "fragments": ["744"]}, |
| 36 | + {"id": 3, "topic": "kiss_you", "type": "single-hop", |
| 37 | + "question": "Who directed the Kiss You music video?", |
| 38 | + "fragments": ["vaughan arnell", "arnell"]}, |
| 39 | + |
| 40 | + # === Military history (Ise-class, Naktong, Ironclad) === |
| 41 | + {"id": 4, "topic": "ise_class", "type": "single-hop", |
| 42 | + "question": "What disaster did the Ise-class battleships carry supplies for in 1923?", |
| 43 | + "fragments": ["earthquake", "kanto", "kantō"]}, |
| 44 | + {"id": 5, "topic": "ise_class", "type": "multi-hop", |
| 45 | + "question": "After which battle were the Ise-class ships rebuilt with a flight deck?", |
| 46 | + "fragments": ["midway"]}, |
| 47 | + {"id": 6, "topic": "ironclad", "type": "single-hop", |
| 48 | + "question": "What was the name of the first ironclad battleship launched in 1859?", |
| 49 | + "fragments": ["gloire"]}, |
| 50 | + {"id": 7, "topic": "naktong", "type": "single-hop", |
| 51 | + "question": "The Second Battle of Naktong Bulge was part of which larger battle?", |
| 52 | + "fragments": ["pusan perimeter", "pusan"]}, |
| 53 | + |
| 54 | + # === Sports (Dick Rifenburg, Clayton Kershaw, Ben Amos) === |
| 55 | + {"id": 8, "topic": "rifenburg", "type": "single-hop", |
| 56 | + "question": "What NFL team did Dick Rifenburg play for in 1950?", |
| 57 | + "fragments": ["detroit lions", "detroit", "lions"]}, |
| 58 | + {"id": 9, "topic": "kershaw", "type": "single-hop", |
| 59 | + "question": "In what year was Clayton Kershaw drafted?", |
| 60 | + "fragments": ["2006"]}, |
| 61 | + {"id": 10, "topic": "kershaw", "type": "single-hop", |
| 62 | + "question": "On what date did Clayton Kershaw pitch a no-hitter?", |
| 63 | + "fragments": ["june 18", "2014"]}, |
| 64 | + {"id": 11, "topic": "amos", "type": "single-hop", |
| 65 | + "question": "Which Manchester United academy did Ben Amos join from?", |
| 66 | + "fragments": ["crewe", "crewe alexandra"]}, |
| 67 | + |
| 68 | + # === Science/Weather (Dvorak technique, Hurricane, Temnospondyli) === |
| 69 | + {"id": 12, "topic": "dvorak", "type": "single-hop", |
| 70 | + "question": "Who developed the Dvorak technique for estimating tropical cyclone intensity?", |
| 71 | + "fragments": ["vernon dvorak", "vernon", "dvorak"]}, |
| 72 | + {"id": 13, "topic": "hurricane", "type": "single-hop", |
| 73 | + "question": "Where did the 1933 Treasure Coast hurricane make landfall in Florida?", |
| 74 | + "fragments": ["jupiter"]}, |
| 75 | + {"id": 14, "topic": "hurricane", "type": "single-hop", |
| 76 | + "question": "What were the peak winds of the 1933 Treasure Coast hurricane?", |
| 77 | + "fragments": ["140"]}, |
| 78 | + {"id": 15, "topic": "temno", "type": "single-hop", |
| 79 | + "question": "What does the Greek word 'temnein' mean in the name Temnospondyli?", |
| 80 | + "fragments": ["cut"]}, |
| 81 | + |
| 82 | + # === Literature/Arts (Imagism, Little Gidding, Portage) === |
| 83 | + {"id": 16, "topic": "imagism", "type": "single-hop", |
| 84 | + "question": "Imagism is considered the first organized movement of what literary period?", |
| 85 | + "fragments": ["modernist", "modernism"]}, |
| 86 | + {"id": 17, "topic": "gidding", "type": "single-hop", |
| 87 | + "question": "Little Gidding is the fourth poem in which series by T.S. Eliot?", |
| 88 | + "fragments": ["four quartets", "quartets"]}, |
| 89 | + {"id": 18, "topic": "portage", "type": "multi-hop", |
| 90 | + "question": "Who wrote The Portage to San Cristobal of A.H., a novella about Nazi hunters?", |
| 91 | + "fragments": ["george steiner", "steiner"]}, |
| 92 | + |
| 93 | + # === Geography/Infrastructure (NY Route 31B, Osbert de Bayeux) === |
| 94 | + {"id": 19, "topic": "route", "type": "single-hop", |
| 95 | + "question": "NY State Route 31B connected Weedsport to which route?", |
| 96 | + "fragments": ["ny 5", "5"]}, |
| 97 | + {"id": 20, "topic": "osbert", "type": "single-hop", |
| 98 | + "question": "In which diocese was Osbert de Bayeux an archdeacon?", |
| 99 | + "fragments": ["york"]}, |
| 100 | +] |
| 101 | + |
| 102 | + |
| 103 | +def fuzzy_hit(text, fragments): |
| 104 | + t = text.lower() |
| 105 | + matched = [f for f in fragments if f in t] |
| 106 | + return (len(matched) > 0, matched) |
| 107 | + |
| 108 | + |
| 109 | +def collect_text(result): |
| 110 | + parts = [result.get("final_answer", "")] |
| 111 | + for a in result.get("research", {}).get("attempts", []): |
| 112 | + parts.append(a.get("answer", "") or "") |
| 113 | + return " ".join(parts).lower() |
| 114 | + |
| 115 | + |
| 116 | +def main(): |
| 117 | + parser = argparse.ArgumentParser(description=__doc__) |
| 118 | + parser.add_argument("--verbose", action="store_true") |
| 119 | + parser.add_argument("--only", type=int, default=None) |
| 120 | + args = parser.parse_args() |
| 121 | + |
| 122 | + doc_text = DOC_PATH.read_text(encoding="utf-8", errors="replace") |
| 123 | + print("=" * 76) |
| 124 | + print("Phase A-2: LARGE DOCUMENT stress test (1.3MB wikitext, 860+ chunks)") |
| 125 | + print("=" * 76) |
| 126 | + print(f"Document: {DOC_PATH.name}") |
| 127 | + print(f" chars: {len(doc_text):,}") |
| 128 | + print(f" est tokens: ~{len(doc_text)//3:,}") |
| 129 | + print(f" articles: 63") |
| 130 | + print(f" questions: {len(QUESTIONS)} across 13 topics") |
| 131 | + print("-" * 76) |
| 132 | + |
| 133 | + _llm.start_server() |
| 134 | + t_start = time.time() |
| 135 | + try: |
| 136 | + print("[setup] building gist (one-time, no LLM)...") |
| 137 | + cached_gist = gist_stage.build_gist(doc_text, doc_id="wikitext2_full", verbose=False) |
| 138 | + print(f"[setup] gist has {len(cached_gist.chunks)} chunks") |
| 139 | + print() |
| 140 | + |
| 141 | + passed = 0 |
| 142 | + results = [] |
| 143 | + for q in QUESTIONS: |
| 144 | + if args.only is not None and q["id"] != args.only: |
| 145 | + continue |
| 146 | + print(f"--- Q{q['id']} ({q['topic']}, {q['type']}) ---") |
| 147 | + print(f"Q: {q['question']}") |
| 148 | + t_q = time.time() |
| 149 | + try: |
| 150 | + r = answer_question(doc_text, q["question"], doc_id="wt2_full", |
| 151 | + cached_gist=cached_gist, verbose=args.verbose) |
| 152 | + except Exception as e: |
| 153 | + print(f" ERROR: {e}") |
| 154 | + results.append({"q": q, "ok": False}) |
| 155 | + continue |
| 156 | + elapsed = time.time() - t_q |
| 157 | + scoring_text = collect_text(r) |
| 158 | + ok, matched = fuzzy_hit(scoring_text, q["fragments"]) |
| 159 | + mark = "PASS" if ok else "FAIL" |
| 160 | + if ok: passed += 1 |
| 161 | + print(f" [{mark}] ({elapsed:.1f}s) {r['final_answer'][:120]!r}") |
| 162 | + if ok: |
| 163 | + print(f" matched: {matched}") |
| 164 | + else: |
| 165 | + print(f" expected: {q['fragments']}") |
| 166 | + print() |
| 167 | + results.append({"q": q, "ok": ok, "elapsed": elapsed}) |
| 168 | + |
| 169 | + finally: |
| 170 | + _llm.stop_server() |
| 171 | + |
| 172 | + total = time.time() - t_start |
| 173 | + n = len(results) |
| 174 | + print("=" * 76) |
| 175 | + print(f"RESULTS: {passed}/{n} in {total:.0f}s") |
| 176 | + print("=" * 76) |
| 177 | + for r in results: |
| 178 | + q = r["q"] |
| 179 | + mark = "OK" if r["ok"] else "XX" |
| 180 | + print(f" {q['id']:>2} {q['topic']:<12} {q['type']:<10} {mark}") |
| 181 | + print() |
| 182 | + if passed == n: |
| 183 | + print(f" LARGE DOC GATE: PASS ✅ ({passed}/{n})") |
| 184 | + else: |
| 185 | + print(f" LARGE DOC GATE: {passed}/{n} ({100*passed/n:.0f}%)") |
| 186 | + return 0 if passed == n else 1 |
| 187 | + |
| 188 | + |
| 189 | +if __name__ == "__main__": |
| 190 | + sys.exit(main()) |
0 commit comments