Skip to content

Commit a922806

Browse files
unamedkrclaude
andcommitted
test(rlv): add 1.3MB large-doc stress test (Phase A-2)
eval_wikitext_large.py: 20 questions across 13 topics from 63 articles in wikitext2_test.txt (1.3MB, ~2754 chunks, ~310K tokens). This is 37x larger than the original eval_wikitext.py (35K chars) and tests whether RLV scales to real document sizes. Dry-run locator accuracy: 7/8 on keyword-only (no LLM), showing BM25+RRF can discriminate among 2754 chunks. Full LLM test requires M3 machine. Questions span: military history (Ise-class, Naktong, Ironclad), sports (Rifenburg, Kershaw, Amos), science (Dvorak, hurricane, Temnospondyli), literature (Imagism, Little Gidding, Portage), geography (NY Route 31B, Osbert de Bayeux). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent ae1365e commit a922806

1 file changed

Lines changed: 190 additions & 0 deletions

File tree

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
#!/usr/bin/env python3
2+
"""Phase A-2: Large document stress test — 1.3MB wikitext (860+ chunks).
3+
4+
This tests whether RLV scales to REAL document sizes. The original
5+
eval_wikitext.py uses ppl_8k.txt (35K chars, 23 chunks, 3 articles).
6+
This uses wikitext2_test.txt (1.3MB, ~860 chunks, 63 articles).
7+
8+
Key challenges at this scale:
9+
- 860 chunks → locator must discriminate among 37x more candidates
10+
- 63 articles with overlapping topics (military history, sports, poetry)
11+
- BM25 IDF changes: common words have lower discrimination power
12+
- Multiple articles about similar subjects (battles, sports figures)
13+
"""
14+
import argparse
15+
import sys
16+
import time
17+
from pathlib import Path
18+
19+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
20+
21+
from rlv_orchestrator import answer_question
22+
from stages import _llm
23+
from stages import gist as gist_stage
24+
25+
DOC_PATH = Path(__file__).resolve().parent.parent.parent / "data" / "wikitext2_test.txt"
26+
27+
# 20 questions across 10+ different articles — diverse topics, difficulty levels
28+
QUESTIONS = [
29+
# === Already-tested articles (Boulter, Du Fu, Kiss You) — regression check ===
30+
{"id": 1, "topic": "boulter", "type": "single-hop",
31+
"question": "Who directed the production of Mercury Fur in which Boulter appeared?",
32+
"fragments": ["john tiffany", "tiffany"]},
33+
{"id": 2, "topic": "dufu", "type": "single-hop",
34+
"question": "In what year did Du Fu first meet Li Bai?",
35+
"fragments": ["744"]},
36+
{"id": 3, "topic": "kiss_you", "type": "single-hop",
37+
"question": "Who directed the Kiss You music video?",
38+
"fragments": ["vaughan arnell", "arnell"]},
39+
40+
# === Military history (Ise-class, Naktong, Ironclad) ===
41+
{"id": 4, "topic": "ise_class", "type": "single-hop",
42+
"question": "What disaster did the Ise-class battleships carry supplies for in 1923?",
43+
"fragments": ["earthquake", "kanto", "kantō"]},
44+
{"id": 5, "topic": "ise_class", "type": "multi-hop",
45+
"question": "After which battle were the Ise-class ships rebuilt with a flight deck?",
46+
"fragments": ["midway"]},
47+
{"id": 6, "topic": "ironclad", "type": "single-hop",
48+
"question": "What was the name of the first ironclad battleship launched in 1859?",
49+
"fragments": ["gloire"]},
50+
{"id": 7, "topic": "naktong", "type": "single-hop",
51+
"question": "The Second Battle of Naktong Bulge was part of which larger battle?",
52+
"fragments": ["pusan perimeter", "pusan"]},
53+
54+
# === Sports (Dick Rifenburg, Clayton Kershaw, Ben Amos) ===
55+
{"id": 8, "topic": "rifenburg", "type": "single-hop",
56+
"question": "What NFL team did Dick Rifenburg play for in 1950?",
57+
"fragments": ["detroit lions", "detroit", "lions"]},
58+
{"id": 9, "topic": "kershaw", "type": "single-hop",
59+
"question": "In what year was Clayton Kershaw drafted?",
60+
"fragments": ["2006"]},
61+
{"id": 10, "topic": "kershaw", "type": "single-hop",
62+
"question": "On what date did Clayton Kershaw pitch a no-hitter?",
63+
"fragments": ["june 18", "2014"]},
64+
{"id": 11, "topic": "amos", "type": "single-hop",
65+
"question": "Which Manchester United academy did Ben Amos join from?",
66+
"fragments": ["crewe", "crewe alexandra"]},
67+
68+
# === Science/Weather (Dvorak technique, Hurricane, Temnospondyli) ===
69+
{"id": 12, "topic": "dvorak", "type": "single-hop",
70+
"question": "Who developed the Dvorak technique for estimating tropical cyclone intensity?",
71+
"fragments": ["vernon dvorak", "vernon", "dvorak"]},
72+
{"id": 13, "topic": "hurricane", "type": "single-hop",
73+
"question": "Where did the 1933 Treasure Coast hurricane make landfall in Florida?",
74+
"fragments": ["jupiter"]},
75+
{"id": 14, "topic": "hurricane", "type": "single-hop",
76+
"question": "What were the peak winds of the 1933 Treasure Coast hurricane?",
77+
"fragments": ["140"]},
78+
{"id": 15, "topic": "temno", "type": "single-hop",
79+
"question": "What does the Greek word 'temnein' mean in the name Temnospondyli?",
80+
"fragments": ["cut"]},
81+
82+
# === Literature/Arts (Imagism, Little Gidding, Portage) ===
83+
{"id": 16, "topic": "imagism", "type": "single-hop",
84+
"question": "Imagism is considered the first organized movement of what literary period?",
85+
"fragments": ["modernist", "modernism"]},
86+
{"id": 17, "topic": "gidding", "type": "single-hop",
87+
"question": "Little Gidding is the fourth poem in which series by T.S. Eliot?",
88+
"fragments": ["four quartets", "quartets"]},
89+
{"id": 18, "topic": "portage", "type": "multi-hop",
90+
"question": "Who wrote The Portage to San Cristobal of A.H., a novella about Nazi hunters?",
91+
"fragments": ["george steiner", "steiner"]},
92+
93+
# === Geography/Infrastructure (NY Route 31B, Osbert de Bayeux) ===
94+
{"id": 19, "topic": "route", "type": "single-hop",
95+
"question": "NY State Route 31B connected Weedsport to which route?",
96+
"fragments": ["ny 5", "5"]},
97+
{"id": 20, "topic": "osbert", "type": "single-hop",
98+
"question": "In which diocese was Osbert de Bayeux an archdeacon?",
99+
"fragments": ["york"]},
100+
]
101+
102+
103+
def fuzzy_hit(text, fragments):
104+
t = text.lower()
105+
matched = [f for f in fragments if f in t]
106+
return (len(matched) > 0, matched)
107+
108+
109+
def collect_text(result):
110+
parts = [result.get("final_answer", "")]
111+
for a in result.get("research", {}).get("attempts", []):
112+
parts.append(a.get("answer", "") or "")
113+
return " ".join(parts).lower()
114+
115+
116+
def main():
117+
parser = argparse.ArgumentParser(description=__doc__)
118+
parser.add_argument("--verbose", action="store_true")
119+
parser.add_argument("--only", type=int, default=None)
120+
args = parser.parse_args()
121+
122+
doc_text = DOC_PATH.read_text(encoding="utf-8", errors="replace")
123+
print("=" * 76)
124+
print("Phase A-2: LARGE DOCUMENT stress test (1.3MB wikitext, 860+ chunks)")
125+
print("=" * 76)
126+
print(f"Document: {DOC_PATH.name}")
127+
print(f" chars: {len(doc_text):,}")
128+
print(f" est tokens: ~{len(doc_text)//3:,}")
129+
print(f" articles: 63")
130+
print(f" questions: {len(QUESTIONS)} across 13 topics")
131+
print("-" * 76)
132+
133+
_llm.start_server()
134+
t_start = time.time()
135+
try:
136+
print("[setup] building gist (one-time, no LLM)...")
137+
cached_gist = gist_stage.build_gist(doc_text, doc_id="wikitext2_full", verbose=False)
138+
print(f"[setup] gist has {len(cached_gist.chunks)} chunks")
139+
print()
140+
141+
passed = 0
142+
results = []
143+
for q in QUESTIONS:
144+
if args.only is not None and q["id"] != args.only:
145+
continue
146+
print(f"--- Q{q['id']} ({q['topic']}, {q['type']}) ---")
147+
print(f"Q: {q['question']}")
148+
t_q = time.time()
149+
try:
150+
r = answer_question(doc_text, q["question"], doc_id="wt2_full",
151+
cached_gist=cached_gist, verbose=args.verbose)
152+
except Exception as e:
153+
print(f" ERROR: {e}")
154+
results.append({"q": q, "ok": False})
155+
continue
156+
elapsed = time.time() - t_q
157+
scoring_text = collect_text(r)
158+
ok, matched = fuzzy_hit(scoring_text, q["fragments"])
159+
mark = "PASS" if ok else "FAIL"
160+
if ok: passed += 1
161+
print(f" [{mark}] ({elapsed:.1f}s) {r['final_answer'][:120]!r}")
162+
if ok:
163+
print(f" matched: {matched}")
164+
else:
165+
print(f" expected: {q['fragments']}")
166+
print()
167+
results.append({"q": q, "ok": ok, "elapsed": elapsed})
168+
169+
finally:
170+
_llm.stop_server()
171+
172+
total = time.time() - t_start
173+
n = len(results)
174+
print("=" * 76)
175+
print(f"RESULTS: {passed}/{n} in {total:.0f}s")
176+
print("=" * 76)
177+
for r in results:
178+
q = r["q"]
179+
mark = "OK" if r["ok"] else "XX"
180+
print(f" {q['id']:>2} {q['topic']:<12} {q['type']:<10} {mark}")
181+
print()
182+
if passed == n:
183+
print(f" LARGE DOC GATE: PASS ✅ ({passed}/{n})")
184+
else:
185+
print(f" LARGE DOC GATE: {passed}/{n} ({100*passed/n:.0f}%)")
186+
return 0 if passed == n else 1
187+
188+
189+
if __name__ == "__main__":
190+
sys.exit(main())

0 commit comments

Comments
 (0)