@@ -11,6 +11,8 @@ export interface HypothesisEvidenceSeed {
1111 dataset_slot ?: string ;
1212 metric_slot ?: string ;
1313 confidence ?: number ;
14+ source_type ?: "full_text" | "abstract" ;
15+ confidence_reason ?: string ;
1416}
1517
1618export interface HypothesisCandidate {
@@ -73,10 +75,12 @@ export interface HypothesisSelectionScore {
7375 candidate_id : string ;
7476 raw_base_score : number ;
7577 base_score : number ;
78+ evidence_quality_adjustment : number ;
7679 implementation_bonus : number ;
7780 bundling_penalty : number ;
7881 scope_penalty : number ;
7982 diversity_penalty : number ;
83+ evidence_quality_notes : string [ ] ;
8084 final_score : number ;
8185}
8286
@@ -179,6 +183,7 @@ const HYPOTHESIS_REVIEW_SYSTEM_PROMPT = [
179183 "Critique hypothesis drafts for groundedness, causal clarity, falsifiability, experimentability, and objective-metric alignment." ,
180184 "Apply hard gates: hypotheses with too few evidence links, ignored limitations/counterexamples, or no operational measurement plan should not survive review." ,
181185 "When the objective is reproducibility, penalize performance-only hypotheses that do not specify a repeated-run or stability-based outcome." ,
186+ "Penalize hypotheses that rely mostly on abstract-only or heavily caveated evidence when stronger full-text evidence is available." ,
182187 "Revise weak wording instead of praising it." ,
183188 "Return one JSON object only." ,
184189 "No markdown, no prose outside JSON."
@@ -331,7 +336,13 @@ export async function generateHypothesesFromEvidence(args: {
331336 `Hard-gated ${ gatedCandidates . rejected . length } single-pass hypothesis candidate(s) for weak grounding or missing measurement detail.`
332337 ) ;
333338 }
334- const selected = selectHypothesesWithDiversity ( gatedCandidates . kept , [ ] , topK , args . objectiveMetric ) ;
339+ const selected = selectHypothesesWithDiversity (
340+ gatedCandidates . kept ,
341+ [ ] ,
342+ topK ,
343+ args . objectiveMetric ,
344+ args . evidenceSeeds
345+ ) ;
335346 if ( gatedCandidates . kept . length === 0 || selected . selected . length === 0 ) {
336347 throw new Error ( "No valid hypothesis candidates were returned." ) ;
337348 }
@@ -368,11 +379,19 @@ export async function generateHypothesesFromEvidence(args: {
368379 const legacyReason = legacyError instanceof Error ? legacyError . message : String ( legacyError ) ;
369380 args . onProgress ?.( `Hypothesis generation fallback: ${ legacyReason } ` ) ;
370381 const fallback = buildFallbackHypotheses ( args . evidenceSeeds , branchCount , topK ) ;
382+ const fallbackSelection = selectHypothesesWithDiversity (
383+ fallback . candidates ,
384+ [ ] ,
385+ topK ,
386+ args . objectiveMetric ,
387+ args . evidenceSeeds
388+ ) ;
389+ const fallbackSelected = fallbackSelection . selected . length > 0 ? fallbackSelection . selected : fallback . selected ;
371390 return {
372391 source : "fallback" ,
373392 summary : `Fallback generated ${ fallback . candidates . length } hypothesis candidate(s).` ,
374393 candidates : fallback . candidates ,
375- selected : fallback . selected ,
394+ selected : fallbackSelected ,
376395 fallbackReason : `${ stagedReason } ; single_pass=${ legacyReason } ` ,
377396 toolCallsUsed : 0 ,
378397 artifacts : {
@@ -381,18 +400,9 @@ export async function generateHypothesesFromEvidence(args: {
381400 drafts : fallback . candidates ,
382401 reviews : [ ] ,
383402 selection : {
384- selected_ids : fallback . selected . map ( ( candidate ) => candidate . id ) ,
385- ranked_ids : fallback . candidates . map ( ( candidate ) => candidate . id ) ,
386- scores : fallback . candidates . map ( ( candidate ) => ( {
387- candidate_id : candidate . id ,
388- raw_base_score : scoreHypothesis ( candidate ) ,
389- base_score : scoreHypothesis ( candidate ) ,
390- implementation_bonus : 0 ,
391- bundling_penalty : 0 ,
392- scope_penalty : 0 ,
393- diversity_penalty : 0 ,
394- final_score : scoreHypothesis ( candidate )
395- } ) )
403+ selected_ids : fallbackSelected . map ( ( candidate ) => candidate . id ) ,
404+ ranked_ids : fallbackSelection . ranked . map ( ( candidate ) => candidate . id ) ,
405+ scores : fallbackSelection . scores
396406 } ,
397407 llm_trace : {
398408 drafts : [ ]
@@ -530,7 +540,13 @@ async function runStagedHypothesisPipeline(args: {
530540 `Hard-gated ${ gatedCandidates . rejected . length } staged hypothesis candidate(s) for weak grounding or missing measurement detail.`
531541 ) ;
532542 }
533- const selection = selectHypothesesWithDiversity ( gatedCandidates . kept , reviews , args . topK , args . objectiveMetric ) ;
543+ const selection = selectHypothesesWithDiversity (
544+ gatedCandidates . kept ,
545+ reviews ,
546+ args . topK ,
547+ args . objectiveMetric ,
548+ evidencePanel
549+ ) ;
534550
535551 if ( selection . selected . length === 0 ) {
536552 throw new Error ( "no_selected_hypotheses" ) ;
@@ -666,19 +682,7 @@ function buildHypothesisPrompt(
666682 ] ;
667683
668684 evidenceSeeds . slice ( 0 , 16 ) . forEach ( ( seed , index ) => {
669- lines . push (
670- [
671- `${ index + 1 } . evidence_id=${ seed . evidence_id ?? `ev_${ index + 1 } ` } ` ,
672- `paper_id=${ seed . paper_id ?? "unknown" } ` ,
673- `claim=${ seed . claim ?? "unknown" } ` ,
674- seed . limitation_slot ? `limitation=${ seed . limitation_slot } ` : undefined ,
675- seed . dataset_slot ? `dataset=${ seed . dataset_slot } ` : undefined ,
676- seed . metric_slot ? `metric=${ seed . metric_slot } ` : undefined ,
677- typeof seed . confidence === "number" ? `confidence=${ seed . confidence } ` : undefined
678- ]
679- . filter ( Boolean )
680- . join ( " | " )
681- ) ;
685+ lines . push ( renderEvidenceSeed ( seed , index ) ) ;
682686 } ) ;
683687
684688 return lines . join ( "\n" ) ;
@@ -1271,14 +1275,18 @@ function selectHypothesesWithDiversity(
12711275 candidates : HypothesisCandidate [ ] ,
12721276 reviews : HypothesisReview [ ] ,
12731277 topK : number ,
1274- objectiveMetric ?: string
1278+ objectiveMetric ?: string ,
1279+ evidenceSeeds : HypothesisEvidenceSeed [ ] = [ ]
12751280) : { selected : HypothesisCandidate [ ] ; ranked : HypothesisCandidate [ ] ; scores : HypothesisSelectionScore [ ] } {
12761281 const reviewMap = new Map ( reviews . map ( ( review ) => [ review . candidate_id , review ] as const ) ) ;
1282+ const evidenceById = new Map (
1283+ evidenceSeeds . map ( ( seed , index ) => [ seed . evidence_id || `ev_${ index + 1 } ` , seed ] as const )
1284+ ) ;
12771285 const pool = reviews . length > 0 ? candidates . filter ( ( candidate ) => reviewMap . get ( candidate . id ) ?. keep === true ) : candidates ;
12781286 const adjustedBaseById = new Map (
12791287 pool . map ( ( candidate ) => [
12801288 candidate . id ,
1281- buildHypothesisSelectionBase ( candidate , reviewMap . get ( candidate . id ) , objectiveMetric )
1289+ buildHypothesisSelectionBase ( candidate , reviewMap . get ( candidate . id ) , objectiveMetric , evidenceById )
12821290 ] as const )
12831291 ) ;
12841292 const ranked = [ ...pool ] . sort (
@@ -1310,7 +1318,8 @@ function selectHypothesesWithDiversity(
13101318 const score = buildHypothesisSelectionScore (
13111319 candidate ,
13121320 selected ,
1313- adjustedBaseById . get ( candidate . id ) ?? buildHypothesisSelectionBase ( candidate , reviewMap . get ( candidate . id ) , objectiveMetric )
1321+ adjustedBaseById . get ( candidate . id ) ??
1322+ buildHypothesisSelectionBase ( candidate , reviewMap . get ( candidate . id ) , objectiveMetric , evidenceById )
13141323 ) ;
13151324 if ( score . final_score > bestScore ) {
13161325 bestIndex = index ;
@@ -1341,7 +1350,8 @@ function selectHypothesesWithDiversity(
13411350 buildHypothesisSelectionScore (
13421351 candidate ,
13431352 selected . filter ( ( item ) => item . id !== candidate . id ) ,
1344- adjustedBaseById . get ( candidate . id ) ?? buildHypothesisSelectionBase ( candidate , reviewMap . get ( candidate . id ) , objectiveMetric )
1353+ adjustedBaseById . get ( candidate . id ) ??
1354+ buildHypothesisSelectionBase ( candidate , reviewMap . get ( candidate . id ) , objectiveMetric , evidenceById )
13451355 )
13461356 ) ;
13471357 }
@@ -1398,7 +1408,8 @@ function selectHypothesisEvidencePanel(
13981408 ( seed . limitation_slot ? 3 : 0 ) +
13991409 ( seed . dataset_slot ? 2 : 0 ) +
14001410 ( seed . metric_slot ? 2 : 0 ) +
1401- ( seed . claim ? Math . min ( 2 , seed . claim . length / 80 ) : 0 )
1411+ ( seed . claim ? Math . min ( 2 , seed . claim . length / 80 ) : 0 ) +
1412+ assessEvidenceSeedQuality ( seed ) . panel_adjustment
14021413 } ) )
14031414 . sort ( ( a , b ) => b . score - a . score || a . index - b . index ) ;
14041415
@@ -1440,12 +1451,19 @@ function renderEvidenceSeed(seed: HypothesisEvidenceSeed, index: number): string
14401451 seed . limitation_slot ? `limitation=${ seed . limitation_slot } ` : undefined ,
14411452 seed . dataset_slot ? `dataset=${ seed . dataset_slot } ` : undefined ,
14421453 seed . metric_slot ? `metric=${ seed . metric_slot } ` : undefined ,
1443- typeof seed . confidence === "number" ? `confidence=${ seed . confidence } ` : undefined
1454+ typeof seed . confidence === "number" ? `confidence=${ seed . confidence } ` : undefined ,
1455+ seed . source_type ? `source_type=${ seed . source_type } ` : undefined ,
1456+ seed . confidence_reason ? `confidence_reason=${ truncateEvidenceReason ( seed . confidence_reason ) } ` : undefined
14441457 ]
14451458 . filter ( Boolean )
14461459 . join ( " | " ) ;
14471460}
14481461
1462+ function truncateEvidenceReason ( value : string ) : string {
1463+ const trimmed = value . trim ( ) ;
1464+ return trimmed . length > 120 ? `${ trimmed . slice ( 0 , 117 ) } ...` : trimmed ;
1465+ }
1466+
14491467function roleLabel ( kind : HypothesisGeneratorKind ) : string {
14501468 switch ( kind ) {
14511469 case "mechanism" :
@@ -1776,24 +1794,35 @@ function hypothesisBaseScore(candidate: HypothesisCandidate, objectiveMetric?: s
17761794function buildHypothesisSelectionBase (
17771795 candidate : HypothesisCandidate ,
17781796 review : HypothesisReview | undefined ,
1779- objectiveMetric ?: string
1797+ objectiveMetric ?: string ,
1798+ evidenceById : Map < string , HypothesisEvidenceSeed > = new Map ( )
17801799) : {
17811800 raw_base_score : number ;
17821801 base_score : number ;
1802+ evidence_quality_adjustment : number ;
17831803 implementation_bonus : number ;
17841804 bundling_penalty : number ;
17851805 scope_penalty : number ;
1806+ evidence_quality_notes : string [ ] ;
17861807} {
17871808 const rawBaseScore = hypothesisBaseScore ( candidate , objectiveMetric ) ;
1809+ const evidenceSupport = assessCandidateEvidenceSupport ( candidate , evidenceById ) ;
17881810 const implementationBonus = hypothesisImplementationBonus ( candidate , review ) ;
17891811 const bundlingPenalty = hypothesisBundlingPenalty ( candidate , review ) ;
17901812 const scopePenalty = hypothesisScopePenalty ( candidate , review ) ;
17911813 return {
17921814 raw_base_score : rawBaseScore ,
1793- base_score : rawBaseScore + implementationBonus - bundlingPenalty - scopePenalty ,
1815+ base_score :
1816+ rawBaseScore +
1817+ evidenceSupport . adjustment +
1818+ implementationBonus -
1819+ bundlingPenalty -
1820+ scopePenalty ,
1821+ evidence_quality_adjustment : evidenceSupport . adjustment ,
17941822 implementation_bonus : implementationBonus ,
17951823 bundling_penalty : bundlingPenalty ,
1796- scope_penalty : scopePenalty
1824+ scope_penalty : scopePenalty ,
1825+ evidence_quality_notes : evidenceSupport . notes
17971826 } ;
17981827}
17991828
@@ -1803,24 +1832,127 @@ function buildHypothesisSelectionScore(
18031832 adjustedBase : {
18041833 raw_base_score : number ;
18051834 base_score : number ;
1835+ evidence_quality_adjustment : number ;
18061836 implementation_bonus : number ;
18071837 bundling_penalty : number ;
18081838 scope_penalty : number ;
1839+ evidence_quality_notes : string [ ] ;
18091840 }
18101841) : HypothesisSelectionScore {
18111842 const diversityPenalty = calculateDiversityPenalty ( candidate , selected ) ;
18121843 return {
18131844 candidate_id : candidate . id ,
18141845 raw_base_score : adjustedBase . raw_base_score ,
18151846 base_score : adjustedBase . base_score ,
1847+ evidence_quality_adjustment : adjustedBase . evidence_quality_adjustment ,
18161848 implementation_bonus : adjustedBase . implementation_bonus ,
18171849 bundling_penalty : adjustedBase . bundling_penalty ,
18181850 scope_penalty : adjustedBase . scope_penalty ,
18191851 diversity_penalty : diversityPenalty ,
1852+ evidence_quality_notes : adjustedBase . evidence_quality_notes ,
18201853 final_score : adjustedBase . base_score - diversityPenalty
18211854 } ;
18221855}
18231856
1857+ function assessCandidateEvidenceSupport (
1858+ candidate : HypothesisCandidate ,
1859+ evidenceById : Map < string , HypothesisEvidenceSeed >
1860+ ) : { adjustment : number ; notes : string [ ] } {
1861+ const linkedEvidence = dedupeStrings ( candidate . evidence_links )
1862+ . map ( ( evidenceId ) => evidenceById . get ( evidenceId ) )
1863+ . filter ( ( seed ) : seed is HypothesisEvidenceSeed => Boolean ( seed ) ) ;
1864+ if ( linkedEvidence . length === 0 ) {
1865+ return {
1866+ adjustment : - 0.75 ,
1867+ notes : [ "missing_linked_evidence" ]
1868+ } ;
1869+ }
1870+
1871+ const assessments = linkedEvidence . map ( ( seed ) => assessEvidenceSeedQuality ( seed ) ) ;
1872+ let adjustment =
1873+ assessments . reduce ( ( sum , assessment ) => sum + assessment . candidate_adjustment , 0 ) / assessments . length ;
1874+ const notes = dedupeStrings ( assessments . flatMap ( ( assessment ) => assessment . notes ) ) ;
1875+
1876+ if ( linkedEvidence . every ( ( seed ) => seed . source_type === "abstract" ) ) {
1877+ adjustment -= 0.5 ;
1878+ notes . push ( "abstract_only_support" ) ;
1879+ }
1880+
1881+ const strongEvidenceCount = assessments . filter ( ( assessment ) => assessment . candidate_adjustment >= 0.2 ) . length ;
1882+ if ( linkedEvidence . length >= 2 && strongEvidenceCount >= 2 ) {
1883+ adjustment += 0.35 ;
1884+ notes . push ( "multi_source_support" ) ;
1885+ }
1886+
1887+ const riskyEvidenceCount = assessments . filter ( ( assessment ) => assessment . candidate_adjustment <= - 0.75 ) . length ;
1888+ if ( riskyEvidenceCount === linkedEvidence . length && riskyEvidenceCount > 0 ) {
1889+ adjustment -= 0.5 ;
1890+ notes . push ( "all_support_caveated" ) ;
1891+ }
1892+
1893+ return {
1894+ adjustment : Number ( adjustment . toFixed ( 3 ) ) ,
1895+ notes : dedupeStrings ( notes )
1896+ } ;
1897+ }
1898+
1899+ function assessEvidenceSeedQuality (
1900+ seed : HypothesisEvidenceSeed
1901+ ) : { panel_adjustment : number ; candidate_adjustment : number ; notes : string [ ] } {
1902+ let panelAdjustment = 0 ;
1903+ let candidateAdjustment = 0 ;
1904+ const notes : string [ ] = [ ] ;
1905+
1906+ if ( seed . source_type === "full_text" ) {
1907+ panelAdjustment += 0.75 ;
1908+ candidateAdjustment += 0.4 ;
1909+ notes . push ( "full_text_support" ) ;
1910+ } else if ( seed . source_type === "abstract" ) {
1911+ panelAdjustment -= 1.1 ;
1912+ candidateAdjustment -= 0.85 ;
1913+ notes . push ( "abstract_support" ) ;
1914+ }
1915+
1916+ const confidence = typeof seed . confidence === "number" && Number . isFinite ( seed . confidence ) ? seed . confidence : 0.5 ;
1917+ if ( confidence >= 0.9 ) {
1918+ panelAdjustment += 0.3 ;
1919+ candidateAdjustment += 0.2 ;
1920+ } else if ( confidence < 0.55 ) {
1921+ panelAdjustment -= 1.2 ;
1922+ candidateAdjustment -= 1.1 ;
1923+ notes . push ( "low_confidence" ) ;
1924+ } else if ( confidence < 0.7 ) {
1925+ panelAdjustment -= 0.55 ;
1926+ candidateAdjustment -= 0.45 ;
1927+ notes . push ( "mid_confidence" ) ;
1928+ }
1929+
1930+ const reason = ( seed . confidence_reason || "" ) . toLowerCase ( ) ;
1931+ if ( reason ) {
1932+ if ( / ( c o u l d n o t b e g r o u n d e d | n o t b e g r o u n d e d | f a l l b a c k e v i d e n c e | n o s t r u c t u r e d e v i d e n c e | s y n t h e s i [ s z ] e d ) / . test ( reason ) ) {
1933+ panelAdjustment -= 1.8 ;
1934+ candidateAdjustment -= 1.6 ;
1935+ notes . push ( "ungrounded_support" ) ;
1936+ } else if ( / ( o n l y t h e a b s t r a c t | a b s t r a c t - l e v e l | a b s t r a c t o n l y | i n d i r e c t | s u p p l e m e n t a l ) / . test ( reason ) ) {
1937+ panelAdjustment -= 1.05 ;
1938+ candidateAdjustment -= 0.9 ;
1939+ notes . push ( "indirect_support" ) ;
1940+ }
1941+
1942+ if ( / ( s i n g l e b e n c h m a r k | e x t e r n a l v a l i d i t y | l i m i t e d | t e n t a t i v e | w e a k | c a v e a t | p a r t i a l s u p p o r t ) / . test ( reason ) ) {
1943+ panelAdjustment -= 0.35 ;
1944+ candidateAdjustment -= 0.3 ;
1945+ notes . push ( "limited_generalizability" ) ;
1946+ }
1947+ }
1948+
1949+ return {
1950+ panel_adjustment : Number ( panelAdjustment . toFixed ( 3 ) ) ,
1951+ candidate_adjustment : Number ( candidateAdjustment . toFixed ( 3 ) ) ,
1952+ notes : dedupeStrings ( notes )
1953+ } ;
1954+ }
1955+
18241956function hypothesisImplementationBonus (
18251957 candidate : HypothesisCandidate ,
18261958 review : HypothesisReview | undefined
0 commit comments