Skip to content

Commit caace43

Browse files
committed
Fix build-and-test failure
1 parent 61b7a9f commit caace43

15 files changed

Lines changed: 1102 additions & 176 deletions

README.ko.md

Lines changed: 126 additions & 61 deletions
Large diffs are not rendered by default.

README.md

Lines changed: 126 additions & 61 deletions
Large diffs are not rendered by default.

src/core/analysis/researchPlanning.ts

Lines changed: 169 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ export interface HypothesisEvidenceSeed {
1111
dataset_slot?: string;
1212
metric_slot?: string;
1313
confidence?: number;
14+
source_type?: "full_text" | "abstract";
15+
confidence_reason?: string;
1416
}
1517

1618
export interface HypothesisCandidate {
@@ -73,10 +75,12 @@ export interface HypothesisSelectionScore {
7375
candidate_id: string;
7476
raw_base_score: number;
7577
base_score: number;
78+
evidence_quality_adjustment: number;
7679
implementation_bonus: number;
7780
bundling_penalty: number;
7881
scope_penalty: number;
7982
diversity_penalty: number;
83+
evidence_quality_notes: string[];
8084
final_score: number;
8185
}
8286

@@ -179,6 +183,7 @@ const HYPOTHESIS_REVIEW_SYSTEM_PROMPT = [
179183
"Critique hypothesis drafts for groundedness, causal clarity, falsifiability, experimentability, and objective-metric alignment.",
180184
"Apply hard gates: hypotheses with too few evidence links, ignored limitations/counterexamples, or no operational measurement plan should not survive review.",
181185
"When the objective is reproducibility, penalize performance-only hypotheses that do not specify a repeated-run or stability-based outcome.",
186+
"Penalize hypotheses that rely mostly on abstract-only or heavily caveated evidence when stronger full-text evidence is available.",
182187
"Revise weak wording instead of praising it.",
183188
"Return one JSON object only.",
184189
"No markdown, no prose outside JSON."
@@ -331,7 +336,13 @@ export async function generateHypothesesFromEvidence(args: {
331336
`Hard-gated ${gatedCandidates.rejected.length} single-pass hypothesis candidate(s) for weak grounding or missing measurement detail.`
332337
);
333338
}
334-
const selected = selectHypothesesWithDiversity(gatedCandidates.kept, [], topK, args.objectiveMetric);
339+
const selected = selectHypothesesWithDiversity(
340+
gatedCandidates.kept,
341+
[],
342+
topK,
343+
args.objectiveMetric,
344+
args.evidenceSeeds
345+
);
335346
if (gatedCandidates.kept.length === 0 || selected.selected.length === 0) {
336347
throw new Error("No valid hypothesis candidates were returned.");
337348
}
@@ -368,11 +379,19 @@ export async function generateHypothesesFromEvidence(args: {
368379
const legacyReason = legacyError instanceof Error ? legacyError.message : String(legacyError);
369380
args.onProgress?.(`Hypothesis generation fallback: ${legacyReason}`);
370381
const fallback = buildFallbackHypotheses(args.evidenceSeeds, branchCount, topK);
382+
const fallbackSelection = selectHypothesesWithDiversity(
383+
fallback.candidates,
384+
[],
385+
topK,
386+
args.objectiveMetric,
387+
args.evidenceSeeds
388+
);
389+
const fallbackSelected = fallbackSelection.selected.length > 0 ? fallbackSelection.selected : fallback.selected;
371390
return {
372391
source: "fallback",
373392
summary: `Fallback generated ${fallback.candidates.length} hypothesis candidate(s).`,
374393
candidates: fallback.candidates,
375-
selected: fallback.selected,
394+
selected: fallbackSelected,
376395
fallbackReason: `${stagedReason}; single_pass=${legacyReason}`,
377396
toolCallsUsed: 0,
378397
artifacts: {
@@ -381,18 +400,9 @@ export async function generateHypothesesFromEvidence(args: {
381400
drafts: fallback.candidates,
382401
reviews: [],
383402
selection: {
384-
selected_ids: fallback.selected.map((candidate) => candidate.id),
385-
ranked_ids: fallback.candidates.map((candidate) => candidate.id),
386-
scores: fallback.candidates.map((candidate) => ({
387-
candidate_id: candidate.id,
388-
raw_base_score: scoreHypothesis(candidate),
389-
base_score: scoreHypothesis(candidate),
390-
implementation_bonus: 0,
391-
bundling_penalty: 0,
392-
scope_penalty: 0,
393-
diversity_penalty: 0,
394-
final_score: scoreHypothesis(candidate)
395-
}))
403+
selected_ids: fallbackSelected.map((candidate) => candidate.id),
404+
ranked_ids: fallbackSelection.ranked.map((candidate) => candidate.id),
405+
scores: fallbackSelection.scores
396406
},
397407
llm_trace: {
398408
drafts: []
@@ -530,7 +540,13 @@ async function runStagedHypothesisPipeline(args: {
530540
`Hard-gated ${gatedCandidates.rejected.length} staged hypothesis candidate(s) for weak grounding or missing measurement detail.`
531541
);
532542
}
533-
const selection = selectHypothesesWithDiversity(gatedCandidates.kept, reviews, args.topK, args.objectiveMetric);
543+
const selection = selectHypothesesWithDiversity(
544+
gatedCandidates.kept,
545+
reviews,
546+
args.topK,
547+
args.objectiveMetric,
548+
evidencePanel
549+
);
534550

535551
if (selection.selected.length === 0) {
536552
throw new Error("no_selected_hypotheses");
@@ -666,19 +682,7 @@ function buildHypothesisPrompt(
666682
];
667683

668684
evidenceSeeds.slice(0, 16).forEach((seed, index) => {
669-
lines.push(
670-
[
671-
`${index + 1}. evidence_id=${seed.evidence_id ?? `ev_${index + 1}`}`,
672-
`paper_id=${seed.paper_id ?? "unknown"}`,
673-
`claim=${seed.claim ?? "unknown"}`,
674-
seed.limitation_slot ? `limitation=${seed.limitation_slot}` : undefined,
675-
seed.dataset_slot ? `dataset=${seed.dataset_slot}` : undefined,
676-
seed.metric_slot ? `metric=${seed.metric_slot}` : undefined,
677-
typeof seed.confidence === "number" ? `confidence=${seed.confidence}` : undefined
678-
]
679-
.filter(Boolean)
680-
.join(" | ")
681-
);
685+
lines.push(renderEvidenceSeed(seed, index));
682686
});
683687

684688
return lines.join("\n");
@@ -1271,14 +1275,18 @@ function selectHypothesesWithDiversity(
12711275
candidates: HypothesisCandidate[],
12721276
reviews: HypothesisReview[],
12731277
topK: number,
1274-
objectiveMetric?: string
1278+
objectiveMetric?: string,
1279+
evidenceSeeds: HypothesisEvidenceSeed[] = []
12751280
): { selected: HypothesisCandidate[]; ranked: HypothesisCandidate[]; scores: HypothesisSelectionScore[] } {
12761281
const reviewMap = new Map(reviews.map((review) => [review.candidate_id, review] as const));
1282+
const evidenceById = new Map(
1283+
evidenceSeeds.map((seed, index) => [seed.evidence_id || `ev_${index + 1}`, seed] as const)
1284+
);
12771285
const pool = reviews.length > 0 ? candidates.filter((candidate) => reviewMap.get(candidate.id)?.keep === true) : candidates;
12781286
const adjustedBaseById = new Map(
12791287
pool.map((candidate) => [
12801288
candidate.id,
1281-
buildHypothesisSelectionBase(candidate, reviewMap.get(candidate.id), objectiveMetric)
1289+
buildHypothesisSelectionBase(candidate, reviewMap.get(candidate.id), objectiveMetric, evidenceById)
12821290
] as const)
12831291
);
12841292
const ranked = [...pool].sort(
@@ -1310,7 +1318,8 @@ function selectHypothesesWithDiversity(
13101318
const score = buildHypothesisSelectionScore(
13111319
candidate,
13121320
selected,
1313-
adjustedBaseById.get(candidate.id) ?? buildHypothesisSelectionBase(candidate, reviewMap.get(candidate.id), objectiveMetric)
1321+
adjustedBaseById.get(candidate.id) ??
1322+
buildHypothesisSelectionBase(candidate, reviewMap.get(candidate.id), objectiveMetric, evidenceById)
13141323
);
13151324
if (score.final_score > bestScore) {
13161325
bestIndex = index;
@@ -1341,7 +1350,8 @@ function selectHypothesesWithDiversity(
13411350
buildHypothesisSelectionScore(
13421351
candidate,
13431352
selected.filter((item) => item.id !== candidate.id),
1344-
adjustedBaseById.get(candidate.id) ?? buildHypothesisSelectionBase(candidate, reviewMap.get(candidate.id), objectiveMetric)
1353+
adjustedBaseById.get(candidate.id) ??
1354+
buildHypothesisSelectionBase(candidate, reviewMap.get(candidate.id), objectiveMetric, evidenceById)
13451355
)
13461356
);
13471357
}
@@ -1398,7 +1408,8 @@ function selectHypothesisEvidencePanel(
13981408
(seed.limitation_slot ? 3 : 0) +
13991409
(seed.dataset_slot ? 2 : 0) +
14001410
(seed.metric_slot ? 2 : 0) +
1401-
(seed.claim ? Math.min(2, seed.claim.length / 80) : 0)
1411+
(seed.claim ? Math.min(2, seed.claim.length / 80) : 0) +
1412+
assessEvidenceSeedQuality(seed).panel_adjustment
14021413
}))
14031414
.sort((a, b) => b.score - a.score || a.index - b.index);
14041415

@@ -1440,12 +1451,19 @@ function renderEvidenceSeed(seed: HypothesisEvidenceSeed, index: number): string
14401451
seed.limitation_slot ? `limitation=${seed.limitation_slot}` : undefined,
14411452
seed.dataset_slot ? `dataset=${seed.dataset_slot}` : undefined,
14421453
seed.metric_slot ? `metric=${seed.metric_slot}` : undefined,
1443-
typeof seed.confidence === "number" ? `confidence=${seed.confidence}` : undefined
1454+
typeof seed.confidence === "number" ? `confidence=${seed.confidence}` : undefined,
1455+
seed.source_type ? `source_type=${seed.source_type}` : undefined,
1456+
seed.confidence_reason ? `confidence_reason=${truncateEvidenceReason(seed.confidence_reason)}` : undefined
14441457
]
14451458
.filter(Boolean)
14461459
.join(" | ");
14471460
}
14481461

1462+
function truncateEvidenceReason(value: string): string {
1463+
const trimmed = value.trim();
1464+
return trimmed.length > 120 ? `${trimmed.slice(0, 117)}...` : trimmed;
1465+
}
1466+
14491467
function roleLabel(kind: HypothesisGeneratorKind): string {
14501468
switch (kind) {
14511469
case "mechanism":
@@ -1776,24 +1794,35 @@ function hypothesisBaseScore(candidate: HypothesisCandidate, objectiveMetric?: s
17761794
function buildHypothesisSelectionBase(
17771795
candidate: HypothesisCandidate,
17781796
review: HypothesisReview | undefined,
1779-
objectiveMetric?: string
1797+
objectiveMetric?: string,
1798+
evidenceById: Map<string, HypothesisEvidenceSeed> = new Map()
17801799
): {
17811800
raw_base_score: number;
17821801
base_score: number;
1802+
evidence_quality_adjustment: number;
17831803
implementation_bonus: number;
17841804
bundling_penalty: number;
17851805
scope_penalty: number;
1806+
evidence_quality_notes: string[];
17861807
} {
17871808
const rawBaseScore = hypothesisBaseScore(candidate, objectiveMetric);
1809+
const evidenceSupport = assessCandidateEvidenceSupport(candidate, evidenceById);
17881810
const implementationBonus = hypothesisImplementationBonus(candidate, review);
17891811
const bundlingPenalty = hypothesisBundlingPenalty(candidate, review);
17901812
const scopePenalty = hypothesisScopePenalty(candidate, review);
17911813
return {
17921814
raw_base_score: rawBaseScore,
1793-
base_score: rawBaseScore + implementationBonus - bundlingPenalty - scopePenalty,
1815+
base_score:
1816+
rawBaseScore +
1817+
evidenceSupport.adjustment +
1818+
implementationBonus -
1819+
bundlingPenalty -
1820+
scopePenalty,
1821+
evidence_quality_adjustment: evidenceSupport.adjustment,
17941822
implementation_bonus: implementationBonus,
17951823
bundling_penalty: bundlingPenalty,
1796-
scope_penalty: scopePenalty
1824+
scope_penalty: scopePenalty,
1825+
evidence_quality_notes: evidenceSupport.notes
17971826
};
17981827
}
17991828

@@ -1803,24 +1832,127 @@ function buildHypothesisSelectionScore(
18031832
adjustedBase: {
18041833
raw_base_score: number;
18051834
base_score: number;
1835+
evidence_quality_adjustment: number;
18061836
implementation_bonus: number;
18071837
bundling_penalty: number;
18081838
scope_penalty: number;
1839+
evidence_quality_notes: string[];
18091840
}
18101841
): HypothesisSelectionScore {
18111842
const diversityPenalty = calculateDiversityPenalty(candidate, selected);
18121843
return {
18131844
candidate_id: candidate.id,
18141845
raw_base_score: adjustedBase.raw_base_score,
18151846
base_score: adjustedBase.base_score,
1847+
evidence_quality_adjustment: adjustedBase.evidence_quality_adjustment,
18161848
implementation_bonus: adjustedBase.implementation_bonus,
18171849
bundling_penalty: adjustedBase.bundling_penalty,
18181850
scope_penalty: adjustedBase.scope_penalty,
18191851
diversity_penalty: diversityPenalty,
1852+
evidence_quality_notes: adjustedBase.evidence_quality_notes,
18201853
final_score: adjustedBase.base_score - diversityPenalty
18211854
};
18221855
}
18231856

1857+
function assessCandidateEvidenceSupport(
1858+
candidate: HypothesisCandidate,
1859+
evidenceById: Map<string, HypothesisEvidenceSeed>
1860+
): { adjustment: number; notes: string[] } {
1861+
const linkedEvidence = dedupeStrings(candidate.evidence_links)
1862+
.map((evidenceId) => evidenceById.get(evidenceId))
1863+
.filter((seed): seed is HypothesisEvidenceSeed => Boolean(seed));
1864+
if (linkedEvidence.length === 0) {
1865+
return {
1866+
adjustment: -0.75,
1867+
notes: ["missing_linked_evidence"]
1868+
};
1869+
}
1870+
1871+
const assessments = linkedEvidence.map((seed) => assessEvidenceSeedQuality(seed));
1872+
let adjustment =
1873+
assessments.reduce((sum, assessment) => sum + assessment.candidate_adjustment, 0) / assessments.length;
1874+
const notes = dedupeStrings(assessments.flatMap((assessment) => assessment.notes));
1875+
1876+
if (linkedEvidence.every((seed) => seed.source_type === "abstract")) {
1877+
adjustment -= 0.5;
1878+
notes.push("abstract_only_support");
1879+
}
1880+
1881+
const strongEvidenceCount = assessments.filter((assessment) => assessment.candidate_adjustment >= 0.2).length;
1882+
if (linkedEvidence.length >= 2 && strongEvidenceCount >= 2) {
1883+
adjustment += 0.35;
1884+
notes.push("multi_source_support");
1885+
}
1886+
1887+
const riskyEvidenceCount = assessments.filter((assessment) => assessment.candidate_adjustment <= -0.75).length;
1888+
if (riskyEvidenceCount === linkedEvidence.length && riskyEvidenceCount > 0) {
1889+
adjustment -= 0.5;
1890+
notes.push("all_support_caveated");
1891+
}
1892+
1893+
return {
1894+
adjustment: Number(adjustment.toFixed(3)),
1895+
notes: dedupeStrings(notes)
1896+
};
1897+
}
1898+
1899+
function assessEvidenceSeedQuality(
1900+
seed: HypothesisEvidenceSeed
1901+
): { panel_adjustment: number; candidate_adjustment: number; notes: string[] } {
1902+
let panelAdjustment = 0;
1903+
let candidateAdjustment = 0;
1904+
const notes: string[] = [];
1905+
1906+
if (seed.source_type === "full_text") {
1907+
panelAdjustment += 0.75;
1908+
candidateAdjustment += 0.4;
1909+
notes.push("full_text_support");
1910+
} else if (seed.source_type === "abstract") {
1911+
panelAdjustment -= 1.1;
1912+
candidateAdjustment -= 0.85;
1913+
notes.push("abstract_support");
1914+
}
1915+
1916+
const confidence = typeof seed.confidence === "number" && Number.isFinite(seed.confidence) ? seed.confidence : 0.5;
1917+
if (confidence >= 0.9) {
1918+
panelAdjustment += 0.3;
1919+
candidateAdjustment += 0.2;
1920+
} else if (confidence < 0.55) {
1921+
panelAdjustment -= 1.2;
1922+
candidateAdjustment -= 1.1;
1923+
notes.push("low_confidence");
1924+
} else if (confidence < 0.7) {
1925+
panelAdjustment -= 0.55;
1926+
candidateAdjustment -= 0.45;
1927+
notes.push("mid_confidence");
1928+
}
1929+
1930+
const reason = (seed.confidence_reason || "").toLowerCase();
1931+
if (reason) {
1932+
if (/(could not be grounded|not be grounded|fallback evidence|no structured evidence|synthesi[sz]ed)/.test(reason)) {
1933+
panelAdjustment -= 1.8;
1934+
candidateAdjustment -= 1.6;
1935+
notes.push("ungrounded_support");
1936+
} else if (/(only the abstract|abstract-level|abstract only|indirect|supplemental)/.test(reason)) {
1937+
panelAdjustment -= 1.05;
1938+
candidateAdjustment -= 0.9;
1939+
notes.push("indirect_support");
1940+
}
1941+
1942+
if (/(single benchmark|external validity|limited|tentative|weak|caveat|partial support)/.test(reason)) {
1943+
panelAdjustment -= 0.35;
1944+
candidateAdjustment -= 0.3;
1945+
notes.push("limited_generalizability");
1946+
}
1947+
}
1948+
1949+
return {
1950+
panel_adjustment: Number(panelAdjustment.toFixed(3)),
1951+
candidate_adjustment: Number(candidateAdjustment.toFixed(3)),
1952+
notes: dedupeStrings(notes)
1953+
};
1954+
}
1955+
18241956
function hypothesisImplementationBonus(
18251957
candidate: HypothesisCandidate,
18261958
review: HypothesisReview | undefined

0 commit comments

Comments
 (0)