|
| 1 | +import fs from "node:fs"; |
| 2 | +import path from "node:path"; |
| 3 | + |
| 4 | +import type { StoredCorpusRow } from "../collection/types.js"; |
| 5 | + |
| 6 | +export interface CitationReport { |
| 7 | + orphan_citations: string[]; |
| 8 | + unchecked_sources: string[]; |
| 9 | + status: "pass" | "fail"; |
| 10 | +} |
| 11 | + |
| 12 | +interface EvidenceLinksClaimLike { |
| 13 | + citation_paper_ids?: unknown; |
| 14 | +} |
| 15 | + |
| 16 | +interface EvidenceLinksArtifactLike { |
| 17 | + claims?: unknown; |
| 18 | +} |
| 19 | + |
| 20 | +const CITATION_REGEX = new RegExp(String.raw`\\cite[a-zA-Z*]*(?:\[[^\]]*\]){0,2}\{([^}]+)\}`, "gu"); |
| 21 | +const BIB_ENTRY_REGEX = /@\w+\s*\{\s*([^,\s]+)\s*,/gu; |
| 22 | + |
| 23 | +export function checkCitationConsistency(runDir: string): CitationReport { |
| 24 | + const paperDir = path.join(runDir, "paper"); |
| 25 | + const mainTex = safeReadFile(path.join(paperDir, "main.tex")); |
| 26 | + const referencesBib = safeReadFile(path.join(paperDir, "references.bib")); |
| 27 | + const evidenceLinks = safeReadJson<EvidenceLinksArtifactLike>(path.join(paperDir, "evidence_links.json")); |
| 28 | + const corpusRows = parseCorpusRows(path.join(runDir, "corpus.jsonl")); |
| 29 | + |
| 30 | + const citedKeys = extractCitationKeys(mainTex); |
| 31 | + const bibKeys = extractBibKeys(referencesBib); |
| 32 | + const orphanCitations = uniqueStrings( |
| 33 | + citedKeys.filter((key) => !bibKeys.has(key)) |
| 34 | + ); |
| 35 | + const uncheckedSources = resolveUncheckedSources(evidenceLinks, corpusRows); |
| 36 | + |
| 37 | + return { |
| 38 | + orphan_citations: orphanCitations, |
| 39 | + unchecked_sources: uncheckedSources, |
| 40 | + status: orphanCitations.length > 0 ? "fail" : "pass" |
| 41 | + }; |
| 42 | +} |
| 43 | + |
| 44 | +function safeReadFile(filePath: string): string | null { |
| 45 | + try { |
| 46 | + return fs.readFileSync(filePath, "utf8"); |
| 47 | + } catch { |
| 48 | + return null; |
| 49 | + } |
| 50 | +} |
| 51 | + |
| 52 | +function safeReadJson<T>(filePath: string): T | null { |
| 53 | + try { |
| 54 | + return JSON.parse(fs.readFileSync(filePath, "utf8")) as T; |
| 55 | + } catch { |
| 56 | + return null; |
| 57 | + } |
| 58 | +} |
| 59 | + |
| 60 | +function extractCitationKeys(mainTex: string | null): string[] { |
| 61 | + if (!mainTex) { |
| 62 | + return []; |
| 63 | + } |
| 64 | + const keys: string[] = []; |
| 65 | + for (const match of mainTex.matchAll(CITATION_REGEX)) { |
| 66 | + const rawKeys = match[1]?.split(",") ?? []; |
| 67 | + for (const key of rawKeys) { |
| 68 | + const trimmed = key.trim(); |
| 69 | + if (trimmed) { |
| 70 | + keys.push(trimmed); |
| 71 | + } |
| 72 | + } |
| 73 | + } |
| 74 | + return uniqueStrings(keys); |
| 75 | +} |
| 76 | + |
| 77 | +function extractBibKeys(referencesBib: string | null): Set<string> { |
| 78 | + const keys = new Set<string>(); |
| 79 | + if (!referencesBib) { |
| 80 | + return keys; |
| 81 | + } |
| 82 | + for (const match of referencesBib.matchAll(BIB_ENTRY_REGEX)) { |
| 83 | + const key = match[1]?.trim(); |
| 84 | + if (key) { |
| 85 | + keys.add(key); |
| 86 | + } |
| 87 | + } |
| 88 | + return keys; |
| 89 | +} |
| 90 | + |
| 91 | +function parseCorpusRows(corpusPath: string): Map<string, StoredCorpusRow> { |
| 92 | + const rows = new Map<string, StoredCorpusRow>(); |
| 93 | + const raw = safeReadFile(corpusPath); |
| 94 | + if (!raw) { |
| 95 | + return rows; |
| 96 | + } |
| 97 | + for (const line of raw.split(/\r?\n/u)) { |
| 98 | + const trimmed = line.trim(); |
| 99 | + if (!trimmed) { |
| 100 | + continue; |
| 101 | + } |
| 102 | + try { |
| 103 | + const parsed = JSON.parse(trimmed) as StoredCorpusRow; |
| 104 | + if (parsed.paper_id) { |
| 105 | + rows.set(parsed.paper_id, parsed); |
| 106 | + } |
| 107 | + } catch { |
| 108 | + // Ignore malformed corpus rows and rely on the remaining parseable records. |
| 109 | + } |
| 110 | + } |
| 111 | + return rows; |
| 112 | +} |
| 113 | + |
| 114 | +function resolveUncheckedSources( |
| 115 | + evidenceLinks: EvidenceLinksArtifactLike | null, |
| 116 | + corpusRows: Map<string, StoredCorpusRow> |
| 117 | +): string[] { |
| 118 | + const claims = Array.isArray(evidenceLinks?.claims) |
| 119 | + ? evidenceLinks.claims as EvidenceLinksClaimLike[] |
| 120 | + : []; |
| 121 | + const citationPaperIds = uniqueStrings( |
| 122 | + claims.flatMap((claim) => |
| 123 | + Array.isArray(claim.citation_paper_ids) |
| 124 | + ? claim.citation_paper_ids.filter((value): value is string => typeof value === "string" && value.trim().length > 0) |
| 125 | + : [] |
| 126 | + ) |
| 127 | + ); |
| 128 | + |
| 129 | + return citationPaperIds.filter((paperId) => { |
| 130 | + const row = corpusRows.get(paperId); |
| 131 | + if (!row) { |
| 132 | + return true; |
| 133 | + } |
| 134 | + return !Boolean(row.doi || row.url || row.landing_url || row.pdf_url); |
| 135 | + }); |
| 136 | +} |
| 137 | + |
| 138 | +function uniqueStrings(values: string[]): string[] { |
| 139 | + return [...new Set(values)]; |
| 140 | +} |
0 commit comments