From 5833bc4fe33e6bd626f27de2474c950726b0007b Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 16 Apr 2026 11:51:34 -0400 Subject: [PATCH 01/13] Method to find overlapping excerpts; reusable logic for merging groups --- src/corppa/poetry_detection/merge_excerpts.py | 220 +++++++++++++----- 1 file changed, 168 insertions(+), 52 deletions(-) diff --git a/src/corppa/poetry_detection/merge_excerpts.py b/src/corppa/poetry_detection/merge_excerpts.py index e034f184..11c33531 100755 --- a/src/corppa/poetry_detection/merge_excerpts.py +++ b/src/corppa/poetry_detection/merge_excerpts.py @@ -54,6 +54,77 @@ from corppa.poetry_detection.polars_utils import load_excerpts_df, standardize_dataframe +def merge_excerpt_groups( + grouped_df: pl.dataframe.group_by.GroupBy, merge_reason: str = "ppa exact span" +) -> pl.DataFrame: + """Takes a GroupBy dataframe of excerpts (created by calling `group_by`), and combines + groups of excerpts into merged excerpts. Merges as follows: + - first ppa_span_text + - combined unique set of detection methods + - combined unique set of notes + - updated excerpt id + - first poem id (dataframe should be sorted so preferred id is first) + - any other poem ids are listed in alt_poem_ids + - first reference corpus id + - first reference span and text + - combined unique list of identification methods + After merging, it adds a note documenting the group, with the specified reason, + and the number of raw excerpts in the merged set. + """ + return ( + grouped_df.agg( + # TODO: how to handle for overlapping spans + pl.first("ppa_span_text"), # should match exactly + pl.col("detection_methods") + .explode() + .unique(), # combine in a single list, no repeats + # combine notes but don't repeat duplicate info (like passim char match count) + pl.col("notes").explode().unique().sort().str.join("; "), + # construct merged excerpt id manually; c= prefix for combined + # (although strictly speaking should only be if > 1 detection method) + pl.concat_str( + pl.lit("c@"), + pl.col("ppa_span_start").first(), + pl.lit(":"), + pl.col("ppa_span_end").first(), + ).alias("excerpt_id"), + # pick the first poem id (relies on previous sorting) + pl.col("poem_id").explode().unique().first(), + # and store all others in alt poem ids field + pl.col("poem_id") + .explode() + .unique() + .drop_nulls() + .slice(1) + .alias("alt_poem_ids"), + pl.col("ref_corpus").explode().first(), + # use first reference span and text so numbers are useful; ignore nulls + pl.col("ref_span_start").first(), + pl.col("ref_span_end").first(), + pl.col("ref_span_text").first(), + # combine unique list of id methods + pl.col("identification_methods") + .explode() + .unique() + .drop_nulls(), # combine in a single list, no repeats, ignore nulls (not identified before merging) + pl.len().alias("group_size"), # count number in the group + ) + .with_columns( + notes=pl.concat_str( + pl.col("notes"), + pl.lit(f"; merge: {merge_reason}, "), + pl.col("group_size"), + pl.lit(" excerpts"), + ), + # if alt poem ids is empty, replace with None + alt_poem_ids=pl.when(pl.col("alt_poem_ids").list.len() > 0) + .then(pl.col("alt_poem_ids")) + .otherwise(pl.lit(None)), + ) + .drop("group_size") + ) # drop group size column + + def merge_excerpts( df: pl.DataFrame, disable_progress=True, verbose=False ) -> pl.DataFrame: @@ -95,6 +166,7 @@ def merge_excerpts( .drop("group_size") ) if output_df.is_empty(): + # if none were found, create an empty output_df = df.clear() # any excerpts with group size > 1 are candidates for merging @@ -128,57 +200,7 @@ def merge_excerpts( f"Identified {merge_candidates.height:,} merge candidates in {num_merge_groups:,} groups." ) - merged_output_df = ( - merge_groups.agg( - pl.first("ppa_span_text"), # should match exactly - pl.col("detection_methods") - .explode() - .unique(), # combine in a single list, no repeats - # combine notes but don't repeat duplicate info (like passim char match count) - pl.col("notes").explode().unique().sort().str.join("; "), - # construct merged excerpt id manually; c= prefix for combined - # (although strictly speaking should only be if > 1 detection method) - pl.concat_str( - pl.lit("c@"), - pl.col("ppa_span_start").first(), - pl.lit(":"), - pl.col("ppa_span_end").first(), - ).alias("excerpt_id"), - # pick the first poem id (relies on previous sorting) - pl.col("poem_id").explode().unique().first(), - # and store all others in alt poem ids field - pl.col("poem_id") - .explode() - .unique() - .drop_nulls() - .slice(1) - .alias("alt_poem_ids"), - pl.col("ref_corpus").explode().first(), - # use first reference span and text so numbers are useful; ignore nulls - pl.col("ref_span_start").first(), - pl.col("ref_span_end").first(), - pl.col("ref_span_text").first(), - # combine unique list of id methods - pl.col("identification_methods") - .explode() - .unique() - .drop_nulls(), # combine in a single list, no repeats, ignore nulls (not identified before merging) - pl.len().alias("group_size"), # count number in the group - ) - .with_columns( - notes=pl.concat_str( - pl.col("notes"), - pl.lit("; merge: ppa exact span, "), - pl.col("group_size"), - pl.lit(" excerpts"), - ), - # if alt poem ids is empty, replace with None - alt_poem_ids=pl.when(pl.col("alt_poem_ids").list.len() > 0) - .then(pl.col("alt_poem_ids")) - .otherwise(pl.lit(None)), - ) - .drop("group_size") # drop group size column - ) + merged_output_df = merge_excerpt_groups(merge_groups) if verbose: multi_id = merged_output_df.filter( @@ -191,7 +213,101 @@ def merge_excerpts( # combined merged records with the output # use a diagonal concat instead of vstack/extend # to avoid having to reconcile columns first - return pl.concat([output_df, merged_output_df], how="diagonal") + output_df = pl.concat([output_df, merged_output_df], how="diagonal") + + return output_df + + +def identify_overlapping_excerpts( + excerpts_df: pl.DataFrame, + min_overlap_factor: float = 0.98, + min_overlap_chars: int = 10, +) -> pl.DataFrame: + """ + Takes a DataFrame of excerpts and identifies pairs of overlapping excerpts. + Overlapping excerpts are on the same page, with some shared span of text. + We exclude short overlaps based on the minimum character parameter, + and an overlap factor, which is calculated by the length of the shared + span divided by the length of the longer of the two spans. + + Returns a DataFrame of excerpt pairs, which includes the page id, + two excerpt ids, overlap length, and overlap factor. + """ + + # TODO: what about small spans completely inside another? + # overlap factor would be small + + # identify excerpts with partial overlap + overlaps_df = ( + excerpts_df + # Filter to excerpts on pages with multiple excerpts + .filter(pl.col("page_id").is_duplicated()) + .join_where( + excerpts_df, + # 1. Excerpts are on the same page + pl.col("page_id") == pl.col("page_id_right"), + # 2. Excerpts overlap: + # left span starts before right span ends + pl.col("ppa_span_start") < pl.col("ppa_span_end_right"), + # and right span starts before left span ends + pl.col("ppa_span_start_right") < pl.col("ppa_span_end"), + # 3. Exclude self-matches + pl.col("excerpt_id") != pl.col("excerpt_id_right"), + ) + .with_columns( + # make a sorted combined id so we can drop duplicate copies of the same pair + group_ids=pl.concat_list( + [pl.col("excerpt_id"), pl.col("excerpt_id_right")] + ).list.sort() + ) + # excerpt ids are ONLY unique within a page + # drop duplicate copies of the same overlapping pair on the same page + .unique(["group_ids", "page_id"]) + .with_columns( + # calculate length of the overlap: smaller end minus larger start + overlap_len=pl.min_horizontal( + pl.col("ppa_span_end"), pl.col("ppa_span_end_right") + ).sub( + pl.max_horizontal( + pl.col("ppa_span_start"), pl.col("ppa_span_start_right") + ) + ), + ) + .with_columns( + overlap_factor=pl.col("overlap_len").truediv( + pl.max_horizontal( + pl.col("ppa_span_text").str.len_chars(), + pl.col("ppa_span_text_right").str.len_chars(), + ) + ) + ) + # filter to requested overlap / length to limit to high confidence overlaps + .filter( + pl.col("overlap_factor").gt(min_overlap_factor), + pl.col("overlap_len").gt(min_overlap_chars), + ) + ) + + # what fields are needed here? + return overlaps_df.select( + "page_id", + "excerpt_id", + "excerpt_id_right", + "group_ids", # drop? + "overlap_len", + "overlap_factor", + # these are not strictly needed but may be helpful for investigating + "notes", + "notes_right", + "ppa_span_text", + "ppa_span_start", + "ppa_span_end", + "ppa_span_text_right", + "ppa_span_start_right", + "ppa_span_end_right", + "ref_span_text", + "ref_span_text_right", + ) def merge_excerpt_files( From 317ca925b1e4cf9da01f8fc0e1fca2533c9dadba Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 16 Apr 2026 16:49:38 -0400 Subject: [PATCH 02/13] Add unit tests for method for identify overlapping excerpts --- src/corppa/poetry_detection/merge_excerpts.py | 25 ++--- .../test_merge_excerpts.py | 101 +++++++++++++++++- 2 files changed, 113 insertions(+), 13 deletions(-) diff --git a/src/corppa/poetry_detection/merge_excerpts.py b/src/corppa/poetry_detection/merge_excerpts.py index 11c33531..ed177dc7 100755 --- a/src/corppa/poetry_detection/merge_excerpts.py +++ b/src/corppa/poetry_detection/merge_excerpts.py @@ -228,15 +228,14 @@ def identify_overlapping_excerpts( Overlapping excerpts are on the same page, with some shared span of text. We exclude short overlaps based on the minimum character parameter, and an overlap factor, which is calculated by the length of the shared - span divided by the length of the longer of the two spans. + span divided by the length of the longer of the two spans. Note that + this will typically not return small excerpts completely inside another + larger span. - Returns a DataFrame of excerpt pairs, which includes the page id, - two excerpt ids, overlap length, and overlap factor. + Returns a DataFrame of excerpt pairs with columns for page id, + pairs of excerpt ids, overlap length, and overlap factor. """ - # TODO: what about small spans completely inside another? - # overlap factor would be small - # identify excerpts with partial overlap overlaps_df = ( excerpts_df @@ -244,12 +243,12 @@ def identify_overlapping_excerpts( .filter(pl.col("page_id").is_duplicated()) .join_where( excerpts_df, - # 1. Excerpts are on the same page + # 1. Limit to excerpts are on the same page pl.col("page_id") == pl.col("page_id_right"), # 2. Excerpts overlap: # left span starts before right span ends pl.col("ppa_span_start") < pl.col("ppa_span_end_right"), - # and right span starts before left span ends + # and right span starts before left span ends pl.col("ppa_span_start_right") < pl.col("ppa_span_end"), # 3. Exclude self-matches pl.col("excerpt_id") != pl.col("excerpt_id_right"), @@ -276,15 +275,17 @@ def identify_overlapping_excerpts( .with_columns( overlap_factor=pl.col("overlap_len").truediv( pl.max_horizontal( - pl.col("ppa_span_text").str.len_chars(), - pl.col("ppa_span_text_right").str.len_chars(), + pl.col("ppa_span_end").sub(pl.col("ppa_span_start")), + pl.col("ppa_span_end_right").sub(pl.col("ppa_span_start_right")), + # pl.col("ppa_span_text").str.len_chars(), + # pl.col("ppa_span_text_right").str.len_chars(), ) ) ) # filter to requested overlap / length to limit to high confidence overlaps .filter( - pl.col("overlap_factor").gt(min_overlap_factor), - pl.col("overlap_len").gt(min_overlap_chars), + pl.col("overlap_factor").ge(min_overlap_factor), + pl.col("overlap_len").ge(min_overlap_chars), ) ) diff --git a/tests/test_poetry_detection/test_merge_excerpts.py b/tests/test_poetry_detection/test_merge_excerpts.py index cfdaa2ba..b3368e79 100644 --- a/tests/test_poetry_detection/test_merge_excerpts.py +++ b/tests/test_poetry_detection/test_merge_excerpts.py @@ -9,11 +9,13 @@ import pytest from test_polars_utils import _excerpts_to_csv -from corppa.poetry_detection.core import Excerpt, LabeledExcerpt +from corppa.poetry_detection.core import Excerpt, LabeledExcerpt, Span from corppa.poetry_detection.merge_excerpts import ( + identify_overlapping_excerpts, main, merge_excerpts, ) +from corppa.poetry_detection.polars_utils import standardize_dataframe excerpt1 = Excerpt( page_id="p.1", @@ -507,3 +509,100 @@ def test_main_successful(capsys, tmp_path): assert merged_ex1.ref_span_text == excerpt1_label1.ref_span_text # id methods combined assert merged_ex1.identification_methods == {"manual", "refmatcha"} + + +### test for identify_overlapping_excerpts + +# test scenarios that should result in no overlapping pairs +no_overlap_inputs = [ + # list of excerpts, reason this example has no overlapping pairs + ([excerpt2], "single excerpt"), + ([excerpt2, excerpt1_label1], "excerpts on different page"), + # construct a second excerpt on the same page by using replace and relative offset span start/end + ( + [ + excerpt1, + replace( + excerpt1, + ppa_span_start=excerpt1.ppa_span_end + 100, + ppa_span_end=excerpt1.ppa_span_end + 120, + ), + ], + "same page, no overlap", + ), + # construct a short second excerpt on the same page with minimal overlap + ( + [ + excerpt1, + replace( + excerpt1, + ppa_span_start=excerpt1.ppa_span_end - 1, + ppa_span_end=excerpt1.ppa_span_end + 3, + ), + ], + "very small overlap", + ), +] + + +@pytest.mark.parametrize("excerpts, reason", no_overlap_inputs) +def test_identify_overlapping_excerpts_no_pairs(excerpts, reason): + # construct a standardized dataframe from the list of excerpts given + excerpts_df = standardize_dataframe( + pl.from_dicts([ex.to_dict() for ex in excerpts]) + ) + pairs_df = identify_overlapping_excerpts(excerpts_df) + assert pairs_df.height == 0, f"expected 0 overlapping pairs: {reason}" + + +def test_identify_overlapping_excerpts(): + # create a pair with high overlap starting with fixture 1 + # for convenience, we use the existing Span object to construct + # an overlapping span and check the overlap length / factor logic + ppa_span1 = Span(start=excerpt1.ppa_span_start, end=excerpt1.ppa_span_end, label="") + # create a second span; offset start by 1/9 the length of the first span + ppa_span2 = Span( + start=int(ppa_span1.start + len(ppa_span1) / 9), end=ppa_span1.end + 1, label="" + ) + excerpt1_overlap = replace( + excerpt1, ppa_span_start=ppa_span2.start, ppa_span_end=ppa_span2.end + ) + # use existing span logic as coherence check for new method + overlap_len = ppa_span1.overlap_length(ppa_span2) + assert overlap_len >= 9 + overlap_factor = ppa_span1.overlap_factor(ppa_span2, ignore_label=True) + assert overlap_factor >= 0.9 + # construct a standardized dataframe from the two test excerpts + excerpts = [excerpt1, excerpt1_overlap] + excerpts_df = standardize_dataframe( + pl.from_dicts([ex.to_dict() for ex in excerpts]) + ) + pairs_df = identify_overlapping_excerpts( + excerpts_df, min_overlap_chars=9, min_overlap_factor=0.9 + ) + # we expect one pair + assert pairs_df.height == 1 + # inspect the fields in the one returned pair + pair_result = pairs_df.row(0, named=True) + assert pair_result["page_id"] == excerpt1.page_id + # both excerpt ids present (order agnostic) + pair_exc_ids = set([pair_result["excerpt_id"], pair_result["excerpt_id_right"]]) + assert pair_exc_ids == set([excerpt1.excerpt_id, excerpt1_overlap.excerpt_id]) + assert pair_result["overlap_len"] == overlap_len + assert pair_result["overlap_factor"] == overlap_factor + + # confirm that if we adjust the parameters, this pair is not returned + assert ( + identify_overlapping_excerpts( + excerpts_df, min_overlap_chars=10, min_overlap_factor=0.9 + ).height + == 0 + ) + assert ( + identify_overlapping_excerpts( + excerpts_df, min_overlap_chars=9, min_overlap_factor=0.95 + ).height + == 0 + ) + # defaults options exclude this pair + assert identify_overlapping_excerpts(excerpts_df).height == 0 From a617b0bbf7523dc4a99aa3603bcd438624654573 Mon Sep 17 00:00:00 2001 From: Rebecca Sutton Koeser Date: Mon, 20 Apr 2026 15:46:41 -0400 Subject: [PATCH 03/13] Apply suggestions from code review Co-authored-by: Laure Thompson <602628+laurejt@users.noreply.github.com> --- src/corppa/poetry_detection/merge_excerpts.py | 105 ++++++++---------- 1 file changed, 44 insertions(+), 61 deletions(-) diff --git a/src/corppa/poetry_detection/merge_excerpts.py b/src/corppa/poetry_detection/merge_excerpts.py index ed177dc7..eae24663 100755 --- a/src/corppa/poetry_detection/merge_excerpts.py +++ b/src/corppa/poetry_detection/merge_excerpts.py @@ -49,6 +49,7 @@ import sys import polars as pl +from polars import col as c # for shorthand column notation from corppa.poetry_detection.core import MULTIVAL_DELIMITER from corppa.poetry_detection.polars_utils import load_excerpts_df, standardize_dataframe @@ -75,50 +76,42 @@ def merge_excerpt_groups( grouped_df.agg( # TODO: how to handle for overlapping spans pl.first("ppa_span_text"), # should match exactly - pl.col("detection_methods") - .explode() - .unique(), # combine in a single list, no repeats + c.detection_methods.explode().unique(), # combine in a single list, no repeats # combine notes but don't repeat duplicate info (like passim char match count) - pl.col("notes").explode().unique().sort().str.join("; "), + c.notes.explode().unique().sort().str.join("; "), # construct merged excerpt id manually; c= prefix for combined # (although strictly speaking should only be if > 1 detection method) pl.concat_str( pl.lit("c@"), - pl.col("ppa_span_start").first(), + c.ppa_span_start.first(), pl.lit(":"), - pl.col("ppa_span_end").first(), + c.ppa_span_end.first(), ).alias("excerpt_id"), # pick the first poem id (relies on previous sorting) - pl.col("poem_id").explode().unique().first(), + c.poem_id.first(), # and store all others in alt poem ids field - pl.col("poem_id") - .explode() - .unique() - .drop_nulls() - .slice(1) - .alias("alt_poem_ids"), - pl.col("ref_corpus").explode().first(), + c.poem_id.unique().drop_nulls().slice(1).alias("alt_poem_ids"), + c.ref_corpus.explode().first(), # use first reference span and text so numbers are useful; ignore nulls - pl.col("ref_span_start").first(), - pl.col("ref_span_end").first(), - pl.col("ref_span_text").first(), + c.ref_span_start.first(), + c.ref_span_end.first(), + c.ref_span_text.first(), # combine unique list of id methods - pl.col("identification_methods") - .explode() + c.identification_methods.explode() .unique() .drop_nulls(), # combine in a single list, no repeats, ignore nulls (not identified before merging) pl.len().alias("group_size"), # count number in the group ) .with_columns( notes=pl.concat_str( - pl.col("notes"), + c.notes, pl.lit(f"; merge: {merge_reason}, "), - pl.col("group_size"), + c.group_size, pl.lit(" excerpts"), ), # if alt poem ids is empty, replace with None - alt_poem_ids=pl.when(pl.col("alt_poem_ids").list.len() > 0) - .then(pl.col("alt_poem_ids")) + alt_poem_ids=pl.when(c.alt_poem_ids.list.len() > 0) + .then(c.alt_poem_ids) .otherwise(pl.lit(None)), ) .drop("group_size") @@ -146,9 +139,9 @@ def merge_excerpts( # TEMPORARY - make sure internet poem ref corpus ids match before merging df = df.with_columns( - ref_corpus=pl.when(pl.col("ref_corpus").eq("internet-poems")) + ref_corpus=pl.when(c.ref_corpus.eq("internet-poems")) .then(pl.lit("internet_poems")) - .otherwise(pl.col.ref_corpus) + .otherwise(c.ref_corpus) ) # group by page id and excerpt id to get potential matches @@ -162,7 +155,7 @@ def merge_excerpts( # add to output df and don't process further output_df = ( df.join(grouped, on=["page_id", "ppa_span_start", "ppa_span_end"]) - .filter(pl.col("group_size").eq(1)) + .filter(c.group_size.eq(1)) .drop("group_size") ) if output_df.is_empty(): @@ -172,7 +165,7 @@ def merge_excerpts( # any excerpts with group size > 1 are candidates for merging merge_candidates = ( df.join(grouped, on=["page_id", "ppa_span_start", "ppa_span_end"]) - .filter(pl.col("group_size").gt(1)) + .filter(c.group_size.gt(1)) .drop("group_size") ) @@ -182,7 +175,7 @@ def merge_excerpts( merge_groups = ( merge_candidates.with_columns( # extract passim match length so we can prioritize longer matches - passim_match_len=pl.col("notes").str.extract(r"passim: (\d+) char matches") + passim_match_len=c.notes.str.extract(r"passim: (\d+) char matches") ) .sort( "page_id", @@ -203,9 +196,7 @@ def merge_excerpts( merged_output_df = merge_excerpt_groups(merge_groups) if verbose: - multi_id = merged_output_df.filter( - pl.col("alt_poem_ids").list.len().gt(0) - ).height + multi_id = merged_output_df.filter(c.alt_poem_ids.list.len().gt(0)).height print( f"{merged_output_df.height:,} merged excerpts; {multi_id:,} with multiple poem ids." ) @@ -240,52 +231,46 @@ def identify_overlapping_excerpts( overlaps_df = ( excerpts_df # Filter to excerpts on pages with multiple excerpts - .filter(pl.col("page_id").is_duplicated()) + .filter(c.page_id.is_duplicated()) .join_where( excerpts_df, # 1. Limit to excerpts are on the same page - pl.col("page_id") == pl.col("page_id_right"), + c.page_id == c.page_id_right, # 2. Excerpts overlap: # left span starts before right span ends - pl.col("ppa_span_start") < pl.col("ppa_span_end_right"), + c.ppa_span_start < c.ppa_span_end_right, # and right span starts before left span ends - pl.col("ppa_span_start_right") < pl.col("ppa_span_end"), + c.ppa_span_start_right < c.ppa_span_end, # 3. Exclude self-matches - pl.col("excerpt_id") != pl.col("excerpt_id_right"), + c.excerpt_id < c.excerpt_id_right, ) .with_columns( # make a sorted combined id so we can drop duplicate copies of the same pair - group_ids=pl.concat_list( - [pl.col("excerpt_id"), pl.col("excerpt_id_right")] - ).list.sort() + group_ids=pl.concat_list([c.excerpt_id, c.excerpt_id_right]).list.sort() ) # excerpt ids are ONLY unique within a page # drop duplicate copies of the same overlapping pair on the same page .unique(["group_ids", "page_id"]) .with_columns( # calculate length of the overlap: smaller end minus larger start - overlap_len=pl.min_horizontal( - pl.col("ppa_span_end"), pl.col("ppa_span_end_right") - ).sub( - pl.max_horizontal( - pl.col("ppa_span_start"), pl.col("ppa_span_start_right") - ) + overlap_len=pl.min_horizontal(c.ppa_span_end, c.ppa_span_end_right).sub( + pl.max_horizontal(c.ppa_span_start, c.ppa_span_start_right) ), ) .with_columns( - overlap_factor=pl.col("overlap_len").truediv( + overlap_factor=c.overlap_len.truediv( pl.max_horizontal( - pl.col("ppa_span_end").sub(pl.col("ppa_span_start")), - pl.col("ppa_span_end_right").sub(pl.col("ppa_span_start_right")), - # pl.col("ppa_span_text").str.len_chars(), - # pl.col("ppa_span_text_right").str.len_chars(), + c.ppa_span_end.sub(c.ppa_span_start), + c.ppa_span_end_right.sub(c.ppa_span_start_right), + # c.ppa_span_text").str.len_chars(), + # c.ppa_span_text_right").str.len_chars(), ) ) ) # filter to requested overlap / length to limit to high confidence overlaps .filter( - pl.col("overlap_factor").ge(min_overlap_factor), - pl.col("overlap_len").ge(min_overlap_chars), + c.overlap_factor.ge(min_overlap_factor), + c.overlap_len.ge(min_overlap_chars), ) ) @@ -335,7 +320,7 @@ def merge_excerpt_files( total_excerpts = excerpts.height # use unique to drop exact duplicates excerpts = excerpts.unique() - initial_labeled_excerpts = excerpts.filter(pl.col("poem_id").is_not_null()).height + initial_labeled_excerpts = excerpts.filter(c.poem_id.is_not_null()).height # output summary information about input data print( f"Loaded {total_excerpts:,} excerpts from {len(input_files)} files ({excerpts.height:,} unique; {initial_labeled_excerpts:,} labeled)." @@ -352,16 +337,14 @@ def merge_excerpt_files( # convert list fields for output to csv and reporting excerpts = excerpts.with_columns( - detection_methods=pl.col("detection_methods") - .list.sort() - .list.join(MULTIVAL_DELIMITER), - identification_methods=pl.col("identification_methods") - .list.sort() - .list.join(MULTIVAL_DELIMITER), - alt_poem_ids=pl.col("alt_poem_ids").list.join(MULTIVAL_DELIMITER), + detection_methods=c.detection_methods.list.sort().list.join(MULTIVAL_DELIMITER), + identification_methods=c.identification_methods.list.sort().list.join( + MULTIVAL_DELIMITER + ), + alt_poem_ids=c.alt_poem_ids.list.join(MULTIVAL_DELIMITER), ) - labeled_excerpts = excerpts.filter(pl.col("poem_id").is_not_null()) + labeled_excerpts = excerpts.filter(c.poem_id.is_not_null()) # summary information about the content and what as done print( From 4b83c426aeaa89d069766296522a9d99e760d59d Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Mon, 20 Apr 2026 15:51:54 -0400 Subject: [PATCH 04/13] Simplify logic for filtering overlapping excerpt pairs --- src/corppa/poetry_detection/merge_excerpts.py | 23 ++++++------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/src/corppa/poetry_detection/merge_excerpts.py b/src/corppa/poetry_detection/merge_excerpts.py index eae24663..9fbd0d04 100755 --- a/src/corppa/poetry_detection/merge_excerpts.py +++ b/src/corppa/poetry_detection/merge_excerpts.py @@ -74,8 +74,9 @@ def merge_excerpt_groups( """ return ( grouped_df.agg( - # TODO: how to handle for overlapping spans - pl.first("ppa_span_text"), # should match exactly + # NOTE: does not handle overlapping spans; + # currently assumes text spans match or first in group is complete + pl.first("ppa_span_text"), c.detection_methods.explode().unique(), # combine in a single list, no repeats # combine notes but don't repeat duplicate info (like passim char match count) c.notes.explode().unique().sort().str.join("; "), @@ -91,15 +92,13 @@ def merge_excerpt_groups( c.poem_id.first(), # and store all others in alt poem ids field c.poem_id.unique().drop_nulls().slice(1).alias("alt_poem_ids"), - c.ref_corpus.explode().first(), + c.ref_corpus.first(), # use first reference span and text so numbers are useful; ignore nulls c.ref_span_start.first(), c.ref_span_end.first(), c.ref_span_text.first(), - # combine unique list of id methods - c.identification_methods.explode() - .unique() - .drop_nulls(), # combine in a single list, no repeats, ignore nulls (not identified before merging) + # combine unique list of unique id methods; ignore nulls (not identified before merging) + c.identification_methods.explode().unique().drop_nulls(), pl.len().alias("group_size"), # count number in the group ) .with_columns( @@ -241,16 +240,9 @@ def identify_overlapping_excerpts( c.ppa_span_start < c.ppa_span_end_right, # and right span starts before left span ends c.ppa_span_start_right < c.ppa_span_end, - # 3. Exclude self-matches + # 3. Exclude self-matches and reverse matches c.excerpt_id < c.excerpt_id_right, ) - .with_columns( - # make a sorted combined id so we can drop duplicate copies of the same pair - group_ids=pl.concat_list([c.excerpt_id, c.excerpt_id_right]).list.sort() - ) - # excerpt ids are ONLY unique within a page - # drop duplicate copies of the same overlapping pair on the same page - .unique(["group_ids", "page_id"]) .with_columns( # calculate length of the overlap: smaller end minus larger start overlap_len=pl.min_horizontal(c.ppa_span_end, c.ppa_span_end_right).sub( @@ -279,7 +271,6 @@ def identify_overlapping_excerpts( "page_id", "excerpt_id", "excerpt_id_right", - "group_ids", # drop? "overlap_len", "overlap_factor", # these are not strictly needed but may be helpful for investigating From 4b519b1cebc023cf5598ab3c5c44aa1b98a0193c Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 21 Apr 2026 10:10:21 -0400 Subject: [PATCH 05/13] Clean up & clarify comments for merge, remove unneeded code & sort --- src/corppa/poetry_detection/merge_excerpts.py | 31 ++++++++----------- .../test_merge_excerpts.py | 2 +- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/src/corppa/poetry_detection/merge_excerpts.py b/src/corppa/poetry_detection/merge_excerpts.py index 9fbd0d04..b4f374ec 100755 --- a/src/corppa/poetry_detection/merge_excerpts.py +++ b/src/corppa/poetry_detection/merge_excerpts.py @@ -59,18 +59,20 @@ def merge_excerpt_groups( grouped_df: pl.dataframe.group_by.GroupBy, merge_reason: str = "ppa exact span" ) -> pl.DataFrame: """Takes a GroupBy dataframe of excerpts (created by calling `group_by`), and combines - groups of excerpts into merged excerpts. Merges as follows: - - first ppa_span_text - - combined unique set of detection methods + groups of excerpts into merged excerpts. Fields are expected to match + labeled excerpts (:class:~`corppa.poetry_detection.core.LabeledExcerpt`), and the + dataframe should be pre-sorted so the preferred excerpt comes first, + since in several cases the first value is the one preserved in the merge. + Merge logic is as follows: + - first `ppa_span_text`, `poem_id`, reference corpus values (`ref_corpus`, + `ref_span_start`, `ref_span_end`, `ref_span_text`) + - combined unique set of detection methods and identification methods - combined unique set of notes - updated excerpt id - - first poem id (dataframe should be sorted so preferred id is first) - - any other poem ids are listed in alt_poem_ids - - first reference corpus id - - first reference span and text - - combined unique list of identification methods - After merging, it adds a note documenting the group, with the specified reason, - and the number of raw excerpts in the merged set. + - any additional poem ids are listed in alt_poem_ids + After merging, the notes field is updated with text documenting the merge + with the specified reason (by default, exact span in PPA), and the + number of excerpts that were merged. """ return ( grouped_df.agg( @@ -79,7 +81,7 @@ def merge_excerpt_groups( pl.first("ppa_span_text"), c.detection_methods.explode().unique(), # combine in a single list, no repeats # combine notes but don't repeat duplicate info (like passim char match count) - c.notes.explode().unique().sort().str.join("; "), + c.notes.explode().unique().str.join("; "), # construct merged excerpt id manually; c= prefix for combined # (although strictly speaking should only be if > 1 detection method) pl.concat_str( @@ -136,13 +138,6 @@ def merge_excerpts( versions of duplicated excerpts. """ - # TEMPORARY - make sure internet poem ref corpus ids match before merging - df = df.with_columns( - ref_corpus=pl.when(c.ref_corpus.eq("internet-poems")) - .then(pl.lit("internet_poems")) - .otherwise(c.ref_corpus) - ) - # group by page id and excerpt id to get potential matches # use aggregation to get the count of excerpts in each group, # then split input dataframe into singletons and merge candidates diff --git a/tests/test_poetry_detection/test_merge_excerpts.py b/tests/test_poetry_detection/test_merge_excerpts.py index b3368e79..0217dbd3 100644 --- a/tests/test_poetry_detection/test_merge_excerpts.py +++ b/tests/test_poetry_detection/test_merge_excerpts.py @@ -133,7 +133,7 @@ def test_merge_excerpts_1ex_note_1label(): # notes should be combined, and merge info should be added expected_merge_note = "merge: ppa exact span, 2 excerpts" expected_notes = "; ".join( - [ex1_notes.notes, excerpt1_label1.notes, expected_merge_note] + [excerpt1_label1.notes, ex1_notes.notes, expected_merge_note] ) assert merged_excerpt.notes == expected_notes excerpt_with_notes = replace(excerpt1_label1, notes=expected_notes) From 8592b93ebebf49bca21def9972fd1201310c78e7 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 21 Apr 2026 10:19:48 -0400 Subject: [PATCH 06/13] Split code docs out into one file per top-level module Assisted-by: Claude:sonnet-4-6 [ClaudeCode] --- docs/source/code-docs.rst | 101 --------------------- docs/source/code-docs/annotation.rst | 30 ++++++ docs/source/code-docs/index.rst | 10 ++ docs/source/code-docs/ocr.rst | 12 +++ docs/source/code-docs/poetry-detection.rst | 22 +++++ docs/source/code-docs/utils.rst | 27 ++++++ docs/source/index.rst | 2 +- 7 files changed, 102 insertions(+), 102 deletions(-) delete mode 100644 docs/source/code-docs.rst create mode 100644 docs/source/code-docs/annotation.rst create mode 100644 docs/source/code-docs/index.rst create mode 100644 docs/source/code-docs/ocr.rst create mode 100644 docs/source/code-docs/poetry-detection.rst create mode 100644 docs/source/code-docs/utils.rst diff --git a/docs/source/code-docs.rst b/docs/source/code-docs.rst deleted file mode 100644 index 2ce6b6cf..00000000 --- a/docs/source/code-docs.rst +++ /dev/null @@ -1,101 +0,0 @@ -Code Documentation -################## - -.. toctree:: - :maxdepth: 2 - -OCR -=== -.. automodule:: corppa.ocr.gvision_ocr - :members: - - -Collate Texts -------------- -.. automodule:: corppa.ocr.collate_txt -.. Note: not including the members for the method docs, *but* we should we -.. make the top-level comment better. - - -Utils -===== - -Filter Utility --------------- -.. automodule:: corppa.utils.filter -.. Note: not including members for method docs, only top-level script usage - -Path Utilities --------------- -.. automodule:: corppa.utils.path_utils - :members: - -Generate PPA Page Set ----------------------- -.. automodule:: corppa.utils.generate_page_set -.. Note: not including members for method docs, only top-level script usage - -Add Image (Relative) Paths ---------------------------- -.. automodule:: corppa.utils.add_image_relpaths -.. Note: not including members for method docs, only top-level script usage - -Build Text Corpus ------------------ -.. automodule:: corppa.utils.build_text_corpus -.. Note: not including members for method docs, only top-level script usage - - -Annotation -========== - -Data Preparation ------------------ -Preliminary Page Set Creation -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. automodule:: corppa.poetry_detection.annotation.create_pageset -.. Note: not including members for method docs, only top-level script usage - -Add Metadata -^^^^^^^^^^^^ -.. automodule:: corppa.poetry_detection.annotation.add_metadata -.. Note: not including members for method docs, only top-level script usage - -Annotation Recipes ------------------- -.. automodule:: corppa.poetry_detection.annotation.annotation_recipes -.. Note: not including members for method docs, only top-level script usage - -Command Recipes ---------------- -.. automodule:: corppa.poetry_detection.annotation.command_recipes -.. Note: not including members for method docs, only top-level script usage - -Process Adjudication Data -------------------------- -.. automodule:: corppa.poetry_detection.annotation.process_adjudication_data -.. Note: not including members for method docs, only top-level script usage - - -Poetry Detection -================ - -Reference Corpora ------------------ -.. automodule:: corppa.poetry_detection.ref_corpora - :members: - - -Scripts -------- - -refmatcha -^^^^^^^^^ - -.. automodule:: corppa.poetry_detection.refmatcha - -Merge excerpts -^^^^^^^^^^^^^^ - -.. automodule:: corppa.poetry_detection.merge_excerpts -.. Note: not including members for method docs, only top-level script usage \ No newline at end of file diff --git a/docs/source/code-docs/annotation.rst b/docs/source/code-docs/annotation.rst new file mode 100644 index 00000000..8bbb8dc7 --- /dev/null +++ b/docs/source/code-docs/annotation.rst @@ -0,0 +1,30 @@ +Annotation +########## + +Data Preparation +================ + +Preliminary Page Set Creation +------------------------------ +.. automodule:: corppa.poetry_detection.annotation.create_pageset +.. Note: not including members for method docs, only top-level script usage + +Add Metadata +------------ +.. automodule:: corppa.poetry_detection.annotation.add_metadata +.. Note: not including members for method docs, only top-level script usage + +Annotation Recipes +================== +.. automodule:: corppa.poetry_detection.annotation.annotation_recipes +.. Note: not including members for method docs, only top-level script usage + +Command Recipes +=============== +.. automodule:: corppa.poetry_detection.annotation.command_recipes +.. Note: not including members for method docs, only top-level script usage + +Process Adjudication Data +========================= +.. automodule:: corppa.poetry_detection.annotation.process_adjudication_data +.. Note: not including members for method docs, only top-level script usage diff --git a/docs/source/code-docs/index.rst b/docs/source/code-docs/index.rst new file mode 100644 index 00000000..ca5a4ef2 --- /dev/null +++ b/docs/source/code-docs/index.rst @@ -0,0 +1,10 @@ +Code Documentation +################## + +.. toctree:: + :maxdepth: 2 + + ocr + utils + annotation + poetry-detection diff --git a/docs/source/code-docs/ocr.rst b/docs/source/code-docs/ocr.rst new file mode 100644 index 00000000..7c7938e8 --- /dev/null +++ b/docs/source/code-docs/ocr.rst @@ -0,0 +1,12 @@ +OCR +### + +.. automodule:: corppa.ocr.gvision_ocr + :members: + + +Collate Texts +============= +.. automodule:: corppa.ocr.collate_txt +.. Note: not including the members for the method docs, *but* we should we +.. make the top-level comment better. diff --git a/docs/source/code-docs/poetry-detection.rst b/docs/source/code-docs/poetry-detection.rst new file mode 100644 index 00000000..3335c750 --- /dev/null +++ b/docs/source/code-docs/poetry-detection.rst @@ -0,0 +1,22 @@ +Poetry Detection +################ + +Reference Corpora +================= +.. automodule:: corppa.poetry_detection.ref_corpora + :members: + + +Scripts +======= + +refmatcha +--------- + +.. automodule:: corppa.poetry_detection.refmatcha + +Merge excerpts +-------------- + +.. automodule:: corppa.poetry_detection.merge_excerpts +.. Note: not including members for method docs, only top-level script usage diff --git a/docs/source/code-docs/utils.rst b/docs/source/code-docs/utils.rst new file mode 100644 index 00000000..994e9318 --- /dev/null +++ b/docs/source/code-docs/utils.rst @@ -0,0 +1,27 @@ +Utils +##### + +Filter Utility +============== +.. automodule:: corppa.utils.filter +.. Note: not including members for method docs, only top-level script usage + +Path Utilities +============== +.. automodule:: corppa.utils.path_utils + :members: + +Generate PPA Page Set +===================== +.. automodule:: corppa.utils.generate_page_set +.. Note: not including members for method docs, only top-level script usage + +Add Image (Relative) Paths +========================== +.. automodule:: corppa.utils.add_image_relpaths +.. Note: not including members for method docs, only top-level script usage + +Build Text Corpus +================= +.. automodule:: corppa.utils.build_text_corpus +.. Note: not including members for method docs, only top-level script usage diff --git a/docs/source/index.rst b/docs/source/index.rst index 39d4492b..a3f4bd1a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -19,5 +19,5 @@ This repository is research software developed as part of the `Ends of Prosody < Overview Developer Notes - code-docs + code-docs/index eop-docs From bbf5e38be94fead3259edc416e1db7c6834a7fbe Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 21 Apr 2026 10:28:23 -0400 Subject: [PATCH 07/13] Add core objects to docs & document fields --- docs/source/code-docs/poetry-detection.rst | 6 +++++ src/corppa/poetry_detection/core.py | 30 ++++++++++++++++------ 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/docs/source/code-docs/poetry-detection.rst b/docs/source/code-docs/poetry-detection.rst index 3335c750..a2bf03b4 100644 --- a/docs/source/code-docs/poetry-detection.rst +++ b/docs/source/code-docs/poetry-detection.rst @@ -1,6 +1,12 @@ Poetry Detection ################ +Core objects +============ + +.. automodule:: corppa.poetry_detection.core + :members: + Reference Corpora ================= .. automodule:: corppa.poetry_detection.ref_corpora diff --git a/src/corppa/poetry_detection/core.py b/src/corppa/poetry_detection/core.py index 3ff0633b..de9e0583 100644 --- a/src/corppa/poetry_detection/core.py +++ b/src/corppa/poetry_detection/core.py @@ -12,7 +12,7 @@ from Bio.Align import PairwiseAligner -# Table of supported detection methods and their corresponding prefixes +#: Supported detection methods with corresponding prefixes DETECTION_METHODS = { "adjudication": "a", "manual": "m", @@ -27,8 +27,11 @@ class Span: Span object representing a Pythonic "closed open" interval """ + #: start index start: int + #: end index end: int + #: label for the span label: str def __post_init__(self): @@ -88,11 +91,11 @@ def overlap_factor(self, other: "Span", ignore_label: bool = False) -> float: def field_real_type(field_type) -> type: """Return the real type for a dataclass field type annotation. For unions or optional values (e.g. `Optional[int]`), returns the first - non-None type; for type aliases (e.g. `set[str]`, returns the original type + non-None type; for type aliases (e.g. `set[str]`), returns the original type that was used to create the alias. For example: - - int -> int - - Optional[int] -> int - - set[str] -> set + - int -> int + - Optional[int] -> int + - set[str] -> set """ # if it's a regular type, return unchanged if isinstance(field_type, type): @@ -143,16 +146,21 @@ class Excerpt: """ # PPA page related + #: page id page_id: str + #: ppa span start index ppa_span_start: int + #: ppa span end index ppa_span_end: int + #: ppa span text ppa_span_text: str - # Detection methods + #: Detection methods detection_methods: set[str] - # Optional notes field + #: Optional notes notes: Optional[str] = None # Excerpt id, set in post initialization # Note: Cannot be passed in at initialization + #: excerpt identifier excerpt_id: str = field(init=False) def __post_init__(self): @@ -336,14 +344,20 @@ class LabeledExcerpt(Excerpt): """ # Reference poem related + #: poem id poem_id: str + #: reference corpus id ref_corpus: str + #: reference span start index ref_span_start: Optional[int] = None + #: reference span end index ref_span_end: Optional[int] = None + #: reference span text ref_span_text: Optional[str] = None + #: set of alternate poem ids, for merged excerpts with multiple ids alt_poem_ids: Optional[set[str]] = None - # Identification methods + #: Identification methods identification_methods: set[str] def __post_init__(self): From d23bedcc13d4ac4cfee90c32ff60bb0336350b6f Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 21 Apr 2026 10:41:17 -0400 Subject: [PATCH 08/13] Improve docs for merge excerpts code --- docs/source/code-docs/poetry-detection.rst | 3 +- src/corppa/poetry_detection/merge_excerpts.py | 63 +++++++++---------- 2 files changed, 32 insertions(+), 34 deletions(-) diff --git a/docs/source/code-docs/poetry-detection.rst b/docs/source/code-docs/poetry-detection.rst index a2bf03b4..7cc44295 100644 --- a/docs/source/code-docs/poetry-detection.rst +++ b/docs/source/code-docs/poetry-detection.rst @@ -13,6 +13,7 @@ Reference Corpora :members: + Scripts ======= @@ -25,4 +26,4 @@ Merge excerpts -------------- .. automodule:: corppa.poetry_detection.merge_excerpts -.. Note: not including members for method docs, only top-level script usage + :members: diff --git a/src/corppa/poetry_detection/merge_excerpts.py b/src/corppa/poetry_detection/merge_excerpts.py index b4f374ec..bc962f85 100755 --- a/src/corppa/poetry_detection/merge_excerpts.py +++ b/src/corppa/poetry_detection/merge_excerpts.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 """ -This script and associated method merges labeled and unlabeled poem excerpts +This script and associated method merges labeled and unlabeled poem excerpts with matching spans in the PPA page text. It takes two or more input files of excerpt data (labeled or unlabeled) in CSV format, @@ -15,31 +15,30 @@ Merging logic is as follows: - Excerpts are grouped based on exact span match in PPA text (i.e., the - combination of `page_id`, `ppa_span_start`, and `ppa_span_end`) + combination of ``page_id``, ``ppa_span_start``, and ``ppa_span_end``) even when poem identifications differ, and combined as follows: - - Excerpts are sorted by `poem_id`, `ref_span_start`, and passim + - Excerpts are sorted by ``poem_id``, ``ref_span_start``, and passim match length with nulls last and longest passim match first. - Reference information (`poem_id`, `ref_span_start`, `ref_span_end`, - `ref_span_text`, `ref_corpus`) is taken from the first excerpt in + Reference information (``poem_id``, ``ref_span_start``, ``ref_span_end``, + ``ref_span_text``, ``ref_corpus``) is taken from the first excerpt in the group. - When merged excerpts have different poem identifications, all unique - poem ids after the first are collected into `alt_poem_ids` - - The `detection_methods` and `identification_methods` fields are combined + poem ids after the first are collected into ``alt_poem_ids`` + - The ``detection_methods`` and ``identification_methods`` fields are combined to the unique set of methods used in the merged excerpts. - - The `notes` field is combined with the set of all unique content from + - The ``notes`` field is combined with the set of all unique content from notes in merged excerpts with an additional note about the merge. Example usage: :: -./src/corppa/poetry_detection/merge_excerpts.py adjudication_excerpts.csv \ -labeled_excerpts.csv -o merged_excerpts.csv + merge-excerpts adjudication_excerpts.csv labeled_excerpts.csv -o merged_excerpts.csv Limitations: - Merge logic collapses different poem ids that may not correspond; they may be subsets of the same poem, different editions, or entirely - different poems. + different poems. Alternate poem ids are preserved in ``alt_poem_ids``. - Currently supports CSV input and output only. """ @@ -59,17 +58,19 @@ def merge_excerpt_groups( grouped_df: pl.dataframe.group_by.GroupBy, merge_reason: str = "ppa exact span" ) -> pl.DataFrame: """Takes a GroupBy dataframe of excerpts (created by calling `group_by`), and combines - groups of excerpts into merged excerpts. Fields are expected to match - labeled excerpts (:class:~`corppa.poetry_detection.core.LabeledExcerpt`), and the + groups of excerpts into merged excerpts. Fields are expected to correspond to + labeled excerpts (:class:`~corppa.poetry_detection.core.LabeledExcerpt`), and the dataframe should be pre-sorted so the preferred excerpt comes first, since in several cases the first value is the one preserved in the merge. Merge logic is as follows: - - first `ppa_span_text`, `poem_id`, reference corpus values (`ref_corpus`, - `ref_span_start`, `ref_span_end`, `ref_span_text`) - - combined unique set of detection methods and identification methods - - combined unique set of notes - - updated excerpt id - - any additional poem ids are listed in alt_poem_ids + + - first ``ppa_span_text``, ``poem_id``, reference corpus values (``ref_corpus``, + ``ref_span_start``, ``ref_span_end``, ``ref_span_text``) + - combined unique set of detection methods and identification methods + - combined unique set of notes + - updated excerpt id + - any additional poem ids are listed in ``alt_poem_ids`` + After merging, the notes field is updated with text documenting the merge with the specified reason (by default, exact span in PPA), and the number of excerpts that were merged. @@ -122,20 +123,18 @@ def merge_excerpt_groups( def merge_excerpts( df: pl.DataFrame, disable_progress=True, verbose=False ) -> pl.DataFrame: - """Takes a polars DataFrame that includes labeled or unlabeled excerpts, - and merges excerpts based primarily on `page_id` and `excerpt_id`. - For now, merging is only done on the simple cases where reference - fields match exactly, or where reference fields are present in one labeled - excerpt and unset in the other: - - unlabeled excerpts with matching labeled excerpts - - multiple labeled excerpts with matching `poem_id` and non-conflicting - reference information - - When excerpts are merged, the detection_methods, identification_methods, - and notes fields are all combined to preserve all information. + """Takes a polars DataFrame that includes labeled or unlabeled excerpts + (fields correspond to :class:`~corppa.poetry_detection.core.LabeledExcerpt`), + and merges excerpts based on ``page_id`` and ppa span (``ppa_span_start`` and + ``ppa_span_end``). For now, merging is only done on the simple cases where + PPA excerpt text bounds match exactly. The best match is prioritized, based + on passim match length; alternate poem ids are preserved in ``alt_poem_ids``. + + When excerpts are merged, the ``detection_methods``, ``identification_methods``, + and ``notes`` fields are all combined to preserve information. Returns a dataframe that contains all unique excerpts and merged - versions of duplicated excerpts. + versions of duplicate excerpts. """ # group by page id and excerpt id to get potential matches @@ -249,8 +248,6 @@ def identify_overlapping_excerpts( pl.max_horizontal( c.ppa_span_end.sub(c.ppa_span_start), c.ppa_span_end_right.sub(c.ppa_span_start_right), - # c.ppa_span_text").str.len_chars(), - # c.ppa_span_text_right").str.len_chars(), ) ) ) From 6339d5bfe1da9f7af1d35fc268d88623a3de8a43 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 21 Apr 2026 10:45:29 -0400 Subject: [PATCH 09/13] Update years in docs copyright statement --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 78b52555..0724dd0d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -9,7 +9,7 @@ from corppa import __version__ project = "corppa" -copyright = "2024,2025 Center for Digital Humanities, Princeton University" +copyright = "2024—2026 Center for Digital Humanities, Princeton University" author = "Center for Digital Humanities RSE Team, Princeton University" release = __version__ From 99ca06709e4f2bba99592c586d488da795c578b7 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 21 Apr 2026 10:49:47 -0400 Subject: [PATCH 10/13] Fix formatting in docstring --- src/corppa/poetry_detection/core.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/corppa/poetry_detection/core.py b/src/corppa/poetry_detection/core.py index de9e0583..619a46e0 100644 --- a/src/corppa/poetry_detection/core.py +++ b/src/corppa/poetry_detection/core.py @@ -90,12 +90,13 @@ def overlap_factor(self, other: "Span", ignore_label: bool = False) -> float: def field_real_type(field_type) -> type: """Return the real type for a dataclass field type annotation. - For unions or optional values (e.g. `Optional[int]`), returns the first - non-None type; for type aliases (e.g. `set[str]`), returns the original type + For unions or optional values (e.g. ``Optional[int]``), returns the first + non-None type; for type aliases (e.g. ``set[str]``), returns the original type that was used to create the alias. For example: - - int -> int - - Optional[int] -> int - - set[str] -> set + + - ``int`` -> ``int`` + - ``Optional[int]`` -> ``int`` + - ``set[str]`` -> ``set`` """ # if it's a regular type, return unchanged if isinstance(field_type, type): From 757935bc24344ffb932c74352c8e51732ab8cffd Mon Sep 17 00:00:00 2001 From: Rebecca Sutton Koeser Date: Tue, 21 Apr 2026 13:20:59 -0400 Subject: [PATCH 11/13] Apply suggestions from code review Co-authored-by: Laure Thompson <602628+laurejt@users.noreply.github.com> --- src/corppa/poetry_detection/merge_excerpts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/corppa/poetry_detection/merge_excerpts.py b/src/corppa/poetry_detection/merge_excerpts.py index bc962f85..b923b958 100755 --- a/src/corppa/poetry_detection/merge_excerpts.py +++ b/src/corppa/poetry_detection/merge_excerpts.py @@ -96,7 +96,7 @@ def merge_excerpt_groups( # and store all others in alt poem ids field c.poem_id.unique().drop_nulls().slice(1).alias("alt_poem_ids"), c.ref_corpus.first(), - # use first reference span and text so numbers are useful; ignore nulls + # use first reference span and text so numbers are useful c.ref_span_start.first(), c.ref_span_end.first(), c.ref_span_text.first(), @@ -227,7 +227,7 @@ def identify_overlapping_excerpts( .filter(c.page_id.is_duplicated()) .join_where( excerpts_df, - # 1. Limit to excerpts are on the same page + # 1. Limit to excerpts on the same page c.page_id == c.page_id_right, # 2. Excerpts overlap: # left span starts before right span ends From 33be42ebcad1ba998a7d56eb1beec91b9bca336e Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 21 Apr 2026 13:16:11 -0400 Subject: [PATCH 12/13] Add license document to sphinx docs Resolves warning about unresolved link in readme --- docs/source/LICENSE.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 docs/source/LICENSE.md diff --git a/docs/source/LICENSE.md b/docs/source/LICENSE.md new file mode 100644 index 00000000..165e2cc4 --- /dev/null +++ b/docs/source/LICENSE.md @@ -0,0 +1,7 @@ +--- +orphan: true +--- +``` +{include} ../../LICENSE.md +``` + From 5f9e59db10044fa17ce84ae938fe6d7c1f29ee67 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 21 Apr 2026 13:31:07 -0400 Subject: [PATCH 13/13] Clean up & additional test cases per @laurejt review --- .../test_merge_excerpts.py | 43 +++++++++++++++++-- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/tests/test_poetry_detection/test_merge_excerpts.py b/tests/test_poetry_detection/test_merge_excerpts.py index 0217dbd3..bd46de7d 100644 --- a/tests/test_poetry_detection/test_merge_excerpts.py +++ b/tests/test_poetry_detection/test_merge_excerpts.py @@ -117,6 +117,18 @@ def test_merge_excerpts_1ex_2labels(capsys): if field not in ["notes", "poem_id", "identification_methods", "alt_poem_ids"]: assert getattr(merged_excerpt, field) == getattr(excerpt1_label1, field) + # check that works the same way with different initial order, + # since the method orders before grouping + df = pl.from_dicts( + [excerpt1_label2.to_dict(), excerpt1_label1.to_dict(), excerpt1.to_dict()] + ) + merged = merge_excerpts(df) + # expect one row with combined labels + assert len(merged) == 1 + merged_excerpt = LabeledExcerpt.from_dict(merged.row(0, named=True)) + # first poem id is selected as primary + assert merged_excerpt.poem_id == excerpt1_label1.poem_id + def test_merge_excerpts_1ex_note_1label(): # excerpt with note + labeled excerpt (same id) @@ -196,7 +208,7 @@ def test_merge_passim_match_len(): def test_merge_excerpts_multiple_diff_labels(capsys): - # excerpt + two labeled excerpt (same excerpt id, two different ref ids) + # excerpt + two labeled excerpt (same excerpt id, two different poem ids) df = pl.from_dicts( [excerpt1.to_dict(), excerpt1_label1.to_dict(), excerpt1_label2.to_dict()] ) @@ -230,7 +242,7 @@ def test_merge_excerpts_multiple_diff_labels(capsys): def test_merge_excerpts_1ex_2labels_diffmethod(): # unlabeled excerpt + two matching labeled excerpts - # - same excerpt id, two labels with same ref ids but different method + # - same excerpt id, two labels with same poem ids but different method # combine method does not merge these # everything the same except for the method (unlikely!) @@ -320,7 +332,7 @@ def test_merge_unlabeled_labeled_excerpts(): def test_merge_excerpts(): # excerpt + two matching labeled excerpts - # - same excerpt id, two labels with same ref ids but different method + # - same excerpt id, two labels with same poem ids but different method # everything the same except for the method (unlikely!) excerpt1_label1_method2 = replace( @@ -455,7 +467,7 @@ def test_main_invalid_input(capsys, tmp_path): def test_main_successful(capsys, tmp_path): - # test a succesful run + # test a successful run excerpt_datafile = tmp_path / "excerpts.csv" _excerpts_to_csv(excerpt_datafile, [excerpt1, excerpt2]) # valid excerpt data @@ -542,6 +554,14 @@ def test_main_successful(capsys, tmp_path): ], "very small overlap", ), + ([excerpt1, excerpt1], "full overlap, short text"), + ( + [ + replace(excerpt1, ppa_span_end=100), + replace(excerpt1, ppa_span_start=80, ppa_span_end=200), + ], + "long overlap, low overlap factor", + ), ] @@ -606,3 +626,18 @@ def test_identify_overlapping_excerpts(): ) # defaults options exclude this pair assert identify_overlapping_excerpts(excerpts_df).height == 0 + + # check results when input is given in the alternate order + excerpts = [excerpt1_overlap, excerpt1] + excerpts_df = standardize_dataframe( + pl.from_dicts([ex.to_dict() for ex in excerpts]) + ) + pairs_df = identify_overlapping_excerpts( + excerpts_df, min_overlap_chars=9, min_overlap_factor=0.9 + ) + # we expect one pair + assert pairs_df.height == 1 + # check that pair is ordered as expected + pair_result = pairs_df.row(0, named=True) + assert pair_result["excerpt_id"] == excerpt1.excerpt_id + assert pair_result["excerpt_id_right"] == excerpt1_overlap.excerpt_id