diff --git a/docs/source/LICENSE.md b/docs/source/LICENSE.md new file mode 100644 index 00000000..165e2cc4 --- /dev/null +++ b/docs/source/LICENSE.md @@ -0,0 +1,7 @@ +--- +orphan: true +--- +``` +{include} ../../LICENSE.md +``` + diff --git a/docs/source/code-docs.rst b/docs/source/code-docs.rst deleted file mode 100644 index 2ce6b6cf..00000000 --- a/docs/source/code-docs.rst +++ /dev/null @@ -1,101 +0,0 @@ -Code Documentation -################## - -.. toctree:: - :maxdepth: 2 - -OCR -=== -.. automodule:: corppa.ocr.gvision_ocr - :members: - - -Collate Texts -------------- -.. automodule:: corppa.ocr.collate_txt -.. Note: not including the members for the method docs, *but* we should we -.. make the top-level comment better. - - -Utils -===== - -Filter Utility --------------- -.. automodule:: corppa.utils.filter -.. Note: not including members for method docs, only top-level script usage - -Path Utilities --------------- -.. automodule:: corppa.utils.path_utils - :members: - -Generate PPA Page Set ----------------------- -.. automodule:: corppa.utils.generate_page_set -.. Note: not including members for method docs, only top-level script usage - -Add Image (Relative) Paths ---------------------------- -.. automodule:: corppa.utils.add_image_relpaths -.. Note: not including members for method docs, only top-level script usage - -Build Text Corpus ------------------ -.. automodule:: corppa.utils.build_text_corpus -.. Note: not including members for method docs, only top-level script usage - - -Annotation -========== - -Data Preparation ------------------ -Preliminary Page Set Creation -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. automodule:: corppa.poetry_detection.annotation.create_pageset -.. Note: not including members for method docs, only top-level script usage - -Add Metadata -^^^^^^^^^^^^ -.. automodule:: corppa.poetry_detection.annotation.add_metadata -.. Note: not including members for method docs, only top-level script usage - -Annotation Recipes ------------------- -.. automodule:: corppa.poetry_detection.annotation.annotation_recipes -.. Note: not including members for method docs, only top-level script usage - -Command Recipes ---------------- -.. automodule:: corppa.poetry_detection.annotation.command_recipes -.. Note: not including members for method docs, only top-level script usage - -Process Adjudication Data -------------------------- -.. automodule:: corppa.poetry_detection.annotation.process_adjudication_data -.. Note: not including members for method docs, only top-level script usage - - -Poetry Detection -================ - -Reference Corpora ------------------ -.. automodule:: corppa.poetry_detection.ref_corpora - :members: - - -Scripts -------- - -refmatcha -^^^^^^^^^ - -.. automodule:: corppa.poetry_detection.refmatcha - -Merge excerpts -^^^^^^^^^^^^^^ - -.. automodule:: corppa.poetry_detection.merge_excerpts -.. Note: not including members for method docs, only top-level script usage \ No newline at end of file diff --git a/docs/source/code-docs/annotation.rst b/docs/source/code-docs/annotation.rst new file mode 100644 index 00000000..8bbb8dc7 --- /dev/null +++ b/docs/source/code-docs/annotation.rst @@ -0,0 +1,30 @@ +Annotation +########## + +Data Preparation +================ + +Preliminary Page Set Creation +------------------------------ +.. automodule:: corppa.poetry_detection.annotation.create_pageset +.. Note: not including members for method docs, only top-level script usage + +Add Metadata +------------ +.. automodule:: corppa.poetry_detection.annotation.add_metadata +.. Note: not including members for method docs, only top-level script usage + +Annotation Recipes +================== +.. automodule:: corppa.poetry_detection.annotation.annotation_recipes +.. Note: not including members for method docs, only top-level script usage + +Command Recipes +=============== +.. automodule:: corppa.poetry_detection.annotation.command_recipes +.. Note: not including members for method docs, only top-level script usage + +Process Adjudication Data +========================= +.. automodule:: corppa.poetry_detection.annotation.process_adjudication_data +.. Note: not including members for method docs, only top-level script usage diff --git a/docs/source/code-docs/index.rst b/docs/source/code-docs/index.rst new file mode 100644 index 00000000..ca5a4ef2 --- /dev/null +++ b/docs/source/code-docs/index.rst @@ -0,0 +1,10 @@ +Code Documentation +################## + +.. toctree:: + :maxdepth: 2 + + ocr + utils + annotation + poetry-detection diff --git a/docs/source/code-docs/ocr.rst b/docs/source/code-docs/ocr.rst new file mode 100644 index 00000000..7c7938e8 --- /dev/null +++ b/docs/source/code-docs/ocr.rst @@ -0,0 +1,12 @@ +OCR +### + +.. automodule:: corppa.ocr.gvision_ocr + :members: + + +Collate Texts +============= +.. automodule:: corppa.ocr.collate_txt +.. Note: not including the members for the method docs, *but* we should we +.. make the top-level comment better. diff --git a/docs/source/code-docs/poetry-detection.rst b/docs/source/code-docs/poetry-detection.rst new file mode 100644 index 00000000..7cc44295 --- /dev/null +++ b/docs/source/code-docs/poetry-detection.rst @@ -0,0 +1,29 @@ +Poetry Detection +################ + +Core objects +============ + +.. automodule:: corppa.poetry_detection.core + :members: + +Reference Corpora +================= +.. automodule:: corppa.poetry_detection.ref_corpora + :members: + + + +Scripts +======= + +refmatcha +--------- + +.. automodule:: corppa.poetry_detection.refmatcha + +Merge excerpts +-------------- + +.. automodule:: corppa.poetry_detection.merge_excerpts + :members: diff --git a/docs/source/code-docs/utils.rst b/docs/source/code-docs/utils.rst new file mode 100644 index 00000000..994e9318 --- /dev/null +++ b/docs/source/code-docs/utils.rst @@ -0,0 +1,27 @@ +Utils +##### + +Filter Utility +============== +.. automodule:: corppa.utils.filter +.. Note: not including members for method docs, only top-level script usage + +Path Utilities +============== +.. automodule:: corppa.utils.path_utils + :members: + +Generate PPA Page Set +===================== +.. automodule:: corppa.utils.generate_page_set +.. Note: not including members for method docs, only top-level script usage + +Add Image (Relative) Paths +========================== +.. automodule:: corppa.utils.add_image_relpaths +.. Note: not including members for method docs, only top-level script usage + +Build Text Corpus +================= +.. automodule:: corppa.utils.build_text_corpus +.. Note: not including members for method docs, only top-level script usage diff --git a/docs/source/conf.py b/docs/source/conf.py index 78b52555..0724dd0d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -9,7 +9,7 @@ from corppa import __version__ project = "corppa" -copyright = "2024,2025 Center for Digital Humanities, Princeton University" +copyright = "2024—2026 Center for Digital Humanities, Princeton University" author = "Center for Digital Humanities RSE Team, Princeton University" release = __version__ diff --git a/docs/source/index.rst b/docs/source/index.rst index 39d4492b..a3f4bd1a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -19,5 +19,5 @@ This repository is research software developed as part of the `Ends of Prosody < Overview Developer Notes - code-docs + code-docs/index eop-docs diff --git a/src/corppa/poetry_detection/core.py b/src/corppa/poetry_detection/core.py index 3ff0633b..619a46e0 100644 --- a/src/corppa/poetry_detection/core.py +++ b/src/corppa/poetry_detection/core.py @@ -12,7 +12,7 @@ from Bio.Align import PairwiseAligner -# Table of supported detection methods and their corresponding prefixes +#: Supported detection methods with corresponding prefixes DETECTION_METHODS = { "adjudication": "a", "manual": "m", @@ -27,8 +27,11 @@ class Span: Span object representing a Pythonic "closed open" interval """ + #: start index start: int + #: end index end: int + #: label for the span label: str def __post_init__(self): @@ -87,12 +90,13 @@ def overlap_factor(self, other: "Span", ignore_label: bool = False) -> float: def field_real_type(field_type) -> type: """Return the real type for a dataclass field type annotation. - For unions or optional values (e.g. `Optional[int]`), returns the first - non-None type; for type aliases (e.g. `set[str]`, returns the original type + For unions or optional values (e.g. ``Optional[int]``), returns the first + non-None type; for type aliases (e.g. ``set[str]``), returns the original type that was used to create the alias. For example: - - int -> int - - Optional[int] -> int - - set[str] -> set + + - ``int`` -> ``int`` + - ``Optional[int]`` -> ``int`` + - ``set[str]`` -> ``set`` """ # if it's a regular type, return unchanged if isinstance(field_type, type): @@ -143,16 +147,21 @@ class Excerpt: """ # PPA page related + #: page id page_id: str + #: ppa span start index ppa_span_start: int + #: ppa span end index ppa_span_end: int + #: ppa span text ppa_span_text: str - # Detection methods + #: Detection methods detection_methods: set[str] - # Optional notes field + #: Optional notes notes: Optional[str] = None # Excerpt id, set in post initialization # Note: Cannot be passed in at initialization + #: excerpt identifier excerpt_id: str = field(init=False) def __post_init__(self): @@ -336,14 +345,20 @@ class LabeledExcerpt(Excerpt): """ # Reference poem related + #: poem id poem_id: str + #: reference corpus id ref_corpus: str + #: reference span start index ref_span_start: Optional[int] = None + #: reference span end index ref_span_end: Optional[int] = None + #: reference span text ref_span_text: Optional[str] = None + #: set of alternate poem ids, for merged excerpts with multiple ids alt_poem_ids: Optional[set[str]] = None - # Identification methods + #: Identification methods identification_methods: set[str] def __post_init__(self): diff --git a/src/corppa/poetry_detection/merge_excerpts.py b/src/corppa/poetry_detection/merge_excerpts.py index e034f184..b923b958 100755 --- a/src/corppa/poetry_detection/merge_excerpts.py +++ b/src/corppa/poetry_detection/merge_excerpts.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 """ -This script and associated method merges labeled and unlabeled poem excerpts +This script and associated method merges labeled and unlabeled poem excerpts with matching spans in the PPA page text. It takes two or more input files of excerpt data (labeled or unlabeled) in CSV format, @@ -15,31 +15,30 @@ Merging logic is as follows: - Excerpts are grouped based on exact span match in PPA text (i.e., the - combination of `page_id`, `ppa_span_start`, and `ppa_span_end`) + combination of ``page_id``, ``ppa_span_start``, and ``ppa_span_end``) even when poem identifications differ, and combined as follows: - - Excerpts are sorted by `poem_id`, `ref_span_start`, and passim + - Excerpts are sorted by ``poem_id``, ``ref_span_start``, and passim match length with nulls last and longest passim match first. - Reference information (`poem_id`, `ref_span_start`, `ref_span_end`, - `ref_span_text`, `ref_corpus`) is taken from the first excerpt in + Reference information (``poem_id``, ``ref_span_start``, ``ref_span_end``, + ``ref_span_text``, ``ref_corpus``) is taken from the first excerpt in the group. - When merged excerpts have different poem identifications, all unique - poem ids after the first are collected into `alt_poem_ids` - - The `detection_methods` and `identification_methods` fields are combined + poem ids after the first are collected into ``alt_poem_ids`` + - The ``detection_methods`` and ``identification_methods`` fields are combined to the unique set of methods used in the merged excerpts. - - The `notes` field is combined with the set of all unique content from + - The ``notes`` field is combined with the set of all unique content from notes in merged excerpts with an additional note about the merge. Example usage: :: -./src/corppa/poetry_detection/merge_excerpts.py adjudication_excerpts.csv \ -labeled_excerpts.csv -o merged_excerpts.csv + merge-excerpts adjudication_excerpts.csv labeled_excerpts.csv -o merged_excerpts.csv Limitations: - Merge logic collapses different poem ids that may not correspond; they may be subsets of the same poem, different editions, or entirely - different poems. + different poems. Alternate poem ids are preserved in ``alt_poem_ids``. - Currently supports CSV input and output only. """ @@ -49,37 +48,95 @@ import sys import polars as pl +from polars import col as c # for shorthand column notation from corppa.poetry_detection.core import MULTIVAL_DELIMITER from corppa.poetry_detection.polars_utils import load_excerpts_df, standardize_dataframe +def merge_excerpt_groups( + grouped_df: pl.dataframe.group_by.GroupBy, merge_reason: str = "ppa exact span" +) -> pl.DataFrame: + """Takes a GroupBy dataframe of excerpts (created by calling `group_by`), and combines + groups of excerpts into merged excerpts. Fields are expected to correspond to + labeled excerpts (:class:`~corppa.poetry_detection.core.LabeledExcerpt`), and the + dataframe should be pre-sorted so the preferred excerpt comes first, + since in several cases the first value is the one preserved in the merge. + Merge logic is as follows: + + - first ``ppa_span_text``, ``poem_id``, reference corpus values (``ref_corpus``, + ``ref_span_start``, ``ref_span_end``, ``ref_span_text``) + - combined unique set of detection methods and identification methods + - combined unique set of notes + - updated excerpt id + - any additional poem ids are listed in ``alt_poem_ids`` + + After merging, the notes field is updated with text documenting the merge + with the specified reason (by default, exact span in PPA), and the + number of excerpts that were merged. + """ + return ( + grouped_df.agg( + # NOTE: does not handle overlapping spans; + # currently assumes text spans match or first in group is complete + pl.first("ppa_span_text"), + c.detection_methods.explode().unique(), # combine in a single list, no repeats + # combine notes but don't repeat duplicate info (like passim char match count) + c.notes.explode().unique().str.join("; "), + # construct merged excerpt id manually; c= prefix for combined + # (although strictly speaking should only be if > 1 detection method) + pl.concat_str( + pl.lit("c@"), + c.ppa_span_start.first(), + pl.lit(":"), + c.ppa_span_end.first(), + ).alias("excerpt_id"), + # pick the first poem id (relies on previous sorting) + c.poem_id.first(), + # and store all others in alt poem ids field + c.poem_id.unique().drop_nulls().slice(1).alias("alt_poem_ids"), + c.ref_corpus.first(), + # use first reference span and text so numbers are useful + c.ref_span_start.first(), + c.ref_span_end.first(), + c.ref_span_text.first(), + # combine unique list of unique id methods; ignore nulls (not identified before merging) + c.identification_methods.explode().unique().drop_nulls(), + pl.len().alias("group_size"), # count number in the group + ) + .with_columns( + notes=pl.concat_str( + c.notes, + pl.lit(f"; merge: {merge_reason}, "), + c.group_size, + pl.lit(" excerpts"), + ), + # if alt poem ids is empty, replace with None + alt_poem_ids=pl.when(c.alt_poem_ids.list.len() > 0) + .then(c.alt_poem_ids) + .otherwise(pl.lit(None)), + ) + .drop("group_size") + ) # drop group size column + + def merge_excerpts( df: pl.DataFrame, disable_progress=True, verbose=False ) -> pl.DataFrame: - """Takes a polars DataFrame that includes labeled or unlabeled excerpts, - and merges excerpts based primarily on `page_id` and `excerpt_id`. - For now, merging is only done on the simple cases where reference - fields match exactly, or where reference fields are present in one labeled - excerpt and unset in the other: - - unlabeled excerpts with matching labeled excerpts - - multiple labeled excerpts with matching `poem_id` and non-conflicting - reference information - - When excerpts are merged, the detection_methods, identification_methods, - and notes fields are all combined to preserve all information. + """Takes a polars DataFrame that includes labeled or unlabeled excerpts + (fields correspond to :class:`~corppa.poetry_detection.core.LabeledExcerpt`), + and merges excerpts based on ``page_id`` and ppa span (``ppa_span_start`` and + ``ppa_span_end``). For now, merging is only done on the simple cases where + PPA excerpt text bounds match exactly. The best match is prioritized, based + on passim match length; alternate poem ids are preserved in ``alt_poem_ids``. + + When excerpts are merged, the ``detection_methods``, ``identification_methods``, + and ``notes`` fields are all combined to preserve information. Returns a dataframe that contains all unique excerpts and merged - versions of duplicated excerpts. + versions of duplicate excerpts. """ - # TEMPORARY - make sure internet poem ref corpus ids match before merging - df = df.with_columns( - ref_corpus=pl.when(pl.col("ref_corpus").eq("internet-poems")) - .then(pl.lit("internet_poems")) - .otherwise(pl.col.ref_corpus) - ) - # group by page id and excerpt id to get potential matches # use aggregation to get the count of excerpts in each group, # then split input dataframe into singletons and merge candidates @@ -91,16 +148,17 @@ def merge_excerpts( # add to output df and don't process further output_df = ( df.join(grouped, on=["page_id", "ppa_span_start", "ppa_span_end"]) - .filter(pl.col("group_size").eq(1)) + .filter(c.group_size.eq(1)) .drop("group_size") ) if output_df.is_empty(): + # if none were found, create an empty output_df = df.clear() # any excerpts with group size > 1 are candidates for merging merge_candidates = ( df.join(grouped, on=["page_id", "ppa_span_start", "ppa_span_end"]) - .filter(pl.col("group_size").gt(1)) + .filter(c.group_size.gt(1)) .drop("group_size") ) @@ -110,7 +168,7 @@ def merge_excerpts( merge_groups = ( merge_candidates.with_columns( # extract passim match length so we can prioritize longer matches - passim_match_len=pl.col("notes").str.extract(r"passim: (\d+) char matches") + passim_match_len=c.notes.str.extract(r"passim: (\d+) char matches") ) .sort( "page_id", @@ -128,62 +186,10 @@ def merge_excerpts( f"Identified {merge_candidates.height:,} merge candidates in {num_merge_groups:,} groups." ) - merged_output_df = ( - merge_groups.agg( - pl.first("ppa_span_text"), # should match exactly - pl.col("detection_methods") - .explode() - .unique(), # combine in a single list, no repeats - # combine notes but don't repeat duplicate info (like passim char match count) - pl.col("notes").explode().unique().sort().str.join("; "), - # construct merged excerpt id manually; c= prefix for combined - # (although strictly speaking should only be if > 1 detection method) - pl.concat_str( - pl.lit("c@"), - pl.col("ppa_span_start").first(), - pl.lit(":"), - pl.col("ppa_span_end").first(), - ).alias("excerpt_id"), - # pick the first poem id (relies on previous sorting) - pl.col("poem_id").explode().unique().first(), - # and store all others in alt poem ids field - pl.col("poem_id") - .explode() - .unique() - .drop_nulls() - .slice(1) - .alias("alt_poem_ids"), - pl.col("ref_corpus").explode().first(), - # use first reference span and text so numbers are useful; ignore nulls - pl.col("ref_span_start").first(), - pl.col("ref_span_end").first(), - pl.col("ref_span_text").first(), - # combine unique list of id methods - pl.col("identification_methods") - .explode() - .unique() - .drop_nulls(), # combine in a single list, no repeats, ignore nulls (not identified before merging) - pl.len().alias("group_size"), # count number in the group - ) - .with_columns( - notes=pl.concat_str( - pl.col("notes"), - pl.lit("; merge: ppa exact span, "), - pl.col("group_size"), - pl.lit(" excerpts"), - ), - # if alt poem ids is empty, replace with None - alt_poem_ids=pl.when(pl.col("alt_poem_ids").list.len() > 0) - .then(pl.col("alt_poem_ids")) - .otherwise(pl.lit(None)), - ) - .drop("group_size") # drop group size column - ) + merged_output_df = merge_excerpt_groups(merge_groups) if verbose: - multi_id = merged_output_df.filter( - pl.col("alt_poem_ids").list.len().gt(0) - ).height + multi_id = merged_output_df.filter(c.alt_poem_ids.list.len().gt(0)).height print( f"{merged_output_df.height:,} merged excerpts; {multi_id:,} with multiple poem ids." ) @@ -191,7 +197,86 @@ def merge_excerpts( # combined merged records with the output # use a diagonal concat instead of vstack/extend # to avoid having to reconcile columns first - return pl.concat([output_df, merged_output_df], how="diagonal") + output_df = pl.concat([output_df, merged_output_df], how="diagonal") + + return output_df + + +def identify_overlapping_excerpts( + excerpts_df: pl.DataFrame, + min_overlap_factor: float = 0.98, + min_overlap_chars: int = 10, +) -> pl.DataFrame: + """ + Takes a DataFrame of excerpts and identifies pairs of overlapping excerpts. + Overlapping excerpts are on the same page, with some shared span of text. + We exclude short overlaps based on the minimum character parameter, + and an overlap factor, which is calculated by the length of the shared + span divided by the length of the longer of the two spans. Note that + this will typically not return small excerpts completely inside another + larger span. + + Returns a DataFrame of excerpt pairs with columns for page id, + pairs of excerpt ids, overlap length, and overlap factor. + """ + + # identify excerpts with partial overlap + overlaps_df = ( + excerpts_df + # Filter to excerpts on pages with multiple excerpts + .filter(c.page_id.is_duplicated()) + .join_where( + excerpts_df, + # 1. Limit to excerpts on the same page + c.page_id == c.page_id_right, + # 2. Excerpts overlap: + # left span starts before right span ends + c.ppa_span_start < c.ppa_span_end_right, + # and right span starts before left span ends + c.ppa_span_start_right < c.ppa_span_end, + # 3. Exclude self-matches and reverse matches + c.excerpt_id < c.excerpt_id_right, + ) + .with_columns( + # calculate length of the overlap: smaller end minus larger start + overlap_len=pl.min_horizontal(c.ppa_span_end, c.ppa_span_end_right).sub( + pl.max_horizontal(c.ppa_span_start, c.ppa_span_start_right) + ), + ) + .with_columns( + overlap_factor=c.overlap_len.truediv( + pl.max_horizontal( + c.ppa_span_end.sub(c.ppa_span_start), + c.ppa_span_end_right.sub(c.ppa_span_start_right), + ) + ) + ) + # filter to requested overlap / length to limit to high confidence overlaps + .filter( + c.overlap_factor.ge(min_overlap_factor), + c.overlap_len.ge(min_overlap_chars), + ) + ) + + # what fields are needed here? + return overlaps_df.select( + "page_id", + "excerpt_id", + "excerpt_id_right", + "overlap_len", + "overlap_factor", + # these are not strictly needed but may be helpful for investigating + "notes", + "notes_right", + "ppa_span_text", + "ppa_span_start", + "ppa_span_end", + "ppa_span_text_right", + "ppa_span_start_right", + "ppa_span_end_right", + "ref_span_text", + "ref_span_text_right", + ) def merge_excerpt_files( @@ -218,7 +303,7 @@ def merge_excerpt_files( total_excerpts = excerpts.height # use unique to drop exact duplicates excerpts = excerpts.unique() - initial_labeled_excerpts = excerpts.filter(pl.col("poem_id").is_not_null()).height + initial_labeled_excerpts = excerpts.filter(c.poem_id.is_not_null()).height # output summary information about input data print( f"Loaded {total_excerpts:,} excerpts from {len(input_files)} files ({excerpts.height:,} unique; {initial_labeled_excerpts:,} labeled)." @@ -235,16 +320,14 @@ def merge_excerpt_files( # convert list fields for output to csv and reporting excerpts = excerpts.with_columns( - detection_methods=pl.col("detection_methods") - .list.sort() - .list.join(MULTIVAL_DELIMITER), - identification_methods=pl.col("identification_methods") - .list.sort() - .list.join(MULTIVAL_DELIMITER), - alt_poem_ids=pl.col("alt_poem_ids").list.join(MULTIVAL_DELIMITER), + detection_methods=c.detection_methods.list.sort().list.join(MULTIVAL_DELIMITER), + identification_methods=c.identification_methods.list.sort().list.join( + MULTIVAL_DELIMITER + ), + alt_poem_ids=c.alt_poem_ids.list.join(MULTIVAL_DELIMITER), ) - labeled_excerpts = excerpts.filter(pl.col("poem_id").is_not_null()) + labeled_excerpts = excerpts.filter(c.poem_id.is_not_null()) # summary information about the content and what as done print( diff --git a/tests/test_poetry_detection/test_merge_excerpts.py b/tests/test_poetry_detection/test_merge_excerpts.py index cfdaa2ba..bd46de7d 100644 --- a/tests/test_poetry_detection/test_merge_excerpts.py +++ b/tests/test_poetry_detection/test_merge_excerpts.py @@ -9,11 +9,13 @@ import pytest from test_polars_utils import _excerpts_to_csv -from corppa.poetry_detection.core import Excerpt, LabeledExcerpt +from corppa.poetry_detection.core import Excerpt, LabeledExcerpt, Span from corppa.poetry_detection.merge_excerpts import ( + identify_overlapping_excerpts, main, merge_excerpts, ) +from corppa.poetry_detection.polars_utils import standardize_dataframe excerpt1 = Excerpt( page_id="p.1", @@ -115,6 +117,18 @@ def test_merge_excerpts_1ex_2labels(capsys): if field not in ["notes", "poem_id", "identification_methods", "alt_poem_ids"]: assert getattr(merged_excerpt, field) == getattr(excerpt1_label1, field) + # check that works the same way with different initial order, + # since the method orders before grouping + df = pl.from_dicts( + [excerpt1_label2.to_dict(), excerpt1_label1.to_dict(), excerpt1.to_dict()] + ) + merged = merge_excerpts(df) + # expect one row with combined labels + assert len(merged) == 1 + merged_excerpt = LabeledExcerpt.from_dict(merged.row(0, named=True)) + # first poem id is selected as primary + assert merged_excerpt.poem_id == excerpt1_label1.poem_id + def test_merge_excerpts_1ex_note_1label(): # excerpt with note + labeled excerpt (same id) @@ -131,7 +145,7 @@ def test_merge_excerpts_1ex_note_1label(): # notes should be combined, and merge info should be added expected_merge_note = "merge: ppa exact span, 2 excerpts" expected_notes = "; ".join( - [ex1_notes.notes, excerpt1_label1.notes, expected_merge_note] + [excerpt1_label1.notes, ex1_notes.notes, expected_merge_note] ) assert merged_excerpt.notes == expected_notes excerpt_with_notes = replace(excerpt1_label1, notes=expected_notes) @@ -194,7 +208,7 @@ def test_merge_passim_match_len(): def test_merge_excerpts_multiple_diff_labels(capsys): - # excerpt + two labeled excerpt (same excerpt id, two different ref ids) + # excerpt + two labeled excerpt (same excerpt id, two different poem ids) df = pl.from_dicts( [excerpt1.to_dict(), excerpt1_label1.to_dict(), excerpt1_label2.to_dict()] ) @@ -228,7 +242,7 @@ def test_merge_excerpts_multiple_diff_labels(capsys): def test_merge_excerpts_1ex_2labels_diffmethod(): # unlabeled excerpt + two matching labeled excerpts - # - same excerpt id, two labels with same ref ids but different method + # - same excerpt id, two labels with same poem ids but different method # combine method does not merge these # everything the same except for the method (unlikely!) @@ -318,7 +332,7 @@ def test_merge_unlabeled_labeled_excerpts(): def test_merge_excerpts(): # excerpt + two matching labeled excerpts - # - same excerpt id, two labels with same ref ids but different method + # - same excerpt id, two labels with same poem ids but different method # everything the same except for the method (unlikely!) excerpt1_label1_method2 = replace( @@ -453,7 +467,7 @@ def test_main_invalid_input(capsys, tmp_path): def test_main_successful(capsys, tmp_path): - # test a succesful run + # test a successful run excerpt_datafile = tmp_path / "excerpts.csv" _excerpts_to_csv(excerpt_datafile, [excerpt1, excerpt2]) # valid excerpt data @@ -507,3 +521,123 @@ def test_main_successful(capsys, tmp_path): assert merged_ex1.ref_span_text == excerpt1_label1.ref_span_text # id methods combined assert merged_ex1.identification_methods == {"manual", "refmatcha"} + + +### test for identify_overlapping_excerpts + +# test scenarios that should result in no overlapping pairs +no_overlap_inputs = [ + # list of excerpts, reason this example has no overlapping pairs + ([excerpt2], "single excerpt"), + ([excerpt2, excerpt1_label1], "excerpts on different page"), + # construct a second excerpt on the same page by using replace and relative offset span start/end + ( + [ + excerpt1, + replace( + excerpt1, + ppa_span_start=excerpt1.ppa_span_end + 100, + ppa_span_end=excerpt1.ppa_span_end + 120, + ), + ], + "same page, no overlap", + ), + # construct a short second excerpt on the same page with minimal overlap + ( + [ + excerpt1, + replace( + excerpt1, + ppa_span_start=excerpt1.ppa_span_end - 1, + ppa_span_end=excerpt1.ppa_span_end + 3, + ), + ], + "very small overlap", + ), + ([excerpt1, excerpt1], "full overlap, short text"), + ( + [ + replace(excerpt1, ppa_span_end=100), + replace(excerpt1, ppa_span_start=80, ppa_span_end=200), + ], + "long overlap, low overlap factor", + ), +] + + +@pytest.mark.parametrize("excerpts, reason", no_overlap_inputs) +def test_identify_overlapping_excerpts_no_pairs(excerpts, reason): + # construct a standardized dataframe from the list of excerpts given + excerpts_df = standardize_dataframe( + pl.from_dicts([ex.to_dict() for ex in excerpts]) + ) + pairs_df = identify_overlapping_excerpts(excerpts_df) + assert pairs_df.height == 0, f"expected 0 overlapping pairs: {reason}" + + +def test_identify_overlapping_excerpts(): + # create a pair with high overlap starting with fixture 1 + # for convenience, we use the existing Span object to construct + # an overlapping span and check the overlap length / factor logic + ppa_span1 = Span(start=excerpt1.ppa_span_start, end=excerpt1.ppa_span_end, label="") + # create a second span; offset start by 1/9 the length of the first span + ppa_span2 = Span( + start=int(ppa_span1.start + len(ppa_span1) / 9), end=ppa_span1.end + 1, label="" + ) + excerpt1_overlap = replace( + excerpt1, ppa_span_start=ppa_span2.start, ppa_span_end=ppa_span2.end + ) + # use existing span logic as coherence check for new method + overlap_len = ppa_span1.overlap_length(ppa_span2) + assert overlap_len >= 9 + overlap_factor = ppa_span1.overlap_factor(ppa_span2, ignore_label=True) + assert overlap_factor >= 0.9 + # construct a standardized dataframe from the two test excerpts + excerpts = [excerpt1, excerpt1_overlap] + excerpts_df = standardize_dataframe( + pl.from_dicts([ex.to_dict() for ex in excerpts]) + ) + pairs_df = identify_overlapping_excerpts( + excerpts_df, min_overlap_chars=9, min_overlap_factor=0.9 + ) + # we expect one pair + assert pairs_df.height == 1 + # inspect the fields in the one returned pair + pair_result = pairs_df.row(0, named=True) + assert pair_result["page_id"] == excerpt1.page_id + # both excerpt ids present (order agnostic) + pair_exc_ids = set([pair_result["excerpt_id"], pair_result["excerpt_id_right"]]) + assert pair_exc_ids == set([excerpt1.excerpt_id, excerpt1_overlap.excerpt_id]) + assert pair_result["overlap_len"] == overlap_len + assert pair_result["overlap_factor"] == overlap_factor + + # confirm that if we adjust the parameters, this pair is not returned + assert ( + identify_overlapping_excerpts( + excerpts_df, min_overlap_chars=10, min_overlap_factor=0.9 + ).height + == 0 + ) + assert ( + identify_overlapping_excerpts( + excerpts_df, min_overlap_chars=9, min_overlap_factor=0.95 + ).height + == 0 + ) + # defaults options exclude this pair + assert identify_overlapping_excerpts(excerpts_df).height == 0 + + # check results when input is given in the alternate order + excerpts = [excerpt1_overlap, excerpt1] + excerpts_df = standardize_dataframe( + pl.from_dicts([ex.to_dict() for ex in excerpts]) + ) + pairs_df = identify_overlapping_excerpts( + excerpts_df, min_overlap_chars=9, min_overlap_factor=0.9 + ) + # we expect one pair + assert pairs_df.height == 1 + # check that pair is ordered as expected + pair_result = pairs_df.row(0, named=True) + assert pair_result["excerpt_id"] == excerpt1.excerpt_id + assert pair_result["excerpt_id_right"] == excerpt1_overlap.excerpt_id