diff --git a/sample_config.yml b/sample_config.yml index 9fe8aa29..2bd4e826 100644 --- a/sample_config.yml +++ b/sample_config.yml @@ -26,11 +26,11 @@ reference_corpora: # internet_poems: # tarball of directory of text files OR expanded directory; # some functionality will only work with the expanded directory - # text_dir: "internet_poems/internet_poems_texts.tar.gz" + # text_path: "internet_poems/internet_poems_texts.tar.gz" # chadwyck-healey: # tarball of directory of text files OR expanded directory; # some functionality will only work with the expanded directory - # text_dir: "chadwyck-healey/chadwyck-healey_texts.tar.gz" + # text_path: "chadwyck-healey/chadwyck-healey_texts.tar.gz" # metadata_path: "chadwyck-healey/chadwyck-healey.csv" other: # Provide a URL or local path to "Other Poems" metadata diff --git a/src/corppa/config.py b/src/corppa/config.py index 1fb01dbb..4673a66c 100644 --- a/src/corppa/config.py +++ b/src/corppa/config.py @@ -29,11 +29,11 @@ "base_dir": "ref-corpora", # paths are relative to base_dir "internet_poems": { - # tarball of directory of text files OR expanded directory - "text_dir": "internet_poems/internet_poems_texts.tar.gz" + # tarball of text files OR expanded directory + "text_path": "internet_poems/internet_poems_texts.tar.gz" }, "chadwyck-healey": { - "text_dir": "chadwyck-healey/chadwyck-healey_texts.tar.gz", + "text_path": "chadwyck-healey/chadwyck-healey_texts.tar.gz", "metadata_path": "chadwyck-healey/chadwyck-healey.csv", }, # other poems metadata_path configuration required diff --git a/src/corppa/poetry_detection/compile_dataset.py b/src/corppa/poetry_detection/compile_dataset.py index ba736707..1f434319 100644 --- a/src/corppa/poetry_detection/compile_dataset.py +++ b/src/corppa/poetry_detection/compile_dataset.py @@ -11,6 +11,7 @@ To run one or more specific steps, specify which steps you want to run. Any string that is distinct will be enough to select the step. + ```console compile-dataset --merge compile-dataset --poem-metadata @@ -30,8 +31,8 @@ from corppa.config import get_config from corppa.poetry_detection.merge_excerpts import merge_excerpt_files - -# from corppa.utils.path_utils import find_relative_paths +from corppa.poetry_detection.polars_utils import add_ref_poems_meta +from corppa.poetry_detection.ppa_works import extract_page_meta from corppa.poetry_detection.ref_corpora import save_poem_metadata DEFAULT_CONFIGS = { @@ -39,6 +40,9 @@ "source_ppa_metadata": "ppa-data/ppa_works.csv", } +#: compile script config options, for run_step method type hints +CompileOpts = dict[str, pathlib.Path] + def load_compilation_config(): """Load configuration for dataset compilation, @@ -132,19 +136,41 @@ def load_compilation_config(): } +def load_compiled_excerpts(config: CompileOpts) -> pl.DataFrame: + """Load compiled excerpts from CSV or compressed CSV file + based on configured path, whichever file exists (uncompressed first). + Raises a ValuError if neither file exists. + """ + for datafile in [ + config["compiled_excerpt_file"], + config["compressed_excerpt_file"], + ]: + if datafile.exists(): + # extract ppa work id and page number (needed for both poem and ppa metadata) + return extract_page_meta(pl.read_csv(datafile)) + raise ValueError( + f"Excerpt data file not found (checked {config['compiled_excerpt_file']} and {config['compressed_excerpt_file']}" + ) + + def get_excerpt_sources(excerpt_data_dir: pathlib.Path) -> list[pathlib.Path]: + """ + Find all CSV and compressed CSV files in a directory. + """ return list(excerpt_data_dir.glob("**/*.csv")) + list( excerpt_data_dir.glob("**/*.csv.gz") ) - # wondered about using find_relative_paths here, but we actually - # want non-relative paths and we need to handle a two-part extension - # return [ - # excerpt_data_dir / rel_path - # for rel_path in find_relative_paths(excerpt_data_dir, exts=[".csv", ".gz"]) # can we assume .gz == .csv.gz ? - # ] -def save_ppa_metadata(input_file: pathlib.Path, output_file: pathlib.Path): +def save_ppa_metadata( + input_file: pathlib.Path, output_file: pathlib.Path, excerpts_df: pl.DataFrame +): + """ + Save PPA work metadata with work-level excerpt totals. + Takes a PPA metadata file as input, a path for the output file, + and a dataframe of merged excerpt data. + Raises a ValueError if metadata file is not a CSV. + """ # copy as-is, do not rename or subset any fields # NOTE: currently assumes and only supports PPA metadata in csv format if input_file.suffix != ".csv": @@ -152,11 +178,32 @@ def save_ppa_metadata(input_file: pathlib.Path, output_file: pathlib.Path): f"PPA metadata must be loaded as CSV, got {input_file.suffix.lstrip('.')}" ) ppa_meta_df = pl.read_csv(input_file) - # TODO: add aggregate counts here + + # get work-level aggregate excerpt totals + excerpt_totals_df = excerpts_df.group_by("ppa_work_id").agg( + pl.col("excerpt_id").n_unique().alias("num_excerpts"), + pl.col("poem_id").n_unique().alias("num_poems"), + pl.col("poem_author").n_unique().alias("num_poets"), + ) + + # combine the totals with ppa work metadata + ppa_meta_df = ppa_meta_df.join( + excerpt_totals_df, left_on="work_id", right_on="ppa_work_id", how="left" + ).with_columns( + # fill any missing values with zeroes + pl.col("num_excerpts").fill_null(pl.lit(0)), + pl.col("num_poems").fill_null(pl.lit(0)), + pl.col("num_poets").fill_null(pl.lit(0)), + ) + ppa_meta_df.write_csv(output_file) -def compress_file(uncompressed_file, compressed_file): +def compress_file(uncompressed_file: pathlib.Path, compressed_file: pathlib.Path): + """ + Compress the `uncompressed_file` passed in with gzip, + saving it at the `compressed_file` path and deleting the original. + """ with open(str(uncompressed_file), "rb") as inputfile: with gzip.open(str(compressed_file), "wb") as output_file: shutil.copyfileobj(inputfile, output_file) @@ -165,7 +212,73 @@ def compress_file(uncompressed_file, compressed_file): uncompressed_file.unlink() -def main(): +def run_merge_step( + compile_opts: CompileOpts, excerpts_df: pl.DataFrame | None, compress_excerpts: bool +) -> pl.DataFrame: + """Run the merge excerpts step. Finds source excerpt files from the configured + path, merges excerpts, saves to CSV, and optionally compresses the CSV file. + """ + print("## Merging excerpts") + excerpt_sources = get_excerpt_sources(compile_opts["source_excerpt_data"]) + excerpts_df = merge_excerpt_files( + excerpt_sources, compile_opts["compiled_excerpt_file"] + ) + if compress_excerpts: + print( + f"Compressing excerpt data... {compile_opts['compiled_excerpt_file']} → {compile_opts['compressed_excerpt_file']}" + ) + compress_file( + compile_opts["compiled_excerpt_file"], + compile_opts["compressed_excerpt_file"], + ) + return excerpts_df + + +def run_poem_metadata_step( + compile_opts: CompileOpts, excerpts_df: pl.DataFrame | None = None +) -> None: + """Run the poem metadata compilation step. Uses excerpt data + (passed in or loaded from compile opts path) to calculate + poem excerpt totals. + """ + print("\n## Compiling reference corpora metadata") + if excerpts_df is None: + excerpts_df = load_compiled_excerpts(compile_opts) + else: + excerpts_df = extract_page_meta(excerpts_df) + save_poem_metadata(compile_opts["poem_metadata_file"], excerpts_df) + + +def run_ppa_metadata_step( + compile_opts: CompileOpts, excerpts_df: pl.DataFrame | None = None +) -> None: + """Run the PPA metadata compilation step. Uses excerpt data (passed + in or loaded from compile opts path) to calculate work-level + excerpt totals. + """ + print("\n## PPA work-level metadata") + if excerpts_df is None: + excerpts_df = load_compiled_excerpts(compile_opts) + else: + excerpts_df = extract_page_meta(excerpts_df) + + excerpts_df = add_ref_poems_meta(excerpts_df, compile_opts["poem_metadata_file"]) + + save_ppa_metadata( + compile_opts["source_ppa_metadata"], + compile_opts["ppa_metadata_file"], + excerpts_df, + ) + + +def main(cmd_args=None) -> None: + """ + Main entry point for the dataset compilation script. Parses + arguments to determine which steps to run. + """ + # allow passing arguments in; if not specified, draw from sys.argv/command line + if cmd_args is None: + cmd_args = sys.argv[1:] parser = argparse.ArgumentParser(description="Compile PPA found-poems dataset") parser.add_argument( "--compress-excerpts", @@ -191,40 +304,26 @@ def main(): action="append_const", const=step, ) - args = parser.parse_args() - compilation_steps = args.steps # None or list of steps + args = parser.parse_args(cmd_args) + # if not specified, run all steps + compilation_steps = args.steps if args.steps else list(compilation_steps.keys()) compile_opts = load_compilation_config() - if compilation_steps is None or "merge" in compilation_steps: - print("## Merging excerpts") - # find excerpt source files to be included in the compiled dataset file - excerpt_sources = get_excerpt_sources(compile_opts["source_excerpt_data"]) - # merge into a single uncompressed csv - # (polars doesn't currently support writing directly to a csv.gz) - merge_excerpt_files(excerpt_sources, compile_opts["compiled_excerpt_file"]) - # compress the resulting file if requested - if args.compress_excerpts: - print( - f"Compressing excerpt data... ({compile_opts['compiled_excerpt_file']} → {compile_opts['compressed_excerpt_file']})" - ) - compress_file( - compile_opts["compiled_excerpt_file"], - compile_opts["compressed_excerpt_file"], - ) + excerpts_df = None + if "merge" in compilation_steps: + excerpts_df = run_merge_step(compile_opts, excerpts_df, args.compress_excerpts) - if compilation_steps is None or "poem_metadata" in compilation_steps: - print("\n## Compiling reference corpora metadata") - save_poem_metadata(compile_opts["poem_metadata_file"]) + if "poem_metadata" in compilation_steps: + run_poem_metadata_step(compile_opts, excerpts_df) - if compilation_steps is None or "ppa_metadata" in compilation_steps: - print("\n## PPA work-level metadata") - save_ppa_metadata( - compile_opts["source_ppa_metadata"], compile_opts["ppa_metadata_file"] - ) + if "ppa_metadata" in compilation_steps: + run_ppa_metadata_step(compile_opts, excerpts_df) - print("\nRemember to commit and push the updated data files") - print(f"cd {compile_opts['output_data_dir'].parent} && git add data/*") + # probably not relevant anymore, not using git-lfs for this data... + print(f"Output files in {compile_opts['output_data_dir']}") + # print("\nRemember to commit and push the updated data files") + # print(f"cd {compile_opts['output_data_dir'].parent} && git add data/*") if __name__ == "__main__": diff --git a/src/corppa/poetry_detection/merge_excerpts.py b/src/corppa/poetry_detection/merge_excerpts.py index 7ff5869f..e034f184 100755 --- a/src/corppa/poetry_detection/merge_excerpts.py +++ b/src/corppa/poetry_detection/merge_excerpts.py @@ -194,7 +194,9 @@ def merge_excerpts( return pl.concat([output_df, merged_output_df], how="diagonal") -def merge_excerpt_files(input_files, output_file): +def merge_excerpt_files( + input_files: list[pathlib.Path], output_file: pathlib.Path +) -> pl.DataFrame: total_excerpts = 0 input_dfs = [] @@ -259,7 +261,11 @@ def merge_excerpt_files(input_files, output_file): # row is a tuple of value, count print(f"\t{row[0]}: {row[1]:,}") + # polars supports compression; but not sure what version it + # was added in, and documentation says it is unstable. Use that in future excerpts.write_csv(output_file) + # return excerpt data frame + return excerpts def main(): diff --git a/src/corppa/poetry_detection/ref_corpora.py b/src/corppa/poetry_detection/ref_corpora.py index baa91099..4bd64e5d 100644 --- a/src/corppa/poetry_detection/ref_corpora.py +++ b/src/corppa/poetry_detection/ref_corpora.py @@ -1,12 +1,15 @@ -import os.path +import logging import pathlib -import tarfile from collections.abc import Generator +from typing import Optional import polars as pl from corppa.config import get_config -from corppa.utils.build_text_corpus import build_text_corpus +from corppa.utils.build_text_corpus import build_text_corpus, text_corpus_from_tarfile + +logger = logging.getLogger(__name__) + #: schema for reference corpora metadata :class:`pl.DataFrame` METADATA_SCHEMA = { @@ -14,6 +17,9 @@ "author": pl.String, "title": pl.String, "ref_corpus": pl.String, + "num_lines": pl.Int64, + "num_words": pl.Int64, + "char_len": pl.Int64, } @@ -25,7 +31,7 @@ class BaseReferenceCorpus: corpus_id: str corpus_name: str - text_dir: pathlib.Path + text_path: pathlib.Path metadata_path: pathlib.Path | str def get_config_opts(self) -> dict: @@ -60,10 +66,24 @@ def get_config_opts(self) -> dict: corpus_opts.update(config_opts["reference_corpora"].get(self.corpus_id, {})) return corpus_opts - def get_metadata_df(self) -> pl.DataFrame: + @staticmethod + def calculate_poem_length(text: str) -> dict[str, int]: + """Calculate poem length metrics from text content. Takes the + text of the poem and returns a dictionary num_lines (non-blank lines), + num_words, and char_len. + """ + return { + "num_lines": len([line for line in text.splitlines() if line.strip()]), + "num_words": len(text.split()), + "char_len": len(text), + } + + def get_metadata_df(self, poem_length=False) -> pl.DataFrame: """Minimal common poetry metadata for use across reference corpora. Should return a :class:`pl.DataFrame` with poem_id, author, title, and - ref_corpus for each poem in this corpus.""" + ref_corpus for each poem in this corpus. Optionally, should + return information about poem length (number of characters and lines + in the text).""" raise NotImplementedError def get_text_corpus(self) -> Generator[dict[str, str]]: @@ -87,36 +107,40 @@ def __init__(self): config_opts = self.get_config_opts() # get text directory for this reference corpus from app configuration - if "text_dir" in config_opts: - self.text_dir = pathlib.Path(config_opts["text_dir"]) - # if text dir is not absolute, assume relative to ref_corpus base dir - if not self.text_dir.is_absolute(): - self.text_dir = config_opts["base_dir"] / self.text_dir - - if not self.text_dir.exists(): + if "text_path" in config_opts: + self.text_path = pathlib.Path(config_opts["text_path"]) + # if text path is not absolute, assume relative to ref_corpus base dir + # TODO: shift relative path logic to config loader + if not self.text_path.is_absolute(): + self.text_path = config_opts["base_dir"] / self.text_path + + if not self.text_path.exists(): raise ValueError( - f"Configuration error: {self.corpus_name} path {self.text_dir} does not exist" + f"Configuration error: {self.corpus_name} path {self.text_path} does not exist" ) - # TODO: allow tar.gz here; determine which and set a flag? - if not self.text_dir.is_dir() and not ( - self.text_dir.is_file() and self.text_dir.name.endswith(".tar.gz") + # Currently supports directory and tar.gz file; + # might be nice to support zipfile as well + if not self.text_path.is_dir() and not ( + self.text_path.is_file() and self.text_path.name.endswith(".tar.gz") ): raise ValueError( - f"Configuration error: {self.corpus_name} path {self.text_dir} is not a directory or a tar.gz" + f"Configuration error: {self.corpus_name} path {self.text_path} is not a directory or a tar.gz" ) - def get_text_corpus( - self, disable_progress: bool = True - ) -> Generator[dict[str, str]]: - # if text_dir is tarball, raise not implemented error - if not self.text_dir.is_dir(): + def get_text_corpus(self, disable_progress: bool = True) -> dict[str, str]: + # if text_path is tarball, raise not implemented error + if self.text_path.is_dir(): + corpus_method = build_text_corpus + elif self.text_path.name.endswith(".tar.gz"): + corpus_method = text_corpus_from_tarfile + else: raise NotImplementedError( - "text corpus generation is not supported for tar.gz; configure a directory" + "text corpus generation is only supported for tar.gz and directories" ) # build_text_corpus method returns id, so rename id to poem_id yield from ( {"poem_id": p["id"], "text": p["text"]} - for p in build_text_corpus(self.text_dir, disable_progress=disable_progress) + for p in corpus_method(self.text_path, disable_progress=disable_progress) ) @@ -131,40 +155,30 @@ class InternetPoems(LocalTextCorpus): #: id for this reference corpus: internet_poems corpus_id: str = "internet_poems" corpus_name: str = "Internet Poems" - # inherits text_dir path + # inherits text_path # no init/validation needed beyond that provided by LocalTextCorpus - def get_metadata_df(self) -> pl.DataFrame: + def get_metadata_df(self, poem_length=False) -> pl.DataFrame: metadata = [] - - # if configured text_dir is a directory, get list of names - # from the filesystem - if self.text_dir.is_dir(): - text_ids = [file.stem for file in self.text_dir.glob("*.txt")] - # otherwise, get from tar archive list - else: - with tarfile.open(str(self.text_dir), "r:gz") as text_archive: - text_ids = [ - os.path.splitext(os.path.basename(name))[0] - for name in text_archive.getnames() - if name.endswith(".txt") - ] - - # filename without extension is poem identifier - for poem_id in text_ids: + # returns a generator of dicts with id and text string + # TODO: when called from compile script, might be nice to show progress bar + for poem in self.get_text_corpus(): # filename format: # Firstname-Lastname_Poem-Title.txt # Replace - with spaces and split on - to separate author/title - author, title = poem_id.replace("-", " ").split("_", 1) - metadata.append( - { - "poem_id": poem_id, - "author": author, - "title": title, - "ref_corpus": self.corpus_id, - } - ) + author, title = poem["poem_id"].replace("-", " ").split("_", 1) + poem_metadata: dict[str, str | int] = { + "poem_id": poem["poem_id"], + "author": author, + "title": title, + "ref_corpus": self.corpus_id, + } + if poem_length: + poem_metadata.update(self.calculate_poem_length(poem["text"])) + + metadata.append(poem_metadata) + return pl.from_dicts(metadata, schema=METADATA_SCHEMA) @@ -177,10 +191,10 @@ class ChadwyckHealey(LocalTextCorpus): #: id for this reference corpus: chadwyck-healey corpus_id: str = "chadwyck-healey" corpus_name: str = "Chadwyck-Healey" - # inherits text_dir path + # inherits text_path def __init__(self): - # use LocalTextCorpus init to configure and validate text_dir + # use LocalTextCorpus init to configure and validate text_path super().__init__() # get configuration to set metadata path config_opts = self.get_config_opts() @@ -194,9 +208,9 @@ def __init__(self): f"Configuration error: {self.corpus_name} metadata {self.metadata_path} does not exist" ) - def get_metadata_df(self) -> pl.DataFrame: + def get_metadata_df(self, poem_length=False) -> pl.DataFrame: # disable schema inference; the fields we care about are all strings - return ( + df = ( pl.read_csv(self.metadata_path, infer_schema=False) # rename fields .rename({"title_main": "title", "id": "poem_id"}) @@ -212,6 +226,28 @@ def get_metadata_df(self) -> pl.DataFrame: .select(["poem_id", "author", "title", "ref_corpus"]) ) + if poem_length: + poem_lengths = [] + + # text corpus returns a generator of dicts with id and text string + # NOTE: when called from compile script, might be nice to show progress bar + for poem in self.get_text_corpus(): + poem_lengths.append( + { + "poem_id": poem["poem_id"], + **self.calculate_poem_length(poem["text"]), + } + ) + if poem_lengths: + poem_length_df = pl.from_dicts(poem_lengths) + df = df.join(poem_length_df, on="poem_id") + else: + logger.warning( + "Poem length requested but none calculated (no text files found?)" + ) + + return df + class OtherPoems(BaseReferenceCorpus): """A metadata-only reference corpus with metadata for poems that have @@ -239,7 +275,7 @@ def __init__(self): f"Configuration error: {self.corpus_name} 'metadata_path' is not set" ) - def get_metadata_df(self) -> pl.DataFrame: + def get_metadata_df(self, poem_length=False) -> pl.DataFrame: # polars can load csv directly from a url return pl.read_csv(self.metadata_path, schema=METADATA_SCHEMA).with_columns( ref_corpus=pl.lit(self.corpus_id) @@ -260,20 +296,22 @@ def fulltext_corpora() -> list[BaseReferenceCorpus]: return [InternetPoems(), ChadwyckHealey()] -def compile_metadata_df() -> pl.DataFrame: +def compile_metadata_df(poem_length=False) -> pl.DataFrame: """Compile poetry metadata from all reference corpora into a single polars DataFrame with reference corpus ids.""" - # create an empty dataframe with the intended fields - poem_metadata = pl.DataFrame([], schema=METADATA_SCHEMA) + # Combine poem metadata from all reference corpora - # for each corpus, load poem metadata into a polars dataframe, - # rename id to poem_id, and add a column with the corpus id - for ref_corpus in all_corpora(): - poem_metadata.extend(ref_corpus.get_metadata_df()) - return poem_metadata + # use a diagonal concat instead of vstack/extend + ref_corpora_dfs = [ + ref_corpus.get_metadata_df(poem_length=poem_length) + for ref_corpus in all_corpora() + ] + return pl.concat(ref_corpora_dfs, how="diagonal") -def save_poem_metadata(output_file: pathlib.Path): +def save_poem_metadata( + output_file: pathlib.Path, excerpts_df: Optional[pl.DataFrame] = None +): """Generate and save compiled poetry metadata as a data file in the poem dataset. """ @@ -283,7 +321,7 @@ def save_poem_metadata(output_file: pathlib.Path): output_verb = "Replacing" print(f"{output_verb} {output_file}") - df = compile_metadata_df() + df = compile_metadata_df(poem_length=True) ref_corpus_names = { ref_corpus.corpus_id: ref_corpus.corpus_name for ref_corpus in all_corpora() } @@ -294,5 +332,23 @@ def save_poem_metadata(output_file: pathlib.Path): # row is a tuple of value, count; convert reference corpus id to name totals.append(f"{ref_corpus_names[value]}: {count:,}") + # when excerpt data is present, calculate & include aggregate totals + if excerpts_df is not None: + # get work-level aggregate excerpt totals + # (only includes primary poem ids, not alt poem ids) + excerpt_totals_df = excerpts_df.group_by("poem_id").agg( + pl.col("excerpt_id").n_unique().alias("num_excerpts"), + pl.col("ppa_work_id").n_unique().alias("num_ppa_works"), + pl.col("page_id").n_unique().alias("num_ppa_pages"), + # number of unique ppa authors would be nice, but requires joining ppa metadata + ) + # combine the totals with poem metadata + df = df.join(excerpt_totals_df, on="poem_id", how="left").with_columns( + # fill any missing values with zeroes + pl.col("num_excerpts").fill_null(pl.lit(0)), + pl.col("num_ppa_works").fill_null(pl.lit(0)), + pl.col("num_ppa_pages").fill_null(pl.lit(0)), + ) + print(f"{df.height:,} poem metadata entries ({'; '.join(totals)})") df.write_csv(output_file, include_bom=True) diff --git a/src/corppa/utils/build_text_corpus.py b/src/corppa/utils/build_text_corpus.py index 8ae288b9..4c56effa 100644 --- a/src/corppa/utils/build_text_corpus.py +++ b/src/corppa/utils/build_text_corpus.py @@ -26,6 +26,7 @@ import argparse import sys +import tarfile from pathlib import Path import orjsonl @@ -44,13 +45,13 @@ def get_text_record(text_file: Path) -> dict[str, str]: def build_text_corpus( - input_dir: Path, disable_progress: bool = False + input_path: Path, disable_progress: bool = False ) -> dict[str, str]: """ Generates text records for each text file within input directory """ progress_bar = tqdm( - input_dir.glob("**/*.txt"), + input_path.glob("**/*.txt"), bar_format="Read {n:,} pages{postfix} | elapsed: {elapsed}", disable=disable_progress, ) @@ -58,6 +59,34 @@ def build_text_corpus( yield get_text_record(text_file) +def text_corpus_from_tarfile( + input_path: Path, disable_progress: bool = False +) -> dict[str, str]: + """ + Generate text records for each text file within a tar.gz archive + """ + # NOTE: could make compression optional, currently assumes gz + # NOTE: currently does not support progressbar + with tarfile.open(str(input_path), "r:gz") as tar_archive: + for member in tqdm( + tar_archive.getmembers(), + bar_format="Read {n:,} files{postfix} | elapsed: {elapsed}", + disable=disable_progress, + ): + # skip any OSX metadata files included in the archive + if "._" in member.name: + continue + + # read contents of text files and yield filename and contents + if member.name.endswith(".txt"): + txtfile = tar_archive.extractfile(member) + if txtfile is not None: + yield { + "id": Path(member.name).stem, + "text": txtfile.read().decode("utf-8"), + } + + def save_text_corpus( input_dir: Path, output_file=Path, diff --git a/tests/test_poetry_detection/test_compile_dataset.py b/tests/test_poetry_detection/test_compile_dataset.py new file mode 100644 index 00000000..658b21d3 --- /dev/null +++ b/tests/test_poetry_detection/test_compile_dataset.py @@ -0,0 +1,306 @@ +# Copyright (c) 2026, Center for Digital Humanities, Princeton University +# SPDX-License-Identifier: Apache-2.0 + + +import gzip +from pathlib import Path +from unittest.mock import patch + +import polars as pl +import pytest + +from corppa.poetry_detection.compile_dataset import ( + compress_file, + get_excerpt_sources, + load_compiled_excerpts, + main, + run_merge_step, + run_poem_metadata_step, + run_ppa_metadata_step, + save_ppa_metadata, +) + + +def test_get_excerpt_sources_empty_dir(tmp_path): + result = get_excerpt_sources(tmp_path) + assert result == [] + + +def test_get_excerpt_sources_with_files(tmp_path): + subdir1 = tmp_path / "subdir1" + subdir2 = tmp_path / "subdir2" + subdir1.mkdir() + subdir2.mkdir() + + (tmp_path / "file1.csv").touch() + (tmp_path / "file2.csv.gz").touch() + (subdir1 / "nested.csv").touch() + (subdir2 / "nested.csv.gz").touch() + (tmp_path / "file3.txt").touch() + + result = get_excerpt_sources(tmp_path) + assert len(result) == 4 + + +def test_save_ppa_metadata(tmp_path): + input_file = tmp_path / "ppa_works.csv" + output_file = tmp_path / "output.csv" + + input_file.write_text("work_id,title,author\nW001,Test Work,Test Author\n") + + excerpts_df = pl.DataFrame( + { + "ppa_work_id": ["W001", "W001", "W001"], + "excerpt_id": ["e1", "e2", "e3"], + "poem_id": ["poem-1", "poem-1", "poem-2"], + "poem_author": ["Author A", "Author A", "Author B"], + } + ) + + save_ppa_metadata(input_file, output_file, excerpts_df) + + result = pl.read_csv(output_file) + assert "num_excerpts" in result.columns + assert "num_poems" in result.columns + assert "num_poets" in result.columns + + row = result.row(0, named=True) + assert row["work_id"] == "W001" + assert row["num_excerpts"] == 3 + assert row["num_poems"] == 2 + assert row["num_poets"] == 2 + + +def test_save_ppa_metadata_not_csv(tmp_path): + input_file = tmp_path / "ppa_works.json" + output_file = tmp_path / "output.csv" + + excerpts_df = pl.DataFrame( + { + "ppa_work_id": ["W001"], + "excerpt_id": ["e1"], + "poem_id": ["poem-1"], + "poem_author": ["Author A"], + } + ) + + with pytest.raises(ValueError, match="PPA metadata must be loaded as CSV"): + save_ppa_metadata(input_file, output_file, excerpts_df) + + +@patch("corppa.poetry_detection.compile_dataset.pl.read_csv") +@patch("corppa.poetry_detection.compile_dataset.extract_page_meta") +def test_load_compiled_excerpts_uncompressed( + mock_extract_page_meta, mock_read_csv, tmp_path +): + # config method populates both paths; + # load method will choose the first one that exists + excerpt_file = tmp_path / "excerpts.csv" + excerpt_file.touch() + excerpt_gz_file = tmp_path / "excerpts.csv.gz" + config = { + "compiled_excerpt_file": excerpt_file, + "compressed_excerpt_file": excerpt_gz_file, + } + + result = load_compiled_excerpts(config) + assert result == mock_extract_page_meta.return_value + + mock_extract_page_meta.assert_called_once_with(mock_read_csv.return_value) + mock_read_csv.assert_called_once_with(excerpt_file) + + # reset and remove the uncompressed, make the gz exist + mock_read_csv.reset_mock() + excerpt_file.unlink() + excerpt_gz_file.touch() + load_compiled_excerpts(config) + mock_read_csv.assert_called_once_with(excerpt_gz_file) + + +def test_load_compiled_excerpts_file_not_found(tmp_path): + config = { + "compiled_excerpt_file": tmp_path / "nonexistent.csv", + "compressed_excerpt_file": tmp_path / "nonexistent.csv.gz", + } + + with pytest.raises(ValueError, match="Excerpt data file not found"): + load_compiled_excerpts(config) + + +@pytest.mark.parametrize( + "args,expected_calls", + [ + ([], {"merge", "poem_metadata", "ppa_metadata"}), + (["--merge"], {"merge"}), + (["--poem_metadata"], {"poem_metadata"}), + (["--ppa_metadata"], {"ppa_metadata"}), + ], +) +@patch("corppa.poetry_detection.compile_dataset.run_ppa_metadata_step") +@patch("corppa.poetry_detection.compile_dataset.run_poem_metadata_step") +@patch("corppa.poetry_detection.compile_dataset.run_merge_step") +@patch("corppa.poetry_detection.compile_dataset.load_compilation_config") +def test_main( + mock_load_config, + mock_merge, + mock_poem, + mock_ppa, + args, + expected_calls, + tmp_path, +): + mock_load_config.return_value = { + "test": "config", + "output_data_dir": tmp_path, + } + + main(args) + + if "merge" in expected_calls: + mock_merge.assert_called_once() + else: + mock_merge.assert_not_called() + + if "poem_metadata" in expected_calls: + mock_poem.assert_called_once() + else: + mock_poem.assert_not_called() + + if "ppa_metadata" in expected_calls: + mock_ppa.assert_called_once() + else: + mock_ppa.assert_not_called() + + +@patch("corppa.poetry_detection.compile_dataset.compress_file") +@patch("corppa.poetry_detection.compile_dataset.merge_excerpt_files") +@patch("corppa.poetry_detection.compile_dataset.get_excerpt_sources") +def test_run_merge_step(mock_get_sources, mock_merge, mock_compress, tmp_path): + compile_opts = { + "source_excerpt_data": tmp_path / "/data/excerpts", + "compiled_excerpt_file": tmp_path / "/out/excerpts.csv", + "compressed_excerpt_file": tmp_path / "/out/excerpts.csv.gz", + } + + result = run_merge_step(compile_opts, None, compress_excerpts=True) + # returns result of merge + assert result == mock_merge.return_value + + # get sources is called on the configured path + mock_get_sources.assert_called_once_with(compile_opts["source_excerpt_data"]) + # merge is called with the result of get sources and compile option + mock_merge.assert_called_once_with( + mock_get_sources.return_value, compile_opts["compiled_excerpt_file"] + ) + # compress is called + mock_compress.assert_called_once_with( + Path("/out/excerpts.csv"), Path("/out/excerpts.csv.gz") + ) + + # call again with no compression + mock_compress.reset_mock() + run_merge_step(compile_opts, None, compress_excerpts=False) + mock_compress.assert_not_called() + + +@patch("corppa.poetry_detection.compile_dataset.save_poem_metadata") +@patch("corppa.poetry_detection.compile_dataset.extract_page_meta") +@patch("corppa.poetry_detection.compile_dataset.load_compiled_excerpts") +def test_run_poem_metadata_step_with_df(mock_load, mock_extract, mock_save, tmp_path): + input_df = pl.DataFrame({"id": [1]}) + mock_extract.return_value = pl.DataFrame({"id": [1], "page_id": ["p.1"]}) + + compile_opts = {"poem_metadata_file": tmp_path / "/out/poem_meta.csv"} + + run_poem_metadata_step(compile_opts, input_df) + + mock_load.assert_not_called() + mock_extract.assert_called_once_with(input_df) + mock_save.assert_called_once_with( + compile_opts["poem_metadata_file"], mock_extract.return_value + ) + + +@patch("corppa.poetry_detection.compile_dataset.save_poem_metadata") +@patch("corppa.poetry_detection.compile_dataset.extract_page_meta") +@patch("corppa.poetry_detection.compile_dataset.load_compiled_excerpts") +def test_run_poem_metadata_step(mock_load, mock_extract, mock_save, tmp_path): + mock_load.return_value = pl.DataFrame({"id": [1]}) + + compile_opts = {"poem_metadata_file": tmp_path / "/out/poem_meta.csv"} + + run_poem_metadata_step(compile_opts, None) + + mock_load.assert_called_once_with(compile_opts) + mock_extract.assert_not_called() + mock_save.assert_called_once_with( + compile_opts["poem_metadata_file"], mock_load.return_value + ) + + +@patch("corppa.poetry_detection.compile_dataset.save_ppa_metadata") +@patch("corppa.poetry_detection.compile_dataset.add_ref_poems_meta") +@patch("corppa.poetry_detection.compile_dataset.extract_page_meta") +@patch("corppa.poetry_detection.compile_dataset.load_compiled_excerpts") +def test_run_ppa_metadata_step( + mock_load, mock_extract, mock_add_ref_poems, mock_save, tmp_path +): + input_df = pl.DataFrame({"id": [1]}) + + compile_opts = { + "poem_metadata_file": tmp_path / "/out/poem_meta.csv", + "source_ppa_metadata": tmp_path / "/data/ppa_works.csv", + "ppa_metadata_file": tmp_path / "/out/ppa_meta.csv", + } + + # call with excerpt dataframe provided + run_ppa_metadata_step(compile_opts, input_df) + # doesn't load excerpts because provided + mock_load.assert_not_called() + # extracts page/work metadata + mock_extract.assert_called_once_with(input_df) + # loads reference poem metadata + mock_add_ref_poems.assert_called_once_with( + mock_extract.return_value, compile_opts["poem_metadata_file"] + ) + mock_save.assert_called_once_with( + compile_opts["source_ppa_metadata"], + compile_opts["ppa_metadata_file"], + mock_add_ref_poems.return_value, + ) + + # call without excerpt df + mock_extract.reset_mock() + mock_add_ref_poems.reset_mock() + mock_save.reset_mock() + run_ppa_metadata_step(compile_opts, None) + mock_load.assert_called_once_with(compile_opts) + mock_extract.assert_not_called() + mock_add_ref_poems.assert_called_once_with( + mock_load.return_value, compile_opts["poem_metadata_file"] + ) + mock_save.assert_called_once_with( + compile_opts["source_ppa_metadata"], + compile_opts["ppa_metadata_file"], + mock_add_ref_poems.return_value, + ) + + +def test_compress_file(tmp_path): + # integration test to confirm logic works as expected + uncompressed_file = tmp_path / "excerpts.csv" + compressed_file = tmp_path / "excerpts.csv.gz" + # write out content to test round-trip + file_contents = "excerpt_id,text\n1,hello\n" + uncompressed_file.write_text(file_contents) + + compress_file(uncompressed_file, compressed_file) + # uncompressed file should be removed + assert not uncompressed_file.exists() + # compressed file should now be present + assert compressed_file.exists() + + # uncompressed content should match what we wrote out + with gzip.open(compressed_file, "rt") as f: + content = f.read() + assert content == file_contents diff --git a/tests/test_poetry_detection/test_ref_corpora.py b/tests/test_poetry_detection/test_ref_corpora.py index 40890f61..3417b625 100644 --- a/tests/test_poetry_detection/test_ref_corpora.py +++ b/tests/test_poetry_detection/test_ref_corpora.py @@ -34,9 +34,9 @@ def corppa_test_config(tmp_path): reference_corpora: base_dir: {base_dir} internet_poems: - text_dir: {base_dir / "internet_poems2"} + text_path: {base_dir / "internet_poems2"} chadwyck-healey: - text_dir: "ch" + text_path: "ch" metadata_path: "ch/chadwyck-healey.csv" other: metadata_path: http://example.com/other-poems.csv @@ -118,6 +118,24 @@ def test_get_config_relative_dir(self, mock_get_config): config_opts["base_dir"] == pathlib.Path(ingredients_dir) / ref_corpora_dir ) + def test_calculate_poem_length(self): + # Test single line text + result = BaseReferenceCorpus.calculate_poem_length("Hello world test") + assert result == {"num_lines": 1, "num_words": 3, "char_len": 16} + + # Test multi-line text with blank lines + text = "Line one here\nLine two here\n\nLine three" + result = BaseReferenceCorpus.calculate_poem_length(text) + assert result == {"num_lines": 3, "num_words": 8, "char_len": len(text)} + + # Test empty text + result = BaseReferenceCorpus.calculate_poem_length("") + assert result == {"num_lines": 0, "num_words": 0, "char_len": 0} + + # Test text with only blank lines + result = BaseReferenceCorpus.calculate_poem_length(" \n\n ") + assert result == {"num_lines": 0, "num_words": 0, "char_len": 8} + # fixture data for internet poems INTERNETPOEMS_TEXTS = [ @@ -133,12 +151,12 @@ def test_get_config_relative_dir(self, mock_get_config): @pytest.fixture -def internetpoems_data_dir(tmp_path): +def internetpoems_data_dir(tmp_path, corppa_test_config): # test fixture to create internet poems data directory with sample text files config_opts = config.get_config() - # use the configured text data dir + # use the configured text data dir from test config data_dir = pathlib.Path( - config_opts["reference_corpora"]["internet_poems"]["text_dir"] + config_opts["reference_corpora"]["internet_poems"]["text_path"] ) data_dir.mkdir(parents=True, exist_ok=True) @@ -149,7 +167,7 @@ def internetpoems_data_dir(tmp_path): @pytest.fixture -def internetpoems_tarball(tmp_path): +def internetpoems_tarball(tmp_path, corppa_test_config_defaults): # test fixture to create tar.gzip of internet poems data directory with sample text files # should be used with default config config_opts = config.get_config() @@ -159,7 +177,7 @@ def internetpoems_tarball(tmp_path): text_file = internetpoems_data_dir / f"{sample['id']}.txt" text_file.write_text(sample["text"]) - tarfile_name = config_opts["reference_corpora"]["internet_poems"]["text_dir"] + tarfile_name = config_opts["reference_corpora"]["internet_poems"]["text_path"] base_dir = pathlib.Path(config_opts["reference_corpora"]["base_dir"]) tarfile_path = base_dir / tarfile_name tarfile_path.parent.mkdir(parents=True, exist_ok=True) @@ -180,14 +198,14 @@ def test_init(self, tmp_path, corppa_test_config): config_opts = config.get_config() # expected data_dir expected_data_dir = pathlib.Path( - config_opts["reference_corpora"]["internet_poems"]["text_dir"] + config_opts["reference_corpora"]["internet_poems"]["text_path"] ) # init should succeed when directory exists expected_data_dir.mkdir(parents=True) internet_poems = InternetPoems() - assert isinstance(internet_poems.text_dir, pathlib.Path) - assert internet_poems.text_dir == expected_data_dir + assert isinstance(internet_poems.text_path, pathlib.Path) + assert internet_poems.text_path == expected_data_dir # error if it is not a directory : remove dir and create a regular file expected_data_dir.rmdir() @@ -203,15 +221,15 @@ def test_get_config(self, mock_pathlib, tmp_path, corppa_test_config): # should pass in reference corpus base directory assert "base_dir" in config_opts # should include ref_corpus specific options, where are in the test config - assert "text_dir" in config_opts + assert "text_path" in config_opts @patch.object(InternetPoems, "get_config_opts") def test_get_metadata_df( self, mock_get_config_opts, tmp_path, corppa_test_config, internetpoems_data_dir ): - mock_get_config_opts.return_value = {"text_dir": str(internetpoems_data_dir)} + mock_get_config_opts.return_value = {"text_path": str(internetpoems_data_dir)} internet_poems = InternetPoems() - meta_df = internet_poems.get_metadata_df() + meta_df = internet_poems.get_metadata_df(poem_length=True) assert isinstance(meta_df, pl.DataFrame) assert meta_df.schema == METADATA_SCHEMA assert meta_df.height == len(INTERNETPOEMS_TEXTS) @@ -221,6 +239,29 @@ def test_get_metadata_df( assert meta_row["author"] == "King James Bible" assert meta_row["title"] == "Psalms" assert meta_row["ref_corpus"] == internet_poems.corpus_id + # check poem length calculations (non-blank lines, word count, char length) + assert meta_row["num_lines"] == 1 + assert meta_row["num_words"] == 9 + assert meta_row["char_len"] == len(INTERNETPOEMS_TEXTS[0]["text"]) + + @patch.object(InternetPoems, "get_config_opts") + def test_get_metadata_df_no_poem_length( + self, mock_get_config_opts, tmp_path, corppa_test_config, internetpoems_data_dir + ): + # Test that poem_length=False sets length fields to null + mock_get_config_opts.return_value = {"text_path": str(internetpoems_data_dir)} + internet_poems = InternetPoems() + meta_df = internet_poems.get_metadata_df(poem_length=False) + assert isinstance(meta_df, pl.DataFrame) + # Length fields should be present but null + assert "num_lines" in meta_df.columns + assert "num_words" in meta_df.columns + assert "char_len" in meta_df.columns + # All length values should be null + assert ( + meta_df.select("num_lines", "num_words", "char_len").drop_nulls().height + == 0 + ) def test_get_metadata_df_tarball( self, @@ -247,9 +288,46 @@ def test_get_text_corpus_tarball( internetpoems_tarball, ): internet_poems = InternetPoems() - with pytest.raises(NotImplementedError, match="not supported for tar.gz"): - # returns a generator; use list to get to actually run - list(internet_poems.get_text_corpus()) + # with pytest.raises(NotImplementedError, match="not supported for tar.gz"): + # returns a generator; use list to get to actually run + # convert to list, sort to ensure order matches fixture data + text_data = sorted( + list(internet_poems.get_text_corpus()), key=lambda x: x["poem_id"] + ) + assert len(text_data) == len(INTERNETPOEMS_TEXTS) + assert text_data[0]["poem_id"] == INTERNETPOEMS_TEXTS[0]["id"] + assert text_data[0]["text"] == INTERNETPOEMS_TEXTS[0]["text"] + + @patch.object(InternetPoems, "get_config_opts") + def test_get_text_unsupported( + self, + mock_get_config_opts, + tmp_path, + corppa_test_config_defaults, + ): + zipfile = tmp_path / "internet_poems.zip" + zipfile.touch() + mock_get_config_opts.return_value = { + "text_path": zipfile, + "base_dir": tmp_path / "ref-corpora", + } + with pytest.raises(ValueError, match=".*not a directory or a tar.gz"): + # checks configuration on init + InternetPoems() + + def get_text_corpus_unsupported( + self, mock_get_config_opts, tmp_path, corppa_test_config_defaults + ): + zipfile = tmp_path / "internet_poems.zip" + zipfile.touch() + # init normally to by pass the check path type check + internet_poems = InternetPoems() + # patch in our zip file + internet_poems.text_path = zipfile + with pytest.raises( + NotImplementedError, match="only supported for tar.gz and directories" + ): + internet_poems.get_text_corpus() @patch.object(InternetPoems, "get_config_opts") def test_get_text_corpus( @@ -260,7 +338,7 @@ def test_get_text_corpus( internetpoems_data_dir, ): mock_get_config_opts.return_value = { - "text_dir": str(internetpoems_data_dir), + "text_path": str(internetpoems_data_dir), "base_dir": tmp_path / "ref-corpora", } internet_poems = InternetPoems() @@ -274,15 +352,15 @@ def test_get_text_corpus( @pytest.fixture -def chadwyck_healey_csv(tmp_path): +def chadwyck_healey_csv(tmp_path, corppa_test_config): "fixture to create a test version of the chadwyck-healey metadata csv file" - # test fixture to create internet poems data directory with sample text files + # test fixture to create chadwyck-healey data directory with sample metadata csv config_opts = config.get_config() - # use the configured data paths or configured ref_corpus base_dir and defaults + # use the configured data paths from test config base_dir = pathlib.Path(config_opts["reference_corpora"]["base_dir"]) override_opts = config_opts["reference_corpora"][ChadwyckHealey.corpus_id] - data_dir = pathlib.Path(override_opts["text_dir"]) + data_dir = pathlib.Path(override_opts["text_path"]) ch_meta_csv = pathlib.Path(override_opts["metadata_path"]) # in either case, make relative to base dir if not absolute @@ -310,7 +388,8 @@ def test_get_metadata_df(self, tmp_path, corppa_test_config, chadwyck_healey_csv chadwyck_healey = ChadwyckHealey() meta_df = chadwyck_healey.get_metadata_df() assert isinstance(meta_df, pl.DataFrame) - assert meta_df.schema == METADATA_SCHEMA + # schema is a subset because we don't include poem lengths + assert all(key in METADATA_SCHEMA for key in meta_df.schema.keys()) # csv fixture data currently has one row assert meta_df.height == 1 # get the first row as a dict and check values @@ -320,6 +399,32 @@ def test_get_metadata_df(self, tmp_path, corppa_test_config, chadwyck_healey_csv assert meta_row["title"] == "THE CAVERN OF WOE." assert meta_row["ref_corpus"] == chadwyck_healey.corpus_id + def test_get_metadata_df_with_poem_length( + self, tmp_path, corppa_test_config, chadwyck_healey_csv + ): + # Create a text file for the poem to test poem length calculation + chadwyck_healey = ChadwyckHealey() + text_dir = chadwyck_healey.text_path + # three lines, eight words + text_content = "Line one here\nLine two here\nLine three" + text_file = text_dir / "Z300475611.txt" + text_file.write_text(text_content) + + meta_df = chadwyck_healey.get_metadata_df(poem_length=True) + assert isinstance(meta_df, pl.DataFrame) + # Should include length fields + assert "num_lines" in meta_df.columns + assert "num_words" in meta_df.columns + assert "char_len" in meta_df.columns + + meta_row = meta_df.row(0, named=True) + # 3 non-blank lines + assert meta_row["num_lines"] == 3 + # 8 words total + assert meta_row["num_words"] == 8 + # character length (including newlines) + assert meta_row["char_len"] == len(text_content) + # get_text_corpus method is not tested here because it is inherited; # logic is shared with InternetPoems and tested there @@ -351,7 +456,8 @@ def test_get_metadata_df( opoems = OtherPoems() meta_df = opoems.get_metadata_df() assert isinstance(meta_df, pl.DataFrame) - assert meta_df.schema == METADATA_SCHEMA + # schema is a subset because we don't include poem lengths + assert all(key in METADATA_SCHEMA for key in meta_df.schema.keys()) assert meta_df.height == len(OTHERPOEM_METADATA) # check values on the first row meta_row = meta_df.row(0, named=True) @@ -448,6 +554,7 @@ def test_save_poem_metadata( otherpoems_metadata_df, ): # data fixtures should ensure that all the expected directories exist + # mock_get_config_opts.return_value = {"text_path": str(internetpoems_data_dir)} # add corpus id to other poems data frame and patch it to be returned otherpoems_metadata_df = otherpoems_metadata_df.with_columns( @@ -471,3 +578,83 @@ def test_save_poem_metadata( save_poem_metadata(output_file) captured = capsys.readouterr() assert "Replacing" in captured.out + + +def test_save_poem_metadata_with_excerpts( + tmp_path, + capsys, + corppa_test_config, + internetpoems_data_dir, + chadwyck_healey_csv, + otherpoems_metadata_df, +): + # Test the case where excerpts_df is provided - tests aggregation logic + + # add corpus id to other poems data frame and patch it to be returned + otherpoems_metadata_df = otherpoems_metadata_df.with_columns( + ref_corpus=pl.lit(OtherPoems.corpus_id) + ) + + # Create sample excerpts dataframe with poem data + # Use poem IDs from the INTERNETPOEMS_TEXTS global variable + excerpts_df = pl.from_dicts( + [ + # two excerpts for poem 0 from the same work, two different pages + { + "poem_id": INTERNETPOEMS_TEXTS[0]["id"], + "excerpt_id": "p@1:10", + "ppa_work_id": "work1", + "page_id": "page1", + }, + { + "poem_id": INTERNETPOEMS_TEXTS[0]["id"], + "excerpt_id": "p@3:30", + "ppa_work_id": "work1", + "page_id": "page2", + }, + # one excerpt for poem 2 + { + "poem_id": INTERNETPOEMS_TEXTS[1]["id"], + "excerpt_id": "ex3", + "ppa_work_id": "work2", + "page_id": "page3", + }, + ] + ) + + aggregation_fields = ["num_excerpts", "num_ppa_works", "num_ppa_pages"] + + with patch.object( + OtherPoems, "get_metadata_df", return_value=otherpoems_metadata_df + ): + output_file = tmp_path / "poem_meta.csv" + save_poem_metadata(output_file, excerpts_df=excerpts_df) + assert output_file.exists() + + # Read the output CSV and check for aggregate columns + result_df = pl.read_csv(output_file) + # all fields should be present + for field in aggregation_fields: + assert field in result_df.columns + + # Check that poem with 2 excerpts has correct counts + psalms_row = result_df.filter( + pl.col("poem_id") == INTERNETPOEMS_TEXTS[0]["id"] + ).row(0, named=True) + # two excerpts from one work, different pages + assert psalms_row["num_excerpts"] == 2 + assert psalms_row["num_ppa_works"] == 1 + assert psalms_row["num_ppa_pages"] == 2 + + # Check that poem with 1 excerpt has correct counts + mary_row = result_df.filter(pl.col("poem_id") == INTERNETPOEMS_TEXTS[1]["id"]).row( + 0, named=True + ) + # one excerpt, all counts are 1 + assert all(mary_row[value] == 1 for value in aggregation_fields) + + # Check that poems without excerpts (from otherpoems) have zero counts + for poem_info in result_df.filter( + pl.col("poem_id").is_in(OTHERPOEM_METADATA[0]) + ).iter_rows(named=True): + assert all(poem_info[value] == 0 for value in aggregation_fields)