diff --git a/sample_config.yml b/sample_config.yml
index 9fe8aa29..2bd4e826 100644
--- a/sample_config.yml
+++ b/sample_config.yml
@@ -26,11 +26,11 @@ reference_corpora:
   # internet_poems:
     # tarball of directory of text files OR expanded directory;
     # some functionality will only work with the expanded directory
-    # text_dir: "internet_poems/internet_poems_texts.tar.gz"
+    # text_path: "internet_poems/internet_poems_texts.tar.gz"
   # chadwyck-healey:
     # tarball of directory of text files OR expanded directory;
     # some functionality will only work with the expanded directory
-    # text_dir: "chadwyck-healey/chadwyck-healey_texts.tar.gz"
+    # text_path: "chadwyck-healey/chadwyck-healey_texts.tar.gz"
     # metadata_path: "chadwyck-healey/chadwyck-healey.csv"
   other:
     # Provide a URL or local path to "Other Poems" metadata
diff --git a/src/corppa/config.py b/src/corppa/config.py
index 1fb01dbb..4673a66c 100644
--- a/src/corppa/config.py
+++ b/src/corppa/config.py
@@ -29,11 +29,11 @@
         "base_dir": "ref-corpora",
         # paths are relative to base_dir
         "internet_poems": {
-            # tarball of directory of text files OR expanded directory
-            "text_dir": "internet_poems/internet_poems_texts.tar.gz"
+            # tarball of text files OR expanded directory
+            "text_path": "internet_poems/internet_poems_texts.tar.gz"
         },
         "chadwyck-healey": {
-            "text_dir": "chadwyck-healey/chadwyck-healey_texts.tar.gz",
+            "text_path": "chadwyck-healey/chadwyck-healey_texts.tar.gz",
             "metadata_path": "chadwyck-healey/chadwyck-healey.csv",
         },
         # other poems metadata_path configuration required
diff --git a/src/corppa/poetry_detection/compile_dataset.py b/src/corppa/poetry_detection/compile_dataset.py
index ba736707..1f434319 100644
--- a/src/corppa/poetry_detection/compile_dataset.py
+++ b/src/corppa/poetry_detection/compile_dataset.py
@@ -11,6 +11,7 @@
 
 To run one or more specific steps, specify which steps you want to run.
 Any string that is distinct will be enough to select the step.
+
 ```console
 compile-dataset --merge
 compile-dataset --poem-metadata
@@ -30,8 +31,8 @@
 
 from corppa.config import get_config
 from corppa.poetry_detection.merge_excerpts import merge_excerpt_files
-
-# from corppa.utils.path_utils import find_relative_paths
+from corppa.poetry_detection.polars_utils import add_ref_poems_meta
+from corppa.poetry_detection.ppa_works import extract_page_meta
 from corppa.poetry_detection.ref_corpora import save_poem_metadata
 
 DEFAULT_CONFIGS = {
@@ -39,6 +40,9 @@
     "source_ppa_metadata": "ppa-data/ppa_works.csv",
 }
 
+#: compile script config options, for run_step method type hints
+CompileOpts = dict[str, pathlib.Path]
+
 
 def load_compilation_config():
     """Load configuration for dataset compilation,
@@ -132,19 +136,41 @@ def load_compilation_config():
     }
 
 
+def load_compiled_excerpts(config: CompileOpts) -> pl.DataFrame:
+    """Load compiled excerpts from CSV or compressed CSV file
+    based on configured path, whichever file exists (uncompressed first).
+    Raises a ValuError if neither file exists.
+    """
+    for datafile in [
+        config["compiled_excerpt_file"],
+        config["compressed_excerpt_file"],
+    ]:
+        if datafile.exists():
+            # extract ppa work id and page number (needed for both poem and ppa metadata)
+            return extract_page_meta(pl.read_csv(datafile))
+    raise ValueError(
+        f"Excerpt data file not found (checked {config['compiled_excerpt_file']} and {config['compressed_excerpt_file']}"
+    )
+
+
 def get_excerpt_sources(excerpt_data_dir: pathlib.Path) -> list[pathlib.Path]:
+    """
+    Find all CSV and compressed CSV files in a directory.
+    """
     return list(excerpt_data_dir.glob("**/*.csv")) + list(
         excerpt_data_dir.glob("**/*.csv.gz")
     )
-    # wondered about using find_relative_paths here, but we actually
-    # want non-relative paths and we need to handle a two-part extension
-    # return [
-    #     excerpt_data_dir / rel_path
-    #     for rel_path in find_relative_paths(excerpt_data_dir, exts=[".csv", ".gz"]) # can we assume .gz == .csv.gz ?
-    # ]
 
 
-def save_ppa_metadata(input_file: pathlib.Path, output_file: pathlib.Path):
+def save_ppa_metadata(
+    input_file: pathlib.Path, output_file: pathlib.Path, excerpts_df: pl.DataFrame
+):
+    """
+    Save PPA work metadata with work-level excerpt totals.
+    Takes a PPA metadata file as input, a path for the output file,
+    and a dataframe of merged excerpt data.
+    Raises a ValueError if metadata file is not a CSV.
+    """
     # copy as-is, do not rename or subset any fields
     # NOTE: currently assumes and only supports PPA metadata in csv format
     if input_file.suffix != ".csv":
@@ -152,11 +178,32 @@ def save_ppa_metadata(input_file: pathlib.Path, output_file: pathlib.Path):
             f"PPA metadata must be loaded as CSV, got {input_file.suffix.lstrip('.')}"
         )
     ppa_meta_df = pl.read_csv(input_file)
-    # TODO: add aggregate counts here
+
+    # get work-level aggregate excerpt totals
+    excerpt_totals_df = excerpts_df.group_by("ppa_work_id").agg(
+        pl.col("excerpt_id").n_unique().alias("num_excerpts"),
+        pl.col("poem_id").n_unique().alias("num_poems"),
+        pl.col("poem_author").n_unique().alias("num_poets"),
+    )
+
+    # combine the totals with ppa work metadata
+    ppa_meta_df = ppa_meta_df.join(
+        excerpt_totals_df, left_on="work_id", right_on="ppa_work_id", how="left"
+    ).with_columns(
+        # fill any missing values with zeroes
+        pl.col("num_excerpts").fill_null(pl.lit(0)),
+        pl.col("num_poems").fill_null(pl.lit(0)),
+        pl.col("num_poets").fill_null(pl.lit(0)),
+    )
+
     ppa_meta_df.write_csv(output_file)
 
 
-def compress_file(uncompressed_file, compressed_file):
+def compress_file(uncompressed_file: pathlib.Path, compressed_file: pathlib.Path):
+    """
+    Compress the `uncompressed_file` passed in with gzip,
+    saving it at the `compressed_file` path and deleting the original.
+    """
     with open(str(uncompressed_file), "rb") as inputfile:
         with gzip.open(str(compressed_file), "wb") as output_file:
             shutil.copyfileobj(inputfile, output_file)
@@ -165,7 +212,73 @@ def compress_file(uncompressed_file, compressed_file):
     uncompressed_file.unlink()
 
 
-def main():
+def run_merge_step(
+    compile_opts: CompileOpts, excerpts_df: pl.DataFrame | None, compress_excerpts: bool
+) -> pl.DataFrame:
+    """Run the merge excerpts step. Finds source excerpt files from the configured
+    path, merges excerpts, saves to CSV, and optionally compresses the CSV file.
+    """
+    print("## Merging excerpts")
+    excerpt_sources = get_excerpt_sources(compile_opts["source_excerpt_data"])
+    excerpts_df = merge_excerpt_files(
+        excerpt_sources, compile_opts["compiled_excerpt_file"]
+    )
+    if compress_excerpts:
+        print(
+            f"Compressing excerpt data... {compile_opts['compiled_excerpt_file']} → {compile_opts['compressed_excerpt_file']}"
+        )
+        compress_file(
+            compile_opts["compiled_excerpt_file"],
+            compile_opts["compressed_excerpt_file"],
+        )
+    return excerpts_df
+
+
+def run_poem_metadata_step(
+    compile_opts: CompileOpts, excerpts_df: pl.DataFrame | None = None
+) -> None:
+    """Run the poem metadata compilation step. Uses excerpt data
+    (passed in or loaded from compile opts path) to calculate
+    poem excerpt totals.
+    """
+    print("\n## Compiling reference corpora metadata")
+    if excerpts_df is None:
+        excerpts_df = load_compiled_excerpts(compile_opts)
+    else:
+        excerpts_df = extract_page_meta(excerpts_df)
+    save_poem_metadata(compile_opts["poem_metadata_file"], excerpts_df)
+
+
+def run_ppa_metadata_step(
+    compile_opts: CompileOpts, excerpts_df: pl.DataFrame | None = None
+) -> None:
+    """Run the PPA metadata compilation step.  Uses excerpt data (passed
+    in or loaded from compile opts path) to calculate work-level
+    excerpt totals.
+    """
+    print("\n## PPA work-level metadata")
+    if excerpts_df is None:
+        excerpts_df = load_compiled_excerpts(compile_opts)
+    else:
+        excerpts_df = extract_page_meta(excerpts_df)
+
+    excerpts_df = add_ref_poems_meta(excerpts_df, compile_opts["poem_metadata_file"])
+
+    save_ppa_metadata(
+        compile_opts["source_ppa_metadata"],
+        compile_opts["ppa_metadata_file"],
+        excerpts_df,
+    )
+
+
+def main(cmd_args=None) -> None:
+    """
+    Main entry point for the dataset compilation script.  Parses
+    arguments to determine which steps to run.
+    """
+    # allow passing arguments in; if not specified, draw from sys.argv/command line
+    if cmd_args is None:
+        cmd_args = sys.argv[1:]
     parser = argparse.ArgumentParser(description="Compile PPA found-poems dataset")
     parser.add_argument(
         "--compress-excerpts",
@@ -191,40 +304,26 @@ def main():
             action="append_const",
             const=step,
         )
-    args = parser.parse_args()
-    compilation_steps = args.steps  # None or list of steps
+    args = parser.parse_args(cmd_args)
+    # if not specified, run all steps
+    compilation_steps = args.steps if args.steps else list(compilation_steps.keys())
 
     compile_opts = load_compilation_config()
 
-    if compilation_steps is None or "merge" in compilation_steps:
-        print("## Merging excerpts")
-        # find excerpt source files to be included in the compiled dataset file
-        excerpt_sources = get_excerpt_sources(compile_opts["source_excerpt_data"])
-        # merge into a single uncompressed csv
-        # (polars doesn't currently support writing directly to a csv.gz)
-        merge_excerpt_files(excerpt_sources, compile_opts["compiled_excerpt_file"])
-        # compress the resulting file if requested
-        if args.compress_excerpts:
-            print(
-                f"Compressing excerpt data... ({compile_opts['compiled_excerpt_file']} → {compile_opts['compressed_excerpt_file']})"
-            )
-            compress_file(
-                compile_opts["compiled_excerpt_file"],
-                compile_opts["compressed_excerpt_file"],
-            )
+    excerpts_df = None
+    if "merge" in compilation_steps:
+        excerpts_df = run_merge_step(compile_opts, excerpts_df, args.compress_excerpts)
 
-    if compilation_steps is None or "poem_metadata" in compilation_steps:
-        print("\n## Compiling reference corpora metadata")
-        save_poem_metadata(compile_opts["poem_metadata_file"])
+    if "poem_metadata" in compilation_steps:
+        run_poem_metadata_step(compile_opts, excerpts_df)
 
-    if compilation_steps is None or "ppa_metadata" in compilation_steps:
-        print("\n## PPA work-level metadata")
-        save_ppa_metadata(
-            compile_opts["source_ppa_metadata"], compile_opts["ppa_metadata_file"]
-        )
+    if "ppa_metadata" in compilation_steps:
+        run_ppa_metadata_step(compile_opts, excerpts_df)
 
-    print("\nRemember to commit and push the updated data files")
-    print(f"cd {compile_opts['output_data_dir'].parent} && git add data/*")
+    # probably not relevant anymore, not using git-lfs for this data...
+    print(f"Output files in {compile_opts['output_data_dir']}")
+    # print("\nRemember to commit and push the updated data files")
+    # print(f"cd {compile_opts['output_data_dir'].parent} && git add data/*")
 
 
 if __name__ == "__main__":
diff --git a/src/corppa/poetry_detection/merge_excerpts.py b/src/corppa/poetry_detection/merge_excerpts.py
index 7ff5869f..e034f184 100755
--- a/src/corppa/poetry_detection/merge_excerpts.py
+++ b/src/corppa/poetry_detection/merge_excerpts.py
@@ -194,7 +194,9 @@ def merge_excerpts(
     return pl.concat([output_df, merged_output_df], how="diagonal")
 
 
-def merge_excerpt_files(input_files, output_file):
+def merge_excerpt_files(
+    input_files: list[pathlib.Path], output_file: pathlib.Path
+) -> pl.DataFrame:
     total_excerpts = 0
     input_dfs = []
 
@@ -259,7 +261,11 @@ def merge_excerpt_files(input_files, output_file):
         # row is a tuple of value, count
         print(f"\t{row[0]}: {row[1]:,}")
 
+    # polars supports compression; but not sure what version it
+    # was added in, and documentation says it is unstable. Use that in future
     excerpts.write_csv(output_file)
+    # return excerpt data frame
+    return excerpts
 
 
 def main():
diff --git a/src/corppa/poetry_detection/ref_corpora.py b/src/corppa/poetry_detection/ref_corpora.py
index baa91099..4bd64e5d 100644
--- a/src/corppa/poetry_detection/ref_corpora.py
+++ b/src/corppa/poetry_detection/ref_corpora.py
@@ -1,12 +1,15 @@
-import os.path
+import logging
 import pathlib
-import tarfile
 from collections.abc import Generator
+from typing import Optional
 
 import polars as pl
 
 from corppa.config import get_config
-from corppa.utils.build_text_corpus import build_text_corpus
+from corppa.utils.build_text_corpus import build_text_corpus, text_corpus_from_tarfile
+
+logger = logging.getLogger(__name__)
+
 
 #: schema for reference corpora metadata :class:`pl.DataFrame`
 METADATA_SCHEMA = {
@@ -14,6 +17,9 @@
     "author": pl.String,
     "title": pl.String,
     "ref_corpus": pl.String,
+    "num_lines": pl.Int64,
+    "num_words": pl.Int64,
+    "char_len": pl.Int64,
 }
 
 
@@ -25,7 +31,7 @@ class BaseReferenceCorpus:
 
     corpus_id: str
     corpus_name: str
-    text_dir: pathlib.Path
+    text_path: pathlib.Path
     metadata_path: pathlib.Path | str
 
     def get_config_opts(self) -> dict:
@@ -60,10 +66,24 @@ def get_config_opts(self) -> dict:
         corpus_opts.update(config_opts["reference_corpora"].get(self.corpus_id, {}))
         return corpus_opts
 
-    def get_metadata_df(self) -> pl.DataFrame:
+    @staticmethod
+    def calculate_poem_length(text: str) -> dict[str, int]:
+        """Calculate poem length metrics from text content.  Takes the
+        text of the poem and returns a dictionary num_lines (non-blank lines),
+        num_words, and char_len.
+        """
+        return {
+            "num_lines": len([line for line in text.splitlines() if line.strip()]),
+            "num_words": len(text.split()),
+            "char_len": len(text),
+        }
+
+    def get_metadata_df(self, poem_length=False) -> pl.DataFrame:
         """Minimal common poetry metadata for use across reference corpora.
         Should return a :class:`pl.DataFrame` with poem_id, author, title, and
-        ref_corpus for each poem in this corpus."""
+        ref_corpus for each poem in this corpus.  Optionally, should
+        return information about poem length (number of characters and lines
+        in the text)."""
         raise NotImplementedError
 
     def get_text_corpus(self) -> Generator[dict[str, str]]:
@@ -87,36 +107,40 @@ def __init__(self):
         config_opts = self.get_config_opts()
 
         # get text directory for this reference corpus from app configuration
-        if "text_dir" in config_opts:
-            self.text_dir = pathlib.Path(config_opts["text_dir"])
-        # if text dir is not absolute, assume relative to ref_corpus base dir
-        if not self.text_dir.is_absolute():
-            self.text_dir = config_opts["base_dir"] / self.text_dir
-
-        if not self.text_dir.exists():
+        if "text_path" in config_opts:
+            self.text_path = pathlib.Path(config_opts["text_path"])
+        # if text path is not absolute, assume relative to ref_corpus base dir
+        # TODO: shift relative path logic to config loader
+        if not self.text_path.is_absolute():
+            self.text_path = config_opts["base_dir"] / self.text_path
+
+        if not self.text_path.exists():
             raise ValueError(
-                f"Configuration error: {self.corpus_name} path {self.text_dir} does not exist"
+                f"Configuration error: {self.corpus_name} path {self.text_path} does not exist"
             )
-        # TODO: allow tar.gz here; determine which and set a flag?
-        if not self.text_dir.is_dir() and not (
-            self.text_dir.is_file() and self.text_dir.name.endswith(".tar.gz")
+        # Currently supports directory and tar.gz file;
+        # might be nice to support zipfile as well
+        if not self.text_path.is_dir() and not (
+            self.text_path.is_file() and self.text_path.name.endswith(".tar.gz")
         ):
             raise ValueError(
-                f"Configuration error: {self.corpus_name} path {self.text_dir} is not a directory or a tar.gz"
+                f"Configuration error: {self.corpus_name} path {self.text_path} is not a directory or a tar.gz"
             )
 
-    def get_text_corpus(
-        self, disable_progress: bool = True
-    ) -> Generator[dict[str, str]]:
-        # if text_dir is tarball, raise not implemented error
-        if not self.text_dir.is_dir():
+    def get_text_corpus(self, disable_progress: bool = True) -> dict[str, str]:
+        # if text_path is tarball, raise not implemented error
+        if self.text_path.is_dir():
+            corpus_method = build_text_corpus
+        elif self.text_path.name.endswith(".tar.gz"):
+            corpus_method = text_corpus_from_tarfile
+        else:
             raise NotImplementedError(
-                "text corpus generation is not supported for tar.gz; configure a directory"
+                "text corpus generation is only supported for tar.gz and directories"
             )
         # build_text_corpus method returns id, so rename id to poem_id
         yield from (
             {"poem_id": p["id"], "text": p["text"]}
-            for p in build_text_corpus(self.text_dir, disable_progress=disable_progress)
+            for p in corpus_method(self.text_path, disable_progress=disable_progress)
         )
 
 
@@ -131,40 +155,30 @@ class InternetPoems(LocalTextCorpus):
     #: id for this reference corpus: internet_poems
     corpus_id: str = "internet_poems"
     corpus_name: str = "Internet Poems"
-    # inherits text_dir path
+    # inherits text_path
 
     # no init/validation needed beyond that provided by LocalTextCorpus
 
-    def get_metadata_df(self) -> pl.DataFrame:
+    def get_metadata_df(self, poem_length=False) -> pl.DataFrame:
         metadata = []
-
-        # if configured text_dir is a directory, get list of names
-        # from the filesystem
-        if self.text_dir.is_dir():
-            text_ids = [file.stem for file in self.text_dir.glob("*.txt")]
-        # otherwise, get from tar archive list
-        else:
-            with tarfile.open(str(self.text_dir), "r:gz") as text_archive:
-                text_ids = [
-                    os.path.splitext(os.path.basename(name))[0]
-                    for name in text_archive.getnames()
-                    if name.endswith(".txt")
-                ]
-
-        # filename without extension is poem identifier
-        for poem_id in text_ids:
+        # returns a generator of dicts with id and text string
+        # TODO: when called from compile script, might be nice to show progress bar
+        for poem in self.get_text_corpus():
             # filename format:
             #   Firstname-Lastname_Poem-Title.txt
             #   Replace - with spaces and split on - to separate author/title
-            author, title = poem_id.replace("-", " ").split("_", 1)
-            metadata.append(
-                {
-                    "poem_id": poem_id,
-                    "author": author,
-                    "title": title,
-                    "ref_corpus": self.corpus_id,
-                }
-            )
+            author, title = poem["poem_id"].replace("-", " ").split("_", 1)
+            poem_metadata: dict[str, str | int] = {
+                "poem_id": poem["poem_id"],
+                "author": author,
+                "title": title,
+                "ref_corpus": self.corpus_id,
+            }
+            if poem_length:
+                poem_metadata.update(self.calculate_poem_length(poem["text"]))
+
+            metadata.append(poem_metadata)
+
         return pl.from_dicts(metadata, schema=METADATA_SCHEMA)
 
 
@@ -177,10 +191,10 @@ class ChadwyckHealey(LocalTextCorpus):
     #: id for this reference corpus: chadwyck-healey
     corpus_id: str = "chadwyck-healey"
     corpus_name: str = "Chadwyck-Healey"
-    # inherits text_dir path
+    # inherits text_path
 
     def __init__(self):
-        # use LocalTextCorpus init to configure and validate text_dir
+        # use LocalTextCorpus init to configure and validate text_path
         super().__init__()
         # get configuration to set metadata path
         config_opts = self.get_config_opts()
@@ -194,9 +208,9 @@ def __init__(self):
                 f"Configuration error: {self.corpus_name} metadata {self.metadata_path} does not exist"
             )
 
-    def get_metadata_df(self) -> pl.DataFrame:
+    def get_metadata_df(self, poem_length=False) -> pl.DataFrame:
         # disable schema inference; the fields we care about are all strings
-        return (
+        df = (
             pl.read_csv(self.metadata_path, infer_schema=False)
             # rename fields
             .rename({"title_main": "title", "id": "poem_id"})
@@ -212,6 +226,28 @@ def get_metadata_df(self) -> pl.DataFrame:
             .select(["poem_id", "author", "title", "ref_corpus"])
         )
 
+        if poem_length:
+            poem_lengths = []
+
+            # text corpus returns a generator of dicts with id and text string
+            # NOTE: when called from compile script, might be nice to show progress bar
+            for poem in self.get_text_corpus():
+                poem_lengths.append(
+                    {
+                        "poem_id": poem["poem_id"],
+                        **self.calculate_poem_length(poem["text"]),
+                    }
+                )
+            if poem_lengths:
+                poem_length_df = pl.from_dicts(poem_lengths)
+                df = df.join(poem_length_df, on="poem_id")
+            else:
+                logger.warning(
+                    "Poem length requested but none calculated (no text files found?)"
+                )
+
+        return df
+
 
 class OtherPoems(BaseReferenceCorpus):
     """A metadata-only reference corpus with metadata for poems that have
@@ -239,7 +275,7 @@ def __init__(self):
                 f"Configuration error: {self.corpus_name} 'metadata_path' is not set"
             )
 
-    def get_metadata_df(self) -> pl.DataFrame:
+    def get_metadata_df(self, poem_length=False) -> pl.DataFrame:
         # polars can load csv directly from a url
         return pl.read_csv(self.metadata_path, schema=METADATA_SCHEMA).with_columns(
             ref_corpus=pl.lit(self.corpus_id)
@@ -260,20 +296,22 @@ def fulltext_corpora() -> list[BaseReferenceCorpus]:
     return [InternetPoems(), ChadwyckHealey()]
 
 
-def compile_metadata_df() -> pl.DataFrame:
+def compile_metadata_df(poem_length=False) -> pl.DataFrame:
     """Compile poetry metadata from all reference corpora into a single
     polars DataFrame with reference corpus ids."""
-    # create an empty dataframe with the intended fields
-    poem_metadata = pl.DataFrame([], schema=METADATA_SCHEMA)
+    # Combine poem metadata from all reference corpora
 
-    # for each corpus, load poem metadata into a polars dataframe,
-    # rename id to poem_id, and add a column with the corpus id
-    for ref_corpus in all_corpora():
-        poem_metadata.extend(ref_corpus.get_metadata_df())
-    return poem_metadata
+    # use a diagonal concat instead of vstack/extend
+    ref_corpora_dfs = [
+        ref_corpus.get_metadata_df(poem_length=poem_length)
+        for ref_corpus in all_corpora()
+    ]
+    return pl.concat(ref_corpora_dfs, how="diagonal")
 
 
-def save_poem_metadata(output_file: pathlib.Path):
+def save_poem_metadata(
+    output_file: pathlib.Path, excerpts_df: Optional[pl.DataFrame] = None
+):
     """Generate and save compiled poetry metadata as a data file in the
     poem dataset.
     """
@@ -283,7 +321,7 @@ def save_poem_metadata(output_file: pathlib.Path):
         output_verb = "Replacing"
     print(f"{output_verb} {output_file}")
 
-    df = compile_metadata_df()
+    df = compile_metadata_df(poem_length=True)
     ref_corpus_names = {
         ref_corpus.corpus_id: ref_corpus.corpus_name for ref_corpus in all_corpora()
     }
@@ -294,5 +332,23 @@ def save_poem_metadata(output_file: pathlib.Path):
         # row is a tuple of value, count;  convert reference corpus id to name
         totals.append(f"{ref_corpus_names[value]}: {count:,}")
 
+    # when excerpt data is present, calculate & include aggregate totals
+    if excerpts_df is not None:
+        # get work-level aggregate excerpt totals
+        # (only includes primary poem ids, not alt poem ids)
+        excerpt_totals_df = excerpts_df.group_by("poem_id").agg(
+            pl.col("excerpt_id").n_unique().alias("num_excerpts"),
+            pl.col("ppa_work_id").n_unique().alias("num_ppa_works"),
+            pl.col("page_id").n_unique().alias("num_ppa_pages"),
+            # number of unique ppa authors would be nice, but requires joining ppa metadata
+        )
+        # combine the totals with poem metadata
+        df = df.join(excerpt_totals_df, on="poem_id", how="left").with_columns(
+            # fill any missing values with zeroes
+            pl.col("num_excerpts").fill_null(pl.lit(0)),
+            pl.col("num_ppa_works").fill_null(pl.lit(0)),
+            pl.col("num_ppa_pages").fill_null(pl.lit(0)),
+        )
+
     print(f"{df.height:,} poem metadata entries ({'; '.join(totals)})")
     df.write_csv(output_file, include_bom=True)
diff --git a/src/corppa/utils/build_text_corpus.py b/src/corppa/utils/build_text_corpus.py
index 8ae288b9..4c56effa 100644
--- a/src/corppa/utils/build_text_corpus.py
+++ b/src/corppa/utils/build_text_corpus.py
@@ -26,6 +26,7 @@
 
 import argparse
 import sys
+import tarfile
 from pathlib import Path
 
 import orjsonl
@@ -44,13 +45,13 @@ def get_text_record(text_file: Path) -> dict[str, str]:
 
 
 def build_text_corpus(
-    input_dir: Path, disable_progress: bool = False
+    input_path: Path, disable_progress: bool = False
 ) -> dict[str, str]:
     """
     Generates text records for each text file within input directory
     """
     progress_bar = tqdm(
-        input_dir.glob("**/*.txt"),
+        input_path.glob("**/*.txt"),
         bar_format="Read {n:,} pages{postfix} | elapsed: {elapsed}",
         disable=disable_progress,
     )
@@ -58,6 +59,34 @@ def build_text_corpus(
         yield get_text_record(text_file)
 
 
+def text_corpus_from_tarfile(
+    input_path: Path, disable_progress: bool = False
+) -> dict[str, str]:
+    """
+    Generate text records for each text file within a tar.gz archive
+    """
+    # NOTE: could make compression optional, currently assumes gz
+    # NOTE: currently does not support progressbar
+    with tarfile.open(str(input_path), "r:gz") as tar_archive:
+        for member in tqdm(
+            tar_archive.getmembers(),
+            bar_format="Read {n:,} files{postfix} | elapsed: {elapsed}",
+            disable=disable_progress,
+        ):
+            # skip any OSX metadata files included in the archive
+            if "._" in member.name:
+                continue
+
+            # read contents of text files and yield filename and contents
+            if member.name.endswith(".txt"):
+                txtfile = tar_archive.extractfile(member)
+                if txtfile is not None:
+                    yield {
+                        "id": Path(member.name).stem,
+                        "text": txtfile.read().decode("utf-8"),
+                    }
+
+
 def save_text_corpus(
     input_dir: Path,
     output_file=Path,
diff --git a/tests/test_poetry_detection/test_compile_dataset.py b/tests/test_poetry_detection/test_compile_dataset.py
new file mode 100644
index 00000000..658b21d3
--- /dev/null
+++ b/tests/test_poetry_detection/test_compile_dataset.py
@@ -0,0 +1,306 @@
+# Copyright (c) 2026, Center for Digital Humanities, Princeton University
+# SPDX-License-Identifier: Apache-2.0
+
+
+import gzip
+from pathlib import Path
+from unittest.mock import patch
+
+import polars as pl
+import pytest
+
+from corppa.poetry_detection.compile_dataset import (
+    compress_file,
+    get_excerpt_sources,
+    load_compiled_excerpts,
+    main,
+    run_merge_step,
+    run_poem_metadata_step,
+    run_ppa_metadata_step,
+    save_ppa_metadata,
+)
+
+
+def test_get_excerpt_sources_empty_dir(tmp_path):
+    result = get_excerpt_sources(tmp_path)
+    assert result == []
+
+
+def test_get_excerpt_sources_with_files(tmp_path):
+    subdir1 = tmp_path / "subdir1"
+    subdir2 = tmp_path / "subdir2"
+    subdir1.mkdir()
+    subdir2.mkdir()
+
+    (tmp_path / "file1.csv").touch()
+    (tmp_path / "file2.csv.gz").touch()
+    (subdir1 / "nested.csv").touch()
+    (subdir2 / "nested.csv.gz").touch()
+    (tmp_path / "file3.txt").touch()
+
+    result = get_excerpt_sources(tmp_path)
+    assert len(result) == 4
+
+
+def test_save_ppa_metadata(tmp_path):
+    input_file = tmp_path / "ppa_works.csv"
+    output_file = tmp_path / "output.csv"
+
+    input_file.write_text("work_id,title,author\nW001,Test Work,Test Author\n")
+
+    excerpts_df = pl.DataFrame(
+        {
+            "ppa_work_id": ["W001", "W001", "W001"],
+            "excerpt_id": ["e1", "e2", "e3"],
+            "poem_id": ["poem-1", "poem-1", "poem-2"],
+            "poem_author": ["Author A", "Author A", "Author B"],
+        }
+    )
+
+    save_ppa_metadata(input_file, output_file, excerpts_df)
+
+    result = pl.read_csv(output_file)
+    assert "num_excerpts" in result.columns
+    assert "num_poems" in result.columns
+    assert "num_poets" in result.columns
+
+    row = result.row(0, named=True)
+    assert row["work_id"] == "W001"
+    assert row["num_excerpts"] == 3
+    assert row["num_poems"] == 2
+    assert row["num_poets"] == 2
+
+
+def test_save_ppa_metadata_not_csv(tmp_path):
+    input_file = tmp_path / "ppa_works.json"
+    output_file = tmp_path / "output.csv"
+
+    excerpts_df = pl.DataFrame(
+        {
+            "ppa_work_id": ["W001"],
+            "excerpt_id": ["e1"],
+            "poem_id": ["poem-1"],
+            "poem_author": ["Author A"],
+        }
+    )
+
+    with pytest.raises(ValueError, match="PPA metadata must be loaded as CSV"):
+        save_ppa_metadata(input_file, output_file, excerpts_df)
+
+
+@patch("corppa.poetry_detection.compile_dataset.pl.read_csv")
+@patch("corppa.poetry_detection.compile_dataset.extract_page_meta")
+def test_load_compiled_excerpts_uncompressed(
+    mock_extract_page_meta, mock_read_csv, tmp_path
+):
+    # config method populates both paths;
+    # load method will choose the first one that exists
+    excerpt_file = tmp_path / "excerpts.csv"
+    excerpt_file.touch()
+    excerpt_gz_file = tmp_path / "excerpts.csv.gz"
+    config = {
+        "compiled_excerpt_file": excerpt_file,
+        "compressed_excerpt_file": excerpt_gz_file,
+    }
+
+    result = load_compiled_excerpts(config)
+    assert result == mock_extract_page_meta.return_value
+
+    mock_extract_page_meta.assert_called_once_with(mock_read_csv.return_value)
+    mock_read_csv.assert_called_once_with(excerpt_file)
+
+    # reset and remove the uncompressed, make the gz exist
+    mock_read_csv.reset_mock()
+    excerpt_file.unlink()
+    excerpt_gz_file.touch()
+    load_compiled_excerpts(config)
+    mock_read_csv.assert_called_once_with(excerpt_gz_file)
+
+
+def test_load_compiled_excerpts_file_not_found(tmp_path):
+    config = {
+        "compiled_excerpt_file": tmp_path / "nonexistent.csv",
+        "compressed_excerpt_file": tmp_path / "nonexistent.csv.gz",
+    }
+
+    with pytest.raises(ValueError, match="Excerpt data file not found"):
+        load_compiled_excerpts(config)
+
+
+@pytest.mark.parametrize(
+    "args,expected_calls",
+    [
+        ([], {"merge", "poem_metadata", "ppa_metadata"}),
+        (["--merge"], {"merge"}),
+        (["--poem_metadata"], {"poem_metadata"}),
+        (["--ppa_metadata"], {"ppa_metadata"}),
+    ],
+)
+@patch("corppa.poetry_detection.compile_dataset.run_ppa_metadata_step")
+@patch("corppa.poetry_detection.compile_dataset.run_poem_metadata_step")
+@patch("corppa.poetry_detection.compile_dataset.run_merge_step")
+@patch("corppa.poetry_detection.compile_dataset.load_compilation_config")
+def test_main(
+    mock_load_config,
+    mock_merge,
+    mock_poem,
+    mock_ppa,
+    args,
+    expected_calls,
+    tmp_path,
+):
+    mock_load_config.return_value = {
+        "test": "config",
+        "output_data_dir": tmp_path,
+    }
+
+    main(args)
+
+    if "merge" in expected_calls:
+        mock_merge.assert_called_once()
+    else:
+        mock_merge.assert_not_called()
+
+    if "poem_metadata" in expected_calls:
+        mock_poem.assert_called_once()
+    else:
+        mock_poem.assert_not_called()
+
+    if "ppa_metadata" in expected_calls:
+        mock_ppa.assert_called_once()
+    else:
+        mock_ppa.assert_not_called()
+
+
+@patch("corppa.poetry_detection.compile_dataset.compress_file")
+@patch("corppa.poetry_detection.compile_dataset.merge_excerpt_files")
+@patch("corppa.poetry_detection.compile_dataset.get_excerpt_sources")
+def test_run_merge_step(mock_get_sources, mock_merge, mock_compress, tmp_path):
+    compile_opts = {
+        "source_excerpt_data": tmp_path / "/data/excerpts",
+        "compiled_excerpt_file": tmp_path / "/out/excerpts.csv",
+        "compressed_excerpt_file": tmp_path / "/out/excerpts.csv.gz",
+    }
+
+    result = run_merge_step(compile_opts, None, compress_excerpts=True)
+    # returns result of merge
+    assert result == mock_merge.return_value
+
+    # get sources is called on the configured path
+    mock_get_sources.assert_called_once_with(compile_opts["source_excerpt_data"])
+    # merge is called with the result of get sources and compile option
+    mock_merge.assert_called_once_with(
+        mock_get_sources.return_value, compile_opts["compiled_excerpt_file"]
+    )
+    # compress is called
+    mock_compress.assert_called_once_with(
+        Path("/out/excerpts.csv"), Path("/out/excerpts.csv.gz")
+    )
+
+    # call again with no compression
+    mock_compress.reset_mock()
+    run_merge_step(compile_opts, None, compress_excerpts=False)
+    mock_compress.assert_not_called()
+
+
+@patch("corppa.poetry_detection.compile_dataset.save_poem_metadata")
+@patch("corppa.poetry_detection.compile_dataset.extract_page_meta")
+@patch("corppa.poetry_detection.compile_dataset.load_compiled_excerpts")
+def test_run_poem_metadata_step_with_df(mock_load, mock_extract, mock_save, tmp_path):
+    input_df = pl.DataFrame({"id": [1]})
+    mock_extract.return_value = pl.DataFrame({"id": [1], "page_id": ["p.1"]})
+
+    compile_opts = {"poem_metadata_file": tmp_path / "/out/poem_meta.csv"}
+
+    run_poem_metadata_step(compile_opts, input_df)
+
+    mock_load.assert_not_called()
+    mock_extract.assert_called_once_with(input_df)
+    mock_save.assert_called_once_with(
+        compile_opts["poem_metadata_file"], mock_extract.return_value
+    )
+
+
+@patch("corppa.poetry_detection.compile_dataset.save_poem_metadata")
+@patch("corppa.poetry_detection.compile_dataset.extract_page_meta")
+@patch("corppa.poetry_detection.compile_dataset.load_compiled_excerpts")
+def test_run_poem_metadata_step(mock_load, mock_extract, mock_save, tmp_path):
+    mock_load.return_value = pl.DataFrame({"id": [1]})
+
+    compile_opts = {"poem_metadata_file": tmp_path / "/out/poem_meta.csv"}
+
+    run_poem_metadata_step(compile_opts, None)
+
+    mock_load.assert_called_once_with(compile_opts)
+    mock_extract.assert_not_called()
+    mock_save.assert_called_once_with(
+        compile_opts["poem_metadata_file"], mock_load.return_value
+    )
+
+
+@patch("corppa.poetry_detection.compile_dataset.save_ppa_metadata")
+@patch("corppa.poetry_detection.compile_dataset.add_ref_poems_meta")
+@patch("corppa.poetry_detection.compile_dataset.extract_page_meta")
+@patch("corppa.poetry_detection.compile_dataset.load_compiled_excerpts")
+def test_run_ppa_metadata_step(
+    mock_load, mock_extract, mock_add_ref_poems, mock_save, tmp_path
+):
+    input_df = pl.DataFrame({"id": [1]})
+
+    compile_opts = {
+        "poem_metadata_file": tmp_path / "/out/poem_meta.csv",
+        "source_ppa_metadata": tmp_path / "/data/ppa_works.csv",
+        "ppa_metadata_file": tmp_path / "/out/ppa_meta.csv",
+    }
+
+    # call with excerpt dataframe provided
+    run_ppa_metadata_step(compile_opts, input_df)
+    # doesn't load excerpts because provided
+    mock_load.assert_not_called()
+    # extracts page/work metadata
+    mock_extract.assert_called_once_with(input_df)
+    # loads reference poem metadata
+    mock_add_ref_poems.assert_called_once_with(
+        mock_extract.return_value, compile_opts["poem_metadata_file"]
+    )
+    mock_save.assert_called_once_with(
+        compile_opts["source_ppa_metadata"],
+        compile_opts["ppa_metadata_file"],
+        mock_add_ref_poems.return_value,
+    )
+
+    # call without excerpt df
+    mock_extract.reset_mock()
+    mock_add_ref_poems.reset_mock()
+    mock_save.reset_mock()
+    run_ppa_metadata_step(compile_opts, None)
+    mock_load.assert_called_once_with(compile_opts)
+    mock_extract.assert_not_called()
+    mock_add_ref_poems.assert_called_once_with(
+        mock_load.return_value, compile_opts["poem_metadata_file"]
+    )
+    mock_save.assert_called_once_with(
+        compile_opts["source_ppa_metadata"],
+        compile_opts["ppa_metadata_file"],
+        mock_add_ref_poems.return_value,
+    )
+
+
+def test_compress_file(tmp_path):
+    # integration test to confirm logic works as expected
+    uncompressed_file = tmp_path / "excerpts.csv"
+    compressed_file = tmp_path / "excerpts.csv.gz"
+    # write out content to test round-trip
+    file_contents = "excerpt_id,text\n1,hello\n"
+    uncompressed_file.write_text(file_contents)
+
+    compress_file(uncompressed_file, compressed_file)
+    # uncompressed file should be removed
+    assert not uncompressed_file.exists()
+    # compressed file should now be present
+    assert compressed_file.exists()
+
+    # uncompressed content should match what we wrote out
+    with gzip.open(compressed_file, "rt") as f:
+        content = f.read()
+    assert content == file_contents
diff --git a/tests/test_poetry_detection/test_ref_corpora.py b/tests/test_poetry_detection/test_ref_corpora.py
index 40890f61..3417b625 100644
--- a/tests/test_poetry_detection/test_ref_corpora.py
+++ b/tests/test_poetry_detection/test_ref_corpora.py
@@ -34,9 +34,9 @@ def corppa_test_config(tmp_path):
     reference_corpora:
         base_dir: {base_dir}
         internet_poems:
-            text_dir: {base_dir / "internet_poems2"}
+            text_path: {base_dir / "internet_poems2"}
         chadwyck-healey:
-            text_dir: "ch"
+            text_path: "ch"
             metadata_path: "ch/chadwyck-healey.csv"
         other:
             metadata_path: http://example.com/other-poems.csv
@@ -118,6 +118,24 @@ def test_get_config_relative_dir(self, mock_get_config):
             config_opts["base_dir"] == pathlib.Path(ingredients_dir) / ref_corpora_dir
         )
 
+    def test_calculate_poem_length(self):
+        # Test single line text
+        result = BaseReferenceCorpus.calculate_poem_length("Hello world test")
+        assert result == {"num_lines": 1, "num_words": 3, "char_len": 16}
+
+        # Test multi-line text with blank lines
+        text = "Line one here\nLine two here\n\nLine three"
+        result = BaseReferenceCorpus.calculate_poem_length(text)
+        assert result == {"num_lines": 3, "num_words": 8, "char_len": len(text)}
+
+        # Test empty text
+        result = BaseReferenceCorpus.calculate_poem_length("")
+        assert result == {"num_lines": 0, "num_words": 0, "char_len": 0}
+
+        # Test text with only blank lines
+        result = BaseReferenceCorpus.calculate_poem_length("   \n\n   ")
+        assert result == {"num_lines": 0, "num_words": 0, "char_len": 8}
+
 
 # fixture data for internet poems
 INTERNETPOEMS_TEXTS = [
@@ -133,12 +151,12 @@ def test_get_config_relative_dir(self, mock_get_config):
 
 
 @pytest.fixture
-def internetpoems_data_dir(tmp_path):
+def internetpoems_data_dir(tmp_path, corppa_test_config):
     # test fixture to create internet poems data directory with sample text files
     config_opts = config.get_config()
-    # use the configured text data dir
+    # use the configured text data dir from test config
     data_dir = pathlib.Path(
-        config_opts["reference_corpora"]["internet_poems"]["text_dir"]
+        config_opts["reference_corpora"]["internet_poems"]["text_path"]
     )
 
     data_dir.mkdir(parents=True, exist_ok=True)
@@ -149,7 +167,7 @@ def internetpoems_data_dir(tmp_path):
 
 
 @pytest.fixture
-def internetpoems_tarball(tmp_path):
+def internetpoems_tarball(tmp_path, corppa_test_config_defaults):
     # test fixture to create tar.gzip of internet poems data directory with sample text files
     # should be used with default config
     config_opts = config.get_config()
@@ -159,7 +177,7 @@ def internetpoems_tarball(tmp_path):
         text_file = internetpoems_data_dir / f"{sample['id']}.txt"
         text_file.write_text(sample["text"])
 
-    tarfile_name = config_opts["reference_corpora"]["internet_poems"]["text_dir"]
+    tarfile_name = config_opts["reference_corpora"]["internet_poems"]["text_path"]
     base_dir = pathlib.Path(config_opts["reference_corpora"]["base_dir"])
     tarfile_path = base_dir / tarfile_name
     tarfile_path.parent.mkdir(parents=True, exist_ok=True)
@@ -180,14 +198,14 @@ def test_init(self, tmp_path, corppa_test_config):
         config_opts = config.get_config()
         # expected data_dir
         expected_data_dir = pathlib.Path(
-            config_opts["reference_corpora"]["internet_poems"]["text_dir"]
+            config_opts["reference_corpora"]["internet_poems"]["text_path"]
         )
 
         # init should succeed when directory exists
         expected_data_dir.mkdir(parents=True)
         internet_poems = InternetPoems()
-        assert isinstance(internet_poems.text_dir, pathlib.Path)
-        assert internet_poems.text_dir == expected_data_dir
+        assert isinstance(internet_poems.text_path, pathlib.Path)
+        assert internet_poems.text_path == expected_data_dir
 
         # error if it is not a directory : remove dir and create a regular file
         expected_data_dir.rmdir()
@@ -203,15 +221,15 @@ def test_get_config(self, mock_pathlib, tmp_path, corppa_test_config):
         # should pass in reference corpus base directory
         assert "base_dir" in config_opts
         # should include ref_corpus specific options, where are in the test config
-        assert "text_dir" in config_opts
+        assert "text_path" in config_opts
 
     @patch.object(InternetPoems, "get_config_opts")
     def test_get_metadata_df(
         self, mock_get_config_opts, tmp_path, corppa_test_config, internetpoems_data_dir
     ):
-        mock_get_config_opts.return_value = {"text_dir": str(internetpoems_data_dir)}
+        mock_get_config_opts.return_value = {"text_path": str(internetpoems_data_dir)}
         internet_poems = InternetPoems()
-        meta_df = internet_poems.get_metadata_df()
+        meta_df = internet_poems.get_metadata_df(poem_length=True)
         assert isinstance(meta_df, pl.DataFrame)
         assert meta_df.schema == METADATA_SCHEMA
         assert meta_df.height == len(INTERNETPOEMS_TEXTS)
@@ -221,6 +239,29 @@ def test_get_metadata_df(
         assert meta_row["author"] == "King James Bible"
         assert meta_row["title"] == "Psalms"
         assert meta_row["ref_corpus"] == internet_poems.corpus_id
+        # check poem length calculations (non-blank lines, word count, char length)
+        assert meta_row["num_lines"] == 1
+        assert meta_row["num_words"] == 9
+        assert meta_row["char_len"] == len(INTERNETPOEMS_TEXTS[0]["text"])
+
+    @patch.object(InternetPoems, "get_config_opts")
+    def test_get_metadata_df_no_poem_length(
+        self, mock_get_config_opts, tmp_path, corppa_test_config, internetpoems_data_dir
+    ):
+        # Test that poem_length=False sets length fields to null
+        mock_get_config_opts.return_value = {"text_path": str(internetpoems_data_dir)}
+        internet_poems = InternetPoems()
+        meta_df = internet_poems.get_metadata_df(poem_length=False)
+        assert isinstance(meta_df, pl.DataFrame)
+        # Length fields should be present but null
+        assert "num_lines" in meta_df.columns
+        assert "num_words" in meta_df.columns
+        assert "char_len" in meta_df.columns
+        # All length values should be null
+        assert (
+            meta_df.select("num_lines", "num_words", "char_len").drop_nulls().height
+            == 0
+        )
 
     def test_get_metadata_df_tarball(
         self,
@@ -247,9 +288,46 @@ def test_get_text_corpus_tarball(
         internetpoems_tarball,
     ):
         internet_poems = InternetPoems()
-        with pytest.raises(NotImplementedError, match="not supported for tar.gz"):
-            # returns a generator; use list to get to actually run
-            list(internet_poems.get_text_corpus())
+        # with pytest.raises(NotImplementedError, match="not supported for tar.gz"):
+        # returns a generator; use list to get to actually run
+        # convert to list, sort to ensure order matches fixture data
+        text_data = sorted(
+            list(internet_poems.get_text_corpus()), key=lambda x: x["poem_id"]
+        )
+        assert len(text_data) == len(INTERNETPOEMS_TEXTS)
+        assert text_data[0]["poem_id"] == INTERNETPOEMS_TEXTS[0]["id"]
+        assert text_data[0]["text"] == INTERNETPOEMS_TEXTS[0]["text"]
+
+    @patch.object(InternetPoems, "get_config_opts")
+    def test_get_text_unsupported(
+        self,
+        mock_get_config_opts,
+        tmp_path,
+        corppa_test_config_defaults,
+    ):
+        zipfile = tmp_path / "internet_poems.zip"
+        zipfile.touch()
+        mock_get_config_opts.return_value = {
+            "text_path": zipfile,
+            "base_dir": tmp_path / "ref-corpora",
+        }
+        with pytest.raises(ValueError, match=".*not a directory or a tar.gz"):
+            # checks configuration on init
+            InternetPoems()
+
+    def get_text_corpus_unsupported(
+        self, mock_get_config_opts, tmp_path, corppa_test_config_defaults
+    ):
+        zipfile = tmp_path / "internet_poems.zip"
+        zipfile.touch()
+        # init normally to by pass the check path type check
+        internet_poems = InternetPoems()
+        # patch in our zip file
+        internet_poems.text_path = zipfile
+        with pytest.raises(
+            NotImplementedError, match="only supported for tar.gz and directories"
+        ):
+            internet_poems.get_text_corpus()
 
     @patch.object(InternetPoems, "get_config_opts")
     def test_get_text_corpus(
@@ -260,7 +338,7 @@ def test_get_text_corpus(
         internetpoems_data_dir,
     ):
         mock_get_config_opts.return_value = {
-            "text_dir": str(internetpoems_data_dir),
+            "text_path": str(internetpoems_data_dir),
             "base_dir": tmp_path / "ref-corpora",
         }
         internet_poems = InternetPoems()
@@ -274,15 +352,15 @@ def test_get_text_corpus(
 
 
 @pytest.fixture
-def chadwyck_healey_csv(tmp_path):
+def chadwyck_healey_csv(tmp_path, corppa_test_config):
     "fixture to create a test version of the chadwyck-healey metadata csv file"
-    # test fixture to create internet poems data directory with sample text files
+    # test fixture to create chadwyck-healey data directory with sample metadata csv
 
     config_opts = config.get_config()
-    # use the configured data paths or configured ref_corpus base_dir and defaults
+    # use the configured data paths from test config
     base_dir = pathlib.Path(config_opts["reference_corpora"]["base_dir"])
     override_opts = config_opts["reference_corpora"][ChadwyckHealey.corpus_id]
-    data_dir = pathlib.Path(override_opts["text_dir"])
+    data_dir = pathlib.Path(override_opts["text_path"])
     ch_meta_csv = pathlib.Path(override_opts["metadata_path"])
 
     # in either case, make relative to base dir if not absolute
@@ -310,7 +388,8 @@ def test_get_metadata_df(self, tmp_path, corppa_test_config, chadwyck_healey_csv
         chadwyck_healey = ChadwyckHealey()
         meta_df = chadwyck_healey.get_metadata_df()
         assert isinstance(meta_df, pl.DataFrame)
-        assert meta_df.schema == METADATA_SCHEMA
+        # schema is a subset because we don't include poem lengths
+        assert all(key in METADATA_SCHEMA for key in meta_df.schema.keys())
         # csv fixture data currently has one row
         assert meta_df.height == 1
         # get the first row as a dict and check values
@@ -320,6 +399,32 @@ def test_get_metadata_df(self, tmp_path, corppa_test_config, chadwyck_healey_csv
         assert meta_row["title"] == "THE CAVERN OF WOE."
         assert meta_row["ref_corpus"] == chadwyck_healey.corpus_id
 
+    def test_get_metadata_df_with_poem_length(
+        self, tmp_path, corppa_test_config, chadwyck_healey_csv
+    ):
+        # Create a text file for the poem to test poem length calculation
+        chadwyck_healey = ChadwyckHealey()
+        text_dir = chadwyck_healey.text_path
+        # three lines, eight words
+        text_content = "Line one here\nLine two here\nLine three"
+        text_file = text_dir / "Z300475611.txt"
+        text_file.write_text(text_content)
+
+        meta_df = chadwyck_healey.get_metadata_df(poem_length=True)
+        assert isinstance(meta_df, pl.DataFrame)
+        # Should include length fields
+        assert "num_lines" in meta_df.columns
+        assert "num_words" in meta_df.columns
+        assert "char_len" in meta_df.columns
+
+        meta_row = meta_df.row(0, named=True)
+        # 3 non-blank lines
+        assert meta_row["num_lines"] == 3
+        # 8 words total
+        assert meta_row["num_words"] == 8
+        # character length (including newlines)
+        assert meta_row["char_len"] == len(text_content)
+
     # get_text_corpus method is not tested here because it is inherited;
     # logic is shared with InternetPoems and tested there
 
@@ -351,7 +456,8 @@ def test_get_metadata_df(
         opoems = OtherPoems()
         meta_df = opoems.get_metadata_df()
         assert isinstance(meta_df, pl.DataFrame)
-        assert meta_df.schema == METADATA_SCHEMA
+        # schema is a subset because we don't include poem lengths
+        assert all(key in METADATA_SCHEMA for key in meta_df.schema.keys())
         assert meta_df.height == len(OTHERPOEM_METADATA)
         # check values on the first row
         meta_row = meta_df.row(0, named=True)
@@ -448,6 +554,7 @@ def test_save_poem_metadata(
     otherpoems_metadata_df,
 ):
     # data fixtures should ensure that all the expected directories exist
+    # mock_get_config_opts.return_value = {"text_path": str(internetpoems_data_dir)}
 
     # add corpus id to other poems data frame and patch it to be returned
     otherpoems_metadata_df = otherpoems_metadata_df.with_columns(
@@ -471,3 +578,83 @@ def test_save_poem_metadata(
         save_poem_metadata(output_file)
         captured = capsys.readouterr()
         assert "Replacing" in captured.out
+
+
+def test_save_poem_metadata_with_excerpts(
+    tmp_path,
+    capsys,
+    corppa_test_config,
+    internetpoems_data_dir,
+    chadwyck_healey_csv,
+    otherpoems_metadata_df,
+):
+    # Test the case where excerpts_df is provided - tests aggregation logic
+
+    # add corpus id to other poems data frame and patch it to be returned
+    otherpoems_metadata_df = otherpoems_metadata_df.with_columns(
+        ref_corpus=pl.lit(OtherPoems.corpus_id)
+    )
+
+    # Create sample excerpts dataframe with poem data
+    # Use poem IDs from the INTERNETPOEMS_TEXTS global variable
+    excerpts_df = pl.from_dicts(
+        [
+            # two excerpts for poem 0 from the same work, two different pages
+            {
+                "poem_id": INTERNETPOEMS_TEXTS[0]["id"],
+                "excerpt_id": "p@1:10",
+                "ppa_work_id": "work1",
+                "page_id": "page1",
+            },
+            {
+                "poem_id": INTERNETPOEMS_TEXTS[0]["id"],
+                "excerpt_id": "p@3:30",
+                "ppa_work_id": "work1",
+                "page_id": "page2",
+            },
+            # one excerpt for poem 2
+            {
+                "poem_id": INTERNETPOEMS_TEXTS[1]["id"],
+                "excerpt_id": "ex3",
+                "ppa_work_id": "work2",
+                "page_id": "page3",
+            },
+        ]
+    )
+
+    aggregation_fields = ["num_excerpts", "num_ppa_works", "num_ppa_pages"]
+
+    with patch.object(
+        OtherPoems, "get_metadata_df", return_value=otherpoems_metadata_df
+    ):
+        output_file = tmp_path / "poem_meta.csv"
+        save_poem_metadata(output_file, excerpts_df=excerpts_df)
+        assert output_file.exists()
+
+        # Read the output CSV and check for aggregate columns
+        result_df = pl.read_csv(output_file)
+        # all fields should be present
+        for field in aggregation_fields:
+            assert field in result_df.columns
+
+    # Check that poem with 2 excerpts has correct counts
+    psalms_row = result_df.filter(
+        pl.col("poem_id") == INTERNETPOEMS_TEXTS[0]["id"]
+    ).row(0, named=True)
+    # two excerpts from one work, different pages
+    assert psalms_row["num_excerpts"] == 2
+    assert psalms_row["num_ppa_works"] == 1
+    assert psalms_row["num_ppa_pages"] == 2
+
+    # Check that poem with 1 excerpt has correct counts
+    mary_row = result_df.filter(pl.col("poem_id") == INTERNETPOEMS_TEXTS[1]["id"]).row(
+        0, named=True
+    )
+    # one excerpt, all counts are 1
+    assert all(mary_row[value] == 1 for value in aggregation_fields)
+
+    # Check that poems without excerpts (from otherpoems) have zero counts
+    for poem_info in result_df.filter(
+        pl.col("poem_id").is_in(OTHERPOEM_METADATA[0])
+    ).iter_rows(named=True):
+        assert all(poem_info[value] == 0 for value in aggregation_fields)