Experimenting with doc2query-t5 indexing

CodingTil · CodingTil · commit b7dcf16ac6d4 · 2023-10-11T09:35:13.000+02:00
diff --git a/py_css/indexer/index.py b/py_css/indexer/index.py
@@ -1,10 +1,13 @@
 import os
 from pkg_resources import resource_filename
 import shutil
-import pyterrier as pt
 import logging
 from typing import Dict, Generator
 
+import pandas as pd
+import pyterrier as pt
+from pyterrier_doc2query import Doc2Query
+
 # Paths for data and index
 DATA_PATH = resource_filename(__name__, "../../data/collection.tsv")
 INDEX_PATH = resource_filename(__name__, "../../data/index")
@@ -58,22 +61,47 @@ def create_index() -> pt.IndexRef:
 
     # Create an index with both "docno" and "text" as metadata
     logging.info("Creating Index")
-    iter_indexer = pt.IterDictIndexer(INDEX_PATH, verbose=True, blocks=True)
-    return iter_indexer.index(
-        document_collection_generator(), meta={"docno": 20, "text": 4096}
+    df = document_collection_dataframe()
+    df = add_query_description_to_documents(df)
+    df_indexer = pt.DFIndexer(INDEX_PATH, verbose=True, blocks=True)
+    logging.info("Indexing")
+    return df_indexer.index(df["text"], df)
+
+
+def document_collection_dataframe() -> pd.DataFrame:
+    """
+    Return a dataframe of the document collection.
+
+    Returns
+    -------
+    pd.DataFrame
+        A dataframe of the document collection.
+    """
+    return pd.read_table(
+        DATA_PATH, names=["docno", "text"], header=0, dtype={"docno": str, "text": str}
     )
 
 
-def document_collection_generator() -> Generator[Dict[str, str], None, None]:
+def add_query_description_to_documents(df: pd.DataFrame) -> pd.DataFrame:
     """
-    Return a generator over the document collection.
+    Add a query description to each document.
 
-    Yields
+    Parameters
+    ----------
+    pd.DataFrame
+        A dataframe of the document collection.
+
+    Returns
     -------
-    Generator[Dict[str, str], None, None]
-        A generator over the document collection (docno, content).
+    pd.DataFrame
+        A dataframe of the document collection with a query description.
     """
-    with open(DATA_PATH, "r") as f:
-        for line in f:
-            docno, content = line.split("\t")
-            yield {"docno": docno, "text": content}
+    doc_2_query_t5 = Doc2Query(
+        append=False,
+        out_attr="querygen",
+        fast_tokenizer=True,
+        verbose=True,
+        num_samples=1,
+    )
+    logging.info("Adding query description to documents")
+    return doc_2_query_t5.transform(df)
diff --git a/py_css/models/T5DocumentExpander.py b/py_css/models/T5DocumentExpander.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,6 +13,7 @@ requires = [
     "rich",
     "python-terrier",
     "pyterrier-t5@git+https://github.com/terrierteam/pyterrier_t5.git",
+    "pyterrier_doc2query@git+https://github.com/terrierteam/pyterrier_doc2query.git",
     "torch",
     "transformers[torch]"
 ]

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@ requires = [`
`13`	`13`	`"rich",`
`14`	`14`	`"python-terrier",`
`15`	`15`	`"pyterrier-t5@git+https://github.com/terrierteam/pyterrier_t5.git",`
	`16`	`+ "pyterrier_doc2query@git+https://github.com/terrierteam/pyterrier_doc2query.git",`
`16`	`17`	`"torch",`
`17`	`18`	`"transformers[torch]"`
`18`	`19`	`]`