Skip to content

Commit b7dcf16

Browse files
committed
Experimenting with doc2query-t5 indexing
1 parent a316e5d commit b7dcf16

3 files changed

Lines changed: 42 additions & 13 deletions

File tree

py_css/indexer/index.py

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
import os
22
from pkg_resources import resource_filename
33
import shutil
4-
import pyterrier as pt
54
import logging
65
from typing import Dict, Generator
76

7+
import pandas as pd
8+
import pyterrier as pt
9+
from pyterrier_doc2query import Doc2Query
10+
811
# Paths for data and index
912
DATA_PATH = resource_filename(__name__, "../../data/collection.tsv")
1013
INDEX_PATH = resource_filename(__name__, "../../data/index")
@@ -58,22 +61,47 @@ def create_index() -> pt.IndexRef:
5861

5962
# Create an index with both "docno" and "text" as metadata
6063
logging.info("Creating Index")
61-
iter_indexer = pt.IterDictIndexer(INDEX_PATH, verbose=True, blocks=True)
62-
return iter_indexer.index(
63-
document_collection_generator(), meta={"docno": 20, "text": 4096}
64+
df = document_collection_dataframe()
65+
df = add_query_description_to_documents(df)
66+
df_indexer = pt.DFIndexer(INDEX_PATH, verbose=True, blocks=True)
67+
logging.info("Indexing")
68+
return df_indexer.index(df["text"], df)
69+
70+
71+
def document_collection_dataframe() -> pd.DataFrame:
72+
"""
73+
Return a dataframe of the document collection.
74+
75+
Returns
76+
-------
77+
pd.DataFrame
78+
A dataframe of the document collection.
79+
"""
80+
return pd.read_table(
81+
DATA_PATH, names=["docno", "text"], header=0, dtype={"docno": str, "text": str}
6482
)
6583

6684

67-
def document_collection_generator() -> Generator[Dict[str, str], None, None]:
85+
def add_query_description_to_documents(df: pd.DataFrame) -> pd.DataFrame:
6886
"""
69-
Return a generator over the document collection.
87+
Add a query description to each document.
7088
71-
Yields
89+
Parameters
90+
----------
91+
pd.DataFrame
92+
A dataframe of the document collection.
93+
94+
Returns
7295
-------
73-
Generator[Dict[str, str], None, None]
74-
A generator over the document collection (docno, content).
96+
pd.DataFrame
97+
A dataframe of the document collection with a query description.
7598
"""
76-
with open(DATA_PATH, "r") as f:
77-
for line in f:
78-
docno, content = line.split("\t")
79-
yield {"docno": docno, "text": content}
99+
doc_2_query_t5 = Doc2Query(
100+
append=False,
101+
out_attr="querygen",
102+
fast_tokenizer=True,
103+
verbose=True,
104+
num_samples=1,
105+
)
106+
logging.info("Adding query description to documents")
107+
return doc_2_query_t5.transform(df)

py_css/models/T5DocumentExpander.py

Whitespace-only changes.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ requires = [
1313
"rich",
1414
"python-terrier",
1515
"pyterrier-t5@git+https://github.com/terrierteam/pyterrier_t5.git",
16+
"pyterrier_doc2query@git+https://github.com/terrierteam/pyterrier_doc2query.git",
1617
"torch",
1718
"transformers[torch]"
1819
]

0 commit comments

Comments
 (0)