Skip to content

Commit 139e4c2

Browse files
committed
Undo doc2query-t5
1 parent 2b4a0c4 commit 139e4c2

1 file changed

Lines changed: 14 additions & 42 deletions

File tree

py_css/indexer/index.py

Lines changed: 14 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,7 @@
44
import logging
55
from typing import Dict, Generator
66

7-
import pandas as pd
87
import pyterrier as pt
9-
from pyterrier_doc2query import Doc2Query
108

119
# Paths for data and index
1210
DATA_PATH = resource_filename(__name__, "../../data/collection.tsv")
@@ -61,50 +59,24 @@ def create_index() -> pt.IndexRef:
6159

6260
# Create an index with both "docno" and "text" as metadata
6361
logging.info("Creating Index")
64-
df = document_collection_dataframe()
65-
df = add_query_description_to_documents(df)
66-
df_indexer = pt.DFIndexer(INDEX_PATH, verbose=True, blocks=True)
67-
logging.info("Indexing")
68-
return df_indexer.index(df["text"], df)
69-
70-
71-
def document_collection_dataframe() -> pd.DataFrame:
72-
"""
73-
Return a dataframe of the document collection.
74-
75-
Returns
76-
-------
77-
pd.DataFrame
78-
A dataframe of the document collection.
79-
"""
80-
df = pd.read_table(
81-
DATA_PATH, names=["docno", "text"], header=0, dtype={"docno": str, "text": str}
62+
iter_indexer = pt.IterDictIndexer(INDEX_PATH, verbose=True, blocks=True)
63+
return iter_indexer.index(
64+
document_collection_generator(), meta={"docno": 20, "text": 4096}
8265
)
83-
# Remove the rows where text is empty
84-
df = df[df["text"].str.strip().astype(bool)]
85-
return df
8666

8767

88-
def add_query_description_to_documents(df: pd.DataFrame) -> pd.DataFrame:
68+
def document_collection_generator() -> Generator[Dict[str, str], None, None]:
8969
"""
90-
Add a query description to each document.
91-
92-
Parameters
93-
----------
94-
pd.DataFrame
95-
A dataframe of the document collection.
70+
Return a generator over the document collection.
9671
97-
Returns
72+
Yields
9873
-------
99-
pd.DataFrame
100-
A dataframe of the document collection with a query description.
74+
Generator[Dict[str, str], None, None]
75+
A generator over the document collection (docno, content).
10176
"""
102-
doc_2_query_t5 = Doc2Query(
103-
append=False,
104-
out_attr="querygen",
105-
fast_tokenizer=True,
106-
verbose=True,
107-
num_samples=1,
108-
)
109-
logging.info("Adding query description to documents")
110-
return doc_2_query_t5.transform(df)
77+
with open(DATA_PATH, "r") as f:
78+
for line in f:
79+
docno, content = line.split("\t")
80+
if content.strip() == "":
81+
continue
82+
yield {"docno": docno, "text": content}

0 commit comments

Comments
 (0)