|
1 | 1 | import os |
2 | 2 | from pkg_resources import resource_filename |
3 | 3 | import shutil |
4 | | -import pyterrier as pt |
5 | 4 | import logging |
6 | 5 | from typing import Dict, Generator |
7 | 6 |
|
| 7 | +import pandas as pd |
| 8 | +import pyterrier as pt |
| 9 | +from pyterrier_doc2query import Doc2Query |
| 10 | + |
8 | 11 | # Paths for data and index |
9 | 12 | DATA_PATH = resource_filename(__name__, "../../data/collection.tsv") |
10 | 13 | INDEX_PATH = resource_filename(__name__, "../../data/index") |
@@ -58,22 +61,47 @@ def create_index() -> pt.IndexRef: |
58 | 61 |
|
59 | 62 | # Create an index with both "docno" and "text" as metadata |
60 | 63 | logging.info("Creating Index") |
61 | | - iter_indexer = pt.IterDictIndexer(INDEX_PATH, verbose=True, blocks=True) |
62 | | - return iter_indexer.index( |
63 | | - document_collection_generator(), meta={"docno": 20, "text": 4096} |
| 64 | + df = document_collection_dataframe() |
| 65 | + df = add_query_description_to_documents(df) |
| 66 | + df_indexer = pt.DFIndexer(INDEX_PATH, verbose=True, blocks=True) |
| 67 | + logging.info("Indexing") |
| 68 | + return df_indexer.index(df["text"], df) |
| 69 | + |
| 70 | + |
| 71 | +def document_collection_dataframe() -> pd.DataFrame: |
| 72 | + """ |
| 73 | + Return a dataframe of the document collection. |
| 74 | +
|
| 75 | + Returns |
| 76 | + ------- |
| 77 | + pd.DataFrame |
| 78 | + A dataframe of the document collection. |
| 79 | + """ |
| 80 | + return pd.read_table( |
| 81 | + DATA_PATH, names=["docno", "text"], header=0, dtype={"docno": str, "text": str} |
64 | 82 | ) |
65 | 83 |
|
66 | 84 |
|
67 | | -def document_collection_generator() -> Generator[Dict[str, str], None, None]: |
| 85 | +def add_query_description_to_documents(df: pd.DataFrame) -> pd.DataFrame: |
68 | 86 | """ |
69 | | - Return a generator over the document collection. |
| 87 | + Add a query description to each document. |
70 | 88 |
|
71 | | - Yields |
| 89 | + Parameters |
| 90 | + ---------- |
| 91 | + pd.DataFrame |
| 92 | + A dataframe of the document collection. |
| 93 | +
|
| 94 | + Returns |
72 | 95 | ------- |
73 | | - Generator[Dict[str, str], None, None] |
74 | | - A generator over the document collection (docno, content). |
| 96 | + pd.DataFrame |
| 97 | + A dataframe of the document collection with a query description. |
75 | 98 | """ |
76 | | - with open(DATA_PATH, "r") as f: |
77 | | - for line in f: |
78 | | - docno, content = line.split("\t") |
79 | | - yield {"docno": docno, "text": content} |
| 99 | + doc_2_query_t5 = Doc2Query( |
| 100 | + append=False, |
| 101 | + out_attr="querygen", |
| 102 | + fast_tokenizer=True, |
| 103 | + verbose=True, |
| 104 | + num_samples=1, |
| 105 | + ) |
| 106 | + logging.info("Adding query description to documents") |
| 107 | + return doc_2_query_t5.transform(df) |
0 commit comments