|
4 | 4 | import logging |
5 | 5 | from typing import Dict, Generator |
6 | 6 |
|
7 | | -import pandas as pd |
8 | 7 | import pyterrier as pt |
9 | | -from pyterrier_doc2query import Doc2Query |
10 | 8 |
|
11 | 9 | # Paths for data and index |
12 | 10 | DATA_PATH = resource_filename(__name__, "../../data/collection.tsv") |
@@ -61,50 +59,24 @@ def create_index() -> pt.IndexRef: |
61 | 59 |
|
62 | 60 | # Create an index with both "docno" and "text" as metadata |
63 | 61 | logging.info("Creating Index") |
64 | | - df = document_collection_dataframe() |
65 | | - df = add_query_description_to_documents(df) |
66 | | - df_indexer = pt.DFIndexer(INDEX_PATH, verbose=True, blocks=True) |
67 | | - logging.info("Indexing") |
68 | | - return df_indexer.index(df["text"], df) |
69 | | - |
70 | | - |
71 | | -def document_collection_dataframe() -> pd.DataFrame: |
72 | | - """ |
73 | | - Return a dataframe of the document collection. |
74 | | -
|
75 | | - Returns |
76 | | - ------- |
77 | | - pd.DataFrame |
78 | | - A dataframe of the document collection. |
79 | | - """ |
80 | | - df = pd.read_table( |
81 | | - DATA_PATH, names=["docno", "text"], header=0, dtype={"docno": str, "text": str} |
| 62 | + iter_indexer = pt.IterDictIndexer(INDEX_PATH, verbose=True, blocks=True) |
| 63 | + return iter_indexer.index( |
| 64 | + document_collection_generator(), meta={"docno": 20, "text": 4096} |
82 | 65 | ) |
83 | | - # Remove the rows where text is empty |
84 | | - df = df[df["text"].str.strip().astype(bool)] |
85 | | - return df |
86 | 66 |
|
87 | 67 |
|
88 | | -def add_query_description_to_documents(df: pd.DataFrame) -> pd.DataFrame: |
| 68 | +def document_collection_generator() -> Generator[Dict[str, str], None, None]: |
89 | 69 | """ |
90 | | - Add a query description to each document. |
91 | | -
|
92 | | - Parameters |
93 | | - ---------- |
94 | | - pd.DataFrame |
95 | | - A dataframe of the document collection. |
| 70 | + Return a generator over the document collection. |
96 | 71 |
|
97 | | - Returns |
| 72 | + Yields |
98 | 73 | ------- |
99 | | - pd.DataFrame |
100 | | - A dataframe of the document collection with a query description. |
| 74 | + Generator[Dict[str, str], None, None] |
| 75 | + A generator over the document collection (docno, content). |
101 | 76 | """ |
102 | | - doc_2_query_t5 = Doc2Query( |
103 | | - append=False, |
104 | | - out_attr="querygen", |
105 | | - fast_tokenizer=True, |
106 | | - verbose=True, |
107 | | - num_samples=1, |
108 | | - ) |
109 | | - logging.info("Adding query description to documents") |
110 | | - return doc_2_query_t5.transform(df) |
| 77 | + with open(DATA_PATH, "r") as f: |
| 78 | + for line in f: |
| 79 | + docno, content = line.split("\t") |
| 80 | + if content.strip() == "": |
| 81 | + continue |
| 82 | + yield {"docno": docno, "text": content} |
0 commit comments