Fixed the issue bug that occurred when a query does not have N-relevant documents (or even none)

CodingTil · CodingTil · commit f556964ca745 · 2023-10-13T11:55:19.000+02:00
diff --git a/py_css/models/T5Rewriter.py b/py_css/models/T5Rewriter.py
@@ -13,6 +13,7 @@
 EARLY_STOPPING: bool = True
 
 COPY_REWRITTEN_QUERY_COLUMN: str = "rewritten_query"
+SEPERATOR_TOKEN: str = " ||| "
 
 
 class T5Rewriter(pt.Transformer):
@@ -46,26 +47,26 @@ def __init__(self):
         )
         super().__init__()
 
-    # the query has multiply " <sep> " in it. Create a list of the split with a maximum of 3 elements (last element is last, second last is middle, and the first n are joined)
+    # the query has multiply SEPERATOR_TOKEN in it. Create a list of the split with a maximum of 3 elements (last element is last, second last is middle, and the first n are joined)
     def __split_query_tokenize_join(self, q):
         """
         Split the query, tokenize the parts, and join them back together.
         """
-        l = q.split(" <sep> ")
+        l = q.split(SEPERATOR_TOKEN)
         if len(l) < 3:
             tokens = []
             for ll in l:
                 tokens.extend(self.tokenizer.tokenize(ll))
-                tokens.append(" <sep> ")
+                tokens.append(SEPERATOR_TOKEN)
             if len(tokens) > 0:
                 tokens.pop()
             return tokens
         else:
             tokens = []
-            tokens.extend(self.tokenizer.tokenize(" <sep> ".join(l[:-2])))
-            tokens.append(" <sep> ")
+            tokens.extend(self.tokenizer.tokenize(SEPERATOR_TOKEN.join(l[:-2])))
+            tokens.append(SEPERATOR_TOKEN)
             tokens.extend(self.tokenizer.tokenize(l[-2]))
-            tokens.append(" <sep> ")
+            tokens.append(SEPERATOR_TOKEN)
             tokens.extend(self.tokenizer.tokenize(l[-1]))
             return tokens
 
diff --git a/py_css/models/base.py b/py_css/models/base.py
@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 import logging
-from typing import Optional, List, Tuple, TypeAlias
+from typing import Optional, List, Tuple, TypeAlias, Set, Any, Dict, Generator
 import warnings
 
 import models.T5Rewriter as t5_rewriter_module
@@ -57,6 +57,9 @@ def __str__(self) -> str:
 
 Context: TypeAlias = List[Tuple[Query, Optional[List[Document]]]]
 
+# If the retrieval model did not find any suitable or all N-required documents, this document shall be used as a placeholder
+EMPTY_PLACEHOLDER_DOC: Document = Document("-1", "")
+
 
 class Pipeline(ABC):
     """
@@ -99,6 +102,148 @@ def transform(self, query_df: pd.DataFrame) -> pd.DataFrame:
         """
         ...
 
+    def pad_empty_documents(
+        self, df: pd.DataFrame, qids: Set[str], N: int, queries_df: pd.DataFrame
+    ) -> pd.DataFrame:
+        """
+        Pad the dataframe with empty documents.
+
+        Parameters
+        ----------
+        df : pd.DataFrame
+            The dataframe to be padded.
+        qids : Set[str]
+            The query ids.
+        N : int
+            The number of documents each qid should have.
+        queries_df : pd.DataFrame
+            The queries dataframe.
+
+        Returns
+        -------
+        pd.DataFrame
+            The padded dataframe.
+        """
+        df_has_rewritten_queries: bool = (
+            t5_rewriter_module.COPY_REWRITTEN_QUERY_COLUMN in df.columns
+        )
+        rows_to_add: List[Dict[str, Any]] = []
+        for qid in qids:
+            # Check if qid is in top_doc_df (if not --> no documents at all were found)
+            if qid not in df["qid"].unique():
+                for i in range(1, N + 1):
+                    row = {
+                        "qid": qid,
+                        "docid": EMPTY_PLACEHOLDER_DOC.docno,
+                        "docno": EMPTY_PLACEHOLDER_DOC.docno,
+                        "text": EMPTY_PLACEHOLDER_DOC.content,
+                        "score": -i,
+                        "rank": i,
+                        "query_0": queries_df[queries_df["qid"] == qid]["query"].iloc[
+                            0
+                        ],
+                        "query": queries_df[queries_df["qid"] == qid]["query"].iloc[0],
+                    }
+                    if df_has_rewritten_queries:
+                        row[t5_rewriter_module.COPY_REWRITTEN_QUERY_COLUMN] = row[
+                            "query"
+                        ]
+                    rows_to_add.append(row)
+            else:
+                # if there are less than N occurrences of a qid, add base_module.EMPTY_PLACEHOLDER_DOC to fill up (need to adjust score)
+                if df.groupby("qid").size()[qid] < N:
+                    lowest_score = df[df["qid"] == qid]["score"].min()
+                    rank = int(round(df[df["qid"] == qid]["rank"].max()))
+                    for i in range(
+                        rank + 1,
+                        N + 1,
+                    ):
+                        row = {
+                            "qid": qid,
+                            "docid": EMPTY_PLACEHOLDER_DOC.docno,
+                            "docno": EMPTY_PLACEHOLDER_DOC.docno,
+                            "text": EMPTY_PLACEHOLDER_DOC.content,
+                            "score": lowest_score - i,
+                            "rank": i,
+                            "query_0": queries_df[queries_df["qid"] == qid][
+                                "query"
+                            ].iloc[0],
+                            "query": queries_df[queries_df["qid"] == qid]["query"].iloc[
+                                0
+                            ],
+                        }
+                        if df_has_rewritten_queries:
+                            row[t5_rewriter_module.COPY_REWRITTEN_QUERY_COLUMN] = row[
+                                "query"
+                            ]
+                        rows_to_add.append(row)
+        if len(rows_to_add) > 0:
+            df = pd.concat([df, pd.DataFrame(rows_to_add)])
+            df = df.sort_values(["qid", "rank"], ascending=[True, True])
+
+        return df
+
+    def replace_empty_placeholder_docs(
+        self, df: pd.DataFrame, context_list: List[Tuple[Query, Context]]
+    ) -> pd.DataFrame:
+        """
+        Replace any empty placeholder documents with documents from the context, if possible.
+
+        Parameters
+        ----------
+        df : pd.DataFrame
+            The dataframe to be replaced.
+        context_list : List[Tuple[Query, Context]]
+            The context of the queries.
+
+        Returns
+        -------
+        pd.DataFrame
+            The dataframe with the replaced documents.
+        """
+
+        def gen_context_docs(context: Context) -> Generator[Document, None, None]:
+            for _, docs in reversed(context):
+                if docs is not None:
+                    for doc in docs:
+                        if doc.docno != EMPTY_PLACEHOLDER_DOC.docno:
+                            yield doc
+
+        for query, context in context_list:
+            # check if there is a row in the df with "qid" == query.query_id, where "docno" == EMPTY_PLACEHOLDER_DOC.docno
+            # if yes, replace it with the top document from the context
+            while True:
+                if not df[
+                    (df["qid"] == query.query_id)
+                    & (df["docno"] == EMPTY_PLACEHOLDER_DOC.docno)
+                ].empty:
+                    # Check if gen_docs has next element
+                    doc: Document
+                    doc_gen = gen_context_docs(context)
+                    try:
+                        doc = next(doc_gen)
+                        while (
+                            doc.docno
+                            in df[(df["qid"] == query.query_id)]["docno"].unique()
+                        ):
+                            doc = next(doc_gen)
+                    except StopIteration:
+                        break
+                    # Get the row index of the row to be replaced (of all of the rows satisfying the condition, take the one with min "rank" value)
+                    row_index = df[
+                        (df["qid"] == query.query_id)
+                        & (df["docno"] == EMPTY_PLACEHOLDER_DOC.docno)
+                    ]["rank"].idxmin()
+
+                    # Of that row, set docno and docid to doc.no, and text to doc.content
+                    df.loc[row_index, "docno"] = doc.docno
+                    df.loc[row_index, "docid"] = doc.docno
+                    df.loc[row_index, "text"] = doc.content
+                else:
+                    break
+
+        return df
+
     def combine_result_stages(self, results: List[pd.DataFrame]) -> pd.DataFrame:
         """
         Combine the results of the stages.
@@ -188,6 +333,7 @@ def search(self, query: Query, context: Context) -> Tuple[Context, pd.DataFrame]
         query_str = self.transform_input(query, context)
         query_df = pd.DataFrame([{"qid": query.query_id, "query": query_str}])
         result = self.transform(query_df)
+        result = self.replace_empty_placeholder_docs(result, [(query, context)])
 
         if t5_rewriter_module.COPY_REWRITTEN_QUERY_COLUMN in result.columns:
             temp_result = result[result["qid"] == query.query_id]
@@ -239,6 +385,7 @@ def batch_search(
             ]
         )
         result = self.transform(query_df)
+        result = self.replace_empty_placeholder_docs(result, inputs)
 
         if t5_rewriter_module.COPY_REWRITTEN_QUERY_COLUMN in result.columns:
             for query, _ in inputs:
diff --git a/py_css/models/baseline.py b/py_css/models/baseline.py
@@ -17,6 +17,7 @@ class Baseline(base_module.Pipeline):
     A class to represent the baseline retrieval method.
     """
 
+    t5_qr: t5_rewriter.T5Rewriter
     top_docs: Tuple[pt.Transformer, int]
     mono_t5: Tuple[MonoT5ReRanker, int]
     duo_t5: Tuple[DuoT5ReRanker, int]
@@ -43,9 +44,9 @@ def __init__(
         duo_t5_docs : int
             The number of documents to retrieve with DuoT5.
         """
-        t5_qr = t5_rewriter.T5Rewriter()
+        self.t5_qr = t5_rewriter.T5Rewriter()
         bm25 = pt.BatchRetrieve(index, wmodel="BM25", metadata=["docno", "text"])
-        self.top_docs = ((t5_qr >> bm25) % bm25_docs, bm25_docs)
+        self.top_docs = ((bm25 % bm25_docs).compile(), bm25_docs)
         self.mono_t5 = (MonoT5ReRanker(batch_size=BATCH_SIZE), mono_t5_docs)
         self.duo_t5 = (DuoT5ReRanker(batch_size=BATCH_SIZE), duo_t5_docs)
 
@@ -58,7 +59,11 @@ def transform_input(
         doc_was_added = False
         if len(context) > 0:
             last_docs = context[-1][1]
-            if last_docs is not None and len(last_docs) > 0:
+            if (
+                last_docs is not None
+                and len(last_docs) > 0
+                and last_docs[0].docno != base_module.EMPTY_PLACEHOLDER_DOC.docno
+            ):
                 history.append(last_docs[0].content)
                 doc_was_added = True
         sum_of_lengths = sum([len(q) for q in history]) + len(query.query)
@@ -89,11 +94,15 @@ def transform_input(
                         remaining = 0
 
         history.append(query.query)
-        new_query = " <sep> ".join(history)
+        new_query = t5_rewriter.SEPERATOR_TOKEN.join(history)
         return new_query
 
     def transform(self, query_df: pd.DataFrame) -> pd.DataFrame:
-        top_docs_df = self.top_docs[0].transform(query_df)
+        unique_qids = set(query_df["qid"].unique())
+
+        rewritten_queries_df = self.t5_qr.transform(query_df)
+
+        top_docs_df = self.top_docs[0].transform(rewritten_queries_df.copy())
         top_docs_df = (
             top_docs_df.sort_values(["qid", "score"], ascending=False)
             .groupby("qid")
@@ -118,4 +127,9 @@ def transform(self, query_df: pd.DataFrame) -> pd.DataFrame:
             .head(self.duo_t5[1])
         )
 
-        return self.combine_result_stages([top_docs_df, mono_t5_df, duo_t5_df])
+        result = self.combine_result_stages([top_docs_df, mono_t5_df, duo_t5_df])
+        result = self.pad_empty_documents(
+            result, unique_qids, self.top_docs[1], rewritten_queries_df
+        )
+
+        return result
diff --git a/py_css/models/baseline_prf.py b/py_css/models/baseline_prf.py