Narrowing down the bug

CodingTil · CodingTil · commit 2bf879a11c63 · 2023-10-13T06:56:39.000+02:00
diff --git a/py_css/interface/kaggle.py b/py_css/interface/kaggle.py
@@ -28,7 +28,11 @@ def to_kaggle_format(df: pd.DataFrame) -> str:
         The dataframe in the Kaggle submission format.
     """
     # for each query, only keep the best 3 docnos ranked by asceding rank
-    df = df.groupby("qid").sort_values("rank").head(3)
+    df = (
+        df.sort_values(by=["qid", "rank"], ascending=[True, True])
+        .groupby("qid")
+        .head(3)
+    )
 
     output = "qid,docid\n"
     for _, row in df.iterrows():
diff --git a/py_css/interface/run_queries.py b/py_css/interface/run_queries.py
@@ -29,6 +29,7 @@ def to_trec_runfile_format(df: pd.DataFrame, model_name: str) -> str:
     str
         The dataframe in the TREC runfile format.
     """
+    df = df.sort_values(by=["qid", "rank"], ascending=[True, True])
     return "\n".join(
         [
             f"{row['qid']} Q0 {row['docno']} {int(row['rank']) + 1} {row['score']} {model_name}"
diff --git a/py_css/models/baseline.py b/py_css/models/baseline.py
@@ -45,7 +45,7 @@ def __init__(
         """
         t5_qr = t5_rewriter.T5Rewriter()
         bm25 = pt.BatchRetrieve(index, wmodel="BM25", metadata=["docno", "text"])
-        self.top_docs = (t5_qr >> bm25, bm25_docs)
+        self.top_docs = ((t5_qr >> bm25) % bm25_docs, bm25_docs)
         self.mono_t5 = (MonoT5ReRanker(batch_size=BATCH_SIZE), mono_t5_docs)
         self.duo_t5 = (DuoT5ReRanker(batch_size=BATCH_SIZE), duo_t5_docs)
 
diff --git a/py_css/models/baseline_prf.py b/py_css/models/baseline_prf.py
@@ -53,7 +53,7 @@ def __init__(
         self.t5_qr = t5_rewriter.T5Rewriter()
         bm25 = pt.BatchRetrieve(index, wmodel="BM25", metadata=["docno", "text"])
         rm3 = pt.rewrite.RM3(index, fb_docs=rm3_fb_docs, fb_terms=rm3_fb_terms)
-        self.top_docs = ((bm25 % rm3_fb_docs) >> rm3 >> bm25, bm25_docs)
+        self.top_docs = (((bm25 % rm3_fb_docs) >> rm3 >> bm25) % bm25_docs, bm25_docs)
         self.mono_t5 = (MonoT5ReRanker(batch_size=BATCH_SIZE), mono_t5_docs)
         self.duo_t5 = (DuoT5ReRanker(batch_size=BATCH_SIZE), duo_t5_docs)
 
@@ -115,6 +115,11 @@ def transform(self, query_df: pd.DataFrame) -> pd.DataFrame:
             top_docs_df["qid"].unique()
         ), f"{unique_qids} != {set(top_docs_df['qid'].unique())}"
 
+        # assert that each qid is present 1000 times
+        assert (
+            top_docs_df.groupby("qid").size() == 1000
+        ).all(), f"{top_docs_df.groupby('qid').size().unique()}"
+
         top_docs_df = (
             top_docs_df.sort_values(["qid", "score"], ascending=False)
             .groupby("qid")
@@ -125,6 +130,10 @@ def transform(self, query_df: pd.DataFrame) -> pd.DataFrame:
             top_docs_df["qid"].unique()
         ), f"{unique_qids} != {set(top_docs_df['qid'].unique())}"
 
+        assert (
+            top_docs_df.groupby("qid").size() == 1000
+        ).all(), f"{top_docs_df.groupby('qid').size().unique()}"
+
         # Now add in the rewritten queries to the top docs
         top_docs_df = pt.model.push_queries(top_docs_df, inplace=True)
         top_docs_df = pd.merge(

Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@ def to_trec_runfile_format(df: pd.DataFrame, model_name: str) -> str:`
`29`	`29`	`str`
`30`	`30`	`The dataframe in the TREC runfile format.`
`31`	`31`	`"""`
	`32`	`+ df = df.sort_values(by=["qid", "rank"], ascending=[True, True])`
`32`	`33`	`return "\n".join(`
`33`	`34`	`[`
`34`	`35`	`f"{row['qid']} Q0 {row['docno']} {int(row['rank']) + 1} {row['score']} {model_name}"`