Bit report & Debugging code

CodingTil · CodingTil · commit d8917b8ade06 · 2023-10-12T23:14:04.000+02:00
diff --git a/py_css/models/baseline.py b/py_css/models/baseline.py
@@ -55,10 +55,39 @@ def transform_input(
         history = []
         for q, _ in context:
             history.append(q.query)
+        doc_was_added = False
         if len(context) > 0:
             last_docs = context[-1][1]
-            if last_docs is not None:
+            if last_docs is not None and len(last_docs) > 0:
                 history.append(last_docs[0].content)
+                doc_was_added = True
+        sum_of_lengths = sum([len(q) for q in history]) + len(query.query)
+        if sum_of_lengths > 512:
+            if doc_was_added:
+                if sum_of_lengths - 512 < len(history[-1]):
+                    history[-1] = history[-1][: sum_of_lengths - 512]
+                elif sum_of_lengths - 512 == len(query.query):
+                    history = history[:-1]
+                else:
+                    history = history[:-1]
+                    remaining = sum([len(q) for q in history]) + len(query.query) - 512
+                    while remaining > 0 and 0 < len(history):
+                        if len(history[0]) < remaining:
+                            remaining -= len(history[0])
+                            history = history[0:]
+                        else:
+                            history[0] = history[0][remaining:]
+                            remaining = 0
+            else:
+                remaining = sum([len(q) for q in history]) + len(query.query) - 512
+                while remaining > 0 and 0 < len(history):
+                    if len(history[0]) < remaining:
+                        remaining -= len(history[0])
+                        history = history[0:]
+                    else:
+                        history[0] = history[0][remaining:]
+                        remaining = 0
+
         history.append(query.query)
         new_query = " <sep> ".join(history)
         return new_query
diff --git a/py_css/models/baseline_prf.py b/py_css/models/baseline_prf.py
@@ -63,24 +63,63 @@ def transform_input(
         history = []
         for q, _ in context:
             history.append(q.query)
+        doc_was_added = False
         if len(context) > 0:
             last_docs = context[-1][1]
-            if last_docs is not None:
+            if last_docs is not None and len(last_docs) > 0:
                 history.append(last_docs[0].content)
+                doc_was_added = True
+        sum_of_lengths = sum([len(q) for q in history]) + len(query.query)
+        if sum_of_lengths > 512:
+            if doc_was_added:
+                if sum_of_lengths - 512 < len(history[-1]):
+                    history[-1] = history[-1][: sum_of_lengths - 512]
+                elif sum_of_lengths - 512 == len(query.query):
+                    history = history[:-1]
+                else:
+                    history = history[:-1]
+                    remaining = sum([len(q) for q in history]) + len(query.query) - 512
+                    while remaining > 0 and 0 < len(history):
+                        if len(history[0]) < remaining:
+                            remaining -= len(history[0])
+                            history = history[0:]
+                        else:
+                            history[0] = history[0][remaining:]
+                            remaining = 0
+            else:
+                remaining = sum([len(q) for q in history]) + len(query.query) - 512
+                while remaining > 0 and 0 < len(history):
+                    if len(history[0]) < remaining:
+                        remaining -= len(history[0])
+                        history = history[0:]
+                    else:
+                        history[0] = history[0][remaining:]
+                        remaining = 0
+
         history.append(query.query)
         new_query = " <sep> ".join(history)
         return new_query
 
     def transform(self, query_df: pd.DataFrame) -> pd.DataFrame:
+        unique_qids = set(query_df["qid"].unique())
+
         rewritten_queries_df = self.t5_qr.transform(query_df)
 
+        assert unique_qids == set(
+            rewritten_queries_df["qid"].unique()
+        ), f"{unique_qids} != {set(rewritten_queries_df['qid'].unique())}"
+
         top_docs_df = self.top_docs[0].transform(rewritten_queries_df.copy())
         top_docs_df = (
             top_docs_df.sort_values(["qid", "score"], ascending=False)
             .groupby("qid")
             .head(self.top_docs[1])
         )
 
+        assert unique_qids == set(
+            top_docs_df["qid"].unique()
+        ), f"{unique_qids} != {set(top_docs_df['qid'].unique())}"
+
         # Now add in the rewritten queries to the top docs
         top_docs_df = pt.model.push_queries(top_docs_df, inplace=True)
         top_docs_df = pd.merge(
@@ -91,6 +130,10 @@ def transform(self, query_df: pd.DataFrame) -> pd.DataFrame:
         )
         top_docs_df["query"] = top_docs_df["rewritten_query"]
 
+        assert unique_qids == set(
+            top_docs_df["qid"].unique()
+        ), f"{unique_qids} != {set(top_docs_df['qid'].unique())}"
+
         mono_t5_df = self.mono_t5[0].transform(
             top_docs_df.groupby("qid").head(self.mono_t5[1])
         )
@@ -100,6 +143,10 @@ def transform(self, query_df: pd.DataFrame) -> pd.DataFrame:
             .head(self.mono_t5[1])
         )
 
+        assert unique_qids == set(
+            mono_t5_df["qid"].unique()
+        ), f"{unique_qids} != {set(mono_t5_df['qid'].unique())}"
+
         duo_t5_df = self.duo_t5[0].transform(
             mono_t5_df.groupby("qid").head(self.duo_t5[1])
         )
@@ -109,4 +156,14 @@ def transform(self, query_df: pd.DataFrame) -> pd.DataFrame:
             .head(self.duo_t5[1])
         )
 
-        return self.combine_result_stages([top_docs_df, mono_t5_df, duo_t5_df])
+        assert unique_qids == set(
+            duo_t5_df["qid"].unique()
+        ), f"{unique_qids} != {set(duo_t5_df['qid'].unique())}"
+
+        result = self.combine_result_stages([top_docs_df, mono_t5_df, duo_t5_df])
+
+        assert unique_qids == set(
+            result["qid"].unique()
+        ), f"{unique_qids} != {set(result['qid'].unique())}"
+
+        return result
diff --git a/report/.vscode/ltex.hiddenFalsePositives.en-US.txt b/report/.vscode/ltex.hiddenFalsePositives.en-US.txt
@@ -0,0 +1,2 @@
+{"rule":"ADMIT_ENJOY_VB","sentence":"^\\QNLP aspires to equip computers with the capability to grasp the depth of human language and harness this understanding to execute a range of tasks, from text summarization and machine translation to the complexities of question answering.\\E$"}
+{"rule":"ADMIT_ENJOY_VB","sentence":"^\\QNLP aspires to equip computers with the capability to grasp the depth of human language and harness this understanding to execute a range of tasks, such as text summarization, machine translation, and question answering.\\E$"}
diff --git a/report/main.bib b/report/main.bib
@@ -8,3 +8,30 @@ @inproceedings{Lajewska:2023:ECIR
   doi =       {10.1007/978-3-031-28241-6_12},
   publisher = {Springer}
 }
+
+@article{raffel2020exploring,
+  title={Exploring the limits of transfer learning with a unified text-to-text transformer},
+  author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
+  journal={The Journal of Machine Learning Research},
+  volume={21},
+  number={1},
+  pages={5485--5551},
+  year={2020},
+  publisher={JMLRORG}
+}
+
+@article{nogueira2019document,
+  title={Document expansion by query prediction},
+  author={Nogueira, Rodrigo and Yang, Wei and Lin, Jimmy and Cho, Kyunghyun},
+  journal={arXiv preprint arXiv:1904.08375},
+  year={2019}
+}
+
+@article{nogueira2019doc2query,
+  title={From doc2query to docTTTTTquery},
+  author={Nogueira, Rodrigo and Lin, Jimmy and Epistemic, AI},
+  journal={Online preprint},
+  volume={6},
+  pages={2},
+  year={2019}
+}
diff --git a/report/main.pdf b/report/main.pdf
diff --git a/report/main.tex b/report/main.tex
@@ -71,10 +71,15 @@ \section{Problem Statement}\label{sec:problem}
 
 
 \section{Related Work}\label{sec:related}
-\subsection*{T5}
+In this section, we delve into pertinent research encompassing the realms of conversational search engines and the broader area of information retrieval. While certain highlighted studies do not directly cater to conversational search engines or explicit information retrieval, their techniques remain invaluable in various stages of the conversational retrieval process.
 
 \subsection*{doc2query}
 
+\subsection*{Text-to-Text Transfer Transformer}
+The vast domain of natural language processing (NLP) revolves around the understanding of natural language, whether presented in text or speech form. NLP aspires to equip computers with the capability to grasp the depth of human language and harness this understanding to execute a range of tasks, such as text summarization, machine translation, and question answering. Given the diverse nature of these tasks in terms of their input, output, and underlying challenges, developing a unified model proficient across the entire spectrum poses a significant challenge.
+
+Enter the Text-to-Text Transfer Transformer (T5) \cite{raffel2020exploring}. This work by Raffel et al. introduces transfer learning in NLP, aiming to craft a versatile model that can be used for any NLP problem. In essence, T5 models first learn the basics of language. Then, they're sharpened for particular tasks using targeted data. It's common to find models that have been trained in this manner for any specific NLP problem.
+
 \subsection*{doc2query-T5}
 
 \subsection*{monoT5 \& duoT5}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+{"rule":"ADMIT_ENJOY_VB","sentence":"^\\QNLP aspires to equip computers with the capability to grasp the depth of human language and harness this understanding to execute a range of tasks, from text summarization and machine translation to the complexities of question answering.\\E$"}`
	`2`	`+{"rule":"ADMIT_ENJOY_VB","sentence":"^\\QNLP aspires to equip computers with the capability to grasp the depth of human language and harness this understanding to execute a range of tasks, such as text summarization, machine translation, and question answering.\\E$"}`