Skip to content

Commit d8917b8

Browse files
committed
Bit report & Debugging code
1 parent ac4239f commit d8917b8

6 files changed

Lines changed: 124 additions & 4 deletions

File tree

py_css/models/baseline.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,39 @@ def transform_input(
5555
history = []
5656
for q, _ in context:
5757
history.append(q.query)
58+
doc_was_added = False
5859
if len(context) > 0:
5960
last_docs = context[-1][1]
60-
if last_docs is not None:
61+
if last_docs is not None and len(last_docs) > 0:
6162
history.append(last_docs[0].content)
63+
doc_was_added = True
64+
sum_of_lengths = sum([len(q) for q in history]) + len(query.query)
65+
if sum_of_lengths > 512:
66+
if doc_was_added:
67+
if sum_of_lengths - 512 < len(history[-1]):
68+
history[-1] = history[-1][: sum_of_lengths - 512]
69+
elif sum_of_lengths - 512 == len(query.query):
70+
history = history[:-1]
71+
else:
72+
history = history[:-1]
73+
remaining = sum([len(q) for q in history]) + len(query.query) - 512
74+
while remaining > 0 and 0 < len(history):
75+
if len(history[0]) < remaining:
76+
remaining -= len(history[0])
77+
history = history[0:]
78+
else:
79+
history[0] = history[0][remaining:]
80+
remaining = 0
81+
else:
82+
remaining = sum([len(q) for q in history]) + len(query.query) - 512
83+
while remaining > 0 and 0 < len(history):
84+
if len(history[0]) < remaining:
85+
remaining -= len(history[0])
86+
history = history[0:]
87+
else:
88+
history[0] = history[0][remaining:]
89+
remaining = 0
90+
6291
history.append(query.query)
6392
new_query = " <sep> ".join(history)
6493
return new_query

py_css/models/baseline_prf.py

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,24 +63,63 @@ def transform_input(
6363
history = []
6464
for q, _ in context:
6565
history.append(q.query)
66+
doc_was_added = False
6667
if len(context) > 0:
6768
last_docs = context[-1][1]
68-
if last_docs is not None:
69+
if last_docs is not None and len(last_docs) > 0:
6970
history.append(last_docs[0].content)
71+
doc_was_added = True
72+
sum_of_lengths = sum([len(q) for q in history]) + len(query.query)
73+
if sum_of_lengths > 512:
74+
if doc_was_added:
75+
if sum_of_lengths - 512 < len(history[-1]):
76+
history[-1] = history[-1][: sum_of_lengths - 512]
77+
elif sum_of_lengths - 512 == len(query.query):
78+
history = history[:-1]
79+
else:
80+
history = history[:-1]
81+
remaining = sum([len(q) for q in history]) + len(query.query) - 512
82+
while remaining > 0 and 0 < len(history):
83+
if len(history[0]) < remaining:
84+
remaining -= len(history[0])
85+
history = history[0:]
86+
else:
87+
history[0] = history[0][remaining:]
88+
remaining = 0
89+
else:
90+
remaining = sum([len(q) for q in history]) + len(query.query) - 512
91+
while remaining > 0 and 0 < len(history):
92+
if len(history[0]) < remaining:
93+
remaining -= len(history[0])
94+
history = history[0:]
95+
else:
96+
history[0] = history[0][remaining:]
97+
remaining = 0
98+
7099
history.append(query.query)
71100
new_query = " <sep> ".join(history)
72101
return new_query
73102

74103
def transform(self, query_df: pd.DataFrame) -> pd.DataFrame:
104+
unique_qids = set(query_df["qid"].unique())
105+
75106
rewritten_queries_df = self.t5_qr.transform(query_df)
76107

108+
assert unique_qids == set(
109+
rewritten_queries_df["qid"].unique()
110+
), f"{unique_qids} != {set(rewritten_queries_df['qid'].unique())}"
111+
77112
top_docs_df = self.top_docs[0].transform(rewritten_queries_df.copy())
78113
top_docs_df = (
79114
top_docs_df.sort_values(["qid", "score"], ascending=False)
80115
.groupby("qid")
81116
.head(self.top_docs[1])
82117
)
83118

119+
assert unique_qids == set(
120+
top_docs_df["qid"].unique()
121+
), f"{unique_qids} != {set(top_docs_df['qid'].unique())}"
122+
84123
# Now add in the rewritten queries to the top docs
85124
top_docs_df = pt.model.push_queries(top_docs_df, inplace=True)
86125
top_docs_df = pd.merge(
@@ -91,6 +130,10 @@ def transform(self, query_df: pd.DataFrame) -> pd.DataFrame:
91130
)
92131
top_docs_df["query"] = top_docs_df["rewritten_query"]
93132

133+
assert unique_qids == set(
134+
top_docs_df["qid"].unique()
135+
), f"{unique_qids} != {set(top_docs_df['qid'].unique())}"
136+
94137
mono_t5_df = self.mono_t5[0].transform(
95138
top_docs_df.groupby("qid").head(self.mono_t5[1])
96139
)
@@ -100,6 +143,10 @@ def transform(self, query_df: pd.DataFrame) -> pd.DataFrame:
100143
.head(self.mono_t5[1])
101144
)
102145

146+
assert unique_qids == set(
147+
mono_t5_df["qid"].unique()
148+
), f"{unique_qids} != {set(mono_t5_df['qid'].unique())}"
149+
103150
duo_t5_df = self.duo_t5[0].transform(
104151
mono_t5_df.groupby("qid").head(self.duo_t5[1])
105152
)
@@ -109,4 +156,14 @@ def transform(self, query_df: pd.DataFrame) -> pd.DataFrame:
109156
.head(self.duo_t5[1])
110157
)
111158

112-
return self.combine_result_stages([top_docs_df, mono_t5_df, duo_t5_df])
159+
assert unique_qids == set(
160+
duo_t5_df["qid"].unique()
161+
), f"{unique_qids} != {set(duo_t5_df['qid'].unique())}"
162+
163+
result = self.combine_result_stages([top_docs_df, mono_t5_df, duo_t5_df])
164+
165+
assert unique_qids == set(
166+
result["qid"].unique()
167+
), f"{unique_qids} != {set(result['qid'].unique())}"
168+
169+
return result
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"rule":"ADMIT_ENJOY_VB","sentence":"^\\QNLP aspires to equip computers with the capability to grasp the depth of human language and harness this understanding to execute a range of tasks, from text summarization and machine translation to the complexities of question answering.\\E$"}
2+
{"rule":"ADMIT_ENJOY_VB","sentence":"^\\QNLP aspires to equip computers with the capability to grasp the depth of human language and harness this understanding to execute a range of tasks, such as text summarization, machine translation, and question answering.\\E$"}

report/main.bib

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,30 @@ @inproceedings{Lajewska:2023:ECIR
88
doi = {10.1007/978-3-031-28241-6_12},
99
publisher = {Springer}
1010
}
11+
12+
@article{raffel2020exploring,
13+
title={Exploring the limits of transfer learning with a unified text-to-text transformer},
14+
author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
15+
journal={The Journal of Machine Learning Research},
16+
volume={21},
17+
number={1},
18+
pages={5485--5551},
19+
year={2020},
20+
publisher={JMLRORG}
21+
}
22+
23+
@article{nogueira2019document,
24+
title={Document expansion by query prediction},
25+
author={Nogueira, Rodrigo and Yang, Wei and Lin, Jimmy and Cho, Kyunghyun},
26+
journal={arXiv preprint arXiv:1904.08375},
27+
year={2019}
28+
}
29+
30+
@article{nogueira2019doc2query,
31+
title={From doc2query to docTTTTTquery},
32+
author={Nogueira, Rodrigo and Lin, Jimmy and Epistemic, AI},
33+
journal={Online preprint},
34+
volume={6},
35+
pages={2},
36+
year={2019}
37+
}

report/main.pdf

4.4 KB
Binary file not shown.

report/main.tex

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,15 @@ \section{Problem Statement}\label{sec:problem}
7171

7272

7373
\section{Related Work}\label{sec:related}
74-
\subsection*{T5}
74+
In this section, we delve into pertinent research encompassing the realms of conversational search engines and the broader area of information retrieval. While certain highlighted studies do not directly cater to conversational search engines or explicit information retrieval, their techniques remain invaluable in various stages of the conversational retrieval process.
7575

7676
\subsection*{doc2query}
7777

78+
\subsection*{Text-to-Text Transfer Transformer}
79+
The vast domain of natural language processing (NLP) revolves around the understanding of natural language, whether presented in text or speech form. NLP aspires to equip computers with the capability to grasp the depth of human language and harness this understanding to execute a range of tasks, such as text summarization, machine translation, and question answering. Given the diverse nature of these tasks in terms of their input, output, and underlying challenges, developing a unified model proficient across the entire spectrum poses a significant challenge.
80+
81+
Enter the Text-to-Text Transfer Transformer (T5) \cite{raffel2020exploring}. This work by Raffel et al. introduces transfer learning in NLP, aiming to craft a versatile model that can be used for any NLP problem. In essence, T5 models first learn the basics of language. Then, they're sharpened for particular tasks using targeted data. It's common to find models that have been trained in this manner for any specific NLP problem.
82+
7883
\subsection*{doc2query-T5}
7984

8085
\subsection*{monoT5 \& duoT5}

0 commit comments

Comments
 (0)