filtered a warning, decreased number of wiki pages that are not found by disabling auto_suggest, changed filter for too short paragraphs from 50 to 100 characters, documentation

flowun · flowun · commit 2ac32121a1ba · 2024-05-07T19:59:03.000+02:00
diff --git a/contextplus/main.py b/contextplus/main.py
@@ -3,23 +3,24 @@
 from contextplus import model, wiki
 
 
-def context(query, n_wiki_pages=5, n_top_chunks=8, min_summary_length=100, max_summary_length=200, verbose=False):
+def context(query, min_summary_length=100, max_summary_length=200, n_wiki_pages=5, n_top_chunks=8, verbose=False):
     """
     provides context for the query by searching for relevant wikipedia pages, extracting the most relevant information
     and summarizing the facts
     :param query: query as a string for which the context should be provided
-    :param n_wiki_pages: (optional) number of wikipedia pages that should be searched
-    :param n_top_chunks: (optional) number of highest scoring chunks that should be summarized
     :param min_summary_length: (optional) minimum length of the summary (in tokens)
     :param max_summary_length: (optional) maximum length of the summary (in tokens)
+    :param n_wiki_pages: (optional, not recommended to change) number of wikipedia pages that should be searched
+    :param n_top_chunks: (optional, not recommended to change) number of highest scoring chunks that should be summarized
     :param verbose: (optional) whether to print the progress
-    :return: summarized facts from the wikipedia pages as a string
+    :return: summarized facts from the wikipedia pages as a string, None if no wikipedia pages were found
     """
-    # todo: finding optimal default values for the parameters
+
     time1, time2, time3, time4, time5, time6, time7, time8, time9 = 0, 0, 0, 0, 0, 0, 0, 0, 0
     if verbose:
         print("Query:", query)
         time1 = time.time()
+
     # create wikipedia search prompt
     wiki_search_prompt = model.create_wiki_search_prompt(query, verbose=verbose)
     if verbose:
@@ -30,14 +31,16 @@ def context(query, n_wiki_pages=5, n_top_chunks=8, min_summary_length=100, max_s
     page_titles = wiki.get_pages(wiki_search_prompt, n_results=n_wiki_pages)
     if verbose:
         print("Page titles:", page_titles)
-    # get the content of the wikipedia pages and split it into chunks
-    if verbose:
         time3 = time.time()
         print("Time taken to get wiki pages:", time3 - time2, "seconds")
+
+    # get the content of the wikipedia pages and split it into chunks
     wiki_chunks = wiki.get_text_chunks(page_titles, chunk_length=512, verbose=verbose)
     if verbose:
         time4 = time.time()
         print("Time taken to get wiki chunks:", time4 - time3, "seconds")
+    if not wiki_chunks:
+        return None
 
     # get the embeddings for the query and the wiki chunks
     query_embedding = model.get_embeddings([query])
@@ -48,21 +51,22 @@ def context(query, n_wiki_pages=5, n_top_chunks=8, min_summary_length=100, max_s
     if verbose:
         time6 = time.time()
         print("Time taken to get wiki embeddings:", time6 - time5, "seconds")
+
     # calculate the similarity between the query and the wiki chunks
     similarities = model.calculate_similarity(query_embedding, wiki_embeddings, top_k=n_top_chunks)
     if verbose:
         time7 = time.time()
         print("Time taken to calculate similarity:", time7 - time6, "seconds")
-    top_chunks = ""
 
+    top_chunks = ""
     for i, similarity in enumerate(similarities):
         top_chunks += "<" + str(i + 1) + "> " + wiki_chunks[similarity['corpus_id']] + " </" + str(i + 1) + ">\n\n"
         if verbose:
             print("Chunk" + str(i + 1) + ":", wiki_chunks[similarity['corpus_id']], "\t\t\tscore:", similarity['score'])
-
     if verbose:
         time8 = time.time()
         print("Time taken to get concatenated top chunk string:", time8 - time7, "seconds")
+
     # summarize facts from the top wiki chunks
     summarized_facts = model.summarize_facts(top_chunks, min_length=min_summary_length, max_length=max_summary_length)
     if verbose:
@@ -74,5 +78,5 @@ def context(query, n_wiki_pages=5, n_top_chunks=8, min_summary_length=100, max_s
 
 if __name__ == "__main__":
     user_query = "What are the names of Barack Obamas children?"
-    context = context(user_query, verbose=False)
+    context = context(user_query, verbose=True)
     print(context)
diff --git a/contextplus/model.py b/contextplus/model.py
@@ -18,7 +18,6 @@ def get_embeddings(texts):
     :param texts: list of texts for which the embeddings should be calculated
     :return: embeddings
     """
-    # todo: check out arguments of the encode method for example 'prompt' or 'precision'
     return gist_embedding.encode(texts)
 
 
@@ -62,8 +61,6 @@ def create_wiki_search_prompt(query, verbose=False):
     return keywords
 
 
-# todo: try out to look at different titles and let the model decide which will be the most promising ones
-
 # ------------------------------------------------ Bart Large CNN -----------------------------------------------------
 
 def summarize_facts(top_chunks, min_length, max_length):
@@ -74,6 +71,8 @@ def summarize_facts(top_chunks, min_length, max_length):
     :param max_length: maximum length of the summary (in tokens)
     :return: summarized facts from the wiki content as a string
     """
+    if len(top_chunks) > 3700:
+        top_chunks = top_chunks[:3700]
     summary = bart_summarizer(top_chunks, min_length=min_length, max_length=max_length, do_sample=False)
     summary = summary[0]['summary_text']
     if summary.startswith(" "):
diff --git a/contextplus/wiki.py b/contextplus/wiki.py
@@ -1,4 +1,5 @@
 import concurrent.futures
+import warnings
 import wikipedia
 
 
@@ -44,7 +45,10 @@ def get_page_content(page_title):
     :param page_title: page_title of the wikipedia page from which the content should be extracted
     :return: content of the wikipedia page
     """
-    return wikipedia.page(page_title).content
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=UserWarning)
+        page_content = wikipedia.page(page_title, auto_suggest=False).content
+    return page_content
 
 
 def preprocess_and_chunk_wiki_content(wiki_content, chunk_length=512):
@@ -62,7 +66,7 @@ def preprocess_and_chunk_wiki_content(wiki_content, chunk_length=512):
     # split into paragraphs
     chunks = wiki_content.split("\n")
     # remove headings, empty chunks and too short chunks
-    chunks = [chunk for chunk in chunks if not ((chunk.startswith("=") and chunk.endswith("=")) or len(chunk) < 50)]
+    chunks = [chunk for chunk in chunks if not ((chunk.startswith("=") and chunk.endswith("=")) or len(chunk) < 100)]
     # split too long chunks
     additional_chunks = []
     for i, chunk in enumerate(chunks):