Created a first prototype

flowun · flowun · commit 59fafc486d9c · 2024-05-06T15:07:31.000+02:00
diff --git a/main.py b/main.py
@@ -0,0 +1,71 @@
+import time
+
+import wiki
+import model
+
+
+def context(query, n_wiki_pages=5, n_top_chunks=8, min_summary_length=100, max_summary_length=200, verbose=False):
+    """
+    provides context for the query by searching for relevant wikipedia pages, extracting the most relevant information
+    and summarizing the facts
+    :param query: query as a string for which the context should be provided
+    :param n_wiki_pages: (optional) number of wikipedia pages that should be searched
+    :param n_top_chunks: (optional) number of highest scoring chunks that should be summarized
+    :param min_summary_length: (optional) minimum length of the summary (in tokens)
+    :param max_summary_length: (optional) maximum length of the summary (in tokens)
+    :param verbose: (optional) whether to print the progress
+    :return: summarized facts from the wikipedia pages as a string
+    """
+    # todo: finding optimal default values for the parameters
+    # todo: remove time measurements
+    if verbose:
+        print("Query:", query)
+    time1 = time.time()
+    # create wikipedia search prompt
+    wiki_search_prompt = model.create_wiki_search_prompt(query, verbose=verbose)
+    time2 = time.time()
+    print("Time taken to get wiki search prompt:", time2 - time1, "seconds")
+
+    # get relevant wikipedia pages
+    page_titles = wiki.get_pages(wiki_search_prompt, n_results=n_wiki_pages)
+    if verbose:
+        print("Page titles:", page_titles)
+    # get the content of the wikipedia pages and split it into chunks
+    time3 = time.time()
+    print("Time taken to get wiki pages:", time3 - time2, "seconds")
+    wiki_chunks = wiki.get_text_chunks(page_titles, chunk_length=512, verbose=verbose)
+    time4 = time.time()
+    print("Time taken to get wiki chunks:", time4 - time3, "seconds")
+
+    # get the embeddings for the query and the wiki chunks
+    query_embedding = model.get_embeddings([query])
+    time5 = time.time()
+    print("Time taken to get query embedding:", time5 - time4, "seconds")
+    wiki_embeddings = model.get_embeddings(wiki_chunks)
+    time6 = time.time()
+    print("Time taken to get wiki embeddings:", time6 - time5, "seconds")
+    # calculate the similarity between the query and the wiki chunks
+    similarities = model.calculate_similarity(query_embedding, wiki_embeddings, top_k=n_top_chunks)
+    time7 = time.time()
+    print("Time taken to calculate similarity:", time7 - time6, "seconds")
+    top_chunks = ""
+
+    for i, similarity in enumerate(similarities):
+        top_chunks += "<" + str(i + 1) + "> " + wiki_chunks[similarity['corpus_id']] + " </" + str(i + 1) + ">\n\n"
+        if verbose:
+            print("Chunk" + str(i + 1) + ":", wiki_chunks[similarity['corpus_id']], "\t\t\tscore:", similarity['score'])
+
+    time8 = time.time()
+    print("Time taken to get concatenated top chunk string:", time8 - time7, "seconds")
+    # summarize facts from the top wiki chunks
+    summarized_facts = model.summarize_facts(top_chunks, min_length=min_summary_length, max_length=max_summary_length)
+    time9 = time.time()
+    print("Time taken to summarize facts:", time9 - time8, "seconds")
+    print("Total time taken:", time9 - time1, "seconds")
+    return summarized_facts
+
+
+if __name__ == "__main__":
+    user_query = "What are the names of Barack Obamas children?"
+    context = context(user_query, verbose=True)
+    print(context)
diff --git a/model.py b/model.py
@@ -0,0 +1,81 @@
+from sentence_transformers.util import semantic_search
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+
+# Loading models
+device = "cpu"  # todo only for cpu testing, can be removed to automatically choose the device
+gist_embedding = SentenceTransformer("avsolatorio/GIST-small-Embedding-v0", device=device)
+bart_summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
+flan_t5 = pipeline("text2text-generation", model="google/flan-t5-base", device=device)
+
+
+# ------------------------------------------------ Embedding Model ----------------------------------------------------
+
+
+def get_embeddings(texts):
+    """
+    gets the embeddings for the texts using the sentence transformer embedding model
+    :param texts: list of texts for which the embeddings should be calculated
+    :return: embeddings
+    """
+    # todo: check out arguments of the encode method for example 'prompt' or 'precision'
+    return gist_embedding.encode(texts)
+
+
+def calculate_similarity(query_embedding, wiki_embeddings, top_k=10):
+    """
+    calculates the similarity between the query_embedding and the wiki_embeddings which can be used to filter the
+    wiki content
+    for the most relevant information
+    :param query_embedding: embedding of the query
+    :param wiki_embeddings: list of chunked embeddings of the wikipedia content
+    :param top_k: number of most similar chunks that should be returned
+    :return: list of dictionary's with similarity scores ['score'] between the query_embedding and each embedding chunk
+             and the index of the chunk ['corpus_id']
+    """
+    return semantic_search(query_embedding, wiki_embeddings, top_k=top_k)[0]
+
+
+# --------------------------------------------------- Flan T5 ---------------------------------------------------------
+
+def create_wiki_search_prompt(query, verbose=False):
+    """
+    extracts the most relevant keywords from the query and returns it as a prompt for the wikipedia search
+    :param query: query for which the keywords should be extracted
+    :param verbose: whether to print the wiki search prompt
+    :return: keywords for the wikipedia search
+    """
+    prompt = ("I will give you a query and you have to create a list of keywords separated by commas to search in the "
+              "internet for additional information. "
+              "Example Query 1: What is the capital of France? "
+              "Keywords: capital, France"
+              "Example Query 2: Person that won the Nobel Prize in Literature in 2020 "
+              "Keywords: Nobel Prize, Literature, 2020"
+              "Example Query 3: What variation of house music was produced by artists such as Madonna and Kylie Minogue? "
+              "Keywords: house music, Madonna, Kylie Minogue"
+              "Now it's your turn!"
+              f"Query: {query} Keywords:")
+
+    keywords = flan_t5(prompt, max_length=50, do_sample=False)[0]['generated_text']
+    if verbose:
+        print("wiki search prompt:", keywords)
+    return keywords
+
+
+# todo: try out to look at different titles and let the model decide which will be the most promising ones
+
+# ------------------------------------------------ Bart Large CNN -----------------------------------------------------
+
+def summarize_facts(top_chunks, min_length, max_length):
+    """
+    summarizes the facts from the wiki_content
+    :param top_chunks: chunks of the wiki content with the highest similarity to the query
+    :param min_length: minimum length of the summary (in tokens)
+    :param max_length: maximum length of the summary (in tokens)
+    :return: summarized facts from the wiki content as a string
+    """
+    summary = bart_summarizer(top_chunks, min_length=min_length, max_length=max_length, do_sample=False)
+    summary = summary[0]['summary_text']
+    if summary.startswith(" "):
+        summary = summary[1:]
+    return summary
diff --git a/wiki.py b/wiki.py
@@ -0,0 +1,91 @@
+import concurrent.futures
+import wikipedia
+
+
+def get_pages(search_prompt, n_results=5):
+    """
+    gets the wikipedia pages for the search prompt using the wikipedia api
+    :param search_prompt: search prompt for the wikipedia search
+    :param n_results: number of page titles that should be returned
+    :return: page titles
+    """
+    return wikipedia.search(search_prompt, results=n_results)
+
+
+def get_text_chunks(page_titles, chunk_length=512, verbose=False):
+    """
+    gets the content of the wikipedia pages using multiple threads (API calls take time) and splits it into chunks
+    :param page_titles: list of page titles for which the content should be extracted
+    :param chunk_length: length of characters that a chunk should have
+    :param verbose: whether to print the progress
+    :return: list of wiki text chunks
+    """
+    wiki_chunks = []
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        future_to_page = {executor.submit(get_page_content, page_title): page_title for page_title in page_titles}
+        for future in concurrent.futures.as_completed(future_to_page):
+            page_title = future_to_page[future]
+            try:
+                wiki_content = future.result()
+                wiki_content = preprocess_and_chunk_wiki_content(wiki_content, chunk_length=chunk_length)
+                if verbose:
+                    print(f"getting content of page {page_title}")
+                wiki_chunks.extend(wiki_content)
+            except wikipedia.exceptions.PageError or wikipedia.exceptions.DisambiguationError as e:
+                if verbose:
+                    print(f"page {page_title} not found, {e}")
+                continue  # skip the page if it is not available
+    return wiki_chunks
+
+
+def get_page_content(page_title):
+    """
+    gets the content of the wikipedia page using the wikipedia api
+    :param page_title: page_title of the wikipedia page from which the content should be extracted
+    :return: content of the wikipedia page
+    """
+    return wikipedia.page(page_title).content
+
+
+def preprocess_and_chunk_wiki_content(wiki_content, chunk_length=512):
+    """
+    preprocesses the wiki content:
+    - splits the content into paragraphs
+    - removes headings and empty lines
+    - splits too long paragraphs into smaller chunks without cutting sentences
+    :param wiki_content: content of the wikipedia page
+    :param chunk_length: length of characters that a chunk should have
+    :return: list of wiki text chunks
+    """
+    # remove everything from the references section onwards
+    wiki_content = wiki_content.split("== References ==")[0]
+    # split into paragraphs
+    chunks = wiki_content.split("\n")
+    # remove headings, empty chunks and too short chunks
+    chunks = [chunk for chunk in chunks if not ((chunk.startswith("=") and chunk.endswith("=")) or len(chunk) < 50)]
+    # split too long chunks
+    additional_chunks = []
+    for i, chunk in enumerate(chunks):
+        if len(chunk) > chunk_length:
+            # split into sentences
+            sentences = chunk.split(". ")
+            # split into sub-chunks without cutting sentences
+            sub_chunks = [""]
+            sub_chunk_index = 0
+            for sentence in sentences:
+                if len(sentence) > chunk_length:
+                    # cut the sentence if it's longer than the chunk_length (this should happen rarely)
+                    sentence = sentence[:chunk_length]
+                # if the sentence fits into the current sub-chunk
+                if len(sub_chunks[sub_chunk_index]) + len(sentence) < chunk_length:
+                    sub_chunks[sub_chunk_index] += sentence + ". "
+                else:
+                    sub_chunk_index += 1
+                    sub_chunks.append(sentence + ". ")
+            # replace original chunk with first sub-chunk
+            chunks[i] = sub_chunks[0]
+            # add the other sub-chunks to the list
+            for j in range(1, len(sub_chunks)):
+                additional_chunks.append(sub_chunks[j])
+    chunks.extend(additional_chunks)
+    return chunks