Spaces:

kmaurinjones
/

Wiki-Game

Sleeping

App Files Files Community

KAI MAURIN-JONES commited on Jul 31, 2023

Commit

9511a2d

•

1 Parent(s): 3a9722c

beta version 2 - full updates

Browse files

Files changed (7) hide show

app.py +3 -1
censored.txt +1 -0
data/3238x7.csv +0 -0
packages.txt +1 -0
requirements.txt +6 -3
wikigame_app2.py +305 -0
wikigame_rnd.ipynb +1315 -0

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import streamlit as st
 from wiki_game_st_bs4 import *
 # Set the title of the app
 st.title("Wiki Game (BETA)")
@@ -34,4 +35,5 @@ elif slider == 1:
 if start_topic and end_topic:
     if st.button("GO!"):
-        play_wiki_game(starting_topic = start_topic, target_topic = end_topic, limit = limit_pages, delay = delay)

 import streamlit as st
 from wiki_game_st_bs4 import *
+from wikigame_app2 import *
 # Set the title of the app
 st.title("Wiki Game (BETA)")
 if start_topic and end_topic:
     if st.button("GO!"):
+        # play_wiki_game(starting_topic = start_topic, target_topic = end_topic, limit = limit_pages, delay = delay)
+        play_wiki_game_2(starting_topic = start_topic, target_topic = end_topic, limit = limit_pages, delay = delay)

censored.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ nigger

data/3238x7.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ chromium

requirements.txt CHANGED Viewed

@@ -1,8 +1,11 @@
 altair<5
-selenium==4.10.0
 beautifulsoup4==4.11.1
 numpy==1.23.5
 tensorflow==2.10.0
 tensorflow-hub==0.14.0
-jellyfish==0.11.2
-streamlit==1.21.0

 altair<5
+# selenium==4.10.0
 beautifulsoup4==4.11.1
 numpy==1.23.5
 tensorflow==2.10.0
 tensorflow-hub==0.14.0
+# jellyfish==0.11.2
+# streamlit==1.21.0
+streamlit
+seleniumbase
+webdriver-manager

wikigame_app2.py ADDED Viewed

	@@ -0,0 +1,305 @@

+#### For scraping/webpage processing
+import requests
+import json # specifically for wikipedia api
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from bs4 import BeautifulSoup
+#### For timing
+import time
+#### For app
+import streamlit as st
+from collections import deque # for printouts
+#### For semantic similarity model
+# !pip install tensorflow tensorflow-hub
+import tensorflow as tf
+import tensorflow_hub as hub
+import numpy as np
+embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") # Load the pre-trained Universal Sentence Encoder -- accessible at same link
+# # @st.experimental_singleton
+# @st.cache_resource
+# def get_driver():
+#     return webdriver.Chrome(service = Service(ChromeDriverManager().install()), options = options)
+# import os, sys
+# @st.cache_resource
+# def installff():
+#   os.system('sbase install geckodriver')
+#   os.system('ln -s /home/appuser/venv/lib/python3.7/site-packages/seleniumbase/drivers/geckodriver /home/appuser/venv/bin/geckodriver')
+# _ = installff()
+# from selenium import webdriver
+# from selenium.webdriver import FirefoxOptions
+# opts = FirefoxOptions()
+# opts.add_argument("--headless")
+# driver = webdriver.Firefox(options=opts)
+# driver_target = webdriver.Firefox(options=opts)
+# browser.get('http://example.com')
+# driver.get("http://example.com")
+# from selenium import webdriver
+# from selenium.common.exceptions import TimeoutException
+# from selenium.webdriver.common.by import By
+# from selenium.webdriver.firefox.options import Options
+# from selenium.webdriver.firefox.service import Service
+# from selenium.webdriver.support import expected_conditions as EC
+# from selenium.webdriver.support.ui import WebDriverWait
+# from webdriver_manager.firefox import GeckoDriverManager
+# # URL = ""
+# TIMEOUT = 20
+# # st.title("Test Selenium")
+# firefoxOptions = Options()
+# firefoxOptions.add_argument("--headless")
+# service = Service(GeckoDriverManager().install())
+# driver = webdriver.Firefox(
+#     options=firefoxOptions,
+#     service=service,
+# )
+# driver_target = webdriver.Firefox(
+#     options=firefoxOptions,
+#     service=service,
+# )
+import streamlit as st
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+@st.cache_resource
+def get_driver():
+    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
+options = Options()
+options.add_argument('--disable-gpu')
+options.add_argument('--headless')
+driver = get_driver()
+driver_target = get_driver()
+# driver.get('http://example.com')
+# st.code(driver.page_source)
+# Initialize an empty deque
+messages = deque(maxlen = 1000) # after 1000 links, it'll start popping things. The model should always timeout before this, since most people won't have the patience to make it last this long
+def update_messages(message):
+    # Add the new message to the start of deque
+    messages.appendleft(message)
+    # Use a placeholder
+    placeholder = st.empty()
+    # Clear the placeholder and add all the messages from the deque
+    placeholder.text('')  # clears the placeholder
+    for msg in messages:
+        placeholder.text(msg)
+def most_similar_sentence(target_topic, labels_list):
+    # Encode the context sentence and all sentences in the list
+    context_embedding = embed([target_topic])[0]
+    sentence_embeddings = embed(labels_list)
+    # Calculate cosine similarities between the context sentence and each sentence in the list
+    similarities = np.inner(context_embedding, sentence_embeddings)
+    # Find the index of the most similar sentence
+    most_similar_index = np.argmax(similarities)
+    return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index
+def search_wikipedia(search_term):
+    # Define the endpoint
+    endpoint = "https://en.wikipedia.org/w/api.php"
+    # Define the search parameters
+    params = {
+        "action": "query",
+        "format": "json",
+        "list": "search",
+        "srsearch": search_term
+    }
+    # Send a GET request to the endpoint with your parameters
+    response = requests.get(url = endpoint, params = params)
+    # Parse the results as JSON
+    data = json.loads(response.text)
+    # Get the title of the first result (this will be used as the page title in the next step)
+    page_title = data["query"]["search"][0]["title"]
+    if "may refer to" in data["query"]["search"][0]["snippet"].lower():
+        page_title = data["query"]["search"][1]["title"]
+    # Construct the URL of the Wikipedia page
+    page_url = "https://en.wikipedia.org/wiki/{}".format(page_title.replace(" ", "_"))
+    return page_url, page_title
+def get_topic_context(driver, more = False):
+    # Find the first paragraph of the main article
+    first_paragraph = driver.find_element(By.CSS_SELECTOR, "div.mw-parser-output > p:not(.mw-empty-elt)").text
+    if more:
+        context_sentence = ". ".join(first_paragraph.split(". ")[:5])
+    else:
+        context_sentence = first_paragraph.split(". ")[0]
+    return context_sentence
+# bad_words = [word for word in open("censored.txt", "r").readlines()]
+bad_words = [word.strip() for word in open("censored.txt", "r").readlines()]
+def refine_links(topic, links, current_url_suffix, used_links, used_topics, censor = False):
+    links_texts = []
+    # Iterate through the links and extract their URLs
+    for link in links:
+        link_url = link.get('href')
+        if link_url and link_url.startswith("/wiki/"):
+            link_url = "https://en.wikipedia.org" + link_url
+            link_text = link.text.strip() # Get the text and remove leading/trailing spaces
+            # make sure they are both not None
+            if link_text and current_url_suffix not in link_url:
+                if link_url not in used_links and link_text.lower() not in [topic.lower() for topic in used_topics]:
+                    # eliminates topic duplicates, non-wiki links, and wiki-help pages (non-content pages)
+                    if topic.lower() not in link_url.lower() and "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
+                        # censoring if needed
+                        if censor:
+                            if not any(word1.lower() in bad_words for word1 in [word.lower() for word in link_text.split()]):
+                                links_texts.append((link_url, link_text))
+                        else:
+                            links_texts.append((link_url, link_text))
+    return links_texts
+def play_wiki_game_2(starting_topic: str, target_topic: str, limit: int = 100, delay: int = 0):
+    ##### Setup Chrome options
+    # chrome_options = webdriver.ChromeOptions()
+    # chrome_options.add_argument("--headless")  # Ensure GUI is off
+    # chrome_options.add_argument("--no-sandbox")
+    # chrome_options.add_argument("--disable-dev-shm-usage")
+    # driver = webdriver.Chrome(options = chrome_options)
+    # options = Options()
+    # options.add_argument('--disable-gpu')
+    # options.add_argument('--headless')
+    # driver = get_driver()
+    # driver = webdriver.Firefox(options=opts)
+    # driver_target = webdriver.Firefox(options=opts)
+    #### Getting target url, topic, and context
+    # driver_target = webdriver.Chrome(options = chrome_options)
+    # driver_target = get_driver()
+    target_url, target_topic = search_wikipedia(search_term = target_topic)
+    driver_target.get(target_url)
+    target_context = get_topic_context(driver_target, more = True)
+    # update_messages(target_context)
+    driver_target.quit()
+    topic = starting_topic
+    num_pages = 0
+    used_topics = []
+    used_links = []
+    start_time = time.time()
+    ### BEGIN ###
+    update_messages("-" * 150)
+    update_messages(f"\nStarting!\n")
+    update_messages("-" * 150)
+    url, topic = search_wikipedia(search_term = starting_topic)
+    driver.get(url)
+    used_topics.append(topic)
+    used_links.append(driver.current_url)
+    while True:
+        # increment the page tracking by 1 for each new page
+        num_pages += 1
+        # if not the first page, navigate to the new page
+        if num_pages > 1:
+            driver.get(next_link)
+        try:
+            context_sentence = get_topic_context(driver)
+        except Exception as e:
+            context_sentence = "Context could not be found from webpage"
+        current_url = driver.current_url
+        current_url_suffix = str(current_url).split("/")[-1]
+        ### Use BeautifulSoup and Requests instead of Selenium for link extraction
+        current_page = driver.page_source # html from Selenium instead of BeautifulSoup
+        soup = BeautifulSoup(current_page, 'html.parser')
+        links = soup.find_all('a')
+        # get rid of any bloat in the links from the page
+        links_texts = refine_links(topic, links, current_url_suffix, used_links, used_topics)
+        # best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = [text for link, text in links_texts])
+        best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_context.lower(), labels_list = [text.lower() for link, text in links_texts])
+        update_messages(f"\nPage: {num_pages}")
+        update_messages(f"Current topic: '{topic.title()}'")
+        update_messages(f"Current URL: '{current_url}'")
+        update_messages(f"Current Topic Context: '{context_sentence}'")
+        if current_url != target_url:
+            update_messages(f"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%")
+        next_link, topic = links_texts[loc_idx]
+        used_links.append(next_link)
+        used_topics.append(topic)
+        if current_url == target_url: # because the target_url is now found through the API
+            update_messages("\n" + "-" * 150)
+            update_messages(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
+            update_messages(f"Starting topic: '{starting_topic.title()}': '{used_links[0]}'")
+            update_messages(f"Target topic: '{target_topic.title()}': '{target_url}'\n")
+            update_messages("-" * 150)
+            driver.quit()
+            break
+        if num_pages == limit:
+            update_messages("\n" + "-" * 150)
+            update_messages(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
+            update_messages(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{used_topics[-1].title()}': '{used_links[-1]}'")
+            update_messages(f"\nTry a different combination to see if it can do it!\n")
+            update_messages("-" * 150)
+            driver.quit()
+            break
+        # delay things, if applicable
+###### Example
+        time.sleep(delay)
+# starting_topic = 'soulja boy'
+# target_topic = 'urine'
+# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)

wikigame_rnd.ipynb ADDED Viewed

	@@ -0,0 +1,1315 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from selenium import webdriver\n",
+    "# from selenium.webdriver.common.by import By\n",
+    "# from selenium.webdriver.common.keys import Keys\n",
+    "# from bs4 import BeautifulSoup\n",
+    "# import time\n",
+    "# # !pip install tensorflow tensorflow-hub\n",
+    "# import tensorflow as tf\n",
+    "# import tensorflow_hub as hub\n",
+    "# import numpy as np\n",
+    "# # !pip install jellyfish\n",
+    "# import jellyfish"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip show selenium\n",
+    "# !pip show beautifulsoup4\n",
+    "# !pip show numpy\n",
+    "# !pip show tensorflow\n",
+    "# !pip show tensorflow-hub\n",
+    "# !pip show jellyfish\n",
+    "# !pip show streamlit"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# starting_topic = \"soulja boy\"\n",
+    "# target_topic = \"fart\"\n",
+    "\n",
+    "# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Version 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-07-30 09:07:17.451238: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
+      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from selenium import webdriver\n",
+    "from selenium.webdriver.common.by import By\n",
+    "from selenium.webdriver.common.keys import Keys\n",
+    "from bs4 import BeautifulSoup\n",
+    "import time\n",
+    "# !pip install tensorflow tensorflow-hub\n",
+    "import tensorflow as tf\n",
+    "import tensorflow_hub as hub\n",
+    "import numpy as np\n",
+    "import requests\n",
+    "import json\n",
+    "\n",
+    "# Load the pre-trained Universal Sentence Encoder\n",
+    "embed = hub.load(\"https://tfhub.dev/google/universal-sentence-encoder/4\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "------------------------------------------------------------------------------------------------------------------------------------------------------\n",
+      "\n",
+      "Starting!\n",
+      "\n",
+      "------------------------------------------------------------------------------------------------------------------------------------------------------\n",
+      "\n",
+      "Page: 1\n",
+      "Current topic: 'Soulja Boy'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/Soulja_Boy'\n",
+      "Current Topic Context: 'DeAndre Cortez Way (born July 28, 1990), known professionally as Soulja Boy (formerly Soulja Boy Tell 'Em), is an American rapper and record producer'\n",
+      "Next topic: 'Peewee Longway'. Semantic similarity to 'Urine': 21.81%\n",
+      "\n",
+      "Page: 2\n",
+      "Current topic: 'Peewee Longway'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/Peewee_Longway'\n",
+      "Current Topic Context: 'Quincy Lamont Williams (born August 17, 1984), known by his stage name Peewee Longway, is an American rapper best known for his mixtape The Blue M&M and his collaboration with Young Thug, \"Loaded\"'\n",
+      "Next topic: 'Hip Hop'. Semantic similarity to 'Urine': 12.0%\n",
+      "\n",
+      "Page: 3\n",
+      "Current topic: 'Hip Hop'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/Hip_hop_music'\n",
+      "Current Topic Context: 'Hip hop or hip-hop, also known as rap and formerly known as disco rap,[5][6] is a genre of popular music that was originated in the Bronx[7][8][9][10] borough of New York City in the early 1970s by African Americans,[11][12][13] having existed for several years prior to mainstream discovery.[14] Hip hop originated as an anti-drug and anti-violence genre,[15] while consisting of stylized rhythmic music (usually built around drum beats) that commonly accompanies rapping, a rhythmic and rhyming speech that is chanted.[16] According to the professor Asante of African American studies at Temple University, \"hip hop is something that blacks can unequivocally claim as their own\".[17] It was developed as part of hip hop culture, a subculture defined by four key stylistic elements: MCing/rapping, DJing/scratching with turntables, break dancing, and graffiti art.[18][19][20] Other elements include sampling beats or bass lines from records (or synthesized beats and sounds), and rhythmic beatboxing'\n",
+      "Next topic: 'Rufus Thomas'. Semantic similarity to 'Urine': 21.79%\n",
+      "\n",
+      "Page: 4\n",
+      "Current topic: 'Rufus Thomas'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/Rufus_Thomas'\n",
+      "Current Topic Context: 'Rufus C'\n",
+      "Next topic: 'Rabbit Foot Minstrels'. Semantic similarity to 'Urine': 19.28%\n",
+      "\n",
+      "Page: 5\n",
+      "Current topic: 'Rabbit Foot Minstrels'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/The_Rabbit%27s_Foot_Company'\n",
+      "Current Topic Context: 'The Rabbit's Foot Company, also known as the Rabbit('s) Foot Minstrels and colloquially as \"The Foots\", was a long-running minstrel and variety troupe that toured as a tent show in the American South between 1900 and the late 1950s'\n",
+      "Next topic: 'Jstor'. Semantic similarity to 'Urine': 11.85%\n",
+      "\n",
+      "Page: 6\n",
+      "Current topic: 'Jstor'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/JSTOR'\n",
+      "Current Topic Context: 'JSTOR (/ˈdʒeɪstɔːr/; short for Journal Storage)[2] is a digital library founded in 1994'\n",
+      "Next topic: 'Nieman Lab'. Semantic similarity to 'Urine': 12.14%\n",
+      "\n",
+      "Page: 7\n",
+      "Current topic: 'Nieman Lab'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/Nieman_Foundation_for_Journalism'\n",
+      "Current Topic Context: 'The Nieman Foundation for Journalism at Harvard University is the primary journalism institution at Harvard.'\n",
+      "Next topic: 'Men'S Soccer'. Semantic similarity to 'Urine': 14.43%\n",
+      "\n",
+      "Page: 8\n",
+      "Current topic: 'Men'S Soccer'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/Harvard_Crimson_men%27s_soccer'\n",
+      "Current Topic Context: 'The Harvard Crimson men's soccer team is an intercollegiate varsity sports team of Harvard University'\n",
+      "Next topic: 'California Golden Bears Men'S Soccer'. Semantic similarity to 'Urine': 17.31%\n",
+      "\n",
+      "Page: 9\n",
+      "Current topic: 'California Golden Bears Men'S Soccer'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/California_Golden_Bears_men%27s_soccer'\n",
+      "Current Topic Context: 'The California Golden Bears men's soccer team is a varsity intercollegiate athletic team of University of California, Berkeley in Berkeley, California, United States.[1] The team is a member of the Pac-12 Conference, which is part of the National Collegiate Athletic Association's Division I'\n",
+      "Next topic: 'California Drinking Song'. Semantic similarity to 'Urine': 15.78%\n",
+      "\n",
+      "Page: 10\n",
+      "Current topic: 'California Drinking Song'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/California_Drinking_Song'\n",
+      "Current Topic Context: '\"California Drinking Song\" is a spirit song from the University of California, Berkeley'\n",
+      "Next topic: 'Uc Men'S Octet'. Semantic similarity to 'Urine': 15.63%\n",
+      "\n",
+      "Page: 11\n",
+      "Current topic: 'Uc Men'S Octet'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/University_of_California_Men%27s_Octet'\n",
+      "Current Topic Context: 'The UC Men's Octet, sometimes termed the Cal Men’s Octet or the UC Berkeley Men’s Octet, is an eight-member male a cappella group at the University of California, Berkeley'\n",
+      "Next topic: 'Laboratories'. Semantic similarity to 'Urine': 15.45%\n",
+      "\n",
+      "Page: 12\n",
+      "Current topic: 'Laboratories'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/Research_centers_and_laboratories_at_the_University_of_California,_Berkeley'\n",
+      "Current Topic Context: 'The University of California, Berkeley, contains many research centers and laboratories.'\n",
+      "Next topic: 'Uc Irvine Medical Center'. Semantic similarity to 'Urine': 18.16%\n",
+      "\n",
+      "Page: 13\n",
+      "Current topic: 'Uc Irvine Medical Center'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/University_of_California,_Irvine_Medical_Center'\n",
+      "Current Topic Context: 'The University of California, Irvine Medical Center (UCIMC or UCI Medical Center) is a major research hospital located in Orange, California'\n",
+      "Next topic: 'Sepsis'. Semantic similarity to 'Urine': 19.29%\n",
+      "\n",
+      "Page: 14\n",
+      "Current topic: 'Sepsis'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/Sepsis'\n",
+      "Current Topic Context: 'Sepsis (septicaemia in British English), or blood poisoning,[8][9] is a life-threatening condition that arises when the body's response to infection causes injury to its own tissues and organs.[4][8]'\n",
+      "Next topic: 'Urinary Tract'. Semantic similarity to 'Urine': 51.26%\n",
+      "\n",
+      "Page: 15\n",
+      "Current topic: 'Urinary Tract'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/Urinary_system'\n",
+      "Current Topic Context: 'The urinary system, also known as the urinary tract or renal system, consists of the kidneys, ureters, bladder, and the urethra'\n",
+      "Next topic: 'Urinary Bladder'. Semantic similarity to 'Urine': 61.01%\n",
+      "\n",
+      "Page: 16\n",
+      "Current topic: 'Urinary Bladder'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/Bladder'\n",
+      "Current Topic Context: 'The bladder is a hollow organ in humans and other vertebrates that stores urine from the kidneys before disposal by urination'\n",
+      "Next topic: 'Urination § Anatomy Of The Bladder And Outlet'. Semantic similarity to 'Urine': 57.69%\n",
+      "\n",
+      "Page: 17\n",
+      "Current topic: 'Urination § Anatomy Of The Bladder And Outlet'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/Urination#Anatomy_of_the_bladder_and_outlet'\n",
+      "Current Topic Context: 'Urination is the release of urine from the urinary bladder through the urethra to the outside of the body'\n",
+      "Next topic: 'Urine'. Semantic similarity to 'Urine': 57.28%\n",
+      "\n",
+      "Page: 18\n",
+      "Current topic: 'Urine'\n",
+      "Current URL: 'https://en.wikipedia.org/wiki/Urine'\n",
+      "Current Topic Context: 'Urine is a liquid by-product of metabolism in humans and in many other animals'\n",
+      "\n",
+      "------------------------------------------------------------------------------------------------------------------------------------------------------\n",
+      "\n",
+      "From 'Soulja Boy', to 'Urine' in 18 pages, 8.54 seconds!\n",
+      "Starting topic: 'Soulja Boy': 'https://en.wikipedia.org/wiki/Soulja_Boy'\n",
+      "Target topic: 'Urine': 'https://en.wikipedia.org/wiki/Urine'\n",
+      "\n",
+      "------------------------------------------------------------------------------------------------------------------------------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "def most_similar_sentence(target_topic, labels_list):\n",
+    "    # Encode the context sentence and all sentences in the list\n",
+    "    context_embedding = embed([target_topic])[0]\n",
+    "    sentence_embeddings = embed(labels_list)\n",
+    "    \n",
+    "    # Calculate cosine similarities between the context sentence and each sentence in the list\n",
+    "    similarities = np.inner(context_embedding, sentence_embeddings)\n",
+    "    \n",
+    "    # Find the index of the most similar sentence\n",
+    "    most_similar_index = np.argmax(similarities)\n",
+    "    \n",
+    "    return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index\n",
+    "\n",
+    "def search_wikipedia(search_term):\n",
+    "    # Define the endpoint\n",
+    "    endpoint = \"https://en.wikipedia.org/w/api.php\"\n",
+    "\n",
+    "    # Define the search parameters\n",
+    "    params = {\n",
+    "        \"action\": \"query\",\n",
+    "        \"format\": \"json\",\n",
+    "        \"list\": \"search\",\n",
+    "        \"srsearch\": search_term\n",
+    "    }\n",
+    "\n",
+    "    # Send a GET request to the endpoint with your parameters\n",
+    "    response = requests.get(url = endpoint, params = params)\n",
+    "\n",
+    "    # Parse the results as JSON\n",
+    "    data = json.loads(response.text)\n",
+    "\n",
+    "    # Get the title of the first result (this will be used as the page title in the next step)\n",
+    "    page_title = data[\"query\"][\"search\"][0][\"title\"]\n",
+    "\n",
+    "    # if \"may refer to\" in data[\"query\"][\"search\"][0][\"snippet\"].lower():\n",
+    "    #     page_title = data[\"query\"][\"search\"][1][\"title\"]\n",
+    "\n",
+    "    # Construct the URL of the Wikipedia page\n",
+    "    page_url = \"https://en.wikipedia.org/wiki/{}\".format(page_title.replace(\" \", \"_\"))\n",
+    "\n",
+    "    return page_url, page_title\n",
+    "\n",
+    "def get_topic_context(driver, more = False):\n",
+    "    # Find the first paragraph of the main article\n",
+    "    first_paragraph = driver.find_element(By.CSS_SELECTOR, \"div.mw-parser-output > p:not(.mw-empty-elt)\").text\n",
+    "\n",
+    "    if more:\n",
+    "        context_sentence = \". \".join(first_paragraph.split(\". \")[:5])\n",
+    "    else:\n",
+    "        context_sentence = first_paragraph.split(\". \")[0]\n",
+    "\n",
+    "    return context_sentence\n",
+    "\n",
+    "# bad_words = [word for word in open(\"censored.txt\", \"r\").readlines()]\n",
+    "bad_words = [word.strip() for word in open(\"censored.txt\", \"r\").readlines()]\n",
+    "\n",
+    "def refine_links(topic, links, current_url_suffix, used_links, used_topics, censor = False):\n",
+    "\n",
+    "    links_texts = []\n",
+    "\n",
+    "    # Iterate through the links and extract their URLs\n",
+    "    for link in links:\n",
+    "        link_url = link.get('href')\n",
+    "        if link_url and link_url.startswith(\"/wiki/\"):\n",
+    "            link_url = \"https://en.wikipedia.org\" + link_url\n",
+    "            link_text = link.text.strip() # Get the text and remove leading/trailing spaces\n",
+    "\n",
+    "            # make sure they are both not None\n",
+    "            if link_text and current_url_suffix not in link_url:\n",
+    "\n",
+    "                if link_url not in used_links and link_text.lower() not in [topic.lower() for topic in used_topics]:\n",
+    "\n",
+    "                    # eliminates topic duplicates, non-wiki links, and wiki-help pages (non-content pages)\n",
+    "                    if topic.lower() not in link_url.lower() and \"en.wikipedia.org/wiki/\" in link_url and \":\" not in \"\".join(link_url.split(\"/\")[1:]) and \"Main_Page\" != str(link_url.split(\"/\")[-1]):\n",
+    "\n",
+    "                        # censoring if needed\n",
+    "                        if censor:\n",
+    "                            if not any(word1.lower() in bad_words for word1 in [word.lower() for word in link_text.split()]):\n",
+    "                                links_texts.append((link_url, link_text))\n",
+    "                        else:\n",
+    "                            links_texts.append((link_url, link_text))\n",
+    "\n",
+    "    return links_texts\n",
+    "\n",
+    "def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):\n",
+    "\n",
+    "    ##### Setup Chrome options\n",
+    "    chrome_options = webdriver.ChromeOptions()\n",
+    "    chrome_options.add_argument(\"--headless\")  # Ensure GUI is off\n",
+    "    chrome_options.add_argument(\"--no-sandbox\")\n",
+    "    chrome_options.add_argument(\"--disable-dev-shm-usage\")\n",
+    "    driver = webdriver.Chrome(options = chrome_options)\n",
+    "\n",
+    "    #### Getting target url, topic, and context\n",
+    "    driver_target = webdriver.Chrome(options = chrome_options)\n",
+    "    target_url, target_topic = search_wikipedia(search_term = target_topic)\n",
+    "    driver_target.get(target_url)\n",
+    "    target_context = get_topic_context(driver_target, more = True)\n",
+    "    # print(target_context)\n",
+    "    driver_target.quit()\n",
+    "\n",
+    "    topic = starting_topic\n",
+    "    num_pages = 0\n",
+    "    used_topics = []\n",
+    "    used_links = []\n",
+    "\n",
+    "    start_time = time.time()\n",
+    "\n",
+    "    ### BEGIN ###\n",
+    "\n",
+    "    print(\"-\" * 150)\n",
+    "    print(f\"\\nStarting!\\n\")\n",
+    "    print(\"-\" * 150)\n",
+    "\n",
+    "    url, topic = search_wikipedia(search_term = starting_topic)\n",
+    "    driver.get(url)\n",
+    "    used_topics.append(topic)\n",
+    "    used_links.append(driver.current_url)\n",
+    "\n",
+    "    while True:\n",
+    "        # increment the page tracking by 1 for each new page\n",
+    "        num_pages += 1\n",
+    "\n",
+    "        # if not the first page, navigate to the new page\n",
+    "        if num_pages > 1:\n",
+    "            driver.get(next_link)\n",
+    "\n",
+    "        try:\n",
+    "            context_sentence = get_topic_context(driver)\n",
+    "        except Exception as e:\n",
+    "            context_sentence = \"Context could not be found from webpage\"\n",
+    "\n",
+    "        current_url = driver.current_url\n",
+    "        current_url_suffix = str(current_url).split(\"/\")[-1]\n",
+    "\n",
+    "        ### Use BeautifulSoup and Requests instead of Selenium for link extraction\n",
+    "        current_page = driver.page_source # html from Selenium instead of BeautifulSoup\n",
+    "\n",
+    "        soup = BeautifulSoup(current_page, 'html.parser')\n",
+    "\n",
+    "        links = soup.find_all('a')\n",
+    "\n",
+    "        # get rid of any bloat in the links from the page\n",
+    "        links_texts = refine_links(topic, links, current_url_suffix, used_links, used_topics)\n",
+    "\n",
+    "        # best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = [text for link, text in links_texts])\n",
+    "        best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_context.lower(), labels_list = [text.lower() for link, text in links_texts])\n",
+    "\n",
+    "        print(f\"\\nPage: {num_pages}\")\n",
+    "        print(f\"Current topic: '{topic.title()}'\")\n",
+    "        print(f\"Current URL: '{current_url}'\")\n",
+    "        print(f\"Current Topic Context: '{context_sentence}'\")\n",
+    "        if current_url != target_url:\n",
+    "            print(f\"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%\")\n",
+    "\n",
+    "        next_link, topic = links_texts[loc_idx]\n",
+    "\n",
+    "        used_links.append(next_link)\n",
+    "        used_topics.append(topic)\n",
+    "\n",
+    "        if current_url == target_url: # because the target_url is now found through the API\n",
+    "            print(\"\\n\" + \"-\" * 150)\n",
+    "            print(f\"\\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!\")\n",
+    "            print(f\"Starting topic: '{starting_topic.title()}': '{used_links[0]}'\")\n",
+    "            print(f\"Target topic: '{target_topic.title()}': '{target_url}'\\n\")\n",
+    "            print(\"-\" * 150)\n",
+    "            driver.quit()\n",
+    "            break\n",
+    "\n",
+    "        if num_pages == limit:\n",
+    "            print(\"\\n\" + \"-\" * 150)\n",
+    "            print(f\"\\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.\")\n",
+    "            print(f\"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{used_topics[-1].title()}': '{used_links[-1]}'\")\n",
+    "            print(f\"\\nTry a different combination to see if it can do it!\\n\")\n",
+    "            print(\"-\" * 150)\n",
+    "            driver.quit()\n",
+    "            break\n",
+    "\n",
+    "###### Example\n",
+    "\n",
+    "starting_topic = 'soulja boy'\n",
+    "target_topic = 'urine'\n",
+    "\n",
+    "play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# starting_topic = 'soulja boy'\n",
+    "# target_topic = 'fart'\n",
+    "\n",
+    "# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tracking Stats"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def play_wiki_game_stats(starting_topic: str, target_topic: str, limit: int = 200):\n",
+    "\n",
+    "    stats_dict = {}\n",
+    "\n",
+    "    ##### Setup Chrome options\n",
+    "    chrome_options = webdriver.ChromeOptions()\n",
+    "    chrome_options.add_argument(\"--headless\")  # Ensure GUI is off\n",
+    "    chrome_options.add_argument(\"--no-sandbox\")\n",
+    "    chrome_options.add_argument(\"--disable-dev-shm-usage\")\n",
+    "    driver = webdriver.Chrome(options = chrome_options)\n",
+    "\n",
+    "    #### Getting target url, topic, and context\n",
+    "    driver_target = webdriver.Chrome(options = chrome_options)\n",
+    "    target_url, target_topic = search_wikipedia(search_term = target_topic)\n",
+    "    driver_target.get(target_url)\n",
+    "    target_context = get_topic_context(driver_target)\n",
+    "    print(target_context)\n",
+    "    print()\n",
+    "    driver_target.quit()\n",
+    "    \n",
+    "    topic = starting_topic\n",
+    "    num_pages = 0\n",
+    "    used_topics = []\n",
+    "    used_links = []\n",
+    "    contexts = []\n",
+    "    sim_to_target_scores = []\n",
+    "\n",
+    "    start_time = time.time()\n",
+    "\n",
+    "    ### BEGIN ###\n",
+    "\n",
+    "    print(\"-\" * 150)\n",
+    "    print(f\"\\nStarting!\\n\")\n",
+    "    print(\"-\" * 150)\n",
+    "\n",
+    "    url, topic = search_wikipedia(search_term = starting_topic)\n",
+    "    driver.get(url)\n",
+    "    used_topics.append(topic)\n",
+    "    used_links.append(driver.current_url)\n",
+    "    sim_to_target_scores.append(most_similar_sentence(target_topic = target_context, labels_list = [topic])[1])\n",
+    "\n",
+    "    while True:\n",
+    "        # increment the page tracking by 1 for each new page\n",
+    "        num_pages += 1\n",
+    "\n",
+    "        # if not the first page, navigate to the new page\n",
+    "        if num_pages > 1:\n",
+    "            driver.get(next_link)\n",
+    "\n",
+    "        context_sentence = get_topic_context(driver)\n",
+    "        contexts.append(context_sentence)\n",
+    "\n",
+    "        current_url = driver.current_url\n",
+    "        current_url_suffix = str(current_url).split(\"/\")[-1]\n",
+    "\n",
+    "        ### Use BeautifulSoup and Requests instead of Selenium for link extraction\n",
+    "        current_page = driver.page_source # html from Selenium instead of BeautifulSoup\n",
+    "\n",
+    "        soup = BeautifulSoup(current_page, 'html.parser')\n",
+    "\n",
+    "        links = soup.find_all('a')\n",
+    "\n",
+    "        # get rid of any bloat in the links from the page\n",
+    "        links_texts = refine_links(topic, links, current_url_suffix, used_links, used_topics)\n",
+    "\n",
+    "        best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_context, labels_list = [text for link, text in links_texts])\n",
+    "\n",
+    "        print(f\"\\nPage: {num_pages}\")\n",
+    "        print(f\"Current topic: '{topic.title()}'\")\n",
+    "        print(f\"Current URL: '{current_url}'\")\n",
+    "        print(f\"Current Topic Context: '{context_sentence}'\")\n",
+    "        if current_url != target_url:\n",
+    "            print(f\"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%\")\n",
+    "        \n",
+    "        next_link, topic = links_texts[loc_idx]\n",
+    "\n",
+    "        # contexts.append(context_sentence)\n",
+    "\n",
+    "        if current_url == target_url: # because the target_url is now found through the API\n",
+    "            print(\"\\n\" + \"-\" * 150)\n",
+    "            print(f\"\\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!\")\n",
+    "            print(f\"Starting topic: '{starting_topic.title()}': '{used_links[0]}'\")\n",
+    "            print(f\"Target topic: '{target_topic.title()}': '{used_links[-1]}'\\n\")\n",
+    "            print(\"-\" * 150)\n",
+    "\n",
+    "            stats_dict['start_end'] = [f\"{starting_topic}_{target_topic}\" for i in range(num_pages)]\n",
+    "            stats_dict['topic'] = used_topics\n",
+    "            stats_dict['context'] = contexts\n",
+    "            stats_dict['sim_to_target'] = sim_to_target_scores\n",
+    "            stats_dict['url'] = used_links\n",
+    "            stats_dict['page_num'] = [i+1 for i in range(num_pages)]\n",
+    "            driver.quit()\n",
+    "            return stats_dict\n",
+    "            break\n",
+    "\n",
+    "        ##### ADD DRAMATIC DELAY HERE #####\n",
+    "        # time.sleep(0.5)\n",
+    "        # time.sleep(10)\n",
+    "\n",
+    "        if num_pages == limit:\n",
+    "            print(\"\\n\" + \"-\" * 150)\n",
+    "            print(f\"\\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.\")\n",
+    "            print(f\"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{used_topics[-1].title()}': '{used_links[-1]}'\")\n",
+    "            print(f\"\\nTry a different combination to see if it can do it!\\n\")\n",
+    "            print(\"-\" * 150)\n",
+    "\n",
+    "            stats_dict['start_end'] = [f\"{starting_topic}_{target_topic}\" for i in range(num_pages)]\n",
+    "            stats_dict['topic'] = used_topics\n",
+    "            stats_dict['context'] = contexts\n",
+    "            stats_dict['sim_to_target'] = sim_to_target_scores\n",
+    "            stats_dict['url'] = used_links\n",
+    "            stats_dict['page_num'] = [i+1 for i in range(num_pages)]\n",
+    "            driver.quit()\n",
+    "            return stats_dict\n",
+    "            break\n",
+    "\n",
+    "        used_links.append(next_link)\n",
+    "        used_topics.append(topic)\n",
+    "        sim_to_target_scores.append(best_score)\n",
+    "\n",
+    "# starting_topic = 'john mayer'\n",
+    "# target_topic = 'fart'\n",
+    "\n",
+    "# stats_dict = play_wiki_game_stats(starting_topic = starting_topic, target_topic = target_topic, limit = 200)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "dict_keys(['start_end', 'topic', 'context', 'sim_to_target', 'url', 'page_num'])\n",
+      "[6, 6, 6, 6, 6, 6]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# stats_dict['start_end'] = [f\"{starting_topic}_{target_topic}\" for i in range(7)]\n",
+    "print(stats_dict.keys())\n",
+    "print([len(stats_dict[key]) for key in stats_dict.keys()])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[0.027460583, 0.20852715, 0.2775123, 0.31147623, 0.4413054, 0.6199604]"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stats_dict['sim_to_target']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>start_end</th>\n",
+       "      <th>topic</th>\n",
+       "      <th>context</th>\n",
+       "      <th>sim_to_target</th>\n",
+       "      <th>url</th>\n",
+       "      <th>page_num</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>john mayer_Flatulence</td>\n",
+       "      <td>John Mayer</td>\n",
+       "      <td>John Clayton Mayer[1] (/ˈmeɪ.ər/ MAY-ər; born ...</td>\n",
+       "      <td>0.027461</td>\n",
+       "      <td>https://en.wikipedia.org/wiki/John_Mayer</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>john mayer_Flatulence</td>\n",
+       "      <td>cardiac dysrhythmia</td>\n",
+       "      <td>Arrhythmias, also known as cardiac arrhythmias...</td>\n",
+       "      <td>0.208527</td>\n",
+       "      <td>https://en.wikipedia.org/wiki/Cardiac_dysrhythmia</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>john mayer_Flatulence</td>\n",
+       "      <td>prolapse</td>\n",
+       "      <td>Mitral valve prolapse (MVP) is a valvular hear...</td>\n",
+       "      <td>0.277512</td>\n",
+       "      <td>https://en.wikipedia.org/wiki/Mitral_valve_pro...</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>john mayer_Flatulence</td>\n",
+       "      <td>gastrointestinal disturbances</td>\n",
+       "      <td>Gastrointestinal diseases (abbrev</td>\n",
+       "      <td>0.311476</td>\n",
+       "      <td>https://en.wikipedia.org/wiki/Gastrointestinal...</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>john mayer_Flatulence</td>\n",
+       "      <td>gastrointestinal tract</td>\n",
+       "      <td>The gastrointestinal tract (GI tract, digestiv...</td>\n",
+       "      <td>0.441305</td>\n",
+       "      <td>https://en.wikipedia.org/wiki/Human_gastrointe...</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>john mayer_Flatulence</td>\n",
+       "      <td>flatulence</td>\n",
+       "      <td>Flatulence, in humans, is the expulsion of gas...</td>\n",
+       "      <td>0.619960</td>\n",
+       "      <td>https://en.wikipedia.org/wiki/Flatulence</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               start_end                          topic  \\\n",
+       "0  john mayer_Flatulence                     John Mayer   \n",
+       "1  john mayer_Flatulence            cardiac dysrhythmia   \n",
+       "2  john mayer_Flatulence                       prolapse   \n",
+       "3  john mayer_Flatulence  gastrointestinal disturbances   \n",
+       "4  john mayer_Flatulence         gastrointestinal tract   \n",
+       "5  john mayer_Flatulence                     flatulence   \n",
+       "\n",
+       "                                             context  sim_to_target  \\\n",
+       "0  John Clayton Mayer[1] (/ˈmeɪ.ər/ MAY-ər; born ...       0.027461   \n",
+       "1  Arrhythmias, also known as cardiac arrhythmias...       0.208527   \n",
+       "2  Mitral valve prolapse (MVP) is a valvular hear...       0.277512   \n",
+       "3                  Gastrointestinal diseases (abbrev       0.311476   \n",
+       "4  The gastrointestinal tract (GI tract, digestiv...       0.441305   \n",
+       "5  Flatulence, in humans, is the expulsion of gas...       0.619960   \n",
+       "\n",
+       "                                                 url  page_num  \n",
+       "0           https://en.wikipedia.org/wiki/John_Mayer         1  \n",
+       "1  https://en.wikipedia.org/wiki/Cardiac_dysrhythmia         2  \n",
+       "2  https://en.wikipedia.org/wiki/Mitral_valve_pro...         3  \n",
+       "3  https://en.wikipedia.org/wiki/Gastrointestinal...         4  \n",
+       "4  https://en.wikipedia.org/wiki/Human_gastrointe...         5  \n",
+       "5           https://en.wikipedia.org/wiki/Flatulence         6  "
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "stats_df = pd.DataFrame(stats_dict)\n",
+    "stats_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Simluations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "110\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[('Sushi', 'Mars'),\n",
+       " ('Sushi', 'Beethoven'),\n",
+       " ('Sushi', 'Mount Everest'),\n",
+       " ('Sushi', 'Humpback Whale'),\n",
+       " ('Sushi', 'The Great Wall of China')]"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import itertools\n",
+    "\n",
+    "unrelated_list = [\n",
+    "    \"Sushi\",\n",
+    "    \"Mars\",\n",
+    "    \"Beethoven\",\n",
+    "    \"Mount Everest\",\n",
+    "    \"Humpback Whale\",\n",
+    "    \"The Great Wall of China\",\n",
+    "    \"Photography\",\n",
+    "    \"Pyramids of Egypt\",\n",
+    "    \"Albert Einstein\",\n",
+    "    \"Rainforests\",\n",
+    "    'buggy'\n",
+    "]\n",
+    "\n",
+    "# Generate all permutations of pairs\n",
+    "pair_permutations = list(itertools.permutations(unrelated_list, 2))\n",
+    "\n",
+    "print(len(pair_permutations)) # no pairs with self\n",
+    "pair_permutations[:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'fruits': ['apple', 'banana', 'orange', 'grapes', 'kiwi'], 'animals': ['cat', 'dog', 'elephant', 'tiger', 'lion'], 'cities': ['New York', 'London'], 'colors': ['red', 'blue']}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Initial dictionary\n",
+    "main_dict = {\n",
+    "    'fruits': ['apple', 'banana', 'orange'],\n",
+    "    'animals': ['cat', 'dog', 'elephant'],\n",
+    "}\n",
+    "\n",
+    "# Function to add a new dictionary to the main_dict\n",
+    "def add_to_main_dict(main_dict, new_dict):\n",
+    "    for key, value in new_dict.items():\n",
+    "        if key in main_dict:\n",
+    "            main_dict[key].extend(value)\n",
+    "        else:\n",
+    "            main_dict[key] = value\n",
+    "\n",
+    "# New dictionary to add to main_dict\n",
+    "new_dict1 = {\n",
+    "    'fruits': ['grapes', 'kiwi'],\n",
+    "    'cities': ['New York', 'London'],\n",
+    "}\n",
+    "\n",
+    "# Add new_dict1 to main_dict\n",
+    "add_to_main_dict(main_dict, new_dict1)\n",
+    "\n",
+    "# New dictionary to add to main_dict\n",
+    "new_dict2 = {\n",
+    "    'animals': ['tiger', 'lion'],\n",
+    "    'colors': ['red', 'blue'],\n",
+    "}\n",
+    "\n",
+    "# Add new_dict2 to main_dict\n",
+    "add_to_main_dict(main_dict, new_dict2)\n",
+    "\n",
+    "# Print the updated main_dict\n",
+    "print(main_dict)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Start: 'Sushi'. End: 'Mars'. Page: 8\n",
+      "Start: 'Sushi'. End: 'Ludwig van Beethoven'. Page: 9\n",
+      "Start: 'Sushi'. End: 'Mount Everest'. Page: 4\n",
+      "Start: 'Sushi'. End: 'Humpback whale'. Page: 3\n",
+      "Start: 'Sushi'. End: 'Great Wall of China'. Page: 7\n",
+      "Start: 'Sushi'. End: 'Photography'. Page: 29\n",
+      "Start: 'Sushi'. End: 'Egyptian pyramids'. Page: 23\n",
+      "Start: 'Sushi'. End: 'Albert Einstein'. Page: 12\n",
+      "Start: 'Sushi'. End: 'Rainforest'. Page: 7\n",
+      "Start: 'Sushi'. End: 'Buggy'. Page: 200\n",
+      "Start: 'Mars'. End: 'Sushi'. Page: 19\n",
+      "Start: 'Mars'. End: 'Ludwig van Beethoven'. Page: 4\n",
+      "Start: 'Mars'. End: 'Mount Everest'. Page: 2\n",
+      "Start: 'Mars'. End: 'Humpback whale'. Page: 4\n",
+      "Start: 'Mars'. End: 'Great Wall of China'. Page: 13\n",
+      "Start: 'Mars'. End: 'Photography'. Page: 32\n",
+      "Start: 'Mars'. End: 'Egyptian pyramids'. Page: 3\n",
+      "Start: 'Mars'. End: 'Albert Einstein'. Page: 6\n",
+      "Start: 'Mars'. End: 'Rainforest'. Page: 7\n",
+      "Start: 'Mars'. End: 'Buggy'. Page: 200\n",
+      "Start: 'Beethoven'. End: 'Sushi'. Page: 17\n",
+      "Start: 'Beethoven'. End: 'Mars'. Page: 3\n",
+      "Start: 'Beethoven'. End: 'Mount Everest'. Page: 6\n",
+      "Start: 'Beethoven'. End: 'Humpback whale'. Page: 4\n",
+      "Start: 'Beethoven'. End: 'Great Wall of China'. Page: 14\n",
+      "Start: 'Beethoven'. End: 'Photography'. Page: 31\n",
+      "Start: 'Beethoven'. End: 'Egyptian pyramids'. Page: 8\n",
+      "Start: 'Beethoven'. End: 'Albert Einstein'. Page: 3\n",
+      "Start: 'Beethoven'. End: 'Rainforest'. Page: 15\n",
+      "Start: 'Beethoven'. End: 'Buggy'. Page: 200\n",
+      "Start: 'Mount Everest'. End: 'Sushi'. Page: 14\n",
+      "Start: 'Mount Everest'. End: 'Mars'. Page: 2\n",
+      "Start: 'Mount Everest'. End: 'Ludwig van Beethoven'. Page: 23\n",
+      "Start: 'Mount Everest'. End: 'Humpback whale'. Page: 7\n",
+      "Start: 'Mount Everest'. End: 'Great Wall of China'. Page: 6\n",
+      "Start: 'Mount Everest'. End: 'Photography'. Page: 29\n",
+      "Start: 'Mount Everest'. End: 'Egyptian pyramids'. Page: 8\n",
+      "Start: 'Mount Everest'. End: 'Albert Einstein'. Page: 5\n",
+      "Start: 'Mount Everest'. End: 'Rainforest'. Page: 7\n",
+      "Start: 'Mount Everest'. End: 'Buggy'. Page: 200\n",
+      "Start: 'Humpback Whale'. End: 'Sushi'. Page: 9\n",
+      "Start: 'Humpback Whale'. End: 'Mars'. Page: 19\n",
+      "Start: 'Humpback Whale'. End: 'Ludwig van Beethoven'. Page: 29\n",
+      "Start: 'Humpback Whale'. End: 'Mount Everest'. Page: 5\n",
+      "Start: 'Humpback Whale'. End: 'Great Wall of China'. Page: 12\n",
+      "Start: 'Humpback Whale'. End: 'Photography'. Page: 5\n",
+      "Start: 'Humpback Whale'. End: 'Egyptian pyramids'. Page: 5\n",
+      "Start: 'Humpback Whale'. End: 'Albert Einstein'. Page: 8\n",
+      "Start: 'Humpback Whale'. End: 'Rainforest'. Page: 3\n",
+      "Start: 'Humpback Whale'. End: 'Buggy'. Page: 200\n",
+      "Start: 'The Great Wall of China'. End: 'Sushi'. Page: 7\n",
+      "Start: 'The Great Wall of China'. End: 'Mars'. Page: 13\n",
+      "Start: 'The Great Wall of China'. End: 'Ludwig van Beethoven'. Page: 10\n",
+      "Start: 'The Great Wall of China'. End: 'Mount Everest'. Page: 3\n",
+      "Start: 'The Great Wall of China'. End: 'Humpback whale'. Page: 11\n",
+      "Start: 'The Great Wall of China'. End: 'Photography'. Page: 48\n",
+      "Start: 'The Great Wall of China'. End: 'Egyptian pyramids'. Page: 5\n",
+      "Start: 'The Great Wall of China'. End: 'Albert Einstein'. Page: 7\n",
+      "Start: 'The Great Wall of China'. End: 'Rainforest'. Page: 4\n",
+      "Start: 'The Great Wall of China'. End: 'Buggy'. Page: 200\n",
+      "Start: 'Photography'. End: 'Sushi'. Page: 15\n",
+      "Start: 'Photography'. End: 'Mars'. Page: 13\n",
+      "Start: 'Photography'. End: 'Ludwig van Beethoven'. Page: 26\n",
+      "Start: 'Photography'. End: 'Mount Everest'. Page: 8\n",
+      "Start: 'Photography'. End: 'Humpback whale'. Page: 10\n",
+      "Start: 'Photography'. End: 'Great Wall of China'. Page: 3\n",
+      "Start: 'Photography'. End: 'Egyptian pyramids'. Page: 6\n",
+      "Start: 'Photography'. End: 'Albert Einstein'. Page: 21\n",
+      "Start: 'Photography'. End: 'Rainforest'. Page: 8\n",
+      "Start: 'Photography'. End: 'Buggy'. Page: 200\n",
+      "Start: 'Pyramids of Egypt'. End: 'Sushi'. Page: 7\n",
+      "Start: 'Pyramids of Egypt'. End: 'Mars'. Page: 7\n",
+      "Start: 'Pyramids of Egypt'. End: 'Ludwig van Beethoven'. Page: 62\n",
+      "Start: 'Pyramids of Egypt'. End: 'Mount Everest'. Page: 8\n",
+      "Start: 'Pyramids of Egypt'. End: 'Humpback whale'. Page: 10\n",
+      "Start: 'Pyramids of Egypt'. End: 'Great Wall of China'. Page: 8\n",
+      "Start: 'Pyramids of Egypt'. End: 'Photography'. Page: 31\n",
+      "Start: 'Pyramids of Egypt'. End: 'Albert Einstein'. Page: 3\n",
+      "Start: 'Pyramids of Egypt'. End: 'Rainforest'. Page: 10\n",
+      "Start: 'Pyramids of Egypt'. End: 'Buggy'. Page: 200\n",
+      "Start: 'Albert Einstein'. End: 'Sushi'. Page: 10\n",
+      "Start: 'Albert Einstein'. End: 'Mars'. Page: 3\n",
+      "Start: 'Albert Einstein'. End: 'Ludwig van Beethoven'. Page: 2\n",
+      "Start: 'Albert Einstein'. End: 'Mount Everest'. Page: 5\n",
+      "Start: 'Albert Einstein'. End: 'Humpback whale'. Page: 18\n",
+      "Start: 'Albert Einstein'. End: 'Great Wall of China'. Page: 8\n",
+      "Start: 'Albert Einstein'. End: 'Photography'. Page: 42\n",
+      "Start: 'Albert Einstein'. End: 'Egyptian pyramids'. Page: 7\n",
+      "Start: 'Albert Einstein'. End: 'Rainforest'. Page: 6\n",
+      "Start: 'Albert Einstein'. End: 'Buggy'. Page: 200\n",
+      "Start: 'Rainforests'. End: 'Sushi'. Page: 3\n",
+      "Start: 'Rainforests'. End: 'Mars'. Page: 7\n",
+      "Start: 'Rainforests'. End: 'Ludwig van Beethoven'. Page: 18\n",
+      "Start: 'Rainforests'. End: 'Mount Everest'. Page: 7\n",
+      "Start: 'Rainforests'. End: 'Humpback whale'. Page: 4\n",
+      "Start: 'Rainforests'. End: 'Great Wall of China'. Page: 4\n",
+      "Start: 'Rainforests'. End: 'Photography'. Page: 38\n",
+      "Start: 'Rainforests'. End: 'Egyptian pyramids'. Page: 7\n",
+      "Start: 'Rainforests'. End: 'Albert Einstein'. Page: 8\n",
+      "Start: 'Rainforests'. End: 'Buggy'. Page: 200\n",
+      "Start: 'buggy'. End: 'Sushi'. Page: 6\n",
+      "Start: 'buggy'. End: 'Mars'. Page: 8\n",
+      "Start: 'buggy'. End: 'Ludwig van Beethoven'. Page: 28\n",
+      "Start: 'buggy'. End: 'Mount Everest'. Page: 8\n",
+      "Start: 'buggy'. End: 'Humpback whale'. Page: 19\n",
+      "Start: 'buggy'. End: 'Great Wall of China'. Page: 12\n",
+      "Start: 'buggy'. End: 'Photography'. Page: 54\n",
+      "Start: 'buggy'. End: 'Egyptian pyramids'. Page: 9\n",
+      "Start: 'buggy'. End: 'Albert Einstein'. Page: 35\n",
+      "Start: 'buggy'. End: 'Rainforest'. Page: 9\n"
+     ]
+    }
+   ],
+   "source": [
+    "def play_wiki_game_stats(starting_topic: str, target_topic: str, limit: int = 200):\n",
+    "\n",
+    "    stats_dict = {}\n",
+    "\n",
+    "    ##### Setup Chrome options\n",
+    "    chrome_options = webdriver.ChromeOptions()\n",
+    "    chrome_options.add_argument(\"--headless\")  # Ensure GUI is off\n",
+    "    chrome_options.add_argument(\"--no-sandbox\")\n",
+    "    chrome_options.add_argument(\"--disable-dev-shm-usage\")\n",
+    "    driver = webdriver.Chrome(options = chrome_options)\n",
+    "\n",
+    "    #### Getting target url, topic, and context\n",
+    "    driver_target = webdriver.Chrome(options = chrome_options)\n",
+    "    target_url, target_topic = search_wikipedia(search_term = target_topic)\n",
+    "    driver_target.get(target_url)\n",
+    "    target_context = get_topic_context(driver_target)\n",
+    "    driver_target.quit()\n",
+    "    \n",
+    "    topic = starting_topic\n",
+    "    num_pages = 0\n",
+    "    used_topics = []\n",
+    "    used_links = []\n",
+    "    contexts = []\n",
+    "    sim_to_target_scores = []\n",
+    "\n",
+    "    start_time = time.time()\n",
+    "\n",
+    "    url, topic = search_wikipedia(search_term = starting_topic)\n",
+    "    driver.get(url)\n",
+    "    used_topics.append(topic)\n",
+    "    used_links.append(driver.current_url)\n",
+    "    sim_to_target_scores.append(most_similar_sentence(target_topic = target_context, labels_list = [topic])[1])\n",
+    "\n",
+    "    while True:\n",
+    "        # increment the page tracking by 1 for each new page\n",
+    "        num_pages += 1\n",
+    "\n",
+    "        # if not the first page, navigate to the new page\n",
+    "        if num_pages > 1:\n",
+    "            driver.get(next_link)\n",
+    "\n",
+    "        context_sentence = get_topic_context(driver)\n",
+    "        contexts.append(context_sentence)\n",
+    "\n",
+    "        current_url = driver.current_url\n",
+    "        current_url_suffix = str(current_url).split(\"/\")[-1]\n",
+    "\n",
+    "        ### Use BeautifulSoup and Requests instead of Selenium for link extraction\n",
+    "        current_page = driver.page_source # html from Selenium instead of BeautifulSoup\n",
+    "\n",
+    "        soup = BeautifulSoup(current_page, 'html.parser')\n",
+    "\n",
+    "        links = soup.find_all('a')\n",
+    "\n",
+    "        # get rid of any bloat in the links from the page\n",
+    "        links_texts = refine_links(topic, links, current_url_suffix, used_links, used_topics)\n",
+    "\n",
+    "        best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_context, labels_list = [text for link, text in links_texts])\n",
+    "\n",
+    "        print(f\"Start: '{starting_topic}'. End: '{target_topic}'. Page: {num_pages}\", end = '\\r')\n",
+    "\n",
+    "        next_link, topic = links_texts[loc_idx]\n",
+    "\n",
+    "        if current_url == target_url: # because the target_url is now found through the API\n",
+    "            print()\n",
+    "            stats_dict['start'] = [starting_topic for i in range(num_pages)]\n",
+    "            stats_dict['target'] = [target_topic for i in range(num_pages)]\n",
+    "            stats_dict['topic'] = used_topics\n",
+    "            stats_dict['context'] = contexts\n",
+    "            stats_dict['sim_to_target'] = sim_to_target_scores\n",
+    "            # stats_dict['time_seconds'] = times\n",
+    "            stats_dict['url'] = used_links\n",
+    "            stats_dict['page_num'] = [i+1 for i in range(num_pages)]\n",
+    "            add_to_main_dict(master_dict, stats_dict)\n",
+    "            driver.quit()\n",
+    "            break\n",
+    "\n",
+    "        if num_pages == limit:\n",
+    "            print()\n",
+    "            stats_dict['start'] = [starting_topic for i in range(num_pages)]\n",
+    "            stats_dict['target'] = [target_topic for i in range(num_pages)]\n",
+    "            stats_dict['topic'] = used_topics\n",
+    "            stats_dict['context'] = contexts\n",
+    "            stats_dict['sim_to_target'] = sim_to_target_scores\n",
+    "            stats_dict['url'] = used_links\n",
+    "            stats_dict['page_num'] = [i+1 for i in range(num_pages)]\n",
+    "            driver.quit()\n",
+    "            add_to_main_dict(master_dict, stats_dict)\n",
+    "            break\n",
+    "\n",
+    "        used_links.append(next_link)\n",
+    "        used_topics.append(topic)\n",
+    "        sim_to_target_scores.append(best_score)\n",
+    "\n",
+    "master_dict = {}\n",
+    "master_dict['start'] = []\n",
+    "master_dict['target'] = []\n",
+    "master_dict['topic'] = []\n",
+    "master_dict['context'] = []\n",
+    "master_dict['sim_to_target'] = []\n",
+    "master_dict['url'] = []\n",
+    "master_dict['page_num'] = []\n",
+    "\n",
+    "# starting_topic = 'john mayer'\n",
+    "# target_topic = 'fart'\n",
+    "\n",
+    "for starting_topic, target_topic in pair_permutations:\n",
+    "    play_wiki_game_stats(starting_topic = starting_topic, target_topic = target_topic, limit = 200)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "dict_keys(['start', 'target', 'topic', 'context', 'sim_to_target', 'url', 'page_num'])\n",
+      "[3238, 3238, 3238, 3238, 3238, 3238, 3238]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(master_dict.keys())\n",
+    "print([len(master_dict[key]) for key in master_dict.keys()])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>start</th>\n",
+       "      <th>target</th>\n",
+       "      <th>topic</th>\n",
+       "      <th>context</th>\n",
+       "      <th>sim_to_target</th>\n",
+       "      <th>url</th>\n",
+       "      <th>page_num</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Sushi</td>\n",
+       "      <td>Mars</td>\n",
+       "      <td>Sushi</td>\n",
+       "      <td>Sushi (すし, 寿司, 鮨, 鮓, pronounced [sɯɕiꜜ] or [sɯ...</td>\n",
+       "      <td>0.046150</td>\n",
+       "      <td>https://en.wikipedia.org/wiki/Sushi</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Sushi</td>\n",
+       "      <td>Mars</td>\n",
+       "      <td>Planet Money</td>\n",
+       "      <td>Planet Money is an American podcast and blog p...</td>\n",
+       "      <td>0.494693</td>\n",
+       "      <td>https://en.wikipedia.org/wiki/Planet_Money</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Sushi</td>\n",
+       "      <td>Mars</td>\n",
+       "      <td>Pacifica Foundation</td>\n",
+       "      <td>Pacifica Foundation is an American non-profit ...</td>\n",
+       "      <td>0.186643</td>\n",
+       "      <td>https://en.wikipedia.org/wiki/Pacifica_Foundation</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Sushi</td>\n",
+       "      <td>Mars</td>\n",
+       "      <td>Mars Hill</td>\n",
+       "      <td>The Mars Hill Network is a network of Christia...</td>\n",
+       "      <td>0.466525</td>\n",
+       "      <td>https://en.wikipedia.org/wiki/Mars_Hill_Network</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Sushi</td>\n",
+       "      <td>Mars</td>\n",
+       "      <td>Equinox Mountain</td>\n",
+       "      <td>Equinox Mountain is the highest peak of the Ta...</td>\n",
+       "      <td>0.196999</td>\n",
+       "      <td>https://en.wikipedia.org/wiki/Equinox_Mountain</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3233</th>\n",
+       "      <td>buggy</td>\n",
+       "      <td>Rainforest</td>\n",
+       "      <td>Forests of the United States</td>\n",
+       "      <td>It has been estimated that before European set...</td>\n",
+       "      <td>0.437653</td>\n",
+       "      <td>https://en.wikipedia.org/wiki/Forests_of_the_U...</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3234</th>\n",
+       "      <td>buggy</td>\n",
+       "      <td>Rainforest</td>\n",
+       "      <td>boreal forests</td>\n",
+       "      <td>Taiga (/ˈtaɪɡə/; Russian: тайга́; relates to M...</td>\n",
+       "      <td>0.474700</td>\n",
+       "      <td>https://en.wikipedia.org/wiki/Boreal_forest</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3235</th>\n",
+       "      <td>buggy</td>\n",
+       "      <td>Rainforest</td>\n",
+       "      <td>Deciduous forests</td>\n",
+       "      <td>Temperate deciduous or temperate broad-leaf fo...</td>\n",
+       "      <td>0.501480</td>\n",
+       "      <td>https://en.wikipedia.org/wiki/Temperate_decidu...</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3236</th>\n",
+       "      <td>buggy</td>\n",
+       "      <td>Rainforest</td>\n",
+       "      <td>Tropical deciduous forest</td>\n",
+       "      <td>The tropical and subtropical dry broadleaf for...</td>\n",
+       "      <td>0.480779</td>\n",
+       "      <td>https://en.wikipedia.org/wiki/Tropical_deciduo...</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3237</th>\n",
+       "      <td>buggy</td>\n",
+       "      <td>Rainforest</td>\n",
+       "      <td>rainforests</td>\n",
+       "      <td>Rainforests are forests characterized by a clo...</td>\n",
+       "      <td>0.482825</td>\n",
+       "      <td>https://en.wikipedia.org/wiki/Rainforest</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3238 rows × 7 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      start      target                         topic  \\\n",
+       "0     Sushi        Mars                         Sushi   \n",
+       "1     Sushi        Mars                  Planet Money   \n",
+       "2     Sushi        Mars           Pacifica Foundation   \n",
+       "3     Sushi        Mars                     Mars Hill   \n",
+       "4     Sushi        Mars              Equinox Mountain   \n",
+       "...     ...         ...                           ...   \n",
+       "3233  buggy  Rainforest  Forests of the United States   \n",
+       "3234  buggy  Rainforest                boreal forests   \n",
+       "3235  buggy  Rainforest             Deciduous forests   \n",
+       "3236  buggy  Rainforest     Tropical deciduous forest   \n",
+       "3237  buggy  Rainforest                   rainforests   \n",
+       "\n",
+       "                                                context  sim_to_target  \\\n",
+       "0     Sushi (すし, 寿司, 鮨, 鮓, pronounced [sɯɕiꜜ] or [sɯ...       0.046150   \n",
+       "1     Planet Money is an American podcast and blog p...       0.494693   \n",
+       "2     Pacifica Foundation is an American non-profit ...       0.186643   \n",
+       "3     The Mars Hill Network is a network of Christia...       0.466525   \n",
+       "4     Equinox Mountain is the highest peak of the Ta...       0.196999   \n",
+       "...                                                 ...            ...   \n",
+       "3233  It has been estimated that before European set...       0.437653   \n",
+       "3234  Taiga (/ˈtaɪɡə/; Russian: тайга́; relates to M...       0.474700   \n",
+       "3235  Temperate deciduous or temperate broad-leaf fo...       0.501480   \n",
+       "3236  The tropical and subtropical dry broadleaf for...       0.480779   \n",
+       "3237  Rainforests are forests characterized by a clo...       0.482825   \n",
+       "\n",
+       "                                                    url  page_num  \n",
+       "0                   https://en.wikipedia.org/wiki/Sushi         1  \n",
+       "1            https://en.wikipedia.org/wiki/Planet_Money         2  \n",
+       "2     https://en.wikipedia.org/wiki/Pacifica_Foundation         3  \n",
+       "3       https://en.wikipedia.org/wiki/Mars_Hill_Network         4  \n",
+       "4        https://en.wikipedia.org/wiki/Equinox_Mountain         5  \n",
+       "...                                                 ...       ...  \n",
+       "3233  https://en.wikipedia.org/wiki/Forests_of_the_U...         5  \n",
+       "3234        https://en.wikipedia.org/wiki/Boreal_forest         6  \n",
+       "3235  https://en.wikipedia.org/wiki/Temperate_decidu...         7  \n",
+       "3236  https://en.wikipedia.org/wiki/Tropical_deciduo...         8  \n",
+       "3237           https://en.wikipedia.org/wiki/Rainforest         9  \n",
+       "\n",
+       "[3238 rows x 7 columns]"
+      ]
+     },
+     "execution_count": 66,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "master_df = pd.DataFrame(master_dict)\n",
+    "master_df.to_csv(\"data/3238x7.csv\", index = False)\n",
+    "master_df"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.2"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}