KAI MAURIN-JONES commited on
Commit
9511a2d
1 Parent(s): 3a9722c

beta version 2 - full updates

Browse files
Files changed (7) hide show
  1. app.py +3 -1
  2. censored.txt +1 -0
  3. data/3238x7.csv +0 -0
  4. packages.txt +1 -0
  5. requirements.txt +6 -3
  6. wikigame_app2.py +305 -0
  7. wikigame_rnd.ipynb +1315 -0
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import streamlit as st
2
  from wiki_game_st_bs4 import *
 
3
 
4
  # Set the title of the app
5
  st.title("Wiki Game (BETA)")
@@ -34,4 +35,5 @@ elif slider == 1:
34
 
35
  if start_topic and end_topic:
36
  if st.button("GO!"):
37
- play_wiki_game(starting_topic = start_topic, target_topic = end_topic, limit = limit_pages, delay = delay)
 
 
1
  import streamlit as st
2
  from wiki_game_st_bs4 import *
3
+ from wikigame_app2 import *
4
 
5
  # Set the title of the app
6
  st.title("Wiki Game (BETA)")
 
35
 
36
  if start_topic and end_topic:
37
  if st.button("GO!"):
38
+ # play_wiki_game(starting_topic = start_topic, target_topic = end_topic, limit = limit_pages, delay = delay)
39
+ play_wiki_game_2(starting_topic = start_topic, target_topic = end_topic, limit = limit_pages, delay = delay)
censored.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ nigger
data/3238x7.csv ADDED
The diff for this file is too large to render. See raw diff
 
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ chromium
requirements.txt CHANGED
@@ -1,8 +1,11 @@
1
  altair<5
2
- selenium==4.10.0
3
  beautifulsoup4==4.11.1
4
  numpy==1.23.5
5
  tensorflow==2.10.0
6
  tensorflow-hub==0.14.0
7
- jellyfish==0.11.2
8
- streamlit==1.21.0
 
 
 
 
1
  altair<5
2
+ # selenium==4.10.0
3
  beautifulsoup4==4.11.1
4
  numpy==1.23.5
5
  tensorflow==2.10.0
6
  tensorflow-hub==0.14.0
7
+ # jellyfish==0.11.2
8
+ # streamlit==1.21.0
9
+ streamlit
10
+ seleniumbase
11
+ webdriver-manager
wikigame_app2.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### For scraping/webpage processing
2
+ import requests
3
+ import json # specifically for wikipedia api
4
+ from selenium import webdriver
5
+ from selenium.webdriver.chrome.options import Options
6
+ from selenium.webdriver.chrome.service import Service
7
+ from webdriver_manager.chrome import ChromeDriverManager
8
+ from selenium.webdriver.common.by import By
9
+ from selenium.webdriver.common.keys import Keys
10
+ from bs4 import BeautifulSoup
11
+
12
+ #### For timing
13
+ import time
14
+
15
+ #### For app
16
+ import streamlit as st
17
+ from collections import deque # for printouts
18
+
19
+ #### For semantic similarity model
20
+ # !pip install tensorflow tensorflow-hub
21
+ import tensorflow as tf
22
+ import tensorflow_hub as hub
23
+ import numpy as np
24
+ embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") # Load the pre-trained Universal Sentence Encoder -- accessible at same link
25
+
26
+ # # @st.experimental_singleton
27
+ # @st.cache_resource
28
+ # def get_driver():
29
+ # return webdriver.Chrome(service = Service(ChromeDriverManager().install()), options = options)
30
+
31
+ # import os, sys
32
+
33
+ # @st.cache_resource
34
+ # def installff():
35
+ # os.system('sbase install geckodriver')
36
+ # os.system('ln -s /home/appuser/venv/lib/python3.7/site-packages/seleniumbase/drivers/geckodriver /home/appuser/venv/bin/geckodriver')
37
+
38
+ # _ = installff()
39
+ # from selenium import webdriver
40
+ # from selenium.webdriver import FirefoxOptions
41
+ # opts = FirefoxOptions()
42
+ # opts.add_argument("--headless")
43
+ # driver = webdriver.Firefox(options=opts)
44
+ # driver_target = webdriver.Firefox(options=opts)
45
+
46
+ # browser.get('http://example.com')
47
+
48
+ # driver.get("http://example.com")
49
+
50
+ # from selenium import webdriver
51
+ # from selenium.common.exceptions import TimeoutException
52
+ # from selenium.webdriver.common.by import By
53
+ # from selenium.webdriver.firefox.options import Options
54
+ # from selenium.webdriver.firefox.service import Service
55
+ # from selenium.webdriver.support import expected_conditions as EC
56
+ # from selenium.webdriver.support.ui import WebDriverWait
57
+ # from webdriver_manager.firefox import GeckoDriverManager
58
+
59
+ # # URL = ""
60
+ # TIMEOUT = 20
61
+
62
+ # # st.title("Test Selenium")
63
+
64
+ # firefoxOptions = Options()
65
+ # firefoxOptions.add_argument("--headless")
66
+ # service = Service(GeckoDriverManager().install())
67
+ # driver = webdriver.Firefox(
68
+ # options=firefoxOptions,
69
+ # service=service,
70
+ # )
71
+ # driver_target = webdriver.Firefox(
72
+ # options=firefoxOptions,
73
+ # service=service,
74
+ # )
75
+
76
+ import streamlit as st
77
+
78
+ from selenium import webdriver
79
+ from selenium.webdriver.chrome.options import Options
80
+ from selenium.webdriver.chrome.service import Service
81
+ from webdriver_manager.chrome import ChromeDriverManager
82
+
83
+ @st.cache_resource
84
+ def get_driver():
85
+ return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
86
+
87
+ options = Options()
88
+ options.add_argument('--disable-gpu')
89
+ options.add_argument('--headless')
90
+
91
+ driver = get_driver()
92
+ driver_target = get_driver()
93
+ # driver.get('http://example.com')
94
+
95
+ # st.code(driver.page_source)
96
+
97
+ # Initialize an empty deque
98
+ messages = deque(maxlen = 1000) # after 1000 links, it'll start popping things. The model should always timeout before this, since most people won't have the patience to make it last this long
99
+
100
+ def update_messages(message):
101
+ # Add the new message to the start of deque
102
+ messages.appendleft(message)
103
+ # Use a placeholder
104
+ placeholder = st.empty()
105
+ # Clear the placeholder and add all the messages from the deque
106
+ placeholder.text('') # clears the placeholder
107
+ for msg in messages:
108
+ placeholder.text(msg)
109
+
110
+ def most_similar_sentence(target_topic, labels_list):
111
+ # Encode the context sentence and all sentences in the list
112
+ context_embedding = embed([target_topic])[0]
113
+ sentence_embeddings = embed(labels_list)
114
+
115
+ # Calculate cosine similarities between the context sentence and each sentence in the list
116
+ similarities = np.inner(context_embedding, sentence_embeddings)
117
+
118
+ # Find the index of the most similar sentence
119
+ most_similar_index = np.argmax(similarities)
120
+
121
+ return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index
122
+
123
+ def search_wikipedia(search_term):
124
+ # Define the endpoint
125
+ endpoint = "https://en.wikipedia.org/w/api.php"
126
+
127
+ # Define the search parameters
128
+ params = {
129
+ "action": "query",
130
+ "format": "json",
131
+ "list": "search",
132
+ "srsearch": search_term
133
+ }
134
+
135
+ # Send a GET request to the endpoint with your parameters
136
+ response = requests.get(url = endpoint, params = params)
137
+
138
+ # Parse the results as JSON
139
+ data = json.loads(response.text)
140
+
141
+ # Get the title of the first result (this will be used as the page title in the next step)
142
+ page_title = data["query"]["search"][0]["title"]
143
+
144
+ if "may refer to" in data["query"]["search"][0]["snippet"].lower():
145
+ page_title = data["query"]["search"][1]["title"]
146
+
147
+ # Construct the URL of the Wikipedia page
148
+ page_url = "https://en.wikipedia.org/wiki/{}".format(page_title.replace(" ", "_"))
149
+
150
+ return page_url, page_title
151
+
152
+ def get_topic_context(driver, more = False):
153
+ # Find the first paragraph of the main article
154
+ first_paragraph = driver.find_element(By.CSS_SELECTOR, "div.mw-parser-output > p:not(.mw-empty-elt)").text
155
+
156
+ if more:
157
+ context_sentence = ". ".join(first_paragraph.split(". ")[:5])
158
+ else:
159
+ context_sentence = first_paragraph.split(". ")[0]
160
+
161
+ return context_sentence
162
+
163
+ # bad_words = [word for word in open("censored.txt", "r").readlines()]
164
+ bad_words = [word.strip() for word in open("censored.txt", "r").readlines()]
165
+
166
+ def refine_links(topic, links, current_url_suffix, used_links, used_topics, censor = False):
167
+
168
+ links_texts = []
169
+
170
+ # Iterate through the links and extract their URLs
171
+ for link in links:
172
+ link_url = link.get('href')
173
+ if link_url and link_url.startswith("/wiki/"):
174
+ link_url = "https://en.wikipedia.org" + link_url
175
+ link_text = link.text.strip() # Get the text and remove leading/trailing spaces
176
+
177
+ # make sure they are both not None
178
+ if link_text and current_url_suffix not in link_url:
179
+
180
+ if link_url not in used_links and link_text.lower() not in [topic.lower() for topic in used_topics]:
181
+
182
+ # eliminates topic duplicates, non-wiki links, and wiki-help pages (non-content pages)
183
+ if topic.lower() not in link_url.lower() and "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
184
+
185
+ # censoring if needed
186
+ if censor:
187
+ if not any(word1.lower() in bad_words for word1 in [word.lower() for word in link_text.split()]):
188
+ links_texts.append((link_url, link_text))
189
+ else:
190
+ links_texts.append((link_url, link_text))
191
+
192
+ return links_texts
193
+
194
+ def play_wiki_game_2(starting_topic: str, target_topic: str, limit: int = 100, delay: int = 0):
195
+
196
+ ##### Setup Chrome options
197
+ # chrome_options = webdriver.ChromeOptions()
198
+ # chrome_options.add_argument("--headless") # Ensure GUI is off
199
+ # chrome_options.add_argument("--no-sandbox")
200
+ # chrome_options.add_argument("--disable-dev-shm-usage")
201
+ # driver = webdriver.Chrome(options = chrome_options)
202
+
203
+ # options = Options()
204
+ # options.add_argument('--disable-gpu')
205
+ # options.add_argument('--headless')
206
+ # driver = get_driver()
207
+ # driver = webdriver.Firefox(options=opts)
208
+ # driver_target = webdriver.Firefox(options=opts)
209
+
210
+
211
+ #### Getting target url, topic, and context
212
+ # driver_target = webdriver.Chrome(options = chrome_options)
213
+ # driver_target = get_driver()
214
+ target_url, target_topic = search_wikipedia(search_term = target_topic)
215
+ driver_target.get(target_url)
216
+ target_context = get_topic_context(driver_target, more = True)
217
+ # update_messages(target_context)
218
+ driver_target.quit()
219
+
220
+ topic = starting_topic
221
+ num_pages = 0
222
+ used_topics = []
223
+ used_links = []
224
+
225
+ start_time = time.time()
226
+
227
+ ### BEGIN ###
228
+
229
+ update_messages("-" * 150)
230
+ update_messages(f"\nStarting!\n")
231
+ update_messages("-" * 150)
232
+
233
+ url, topic = search_wikipedia(search_term = starting_topic)
234
+ driver.get(url)
235
+ used_topics.append(topic)
236
+ used_links.append(driver.current_url)
237
+
238
+ while True:
239
+ # increment the page tracking by 1 for each new page
240
+ num_pages += 1
241
+
242
+ # if not the first page, navigate to the new page
243
+ if num_pages > 1:
244
+ driver.get(next_link)
245
+
246
+ try:
247
+ context_sentence = get_topic_context(driver)
248
+ except Exception as e:
249
+ context_sentence = "Context could not be found from webpage"
250
+
251
+ current_url = driver.current_url
252
+ current_url_suffix = str(current_url).split("/")[-1]
253
+
254
+ ### Use BeautifulSoup and Requests instead of Selenium for link extraction
255
+ current_page = driver.page_source # html from Selenium instead of BeautifulSoup
256
+
257
+ soup = BeautifulSoup(current_page, 'html.parser')
258
+
259
+ links = soup.find_all('a')
260
+
261
+ # get rid of any bloat in the links from the page
262
+ links_texts = refine_links(topic, links, current_url_suffix, used_links, used_topics)
263
+
264
+ # best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = [text for link, text in links_texts])
265
+ best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_context.lower(), labels_list = [text.lower() for link, text in links_texts])
266
+
267
+ update_messages(f"\nPage: {num_pages}")
268
+ update_messages(f"Current topic: '{topic.title()}'")
269
+ update_messages(f"Current URL: '{current_url}'")
270
+ update_messages(f"Current Topic Context: '{context_sentence}'")
271
+ if current_url != target_url:
272
+ update_messages(f"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%")
273
+
274
+ next_link, topic = links_texts[loc_idx]
275
+
276
+ used_links.append(next_link)
277
+ used_topics.append(topic)
278
+
279
+ if current_url == target_url: # because the target_url is now found through the API
280
+ update_messages("\n" + "-" * 150)
281
+ update_messages(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
282
+ update_messages(f"Starting topic: '{starting_topic.title()}': '{used_links[0]}'")
283
+ update_messages(f"Target topic: '{target_topic.title()}': '{target_url}'\n")
284
+ update_messages("-" * 150)
285
+ driver.quit()
286
+ break
287
+
288
+ if num_pages == limit:
289
+ update_messages("\n" + "-" * 150)
290
+ update_messages(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
291
+ update_messages(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{used_topics[-1].title()}': '{used_links[-1]}'")
292
+ update_messages(f"\nTry a different combination to see if it can do it!\n")
293
+ update_messages("-" * 150)
294
+ driver.quit()
295
+ break
296
+
297
+ # delay things, if applicable
298
+
299
+ ###### Example
300
+ time.sleep(delay)
301
+
302
+ # starting_topic = 'soulja boy'
303
+ # target_topic = 'urine'
304
+
305
+ # play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)
wikigame_rnd.ipynb ADDED
@@ -0,0 +1,1315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# from selenium import webdriver\n",
10
+ "# from selenium.webdriver.common.by import By\n",
11
+ "# from selenium.webdriver.common.keys import Keys\n",
12
+ "# from bs4 import BeautifulSoup\n",
13
+ "# import time\n",
14
+ "# # !pip install tensorflow tensorflow-hub\n",
15
+ "# import tensorflow as tf\n",
16
+ "# import tensorflow_hub as hub\n",
17
+ "# import numpy as np\n",
18
+ "# # !pip install jellyfish\n",
19
+ "# import jellyfish"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 2,
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "# !pip show selenium\n",
29
+ "# !pip show beautifulsoup4\n",
30
+ "# !pip show numpy\n",
31
+ "# !pip show tensorflow\n",
32
+ "# !pip show tensorflow-hub\n",
33
+ "# !pip show jellyfish\n",
34
+ "# !pip show streamlit"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": 3,
40
+ "metadata": {},
41
+ "outputs": [],
42
+ "source": [
43
+ "# starting_topic = \"soulja boy\"\n",
44
+ "# target_topic = \"fart\"\n",
45
+ "\n",
46
+ "# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "markdown",
51
+ "metadata": {},
52
+ "source": [
53
+ "# Version 3"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": 4,
59
+ "metadata": {},
60
+ "outputs": [
61
+ {
62
+ "name": "stderr",
63
+ "output_type": "stream",
64
+ "text": [
65
+ "2023-07-30 09:07:17.451238: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
66
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
67
+ ]
68
+ }
69
+ ],
70
+ "source": [
71
+ "from selenium import webdriver\n",
72
+ "from selenium.webdriver.common.by import By\n",
73
+ "from selenium.webdriver.common.keys import Keys\n",
74
+ "from bs4 import BeautifulSoup\n",
75
+ "import time\n",
76
+ "# !pip install tensorflow tensorflow-hub\n",
77
+ "import tensorflow as tf\n",
78
+ "import tensorflow_hub as hub\n",
79
+ "import numpy as np\n",
80
+ "import requests\n",
81
+ "import json\n",
82
+ "\n",
83
+ "# Load the pre-trained Universal Sentence Encoder\n",
84
+ "embed = hub.load(\"https://tfhub.dev/google/universal-sentence-encoder/4\")"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 105,
90
+ "metadata": {},
91
+ "outputs": [
92
+ {
93
+ "name": "stdout",
94
+ "output_type": "stream",
95
+ "text": [
96
+ "------------------------------------------------------------------------------------------------------------------------------------------------------\n",
97
+ "\n",
98
+ "Starting!\n",
99
+ "\n",
100
+ "------------------------------------------------------------------------------------------------------------------------------------------------------\n",
101
+ "\n",
102
+ "Page: 1\n",
103
+ "Current topic: 'Soulja Boy'\n",
104
+ "Current URL: 'https://en.wikipedia.org/wiki/Soulja_Boy'\n",
105
+ "Current Topic Context: 'DeAndre Cortez Way (born July 28, 1990), known professionally as Soulja Boy (formerly Soulja Boy Tell 'Em), is an American rapper and record producer'\n",
106
+ "Next topic: 'Peewee Longway'. Semantic similarity to 'Urine': 21.81%\n",
107
+ "\n",
108
+ "Page: 2\n",
109
+ "Current topic: 'Peewee Longway'\n",
110
+ "Current URL: 'https://en.wikipedia.org/wiki/Peewee_Longway'\n",
111
+ "Current Topic Context: 'Quincy Lamont Williams (born August 17, 1984), known by his stage name Peewee Longway, is an American rapper best known for his mixtape The Blue M&M and his collaboration with Young Thug, \"Loaded\"'\n",
112
+ "Next topic: 'Hip Hop'. Semantic similarity to 'Urine': 12.0%\n",
113
+ "\n",
114
+ "Page: 3\n",
115
+ "Current topic: 'Hip Hop'\n",
116
+ "Current URL: 'https://en.wikipedia.org/wiki/Hip_hop_music'\n",
117
+ "Current Topic Context: 'Hip hop or hip-hop, also known as rap and formerly known as disco rap,[5][6] is a genre of popular music that was originated in the Bronx[7][8][9][10] borough of New York City in the early 1970s by African Americans,[11][12][13] having existed for several years prior to mainstream discovery.[14] Hip hop originated as an anti-drug and anti-violence genre,[15] while consisting of stylized rhythmic music (usually built around drum beats) that commonly accompanies rapping, a rhythmic and rhyming speech that is chanted.[16] According to the professor Asante of African American studies at Temple University, \"hip hop is something that blacks can unequivocally claim as their own\".[17] It was developed as part of hip hop culture, a subculture defined by four key stylistic elements: MCing/rapping, DJing/scratching with turntables, break dancing, and graffiti art.[18][19][20] Other elements include sampling beats or bass lines from records (or synthesized beats and sounds), and rhythmic beatboxing'\n",
118
+ "Next topic: 'Rufus Thomas'. Semantic similarity to 'Urine': 21.79%\n",
119
+ "\n",
120
+ "Page: 4\n",
121
+ "Current topic: 'Rufus Thomas'\n",
122
+ "Current URL: 'https://en.wikipedia.org/wiki/Rufus_Thomas'\n",
123
+ "Current Topic Context: 'Rufus C'\n",
124
+ "Next topic: 'Rabbit Foot Minstrels'. Semantic similarity to 'Urine': 19.28%\n",
125
+ "\n",
126
+ "Page: 5\n",
127
+ "Current topic: 'Rabbit Foot Minstrels'\n",
128
+ "Current URL: 'https://en.wikipedia.org/wiki/The_Rabbit%27s_Foot_Company'\n",
129
+ "Current Topic Context: 'The Rabbit's Foot Company, also known as the Rabbit('s) Foot Minstrels and colloquially as \"The Foots\", was a long-running minstrel and variety troupe that toured as a tent show in the American South between 1900 and the late 1950s'\n",
130
+ "Next topic: 'Jstor'. Semantic similarity to 'Urine': 11.85%\n",
131
+ "\n",
132
+ "Page: 6\n",
133
+ "Current topic: 'Jstor'\n",
134
+ "Current URL: 'https://en.wikipedia.org/wiki/JSTOR'\n",
135
+ "Current Topic Context: 'JSTOR (/ˈdʒeɪstɔːr/; short for Journal Storage)[2] is a digital library founded in 1994'\n",
136
+ "Next topic: 'Nieman Lab'. Semantic similarity to 'Urine': 12.14%\n",
137
+ "\n",
138
+ "Page: 7\n",
139
+ "Current topic: 'Nieman Lab'\n",
140
+ "Current URL: 'https://en.wikipedia.org/wiki/Nieman_Foundation_for_Journalism'\n",
141
+ "Current Topic Context: 'The Nieman Foundation for Journalism at Harvard University is the primary journalism institution at Harvard.'\n",
142
+ "Next topic: 'Men'S Soccer'. Semantic similarity to 'Urine': 14.43%\n",
143
+ "\n",
144
+ "Page: 8\n",
145
+ "Current topic: 'Men'S Soccer'\n",
146
+ "Current URL: 'https://en.wikipedia.org/wiki/Harvard_Crimson_men%27s_soccer'\n",
147
+ "Current Topic Context: 'The Harvard Crimson men's soccer team is an intercollegiate varsity sports team of Harvard University'\n",
148
+ "Next topic: 'California Golden Bears Men'S Soccer'. Semantic similarity to 'Urine': 17.31%\n",
149
+ "\n",
150
+ "Page: 9\n",
151
+ "Current topic: 'California Golden Bears Men'S Soccer'\n",
152
+ "Current URL: 'https://en.wikipedia.org/wiki/California_Golden_Bears_men%27s_soccer'\n",
153
+ "Current Topic Context: 'The California Golden Bears men's soccer team is a varsity intercollegiate athletic team of University of California, Berkeley in Berkeley, California, United States.[1] The team is a member of the Pac-12 Conference, which is part of the National Collegiate Athletic Association's Division I'\n",
154
+ "Next topic: 'California Drinking Song'. Semantic similarity to 'Urine': 15.78%\n",
155
+ "\n",
156
+ "Page: 10\n",
157
+ "Current topic: 'California Drinking Song'\n",
158
+ "Current URL: 'https://en.wikipedia.org/wiki/California_Drinking_Song'\n",
159
+ "Current Topic Context: '\"California Drinking Song\" is a spirit song from the University of California, Berkeley'\n",
160
+ "Next topic: 'Uc Men'S Octet'. Semantic similarity to 'Urine': 15.63%\n",
161
+ "\n",
162
+ "Page: 11\n",
163
+ "Current topic: 'Uc Men'S Octet'\n",
164
+ "Current URL: 'https://en.wikipedia.org/wiki/University_of_California_Men%27s_Octet'\n",
165
+ "Current Topic Context: 'The UC Men's Octet, sometimes termed the Cal Men’s Octet or the UC Berkeley Men’s Octet, is an eight-member male a cappella group at the University of California, Berkeley'\n",
166
+ "Next topic: 'Laboratories'. Semantic similarity to 'Urine': 15.45%\n",
167
+ "\n",
168
+ "Page: 12\n",
169
+ "Current topic: 'Laboratories'\n",
170
+ "Current URL: 'https://en.wikipedia.org/wiki/Research_centers_and_laboratories_at_the_University_of_California,_Berkeley'\n",
171
+ "Current Topic Context: 'The University of California, Berkeley, contains many research centers and laboratories.'\n",
172
+ "Next topic: 'Uc Irvine Medical Center'. Semantic similarity to 'Urine': 18.16%\n",
173
+ "\n",
174
+ "Page: 13\n",
175
+ "Current topic: 'Uc Irvine Medical Center'\n",
176
+ "Current URL: 'https://en.wikipedia.org/wiki/University_of_California,_Irvine_Medical_Center'\n",
177
+ "Current Topic Context: 'The University of California, Irvine Medical Center (UCIMC or UCI Medical Center) is a major research hospital located in Orange, California'\n",
178
+ "Next topic: 'Sepsis'. Semantic similarity to 'Urine': 19.29%\n",
179
+ "\n",
180
+ "Page: 14\n",
181
+ "Current topic: 'Sepsis'\n",
182
+ "Current URL: 'https://en.wikipedia.org/wiki/Sepsis'\n",
183
+ "Current Topic Context: 'Sepsis (septicaemia in British English), or blood poisoning,[8][9] is a life-threatening condition that arises when the body's response to infection causes injury to its own tissues and organs.[4][8]'\n",
184
+ "Next topic: 'Urinary Tract'. Semantic similarity to 'Urine': 51.26%\n",
185
+ "\n",
186
+ "Page: 15\n",
187
+ "Current topic: 'Urinary Tract'\n",
188
+ "Current URL: 'https://en.wikipedia.org/wiki/Urinary_system'\n",
189
+ "Current Topic Context: 'The urinary system, also known as the urinary tract or renal system, consists of the kidneys, ureters, bladder, and the urethra'\n",
190
+ "Next topic: 'Urinary Bladder'. Semantic similarity to 'Urine': 61.01%\n",
191
+ "\n",
192
+ "Page: 16\n",
193
+ "Current topic: 'Urinary Bladder'\n",
194
+ "Current URL: 'https://en.wikipedia.org/wiki/Bladder'\n",
195
+ "Current Topic Context: 'The bladder is a hollow organ in humans and other vertebrates that stores urine from the kidneys before disposal by urination'\n",
196
+ "Next topic: 'Urination § Anatomy Of The Bladder And Outlet'. Semantic similarity to 'Urine': 57.69%\n",
197
+ "\n",
198
+ "Page: 17\n",
199
+ "Current topic: 'Urination § Anatomy Of The Bladder And Outlet'\n",
200
+ "Current URL: 'https://en.wikipedia.org/wiki/Urination#Anatomy_of_the_bladder_and_outlet'\n",
201
+ "Current Topic Context: 'Urination is the release of urine from the urinary bladder through the urethra to the outside of the body'\n",
202
+ "Next topic: 'Urine'. Semantic similarity to 'Urine': 57.28%\n",
203
+ "\n",
204
+ "Page: 18\n",
205
+ "Current topic: 'Urine'\n",
206
+ "Current URL: 'https://en.wikipedia.org/wiki/Urine'\n",
207
+ "Current Topic Context: 'Urine is a liquid by-product of metabolism in humans and in many other animals'\n",
208
+ "\n",
209
+ "------------------------------------------------------------------------------------------------------------------------------------------------------\n",
210
+ "\n",
211
+ "From 'Soulja Boy', to 'Urine' in 18 pages, 8.54 seconds!\n",
212
+ "Starting topic: 'Soulja Boy': 'https://en.wikipedia.org/wiki/Soulja_Boy'\n",
213
+ "Target topic: 'Urine': 'https://en.wikipedia.org/wiki/Urine'\n",
214
+ "\n",
215
+ "------------------------------------------------------------------------------------------------------------------------------------------------------\n"
216
+ ]
217
+ }
218
+ ],
219
+ "source": [
220
+ "def most_similar_sentence(target_topic, labels_list):\n",
221
+ " # Encode the context sentence and all sentences in the list\n",
222
+ " context_embedding = embed([target_topic])[0]\n",
223
+ " sentence_embeddings = embed(labels_list)\n",
224
+ " \n",
225
+ " # Calculate cosine similarities between the context sentence and each sentence in the list\n",
226
+ " similarities = np.inner(context_embedding, sentence_embeddings)\n",
227
+ " \n",
228
+ " # Find the index of the most similar sentence\n",
229
+ " most_similar_index = np.argmax(similarities)\n",
230
+ " \n",
231
+ " return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index\n",
232
+ "\n",
233
+ "def search_wikipedia(search_term):\n",
234
+ " # Define the endpoint\n",
235
+ " endpoint = \"https://en.wikipedia.org/w/api.php\"\n",
236
+ "\n",
237
+ " # Define the search parameters\n",
238
+ " params = {\n",
239
+ " \"action\": \"query\",\n",
240
+ " \"format\": \"json\",\n",
241
+ " \"list\": \"search\",\n",
242
+ " \"srsearch\": search_term\n",
243
+ " }\n",
244
+ "\n",
245
+ " # Send a GET request to the endpoint with your parameters\n",
246
+ " response = requests.get(url = endpoint, params = params)\n",
247
+ "\n",
248
+ " # Parse the results as JSON\n",
249
+ " data = json.loads(response.text)\n",
250
+ "\n",
251
+ " # Get the title of the first result (this will be used as the page title in the next step)\n",
252
+ " page_title = data[\"query\"][\"search\"][0][\"title\"]\n",
253
+ "\n",
254
+ " # if \"may refer to\" in data[\"query\"][\"search\"][0][\"snippet\"].lower():\n",
255
+ " # page_title = data[\"query\"][\"search\"][1][\"title\"]\n",
256
+ "\n",
257
+ " # Construct the URL of the Wikipedia page\n",
258
+ " page_url = \"https://en.wikipedia.org/wiki/{}\".format(page_title.replace(\" \", \"_\"))\n",
259
+ "\n",
260
+ " return page_url, page_title\n",
261
+ "\n",
262
+ "def get_topic_context(driver, more = False):\n",
263
+ " # Find the first paragraph of the main article\n",
264
+ " first_paragraph = driver.find_element(By.CSS_SELECTOR, \"div.mw-parser-output > p:not(.mw-empty-elt)\").text\n",
265
+ "\n",
266
+ " if more:\n",
267
+ " context_sentence = \". \".join(first_paragraph.split(\". \")[:5])\n",
268
+ " else:\n",
269
+ " context_sentence = first_paragraph.split(\". \")[0]\n",
270
+ "\n",
271
+ " return context_sentence\n",
272
+ "\n",
273
+ "# bad_words = [word for word in open(\"censored.txt\", \"r\").readlines()]\n",
274
+ "bad_words = [word.strip() for word in open(\"censored.txt\", \"r\").readlines()]\n",
275
+ "\n",
276
+ "def refine_links(topic, links, current_url_suffix, used_links, used_topics, censor = False):\n",
277
+ "\n",
278
+ " links_texts = []\n",
279
+ "\n",
280
+ " # Iterate through the links and extract their URLs\n",
281
+ " for link in links:\n",
282
+ " link_url = link.get('href')\n",
283
+ " if link_url and link_url.startswith(\"/wiki/\"):\n",
284
+ " link_url = \"https://en.wikipedia.org\" + link_url\n",
285
+ " link_text = link.text.strip() # Get the text and remove leading/trailing spaces\n",
286
+ "\n",
287
+ " # make sure they are both not None\n",
288
+ " if link_text and current_url_suffix not in link_url:\n",
289
+ "\n",
290
+ " if link_url not in used_links and link_text.lower() not in [topic.lower() for topic in used_topics]:\n",
291
+ "\n",
292
+ " # eliminates topic duplicates, non-wiki links, and wiki-help pages (non-content pages)\n",
293
+ " if topic.lower() not in link_url.lower() and \"en.wikipedia.org/wiki/\" in link_url and \":\" not in \"\".join(link_url.split(\"/\")[1:]) and \"Main_Page\" != str(link_url.split(\"/\")[-1]):\n",
294
+ "\n",
295
+ " # censoring if needed\n",
296
+ " if censor:\n",
297
+ " if not any(word1.lower() in bad_words for word1 in [word.lower() for word in link_text.split()]):\n",
298
+ " links_texts.append((link_url, link_text))\n",
299
+ " else:\n",
300
+ " links_texts.append((link_url, link_text))\n",
301
+ "\n",
302
+ " return links_texts\n",
303
+ "\n",
304
+ "def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):\n",
305
+ "\n",
306
+ " ##### Setup Chrome options\n",
307
+ " chrome_options = webdriver.ChromeOptions()\n",
308
+ " chrome_options.add_argument(\"--headless\") # Ensure GUI is off\n",
309
+ " chrome_options.add_argument(\"--no-sandbox\")\n",
310
+ " chrome_options.add_argument(\"--disable-dev-shm-usage\")\n",
311
+ " driver = webdriver.Chrome(options = chrome_options)\n",
312
+ "\n",
313
+ " #### Getting target url, topic, and context\n",
314
+ " driver_target = webdriver.Chrome(options = chrome_options)\n",
315
+ " target_url, target_topic = search_wikipedia(search_term = target_topic)\n",
316
+ " driver_target.get(target_url)\n",
317
+ " target_context = get_topic_context(driver_target, more = True)\n",
318
+ " # print(target_context)\n",
319
+ " driver_target.quit()\n",
320
+ "\n",
321
+ " topic = starting_topic\n",
322
+ " num_pages = 0\n",
323
+ " used_topics = []\n",
324
+ " used_links = []\n",
325
+ "\n",
326
+ " start_time = time.time()\n",
327
+ "\n",
328
+ " ### BEGIN ###\n",
329
+ "\n",
330
+ " print(\"-\" * 150)\n",
331
+ " print(f\"\\nStarting!\\n\")\n",
332
+ " print(\"-\" * 150)\n",
333
+ "\n",
334
+ " url, topic = search_wikipedia(search_term = starting_topic)\n",
335
+ " driver.get(url)\n",
336
+ " used_topics.append(topic)\n",
337
+ " used_links.append(driver.current_url)\n",
338
+ "\n",
339
+ " while True:\n",
340
+ " # increment the page tracking by 1 for each new page\n",
341
+ " num_pages += 1\n",
342
+ "\n",
343
+ " # if not the first page, navigate to the new page\n",
344
+ " if num_pages > 1:\n",
345
+ " driver.get(next_link)\n",
346
+ "\n",
347
+ " try:\n",
348
+ " context_sentence = get_topic_context(driver)\n",
349
+ " except Exception as e:\n",
350
+ " context_sentence = \"Context could not be found from webpage\"\n",
351
+ "\n",
352
+ " current_url = driver.current_url\n",
353
+ " current_url_suffix = str(current_url).split(\"/\")[-1]\n",
354
+ "\n",
355
+ " ### Use BeautifulSoup and Requests instead of Selenium for link extraction\n",
356
+ " current_page = driver.page_source # html from Selenium instead of BeautifulSoup\n",
357
+ "\n",
358
+ " soup = BeautifulSoup(current_page, 'html.parser')\n",
359
+ "\n",
360
+ " links = soup.find_all('a')\n",
361
+ "\n",
362
+ " # get rid of any bloat in the links from the page\n",
363
+ " links_texts = refine_links(topic, links, current_url_suffix, used_links, used_topics)\n",
364
+ "\n",
365
+ " # best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = [text for link, text in links_texts])\n",
366
+ " best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_context.lower(), labels_list = [text.lower() for link, text in links_texts])\n",
367
+ "\n",
368
+ " print(f\"\\nPage: {num_pages}\")\n",
369
+ " print(f\"Current topic: '{topic.title()}'\")\n",
370
+ " print(f\"Current URL: '{current_url}'\")\n",
371
+ " print(f\"Current Topic Context: '{context_sentence}'\")\n",
372
+ " if current_url != target_url:\n",
373
+ " print(f\"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%\")\n",
374
+ "\n",
375
+ " next_link, topic = links_texts[loc_idx]\n",
376
+ "\n",
377
+ " used_links.append(next_link)\n",
378
+ " used_topics.append(topic)\n",
379
+ "\n",
380
+ " if current_url == target_url: # because the target_url is now found through the API\n",
381
+ " print(\"\\n\" + \"-\" * 150)\n",
382
+ " print(f\"\\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!\")\n",
383
+ " print(f\"Starting topic: '{starting_topic.title()}': '{used_links[0]}'\")\n",
384
+ " print(f\"Target topic: '{target_topic.title()}': '{target_url}'\\n\")\n",
385
+ " print(\"-\" * 150)\n",
386
+ " driver.quit()\n",
387
+ " break\n",
388
+ "\n",
389
+ " if num_pages == limit:\n",
390
+ " print(\"\\n\" + \"-\" * 150)\n",
391
+ " print(f\"\\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.\")\n",
392
+ " print(f\"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{used_topics[-1].title()}': '{used_links[-1]}'\")\n",
393
+ " print(f\"\\nTry a different combination to see if it can do it!\\n\")\n",
394
+ " print(\"-\" * 150)\n",
395
+ " driver.quit()\n",
396
+ " break\n",
397
+ "\n",
398
+ "###### Example\n",
399
+ "\n",
400
+ "starting_topic = 'soulja boy'\n",
401
+ "target_topic = 'urine'\n",
402
+ "\n",
403
+ "play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)"
404
+ ]
405
+ },
406
+ {
407
+ "cell_type": "code",
408
+ "execution_count": 6,
409
+ "metadata": {},
410
+ "outputs": [],
411
+ "source": [
412
+ "# starting_topic = 'soulja boy'\n",
413
+ "# target_topic = 'fart'\n",
414
+ "\n",
415
+ "# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)"
416
+ ]
417
+ },
418
+ {
419
+ "cell_type": "markdown",
420
+ "metadata": {},
421
+ "source": [
422
+ "# Tracking Stats"
423
+ ]
424
+ },
425
+ {
426
+ "cell_type": "code",
427
+ "execution_count": 73,
428
+ "metadata": {},
429
+ "outputs": [],
430
+ "source": [
431
+ "def play_wiki_game_stats(starting_topic: str, target_topic: str, limit: int = 200):\n",
432
+ "\n",
433
+ " stats_dict = {}\n",
434
+ "\n",
435
+ " ##### Setup Chrome options\n",
436
+ " chrome_options = webdriver.ChromeOptions()\n",
437
+ " chrome_options.add_argument(\"--headless\") # Ensure GUI is off\n",
438
+ " chrome_options.add_argument(\"--no-sandbox\")\n",
439
+ " chrome_options.add_argument(\"--disable-dev-shm-usage\")\n",
440
+ " driver = webdriver.Chrome(options = chrome_options)\n",
441
+ "\n",
442
+ " #### Getting target url, topic, and context\n",
443
+ " driver_target = webdriver.Chrome(options = chrome_options)\n",
444
+ " target_url, target_topic = search_wikipedia(search_term = target_topic)\n",
445
+ " driver_target.get(target_url)\n",
446
+ " target_context = get_topic_context(driver_target)\n",
447
+ " print(target_context)\n",
448
+ " print()\n",
449
+ " driver_target.quit()\n",
450
+ " \n",
451
+ " topic = starting_topic\n",
452
+ " num_pages = 0\n",
453
+ " used_topics = []\n",
454
+ " used_links = []\n",
455
+ " contexts = []\n",
456
+ " sim_to_target_scores = []\n",
457
+ "\n",
458
+ " start_time = time.time()\n",
459
+ "\n",
460
+ " ### BEGIN ###\n",
461
+ "\n",
462
+ " print(\"-\" * 150)\n",
463
+ " print(f\"\\nStarting!\\n\")\n",
464
+ " print(\"-\" * 150)\n",
465
+ "\n",
466
+ " url, topic = search_wikipedia(search_term = starting_topic)\n",
467
+ " driver.get(url)\n",
468
+ " used_topics.append(topic)\n",
469
+ " used_links.append(driver.current_url)\n",
470
+ " sim_to_target_scores.append(most_similar_sentence(target_topic = target_context, labels_list = [topic])[1])\n",
471
+ "\n",
472
+ " while True:\n",
473
+ " # increment the page tracking by 1 for each new page\n",
474
+ " num_pages += 1\n",
475
+ "\n",
476
+ " # if not the first page, navigate to the new page\n",
477
+ " if num_pages > 1:\n",
478
+ " driver.get(next_link)\n",
479
+ "\n",
480
+ " context_sentence = get_topic_context(driver)\n",
481
+ " contexts.append(context_sentence)\n",
482
+ "\n",
483
+ " current_url = driver.current_url\n",
484
+ " current_url_suffix = str(current_url).split(\"/\")[-1]\n",
485
+ "\n",
486
+ " ### Use BeautifulSoup and Requests instead of Selenium for link extraction\n",
487
+ " current_page = driver.page_source # html from Selenium instead of BeautifulSoup\n",
488
+ "\n",
489
+ " soup = BeautifulSoup(current_page, 'html.parser')\n",
490
+ "\n",
491
+ " links = soup.find_all('a')\n",
492
+ "\n",
493
+ " # get rid of any bloat in the links from the page\n",
494
+ " links_texts = refine_links(topic, links, current_url_suffix, used_links, used_topics)\n",
495
+ "\n",
496
+ " best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_context, labels_list = [text for link, text in links_texts])\n",
497
+ "\n",
498
+ " print(f\"\\nPage: {num_pages}\")\n",
499
+ " print(f\"Current topic: '{topic.title()}'\")\n",
500
+ " print(f\"Current URL: '{current_url}'\")\n",
501
+ " print(f\"Current Topic Context: '{context_sentence}'\")\n",
502
+ " if current_url != target_url:\n",
503
+ " print(f\"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%\")\n",
504
+ " \n",
505
+ " next_link, topic = links_texts[loc_idx]\n",
506
+ "\n",
507
+ " # contexts.append(context_sentence)\n",
508
+ "\n",
509
+ " if current_url == target_url: # because the target_url is now found through the API\n",
510
+ " print(\"\\n\" + \"-\" * 150)\n",
511
+ " print(f\"\\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!\")\n",
512
+ " print(f\"Starting topic: '{starting_topic.title()}': '{used_links[0]}'\")\n",
513
+ " print(f\"Target topic: '{target_topic.title()}': '{used_links[-1]}'\\n\")\n",
514
+ " print(\"-\" * 150)\n",
515
+ "\n",
516
+ " stats_dict['start_end'] = [f\"{starting_topic}_{target_topic}\" for i in range(num_pages)]\n",
517
+ " stats_dict['topic'] = used_topics\n",
518
+ " stats_dict['context'] = contexts\n",
519
+ " stats_dict['sim_to_target'] = sim_to_target_scores\n",
520
+ " stats_dict['url'] = used_links\n",
521
+ " stats_dict['page_num'] = [i+1 for i in range(num_pages)]\n",
522
+ " driver.quit()\n",
523
+ " return stats_dict\n",
524
+ " break\n",
525
+ "\n",
526
+ " ##### ADD DRAMATIC DELAY HERE #####\n",
527
+ " # time.sleep(0.5)\n",
528
+ " # time.sleep(10)\n",
529
+ "\n",
530
+ " if num_pages == limit:\n",
531
+ " print(\"\\n\" + \"-\" * 150)\n",
532
+ " print(f\"\\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.\")\n",
533
+ " print(f\"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{used_topics[-1].title()}': '{used_links[-1]}'\")\n",
534
+ " print(f\"\\nTry a different combination to see if it can do it!\\n\")\n",
535
+ " print(\"-\" * 150)\n",
536
+ "\n",
537
+ " stats_dict['start_end'] = [f\"{starting_topic}_{target_topic}\" for i in range(num_pages)]\n",
538
+ " stats_dict['topic'] = used_topics\n",
539
+ " stats_dict['context'] = contexts\n",
540
+ " stats_dict['sim_to_target'] = sim_to_target_scores\n",
541
+ " stats_dict['url'] = used_links\n",
542
+ " stats_dict['page_num'] = [i+1 for i in range(num_pages)]\n",
543
+ " driver.quit()\n",
544
+ " return stats_dict\n",
545
+ " break\n",
546
+ "\n",
547
+ " used_links.append(next_link)\n",
548
+ " used_topics.append(topic)\n",
549
+ " sim_to_target_scores.append(best_score)\n",
550
+ "\n",
551
+ "# starting_topic = 'john mayer'\n",
552
+ "# target_topic = 'fart'\n",
553
+ "\n",
554
+ "# stats_dict = play_wiki_game_stats(starting_topic = starting_topic, target_topic = target_topic, limit = 200)"
555
+ ]
556
+ },
557
+ {
558
+ "cell_type": "code",
559
+ "execution_count": 35,
560
+ "metadata": {},
561
+ "outputs": [
562
+ {
563
+ "name": "stdout",
564
+ "output_type": "stream",
565
+ "text": [
566
+ "dict_keys(['start_end', 'topic', 'context', 'sim_to_target', 'url', 'page_num'])\n",
567
+ "[6, 6, 6, 6, 6, 6]\n"
568
+ ]
569
+ }
570
+ ],
571
+ "source": [
572
+ "# stats_dict['start_end'] = [f\"{starting_topic}_{target_topic}\" for i in range(7)]\n",
573
+ "print(stats_dict.keys())\n",
574
+ "print([len(stats_dict[key]) for key in stats_dict.keys()])"
575
+ ]
576
+ },
577
+ {
578
+ "cell_type": "code",
579
+ "execution_count": 36,
580
+ "metadata": {},
581
+ "outputs": [
582
+ {
583
+ "data": {
584
+ "text/plain": [
585
+ "[0.027460583, 0.20852715, 0.2775123, 0.31147623, 0.4413054, 0.6199604]"
586
+ ]
587
+ },
588
+ "execution_count": 36,
589
+ "metadata": {},
590
+ "output_type": "execute_result"
591
+ }
592
+ ],
593
+ "source": [
594
+ "stats_dict['sim_to_target']"
595
+ ]
596
+ },
597
+ {
598
+ "cell_type": "code",
599
+ "execution_count": 37,
600
+ "metadata": {},
601
+ "outputs": [
602
+ {
603
+ "data": {
604
+ "text/html": [
605
+ "<div>\n",
606
+ "<style scoped>\n",
607
+ " .dataframe tbody tr th:only-of-type {\n",
608
+ " vertical-align: middle;\n",
609
+ " }\n",
610
+ "\n",
611
+ " .dataframe tbody tr th {\n",
612
+ " vertical-align: top;\n",
613
+ " }\n",
614
+ "\n",
615
+ " .dataframe thead th {\n",
616
+ " text-align: right;\n",
617
+ " }\n",
618
+ "</style>\n",
619
+ "<table border=\"1\" class=\"dataframe\">\n",
620
+ " <thead>\n",
621
+ " <tr style=\"text-align: right;\">\n",
622
+ " <th></th>\n",
623
+ " <th>start_end</th>\n",
624
+ " <th>topic</th>\n",
625
+ " <th>context</th>\n",
626
+ " <th>sim_to_target</th>\n",
627
+ " <th>url</th>\n",
628
+ " <th>page_num</th>\n",
629
+ " </tr>\n",
630
+ " </thead>\n",
631
+ " <tbody>\n",
632
+ " <tr>\n",
633
+ " <th>0</th>\n",
634
+ " <td>john mayer_Flatulence</td>\n",
635
+ " <td>John Mayer</td>\n",
636
+ " <td>John Clayton Mayer[1] (/ˈmeɪ.ər/ MAY-ər; born ...</td>\n",
637
+ " <td>0.027461</td>\n",
638
+ " <td>https://en.wikipedia.org/wiki/John_Mayer</td>\n",
639
+ " <td>1</td>\n",
640
+ " </tr>\n",
641
+ " <tr>\n",
642
+ " <th>1</th>\n",
643
+ " <td>john mayer_Flatulence</td>\n",
644
+ " <td>cardiac dysrhythmia</td>\n",
645
+ " <td>Arrhythmias, also known as cardiac arrhythmias...</td>\n",
646
+ " <td>0.208527</td>\n",
647
+ " <td>https://en.wikipedia.org/wiki/Cardiac_dysrhythmia</td>\n",
648
+ " <td>2</td>\n",
649
+ " </tr>\n",
650
+ " <tr>\n",
651
+ " <th>2</th>\n",
652
+ " <td>john mayer_Flatulence</td>\n",
653
+ " <td>prolapse</td>\n",
654
+ " <td>Mitral valve prolapse (MVP) is a valvular hear...</td>\n",
655
+ " <td>0.277512</td>\n",
656
+ " <td>https://en.wikipedia.org/wiki/Mitral_valve_pro...</td>\n",
657
+ " <td>3</td>\n",
658
+ " </tr>\n",
659
+ " <tr>\n",
660
+ " <th>3</th>\n",
661
+ " <td>john mayer_Flatulence</td>\n",
662
+ " <td>gastrointestinal disturbances</td>\n",
663
+ " <td>Gastrointestinal diseases (abbrev</td>\n",
664
+ " <td>0.311476</td>\n",
665
+ " <td>https://en.wikipedia.org/wiki/Gastrointestinal...</td>\n",
666
+ " <td>4</td>\n",
667
+ " </tr>\n",
668
+ " <tr>\n",
669
+ " <th>4</th>\n",
670
+ " <td>john mayer_Flatulence</td>\n",
671
+ " <td>gastrointestinal tract</td>\n",
672
+ " <td>The gastrointestinal tract (GI tract, digestiv...</td>\n",
673
+ " <td>0.441305</td>\n",
674
+ " <td>https://en.wikipedia.org/wiki/Human_gastrointe...</td>\n",
675
+ " <td>5</td>\n",
676
+ " </tr>\n",
677
+ " <tr>\n",
678
+ " <th>5</th>\n",
679
+ " <td>john mayer_Flatulence</td>\n",
680
+ " <td>flatulence</td>\n",
681
+ " <td>Flatulence, in humans, is the expulsion of gas...</td>\n",
682
+ " <td>0.619960</td>\n",
683
+ " <td>https://en.wikipedia.org/wiki/Flatulence</td>\n",
684
+ " <td>6</td>\n",
685
+ " </tr>\n",
686
+ " </tbody>\n",
687
+ "</table>\n",
688
+ "</div>"
689
+ ],
690
+ "text/plain": [
691
+ " start_end topic \\\n",
692
+ "0 john mayer_Flatulence John Mayer \n",
693
+ "1 john mayer_Flatulence cardiac dysrhythmia \n",
694
+ "2 john mayer_Flatulence prolapse \n",
695
+ "3 john mayer_Flatulence gastrointestinal disturbances \n",
696
+ "4 john mayer_Flatulence gastrointestinal tract \n",
697
+ "5 john mayer_Flatulence flatulence \n",
698
+ "\n",
699
+ " context sim_to_target \\\n",
700
+ "0 John Clayton Mayer[1] (/ˈmeɪ.ər/ MAY-ər; born ... 0.027461 \n",
701
+ "1 Arrhythmias, also known as cardiac arrhythmias... 0.208527 \n",
702
+ "2 Mitral valve prolapse (MVP) is a valvular hear... 0.277512 \n",
703
+ "3 Gastrointestinal diseases (abbrev 0.311476 \n",
704
+ "4 The gastrointestinal tract (GI tract, digestiv... 0.441305 \n",
705
+ "5 Flatulence, in humans, is the expulsion of gas... 0.619960 \n",
706
+ "\n",
707
+ " url page_num \n",
708
+ "0 https://en.wikipedia.org/wiki/John_Mayer 1 \n",
709
+ "1 https://en.wikipedia.org/wiki/Cardiac_dysrhythmia 2 \n",
710
+ "2 https://en.wikipedia.org/wiki/Mitral_valve_pro... 3 \n",
711
+ "3 https://en.wikipedia.org/wiki/Gastrointestinal... 4 \n",
712
+ "4 https://en.wikipedia.org/wiki/Human_gastrointe... 5 \n",
713
+ "5 https://en.wikipedia.org/wiki/Flatulence 6 "
714
+ ]
715
+ },
716
+ "execution_count": 37,
717
+ "metadata": {},
718
+ "output_type": "execute_result"
719
+ }
720
+ ],
721
+ "source": [
722
+ "import pandas as pd\n",
723
+ "stats_df = pd.DataFrame(stats_dict)\n",
724
+ "stats_df"
725
+ ]
726
+ },
727
+ {
728
+ "cell_type": "markdown",
729
+ "metadata": {},
730
+ "source": [
731
+ "# Simluations"
732
+ ]
733
+ },
734
+ {
735
+ "cell_type": "code",
736
+ "execution_count": 42,
737
+ "metadata": {},
738
+ "outputs": [
739
+ {
740
+ "name": "stdout",
741
+ "output_type": "stream",
742
+ "text": [
743
+ "110\n"
744
+ ]
745
+ },
746
+ {
747
+ "data": {
748
+ "text/plain": [
749
+ "[('Sushi', 'Mars'),\n",
750
+ " ('Sushi', 'Beethoven'),\n",
751
+ " ('Sushi', 'Mount Everest'),\n",
752
+ " ('Sushi', 'Humpback Whale'),\n",
753
+ " ('Sushi', 'The Great Wall of China')]"
754
+ ]
755
+ },
756
+ "execution_count": 42,
757
+ "metadata": {},
758
+ "output_type": "execute_result"
759
+ }
760
+ ],
761
+ "source": [
762
+ "import itertools\n",
763
+ "\n",
764
+ "unrelated_list = [\n",
765
+ " \"Sushi\",\n",
766
+ " \"Mars\",\n",
767
+ " \"Beethoven\",\n",
768
+ " \"Mount Everest\",\n",
769
+ " \"Humpback Whale\",\n",
770
+ " \"The Great Wall of China\",\n",
771
+ " \"Photography\",\n",
772
+ " \"Pyramids of Egypt\",\n",
773
+ " \"Albert Einstein\",\n",
774
+ " \"Rainforests\",\n",
775
+ " 'buggy'\n",
776
+ "]\n",
777
+ "\n",
778
+ "# Generate all permutations of pairs\n",
779
+ "pair_permutations = list(itertools.permutations(unrelated_list, 2))\n",
780
+ "\n",
781
+ "print(len(pair_permutations)) # no pairs with self\n",
782
+ "pair_permutations[:5]"
783
+ ]
784
+ },
785
+ {
786
+ "cell_type": "code",
787
+ "execution_count": 43,
788
+ "metadata": {},
789
+ "outputs": [
790
+ {
791
+ "name": "stdout",
792
+ "output_type": "stream",
793
+ "text": [
794
+ "{'fruits': ['apple', 'banana', 'orange', 'grapes', 'kiwi'], 'animals': ['cat', 'dog', 'elephant', 'tiger', 'lion'], 'cities': ['New York', 'London'], 'colors': ['red', 'blue']}\n"
795
+ ]
796
+ }
797
+ ],
798
+ "source": [
799
+ "# Initial dictionary\n",
800
+ "main_dict = {\n",
801
+ " 'fruits': ['apple', 'banana', 'orange'],\n",
802
+ " 'animals': ['cat', 'dog', 'elephant'],\n",
803
+ "}\n",
804
+ "\n",
805
+ "# Function to add a new dictionary to the main_dict\n",
806
+ "def add_to_main_dict(main_dict, new_dict):\n",
807
+ " for key, value in new_dict.items():\n",
808
+ " if key in main_dict:\n",
809
+ " main_dict[key].extend(value)\n",
810
+ " else:\n",
811
+ " main_dict[key] = value\n",
812
+ "\n",
813
+ "# New dictionary to add to main_dict\n",
814
+ "new_dict1 = {\n",
815
+ " 'fruits': ['grapes', 'kiwi'],\n",
816
+ " 'cities': ['New York', 'London'],\n",
817
+ "}\n",
818
+ "\n",
819
+ "# Add new_dict1 to main_dict\n",
820
+ "add_to_main_dict(main_dict, new_dict1)\n",
821
+ "\n",
822
+ "# New dictionary to add to main_dict\n",
823
+ "new_dict2 = {\n",
824
+ " 'animals': ['tiger', 'lion'],\n",
825
+ " 'colors': ['red', 'blue'],\n",
826
+ "}\n",
827
+ "\n",
828
+ "# Add new_dict2 to main_dict\n",
829
+ "add_to_main_dict(main_dict, new_dict2)\n",
830
+ "\n",
831
+ "# Print the updated main_dict\n",
832
+ "print(main_dict)\n"
833
+ ]
834
+ },
835
+ {
836
+ "cell_type": "code",
837
+ "execution_count": 63,
838
+ "metadata": {},
839
+ "outputs": [
840
+ {
841
+ "name": "stdout",
842
+ "output_type": "stream",
843
+ "text": [
844
+ "Start: 'Sushi'. End: 'Mars'. Page: 8\n",
845
+ "Start: 'Sushi'. End: 'Ludwig van Beethoven'. Page: 9\n",
846
+ "Start: 'Sushi'. End: 'Mount Everest'. Page: 4\n",
847
+ "Start: 'Sushi'. End: 'Humpback whale'. Page: 3\n",
848
+ "Start: 'Sushi'. End: 'Great Wall of China'. Page: 7\n",
849
+ "Start: 'Sushi'. End: 'Photography'. Page: 29\n",
850
+ "Start: 'Sushi'. End: 'Egyptian pyramids'. Page: 23\n",
851
+ "Start: 'Sushi'. End: 'Albert Einstein'. Page: 12\n",
852
+ "Start: 'Sushi'. End: 'Rainforest'. Page: 7\n",
853
+ "Start: 'Sushi'. End: 'Buggy'. Page: 200\n",
854
+ "Start: 'Mars'. End: 'Sushi'. Page: 19\n",
855
+ "Start: 'Mars'. End: 'Ludwig van Beethoven'. Page: 4\n",
856
+ "Start: 'Mars'. End: 'Mount Everest'. Page: 2\n",
857
+ "Start: 'Mars'. End: 'Humpback whale'. Page: 4\n",
858
+ "Start: 'Mars'. End: 'Great Wall of China'. Page: 13\n",
859
+ "Start: 'Mars'. End: 'Photography'. Page: 32\n",
860
+ "Start: 'Mars'. End: 'Egyptian pyramids'. Page: 3\n",
861
+ "Start: 'Mars'. End: 'Albert Einstein'. Page: 6\n",
862
+ "Start: 'Mars'. End: 'Rainforest'. Page: 7\n",
863
+ "Start: 'Mars'. End: 'Buggy'. Page: 200\n",
864
+ "Start: 'Beethoven'. End: 'Sushi'. Page: 17\n",
865
+ "Start: 'Beethoven'. End: 'Mars'. Page: 3\n",
866
+ "Start: 'Beethoven'. End: 'Mount Everest'. Page: 6\n",
867
+ "Start: 'Beethoven'. End: 'Humpback whale'. Page: 4\n",
868
+ "Start: 'Beethoven'. End: 'Great Wall of China'. Page: 14\n",
869
+ "Start: 'Beethoven'. End: 'Photography'. Page: 31\n",
870
+ "Start: 'Beethoven'. End: 'Egyptian pyramids'. Page: 8\n",
871
+ "Start: 'Beethoven'. End: 'Albert Einstein'. Page: 3\n",
872
+ "Start: 'Beethoven'. End: 'Rainforest'. Page: 15\n",
873
+ "Start: 'Beethoven'. End: 'Buggy'. Page: 200\n",
874
+ "Start: 'Mount Everest'. End: 'Sushi'. Page: 14\n",
875
+ "Start: 'Mount Everest'. End: 'Mars'. Page: 2\n",
876
+ "Start: 'Mount Everest'. End: 'Ludwig van Beethoven'. Page: 23\n",
877
+ "Start: 'Mount Everest'. End: 'Humpback whale'. Page: 7\n",
878
+ "Start: 'Mount Everest'. End: 'Great Wall of China'. Page: 6\n",
879
+ "Start: 'Mount Everest'. End: 'Photography'. Page: 29\n",
880
+ "Start: 'Mount Everest'. End: 'Egyptian pyramids'. Page: 8\n",
881
+ "Start: 'Mount Everest'. End: 'Albert Einstein'. Page: 5\n",
882
+ "Start: 'Mount Everest'. End: 'Rainforest'. Page: 7\n",
883
+ "Start: 'Mount Everest'. End: 'Buggy'. Page: 200\n",
884
+ "Start: 'Humpback Whale'. End: 'Sushi'. Page: 9\n",
885
+ "Start: 'Humpback Whale'. End: 'Mars'. Page: 19\n",
886
+ "Start: 'Humpback Whale'. End: 'Ludwig van Beethoven'. Page: 29\n",
887
+ "Start: 'Humpback Whale'. End: 'Mount Everest'. Page: 5\n",
888
+ "Start: 'Humpback Whale'. End: 'Great Wall of China'. Page: 12\n",
889
+ "Start: 'Humpback Whale'. End: 'Photography'. Page: 5\n",
890
+ "Start: 'Humpback Whale'. End: 'Egyptian pyramids'. Page: 5\n",
891
+ "Start: 'Humpback Whale'. End: 'Albert Einstein'. Page: 8\n",
892
+ "Start: 'Humpback Whale'. End: 'Rainforest'. Page: 3\n",
893
+ "Start: 'Humpback Whale'. End: 'Buggy'. Page: 200\n",
894
+ "Start: 'The Great Wall of China'. End: 'Sushi'. Page: 7\n",
895
+ "Start: 'The Great Wall of China'. End: 'Mars'. Page: 13\n",
896
+ "Start: 'The Great Wall of China'. End: 'Ludwig van Beethoven'. Page: 10\n",
897
+ "Start: 'The Great Wall of China'. End: 'Mount Everest'. Page: 3\n",
898
+ "Start: 'The Great Wall of China'. End: 'Humpback whale'. Page: 11\n",
899
+ "Start: 'The Great Wall of China'. End: 'Photography'. Page: 48\n",
900
+ "Start: 'The Great Wall of China'. End: 'Egyptian pyramids'. Page: 5\n",
901
+ "Start: 'The Great Wall of China'. End: 'Albert Einstein'. Page: 7\n",
902
+ "Start: 'The Great Wall of China'. End: 'Rainforest'. Page: 4\n",
903
+ "Start: 'The Great Wall of China'. End: 'Buggy'. Page: 200\n",
904
+ "Start: 'Photography'. End: 'Sushi'. Page: 15\n",
905
+ "Start: 'Photography'. End: 'Mars'. Page: 13\n",
906
+ "Start: 'Photography'. End: 'Ludwig van Beethoven'. Page: 26\n",
907
+ "Start: 'Photography'. End: 'Mount Everest'. Page: 8\n",
908
+ "Start: 'Photography'. End: 'Humpback whale'. Page: 10\n",
909
+ "Start: 'Photography'. End: 'Great Wall of China'. Page: 3\n",
910
+ "Start: 'Photography'. End: 'Egyptian pyramids'. Page: 6\n",
911
+ "Start: 'Photography'. End: 'Albert Einstein'. Page: 21\n",
912
+ "Start: 'Photography'. End: 'Rainforest'. Page: 8\n",
913
+ "Start: 'Photography'. End: 'Buggy'. Page: 200\n",
914
+ "Start: 'Pyramids of Egypt'. End: 'Sushi'. Page: 7\n",
915
+ "Start: 'Pyramids of Egypt'. End: 'Mars'. Page: 7\n",
916
+ "Start: 'Pyramids of Egypt'. End: 'Ludwig van Beethoven'. Page: 62\n",
917
+ "Start: 'Pyramids of Egypt'. End: 'Mount Everest'. Page: 8\n",
918
+ "Start: 'Pyramids of Egypt'. End: 'Humpback whale'. Page: 10\n",
919
+ "Start: 'Pyramids of Egypt'. End: 'Great Wall of China'. Page: 8\n",
920
+ "Start: 'Pyramids of Egypt'. End: 'Photography'. Page: 31\n",
921
+ "Start: 'Pyramids of Egypt'. End: 'Albert Einstein'. Page: 3\n",
922
+ "Start: 'Pyramids of Egypt'. End: 'Rainforest'. Page: 10\n",
923
+ "Start: 'Pyramids of Egypt'. End: 'Buggy'. Page: 200\n",
924
+ "Start: 'Albert Einstein'. End: 'Sushi'. Page: 10\n",
925
+ "Start: 'Albert Einstein'. End: 'Mars'. Page: 3\n",
926
+ "Start: 'Albert Einstein'. End: 'Ludwig van Beethoven'. Page: 2\n",
927
+ "Start: 'Albert Einstein'. End: 'Mount Everest'. Page: 5\n",
928
+ "Start: 'Albert Einstein'. End: 'Humpback whale'. Page: 18\n",
929
+ "Start: 'Albert Einstein'. End: 'Great Wall of China'. Page: 8\n",
930
+ "Start: 'Albert Einstein'. End: 'Photography'. Page: 42\n",
931
+ "Start: 'Albert Einstein'. End: 'Egyptian pyramids'. Page: 7\n",
932
+ "Start: 'Albert Einstein'. End: 'Rainforest'. Page: 6\n",
933
+ "Start: 'Albert Einstein'. End: 'Buggy'. Page: 200\n",
934
+ "Start: 'Rainforests'. End: 'Sushi'. Page: 3\n",
935
+ "Start: 'Rainforests'. End: 'Mars'. Page: 7\n",
936
+ "Start: 'Rainforests'. End: 'Ludwig van Beethoven'. Page: 18\n",
937
+ "Start: 'Rainforests'. End: 'Mount Everest'. Page: 7\n",
938
+ "Start: 'Rainforests'. End: 'Humpback whale'. Page: 4\n",
939
+ "Start: 'Rainforests'. End: 'Great Wall of China'. Page: 4\n",
940
+ "Start: 'Rainforests'. End: 'Photography'. Page: 38\n",
941
+ "Start: 'Rainforests'. End: 'Egyptian pyramids'. Page: 7\n",
942
+ "Start: 'Rainforests'. End: 'Albert Einstein'. Page: 8\n",
943
+ "Start: 'Rainforests'. End: 'Buggy'. Page: 200\n",
944
+ "Start: 'buggy'. End: 'Sushi'. Page: 6\n",
945
+ "Start: 'buggy'. End: 'Mars'. Page: 8\n",
946
+ "Start: 'buggy'. End: 'Ludwig van Beethoven'. Page: 28\n",
947
+ "Start: 'buggy'. End: 'Mount Everest'. Page: 8\n",
948
+ "Start: 'buggy'. End: 'Humpback whale'. Page: 19\n",
949
+ "Start: 'buggy'. End: 'Great Wall of China'. Page: 12\n",
950
+ "Start: 'buggy'. End: 'Photography'. Page: 54\n",
951
+ "Start: 'buggy'. End: 'Egyptian pyramids'. Page: 9\n",
952
+ "Start: 'buggy'. End: 'Albert Einstein'. Page: 35\n",
953
+ "Start: 'buggy'. End: 'Rainforest'. Page: 9\n"
954
+ ]
955
+ }
956
+ ],
957
+ "source": [
958
+ "def play_wiki_game_stats(starting_topic: str, target_topic: str, limit: int = 200):\n",
959
+ "\n",
960
+ " stats_dict = {}\n",
961
+ "\n",
962
+ " ##### Setup Chrome options\n",
963
+ " chrome_options = webdriver.ChromeOptions()\n",
964
+ " chrome_options.add_argument(\"--headless\") # Ensure GUI is off\n",
965
+ " chrome_options.add_argument(\"--no-sandbox\")\n",
966
+ " chrome_options.add_argument(\"--disable-dev-shm-usage\")\n",
967
+ " driver = webdriver.Chrome(options = chrome_options)\n",
968
+ "\n",
969
+ " #### Getting target url, topic, and context\n",
970
+ " driver_target = webdriver.Chrome(options = chrome_options)\n",
971
+ " target_url, target_topic = search_wikipedia(search_term = target_topic)\n",
972
+ " driver_target.get(target_url)\n",
973
+ " target_context = get_topic_context(driver_target)\n",
974
+ " driver_target.quit()\n",
975
+ " \n",
976
+ " topic = starting_topic\n",
977
+ " num_pages = 0\n",
978
+ " used_topics = []\n",
979
+ " used_links = []\n",
980
+ " contexts = []\n",
981
+ " sim_to_target_scores = []\n",
982
+ "\n",
983
+ " start_time = time.time()\n",
984
+ "\n",
985
+ " url, topic = search_wikipedia(search_term = starting_topic)\n",
986
+ " driver.get(url)\n",
987
+ " used_topics.append(topic)\n",
988
+ " used_links.append(driver.current_url)\n",
989
+ " sim_to_target_scores.append(most_similar_sentence(target_topic = target_context, labels_list = [topic])[1])\n",
990
+ "\n",
991
+ " while True:\n",
992
+ " # increment the page tracking by 1 for each new page\n",
993
+ " num_pages += 1\n",
994
+ "\n",
995
+ " # if not the first page, navigate to the new page\n",
996
+ " if num_pages > 1:\n",
997
+ " driver.get(next_link)\n",
998
+ "\n",
999
+ " context_sentence = get_topic_context(driver)\n",
1000
+ " contexts.append(context_sentence)\n",
1001
+ "\n",
1002
+ " current_url = driver.current_url\n",
1003
+ " current_url_suffix = str(current_url).split(\"/\")[-1]\n",
1004
+ "\n",
1005
+ " ### Use BeautifulSoup and Requests instead of Selenium for link extraction\n",
1006
+ " current_page = driver.page_source # html from Selenium instead of BeautifulSoup\n",
1007
+ "\n",
1008
+ " soup = BeautifulSoup(current_page, 'html.parser')\n",
1009
+ "\n",
1010
+ " links = soup.find_all('a')\n",
1011
+ "\n",
1012
+ " # get rid of any bloat in the links from the page\n",
1013
+ " links_texts = refine_links(topic, links, current_url_suffix, used_links, used_topics)\n",
1014
+ "\n",
1015
+ " best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_context, labels_list = [text for link, text in links_texts])\n",
1016
+ "\n",
1017
+ " print(f\"Start: '{starting_topic}'. End: '{target_topic}'. Page: {num_pages}\", end = '\\r')\n",
1018
+ "\n",
1019
+ " next_link, topic = links_texts[loc_idx]\n",
1020
+ "\n",
1021
+ " if current_url == target_url: # because the target_url is now found through the API\n",
1022
+ " print()\n",
1023
+ " stats_dict['start'] = [starting_topic for i in range(num_pages)]\n",
1024
+ " stats_dict['target'] = [target_topic for i in range(num_pages)]\n",
1025
+ " stats_dict['topic'] = used_topics\n",
1026
+ " stats_dict['context'] = contexts\n",
1027
+ " stats_dict['sim_to_target'] = sim_to_target_scores\n",
1028
+ " # stats_dict['time_seconds'] = times\n",
1029
+ " stats_dict['url'] = used_links\n",
1030
+ " stats_dict['page_num'] = [i+1 for i in range(num_pages)]\n",
1031
+ " add_to_main_dict(master_dict, stats_dict)\n",
1032
+ " driver.quit()\n",
1033
+ " break\n",
1034
+ "\n",
1035
+ " if num_pages == limit:\n",
1036
+ " print()\n",
1037
+ " stats_dict['start'] = [starting_topic for i in range(num_pages)]\n",
1038
+ " stats_dict['target'] = [target_topic for i in range(num_pages)]\n",
1039
+ " stats_dict['topic'] = used_topics\n",
1040
+ " stats_dict['context'] = contexts\n",
1041
+ " stats_dict['sim_to_target'] = sim_to_target_scores\n",
1042
+ " stats_dict['url'] = used_links\n",
1043
+ " stats_dict['page_num'] = [i+1 for i in range(num_pages)]\n",
1044
+ " driver.quit()\n",
1045
+ " add_to_main_dict(master_dict, stats_dict)\n",
1046
+ " break\n",
1047
+ "\n",
1048
+ " used_links.append(next_link)\n",
1049
+ " used_topics.append(topic)\n",
1050
+ " sim_to_target_scores.append(best_score)\n",
1051
+ "\n",
1052
+ "master_dict = {}\n",
1053
+ "master_dict['start'] = []\n",
1054
+ "master_dict['target'] = []\n",
1055
+ "master_dict['topic'] = []\n",
1056
+ "master_dict['context'] = []\n",
1057
+ "master_dict['sim_to_target'] = []\n",
1058
+ "master_dict['url'] = []\n",
1059
+ "master_dict['page_num'] = []\n",
1060
+ "\n",
1061
+ "# starting_topic = 'john mayer'\n",
1062
+ "# target_topic = 'fart'\n",
1063
+ "\n",
1064
+ "for starting_topic, target_topic in pair_permutations:\n",
1065
+ " play_wiki_game_stats(starting_topic = starting_topic, target_topic = target_topic, limit = 200)"
1066
+ ]
1067
+ },
1068
+ {
1069
+ "cell_type": "code",
1070
+ "execution_count": 64,
1071
+ "metadata": {},
1072
+ "outputs": [
1073
+ {
1074
+ "name": "stdout",
1075
+ "output_type": "stream",
1076
+ "text": [
1077
+ "dict_keys(['start', 'target', 'topic', 'context', 'sim_to_target', 'url', 'page_num'])\n",
1078
+ "[3238, 3238, 3238, 3238, 3238, 3238, 3238]\n"
1079
+ ]
1080
+ }
1081
+ ],
1082
+ "source": [
1083
+ "print(master_dict.keys())\n",
1084
+ "print([len(master_dict[key]) for key in master_dict.keys()])"
1085
+ ]
1086
+ },
1087
+ {
1088
+ "cell_type": "code",
1089
+ "execution_count": 66,
1090
+ "metadata": {},
1091
+ "outputs": [
1092
+ {
1093
+ "data": {
1094
+ "text/html": [
1095
+ "<div>\n",
1096
+ "<style scoped>\n",
1097
+ " .dataframe tbody tr th:only-of-type {\n",
1098
+ " vertical-align: middle;\n",
1099
+ " }\n",
1100
+ "\n",
1101
+ " .dataframe tbody tr th {\n",
1102
+ " vertical-align: top;\n",
1103
+ " }\n",
1104
+ "\n",
1105
+ " .dataframe thead th {\n",
1106
+ " text-align: right;\n",
1107
+ " }\n",
1108
+ "</style>\n",
1109
+ "<table border=\"1\" class=\"dataframe\">\n",
1110
+ " <thead>\n",
1111
+ " <tr style=\"text-align: right;\">\n",
1112
+ " <th></th>\n",
1113
+ " <th>start</th>\n",
1114
+ " <th>target</th>\n",
1115
+ " <th>topic</th>\n",
1116
+ " <th>context</th>\n",
1117
+ " <th>sim_to_target</th>\n",
1118
+ " <th>url</th>\n",
1119
+ " <th>page_num</th>\n",
1120
+ " </tr>\n",
1121
+ " </thead>\n",
1122
+ " <tbody>\n",
1123
+ " <tr>\n",
1124
+ " <th>0</th>\n",
1125
+ " <td>Sushi</td>\n",
1126
+ " <td>Mars</td>\n",
1127
+ " <td>Sushi</td>\n",
1128
+ " <td>Sushi (すし, 寿司, 鮨, 鮓, pronounced [sɯɕiꜜ] or [sɯ...</td>\n",
1129
+ " <td>0.046150</td>\n",
1130
+ " <td>https://en.wikipedia.org/wiki/Sushi</td>\n",
1131
+ " <td>1</td>\n",
1132
+ " </tr>\n",
1133
+ " <tr>\n",
1134
+ " <th>1</th>\n",
1135
+ " <td>Sushi</td>\n",
1136
+ " <td>Mars</td>\n",
1137
+ " <td>Planet Money</td>\n",
1138
+ " <td>Planet Money is an American podcast and blog p...</td>\n",
1139
+ " <td>0.494693</td>\n",
1140
+ " <td>https://en.wikipedia.org/wiki/Planet_Money</td>\n",
1141
+ " <td>2</td>\n",
1142
+ " </tr>\n",
1143
+ " <tr>\n",
1144
+ " <th>2</th>\n",
1145
+ " <td>Sushi</td>\n",
1146
+ " <td>Mars</td>\n",
1147
+ " <td>Pacifica Foundation</td>\n",
1148
+ " <td>Pacifica Foundation is an American non-profit ...</td>\n",
1149
+ " <td>0.186643</td>\n",
1150
+ " <td>https://en.wikipedia.org/wiki/Pacifica_Foundation</td>\n",
1151
+ " <td>3</td>\n",
1152
+ " </tr>\n",
1153
+ " <tr>\n",
1154
+ " <th>3</th>\n",
1155
+ " <td>Sushi</td>\n",
1156
+ " <td>Mars</td>\n",
1157
+ " <td>Mars Hill</td>\n",
1158
+ " <td>The Mars Hill Network is a network of Christia...</td>\n",
1159
+ " <td>0.466525</td>\n",
1160
+ " <td>https://en.wikipedia.org/wiki/Mars_Hill_Network</td>\n",
1161
+ " <td>4</td>\n",
1162
+ " </tr>\n",
1163
+ " <tr>\n",
1164
+ " <th>4</th>\n",
1165
+ " <td>Sushi</td>\n",
1166
+ " <td>Mars</td>\n",
1167
+ " <td>Equinox Mountain</td>\n",
1168
+ " <td>Equinox Mountain is the highest peak of the Ta...</td>\n",
1169
+ " <td>0.196999</td>\n",
1170
+ " <td>https://en.wikipedia.org/wiki/Equinox_Mountain</td>\n",
1171
+ " <td>5</td>\n",
1172
+ " </tr>\n",
1173
+ " <tr>\n",
1174
+ " <th>...</th>\n",
1175
+ " <td>...</td>\n",
1176
+ " <td>...</td>\n",
1177
+ " <td>...</td>\n",
1178
+ " <td>...</td>\n",
1179
+ " <td>...</td>\n",
1180
+ " <td>...</td>\n",
1181
+ " <td>...</td>\n",
1182
+ " </tr>\n",
1183
+ " <tr>\n",
1184
+ " <th>3233</th>\n",
1185
+ " <td>buggy</td>\n",
1186
+ " <td>Rainforest</td>\n",
1187
+ " <td>Forests of the United States</td>\n",
1188
+ " <td>It has been estimated that before European set...</td>\n",
1189
+ " <td>0.437653</td>\n",
1190
+ " <td>https://en.wikipedia.org/wiki/Forests_of_the_U...</td>\n",
1191
+ " <td>5</td>\n",
1192
+ " </tr>\n",
1193
+ " <tr>\n",
1194
+ " <th>3234</th>\n",
1195
+ " <td>buggy</td>\n",
1196
+ " <td>Rainforest</td>\n",
1197
+ " <td>boreal forests</td>\n",
1198
+ " <td>Taiga (/ˈtaɪɡə/; Russian: тайга́; relates to M...</td>\n",
1199
+ " <td>0.474700</td>\n",
1200
+ " <td>https://en.wikipedia.org/wiki/Boreal_forest</td>\n",
1201
+ " <td>6</td>\n",
1202
+ " </tr>\n",
1203
+ " <tr>\n",
1204
+ " <th>3235</th>\n",
1205
+ " <td>buggy</td>\n",
1206
+ " <td>Rainforest</td>\n",
1207
+ " <td>Deciduous forests</td>\n",
1208
+ " <td>Temperate deciduous or temperate broad-leaf fo...</td>\n",
1209
+ " <td>0.501480</td>\n",
1210
+ " <td>https://en.wikipedia.org/wiki/Temperate_decidu...</td>\n",
1211
+ " <td>7</td>\n",
1212
+ " </tr>\n",
1213
+ " <tr>\n",
1214
+ " <th>3236</th>\n",
1215
+ " <td>buggy</td>\n",
1216
+ " <td>Rainforest</td>\n",
1217
+ " <td>Tropical deciduous forest</td>\n",
1218
+ " <td>The tropical and subtropical dry broadleaf for...</td>\n",
1219
+ " <td>0.480779</td>\n",
1220
+ " <td>https://en.wikipedia.org/wiki/Tropical_deciduo...</td>\n",
1221
+ " <td>8</td>\n",
1222
+ " </tr>\n",
1223
+ " <tr>\n",
1224
+ " <th>3237</th>\n",
1225
+ " <td>buggy</td>\n",
1226
+ " <td>Rainforest</td>\n",
1227
+ " <td>rainforests</td>\n",
1228
+ " <td>Rainforests are forests characterized by a clo...</td>\n",
1229
+ " <td>0.482825</td>\n",
1230
+ " <td>https://en.wikipedia.org/wiki/Rainforest</td>\n",
1231
+ " <td>9</td>\n",
1232
+ " </tr>\n",
1233
+ " </tbody>\n",
1234
+ "</table>\n",
1235
+ "<p>3238 rows × 7 columns</p>\n",
1236
+ "</div>"
1237
+ ],
1238
+ "text/plain": [
1239
+ " start target topic \\\n",
1240
+ "0 Sushi Mars Sushi \n",
1241
+ "1 Sushi Mars Planet Money \n",
1242
+ "2 Sushi Mars Pacifica Foundation \n",
1243
+ "3 Sushi Mars Mars Hill \n",
1244
+ "4 Sushi Mars Equinox Mountain \n",
1245
+ "... ... ... ... \n",
1246
+ "3233 buggy Rainforest Forests of the United States \n",
1247
+ "3234 buggy Rainforest boreal forests \n",
1248
+ "3235 buggy Rainforest Deciduous forests \n",
1249
+ "3236 buggy Rainforest Tropical deciduous forest \n",
1250
+ "3237 buggy Rainforest rainforests \n",
1251
+ "\n",
1252
+ " context sim_to_target \\\n",
1253
+ "0 Sushi (すし, 寿司, 鮨, 鮓, pronounced [sɯɕiꜜ] or [sɯ... 0.046150 \n",
1254
+ "1 Planet Money is an American podcast and blog p... 0.494693 \n",
1255
+ "2 Pacifica Foundation is an American non-profit ... 0.186643 \n",
1256
+ "3 The Mars Hill Network is a network of Christia... 0.466525 \n",
1257
+ "4 Equinox Mountain is the highest peak of the Ta... 0.196999 \n",
1258
+ "... ... ... \n",
1259
+ "3233 It has been estimated that before European set... 0.437653 \n",
1260
+ "3234 Taiga (/ˈtaɪɡə/; Russian: тайга́; relates to M... 0.474700 \n",
1261
+ "3235 Temperate deciduous or temperate broad-leaf fo... 0.501480 \n",
1262
+ "3236 The tropical and subtropical dry broadleaf for... 0.480779 \n",
1263
+ "3237 Rainforests are forests characterized by a clo... 0.482825 \n",
1264
+ "\n",
1265
+ " url page_num \n",
1266
+ "0 https://en.wikipedia.org/wiki/Sushi 1 \n",
1267
+ "1 https://en.wikipedia.org/wiki/Planet_Money 2 \n",
1268
+ "2 https://en.wikipedia.org/wiki/Pacifica_Foundation 3 \n",
1269
+ "3 https://en.wikipedia.org/wiki/Mars_Hill_Network 4 \n",
1270
+ "4 https://en.wikipedia.org/wiki/Equinox_Mountain 5 \n",
1271
+ "... ... ... \n",
1272
+ "3233 https://en.wikipedia.org/wiki/Forests_of_the_U... 5 \n",
1273
+ "3234 https://en.wikipedia.org/wiki/Boreal_forest 6 \n",
1274
+ "3235 https://en.wikipedia.org/wiki/Temperate_decidu... 7 \n",
1275
+ "3236 https://en.wikipedia.org/wiki/Tropical_deciduo... 8 \n",
1276
+ "3237 https://en.wikipedia.org/wiki/Rainforest 9 \n",
1277
+ "\n",
1278
+ "[3238 rows x 7 columns]"
1279
+ ]
1280
+ },
1281
+ "execution_count": 66,
1282
+ "metadata": {},
1283
+ "output_type": "execute_result"
1284
+ }
1285
+ ],
1286
+ "source": [
1287
+ "master_df = pd.DataFrame(master_dict)\n",
1288
+ "master_df.to_csv(\"data/3238x7.csv\", index = False)\n",
1289
+ "master_df"
1290
+ ]
1291
+ }
1292
+ ],
1293
+ "metadata": {
1294
+ "kernelspec": {
1295
+ "display_name": "base",
1296
+ "language": "python",
1297
+ "name": "python3"
1298
+ },
1299
+ "language_info": {
1300
+ "codemirror_mode": {
1301
+ "name": "ipython",
1302
+ "version": 3
1303
+ },
1304
+ "file_extension": ".py",
1305
+ "mimetype": "text/x-python",
1306
+ "name": "python",
1307
+ "nbconvert_exporter": "python",
1308
+ "pygments_lexer": "ipython3",
1309
+ "version": "3.10.2"
1310
+ },
1311
+ "orig_nbformat": 4
1312
+ },
1313
+ "nbformat": 4,
1314
+ "nbformat_minor": 2
1315
+ }