Spaces:

kmaurinjones
/

Wiki-Game

Sleeping

Wiki-Game / wiki_game_local.py

KAI MAURIN-JONES

files added

aaaf5e8 over 1 year ago

6.95 kB

	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.common.keys import Keys
	from bs4 import BeautifulSoup
	import time
	# !pip install tensorflow tensorflow-hub
	import tensorflow as tf
	import tensorflow_hub as hub
	import numpy as np
	# !pip install jellyfish
	import jellyfish

	# Load the pre-trained Universal Sentence Encoder
	embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

	def calculate_jaro_similarity(str1, str2):
	jaro_similarity = jellyfish.jaro_distance(str1, str2)
	return jaro_similarity

	def most_similar_sentence(target_topic, labels_list):
	# Encode the context sentence and all sentences in the list
	context_embedding = embed([target_topic])[0]
	sentence_embeddings = embed(labels_list)

	# Calculate cosine similarities between the context sentence and each sentence in the list
	similarities = np.inner(context_embedding, sentence_embeddings)

	# Find the index of the most similar sentence
	most_similar_index = np.argmax(similarities)

	return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index

	def search_wikipedia(query, driver):
	# Go to Wikipedia's main page
	driver.get("https://www.wikipedia.org/")

	# Find the search bar using its name
	search_bar = driver.find_element(By.NAME, "search")

	# Send the query to the search bar and hit Enter
	search_bar.send_keys(query)
	search_bar.send_keys(Keys.RETURN)

	return driver

	def get_topic_context(driver):
	# Find the first paragraph of the main article
	first_paragraph = driver.find_element(By.CSS_SELECTOR, "div.mw-parser-output > p:not(.mw-empty-elt)").text

	context_sentence = first_paragraph.split(". ")[0]
	# print(context_sentence)

	return context_sentence

	def search_wikipedia(query, driver):
	# Go to Wikipedia's main page
	driver.get("https://www.wikipedia.org/")

	# Find the search bar using its name
	search_bar = driver.find_element(By.NAME, "search")

	# Send the query to the search bar and hit Enter
	search_bar.send_keys(query)
	search_bar.send_keys(Keys.RETURN)

	return driver

	def get_topic_context(driver):
	# Find the first paragraph of the main article
	first_paragraph = driver.find_element(By.CSS_SELECTOR, "div.mw-parser-output > p:not(.mw-empty-elt)").text

	context_sentence = first_paragraph.split(". ")[0]
	# print(context_sentence)

	return context_sentence

	def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):

	##### Setup Chrome options
	chrome_options = webdriver.ChromeOptions()
	chrome_options.add_argument("--headless") # Ensure GUI is off
	chrome_options.add_argument("--no-sandbox")
	chrome_options.add_argument("--disable-dev-shm-usage")
	driver = webdriver.Chrome(options = chrome_options)

	topic = starting_topic
	num_pages = 0
	used_topics = []
	used_links = []

	start_time = time.time()

	### BEGIN ###

	print("-" * 150)
	print(f"\nStarting!\n")
	print("-" * 150)

	driver = search_wikipedia(starting_topic, driver)
	used_links.append(driver.current_url)

	while True:
	# increment the page tracking by 1 for each new page
	num_pages += 1

	# if not the first page, navigate to the new page
	if num_pages > 1:
	driver.get(next_link)

	context_sentence = get_topic_context(driver)
	links_texts = []

	current_url = driver.current_url
	current_url_suffix = str(current_url).split("/")[-1]

	### Use BeautifulSoup and Requests instead of Selenium for link extraction
	current_page = driver.page_source # html from Selenium instead of BeautifulSoup

	soup = BeautifulSoup(current_page, 'html.parser')

	links = soup.find_all('a')

	# Iterate through the links and extract their URLs
	for link in links:
	link_url = link.get('href')
	if link_url and link_url.startswith("/wiki/"):
	link_url = "https://en.wikipedia.org" + link_url
	link_text = link.text.strip() # Get the text and remove leading/trailing spaces

	# make sure they are both not None
	if link_text and current_url_suffix not in link_url:

	if link_url not in used_links and link_text not in used_topics:

	# eliminates topic duplicates, non-wiki links, and wiki-help pages (non-content pages)
	if topic.lower() not in link_url.lower() and "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
	links_texts.append((link_url, link_text))

	best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = [text for link, text in links_texts])

	print(f"\nPage: {num_pages}")
	print(f"Current topic: '{topic.title()}'")
	print(f"Current URL: '{current_url}'")
	print(f"Current Topic Context: '{context_sentence}'")
	print(f"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%")

	next_link, topic = links_texts[loc_idx]
	# print(next_link)

	# if target_topic.lower() in topic.lower():# or best_score > float(0.85):
	if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90): # if topic text is identical or at least 90% the same spelling
	print("\n" + "-" * 150)
	print(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
	print(f"Starting topic: '{starting_topic.title()}': '{used_links[0]}'")
	print(f"Target topic: '{target_topic.title()}': '{used_links[-1]}'\n")
	print("-" * 150)
	break

	##### ADD DRAMATIC DELAY HERE #####
	# time.sleep(0.5)
	# time.sleep(10)

	if num_pages == limit:
	print("\n" + "-" * 150)
	print(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
	print(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': '{used_links[0]}', to '{target_topic.title()}': '{used_links[-1]}'")
	print(f"\nTry a different combination to see if it can do it!\n")
	print("-" * 150)
	break

	used_links.append(next_link)
	used_topics.append(topic)

	driver.quit()

	###### Example

	# starting_topic = "soulja boy"
	# target_topic = "test"
	# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)