Spaces:
Sleeping
Sleeping
KAI MAURIN-JONES
commited on
Commit
β’
bf84cfc
1
Parent(s):
657dfe4
app updated
Browse files- app.py +1 -1
- wiki_game_st_bs4.py +101 -0
- wiki_game_st.py β wiki_game_st_sel.py +0 -0
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import streamlit as st
|
2 |
-
from
|
3 |
|
4 |
# Set the title of the app
|
5 |
st.title("Wiki Game (BETA)")
|
|
|
1 |
import streamlit as st
|
2 |
+
from wiki_game_st_bs4 import *
|
3 |
|
4 |
# Set the title of the app
|
5 |
st.title("Wiki Game (BETA)")
|
wiki_game_st_bs4.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import time
|
4 |
+
import tensorflow as tf
|
5 |
+
import tensorflow_hub as hub
|
6 |
+
import numpy as np
|
7 |
+
import jellyfish
|
8 |
+
import streamlit as st
|
9 |
+
|
10 |
+
# Load the pre-trained Universal Sentence Encoder
|
11 |
+
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
|
12 |
+
|
13 |
+
def calculate_jaro_similarity(str1, str2):
|
14 |
+
jaro_similarity = jellyfish.jaro_distance(str1, str2)
|
15 |
+
return jaro_similarity
|
16 |
+
|
17 |
+
def most_similar_sentence(target_topic, labels_list):
|
18 |
+
context_embedding = embed([target_topic])[0]
|
19 |
+
sentence_embeddings = embed(labels_list)
|
20 |
+
similarities = np.inner(context_embedding, sentence_embeddings)
|
21 |
+
most_similar_index = np.argmax(similarities)
|
22 |
+
return labels_list[most_similar_index], similarities[most_similar_index], most_similar_index
|
23 |
+
|
24 |
+
def get_wikipedia_page(query):
|
25 |
+
response = requests.get(f"https://en.wikipedia.org/wiki/{query}")
|
26 |
+
return response.text
|
27 |
+
|
28 |
+
def get_topic_context(page_source):
|
29 |
+
soup = BeautifulSoup(page_source, 'html.parser')
|
30 |
+
first_paragraph = soup.select_one("div.mw-parser-output > p:not(.mw-empty-elt)").text
|
31 |
+
context_sentence = first_paragraph.split(". ")[0]
|
32 |
+
return context_sentence
|
33 |
+
|
34 |
+
def play_wiki_game(starting_topic: str, target_topic: str, limit: int = 100):
|
35 |
+
topic = starting_topic
|
36 |
+
num_pages = 0
|
37 |
+
used_topics = []
|
38 |
+
used_links = []
|
39 |
+
start_time = time.time()
|
40 |
+
|
41 |
+
st.write("-" * 150)
|
42 |
+
st.write(f"\nStarting!\n")
|
43 |
+
st.write("-" * 150)
|
44 |
+
|
45 |
+
page_source = get_wikipedia_page(starting_topic)
|
46 |
+
used_links.append(f"https://en.wikipedia.org/wiki/{starting_topic}")
|
47 |
+
|
48 |
+
while True:
|
49 |
+
num_pages += 1
|
50 |
+
|
51 |
+
if num_pages > 1:
|
52 |
+
page_source = get_wikipedia_page(topic)
|
53 |
+
|
54 |
+
context_sentence = get_topic_context(page_source)
|
55 |
+
links_texts = []
|
56 |
+
|
57 |
+
soup = BeautifulSoup(page_source, 'html.parser')
|
58 |
+
links = soup.find_all('a')
|
59 |
+
|
60 |
+
for link in links:
|
61 |
+
link_url = link.get('href')
|
62 |
+
if link_url and link_url.startswith("/wiki/"):
|
63 |
+
link_url = "https://en.wikipedia.org" + link_url
|
64 |
+
link_text = link.text.strip()
|
65 |
+
|
66 |
+
if link_text and topic.lower() not in link_url.lower() and link_url not in used_links and link_text not in used_topics:
|
67 |
+
if "en.wikipedia.org/wiki/" in link_url and ":" not in "".join(link_url.split("/")[1:]) and "Main_Page" != str(link_url.split("/")[-1]):
|
68 |
+
links_texts.append((link_url, link_text))
|
69 |
+
|
70 |
+
best_label, best_score, loc_idx = most_similar_sentence(target_topic = target_topic, labels_list = [text for link, text in links_texts])
|
71 |
+
|
72 |
+
st.write(f"\nPage: {num_pages}")
|
73 |
+
st.write(f"Current topic: '{topic.title()}'")
|
74 |
+
st.write(f"Current URL: 'https://en.wikipedia.org/wiki/{topic}'")
|
75 |
+
st.write(f"Current Topic Context: '{context_sentence}'")
|
76 |
+
st.write(f"Next topic: '{best_label.title()}'. Semantic similarity to '{target_topic.title()}': {round((best_score * 100), 2)}%")
|
77 |
+
|
78 |
+
next_link, topic = links_texts[loc_idx]
|
79 |
+
|
80 |
+
if target_topic.lower() == topic.lower() or calculate_jaro_similarity(target_topic.lower(), topic.lower()) > 0.9 or best_score > float(0.90):
|
81 |
+
st.write("\n" + "-" * 150)
|
82 |
+
st.write(f"\nFrom '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages, {round(time.time() - start_time, 2)} seconds!")
|
83 |
+
st.write(f"Starting topic: '{starting_topic.title()}': 'https://en.wikipedia.org/wiki/{starting_topic}'")
|
84 |
+
st.write(f"Target topic: '{target_topic.title()}': '{used_links[-1]}'\n")
|
85 |
+
st.write("-" * 150)
|
86 |
+
break
|
87 |
+
|
88 |
+
if num_pages == limit:
|
89 |
+
st.write("\n" + "-" * 150)
|
90 |
+
st.write(f"\nUnfortunately, the model couldn't get from '{starting_topic.title()}', to '{target_topic.title()}' in {num_pages} pages or less.")
|
91 |
+
st.write(f"In {round(time.time() - start_time, 2)} seconds, it got from '{starting_topic.title()}': 'https://en.wikipedia.org/wiki/{starting_topic}', to '{target_topic.title()}': '{used_links[-1]}'")
|
92 |
+
st.write(f"\nTry a different combination to see if it can do it!\n")
|
93 |
+
st.write("-" * 150)
|
94 |
+
break
|
95 |
+
|
96 |
+
used_links.append(next_link)
|
97 |
+
used_topics.append(topic)
|
98 |
+
|
99 |
+
# starting_topic = "soulja boy"
|
100 |
+
# target_topic = "test"
|
101 |
+
# play_wiki_game(starting_topic = starting_topic, target_topic = target_topic, limit = 50)
|
wiki_game_st.py β wiki_game_st_sel.py
RENAMED
File without changes
|