Spaces:
Runtime error
Runtime error
import pandas as pd | |
import numpy as np | |
import re | |
from nltk.tokenize import word_tokenize, sent_tokenize | |
from nltk.stem import PorterStemmer | |
import nltk | |
nltk.download('punkt') | |
from textacy.preprocessing.remove import accents, brackets, punctuation | |
from textacy.preprocessing.replace import numbers, urls | |
from textacy.preprocessing.normalize import whitespace | |
import os | |
def clean_page(page): | |
# given a page, removes heading, newlines, tabs, etc | |
page = re.sub("=+", "", page) | |
page = page.replace("\n", "") | |
page = page.replace("\t", "") | |
page = accents(brackets(page)) | |
page = urls(page) | |
return whitespace(page).lower() | |
def clean_sentences(s): | |
pattern = r'[^A-Za-z0-9]+' | |
page = re.sub(pattern, '', s) | |
return s | |
ps = PorterStemmer() | |
def prepare_document(doc): | |
# given a document, preprocesses and tokenizes it for tfidf | |
# clean the document of misc symbols and headings, lowercase it | |
doc = clean_page(doc) | |
#tokenize by sentence and then by word | |
sentences = sent_tokenize(doc) | |
#remove punctuation | |
sentences = [punctuation(s) for s in sentences] | |
# stem every word | |
sentences_and_words = [word_tokenize(s) for s in sentences] | |
prepared_doc = [] | |
for sent in sentences_and_words: | |
stemmed_sentences = [] | |
for word in sent: | |
stemmed_sentences.append(ps.stem(word)) | |
cleaned_sentence = " ".join(stemmed_sentences) | |
prepared_doc.append(cleaned_sentence) | |
return " ".join(prepared_doc) | |
# small function to calculats cosine similarity of all pairs and store | |
def cosine_similarity(v1, v2): | |
numerator = np.dot(v1, v2) | |
denom = np.sqrt(np.sum(np.square(v1))) * np.sqrt(np.sum(np.square(v2))) | |
return numerator/denom | |
def cos_dicts(names, vects): | |
#given a set of vectors, create a dict of dicts for cosine similarity | |
# This dict of dict structure allows us to index directly into the pair we want | |
# The first key will be our desired game | |
# and the value for that key will be a dictionary of partner games | |
# The inner key will be the second game we wish to seek, and its value will be cosine similarity to our first game | |
d = {} | |
for name, vect in zip(names, vects): | |
cos_sim_by_vect = {} | |
for n2, v2 in zip(names, vects): | |
if n2 != name: | |
cos_sim_by_vect[n2] = cosine_similarity(vect, v2) | |
d[name] = cos_sim_by_vect | |
return d | |
def retrieve_top_k_similar(n1, similarity_dict, k): | |
inner_dict = similarity_dict[n1] | |
# sort the dictionary by value, descending, then retrieve top k values | |
return sorted(inner_dict.items(), reverse = True, key = lambda x: x[1])[:k] | |