Spaces:

arjunpatel
/

best-selling-video-games

Runtime error

App Files Files Community

best-selling-video-games / data_cleaning.py

arjunpatel

Fix a space specific error on NLTK input

2840a75 about 2 years ago

raw

history blame contribute delete

2.7 kB


	import pandas as pd
	import numpy as np
	import re

	from nltk.tokenize import word_tokenize, sent_tokenize
	from nltk.stem import PorterStemmer
	import nltk
	nltk.download('punkt')

	from textacy.preprocessing.remove import accents, brackets, punctuation
	from textacy.preprocessing.replace import numbers, urls
	from textacy.preprocessing.normalize import whitespace

	import os

	def clean_page(page):
	# given a page, removes heading, newlines, tabs, etc
	page = re.sub("=+", "", page)
	page = page.replace("\n", "")
	page = page.replace("\t", "")
	page = accents(brackets(page))
	page = urls(page)

	return whitespace(page).lower()

	def clean_sentences(s):

	pattern = r'[^A-Za-z0-9]+'
	page = re.sub(pattern, '', s)
	return s



	ps = PorterStemmer()
	def prepare_document(doc):
	# given a document, preprocesses and tokenizes it for tfidf

	# clean the document of misc symbols and headings, lowercase it
	doc = clean_page(doc)

	#tokenize by sentence and then by word
	sentences = sent_tokenize(doc)

	#remove punctuation
	sentences = [punctuation(s) for s in sentences]


	# stem every word
	sentences_and_words = [word_tokenize(s) for s in sentences]

	prepared_doc = []

	for sent in sentences_and_words:
	stemmed_sentences = []
	for word in sent:
	stemmed_sentences.append(ps.stem(word))
	cleaned_sentence = " ".join(stemmed_sentences)
	prepared_doc.append(cleaned_sentence)
	return " ".join(prepared_doc)


	# small function to calculats cosine similarity of all pairs and store
	def cosine_similarity(v1, v2):
	numerator = np.dot(v1, v2)
	denom = np.sqrt(np.sum(np.square(v1))) * np.sqrt(np.sum(np.square(v2)))

	return numerator/denom


	def cos_dicts(names, vects):

	#given a set of vectors, create a dict of dicts for cosine similarity
	# This dict of dict structure allows us to index directly into the pair we want
	# The first key will be our desired game
	# and the value for that key will be a dictionary of partner games

	# The inner key will be the second game we wish to seek, and its value will be cosine similarity to our first game

	d = {}
	for name, vect in zip(names, vects):
	cos_sim_by_vect = {}
	for n2, v2 in zip(names, vects):
	if n2 != name:
	cos_sim_by_vect[n2] = cosine_similarity(vect, v2)
	d[name] = cos_sim_by_vect
	return d

	def retrieve_top_k_similar(n1, similarity_dict, k):
	inner_dict = similarity_dict[n1]
	# sort the dictionary by value, descending, then retrieve top k values
	return sorted(inner_dict.items(), reverse = True, key = lambda x: x[1])[:k]