Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""HS_Text-based_Recom_Metacritic.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1MmWRwRJT04GVAO2SKCpwSqQ2bWghVGtQ | |
""" | |
import pandas as pd | |
import numpy as np | |
from fuzzywuzzy import fuzz | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
df = pd.read_csv("Metacritic_Reviews_Only.csv", error_bad_lines=False, encoding='utf-8') | |
#Remove title from review | |
def remove_title(row): | |
game_title = row['Game Title'] | |
body_text = row['Reviews'] | |
new_doc = body_text.replace(game_title, "") | |
return new_doc | |
df['Reviews'] = df.apply(remove_title, axis=1) | |
#drop redundant column | |
df = df.drop(['Unnamed: 0'], axis=1) | |
df.dropna(inplace=True) #Drop Null Reviews | |
# Instantiate the vectorizer object to the vectorizer variable | |
#Minimum word count 2 to be included, words that appear in over 70% of docs should not be included | |
vectorizer = TfidfVectorizer(min_df=2, max_df=0.7) | |
# Fit and transform the plot column | |
vectorized_data = vectorizer.fit_transform(df['Reviews']) | |
# Create Dataframe from TF-IDFarray | |
tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names()) | |
# Assign the game titles to the index | |
tfidf_df.index = df['Game Title'] | |
# Find the cosine similarity measures between all game and assign the results to cosine_similarity_array. | |
cosine_similarity_array = cosine_similarity(tfidf_df) | |
# Create a DataFrame from the cosine_similarity_array with tfidf_df.index as its rows and columns. | |
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_df.index, columns=tfidf_df.index) | |
# create a function to find the closest title | |
def matching_score(a,b): | |
#fuzz.ratio(a,b) calculates the Levenshtein Distance between a and b, and returns the score for the distance | |
return fuzz.ratio(a,b) | |
# exactly the same, the score becomes 100 | |
#Convert index to title_year | |
def get_title_from_index(index): | |
return df[df.index == index]['Game Title'].values[0] | |
# A function to return the most similar title to the words a user type | |
# Without this, the recommender only works when a user enters the exact title which the data has. | |
def find_closest_title(title): | |
#matching_score(a,b) > a is the current row, b is the title we're trying to match | |
leven_scores = list(enumerate(df['Game Title'].apply(matching_score, b=title))) #[(0, 30), (1,95), (2, 19)~~] A tuple of distances per index | |
sorted_leven_scores = sorted(leven_scores, key=lambda x: x[1], reverse=True) #Sorts list of tuples by distance [(1, 95), (3, 49), (0, 30)~~] | |
closest_title = get_title_from_index(sorted_leven_scores[0][0]) | |
distance_score = sorted_leven_scores[0][1] | |
return closest_title, distance_score | |
# Bejeweled Twist, 100 | |
#find_closest_title('Batman Arkham Knight') | |
"""# Build Recommender Function | |
Our recommender function will take in two inputs. The game title and the keyword exclusion. The keyword exclusion was added when I realised that the recommendations were returning a lot of DLCs and sequels which isn't a very useful recommender. | |
By combining everything we've done from building the user profile onwards we will pull out the Top 5 games we want to recommend. | |
1. Text Match the closest title in the dataset | |
2. Assign number for the final ranking | |
3. Create your user profile based on previous games | |
4. Create TFIDF subset without previously mentioned titles | |
5. Calculate cosine similarity based on selected titles and convert back into DataFrame | |
6. Sort DataFrame by similarity | |
7. Return most similarity game titles that don't contain keyword | |
""" | |
def recommend_games(game1, game2, game3, keyword1, keyword2, keyword3, max_results): | |
#Insert closest title here | |
title1, distance_score1 = find_closest_title(game1) | |
title2, distance_score2 = find_closest_title(game2) | |
title3, distance_score3 = find_closest_title(game3) | |
#Counter for Ranking | |
number = 1 | |
print('Recommended because you played {}, {} and {}:\n'.format(title1, title2, title3)) | |
list_of_games_enjoyed = [title1, title2, title3] | |
games_enjoyed_df = tfidf_df.reindex(list_of_games_enjoyed) | |
user_prof = games_enjoyed_df.mean() | |
tfidf_subset_df = tfidf_df.drop([title1, title2, title3], axis=0) | |
similarity_array = cosine_similarity(user_prof.values.reshape(1, -1), tfidf_subset_df) | |
similarity_df = pd.DataFrame(similarity_array.T, index=tfidf_subset_df.index, columns=["similarity_score"]) | |
# Sort the values from high to low by the values in the similarity_score | |
sorted_similarity_df = similarity_df.sort_values(by="similarity_score", ascending=False) | |
# Inspect the most similar to the user preferences | |
print("Without Keywords Exclusions:") | |
print(sorted_similarity_df.head()) | |
print("\n") | |
print("With Keywords Exclusions:\n ") | |
number = 0 | |
rank = 1 | |
for n in sorted_similarity_df.index: | |
if rank <= max_results: | |
if keyword1.lower() not in n.lower() and keyword2.lower() not in n.lower() and keyword3.lower() not in n.lower(): | |
print("#" + str(rank) + ": " + n + ", " + str(round(sorted_similarity_df.iloc[number]['similarity_score']*100,2)) + "% " + "match") | |
number+=1 | |
rank +=1 | |
else: | |
continue | |
# recommend_games('Mortal Kombat', 'Street Fighter', 'Overwatch', 'Kombat', 'Fighter', 'Overwatch', 5) | |
import gradio as gr | |
recommender_interface = gr.Interface(fn=recommend_games, | |
inputs=["text","text","text","text","text","text", gr.inputs.Slider(1, 20, step=1)], | |
title="Text-based Recommendation Engine for Video Games", | |
description="""This is a Recommendation Engine based on the review texts of Metacritic critics for games between 2011-2019. | |
You need to enter 3 games you've enjoyed playing followed by 3 keywords from those game titles so that I can avoid recommending the same games to you.""", | |
examples= [['Mortal Kombat', 'Street Fighter', 'Overwatch', 'Kombat', 'Fighter', 'Overwatch', 5], | |
["Batman Arkham Knight","Dying Light","Left 4 Dead","Batman","Dying","Left", 10], | |
["Mario Kart","Zelda","Final Fantasy","Mario","Zelda","Final", 7]], | |
outputs=["dataframe"]) | |
recommender_interface.launch(debug=True) |