|
|
|
import pandas as pd |
|
import numpy as np |
|
import re |
|
import itertools |
|
import matplotlib.pyplot as plt |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import linear_kernel |
|
|
|
from huggingface_hub import upload_file |
|
|
|
from fuzzywuzzy import fuzz |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
import gradio as gr |
|
|
|
from datasets import load_dataset |
|
|
|
dataset = load_dataset("seyia92coding/steam-clean-games-2019") |
|
|
|
df = pd.read_csv(dataset, error_bad_lines=False, encoding='utf-8') |
|
|
|
|
|
def extract_year(date): |
|
year = date[:4] |
|
if year.isnumeric(): |
|
return int(year) |
|
else: |
|
return np.nan |
|
df['year'] = df['release_date'].apply(extract_year) |
|
|
|
df['steamspy_tags'] = df['steamspy_tags'].str.replace(' ','-') |
|
df['genres'] = df['steamspy_tags'].str.replace(';',' ') |
|
counts = dict() |
|
for i in df.index: |
|
for g in df.loc[i,'genres'].split(' '): |
|
if g not in counts: |
|
counts[g] = 1 |
|
else: |
|
counts[g] = counts[g] + 1 |
|
|
|
def create_score(row): |
|
pos_count = row['positive_ratings'] |
|
neg_count = row['negative_ratings'] |
|
total_count = pos_count + neg_count |
|
average = pos_count / total_count |
|
return round(average, 2) |
|
|
|
def total_ratings(row): |
|
pos_count = row['positive_ratings'] |
|
neg_count = row['negative_ratings'] |
|
total_count = pos_count + neg_count |
|
return total_count |
|
|
|
df['total_ratings'] = df.apply(total_ratings, axis=1) |
|
df['score'] = df.apply(create_score, axis=1) |
|
|
|
|
|
C = df['score'].mean() |
|
m = df['total_ratings'].quantile(0.90) |
|
|
|
|
|
def weighted_rating(x, m=m, C=C): |
|
v = x['total_ratings'] |
|
R = x['score'] |
|
|
|
return round((v/(v+m) * R) + (m/(m+v) * C), 2) |
|
|
|
|
|
df['weighted_score'] = df.apply(weighted_rating, axis=1) |
|
|
|
|
|
tfidf_vector = TfidfVectorizer(stop_words='english') |
|
tfidf_matrix = tfidf_vector.fit_transform(df['genres']) |
|
|
|
|
|
sim_matrix = linear_kernel(tfidf_matrix,tfidf_matrix) |
|
|
|
|
|
def matching_score(a,b): |
|
|
|
return fuzz.ratio(a,b) |
|
|
|
"""# Make our Recommendation Engine |
|
|
|
We need combine our formatted dataset with the similarity logic to return recommendations. This is also where we can fine-tune it if we do not like the results. |
|
""" |
|
|
|
|
|
|
|
|
|
def get_title_year_from_index(index): |
|
return df[df.index == index]['year'].values[0] |
|
|
|
def get_title_from_index(index): |
|
return df[df.index == index]['name'].values[0] |
|
|
|
def get_index_from_title(title): |
|
return df[df.name == title].index.values[0] |
|
|
|
def get_score_from_index(index): |
|
return df[df.index == index]['score'].values[0] |
|
|
|
def get_weighted_score_from_index(index): |
|
return df[df.index == index]['weighted_score'].values[0] |
|
|
|
def get_total_ratings_from_index(index): |
|
return df[df.index == index]['total_ratings'].values[0] |
|
|
|
def get_platform_from_index(index): |
|
return df[df.index == index]['platforms'].values[0] |
|
|
|
|
|
def find_closest_title(title): |
|
|
|
leven_scores = list(enumerate(df['name'].apply(matching_score, b=title))) |
|
sorted_leven_scores = sorted(leven_scores, key=lambda x: x[1], reverse=True) |
|
closest_title = get_title_from_index(sorted_leven_scores[0][0]) |
|
distance_score = sorted_leven_scores[0][1] |
|
return closest_title, distance_score |
|
|
|
def gradio_contents_based_recommender_v2(game, how_many, sort_option, min_year, platform, min_score): |
|
|
|
closest_title, distance_score = find_closest_title(game) |
|
|
|
recomm_df = pd.DataFrame(columns=['Game Title', 'Year', 'Score', 'Weighted Score', 'Total Ratings']) |
|
|
|
games_index = get_index_from_title(closest_title) |
|
|
|
games_list = list(enumerate(sim_matrix[int(games_index)])) |
|
|
|
similar_games = list(filter(lambda x:x[0] != int(games_index), sorted(games_list,key=lambda x:x[1], reverse=True))) |
|
|
|
print('Here\'s the list of games similar to '+'\033[1m'+str(closest_title)+'\033[0m'+'.\n') |
|
|
|
n_games = [] |
|
for i,s in similar_games: |
|
if platform in get_platform_from_index(i): |
|
n_games.append((i,s)) |
|
|
|
high_scores = [] |
|
for i,s in n_games: |
|
if get_score_from_index(i) > min_score: |
|
high_scores.append((i,s)) |
|
|
|
|
|
for i,s in n_games[:how_many]: |
|
|
|
row = {'Game Title': get_title_from_index(i), 'Year': get_title_year_from_index(i), 'Score': get_score_from_index(i), |
|
'Weighted Score': get_weighted_score_from_index(i), |
|
'Total Ratings': get_total_ratings_from_index(i),} |
|
|
|
recomm_df = recomm_df.append(row, ignore_index = True) |
|
|
|
recomm_df = recomm_df.sort_values(sort_option, ascending=False) |
|
|
|
recomm_df = recomm_df[recomm_df['Year'] >= min_year] |
|
|
|
return recomm_df |
|
|
|
|
|
years_sorted = sorted(list(df['year'].unique())) |
|
|
|
|
|
recommender = gr.Interface(gradio_contents_based_recommender_v2, ["text", gr.inputs.Slider(1, 20, step=int(1)), |
|
gr.inputs.Radio(['Year','Score','Weighted Score','Total Ratings']), |
|
gr.inputs.Slider(int(years_sorted[0]), int(years_sorted[-1]), step=int(1)), |
|
gr.inputs.Radio(['windows','xbox','playstation','linux','mac']), |
|
gr.inputs.Slider(0, 10, step=0.1)], |
|
"dataframe") |
|
|
|
recommender.launch(debug=True) |