seyia92coding
commited on
Commit
•
8cc04c9
1
Parent(s):
415c027
Upload hugging_face_demo_v1.py
Browse files- hugging_face_demo_v1.py +176 -0
hugging_face_demo_v1.py
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Hugging Face Demo V1.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colaboratory.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1UPgdrPyLAvEWpJifn7Y6eblkiM2yc0_3
|
8 |
+
"""
|
9 |
+
|
10 |
+
import pandas as pd
|
11 |
+
import numpy as np
|
12 |
+
import re
|
13 |
+
import itertools
|
14 |
+
import matplotlib.pyplot as plt
|
15 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
16 |
+
from sklearn.metrics.pairwise import linear_kernel
|
17 |
+
!pip install fuzzywuzzy
|
18 |
+
from fuzzywuzzy import fuzz
|
19 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
20 |
+
!pip install gradio
|
21 |
+
import gradio as gr
|
22 |
+
|
23 |
+
clean_ratings_tags = "/content/steam-clean-games.csv"
|
24 |
+
|
25 |
+
df = pd.read_csv(clean_ratings_tags, error_bad_lines=False, encoding='utf-8')
|
26 |
+
|
27 |
+
# the function to extract years
|
28 |
+
def extract_year(date):
|
29 |
+
year = date[:4]
|
30 |
+
if year.isnumeric():
|
31 |
+
return int(year)
|
32 |
+
else:
|
33 |
+
return np.nan
|
34 |
+
df['year'] = df['release_date'].apply(extract_year)
|
35 |
+
|
36 |
+
df['steamspy_tags'] = df['steamspy_tags'].str.replace(' ','-')
|
37 |
+
df['genres'] = df['steamspy_tags'].str.replace(';',' ')
|
38 |
+
counts = dict()
|
39 |
+
for i in df.index:
|
40 |
+
for g in df.loc[i,'genres'].split(' '):
|
41 |
+
if g not in counts:
|
42 |
+
counts[g] = 1
|
43 |
+
else:
|
44 |
+
counts[g] = counts[g] + 1
|
45 |
+
|
46 |
+
def create_score(row):
|
47 |
+
pos_count = row['positive_ratings']
|
48 |
+
neg_count = row['negative_ratings']
|
49 |
+
total_count = pos_count + neg_count
|
50 |
+
average = pos_count / total_count
|
51 |
+
return round(average, 2)
|
52 |
+
|
53 |
+
def total_ratings(row):
|
54 |
+
pos_count = row['positive_ratings']
|
55 |
+
neg_count = row['negative_ratings']
|
56 |
+
total_count = pos_count + neg_count
|
57 |
+
return total_count
|
58 |
+
|
59 |
+
df['total_ratings'] = df.apply(total_ratings, axis=1)
|
60 |
+
df['score'] = df.apply(create_score, axis=1)
|
61 |
+
|
62 |
+
# Calculate mean of vote average column
|
63 |
+
C = df['score'].mean()
|
64 |
+
m = df['total_ratings'].quantile(0.90)
|
65 |
+
|
66 |
+
# Function that computes the weighted rating of each game
|
67 |
+
def weighted_rating(x, m=m, C=C):
|
68 |
+
v = x['total_ratings']
|
69 |
+
R = x['score']
|
70 |
+
# Calculation based on the IMDB formula
|
71 |
+
return round((v/(v+m) * R) + (m/(m+v) * C), 2)
|
72 |
+
|
73 |
+
# Define a new feature 'score' and calculate its value with `weighted_rating()`
|
74 |
+
df['weighted_score'] = df.apply(weighted_rating, axis=1)
|
75 |
+
|
76 |
+
# create an object for TfidfVectorizer
|
77 |
+
tfidf_vector = TfidfVectorizer(stop_words='english')
|
78 |
+
tfidf_matrix = tfidf_vector.fit_transform(df['genres'])
|
79 |
+
|
80 |
+
# create the cosine similarity matrix
|
81 |
+
sim_matrix = linear_kernel(tfidf_matrix,tfidf_matrix)
|
82 |
+
|
83 |
+
# create a function to find the closest title
|
84 |
+
def matching_score(a,b):
|
85 |
+
#fuzz.ratio(a,b) calculates the Levenshtein Distance between a and b, and returns the score for the distance
|
86 |
+
return fuzz.ratio(a,b)
|
87 |
+
|
88 |
+
"""# Make our Recommendation Engine
|
89 |
+
|
90 |
+
We need combine our formatted dataset with the similarity logic to return recommendations. This is also where we can fine-tune it if we do not like the results.
|
91 |
+
"""
|
92 |
+
|
93 |
+
##These functions needed to return different attributes of the recommended game titles
|
94 |
+
|
95 |
+
#Convert index to title_year
|
96 |
+
def get_title_year_from_index(index):
|
97 |
+
return df[df.index == index]['year'].values[0]
|
98 |
+
#Convert index to title
|
99 |
+
def get_title_from_index(index):
|
100 |
+
return df[df.index == index]['name'].values[0]
|
101 |
+
#Convert index to title
|
102 |
+
def get_index_from_title(title):
|
103 |
+
return df[df.name == title].index.values[0]
|
104 |
+
#Convert index to score
|
105 |
+
def get_score_from_index(index):
|
106 |
+
return df[df.index == index]['score'].values[0]
|
107 |
+
#Convert index to weighted score
|
108 |
+
def get_weighted_score_from_index(index):
|
109 |
+
return df[df.index == index]['weighted_score'].values[0]
|
110 |
+
#Convert index to total_ratings
|
111 |
+
def get_total_ratings_from_index(index):
|
112 |
+
return df[df.index == index]['total_ratings'].values[0]
|
113 |
+
#Convert index to platform
|
114 |
+
def get_platform_from_index(index):
|
115 |
+
return df[df.index == index]['platforms'].values[0]
|
116 |
+
|
117 |
+
# A function to return the most similar title to the words a user type
|
118 |
+
def find_closest_title(title):
|
119 |
+
#matching_score(a,b) > a is the current row, b is the title we're trying to match
|
120 |
+
leven_scores = list(enumerate(df['name'].apply(matching_score, b=title))) #[(0, 30), (1,95), (2, 19)~~] A tuple of distances per index
|
121 |
+
sorted_leven_scores = sorted(leven_scores, key=lambda x: x[1], reverse=True) #Sorts list of tuples by distance [(1, 95), (3, 49), (0, 30)~~]
|
122 |
+
closest_title = get_title_from_index(sorted_leven_scores[0][0])
|
123 |
+
distance_score = sorted_leven_scores[0][1]
|
124 |
+
return closest_title, distance_score
|
125 |
+
|
126 |
+
def gradio_contents_based_recommender_v2(game, how_many, sort_option, min_year, platform, min_score):
|
127 |
+
#Return closest game title match
|
128 |
+
closest_title, distance_score = find_closest_title(game)
|
129 |
+
#Create a Dataframe with these column headers
|
130 |
+
recomm_df = pd.DataFrame(columns=['Game Title', 'Year', 'Score', 'Weighted Score', 'Total Ratings'])
|
131 |
+
#find the corresponding index of the game title
|
132 |
+
games_index = get_index_from_title(closest_title)
|
133 |
+
#return a list of the most similar game indexes as a list
|
134 |
+
games_list = list(enumerate(sim_matrix[int(games_index)]))
|
135 |
+
#Sort list of similar games from top to bottom
|
136 |
+
similar_games = list(filter(lambda x:x[0] != int(games_index), sorted(games_list,key=lambda x:x[1], reverse=True)))
|
137 |
+
#Print the game title the similarity matrix is based on
|
138 |
+
print('Here\'s the list of games similar to '+'\033[1m'+str(closest_title)+'\033[0m'+'.\n')
|
139 |
+
#Only return the games that are on selected platform
|
140 |
+
n_games = []
|
141 |
+
for i,s in similar_games:
|
142 |
+
if platform in get_platform_from_index(i):
|
143 |
+
n_games.append((i,s))
|
144 |
+
#Only return the games that are above the minimum score
|
145 |
+
high_scores = []
|
146 |
+
for i,s in n_games:
|
147 |
+
if get_score_from_index(i) > min_score:
|
148 |
+
high_scores.append((i,s))
|
149 |
+
|
150 |
+
#Return the game tuple (game index, game distance score) and store in a dataframe
|
151 |
+
for i,s in n_games[:how_many]:
|
152 |
+
#Dataframe will contain attributes based on game index
|
153 |
+
row = {'Game Title': get_title_from_index(i), 'Year': get_title_year_from_index(i), 'Score': get_score_from_index(i),
|
154 |
+
'Weighted Score': get_weighted_score_from_index(i),
|
155 |
+
'Total Ratings': get_total_ratings_from_index(i),}
|
156 |
+
#Append each row to this dataframe
|
157 |
+
recomm_df = recomm_df.append(row, ignore_index = True)
|
158 |
+
#Sort dataframe by Sort_Option provided by user
|
159 |
+
recomm_df = recomm_df.sort_values(sort_option, ascending=False)
|
160 |
+
#Only include games released same or after minimum year selected
|
161 |
+
recomm_df = recomm_df[recomm_df['Year'] >= min_year]
|
162 |
+
|
163 |
+
return recomm_df
|
164 |
+
|
165 |
+
#Create list of unique calendar years based on main df column
|
166 |
+
years_sorted = sorted(list(df['year'].unique()))
|
167 |
+
|
168 |
+
#Interface will include these buttons based on parameters in the function with a dataframe output
|
169 |
+
recommender = gr.Interface(gradio_contents_based_recommender_v2, ["text", gr.inputs.Slider(1, 20, step=int(1)),
|
170 |
+
gr.inputs.Radio(['Year','Score','Weighted Score','Total Ratings']),
|
171 |
+
gr.inputs.Slider(int(years_sorted[0]), int(years_sorted[-1]), step=int(1)),
|
172 |
+
gr.inputs.Radio(['windows','xbox','playstation','linux','mac']),
|
173 |
+
gr.inputs.Slider(0, 10, step=0.1)],
|
174 |
+
"dataframe")
|
175 |
+
|
176 |
+
recommender.launch(debug=True)
|