Spaces:
Runtime error
Runtime error
Upload marchmachinelearningmania2021.py
Browse files- marchmachinelearningmania2021.py +562 -0
marchmachinelearningmania2021.py
ADDED
@@ -0,0 +1,562 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""MarchMachineLearningMania2021.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colaboratory.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1FeGm3qNqLAlrQd6R9EkuNFWy9oOlnxWy
|
8 |
+
"""
|
9 |
+
|
10 |
+
# Commented out IPython magic to ensure Python compatibility.
|
11 |
+
import numpy as np
|
12 |
+
import pandas as pd
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
# %matplotlib inline
|
15 |
+
import seaborn as sns; sns.set()
|
16 |
+
|
17 |
+
from sklearn.model_selection import GroupKFold, KFold
|
18 |
+
from sklearn.metrics import log_loss
|
19 |
+
import lightgbm as lgb
|
20 |
+
|
21 |
+
from google.colab import drive
|
22 |
+
|
23 |
+
drive.mount('/content/drive')
|
24 |
+
|
25 |
+
data = '/content/drive/MyDrive/MarchMachineLearningMania2021/ncaam-march-mania-2021 (1)/MDataFiles_Stage2'
|
26 |
+
STAGE_1 = False
|
27 |
+
|
28 |
+
MRSCResults = pd.read_csv(data + '/MRegularSeasonCompactResults.csv')
|
29 |
+
MRSCResults
|
30 |
+
|
31 |
+
A_w = MRSCResults[MRSCResults.WLoc == 'A']\
|
32 |
+
.groupby(['Season','WTeamID'])['WTeamID'].count().to_frame()\
|
33 |
+
.rename(columns={"WTeamID": "win_A"})
|
34 |
+
print(A_w.head())
|
35 |
+
N_w = MRSCResults[MRSCResults.WLoc == 'N']\
|
36 |
+
.groupby(['Season','WTeamID'])['WTeamID'].count().to_frame()\
|
37 |
+
.rename(columns={"WTeamID": "win_N"})
|
38 |
+
H_w = MRSCResults[MRSCResults.WLoc == 'H']\
|
39 |
+
.groupby(['Season','WTeamID'])['WTeamID'].count().to_frame()\
|
40 |
+
.rename(columns={"WTeamID": "win_H"})
|
41 |
+
win = A_w.join(N_w, how='outer').join(H_w, how='outer').fillna(0)
|
42 |
+
|
43 |
+
H_l = MRSCResults[MRSCResults.WLoc == 'A']\
|
44 |
+
.groupby(['Season','LTeamID'])['LTeamID'].count().to_frame()\
|
45 |
+
.rename(columns={"LTeamID": "lost_H"})
|
46 |
+
N_l = MRSCResults[MRSCResults.WLoc == 'N']\
|
47 |
+
.groupby(['Season','LTeamID'])['LTeamID'].count().to_frame()\
|
48 |
+
.rename(columns={"LTeamID": "lost_N"})
|
49 |
+
A_l = MRSCResults[MRSCResults.WLoc == 'H']\
|
50 |
+
.groupby(['Season','LTeamID'])['LTeamID'].count().to_frame()\
|
51 |
+
.rename(columns={"LTeamID": "lost_A"})
|
52 |
+
lost = A_l.join(N_l, how='outer').join(H_l, how='outer').fillna(0)
|
53 |
+
print(win)
|
54 |
+
print(lost)
|
55 |
+
win.index = win.index.rename(['Season', 'TeamID'])
|
56 |
+
lost.index = lost.index.rename(['Season', 'TeamID'])
|
57 |
+
wl = win.join(lost, how='outer').reset_index()
|
58 |
+
print(wl)
|
59 |
+
wl['win_pct_A'] = wl['win_A'] / (wl['win_A'] + wl['lost_A'])
|
60 |
+
wl['win_pct_N'] = wl['win_N'] / (wl['win_N'] + wl['lost_N'])
|
61 |
+
wl['win_pct_H'] = wl['win_H'] / (wl['win_H'] + wl['lost_H'])
|
62 |
+
wl['win_pct_All'] = (wl['win_A'] + wl['win_N'] + wl['win_H']) / \
|
63 |
+
(wl['win_A'] + wl['win_N'] + wl['win_H'] + wl['lost_A']\
|
64 |
+
+ wl['lost_N'] + wl['lost_H'])
|
65 |
+
print(wl)
|
66 |
+
del A_w, N_w, H_w, H_l, N_l, A_l, win, lost
|
67 |
+
|
68 |
+
MRSCResults['relScore'] = MRSCResults.WScore - MRSCResults.LScore
|
69 |
+
|
70 |
+
w_scr = MRSCResults.loc[:, ['Season', 'WTeamID', 'WScore', 'WLoc','relScore']]
|
71 |
+
w_scr.columns = ['Season', 'TeamID','Score','Loc','relScore']
|
72 |
+
#print(w_scr)
|
73 |
+
l_scr = MRSCResults.loc[:, ['Season', 'LTeamID', 'LScore', 'WLoc','relScore']]
|
74 |
+
#print(l_scr)
|
75 |
+
l_scr['WLoc'] = l_scr.WLoc.apply(lambda x: 'H' if x == 'A' else 'A' if x == 'H' else 'N')
|
76 |
+
l_scr['relScore'] = -1 * l_scr.relScore
|
77 |
+
l_scr.columns = ['Season', 'TeamID','Score','Loc','relScore']
|
78 |
+
#print(l_scr)
|
79 |
+
wl_scr = pd.concat([w_scr,l_scr])
|
80 |
+
#print(wl_scr)
|
81 |
+
A_scr = wl_scr[wl_scr.Loc == 'A'].groupby(['Season','TeamID'])\
|
82 |
+
['Score','relScore'].mean()\
|
83 |
+
.rename(columns={"Score": "Score_A", "relScore": "relScore_A"})
|
84 |
+
#print(A_scr)
|
85 |
+
N_scr = wl_scr[wl_scr.Loc == 'N'].groupby(['Season','TeamID'])\
|
86 |
+
['Score','relScore'].mean()\
|
87 |
+
.rename(columns={"Score": "Score_N", "relScore": "relScore_N"})
|
88 |
+
H_scr = wl_scr[wl_scr.Loc == 'H'].groupby(['Season','TeamID'])\
|
89 |
+
['Score','relScore'].mean()\
|
90 |
+
.rename(columns={"Score": "Score_H", "relScore": "relScore_H"})
|
91 |
+
All_scr = wl_scr.groupby(['Season','TeamID'])['Score','relScore']\
|
92 |
+
.mean().rename(columns={"Score": "Score_All", "relScore": "relScore_All"})
|
93 |
+
scr = A_scr.join(N_scr, how='outer').join(H_scr, how='outer')\
|
94 |
+
.join(All_scr, how='outer').fillna(0).reset_index()
|
95 |
+
print(scr)
|
96 |
+
del w_scr, l_scr, wl_scr, A_scr, H_scr, N_scr, All_scr
|
97 |
+
|
98 |
+
MRSDetailedResults = pd.read_csv(data + '/MRegularSeasonDetailedResults.csv')
|
99 |
+
MRSDetailedResults
|
100 |
+
|
101 |
+
w = MRSDetailedResults.loc[:, ['Season', 'WTeamID', 'WFGM','WFGA','WFGM3'
|
102 |
+
,'WFGA3','WFTM','WFTA','WOR','WDR','WAst',
|
103 |
+
'WTO','WStl','WBlk','WPF']]
|
104 |
+
w.columns = ['Season', 'TeamID', 'FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR',
|
105 |
+
'Ast','TO','Stl','Blk','PF']
|
106 |
+
#print(w)
|
107 |
+
l = MRSDetailedResults.loc[:, ['Season', 'LTeamID', 'LFGM','LFGA','LFGM3',
|
108 |
+
'LFGA3','LFTM','LFTA','LOR','LDR','LAst',
|
109 |
+
'LTO','LStl','LBlk','LPF']]
|
110 |
+
l.columns = ['Season', 'TeamID', 'FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR',
|
111 |
+
'Ast','TO','Stl','Blk','PF']
|
112 |
+
|
113 |
+
detail = pd.concat([w,l])
|
114 |
+
#print(detail)
|
115 |
+
detail['goal_rate'] = detail.FGM / detail.FGA
|
116 |
+
detail['3p_goal_rate'] = detail.FGM3 / detail.FGA3
|
117 |
+
detail['ft_goal_rate'] = detail.FTM / detail.FTA
|
118 |
+
|
119 |
+
dt = detail.groupby(['Season','TeamID'])['FGM','FGA','FGM3','FGA3','FTM','FTA',
|
120 |
+
'OR','DR','Ast','TO','Stl','Blk','PF',
|
121 |
+
'goal_rate', '3p_goal_rate',
|
122 |
+
'ft_goal_rate']\
|
123 |
+
.mean().fillna(0).reset_index()
|
124 |
+
print(dt)
|
125 |
+
|
126 |
+
del w, l, detail
|
127 |
+
|
128 |
+
MMOrdinals = pd.read_csv(data + '/MMasseyOrdinals.csv')
|
129 |
+
MMOrdinals
|
130 |
+
|
131 |
+
MOR_127_128 = MMOrdinals[(MMOrdinals.SystemName == 'MOR') & ((MMOrdinals.RankingDayNum == 127) \
|
132 |
+
| (MMOrdinals.RankingDayNum == 128))]\
|
133 |
+
[['Season','TeamID','OrdinalRank']]
|
134 |
+
MOR_50_51 = MMOrdinals[(MMOrdinals.SystemName == 'MOR') & \
|
135 |
+
((MMOrdinals.RankingDayNum == 50) \
|
136 |
+
| (MMOrdinals.RankingDayNum == 51))]\
|
137 |
+
[['Season','TeamID','OrdinalRank']]
|
138 |
+
MOR_15_16 = MMOrdinals[(MMOrdinals.SystemName == 'MOR') & \
|
139 |
+
((MMOrdinals.RankingDayNum == 15) \
|
140 |
+
| (MMOrdinals.RankingDayNum == 16))]\
|
141 |
+
[['Season','TeamID','OrdinalRank']]
|
142 |
+
|
143 |
+
MOR_127_128 = MOR_127_128.rename(columns={'OrdinalRank':'OrdinalRank_127_128'})
|
144 |
+
#print(MOR_127_128)
|
145 |
+
MOR_50_51 = MOR_50_51.rename(columns={'OrdinalRank':'OrdinalRank_50_51'})
|
146 |
+
#print(MOR_50_51)
|
147 |
+
MOR_15_16 = MOR_15_16.rename(columns={'OrdinalRank':'OrdinalRank_15_16'})
|
148 |
+
#print(MOR_15_16)
|
149 |
+
MOR = MOR_127_128.merge(MOR_50_51, how='left', on=['Season','TeamID'])\
|
150 |
+
.merge(MOR_15_16, how='left', on=['Season','TeamID'])
|
151 |
+
#print(MOR)
|
152 |
+
## normalizing Rank values by its season maxium as it varies by seasons
|
153 |
+
MOR_max = MOR.groupby('Season')['OrdinalRank_127_128','OrdinalRank_50_51',
|
154 |
+
'OrdinalRank_15_16'].max().reset_index()
|
155 |
+
MOR_max.columns = ['Season', 'maxRank_127_128', 'maxRank_50_51', 'maxRank_15_16']
|
156 |
+
#print(MOR_max)
|
157 |
+
|
158 |
+
MOR_tmp = MMOrdinals[(MMOrdinals.SystemName == 'MOR') \
|
159 |
+
& (MMOrdinals.RankingDayNum < 133)]
|
160 |
+
#print(MOR_tmp)
|
161 |
+
MOR_stats = MOR_tmp.groupby(['Season','TeamID'])['OrdinalRank']\
|
162 |
+
.agg(['max','min','std','mean']).reset_index()
|
163 |
+
|
164 |
+
MOR_stats.columns = ['Season','TeamID','RankMax','RankMin','RankStd','RankMean']
|
165 |
+
#print(MOR_stats)
|
166 |
+
MOR = MOR.merge(MOR_max, how='left', on='Season')\
|
167 |
+
.merge(MOR_stats, how='left', on=['Season','TeamID'])
|
168 |
+
#print(MOR)
|
169 |
+
MOR['OrdinalRank_127_128'] = MOR['OrdinalRank_127_128'] / MOR['maxRank_127_128']
|
170 |
+
MOR['OrdinalRank_50_51'] = MOR['OrdinalRank_50_51'] / MOR['maxRank_50_51']
|
171 |
+
MOR['OrdinalRank_15_16'] = MOR['OrdinalRank_15_16'] / MOR['maxRank_15_16']
|
172 |
+
MOR['RankTrans_50_51_to_127_128'] = MOR['OrdinalRank_127_128'] \
|
173 |
+
- MOR['OrdinalRank_50_51']
|
174 |
+
MOR['RankTrans_15_16_to_127_128'] = MOR['OrdinalRank_127_128'] \
|
175 |
+
- MOR['OrdinalRank_15_16']
|
176 |
+
|
177 |
+
wl_1 = wl.loc[:,['Season','TeamID','win_pct_A','win_pct_N',
|
178 |
+
'win_pct_H','win_pct_All']]
|
179 |
+
wl_1.columns = [str(col) + '_1' if col not in ['Season','TeamID'] \
|
180 |
+
else str(col) for col in wl_1.columns ]
|
181 |
+
#print(wl_1)
|
182 |
+
|
183 |
+
wl_2 = wl.loc[:,['Season','TeamID','win_pct_A','win_pct_N',
|
184 |
+
'win_pct_H','win_pct_All']]
|
185 |
+
wl_2.columns = [str(col) + '_2' if col not in ['Season','TeamID'] \
|
186 |
+
else str(col) for col in wl_2.columns ]
|
187 |
+
#print(wl_2)
|
188 |
+
scr_1 = scr.copy()
|
189 |
+
scr_1.columns = [str(col) + '_1' if col not in ['Season','TeamID'] \
|
190 |
+
else str(col) for col in scr_1.columns ]
|
191 |
+
#print(scr_1)
|
192 |
+
scr_2 = scr.copy()
|
193 |
+
scr_2.columns = [str(col) + '_2' if col not in ['Season','TeamID'] \
|
194 |
+
else str(col) for col in scr_2.columns ]
|
195 |
+
#print(scr_2)
|
196 |
+
dt_1 = dt.copy()
|
197 |
+
dt_1.columns = [str(col) + '_1' if col not in ['Season','TeamID'] \
|
198 |
+
else str(col) for col in dt_1.columns ]
|
199 |
+
|
200 |
+
dt_2 = dt.copy()
|
201 |
+
dt_2.columns = [str(col) + '_2' if col not in ['Season','TeamID'] \
|
202 |
+
else str(col) for col in dt_2.columns ]
|
203 |
+
|
204 |
+
MOR_1 = MOR.copy()
|
205 |
+
MOR_1.columns = [str(col) + '_1' if col not in ['Season','TeamID'] \
|
206 |
+
else str(col) for col in MOR_1.columns ]
|
207 |
+
|
208 |
+
MOR_2 = MOR.copy()
|
209 |
+
MOR_2.columns = [str(col) + '_2' if col not in ['Season','TeamID'] \
|
210 |
+
else str(col) for col in MOR_2.columns ]
|
211 |
+
|
212 |
+
TCResults = pd.read_csv(data + '/MNCAATourneyCompactResults.csv')
|
213 |
+
TCResults
|
214 |
+
|
215 |
+
tourney1 = TCResults.loc[:, ['Season','WTeamID','LTeamID']]
|
216 |
+
tourney1.columns = ['Season','TeamID1','TeamID2']
|
217 |
+
tourney1['result'] = 1
|
218 |
+
|
219 |
+
tourney2 = TCResults.loc[:, ['Season','LTeamID','WTeamID']]
|
220 |
+
tourney2.columns = ['Season','TeamID1','TeamID2']
|
221 |
+
tourney2['result'] = 0
|
222 |
+
print(TCResults)
|
223 |
+
print(tourney1)
|
224 |
+
print(tourney2)
|
225 |
+
tourney = pd.concat([tourney1, tourney2])
|
226 |
+
print(tourney)
|
227 |
+
del tourney1, tourney2
|
228 |
+
|
229 |
+
def merge_data(df):
|
230 |
+
|
231 |
+
df = df.merge(wl_1, how='left', left_on=['Season','TeamID1'],
|
232 |
+
right_on=['Season','TeamID'])
|
233 |
+
df = df.merge(wl_2, how='left', left_on=['Season','TeamID2'],
|
234 |
+
right_on=['Season','TeamID'])
|
235 |
+
df = df.drop(['TeamID_x','TeamID_y'], axis=1)
|
236 |
+
|
237 |
+
|
238 |
+
df = df.merge(scr_1, how='left', left_on=['Season','TeamID1'],
|
239 |
+
right_on=['Season','TeamID'])
|
240 |
+
df = df.merge(scr_2, how='left', left_on=['Season','TeamID2'],
|
241 |
+
right_on=['Season','TeamID'])
|
242 |
+
df = df.drop(['TeamID_x','TeamID_y'], axis=1)
|
243 |
+
|
244 |
+
df = df.merge(dt_1, how='left', left_on=['Season','TeamID1'],
|
245 |
+
right_on=['Season','TeamID'])
|
246 |
+
df = df.merge(dt_2, how='left', left_on=['Season','TeamID2'],
|
247 |
+
right_on=['Season','TeamID'])
|
248 |
+
|
249 |
+
df = df.drop(['TeamID_x','TeamID_y'], axis=1)
|
250 |
+
|
251 |
+
df = df.merge(MOR_1, how='left', left_on=['Season','TeamID1'],
|
252 |
+
right_on=['Season','TeamID'])
|
253 |
+
df = df.merge(MOR_2, how='left', left_on=['Season','TeamID2'],
|
254 |
+
right_on=['Season','TeamID'])
|
255 |
+
df = df.drop(['TeamID_x','TeamID_y'], axis=1)
|
256 |
+
|
257 |
+
df['OrdinalRank_127_128_diff'] = df['OrdinalRank_127_128_1'] \
|
258 |
+
- df['OrdinalRank_127_128_2']
|
259 |
+
|
260 |
+
df['magic1'] = df['OrdinalRank_127_128_diff'] - df['RankMean_1']
|
261 |
+
df['magic2'] = df['RankMean_1'] - df['RankMean_2']
|
262 |
+
df['magic3'] = df['OrdinalRank_127_128_diff'] - df['RankMean_2']
|
263 |
+
|
264 |
+
df['magic11'] = df['OrdinalRank_127_128_diff'] * df['RankMean_1']
|
265 |
+
df['magic21'] = df['RankMean_1'] * df['RankMean_2']
|
266 |
+
df['magic31'] = df['OrdinalRank_127_128_diff'] * df['RankMean_2']
|
267 |
+
|
268 |
+
df['magic12'] = df['OrdinalRank_127_128_diff'] / df['RankMean_1']
|
269 |
+
df['magic22'] = df['RankMean_1'] / df['RankMean_2']
|
270 |
+
df['magic32'] = df['OrdinalRank_127_128_diff'] / df['RankMean_2']
|
271 |
+
|
272 |
+
df = df.fillna(-1)
|
273 |
+
|
274 |
+
for col in df.columns:
|
275 |
+
if (df[col] == np.inf).any() or (df[col] == -np.inf).any():
|
276 |
+
df[col][(df[col] == np.inf) | (df[col] == -np.inf)] = -1
|
277 |
+
|
278 |
+
return df
|
279 |
+
|
280 |
+
tourney = merge_data(tourney)
|
281 |
+
tourney = tourney.loc[tourney.Season >= 2003,:].reset_index(drop=True)
|
282 |
+
|
283 |
+
if STAGE_1:
|
284 |
+
tourney = tourney.loc[tourney.Season < 2015, :]
|
285 |
+
|
286 |
+
if STAGE_1:
|
287 |
+
MSampleSubmission = pd.read_csv(data + '/MSampleSubmissionStage1.csv')
|
288 |
+
else:
|
289 |
+
MSampleSubmission = pd.read_csv(data + '/MSampleSubmissionStage2.csv')
|
290 |
+
|
291 |
+
test1 = MSampleSubmission.copy()
|
292 |
+
test1['Season'] = test1.ID.apply(lambda x: int(x[0:4]))
|
293 |
+
test1['TeamID1'] = test1.ID.apply(lambda x: int(x[5:9]))
|
294 |
+
test1['TeamID2'] = test1.ID.apply(lambda x: int(x[10:14]))
|
295 |
+
|
296 |
+
test2 = MSampleSubmission.copy()
|
297 |
+
test2['Season'] = test2.ID.apply(lambda x: int(x[0:4]))
|
298 |
+
test2['TeamID1'] = test2.ID.apply(lambda x: int(x[10:14]))
|
299 |
+
test2['TeamID2'] = test2.ID.apply(lambda x: int(x[5:9]))
|
300 |
+
|
301 |
+
test = pd.concat([test1,test2]).drop(['Pred'], axis=1)
|
302 |
+
print(test)
|
303 |
+
test = merge_data(test)
|
304 |
+
print(test)
|
305 |
+
|
306 |
+
tourney
|
307 |
+
|
308 |
+
test
|
309 |
+
|
310 |
+
X = tourney.drop(['Season','TeamID1','TeamID2','result'], axis=1)
|
311 |
+
y = tourney["result"]
|
312 |
+
s = tourney["Season"]
|
313 |
+
|
314 |
+
X_test = test.drop(['ID', 'Season','TeamID1','TeamID2'], axis=1)
|
315 |
+
X_test
|
316 |
+
|
317 |
+
s.head()
|
318 |
+
|
319 |
+
s.value_counts()
|
320 |
+
|
321 |
+
len(X_test)
|
322 |
+
|
323 |
+
def model_training(X, y, cv, groups, params, metric, early_stopping=10, \
|
324 |
+
plt_iter=True, X_test=[], cat_features=[]):
|
325 |
+
|
326 |
+
feature_importance = pd.DataFrame()
|
327 |
+
val_scores=[]
|
328 |
+
train_evals=[]
|
329 |
+
valid_evals=[]
|
330 |
+
|
331 |
+
if len(X_test) > 0:
|
332 |
+
test_pred = np.zeros(len(X_test))
|
333 |
+
|
334 |
+
for idx, (train_index, val_index) in enumerate(cv.split(X, y, groups)):
|
335 |
+
|
336 |
+
print("###### fold %d ######" % (idx+1))
|
337 |
+
X_train, X_val = X.iloc[train_index], X.iloc[val_index]
|
338 |
+
y_train, y_val = y.iloc[train_index], y.iloc[val_index]
|
339 |
+
|
340 |
+
model = lgb.LGBMClassifier(**params)
|
341 |
+
|
342 |
+
model.fit(X_train, y_train,
|
343 |
+
eval_set=[(X_train, y_train), (X_val, y_val)],
|
344 |
+
early_stopping_rounds=early_stopping,
|
345 |
+
verbose=20
|
346 |
+
)
|
347 |
+
val_scores.append(model.best_score_['valid_1'][metric])
|
348 |
+
train_evals.append(model.evals_result_['training'][metric])
|
349 |
+
valid_evals.append(model.evals_result_['valid_1'][metric])
|
350 |
+
|
351 |
+
if len(X_test) > 0:
|
352 |
+
test_pred = test_pred + model.predict_proba(X_test, num_iteration=model.best_iteration_)[:,1]
|
353 |
+
|
354 |
+
fold_importance = pd.DataFrame()
|
355 |
+
fold_importance["feature"] = X_train.columns
|
356 |
+
fold_importance["importance"] = model.feature_importances_
|
357 |
+
fold_importance["fold"] = idx+1
|
358 |
+
feature_importance = pd.concat([feature_importance, fold_importance]
|
359 |
+
, axis=0)
|
360 |
+
|
361 |
+
if plt_iter:
|
362 |
+
|
363 |
+
fig, axs = plt.subplots(2, 2, figsize=(9,6))
|
364 |
+
|
365 |
+
for i, ax in enumerate(axs.flatten()):
|
366 |
+
ax.plot(train_evals[i], label='training')
|
367 |
+
ax.plot(valid_evals[i], label='validation')
|
368 |
+
ax.set(xlabel='interations', ylabel=f'{metric}')
|
369 |
+
ax.set_title(f'fold {i+1}', fontsize=12)
|
370 |
+
ax.legend(loc='upper right', prop={'size': 9})
|
371 |
+
fig.tight_layout()
|
372 |
+
plt.show()
|
373 |
+
|
374 |
+
print('### CV scores by fold ###')
|
375 |
+
for i in range(cv.get_n_splits(X)):
|
376 |
+
print(f'fold {i+1}: {val_scores[i]:.4f}')
|
377 |
+
print('CV mean score: {0:.4f}, std: {1:.4f}.'\
|
378 |
+
.format(np.mean(val_scores), np.std(val_scores)))
|
379 |
+
|
380 |
+
feature_importance = feature_importance[["feature", "importance"]]\
|
381 |
+
.groupby("feature").mean().sort_values(
|
382 |
+
by="importance", ascending=False)
|
383 |
+
feature_importance.reset_index(inplace=True)
|
384 |
+
|
385 |
+
if len(X_test) > 0:
|
386 |
+
test_pred = test_pred / cv.get_n_splits(X)
|
387 |
+
return feature_importance, test_pred
|
388 |
+
else:
|
389 |
+
return feature_importance
|
390 |
+
|
391 |
+
lgb_params = {'objective': 'binary',
|
392 |
+
'metric': 'binary_logloss',
|
393 |
+
'boosting': 'gbdt',
|
394 |
+
'num_leaves': 31,
|
395 |
+
'feature_fraction': 0.8,
|
396 |
+
'bagging_fraction': 0.8,
|
397 |
+
'bagging_freq': 5,
|
398 |
+
'learning_rate': 0.1,
|
399 |
+
'n_estimators': 1000,
|
400 |
+
}
|
401 |
+
|
402 |
+
N_FOLDS = 10
|
403 |
+
|
404 |
+
# Commented out IPython magic to ensure Python compatibility.
|
405 |
+
# %%time
|
406 |
+
# group_kfold = GroupKFold(n_splits=N_FOLDS)
|
407 |
+
#
|
408 |
+
# feature_importance, test_pred = model_training(X, y, group_kfold, s, lgb_params, 'binary_logloss', plt_iter = True, X_test = X_test)
|
409 |
+
|
410 |
+
plt.figure(figsize=(10, 10));
|
411 |
+
sns.barplot(x="importance", y="feature", data=feature_importance[:30])
|
412 |
+
plt.title('Feature Importnace')
|
413 |
+
|
414 |
+
import warnings
|
415 |
+
warnings.filterwarnings("ignore")
|
416 |
+
import numpy as np
|
417 |
+
import pandas as pd
|
418 |
+
from sklearn.experimental import enable_hist_gradient_boosting
|
419 |
+
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier, RandomForestClassifier
|
420 |
+
from sklearn.model_selection import KFold, GroupKFold
|
421 |
+
from sklearn.linear_model import LinearRegression, LogisticRegression
|
422 |
+
from sklearn.svm import SVC
|
423 |
+
from sklearn.metrics import log_loss
|
424 |
+
from tqdm.notebook import tqdm
|
425 |
+
import glob
|
426 |
+
import os
|
427 |
+
import gc
|
428 |
+
import xgboost as xgb
|
429 |
+
|
430 |
+
train = tourney
|
431 |
+
test = test
|
432 |
+
|
433 |
+
xgb_params= {
|
434 |
+
"objective": "binary:logistic",
|
435 |
+
"max_depth": 2,
|
436 |
+
"learning_rate": 0.1,
|
437 |
+
"colsample_bytree": 0.8,
|
438 |
+
"subsample": 0.8,
|
439 |
+
"min_child_weight": 30,
|
440 |
+
"n_jobs": 2,
|
441 |
+
"seed": 2021,
|
442 |
+
'tree_method': "gpu_hist",
|
443 |
+
"gpu_id": 0,
|
444 |
+
'predictor': 'gpu_predictor'
|
445 |
+
}
|
446 |
+
|
447 |
+
y = train["result"]
|
448 |
+
s = train["Season"]
|
449 |
+
X = train.drop(['Season','TeamID1','TeamID2','result'], axis=1)
|
450 |
+
|
451 |
+
X_test = test.drop(['ID', 'Season','TeamID1','TeamID2'], axis=1)
|
452 |
+
|
453 |
+
train_oof = np.zeros((X.shape[0],))
|
454 |
+
test_preds = 0
|
455 |
+
train_oof.shape
|
456 |
+
|
457 |
+
NUM_FOLDS = 5
|
458 |
+
kf = GroupKFold(n_splits=NUM_FOLDS)
|
459 |
+
max_iter = 550
|
460 |
+
|
461 |
+
for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(X, y, s))):
|
462 |
+
train_df, val_df = X.iloc[train_ind], X.iloc[val_ind]
|
463 |
+
train_target, val_target = y.iloc[train_ind], y.iloc[val_ind]
|
464 |
+
train_df_xgb = xgb.DMatrix(train_df, label=train_target)
|
465 |
+
val_df_xgb = xgb.DMatrix(val_df, label=val_target)
|
466 |
+
|
467 |
+
model = HistGradientBoostingClassifier(max_iter=max_iter, validation_fraction=None, learning_rate=0.01, max_depth=2, min_samples_leaf=32)
|
468 |
+
model1 = RandomForestClassifier()
|
469 |
+
model2 = LogisticRegression(C=1)
|
470 |
+
model3 = xgb.train(xgb_params, train_df_xgb, 1000)
|
471 |
+
|
472 |
+
model = model.fit(train_df, train_target)
|
473 |
+
model1 = model1.fit(train_df, train_target)
|
474 |
+
model2 = model2.fit(train_df, train_target)
|
475 |
+
|
476 |
+
temp_oof = (model.predict_proba(val_df)[:,1] + model1.predict_proba(val_df)[:,1] + model2.predict_proba(val_df)[:,1] + model3.predict(val_df_xgb)) / 4
|
477 |
+
|
478 |
+
temp_test = (model.predict_proba(X_test)[:,1] + model1.predict_proba(X_test)[:,1] + model2.predict_proba(X_test)[:,1] + model3.predict(xgb.DMatrix(X_test))) / 4
|
479 |
+
|
480 |
+
train_oof[val_ind] = temp_oof
|
481 |
+
|
482 |
+
test_preds += temp_test / NUM_FOLDS
|
483 |
+
|
484 |
+
print(log_loss(val_target, temp_oof))
|
485 |
+
|
486 |
+
print('CV', log_loss(y, train_oof))
|
487 |
+
np.save('train_oof', train_oof)
|
488 |
+
np.save('test_preds', test_preds)
|
489 |
+
|
490 |
+
test = test
|
491 |
+
MSampleSubmission = pd.read_csv(data + '/MSampleSubmissionStage2.csv')
|
492 |
+
|
493 |
+
idx = test_preds.shape[0] //2
|
494 |
+
test_preds[idx:] = 1 - test_preds[idx:]
|
495 |
+
|
496 |
+
pred = pd.concat([test.ID, pd.Series(test_preds)], axis=1).groupby('ID')[0]\
|
497 |
+
.mean().reset_index().rename(columns={0:'Pred'})
|
498 |
+
sub3 = MSampleSubmission.drop(['Pred'],axis=1).merge(pred, on='ID')
|
499 |
+
pred_3 = sub3['Pred']
|
500 |
+
|
501 |
+
0.5539459504635523
|
502 |
+
|
503 |
+
idx = test_pred.shape[0] //2
|
504 |
+
test_pred[idx:] = 1 - test_pred[idx:]
|
505 |
+
|
506 |
+
pred = pd.concat([test.ID, pd.Series(test_pred)], axis=1).groupby('ID')[0]\
|
507 |
+
.mean().reset_index().rename(columns={0:'Pred'})
|
508 |
+
sub = MSampleSubmission.drop(['Pred'],axis=1).merge(pred, on='ID')
|
509 |
+
sub['Pred'] = sub['Pred'] * 0.3 + sub3['Pred'] * 0.7
|
510 |
+
sub.to_csv('submission.csv', index=False)
|
511 |
+
sub.head()
|
512 |
+
|
513 |
+
if STAGE_1:
|
514 |
+
rslt = pd.DataFrame()
|
515 |
+
TCResults_s = TCResults.loc[TCResults.Season >= 2015,:]
|
516 |
+
rslt['season'] = TCResults_s.Season
|
517 |
+
rslt['team1'] = TCResults_s.apply(lambda x: x.WTeamID \
|
518 |
+
if x.WTeamID < x.LTeamID else x.LTeamID
|
519 |
+
, axis=1)
|
520 |
+
rslt['team2'] = TCResults_s.apply(lambda x: x.WTeamID \
|
521 |
+
if x.WTeamID > x.LTeamID else x.LTeamID
|
522 |
+
, axis=1)
|
523 |
+
rslt['wl'] = TCResults_s.apply(lambda x: 1 if x.WTeamID < x.LTeamID else 0
|
524 |
+
, axis=1)
|
525 |
+
rslt['ID'] = rslt.apply(lambda x: str(x.season) + '_' + str(x.team1) \
|
526 |
+
+ '_' + str(x.team2), axis=1)
|
527 |
+
sub2 = sub.merge(rslt.loc[:,['ID','wl']], how='inner', on='ID')
|
528 |
+
|
529 |
+
preds = []
|
530 |
+
for i in sub2.Pred:
|
531 |
+
preds.append([1-i, i])
|
532 |
+
|
533 |
+
print('Test logloss is {:.5f}'.format(log_loss(sub2.wl.values, preds)))
|
534 |
+
|
535 |
+
0.51971
|
536 |
+
|
537 |
+
!pip install gradio
|
538 |
+
|
539 |
+
sub
|
540 |
+
|
541 |
+
import gradio as gr
|
542 |
+
|
543 |
+
def prediction_result(teamID_1, teamID_2):
|
544 |
+
id = f"2021_{int(teamID_1)}_{int(teamID_2)}"
|
545 |
+
pred = sub["Pred"].loc[sub["ID"] == id]
|
546 |
+
p = pred.values
|
547 |
+
return f"The winning probability of teamID {int(teamID_1)} is {round(p[0] * 100, 2)}%"
|
548 |
+
|
549 |
+
demo = gr.Interface(
|
550 |
+
fn = prediction_result,
|
551 |
+
inputs = ["number", "number"],
|
552 |
+
outputs = "text",
|
553 |
+
title = "MENS MARCH MANIA 2021",
|
554 |
+
description = """Predicted the outcome of the 2021 tournament""",
|
555 |
+
examples = [[1101, 1104], [1101, 1111], [1101, 1116], [1101, 1124], [1101, 1140]],
|
556 |
+
live = True
|
557 |
+
)
|
558 |
+
|
559 |
+
demo.launch(share = True)
|
560 |
+
|
561 |
+
!git clone https://huggingface.co/spaces/Harshi/MarchMachineLearningMania
|
562 |
+
|