Harshi commited on
Commit
3d0c560
1 Parent(s): 21adadd

Upload marchmachinelearningmania2021.py

Browse files
Files changed (1) hide show
  1. marchmachinelearningmania2021.py +562 -0
marchmachinelearningmania2021.py ADDED
@@ -0,0 +1,562 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """MarchMachineLearningMania2021.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1FeGm3qNqLAlrQd6R9EkuNFWy9oOlnxWy
8
+ """
9
+
10
+ # Commented out IPython magic to ensure Python compatibility.
11
+ import numpy as np
12
+ import pandas as pd
13
+ import matplotlib.pyplot as plt
14
+ # %matplotlib inline
15
+ import seaborn as sns; sns.set()
16
+
17
+ from sklearn.model_selection import GroupKFold, KFold
18
+ from sklearn.metrics import log_loss
19
+ import lightgbm as lgb
20
+
21
+ from google.colab import drive
22
+
23
+ drive.mount('/content/drive')
24
+
25
+ data = '/content/drive/MyDrive/MarchMachineLearningMania2021/ncaam-march-mania-2021 (1)/MDataFiles_Stage2'
26
+ STAGE_1 = False
27
+
28
+ MRSCResults = pd.read_csv(data + '/MRegularSeasonCompactResults.csv')
29
+ MRSCResults
30
+
31
+ A_w = MRSCResults[MRSCResults.WLoc == 'A']\
32
+ .groupby(['Season','WTeamID'])['WTeamID'].count().to_frame()\
33
+ .rename(columns={"WTeamID": "win_A"})
34
+ print(A_w.head())
35
+ N_w = MRSCResults[MRSCResults.WLoc == 'N']\
36
+ .groupby(['Season','WTeamID'])['WTeamID'].count().to_frame()\
37
+ .rename(columns={"WTeamID": "win_N"})
38
+ H_w = MRSCResults[MRSCResults.WLoc == 'H']\
39
+ .groupby(['Season','WTeamID'])['WTeamID'].count().to_frame()\
40
+ .rename(columns={"WTeamID": "win_H"})
41
+ win = A_w.join(N_w, how='outer').join(H_w, how='outer').fillna(0)
42
+
43
+ H_l = MRSCResults[MRSCResults.WLoc == 'A']\
44
+ .groupby(['Season','LTeamID'])['LTeamID'].count().to_frame()\
45
+ .rename(columns={"LTeamID": "lost_H"})
46
+ N_l = MRSCResults[MRSCResults.WLoc == 'N']\
47
+ .groupby(['Season','LTeamID'])['LTeamID'].count().to_frame()\
48
+ .rename(columns={"LTeamID": "lost_N"})
49
+ A_l = MRSCResults[MRSCResults.WLoc == 'H']\
50
+ .groupby(['Season','LTeamID'])['LTeamID'].count().to_frame()\
51
+ .rename(columns={"LTeamID": "lost_A"})
52
+ lost = A_l.join(N_l, how='outer').join(H_l, how='outer').fillna(0)
53
+ print(win)
54
+ print(lost)
55
+ win.index = win.index.rename(['Season', 'TeamID'])
56
+ lost.index = lost.index.rename(['Season', 'TeamID'])
57
+ wl = win.join(lost, how='outer').reset_index()
58
+ print(wl)
59
+ wl['win_pct_A'] = wl['win_A'] / (wl['win_A'] + wl['lost_A'])
60
+ wl['win_pct_N'] = wl['win_N'] / (wl['win_N'] + wl['lost_N'])
61
+ wl['win_pct_H'] = wl['win_H'] / (wl['win_H'] + wl['lost_H'])
62
+ wl['win_pct_All'] = (wl['win_A'] + wl['win_N'] + wl['win_H']) / \
63
+ (wl['win_A'] + wl['win_N'] + wl['win_H'] + wl['lost_A']\
64
+ + wl['lost_N'] + wl['lost_H'])
65
+ print(wl)
66
+ del A_w, N_w, H_w, H_l, N_l, A_l, win, lost
67
+
68
+ MRSCResults['relScore'] = MRSCResults.WScore - MRSCResults.LScore
69
+
70
+ w_scr = MRSCResults.loc[:, ['Season', 'WTeamID', 'WScore', 'WLoc','relScore']]
71
+ w_scr.columns = ['Season', 'TeamID','Score','Loc','relScore']
72
+ #print(w_scr)
73
+ l_scr = MRSCResults.loc[:, ['Season', 'LTeamID', 'LScore', 'WLoc','relScore']]
74
+ #print(l_scr)
75
+ l_scr['WLoc'] = l_scr.WLoc.apply(lambda x: 'H' if x == 'A' else 'A' if x == 'H' else 'N')
76
+ l_scr['relScore'] = -1 * l_scr.relScore
77
+ l_scr.columns = ['Season', 'TeamID','Score','Loc','relScore']
78
+ #print(l_scr)
79
+ wl_scr = pd.concat([w_scr,l_scr])
80
+ #print(wl_scr)
81
+ A_scr = wl_scr[wl_scr.Loc == 'A'].groupby(['Season','TeamID'])\
82
+ ['Score','relScore'].mean()\
83
+ .rename(columns={"Score": "Score_A", "relScore": "relScore_A"})
84
+ #print(A_scr)
85
+ N_scr = wl_scr[wl_scr.Loc == 'N'].groupby(['Season','TeamID'])\
86
+ ['Score','relScore'].mean()\
87
+ .rename(columns={"Score": "Score_N", "relScore": "relScore_N"})
88
+ H_scr = wl_scr[wl_scr.Loc == 'H'].groupby(['Season','TeamID'])\
89
+ ['Score','relScore'].mean()\
90
+ .rename(columns={"Score": "Score_H", "relScore": "relScore_H"})
91
+ All_scr = wl_scr.groupby(['Season','TeamID'])['Score','relScore']\
92
+ .mean().rename(columns={"Score": "Score_All", "relScore": "relScore_All"})
93
+ scr = A_scr.join(N_scr, how='outer').join(H_scr, how='outer')\
94
+ .join(All_scr, how='outer').fillna(0).reset_index()
95
+ print(scr)
96
+ del w_scr, l_scr, wl_scr, A_scr, H_scr, N_scr, All_scr
97
+
98
+ MRSDetailedResults = pd.read_csv(data + '/MRegularSeasonDetailedResults.csv')
99
+ MRSDetailedResults
100
+
101
+ w = MRSDetailedResults.loc[:, ['Season', 'WTeamID', 'WFGM','WFGA','WFGM3'
102
+ ,'WFGA3','WFTM','WFTA','WOR','WDR','WAst',
103
+ 'WTO','WStl','WBlk','WPF']]
104
+ w.columns = ['Season', 'TeamID', 'FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR',
105
+ 'Ast','TO','Stl','Blk','PF']
106
+ #print(w)
107
+ l = MRSDetailedResults.loc[:, ['Season', 'LTeamID', 'LFGM','LFGA','LFGM3',
108
+ 'LFGA3','LFTM','LFTA','LOR','LDR','LAst',
109
+ 'LTO','LStl','LBlk','LPF']]
110
+ l.columns = ['Season', 'TeamID', 'FGM','FGA','FGM3','FGA3','FTM','FTA','OR','DR',
111
+ 'Ast','TO','Stl','Blk','PF']
112
+
113
+ detail = pd.concat([w,l])
114
+ #print(detail)
115
+ detail['goal_rate'] = detail.FGM / detail.FGA
116
+ detail['3p_goal_rate'] = detail.FGM3 / detail.FGA3
117
+ detail['ft_goal_rate'] = detail.FTM / detail.FTA
118
+
119
+ dt = detail.groupby(['Season','TeamID'])['FGM','FGA','FGM3','FGA3','FTM','FTA',
120
+ 'OR','DR','Ast','TO','Stl','Blk','PF',
121
+ 'goal_rate', '3p_goal_rate',
122
+ 'ft_goal_rate']\
123
+ .mean().fillna(0).reset_index()
124
+ print(dt)
125
+
126
+ del w, l, detail
127
+
128
+ MMOrdinals = pd.read_csv(data + '/MMasseyOrdinals.csv')
129
+ MMOrdinals
130
+
131
+ MOR_127_128 = MMOrdinals[(MMOrdinals.SystemName == 'MOR') & ((MMOrdinals.RankingDayNum == 127) \
132
+ | (MMOrdinals.RankingDayNum == 128))]\
133
+ [['Season','TeamID','OrdinalRank']]
134
+ MOR_50_51 = MMOrdinals[(MMOrdinals.SystemName == 'MOR') & \
135
+ ((MMOrdinals.RankingDayNum == 50) \
136
+ | (MMOrdinals.RankingDayNum == 51))]\
137
+ [['Season','TeamID','OrdinalRank']]
138
+ MOR_15_16 = MMOrdinals[(MMOrdinals.SystemName == 'MOR') & \
139
+ ((MMOrdinals.RankingDayNum == 15) \
140
+ | (MMOrdinals.RankingDayNum == 16))]\
141
+ [['Season','TeamID','OrdinalRank']]
142
+
143
+ MOR_127_128 = MOR_127_128.rename(columns={'OrdinalRank':'OrdinalRank_127_128'})
144
+ #print(MOR_127_128)
145
+ MOR_50_51 = MOR_50_51.rename(columns={'OrdinalRank':'OrdinalRank_50_51'})
146
+ #print(MOR_50_51)
147
+ MOR_15_16 = MOR_15_16.rename(columns={'OrdinalRank':'OrdinalRank_15_16'})
148
+ #print(MOR_15_16)
149
+ MOR = MOR_127_128.merge(MOR_50_51, how='left', on=['Season','TeamID'])\
150
+ .merge(MOR_15_16, how='left', on=['Season','TeamID'])
151
+ #print(MOR)
152
+ ## normalizing Rank values by its season maxium as it varies by seasons
153
+ MOR_max = MOR.groupby('Season')['OrdinalRank_127_128','OrdinalRank_50_51',
154
+ 'OrdinalRank_15_16'].max().reset_index()
155
+ MOR_max.columns = ['Season', 'maxRank_127_128', 'maxRank_50_51', 'maxRank_15_16']
156
+ #print(MOR_max)
157
+
158
+ MOR_tmp = MMOrdinals[(MMOrdinals.SystemName == 'MOR') \
159
+ & (MMOrdinals.RankingDayNum < 133)]
160
+ #print(MOR_tmp)
161
+ MOR_stats = MOR_tmp.groupby(['Season','TeamID'])['OrdinalRank']\
162
+ .agg(['max','min','std','mean']).reset_index()
163
+
164
+ MOR_stats.columns = ['Season','TeamID','RankMax','RankMin','RankStd','RankMean']
165
+ #print(MOR_stats)
166
+ MOR = MOR.merge(MOR_max, how='left', on='Season')\
167
+ .merge(MOR_stats, how='left', on=['Season','TeamID'])
168
+ #print(MOR)
169
+ MOR['OrdinalRank_127_128'] = MOR['OrdinalRank_127_128'] / MOR['maxRank_127_128']
170
+ MOR['OrdinalRank_50_51'] = MOR['OrdinalRank_50_51'] / MOR['maxRank_50_51']
171
+ MOR['OrdinalRank_15_16'] = MOR['OrdinalRank_15_16'] / MOR['maxRank_15_16']
172
+ MOR['RankTrans_50_51_to_127_128'] = MOR['OrdinalRank_127_128'] \
173
+ - MOR['OrdinalRank_50_51']
174
+ MOR['RankTrans_15_16_to_127_128'] = MOR['OrdinalRank_127_128'] \
175
+ - MOR['OrdinalRank_15_16']
176
+
177
+ wl_1 = wl.loc[:,['Season','TeamID','win_pct_A','win_pct_N',
178
+ 'win_pct_H','win_pct_All']]
179
+ wl_1.columns = [str(col) + '_1' if col not in ['Season','TeamID'] \
180
+ else str(col) for col in wl_1.columns ]
181
+ #print(wl_1)
182
+
183
+ wl_2 = wl.loc[:,['Season','TeamID','win_pct_A','win_pct_N',
184
+ 'win_pct_H','win_pct_All']]
185
+ wl_2.columns = [str(col) + '_2' if col not in ['Season','TeamID'] \
186
+ else str(col) for col in wl_2.columns ]
187
+ #print(wl_2)
188
+ scr_1 = scr.copy()
189
+ scr_1.columns = [str(col) + '_1' if col not in ['Season','TeamID'] \
190
+ else str(col) for col in scr_1.columns ]
191
+ #print(scr_1)
192
+ scr_2 = scr.copy()
193
+ scr_2.columns = [str(col) + '_2' if col not in ['Season','TeamID'] \
194
+ else str(col) for col in scr_2.columns ]
195
+ #print(scr_2)
196
+ dt_1 = dt.copy()
197
+ dt_1.columns = [str(col) + '_1' if col not in ['Season','TeamID'] \
198
+ else str(col) for col in dt_1.columns ]
199
+
200
+ dt_2 = dt.copy()
201
+ dt_2.columns = [str(col) + '_2' if col not in ['Season','TeamID'] \
202
+ else str(col) for col in dt_2.columns ]
203
+
204
+ MOR_1 = MOR.copy()
205
+ MOR_1.columns = [str(col) + '_1' if col not in ['Season','TeamID'] \
206
+ else str(col) for col in MOR_1.columns ]
207
+
208
+ MOR_2 = MOR.copy()
209
+ MOR_2.columns = [str(col) + '_2' if col not in ['Season','TeamID'] \
210
+ else str(col) for col in MOR_2.columns ]
211
+
212
+ TCResults = pd.read_csv(data + '/MNCAATourneyCompactResults.csv')
213
+ TCResults
214
+
215
+ tourney1 = TCResults.loc[:, ['Season','WTeamID','LTeamID']]
216
+ tourney1.columns = ['Season','TeamID1','TeamID2']
217
+ tourney1['result'] = 1
218
+
219
+ tourney2 = TCResults.loc[:, ['Season','LTeamID','WTeamID']]
220
+ tourney2.columns = ['Season','TeamID1','TeamID2']
221
+ tourney2['result'] = 0
222
+ print(TCResults)
223
+ print(tourney1)
224
+ print(tourney2)
225
+ tourney = pd.concat([tourney1, tourney2])
226
+ print(tourney)
227
+ del tourney1, tourney2
228
+
229
+ def merge_data(df):
230
+
231
+ df = df.merge(wl_1, how='left', left_on=['Season','TeamID1'],
232
+ right_on=['Season','TeamID'])
233
+ df = df.merge(wl_2, how='left', left_on=['Season','TeamID2'],
234
+ right_on=['Season','TeamID'])
235
+ df = df.drop(['TeamID_x','TeamID_y'], axis=1)
236
+
237
+
238
+ df = df.merge(scr_1, how='left', left_on=['Season','TeamID1'],
239
+ right_on=['Season','TeamID'])
240
+ df = df.merge(scr_2, how='left', left_on=['Season','TeamID2'],
241
+ right_on=['Season','TeamID'])
242
+ df = df.drop(['TeamID_x','TeamID_y'], axis=1)
243
+
244
+ df = df.merge(dt_1, how='left', left_on=['Season','TeamID1'],
245
+ right_on=['Season','TeamID'])
246
+ df = df.merge(dt_2, how='left', left_on=['Season','TeamID2'],
247
+ right_on=['Season','TeamID'])
248
+
249
+ df = df.drop(['TeamID_x','TeamID_y'], axis=1)
250
+
251
+ df = df.merge(MOR_1, how='left', left_on=['Season','TeamID1'],
252
+ right_on=['Season','TeamID'])
253
+ df = df.merge(MOR_2, how='left', left_on=['Season','TeamID2'],
254
+ right_on=['Season','TeamID'])
255
+ df = df.drop(['TeamID_x','TeamID_y'], axis=1)
256
+
257
+ df['OrdinalRank_127_128_diff'] = df['OrdinalRank_127_128_1'] \
258
+ - df['OrdinalRank_127_128_2']
259
+
260
+ df['magic1'] = df['OrdinalRank_127_128_diff'] - df['RankMean_1']
261
+ df['magic2'] = df['RankMean_1'] - df['RankMean_2']
262
+ df['magic3'] = df['OrdinalRank_127_128_diff'] - df['RankMean_2']
263
+
264
+ df['magic11'] = df['OrdinalRank_127_128_diff'] * df['RankMean_1']
265
+ df['magic21'] = df['RankMean_1'] * df['RankMean_2']
266
+ df['magic31'] = df['OrdinalRank_127_128_diff'] * df['RankMean_2']
267
+
268
+ df['magic12'] = df['OrdinalRank_127_128_diff'] / df['RankMean_1']
269
+ df['magic22'] = df['RankMean_1'] / df['RankMean_2']
270
+ df['magic32'] = df['OrdinalRank_127_128_diff'] / df['RankMean_2']
271
+
272
+ df = df.fillna(-1)
273
+
274
+ for col in df.columns:
275
+ if (df[col] == np.inf).any() or (df[col] == -np.inf).any():
276
+ df[col][(df[col] == np.inf) | (df[col] == -np.inf)] = -1
277
+
278
+ return df
279
+
280
+ tourney = merge_data(tourney)
281
+ tourney = tourney.loc[tourney.Season >= 2003,:].reset_index(drop=True)
282
+
283
+ if STAGE_1:
284
+ tourney = tourney.loc[tourney.Season < 2015, :]
285
+
286
+ if STAGE_1:
287
+ MSampleSubmission = pd.read_csv(data + '/MSampleSubmissionStage1.csv')
288
+ else:
289
+ MSampleSubmission = pd.read_csv(data + '/MSampleSubmissionStage2.csv')
290
+
291
+ test1 = MSampleSubmission.copy()
292
+ test1['Season'] = test1.ID.apply(lambda x: int(x[0:4]))
293
+ test1['TeamID1'] = test1.ID.apply(lambda x: int(x[5:9]))
294
+ test1['TeamID2'] = test1.ID.apply(lambda x: int(x[10:14]))
295
+
296
+ test2 = MSampleSubmission.copy()
297
+ test2['Season'] = test2.ID.apply(lambda x: int(x[0:4]))
298
+ test2['TeamID1'] = test2.ID.apply(lambda x: int(x[10:14]))
299
+ test2['TeamID2'] = test2.ID.apply(lambda x: int(x[5:9]))
300
+
301
+ test = pd.concat([test1,test2]).drop(['Pred'], axis=1)
302
+ print(test)
303
+ test = merge_data(test)
304
+ print(test)
305
+
306
+ tourney
307
+
308
+ test
309
+
310
+ X = tourney.drop(['Season','TeamID1','TeamID2','result'], axis=1)
311
+ y = tourney["result"]
312
+ s = tourney["Season"]
313
+
314
+ X_test = test.drop(['ID', 'Season','TeamID1','TeamID2'], axis=1)
315
+ X_test
316
+
317
+ s.head()
318
+
319
+ s.value_counts()
320
+
321
+ len(X_test)
322
+
323
+ def model_training(X, y, cv, groups, params, metric, early_stopping=10, \
324
+ plt_iter=True, X_test=[], cat_features=[]):
325
+
326
+ feature_importance = pd.DataFrame()
327
+ val_scores=[]
328
+ train_evals=[]
329
+ valid_evals=[]
330
+
331
+ if len(X_test) > 0:
332
+ test_pred = np.zeros(len(X_test))
333
+
334
+ for idx, (train_index, val_index) in enumerate(cv.split(X, y, groups)):
335
+
336
+ print("###### fold %d ######" % (idx+1))
337
+ X_train, X_val = X.iloc[train_index], X.iloc[val_index]
338
+ y_train, y_val = y.iloc[train_index], y.iloc[val_index]
339
+
340
+ model = lgb.LGBMClassifier(**params)
341
+
342
+ model.fit(X_train, y_train,
343
+ eval_set=[(X_train, y_train), (X_val, y_val)],
344
+ early_stopping_rounds=early_stopping,
345
+ verbose=20
346
+ )
347
+ val_scores.append(model.best_score_['valid_1'][metric])
348
+ train_evals.append(model.evals_result_['training'][metric])
349
+ valid_evals.append(model.evals_result_['valid_1'][metric])
350
+
351
+ if len(X_test) > 0:
352
+ test_pred = test_pred + model.predict_proba(X_test, num_iteration=model.best_iteration_)[:,1]
353
+
354
+ fold_importance = pd.DataFrame()
355
+ fold_importance["feature"] = X_train.columns
356
+ fold_importance["importance"] = model.feature_importances_
357
+ fold_importance["fold"] = idx+1
358
+ feature_importance = pd.concat([feature_importance, fold_importance]
359
+ , axis=0)
360
+
361
+ if plt_iter:
362
+
363
+ fig, axs = plt.subplots(2, 2, figsize=(9,6))
364
+
365
+ for i, ax in enumerate(axs.flatten()):
366
+ ax.plot(train_evals[i], label='training')
367
+ ax.plot(valid_evals[i], label='validation')
368
+ ax.set(xlabel='interations', ylabel=f'{metric}')
369
+ ax.set_title(f'fold {i+1}', fontsize=12)
370
+ ax.legend(loc='upper right', prop={'size': 9})
371
+ fig.tight_layout()
372
+ plt.show()
373
+
374
+ print('### CV scores by fold ###')
375
+ for i in range(cv.get_n_splits(X)):
376
+ print(f'fold {i+1}: {val_scores[i]:.4f}')
377
+ print('CV mean score: {0:.4f}, std: {1:.4f}.'\
378
+ .format(np.mean(val_scores), np.std(val_scores)))
379
+
380
+ feature_importance = feature_importance[["feature", "importance"]]\
381
+ .groupby("feature").mean().sort_values(
382
+ by="importance", ascending=False)
383
+ feature_importance.reset_index(inplace=True)
384
+
385
+ if len(X_test) > 0:
386
+ test_pred = test_pred / cv.get_n_splits(X)
387
+ return feature_importance, test_pred
388
+ else:
389
+ return feature_importance
390
+
391
+ lgb_params = {'objective': 'binary',
392
+ 'metric': 'binary_logloss',
393
+ 'boosting': 'gbdt',
394
+ 'num_leaves': 31,
395
+ 'feature_fraction': 0.8,
396
+ 'bagging_fraction': 0.8,
397
+ 'bagging_freq': 5,
398
+ 'learning_rate': 0.1,
399
+ 'n_estimators': 1000,
400
+ }
401
+
402
+ N_FOLDS = 10
403
+
404
+ # Commented out IPython magic to ensure Python compatibility.
405
+ # %%time
406
+ # group_kfold = GroupKFold(n_splits=N_FOLDS)
407
+ #
408
+ # feature_importance, test_pred = model_training(X, y, group_kfold, s, lgb_params, 'binary_logloss', plt_iter = True, X_test = X_test)
409
+
410
+ plt.figure(figsize=(10, 10));
411
+ sns.barplot(x="importance", y="feature", data=feature_importance[:30])
412
+ plt.title('Feature Importnace')
413
+
414
+ import warnings
415
+ warnings.filterwarnings("ignore")
416
+ import numpy as np
417
+ import pandas as pd
418
+ from sklearn.experimental import enable_hist_gradient_boosting
419
+ from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier, RandomForestClassifier
420
+ from sklearn.model_selection import KFold, GroupKFold
421
+ from sklearn.linear_model import LinearRegression, LogisticRegression
422
+ from sklearn.svm import SVC
423
+ from sklearn.metrics import log_loss
424
+ from tqdm.notebook import tqdm
425
+ import glob
426
+ import os
427
+ import gc
428
+ import xgboost as xgb
429
+
430
+ train = tourney
431
+ test = test
432
+
433
+ xgb_params= {
434
+ "objective": "binary:logistic",
435
+ "max_depth": 2,
436
+ "learning_rate": 0.1,
437
+ "colsample_bytree": 0.8,
438
+ "subsample": 0.8,
439
+ "min_child_weight": 30,
440
+ "n_jobs": 2,
441
+ "seed": 2021,
442
+ 'tree_method': "gpu_hist",
443
+ "gpu_id": 0,
444
+ 'predictor': 'gpu_predictor'
445
+ }
446
+
447
+ y = train["result"]
448
+ s = train["Season"]
449
+ X = train.drop(['Season','TeamID1','TeamID2','result'], axis=1)
450
+
451
+ X_test = test.drop(['ID', 'Season','TeamID1','TeamID2'], axis=1)
452
+
453
+ train_oof = np.zeros((X.shape[0],))
454
+ test_preds = 0
455
+ train_oof.shape
456
+
457
+ NUM_FOLDS = 5
458
+ kf = GroupKFold(n_splits=NUM_FOLDS)
459
+ max_iter = 550
460
+
461
+ for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(X, y, s))):
462
+ train_df, val_df = X.iloc[train_ind], X.iloc[val_ind]
463
+ train_target, val_target = y.iloc[train_ind], y.iloc[val_ind]
464
+ train_df_xgb = xgb.DMatrix(train_df, label=train_target)
465
+ val_df_xgb = xgb.DMatrix(val_df, label=val_target)
466
+
467
+ model = HistGradientBoostingClassifier(max_iter=max_iter, validation_fraction=None, learning_rate=0.01, max_depth=2, min_samples_leaf=32)
468
+ model1 = RandomForestClassifier()
469
+ model2 = LogisticRegression(C=1)
470
+ model3 = xgb.train(xgb_params, train_df_xgb, 1000)
471
+
472
+ model = model.fit(train_df, train_target)
473
+ model1 = model1.fit(train_df, train_target)
474
+ model2 = model2.fit(train_df, train_target)
475
+
476
+ temp_oof = (model.predict_proba(val_df)[:,1] + model1.predict_proba(val_df)[:,1] + model2.predict_proba(val_df)[:,1] + model3.predict(val_df_xgb)) / 4
477
+
478
+ temp_test = (model.predict_proba(X_test)[:,1] + model1.predict_proba(X_test)[:,1] + model2.predict_proba(X_test)[:,1] + model3.predict(xgb.DMatrix(X_test))) / 4
479
+
480
+ train_oof[val_ind] = temp_oof
481
+
482
+ test_preds += temp_test / NUM_FOLDS
483
+
484
+ print(log_loss(val_target, temp_oof))
485
+
486
+ print('CV', log_loss(y, train_oof))
487
+ np.save('train_oof', train_oof)
488
+ np.save('test_preds', test_preds)
489
+
490
+ test = test
491
+ MSampleSubmission = pd.read_csv(data + '/MSampleSubmissionStage2.csv')
492
+
493
+ idx = test_preds.shape[0] //2
494
+ test_preds[idx:] = 1 - test_preds[idx:]
495
+
496
+ pred = pd.concat([test.ID, pd.Series(test_preds)], axis=1).groupby('ID')[0]\
497
+ .mean().reset_index().rename(columns={0:'Pred'})
498
+ sub3 = MSampleSubmission.drop(['Pred'],axis=1).merge(pred, on='ID')
499
+ pred_3 = sub3['Pred']
500
+
501
+ 0.5539459504635523
502
+
503
+ idx = test_pred.shape[0] //2
504
+ test_pred[idx:] = 1 - test_pred[idx:]
505
+
506
+ pred = pd.concat([test.ID, pd.Series(test_pred)], axis=1).groupby('ID')[0]\
507
+ .mean().reset_index().rename(columns={0:'Pred'})
508
+ sub = MSampleSubmission.drop(['Pred'],axis=1).merge(pred, on='ID')
509
+ sub['Pred'] = sub['Pred'] * 0.3 + sub3['Pred'] * 0.7
510
+ sub.to_csv('submission.csv', index=False)
511
+ sub.head()
512
+
513
+ if STAGE_1:
514
+ rslt = pd.DataFrame()
515
+ TCResults_s = TCResults.loc[TCResults.Season >= 2015,:]
516
+ rslt['season'] = TCResults_s.Season
517
+ rslt['team1'] = TCResults_s.apply(lambda x: x.WTeamID \
518
+ if x.WTeamID < x.LTeamID else x.LTeamID
519
+ , axis=1)
520
+ rslt['team2'] = TCResults_s.apply(lambda x: x.WTeamID \
521
+ if x.WTeamID > x.LTeamID else x.LTeamID
522
+ , axis=1)
523
+ rslt['wl'] = TCResults_s.apply(lambda x: 1 if x.WTeamID < x.LTeamID else 0
524
+ , axis=1)
525
+ rslt['ID'] = rslt.apply(lambda x: str(x.season) + '_' + str(x.team1) \
526
+ + '_' + str(x.team2), axis=1)
527
+ sub2 = sub.merge(rslt.loc[:,['ID','wl']], how='inner', on='ID')
528
+
529
+ preds = []
530
+ for i in sub2.Pred:
531
+ preds.append([1-i, i])
532
+
533
+ print('Test logloss is {:.5f}'.format(log_loss(sub2.wl.values, preds)))
534
+
535
+ 0.51971
536
+
537
+ !pip install gradio
538
+
539
+ sub
540
+
541
+ import gradio as gr
542
+
543
+ def prediction_result(teamID_1, teamID_2):
544
+ id = f"2021_{int(teamID_1)}_{int(teamID_2)}"
545
+ pred = sub["Pred"].loc[sub["ID"] == id]
546
+ p = pred.values
547
+ return f"The winning probability of teamID {int(teamID_1)} is {round(p[0] * 100, 2)}%"
548
+
549
+ demo = gr.Interface(
550
+ fn = prediction_result,
551
+ inputs = ["number", "number"],
552
+ outputs = "text",
553
+ title = "MENS MARCH MANIA 2021",
554
+ description = """Predicted the outcome of the 2021 tournament""",
555
+ examples = [[1101, 1104], [1101, 1111], [1101, 1116], [1101, 1124], [1101, 1140]],
556
+ live = True
557
+ )
558
+
559
+ demo.launch(share = True)
560
+
561
+ !git clone https://huggingface.co/spaces/Harshi/MarchMachineLearningMania
562
+