yifanxie commited on
Commit
f3c73fd
·
1 Parent(s): 902cad8

first commit

Browse files
notebooks/data_exploration_v1.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
project_tools/__init__.py ADDED
File without changes
project_tools/project_class.py ADDED
File without changes
project_tools/project_config.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.dirname(os.getcwd()))
4
+
5
+ DATETIME_FORMAT1 = '%Y%m%d%H%M'
6
+ DATETIME_FORMAT2 = '%Y/%m/%d %H:%M'
7
+ DATETIME_FORMAT3 = '%Y-%m-%d'
project_tools/project_utils.py ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import random
4
+ import os
5
+ from os import listdir
6
+ from os.path import isfile, join, isdir
7
+ import cv2
8
+ import pickle
9
+ import sys
10
+ import time
11
+ from contextlib import contextmanager
12
+ from importlib import reload
13
+ # from datetime import datetime
14
+ from shutil import copyfile, move
15
+ import re
16
+ from pathlib import Path
17
+
18
+ from project_tools import project_config, project_utils
19
+
20
+ from shutil import copyfile, move
21
+ import gc
22
+ import glob
23
+ from multiprocessing import Pool
24
+ from functools import partial
25
+ import matplotlib.pyplot as plt
26
+ import traceback
27
+ import json
28
+ import datetime
29
+
30
+
31
+ def get_time_string():
32
+ """
33
+ Generate a time string representation of the time of call of this function.
34
+ :param None
35
+ :return: a string that represent the time of the functional call.
36
+ """
37
+ now = datetime.datetime.now()
38
+ now = str(now.strftime('%Y%m%d%H%M'))
39
+ return now
40
+
41
+
42
+ def reload_project():
43
+ """
44
+ utility function used during experimentation to reload various model when required, useful for quick experiment iteration
45
+ :return: None
46
+ """
47
+ reload(project_config)
48
+ reload(project_utils)
49
+ reload(project_class)
50
+
51
+
52
+ @contextmanager
53
+ def timer(name):
54
+ """
55
+ utility timer function to check how long a piece of code might take to run.
56
+ :param name: name of the code fragment to be timed
57
+ :yield: time taken for the code to run
58
+ """
59
+ t0 = time.time()
60
+ print('[%s] in progress' % name)
61
+ yield
62
+ print('[%s] done in %.6f s' %(name, time.time() - t0))
63
+
64
+
65
+
66
+ def load_data(pickle_file):
67
+ """
68
+ load pickle data from file
69
+ :param pickle_file: path of pickle data
70
+ :return: data stored in pickle file
71
+ """
72
+ load_file = open(pickle_file, 'rb')
73
+ data = pickle.load(load_file)
74
+ return data
75
+
76
+
77
+ def pickle_data(path, data, protocol=-1, timestamp=False, verbose=True):
78
+ """
79
+ Pickle data to specified file
80
+ :param path: full path of file where data will be pickled to
81
+ :param data: data to be pickled
82
+ :param protocol: pickle protocol, -1 indicate to use the latest protocol
83
+ :return: None
84
+ """
85
+ file = path
86
+ if timestamp:
87
+ base_file = os.path.splitext(file)[0]
88
+ time_str = '_' + get_time_string()
89
+ ext = os.path.splitext(os.path.basename(file))[1]
90
+ file = base_file + time_str + ext
91
+
92
+ if verbose:
93
+ print('creating file %s' % file)
94
+
95
+ save_file = open(file, 'wb')
96
+ pickle.dump(data, save_file, protocol=protocol)
97
+ save_file.close()
98
+
99
+
100
+ def save_json(path, data, timestamp=False, verbose=True, indent=2):
101
+ """
102
+ Save data to Json format
103
+ :param path: full path of file where data will be pickled to
104
+ :param data: data to be pickled
105
+ :param timestamp: if true, the timestamp will be saved as part of the file name
106
+ :param verbose: if true, print information about file creation
107
+ :param indent: specify the width of the indent in the resulted Json file
108
+ :return: None
109
+ """
110
+ file = path
111
+ if timestamp:
112
+ base_file = os.path.splitext(file)[0]
113
+ time_str = '_' + get_time_string()
114
+ ext = os.path.splitext(os.path.basename(file))[1]
115
+ file = base_file + time_str + ext
116
+ if verbose:
117
+ print('creating file %s' % file)
118
+ outfile = open(file, 'w')
119
+ json.dump(data, outfile, indent=indent)
120
+ outfile.close()
121
+
122
+
123
+ def load_json(json_file):
124
+ """
125
+ load data from Json file
126
+ :param json_file: path of json file
127
+ :return: data stored in json file as python dictionary
128
+ """
129
+ load_file = open(json_file)
130
+ data = json.load(load_file)
131
+ load_file.close()
132
+ return data
133
+
134
+
135
+ def create_folder(path):
136
+ Path(path).mkdir(parents=True, exist_ok=True)
137
+
138
+
139
+
140
+ def glob_folder_filelist(path, file_type='', recursive=True):
141
+ """
142
+ utility function that walk through a given directory, and return list of files in the directory
143
+ :param path: the path of the directory
144
+ :param file_type: if not '', this function would only consider the file type specified by this parameter
145
+ :param recursive: if True, perform directory walk-fhrough recursively
146
+ :return absfile: a list containing absolute path of each file in the directory
147
+ :return base_files: a list containing base name of each file in the directory
148
+ """
149
+ if path[-1] != '/':
150
+ path = path +'/'
151
+ abs_files = []
152
+ base_files = []
153
+ patrn = '**' if recursive else '*'
154
+ glob_path = path + patrn
155
+ matches = glob.glob(glob_path, recursive=recursive)
156
+ for f in matches:
157
+ if os.path.isfile(f):
158
+ include = True
159
+ if len(file_type)>0:
160
+ ext = os.path.splitext(f)[1]
161
+ if ext[1:] != file_type:
162
+ include = False
163
+ if include:
164
+ abs_files.append(f)
165
+ base_files.append(os.path.basename(f))
166
+ return abs_files, base_files
167
+
168
+
169
+ def dir_compare(pathl, pathr):
170
+ files_pathl = set([f for f in listdir(pathl) if isfile(join(pathl, f))])
171
+ files_pathr = set([f for f in listdir(pathr) if isfile(join(pathr, f))])
172
+ return list(files_pathl-files_pathr), list(files_pathr-files_pathl)
173
+
174
+
175
+
176
+
177
+ def lr_dir_sync(pathl, pathr):
178
+ files_lrddiff, files_rldiff = project_utils.dir_compare(pathl, pathr)
179
+ for f in files_lrddiff:
180
+ scr = pathl + f
181
+ dst = pathr + f
182
+ print('copying file %s' % scr)
183
+ copyfile(scr, dst)
184
+
185
+
186
+
187
+ def copy_file_with_time(src_file, dst_file_name, des_path):
188
+ basename = os.path.splitext(os.path.basename(dst_file_name))[0]
189
+ ext_name = os.path.splitext(os.path.basename(dst_file_name))[1]
190
+ timestr = get_time_string()
191
+ des_name = '%s%s_%s%s' % (des_path, basename, timestr, ext_name)
192
+ # print(des_name)
193
+ copyfile(src_file, des_name)
194
+
195
+
196
+
197
+
198
+
199
+ def find_filesfromfolder(target_dir, containtext):
200
+ absnames, basenames = glob_folder_filelist(target_dir)
201
+ result_filelist = []
202
+ for absname, basename in zip(absnames, basenames):
203
+ if containtext in basename:
204
+ result_filelist.append(absname)
205
+ # result_filelist = [f for f in total_filelist if containtext in f]
206
+ return result_filelist
207
+
208
+
209
+ def cp_files_with_prefix(src_path, dst_path, prefix, ext):
210
+ abs_file_list, base_file_list = get_folder_filelist(src_path, file_type=ext)
211
+ # print(abs_file_list)
212
+ for src_file, base_file in zip(abs_file_list, base_file_list):
213
+ dst_file = dst_path + prefix + base_file
214
+ copyfile(src_file, dst_file)
215
+ return None
216
+
217
+
218
+
219
+ def mv_files_with_prefix(src_path, dst_path, prefix, ext):
220
+ abs_file_list, base_file_list = get_folder_filelist(src_path, file_type=ext)
221
+ # print(abs_file_list)
222
+ for src_file, base_file in zip(abs_file_list, base_file_list):
223
+ dst_file = dst_path + prefix + base_file
224
+ move(src_file, dst_file)
225
+ return None
226
+
227
+
228
+
229
+ def empty_folder(path):
230
+ if path[-1]!='*':
231
+ path = path + '*'
232
+ files = glob.glob(path)
233
+ for f in files:
234
+ os.remove(f)
235
+
236
+
237
+
238
+ def rmse(y_true, y_pred):
239
+ """
240
+ RMSE (Root Mean Square Error) evaluation function
241
+ :param y_true: label values
242
+ :param y_pred: prediction values
243
+ :return: RMSE value of the input prediction values, evaluated against the input label values
244
+ """
245
+ return np.sqrt(mean_squared_error(y_true, y_pred))
246
+
247
+
248
+
249
+
250
+ def str2date(date_str, dateformat='%Y-%m-%d'):
251
+ """
252
+ convert an input string in specified format into datetime format
253
+ :param date_str: the input string with certain specified format
254
+ :param dateformat: the format of the string which is used by the strptime function to do the type converson
255
+ :return dt_value: the datetime value that is corresponding to the input string and the specified format
256
+ """
257
+ dt_value = datetime.datetime.strptime(date_str, dateformat)
258
+ return dt_value
259
+
260
+
261
+ def isnotebook():
262
+ """
263
+ Determine if the current python file is a jupyter notebook (.ipynb) or a python script (.py)
264
+ :return: return True if the the current python file is a jupyter notebook, otherwise return False
265
+ """
266
+ try:
267
+ shell = get_ipython().__class__.__name__
268
+ if shell == 'ZMQInteractiveShell':
269
+ return True # Jupyter notebook
270
+ elif shell == 'TerminalInteractiveShell':
271
+ return False # Terminal running IPython
272
+ else:
273
+ return False # Other type (?)
274
+ except NameError:
275
+ return False
276
+
277
+
278
+
279
+ def list_intersection(left, right):
280
+ """
281
+ take two list as input, conver them into sets, calculate the intersection of the two sets, and return this as a list
282
+ :param left: the first input list
283
+ :param right: the second input list
284
+ :return: the intersection set of elements for both input list, as a list
285
+ """
286
+ left_set = set(left)
287
+ right_set = set(right)
288
+ return list(left_set.intersection(right_set))
289
+
290
+
291
+ def list_union(left, right):
292
+ """
293
+ take two list as input, conver them into sets, calculate the union of the two sets, and return this as a list
294
+ :param left: the first input list
295
+ :param right: the second input list
296
+ :return: the union set of elements for both input list, as a list
297
+ """
298
+ left_set = set(left)
299
+ right_set = set(right)
300
+ return list(left_set.union(right_set))
301
+
302
+
303
+ def list_difference(left, right):
304
+ """
305
+ take two list as input, conver them into sets, calculate the difference of the first set to the second set, and return this as a list
306
+ :param left: the first input list
307
+ :param right: the second input list
308
+ :return: the result of difference set operation on elements for both input list, as a list
309
+ """
310
+ left_set = set(left)
311
+ right_set = set(right)
312
+ return list(left_set.difference(right_set))
313
+
314
+
315
+ def is_listelements_identical(left, right):
316
+ equal_length = (len(left)==len(right))
317
+ zero_diff = (len(list_difference(left,right))==0)
318
+ return equal_length & zero_diff
319
+
320
+
321
+
322
+
323
+ def np_corr(a, b):
324
+ """
325
+ take two numpy arrays, and compute their correlation
326
+ :param a: the first numpy array input
327
+ :param b: the second numpy array input
328
+ :return: the correlation between the two input arrays
329
+ """
330
+ return pd.Series(a).corr(pd.Series(b))
331
+
332
+
333
+
334
+ def list_sort_values(a, ascending=True):
335
+ """
336
+ sort the value of a list in specified order
337
+ :param a: the input list
338
+ :param ascending: specified if the sorting is to be done in ascending or descending order
339
+ :return: the input list sorted in the specified order
340
+ """
341
+ return pd.Series(a).sort_values(ascending=ascending).tolist()
342
+
343
+
344
+ def get_rank(data):
345
+ """
346
+ convert the values of a list or array into ranked percentage values
347
+ :param data: the input data in the form of a list or an array
348
+ :return: the return ranked percentage values in numpy array
349
+ """
350
+ ranks = pd.Series(data).rank(pct=True).values
351
+ return ranks
352
+
353
+
354
+
355
+ def plot_feature_corr(df, features, figsize=(10,10), vmin=-1.0):
356
+ """
357
+ plot the pair-wise correlation matrix for specified features in a dataframe
358
+ :param df: the input dataframe
359
+ :param features: the list of features for which correlation matrix will be plotted
360
+ :param figsize: the size of the displayed figure
361
+ :param vmin: the minimum value of the correlation to be included in the plotting
362
+ :return: the pair-wise correlation values in the form of pandas dataframe, the figure will be plotted during the operation of this function.
363
+ """
364
+ val_corr = df[features].corr().fillna(0)
365
+ f, ax = plt.subplots(figsize=figsize)
366
+ sns.heatmap(val_corr, vmin=vmin, square=True)
367
+ return val_corr
368
+
369
+
370
+ def decision_to_prob(data):
371
+ """
372
+ convert output value of a sklearn classifier (i.e. ridge classifier) decision function into probability
373
+ :param data: output value of decision function in the form of a numpy array
374
+ :return: value of probability in the form of a numpy array
375
+ """
376
+ prob = np.exp(data) / np.sum(np.exp(data))
377
+ return prob
378
+
379
+
380
+ def np_describe(a):
381
+ """
382
+ provide overall statistic description of an input numpy value using the Describe method of Pandas Series
383
+ :param a: the input numpy array
384
+ :return: overall statistic description
385
+ """
386
+ return pd.Series(a.flatten()).describe()
387
+
388
+
389
+ def ks_2samp_selection(train_df, test_df, pval=0.1):
390
+ """
391
+ use scipy ks_2samp function to select features that are statistically similar between the input train and test dataframe.
392
+ :param train_df: the input train dataframe
393
+ :param test_df: the input test dataframe
394
+ :param pval: the p value threshold use to decide which features to be selected. Only features with value higher than the specified p value will be selected
395
+ :return train_df: the return train dataframe with selected features
396
+ :return test_df: the return test dataframe with selected features
397
+ """
398
+ list_p_value = []
399
+ for i in train_df.columns.tolist():
400
+ list_p_value.append(ks_2samp(train_df[i], test_df[i])[1])
401
+ Se = pd.Series(list_p_value, index=train_df.columns.tolist()).sort_values()
402
+ list_discarded = list(Se[Se < pval].index)
403
+ train_df = train_df.drop(columns=list_discarded)
404
+ test_df = test_df.drop(columns=list_discarded)
405
+ return train_df, test_df
406
+
407
+
408
+
409
+ def df_balance_sampling(df, class_feature, minor_class=1, sample_ratio=1):
410
+ """
411
+ :param df:
412
+ :param class_feature:
413
+ :param minor_class:
414
+ :param sample_ratio:
415
+ :return:
416
+ """
417
+ minor_df = df[df[class_feature] == minor_class]
418
+ major_df = df[df[class_feature] == (1 - minor_class)].sample(sample_ratio * len(minor_df))
419
+
420
+ res_df = minor_df.append(major_df)
421
+ res_df = res_df.sample(len(res_df)).reset_index(drop=True)
422
+ return res_df
423
+
424
+
425
+ def prob2acc(label, probs, p=0.5):
426
+ """
427
+ calculate accuracy score for probability predictions with given threshold, as part of the process, the input probability predictions will be converted into discrete binary predictions
428
+ :param label: labels used to evaluate accuracy score
429
+ :param probs: probability predictions for which accuracy score will be calculated
430
+ :param p: the threshold to be used for convert probabilites into discrete binary values 0 and 1
431
+ :return acc: the computed accuracy score
432
+ :return preds: predictions in discrete binary value
433
+ """
434
+
435
+ preds = (probs >= p).astype(np.uint8)
436
+ acc = accuracy_score(label, preds)
437
+ return acc, preds
438
+
439
+
440
+
441
+ def np_pearson(t,p):
442
+ vt = t - t.mean()
443
+ vp = p - p.mean()
444
+ top = np.sum(vt*vp)
445
+ bottom = np.sqrt(np.sum(vt**2)) * np.sqrt(np.sum(vp**2))
446
+ res = top/bottom
447
+ return res
448
+
449
+
450
+ def df_get_features_with_str(df, ptrn):
451
+ """
452
+ extract list of feature names from a data frame that contain the specified regular expression pattern
453
+ :param df: the input dataframe of which features name to be analysed
454
+ :param ptrn: the specified regular expression pattern
455
+ :return: list of feature names that contained the specified regular expression
456
+ """
457
+ return [col for col in df.columns.tolist() if len(re.findall(ptrn, col)) > 0]
458
+
459
+
460
+ def df_fillna_with_other(df, src_feature, dst_feature):
461
+ """
462
+ fill the NA values of a specified feature in a dataframe with values of another feature from the same row.
463
+ :param df: the input dataframe
464
+ :param src_feature: the specified feature of which NA value will be filled
465
+ :param dst_feature: the feature of which values will be used
466
+ :return: a dataframe with the specified feature's NA value being filled by values from the "dst_feature"
467
+ """
468
+ src_vals = df[src_feature].values
469
+ dst_vals = df[dst_feature].values
470
+ argwhere_nan = np.argwhere(np.isnan(dst_vals)).flatten()
471
+ dst_vals[argwhere_nan] = src_vals[argwhere_nan]
472
+ df[dst_feature] = dst_vals
473
+ return df
474
+
475
+
476
+
477
+ def plot_prediction_prob(y_pred_prob):
478
+ """
479
+ plot probability prediction values using histrogram
480
+ :param y_pred_prob: the probability prediction values to be plotted
481
+ :return: None, the plot will be plotted during the operation of the function.
482
+ """
483
+ prob_series = pd.Series(data=y_pred_prob)
484
+ prob_series.name = 'prediction probability'
485
+ prob_series.plot(kind='hist', figsize=(15, 5), bins=50)
486
+ plt.show()
487
+ print(prob_series.describe())
488
+
489
+
490
+
491
+
492
+
493
+ def df_traintest_split(df, split_var, seed=None, train_ratio=0.75):
494
+ """
495
+ perform train test split on a specified feature on a given dataframe wwith specified train ratio. Unique value of the specified feature will only present on either the resulted train or the test dataframe
496
+ :param df: the input dataframe to be split
497
+ :param split_var: the feature to be used as unique value to perform the split
498
+ :param seed: the random used to facilitate the train test split
499
+ :param train_ratio: the ratio of data to be split into the resulted train dataframe.
500
+ :return train_df: the resulted train dataframe after the split
501
+ :return test_df: the resulted test dataframe after the split
502
+ """
503
+ sv_list = df[split_var].unique().tolist()
504
+ train_length = int(len(sv_list) * train_ratio)
505
+ train_siv_list = pd.Series(df[split_var].unique()).sample(train_length, random_state=seed)
506
+ train_idx = df.loc[df[split_var].isin(train_siv_list)].index.values
507
+ test_idx = df.iloc[df.index.difference(train_idx)].index.values
508
+ train_df = df.loc[train_idx].copy().reset_index(drop=True)
509
+ test_df = df.loc[test_idx].copy().reset_index(drop=True)
510
+ return train_df, test_df
511
+
512
+
513
+
514
+ # https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
515
+ def reduce_mem_usage(df, verbose=True, exceiptions=[]):
516
+ """ iterate through all the columns of a dataframe and modify the data type
517
+ to reduce memory usage.
518
+ """
519
+ np_input = False
520
+ if isinstance(df, np.ndarray):
521
+ np_input = True
522
+ df = pd.DataFrame(data=df)
523
+
524
+ start_mem = df.memory_usage().sum() / 1024 ** 2
525
+ col_id = 0
526
+ print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
527
+ for col in df.columns:
528
+ if verbose: print('doing %d: %s' % (col_id, col))
529
+ col_type = df[col].dtype
530
+ try:
531
+ if (col_type != object) & (col not in exceiptions):
532
+ c_min = df[col].min()
533
+ c_max = df[col].max()
534
+ if str(col_type)[:3] == 'int':
535
+ if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
536
+ df[col] = df[col].astype(np.int8)
537
+ elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
538
+ df[col] = df[col].astype(np.int16)
539
+ elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
540
+ df[col] = df[col].astype(np.int32)
541
+ elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
542
+ df[col] = df[col].astype(np.int64)
543
+ else:
544
+ if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
545
+ # df[col] = df[col].astype(np.float16)
546
+ # elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
547
+ df[col] = df[col].astype(np.float32)
548
+ else:
549
+ df[col] = df[col].astype(np.float64)
550
+ # else:
551
+ # df[col] = df[col].astype('category')
552
+ # pass
553
+ except:
554
+ pass
555
+ col_id += 1
556
+ end_mem = df.memory_usage().sum() / 1024 ** 2
557
+ print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
558
+ print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
559
+
560
+ if np_input:
561
+ return df.values
562
+ else:
563
+ return df
564
+
565
+
566
+
567
+ def get_xgb_featimp(model):
568
+ imp_type = ['weight', 'gain', 'cover', 'total_gain', 'total_cover']
569
+ imp_dict = {}
570
+ try:
571
+ bst = model.get_booster()
572
+ except:
573
+ bst = model
574
+ feature_names = bst.feature_names
575
+ for impt in imp_type:
576
+ imp_dict[impt] = []
577
+ scores = bst.get_score(importance_type=impt)
578
+ for feature in feature_names:
579
+ if feature in scores.keys():
580
+ imp_dict[impt].append(scores[feature])
581
+ else:
582
+ imp_dict[impt].append(np.nan)
583
+ imp_df = pd.DataFrame(index=bst.feature_names, data=imp_dict)
584
+ return imp_df
585
+
586
+
587
+ def get_df_rankavg(df):
588
+ idx = df.index
589
+ cols = df.columns.tolist()
590
+ rankavg_dict = {}
591
+ for col in cols:
592
+ rankavg_dict[col]=df[col].rank(pct=True).tolist()
593
+ rankavg_df = pd.DataFrame(index=idx, columns=cols, data=rankavg_dict)
594
+ rankavg_df['rankavg'] = rankavg_df.mean(axis=1)
595
+ return rankavg_df.sort_values(by='rankavg', ascending=False)
596
+
597
+
598
+ def get_list_gmean(lists):
599
+ out = np.zeros((len(lists[0]), len(lists)))
600
+ for i in range(0, len(lists)):
601
+ out[:,i] = lists[i]
602
+ gmean_out = gmean(out, axis=1)
603
+ return gmean_out
604
+
605
+
606
+
607
+ def generate_nwise_combination(items, n=2):
608
+ return list(itertools.combinations(items, n))
609
+
610
+
611
+ def pairwise_feature_generation(df, feature_list, operator='addition', verbose=True):
612
+ feats_pair = generate_nwise_combination(feature_list, 2)
613
+ result_df = pd.DataFrame()
614
+ for pair in feats_pair:
615
+ if verbose:
616
+ print('generating %s of %s and %s' % (operator, pair[0], pair[1]))
617
+ if operator == 'addition':
618
+ feat_name = pair[0] + '_add_' + pair[1]
619
+ result_df[feat_name] = df[pair[0]] + df[pair[1]]
620
+ elif operator == 'multiplication':
621
+ feat_name = pair[0] + '_mulp_' + pair[1]
622
+ result_df[feat_name] = df[pair[0]] * df[pair[1]]
623
+ elif operator == 'division':
624
+ feat_name = pair[0] + '_div_' + pair[1]
625
+ result_df[feat_name] = df[pair[0]] / df[pair[1]]
626
+ return result_df
627
+
628
+
629
+ def try_divide(x, y, val=0.0):
630
+ """
631
+ try to perform division between two number, and return a default value if division by zero is detected
632
+ :param x: the number to be used as dividend
633
+ :param y: the number to be used as divisor
634
+ :param val: the default output value
635
+ :return: the output value, the default value of val will be returned if division by zero is detected
636
+ """
637
+ if y != 0.0:
638
+ val = float(x) / y
639
+ return val
640
+
641
+
642
+
643
+
644
+
645
+
646
+
647
+
648
+