Spaces:
Runtime error
Runtime error
File size: 4,623 Bytes
7900c16 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import warnings
import lightgbm as lgb
import numpy as np
from bayes_opt import BayesianOptimization
import argparse
def read_labels(dataset_path):
with open(dataset_path, mode="r", encoding="utf-8") as f:
columns, labels = {}, []
for line_id, line in enumerate(f):
if line_id == 0:
for i, column_name in enumerate(line.rstrip("\r\n").split("\t")):
columns[column_name] = i
continue
line = line.rstrip("\r\n").split("\t")
labels.append(int(line[columns["label"]]))
return labels
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# Path options.
parser.add_argument("--train_path", type=str, required=True,
help="Path of the trainset.")
parser.add_argument("--train_features_path", type=str, required=True,
help="Path of the train features for stacking.")
# Model options.
parser.add_argument("--models_num", type=int, default=64,
help="Number of models for ensemble.")
parser.add_argument("--folds_num", type=int, default=5,
help="Number of folds for cross validation.")
parser.add_argument("--labels_num", type=int, default=2,
help="Number of labels.")
# Bayesian optimization options.
parser.add_argument("--epochs_num", type=int, default=100,
help="Number of epochs.")
args = parser.parse_args()
labels = read_labels(args.train_path)
def lgb_cv(num_leaves, min_data_in_leaf, learning_rate, feature_fraction, lambda_l1, lambda_l2, max_depth):
num_leaves = int(num_leaves)
min_data_in_leaf = int(min_data_in_leaf)
max_depth = int(max_depth)
param = {
"num_leaves": num_leaves,
"min_data_in_leaf": min_data_in_leaf,
"learning_rate": learning_rate,
"feature_fraction": feature_fraction,
"lambda_l1": lambda_l1,
"lambda_l2": lambda_l2,
"max_depth": max_depth,
"save_binary": True,
"objective": "multiclass",
"num_class": args.labels_num,
"verbose": -1,
"metric": "multi_error"
}
scores = []
instances_num_per_fold = len(labels) // args.folds_num + 1
for fold_id in range(args.folds_num):
x_train = np.concatenate((train_features[0: fold_id * instances_num_per_fold], train_features[(fold_id + 1) * instances_num_per_fold:]), axis = 0)
x_val = train_features[fold_id * instances_num_per_fold: (fold_id + 1) * instances_num_per_fold]
y_train = labels[0: fold_id * instances_num_per_fold] + labels[(fold_id + 1) * instances_num_per_fold:]
y_val = labels[fold_id * instances_num_per_fold: (fold_id + 1) * instances_num_per_fold]
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_val, y_val, reference=lgb_train)
model = lgb.train(param, lgb_train, valid_sets=lgb_eval, verbose_eval=0)
pred = model.predict(x_val)
val_pred = np.argmax(pred, axis=1)
confusion = np.zeros((args.labels_num, args.labels_num))
for i in range(len(pred)):
confusion[val_pred[i], y_val[i]] += 1
correct = np.sum(val_pred == y_val)
marco_f1 = []
eps = 1e-9
for i in range(args.labels_num):
p = confusion[i, i].item() / (confusion[i, :].sum().item() + eps)
r = confusion[i, i].item() / (confusion[:, i].sum().item() + eps)
f1 = 2 * p * r / (p + r + eps)
marco_f1.append(f1)
scores.append(np.mean(marco_f1))
return np.mean(scores)
train_features = []
for i in range(args.models_num):
train_features.append(np.load(args.train_features_path + "train_features_" + str(i) + ".npy"))
train_features = np.concatenate(train_features, axis=-1)
bounds = {
"num_leaves": (10, 100),
"min_data_in_leaf": (10, 100),
"learning_rate": (0.005, 0.5),
"feature_fraction": (0.001, 0.5),
"lambda_l1": (0, 10),
"lambda_l2": (0, 10),
"max_depth":(3, 200)
}
lgb_bo = BayesianOptimization(lgb_cv, bounds)
with warnings.catch_warnings():
warnings.filterwarnings('ignore')
lgb_bo.maximize(n_iter=args.epochs_num)
if __name__ == "__main__":
main()
|