Spaces:
Runtime error
Runtime error
import warnings | |
import lightgbm as lgb | |
import numpy as np | |
from bayes_opt import BayesianOptimization | |
import argparse | |
def read_labels(dataset_path): | |
with open(dataset_path, mode="r", encoding="utf-8") as f: | |
columns, labels = {}, [] | |
for line_id, line in enumerate(f): | |
if line_id == 0: | |
for i, column_name in enumerate(line.rstrip("\r\n").split("\t")): | |
columns[column_name] = i | |
continue | |
line = line.rstrip("\r\n").split("\t") | |
labels.append(int(line[columns["label"]])) | |
return labels | |
def main(): | |
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
# Path options. | |
parser.add_argument("--train_path", type=str, required=True, | |
help="Path of the trainset.") | |
parser.add_argument("--train_features_path", type=str, required=True, | |
help="Path of the train features for stacking.") | |
# Model options. | |
parser.add_argument("--models_num", type=int, default=64, | |
help="Number of models for ensemble.") | |
parser.add_argument("--folds_num", type=int, default=5, | |
help="Number of folds for cross validation.") | |
parser.add_argument("--labels_num", type=int, default=2, | |
help="Number of labels.") | |
# Bayesian optimization options. | |
parser.add_argument("--epochs_num", type=int, default=100, | |
help="Number of epochs.") | |
args = parser.parse_args() | |
labels = read_labels(args.train_path) | |
def lgb_cv(num_leaves, min_data_in_leaf, learning_rate, feature_fraction, lambda_l1, lambda_l2, max_depth): | |
num_leaves = int(num_leaves) | |
min_data_in_leaf = int(min_data_in_leaf) | |
max_depth = int(max_depth) | |
param = { | |
"num_leaves": num_leaves, | |
"min_data_in_leaf": min_data_in_leaf, | |
"learning_rate": learning_rate, | |
"feature_fraction": feature_fraction, | |
"lambda_l1": lambda_l1, | |
"lambda_l2": lambda_l2, | |
"max_depth": max_depth, | |
"save_binary": True, | |
"objective": "multiclass", | |
"num_class": args.labels_num, | |
"verbose": -1, | |
"metric": "multi_error" | |
} | |
scores = [] | |
instances_num_per_fold = len(labels) // args.folds_num + 1 | |
for fold_id in range(args.folds_num): | |
x_train = np.concatenate((train_features[0: fold_id * instances_num_per_fold], train_features[(fold_id + 1) * instances_num_per_fold:]), axis = 0) | |
x_val = train_features[fold_id * instances_num_per_fold: (fold_id + 1) * instances_num_per_fold] | |
y_train = labels[0: fold_id * instances_num_per_fold] + labels[(fold_id + 1) * instances_num_per_fold:] | |
y_val = labels[fold_id * instances_num_per_fold: (fold_id + 1) * instances_num_per_fold] | |
lgb_train = lgb.Dataset(x_train, y_train) | |
lgb_eval = lgb.Dataset(x_val, y_val, reference=lgb_train) | |
model = lgb.train(param, lgb_train, valid_sets=lgb_eval, verbose_eval=0) | |
pred = model.predict(x_val) | |
val_pred = np.argmax(pred, axis=1) | |
confusion = np.zeros((args.labels_num, args.labels_num)) | |
for i in range(len(pred)): | |
confusion[val_pred[i], y_val[i]] += 1 | |
correct = np.sum(val_pred == y_val) | |
marco_f1 = [] | |
eps = 1e-9 | |
for i in range(args.labels_num): | |
p = confusion[i, i].item() / (confusion[i, :].sum().item() + eps) | |
r = confusion[i, i].item() / (confusion[:, i].sum().item() + eps) | |
f1 = 2 * p * r / (p + r + eps) | |
marco_f1.append(f1) | |
scores.append(np.mean(marco_f1)) | |
return np.mean(scores) | |
train_features = [] | |
for i in range(args.models_num): | |
train_features.append(np.load(args.train_features_path + "train_features_" + str(i) + ".npy")) | |
train_features = np.concatenate(train_features, axis=-1) | |
bounds = { | |
"num_leaves": (10, 100), | |
"min_data_in_leaf": (10, 100), | |
"learning_rate": (0.005, 0.5), | |
"feature_fraction": (0.001, 0.5), | |
"lambda_l1": (0, 10), | |
"lambda_l2": (0, 10), | |
"max_depth":(3, 200) | |
} | |
lgb_bo = BayesianOptimization(lgb_cv, bounds) | |
with warnings.catch_warnings(): | |
warnings.filterwarnings('ignore') | |
lgb_bo.maximize(n_iter=args.epochs_num) | |
if __name__ == "__main__": | |
main() | |