VISOR-GPT / train /scripts /run_lgb_cv_bayesopt.py
szukevin's picture
upload
7900c16
raw
history blame
4.62 kB
import warnings
import lightgbm as lgb
import numpy as np
from bayes_opt import BayesianOptimization
import argparse
def read_labels(dataset_path):
with open(dataset_path, mode="r", encoding="utf-8") as f:
columns, labels = {}, []
for line_id, line in enumerate(f):
if line_id == 0:
for i, column_name in enumerate(line.rstrip("\r\n").split("\t")):
columns[column_name] = i
continue
line = line.rstrip("\r\n").split("\t")
labels.append(int(line[columns["label"]]))
return labels
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# Path options.
parser.add_argument("--train_path", type=str, required=True,
help="Path of the trainset.")
parser.add_argument("--train_features_path", type=str, required=True,
help="Path of the train features for stacking.")
# Model options.
parser.add_argument("--models_num", type=int, default=64,
help="Number of models for ensemble.")
parser.add_argument("--folds_num", type=int, default=5,
help="Number of folds for cross validation.")
parser.add_argument("--labels_num", type=int, default=2,
help="Number of labels.")
# Bayesian optimization options.
parser.add_argument("--epochs_num", type=int, default=100,
help="Number of epochs.")
args = parser.parse_args()
labels = read_labels(args.train_path)
def lgb_cv(num_leaves, min_data_in_leaf, learning_rate, feature_fraction, lambda_l1, lambda_l2, max_depth):
num_leaves = int(num_leaves)
min_data_in_leaf = int(min_data_in_leaf)
max_depth = int(max_depth)
param = {
"num_leaves": num_leaves,
"min_data_in_leaf": min_data_in_leaf,
"learning_rate": learning_rate,
"feature_fraction": feature_fraction,
"lambda_l1": lambda_l1,
"lambda_l2": lambda_l2,
"max_depth": max_depth,
"save_binary": True,
"objective": "multiclass",
"num_class": args.labels_num,
"verbose": -1,
"metric": "multi_error"
}
scores = []
instances_num_per_fold = len(labels) // args.folds_num + 1
for fold_id in range(args.folds_num):
x_train = np.concatenate((train_features[0: fold_id * instances_num_per_fold], train_features[(fold_id + 1) * instances_num_per_fold:]), axis = 0)
x_val = train_features[fold_id * instances_num_per_fold: (fold_id + 1) * instances_num_per_fold]
y_train = labels[0: fold_id * instances_num_per_fold] + labels[(fold_id + 1) * instances_num_per_fold:]
y_val = labels[fold_id * instances_num_per_fold: (fold_id + 1) * instances_num_per_fold]
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_val, y_val, reference=lgb_train)
model = lgb.train(param, lgb_train, valid_sets=lgb_eval, verbose_eval=0)
pred = model.predict(x_val)
val_pred = np.argmax(pred, axis=1)
confusion = np.zeros((args.labels_num, args.labels_num))
for i in range(len(pred)):
confusion[val_pred[i], y_val[i]] += 1
correct = np.sum(val_pred == y_val)
marco_f1 = []
eps = 1e-9
for i in range(args.labels_num):
p = confusion[i, i].item() / (confusion[i, :].sum().item() + eps)
r = confusion[i, i].item() / (confusion[:, i].sum().item() + eps)
f1 = 2 * p * r / (p + r + eps)
marco_f1.append(f1)
scores.append(np.mean(marco_f1))
return np.mean(scores)
train_features = []
for i in range(args.models_num):
train_features.append(np.load(args.train_features_path + "train_features_" + str(i) + ".npy"))
train_features = np.concatenate(train_features, axis=-1)
bounds = {
"num_leaves": (10, 100),
"min_data_in_leaf": (10, 100),
"learning_rate": (0.005, 0.5),
"feature_fraction": (0.001, 0.5),
"lambda_l1": (0, 10),
"lambda_l2": (0, 10),
"max_depth":(3, 200)
}
lgb_bo = BayesianOptimization(lgb_cv, bounds)
with warnings.catch_warnings():
warnings.filterwarnings('ignore')
lgb_bo.maximize(n_iter=args.epochs_num)
if __name__ == "__main__":
main()