# Dataframe
import pandas as pd
import numpy as np
import sys
# Visualization
import matplotlib.pylab as plt
import seaborn as sns
# Gradient boosting
import lightgbm as lgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
# Sklearn
from sklearn.model_selection import GroupKFold, train_test_split, StratifiedKFold
from sklearn.metrics import (
roc_auc_score,
classification_report,
confusion_matrix,
accuracy_score,
)
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.decomposition import PCA
## Oversampling
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
## Kaggler
import kaggler
from kaggler.model import AutoLGB
from kaggler.preprocessing import DAE, TargetEncoder, LabelEncoder
## CTGAN
from ctgan import CTGANSynthesizer
## TensorFlow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
## LightAutoML
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler
## EVALML
from evalml.automl import AutoMLSearch
## FLAML
from flaml import AutoML
import torch
# Supressing Warnings
import warnings
warnings.filterwarnings("ignore")
INPUT_DIR = "Data"
bcell = pd.read_csv(f"{INPUT_DIR}/input_bcell.csv")
covid = pd.read_csv(f"{INPUT_DIR}/input_covid.csv")
sars = pd.read_csv(f"{INPUT_DIR}/input_sars.csv")
bcell_sars = pd.concat([bcell, sars], axis=0, ignore_index=True)
bcell_sars.head()
bcell_sars.target.value_counts()
idx_train = bcell_sars["target"].astype("bool").values
fig, axes = plt.subplots(2, 2, figsize=(16, 8))
axes = [x for a in axes for x in a]
for i, name in enumerate(["chou_fasman", "emini", "kolaskar_tongaonkar", "parker"]):
value = bcell_sars[name]
sns.distplot(value[~idx_train], ax=axes[i])
sns.distplot(value[idx_train], ax=axes[i])
axes[i].set_xlabel(name, fontsize=12)
fig.legend(labels=["target 0", "target 1"], loc="right", fontsize=12)
clf = PCA(n_components=2)
z = clf.fit_transform(
bcell_sars[["chou_fasman", "emini", "kolaskar_tongaonkar", "parker"]]
)
plt.figure(figsize=(8, 6))
plt.scatter(*z[idx_train].T, s=3)
plt.scatter(*z[~idx_train].T, s=3)
plt.legend(labels=["target_1", "target_0"], fontsize=12)
plt.show()
idx_train = bcell_sars["target"].astype("bool").values
fig, axes = plt.subplots(2, 2, figsize=(16, 8))
axes = [x for a in axes for x in a]
for i, name in enumerate(
["isoelectric_point", "aromaticity", "hydrophobicity", "stability"]
):
value = bcell_sars[name]
sns.distplot(value[~idx_train], ax=axes[i])
sns.distplot(value[idx_train], ax=axes[i])
axes[i].set_xlabel(name, fontsize=12)
fig.legend(labels=["target 0", "target 1"], loc="right", fontsize=12)
clf = PCA(n_components=2)
z = clf.fit_transform(
bcell_sars[["isoelectric_point", "aromaticity", "hydrophobicity", "stability"]]
)
plt.figure(figsize=(8, 6))
plt.scatter(*z[idx_train].T, s=3)
plt.scatter(*z[~idx_train].T, s=3)
plt.legend(labels=["target_1", "target_0"], fontsize=12)
plt.show()
# create length columns
for df in [bcell, sars, covid, bcell_sars]:
df["length"] = df["end_position"] - df["start_position"] + 1
fig, ax = plt.subplots(figsize=(12, 6))
sns.countplot(bcell_sars["length"], ax=ax, color="lightblue")
sns.countplot(bcell_sars.query("target == 1")["length"], ax=ax, color="coral")
plt.legend(labels=["target 0", "target 1"], fontsize=12)
plt.show()
# Corelation Matrix
corr_matrix = bcell_sars[
[
"parent_protein_id",
"protein_seq",
"start_position",
"end_position",
"peptide_seq",
"chou_fasman",
"emini",
"kolaskar_tongaonkar",
"parker",
"isoelectric_point",
"aromaticity",
"hydrophobicity",
"stability",
"target",
]
].corr()
mask = np.zeros_like(corr_matrix, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
#corr heatmap
sns.set_style('whitegrid')
f, ax = plt.subplots(figsize=(11, 15))
heatmap = sns.heatmap(corr_matrix,
mask = mask,
square = True,
linewidths = .5,
cmap = 'coolwarm',
cbar_kws = {'shrink': .4,
'ticks' : [-1, -.5, 0, 0.5, 1]},
vmin = -1,
vmax = 1,
annot = False,
annot_kws = {'size': 12})
#add the column names as labels
ax.set_yticklabels(corr_matrix.columns, rotation = 0)
ax.set_xticklabels(corr_matrix.columns)
sns.set_style({'xtick.bottom': True}, {'ytick.left': True})
X = bcell_sars.drop(
["target", "parent_protein_id", "protein_seq", "peptide_seq"], axis=1
)
y = bcell_sars["target"]
X_train, X_valid, Y_train, Y_valid = train_test_split(
X, y, test_size=0.2, random_state=0
)
d = MinMaxScaler()
d.fit_transform(X_train, Y_train)
d.transform(X_valid)
l = LGBMClassifier(random_state=10)
l.fit(X_train, Y_train)
lg_pred = l.predict(X_valid)
print("AUC score :", roc_auc_score(lg_pred, Y_valid))
AUC score : 0.8165608189066452
cat = CatBoostClassifier(random_state=10, verbose=False)
cat.fit(X_train, Y_train)
cat_pred = cat.predict(X_valid)
print("AUC score :", roc_auc_score(cat_pred, Y_valid))
AUC score : 0.8265375692994856
XG = XGBClassifier(random_state=10)
XG.fit(X_train, Y_train)
XG_pred = XG.predict(X_valid)
print("AUC score :", roc_auc_score(XG_pred, Y_valid))
AUC score : 0.8279276868050317
RF = RandomForestClassifier(random_state=10)
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_valid)
print("AUC score :", roc_auc_score(RF_pred, Y_valid))
AUC score : 0.847563337028536
NB = GaussianNB()
NB.fit(X_train, Y_train)
NB_pred = NB.predict(X_valid)
print("AUC score :", roc_auc_score(NB_pred, Y_valid))
AUC score : 0.6110355253212397
# calculating feature importance
forest_clf = ExtraTreesClassifier(n_estimators=250, random_state=420)
forest_clf.fit(X, y)
imp_features = forest_clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest_clf.estimators_], axis=0)
plt.figure(figsize=(15, 8))
plt.bar(X.columns, std, color="black")
plt.xlabel("Feature Labels")
plt.ylabel("Feature Importances")
plt.title("Comparison of different Feature Importances")
plt.show()
feature_importance = pd.DataFrame()
feature_importance["importance"] = [
268,
154,
204,
221,
180,
244,
335,
355,
354,
374,
311,
]
feature_importance["feature"] = [
"start_position",
"end_position",
"chou_fasman",
"emini",
"kolaskar_tongaonkar",
"parker",
"isoelectric_point",
"aromaticity",
"hydrophobicity",
"stability",
"length",
]
sns.barplot(
x="importance",
y="feature",
data=feature_importance.sort_values(by="importance", ascending=False),
)
plt.title("LGBM Features")
# train_test split
X = bcell_sars.drop(
["target", "parent_protein_id", "protein_seq", "peptide_seq"], axis=1
)
y = bcell_sars["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
model = Sequential()
model.add(Dense(units=128, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(units=64, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(units=32, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(units=16, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(units=1, activation="sigmoid"))
model.compile(
loss="binary_crossentropy",
optimizer="adam",
metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC()],
)
# Early stopping
early_stop = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=25)
model.fit(x=X_train,
y=y_train,
epochs=150,
validation_data=(X_test, y_test), verbose=1,
callbacks=[early_stop]
);
Epoch 1/150
373/373 [==============================] - 3s 4ms/step - loss: 0.5994 - binary_accuracy: 0.7174 - auc: 0.5635 - val_loss: 0.5412 - val_binary_accuracy: 0.7287 - val_auc: 0.6960
Epoch 2/150
373/373 [==============================] - 1s 3ms/step - loss: 0.5568 - binary_accuracy: 0.7296 - auc: 0.6595 - val_loss: 0.5283 - val_binary_accuracy: 0.7337 - val_auc: 0.7198
Epoch 3/150
373/373 [==============================] - 1s 3ms/step - loss: 0.5528 - binary_accuracy: 0.7270 - auc: 0.6718 - val_loss: 0.5269 - val_binary_accuracy: 0.7388 - val_auc: 0.7268
Epoch 4/150
373/373 [==============================] - 1s 3ms/step - loss: 0.5336 - binary_accuracy: 0.7384 - auc: 0.6994 - val_loss: 0.5104 - val_binary_accuracy: 0.7435 - val_auc: 0.7481
Epoch 5/150
373/373 [==============================] - 1s 3ms/step - loss: 0.5296 - binary_accuracy: 0.7436 - auc: 0.7153 - val_loss: 0.4968 - val_binary_accuracy: 0.7599 - val_auc: 0.7635
Epoch 6/150
373/373 [==============================] - 1s 3ms/step - loss: 0.5165 - binary_accuracy: 0.7494 - auc: 0.7339 - val_loss: 0.4804 - val_binary_accuracy: 0.7780 - val_auc: 0.7851
Epoch 7/150
373/373 [==============================] - 1s 3ms/step - loss: 0.5021 - binary_accuracy: 0.7604 - auc: 0.7578 - val_loss: 0.4723 - val_binary_accuracy: 0.7850 - val_auc: 0.7942
Epoch 8/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4845 - binary_accuracy: 0.7687 - auc: 0.7753 - val_loss: 0.4687 - val_binary_accuracy: 0.7824 - val_auc: 0.7985
Epoch 9/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4808 - binary_accuracy: 0.7712 - auc: 0.7782 - val_loss: 0.4525 - val_binary_accuracy: 0.7847 - val_auc: 0.8177
Epoch 10/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4857 - binary_accuracy: 0.7692 - auc: 0.7840 - val_loss: 0.4489 - val_binary_accuracy: 0.7874 - val_auc: 0.8185
Epoch 11/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4599 - binary_accuracy: 0.7860 - auc: 0.8047 - val_loss: 0.4427 - val_binary_accuracy: 0.7914 - val_auc: 0.8283
Epoch 12/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4694 - binary_accuracy: 0.7747 - auc: 0.8026 - val_loss: 0.4378 - val_binary_accuracy: 0.7988 - val_auc: 0.8296
Epoch 13/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4767 - binary_accuracy: 0.7768 - auc: 0.7945 - val_loss: 0.4348 - val_binary_accuracy: 0.7941 - val_auc: 0.8332
Epoch 14/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4500 - binary_accuracy: 0.7883 - auc: 0.8135 - val_loss: 0.4274 - val_binary_accuracy: 0.8005 - val_auc: 0.8404
Epoch 15/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4511 - binary_accuracy: 0.7842 - auc: 0.8175 - val_loss: 0.4284 - val_binary_accuracy: 0.7928 - val_auc: 0.8403
Epoch 16/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4466 - binary_accuracy: 0.7855 - auc: 0.8241 - val_loss: 0.4214 - val_binary_accuracy: 0.7971 - val_auc: 0.8456
Epoch 17/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4291 - binary_accuracy: 0.7983 - auc: 0.8350 - val_loss: 0.4208 - val_binary_accuracy: 0.8042 - val_auc: 0.8476
Epoch 18/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4387 - binary_accuracy: 0.7904 - auc: 0.8274 - val_loss: 0.4132 - val_binary_accuracy: 0.8015 - val_auc: 0.8534
Epoch 19/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4399 - binary_accuracy: 0.7960 - auc: 0.8273 - val_loss: 0.4119 - val_binary_accuracy: 0.8068 - val_auc: 0.8536
Epoch 20/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4460 - binary_accuracy: 0.7871 - auc: 0.8244 - val_loss: 0.4114 - val_binary_accuracy: 0.8045 - val_auc: 0.8554
Epoch 21/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4284 - binary_accuracy: 0.8003 - auc: 0.8419 - val_loss: 0.4115 - val_binary_accuracy: 0.8139 - val_auc: 0.8567
Epoch 22/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4192 - binary_accuracy: 0.7994 - auc: 0.8462 - val_loss: 0.4057 - val_binary_accuracy: 0.8078 - val_auc: 0.8593
Epoch 23/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4168 - binary_accuracy: 0.8099 - auc: 0.8477 - val_loss: 0.3997 - val_binary_accuracy: 0.8109 - val_auc: 0.8652
Epoch 24/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4152 - binary_accuracy: 0.8083 - auc: 0.8492 - val_loss: 0.4027 - val_binary_accuracy: 0.8125 - val_auc: 0.8626
Epoch 25/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4173 - binary_accuracy: 0.8073 - auc: 0.8439 - val_loss: 0.3954 - val_binary_accuracy: 0.8115 - val_auc: 0.8676
Epoch 26/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4079 - binary_accuracy: 0.8110 - auc: 0.8532 - val_loss: 0.3996 - val_binary_accuracy: 0.8129 - val_auc: 0.8672
Epoch 27/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4054 - binary_accuracy: 0.8101 - auc: 0.8560 - val_loss: 0.3968 - val_binary_accuracy: 0.8115 - val_auc: 0.8672
Epoch 28/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4067 - binary_accuracy: 0.8153 - auc: 0.8577 - val_loss: 0.3939 - val_binary_accuracy: 0.8186 - val_auc: 0.8700
Epoch 29/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4064 - binary_accuracy: 0.8121 - auc: 0.8585 - val_loss: 0.3857 - val_binary_accuracy: 0.8243 - val_auc: 0.8751
Epoch 30/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4041 - binary_accuracy: 0.8136 - auc: 0.8616 - val_loss: 0.3882 - val_binary_accuracy: 0.8229 - val_auc: 0.8732
Epoch 31/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4039 - binary_accuracy: 0.8100 - auc: 0.8595 - val_loss: 0.3873 - val_binary_accuracy: 0.8223 - val_auc: 0.8737
Epoch 32/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3998 - binary_accuracy: 0.8147 - auc: 0.8549 - val_loss: 0.3831 - val_binary_accuracy: 0.8186 - val_auc: 0.8768
Epoch 33/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4012 - binary_accuracy: 0.8143 - auc: 0.8642 - val_loss: 0.3860 - val_binary_accuracy: 0.8186 - val_auc: 0.8801
Epoch 34/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3973 - binary_accuracy: 0.8162 - auc: 0.8664 - val_loss: 0.3829 - val_binary_accuracy: 0.8260 - val_auc: 0.8768
Epoch 35/150
373/373 [==============================] - 1s 3ms/step - loss: 0.4037 - binary_accuracy: 0.8113 - auc: 0.8616 - val_loss: 0.3799 - val_binary_accuracy: 0.8276 - val_auc: 0.8809
Epoch 36/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3882 - binary_accuracy: 0.8246 - auc: 0.8698 - val_loss: 0.3727 - val_binary_accuracy: 0.8307 - val_auc: 0.8862
Epoch 37/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3898 - binary_accuracy: 0.8230 - auc: 0.8729 - val_loss: 0.3792 - val_binary_accuracy: 0.8327 - val_auc: 0.8806
Epoch 38/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3973 - binary_accuracy: 0.8192 - auc: 0.8653 - val_loss: 0.3794 - val_binary_accuracy: 0.8270 - val_auc: 0.8819
Epoch 39/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3871 - binary_accuracy: 0.8247 - auc: 0.8724 - val_loss: 0.3786 - val_binary_accuracy: 0.8280 - val_auc: 0.8813
Epoch 40/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3944 - binary_accuracy: 0.8186 - auc: 0.8699 - val_loss: 0.3773 - val_binary_accuracy: 0.8333 - val_auc: 0.8817
Epoch 41/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3836 - binary_accuracy: 0.8266 - auc: 0.8770 - val_loss: 0.3734 - val_binary_accuracy: 0.8310 - val_auc: 0.8845
Epoch 42/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3851 - binary_accuracy: 0.8223 - auc: 0.8737 - val_loss: 0.3735 - val_binary_accuracy: 0.8293 - val_auc: 0.8857
Epoch 43/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3854 - binary_accuracy: 0.8268 - auc: 0.8751 - val_loss: 0.3731 - val_binary_accuracy: 0.8307 - val_auc: 0.8848
Epoch 44/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3770 - binary_accuracy: 0.8324 - auc: 0.8785 - val_loss: 0.3747 - val_binary_accuracy: 0.8320 - val_auc: 0.8837
Epoch 45/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3854 - binary_accuracy: 0.8315 - auc: 0.8747 - val_loss: 0.3751 - val_binary_accuracy: 0.8320 - val_auc: 0.8842
Epoch 46/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3871 - binary_accuracy: 0.8262 - auc: 0.8704 - val_loss: 0.3723 - val_binary_accuracy: 0.8384 - val_auc: 0.8846
Epoch 47/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3816 - binary_accuracy: 0.8246 - auc: 0.8781 - val_loss: 0.3715 - val_binary_accuracy: 0.8333 - val_auc: 0.8863
Epoch 48/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3801 - binary_accuracy: 0.8266 - auc: 0.8767 - val_loss: 0.3663 - val_binary_accuracy: 0.8357 - val_auc: 0.8894
Epoch 49/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3725 - binary_accuracy: 0.8376 - auc: 0.8807 - val_loss: 0.3717 - val_binary_accuracy: 0.8384 - val_auc: 0.8868
Epoch 50/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3802 - binary_accuracy: 0.8291 - auc: 0.8782 - val_loss: 0.3713 - val_binary_accuracy: 0.8370 - val_auc: 0.8868
Epoch 51/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3710 - binary_accuracy: 0.8330 - auc: 0.8843 - val_loss: 0.3622 - val_binary_accuracy: 0.8374 - val_auc: 0.8915
Epoch 52/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3722 - binary_accuracy: 0.8361 - auc: 0.8847 - val_loss: 0.3732 - val_binary_accuracy: 0.8317 - val_auc: 0.8850
Epoch 53/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3704 - binary_accuracy: 0.8354 - auc: 0.8863 - val_loss: 0.3743 - val_binary_accuracy: 0.8367 - val_auc: 0.8859
Epoch 54/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3695 - binary_accuracy: 0.8337 - auc: 0.8847 - val_loss: 0.3640 - val_binary_accuracy: 0.8397 - val_auc: 0.8907
Epoch 55/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3744 - binary_accuracy: 0.8296 - auc: 0.8824 - val_loss: 0.3618 - val_binary_accuracy: 0.8380 - val_auc: 0.8927
Epoch 56/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3674 - binary_accuracy: 0.8292 - auc: 0.8868 - val_loss: 0.3667 - val_binary_accuracy: 0.8374 - val_auc: 0.8908
Epoch 57/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3726 - binary_accuracy: 0.8360 - auc: 0.8839 - val_loss: 0.3668 - val_binary_accuracy: 0.8353 - val_auc: 0.8887
Epoch 58/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3668 - binary_accuracy: 0.8336 - auc: 0.8887 - val_loss: 0.3635 - val_binary_accuracy: 0.8370 - val_auc: 0.8918
Epoch 59/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3680 - binary_accuracy: 0.8361 - auc: 0.8871 - val_loss: 0.3683 - val_binary_accuracy: 0.8374 - val_auc: 0.8887
Epoch 60/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3690 - binary_accuracy: 0.8346 - auc: 0.8845 - val_loss: 0.3670 - val_binary_accuracy: 0.8340 - val_auc: 0.8893
Epoch 61/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3714 - binary_accuracy: 0.8384 - auc: 0.8842 - val_loss: 0.3614 - val_binary_accuracy: 0.8404 - val_auc: 0.8936
Epoch 62/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3742 - binary_accuracy: 0.8300 - auc: 0.8816 - val_loss: 0.3585 - val_binary_accuracy: 0.8431 - val_auc: 0.8950
Epoch 63/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3679 - binary_accuracy: 0.8314 - auc: 0.8886 - val_loss: 0.3636 - val_binary_accuracy: 0.8327 - val_auc: 0.8921
Epoch 64/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3708 - binary_accuracy: 0.8351 - auc: 0.8847 - val_loss: 0.3638 - val_binary_accuracy: 0.8374 - val_auc: 0.8920
Epoch 65/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3730 - binary_accuracy: 0.8374 - auc: 0.8854 - val_loss: 0.3638 - val_binary_accuracy: 0.8360 - val_auc: 0.8933
Epoch 66/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3679 - binary_accuracy: 0.8335 - auc: 0.8875 - val_loss: 0.3592 - val_binary_accuracy: 0.8434 - val_auc: 0.8948
Epoch 67/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3597 - binary_accuracy: 0.8379 - auc: 0.8893 - val_loss: 0.3537 - val_binary_accuracy: 0.8447 - val_auc: 0.8997
Epoch 68/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3660 - binary_accuracy: 0.8353 - auc: 0.8875 - val_loss: 0.3614 - val_binary_accuracy: 0.8377 - val_auc: 0.8934
Epoch 69/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3635 - binary_accuracy: 0.8380 - auc: 0.8874 - val_loss: 0.3560 - val_binary_accuracy: 0.8437 - val_auc: 0.8971
Epoch 70/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3588 - binary_accuracy: 0.8397 - auc: 0.8940 - val_loss: 0.3575 - val_binary_accuracy: 0.8377 - val_auc: 0.8962
Epoch 71/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3642 - binary_accuracy: 0.8378 - auc: 0.8889 - val_loss: 0.3564 - val_binary_accuracy: 0.8360 - val_auc: 0.8965
Epoch 72/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3666 - binary_accuracy: 0.8323 - auc: 0.8854 - val_loss: 0.3598 - val_binary_accuracy: 0.8397 - val_auc: 0.8952
Epoch 73/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3548 - binary_accuracy: 0.8405 - auc: 0.8917 - val_loss: 0.3619 - val_binary_accuracy: 0.8394 - val_auc: 0.8933
Epoch 74/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3525 - binary_accuracy: 0.8440 - auc: 0.8956 - val_loss: 0.3689 - val_binary_accuracy: 0.8327 - val_auc: 0.8891
Epoch 75/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3617 - binary_accuracy: 0.8404 - auc: 0.8900 - val_loss: 0.3623 - val_binary_accuracy: 0.8390 - val_auc: 0.8941
Epoch 76/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3617 - binary_accuracy: 0.8325 - auc: 0.8917 - val_loss: 0.3580 - val_binary_accuracy: 0.8357 - val_auc: 0.8957
Epoch 77/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3570 - binary_accuracy: 0.8465 - auc: 0.8923 - val_loss: 0.3611 - val_binary_accuracy: 0.8330 - val_auc: 0.8936
Epoch 78/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3552 - binary_accuracy: 0.8456 - auc: 0.8962 - val_loss: 0.3612 - val_binary_accuracy: 0.8424 - val_auc: 0.8942
Epoch 79/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3513 - binary_accuracy: 0.8435 - auc: 0.8966 - val_loss: 0.3599 - val_binary_accuracy: 0.8380 - val_auc: 0.8950
Epoch 80/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3556 - binary_accuracy: 0.8427 - auc: 0.8967 - val_loss: 0.3562 - val_binary_accuracy: 0.8434 - val_auc: 0.8975
Epoch 81/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3543 - binary_accuracy: 0.8480 - auc: 0.8960 - val_loss: 0.3555 - val_binary_accuracy: 0.8431 - val_auc: 0.8974
Epoch 82/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3563 - binary_accuracy: 0.8450 - auc: 0.8909 - val_loss: 0.3562 - val_binary_accuracy: 0.8417 - val_auc: 0.8975
Epoch 83/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3541 - binary_accuracy: 0.8463 - auc: 0.8964 - val_loss: 0.3534 - val_binary_accuracy: 0.8427 - val_auc: 0.8994
Epoch 84/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3556 - binary_accuracy: 0.8459 - auc: 0.8904 - val_loss: 0.3565 - val_binary_accuracy: 0.8370 - val_auc: 0.8963
Epoch 85/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3594 - binary_accuracy: 0.8387 - auc: 0.8916 - val_loss: 0.3598 - val_binary_accuracy: 0.8384 - val_auc: 0.8959
Epoch 86/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3570 - binary_accuracy: 0.8437 - auc: 0.8923 - val_loss: 0.3599 - val_binary_accuracy: 0.8414 - val_auc: 0.8957
Epoch 87/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3436 - binary_accuracy: 0.8479 - auc: 0.9026 - val_loss: 0.3633 - val_binary_accuracy: 0.8397 - val_auc: 0.8933
Epoch 88/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3623 - binary_accuracy: 0.8469 - auc: 0.8917 - val_loss: 0.3575 - val_binary_accuracy: 0.8431 - val_auc: 0.8971
Epoch 89/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3528 - binary_accuracy: 0.8430 - auc: 0.8961 - val_loss: 0.3571 - val_binary_accuracy: 0.8421 - val_auc: 0.8970
Epoch 90/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3588 - binary_accuracy: 0.8393 - auc: 0.8910 - val_loss: 0.3588 - val_binary_accuracy: 0.8387 - val_auc: 0.8956
Epoch 91/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3498 - binary_accuracy: 0.8471 - auc: 0.8963 - val_loss: 0.3610 - val_binary_accuracy: 0.8380 - val_auc: 0.8947
Epoch 92/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3579 - binary_accuracy: 0.8458 - auc: 0.8939 - val_loss: 0.3602 - val_binary_accuracy: 0.8437 - val_auc: 0.8952
Epoch 93/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3496 - binary_accuracy: 0.8467 - auc: 0.8967 - val_loss: 0.3570 - val_binary_accuracy: 0.8444 - val_auc: 0.8968
Epoch 94/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3453 - binary_accuracy: 0.8517 - auc: 0.9010 - val_loss: 0.3544 - val_binary_accuracy: 0.8421 - val_auc: 0.8987
Epoch 95/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3468 - binary_accuracy: 0.8484 - auc: 0.8988 - val_loss: 0.3545 - val_binary_accuracy: 0.8417 - val_auc: 0.8983
Epoch 96/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3551 - binary_accuracy: 0.8452 - auc: 0.8945 - val_loss: 0.3549 - val_binary_accuracy: 0.8437 - val_auc: 0.8983
Epoch 97/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3490 - binary_accuracy: 0.8453 - auc: 0.8982 - val_loss: 0.3518 - val_binary_accuracy: 0.8464 - val_auc: 0.9001
Epoch 98/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3457 - binary_accuracy: 0.8440 - auc: 0.8994 - val_loss: 0.3539 - val_binary_accuracy: 0.8451 - val_auc: 0.8991
Epoch 99/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3504 - binary_accuracy: 0.8439 - auc: 0.8959 - val_loss: 0.3576 - val_binary_accuracy: 0.8427 - val_auc: 0.8977
Epoch 100/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3493 - binary_accuracy: 0.8396 - auc: 0.8977 - val_loss: 0.3594 - val_binary_accuracy: 0.8414 - val_auc: 0.8956
Epoch 101/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3411 - binary_accuracy: 0.8462 - auc: 0.9012 - val_loss: 0.3565 - val_binary_accuracy: 0.8441 - val_auc: 0.8972
Epoch 102/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3520 - binary_accuracy: 0.8483 - auc: 0.8981 - val_loss: 0.3614 - val_binary_accuracy: 0.8384 - val_auc: 0.8942
Epoch 103/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3505 - binary_accuracy: 0.8473 - auc: 0.8984 - val_loss: 0.3514 - val_binary_accuracy: 0.8360 - val_auc: 0.9009
Epoch 104/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3489 - binary_accuracy: 0.8432 - auc: 0.8994 - val_loss: 0.3527 - val_binary_accuracy: 0.8478 - val_auc: 0.8992
Epoch 105/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3383 - binary_accuracy: 0.8501 - auc: 0.9037 - val_loss: 0.3494 - val_binary_accuracy: 0.8461 - val_auc: 0.9011
Epoch 106/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3550 - binary_accuracy: 0.8398 - auc: 0.8940 - val_loss: 0.3572 - val_binary_accuracy: 0.8424 - val_auc: 0.8957
Epoch 107/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3529 - binary_accuracy: 0.8437 - auc: 0.8961 - val_loss: 0.3559 - val_binary_accuracy: 0.8414 - val_auc: 0.8977
Epoch 108/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3521 - binary_accuracy: 0.8418 - auc: 0.8990 - val_loss: 0.3538 - val_binary_accuracy: 0.8431 - val_auc: 0.8989
Epoch 109/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3430 - binary_accuracy: 0.8452 - auc: 0.9012 - val_loss: 0.3522 - val_binary_accuracy: 0.8417 - val_auc: 0.9004
Epoch 110/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3436 - binary_accuracy: 0.8506 - auc: 0.9022 - val_loss: 0.3555 - val_binary_accuracy: 0.8421 - val_auc: 0.8984
Epoch 111/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3386 - binary_accuracy: 0.8533 - auc: 0.9048 - val_loss: 0.3492 - val_binary_accuracy: 0.8457 - val_auc: 0.9018
Epoch 112/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3448 - binary_accuracy: 0.8483 - auc: 0.9009 - val_loss: 0.3496 - val_binary_accuracy: 0.8444 - val_auc: 0.9010
Epoch 113/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3344 - binary_accuracy: 0.8507 - auc: 0.9085 - val_loss: 0.3496 - val_binary_accuracy: 0.8414 - val_auc: 0.9010
Epoch 114/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3420 - binary_accuracy: 0.8446 - auc: 0.9024 - val_loss: 0.3529 - val_binary_accuracy: 0.8400 - val_auc: 0.8999
Epoch 115/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3558 - binary_accuracy: 0.8410 - auc: 0.8943 - val_loss: 0.3527 - val_binary_accuracy: 0.8410 - val_auc: 0.8995
Epoch 116/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3455 - binary_accuracy: 0.8526 - auc: 0.9021 - val_loss: 0.3511 - val_binary_accuracy: 0.8444 - val_auc: 0.9014
Epoch 117/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3426 - binary_accuracy: 0.8491 - auc: 0.9016 - val_loss: 0.3476 - val_binary_accuracy: 0.8441 - val_auc: 0.9014
Epoch 118/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3435 - binary_accuracy: 0.8514 - auc: 0.9012 - val_loss: 0.3452 - val_binary_accuracy: 0.8461 - val_auc: 0.9027
Epoch 119/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3418 - binary_accuracy: 0.8434 - auc: 0.9033 - val_loss: 0.3503 - val_binary_accuracy: 0.8421 - val_auc: 0.9012
Epoch 120/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3409 - binary_accuracy: 0.8528 - auc: 0.9033 - val_loss: 0.3485 - val_binary_accuracy: 0.8474 - val_auc: 0.9021
Epoch 121/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3448 - binary_accuracy: 0.8427 - auc: 0.9020 - val_loss: 0.3614 - val_binary_accuracy: 0.8421 - val_auc: 0.8955
Epoch 122/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3389 - binary_accuracy: 0.8520 - auc: 0.9053 - val_loss: 0.3497 - val_binary_accuracy: 0.8447 - val_auc: 0.9002
Epoch 123/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3413 - binary_accuracy: 0.8501 - auc: 0.9035 - val_loss: 0.3508 - val_binary_accuracy: 0.8457 - val_auc: 0.9012
Epoch 124/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3410 - binary_accuracy: 0.8492 - auc: 0.9061 - val_loss: 0.3478 - val_binary_accuracy: 0.8454 - val_auc: 0.9012
Epoch 125/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3284 - binary_accuracy: 0.8553 - auc: 0.9099 - val_loss: 0.3446 - val_binary_accuracy: 0.8424 - val_auc: 0.9037
Epoch 126/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3482 - binary_accuracy: 0.8453 - auc: 0.9009 - val_loss: 0.3459 - val_binary_accuracy: 0.8427 - val_auc: 0.9040
Epoch 127/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3344 - binary_accuracy: 0.8503 - auc: 0.9055 - val_loss: 0.3477 - val_binary_accuracy: 0.8508 - val_auc: 0.9027
Epoch 128/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3349 - binary_accuracy: 0.8541 - auc: 0.9070 - val_loss: 0.3532 - val_binary_accuracy: 0.8437 - val_auc: 0.8990
Epoch 129/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3306 - binary_accuracy: 0.8502 - auc: 0.9092 - val_loss: 0.3502 - val_binary_accuracy: 0.8434 - val_auc: 0.9007
Epoch 130/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3381 - binary_accuracy: 0.8498 - auc: 0.9046 - val_loss: 0.3480 - val_binary_accuracy: 0.8454 - val_auc: 0.9026
Epoch 131/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3280 - binary_accuracy: 0.8568 - auc: 0.9086 - val_loss: 0.3502 - val_binary_accuracy: 0.8427 - val_auc: 0.9018
Epoch 132/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3444 - binary_accuracy: 0.8460 - auc: 0.9001 - val_loss: 0.3472 - val_binary_accuracy: 0.8464 - val_auc: 0.9020
Epoch 133/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3396 - binary_accuracy: 0.8510 - auc: 0.9031 - val_loss: 0.3517 - val_binary_accuracy: 0.8434 - val_auc: 0.8997
Epoch 134/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3348 - binary_accuracy: 0.8498 - auc: 0.9079 - val_loss: 0.3473 - val_binary_accuracy: 0.8447 - val_auc: 0.9017
Epoch 135/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3325 - binary_accuracy: 0.8538 - auc: 0.9082 - val_loss: 0.3433 - val_binary_accuracy: 0.8484 - val_auc: 0.9050
Epoch 136/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3362 - binary_accuracy: 0.8497 - auc: 0.9062 - val_loss: 0.3455 - val_binary_accuracy: 0.8471 - val_auc: 0.9044
Epoch 137/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3423 - binary_accuracy: 0.8474 - auc: 0.9016 - val_loss: 0.3504 - val_binary_accuracy: 0.8457 - val_auc: 0.9001
Epoch 138/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3326 - binary_accuracy: 0.8564 - auc: 0.9068 - val_loss: 0.3496 - val_binary_accuracy: 0.8444 - val_auc: 0.9004
Epoch 139/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3243 - binary_accuracy: 0.8565 - auc: 0.9143 - val_loss: 0.3424 - val_binary_accuracy: 0.8461 - val_auc: 0.9056
Epoch 140/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3415 - binary_accuracy: 0.8480 - auc: 0.9019 - val_loss: 0.3418 - val_binary_accuracy: 0.8441 - val_auc: 0.9052
Epoch 141/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3449 - binary_accuracy: 0.8484 - auc: 0.9011 - val_loss: 0.3420 - val_binary_accuracy: 0.8451 - val_auc: 0.9056
Epoch 142/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3451 - binary_accuracy: 0.8477 - auc: 0.9003 - val_loss: 0.3410 - val_binary_accuracy: 0.8461 - val_auc: 0.9062
Epoch 143/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3336 - binary_accuracy: 0.8544 - auc: 0.9095 - val_loss: 0.3415 - val_binary_accuracy: 0.8478 - val_auc: 0.9048
Epoch 144/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3374 - binary_accuracy: 0.8520 - auc: 0.9069 - val_loss: 0.3426 - val_binary_accuracy: 0.8387 - val_auc: 0.9051
Epoch 145/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3272 - binary_accuracy: 0.8554 - auc: 0.9118 - val_loss: 0.3449 - val_binary_accuracy: 0.8447 - val_auc: 0.9032
Epoch 146/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3379 - binary_accuracy: 0.8520 - auc: 0.9053 - val_loss: 0.3450 - val_binary_accuracy: 0.8387 - val_auc: 0.9042
Epoch 147/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3272 - binary_accuracy: 0.8555 - auc: 0.9073 - val_loss: 0.3443 - val_binary_accuracy: 0.8488 - val_auc: 0.9037
Epoch 148/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3389 - binary_accuracy: 0.8519 - auc: 0.9075 - val_loss: 0.3442 - val_binary_accuracy: 0.8478 - val_auc: 0.9038
Epoch 149/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3329 - binary_accuracy: 0.8572 - auc: 0.9097 - val_loss: 0.3459 - val_binary_accuracy: 0.8431 - val_auc: 0.9040
Epoch 150/150
373/373 [==============================] - 1s 3ms/step - loss: 0.3258 - binary_accuracy: 0.8564 - auc: 0.9127 - val_loss: 0.3452 - val_binary_accuracy: 0.8451 - val_auc: 0.9033
model_loss = pd.DataFrame(model.history.history)
model_loss.plot()
#predictions
predictions = model.predict_classes(X_test)
print("AUC Score:",roc_auc_score(y_test,model.predict(X_test)))
AUC Score: 0.9034168640074813
print("Accuracy Score :",accuracy_score(y_test,predictions))
Accuracy Score : 0.8450704225352113
print(
classification_report(
y_test, predictions, target_names=["Covid_Negative", "Covid_Positive"]
)
)
precision recall f1-score support
Covid_Negative 0.88 0.91 0.90 2173
Covid_Positive 0.74 0.66 0.70 809
accuracy 0.85 2982
macro avg 0.81 0.79 0.80 2982
weighted avg 0.84 0.85 0.84 2982
#confusion matrix
plt.figure(figsize = (10,10))
cm = confusion_matrix(y_test,predictions)
sns.heatmap(cm,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Covid_Negative','Covid_Positive'] , yticklabels = ['Covid_Negative','Covid_Positive'])
plt.xlabel("Predicted")
plt.ylabel("Actual");
X = bcell_sars.drop(
["target", "parent_protein_id", "protein_seq", "peptide_seq"], axis=1
)
y = bcell_sars["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
N_THREADS = 4 # threads cnt for lgbm and linear models
JOB = 1
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 300 # Time in seconds for automl run
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)
def AUC_metric(y_true, y_pred, **kwargs):
return roc_auc_score(y_true, (y_pred > 0.5).astype(int), **kwargs)
task = Task("binary", metric=roc_auc_score)
roles = {
"target": "target",
}
df = bcell_sars.drop(["parent_protein_id", "protein_seq", "peptide_seq"], axis=1)
train_df, test_df = train_test_split(df, test_size=0.20)
%%time
automl = TabularUtilizedAutoML(task = task,
timeout = TIMEOUT,
cpu_limit = N_THREADS,
general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
reader_params = {'n_jobs': JOB})
oof_pred = automl.fit_predict(train_df, roles = roles)
Layer 1 ...
Train process start. Time left 299.336119890213 secs
Time left 294.40120220184326
Time limit exceeded after calculating fold 3
Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
Time left 125.34042477607727
Time limit exceeded in one of the tasks. AutoML will blend level 1 models.
Try to set higher time limits or use Profiler to find bottleneck and optimize Pipelines settings
CPU times: user 4min 18s, sys: 0 ns, total: 4min 18s
Wall time: 2min 55s
pred = automl.predict(X_test)
prediction = (pred.data[:, 0] > 0.5).astype(int)
print("AUC score :", roc_auc_score(y_test, prediction))
AUC score : 0.8739043245756801
%%time
automl = AutoMLSearch(
X_train=X_train,
y_train=y_train,
problem_type='binary',
# random_seed=2021,
max_time=300,
)
CPU times: user 16.4 ms, sys: 0 ns, total: 16.4 ms
Wall time: 20.6 ms
automl.search()
Generating pipelines to search over...
*****************************
* Beginning pipeline search *
*****************************
Optimizing for Log Loss Binary.
Lower score is better.
Will stop searching for new pipelines after 300 seconds.
Allowed model families: lightgbm, xgboost, random_forest, extra_trees, decision_tree, catboost, linear_model
Mode Baseline Binary Classification P... Elapsed: 00:00
Starting cross validation
Finished cross validation - mean Log Loss Binary: 9.442
Logistic Regression Classifier w/ Imp... Elapsed: 00:00
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.559
Random Forest Classifier w/ Imputer Elapsed: 00:04
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.446
XGBoost Classifier w/ Imputer Elapsed: 00:08
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.353
CatBoost Classifier w/ Imputer Elapsed: 00:13
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.625
Elastic Net Classifier w/ Imputer + S... Elapsed: 00:14
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.587
Extra Trees Classifier w/ Imputer Elapsed: 00:15
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.542
LightGBM Classifier w/ Imputer Elapsed: 00:17
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.342
Decision Tree Classifier w/ Imputer Elapsed: 00:20
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.554
LightGBM Classifier w/ Imputer Elapsed: 00:21
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.480
LightGBM Classifier w/ Imputer Elapsed: 00:58
Starting cross validation
Finished cross validation - mean Log Loss Binary: 10.599
LightGBM Classifier w/ Imputer Elapsed: 01:01
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.543
LightGBM Classifier w/ Imputer Elapsed: 01:02
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.349
LightGBM Classifier w/ Imputer Elapsed: 01:05
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.570
XGBoost Classifier w/ Imputer Elapsed: 01:06
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.371
XGBoost Classifier w/ Imputer Elapsed: 01:16
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.467
XGBoost Classifier w/ Imputer Elapsed: 01:21
Starting cross validation
Finished cross validation - mean Log Loss Binary: nan
XGBoost Classifier w/ Imputer Elapsed: 01:49
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.430
XGBoost Classifier w/ Imputer Elapsed: 01:58
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.396
Random Forest Classifier w/ Imputer Elapsed: 02:16
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.499
Random Forest Classifier w/ Imputer Elapsed: 02:31
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.420
Random Forest Classifier w/ Imputer Elapsed: 02:36
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.445
Random Forest Classifier w/ Imputer Elapsed: 02:55
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.498
Random Forest Classifier w/ Imputer Elapsed: 03:14
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.498
Extra Trees Classifier w/ Imputer Elapsed: 03:32
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.553
Extra Trees Classifier w/ Imputer Elapsed: 03:37
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.553
Extra Trees Classifier w/ Imputer Elapsed: 03:41
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.496
Extra Trees Classifier w/ Imputer Elapsed: 03:45
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.482
Extra Trees Classifier w/ Imputer Elapsed: 03:50
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.515
Decision Tree Classifier w/ Imputer Elapsed: 03:54
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.521
Decision Tree Classifier w/ Imputer Elapsed: 03:55
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.706
Decision Tree Classifier w/ Imputer Elapsed: 03:56
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.992
Decision Tree Classifier w/ Imputer Elapsed: 03:57
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.677
Decision Tree Classifier w/ Imputer Elapsed: 03:58
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.570
Logistic Regression Classifier w/ Imp... Elapsed: 03:58
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.559
Logistic Regression Classifier w/ Imp... Elapsed: 04:00
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.559
Logistic Regression Classifier w/ Imp... Elapsed: 04:01
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.559
Logistic Regression Classifier w/ Imp... Elapsed: 04:03
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.559
Logistic Regression Classifier w/ Imp... Elapsed: 04:04
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.559
Elastic Net Classifier w/ Imputer + S... Elapsed: 04:05
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.587
Elastic Net Classifier w/ Imputer + S... Elapsed: 04:06
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.587
Elastic Net Classifier w/ Imputer + S... Elapsed: 04:08
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.586
Elastic Net Classifier w/ Imputer + S... Elapsed: 04:09
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.587
Elastic Net Classifier w/ Imputer + S... Elapsed: 04:11
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.587
CatBoost Classifier w/ Imputer Elapsed: 04:12
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.363
CatBoost Classifier w/ Imputer Elapsed: 04:15
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.361
CatBoost Classifier w/ Imputer Elapsed: 04:16
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.423
CatBoost Classifier w/ Imputer Elapsed: 04:18
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.390
CatBoost Classifier w/ Imputer Elapsed: 04:19
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.363
LightGBM Classifier w/ Imputer Elapsed: 04:21
Starting cross validation
Finished cross validation - mean Log Loss Binary: 0.450
Search finished after 05:41
Best pipeline: LightGBM Classifier w/ Imputer
Best pipeline Log Loss Binary: 0.341506
automl.rankings
%%time
pipeline = automl.best_pipeline
pipeline.fit(X_train, y_train)
CPU times: user 3min 53s, sys: 373 ms, total: 3min 54s
Wall time: 2min 36s
preds = pipeline.predict(X_test)
print("AUC score:",roc_auc_score(y_test,preds))
AUC score: 0.790120320855615
# Initialize an AutoML instance
automl = AutoML()
# Specify automl goal and constraint
automl_settings = {
"time_budget": 300, # in seconds
"metric": "roc_auc",
"task": "classification",
}
# Train with labeled input data
automl.fit(
X_train=X_train,
y_train=y_train,
**automl_settings
)
[flaml.automl: 06-12 21:25:14] {893} INFO - Evaluation method: cv
[flaml.automl: 06-12 21:25:14] {596} INFO - Using StratifiedKFold
[flaml.automl: 06-12 21:25:14] {914} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl: 06-12 21:25:14] {934} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'lrl1']
[flaml.automl: 06-12 21:25:14] {998} INFO - iteration 0, current learner lgbm
[flaml.automl: 06-12 21:25:29] {1150} INFO - at 15.3s, best lgbm's error=0.2874, best lgbm's error=0.2874
[flaml.automl: 06-12 21:25:29] {998} INFO - iteration 1, current learner lgbm
[flaml.automl: 06-12 21:25:41] {1150} INFO - at 27.6s, best lgbm's error=0.2874, best lgbm's error=0.2874
[flaml.automl: 06-12 21:25:41] {998} INFO - iteration 2, current learner lgbm
[flaml.automl: 06-12 21:25:55] {1150} INFO - at 41.7s, best lgbm's error=0.2754, best lgbm's error=0.2754
[flaml.automl: 06-12 21:25:55] {998} INFO - iteration 3, current learner xgboost
[flaml.automl: 06-12 21:25:56] {1150} INFO - at 42.1s, best xgboost's error=0.3049, best lgbm's error=0.2754
[flaml.automl: 06-12 21:25:56] {998} INFO - iteration 4, current learner lgbm
[flaml.automl: 06-12 21:25:56] {1150} INFO - at 42.4s, best lgbm's error=0.2660, best lgbm's error=0.2660
[flaml.automl: 06-12 21:25:56] {998} INFO - iteration 5, current learner lgbm
[flaml.automl: 06-12 21:25:56] {1150} INFO - at 42.7s, best lgbm's error=0.2246, best lgbm's error=0.2246
[flaml.automl: 06-12 21:25:56] {998} INFO - iteration 6, current learner lgbm
[flaml.automl: 06-12 21:25:57] {1150} INFO - at 43.1s, best lgbm's error=0.2246, best lgbm's error=0.2246
[flaml.automl: 06-12 21:25:57] {998} INFO - iteration 7, current learner lgbm
[flaml.automl: 06-12 21:25:57] {1150} INFO - at 43.6s, best lgbm's error=0.1764, best lgbm's error=0.1764
[flaml.automl: 06-12 21:25:57] {998} INFO - iteration 8, current learner lgbm
[flaml.automl: 06-12 21:25:58] {1150} INFO - at 44.1s, best lgbm's error=0.1516, best lgbm's error=0.1516
[flaml.automl: 06-12 21:25:58] {998} INFO - iteration 9, current learner lgbm
[flaml.automl: 06-12 21:25:58] {1150} INFO - at 44.4s, best lgbm's error=0.1516, best lgbm's error=0.1516
[flaml.automl: 06-12 21:25:58] {998} INFO - iteration 10, current learner lgbm
[flaml.automl: 06-12 21:25:59] {1150} INFO - at 45.0s, best lgbm's error=0.1168, best lgbm's error=0.1168
[flaml.automl: 06-12 21:25:59] {998} INFO - iteration 11, current learner xgboost
[flaml.automl: 06-12 21:25:59] {1150} INFO - at 45.2s, best xgboost's error=0.3049, best lgbm's error=0.1168
[flaml.automl: 06-12 21:25:59] {998} INFO - iteration 12, current learner lgbm
[flaml.automl: 06-12 21:25:59] {1150} INFO - at 45.6s, best lgbm's error=0.1168, best lgbm's error=0.1168
[flaml.automl: 06-12 21:25:59] {998} INFO - iteration 13, current learner xgboost
[flaml.automl: 06-12 21:26:00] {1150} INFO - at 45.9s, best xgboost's error=0.2748, best lgbm's error=0.1168
[flaml.automl: 06-12 21:26:00] {998} INFO - iteration 14, current learner lgbm
[flaml.automl: 06-12 21:26:00] {1150} INFO - at 46.2s, best lgbm's error=0.1168, best lgbm's error=0.1168
[flaml.automl: 06-12 21:26:00] {998} INFO - iteration 15, current learner xgboost
[flaml.automl: 06-12 21:26:00] {1150} INFO - at 46.5s, best xgboost's error=0.2452, best lgbm's error=0.1168
[flaml.automl: 06-12 21:26:00] {998} INFO - iteration 16, current learner lgbm
[flaml.automl: 06-12 21:26:01] {1150} INFO - at 47.2s, best lgbm's error=0.1034, best lgbm's error=0.1034
[flaml.automl: 06-12 21:26:01] {998} INFO - iteration 17, current learner xgboost
[flaml.automl: 06-12 21:26:01] {1150} INFO - at 47.5s, best xgboost's error=0.2287, best lgbm's error=0.1034
[flaml.automl: 06-12 21:26:01] {998} INFO - iteration 18, current learner lgbm
[flaml.automl: 06-12 21:26:02] {1150} INFO - at 48.0s, best lgbm's error=0.1034, best lgbm's error=0.1034
[flaml.automl: 06-12 21:26:02] {998} INFO - iteration 19, current learner lgbm
[flaml.automl: 06-12 21:26:02] {1150} INFO - at 48.7s, best lgbm's error=0.1034, best lgbm's error=0.1034
[flaml.automl: 06-12 21:26:02] {998} INFO - iteration 20, current learner lgbm
[flaml.automl: 06-12 21:26:03] {1150} INFO - at 49.2s, best lgbm's error=0.1034, best lgbm's error=0.1034
[flaml.automl: 06-12 21:26:03] {998} INFO - iteration 21, current learner lgbm
[flaml.automl: 06-12 21:26:04] {1150} INFO - at 49.9s, best lgbm's error=0.1034, best lgbm's error=0.1034
[flaml.automl: 06-12 21:26:04] {998} INFO - iteration 22, current learner xgboost
[flaml.automl: 06-12 21:26:04] {1150} INFO - at 50.2s, best xgboost's error=0.2287, best lgbm's error=0.1034
[flaml.automl: 06-12 21:26:04] {998} INFO - iteration 23, current learner xgboost
[flaml.automl: 06-12 21:26:04] {1150} INFO - at 50.6s, best xgboost's error=0.1628, best lgbm's error=0.1034
[flaml.automl: 06-12 21:26:04] {998} INFO - iteration 24, current learner lgbm
[flaml.automl: 06-12 21:26:05] {1150} INFO - at 51.0s, best lgbm's error=0.1034, best lgbm's error=0.1034
[flaml.automl: 06-12 21:26:05] {998} INFO - iteration 25, current learner xgboost
[flaml.automl: 06-12 21:26:05] {1150} INFO - at 51.4s, best xgboost's error=0.1472, best lgbm's error=0.1034
[flaml.automl: 06-12 21:26:05] {998} INFO - iteration 26, current learner xgboost
[flaml.automl: 06-12 21:26:06] {1150} INFO - at 51.9s, best xgboost's error=0.1472, best lgbm's error=0.1034
[flaml.automl: 06-12 21:26:06] {998} INFO - iteration 27, current learner xgboost
[flaml.automl: 06-12 21:26:06] {1150} INFO - at 52.3s, best xgboost's error=0.1210, best lgbm's error=0.1034
[flaml.automl: 06-12 21:26:06] {998} INFO - iteration 28, current learner lgbm
[flaml.automl: 06-12 21:26:07] {1150} INFO - at 53.6s, best lgbm's error=0.0868, best lgbm's error=0.0868
[flaml.automl: 06-12 21:26:07] {998} INFO - iteration 29, current learner lgbm
[flaml.automl: 06-12 21:26:08] {1150} INFO - at 54.0s, best lgbm's error=0.0868, best lgbm's error=0.0868
[flaml.automl: 06-12 21:26:08] {998} INFO - iteration 30, current learner lgbm
[flaml.automl: 06-12 21:26:13] {1150} INFO - at 59.2s, best lgbm's error=0.0778, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:13] {998} INFO - iteration 31, current learner xgboost
[flaml.automl: 06-12 21:26:13] {1150} INFO - at 59.7s, best xgboost's error=0.1210, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:13] {998} INFO - iteration 32, current learner lgbm
[flaml.automl: 06-12 21:26:16] {1150} INFO - at 62.2s, best lgbm's error=0.0778, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:16] {998} INFO - iteration 33, current learner lgbm
[flaml.automl: 06-12 21:26:19] {1150} INFO - at 65.6s, best lgbm's error=0.0778, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:19] {998} INFO - iteration 34, current learner xgboost
[flaml.automl: 06-12 21:26:20] {1150} INFO - at 66.0s, best xgboost's error=0.1210, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:20] {998} INFO - iteration 35, current learner xgboost
[flaml.automl: 06-12 21:26:20] {1150} INFO - at 66.6s, best xgboost's error=0.1116, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:20] {998} INFO - iteration 36, current learner xgboost
[flaml.automl: 06-12 21:26:21] {1150} INFO - at 67.2s, best xgboost's error=0.1116, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:21] {998} INFO - iteration 37, current learner lgbm
[flaml.automl: 06-12 21:26:23] {1150} INFO - at 69.1s, best lgbm's error=0.0778, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:23] {998} INFO - iteration 38, current learner xgboost
[flaml.automl: 06-12 21:26:24] {1150} INFO - at 70.1s, best xgboost's error=0.1089, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:24] {998} INFO - iteration 39, current learner xgboost
[flaml.automl: 06-12 21:26:25] {1150} INFO - at 71.1s, best xgboost's error=0.1089, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:25] {998} INFO - iteration 40, current learner lgbm
[flaml.automl: 06-12 21:26:30] {1150} INFO - at 76.3s, best lgbm's error=0.0778, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:30] {998} INFO - iteration 41, current learner lgbm
[flaml.automl: 06-12 21:26:31] {1150} INFO - at 77.1s, best lgbm's error=0.0778, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:31] {998} INFO - iteration 42, current learner xgboost
[flaml.automl: 06-12 21:26:32] {1150} INFO - at 78.7s, best xgboost's error=0.0950, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:32] {998} INFO - iteration 43, current learner xgboost
[flaml.automl: 06-12 21:26:33] {1150} INFO - at 79.6s, best xgboost's error=0.0950, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:33] {998} INFO - iteration 44, current learner lgbm
[flaml.automl: 06-12 21:26:45] {1150} INFO - at 91.8s, best lgbm's error=0.0778, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:45] {998} INFO - iteration 45, current learner xgboost
[flaml.automl: 06-12 21:26:52] {1150} INFO - at 97.8s, best xgboost's error=0.0841, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:52] {998} INFO - iteration 46, current learner lgbm
[flaml.automl: 06-12 21:26:54] {1150} INFO - at 100.8s, best lgbm's error=0.0778, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:55] {998} INFO - iteration 47, current learner xgboost
[flaml.automl: 06-12 21:26:57] {1150} INFO - at 103.3s, best xgboost's error=0.0841, best lgbm's error=0.0778
[flaml.automl: 06-12 21:26:57] {998} INFO - iteration 48, current learner xgboost
[flaml.automl: 06-12 21:27:21] {1150} INFO - at 127.4s, best xgboost's error=0.0817, best lgbm's error=0.0778
[flaml.automl: 06-12 21:27:21] {998} INFO - iteration 49, current learner extra_tree
[flaml.automl: 06-12 21:27:22] {1150} INFO - at 128.7s, best extra_tree's error=0.1141, best lgbm's error=0.0778
[flaml.automl: 06-12 21:27:22] {998} INFO - iteration 50, current learner extra_tree
[flaml.automl: 06-12 21:27:24] {1150} INFO - at 130.0s, best extra_tree's error=0.1106, best lgbm's error=0.0778
[flaml.automl: 06-12 21:27:24] {998} INFO - iteration 51, current learner lgbm
[flaml.automl: 06-12 21:27:31] {1150} INFO - at 137.1s, best lgbm's error=0.0778, best lgbm's error=0.0778
[flaml.automl: 06-12 21:27:31] {998} INFO - iteration 52, current learner lgbm
[flaml.automl: 06-12 21:27:32] {1150} INFO - at 138.1s, best lgbm's error=0.0778, best lgbm's error=0.0778
[flaml.automl: 06-12 21:27:32] {998} INFO - iteration 53, current learner lgbm
[flaml.automl: 06-12 21:27:54] {1150} INFO - at 160.8s, best lgbm's error=0.0778, best lgbm's error=0.0778
[flaml.automl: 06-12 21:27:54] {998} INFO - iteration 54, current learner extra_tree
[flaml.automl: 06-12 21:27:56] {1150} INFO - at 162.1s, best extra_tree's error=0.0935, best lgbm's error=0.0778
[flaml.automl: 06-12 21:27:56] {998} INFO - iteration 55, current learner extra_tree
[flaml.automl: 06-12 21:27:57] {1150} INFO - at 163.5s, best extra_tree's error=0.0935, best lgbm's error=0.0778
[flaml.automl: 06-12 21:27:57] {998} INFO - iteration 56, current learner extra_tree
[flaml.automl: 06-12 21:27:59] {1150} INFO - at 164.9s, best extra_tree's error=0.0921, best lgbm's error=0.0778
[flaml.automl: 06-12 21:27:59] {998} INFO - iteration 57, current learner rf
[flaml.automl: 06-12 21:28:00] {1150} INFO - at 166.8s, best rf's error=0.1276, best lgbm's error=0.0778
[flaml.automl: 06-12 21:28:00] {998} INFO - iteration 58, current learner rf
[flaml.automl: 06-12 21:28:02] {1150} INFO - at 168.6s, best rf's error=0.1232, best lgbm's error=0.0778
[flaml.automl: 06-12 21:28:02] {998} INFO - iteration 59, current learner lgbm
[flaml.automl: 06-12 21:28:05] {1150} INFO - at 171.6s, best lgbm's error=0.0778, best lgbm's error=0.0778
[flaml.automl: 06-12 21:28:05] {998} INFO - iteration 60, current learner rf
[flaml.automl: 06-12 21:28:08] {1150} INFO - at 174.4s, best rf's error=0.1055, best lgbm's error=0.0778
[flaml.automl: 06-12 21:28:08] {998} INFO - iteration 61, current learner rf
[flaml.automl: 06-12 21:28:11] {1150} INFO - at 176.9s, best rf's error=0.1055, best lgbm's error=0.0778
[flaml.automl: 06-12 21:28:11] {998} INFO - iteration 62, current learner rf
[flaml.automl: 06-12 21:28:14] {1150} INFO - at 179.9s, best rf's error=0.0975, best lgbm's error=0.0778
[flaml.automl: 06-12 21:28:14] {998} INFO - iteration 63, current learner xgboost
[flaml.automl: 06-12 21:28:29] {1150} INFO - at 195.5s, best xgboost's error=0.0817, best lgbm's error=0.0778
[flaml.automl: 06-12 21:28:29] {998} INFO - iteration 64, current learner lgbm
[flaml.automl: 06-12 21:28:32] {1150} INFO - at 197.8s, best lgbm's error=0.0778, best lgbm's error=0.0778
[flaml.automl: 06-12 21:28:32] {998} INFO - iteration 65, current learner rf
[flaml.automl: 06-12 21:28:34] {1150} INFO - at 200.6s, best rf's error=0.0949, best lgbm's error=0.0778
[flaml.automl: 06-12 21:28:34] {998} INFO - iteration 66, current learner xgboost
[flaml.automl: 06-12 21:29:03] {1150} INFO - at 229.5s, best xgboost's error=0.0817, best lgbm's error=0.0778
[flaml.automl: 06-12 21:29:03] {998} INFO - iteration 67, current learner rf
[flaml.automl: 06-12 21:29:06] {1150} INFO - at 232.5s, best rf's error=0.0949, best lgbm's error=0.0778
[flaml.automl: 06-12 21:29:06] {998} INFO - iteration 68, current learner lgbm
[flaml.automl: 06-12 21:29:10] {1150} INFO - at 236.5s, best lgbm's error=0.0778, best lgbm's error=0.0778
[flaml.automl: 06-12 21:29:10] {998} INFO - iteration 69, current learner rf
[flaml.automl: 06-12 21:29:15] {1150} INFO - at 241.1s, best rf's error=0.0820, best lgbm's error=0.0778
[flaml.automl: 06-12 21:29:15] {998} INFO - iteration 70, current learner rf
[flaml.automl: 06-12 21:29:18] {1150} INFO - at 243.9s, best rf's error=0.0820, best lgbm's error=0.0778
[flaml.automl: 06-12 21:29:18] {998} INFO - iteration 71, current learner rf
[flaml.automl: 06-12 21:29:20] {1150} INFO - at 246.8s, best rf's error=0.0820, best lgbm's error=0.0778
[flaml.automl: 06-12 21:29:20] {998} INFO - iteration 72, current learner extra_tree
[flaml.automl: 06-12 21:29:22] {1150} INFO - at 248.1s, best extra_tree's error=0.0915, best lgbm's error=0.0778
[flaml.automl: 06-12 21:29:22] {998} INFO - iteration 73, current learner rf
[flaml.automl: 06-12 21:29:33] {1150} INFO - at 259.3s, best rf's error=0.0736, best rf's error=0.0736
[flaml.automl: 06-12 21:29:33] {998} INFO - iteration 74, current learner extra_tree
[flaml.automl: 06-12 21:29:34] {1150} INFO - at 260.7s, best extra_tree's error=0.0876, best rf's error=0.0736
[flaml.automl: 06-12 21:29:34] {998} INFO - iteration 75, current learner rf
[flaml.automl: 06-12 21:29:39] {1150} INFO - at 265.3s, best rf's error=0.0736, best rf's error=0.0736
[flaml.automl: 06-12 21:29:39] {998} INFO - iteration 76, current learner rf
[flaml.automl: 06-12 21:29:53] {1150} INFO - at 279.0s, best rf's error=0.0724, best rf's error=0.0724
[flaml.automl: 06-12 21:29:53] {998} INFO - iteration 77, current learner extra_tree
[flaml.automl: 06-12 21:29:55] {1150} INFO - at 281.4s, best extra_tree's error=0.0772, best rf's error=0.0724
[flaml.automl: 06-12 21:29:55] {998} INFO - iteration 78, current learner extra_tree
[flaml.automl: 06-12 21:29:57] {1150} INFO - at 282.9s, best extra_tree's error=0.0772, best rf's error=0.0724
[flaml.automl: 06-12 21:29:57] {998} INFO - iteration 79, current learner rf
[flaml.automl: 06-12 21:30:11] {1150} INFO - at 296.9s, best rf's error=0.0709, best rf's error=0.0709
[flaml.automl: 06-12 21:30:11] {998} INFO - iteration 80, current learner extra_tree
[flaml.automl: 06-12 21:30:12] {1150} INFO - at 298.4s, best extra_tree's error=0.0772, best rf's error=0.0709
[flaml.automl: 06-12 21:30:12] {998} INFO - iteration 81, current learner catboost
[flaml.automl: 06-12 21:30:13] {1150} INFO - at 299.0s, best catboost's error=0.1922, best rf's error=0.0709
[flaml.automl: 06-12 21:30:13] {998} INFO - iteration 82, current learner catboost
[flaml.automl: 06-12 21:30:13] {1150} INFO - at 299.3s, best catboost's error=0.1922, best rf's error=0.0709
[flaml.automl: 06-12 21:30:13] {998} INFO - iteration 83, current learner catboost
[flaml.automl: 06-12 21:30:13] {1150} INFO - at 299.6s, best catboost's error=0.1922, best rf's error=0.0709
[flaml.automl: 06-12 21:30:13] {998} INFO - iteration 84, current learner lrl1
No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'.
[flaml.automl: 06-12 21:30:14] {1150} INFO - at 300.4s, best lrl1's error=0.4582, best rf's error=0.0709
[flaml.automl: 06-12 21:30:14] {1190} INFO - selected model: RandomForestClassifier(max_features=0.5387572316546899, n_estimators=70,
n_jobs=-1)
[flaml.automl: 06-12 21:30:14] {948} INFO - fit succeeded
automl.best_estimator
print("AUC Score:",roc_auc_score(y_test,automl.predict(X_test)))
AUC Score: 0.8383864217623809
os = RandomOverSampler(0.95, random_state=10)
print(X.shape, y.shape)
X, y = os.fit_resample(X, y)
print(X.shape, y.shape)
(14907, 11) (14907,)
(21186, 11) (21186,)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
model1 = CatBoostClassifier(
iterations=250,
od_type="Iter",
l2_leaf_reg=5,
learning_rate=0.95,
verbose=0,
depth=10,
)
model2 = RandomForestClassifier(n_estimators=400, random_state=1)
model3 = GaussianNB()
model4 = LGBMClassifier(
learning_rate=0.1,
n_estimators=1000,
num_leaves=120,
n_jobs=4,
min_child_samples=14,
min_child_weight=10,
)
model = VotingClassifier(
estimators=[("cat", model1), ("RF", model2), ("NB", model3), ("LGBM", model4)],
voting="hard",
)
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Check the ROC score of the model
print("AUC score:", roc_auc_score(y_test, y_pred))
AUC score: 0.9440814430336593
print("Accuracy Score:",accuracy_score(y_test,y_pred))
Accuracy Score: 0.9438414346389806
plt.figure(figsize=(10, 10))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(
cm,
cmap="Blues",
linecolor="black",
linewidth=1,
annot=True,
fmt="",
xticklabels=["Covid_Negative", "Covid_Positive"],
yticklabels=["Covid_Negative", "Covid_Positive"],
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
data = bcell_sars.drop(['parent_protein_id', 'protein_seq', 'peptide_seq'], axis = 1)
data.dropna(inplace=True)
ctgan = CTGANSynthesizer()
# ctgan.fit(data)
loaded = ctgan.load("Covid_model_CTGAN_synth.pkl")
# ctgan.save("Covid_model_CTGAN.pkl")
samples = loaded.sample(20000)
X = samples.drop(['target'], axis = 1)
y = samples['target']
y.value_counts()
os = RandomOverSampler(0.95, random_state=10)
print(X.shape, y.shape)
X, y = os.fit_resample(X, y)
print(X.shape, y.shape)
(20000, 11) (20000,)
(30182, 11) (30182,)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
y_test.value_counts()
model1 = CatBoostClassifier(
iterations=250,
od_type="Iter",
l2_leaf_reg=5,
learning_rate=0.95,
verbose=0,
depth=10,
)
model2 = RandomForestClassifier(n_estimators=400, random_state=1)
model3 = GaussianNB()
model4 = LGBMClassifier(
learning_rate=0.1,
n_estimators=1000,
num_leaves=120,
n_jobs=4,
min_child_samples=14,
min_child_weight=10,
)
model = VotingClassifier(
estimators=[("cat", model1), ("RF", model2), ("NB", model3), ("LGBM", model4)],
voting="hard",
)
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Check the ROC score of the model
print("AUC Score:", roc_auc_score(y_test, y_pred))
AUC Score: 0.9183365822818155
print( "Accuracy Score:",accuracy_score(y_test,y_pred))
Accuracy Score: 0.9181712771243995
# confusion matrix
plt.figure(figsize=(10, 10))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(
cm,
cmap="Blues",
linecolor="black",
linewidth=1,
annot=True,
fmt="",
xticklabels=["Covid_Negative", "Covid_Positive"],
yticklabels=["Covid_Negative", "Covid_Positive"],
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
feature_name = "dae_te"
algo_name = "lgb"
model_name = f"{algo_name}_{feature_name}"
trn, tst = train_test_split(data, test_size=0.2)
n_trn = trn.shape[0]
target_col = "target"
num_cols = [
"start_position",
"end_position",
"chou_fasman",
"emini",
"kolaskar_tongaonkar",
"parker",
"isoelectric_point",
"aromaticity",
"hydrophobicity",
"stability",
]
n_fold = 10
seed = 42
encoding_dim = 256
dae = DAE( num_cols=num_cols, encoding_dim=encoding_dim)
X = dae.fit_transform(data[num_cols])
Epoch 1/10
12/12 [==============================] - 1s 56ms/step - loss: 44282.3690 - val_loss: 25388.5371
Epoch 2/10
12/12 [==============================] - 0s 11ms/step - loss: 22239.4500 - val_loss: 8747.9531
Epoch 3/10
12/12 [==============================] - 0s 11ms/step - loss: 10169.5783 - val_loss: 1051.5721
Epoch 4/10
12/12 [==============================] - 0s 9ms/step - loss: 6112.1935 - val_loss: 156.9111
Epoch 5/10
12/12 [==============================] - 0s 10ms/step - loss: 5675.3639 - val_loss: 86.7912
Epoch 6/10
12/12 [==============================] - 0s 11ms/step - loss: 6682.8845 - val_loss: 215.2645
Epoch 7/10
12/12 [==============================] - 0s 11ms/step - loss: 5887.2354 - val_loss: 167.7630
Epoch 8/10
12/12 [==============================] - 0s 11ms/step - loss: 6023.1328 - val_loss: 72.5685
Epoch 9/10
12/12 [==============================] - 0s 9ms/step - loss: 5758.3880 - val_loss: 48.8078
Epoch 10/10
12/12 [==============================] - 0s 11ms/step - loss: 6271.1780 - val_loss: 50.6213
df_dae = pd.DataFrame(X, columns=[f'dae_{i}' for i in range(encoding_dim)])
print(df_dae.shape)
(14907, 256)
X = pd.concat([data[num_cols], df_dae], axis=1)
y = data[target_col]
os = RandomOverSampler(0.95, random_state=10)
print(X.shape, y.shape)
X, y = os.fit_resample(X, y)
print(X.shape, y.shape)
df = pd.concat([X, y], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
(14907, 266) (14907,)
(21186, 266) (21186,)
%%time
model1 = CatBoostClassifier(iterations=250,od_type="Iter",l2_leaf_reg=5,
learning_rate=0.95,verbose=0,
depth=10)
model2 = RandomForestClassifier(n_estimators=400, random_state=1)
model3 = GaussianNB()
model4= LGBMClassifier(learning_rate=0.1,n_estimators=1000,num_leaves=120,n_jobs =4,min_child_samples= 14,
min_child_weight= 10)
model = VotingClassifier(estimators=[('cat', model1),('RF',model2),("NB",model3),("LGBM",model4)], voting='hard')
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Check the ROC score of the model
print( "AUC Score:",roc_auc_score(y_test,y_pred))
AUC Score: 0.9362778270341501
CPU times: user 8min 23s, sys: 3.66 s, total: 8min 26s
Wall time: 6min 4s
print("Accuracy Score:",accuracy_score(y_test,y_pred))
Accuracy Score: 0.9358187824445493
plt.figure(figsize=(10, 10))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(
cm,
cmap="Blues",
linecolor="black",
linewidth=1,
annot=True,
fmt="",
xticklabels=["Covid_Negative", "Covid_Positive"],
yticklabels=["Covid_Negative", "Covid_Positive"],
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
# !pip install black
!black Vaccine_Development_ML.ipynb
reformatted Vaccine_Development_ML.ipynb
All done! ✨ 🍰 ✨
1 file reformatted.