from sklearn.datasets import make_classification
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
Failure of Accuracy
Model Evaluation
def evaluate_model(X, y, model):
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
cv = cross_val_score(model, X, y, scoring='accuracy', cv = cv)
scores return scores
= make_classification(n_samples=10000, n_features=2, n_redundant=0,
X, y =1, weights=[0.99], flip_y=0, random_state=4) n_clusters_per_class
= DummyClassifier(strategy='most_frequent') model
= evaluate_model(X, y, model) scores
print(f'Mean Accuracy: {scores.mean()*100 : 0.2f}%')
Precision, Recall, and F-measure
Calculate Precision
calculates precision for 1:100 dataset with 90 tp and 30 fp
from sklearn.metrics import precision_score,recall_score
# define actual
= [1 for _ in range(100)]
act_pos = [0 for _ in range(10000)]
act_neg = act_pos + act_neg
# define predictions
= [0 for _ in range(10)] + [1 for _ in range(90)]
pred_pos = [1 for _ in range(30)] + [0 for _ in range(9970)]
pred_neg = pred_pos + pred_neg y_pred
# calculate prediction
= precision_score(y_true, y_pred, average='binary')
precision print(f'Precision: {precision: 0.3f}')
Precision: 0.750
Calculates precision for 1:1:100 dataset with 50tp, 20fp, 99tp, 51fp
# define actual
= [1 for _ in range(100)]
act_pos1 = [2 for _ in range(100)]
act_pos2 = [0 for _ in range(10000)]
act_neg = act_pos1 + act_pos2 + act_neg
# define predictions
= [0 for _ in range(50)] + [1 for _ in range(50)]
pred_pos1 = [0 for _ in range(1)] + [2 for _ in range(99)]
pred_pos2 = [1 for _ in range(20)] + [2 for _ in range(51)] + [0 for _ in range(9929)]
pred_neg = pred_pos1 + pred_pos2 + pred_neg y_pred
# calculate prediction
= precision_score(y_true, y_pred, labels=[1,2], average= 'micro')
precision print( ' Precision: %.3f ' % precision)
Precision: 0.677
Calculate Recall
Calculates recall for 1:100 dataset with 90 tp and 10 fn
from sklearn.metrics import recall_score
# define actual
= [1 for _ in range(100)]
act_pos = [0 for _ in range(10000)]
act_neg = act_pos + act_neg
# define predictions
= [0 for _ in range(10)] + [1 for _ in range(90)]
pred_pos = [0 for _ in range(10000)]
pred_neg = pred_pos + pred_neg y_pred
# calculate recall
= recall_score(y_true, y_pred, average='binary')
recall print(f'Recall: {recall: .3f}')
Recall: 0.900
Calculates recall for 1:1:100 dataset with 77tp, 23fn and 95tp, 5fn
# define actual
= [1 for _ in range(100)]
act_pos1 = [2 for _ in range(100)]
act_pos2 = [0 for _ in range(10000)]
act_neg = act_pos1 + act_pos2 + act_neg y_true
# define predictions
= [0 for _ in range(23)] + [1 for _ in range(77)]
pred_pos1 = [0 for _ in range(5)] + [2 for _ in range(95)]
pred_pos2 = [0 for _ in range(10000)]
pred_neg = pred_pos1 + pred_pos2 + pred_neg y_pred
# calculate recall
= recall_score(y_true, y_pred, labels=[1,2], average='micro')
recall print(f'Recall: {recall:.3f}')
Recall: 0.860
Calculates f1 for 1:100 dataset with 95tp, 5fn, 55fp
from sklearn.metrics import f1_score
# define actual
= [1 for _ in range(100)]
act_pos = [0 for _ in range(10000)]
act_neg = act_pos + act_neg y_true
# define predictions
= [0 for _ in range(5)] + [1 for _ in range(95)]
pred_pos = [1 for _ in range(55)] + [0 for _ in range(9945)]
pred_neg = pred_pos + pred_neg y_pred
# calculate score
= f1_score(y_true, y_pred, average= 'binary')
score print(f'F-measure: {score: .3f} ')
F-measure: 0.760
ROC Curves and Precision-Recall Curves
from sklearn.datasets import make_classification
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve
from matplotlib import pyplot
= make_classification(n_samples=1000, n_classes=2, random_state=1) X, y
= train_test_split(X, y, test_size=0.5, random_state=2) trainX, testX, trainy, testy
ROC Curves and ROC AUC
= LogisticRegression(solver='lbfgs') model, traiy)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with
= model.predict_proba(testX) yhat
= yhat[:, 1] pos_probs
= roc_curve(testy, pos_probs) fpr, tpr, _
0,1], [0,1], linestyle='--', label='No Skill')
pyplot.plot([= '.', label='Logistic')
pyplot.plot(fpr, tpr, marker'False Positive Rate')
pyplot.xlabel('True Positive Rate')
ROC Area Under Curve (AUC) Score
= DummyClassifier(strategy='stratified') model, traiy)
DummyClassifier(strategy='stratified')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with
= model.predict_proba(testX) yhat
= yhat[:, 1] pos_probs
= roc_auc_score(testy, pos_probs)
roc_auc print(f'No skill ROC AUC {roc_auc: .3f}')
No skill ROC AUC 0.472
= LogisticRegression(solver='lbfgs')
model, traiy)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with
= model.predict_proba(testX)
yhat = yhat[:, 1] pos_probs
= roc_auc_score(testy, pos_probs)
roc_auc print(f'Logistic ROC AUC {roc_auc: 0.3f}')
Logistic ROC AUC 0.903
Precision-Recall Curves and AUC
= len(y[y==1]) / len(y) no_skill
= precision_recall_curve(testy, pos_probs) precision, recall, _
0,1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot([='.', label='logistic')
pyplot.plot(recall, precision, marker'Recall')
Precision-Recall Area Under Curve (AUC) Score
from sklearn.metrics import auc
= DummyClassifier(strategy='stratified')
model, traiy)
DummyClassifier(strategy='stratified')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with
= model.predict_proba(testX) yhat
= yhat[:,1] pos_probs
= precision_recall_curve(testy, pos_probs)
precision, recall, _ = auc(recall, precision)
auc_score print(f'No Skill PR AUC: {auc_score: 0.3f}')
No Skill PR AUC: 0.607
= LogisticRegression(solver='lbfgs')
model, trainy)= model.predict_proba(testX)
yhat = yhat[:, 1] pos_probs
= precision_recall_curve(testy, pos_probs)
precision, recall, _ = auc(recall, precision)
auc_score print(f'Logistic PR AUC: {auc_score: 0.3f}')
Logistic PR AUC: 0.898