There are several evaluation metrics (e.g., accuracy, AUC-ROC, Mathew correlation coefficient, precision, recall, F1 score, confusion matrix, etc.) that are used to determine the performance of supervised machine learning classification algorithms. The selection of a metric to assess the performance of a classification algorithm depends on the input data. For example, if your data are highly imbalanced, “accuracy” should not be used; MCC or F1 score can be the right metrics. In this post, I am not going to discuss the details of any of the evaluation metrics. I assume that you understand those evaluation metrics. Here, I will write a Python code that uses functions from the sklearn library to compute those metrics. In this Python code, I have given the formula to calculate them, which should help you understand them.
Here is the code:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef
def model_training_tesing(data, label):
# use a classification method
clf = LogisticRegression(max_iter=5000)
# generate 5-fold cross-validated estimates for each input data point
# compute predicted probability instead of label.
return cross_val_predict(clf, data, label, cv=5, method='predict_proba')
def compute_classification_evaluation_metrics(probabilities, label):
# determine y_true and y_predicted.
y_true = [] # store true label of records
y_pred_auc = [] # store class 1 probabilities
y_pred_acc = [] # store predicted label
for j in range(len(label)):
y_true.append(label[j])
y_pred_auc.append(probabilities[j][1]) # class 1 probabilities
y_pred_acc.append(round(probabilities[j][1])) # predicted label
# compute confusion matrix
tn, fp, fn, tp = confusion_matrix(y_true, y_pred_acc).ravel()
print("tn, fp, fn, tp ---> ", tn, fp, fn, tp)
# accuracy
acc = accuracy_score(y_true, y_pred_acc)
print("Accuracy ---> {0}".format(acc))
# AUC-ROC
roc = roc_auc_score(y_true, y_pred_auc)
print("AUC-ROC ---> {0}".format(roc))
# Matthews correlation coefficient (MCC)
mcc = matthews_corrcoef(y_true, y_pred_acc)
print("MCC ---> {0}".format(mcc))
# sensitivity, recall, hit rate, or true positive rate (TPR)
tpr = tp/(tp + fn)
print("Recall/Sensitivity ---> {0}".format(tpr))
# specificity, selectivity or true negative rate (TNR)
tnr = tn/(tn + fp)
print("Specificity ---> {0}".format(tnr))
# precision or positive predictive value (PPV)
ppv = tp/(tp + fp)
print("Precision ---> {0}".format(ppv))
# negative predictive value (NPV)
npv = tn/(tn + fn)
print("Negative Predictive Value ---> {0}".format(npv))
# miss rate or false negative rate (FNR)
fnr = 1 - tpr
print("False Negative Rate ---> {0}".format(fnr))
# fall-out or false positive rate (FPR)
fpr = 1- tnr
print("False Positive Rate ---> {0}".format(fpr))
# false discovery rate (FDR)
fdr = 1 - ppv
print("False Discovery Rate ---> {0}".format(fdr))
# false omission rate (FOR)
fomr = 1 - npv
print("False Omission Rate ---> {0}".format(fomr))
# F1 score - harmonic mean of precision and recall [2*tp/(2*tp + fp + fn)]
f1 = 2* ppv * tpr/(ppv + tpr)
print("F1 Score ---> {0}".format(f1))
if __name__ == '__main__':
"""
This program will compute several evaluation metrics that are used in classification algorithms.
"""
# load sklearn breast cancer data
data = load_breast_cancer()
X = data.data
y = data.target # binary label 0 and 1
# get classification results
predicted_probs = model_training_tesing(X, y)
# compute classification evaluation metrics
compute_classification_evaluation_metrics(predicted_probs, y)