Source code for src.benchmark.metrics

from __future__ import annotations

# typing imports
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from ..config.runs.benchmark import BenchmarkConfig
    from logging import Logger

import numpy as np
import pandas as pd
import os

import mlflow
import math

from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay, PrecisionRecallDisplay, \
    classification_report

import matplotlib.pyplot as plt


[docs] class Metrics: """ This class is used to compute metrics for a benchmarking run. It is capable of generating multiple different artifacts that can be used for further model performance analysis. """ def __init__( self, config: BenchmarkConfig, logger: Logger, model_id: str, base_folder: str, classes: list[str], average: str = "weighted" ): self.config = config self.logger = logger self.model_id = model_id self.classes = classes self.base_folder = base_folder os.makedirs(self.base_folder, exist_ok=True) self.average = average self.zero_devision = self.config.metrics.zero_division_default
[docs] @staticmethod def hamming_score( y_true: np.array, y_pred: np.array ) -> np.array: """ This function contains the code to compute the hamming_Score. Hamming score formula: >>> Hamming score = (Σ (y_true_i == y_pred_i)) / (Σ 1) The Hamming score is a useful metric for evaluating the performance of multi-label classification models, which are models that predict multiple labels for each instance. In multi-label classification, the Hamming score is more sensitive to errors than accuracy, as it considers both false positives and false negatives. :param y_true: matrix what the labels should be :param y_pred: matrix with the predicted labels :return: the hamming scores """ acc_list = [] for i in range(y_true.shape[0]): set_true = set(np.where(y_true[i])[0]) set_pred = set(np.where(y_pred[i])[0]) if len(set_true) == 0 and len(set_pred) == 0: tmp_a = 1 else: tmp_a = len(set_true.intersection(set_pred)) / float(len(set_true.union(set_pred))) acc_list.append(tmp_a) return np.mean(acc_list)
[docs] def f1_score( self, y_true: np.array, y_pred: np.array ) -> np.array: """ This function contains the code to compute the hamming_Score. F1 score formula: >>> F1 = 2 * (precision * recall) / (precision + recall) The F1 score is a statistical measure of a test's accuracy, particularly in the context of binary classification. It is the harmonic mean of precision and recall, which are two important metrics for evaluating the performance of binary classifiers. :param y_true: matrix what the labels should be :param y_pred: matrix with the predicted labels :return: the f1 scores """ return f1_score( y_true, y_pred, average=self.average, zero_division=self.zero_devision )
[docs] def recall_score( self, y_true: np.array, y_pred: np.array ) -> np.array: """ This function computes the recall for the given input The recall formula: >>> Recall = TP / (TP + FN) >>> where: >>> TP is the number of true positives >>> FN is the number of false negatives In machine learning, recall, also known as sensitivity, true positive rate (TPR), or completeness, is the proportion of actual positives that are correctly identified as such by the model. It is calculated as the number of true positives divided by the total number of actual positives. Recall is a binary classification metric, meaning it is only relevant when a model is predicting one of two classes. :param y_true: matrix what the labels should be :param y_pred: matrix with the predicted labels :return: the recall scores """ return recall_score( y_true, y_pred, average=self.average, zero_division=self.zero_devision )
[docs] def precision_score( self, y_true: np.array, y_pred: np.array ) -> np.array: """ This function enables the computation of precision score The precision formula: >>> Precision = TP / (TP + FP) >>> where: >>> TP is the number of true positives >>> FP is the number of false positives In machine learning, precision, also known as positive predictive value (PPV), is the proportion of predicted positives that are actually positive. It is calculated as the number of true positives divided by the total number of predicted positives. Precision is a binary classification metric, meaning it is only relevant when a model is predicting one of two classes :param y_true: matrix what the labels should be :param y_pred: matrix with the predicted labels :return: the precision scores """ return precision_score( y_true, y_pred, average=self.average, zero_division=self.zero_devision )
[docs] def compute(self, y_true: np.array, logits: np.array, suffix: str = None, save: bool = True) -> pd.DataFrame: """ This function brings all previous calculations together. Paired with the config, you can enable and disable certain artifacts and metrics Metrics that can be used: 1. Hamming score 2. f1 score 3. precision 4. recall Artifacts that can be generated: 1. Confusion matrix 2. classification report 3. precision-recall plot 4. overview plot :param y_true: the values that are expected to be predicted :param logits: the logits for the predicted values :param suffix: a suffix that can be used for custom naming of the artifacts :param save: flag to save (currently not used -> config handles this) :return: pandas dataframe containing all the metric values """ y_true = np.nan_to_num(y_true) logits = np.nan_to_num(logits) experiment = {} for thresh in range(0, 100, 1): calc_thresh = thresh / 100.0 y_pred = np.where(logits > calc_thresh, 1, 0) metrics = dict() if self.config.metrics.hamming: metrics["hamming"] = self.hamming_score(y_true, y_pred) if self.config.metrics.precision: metrics["precision_score"] = self.precision_score(y_true, y_pred) if self.config.metrics.recall: metrics["recall_score"] = self.recall_score(y_true, y_pred) if self.config.metrics.f1: metrics["f1_score"] = self.f1_score(y_true, y_pred) if self.config.evaluation.with_threshold: metrics["threshold"] = calc_thresh experiment[thresh] = metrics mlflow.log_metrics(metrics, step=thresh) if self.config.evaluation.confusion_matrix: confusion_matrix_folder = os.path.join( self.base_folder, "confusion_matrix" ) os.makedirs(confusion_matrix_folder, exist_ok=True) cms = multilabel_confusion_matrix( y_true, y_pred ) row = math.ceil(math.sqrt(len(self.classes))) confusion_matrix_save_path = os.path.join( confusion_matrix_folder, f"{thresh:02}_confusion_matrix.png" ) f, axes = plt.subplots(row, row, figsize=(row * 5, row * 5)) axes = axes.ravel() disp = None for i, (cm, label) in enumerate(zip(cms, self.classes)): disp = ConfusionMatrixDisplay(cm) disp.plot(ax=axes[i], values_format='.4g') disp.ax_.set_title(f'class {label}') disp.im_.colorbar.remove() plt.subplots_adjust(wspace=0.10, hspace=0.1) f.colorbar(disp.im_, ax=axes) f.savefig(confusion_matrix_save_path) plt.close(f) if self.config.evaluation.classification_report: classification_report_folder = os.path.join( self.base_folder, "classification_report" ) os.makedirs(classification_report_folder, exist_ok=True) classification_report_file = os.path.join( classification_report_folder, f"{thresh:02}_classification_report.txt" ) classification_result = classification_report( y_true, y_pred, output_dict=False, target_names=self.classes, zero_division=self.zero_devision ) with open(classification_report_file, 'w+') as f: f.write(classification_result) stat_df = pd.DataFrame(experiment).T if self.config.evaluation.precision_recall_plot: precision_recall_save_path = os.path.join( self.base_folder, "precision_recall_plot.png" ) disp = PrecisionRecallDisplay( recall=stat_df["recall_score"], precision=stat_df["precision_score"] ) fig = disp.plot().figure_ fig.savefig(precision_recall_save_path) plt.close(fig) if self.config.evaluation.overview_plot: figure = stat_df.plot( figsize=(18, 8), title=f"Score related to threshold - {self.model_id}", xlabel="threshold steps 0->100", ylabel="score" ).get_figure() save_location = os.path.join( self.base_folder, f"{self.model_id.replace('/', '_')}_scores{('_' + suffix) if suffix is not None else ''}.png" ) figure.savefig(save_location) plt.close(figure) mlflow.log_artifacts(self.base_folder) return stat_df.T