Source code for DashAI.back.explainability.explainers.permutation_feature_importance

from typing import List, Tuple, Union

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
from datasets import DatasetDict
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, balanced_accuracy_score, make_scorer

from DashAI.back.core.schema_fields import (
    BaseSchema,
    enum_field,
    int_field,
    schema_field,
)
from DashAI.back.explainability.global_explainer import BaseGlobalExplainer
from DashAI.back.models import BaseModel


class PermutationFeatureImportanceSchema(BaseSchema):
    """
    Permutation Feature Importance is a explanation method to asses the
    importance of each feature in a model by evaluating how much the model's
    performance decreases when the values of a specific feature are randomly
    shuffled.
    """

    scoring: schema_field(
        enum_field(enum=["accuracy", "balanced_accuracy"]),
        placeholder="accuracy",
        description="Scorer to evaluate how the perfomance of the model "
        "changes when a particular feature is shuffled.",
    )  # type: ignore

    n_repeats: schema_field(
        int_field(ge=1),
        placeholder=10,
        description="Number of times to permute a feature.",
    )  # type: ignore

    random_state: schema_field(
        int_field(),
        placeholder=0,
        description="Seed for the random number generator to control the "
        "permutations of each feature.",
    )  # type: ignore

    max_samples: schema_field(
        int_field(ge=1),
        placeholder=100,
        description="The number of samples to draw from the dataset to "
        "calculate feature importance at each repetition.",
    )  # type: ignore


[docs]class PermutationFeatureImportance(BaseGlobalExplainer): """Permutation Feature Importance is a explanation method to asses the importance of each feature in a model by evaluating how much the model's performance decreases when the values of a specific feature are randomly shuffled. """ COMPATIBLE_COMPONENTS = ["TabularClassificationTask"] SCHEMA = PermutationFeatureImportanceSchema
[docs] def __init__( self, model: BaseModel, scoring: Union[str, List[str], None] = None, n_repeats: int = 5, random_state: Union[int, None] = None, max_samples: int = 1, ): """Initialize a new instance of PermutationFeatureImportance explainer. Parameters ---------- model: BaseModel Model to be explained scoring: Union[str, List[str], None] Scorer to evaluate how the perfomance of the model changes when a particular feature is shuffled n_repeats: int Numer of times to permute a feature random_state: Union[int, None] Seed for the random number generator to control the permutations of each feature max_samples: int The number of samples to draw from the dataset to calculate feature importance at each repetition """ super().__init__(model) metrics = { "accuracy": accuracy_score, "balanced_accuracy": balanced_accuracy_score, } self.scoring = metrics[scoring] self.n_repeats = n_repeats self.random_state = random_state self.max_samples = max_samples
def explain(self, dataset: Tuple[DatasetDict, DatasetDict]): """Method for calculating the importance of features in the model Parameters ---------- dataset: Tuple[DatasetDict, DatasetDict] Tuple with (input_samples, targets) used to generate the explanation. Returns ------- dict Dictionary with the features names and the avarage importance of each feature """ x, y = dataset # Select split x_test = x["test"] y_test = y["test"] input_columns = list(x_test.features) output_columns = list(y_test.features) input_columns = list(x_test.features) output_columns = list(y_test.features) types = {column: "Categorical" for column in output_columns} y_test = y_test.change_columns_type(types) def patched_metric(y_true, y_pred_probas): return self.scoring(y_true, np.argmax(y_pred_probas, axis=1)) # TODO: binary and multi-label scorer pfi = permutation_importance( estimator=self.model, X=x_test.to_pandas(), y=y_test.to_pandas(), scoring=make_scorer(patched_metric), n_repeats=self.n_repeats, random_state=self.random_state, max_samples=self.max_samples, ) return { "features": input_columns, "importances_mean": np.round(pfi["importances_mean"], 3).tolist(), "importances_std": np.round(pfi["importances_std"], 3).tolist(), } def _create_plot(self, data: pd.DataFrame, n_features: int): """Helper method to create the explanation plot using plotly. Parameters ---------- data: pd.DataFrame dataframe containing the data to be plotted. n_features: int number of features to be displayed initially in the plot. Returns: List[dict] list of JSONs containing the information of the explanation plot to be rendered. """ fig = px.bar( data.iloc[-n_features:], x=data.iloc[-n_features:]["importances_mean"], y=data.iloc[-n_features:]["features"], error_x=data.iloc[-n_features:]["importances_std"], ) fig.update_layout( xaxis_title="Importance", yaxis_title=None, annotations=[ { "text": "", "showarrow": False, "x": 0, "y": 1.15, "xanchor": "left", "xref": "paper", "yref": "paper", "yanchor": "top", } ], updatemenus=[ { "x": 0, "xanchor": "left", "y": 1.2, "yanchor": "top", "buttons": [ { "label": f"N° features: {len(data.iloc[-c:,])}", "method": "restyle", "args": [ { "x": [data.iloc[-c:]["importances_mean"]], "y": [data.iloc[-c:]["features"]], "error_x": [data.iloc[-c:]["importances_std"]], }, ], } for c in range(len(data)) ], } ], ) return [plotly.io.to_json(fig)] def plot(self, explanation: dict) -> List[dict]: """Method to create the explanation plot. Parameters ---------- explanation: dict dictionary with the explanation generated by the explainer. Returns: List[dict] list of JSON containing the information of the explanation plot to be rendered. """ n_features = 10 data = pd.DataFrame.from_dict(explanation) data = data.sort_values(by=["importances_mean"], ascending=True) if n_features > len(data): n_features = len(data) return self._create_plot(data, n_features)