Source code for DashAI.back.exploration.explorers.cov_matrix

import os
import pathlib

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from beartype.typing import Any, Dict, Union

from DashAI.back.core.schema_fields import bool_field, int_field, schema_field
from DashAI.back.dataloaders.classes.dashai_dataset import (  # ClassLabel, Value,
    DashAIDataset,
)
from DashAI.back.dependencies.database.models import Exploration, Explorer
from DashAI.back.exploration.base_explorer import BaseExplorer, BaseExplorerSchema


class CovarianceMatrixExplorerSchema(BaseExplorerSchema):
    min_periods: schema_field(
        int_field(gt=0),
        1,
        (
            "The minimum number of observations required per pair of columns to"
            " have a valid result."
        ),
    )  # type: ignore
    delta_degree_of_freedom: schema_field(
        int_field(gt=0),
        1,
        (
            "The delta degree of freedom to use when calculating the covariance matrix."
            "Only used if numeric_only is True."
        ),
    )  # type: ignore
    numeric_only: schema_field(
        bool_field(),
        True,
        (
            "If True, only include numeric columns when calculating correlation."
            "If False, all columns are included."
        ),
    )  # type: ignore
    plot: schema_field(
        bool_field(),
        True,
        ("If True, the result will be plotted."),
    )  # type: ignore


[docs]class CovarianceMatrixExplorer(BaseExplorer):
    """
    CovarianceExplorer is an explorer that returns the covariance matrix of the dataset.

    Its result is a heatmap by default, but can also be returned as a tabular result.
    """

    DISPLAY_NAME = "Covariance Matrix"
    DESCRIPTION = (
        "CovarianceExplorer is an explorer that returns the covariance matrix "
        "of the dataset."
        "\n"
        "Its result is a heatmap by default, "
        "but can also be returned as a tabular result."
    )

    SCHEMA = CovarianceMatrixExplorerSchema
    metadata: Dict[str, Any] = {
        "allowed_dtypes": ["*"],
        "restricted_dtypes": [],
        "input_cardinality": {"min": 2},
    }

[docs]    def __init__(self, **kwargs) -> None:
        self.ddof = kwargs.get("delta_degree_of_freedom")
        self.min_periods = kwargs.get("min_periods")
        self.numeric_only = kwargs.get("numeric_only")
        self.plot = kwargs.get("plot")
        super().__init__(**kwargs)

    def launch_exploration(
        self, dataset: DashAIDataset, explorer_info: Explorer
    ) -> Union[pd.DataFrame, go.Figure]:
        result = dataset.to_pandas().cov(
            min_periods=self.min_periods,
            ddof=self.ddof,
            numeric_only=self.numeric_only,
        )

        if self.plot:
            result = px.imshow(
                result,
                text_auto=True,
                aspect="auto",
                title=f"Covariance Matrix of {len(explorer_info.columns)} columns",
            )
            if explorer_info.name is not None and explorer_info.name != "":
                result.update_layout(title=f"{explorer_info.name}")

        return result

    def save_exploration(
        self,
        __exploration_info__: Exploration,
        explorer_info: Explorer,
        save_path: pathlib.Path,
        result: Union[pd.DataFrame, go.Figure],
    ) -> str:
        filename = f"{explorer_info.id}.json"
        path = pathlib.Path(os.path.join(save_path, filename))

        if self.plot:
            assert isinstance(result, go.Figure)
            result.write_json(path)
        else:
            assert isinstance(result, pd.DataFrame)
            result.to_json(path)
        return path.as_posix()

    def get_results(
        self, exploration_path: str, options: Dict[str, Any]
    ) -> Dict[str, Any]:
        if self.plot:
            resultType = "plotly_json"
            path = pathlib.Path(exploration_path)
            result = pio.read_json(path).to_json()
            return {"type": resultType, "data": result, "config": {}}

        resultType = "tabular"
        orientation = options.get("orientation", "dict")
        config = {"orient": orientation}

        path = pathlib.Path(exploration_path)
        result = (
            pd.read_json(path).replace({np.nan: None}).T.to_dict(orient=orientation)
        )
        return {"type": resultType, "data": result, "config": config}