Source code for DashAI.back.exploration.explorers.describe_explorer

import os
import pathlib

import numpy as np
import pandas as pd
from beartype.typing import Any, Dict

from DashAI.back.core.schema_fields import (
    enum_field,
    none_type,
    schema_field,
    string_field,
)
from DashAI.back.dataloaders.classes.dashai_dataset import (  # ClassLabel, Value,
    DashAIDataset,
)
from DashAI.back.dependencies.database.models import Exploration, Explorer
from DashAI.back.exploration.base_explorer import BaseExplorer, BaseExplorerSchema


class DescribeExplorerSchema(BaseExplorerSchema):
    percentiles: schema_field(
        none_type(string_field()),
        "25, 50, 75",
        (
            "The percentiles to include in the exploration. "
            "Must be a list of integers between 0 and 100.\n"
            "Example: '25, 50, 75'"
        ),
    )  # type: ignore
    include: schema_field(
        none_type(enum_field(["all", "number", "object", "category", "datetime"])),
        "all",
        ("The data types to include in the exploration.\n"),
    )  # type: ignore
    exclude: schema_field(
        none_type(enum_field(["object", "number", "category", "datetime"])),
        None,
        ("The data types to exclude in the exploration."),
    )  # type: ignore


[docs]class DescribeExplorer(BaseExplorer): """ DescribeExplorer is an explorer that uses the pandas describe method to describe the dataset. It returns a tabular representation of the dataset with the count, mean, std, min, 25%, 50%, 75%, and max values for numeric columns and count, unique, top, and freq values for object columns. The user can specify the percentiles to include in the exploration and the data types to include or exclude. """ DISPLAY_NAME = "Describe Dataset" DESCRIPTION = ( "DescribeExplorer is an explorer that describes the dataset. It returns" " a tabular representation of the dataset with the count, mean, std, min," " 25%, 50%, 75%, and max values for numeric columns and count, unique," " top, and freq values for object columns." "\n" "The user can specify the percentiles to include in the exploration and" " the data types to include or exclude." ) SCHEMA = DescribeExplorerSchema metadata: Dict[str, Any] = { "allowed_dtypes": ["*"], "restricted_dtypes": [], "input_cardinality": {"min": 1}, }
[docs] def __init__(self, **kwargs) -> None: # transform percentiles to list of floats for describe (e.g., [0.25, 0.5, 0.75]) if kwargs.get("percentiles"): percentiles = kwargs["percentiles"].strip().split(",") percentiles = [percentile.strip() for percentile in percentiles] if percentiles == [""]: percentiles = None else: percentiles = [float(percentile) / 100 for percentile in percentiles] kwargs["percentiles"] = percentiles if kwargs.get("include") and kwargs["include"] != "all": kwargs["include"] = [kwargs["include"]] if kwargs.get("exclude"): kwargs["exclude"] = [kwargs["exclude"]] self.percentiles = kwargs["percentiles"] self.include = kwargs["include"] self.exclude = kwargs["exclude"] super().__init__(**kwargs)
@classmethod def validate_parameters(cls, params: Dict[str, Any]) -> bool: # Validate schema cls.SCHEMA.model_validate(params) # Validate percentiles (must be int between 0 and 100) if params.get("percentiles"): percentiles = params["percentiles"].strip().split(",") for percentile in percentiles: try: int_percentile = int(percentile) if not 0 <= int_percentile <= 100: return False except ValueError: return False return True def launch_exploration( self, dataset: DashAIDataset, __explorer_info__: Explorer ) -> pd.DataFrame: return dataset.to_pandas().describe( percentiles=self.percentiles, include=self.include, exclude=self.exclude ) def save_exploration( self, __exploration_info__: Exploration, explorer_info: Explorer, save_path: pathlib.Path, result: pd.DataFrame, ) -> str: filename = f"{explorer_info.id}.json" path = pathlib.Path(os.path.join(save_path, filename)) result.to_json(path) return path.as_posix() def get_results( self, exploration_path: str, options: Dict[str, Any] ) -> Dict[str, Any]: resultType = "tabular" orientation = options.get("orientation", "dict") config = {"orient": orientation} path = pathlib.Path(exploration_path) result = ( pd.read_json(path).replace({np.nan: None}).T.to_dict(orient=orientation) ) return {"type": resultType, "data": result, "config": config}