import os
import pathlib
import numpy as np
import pandas as pd
from beartype.typing import Any, Dict
from DashAI.back.core.schema_fields import (
enum_field,
none_type,
schema_field,
string_field,
)
from DashAI.back.dataloaders.classes.dashai_dataset import ( # ClassLabel, Value,
DashAIDataset,
)
from DashAI.back.dependencies.database.models import Exploration, Explorer
from DashAI.back.exploration.base_explorer import BaseExplorer, BaseExplorerSchema
class DescribeExplorerSchema(BaseExplorerSchema):
percentiles: schema_field(
none_type(string_field()),
"25, 50, 75",
(
"The percentiles to include in the exploration. "
"Must be a list of integers between 0 and 100.\n"
"Example: '25, 50, 75'"
),
) # type: ignore
include: schema_field(
none_type(enum_field(["all", "number", "object", "category", "datetime"])),
"all",
("The data types to include in the exploration.\n"),
) # type: ignore
exclude: schema_field(
none_type(enum_field(["object", "number", "category", "datetime"])),
None,
("The data types to exclude in the exploration."),
) # type: ignore
[docs]class DescribeExplorer(BaseExplorer):
"""
DescribeExplorer is an explorer that uses the pandas describe method to
describe the dataset. It returns a tabular representation of the dataset
with the count, mean, std, min, 25%, 50%, 75%, and max values for numeric
columns and count, unique, top, and freq values for object columns.
The user can specify the percentiles to include in the exploration and the
data types to include or exclude.
"""
DISPLAY_NAME = "Describe Dataset"
DESCRIPTION = (
"DescribeExplorer is an explorer that describes the dataset. It returns"
" a tabular representation of the dataset with the count, mean, std, min,"
" 25%, 50%, 75%, and max values for numeric columns and count, unique,"
" top, and freq values for object columns."
"\n"
"The user can specify the percentiles to include in the exploration and"
" the data types to include or exclude."
)
SCHEMA = DescribeExplorerSchema
metadata: Dict[str, Any] = {
"allowed_dtypes": ["*"],
"restricted_dtypes": [],
"input_cardinality": {"min": 1},
}
[docs] def __init__(self, **kwargs) -> None:
# transform percentiles to list of floats for describe (e.g., [0.25, 0.5, 0.75])
if kwargs.get("percentiles"):
percentiles = kwargs["percentiles"].strip().split(",")
percentiles = [percentile.strip() for percentile in percentiles]
if percentiles == [""]:
percentiles = None
else:
percentiles = [float(percentile) / 100 for percentile in percentiles]
kwargs["percentiles"] = percentiles
if kwargs.get("include") and kwargs["include"] != "all":
kwargs["include"] = [kwargs["include"]]
if kwargs.get("exclude"):
kwargs["exclude"] = [kwargs["exclude"]]
self.percentiles = kwargs["percentiles"]
self.include = kwargs["include"]
self.exclude = kwargs["exclude"]
super().__init__(**kwargs)
@classmethod
def validate_parameters(cls, params: Dict[str, Any]) -> bool:
# Validate schema
cls.SCHEMA.model_validate(params)
# Validate percentiles (must be int between 0 and 100)
if params.get("percentiles"):
percentiles = params["percentiles"].strip().split(",")
for percentile in percentiles:
try:
int_percentile = int(percentile)
if not 0 <= int_percentile <= 100:
return False
except ValueError:
return False
return True
def launch_exploration(
self, dataset: DashAIDataset, __explorer_info__: Explorer
) -> pd.DataFrame:
return dataset.to_pandas().describe(
percentiles=self.percentiles, include=self.include, exclude=self.exclude
)
def save_exploration(
self,
__exploration_info__: Exploration,
explorer_info: Explorer,
save_path: pathlib.Path,
result: pd.DataFrame,
) -> str:
filename = f"{explorer_info.id}.json"
path = pathlib.Path(os.path.join(save_path, filename))
result.to_json(path)
return path.as_posix()
def get_results(
self, exploration_path: str, options: Dict[str, Any]
) -> Dict[str, Any]:
resultType = "tabular"
orientation = options.get("orientation", "dict")
config = {"orient": orientation}
path = pathlib.Path(exploration_path)
result = (
pd.read_json(path).replace({np.nan: None}).T.to_dict(orient=orientation)
)
return {"type": resultType, "data": result, "config": config}