Source code for DashAI.back.exploration.explorers.wordcloud

import base64
import os
import pathlib

from beartype.typing import Any, Dict
from PIL.Image import Image
from wordcloud import STOPWORDS, WordCloud

from DashAI.back.core.schema_fields import (
    int_field,
    none_type,
    schema_field,
    string_field,
)
from DashAI.back.dataloaders.classes.dashai_dataset import (  # ClassLabel, Value,
    DashAIDataset,
)
from DashAI.back.dependencies.database.models import Exploration, Explorer
from DashAI.back.exploration.base_explorer import BaseExplorer, BaseExplorerSchema


class WordcloudSchema(BaseExplorerSchema):
    max_words: schema_field(
        t=int_field(gt=0),
        placeholder=200,
        description="The maximum number of words to display in the wordcloud.",
    )  # type: ignore
    background_color: schema_field(
        t=none_type(string_field()),
        placeholder=None,
        description=(
            "The background color of the wordcloud. "
            "If None, the background will be transparent."
        ),
    )  # type: ignore


[docs]class WordcloudExplorer(BaseExplorer):
    """
    WordcloudExplorer is an explorer that generates a wordcloud
    from the concatenated strings of all selected columns in the dataset.
    """

    DISPLAY_NAME = "Word Cloud"
    DESCRIPTION = (
        "A wordcloud is a visual representation of text data, "
        "where the size of each word indicates its frequency in the text."
        "\n"
        "This explorer generates a wordcloud from the concatenated "
        "strings of all selected columns in the dataset."
    )

    SCHEMA = WordcloudSchema
    metadata: Dict[str, Any] = {
        "allowed_dtypes": ["string"],
        "restricted_dtypes": [],
        "input_cardinality": {"min": 1},
    }

[docs]    def __init__(self, **kwargs) -> None:
        self.max_words = kwargs.get("max_words", 200)
        self.background_color = kwargs.get("background_color")
        super().__init__(**kwargs)

    def launch_exploration(self, dataset: DashAIDataset, explorer_info: Explorer):
        _df = dataset.to_pandas()
        cols = [col["columnName"] for col in explorer_info.columns]

        # concatenate all columns into one string
        text = " ".join(_df[cols].astype(str).sum(axis=1))

        # create wordcloud
        wordcloud = WordCloud(
            max_words=self.max_words,
            stopwords=STOPWORDS,
            background_color=self.background_color,
            mode="RGBA" if self.background_color is None else "RGB",
            width=800,
            height=600,
        ).generate(text)

        return wordcloud.to_image()

    def save_exploration(
        self,
        __exploration_info__: Exploration,
        explorer_info: Explorer,
        save_path: pathlib.Path,
        result: Image,
    ) -> str:
        filename = f"{explorer_info.id}.png"
        path = pathlib.Path(os.path.join(save_path, filename))
        result.save(path, format="PNG")

        return path.as_posix()

    def get_results(
        self, exploration_path: str, options: Dict[str, Any]
    ) -> Dict[str, Any]:
        resultType = "image_base64"
        config = {}

        # Load image
        with open(exploration_path, "rb") as f:
            result = f.read()

        # encode image to base64
        result = base64.b64encode(result).decode("utf-8")

        # Return image
        return {"data": result, "type": resultType, "config": config}