Source code for DashAI.back.models.scikit_learn.bow_text_classification_model

from pathlib import Path
from typing import Optional, Union

import joblib
import numpy as np
from datasets import Dataset
from sklearn.feature_extraction.text import CountVectorizer

from DashAI.back.core.schema_fields import (
    BaseSchema,
    component_field,
    int_field,
    schema_field,
)
from DashAI.back.dataloaders.classes.dashai_dataset import to_dashai_dataset
from DashAI.back.models.scikit_learn.sklearn_like_model import SklearnLikeModel
from DashAI.back.models.text_classification_model import TextClassificationModel


class BagOfWordsTextClassificationModelSchema(BaseSchema):
    """
    NumericalWrapperForText is a metamodel that allows text classification using
    tabular classifiers and a tokenizer.
    """

    tabular_classifier: schema_field(
        component_field(parent="TabularClassificationModel"),
        placeholder={"component": "SVC", "params": {}},
        description=(
            "Tabular model used as the underlying model "
            "to generate the text classifier."
        ),
    )  # type: ignore
    ngram_min_n: schema_field(
        int_field(ge=1),
        placeholder=1,
        description=(
            "The lower boundary of the range of n-values for different word n-grams "
            "or char n-grams to be extracted. It must be an integer greater or equal "
            "than 1"
        ),
    )  # type: ignore
    ngram_max_n: schema_field(
        int_field(ge=1),
        placeholder=1,
        description=(
            "The upper boundary of the range of n-values for different word n-grams "
            "or char n-grams to be extracted. It must be an integer greater or equal "
            "than 1"
        ),
    )  # type: ignore


[docs] class BagOfWordsTextClassificationModel(TextClassificationModel, SklearnLikeModel): """Text classification meta-model. The metamodel has two main components: - Tabular classification model: the underlying model that processes the data and provides the prediction. - Vectorizer: a BagOfWords that vectorizes the text into a sparse matrix to give the correct input to the underlying model. The tabular_model and vectorizer are created in the __init__ method and stored in the model. To train the tabular_model the vectorizer is fitted and used to transform the train dataset. To predict with the tabular_model the vectorizer is used to transform the dataset. """ DISPLAY_NAME: str = "Bag of Words Text Classifier" COLOR: str = "#FF5722" SCHEMA = BagOfWordsTextClassificationModelSchema
[docs] def __init__(self, **kwargs) -> None: """ Initialize the BagOfWordsTextClassificationModel. Parameters ---------- kwargs : dict A dictionary containing the parameters for the model, including: - tabular_classifier: The tabular classification model from DashAI to be used. - ngram_min_n: Minimum n-gram value. - ngram_max_n: Maximum n-gram value. """ self.classifier = kwargs["tabular_classifier"] self.vectorizer = CountVectorizer( ngram_range=(kwargs["ngram_min_n"], kwargs["ngram_max_n"]) )
def get_vectorizer(self, input_column: str, output_column: Optional[str] = None): """Factory that returns a function to transform a text classification dataset into a tabular classification dataset. To do this, the column "text" is vectorized (using a BagOfWords) into a sparse matrix of size NxM, where N is the number of examples and M is the vocabulary size. Each column of the output matrix will be named using the input_column name as prefix and the column number as suffix. The output_column is not changed. Parameters ---------- input_column : str name the input column of the dataset. This column will be vectorized. output_column : str name the output column of the dataset. Returns ------- Function Function for vectorize the dataset. """ def _vectorize(example) -> dict: vectorized_sentence = self.vectorizer.transform( [example[input_column]] ).toarray() output_example = {} for idx in range(np.shape(vectorized_sentence)[1]): output_example[input_column + str(idx)] = vectorized_sentence[0][idx] return output_example return _vectorize def fit(self, x: Dataset, y: Dataset): input_column = x.column_names[0] self.vectorizer.fit(x[input_column]) tokenizer_func = self.get_vectorizer(input_column) tokenized_dataset = x.map(tokenizer_func, remove_columns=x.column_names) tokenized_dataset = to_dashai_dataset(tokenized_dataset) self.classifier.fit(tokenized_dataset, y) def predict(self, x: Dataset): input_column = x.column_names[0] tokenizer_func = self.get_vectorizer(input_column) tokenized_dataset = x.map(tokenizer_func, remove_columns=x.column_names) tokenized_dataset = to_dashai_dataset(tokenized_dataset) return self.classifier.predict(tokenized_dataset) def save(self, filename: Union[str, Path]) -> None: """Save the model in the specified path.""" joblib.dump(self, filename) @staticmethod def load(filename: Union[str, Path]) -> None: """Load the model of the specified path.""" model = joblib.load(filename) return model