Source code for DashAI.back.converters.scikit_learn.ordinal_encoder

import pyarrow as pa
from sklearn.preprocessing import OrdinalEncoder as OrdinalEncoderOperation

from DashAI.back.api.utils import cast_string_to_type
from DashAI.back.converters.category.encoding import EncodingConverter
from DashAI.back.converters.sklearn_wrapper import SklearnWrapper
from DashAI.back.core.schema_fields import (
    enum_field,
    float_field,
    int_field,
    none_type,
    schema_field,
    string_field,
    union_type,
)
from DashAI.back.core.schema_fields.base_schema import BaseSchema
from DashAI.back.core.utils import MultilingualString
from DashAI.back.types.categorical import Categorical
from DashAI.back.types.dashai_data_type import DashAIDataType


class OrdinalEncoderSchema(BaseSchema):
    categories: schema_field(
        string_field(),
        "auto",
        description=MultilingualString(
            en="Categories (unique values) per feature.",
            es="Categorías (valores únicos) por característica.",
        ),
    )  # type: ignore
    dtype: schema_field(
        enum_field(["np.int32", "np.int64", "np.float32", "np.float64"]),
        "np.float64",
        description=MultilingualString(
            en="Desired dtype of output.",
            es="Tipo de dato de salida deseado.",
        ),
    )  # type: ignore
    handle_unknown: schema_field(
        enum_field(["error", "use_encoded_value"]),
        "error",
        description=MultilingualString(
            en=(
                "Whether to raise an error or use a specific encoded value when "
                "an unknown category is seen."
            ),
            es=(
                "Si se debe lanzar un error o usar un valor codificado específico "
                "cuando se vea una categoría desconocida."
            ),
        ),
    )  # type: ignore
    unknown_value: schema_field(
        none_type(enum_field(["int", "np.nan"])),
        None,
        description=MultilingualString(
            en="The value to use for unknown categories.",
            es="El valor a usar para categorías desconocidas.",
        ),
    )  # type: ignore
    # Added in version 1.3
    min_frequency: schema_field(
        none_type(union_type(int_field(ge=1), float_field(ge=0.0, le=1.0))),
        None,
        description=MultilingualString(
            en="Minimum frequency of a category to be considered as frequent.",
            es="Frecuencia mínima para considerar una categoría como frecuente.",
        ),
    )  # type: ignore
    # Added in version 1.3
    max_categories: schema_field(
        none_type(int_field(ge=1)),
        None,
        description=MultilingualString(
            en="Maximum number of categories to encode.",
            es="Número máximo de categorías a codificar.",
        ),
    )  # type: ignore


[docs] class OrdinalEncoder(EncodingConverter, SklearnWrapper, OrdinalEncoderOperation): """Scikit-learn's OrdinalEncoder wrapper for DashAI.""" SCHEMA = OrdinalEncoderSchema DESCRIPTION = MultilingualString( en="Encode categorical features as an integer array.", es="Codifica características categóricas como un arreglo de enteros.", ) DISPLAY_NAME = MultilingualString(en="Ordinal Encoder", es="Codificador Ordinal") IMAGE_PREVIEW = "ordinal_encoder.png" metadata = { "allowed_dtypes": ["string"], "restricted_dtypes": [], }
[docs] def __init__(self, **kwargs): self.dtype = kwargs.pop("dtype", "np.float64") self.dtype = cast_string_to_type(self.dtype) kwargs["dtype"] = self.dtype self.unknown_value = kwargs.pop("unknown_value", None) if self.unknown_value is not None: self.unknown_value = cast_string_to_type(self.unknown_value) kwargs["unknown_value"] = self.unknown_value self.min_frequency = kwargs.pop("min_frequency", None) if self.min_frequency is not None: self.min_frequency = cast_string_to_type(self.min_frequency) kwargs["min_frequency"] = self.min_frequency super().__init__(**kwargs)
def get_output_type(self, column_name: str = None) -> DashAIDataType: """ Returns Categorical type with encoded values. After fitting, categories are encoded as integers. """ # Return a placeholder categorical type # The actual categories will be set by sklearn_wrapper's transform method return Categorical(values=pa.array(["0", "1"]))