Source code for DashAI.back.converters.scikit_learn.ordinal_encoder

from sklearn.preprocessing import OrdinalEncoder as OrdinalEncoderOperation

from DashAI.back.api.utils import cast_string_to_type
from DashAI.back.converters.sklearn_wrapper import SklearnWrapper
from DashAI.back.core.schema_fields import (
    enum_field,
    float_field,
    int_field,
    none_type,
    schema_field,
    string_field,
    union_type,
)
from DashAI.back.core.schema_fields.base_schema import BaseSchema


class OrdinalEncoderSchema(BaseSchema):
    categories: schema_field(
        string_field(),  # "auto" or a list of array-like
        "auto",
        "Categories (unique values) per feature.",
    )  # type: ignore
    dtype: schema_field(
        enum_field(["np.int32", "np.int64", "np.float32", "np.float64"]),  # number type
        "np.float64",
        "Desired dtype of output.",
    )  # type: ignore
    handle_unknown: schema_field(
        enum_field(["error", "use_encoded_value"]),
        "error",
        (
            "Whether to raise an error or ignore if an unknown categorical feature "
            "is present during transform."
        ),
    )  # type: ignore
    unknown_value: schema_field(
        none_type(
            enum_field(["int", "np.nan"]),  # int or np.nan
        ),
        None,
        "The value to use for unknown categories.",
    )  # type: ignore
    # Added in version 1.3
    min_frequency: schema_field(
        none_type(union_type(int_field(ge=1), float_field(ge=0.0, le=1.0))),
        None,
        "Minimum frequency of a category to be considered as frequent.",
    )  # type: ignore
    # Added in version 1.3
    max_categories: schema_field(
        none_type(int_field(ge=1)),
        None,
        "Maximum number of categories to encode.",
    )  # type: ignore


[docs] class OrdinalEncoder(SklearnWrapper, OrdinalEncoderOperation): """Scikit-learn's OrdinalEncoder wrapper for DashAI.""" SCHEMA = OrdinalEncoderSchema DESCRIPTION = "Encode categorical features as an integer array."
[docs] def __init__(self, **kwargs): self.dtype = kwargs.pop("dtype", "np.float64") self.dtype = cast_string_to_type(self.dtype) kwargs["dtype"] = self.dtype self.unknown_value = kwargs.pop("unknown_value", None) if self.unknown_value is not None: self.unknown_value = cast_string_to_type(self.unknown_value) kwargs["unknown_value"] = self.unknown_value self.min_frequency = kwargs.pop("min_frequency", None) if self.min_frequency is not None: self.min_frequency = cast_string_to_type(self.min_frequency) kwargs["min_frequency"] = self.min_frequency super().__init__(**kwargs)