Source code for DashAI.back.converters.scikit_learn.label_encoder

from typing import Union

import pyarrow as pa
from sklearn.preprocessing import LabelEncoder as LabelEncoderOperation

from DashAI.back.converters.category.encoding import EncodingConverter
from DashAI.back.converters.sklearn_wrapper import SklearnWrapper
from DashAI.back.core.schema_fields.base_schema import BaseSchema
from DashAI.back.dataloaders.classes.dashai_dataset import (
    DashAIDataset,
    to_dashai_dataset,
)
from DashAI.back.types.categorical import Categorical
from DashAI.back.types.dashai_data_type import DashAIDataType


class LabelEncoderSchema(BaseSchema):
    pass


[docs] class LabelEncoder(EncodingConverter, SklearnWrapper): """Scikit-learn's LabelEncoder wrapper for DashAI that supports multiple columns.""" SCHEMA = LabelEncoderSchema DESCRIPTION = "Encode target labels with value between 0 and n_classes-1." SHORT_DESCRIPTION = "Convert categorical labels to numeric values" CATEGORY = "Encoding" DISPLAY_NAME = "Label Encoder" IMAGE_PREVIEW = "label_encoder.png" metadata = { "changes_data_types": True, "allowed_dtypes": ["string", "int64", "float64"], "restricted_dtypes": [], }
[docs] def __init__(self, **kwargs): super().__init__() self.encoders = {} self.fitted_columns = []
def get_output_type(self, column_name: str = None) -> DashAIDataType: """ Returns Categorical type with the proper encoding for label encoded data. If the encoder has been fitted and has classes_, use them to create a proper categorical type. """ if column_name and column_name in self.encoders: encoder = self.encoders[column_name] if hasattr(encoder, "classes_"): values = pa.array(encoder.classes_.tolist()) encoding = {v: i for i, v in enumerate(encoder.classes_)} return Categorical(values=values, encoding=encoding, converted=True) # Default placeholder if not fitted yet return Categorical(values=pa.array(["0", "1"])) def fit(self, x: DashAIDataset, y: Union[DashAIDataset, None] = None): """Fit label encoders to each column in the dataset.""" x_pandas = x.to_pandas() for col in x_pandas.columns: # Check if column type is in allowed_dtypes using DashAI types col_type = x.types.get(col) col_dtype = col_type.dtype if hasattr(col_type, "dtype") else None # Allow string dtype or if it's a string-like pandas dtype is_allowed = col_dtype in self.metadata["allowed_dtypes"] or x_pandas[ col ].dtype.name in ["object", "category", "string"] if is_allowed: mask = x_pandas[col].notna() if mask.any(): encoder = LabelEncoderOperation() encoder.fit(x_pandas.loc[mask, col]) self.encoders[col] = encoder self.fitted_columns.append(col) return self def transform( self, x: DashAIDataset, y: Union[DashAIDataset, None] = None ) -> DashAIDataset: """Transform columns preserving NaN values.""" x_pandas = x.to_pandas().copy() for col in self.fitted_columns: if col in x_pandas.columns: mask = x_pandas[col].notna() if mask.any(): x_pandas.loc[mask, col] = self.encoders[col].transform( x_pandas.loc[mask, col] ) converted_dataset = to_dashai_dataset(x_pandas) # Set proper categorical types for each encoded column for col in self.fitted_columns: if col in converted_dataset.column_names: converted_dataset.types[col] = self.get_output_type(col) return converted_dataset