Source code for skgbm.preprocessing.discretizer

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import warnings
import scipy
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import clone

from feature_engine.discretisation import ArbitraryDiscretiser

from ..base import GBM
from ..tools import trees_to_dataframe
from ..trees_extraction import _catboost_raw_trees, _catboost_get_splits

try:
    import catboost
    CATBOOST_CLASSES = [
        catboost.CatBoostRegressor,
        catboost.CatBoostClassifier,
        catboost.CatBoostRanker
    ]
except:
    # If there is no CatBoost, any CatBoost model can't be passed anyway
    CATBOOST_CLASSES = []



[docs]class GBMDiscretizer(BaseEstimator, TransformerMixin, GBM):
    """
    Feature discretizer based on GBDT.
    
    Internally, it uses `ArbitraryDiscretiser <https://feature-engine.trainindata.com/en/1.0.x/discretisation/ArbitraryDiscretiser.html>`_
    to handle discretization step after finding the optimal thresholds.
    
    Parameters
    ----------
    estimator: object
        A gradient boosting model from scikit-learn, XGBoost, LightGBM or CatBoost library    
    one_hot: bool
        Transform the ouput categorical features using one-hot encoding
    columns: list of str
        List of column names to be transformed
    append: bool 
        Append the newly created features to the original ones
        
    References
    ----------
    .. [1] K. Semsch, `"My contribution to the tidymodels ecosystem - 
           implementing supervised discretization step with XgBoost backend" 
           <https://konradsemsch.netlify.app/2020/05/my-contribution-to-tidymodels-ecosystem-implementing-supervised-discretization-step-with-xgboost-backend/>`_, 2020.
    .. [2] A. Berrado and  G. C. Runger, `"Supervised multivariate discretization in mixed data with Random Forests"
           <https://ieeexplore.ieee.org/document/5069327>`_, 
           2009.
    .. [3] H. Maïssae, `ForestDisc: Forest Discretization (R package) 
           <https://cran.r-project.org/web/packages/ForestDisc/index.html>`_, 2022.
    
    Examples
    --------
    >>> from sklearn.datasets import load_diabetes
    >>> from skgbm.preprocessing import GBMDiscretizer
    >>> from xgboost import XGBClassifier
    >>>
    >>> iris = load_iris()
    >>> data = pd.DataFrame(
    >>>        data= np.c_[iris['data'], iris['target']],
    >>>        columns= iris['feature_names'] + ['target']
    >>>  )
    >>> data.columns = data.columns.str[:-5]
    >>> data.columns = data.columns.str.replace(' ', '_')
    >>>    
    >>> # Data splitting
    >>> X, y = data.iloc[:, :4], data.iloc[:, 4:]
    >>> X_train, X_test, y_train, y_test = train_test_split(
    >>>         X, y, test_size=0.3, random_state=0
    >>> )
    >>> X_cols = X.columns.tolist()
    >>> X_train, X_test, y_train, y_test = train_test_split(X, y)
    >>> gbm_discretizer = GBMDiscretizer(CatBoostClassifier(verbose=0), 
    >>>                                  X_cols, one_hot=False)
    >>> X_train_disc = gbm_discretizer.fit_transform(X_train, y_train)
    >>> #      sepal_length  sepal_width  petal_length  petal_width
    >>> # 60              7            0             9            5
    >>> # 116            22            9            29           13
    >>> # 144            24           12            31           20
    >>> # 119            17            1            24           10
    >>> # 108            24            4            32           13
    >>> # ..            ...          ...           ...          ...
    >>> # 9               6           10             4            0
    >>> # 103            20            8            30           13
    >>> # 67             15            6            15            5
    >>> # 117            32           17            38           17
    >>> # 47              3           11             3            1
    """
    
    def __init__(self, estimator, 
                 columns: list, 
                 one_hot: bool = True, 
                 append: bool = False):
        self.one_hot = one_hot
        if one_hot:
            self.ohe = OneHotEncoder()
        self.append = append
        self.columns = columns
        self.estimators_ = {}
        self.discretizer_ = None
        super().__init__(estimator)
    
[docs]    def fit(self, X, y, **kwargs):
        """
        Fit a set GBDT models (one per each discretized feature), distil split thresholds from them
        and create an internal `ArbitraryDiscretiser <https://feature-engine.trainindata.com/en/1.0.x/discretisation/ArbitraryDiscretiser.html>`_.
        instance based on those values.
           
        Parameters
        ----------
        X : {array-like} of shape (n_samples, n_features)
            A data frame (matrix) of all the features.
        y: array-like of shape (n_samples,) or (n_samples, n_outputs), default=None
            Target values (this is a supervised transformation).
        
        Returns
        -------
        self: object
            Fitted discretizer.
        """
                    
        # Fitting estimators (one per tranformed column)
        disc_thresholds_ = {}
        
        for col in self.columns:

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                self.estimators_[col] = est_ =  \
                    clone(self.estimator).fit(X[[col]], y)
            
            # Getting the data frame for CatBoost is redundant
            if type(est_) not in CATBOOST_CLASSES:
                trees = trees_to_dataframe(est_)
                splits = trees \
                    .sort_values('threshold')['threshold'] \
                    .drop_duplicates() \
                    .dropna() \
                    .tolist()
            else:
                trees = _catboost_raw_trees(est_)
                splits = _catboost_get_splits(trees)
                splits = np.sort(np.unique(splits)).tolist()
            
            splits = [-np.inf] + splits + [np.inf]
            disc_thresholds_[col] = splits
        
        self.discretizer_ = \
            ArbitraryDiscretiser(binning_dict=disc_thresholds_)
            
        X_disc = self.discretizer_.fit_transform(X)
        
        if hasattr(self, 'ohe'):
            self.ohe.fit(X_disc)
            
        return self
    
[docs]    def transform(self, X, **kwargs):
        """
        Discretize the specified subset of columns.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The data to discretize.
        
        Returns
        -------
        X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
            Transformed array.
        """
        # TODO: one_hot encoding to GBMWrapper
        output = self.discretizer_.transform(X)
        if hasattr(self, 'ohe'):
            output = self.ohe.transform(output)
        if self.append:
            output = scipy.sparse.hstack([X, output])
        return output
    
[docs]    def fit_transform(self, X, y, **kwargs):
        return self.fit(X, y, **kwargs).transform(X, **kwargs)
    
    @property       
    def binner_dict_(self):
        return self.discretizer_.binner_dict_
        
    

if __name__ == '__main__':
    from sklearn.model_selection import train_test_split
    from sklearn.datasets import load_iris
    
    from lightgbm import LGBMClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from xgboost import XGBClassifier
    from catboost import CatBoostClassifier
    
    import pandas as pd
    import numpy as np
    
    # Loading data
    iris = load_iris()
    # https://stackoverflow.com/questions/38105539/how-to-convert-a-scikit-learn-dataset-to-a-pandas-dataset
    data = pd.DataFrame(
        data= np.c_[iris['data'], iris['target']],
        columns= iris['feature_names'] + ['target']
    )
    data.columns = data.columns.str[:-5]
    data.columns = data.columns.str.replace(' ', '_')
    
    # Data splitting
    X, y = data.iloc[:, :4], data.iloc[:, 4:]
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.3, random_state=0)
    X_cols = X.columns.tolist()
    
    gbm_discretizer = GBMDiscretizer(XGBClassifier(), X_cols, one_hot=False)
    gbm_discretizer = GBMDiscretizer(GradientBoostingClassifier(), X_cols, one_hot=False)
    gbm_discretizer = GBMDiscretizer(LGBMClassifier(), X_cols, one_hot=False)
    
    gbm_discretizer.fit_transform(X_train, y_train)