Source code for skgbm.preprocessing.featurizer

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import scipy
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

from ..base import GBM


[docs]class GBMFeaturizer(BaseEstimator, TransformerMixin, GBM):
    """
    Feature generator for any GBDT model.
    
    Parameters
    ----------
    estimator: object
        A gradient boosting model from scikit-learn, XGBoost, LightGBM or CatBoost library    
    one_hot: bool
        Transform the ouput categorical features using one-hot encoding
    append: bool 
        Append the newly created features to the original ones
    
    References
    ----------

    .. [1] X. He, J. Pan, O. Jin, T. Xu, B. Liu, T. Xu, Y. Shi, A. Atallah,
           R. Herbrich, S. Bowers, J. Q. Candela, `"Practical Lessons from Predicting Clicks on Ads at Facebook"
           <https://research.fb.com/wp-content/uploads/2016/11/practical-lessons-from-predicting-clicks-on-ads-at-facebook.pdf>`_, 2016.

    .. [2] C. Mougan, `"Feature Generation with Gradient Boosted Decision Trees"
           <https://towardsdatascience.com/feature-generation-with-gradient-boosted-decision-trees-21d4946d6ab5>`_, Towards Data Science, 2021.
    
    .. [3] David Masip, `"sktools — Helpers for scikit learn"
           <https://sktools.readthedocs.io/en/latest/sktools.html#module-sktools.preprocessing>`_
    
    .. [4] `xgboostExtension: xgboost Extension for Easy Ranking & TreeFeature
            <https://github.com/bigdong89/xgboostExtension>`_
    
    Examples
    --------
    >>> from sklearn.datasets import load_diabetes
    >>> from skgbm.preprocessing import GBMFeaturizer
    >>> from lightgbm import LGBMRegressor
    >>>
    >>> X, y = load_diabetes(return_X_y=True)
    >>> X_train, X_test, y_train, y_test = train_test_split(X, y)
    >>> gbm_featurizer = GBMFeaturizer(LGBMRegressor())
    >>> gbm_featurizer.fit(X_train, y_train)
    >>> gbm_featurizer.transform(X_test)
    
    """
    
    def __init__(self, estimator, one_hot: bool = True, append: bool = True):
        self.one_hot = one_hot
        if one_hot:
            self.ohe = OneHotEncoder()
        self.append = append
        super().__init__(estimator)
    
[docs]    def fit(self, X, y, **kwargs):
        """
        Fit a GBDT model and OneHotEncoder.
           
        Parameters
        ----------
        X : {array-like} of shape (n_samples, n_features)
            A data frame (matrix) of all the features.
        y: array-like of shape (n_samples,) or (n_samples, n_outputs), default=None
            Target values.
        
        Returns
        -------
        self: object
            Fitted discretizer.
        """
        super().fit(X, y, **kwargs)
        if hasattr(self, 'ohe'):
            X_ = self.apply(X)
            self.ohe.fit(X_)
        return self
    
[docs]    def transform(self, X, **kwargs):
        """
        Return features distiled from the GBM model trees.
        The number of the output features depens on `one_hot` and `append` parameters.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The data to discretize.
        
        Returns
        -------
        X_tr : {ndarray, sparse matrix} of shape (n_samples, n_trees) or (n_samples, 1) 
            Transformed array.
        """
        output = self.apply(X)
        if hasattr(self, 'ohe'):
            output = self.ohe.transform(output)
        if self.append:
            output = scipy.sparse.hstack([X, output])
        return output
    
[docs]    def fit_transform(self, X, y, **kwargs):
        return self.fit(X, y, **kwargs).transform(X, **kwargs)