Module pandas_processors.normalize

Expand source code
from __future__ import annotations

from typing import Optional

import pandas as pd
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax, skew
from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin


class SkewedFeatureNormalizer(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    """
    A scikit-learn compatible transformer for normalizing skewed numerical features.

    Parameters
    ----------
    threshold : float, optional
        The threshold value for skewness to be considered skewed.
        Defaults to 0.5.

    Examples
    --------

    >>> import pandas as pd
    >>> import numpy as np
    >>> from pandas_processors.normalize import SkewedFeatureNormalizer
    >>> # Set the seed value
    >>> np.random.seed(123)
    >>> # Create a sample DataFrame with skewed features
    >>> data = {
    ...     'Feature1': np.random.exponential(scale=2, size=1000),
    ...     'Feature2': np.random.lognormal(mean=1, sigma=0.5, size=1000),
    ...     'Feature3': np.random.gamma(shape=2, scale=1, size=1000)
    ... }
    >>> df = pd.DataFrame(data)
    >>> normalizer = SkewedFeatureNormalizer(threshold=0.5)
    >>> normalizer.fit_transform(df)
         Feature1  Feature2  Feature3
    0    0.972661  1.114450  1.450302
    1    0.467319  0.901000  0.917198
    2    0.383601  1.358568  1.388989
    3    0.799976  0.774102  0.646975
    4    1.000851  0.975794  0.831716
    ..        ...       ...       ...
    995  0.437187  1.139626  0.711948
    996  0.965916  1.274143  0.574016
    997  0.548840  0.918681  0.447651
    998  0.008306  1.018361  0.762395
    999  0.479283  1.199449  1.158380
    <BLANKLINE>
    [1000 rows x 3 columns]
    """

    def __init__(self, threshold: float = 0.5):
        self.threshold = threshold

    @staticmethod
    def _normalize_a_feature(feature: pd.Series) -> pd.Series:
        """Apply Box-Cox transformation to a given numerical feature to normalize its distribution.

        Parameters
        ----------
        feature : pd.Series
            The numerical feature to be normalized.

        Returns
        -------
        pd.Series
            The normalized numerical feature after applying the Box-Cox transformation.
        """
        return boxcox1p(feature, boxcox_normmax(feature + 1))

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Fit the transformer to the input data.

        Parameters
        ----------
        X : pd.DataFrame
            The training dataset.
        y : Optional[pd.Series], optional
            The label, by default None

        Returns
        --------
        self
            Returns the transformer instance.
        """
        feature_skewness = X.apply(lambda x: skew(x)).sort_values(ascending=False)
        self.skewed_features = feature_skewness[feature_skewness > self.threshold].index
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Normalize skewed numerical features in the DataFrame.

        Parameters
        ----------
        X : pd.DataFrame
            The input DataFrame containing the numerical features to be normalized.

        Returns
        -------
        pd.DataFrame
            The input DataFrame with skewed numerical features normalized.
        """
        X = X.copy()
        X[self.skewed_features] = X[self.skewed_features].apply(
            self._normalize_a_feature
        )
        return X


if __name__ == "__main__":
    import doctest

    doctest.testmod()

Classes

class SkewedFeatureNormalizer (threshold: float = 0.5)

A scikit-learn compatible transformer for normalizing skewed numerical features.

Parameters

threshold : float, optional
The threshold value for skewness to be considered skewed. Defaults to 0.5.

Examples

>>> import pandas as pd
>>> import numpy as np
>>> from pandas_processors.normalize import SkewedFeatureNormalizer
>>> # Set the seed value
>>> np.random.seed(123)
>>> # Create a sample DataFrame with skewed features
>>> data = {
...     'Feature1': np.random.exponential(scale=2, size=1000),
...     'Feature2': np.random.lognormal(mean=1, sigma=0.5, size=1000),
...     'Feature3': np.random.gamma(shape=2, scale=1, size=1000)
... }
>>> df = pd.DataFrame(data)
>>> normalizer = SkewedFeatureNormalizer(threshold=0.5)
>>> normalizer.fit_transform(df)
     Feature1  Feature2  Feature3
0    0.972661  1.114450  1.450302
1    0.467319  0.901000  0.917198
2    0.383601  1.358568  1.388989
3    0.799976  0.774102  0.646975
4    1.000851  0.975794  0.831716
..        ...       ...       ...
995  0.437187  1.139626  0.711948
996  0.965916  1.274143  0.574016
997  0.548840  0.918681  0.447651
998  0.008306  1.018361  0.762395
999  0.479283  1.199449  1.158380
<BLANKLINE>
[1000 rows x 3 columns]
Expand source code
class SkewedFeatureNormalizer(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    """
    A scikit-learn compatible transformer for normalizing skewed numerical features.

    Parameters
    ----------
    threshold : float, optional
        The threshold value for skewness to be considered skewed.
        Defaults to 0.5.

    Examples
    --------

    >>> import pandas as pd
    >>> import numpy as np
    >>> from pandas_processors.normalize import SkewedFeatureNormalizer
    >>> # Set the seed value
    >>> np.random.seed(123)
    >>> # Create a sample DataFrame with skewed features
    >>> data = {
    ...     'Feature1': np.random.exponential(scale=2, size=1000),
    ...     'Feature2': np.random.lognormal(mean=1, sigma=0.5, size=1000),
    ...     'Feature3': np.random.gamma(shape=2, scale=1, size=1000)
    ... }
    >>> df = pd.DataFrame(data)
    >>> normalizer = SkewedFeatureNormalizer(threshold=0.5)
    >>> normalizer.fit_transform(df)
         Feature1  Feature2  Feature3
    0    0.972661  1.114450  1.450302
    1    0.467319  0.901000  0.917198
    2    0.383601  1.358568  1.388989
    3    0.799976  0.774102  0.646975
    4    1.000851  0.975794  0.831716
    ..        ...       ...       ...
    995  0.437187  1.139626  0.711948
    996  0.965916  1.274143  0.574016
    997  0.548840  0.918681  0.447651
    998  0.008306  1.018361  0.762395
    999  0.479283  1.199449  1.158380
    <BLANKLINE>
    [1000 rows x 3 columns]
    """

    def __init__(self, threshold: float = 0.5):
        self.threshold = threshold

    @staticmethod
    def _normalize_a_feature(feature: pd.Series) -> pd.Series:
        """Apply Box-Cox transformation to a given numerical feature to normalize its distribution.

        Parameters
        ----------
        feature : pd.Series
            The numerical feature to be normalized.

        Returns
        -------
        pd.Series
            The normalized numerical feature after applying the Box-Cox transformation.
        """
        return boxcox1p(feature, boxcox_normmax(feature + 1))

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Fit the transformer to the input data.

        Parameters
        ----------
        X : pd.DataFrame
            The training dataset.
        y : Optional[pd.Series], optional
            The label, by default None

        Returns
        --------
        self
            Returns the transformer instance.
        """
        feature_skewness = X.apply(lambda x: skew(x)).sort_values(ascending=False)
        self.skewed_features = feature_skewness[feature_skewness > self.threshold].index
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Normalize skewed numerical features in the DataFrame.

        Parameters
        ----------
        X : pd.DataFrame
            The input DataFrame containing the numerical features to be normalized.

        Returns
        -------
        pd.DataFrame
            The input DataFrame with skewed numerical features normalized.
        """
        X = X.copy()
        X[self.skewed_features] = X[self.skewed_features].apply(
            self._normalize_a_feature
        )
        return X

Ancestors

  • sklearn.base.BaseEstimator
  • sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
  • sklearn.utils._metadata_requests._MetadataRequester
  • sklearn.base.TransformerMixin
  • sklearn.utils._set_output._SetOutputMixin
  • sklearn.base.OneToOneFeatureMixin

Methods

def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None)

Fit the transformer to the input data.

Parameters

X : pd.DataFrame
The training dataset.
y : Optional[pd.Series], optional
The label, by default None

Returns

self
Returns the transformer instance.
Expand source code
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
    """
    Fit the transformer to the input data.

    Parameters
    ----------
    X : pd.DataFrame
        The training dataset.
    y : Optional[pd.Series], optional
        The label, by default None

    Returns
    --------
    self
        Returns the transformer instance.
    """
    feature_skewness = X.apply(lambda x: skew(x)).sort_values(ascending=False)
    self.skewed_features = feature_skewness[feature_skewness > self.threshold].index
    return self
def transform(self, X: pd.DataFrame) ‑> pandas.core.frame.DataFrame

Normalize skewed numerical features in the DataFrame.

Parameters

X : pd.DataFrame
The input DataFrame containing the numerical features to be normalized.

Returns

pd.DataFrame
The input DataFrame with skewed numerical features normalized.
Expand source code
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """
    Normalize skewed numerical features in the DataFrame.

    Parameters
    ----------
    X : pd.DataFrame
        The input DataFrame containing the numerical features to be normalized.

    Returns
    -------
    pd.DataFrame
        The input DataFrame with skewed numerical features normalized.
    """
    X = X.copy()
    X[self.skewed_features] = X[self.skewed_features].apply(
        self._normalize_a_feature
    )
    return X