Module pandas_processors.create
Expand source code
from __future__ import annotations
from typing import Callable, Optional
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin
class SumFeatures(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
"""
A scikit-learn compatible transformer that sums multiple features into a new feature.
Parameters
----------
features : list[str]
The list of feature names to be summed.
new_feature_name : str
The name of the new feature created by summing the input features.
drop_original : bool, optional
Whether to drop the original features after creating the new feature. Default is False.
weights : list[float], optional
The weights to be applied to each feature during the summation. If None, all features's weights are 1.
If provided, the length of weights must be the same as the length of features.
Examples
--------
>>> import pandas as pd
>>> from pandas_processors.create import SumFeatures
>>> X = pd.DataFrame({"col1": [1, 2, 3], "col2": [1, 1, 1], "col3": [2, 2, 2]})
>>> sum_feature = SumFeatures(
... features=["col1", "col2", "col3"],
... new_feature_name="col4",
... weights=[1, 2, 0.5],
... )
>>> sum_feature.fit_transform(X)
col1 col2 col3 col4
0 1 1 2 4.0
1 2 1 2 5.0
2 3 1 2 6.0
"""
def __init__(
self,
features: list[str],
new_feature_name: str,
drop_original: bool = False,
weights: Optional[list[float]] = None,
) -> None:
self.features = features
self.new_feature_name = new_feature_name
self.drop_original = drop_original
self.weights = weights
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""
Fit the transformer to the input data.
Parameters
----------
X : pd.DataFrame
The training dataset.
y : Optional[pd.Series], optional
The label, by default None
Returns
-------
self
Returns the transformer instance.
"""
return self
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input data by summing the specified features into a new feature.
Parameters
----------
X : pd.DataFrame
The input DataFrame containing the data to be processed.
Returns
-------
pd.DataFrame
The input DataFrame with a new feature created by summing the specified features.
"""
X = X.copy()
weights = self._get_weights()
X[self.new_feature_name] = np.dot(X[self.features], weights)
if self.drop_original:
X.drop(columns=self.features, inplace=True)
return X
def _get_weights(self) -> list[float]:
"""
Get the weights to be applied to each feature during the summation.
Returns
-------
list[float]
The weights to be applied to each feature.
"""
if self.weights is None:
weights_ = [1.0] * len(self.features)
else:
if len(self.weights) != len(self.features):
raise ValueError(
"The length of weights must be the same as the length of features."
)
weights_ = self.weights
return weights_
class ConditionalFeatures(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
"""
A scikit-learn compatible transformer that creates new features based on a condition applied to existing features.
Parameters
----------
features : str | list[str]
The name(s) of the feature(s) to apply the condition to.
condition : callable
A callable object or function that defines the condition to be applied to each value in the features.
The condition should return True or False.
true_value : int | float | str
The value to assign to the new feature(s) when the condition is True.
false_value : int | float | str
The value to assign to the new feature(s) when the condition is False.
new_feature_names : str | list[str]
The name(s) of the new feature(s) created by applying the condition.
If a single name is provided, it will be used for all new features.
If a list of names is provided, it must have the same length as the features.
drop_original : bool, optional
Whether to drop the original features after creating the new features. Default is False.
Examples
--------
>>> import pandas as pd
>>> from pandas_processors.create import ConditionalFeatures
>>> X = pd.DataFrame({"col1": [-1, 2, 3], "col2": [4, -5, 6]})
>>> # Use ConditionalFeatures with one feature
>>> conditional_feature = ConditionalFeatures(
... features="col1",
... new_feature_names="col3",
... condition=lambda x: x > 0,
... true_value=1,
... false_value=0,
... )
>>> conditional_feature.fit_transform(X)
col1 col2 col3
0 -1 4 0
1 2 -5 1
2 3 6 1
>>> # Use ConditionalFeatures with multiple features
>>> conditional_feature = ConditionalFeatures(
... features=["col1", "col2"],
... new_feature_names=["col3", "col4"],
... condition=lambda x: x > 0,
... true_value=1,
... false_value=0,
... )
>>> conditional_feature.fit_transform(X)
col1 col2 col3 col4
0 -1 4 0 1
1 2 -5 1 0
2 3 6 1 1
"""
def __init__(
self,
features: str | list[str],
condition: Callable,
true_value: int | float | str,
false_value: int | float | str,
new_feature_names: str | list[str],
drop_original: bool = False,
) -> None:
if isinstance(features, str):
features = [features]
if isinstance(new_feature_names, str):
new_feature_names = [new_feature_names]
if len(features) != len(new_feature_names):
raise ValueError(
"The length of features and new_feature_names must be the same."
)
self.features = features
self.condition = condition
self.true_value = true_value
self.false_value = false_value
self.new_feature_names = new_feature_names
self.drop_original = drop_original
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""
Fit the transformer to the input data.
Parameters
----------
X : pd.DataFrame
The training dataset.
y : Optional[pd.Series], optional
The label, by default None
Returns
-------
self
Returns the transformer instance.
"""
return self
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Transform the input data by applying the condition to create new features.
Parameters
----------
X : pd.DataFrame
The input DataFrame containing the data to be processed.
Returns
-------
pd.DataFrame
The input DataFrame with new features created by applying the condition.
"""
X = X.copy()
X[self.new_feature_names] = X[self.features].map(
lambda x: self.true_value if self.condition(x) else self.false_value
)
if self.drop_original:
X.drop(columns=self.features, inplace=True)
return X
if __name__ == "__main__":
import doctest
doctest.testmod()
Classes
class ConditionalFeatures (features: str | list[str], condition: Callable, true_value: int | float | str, false_value: int | float | str, new_feature_names: str | list[str], drop_original: bool = False)
-
A scikit-learn compatible transformer that creates new features based on a condition applied to existing features.
Parameters
features
:str | list[str]
- The name(s) of the feature(s) to apply the condition to.
condition
:callable
- A callable object or function that defines the condition to be applied to each value in the features. The condition should return True or False.
true_value
:int | float | str
- The value to assign to the new feature(s) when the condition is True.
false_value
:int | float | str
- The value to assign to the new feature(s) when the condition is False.
new_feature_names
:str | list[str]
- The name(s) of the new feature(s) created by applying the condition. If a single name is provided, it will be used for all new features. If a list of names is provided, it must have the same length as the features.
drop_original
:bool
, optional- Whether to drop the original features after creating the new features. Default is False.
Examples
>>> import pandas as pd >>> from pandas_processors.create import ConditionalFeatures >>> X = pd.DataFrame({"col1": [-1, 2, 3], "col2": [4, -5, 6]}) >>> # Use ConditionalFeatures with one feature >>> conditional_feature = ConditionalFeatures( ... features="col1", ... new_feature_names="col3", ... condition=lambda x: x > 0, ... true_value=1, ... false_value=0, ... ) >>> conditional_feature.fit_transform(X) col1 col2 col3 0 -1 4 0 1 2 -5 1 2 3 6 1 >>> # Use ConditionalFeatures with multiple features >>> conditional_feature = ConditionalFeatures( ... features=["col1", "col2"], ... new_feature_names=["col3", "col4"], ... condition=lambda x: x > 0, ... true_value=1, ... false_value=0, ... ) >>> conditional_feature.fit_transform(X) col1 col2 col3 col4 0 -1 4 0 1 1 2 -5 1 0 2 3 6 1 1
Expand source code
class ConditionalFeatures(BaseEstimator, TransformerMixin, OneToOneFeatureMixin): """ A scikit-learn compatible transformer that creates new features based on a condition applied to existing features. Parameters ---------- features : str | list[str] The name(s) of the feature(s) to apply the condition to. condition : callable A callable object or function that defines the condition to be applied to each value in the features. The condition should return True or False. true_value : int | float | str The value to assign to the new feature(s) when the condition is True. false_value : int | float | str The value to assign to the new feature(s) when the condition is False. new_feature_names : str | list[str] The name(s) of the new feature(s) created by applying the condition. If a single name is provided, it will be used for all new features. If a list of names is provided, it must have the same length as the features. drop_original : bool, optional Whether to drop the original features after creating the new features. Default is False. Examples -------- >>> import pandas as pd >>> from pandas_processors.create import ConditionalFeatures >>> X = pd.DataFrame({"col1": [-1, 2, 3], "col2": [4, -5, 6]}) >>> # Use ConditionalFeatures with one feature >>> conditional_feature = ConditionalFeatures( ... features="col1", ... new_feature_names="col3", ... condition=lambda x: x > 0, ... true_value=1, ... false_value=0, ... ) >>> conditional_feature.fit_transform(X) col1 col2 col3 0 -1 4 0 1 2 -5 1 2 3 6 1 >>> # Use ConditionalFeatures with multiple features >>> conditional_feature = ConditionalFeatures( ... features=["col1", "col2"], ... new_feature_names=["col3", "col4"], ... condition=lambda x: x > 0, ... true_value=1, ... false_value=0, ... ) >>> conditional_feature.fit_transform(X) col1 col2 col3 col4 0 -1 4 0 1 1 2 -5 1 0 2 3 6 1 1 """ def __init__( self, features: str | list[str], condition: Callable, true_value: int | float | str, false_value: int | float | str, new_feature_names: str | list[str], drop_original: bool = False, ) -> None: if isinstance(features, str): features = [features] if isinstance(new_feature_names, str): new_feature_names = [new_feature_names] if len(features) != len(new_feature_names): raise ValueError( "The length of features and new_feature_names must be the same." ) self.features = features self.condition = condition self.true_value = true_value self.false_value = false_value self.new_feature_names = new_feature_names self.drop_original = drop_original def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Fit the transformer to the input data. Parameters ---------- X : pd.DataFrame The training dataset. y : Optional[pd.Series], optional The label, by default None Returns ------- self Returns the transformer instance. """ return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Transform the input data by applying the condition to create new features. Parameters ---------- X : pd.DataFrame The input DataFrame containing the data to be processed. Returns ------- pd.DataFrame The input DataFrame with new features created by applying the condition. """ X = X.copy() X[self.new_feature_names] = X[self.features].map( lambda x: self.true_value if self.condition(x) else self.false_value ) if self.drop_original: X.drop(columns=self.features, inplace=True) return X
Ancestors
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
- sklearn.base.TransformerMixin
- sklearn.utils._set_output._SetOutputMixin
- sklearn.base.OneToOneFeatureMixin
Methods
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None)
-
Fit the transformer to the input data.
Parameters
X
:pd.DataFrame
- The training dataset.
y
:Optional[pd.Series]
, optional- The label, by default None
Returns
self
- Returns the transformer instance.
Expand source code
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Fit the transformer to the input data. Parameters ---------- X : pd.DataFrame The training dataset. y : Optional[pd.Series], optional The label, by default None Returns ------- self Returns the transformer instance. """ return self
def transform(self, X: pd.DataFrame) ‑> pandas.core.frame.DataFrame
-
Transform the input data by applying the condition to create new features.
Parameters
X
:pd.DataFrame
- The input DataFrame containing the data to be processed.
Returns
pd.DataFrame
- The input DataFrame with new features created by applying the condition.
Expand source code
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Transform the input data by applying the condition to create new features. Parameters ---------- X : pd.DataFrame The input DataFrame containing the data to be processed. Returns ------- pd.DataFrame The input DataFrame with new features created by applying the condition. """ X = X.copy() X[self.new_feature_names] = X[self.features].map( lambda x: self.true_value if self.condition(x) else self.false_value ) if self.drop_original: X.drop(columns=self.features, inplace=True) return X
class SumFeatures (features: list[str], new_feature_name: str, drop_original: bool = False, weights: Optional[list[float]] = None)
-
A scikit-learn compatible transformer that sums multiple features into a new feature.
Parameters
features
:list[str]
- The list of feature names to be summed.
new_feature_name
:str
- The name of the new feature created by summing the input features.
drop_original
:bool
, optional- Whether to drop the original features after creating the new feature. Default is False.
weights
:list[float]
, optional- The weights to be applied to each feature during the summation. If None, all features's weights are 1. If provided, the length of weights must be the same as the length of features.
Examples
>>> import pandas as pd >>> from pandas_processors.create import SumFeatures >>> X = pd.DataFrame({"col1": [1, 2, 3], "col2": [1, 1, 1], "col3": [2, 2, 2]}) >>> sum_feature = SumFeatures( ... features=["col1", "col2", "col3"], ... new_feature_name="col4", ... weights=[1, 2, 0.5], ... ) >>> sum_feature.fit_transform(X) col1 col2 col3 col4 0 1 1 2 4.0 1 2 1 2 5.0 2 3 1 2 6.0
Expand source code
class SumFeatures(BaseEstimator, TransformerMixin, OneToOneFeatureMixin): """ A scikit-learn compatible transformer that sums multiple features into a new feature. Parameters ---------- features : list[str] The list of feature names to be summed. new_feature_name : str The name of the new feature created by summing the input features. drop_original : bool, optional Whether to drop the original features after creating the new feature. Default is False. weights : list[float], optional The weights to be applied to each feature during the summation. If None, all features's weights are 1. If provided, the length of weights must be the same as the length of features. Examples -------- >>> import pandas as pd >>> from pandas_processors.create import SumFeatures >>> X = pd.DataFrame({"col1": [1, 2, 3], "col2": [1, 1, 1], "col3": [2, 2, 2]}) >>> sum_feature = SumFeatures( ... features=["col1", "col2", "col3"], ... new_feature_name="col4", ... weights=[1, 2, 0.5], ... ) >>> sum_feature.fit_transform(X) col1 col2 col3 col4 0 1 1 2 4.0 1 2 1 2 5.0 2 3 1 2 6.0 """ def __init__( self, features: list[str], new_feature_name: str, drop_original: bool = False, weights: Optional[list[float]] = None, ) -> None: self.features = features self.new_feature_name = new_feature_name self.drop_original = drop_original self.weights = weights def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Fit the transformer to the input data. Parameters ---------- X : pd.DataFrame The training dataset. y : Optional[pd.Series], optional The label, by default None Returns ------- self Returns the transformer instance. """ return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Transform the input data by summing the specified features into a new feature. Parameters ---------- X : pd.DataFrame The input DataFrame containing the data to be processed. Returns ------- pd.DataFrame The input DataFrame with a new feature created by summing the specified features. """ X = X.copy() weights = self._get_weights() X[self.new_feature_name] = np.dot(X[self.features], weights) if self.drop_original: X.drop(columns=self.features, inplace=True) return X def _get_weights(self) -> list[float]: """ Get the weights to be applied to each feature during the summation. Returns ------- list[float] The weights to be applied to each feature. """ if self.weights is None: weights_ = [1.0] * len(self.features) else: if len(self.weights) != len(self.features): raise ValueError( "The length of weights must be the same as the length of features." ) weights_ = self.weights return weights_
Ancestors
- sklearn.base.BaseEstimator
- sklearn.utils._estimator_html_repr._HTMLDocumentationLinkMixin
- sklearn.utils._metadata_requests._MetadataRequester
- sklearn.base.TransformerMixin
- sklearn.utils._set_output._SetOutputMixin
- sklearn.base.OneToOneFeatureMixin
Methods
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None)
-
Fit the transformer to the input data.
Parameters
X
:pd.DataFrame
- The training dataset.
y
:Optional[pd.Series]
, optional- The label, by default None
Returns
self
- Returns the transformer instance.
Expand source code
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Fit the transformer to the input data. Parameters ---------- X : pd.DataFrame The training dataset. y : Optional[pd.Series], optional The label, by default None Returns ------- self Returns the transformer instance. """ return self
def transform(self, X: pd.DataFrame) ‑> pandas.core.frame.DataFrame
-
Transform the input data by summing the specified features into a new feature.
Parameters
X
:pd.DataFrame
- The input DataFrame containing the data to be processed.
Returns
pd.DataFrame
- The input DataFrame with a new feature created by summing the specified features.
Expand source code
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Transform the input data by summing the specified features into a new feature. Parameters ---------- X : pd.DataFrame The input DataFrame containing the data to be processed. Returns ------- pd.DataFrame The input DataFrame with a new feature created by summing the specified features. """ X = X.copy() weights = self._get_weights() X[self.new_feature_name] = np.dot(X[self.features], weights) if self.drop_original: X.drop(columns=self.features, inplace=True) return X