Source code for bcselector.information_theory.j_criterion_approximations

import numpy as np
import warnings
from bcselector.information_theory.basic_approximations import mutual_information, conditional_mutual_information

__all__ = [
    'mim',
    'mifs',
    'mrmr',
    'jmi',
    'cife'
]


[docs]def mim(data, target_variable, candidate_variable_index, **kwargs): """ This estimator computes the Mutual Information Maximisation criterion. Parameters ---------- data : np.array matrix Matrix of data set. Columns are variables, rows are observations. target_variable : int or float Target variable. Can not be in data! candidate_variable_index : int Index of candidate variable in data matrix. Returns ------- j_criterion_value : float J_criterion approximated by the Mutual Information Maximisation. """ assert isinstance(data, np.ndarray), "Argument 'data' must be a numpy matrix" assert isinstance(target_variable, np.ndarray), "Argument 'target_variable' must be a numpy matrix" assert isinstance(candidate_variable_index, int), "Argument 'candidate_variable_index' must be an integer" assert len(data.shape) == 2, "For 'data' argument use numpy array of shape (n,p)" assert data.shape[0] == len(target_variable), "Number of rows in 'data' must equal target_variable length" assert candidate_variable_index < data.shape[1], "Index 'candidate_variable_index' out of range in 'data'" candidate_variable = data[:, candidate_variable_index] return mutual_information(candidate_variable, target_variable)
[docs]def mifs(data, target_variable, prev_variables_index, candidate_variable_index, **kwargs): """ This estimator computes the Mutual Information Feature Selection criterion. Parameters ---------- data : np.array matrix Matrix of data set. Columns are variables, rows are observations. target_variable : int or float Target variable. Can not be in data! prev_variables_index: list of ints, set of ints Indexes of previously selected variables. candidate_variable_index : int Index of candidate variable in data matrix. beta: float Impact of redundancy segment in MIFS approximation. Higher the beta is, higher the impact. Returns ------- j_criterion_value : float J_criterion approximated by the Mutual Information Feature Selection. """ assert isinstance(data, np.ndarray), "Argument 'data' must be a numpy matrix" assert isinstance(target_variable, np.ndarray), "Argument 'target_variable' must be a numpy matrix" assert isinstance(candidate_variable_index, int), "Argument 'candidate_variable_index' must be an integer" assert len(data.shape) == 2, "For 'data' argument use numpy array of shape (n,p)" assert data.shape[0] == len(target_variable), "Number of rows in 'data' must equal target_variable length" assert candidate_variable_index < data.shape[1], "Index 'candidate_variable_index' out of range in 'data'" for i in prev_variables_index: assert isinstance(i, int), "All previous variable indexes must be int." if kwargs.get('beta') is None: beta = 1 warnings.warn("Parameter `beta` not provided, default value of 1 is selected.", Warning) else: beta = kwargs.pop('beta') assert isinstance(beta, int) or isinstance(beta, float), "Argument 'beta' must be int or float" candidate_variable = data[:, candidate_variable_index] if len(prev_variables_index) == 0: redundancy_sum = 0 else: redundancy_sum = np.apply_along_axis(mutual_information, axis=0, arr=data[:, prev_variables_index], vector_2=candidate_variable).sum() return mutual_information(candidate_variable, target_variable) - beta*redundancy_sum
[docs]def mrmr(data, target_variable, prev_variables_index, candidate_variable_index, **kwargs): """ This estimator computes the Max-Relevance Min-Redundancy criterion. Parameters ---------- data : np.array matrix Matrix of data set. Columns are variables, rows are observations. target_variable : int or float Target variable. Can not be in data! prev_variables_index: list of ints Indexes of previously selected variables. candidate_variable_index : int Index of candidate variable in data matrix. Returns ------- j_criterion_value : float J_criterion approximated by the Max-Relevance Min-Redundancy. """ assert isinstance(data, np.ndarray), "Argument 'data' must be a numpy matrix" assert isinstance(target_variable, np.ndarray), "Argument 'target_variable' must be a numpy matrix" assert isinstance(candidate_variable_index, int), "Argument 'candidate_variable_index' must be an integer" assert len(data.shape) == 2, "For 'data' argument use numpy array of shape (n,p)" assert data.shape[0] == len(target_variable), "Number of rows in 'data' must equal target_variable length" assert candidate_variable_index < data.shape[1], "Index 'candidate_variable_index' out of range in 'data'" for i in prev_variables_index: assert isinstance(i, int), "All previous variable indexes must be int." candidate_variable = data[:, candidate_variable_index] prev_variables_len = 1 if len(prev_variables_index) == 0 else len(prev_variables_index) if len(prev_variables_index) == 0: redundancy_sum = 0 else: redundancy_sum = np.apply_along_axis(mutual_information, axis=0, arr=data[:, prev_variables_index], vector_2=candidate_variable).sum() return mutual_information(candidate_variable, target_variable) - 1/prev_variables_len*redundancy_sum
[docs]def jmi(data, target_variable, prev_variables_index, candidate_variable_index, **kwargs): """ This estimator computes the Joint Mutual Information criterion. Parameters ---------- data : np.array matrix Matrix of data set. Columns are variables, rows are observations. target_variable : int or float Target variable. Can not be in data! prev_variables_index: list of ints Indexes of previously selected variables. candidate_variable_index : int Index of candidate variable in data matrix. Returns ------- j_criterion_value : float J_criterion approximated by the Joint Mutual Information. """ assert isinstance(data, np.ndarray), "Argument 'data' must be a numpy matrix" assert isinstance(target_variable, np.ndarray), "Argument 'target_variable' must be a numpy matrix" assert isinstance(candidate_variable_index, int), "Argument 'candidate_variable_index' must be an integer" assert len(data.shape) == 2, "For 'data' argument use numpy array of shape (n,p)" assert data.shape[0] == len(target_variable), "Number of rows in 'data' must equal target_variable length" assert candidate_variable_index < data.shape[1], "Index 'candidate_variable_index' out of range in 'data'" for i in prev_variables_index: assert isinstance(i, int), "All previous variable indexes must be int." candidate_variable = data[:, candidate_variable_index] prev_variables_len = 1 if len(prev_variables_index) == 0 else len(prev_variables_index) if len(prev_variables_index) == 0: redundancy_sum = 0 else: a = np.apply_along_axis(mutual_information, axis=0, arr=data[:, prev_variables_index], vector_2=candidate_variable).sum() b = np.apply_along_axis(conditional_mutual_information, axis=0, arr=data[:, prev_variables_index], vector_2=candidate_variable, condition=target_variable).sum() redundancy_sum = a - b return mutual_information(candidate_variable, target_variable) - 1/prev_variables_len*redundancy_sum
[docs]def cife(data, target_variable, prev_variables_index, candidate_variable_index, **kwargs): """ This estimator computes the Conditional Infomax Feature Extraction criterion. Parameters ---------- data : np.array matrix Matrix of data set. Columns are variables, rows are observations. target_variable : int or float Target variable. Can not be in data! prev_variables_index: list of ints Indexes of previously selected variables. candidate_variable_index : int Index of candidate variable in data matrix. beta: float Impact of redundancy segment in MIFS approximation. Higher the beta is, higher the impact. Returns ------- j_criterion_value : float J_criterion approximated by the Conditional Infomax Feature Extraction. """ assert isinstance(data, np.ndarray), "Argument 'data' must be a numpy matrix" assert isinstance(target_variable, np.ndarray), "Argument 'target_variable' must be a numpy matrix" assert isinstance(candidate_variable_index, int), "Argument 'candidate_variable_index' must be an integer" assert len(data.shape) == 2, "For 'data' argument use numpy array of shape (n,p)" assert data.shape[0] == len(target_variable), "Number of rows in 'data' must equal target_variable length" assert candidate_variable_index < data.shape[1], "Index 'candidate_variable_index' out of range in 'data'" for i in prev_variables_index: assert isinstance(i, int), "All previous variable indexes must be int." if kwargs.get('beta') is None: beta = 1 warnings.warn("Parameter `beta` not provided, default value of 1 is selected.") else: beta = kwargs.pop('beta') assert isinstance(beta, int) or isinstance(beta, float), "Argument 'beta' must be int or float" candidate_variable = data[:, candidate_variable_index] if len(prev_variables_index) == 0: redundancy_sum = 0 else: a = np.apply_along_axis(mutual_information, axis=0, arr=data[:, prev_variables_index], vector_2=candidate_variable).sum() b = np.apply_along_axis(conditional_mutual_information, axis=0, arr=data[:, prev_variables_index], vector_2=candidate_variable, condition=target_variable).sum() redundancy_sum = a - b return mutual_information(candidate_variable, target_variable) - beta * redundancy_sum