Source code for autoimpute.imputations.dataframe.base_imputer

"""Module for BaseImputer - a base class for DataFrame imputers.

This module contains the `BaseImputer`, which is used to abstract away
functionality in both DataFrame imputers. The `BaseImputer` also holds the
methods available for imputation analysis.
"""

import warnings
from autoimpute.utils import check_strategy_allowed
from autoimpute.imputations import method_names
from ..series import DefaultUnivarImputer, DefaultPredictiveImputer
from ..series import DefaultTimeSeriesImputer
from ..series import MeanImputer, MedianImputer, ModeImputer
from ..series import NormImputer, CategoricalImputer
from ..series import RandomImputer, InterpolateImputer
from ..series import LOCFImputer, NOCBImputer
from ..series import LeastSquaresImputer, StochasticImputer
from ..series import PMMImputer, LRDImputer
from ..series import BinaryLogisticImputer, MultinomialLogisticImputer
from ..series import BayesianLeastSquaresImputer
from ..series import BayesianBinaryLogisticImputer
from ..series import NormUnitVarianceImputer
methods = method_names

# pylint:disable=attribute-defined-outside-init
# pylint:disable=too-many-arguments
# pylint:disable=too-many-instance-attributes
# pylint:disable=inconsistent-return-statements

[docs]class BaseImputer: """Building blocks for more advanced DataFrame imputers. The BaseImputer is not a stand-alone class and thus serves no purpose other than as a parent to Imputers. Therefore, the BaseImputer should not be used directly unless creating an Imputer. That being said, all DataFrame Imputers should inherit from BaseImputer. It contains base functionality for any new DataFrame Imputer, and it holds the set of strategies that make up this imputation library. Attributes: univariate_strategies (dict): univariate imputation methods. | Key = imputation name; Value = function to perform imputation. | `univariate default` mean for numerical, mode for categorical. | `time default` interpolate for numerical, mode for categorical. | `mean` imputes missing values with the average of the series. | `median` imputes missing values with the median of the series. | `mode` imputes missing values with the mode of the series. | Method handles more than one mode (see ModeImputer for info). | `random` imputes random choice from set of series unique vals. | `norm` imputes series w/ random draws from normal distribution. | Mean and std calculated from observed values of the series. | `categorical` imputes series using random draws from pmf. | Proportions calculated from non-missing category instances. | `interpolate` imputes series using chosen interpolation method. | Default is linear. See InterpolateImputer for more info. | `locf` imputes series carrying last observation moving forward. | `nocb` imputes series carrying next observation moving backward. | `normal unit variance` imputes using unit variance w/ norm. predictive_strategies (dict): predictive imputation methods. | Key = imputation name; Value = function to perform imputation. | `predictive default` pmm for numerical,logistic for categorical. | `least squares` predict missing values from linear regression. | `binary logistic` predict missing values with 2 classes. | `multinomial logistic` predict missing values with multiclass. | `stochastic` linear regression+random draw from norm w/ mse std. | `bayesian least squares` draw from the posterior predictive | distribution for each missing value, using OLS model. | `bayesian binary logistic` draw from the posterior predictive | distribution for each missing value, using logistic model. | `pmm` imputes series using predictive mean matching. PMM is a | semi-supervised method using bayesian & hot-deck imputation. | `lrd` imputes series using local residual draws. LRD is a | semi-supervised method using bayesian & hot-deck imputation. """ univariate_strategies = { methods.DEFAULT_UNIVAR: DefaultUnivarImputer, methods.DEFAULT_TIME: DefaultTimeSeriesImputer, methods.MEAN: MeanImputer, methods.MEDIAN: MedianImputer, methods.MODE: ModeImputer, methods.RANDOM: RandomImputer, methods.NORM: NormImputer, methods.CATEGORICAL: CategoricalImputer, methods.INTERPOLATE: InterpolateImputer, methods.LOCF: LOCFImputer, methods.NOCB: NOCBImputer, methods.NORM_UNIT_VARIANCE: NormUnitVarianceImputer, } predictive_strategies = { methods.DEFAULT_PRED: DefaultPredictiveImputer, methods.LS: LeastSquaresImputer, methods.STOCHASTIC: StochasticImputer, methods.BINARY_LOGISTIC: BinaryLogisticImputer, methods.MULTI_LOGISTIC: MultinomialLogisticImputer, methods.BAYESIAN_LS: BayesianLeastSquaresImputer, methods.BAYESIAN_BINARY_LOGISTIC: BayesianBinaryLogisticImputer, methods.PMM: PMMImputer, methods.LRD: LRDImputer } strategies = {**predictive_strategies, **univariate_strategies} visit_sequences = ( "default", "left-to-right" )
[docs] def __init__(self, strategy, imp_kwgs, visit): """Initialize the BaseImputer. Args: strategy (str, iter, dict; optional): strategies for imputation. Default value is str -> `predictive default`. If str, single strategy broadcast to all series in DataFrame. If iter, must provide 1 strategy per column. Each method w/in iterator applies to column with same index value in DataFrame. If dict, must provide key = column name, value = imputer. Dict the most flexible and PREFERRED way to create custom imputation strategies if not using the default. Dict does not require method for every column; just those specified as keys. imp_kwgs (dict, optional): keyword arguments for each imputer. Default is None, which means default imputer created to match specific strategy. imp_kwgs keys can be either columns or strategies. If strategies, each column given that strategy is instantiated with same arguments. visit (str, None): order to visit columns for imputation. Default is `default`, which implements `left-to-right`. More strategies (random, monotone, etc.) TBD. """ self.strategy = strategy self.imp_kwgs = imp_kwgs self.visit = visit
@property def strategy(self): """Property getter to return the value of the strategy property.""" return self._strategy @strategy.setter def strategy(self, s): """Validate the strategy property to ensure it's type and value. Class instance only possible if strategy is proper type, as outlined in the init method. Passes supported strategies and user arg to helper method, which performs strategy checks. Args: s (str, iter, dict): Strategy passed as arg to class instance. Raises: ValueError: Strategies not valid (not in allowed strategies). TypeError: Strategy must be a string, tuple, list, or dict. Both errors raised through helper method `check_strategy_allowed`. """ strat_names = self.strategies.keys() self._strategy = check_strategy_allowed(strat_names, s) @property def imp_kwgs(self): """Property getter to return the value of imp_kwgs.""" return self._imp_kwgs @imp_kwgs.setter def imp_kwgs(self, kwgs): """Validate the imp_kwgs and set default properties. The BaseImputer validates the `imp_kwgs` argument. `imp_kwgs` contain optional keyword arguments for an imputers' strategies or columns. The argument is optional, and its default is None. Args: kwgs (dict, None): None or dictionary of keywords. Raises: ValueError: imp_kwgs not correctly specified as argument. """ if not isinstance(kwgs, (type(None), dict)): err = "imp_kwgs must be dict of args used to instantiate Imputer." raise ValueError(err) self._imp_kwgs = kwgs @property def visit(self): """Property getter to return the value of the visit property.""" return self._visit @visit.setter def visit(self, v): """Validate the visit property to ensure it's type and value. Class instance only possible if visit is proper type, as outlined in the init method. Visit property must be one of valid sequences in the `visit_sequences` variable. Args: v (str): Visit sequence passed as arg to class instance. Raises: TypeError: visit sequence must be a string. ValueError: visit sequenece not in `visit_sequences`. """ # deal with type first if not isinstance(v, str): err = "visit must be a string specifying visit sequence to use." raise TypeError(err) # deal with value next if v not in self.visit_sequences: err = f"visit not valid. Must be one of {self.visit_sequences}" raise ValueError(err) # otherwise, set property for visit self._visit = v def _fit_init_params(self, column, method, kwgs): """Private method to supply imputation model fit params if any.""" # first, handle easy case when no kwargs given if kwgs is None: final_params = kwgs # next, check if any kwargs for a given Imputer method type # then, override those parameters if specific column kwargs supplied if isinstance(kwgs, dict): initial_params = kwgs.get(method, None) final_params = kwgs.get(column, initial_params) # final params must be None or a dictionary of kwargs # this additional validation step is crucial to dictionary unpacking if not isinstance(final_params, (type(None), dict)): err = "Additional params must be dict of args used to init model." raise ValueError(err) return final_params def _check_if_single_dummy(self, col, X): """Private method to check if encoding results in single cat.""" cats = X.columns.tolist() if len(cats) == 1: c = cats[0] msg = f"{c} only category for feature {col}." cons = f"Consider removing {col} from dataset." warnings.warn(f"{msg} {cons}")