Source code for autoimpute.imputations.dataframe.single_imputer

"""This module performs one imputation of missing features in a dataset.

This module contains one class - the SingleImputer. Use this class to
impute each Series within a DataFrame one time. This class makes numerous
imputation methods available - both univariate and multivatiate. Each method
runs once on its specified column. When one pass through the columns is
complete, the SingleImputer returns the single imputed dataset.
"""

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from autoimpute.utils import check_nan_columns, check_predictors_fit
from autoimpute.utils import check_strategy_fit
from autoimpute.utils.helpers import _one_hot_encode
from autoimpute.imputations.helpers import _get_observed
from .base_imputer import BaseImputer
from ..series import DefaultUnivarImputer

# pylint:disable=attribute-defined-outside-init
# pylint:disable=arguments-differ
# pylint:disable=protected-access
# pylint:disable=too-many-arguments
# pylint:disable=too-many-locals
# pylint:disable=too-many-instance-attributes
# pylint:disable=unused-argument


[docs]class SingleImputer(BaseImputer, BaseEstimator, TransformerMixin): """Techniques to impute Series with missing values one time. The SingleImputer class takes a DataFrame and performs imputations on each Series within the DataFrame. The Imputer does one pass for each column, and it supports numerous imputation methods for each column. The SingleImputer delegates imputation to respective SeriesImputers, each of which maps to a specific strategy supported by the SingleImputer. Most of the SeriesImputers are inductive (fit and transform for new data). Transductive SeriesImputers (such as InterpolateImputer) still perform a "mock" fit stage but do all the imputation work in the transform step. The fit stage is performed to remain consistent with the sklearn API. The class is a valid sklearn transformer that can be used in an sklearn Pipeline because it inherits from the TransformerMixin and implements both fit and transform methods. """
[docs] def __init__(self, strategy="default predictive", predictors="all", imp_kwgs=None, copy=True, seed=None, visit="default"): """Create an instance of the SingleImputer class. As with sklearn classes, all arguments take default values. Therefore, SingleImputer() creates a valid class instance. The instance is used to set up a SingleImputer and perform checks on arguments. Args: strategy (str, iter, dict; optional): strategy for single imputer. Default value is str --> `predictive default`. See BaseImputer for all available strategies. If str, single strategy broadcast to all series in DataFrame. If iter, must provide 1 strategy per column. Each method w/in iterator applies to column with same index value in DataFrame. If dict, must provide key = column name, value = imputer. Dict the most flexible and PREFERRED way to create custom imputation strategies if not using the default. Dict does not require method for every column; just those specified as keys. predictors (str, iter, dict, optional): defaults to `all`, i.e. use all predictors. If `all`, every column will be used for every class prediction. If a list, subset of columns used for all predictions. If a dict, specify which columns to use as predictors for each imputation. Columns not specified in dict but present in `strategy` receive `all` other cols as preds. Note predictors are IGNORED for univariate imputation methods, so specifying is meaningless unless strategy is predictive. imp_kwgs (dict, optional): keyword args for each SeriesImputer. Default is None, which means default imputer created to match specific strategy. `imp_kwgs` keys can be either columns or strategies. If strategies, each column given that strategy is instantiated with same arguments. When strategy is `default`, `imp_kwgs` is ignored. copy (bool, optional): create copy of DataFrame or operate inplace. Default value is True. Copy created. seed (int, optional): seed setting for reproducible results. Defualt is None. No validation, but values should be integer. """ BaseImputer.__init__( self, strategy=strategy, imp_kwgs=imp_kwgs, visit=visit ) self.strategy = strategy self.predictors = predictors self.copy = copy self.seed = seed
def _fit_strategy_validator(self, X): """Private method to validate strategies appropriate for fit. Checks whether strategies match with type of column they are applied to. If not, error is raised through `check_strategy_fit` method. """ # remove nan columns and store colnames cols = X.columns.tolist() self._strats = check_strategy_fit(self.strategy, cols) self._preds = check_predictors_fit(self.predictors, cols) def _transform_strategy_validator(self, X): """Private method to prep and validate before transformation.""" # initial checks before transformation and check columns are the same check_is_fitted(self, "statistics_") X_cols = X.columns.tolist() fit_cols = set(self._strats.keys()) diff_fit = set(fit_cols).difference(X_cols) if diff_fit: err = "Same columns that were fit must appear in transform." raise ValueError(err)
[docs] @check_nan_columns def fit(self, X, y=None, imp_ixs=None): """Fit specified imputation methods to each column within a DataFrame. The fit method calculates the `statistics` necessary to later transform a dataset (i.e. perform actual imputations). Inductive methods calculate statistic on the fit data, then impute new missing data with that value. Most currently supported methods are inductive. It's important to note that we have to fit X regardless of whether any data is missing. Transform step may have missing data if new data is used, so fit each column that appears in the given strategies. Args: X (pd.DataFrame): pandas DataFrame on which imputer is fit. y (pd.Series, pd.DataFrame Optional): response. Default is None. Determined interally in fit method. Arg is present to remain compatible with sklearn Pipelines. imp_ixs (dict): Dictionary of lists of indices that indicate which data elements to impute per column or None to identify from missing elements per column Returns: self: instance of the SingleImputer class. Raises: ValueError: error in specification of strategies. Raised through `check_strategy_fit`. See its docstrings for more info. ValueError: error in specification of predictors. Raised through `check_predictors_fit`. See its docstrings for more info. """ # first, prep columns we plan to use and make sure they are valid self._fit_strategy_validator(X) self.statistics_ = {} # perform fit on each column, depending on that column's strategy # note that right now, operations are COLUMN-by-COLUMN, iteratively if self.seed is not None: np.random.seed(self.seed) self._used_columns = {} for column, method in self._strats.items(): imp = self.strategies[method] imp_params = self._fit_init_params(column, method, self.imp_kwgs) # try to create an instance of the imputer, given the args try: if imp_params is None: imputer = imp() else: imputer = imp(**imp_params) except TypeError as te: name = imp.__name__ err = f"Invalid arguments passed to {name} __init__ method." raise ValueError(err) from te # identify the column for imputation ys = X[column] # the fit depends on what type of strategy we use. # first, fit univariate methods, which are straightforward. if method in self.univariate_strategies: imputer.fit(ys, None) # now, fit on predictive methods, which are more complex. if method in self.predictive_strategies: preds = self._preds[column] if preds == "all": xs = X.drop(column, axis=1) else: xs = X[preds] if imp_ixs is not None: ys[imp_ixs[column]] = np.nan # fit the data on observed values only. x_, y_ = _get_observed(xs, ys) # before imputing, need to encode categoricals x_ = _one_hot_encode(x_) self._used_columns[column] = x_.columns imputer.fit(x_, y_) # finally, store imputer for each column as statistics self.statistics_[column] = imputer return self
[docs] @check_nan_columns def transform(self, X, imp_ixs=None, **trans_kwargs): """Impute each column within a DataFrame using fit imputation methods. The transform step performs the actual imputations. Given a dataset previously fit, `transform` imputes each column with it's respective imputed values from fit (in the case of inductive) or performs new fit and transform in one sweep (in the case of transductive). Args: X (pd.DataFrame): DataFrame to impute (same as fit or new data). imp_ixs (dict): Dictionary of lists of indices that indicate which data elements to impute per column or None to identify from missing elements per column **trans_kwargs: dict, optional args for bayesian. Returns: X (pd.DataFrame): imputed in place or copy of original. Raises: ValueError: same columns must appear in fit and transform. Raised through _transform_strategy_validator. """ # copy the dataset if necessary, then prep predictors if self.copy: X = X.copy() self._transform_strategy_validator(X) # transformation logic self.imputed_ = {} if self.seed is not None: np.random.seed(self.seed) for column, imputer in self.statistics_.items(): if imp_ixs is None: imp_ix = X[column][X[column].isnull()].index else: imp_ix = pd.Index(imp_ixs[column]) self.imputed_[column] = imp_ix.tolist() # continue if there are no imputations to make if imp_ix.empty: continue # implement transform logic for univariate if imputer.strategy in self.univariate_strategies: x_ = X[column] # implement transform logic for predictive if imputer.strategy in self.predictive_strategies: preds = self._preds[column] if preds == "all": x_ = X.drop(column, axis=1) else: x_ = X[preds] # isolate missingness if isinstance(x_, pd.Series): x_ = x_.to_frame() x_ = x_.loc[imp_ix] else: x_ = x_.loc[imp_ix, :] # default univariate impute for missing covariates mis_cov = pd.isnull(x_).sum() mis_cov = mis_cov[mis_cov > 0] if any(mis_cov): x_m = mis_cov.index for col in x_m: d = DefaultUnivarImputer() if mis_cov[col] == x_.shape[0]: d_imps = 0 else: d_imps = d.fit_impute(x_[col], None) x_null = x_[col][x_[col].isnull()].index x_.loc[x_null, col] = d_imps # handling encoding again for prediction of imputations x_ = _one_hot_encode(x_, self._used_columns[column]) # perform imputation given the specified imputer and value for x_ # this fix below checks for strategies that need k if Mice used # right now, that's just bayesian strategies # k defaults to None, which works for non Mice related imputation if imputer.strategy in ( "bayesian binary logistic", "bayesian least squares" ): k = trans_kwargs.get("k") X.loc[imp_ix, column] = imputer.impute(x_, k=k) else: X.loc[imp_ix, column] = imputer.impute(x_) return X
[docs] def fit_transform(self, X, y=None, **trans_kwargs): """Convenience method to fit then transform the same dataset. Args: X (pd.DataFrame): DataFrame used for fit and transform steps. y (pd.DataFrame, pd.Series, Optional): response. Default is None. Set internally by `fit` method. **trans_kwargs: dict, optional args for bayesian. Returns: X (pd.DataFrame): imputed in place or copy of original. """ return self.fit(X, y).transform(X, **trans_kwargs)