Source code for autoimpute.imputations.mis_classifier

"""Module to predict missingness in data and generate imputation test cases.

This module contains the MissingnessClassifier, which is used to predict
missingness within a dataset using information derived from other features.
The MissingnessClassifier also generates test cases for imputation. Often,
we do not and will never have the true value of a missing data point,
so its challenging to validate an imputation model's performance.
The MissingnessClassifer generates missing "test" samples from observed
that have high likelihood of being missing, which a user can then "impute".
This practice is useful to validate models that contain truly missing data.
"""

import warnings
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.base import clone, BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_is_fitted
from autoimpute.utils import check_nan_columns, check_predictors_fit

# pylint:disable=attribute-defined-outside-init
# pylint:disable=arguments-differ
# pylint:disable=too-many-arguments
# pylint:disable=too-many-instance-attributes

[docs]class MissingnessClassifier(BaseEstimator, ClassifierMixin): """Classify values as missing or not, based on missingness patterns. The class has has numerous use cases. First, it fits columns of a DataFrame and predicts whether or not an observation is missing, based on all available information in other columns. The class supports both class prediction and class probabilities. Second, the class can generate test cases for imputation analysis. Test cases are values that are truly observed but have a high probability of being missing. These cases make imputation process supervised as opposed to unsupervised. A user never knows the true value of missing data but can verify imputation methods on test cases for which the true value is known. """
[docs] def __init__(self, classifier=None, predictors="all"): """Create an instance of the MissingnessClassifier. The MissingnessClassifier inherits from sklearn BaseEstimator and ClassifierMixin. This inheritence and this class' implementation ensure that the MissingnessClassifier is a valid classifier that will work in an sklearn pipeline. Args: classifier (classifier, optional): valid classifier from sklearn. If None, default is xgboost. Note that classifier must conform to sklearn style. This means it must implement the `predict_proba` method and act as a porper classifier. predictors (str, iter, dict, optiona): defaults to all, i.e. use all predictors. If all, every column will be used for every class prediction. If a list, subset of columns used for all predictions. If a dict, specify which columns to use as predictors for each imputation. Columns not specified in dict will receive `all` by default. """ self.classifier = classifier self.predictors = predictors
@property def classifier(self): """Property getter to return the value of the classifier property""" return self._classifier @classifier.setter def classifier(self, c): """Validate the classifier property and set default parameters. Args: c (classifier): if None, implement the xgboost classifier Raises: ValueError: classifier does not implement `predict_proba` """ if c is None: self._classifier = XGBClassifier() else: m = "predict_proba" if not hasattr(c, m): raise ValueError(f"Classifier must implement {m} method.") self._classifier = c def _fit_strategy_validator(self, X): """Internal helper method to validate behavior appropriate for fit.""" # remove nan columns and store colnames cols = X.columns.tolist() self._preds = check_predictors_fit(self.predictors, cols) # next, prep the categorical / numerical split # only necessary for classes that use other features # wont see this requirement in the single imputer self.data_mi = X.isnull().astype(int) def _predictor_strategy_validator(self, X): """Private method to prep for prediction.""" # initial checks before transformation check_is_fitted(self, "statistics_") # check dataset features are the same for both fit and transform X_cols = X.columns.tolist() mi_cols = self.data_mi.columns.tolist() diff_X = set(X_cols).difference(mi_cols) diff_mi = set(mi_cols).difference(X_cols) if diff_X or diff_mi: raise ValueError("Same columns must appear in fit and predict.")
[docs] @check_nan_columns def fit(self, X, **kwargs): """Fit an individual classifier for each column in the DataFrame. For each feature in the DataFrame, a classifier (default: xgboost) is fit with the feature as the response (y) and all other features as covariates (X). The resulting classifiers are stored in the class instance statistics. One `fit` for each column in the dataset. Column specification will be supported as well. Args: X (pd.DataFrame): DataFrame on which to fit classifiers **kwargs: keyword arguments used by classifiers Returns: self: instance of MissingnessClassifier """ # start with fit checks self._fit_strategy_validator(X) self.statistics_ = {} # iterate missingness fit using classifier and all remaining columns for column in self.data_mi: # only fit non time-based columns... if not np.issubdtype(self.data_mi[column].dtype, np.datetime64): y = self.data_mi[column] preds = self._preds[column] if preds == "all": x = X.drop(column, axis=1) else: x = X[preds] clf = clone(self.classifier) cls_fit = clf.fit(x.values, y.values, **kwargs) self.statistics_[column] = cls_fit return self
[docs] @check_nan_columns def predict(self, X, **kwargs): """Predict class of each feature. 1 for missing; 0 for not missing. First checks to ensure data has been fit. If fit, `predict` method uses the respective classifier of each feature (stored in statistics) and predicts class membership for each observation of each feature. 1 = missing; 0 = not missing. Prediction is binary, as class membership is hard. If probability deesired, use `predict_proba` method. Args: X (pd.DataFrame): DataFrame used to create predictions. kwargs: kewword arguments. Used by the classifer. Returns: pd.DataFrame: DataFrame with class prediction for each observation. """ # predictions for each column using respective fit classifier self._predictor_strategy_validator(X) preds_mat = [] for column in self.data_mi: if not np.issubdtype(self.data_mi[column].dtype, np.datetime64): preds = self._preds[column] if preds == "all": x = X.drop(column, axis=1) else: x = X[preds] cls_fit = self.statistics_[column] y_pred = cls_fit.predict(x.values, **kwargs) preds_mat.append(y_pred) else: y_pred = np.zeros(len(self.data_mi.index)) preds_mat.append(y_pred) # store the predictor matrix class membership as a dataframe preds_mat = np.array(preds_mat).T pred_cols = [f"{cl}_pred" for cl in X.columns] self.data_mi_preds = pd.DataFrame(preds_mat, columns=pred_cols) return self.data_mi_preds
[docs] @check_nan_columns def predict_proba(self, X, **kwargs): """Predict probability of missing class membership of each feature. First checks to ensure data has been fit. If fit, `predict_proba` method uses the respsective classifier of each feature (in statistics) and predicts probability of missing class membership for each observation of each feature. Prediction is probability of missing. Therefore, probability of not missing is 1-P(missing). For hard class membership prediction, use `predict`. Args: X (pd.DataFrame): DataFrame used to create probabilities. Returns: pd.DataFrame: DataFrame with probability of missing class for each observation. """ self._predictor_strategy_validator(X) preds_mat = [] for column in self.data_mi: if not np.issubdtype(self.data_mi[column].dtype, np.datetime64): preds = self._preds[column] if preds == "all": x = X.drop(column, axis=1) else: x = X[preds] cls_fit = self.statistics_[column] y_pred = cls_fit.predict_proba(x.values, **kwargs)[:, 1] preds_mat.append(y_pred) else: y_pred = np.zeros(len(self.data_mi.index)) preds_mat.append(y_pred) # store the predictor matrix probabilities as a dataframe preds_mat = np.array(preds_mat).T pred_cols = [f"{cl}_pred" for cl in X.columns] self.data_mi_proba = pd.DataFrame(preds_mat, columns=pred_cols) return self.data_mi_proba
[docs] def fit_predict(self, X): """Convenience method for fit and class prediction. Args: X (pd.DataFrame): DataFrame to fit classifier and predict class. Returns: pd.DataFrame: DataFrame of class predictions. """ return self.fit(X).predict(X)
[docs] def fit_predict_proba(self, X): """Convenience method for fit and class probability prediction. Args: X (pd.DataFrame): DataFrame to fit classifier and prredict prob. Returns: pd.DataFrame: DataFrame of class probability predictions. """ return self.fit(X).predict_proba(X)
[docs] @check_nan_columns def gen_test_indices(self, X, thresh=0.5, use_exist=False): """Generate indices of false positives for each fitted column. Method generates the locations (indices) of false positives returned from classifiers. These are instances that have a high probability of being missing even though true value is observed. Use this method to get indices without mutating the actual DataFrame. To set the values to missing for the actual DataFrame, use `gen_test_df`. Args: X (pd.DataFrame): DataFrame from which test indices generated. Data first goes through `fit_predict_proba`. thresh (float, optional): Threshhold for generating false positive. If raw value is observed and P(missing) >= thresh, then the observation is considered a false positive and index is stored. use_exist (bool, optional): Whether or not to use existing fit and classifiers. Default is False. Returns: self: test_indice available from `self.test_indices` """ # always fit_transform with dataset, as test vals can change self.test_indices = {} if not use_exist: self.fit_predict_proba(X) # loop through missing data indicators, eval new set for missing for c in self.data_mi: mi_c = self.data_mi[c] not_mi = mi_c[mi_c == 0].index pred_not_mi = self.data_mi_proba.loc[not_mi, f"{c}_pred"] pred_wrong = pred_not_mi[pred_not_mi > thresh].index self.test_indices[c] = pred_wrong return self
[docs] def gen_test_df(self, X, thresh=0.5, m=0.05, inplace=False, use_exist=False): """Generate new DatFrame with value of false positives set to missing. Method generates new DataFrame with the locations (indices) of false positives set to missing. Utilizes `gen_test_indices` to detect index of false positives. Args: X (pd.DataFrame): DataFrame from which test indices generated. Data first goes through `fit_predict_proba`. thresh (float, optional): Threshhold for generating false positive. If raw value is observed and P(missing) >= thresh, then the observation is considered a false positive and index is stored. m (float, optional): % false positive threshhold for warning. If % <= m, issue warning with % of test cases. use_exist (bool, optional): Whether or not to use existing fit and classifiers. Default is False. Returns: pd.DataFrame: DataFrame with false positives set to missing. """ if not inplace: X = X.copy() self.gen_test_indices(X, thresh, use_exist) min_num = np.floor(m*len(X.index)) for c in X: ix_ = self.test_indices[c] if len(ix_) <= min_num: w = f"Fewer than {m*100}% set to missing for {c}" warnings.warn(w) if X[c].dtype == np.number: X.loc[ix_, c] = np.nan else: X.loc[ix_, c] = None return X