Source code for autoimpute.imputations.mis_classifier

"""Module to predict missingness in data and generate imputation test cases.

This module contains the MissingnessClassifier, which is used to predict
missingness within a dataset using information derived from other features.
The MissingnessClassifier also generates test cases for imputation. Often,
we do not and will never have the true value of a missing data point,
so its challenging to validate an imputation model's performance.
The MissingnessClassifer generates missing "test" samples from observed
that have high likelihood of being missing, which a user can then "impute".
This practice is useful to validate models that contain truly missing data.
"""

import warnings
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.base import clone, BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_is_fitted
from autoimpute.utils import check_nan_columns, check_predictors_fit

# pylint:disable=attribute-defined-outside-init
# pylint:disable=arguments-differ
# pylint:disable=too-many-arguments
# pylint:disable=too-many-instance-attributes

[docs]class MissingnessClassifier(BaseEstimator, ClassifierMixin):
    """Classify values as missing or not, based on missingness patterns.

    The class has has numerous use cases. First, it fits columns of a DataFrame
    and predicts whether or not an observation is missing, based on all
    available information in other columns. The class supports both class
    prediction and class probabilities.

    Second, the class can generate test cases for imputation analysis. Test
    cases are values that are truly observed but have a high probability of
    being missing. These cases make imputation process supervised as opposed
    to unsupervised. A user never knows the true value of missing data but can
    verify imputation methods on test cases for which the true value is known.
    """
[docs]    def __init__(self, classifier=None, predictors="all"):
        """Create an instance of the MissingnessClassifier.

        The MissingnessClassifier inherits from sklearn BaseEstimator and
        ClassifierMixin. This inheritence and this class' implementation
        ensure that the MissingnessClassifier is a valid classifier that will
        work in an sklearn pipeline.

        Args:
            classifier (classifier, optional): valid classifier from sklearn.
                If None, default is xgboost. Note that classifier must
                conform to sklearn style. This means it must implement the
                `predict_proba` method and act as a porper classifier.
            predictors (str, iter, dict, optiona): defaults to all, i.e.
                use all predictors. If all, every column will be used for
                every class prediction. If a list, subset of columns used for
                all predictions. If a dict, specify which columns to use as
                predictors for each imputation. Columns not specified in dict
                will receive `all` by default.
        """
        self.classifier = classifier
        self.predictors = predictors

    @property
    def classifier(self):
        """Property getter to return the value of the classifier property"""
        return self._classifier

    @classifier.setter
    def classifier(self, c):
        """Validate the classifier property and set default parameters.

        Args:
            c (classifier): if None, implement the xgboost classifier

        Raises:
            ValueError: classifier does not implement `predict_proba`
        """
        if c is None:
            self._classifier = XGBClassifier()
        else:
            m = "predict_proba"
            if not hasattr(c, m):
                raise ValueError(f"Classifier must implement {m} method.")
            self._classifier = c

    def _fit_strategy_validator(self, X):
        """Internal helper method to validate behavior appropriate for fit."""

        # remove nan columns and store colnames
        cols = X.columns.tolist()
        self._preds = check_predictors_fit(self.predictors, cols)

        # next, prep the categorical / numerical split
        # only necessary for classes that use other features
        # wont see this requirement in the single imputer
        self.data_mi = X.isnull().astype(int)

    def _predictor_strategy_validator(self, X):
        """Private method to prep for prediction."""

        # initial checks before transformation
        check_is_fitted(self, "statistics_")

        # check dataset features are the same for both fit and transform
        X_cols = X.columns.tolist()
        mi_cols = self.data_mi.columns.tolist()
        diff_X = set(X_cols).difference(mi_cols)
        diff_mi = set(mi_cols).difference(X_cols)
        if diff_X or diff_mi:
            raise ValueError("Same columns must appear in fit and predict.")

[docs]    @check_nan_columns
    def fit(self, X, **kwargs):
        """Fit an individual classifier for each column in the DataFrame.

        For each feature in the DataFrame, a classifier (default: xgboost) is
        fit with the feature as the response (y) and all other features as
        covariates (X). The resulting classifiers are stored in the class
        instance statistics. One `fit` for each column in the dataset. Column
        specification will be supported as well.

        Args:
            X (pd.DataFrame): DataFrame on which to fit classifiers
            **kwargs: keyword arguments used by classifiers

        Returns:
            self: instance of MissingnessClassifier
        """

        # start with fit checks
        self._fit_strategy_validator(X)
        self.statistics_ = {}

        # iterate missingness fit using classifier and all remaining columns
        for column in self.data_mi:
            # only fit non time-based columns...
            if not np.issubdtype(self.data_mi[column].dtype, np.datetime64):
                y = self.data_mi[column]
                preds = self._preds[column]
                if preds == "all":
                    x = X.drop(column, axis=1)
                else:
                    x = X[preds]
                clf = clone(self.classifier)
                cls_fit = clf.fit(x.values, y.values, **kwargs)
                self.statistics_[column] = cls_fit
        return self

[docs]    @check_nan_columns
    def predict(self, X, **kwargs):
        """Predict class of each feature. 1 for missing; 0 for not missing.

        First checks to ensure data has been fit. If fit, `predict` method
        uses the respective classifier of each feature (stored in statistics)
        and predicts class membership for each observation of each feature.
        1 = missing; 0 = not missing. Prediction is binary, as class membership
        is hard. If probability deesired, use `predict_proba` method.

        Args:
            X (pd.DataFrame): DataFrame used to create predictions.
            kwargs: kewword arguments. Used by the classifer.

        Returns:
            pd.DataFrame: DataFrame with class prediction for each observation.
        """

        # predictions for each column using respective fit classifier
        self._predictor_strategy_validator(X)
        preds_mat = []
        for column in self.data_mi:
            if not np.issubdtype(self.data_mi[column].dtype, np.datetime64):
                preds = self._preds[column]
                if preds == "all":
                    x = X.drop(column, axis=1)
                else:
                    x = X[preds]
                cls_fit = self.statistics_[column]
                y_pred = cls_fit.predict(x.values, **kwargs)
                preds_mat.append(y_pred)
            else:
                y_pred = np.zeros(len(self.data_mi.index))
                preds_mat.append(y_pred)

        # store the predictor matrix class membership as a dataframe
        preds_mat = np.array(preds_mat).T
        pred_cols = [f"{cl}_pred" for cl in X.columns]
        self.data_mi_preds = pd.DataFrame(preds_mat, columns=pred_cols)
        return self.data_mi_preds

[docs]    @check_nan_columns
    def predict_proba(self, X, **kwargs):
        """Predict probability of missing class membership of each feature.

        First checks to ensure data has been fit. If fit, `predict_proba`
        method uses the respsective classifier of each feature (in statistics)
        and predicts probability of missing class membership for each
        observation of each feature. Prediction is probability of missing.
        Therefore, probability of not missing is 1-P(missing). For hard class
        membership prediction, use `predict`.

        Args:
            X (pd.DataFrame): DataFrame used to create probabilities.

        Returns:
            pd.DataFrame: DataFrame with probability of missing class for
                each observation.
        """
        self._predictor_strategy_validator(X)
        preds_mat = []
        for column in self.data_mi:
            if not np.issubdtype(self.data_mi[column].dtype, np.datetime64):
                preds = self._preds[column]
                if preds == "all":
                    x = X.drop(column, axis=1)
                else:
                    x = X[preds]
                cls_fit = self.statistics_[column]
                y_pred = cls_fit.predict_proba(x.values, **kwargs)[:, 1]
                preds_mat.append(y_pred)
            else:
                y_pred = np.zeros(len(self.data_mi.index))
                preds_mat.append(y_pred)

        # store the predictor matrix probabilities as a dataframe
        preds_mat = np.array(preds_mat).T
        pred_cols = [f"{cl}_pred" for cl in X.columns]
        self.data_mi_proba = pd.DataFrame(preds_mat, columns=pred_cols)
        return self.data_mi_proba

[docs]    def fit_predict(self, X):
        """Convenience method for fit and class prediction.

        Args:
            X (pd.DataFrame): DataFrame to fit classifier and predict class.

        Returns:
            pd.DataFrame: DataFrame of class predictions.
        """
        return self.fit(X).predict(X)

[docs]    def fit_predict_proba(self, X):
        """Convenience method for fit and class probability prediction.

        Args:
            X (pd.DataFrame): DataFrame to fit classifier and prredict prob.

        Returns:
            pd.DataFrame: DataFrame of class probability predictions.
        """
        return self.fit(X).predict_proba(X)

[docs]    @check_nan_columns
    def gen_test_indices(self, X, thresh=0.5, use_exist=False):
        """Generate indices of false positives for each fitted column.

        Method generates the locations (indices) of false positives returned
        from classifiers. These are instances that have a high probability of
        being missing even though true value is observed. Use this method to
        get indices without mutating the actual DataFrame. To set the values
        to missing for the actual DataFrame, use `gen_test_df`.

        Args:
            X (pd.DataFrame): DataFrame from which test indices generated.
                Data first goes through `fit_predict_proba`.
            thresh (float, optional): Threshhold for generating false positive.
                If raw value is observed and P(missing) >= thresh, then the
                observation is considered a false positive and index is stored.
            use_exist (bool, optional): Whether or not to use existing fit and
                classifiers. Default is False.

        Returns:
            self: test_indice available from `self.test_indices`
        """

        # always fit_transform with dataset, as test vals can change
        self.test_indices = {}
        if not use_exist:
            self.fit_predict_proba(X)

        # loop through missing data indicators, eval new set for missing
        for c in self.data_mi:
            mi_c = self.data_mi[c]
            not_mi = mi_c[mi_c == 0].index
            pred_not_mi = self.data_mi_proba.loc[not_mi, f"{c}_pred"]
            pred_wrong = pred_not_mi[pred_not_mi > thresh].index
            self.test_indices[c] = pred_wrong
        return self

[docs]    def gen_test_df(self, X, thresh=0.5, m=0.05,
                    inplace=False, use_exist=False):
        """Generate new DatFrame with value of false positives set to missing.

        Method generates new DataFrame with the locations (indices) of false
        positives set to missing. Utilizes `gen_test_indices` to detect index
        of false positives.

        Args:
            X (pd.DataFrame): DataFrame from which test indices generated.
                Data first goes through `fit_predict_proba`.
            thresh (float, optional): Threshhold for generating false positive.
                If raw value is observed and P(missing) >= thresh, then the
                observation is considered a false positive and index is stored.
            m (float, optional): % false positive threshhold for warning.
                If % <= m, issue warning with % of test cases.
            use_exist (bool, optional): Whether or not to use existing fit and
                classifiers. Default is False.

        Returns:
            pd.DataFrame: DataFrame with false positives set to missing.
        """
        if not inplace:
            X = X.copy()

        self.gen_test_indices(X, thresh, use_exist)
        min_num = np.floor(m*len(X.index))
        for c in X:
            ix_ = self.test_indices[c]
            if len(ix_) <= min_num:
                w = f"Fewer than {m*100}% set to missing for {c}"
                warnings.warn(w)
            if X[c].dtype == np.number:
                X.loc[ix_, c] = np.nan
            else:
                X.loc[ix_, c] = None
        return X