Source code for autoimpute.analysis.logistic_regressor

"""Module containing logistic regression for multiply imputed datasets."""

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.utils.validation import check_is_fitted
from statsmodels.discrete.discrete_model import Logit
from autoimpute.utils import check_nan_columns
from .base_regressor import MiBaseRegressor

# pylint:disable=attribute-defined-outside-init
# pylint:disable=too-many-locals

[docs]class MiLogisticRegression(MiBaseRegressor, BaseEstimator): """Logistic Regression wrapper for multiply imputed datasets. The MiLogisticRegression class wraps the sklearn and statsmodels libraries to extend logistic regression to multiply imputed datasets. The class wraps statsmodels as well as sklearn because sklearn alone does not provide sufficient functionality to pool estimates under Rubin's rules. sklearn is for machine learning; therefore, important inference capabilities are lacking, such as easily calculating std. error estimates for parameters. If users want inference from regression analysis of multiply imputed data, utilze the statsmodels implementation in this class instead. Attributes: logistic_models (dict): logistic models used by supported python libs. """ logistic_models = { "type": "logistic", "statsmodels": Logit, "sklearn": LogisticRegression }
[docs] def __init__(self, mi=None, model_lib="statsmodels", mi_kwgs=None, model_kwgs=None): """Create an instance of the Autoimpute MiLogisticRegression class. Args: mi (MiceImputer, Optional): An instance of a MiceImputer. Default is None. Can create one through `mi_kwgs` instead. model_lib (str, Optional): library the regressor will use to implement regression. Options are sklearn and statsmodels. Default is statsmodels. mi_kwgs (dict, Optional): keyword args to instantiate MiceImputer. Default is None. If valid MiceImputer passed as `mi` argument, then `mi_kwgs` ignored. model_kwgs (dict, Optional): keyword args to instantiate regressor. Default is None. Returns: self. Instance of the class. """ MiBaseRegressor.__init__( self, mi=mi, model_lib=model_lib, mi_kwgs=mi_kwgs, model_kwgs=model_kwgs )
[docs] @check_nan_columns def fit(self, X, y): """Fit model specified to multiply imputed dataset. Fit a logistic regression on multiply imputed datasets. The method creates multiply imputed data using the MiceImputer instantiated when creating an instance of the class. It then runs a logistic model on m datasets. The logistic model comes from sklearn or statsmodels. Finally, the fit method calculates pooled parameters from m logistic models. Note that variance for pooled parameters using Rubin's rules is available for statsmodels only. sklearn does not implement parameter inference out of the box. Args: X (pd.DataFrame): predictors to use. can contain missingness. y (pd.Series, pd.DataFrame): response. can contain missingness. Returns: self. Instance of the class """ # retain columns incase encoding occurs self.fit_X_columns = X.columns.tolist() # generate the imputation datasets from multiple imputation # then fit the analysis models on each of the imputed datasets self.models_ = self._apply_models_to_mi_data( self.logistic_models, X, y ) # generate the fit statistics from each of the m models self.statistics_ = self._get_stats_from_models(self.models_) # still return an instance of the class return self
def _sigmoid(self, z): """Private method that applies sigmoid function to input.""" return 1 / (1 + np.exp(-z))
[docs] @check_nan_columns def predict_proba(self, X): """Predict probabilities of class membership for logistic regression. The regression uses the pooled parameters from each of the imputed datasets to generate a set of single predictions. The pooled params come from multiply imputed datasets, but the predictions themselves follow the same rules as an logistic regression. Because this is logistic regression, the sigmoid function is applied to the result of the normal equation, giving us probabilities between 0 and 1 for each prediction. This method returns those probabilities. Args: X (pd.Dataframe): predictors to predict response Returns: np.array: prob of class membership for predicted observations. """ # run validation first X = self._predict_strategy_validator(self, X) # get the alpha and betas, then create linear equation for predictions alpha = self.statistics_["coefs"].values[0] betas = self.statistics_["coefs"].values[1:] return self._sigmoid(alpha + np.dot(X, betas))
[docs] @check_nan_columns def predict(self, X, threshold=0.5): """Make predictions using statistics generated from fit. The predict method calls on the predict_proba method, which returns the probability of class membership for each prediction. These probabilities range from 0 to 1. Therefore, anything below the set threshold is assigned to class 0, while anything above the threshold is assigned to class 1. The deafult threshhold is 0.5, which indicates a balanced dataset. Args: X (pd.DataFrame): data to make predictions using pooled params. threshold (float, Optional): boundary for class membership. Default is 0.5. Values can range from 0 to 1. Returns: np.array: predictions. """ pred_probs = self.predict_proba(X) pred_array = (pred_probs >= threshold).astype(int) responses = self._response_categories.values return responses[pred_array]
[docs] def summary(self): """Provide a summary for model parameters, variance, and metrics. The summary method brings together the statistics generated from fit as well as the variance ratios, if available. The statistics are far more valuable when using statsmodels than sklearn. Returns: pd.DataFrame: summary statistics """ # only possible once we've fit a model with statsmodels check_is_fitted(self, "statistics_") sdf = pd.DataFrame(self.statistics_) sdf.rename(columns={"lambda_": "lambda"}, inplace=True) return sdf