Source code for autoimpute.imputations.series.logistic_regression

"""This module implements logistic regression imputation.

This module contains the BinaryLogisticImputer and the MultiLogisticImputer.
Both use logistic regression to generate class predictions that become values
for imputations of missing data. Binary is optimized to deal with two classes,
while Multi is optimized to deal with multiple classes. Dataframe imputers
utilize these classes when each's strategy is requested. Use SingleImputer or
MultipleImputer with strategy = `binary logistic` or `multinomial logistic`
to broadcast either strategy across all the columns in a dataframe, or specify
either strategy for a given column.
"""

import warnings
from pandas import Series
from sklearn.utils.validation import check_is_fitted
from sklearn.linear_model import LogisticRegression
from autoimpute.imputations import method_names
from .base import ISeriesImputer
methods = method_names
# pylint:disable=attribute-defined-outside-init
# pylint:

[docs]class BinaryLogisticImputer(ISeriesImputer): """Impute missing values w/ predictions from binary logistic regression. The BinaryLogisticImputer produces predictions using logsitic regression with two classes. The class predictions given a set of predictors become the imputations. To implement logistic regression, the imputer wraps the sklearn LogisticRegression class with a default solver (liblinear). The imputer can be used directly, but such behavior is discouraged. BinaryLogisticImputer does not have the flexibility / robustness of dataframe imputers, nor is its behavior identical. Preferred use is MultipleImputer(strategy="binary logistic"). """ # class variables strategy = methods.BINARY_LOGISTIC
[docs] def __init__(self, **kwargs): """Create an instance of the BinaryLogisticImputer class. Args: **kwargs: keyword arguments passed to LogisticRegresion. """ self.solver = kwargs.pop("solver", "liblinear") self.glm = LogisticRegression(solver=self.solver, **kwargs)
[docs] def fit(self, X, y): """Fit the Imputer to the dataset by fitting logistic model. Args: X (pd.Dataframe): dataset to fit the imputer. y (pd.Series): response, which is eventually imputed. Returns: self. Instance of the class. """ y = y.astype("category").cat y_cat_l = len(y.codes.unique()) if y_cat_l > 2: err = "Binary requires 2 categories. Use multinomial instead." raise ValueError(err) self.glm.fit(X, y.codes) self.statistics_ = {"param": y.categories, "strategy": self.strategy} return self
[docs] def impute(self, X): """Generate imputations using predictions from the fit logistic model. The impute method returns the values for imputation. Missing values in a given dataset are replaced with the predictions from the logistic regression class specification. Args: X (pd.DataFrame): predictors to determine imputed values. Returns: np.array: imputed dataset. """ # check if fitted then predict with logistic check_is_fitted(self, "statistics_") labels = self.statistics_["param"] preds = self.glm.predict(X) # map category codes back to actual labels # then impute the actual labels to keep categories in tact label_dict = {i:j for i, j in enumerate(labels.values)} imp = Series(preds).replace(label_dict, inplace=False) return imp.values
[docs] def fit_impute(self, X, y): """Fit impute method to generate imputations where y is missing. Args: X (pd.Dataframe): predictors in the dataset. y (pd.Series): response w/ missing values to impute. Returns: np.array: imputed dataset. """ # transform occurs with records from X where y is missing miss_y_ix = y[y.isnull()].index return self.fit(X, y).impute(X.loc[miss_y_ix])
[docs]class MultinomialLogisticImputer(ISeriesImputer): """Impute missing values w/ preds from multinomial logistic regression. The MultinomialLogisticImputer produces predictions w/ logsitic regression with more than two classes. Class predictions given a set of predictors become the imputations. To implement logistic regression, the imputer wraps the sklearn LogisticRegression class with a default solver (saga) and default `multi_class` set to multinomial. The imputer can be used directly, but such behavior is discouraged. MultinomialLogisticImputer does not have the flexibility / robustness of dataframe imputers, nor is its behavior identical. Preferred use is MultipleImputer(strategy="multinomial logistic"). """ # class variables strategy = methods.MULTI_LOGISTIC
[docs] def __init__(self, **kwargs): """Create an instance of the MultiLogisticImputer class. Args: **kwargs: keyword arguments passed to LogisticRegression. """ self.solver = kwargs.pop("solver", "saga") self.multiclass = kwargs.pop("multi_class", "multinomial") self.glm = LogisticRegression( solver=self.solver, multi_class=self.multiclass, **kwargs )
[docs] def fit(self, X, y): """Fit the Imputer to the dataset by fitting logistic model. Args: X (pd.Dataframe): dataset to fit the imputer. y (pd.Series): response, which is eventually imputed. Returns: self. Instance of the class. """ y = y.astype("category").cat y_cat_l = len(y.codes.unique()) if y_cat_l == 2: w = "Multiple categories (c) expected. Use binary instead if c=2." warnings.warn(w) self.glm.fit(X, y.codes) self.statistics_ = {"param": y.categories, "strategy": self.strategy} return self
[docs] def impute(self, X): """Generate imputations using predictions from the fit logistic model. The impute method returns the values for imputation. Missing values in a given dataset are replaced with the predictions from the logistic regression class specification. Args: X (pd.DataFrame): predictors to determine imputed values. Returns: np.array: imputed dataset. """ # check if fitted then predict with logistic check_is_fitted(self, "statistics_") labels = self.statistics_["param"] preds = self.glm.predict(X) # map category codes back to actual labels # then impute the actual labels to keep categories in tact label_dict = {i:j for i, j in enumerate(labels.values)} imp = Series(preds).replace(label_dict, inplace=False) return imp.values
[docs] def fit_impute(self, X, y): """Fit impute method to generate imputations where y is missing. Args: X (pd.Dataframe): predictors in the dataset. y (pd.Series): response w/ missing values to impute. Returns: np.array: imputed dataset. """ # transform occurs with records from X where y is missing miss_y_ix = y[y.isnull()].index return self.fit(X, y).impute(X.loc[miss_y_ix])