Source code for autoimpute.analysis.linear_regressor

"""Module containing linear regression for multiply imputed datasets."""

import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.linear_model import LinearRegression
from sklearn.utils.validation import check_is_fitted
from statsmodels.api import OLS
from autoimpute.utils import check_nan_columns
from .base_regressor import MiBaseRegressor

# pylint:disable=attribute-defined-outside-init
# pylint:disable=too-many-locals

[docs]class MiLinearRegression(MiBaseRegressor, BaseEstimator):
    """Linear Regression wrapper for multiply imputed datasets.

    The MiLinearRegression class wraps the sklearn and statsmodels libraries
    to extend linear regression to multiply imputed datasets. The class wraps
    statsmodels as well as sklearn because sklearn alone does not provide
    sufficient functionality to pool estimates under Rubin's rules. sklearn is
    for machine learning; therefore, important inference capabilities are
    lacking, such as easily calculating std. error estimates for parameters.
    If users want inference from regression analysis of multiply imputed
    data, utilze the statsmodels implementation in this class instead.

    Attributes:
        linear_models (dict): linear models used by supported python libs.
    """

    linear_models = {
        "type": "linear",
        "statsmodels": OLS,
        "sklearn": LinearRegression
    }

[docs]    def __init__(self, mi=None, model_lib="statsmodels", mi_kwgs=None,
                 model_kwgs=None):
        """Create an instance of the Autoimpute MiLinearRegression class.

        Args:
            mi (MiceImputer, Optional): An instance of a MiceImputer.
                Default is none. Can create one through `mi_kwgs` instead.
            model_lib (str, Optional): library the regressor will use to
                implement regression. Options are sklearn and statsmodels.
                Default is statsmodels.
            mi_kwgs (dict, Optional): keyword args to instantiate
                MiceImputer. Default is None. If valid MiceImputer
                passed as mi argument, then mi_kwgs ignored.
            model_kwgs (dict, Optional): keyword args to instantiate
                regressor. Default is None.

        Returns:
            self. Instance of the class.
        """
        MiBaseRegressor.__init__(
            self,
            mi=mi,
            model_lib=model_lib,
            mi_kwgs=mi_kwgs,
            model_kwgs=model_kwgs
        )

[docs]    @check_nan_columns
    def fit(self, X, y):
        """Fit model specified to multiply imputed dataset.

        Fit a linear regression on multiply imputed datasets. The method first
        creates multiply imputed data using the MiceImputer instantiated
        when creating an instance of the class. It then runs a linear model on
        each m datasets. The linear model comes from sklearn or statsmodels.
        Finally, the fit method calculates pooled parameters from the m linear
        models. Note that variance for pooled parameters using Rubin's rules
        is available for statsmodels only. sklearn does not implement
        parameter inference out of the box. Autoimpute sklearn pooling TBD.

        Args:
            X (pd.DataFrame): predictors to use. can contain missingness.
            y (pd.Series, pd.DataFrame): response. can contain missingness.

        Returns:
            self. Instance of the class
        """

        # retain columns incase encoding occurs
        self.fit_X_columns = X.columns.tolist()

        # generate the imputation datasets from multiple imputation
        # then fit the analysis models on each of the imputed datasets
        self.models_ = self._apply_models_to_mi_data(
            self.linear_models, X, y
        )

        # generate the fit statistics from each of the m models
        self.statistics_ = self._get_stats_from_models(self.models_)

        # still return an instance of the class
        return self

[docs]    @check_nan_columns
    def predict(self, X):
        """Make predictions using statistics generated from fit.

        The regression uses the pooled parameters from each of the imputed
        datasets to generate a set of single predictions. The pooled params
        come from multiply imputed datasets, but the predictions themselves
        follow the same rules as an ordinary linear regression.

        Args:
            X (pd.DataFrame): data to make predictions using pooled params.

        Returns:
            np.array: predictions.
        """

        # validation before prediction
        X = self._predict_strategy_validator(self, X)

        # get the alpha and betas, then create linear equation for predictions
        alpha = self.statistics_["coefs"].values[0]
        betas = self.statistics_["coefs"].values[1:]
        preds = alpha + betas.dot(X.T)
        return preds

[docs]    def summary(self):
        """Provide a summary for model parameters, variance, and metrics.

        The summary method brings together the statistics generated from fit
        as well as the variance ratios, if available. The statistics are far
        more valuable when using statsmodels than sklearn.

        Returns:
            pd.DataFrame: summary statistics
        """

        # only possible once we've fit a model with statsmodels
        check_is_fitted(self, "statistics_")
        sdf = pd.DataFrame(self.statistics_)
        sdf.rename(columns={"lambda_": "lambda"}, inplace=True)
        return sdf