Source code for autoimpute.imputations.series.linear_regression

"""This module implements least squares and stochastic imputation.

This module contains the LeastSquaresImputer and the StochasticImputer. Both
use least squares to find a line of best fit and fill imputations with the
predictions from the line. Stochastic adds random error to each prediction.
Dataframe imputers utilize this class when its strategy is requested. Use
SingleImputer or MultipleImputer with strategy = `least squares` to broadcast
the strategy across all the columns in a dataframe, or specify this strategy
for a given column.
"""

from numpy import sqrt
from scipy.stats import norm
from sklearn.utils.validation import check_is_fitted
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from autoimpute.imputations import method_names
from autoimpute.imputations.errors import _not_num_series
from .base import ISeriesImputer
methods = method_names
# pylint:disable=attribute-defined-outside-init
# pylint:

[docs]class LeastSquaresImputer(ISeriesImputer):
    """Impute missing values using predictions from least squares regression.

    The LeastSquaresImputer produces predictions using the least squares
    methodology. The prediction from the line of best fit given a set of
    predictors become the imputations. To implement least squares, the imputer
    wraps the sklearn LinearRegression class. The imputer can be used
    directly, but such behavior is discouraged. LeastSquaresImputer does not
    have the flexibility / robustness of dataframe imputers, nor is its
    behavior identical. Preferred use is
    MultipleImputer(strategy="least squares").
    """
    # class variables
    strategy = methods.LS

[docs]    def __init__(self, **kwargs):
        """Create an instance of the LeastSquaresImputer class.

        Args:
            **kwargs: keyword arguments passed to LinearRegression

        """
        self.lm = LinearRegression(**kwargs)

[docs]    def fit(self, X, y):
        """Fit the Imputer to the dataset by fitting linear model.

        Args:
            X (pd.Dataframe): dataset to fit the imputer.
            y (pd.Series): response, which is eventually imputed.

        Returns:
            self. Instance of the class.
        """
        _not_num_series(self.strategy, y)
        self.lm.fit(X, y)
        self.statistics_ = {"strategy": self.strategy}
        return self

[docs]    def impute(self, X):
        """Generate imputations using predictions from the fit linear model.

        The impute method returns the values for imputation. Missing values
        in a given dataset are replaced with the predictions from the least
        squares regression line of best fit. This transform method returns
        those predictions.

        Args:
            X (pd.DataFrame): predictors to determine imputed values.

        Returns:
            np.array: imputed dataset.
        """
        # check if fitted then predict with least squares
        check_is_fitted(self, "statistics_")
        imp = self.lm.predict(X)
        return imp

[docs]    def fit_impute(self, X, y):
        """Fit impute method to generate imputations where y is missing.

        Args:
            X (pd.Dataframe): predictors in the dataset.
            y (pd.Series): response w/ missing values to impute.

        Returns:
            np.array: imputed dataset.
        """
        # transform occurs with records from X where y is missing
        miss_y_ix = y[y.isnull()].index
        return self.fit(X, y).impute(X.loc[miss_y_ix])

[docs]class StochasticImputer(ISeriesImputer):
    """Impute missing values adding error to least squares regression preds.

    The StochasticImputer predicts using the least squares methodology. The
    imputer then samples from the regression's error distribution and adds the
    random draw to the prediction. This draw adds the stochastic element to
    the imputations. The imputer can be used directly, but such behavior is
    discouraged. StochasticImputer does not have the flexibility / robustness
    of dataframe imputers, nor is its behavior identical. Preferred use is
    MultipleImputer(strategy="stochastic").
    """
    # class variables
    strategy = methods.STOCHASTIC

[docs]    def __init__(self, **kwargs):
        """Create an instance of the StochasticImputer class.

        Args:
            **kwargs: keyword arguments passed to LinearRegression.

        """
        self.lm = LinearRegression(**kwargs)

[docs]    def fit(self, X, y):
        """Fit the Imputer to the dataset by fitting linear model.

        The fit step also generates predictions on the observed data. These
        predictions are necessary to derive the mean_squared_error, which is
        passed as a parameter to the impute phase. The MSE is used to create
        the normal error distribution from which the imptuer draws.

        Args:
            X (pd.Dataframe): dataset to fit the imputer.
            y (pd.Series): response, which is eventually imputed.

        Returns:
            self. Instance of the class.
        """
        _not_num_series(self.strategy, y)
        self.lm.fit(X, y)
        preds = self.lm.predict(X)
        mse = mean_squared_error(y, preds)
        self.statistics_ = {"param": mse, "strategy": self.strategy}
        return self

[docs]    def impute(self, X):
        """Generate imputations using predictions from the fit linear model.

        The impute method returns the values for imputation. Missing values
        in a given dataset are replaced with the predictions from the least
        squares regression line of best fit plus a random draw from the normal
        error distribution.

        Args:
            X (pd.DataFrame): predictors to determine imputed values.

        Returns:
            np.array: imputed dataset.
        """
        # check if fitted then predict with least squares
        check_is_fitted(self, "statistics_")
        mse = self.statistics_["param"]
        preds = self.lm.predict(X)

        # add random draw from normal dist w/ mean squared error
        # from observed model. This makes lm stochastic
        mse_dist = norm.rvs(loc=0, scale=sqrt(mse), size=len(preds))
        imp = preds + mse_dist
        return imp

[docs]    def fit_impute(self, X, y):
        """Fit impute method to generate imputations where y is missing.

        Args:
            X (pd.Dataframe): predictors in the dataset.
            y (pd.Series): response w/ missing values to impute

        Returns:
            np.array: imputated dataset.
        """
        # transform occurs with records from X where y is missing
        miss_y_ix = y[y.isnull()].index
        return self.fit(X, y).impute(X.loc[miss_y_ix])