Source code for autoimpute.imputations.series.lrd

"""This module implements local residual draws via the LRDImputer.

This module contains the LRDImputer, which takes local residual draws to
impute missing values. LRD is similar to PMM, but residuals are added onto
the selected value for imputation. Dataframe imputers utilize this class when
its strategy is requested. Use SingleImputer or MultipleImputer with
strategy = `lrd` to broadcast the strategy across all the columns in a
dataframe, or specify this strategy for a given column.
"""

import numpy as np
import pymc as pm
from pandas import DataFrame
from scipy.stats import multivariate_normal
from sklearn.linear_model import LinearRegression
from sklearn.utils.validation import check_is_fitted
from autoimpute.imputations import method_names
from autoimpute.imputations.errors import _not_num_series
from autoimpute.imputations.helpers import _local_residuals
from .base import ISeriesImputer
methods = method_names
# pylint:disable=attribute-defined-outside-init
# pylint:disable=no-member
# pylint:disable=unused-variable

[docs]class LRDImputer(ISeriesImputer):
    """Impute missing values using local residual draws.

    The LRDImputer produces predictions using a combination of bayesian
    approach to least squares and least squares itself. For each missing value
    LRD finds the `n` closest neighbors from a least squares regression
    prediction set, and samples from the corresponding true values for y of
    each of those `n` predictions. The imputation is the resulting sample plus
    the residual, or the distance between the prediction and the neighbor.
    To implement bayesian least squares, the imputer utlilizes the pymc
    library. The imputer can be used directly, but such behavior is
    discouraged. LRDImputer does not have the flexibility / robustness of
    dataframe imputers, nor is its behavior identical. Preferred use is
    MultipleImputer(strategy="lrd").
    """
    # class variables
    strategy = methods.LRD

[docs]    def __init__(self, **kwargs):
        """Create an instance of the LRDImputer class.

        The class requires multiple arguments necessary to create priors for
        a bayesian linear regression equation and least squares itself.
        Therefore, LRD arguments include all of those seen in bayesian least
        squares and least squares itself. New parameters include `neighbors`,
        or the number of neighbors that LRD uses to sample observed.

        Args:
            **kwargs: default keyword arguments for lm & bayesian analysis.
                Note - kwargs popped for default arguments defined below.
                Next set of kwargs popped and sent to linear regression.
                Rest of kwargs passed as params to sampling (see pymc).
            am (float, Optional): mean of alpha prior. Default 0.
            asd (float, Optional): std. deviation of alpha prior. Default 10.
            bm (float, Optional): mean of beta priors. Default 0.
            bsd (float, Optional): std. deviation of beta priors. Default 10.
            sig (float, Optional): parameter of sigma prior. Default 1.
            sample (int, Optional): number of posterior samples per chain.
                Default = 1000. More samples, longer to run, but better
                approximation of the posterior & chance of convergence.
            tune (int, Optional): parameter for tuning. Draws done in addition
                to sample. Default = 1000.
            init (str, Optional): MCMC algo to use for posterior sampling.
                Default = 'auto'. See pymc docs for more info on choices.
            fill_value (str, Optional): How to draw from the posterior to
                create imputations. Default is "random". 'random' and 'mean'
                supported for explicit options.
            neighbors (int, Optional): number of neighbors. Default is 5.
                Value should be greater than 0 and less than # observed,
                although anything greater than 10-20 generally too high
                unless dataset is massive.
            fit_intercept (bool, Optional): sklearn LinearRegression param.
            copy_x (bool, Optional): sklearn LinearRegression param.
            n_jobs (int, Optional): sklearn LinearRegression param.
        """
        self.am = kwargs.pop("am", None)
        self.asd = kwargs.pop("asd", 10)
        self.bm = kwargs.pop("bm", None)
        self.bsd = kwargs.pop("bsd", 10)
        self.sig = kwargs.pop("sig", 1)
        self.sample = kwargs.pop("sample", 1000)
        self.tune = kwargs.pop("tune", 1000)
        self.init = kwargs.pop("init", "auto")
        self.fill_value = kwargs.pop("fill_value", "random")
        self.neighbors = kwargs.pop("neighbors", 5)
        self.fit_intercept = kwargs.pop("fit_intercept", True)
        self.copy_x = kwargs.pop("copy_x", True)
        self.n_jobs = kwargs.pop("n_jobs", None)
        self.lm = LinearRegression(
            fit_intercept=self.fit_intercept,
            copy_X=self.copy_x,
            n_jobs=self.n_jobs
        )
        self.sample_kwargs = kwargs

[docs]    def fit(self, X, y):
        """Fit the Imputer to the dataset by fitting bayesian and LS model.

        Args:
            X (pd.Dataframe): dataset to fit the imputer.
            y (pd.Series): response, which is eventually imputed.

        Returns:
            self. Instance of the class.
        """
        _not_num_series(self.strategy, y)
        nc = len(X.columns)

        # get predictions for the data, which will be used for "closest" vals
        y_pred = self.lm.fit(X, y).predict(X)
        y_df = DataFrame({"y": y, "y_pred": y_pred})

        # calculate bayes and use appropriate means for alpha and beta priors
        # here we specify the point estimates from the linear regression as the
        # means for the priors. This will greatly speed up posterior sampling
        # and help ensure that convergence occurs
        if self.am is None:
            self.am = self.lm.intercept_
        if self.bm is None:
            self.bm = self.lm.coef_

        # initialize model for bayesian linear reg. Default vals for priors
        # assume data is scaled and centered. Convergence can struggle or fail
        # if not the case and proper values for the priors are not specified
        # separately, also assumes each beta is normal and "independent"
        # while betas likely not independent, this is technically a rule of OLS
        with pm.Model() as fit_model:
            alpha = pm.Normal("alpha", self.am, self.asd)
            beta = pm.Normal("beta", self.bm, self.bsd, shape=nc)
            sigma = pm.HalfCauchy("σ", self.sig)
            mu = alpha+beta.dot(X.T)
            score = pm.Normal("score", mu, sigma, observed=y)
        params = {"model": fit_model, "y_obs": y_df}
        self.statistics_ = {"param": params, "strategy": self.strategy}
        return self

[docs]    def impute(self, X):
        """Generate imputations using predictions from the fit bayesian model.

        The transform method returns the values for imputation. Missing values
        in a given dataset are replaced with the random selection from the LRD
        process. Again, LRD imputes actually observed values, and the observed
        values are selected by finding the closest least squares predictions
        to a given prediction from the bayesian model.

        Args:
            X (pd.DataFrame): predictors to determine imputed values.

        Returns:
            np.array: imputed dataset.
        """
        # check if fitted then predict with least squares
        check_is_fitted(self, "statistics_")
        model = self.statistics_["param"]["model"]
        df = self.statistics_["param"]["y_obs"]
        df = df.reset_index(drop=True)

        # generate posterior distribution for alpha, beta coefficients
        with model:
            tr = pm.sample(
                self.sample,
                tune=self.tune,
                init=self.init,
                **self.sample_kwargs
            )
        self.trace_ = tr

        # support for pymc - handling InferenceData obj instead of MultiTrace
        # we have to compress chains ourselves w/ InferenceData obj (xarray)
        post = tr.posterior
        alpha_, beta_ = post.alpha.values, post.beta.values
        chain, draws, beta_dim = beta_.shape
        beta_ = beta_.reshape(chain*draws, beta_dim)

        # sample random alpha from alpha posterior distribution
        alpha_bayes = np.random.choice(alpha_.ravel())

        # get the mean and covariance of the multivariate betas
        # betas assumed multivariate normal by linear reg rules
        # sample beta w/ cov structure to create realistic variability
        beta_means, beta_cov = beta_.mean(0), np.cov(beta_.T)
        beta_bayes = np.array(multivariate_normal(beta_means, beta_cov).rvs())

        # predictions for missing y, using bayes alpha + coeff samples
        # use these preds for nearest neighbor search from reg results
        # neighbors are nearest from prediction model fit on observed
        # imputed values are actual y vals corresponding to nearest neighbors
        # therefore, this is a form of "hot-deck" imputation
        y_pred_bayes = alpha_bayes + beta_bayes.dot(X.T)
        n_ = self.neighbors
        if X.columns.size == 1:
            y_pred_bayes = y_pred_bayes[0]
        if self.fill_value == "mean":
            imp = [_local_residuals(x, n_, df, np.mean) for x in y_pred_bayes]
        elif self.fill_value == "random":
            choice = np.random.choice
            imp = [_local_residuals(x, n_, df, choice) for x in y_pred_bayes]
        else:
            err = f"{self.fill_value} must be `mean` or `random`."
            raise ValueError(err)

        # finally, set last class values and return imputations
        self.y_pred = y_pred_bayes
        self.alphas = alpha_bayes
        self.betas = beta_bayes
        return imp

[docs]    def fit_impute(self, X, y):
        """Fit impute method to generate imputations where y is missing.

        Args:
            X (pd.Dataframe): predictors in the dataset.
            y (pd.Series): response w/ missing values to impute.

        Returns:
            np.array: imputed dataset.
        """
        # transform occurs with records from X where y is missing
        miss_y_ix = y[y.isnull()].index
        return self.fit(X, y).impute(X.loc[miss_y_ix])