Source code for autoimpute.imputations.series.bayesian_regression

"""This module implements bayesian techniques to impute missing data.

This module contains BayesLeastSquaresImputer and BayesBinaryLogisticImputer.
Both imputers are the bayesian equivalent of their frequentist counterparts
(LeastSquaresImputer and BinaryLogisticImputer). Dataframe imputers utilize
the classes in this module when each's respective strategy is requested.
Use SingleImputer or MultipleImputer with strategy = `bayesian least squares`
or `bayesian binary logistic` to broadcast the strategies across all the
columns in a dataframe, or specify either strategy for a given column.
"""

import numpy as np
import pymc as pm
from pandas import Series
from sklearn.utils.validation import check_is_fitted
from autoimpute.imputations import method_names
from autoimpute.imputations.errors import _not_num_series
from .base import ISeriesImputer
methods = method_names
# pylint:disable=attribute-defined-outside-init
# pylint:disable=too-many-arguments
# pylint:disable=unused-variable
# pylint:disable=no-member
# pylint:disable=too-many-instance-attributes
# pylint:disable=unsubscriptable-object

[docs]class BayesianLeastSquaresImputer(ISeriesImputer):
    """Impute missing values using bayesian least squares regression.

    The BayesianLeastSquaresImputer produces predictions using the bayesian
    approach to least squares. Prior distributions are fit for the model
    parameters of interest (alpha, beta, epsilon). Imputations for missing
    values are samples from posterior predictive distribution of each missing
    point. To implement bayesian least squares, the imputer utlilizes the
    pymc library. The imputer can be used directly, but such behavior is
    discouraged. BayesianLeastSquaresImputer does not have the flexibility /
    robustness of dataframe imputers, nor is its behavior identical. Preferred
    use is MultipleImputer(strategy="bayesian least squares").
    """
    # class variables
    strategy = methods.BAYESIAN_LS

[docs]    def __init__(self, **kwargs):
        """Create an instance of the BayesianLeastSquaresImputer class.

        The class requires multiple arguments necessary to create priors for
        a bayesian linear regression equation. The regression is:
        alpha + beta * X + epsilson. Because paramaters are treated as
        random variables, we must specify their distributions, including
        the parameters of those distributions. In thie init method we also
        include arguments used to sample the posterior distributions.

        Args:
            **kwargs: default keyword arguments used for bayesian analysis.
                Note - kwargs popped for default arguments defined below.
                Rest of kwargs passed as params to sampling (see pymc).
            am (float, Optional): mean of alpha prior. Default 0.
            asd (float, Optional): std. deviation of alpha prior. Default 10.
            bm (float, Optional): mean of beta priors. Default 0.
            bsd (float, Optional): std. deviation of beta priors. Default 10.
            sig (float, Optional): parameter of sigma prior. Default 1.
            sample (int, Optional): number of posterior samples per chain.
                Default = 1000. More samples, longer to run, but better
                approximation of the posterior & chance of convergence.
            tune (int, Optional): parameter for tuning. Draws done in addition
                to sample. Default = 1000.
            init (str, Optional): MCMC algo to use for posterior sampling.
                Default = 'auto'. See pymc docs for more info on choices.
            fill_value (str, Optional): How to draw from the posterior to
                create imputations. Default is None. 'random' and 'mean'
                supported for explicit options.
        """
        self.am = kwargs.pop("am", 0)
        self.asd = kwargs.pop("asd", 10)
        self.bm = kwargs.pop("bm", 0)
        self.bsd = kwargs.pop("bsd", 10)
        self.sig = kwargs.pop("sig", 1)
        self.sample = kwargs.pop("sample", 1000)
        self.tune = kwargs.pop("tune", 1000)
        self.init = kwargs.pop("init", "auto")
        self.fill_value = kwargs.pop("fill_value", None)
        self.sample_kwargs = kwargs

[docs]    def fit(self, X, y):
        """Fit the Imputer to the dataset by fitting bayesian model.

        Args:
            X (pd.Dataframe): dataset to fit the imputer.
            y (pd.Series): response, which is eventually imputed.

        Returns:
            self. Instance of the class.
        """
        _not_num_series(self.strategy, y)
        nc = len(X.columns)

        # initialize model for bayesian linear reg. Default vals for priors
        # assume data is scaled and centered. Convergence can struggle or fail
        # if not the case and proper values for the priors are not specified
        # separately, also assumes each beta is normal and "independent"
        # while betas likely not independent, this is technically a rule of OLS
        with pm.Model() as fit_model:
            alpha = pm.Normal("alpha", self.am, self.asd)
            beta = pm.Normal("beta", self.bm, self.bsd, shape=nc)
            sigma = pm.HalfCauchy("σ", self.sig)
            mu = alpha+beta.dot(X.T)
            score = pm.Normal("score", mu, sigma, observed=y)
        self.statistics_ = {"param": fit_model, "strategy": self.strategy}
        return self

[docs]    def impute(self, X, k=None):
        """Generate imputations using predictions from the fit bayesian model.

        The transform method returns the values for imputation. Missing values
        in a given dataset are replaced with the samples from the posterior
        predictive distribution of each missing data point.

        Args:
            X (pd.DataFrame): predictors to determine imputed values.
            k (integer): optional, pass if and only if receiving from MICE
        Returns:
            np.array: imputed dataset.
        """
        # check if fitted then predict with least squares
        check_is_fitted(self, "statistics_")
        model = self.statistics_["param"]

        # add a Deterministic node for each missing value
        # sampling then pulls from the posterior predictive distribution
        # each missing data point. I.e. distribution for EACH missing
        base_name = "mu_pred"
        if k is not None:
            base_name = f"{base_name}_{k}"
        with model:
            mu_pred = pm.Deterministic(
                base_name, model["alpha"]+model["beta"].dot(X.T)
            )
            tr = pm.sample(
                self.sample,
                tune=self.tune,
                init=self.init,
                **self.sample_kwargs
            )
        self.trace_ = tr

        # support for pymc - handling InferenceData obj instead of MultiTrace
        # we have to compress chains ourselves w/ InferenceData obj (xarray)
        post = tr.posterior[base_name].values
        chain, draws, dim = post.shape
        post = post.reshape(chain*draws, dim)

        # decide how to impute. Use mean of posterior predictive or random draw
        # not supported yet, but eventually consider using the MAP
        if not self.fill_value or self.fill_value == "mean":
            imp = post.mean(0)
        elif self.fill_value == "random":
            imp = np.apply_along_axis(np.random.choice, 0, post)
        else:
            err = f"{self.fill_value} must be 'mean' or 'random'."
            raise ValueError(err)
        return imp

[docs]    def fit_impute(self, X, y):
        """Fit impute method to generate imputations where y is missing.

        Args:
            X (pd.Dataframe): predictors in the dataset.
            y (pd.Series): response w/ missing values to impute.

        Returns:
            np.array: imputed dataset.
        """
        # transform occurs with records from X where y is missing
        miss_y_ix = y[y.isnull()].index
        return self.fit(X, y).impute(X.loc[miss_y_ix])


[docs]class BayesianBinaryLogisticImputer(ISeriesImputer):
    """Impute missing values using bayesian binary losgistic regression.

    The BayesianBinaryLogisticImputer produces predictions using the bayesian
    approach to logistic regression. Prior distributions are fit for the model
    parameters of interest (alpha, beta, epsilon). Imputations for missing
    values are samples from the posterior predictive distribution of each
    missing point. To implement bayesian logistic regression, the imputer uses
    the pymc library. The imputer can be used directly, but such behavior is
    discouraged. BayesianBinaryLogisticImputer does not have the flexibility /
    robustness of dataframe imputers, nor is its behavior identical.
    Preferred use is MultipleImputer(strategy="bayesian binary logistic").
    """
    # class variables
    strategy = methods.BAYESIAN_BINARY_LOGISTIC

[docs]    def __init__(self, **kwargs):
        """Create an instance of the BayesianBinaryLogisticImputer class.

        The class requires multiple arguments necessary to create priors for
        a bayesian logistic regression equation. The parameters are the same
        as linear regression, but the regression equation is transformed using
        pymc's invlogit method. Because paramaters are treated as random
        variables, we must specify their distributions, including
        the parameters of those distributions. In the init method we also
        include arguments used to sample the posterior distributions.

        Args:
            **kwargs: default keyword arguments used for bayesian analysis.
                Note - kwargs popped for default arguments defined below.
                Rest of kwargs passed as params to sampling (see pymc).
            am (float, Optional): mean of alpha prior. Default 0.
            asd (float, Optional): std. deviation of alpha prior. Default 10.
            bm (float, Optional): mean of beta priors. Default 0.
            bsd (float, Optional): std. deviation of beta priors. Default 10.
            thresh (float, Optional): threshold for class membership.
                Default 0.5. Max = 1, min = 0. Tune threshhold depending on
                class imbalance. Same as with logistic regression equation.
            sample (int, Optional): number of posterior samples per chain.
                Default = 1000. More samples, longer to run, but better
                approximation of the posterior & chance of convergence.
            tune (int, Optional): parameter for tuning. Draws done in addition
                to sample. Default = 1000.
            init (str, Optional): MCMC algo to use for posterior sampling.
                Default = 'auto'. See pymc docs for more info on choices.
            fill_value (str, Optional): How to draw from the posterior to
                create imputations. Default is None. 'random' and 'mean'
                supported for explicit options.
        """
        self.am = kwargs.pop("am", 0)
        self.asd = kwargs.pop("asd", 10)
        self.bm = kwargs.pop("bm", 0)
        self.bsd = kwargs.pop("bsd", 10)
        self.thresh = kwargs.pop("thresh", 0.5)
        self.sample = kwargs.pop("sample", 1000)
        self.tune = kwargs.pop("tune", 1000)
        self.init = kwargs.pop("init", "auto")
        self.fill_value = kwargs.pop("fill_value", None)
        self.sample_kwargs = kwargs

[docs]    def fit(self, X, y):
        """Fit the Imputer to the dataset by fitting bayesian model.

        Args:
            X (pd.Dataframe): dataset to fit the imputer.
            y (pd.Series): response, which is eventually imputed.

        Returns:
            self. Instance of the class.
        """
        y = y.astype("category").cat
        y_cat_l = len(y.codes.unique())

        # bayesian logistic regression. Mutliple categories not supported yet
        if y_cat_l != 2:
            err = "Only two categories supported. Multinomial coming soon."
            raise ValueError(err)
        nc = len(X.columns)

        # initialize model for bayesian logistic reg. Default vals for priors
        # assume data is scaled and centered. Convergence can struggle or fail
        # if not the case and proper values for the priors are not specified
        # separately, also assumes each beta is normal and "independent"
        # while betas likely not independent, this is technically a rule of OLS
        with pm.Model() as fit_model:
            alpha = pm.Normal("alpha", self.am, self.asd)
            beta = pm.Normal("beta", self.bm, self.bsd, shape=nc)
            p = pm.invlogit(alpha + beta.dot(X.T))
            score = pm.Bernoulli("score", p, observed=y.codes)

        params = {"model": fit_model, "labels": y.categories}
        self.statistics_ = {"param": params, "strategy": self.strategy}
        return self

[docs]    def impute(self, X, k=None):
        """Generate imputations using predictions from the fit bayesian model.

        The impute method returns the values for imputation. Missing values
        in a given dataset are replaced with the samples from the posterior
        predictive distribution of each missing data point.

        Args:
            X (pd.DataFrame): predictors to determine imputed values.
            k (integer): optional, pass if and only if receiving from MICE
        Returns:
            np.array: imputated dataset.
        """
        # check if fitted then predict with least squares
        check_is_fitted(self, "statistics_")
        model = self.statistics_["param"]["model"]
        labels = self.statistics_["param"]["labels"]

        # add a Deterministic node for each missing value
        # sampling then pulls from the posterior predictive distribution
        # each missing data point. I.e. distribution for EACH missing
        base_name = "p_pred"
        if k is not None:
            base_name = f"{base_name}_{k}"
        with model:
            p_pred = pm.Deterministic(
                base_name, pm.invlogit(model["alpha"] + model["beta"].dot(X.T))
            )
            tr = pm.sample(
                self.sample,
                tune=self.tune,
                init=self.init,
                **self.sample_kwargs
            )
        self.trace_ = tr

        # support for pymc - handling InferenceData obj instead of MultiTrace
        # we have to compress chains ourselves w/ InferenceData obj (xarray)
        post = tr.posterior[base_name].values
        chain, draws, dim = post.shape
        post = post.reshape(chain*draws, dim)

        # decide how to impute. Use mean of posterior predictive or random draw
        # not supported yet, but eventually consider using the MAP
        if not self.fill_value or self.fill_value == "mean":
            imp = post.mean(0)
        elif self.fill_value == "random":
            imp = np.apply_along_axis(np.random.choice, 0, post)
        else:
            err = f"{self.fill_value} must be 'mean' or 'random'."
            raise ValueError(err)

        # convert probabilities to class membership
        # then map class membership to corresponding label
        fill_thresh = np.vectorize(lambda f: 1 if f > self.thresh else 0)
        preds = fill_thresh(imp)
        label_dict = {i:j for i, j in enumerate(labels.values)}
        imp = Series(preds).replace(label_dict, inplace=False)
        return imp.values

[docs]    def fit_impute(self, X, y):
        """Fit impute method to generate imputations where y is missing.

        Args:
            X (pd.Dataframe): predictors in the dataset.
            y (pd.Series): response w/ missing values to impute.

        Returns:
            np.array: imputed dataset.
        """
        # transform occurs with records from X where y is missing
        miss_y_ix = y[y.isnull()].index
        return self.fit(X, y).impute(X.loc[miss_y_ix])