Source code for autoimpute.imputations.series.random

"""This module implements random imputation via the RandomImputer.

The RandomImputer imputes missing data using a random draw with replacement
from the observed data. Dataframe imputers utilize this class when its
strategy is requested. Use SingleImputer or MultipleImputer with
strategy = `random` to broadcast the strategy across all the columns in a
dataframe, or specify this strategy for a given column.
"""

import numpy as np
from sklearn.utils.validation import check_is_fitted
from autoimpute.imputations import method_names
from .base import ISeriesImputer
methods = method_names
# pylint:disable=attribute-defined-outside-init
# pylint:disable=unnecessary-pass

[docs]class RandomImputer(ISeriesImputer):
    """Impute missing data using random draws from observed data.

    The RandomImputer samples with replacement from observed data. The imputer
    can be used directly, but such behavior is discouraged. RandomImputer does
    not have the flexibility / robustness of dataframe imputers, nor is its
    behavior identical. Preferred use is MultipleImputer(strategy="random").
    """
    # class variables
    strategy = methods.RANDOM

[docs]    def __init__(self):
        """Create an instance of the RandomImputer class."""
        pass

[docs]    def fit(self, X, y=None):
        """Fit the Imputer to the dataset and get unique observed to sample.

        Args:
            X (pd.Series): Dataset to fit the imputer.
            y (None): ignored, None to meet requirements of base class

        Returns:
            self. Instance of the class.
        """

        # determine set of observed values to sample from
        random = list(set(X[~X.isnull()]))
        self.statistics_ = {"param": random, "strategy": self.strategy}
        return self

[docs]    def impute(self, X):
        """Perform imputations using the statistics generated from fit.

        The transform method handles the actual imputation. Each missing value
        in a given dataset is replaced with a random draw from unique set of
        observed values determined during the fit stage.

        Args:
            X (pd.Series): Dataset to impute missing data from fit.

        Returns:
            np.array -- imputed dataset
        """
        # check if fitted and identify location of missingness
        check_is_fitted(self, "statistics_")
        ind = X[X.isnull()].index

        # get the observed values and sample from them
        param = self.statistics_["param"]
        imp = np.random.choice(param, len(ind))
        return imp

[docs]    def fit_impute(self, X, y=None):
        """Convenience method to perform fit and imputation in one go."""
        return self.fit(X, y).impute(X)