Source code for autoimpute.imputations.series.default

"""This module implements default imputers used for series Imputer classes.

These Imputer classes serve as defaults within more advanced imputers. They
are flexible, and they allow users to quickly run imputations without getting
a runtime error as they would in sklearn if the data types in a dataset are
mixed. There are three default imputers at the moment: DefaultUnivarImputer,
DefaultTimeSeriesImputer and DefaultPredictiveImputer. Default imputers
inherit from DefaultBaseImputer.
"""

from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from sklearn.utils.validation import check_is_fitted
from autoimpute.imputations import method_names
from .pmm import PMMImputer
from .mean import MeanImputer
from .mode import ModeImputer
from .interpolation import InterpolateImputer
from .logistic_regression import MultinomialLogisticImputer
from .base import ISeriesImputer
methods = method_names
# pylint:disable=attribute-defined-outside-init
# pylint:disable=unnecessary-pass
# pylint:disable=dangerous-default-value
# pylint:disable=too-many-instance-attributes

class DefaultBaseImputer(ISeriesImputer):
    """Building blocks for the default imputers.

    The DefaultBaseImputer is not a stand-alone class and thus serves no
    purpose other than as a Parent to DefaultImputers. Therefore, the
    DefaultBaseImputer should not be used directly unless creating a new
    version of a DefaultImputer.
    """
    def __init__(self, num_imputer, cat_imputer, num_kwgs, cat_kwgs):
        """Initialize the DefaultBaseImputer.

        Args:
            num_imputer (Imputer): valid Imputer for numerical data.
            cat_imputer (Imputer): valid Imputer for categorical data.
            num_kwgs (dict): Keyword args for numerical imputer.
            cat_kwgs (dict): keyword args for categorical imputer.

        Returns:
            self. Instance of the class
        """
        # INSTANCE ATTRIBUTES MUST BE IN ORDER THEY ARE VALIDATED WITH GET/SET
        # --------------------------------------------------------------------
        # Position of arguments in __init__ is essentially arbitrary
        # But attribute must appear in proper order if using getters/setters
        self.num_kwgs = num_kwgs
        self.cat_kwgs = cat_kwgs
        self.num_imputer = num_imputer
        self.cat_imputer = cat_imputer

    @property
    def num_kwgs(self):
        """Property getter to return the value of num_kwgs."""
        return self._num_kwgs

    @property
    def cat_kwgs(self):
        """Property getter to return the value of cat_kwgs."""
        return self._cat_kwgs

    @num_kwgs.setter
    def num_kwgs(self, kwgs):
        """Validate the num_kwgs and set default properties."""
        if not isinstance(kwgs, (type(None), dict)):
            err = "num_kwgs must be dict of args used to init num_imputer."
            raise ValueError(err)
        self._num_kwgs = kwgs

    @cat_kwgs.setter
    def cat_kwgs(self, kwgs):
        """Validate the cat_kwgs and set default properties."""
        if not isinstance(kwgs, (type(None), dict)):
            err = "cat_kwgs must be dict of args used to init cat_imputer."
            raise ValueError(err)
        self._cat_kwgs = kwgs

    @property
    def num_imputer(self):
        """Property getter to return the value of the num imputer."""
        return self._num_imputer

    @property
    def cat_imputer(self):
        """Property getter to return the value of the cat imputer."""
        return self._cat_imputer

    @num_imputer.setter
    def num_imputer(self, imp):
        """Validate the num imputer and set default parameters.

        Args:
            imp (Imputer): must be a valid autoimpute Imputer

        Raises:
            ValueError: any Imputer class must end in Imputer
            ValueError: Imputer must implement fit_impute
            ValueError: argument not an instance of an Imputer
        """
        # try necessary because imp may not have __base__ attribute
        try:
            # once imp confirmed class, error handling
            cls_ = imp.__name__.endswith("Imputer")
            if not cls_:
                err = f"{imp} must be a class ending in Imputer"
                raise ValueError(err)
            # valid imputers must have `fit_impute` method
            m = "fit_impute"
            if not hasattr(imp, m):
                err = f"Imputer must implement {m} method."
                raise ValueError(err)
            # if valid imputer, instantiate it with kwargs
            # if kwargs contains improper args, imp will handle error
            if self.num_kwgs is None:
                self._num_imputer = imp()
            else:
                self._num_imputer = imp(**self.num_kwgs)
        # deal with imp that doesn't have __base__ attribute
        except AttributeError as ae:
            err = f"{imp} is not an instance of an Imputer"
            raise ValueError(err) from ae

    @cat_imputer.setter
    def cat_imputer(self, imp):
        """Validate the cat imputer and set default parameters.

        Args:
            imp (Imputer): must be a valid autoimpute imputer

        Raises:
            ValueError: any imputer class must end in Imputer
            ValueError: imputer must implement fit_impute
            ValueError: argument not an instance of an Imputer
        """
        # try necessary because imp could initially be anything
        try:
            # once imp confirmed class, error handling
            cls_ = imp.__name__.endswith("Imputer")
            if not cls_:
                err = f"{imp} must be an Imputer class from autoimpute"
                raise ValueError(err)
            # valid imputers must have `fit_impute` method
            m = "fit_impute"
            if not hasattr(imp, m):
                err = f"Imputer must implement {m} method."
                raise ValueError(err)
            # if valid imputer, instantiate it with kwargs
            # if kwargs contains improper args, imp will handle error
            if self.cat_kwgs is None:
                self._cat_imputer = imp()
            else:
                self._cat_imputer = imp(**self.cat_kwgs)
        except AttributeError as ae:
            err = f"{imp} is not a valid Imputer"
            raise ValueError(err) from ae

    def fit(self, X, y):
        """Fit the Imputer to the dataset and determine the right approach.

        Args:
            X (pd.Series): Dataset to fit the imputer, or predictors
            y (pd.Series): None, or dataset to fit predictors

        Returns:
            self. Instance of the class.
        """
        # start off with stats blank
        stats = {"param": None, "strategy": None}

        # if y is None, fitting simply X. univariate method.
        if y is None:
            if is_numeric_dtype(X):
                stats = {"param": self.num_imputer.fit(X, y),
                         "strategy": self.num_imputer.strategy}
            if is_string_dtype(X):
                stats = {"param": self.cat_imputer.fit(X, y),
                         "strategy": self.cat_imputer.strategy}

        # if y is not None, fitting X to y. predictive method.
        if not y is None:
            if is_numeric_dtype(y):
                stats = {"param": self.num_imputer.fit(X, y),
                         "strategy": self.num_imputer.strategy}
            if is_string_dtype(y):
                stats = {"param": self.cat_imputer.fit(X, y),
                         "strategy": self.cat_imputer.strategy}

        # return final stats
        self.statistics_ = stats
        return self

    def impute(self, X):
        """Perform imputations using the statistics generated from fit.

        The impute method handles the actual imputation. Missing values
        in a given dataset are replaced with the respective mean from fit.

        Args:
            X (pd.Series): Dataset to impute missing data from fit.

        Returns:
            pd.Series -- imputed dataset.
        """
        # check is fitted and delegate transformation to respective imputer
        check_is_fitted(self, "statistics_")
        imp = self.statistics_["param"]

        # ensure that param is not none, which indicates time series column
        if imp:
            X_ = imp.impute(X)
            return X_

    def fit_impute(self, X, y):
        """Convenience method to perform fit and imputation in one go."""
        return self.fit(X, y).impute(X)

[docs]class DefaultUnivarImputer(DefaultBaseImputer): """Impute missing data using default methods for univariate imputation. This imputer is the default for univariate imputation. The imputer determines how to impute based on the column type of each column in a dataframe. The imputer can be used directly, but such behavior is discouraged. DefaultUnivarImputer does not have the flexibility / robustness of more complex imputers, nor is its behavior identical. Preferred use is MultipleImputer(strategy="default univariate"). """ # class variables strategy = methods.DEFAULT_UNIVAR
[docs] def __init__( self, num_imputer=MeanImputer, cat_imputer=ModeImputer, num_kwgs=None, cat_kwgs={"fill_strategy": "random"} ): """Create an instance of the DefaultUnivarImputer class. The dataframe imputers delegate work to the DefaultUnivarImputer if strategy="default univariate" The DefaultUnivarImputer then determines how to impute numerical and categorical columns by default. It does so by passing its arguments to the DefaultBaseImputer, which handles validation and instantiation of numerical and categorical imputers. Args: num_imputer (Imputer, Optional): valid Imputer for numerical data. Default is MeanImputer. cat_imputer (Imputer, Optional): valid Imputer for categorical data. Default is ModeImputer. num_kwgs (dict, optional): Keyword args for numerical imputer. Default is None. cat_kwgs (dict, optional): keyword args for categorical imputer. Default is {"fill_strategy": "random"}. Returns: self. Instance of class. """ # delegate to DefaultBaseImputer DefaultBaseImputer.__init__( self, num_imputer=num_imputer, cat_imputer=cat_imputer, num_kwgs=num_kwgs, cat_kwgs=cat_kwgs )
[docs] def fit(self, X, y=None): """Defer fit to the DefaultBaseImputer.""" super().fit(X, y) return self
[docs] def impute(self, X): """Defer transform to the DefaultBaseImputer.""" X_ = super().impute(X) return X_
[docs]class DefaultTimeSeriesImputer(DefaultBaseImputer): """Impute missing data using default methods for time series. This imputer is the default imputer for time series imputation. The imputer determines how to impute based on the column type of each column in a dataframe. The imputer can be used directly, but such behavior is discouraged. DefaultTimeSeriesImputer does not have the flexibility / robustness of more complex imputers, nor is its behavior identical. Preferred use is MultipleImputer(strategy="default time"). """ # class variables strategy = methods.DEFAULT_TIME
[docs] def __init__( self, num_imputer=InterpolateImputer, cat_imputer=ModeImputer, num_kwgs={"fill_strategy": "linear"}, cat_kwgs={"fill_strategy": "random"} ): """Create an instance of the DefaultTimeSeriesImputer class. The dataframe imputers delegate work to the DefaultTimeSeriesImputer if strategy="default time". The DefaultTimeSeriesImputer then determines how to impute numerical and categorical columns by default. It does so by passing its arguments to the DefaultBaseImputer, which handles validation and instantiation of default numerical and categorical imputers. Args: num_imputer (Imputer, Optional): valid Imputer for numerical data. Default is InterpolateImputer. cat_imputer (Imputer, Optional): valid Imputer for categorical data. Default is ModeImputer. num_kwgs (dict, optional): Keyword args for numerical imputer. Default is {"strategy": "linear"}. cat_kwgs (dict, optional): keyword args for categorical imputer. Default is {"fill_strategy": "random"}. Returns: self. Instance of class. """ DefaultBaseImputer.__init__( self, num_imputer=num_imputer, cat_imputer=cat_imputer, num_kwgs=num_kwgs, cat_kwgs=cat_kwgs )
[docs] def fit(self, X, y=None): """Defer fit to the DefaultBaseImputer.""" super().fit(X, y) return self
[docs] def impute(self, X): """Defer transform to the DefaultBaseImputer.""" X_ = super().impute(X) return X_
[docs]class DefaultPredictiveImputer(DefaultBaseImputer): """Impute missing data using default methods for prediction. This imputer is the default imputer for the MultipleImputer class. When an end-user does not supply a strategy, the DefaultPredictiveImputer determines how to impute based on the column type of each column in a dataframe. The imputer can be used directly, but such behavior is discouraged. DefaultPredictiveImputer does not have the flexibility / robustness of more complex imputers, nor is its behavior identical. Preferred use is MultipleImputer(strategy="default predictive"). """ # class variables strategy = methods.DEFAULT_PRED
[docs] def __init__( self, num_imputer=PMMImputer, cat_imputer=MultinomialLogisticImputer, num_kwgs=None, cat_kwgs=None ): """Create an instance of the DefaultPredictiveImputer class. The dataframe imputers delegate work to DefaultPredictiveImputer if strategy="default predictive" or no strategy given when class is instantiated. The DefaultPredictiveImputer determines how to impute numerical and categorical columns by default. It does so by passing its arguments to the DefaultBaseImputer, which handles validation and instantiation of default numerical and categorical imputers. Args: num_imputer (Imputer, Optional): valid Imputer for numerical data. Default is PMMImputer. cat_imputer (Imputer, Optional): valid Imputer for categorical data. Default is MultiLogisticImputer. num_kwgs (dict, optional): Keyword args for numerical imputer. Default is None. cat_kwgs (dict, optional): keyword args for categorical imputer. Default is None. Returns: self. Instance of class. """ # delegate to DefaultBaseImputer DefaultBaseImputer.__init__( self, num_imputer=num_imputer, cat_imputer=cat_imputer, num_kwgs=num_kwgs, cat_kwgs=cat_kwgs )
[docs] def fit(self, X, y): """Defer fit to the DefaultBaseImputer.""" super().fit(X, y) return self
[docs] def impute(self, X): """Defer transform to the DefaultBaseImputer.""" X_ = super().impute(X) return X_