Source code for autoimpute.imputations.series.mode

"""This module implements mode imputation via the ModeImputer.

The ModeImputer uses the mode of observed data to impute missing values.
Dataframe imputers utilize this class when its strategy is requested. Use
SingleImputer or MultipleImputer with strategy = `mode` to broadcast the
strategy across all the columns in a dataframe, or specify this strategy
for a given column.
"""

import numpy as np
import pandas as pd
from sklearn.utils.validation import check_is_fitted
from autoimpute.imputations import method_names
from .base import ISeriesImputer
methods = method_names
# pylint:disable=attribute-defined-outside-init

[docs]class ModeImputer(ISeriesImputer): """Impute missing values with the mode of the observed data. The mode imputer calculates the mode of the observed dataset and uses it to impute missing observations. In the case where there are more than one mode, the user can supply a `fill_strategy` to choose the mode. The imputer can be used directly, but such behavior is discouraged. ModeImputer does not have the flexibility / robustness of dataframe imputers, nor is its behavior identical. Preferred use is MultipleImputer(strategy="mode"). """ # class variables strategy = methods.MODE fill_strategies = (None, "first", "last", "random")
[docs] def __init__(self, fill_strategy=None): """Create an instance of the ModeImputer class. Args: fill_strategy (str, Optional): strategy to pick mode, if multiple. Default is None, which means first mode taken. Options include None, first, last, random. First, None -> select first of modes. Last -> select the last of modes. Random -> randomly sample from modes with replacement. """ self.fill_strategy = fill_strategy
@property def fill_strategy(self): """Property getter to return the value of fill_strategy property.""" return self._fill_strategy @fill_strategy.setter def fill_strategy(self, fs): """Validate the fill_strategy property and set default parameters. Args: fs (str, None): if None, use first mode. Raises: ValueError: not a valid fill strategy for ModeImputer. """ if fs not in self.fill_strategies: err = f"{fs} not a valid fill strategy for ModeImputer" raise ValueError(err) self._fill_strategy = fs
[docs] def fit(self, X, y=None): """Fit the Imputer to the dataset and calculate the mode. Args: X (pd.Series): Dataset to fit the imputer. y (None): ignored, None to meet requirements of base class Returns: self. Instance of the class. """ mode = X.mode().values self.statistics_ = {"param": mode, "strategy": self.strategy} return self
[docs] def impute(self, X): """Perform imputations using the statistics generated from fit. This method handles the actual imputation. Missing values in a given dataset are replaced with the mode observed from fit. Note that there can be more than one mode. If more than one mode, use the fill_strategy to determine how to use the modes. Args: X (pd.Series): Dataset to impute missing data from fit. Returns: float or np.array -- imputed dataset. """ # check is fitted and identify locations of missingness check_is_fitted(self, "statistics_") ind = X[X.isnull()].index # get the number of modes imp = self.statistics_["param"] # default imputation is to pick first, such as scipy does if self.fill_strategy is None: imp = imp[0] # picking the first of the modes when fill_strategy = first if self.fill_strategy == "first": imp = imp[0] # picking the last of the modes when fill_strategy = last if self.fill_strategy == "last": imp = imp[-1] # sampling when strategy is random if self.fill_strategy == "random": num_modes = len(imp) # check if more modes if num_modes == 1: imp = imp[0] else: samples = np.random.choice(imp, len(ind)) imp = pd.Series(samples, index=ind).values # finally, fill in the right fill values for missing X return imp
[docs] def fit_impute(self, X, y=None): """Convenience method to perform fit and imputation in one go.""" return self.fit(X, y).impute(X)