Source code for autoimpute.imputations.series.interpolation

"""This module implements interpolation methods via the InterpolateImputer.

InterpolateImputer imputes missing data using some interpolation strategies
suppoted by pd.Series.interpolate. Linear is the default strategy, although a
number of additional strategies exist. Dataframe imputers utilize this class
when its strategy is requested. Use SingleImputer or MultipleImputer with
strategy = `interpolate` to broadcast the strategy across all the columns in a
dataframe, or specify this strategy for a given column.
"""

import pandas as pd
from sklearn.utils.validation import check_is_fitted
from autoimpute.imputations import method_names
from .base import ISeriesImputer
methods = method_names
# pylint:disable=attribute-defined-outside-init
# pylint:disable=unnecessary-pass
# pylint:disable=unused-argument

[docs]class InterpolateImputer(ISeriesImputer): """Impute missing values using interpolation techniques. The InterpolateImputer imputes missing values uses a valid pd.Series interpolation strategy. See __init__ method docs for supported strategies. The imputer can be used directly, but such behavior is discouraged. InterpolateImputer does not have the flexibility / robustness of dataframe imputers, nor is its behavior identical. Preferred use is MultipleImputer(strategy="interpolate"). """ # class variables strategy = methods.INTERPOLATE fill_strategies = ( "linear", "time", "quadratic", "cubic", "spline", "barycentric", "polynomial" )
[docs] def __init__(self, fill_strategy="linear", start=None, end=None, order=None): """Create an instance of the InterpolateImputer class. Args: fill_strategy (str, Optional): type of interpolation to perform Default is linear. Other strategies supported include: `time`, `quadratic`, `cubic`, `spline`, `barycentric`, `polynomial`. start (int, Optional): value to impute if first number in Series is missing. Default is None, but first valid used when required for quadratic, cubic, polynomial. end (int, Optional): value to impute if last number in Series is missing. Default is None, but last valid used when required for quadratic, cubic, polynomial. order (int, Optional): if strategy is spline or polynomial, order must be number. Otherwise not considered. Returns: self. Instance of the class. """ self.fill_strategy = fill_strategy self.start = start self.end = end self.order = order
@property def fill_strategy(self): """Property getter to return the value of fill_strategy property.""" return self._fill_strategy @fill_strategy.setter def fill_strategy(self, fs): """Validate the fill_strategy property and set default parameters. Args: fs (str, Optional): if None, use linear. Raises: ValueError: not a valid fill strategy for InterpolateImputer """ if fs not in self.fill_strategies: err = f"{fs} not a valid fill strategy for InterpolateImputer" raise ValueError(err) self._fill_strategy = fs def _handle_start(self, v, X): "private method to handle start values." if v is None: v = X.loc[X.first_valid_index()] if v == "mean": v = X.mean() return v def _handle_end(self, v, X): "private method to handle end values." if v is None: v = X.loc[X.last_valid_index()] if v == "mean": v = X.mean() return v
[docs] def fit(self, X, y=None): """Fit the Imputer to the dataset. Nothing to calculate. Args: X (pd.Series): Dataset to fit the imputer. y (None): ignored, None to meet requirements of base class Returns: self. Instance of the class. """ self.statistics_ = {"param": self.fill_strategy, "strategy": self.strategy} return self
[docs] def impute(self, X): """Perform imputations using the statistics generated from fit. The impute method handles the actual imputation. Missing values in a given dataset are replaced with results from interpolation. Args: X (pd.Series): Dataset to impute missing data from fit. Returns: np.array -- imputed dataset. """ # check if fitted then impute with interpolation strategy check_is_fitted(self, "statistics_") imp = self.statistics_["param"] # setting defaults if no value passed for start and last # quadratic, cubic, and polynomial require first and last if imp in ("quadratic", "cubic", "polynomial"): # handle start and end... if pd.isnull(X.iloc[0]): ix = X.head(1).index[0] X.fillna( {ix: self._handle_start(self.start, X)}, inplace=True ) if pd.isnull(X.iloc[-1]): ix = X.tail(1).index[0] X.fillna( {ix: self._handle_end(self.end, X)}, inplace=True ) # handling for methods that need order num_observed = min(6, X.count()) if imp in ("polynomial", "spline"): if self.order is None or self.order >= num_observed: err = f"Order must be between 1 and {num_observed-1}" raise ValueError(err) # finally, perform interpolation return X.interpolate(method=imp, limit=None, limit_direction="both", inplace=False, order=self.order)
[docs] def fit_impute(self, X, y=None): """Convenience method to perform fit and imputation in one go.""" return self.fit(X, y).impute(X)