Source code for autoimpute.visuals.imputations

"""Visualizations to explore imputations of an incomplete dataset."""

import matplotlib.pylab as plt
import seaborn as sns
from autoimpute.utils import check_data_structure
from autoimpute.imputations import SingleImputer
from .helpers import _validate_data, _validate_kwgs, _get_observed, _melt_df
from .helpers import _default_plot_args, _plot_imp_dists_helper

#pylint:disable=unused-variable
#pylint:disable=too-many-arguments
#plyint:disable=too-many-locals

[docs]@check_data_structure def plot_imp_scatter(d, x, y, strategy, color=None, title="Jointplot after Imputation", h=8.27, imp_kwgs=None, a=0.5, marginals=None, obs_color="navy", imp_color="red", **plot_kwgs): """Plot the joint scatter and density plot after single imputation. Use this method to visualize a scatterplot between two features, x and y, where y is imputed and x is a predictor used to impute y. This method performs single imputation and is useful to determine how an imputation method looks under the hood. Args: d (pd.DataFrame): DataFrame with data to impute and plot. x (str): column to plot on x axis. y (str): column to plot on y axis and set color for imputation. strategy (str): imputation method for SingleImputer. color (str, Optional): which variable to color with imputations. Deafult is none, which means y is colored. Other option is to color "x". Color should be the same as "x" or "y". title (str, Optional): title of plot. "Defualt is Jointplot after Imputation". h (float, Optional): height of the jointplot. Default is 8.27 imp_kwgs (dict, Optional): imp kwgs for SingleImputer procedure. Default is None. a (float, Optional): alpha for plot color. Default is 0.5 marginals (dict, Optional): dictionary of marginal plot args. Default is None, configured in code below. obs_color (str, Optional): color of observed. Default is navy. imp_color (str, Optional): color of imputations. Default is red. **plot_kwgs: keyword arguments used by sns.set. Raises: ValueError: x and y must be names of columns in data """ # plot setup and arg validation _default_plot_args(**plot_kwgs) _validate_kwgs(marginals) _validate_kwgs(imp_kwgs) if marginals is None: marginals = dict(rug=True, kde=True) # validate x and y selection if not x in d.columns or not y in d.columns: err = "x and y must be names of columns in data" raise ValueError(err) # create imputer with strategy and optional imp kwgs if imp_kwgs is None: imp = SingleImputer(strategy=strategy) else: imp = SingleImputer(strategy=strategy, imp_kwgs=imp_kwgs) # handling the color configuration if color is None: color = y else: if color == y: color = y elif color == x: color = x else: err = "color must be the same as `y` or `x`" raise ValueError(err) # configure and apply the imputer impute = imp.fit_transform(d) impute["colors"] = obs_color impute.loc[imp.imputed_[color], "colors"] = imp_color joints_color = impute["colors"] # create the joint plot joint_kws = dict(facecolor=joints_color, edgecolor=joints_color) g = sns.jointplot(x=x, y=y, data=impute, alpha=a, height=h, joint_kws=joint_kws, marginal_kws=marginals) # final plot config and title plt.subplots_adjust(top=0.925) g.fig.suptitle(title)
[docs]def plot_imp_dists(d, mi, imp_col, title="Distributions after Imputation", include_observed=True, separate_observed=True, side_by_side=False, hist_observed=False, hist_imputed=False, gw=(.5, .5), gh=(.5, .5), **plot_kwgs): """Plot the density between imputations for a given column. Use this method to plot the density of a given column after multiple imputation. The function allows the user to also plot the observed data from the column prior to imputation taking place. Further, the user can specify whether the observed should be separated into its own plot or not. Args: d (list): dataset returned from multiple imputation. mi (MultipleImputer): multiple imputer used to generate d. imp_col (str): column to plot. Should be a column with imputations. title (str, Optional): title of plot. Default is "Distributions after Imputation". include_observed (bool, Optional): whether or not to include observed data in the plot. Default is True. If False, observed data for imp_col will not be included as a distribution for density. separate_observed (bool, Optional): whether or not to separate the observed data when plotting against imputed. Default is True. If False, observed data distribution will be plotted on same plot as the imputed data distribution. Note, this attribute matters if and only if `include_observed=True`. side_by_side (bool, Optional): whether columns should be plotted next to each other or stacked vertically. Default is False. If True, plots will be plotted side-by-side. Note, this attribute matters if and only if `include_observed=True`. hist_observed (bool, Optional): whether histogram should be plotted along with the density for observed values. Default is False. Note, this attribute matters if and only if `include_observed=True`. hist_imputed (bool, Optional): whether histogram should be plotted along with the density for imputed values. Default is False. Note, this attribute matters if and only if `include_observed=True`. gw (tuple, Optional): if side-by-side plot, the width ratios for each plot. Default is (.5, .5), so each plot will be same width. Matters if and only if `include_observed=True` and `side_by_side=True`. gh (tuple, Optional): if stacked plot, the height ratios for each plot. Default is (.5, .5), so each plot will be the same height. Matters if and only if `include_observed=True` and `side_by_side=False`. **plot_kwgs: keyword arguments used by sns.set. Returns: sns.distplot: densityplot for observed and/or imputed data Raises: ValueError: see _validate_data method """ # start by setting plot kwgs _default_plot_args(**plot_kwgs) # define the functionality if observed should be included if include_observed: obs = _get_observed(d, mi, imp_col) obs = d[0][1].loc[obs, imp_col] # define the functionality if separate observed if separate_observed: g = {} g["w"] = {"width_ratios": gw} g["h"] = {"height_ratios": gh} # define the functionality if side by side or not if side_by_side: f, ax = plt.subplots(1, 2, gridspec_kw=g["w"]) else: f, ax = plt.subplots(2, 1, gridspec_kw=g["h"]) sns.distplot(obs, hist=hist_observed, ax=ax[0], label="Observed") _plot_imp_dists_helper(d, hist_imputed, imp_col, ax[1]) # handle case where not separated else: sns.distplot(obs, hist=hist_observed, label="Observed") _plot_imp_dists_helper(d, hist_imputed, imp_col) # handle case where not observed else: _validate_data(d, mi, imp_col) _plot_imp_dists_helper(d, hist_imputed, imp_col) # plot title and legend plt.suptitle(title) plt.legend()
[docs]def plot_imp_boxplots(d, mi, imp_col, side_by_side=False, title="Observed vs. Imputed Boxplots", obs_kwgs=None, imp_kwgs=None, **plot_kwgs): """Plot the boxplots between observed and imputations for a given column. Use this method to plot the boxplots of a given column after multiple imputation. The function also plots the boxplot of the observed data from the column prior to imputation taking place. Further, the user can specify additional arguments to tailor the design of the plots themselves. Args: d (list): dataset returned from multiple imputation. mi (MultipleImputer): multiple imputer used to generate d. imp_col (str): column to plot. Should be a column with imputations. side_by_side (bool, Optional): whether columns should be plotted next to each other or stacked vertically. Default is False. If True, plots will be plotted side-by-side. title (str, Optional): title of boxplots. Default is "Observed vs. Imputed Boxplots." obs_kwgs (dict, Optional): dictionary of arguments to unpack for observed boxplot. Default is None, so no additional tailoring. imp_kwgs (dict, Optional): dictionary of arguments to unpack for imputed boxplots. Default is None, so no additional tailoring. **plot_kwgs: keyword arguments used by sns.set. Returns: sns.distplot: boxplots for observed and imputed data Raises: ValueError: see _validate_data method. """ # set plot type and define names necessary _default_plot_args(**plot_kwgs) obs = _get_observed(d, mi, imp_col) obs_ = d[0][1].loc[obs, imp_col].copy().to_frame() obs_["obs"] = "obs" n = len(d) ratio = 1/(n+1) g = (ratio, 1-ratio) datasets_merged = _melt_df(d, mi, imp_col) # validate obs_kwgs, imp_kwgs _validate_kwgs(obs_kwgs) _validate_kwgs(imp_kwgs) # deal with plotting side by side if side_by_side: xo = "obs" yo = imp_col yi = imp_col xi = "imp_num" f, ax = plt.subplots( 1, 2, gridspec_kw={"width_ratios": (ratio, 1-ratio)} ) else: xo = imp_col yo = "obs" yi = "imp_num" xi = imp_col f, ax = plt.subplots( 2, 1, gridspec_kw={"height_ratios": (ratio, 1-ratio)} ) # dealing with plotting with or without kwgs if not obs_kwgs is None: sns.boxplot( x=xo, y=yo, data=obs_, ax=ax[0], **obs_kwgs ).set(xlabel="", ylabel="") else: sns.boxplot( x=xo, y=yo, data=obs_, ax=ax[0] ).set(xlabel="", ylabel="") if not imp_kwgs is None: sns.boxplot( x=xi, y=yi, data=datasets_merged, ax=ax[1], **imp_kwgs ).set(xlabel="", ylabel="") else: sns.boxplot( x=xi, y=yi, data=datasets_merged, ax=ax[1] ).set(xlabel="", ylabel="") # plot title plt.suptitle(title)
[docs]def plot_imp_swarm(d, mi, imp_col, palette=None, title="Imputation Swarm", **plot_kwgs): """Create the swarm plot for multiply imputed data. Args: d (list): dataset returned from multiple imputation. mi (MultipleImputer): multiple imputer used to generate d. imp_col (str): column to plot. Should be a column with imputations. title (str, Optional): title of plot. Default is "Imputation Swarm". palette (list, tuple, Optional): colors for the imps and observed. Default is None. if None, colors default to ["r","c"]. **plot_kwgs: keyword arguments used by sns.set. Returns: sns.distplot: swarmplot for imputed data Raises: ValueError: see _validate_data method. """ # set plot type, validate, and define names necessary _default_plot_args(**plot_kwgs) _validate_data(d, mi, imp_col) datasets_merged = _melt_df(d, mi, imp_col) if palette is None: palette = ["r", "c"] # swarmplot example sns.swarmplot( x="imp_num", y=imp_col, hue="imputed", palette=palette, data=datasets_merged, hue_order=["yes", "no"] ).set(xlabel="Imputation Number", title=title)
[docs]def plot_imp_strip(d, mi, imp_col, palette=None, title="Imputation Strip", **plot_kwgs): """Create the strip plot for multiply imputed data. Args: d (list): dataset returned from multiple imputation. mi (MultipleImputer): multiple imputer used to generate d. imp_col (str): column to plot. Should be a column with imputations. title (str, Optional): title of plot. Default is "Imputation Strip". palette (list, tuple, Optional): colors for the imps and observed. Default is None. if None, colors default to ["r","c"]. **plot_kwgs: keyword arguments used by sns.set. Returns: sns.distplot: stripplot for imputed data Raises: ValueError: see _validate_data method. """ # set plot type, validate, and define names necessary _default_plot_args(**plot_kwgs) _validate_data(d, mi, imp_col) datasets_merged = _melt_df(d, mi, imp_col) if palette is None: palette = ["r", "c"] # stripplot example sns.stripplot( x="imp_num", y=imp_col, hue="imputed", palette=palette, data=datasets_merged, jitter=True, hue_order=["yes", "no"], dodge=True ).set(xlabel="Imputation Number", title=title)