Feature added and documentation change

Jacobluke- · Jacobluke- · commit 2f46706912a5 · 2023-03-13T12:58:01.000+08:00
Add error bar for paired proportional plot.

Change a typo in the documentation
diff --git a/dabest/plot_tools.py b/dabest/plot_tools.py
@@ -388,6 +388,174 @@ def proportion_error_bar(data, x, y, type='mean_sd', offset=0.2, ax=None,
         #                           [central_measure, central_measure], **kwargs)
         # ax.add_line(mean_line)
 
+def sankey_error_bar(data, x, y, type='mean_sd', offset=0.2, ax=None,
+                 line_color="black", gap_width_percent=1, pos=[0,1], 
+                 **kwargs):
+    '''
+    Function to plot the standard devations for proportions as vertical
+    errorbars. The mean is a gap defined by negative space.
+
+    This is a specific design with the addition of parameter `xpos`
+    for Sankey as each Sankey bar requires two errorbars, one for 
+    the left and one for the right. 
+
+    This style is inspired by Edward Tufte's redesign of the boxplot.
+    See The Visual Display of Quantitative Information (1983), pp.128-130.
+
+    Keywords
+    --------
+    data: pandas DataFrame.
+        This DataFrame should be in 'long' format.
+
+    x, y: string.
+        x and y columns to be plotted.
+
+    type: ['mean_sd', 'median_quartiles'], default 'mean_sd'
+        Plots the summary statistics for each group. If 'mean_sd', then the
+        mean and standard deviation of each group is plotted as a gapped line.
+        If 'median_quantiles', then the median and 25th and 75th percentiles of
+        each group is plotted instead.
+
+    offset: float (default 0.3) or iterable.
+        Give a single float (that will be used as the x-offset of all
+        gapped lines), or an iterable containing the list of x-offsets.
+
+    line_color: string (matplotlib color, default "black") or iterable of
+        matplotlib colors.
+
+        The color of the vertical line indicating the stadard deviations.
+
+    gap_width_percent: float, default 5
+        The width of the gap in the line (indicating the central measure),
+        expressed as a percentage of the y-span of the axes.
+
+    ax: matplotlib Axes object, default None
+        If a matplotlib Axes object is specified, the gapped lines will be
+        plotted in order on this axes. If None, the current axes (plt.gca())
+        is used.
+
+    xpos: float, default 0
+        The x-position of the gapped lines. This is useful if you want to
+        plot multiple gapped lines on the same axes, but with different
+        x-positions.
+
+    kwargs: dict, default None
+        Dictionary with kwargs passed to matplotlib.lines.Line2D
+    '''
+    import numpy as np
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    import matplotlib.lines as mlines
+
+    if gap_width_percent < 0 or gap_width_percent > 100:
+        raise ValueError("`gap_width_percent` must be between 0 and 100.")
+
+    if ax is None:
+        ax = plt.gca()
+    ax_ylims = ax.get_ylim()
+    ax_yspan = np.abs(ax_ylims[1] - ax_ylims[0])
+    gap_width = ax_yspan * gap_width_percent / 100
+
+    keys = kwargs.keys()
+    if 'clip_on' not in keys:
+        kwargs['clip_on'] = False
+
+    if 'zorder' not in keys:
+        kwargs['zorder'] = 5
+
+    if 'lw' not in keys:
+        kwargs['lw'] = 2.
+
+    # # Grab the order in which the groups appear.
+    # group_order = pd.unique(data[x])
+
+    # Grab the order in which the groups appear,
+    # depending on whether the x-column is categorical.
+    if isinstance(data[x].dtype, pd.CategoricalDtype):
+        group_order = pd.unique(data[x]).categories
+    else:
+        group_order = pd.unique(data[x])
+
+    means = data.groupby(x)[y].mean().reindex(index=group_order)
+    g = lambda x: np.sqrt((np.sum(x) * (len(x) - np.sum(x))) / (len(x) * len(x) * len(x)))
+    sd = data.groupby(x)[y].apply(g)
+    # sd = data.groupby(x)[y].std().reindex(index=group_order)
+    lower_sd = means - sd
+    upper_sd = means + sd
+
+    if (lower_sd < ax_ylims[0]).any() or (upper_sd > ax_ylims[1]).any():
+        kwargs['clip_on'] = True
+
+    medians = data.groupby(x)[y].median().reindex(index=group_order)
+    quantiles = data.groupby(x)[y].quantile([0.25, 0.75]) \
+        .unstack() \
+        .reindex(index=group_order)
+    lower_quartiles = quantiles[0.25]
+    upper_quartiles = quantiles[0.75]
+
+    if type == 'mean_sd':
+        central_measures = means
+        lows = lower_sd
+        highs = upper_sd
+    elif type == 'median_quartiles':
+        central_measures = medians
+        lows = lower_quartiles
+        highs = upper_quartiles
+
+    n_groups = len(central_measures)
+
+    if isinstance(line_color, str):
+        custom_palette = np.repeat(line_color, n_groups)
+    else:
+        if len(line_color) != n_groups:
+            err1 = "{} groups are being plotted, but ".format(n_groups)
+            err2 = "{} colors(s) were supplied in `line_color`.".format(len(line_color))
+            raise ValueError(err1 + err2)
+        custom_palette = line_color
+
+    try:
+        len_offset = len(offset)
+    except TypeError:
+        offset = np.repeat(offset, n_groups)
+        len_offset = len(offset)
+
+    if len_offset != n_groups:
+        err1 = "{} groups are being plotted, but ".format(n_groups)
+        err2 = "{} offset(s) were supplied in `offset`.".format(len_offset)
+        raise ValueError(err1 + err2)
+
+    kwargs['zorder'] = kwargs['zorder']
+
+    for xpos, central_measure in enumerate(central_measures):
+        # add lower vertical span line.
+
+        kwargs['color'] = custom_palette[xpos]
+
+        _xpos = pos[xpos] + offset[xpos]
+        # add lower vertical span line.
+        low = lows[xpos]
+        low_to_mean = mlines.Line2D([_xpos, _xpos],
+                                    [low, central_measure - gap_width],
+                                    **kwargs)
+        ax.add_line(low_to_mean)
+
+        # add upper vertical span line.
+        high = highs[xpos]
+        mean_to_high = mlines.Line2D([_xpos, _xpos],
+                                     [central_measure + gap_width, high],
+                                     **kwargs)
+        ax.add_line(mean_to_high)
+
+        # # add horzontal central measure line.
+        # kwargs['zorder'] = 6
+        # kwargs['color'] = gap_color
+        # kwargs['lw'] = kwargs['lw'] * 1.5
+        # line_xpos = xpos + offset[xpos]
+        # mean_line = mlines.Line2D([line_xpos-0.015, line_xpos+0.015],
+        #                           [central_measure, central_measure], **kwargs)
+        # ax.add_line(mean_line)
+
+
 def check_data_matches_labels(labels, data, side):
     '''
     Function to check that the labels and data match in the sankey diagram. 
@@ -418,7 +586,7 @@ def check_data_matches_labels(labels, data, side):
 
 def single_sankey(left, right, xpos=0, leftWeight=None, rightWeight=None, 
             colorDict=None, leftLabels=None, rightLabels=None, ax=None, 
-            width=0.5, alpha=0.65, bar_width=0.1, rightColor=False, align='center'):
+            width=0.5, alpha=0.65, bar_width=0.2, rightColor=False, align='center'):
 
     '''
     Make a single Sankey diagram showing proportion flow from left to right
@@ -535,6 +703,10 @@ def single_sankey(left, right, xpos=0, leftWeight=None, rightWeight=None,
     else: 
         leftpos = xpos
 
+    # Combine left and right arrays to have a pandas.DataFrame in the 'long' format
+    left_series = pd.Series(left, name='values').to_frame().assign(groups='left')
+    right_series = pd.Series(right, name='values').to_frame().assign(groups='right')
+    concatenated_df = pd.concat([left_series, right_series], ignore_index=True)
 
     # Determine positions of left label patches and total widths
     # We also want the height of the graph to be 1
@@ -623,6 +795,10 @@ def normalize_dict(nested_dict, target):
             color=colorDict[rightLabel],
             alpha=0.99
         )
+
+    # Plot error bars
+    sankey_error_bar(concatenated_df, x='groups', y='values', ax=ax, offset=0, gap_width_percent=2,
+                     pos=[(leftpos + (-(bar_width) * xMax) + leftpos)/2, (xMax + leftpos + leftpos + ((1 + bar_width) * xMax))/2],)
     
     # Plot strips
     for leftLabel in leftLabels:
@@ -654,7 +830,7 @@ def sankeydiag(data, xvar, yvar, left_idx, right_idx,
                 leftLabels=None, rightLabels=None,  
                 palette=None, ax=None, 
                 one_sankey=False,
-                width=0.5, rightColor=False,
+                width=0.4, rightColor=False,
                 align='center', alpha=0.65, **kwargs):
     '''
     Read in melted pd.DataFrame, and draw multiple sankey diagram on a single axes
@@ -666,6 +842,8 @@ def sankeydiag(data, xvar, yvar, left_idx, right_idx,
     --------
     data: pd.DataFrame
         input data, melted dataframe created by dabest.load()
+    xvar, yvar: string.
+        x and y columns to be plotted.
     left_idx: str
         the value in column xvar that is on the left side of each sankey diagram
     right_idx: str
diff --git a/dabest/plotter.py b/dabest/plotter.py
@@ -127,9 +127,9 @@ def EffectSizeDataFramePlotter(EffectSizeDataFrame, **plot_kwargs):
                                          plot_kwargs["barplot_kwargs"])
 
     # Sankey Diagram kwargs
-    default_sankey_kwargs = {"width": 0.5, "align": "center",
+    default_sankey_kwargs = {"width": 0.4, "align": "center",
                             "alpha": 0.4, "rightColor": False,
-                            "bar_width":0.1}
+                            "bar_width":0.2}
     if plot_kwargs["sankey_kwargs"] is None:
         sankey_kwargs = default_sankey_kwargs
     else:
diff --git a/docs/source/proportion-plot.rst b/docs/source/proportion-plot.rst
@@ -363,7 +363,7 @@ Repeated measures is also supported in paired proportional plot, by changing the
 
   multi_group_sequential = dabest.load(df, idx=((("Control 1", "Test 1","Test 2", "Test 3"),
                                 ("Test 4", "Test 5", "Test 6"))),
-                    proportional=True, paired="baseline", id_col="ID")
+                    proportional=True, paired="sequential", id_col="ID")
 
   multi_group_sequential.mean_diff.plot();