mgplot.summary_plot
summary_plot.py: Produce a summary plot for the data in a given DataFrame. The data is normalised to z-scores and scaled.
1""" 2summary_plot.py: 3Produce a summary plot for the data in a given DataFrame. 4The data is normalised to z-scores and scaled. 5""" 6 7# --- imports 8# system imports 9from typing import Any 10 11# from collections.abc import Sequence 12 13# analytic third-party imports 14from numpy import ndarray, array 15from matplotlib.pyplot import Axes, subplots 16from pandas import DataFrame, Period 17 18# local imports 19from mgplot.settings import DataT 20from mgplot.finalise_plot import make_legend 21from mgplot.utilities import constrain_data, check_clean_timeseries 22from mgplot.kw_type_checking import ( 23 report_kwargs, 24 ExpectedTypeDict, 25 validate_expected, 26 validate_kwargs, 27) 28from mgplot.keyword_names import AX, VERBOSE, MIDDLE, PLOT_TYPE, PLOT_FROM 29 30 31# --- constants 32ZSCORES = "zscores" 33ZSCALED = "zscaled" 34 35SUMMARY_KW_TYPES: ExpectedTypeDict = { 36 AX: (Axes, type(None)), 37 VERBOSE: bool, 38 MIDDLE: float, 39 PLOT_TYPE: str, 40 PLOT_FROM: (int, Period, type(None)), 41} 42validate_expected(SUMMARY_KW_TYPES, "summary_plot") 43 44 45# --- functions 46def _calc_quantiles(middle: float) -> ndarray: 47 """Calculate the quantiles for the middle of the data.""" 48 return array([(1 - middle) / 2.0, 1 - (1 - middle) / 2.0]) 49 50 51def _calculate_z( 52 original: DataFrame, # only contains the data points of interest 53 middle: float, # middle proportion of data to highlight (eg. 0.8) 54 verbose: bool = False, # print the summary data 55) -> tuple[DataFrame, DataFrame]: 56 """Calculate z-scores, scaled z-scores and middle quantiles. 57 Return z_scores, z_scaled, q (which are the quantiles for the 58 start/end of the middle proportion of data to highlight).""" 59 60 # calculate z-scores, scaled scores and middle quantiles 61 z_scores: DataFrame = (original - original.mean()) / original.std() 62 z_scaled: DataFrame = ( 63 # scale z-scores between -1 and +1 64 (((z_scores - z_scores.min()) / (z_scores.max() - z_scores.min())) - 0.5) 65 * 2 66 ) 67 q_middle = _calc_quantiles(middle) 68 69 if verbose: 70 frame = DataFrame( 71 { 72 "count": original.count(), 73 "mean": original.mean(), 74 "median": original.median(), 75 "min shaded": original.quantile(q=q_middle[0]), 76 "max shaded": original.quantile(q=q_middle[1]), 77 "z-scores": z_scores.iloc[-1], 78 "scaled": z_scaled.iloc[-1], 79 } 80 ) 81 print(frame) 82 83 return DataFrame(z_scores), DataFrame(z_scaled) # syntactic sugar for type hinting 84 85 86def _plot_middle_bars( 87 adjusted: DataFrame, 88 middle: float, 89 kwargs: dict[str, Any], # must be a dictionary, not a splat 90) -> Axes: 91 """Plot the middle (typically 80%) of the data as a bar. 92 Note: also sets the x-axis limits in kwargs. 93 Return the matplotlib Axes object.""" 94 95 q = _calc_quantiles(middle) 96 lo_hi: DataFrame = adjusted.quantile(q=q).T # get the middle section of data 97 span = 1.15 98 space = 0.2 99 low = min(adjusted.iloc[-1].min(), lo_hi.min().min(), -span) - space 100 high = max(adjusted.iloc[-1].max(), lo_hi.max().max(), span) + space 101 kwargs["xlim"] = (low, high) # remember the x-axis limits 102 _fig, ax = subplots() 103 ax.barh( 104 y=lo_hi.index, 105 width=lo_hi[q[1]] - lo_hi[q[0]], 106 left=lo_hi[q[0]], 107 color="#bbbbbb", 108 label=f"Middle {middle*100:0.0f}% of prints", 109 ) 110 return ax 111 112 113def _plot_latest_datapoint( 114 ax: Axes, 115 original: DataFrame, 116 adjusted: DataFrame, 117 f_size: int, 118) -> None: 119 """Add the latest datapoints to the summary plot""" 120 121 ax.scatter(adjusted.iloc[-1], adjusted.columns, color="darkorange", label="Latest") 122 f_size = 10 123 row = adjusted.index[-1] 124 for col_num, col_name in enumerate(original.columns): 125 ax.text( 126 x=adjusted.at[row, col_name], 127 y=col_num, 128 s=f"{original.at[row, col_name]:.1f}", 129 ha="center", 130 va="center", 131 size=f_size, 132 ) 133 134 135def _label_extremes( 136 ax: Axes, 137 data: tuple[DataFrame, DataFrame], 138 plot_type: str, 139 f_size: int, 140 kwargs: dict[str, Any], # must be a dictionary, not a splat 141) -> None: 142 """Label the extremes in the scaled plots.""" 143 144 original, adjusted = data 145 low, high = kwargs["xlim"] 146 if plot_type == ZSCALED: 147 ax.axvline(-1, color="#555555", linewidth=0.5, linestyle="--") 148 ax.axvline(1, color="#555555", linewidth=0.5, linestyle="--") 149 ax.scatter( 150 adjusted.median(), 151 adjusted.columns, 152 color="darkorchid", 153 marker="x", 154 s=5, 155 label="Median", 156 ) 157 for col_num, col_name in enumerate(original.columns): 158 ax.text( 159 low, 160 col_num, 161 f" {original[col_name].min():.1f}", 162 ha="left", 163 va="center", 164 size=f_size, 165 ) 166 ax.text( 167 high, 168 col_num, 169 f"{original[col_name].max():.1f} ", 170 ha="right", 171 va="center", 172 size=f_size, 173 ) 174 175 176def _horizontal_bar_plot( 177 original: DataFrame, 178 adjusted: DataFrame, 179 middle: float, 180 plot_type: str, 181 kwargs: dict[str, Any], # must be a dictionary, not a splat 182) -> Axes: 183 """Plot horizontal bars for the middle of the data.""" 184 185 # kwargs is a dictionary, not a splat 186 # so that we can pass it to the Axes object and 187 # set the x-axis limits. 188 189 ax = _plot_middle_bars(adjusted, middle, kwargs) 190 f_size = 10 191 _plot_latest_datapoint(ax, original, adjusted, f_size) 192 _label_extremes( 193 ax, data=(original, adjusted), plot_type=plot_type, f_size=f_size, kwargs=kwargs 194 ) 195 196 return ax 197 198 199# public 200def summary_plot( 201 data: DataT, # summary data 202 **kwargs, 203) -> Axes: 204 """Plot a summary of historical data for a given DataFrame. 205 206 Args: 207 - summary: DataFrame containing the summary data. The column names are 208 used as labels for the plot. 209 - kwargs: additional arguments for the plot, including: 210 - plot_from: int | Period | None 211 - verbose: if True, print the summary data. 212 - middle: proportion of data to highlight (default is 0.8). 213 - plot_types: list of plot types to generate. 214 215 216 Returns Axes. 217 """ 218 219 # --- check the kwargs 220 me = "summary_plot" 221 report_kwargs(called_from=me, **kwargs) 222 kwargs = validate_kwargs(SUMMARY_KW_TYPES, me, **kwargs) 223 224 # --- check the data 225 data = check_clean_timeseries(data, me) 226 if not isinstance(data, DataFrame): 227 raise TypeError("data must be a pandas DataFrame for summary_plot()") 228 df = DataFrame(data) # syntactic sugar for type hinting 229 230 # --- optional arguments 231 verbose = kwargs.pop("verbose", False) 232 middle = float(kwargs.pop("middle", 0.8)) 233 plot_type = kwargs.pop("plot_type", ZSCORES) 234 kwargs["legend"] = kwargs.get( 235 "legend", 236 { 237 # put the legend below the x-axis label 238 "loc": "upper center", 239 "fontsize": "xx-small", 240 "bbox_to_anchor": (0.5, -0.125), 241 "ncol": 4, 242 }, 243 ) 244 245 # get the data, calculate z-scores and scaled scores based on the start period 246 subset, kwargs = constrain_data(df, **kwargs) 247 z_scores, z_scaled = _calculate_z(subset, middle, verbose=verbose) 248 249 # plot as required by the plot_types argument 250 adjusted = z_scores if plot_type == ZSCORES else z_scaled 251 ax = _horizontal_bar_plot(subset, adjusted, middle, plot_type, kwargs) 252 ax.tick_params(axis="y", labelsize="small") 253 make_legend(ax, kwargs["legend"]) 254 ax.set_xlim(kwargs.get("xlim", None)) # provide space for the labels 255 256 return ax
ZSCORES =
'zscores'
ZSCALED =
'zscaled'
SUMMARY_KW_TYPES: mgplot.kw_type_checking.ExpectedTypeDict =
{'ax': (<class 'matplotlib.axes._axes.Axes'>, <class 'NoneType'>), 'verbose': <class 'bool'>, 'middle': <class 'float'>, 'plot_type': <class 'str'>, 'plot_from': (<class 'int'>, <class 'pandas._libs.tslibs.period.Period'>, <class 'NoneType'>)}
def
summary_plot(data: ~DataT, **kwargs) -> matplotlib.axes._axes.Axes:
201def summary_plot( 202 data: DataT, # summary data 203 **kwargs, 204) -> Axes: 205 """Plot a summary of historical data for a given DataFrame. 206 207 Args: 208 - summary: DataFrame containing the summary data. The column names are 209 used as labels for the plot. 210 - kwargs: additional arguments for the plot, including: 211 - plot_from: int | Period | None 212 - verbose: if True, print the summary data. 213 - middle: proportion of data to highlight (default is 0.8). 214 - plot_types: list of plot types to generate. 215 216 217 Returns Axes. 218 """ 219 220 # --- check the kwargs 221 me = "summary_plot" 222 report_kwargs(called_from=me, **kwargs) 223 kwargs = validate_kwargs(SUMMARY_KW_TYPES, me, **kwargs) 224 225 # --- check the data 226 data = check_clean_timeseries(data, me) 227 if not isinstance(data, DataFrame): 228 raise TypeError("data must be a pandas DataFrame for summary_plot()") 229 df = DataFrame(data) # syntactic sugar for type hinting 230 231 # --- optional arguments 232 verbose = kwargs.pop("verbose", False) 233 middle = float(kwargs.pop("middle", 0.8)) 234 plot_type = kwargs.pop("plot_type", ZSCORES) 235 kwargs["legend"] = kwargs.get( 236 "legend", 237 { 238 # put the legend below the x-axis label 239 "loc": "upper center", 240 "fontsize": "xx-small", 241 "bbox_to_anchor": (0.5, -0.125), 242 "ncol": 4, 243 }, 244 ) 245 246 # get the data, calculate z-scores and scaled scores based on the start period 247 subset, kwargs = constrain_data(df, **kwargs) 248 z_scores, z_scaled = _calculate_z(subset, middle, verbose=verbose) 249 250 # plot as required by the plot_types argument 251 adjusted = z_scores if plot_type == ZSCORES else z_scaled 252 ax = _horizontal_bar_plot(subset, adjusted, middle, plot_type, kwargs) 253 ax.tick_params(axis="y", labelsize="small") 254 make_legend(ax, kwargs["legend"]) 255 ax.set_xlim(kwargs.get("xlim", None)) # provide space for the labels 256 257 return ax
Plot a summary of historical data for a given DataFrame.
Args:
- summary: DataFrame containing the summary data. The column names are used as labels for the plot.
- kwargs: additional arguments for the plot, including:
- plot_from: int | Period | None
- verbose: if True, print the summary data.
- middle: proportion of data to highlight (default is 0.8).
- plot_types: list of plot types to generate.
Returns Axes.