mgplot.summary_plot

summary_plot.py: Produce a summary plot for the data in a given DataFrame. The data is normalised to z-scores and scaled.

  1"""
  2summary_plot.py:
  3Produce a summary plot for the data in a given DataFrame.
  4The data is normalised to z-scores and scaled.
  5"""
  6
  7# --- imports
  8# system imports
  9from typing import Any
 10
 11# from collections.abc import Sequence
 12
 13# analytic third-party imports
 14from numpy import ndarray, array
 15from matplotlib.pyplot import Axes, subplots
 16from pandas import DataFrame, Period
 17
 18# local imports
 19from mgplot.settings import DataT
 20from mgplot.finalise_plot import make_legend
 21from mgplot.utilities import constrain_data, check_clean_timeseries
 22from mgplot.kw_type_checking import (
 23    report_kwargs,
 24    ExpectedTypeDict,
 25    validate_expected,
 26    validate_kwargs,
 27)
 28from mgplot.keyword_names import AX, VERBOSE, MIDDLE, PLOT_TYPE, PLOT_FROM
 29
 30
 31# --- constants
 32ZSCORES = "zscores"
 33ZSCALED = "zscaled"
 34
 35SUMMARY_KW_TYPES: ExpectedTypeDict = {
 36    AX: (Axes, type(None)),
 37    VERBOSE: bool,
 38    MIDDLE: float,
 39    PLOT_TYPE: str,
 40    PLOT_FROM: (int, Period, type(None)),
 41}
 42validate_expected(SUMMARY_KW_TYPES, "summary_plot")
 43
 44
 45# --- functions
 46def _calc_quantiles(middle: float) -> ndarray:
 47    """Calculate the quantiles for the middle of the data."""
 48    return array([(1 - middle) / 2.0, 1 - (1 - middle) / 2.0])
 49
 50
 51def _calculate_z(
 52    original: DataFrame,  # only contains the data points of interest
 53    middle: float,  # middle proportion of data to highlight (eg. 0.8)
 54    verbose: bool = False,  # print the summary data
 55) -> tuple[DataFrame, DataFrame]:
 56    """Calculate z-scores, scaled z-scores and middle quantiles.
 57    Return z_scores, z_scaled, q (which are the quantiles for the
 58    start/end of the middle proportion of data to highlight)."""
 59
 60    # calculate z-scores, scaled scores and middle quantiles
 61    z_scores: DataFrame = (original - original.mean()) / original.std()
 62    z_scaled: DataFrame = (
 63        # scale z-scores between -1 and +1
 64        (((z_scores - z_scores.min()) / (z_scores.max() - z_scores.min())) - 0.5)
 65        * 2
 66    )
 67    q_middle = _calc_quantiles(middle)
 68
 69    if verbose:
 70        frame = DataFrame(
 71            {
 72                "count": original.count(),
 73                "mean": original.mean(),
 74                "median": original.median(),
 75                "min shaded": original.quantile(q=q_middle[0]),
 76                "max shaded": original.quantile(q=q_middle[1]),
 77                "z-scores": z_scores.iloc[-1],
 78                "scaled": z_scaled.iloc[-1],
 79            }
 80        )
 81        print(frame)
 82
 83    return DataFrame(z_scores), DataFrame(z_scaled)  # syntactic sugar for type hinting
 84
 85
 86def _plot_middle_bars(
 87    adjusted: DataFrame,
 88    middle: float,
 89    kwargs: dict[str, Any],  # must be a dictionary, not a splat
 90) -> Axes:
 91    """Plot the middle (typically 80%) of the data as a bar.
 92    Note: also sets the x-axis limits in kwargs.
 93    Return the matplotlib Axes object."""
 94
 95    q = _calc_quantiles(middle)
 96    lo_hi: DataFrame = adjusted.quantile(q=q).T  # get the middle section of data
 97    span = 1.15
 98    space = 0.2
 99    low = min(adjusted.iloc[-1].min(), lo_hi.min().min(), -span) - space
100    high = max(adjusted.iloc[-1].max(), lo_hi.max().max(), span) + space
101    kwargs["xlim"] = (low, high)  # remember the x-axis limits
102    _fig, ax = subplots()
103    ax.barh(
104        y=lo_hi.index,
105        width=lo_hi[q[1]] - lo_hi[q[0]],
106        left=lo_hi[q[0]],
107        color="#bbbbbb",
108        label=f"Middle {middle*100:0.0f}% of prints",
109    )
110    return ax
111
112
113def _plot_latest_datapoint(
114    ax: Axes,
115    original: DataFrame,
116    adjusted: DataFrame,
117    f_size: int,
118) -> None:
119    """Add the latest datapoints to the summary plot"""
120
121    ax.scatter(adjusted.iloc[-1], adjusted.columns, color="darkorange", label="Latest")
122    f_size = 10
123    row = adjusted.index[-1]
124    for col_num, col_name in enumerate(original.columns):
125        ax.text(
126            x=adjusted.at[row, col_name],
127            y=col_num,
128            s=f"{original.at[row, col_name]:.1f}",
129            ha="center",
130            va="center",
131            size=f_size,
132        )
133
134
135def _label_extremes(
136    ax: Axes,
137    data: tuple[DataFrame, DataFrame],
138    plot_type: str,
139    f_size: int,
140    kwargs: dict[str, Any],  # must be a dictionary, not a splat
141) -> None:
142    """Label the extremes in the scaled plots."""
143
144    original, adjusted = data
145    low, high = kwargs["xlim"]
146    if plot_type == ZSCALED:
147        ax.axvline(-1, color="#555555", linewidth=0.5, linestyle="--")
148        ax.axvline(1, color="#555555", linewidth=0.5, linestyle="--")
149        ax.scatter(
150            adjusted.median(),
151            adjusted.columns,
152            color="darkorchid",
153            marker="x",
154            s=5,
155            label="Median",
156        )
157        for col_num, col_name in enumerate(original.columns):
158            ax.text(
159                low,
160                col_num,
161                f" {original[col_name].min():.1f}",
162                ha="left",
163                va="center",
164                size=f_size,
165            )
166            ax.text(
167                high,
168                col_num,
169                f"{original[col_name].max():.1f} ",
170                ha="right",
171                va="center",
172                size=f_size,
173            )
174
175
176def _horizontal_bar_plot(
177    original: DataFrame,
178    adjusted: DataFrame,
179    middle: float,
180    plot_type: str,
181    kwargs: dict[str, Any],  # must be a dictionary, not a splat
182) -> Axes:
183    """Plot horizontal bars for the middle of the data."""
184
185    # kwargs is a dictionary, not a splat
186    # so that we can pass it to the Axes object and
187    # set the x-axis limits.
188
189    ax = _plot_middle_bars(adjusted, middle, kwargs)
190    f_size = 10
191    _plot_latest_datapoint(ax, original, adjusted, f_size)
192    _label_extremes(
193        ax, data=(original, adjusted), plot_type=plot_type, f_size=f_size, kwargs=kwargs
194    )
195
196    return ax
197
198
199# public
200def summary_plot(
201    data: DataT,  # summary data
202    **kwargs,
203) -> Axes:
204    """Plot a summary of historical data for a given DataFrame.
205
206    Args:
207    - summary: DataFrame containing the summary data. The column names are
208      used as labels for the plot.
209    - kwargs: additional arguments for the plot, including:
210        - plot_from: int | Period | None
211        - verbose: if True, print the summary data.
212        - middle: proportion of data to highlight (default is 0.8).
213        - plot_types: list of plot types to generate.
214
215
216    Returns Axes.
217    """
218
219    # --- check the kwargs
220    me = "summary_plot"
221    report_kwargs(called_from=me, **kwargs)
222    kwargs = validate_kwargs(SUMMARY_KW_TYPES, me, **kwargs)
223
224    # --- check the data
225    data = check_clean_timeseries(data, me)
226    if not isinstance(data, DataFrame):
227        raise TypeError("data must be a pandas DataFrame for summary_plot()")
228    df = DataFrame(data)  # syntactic sugar for type hinting
229
230    # --- optional arguments
231    verbose = kwargs.pop("verbose", False)
232    middle = float(kwargs.pop("middle", 0.8))
233    plot_type = kwargs.pop("plot_type", ZSCORES)
234    kwargs["legend"] = kwargs.get(
235        "legend",
236        {
237            # put the legend below the x-axis label
238            "loc": "upper center",
239            "fontsize": "xx-small",
240            "bbox_to_anchor": (0.5, -0.125),
241            "ncol": 4,
242        },
243    )
244
245    # get the data, calculate z-scores and scaled scores based on the start period
246    subset, kwargs = constrain_data(df, **kwargs)
247    z_scores, z_scaled = _calculate_z(subset, middle, verbose=verbose)
248
249    # plot as required by the plot_types argument
250    adjusted = z_scores if plot_type == ZSCORES else z_scaled
251    ax = _horizontal_bar_plot(subset, adjusted, middle, plot_type, kwargs)
252    ax.tick_params(axis="y", labelsize="small")
253    make_legend(ax, kwargs["legend"])
254    ax.set_xlim(kwargs.get("xlim", None))  # provide space for the labels
255
256    return ax
ZSCORES = 'zscores'
ZSCALED = 'zscaled'
SUMMARY_KW_TYPES: mgplot.kw_type_checking.ExpectedTypeDict = {'ax': (<class 'matplotlib.axes._axes.Axes'>, <class 'NoneType'>), 'verbose': <class 'bool'>, 'middle': <class 'float'>, 'plot_type': <class 'str'>, 'plot_from': (<class 'int'>, <class 'pandas._libs.tslibs.period.Period'>, <class 'NoneType'>)}
def summary_plot(data: ~DataT, **kwargs) -> matplotlib.axes._axes.Axes:
201def summary_plot(
202    data: DataT,  # summary data
203    **kwargs,
204) -> Axes:
205    """Plot a summary of historical data for a given DataFrame.
206
207    Args:
208    - summary: DataFrame containing the summary data. The column names are
209      used as labels for the plot.
210    - kwargs: additional arguments for the plot, including:
211        - plot_from: int | Period | None
212        - verbose: if True, print the summary data.
213        - middle: proportion of data to highlight (default is 0.8).
214        - plot_types: list of plot types to generate.
215
216
217    Returns Axes.
218    """
219
220    # --- check the kwargs
221    me = "summary_plot"
222    report_kwargs(called_from=me, **kwargs)
223    kwargs = validate_kwargs(SUMMARY_KW_TYPES, me, **kwargs)
224
225    # --- check the data
226    data = check_clean_timeseries(data, me)
227    if not isinstance(data, DataFrame):
228        raise TypeError("data must be a pandas DataFrame for summary_plot()")
229    df = DataFrame(data)  # syntactic sugar for type hinting
230
231    # --- optional arguments
232    verbose = kwargs.pop("verbose", False)
233    middle = float(kwargs.pop("middle", 0.8))
234    plot_type = kwargs.pop("plot_type", ZSCORES)
235    kwargs["legend"] = kwargs.get(
236        "legend",
237        {
238            # put the legend below the x-axis label
239            "loc": "upper center",
240            "fontsize": "xx-small",
241            "bbox_to_anchor": (0.5, -0.125),
242            "ncol": 4,
243        },
244    )
245
246    # get the data, calculate z-scores and scaled scores based on the start period
247    subset, kwargs = constrain_data(df, **kwargs)
248    z_scores, z_scaled = _calculate_z(subset, middle, verbose=verbose)
249
250    # plot as required by the plot_types argument
251    adjusted = z_scores if plot_type == ZSCORES else z_scaled
252    ax = _horizontal_bar_plot(subset, adjusted, middle, plot_type, kwargs)
253    ax.tick_params(axis="y", labelsize="small")
254    make_legend(ax, kwargs["legend"])
255    ax.set_xlim(kwargs.get("xlim", None))  # provide space for the labels
256
257    return ax

Plot a summary of historical data for a given DataFrame.

Args:

  • summary: DataFrame containing the summary data. The column names are used as labels for the plot.
  • kwargs: additional arguments for the plot, including:
    • plot_from: int | Period | None
    • verbose: if True, print the summary data.
    • middle: proportion of data to highlight (default is 0.8).
    • plot_types: list of plot types to generate.

Returns Axes.