Merge pull request #76 from isabelizimm/dev-model-monitoring

isabelizimm · web-flow · commit 85fef87f6177 · 2022-07-05T14:29:49.000-04:00
model monitoring
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -53,6 +53,17 @@ Deploy
    ~write_app
    ~write_docker
 
+Monitor
+==================
+
+.. autosummary::
+   :toctree: reference/
+   :caption: Monitor
+
+   ~compute_metrics
+   ~pin_metrics
+   ~plot_metrics
+
 Advanced Usage
 ==================
 .. toctree::
diff --git a/setup.cfg b/setup.cfg
@@ -34,11 +34,13 @@ install_requires =
     requests
     pins
     rsconnect-python
+    plotly
 
 [options.extras_require]
 dev =
     pytest
     pytest-cov
+    pytest-snapshot
     sphinx
     sphinx-autodoc-typehints
     sphinx-book-theme
diff --git a/vetiver/__init__.py b/vetiver/__init__.py
@@ -18,4 +18,5 @@
 from .handlers.base import VetiverHandler  # noqa
 from .handlers.sklearn import SKLearnHandler  # noqa
 from .handlers.torch import TorchHandler  # noqa
-from .rsconnect import deploy_rsconnect
+from .rsconnect import deploy_rsconnect # noqa
+from .monitor import compute_metrics, pin_metrics, plot_metrics, _rolling_df # noqa
diff --git a/vetiver/monitor.py b/vetiver/monitor.py
@@ -0,0 +1,166 @@
+import datetime
+import pins
+from pins.errors import PinsError
+import plotly.express as px
+import pandas as pd
+from datetime import datetime, timedelta
+
+
+def compute_metrics(
+    data: pd.DataFrame,
+    date_var: str,
+    period: timedelta,
+    metric_set: list,
+    truth: str,
+    estimate: str,
+) -> pd.DataFrame:
+    """
+    Compute metrics for given time period
+
+    Parameters
+    ----------
+    data : DataFrame
+        Pandas dataframe
+    date_var:
+        Column in `data` containing dates
+    period: datetime.timedelta
+        Defining period to group by
+    metric_set: list
+        List of metrics to compute, that have the parameters `y_true` and `y_pred`
+    truth:
+        Column name for true results
+    estimate:
+        Column name for predicted results
+
+    Example
+    -------
+    from sklearn import metrics
+    rng = pd.date_range("1/1/2012", periods=10, freq="S")
+    new = dict(x=range(len(rng)), y = range(len(rng)))
+    df = pd.DataFrame(new, index = rng).reset_index(inplace=True)
+    td = timedelta(seconds = 2)
+    metric_set = [sklearn.metrics.mean_squared_error, sklearn.metrics.mean_absolute_error]
+    compute_metrics(df, "index", td, metric_set=metric_set, truth="x", estimate="y")
+
+    """
+
+    df = data[[truth, estimate, date_var]].set_index(date_var).sort_index()
+    lst = [_ for _ in _rolling_df(df=df, td=period)]
+
+    rows = []
+    for i in lst:
+        for m in metric_set:
+            rows = rows + [
+                {
+                    "index": i.index[0],
+                    "n": len(i),
+                    "metric": m.__qualname__,
+                    "estimate": m(y_pred=i[truth], y_true=i[estimate]),
+                }
+            ]
+
+    outdf = pd.DataFrame.from_dict(rows)
+
+    return outdf
+
+
+def _rolling_df(df: pd.DataFrame, td: timedelta):
+    first = df.index[0]
+    last = df.index[-1]
+
+    while first < last:
+        stop = first + td
+        boolidx = (first <= df.index) & (df.index < stop)
+        yield df[boolidx].copy()
+        first = stop
+
+
+def pin_metrics(board, df_metrics, metrics_pin_name, overwrite=False):
+    pass
+
+
+#     """
+#     Update an existing pin storing model metrics over time
+
+#     Parameters
+#     ----------
+#     board :
+#         Pins board
+#     df_metrics: pd.DataFrame
+#         Dataframe of metrics over time, such as created by `vetiver_compute_metrics()`
+#     metrics_pin_name:
+#         Pin name for where the metrics are stored
+#     overwrite: bool
+#         If TRUE (the default), overwrite any metrics for
+#         dates that exist both in the existing pin and
+#         new metrics with the new values. If FALSE, error
+#         when the new metrics contain overlapping dates with
+#         the existing pin.
+#     """
+#     date_types = (datetime.date, datetime.time, datetime.datetime)
+#     if not isinstance(df_metrics.index, date_types):
+#         try:
+#             df_metrics = df_metrics.index.astype("datetime")
+#         except TypeError:
+#             raise TypeError(f"Index of {df_metrics} must be a date type")
+
+#     new_metrics = df_metrics.sort_index()
+
+#     new_dates = df_metrics.index.unique()
+
+#     try:
+#         old_metrics = board.pin_read(metrics_pin_name)
+#     except PinsError:
+#         board.pin_write(metrics_pin_name)
+
+#     overlapping_dates = old_metrics.index in new_dates
+
+#     if overwrite is True:
+#         old_metrics = old_metrics not in overlapping_dates
+#     else:
+#         if overlapping_dates:
+#             raise ValueError(
+#                 f"The new metrics overlap with dates \
+#                      already stored in {repr(metrics_pin_name)} \
+#                      Check the aggregated dates or use `overwrite = True`"
+#             )
+
+#     new_metrics = old_metrics + df_metrics
+#     new_metrics = new_metrics.sort_index()
+
+#     pins.pin_write(board, new_metrics, metrics_pin_name)
+
+
+def plot_metrics(
+    df_metrics, date="index", estimate="estimate", metric="metric", n="n", **kw
+) -> px.line:
+    """
+    Plot metrics over a given time period
+
+    Parameters
+    ----------
+    df_metrics : DataFrame
+        Pandas dataframe of metrics over time, such as created by `compute_metircs()`
+    date: str
+        Column in `df_metrics` containing dates
+    estimate: str
+        Column in `df_metrics` containing metric output
+    metric: str
+       Column in `df_metrics` containing metric name
+    n: str
+        Column in `df_metrics` containing number of observations
+    """
+
+    fig = px.line(
+        df_metrics,
+        x=date,
+        y=estimate,
+        color=metric,
+        facet_row=metric,
+        markers=n,
+        **kw,
+    )
+    fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
+    fig.update_layout(showlegend=False)
+
+    return fig
diff --git a/vetiver/tests/snapshots/test_monitor.json b/vetiver/tests/snapshots/test_monitor.json
@@ -0,0 +1 @@
+{"index":{"0":1325376000000,"1":1325376000000,"2":1325376002000,"3":1325376002000,"4":1325376004000,"5":1325376004000,"6":1325376006000,"7":1325376006000,"8":1325376008000,"9":1325376008000},"n":{"0":2,"1":2,"2":2,"3":2,"4":2,"5":2,"6":2,"7":2,"8":2,"9":2},"metric":{"0":"mean_squared_error","1":"mean_absolute_error","2":"mean_squared_error","3":"mean_absolute_error","4":"mean_squared_error","5":"mean_absolute_error","6":"mean_squared_error","7":"mean_absolute_error","8":"mean_squared_error","9":"mean_absolute_error"},"estimate":{"0":0.0,"1":0.0,"2":0.0,"3":0.0,"4":0.0,"5":0.0,"6":0.0,"7":0.0,"8":0.0,"9":0.0}}
diff --git a/vetiver/tests/test_monitor.py b/vetiver/tests/test_monitor.py
@@ -0,0 +1,36 @@
+from sklearn import metrics
+from datetime import timedelta
+import pandas as pd
+import numpy
+import vetiver
+
+rng = pd.date_range("1/1/2012", periods=10, freq="S")
+new = dict(x=range(len(rng)), y=range(len(rng)))
+df = pd.DataFrame(new, index=rng)
+td = timedelta(seconds=2)
+metric_set = [metrics.mean_squared_error, metrics.mean_absolute_error]
+
+def test_rolling():
+    m = [_ for _ in vetiver._rolling_df(df, td)]
+    assert len(m) == 5
+    assert len(m[0]) == 2
+
+def test_compute():
+    df.reset_index(inplace=True)
+    m = vetiver.compute_metrics(
+        df, "index", td, metric_set=metric_set, truth="x", estimate="y"
+    )
+    assert isinstance(m, pd.DataFrame)
+    assert m.shape == (10, 4)
+    numpy.testing.assert_array_equal(
+        m.metric.unique(),
+        numpy.array(["mean_squared_error", "mean_absolute_error"], dtype=object),
+    )
+
+def test_monitor(snapshot):
+    snapshot.snapshot_dir = './vetiver/tests/snapshots'
+    m = vetiver.compute_metrics(
+        df, "index", td, metric_set=metric_set, truth="x", estimate="y"
+    )
+    vetiver.plot_metrics(m)
+    snapshot.assert_match(m.to_json(), 'test_monitor.json')

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"index":{"0":1325376000000,"1":1325376000000,"2":1325376002000,"3":1325376002000,"4":1325376004000,"5":1325376004000,"6":1325376006000,"7":1325376006000,"8":1325376008000,"9":1325376008000},"n":{"0":2,"1":2,"2":2,"3":2,"4":2,"5":2,"6":2,"7":2,"8":2,"9":2},"metric":{"0":"mean_squared_error","1":"mean_absolute_error","2":"mean_squared_error","3":"mean_absolute_error","4":"mean_squared_error","5":"mean_absolute_error","6":"mean_squared_error","7":"mean_absolute_error","8":"mean_squared_error","9":"mean_absolute_error"},"estimate":{"0":0.0,"1":0.0,"2":0.0,"3":0.0,"4":0.0,"5":0.0,"6":0.0,"7":0.0,"8":0.0,"9":0.0}}