Skip to content

Commit 1addc8c

Browse files
authored
Merge pull request #82 from machow/feat-pin-metrics
feat: initial vetiver_pin_metrics implementation
2 parents 9f43e8d + 7b08861 commit 1addc8c

2 files changed

Lines changed: 160 additions & 60 deletions

File tree

vetiver/monitor.py

Lines changed: 69 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
1-
import datetime
2-
import pins
3-
from pins.errors import PinsError
41
import plotly.express as px
52
import pandas as pd
6-
from datetime import datetime, timedelta
3+
from datetime import timedelta
74

85

96
def compute_metrics(
@@ -75,60 +72,74 @@ def _rolling_df(df: pd.DataFrame, td: timedelta):
7572
first = stop
7673

7774

78-
def pin_metrics(board, df_metrics, metrics_pin_name, overwrite=False):
79-
pass
80-
81-
82-
# """
83-
# Update an existing pin storing model metrics over time
84-
85-
# Parameters
86-
# ----------
87-
# board :
88-
# Pins board
89-
# df_metrics: pd.DataFrame
90-
# Dataframe of metrics over time, such as created by `vetiver_compute_metrics()`
91-
# metrics_pin_name:
92-
# Pin name for where the metrics are stored
93-
# overwrite: bool
94-
# If TRUE (the default), overwrite any metrics for
95-
# dates that exist both in the existing pin and
96-
# new metrics with the new values. If FALSE, error
97-
# when the new metrics contain overlapping dates with
98-
# the existing pin.
99-
# """
100-
# date_types = (datetime.date, datetime.time, datetime.datetime)
101-
# if not isinstance(df_metrics.index, date_types):
102-
# try:
103-
# df_metrics = df_metrics.index.astype("datetime")
104-
# except TypeError:
105-
# raise TypeError(f"Index of {df_metrics} must be a date type")
106-
107-
# new_metrics = df_metrics.sort_index()
108-
109-
# new_dates = df_metrics.index.unique()
110-
111-
# try:
112-
# old_metrics = board.pin_read(metrics_pin_name)
113-
# except PinsError:
114-
# board.pin_write(metrics_pin_name)
115-
116-
# overlapping_dates = old_metrics.index in new_dates
117-
118-
# if overwrite is True:
119-
# old_metrics = old_metrics not in overlapping_dates
120-
# else:
121-
# if overlapping_dates:
122-
# raise ValueError(
123-
# f"The new metrics overlap with dates \
124-
# already stored in {repr(metrics_pin_name)} \
125-
# Check the aggregated dates or use `overwrite = True`"
126-
# )
127-
128-
# new_metrics = old_metrics + df_metrics
129-
# new_metrics = new_metrics.sort_index()
130-
131-
# pins.pin_write(board, new_metrics, metrics_pin_name)
75+
def pin_metrics(
76+
board,
77+
df_metrics: pd.DataFrame,
78+
metrics_pin_name: str,
79+
pin_type: "str | None" = None,
80+
index_name: str = "index",
81+
overwrite: bool = False,
82+
) -> pd.DataFrame:
83+
"""
84+
Update an existing pin storing model metrics over time
85+
86+
Parameters
87+
----------
88+
board :
89+
Pins board
90+
df_metrics: pd.DataFrame
91+
Dataframe of metrics over time, such as created by `vetiver_compute_metrics()`
92+
metrics_pin_name:
93+
Pin name for where the metrics are stored
94+
index_name:
95+
The column in df_metrics containing the aggregated dates or datetimes.
96+
Note that this defaults to a column named "index".
97+
overwrite: bool
98+
If TRUE (the default), overwrite any metrics for
99+
dates that exist both in the existing pin and
100+
new metrics with the new values. If FALSE, error
101+
when the new metrics contain overlapping dates with
102+
the existing pin.
103+
"""
104+
105+
old_metrics_raw = board.pin_read(metrics_pin_name)
106+
107+
# need to coerce date index to a datetime, since pandas does not infer
108+
# date columns from CSV (but note that formats like arrow do)
109+
old_metrics = old_metrics_raw.copy()
110+
old_metrics[index_name] = pd.to_datetime(old_metrics[index_name])
111+
112+
# handle overlapping dates ----
113+
dt_new = pd.to_datetime(df_metrics[index_name])
114+
dt_old = old_metrics[index_name]
115+
116+
indx_old_overlap = dt_old.isin(dt_new)
117+
118+
if overwrite:
119+
# get only rows specific to old metrics, so when we concat below
120+
# it effectively is an upsert
121+
old_metrics = old_metrics.loc[~indx_old_overlap, :]
122+
123+
elif not overwrite and indx_old_overlap.any():
124+
raise ValueError(
125+
f"The new metrics overlap with dates already stored in {metrics_pin_name}."
126+
" Check the aggregated dates or use `overwrite=True`."
127+
)
128+
129+
# update and pin ----
130+
combined_metrics = pd.concat([old_metrics, df_metrics], ignore_index=True)
131+
sorted_metrics = combined_metrics.sort_values(index_name)
132+
133+
if pin_type is None:
134+
meta = board.pin_meta(metrics_pin_name)
135+
136+
final_pin_type = meta.type
137+
else:
138+
final_pin_type = pin_type
139+
140+
board.pin_write(sorted_metrics, metrics_pin_name, type=final_pin_type)
141+
142+
return sorted_metrics
132143

133144

134145
def plot_metrics(

vetiver/tests/test_monitor.py

Lines changed: 91 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,27 @@
11
from sklearn import metrics
22
from datetime import timedelta
3+
34
import pandas as pd
5+
import pins
46
import numpy
7+
import time
58
import vetiver
69

10+
import pytest
11+
712
rng = pd.date_range("1/1/2012", periods=10, freq="S")
813
new = dict(x=range(len(rng)), y=range(len(rng)))
914
df = pd.DataFrame(new, index=rng)
1015
td = timedelta(seconds=2)
1116
metric_set = [metrics.mean_squared_error, metrics.mean_absolute_error]
1217

18+
1319
def test_rolling():
1420
m = [_ for _ in vetiver._rolling_df(df, td)]
1521
assert len(m) == 5
1622
assert len(m[0]) == 2
1723

24+
1825
def test_compute():
1926
df.reset_index(inplace=True)
2027
m = vetiver.compute_metrics(
@@ -27,10 +34,92 @@ def test_compute():
2734
numpy.array(["mean_squared_error", "mean_absolute_error"], dtype=object),
2835
)
2936

37+
3038
def test_monitor(snapshot):
31-
snapshot.snapshot_dir = './vetiver/tests/snapshots'
39+
snapshot.snapshot_dir = "./vetiver/tests/snapshots"
3240
m = vetiver.compute_metrics(
3341
df, "index", td, metric_set=metric_set, truth="x", estimate="y"
3442
)
3543
vetiver.plot_metrics(m)
36-
snapshot.assert_match(m.to_json(), 'test_monitor.json')
44+
snapshot.assert_match(m.to_json(), "test_monitor.json")
45+
46+
47+
@pytest.fixture
48+
def df_metrics_old():
49+
return pd.DataFrame(
50+
{
51+
"index": pd.to_datetime(["2021-01-01", "2021-01-02"]),
52+
"n": [1, 2],
53+
"metric": ["x", "x"],
54+
"estimate": [0.1, 0.2],
55+
}
56+
)
57+
58+
59+
def test_vetiver_pin_metrics_simple(df_metrics_old):
60+
board = pins.board_temp()
61+
board.pin_write(df_metrics_old, "test_metrics", type="csv")
62+
time.sleep(1)
63+
64+
df_metrics_new = pd.DataFrame(
65+
{
66+
"index": pd.to_datetime(["2021-01-03", "2021-01-04"]),
67+
"n": [3, 4],
68+
"metric": ["x", "x"],
69+
"estimate": [0.8, 0.9],
70+
}
71+
)
72+
73+
df_res = vetiver.pin_metrics(board, df_metrics_new, "test_metrics")
74+
75+
assert len(df_res) == 4
76+
assert df_res.equals(pd.concat([df_metrics_old, df_metrics_new], ignore_index=True))
77+
78+
79+
def test_vetiver_pin_metrics_overlap_error(df_metrics_old):
80+
board = pins.board_temp()
81+
board.pin_write(df_metrics_old, "test_metrics", type="csv")
82+
time.sleep(0.1)
83+
84+
with pytest.raises(ValueError) as exc_info:
85+
vetiver.pin_metrics(board, df_metrics_old, "test_metrics")
86+
87+
assert "The new metrics overlap" in exc_info.value.args[0]
88+
89+
90+
def test_vetiver_pin_metrics_overwrite(df_metrics_old):
91+
board = pins.board_temp()
92+
board.pin_write(df_metrics_old, "test_metrics", type="csv")
93+
time.sleep(1)
94+
95+
# first row should update existing metrics
96+
df_metrics_new = pd.DataFrame(
97+
{
98+
"index": pd.to_datetime(["2021-01-01", "2021-01-03"]),
99+
"n": [200, 201],
100+
"metric": ["y", "y"],
101+
"estimate": [0.8, 0.9],
102+
}
103+
)
104+
105+
df_res = vetiver.pin_metrics(board, df_metrics_new, "test_metrics", overwrite=True)
106+
assert len(df_res) == 3
107+
108+
df_dst = pd.concat([df_metrics_old.iloc[[1], :], df_metrics_new], ignore_index=True)
109+
assert df_res.equals(df_dst.sort_values("index"))
110+
111+
112+
def test_vetiver_pin_metrics_manual_pin_type(df_metrics_old):
113+
board = pins.board_temp()
114+
board.pin_write(df_metrics_old, "test_metrics", type="csv")
115+
time.sleep(1)
116+
117+
df_res = vetiver.pin_metrics(
118+
board, df_metrics_old, "test_metrics", overwrite=True, pin_type="joblib"
119+
)
120+
121+
assert len(df_res) == 2
122+
123+
meta = board.pin_meta("test_metrics")
124+
125+
assert meta.type == "joblib"

0 commit comments

Comments
 (0)