Skip to content

Commit eec86a9

Browse files
Neeratyoymfeurer
authored andcommitted
New example for evalutions (#688)
* Adding example file for evaluations * Adding example file for evaluations * Adding boxplot to compare flows * Editing example headers for make html * Renaming file for make html * Adding more comments, describing plot * Fixing typos, plot aesthetics * Adding flow ID to flow name mapping; Minor text changes * Minor simplification in boxplot function * Fixing PEP8 whitespace issue
1 parent e049fc6 commit eec86a9

3 files changed

Lines changed: 152 additions & 3 deletions

File tree

doc/api.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,4 +135,3 @@ Modules
135135
get_task
136136
get_tasks
137137
list_tasks
138-
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
"""
2+
====================
3+
Fetching Evaluations
4+
====================
5+
6+
Evalutions contain a concise summary of the results of all runs made. Each evaluation
7+
provides information on the dataset used, the flow applied, the setup used, the metric
8+
evaluated, and the result obtained on the metric, for each such run made. These collection
9+
of results can be used for efficient benchmarking of an algorithm and also allow transparent
10+
reuse of results from previous experiments on similar parameters.
11+
12+
In this example, we shall do the following:
13+
14+
* Retrieve evaluations based on different metrics
15+
* Fetch evaluations pertaining to a specific task
16+
* Sort the obtained results in descending order of the metric
17+
* Plot a cumulative distribution function for the evaluations
18+
* Compare the top 10 performing flows based on the evaluation performance
19+
"""
20+
21+
############################################################################
22+
import openml
23+
from pprint import pprint
24+
25+
############################################################################
26+
# Listing evaluations
27+
# *******************
28+
# Evaluations can be retrieved from the database in the chosen output format.
29+
# Required filters can be applied to retrieve results from runs as required.
30+
31+
# We shall retrieve a small set (only 10 entries) to test the listing function for evaluations
32+
openml.evaluations.list_evaluations(function='predictive_accuracy', size=10,
33+
output_format='dataframe')
34+
35+
# Using other evaluation metrics, 'precision' in this case
36+
evals = openml.evaluations.list_evaluations(function='precision', size=10,
37+
output_format='dataframe')
38+
39+
# Querying the returned results for precision above 0.98
40+
pprint(evals[evals.value > 0.98])
41+
42+
#############################################################################
43+
# Viewing a sample task
44+
# =====================
45+
# Over here we shall briefly take a look at the details of the task.
46+
47+
# We will start by displaying a simple *supervised classification* task:
48+
task_id = 167140 # https://www.openml.org/t/167140
49+
task = openml.tasks.get_task(task_id)
50+
pprint(vars(task))
51+
52+
#############################################################################
53+
# Obtaining all the evaluations for the task
54+
# ==========================================
55+
# We'll now obtain all the evaluations that were uploaded for the task
56+
# we displayed previously.
57+
# Note that we now filter the evaluations based on another parameter 'task'.
58+
59+
metric = 'predictive_accuracy'
60+
evals = openml.evaluations.list_evaluations(function=metric, task=[task_id],
61+
output_format='dataframe')
62+
# Displaying the first 10 rows
63+
pprint(evals.head(n=10))
64+
# Sorting the evaluations in decreasing order of the metric chosen
65+
evals = evals.sort_values(by='value', ascending=False)
66+
print("\nDisplaying head of sorted dataframe: ")
67+
pprint(evals.head())
68+
69+
#############################################################################
70+
# Obtaining CDF of metric for chosen task
71+
# ***************************************
72+
# We shall now analyse how the performance of various flows have been on this task,
73+
# by seeing the likelihood of the accuracy obtained across all runs.
74+
# We shall now plot a cumulative distributive function (CDF) for the accuracies obtained.
75+
76+
from matplotlib import pyplot as plt
77+
78+
79+
def plot_cdf(values, metric='predictive_accuracy'):
80+
max_val = max(values)
81+
n, bins, patches = plt.hist(values, density=True, histtype='step',
82+
cumulative=True, linewidth=3)
83+
patches[0].set_xy(patches[0].get_xy()[:-1])
84+
plt.xlim(max(0, min(values) - 0.1), 1)
85+
plt.title('CDF')
86+
plt.xlabel(metric)
87+
plt.ylabel('Likelihood')
88+
plt.grid(b=True, which='major', linestyle='-')
89+
plt.minorticks_on()
90+
plt.grid(b=True, which='minor', linestyle='--')
91+
plt.axvline(max_val, linestyle='--', color='gray')
92+
plt.text(max_val, 0, "%.3f" % max_val, fontsize=9)
93+
plt.show()
94+
95+
96+
plot_cdf(evals.value, metric)
97+
# This CDF plot shows that for the given task, based on the results of the
98+
# runs uploaded, it is almost certain to achieve an accuracy above 52%, i.e.,
99+
# with non-zero probability. While the maximum accuracy seen till now is 96.5%.
100+
101+
#############################################################################
102+
# Comparing top 10 performing flows
103+
# *********************************
104+
# Let us now try to see which flows generally performed the best for this task.
105+
# For this, we shall compare the top performing flows.
106+
107+
import numpy as np
108+
import pandas as pd
109+
110+
111+
def plot_flow_compare(evaluations, top_n=10, metric='predictive_accuracy'):
112+
# Collecting the top 10 performing unique flow_id
113+
flow_ids = evaluations.flow_id.unique()[:top_n]
114+
115+
df = pd.DataFrame()
116+
# Creating a data frame containing only the metric values of the selected flows
117+
# assuming evaluations is sorted in decreasing order of metric
118+
for i in range(len(flow_ids)):
119+
flow_values = evaluations[evaluations.flow_id == flow_ids[i]].value
120+
df = pd.concat([df, flow_values], ignore_index=True, axis=1)
121+
fig, axs = plt.subplots()
122+
df.boxplot()
123+
axs.set_title('Boxplot comparing ' + metric + ' for different flows')
124+
axs.set_ylabel(metric)
125+
axs.set_xlabel('Flow ID')
126+
axs.set_xticklabels(flow_ids)
127+
axs.grid(which='major', linestyle='-', linewidth='0.5', color='gray', axis='y')
128+
axs.minorticks_on()
129+
axs.grid(which='minor', linestyle='--', linewidth='0.5', color='gray', axis='y')
130+
# Counting the number of entries for each flow in the data frame
131+
# which gives the number of runs for each flow
132+
flow_freq = list(df.count(axis=0, numeric_only=True))
133+
for i in range(len(flow_ids)):
134+
axs.text(i + 1.05, np.nanmin(df.values), str(flow_freq[i]) + '\nrun(s)', fontsize=7)
135+
plt.show()
136+
137+
138+
plot_flow_compare(evals, metric=metric, top_n=10)
139+
# The boxplots below show how the flows perform across multiple runs on the chosen
140+
# task. The green horizontal lines represent the median accuracy of all the runs for
141+
# that flow (number of runs denoted at the bottom of the boxplots). The higher the
142+
# green line, the better the flow is for the task at hand. The ordering of the flows
143+
# are in the descending order of the higest accuracy value seen under that flow.
144+
145+
# Printing the corresponding flow names for the top 10 performing flow IDs
146+
top_n = 10
147+
flow_ids = evals.flow_id.unique()[:top_n]
148+
flow_names = evals.flow_name.unique()[:top_n]
149+
for i in range(top_n):
150+
pprint((flow_ids[i], flow_names[i]))

openml/datasets/functions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,6 @@ def list_datasets(
237237

238238

239239
def _list_datasets(output_format='dict', **kwargs):
240-
241240
"""
242241
Perform api call to return a list of all datasets.
243242
@@ -308,7 +307,8 @@ def _load_features_from_file(features_file: str) -> Dict:
308307

309308

310309
def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]:
311-
""" Check if the dataset ids provided are active.
310+
"""
311+
Check if the dataset ids provided are active.
312312
313313
Parameters
314314
----------

0 commit comments

Comments
 (0)