Skip to content

Commit a2c1e7c

Browse files
authored
Merge pull request #181 from openml/feature/upload-run
Feature/upload run
2 parents 31bf79e + a972eeb commit a2c1e7c

4 files changed

Lines changed: 192 additions & 28 deletions

File tree

openml/flows/functions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,4 @@ def get_flow(flow_id):
2525
if 'sklearn' in flow.external_version:
2626
flow.model = flow_to_sklearn(flow)
2727

28-
return flow
28+
return flow

openml/runs/functions.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import io
33
import os
44
import xmltodict
5+
from sklearn.model_selection._search import BaseSearchCV
56

67
from .. import config
78
from ..flows import sklearn_to_flow
@@ -56,19 +57,16 @@ def run_task(task, model):
5657
'only works for tasks with class labels.')
5758

5859
run = OpenMLRun(task_id=task.task_id, flow_id=flow_id,
59-
dataset_id=dataset.dataset_id)
60-
run.data_content = _run_task_get_arffcontent(model, task, class_labels)
60+
dataset_id=dataset.dataset_id, model=model)
61+
run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)
6162

62-
# The model will not be uploaded at the moment, but used to get the
63-
# hyperparameter values when uploading the run
64-
X, Y = task.get_X_and_y()
65-
run.model = model.fit(X, Y)
6663
return run
6764

6865

6966
def _run_task_get_arffcontent(model, task, class_labels):
7067
X, Y = task.get_X_and_y()
7168
arff_datacontent = []
69+
arff_tracecontent = []
7270

7371
rep_no = 0
7472
# TODO use different iterator to only provide a single iterator (less
@@ -83,6 +81,10 @@ def _run_task_get_arffcontent(model, task, class_labels):
8381
testY = Y[test_indices]
8482

8583
model.fit(trainX, trainY)
84+
if isinstance(model, BaseSearchCV):
85+
_add_results_to_arfftrace(arff_tracecontent, fold_no, model,
86+
rep_no)
87+
8688
ProbaY = model.predict_proba(testX)
8789
PredY = model.predict(testX)
8890

@@ -96,7 +98,24 @@ def _run_task_get_arffcontent(model, task, class_labels):
9698
fold_no = fold_no + 1
9799
rep_no = rep_no + 1
98100

99-
return arff_datacontent
101+
if not isinstance(model, BaseSearchCV):
102+
arff_tracecontent = None
103+
104+
return arff_datacontent, arff_tracecontent
105+
106+
107+
def _add_results_to_arfftrace(arff_tracecontent, fold_no, model, rep_no):
108+
for itt_no in range(0, len(model.cv_results_['mean_test_score'])):
109+
# we use the string values for True and False, as it is defined in this way by the OpenML server
110+
selected = 'false'
111+
if itt_no == model.best_index_:
112+
selected = 'true'
113+
test_score = model.cv_results_['mean_test_score'][itt_no]
114+
arff_line = [rep_no, fold_no, itt_no, test_score, selected]
115+
for key in model.cv_results_:
116+
if key.startswith("param_"):
117+
arff_line.append(str(model.cv_results_[key][itt_no]))
118+
arff_tracecontent.append(arff_line)
100119

101120

102121
def get_runs(run_ids):

openml/runs/run.py

Lines changed: 107 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,13 @@
44

55
import arff
66
import xmltodict
7+
from sklearn.base import BaseEstimator
8+
from sklearn.model_selection._search import BaseSearchCV
79

10+
import openml
811
from ..tasks import get_task
912
from .._api_calls import _perform_api_call
10-
13+
from ..exceptions import PyOpenMLError
1114

1215
class OpenMLRun(object):
1316
"""OpenML Run: result of running a model on an openml dataset.
@@ -17,10 +20,10 @@ class OpenMLRun(object):
1720
FIXME
1821
1922
"""
20-
def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
23+
def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
2124
files=None, setup_id=None, tags=None, uploader=None, uploader_name=None,
2225
evaluations=None, detailed_evaluations=None,
23-
data_content=None, model=None, task_type=None,
26+
data_content=None, trace_content=None, model=None, task_type=None,
2427
task_evaluation_measure=None, flow_name=None,
2528
parameter_settings=None, predictions_url=None, task=None,
2629
flow=None, run_id=None):
@@ -39,12 +42,14 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
3942
self.evaluations = evaluations
4043
self.detailed_evaluations = detailed_evaluations
4144
self.data_content = data_content
45+
self.trace_content = trace_content
4246
self.task = task
4347
self.flow = flow
4448
self.run_id = run_id
49+
self.model = model
4550

4651
def _generate_arff_dict(self):
47-
"""Generates the arff dictionary for upload to the server.
52+
"""Generates the arff dictionary for uploading predictions to the server.
4853
4954
Assumes that the run has been executed.
5055
@@ -74,6 +79,48 @@ def _generate_arff_dict(self):
7479
arff_dict['relation'] = 'openml_task_' + str(task.task_id) + '_predictions'
7580
return arff_dict
7681

82+
def _generate_trace_arff_dict(self, model):
83+
"""Generates the arff dictionary for uploading predictions to the server.
84+
85+
Assumes that the run has been executed.
86+
87+
Returns
88+
-------
89+
arf_dict : dict
90+
Dictionary representation of the ARFF file that will be uploaded.
91+
Contains information about the optimization trace.
92+
"""
93+
if self.trace_content is None:
94+
raise ValueError('No trace content avaiable.')
95+
if not isinstance(model, BaseSearchCV):
96+
raise PyOpenMLError('Cannot generate trace on provided classifier. (This should never happen.)')
97+
98+
arff_dict = {}
99+
arff_dict['attributes'] = [('repeat', 'NUMERIC'),
100+
('fold', 'NUMERIC'),
101+
('iteration', 'NUMERIC'),
102+
('evaluation', 'NUMERIC'),
103+
('selected', ['true', 'false'])]
104+
for key in model.cv_results_:
105+
if key.startswith("param_"):
106+
type = 'STRING'
107+
if all(isinstance(i, (bool)) for i in model.cv_results_[key]):
108+
type = ['True', 'False']
109+
elif all(isinstance(i, (int, float)) for i in model.cv_results_[key]):
110+
type = 'NUMERIC'
111+
else:
112+
values = list(set(model.cv_results_[key])) # unique values
113+
type = [str(i) for i in values]
114+
print(key + ": " + str(type))
115+
116+
attribute = ("parameter_" + key[6:], type)
117+
arff_dict['attributes'].append(attribute)
118+
119+
arff_dict['data'] = self.trace_content
120+
arff_dict['relation'] = 'openml_task_' + str(self.task_id) + '_predictions'
121+
122+
return arff_dict
123+
77124
def publish(self):
78125
"""Publish a run to the OpenML server.
79126
@@ -84,10 +131,18 @@ def publish(self):
84131
-------
85132
self : OpenMLRun
86133
"""
134+
if self.model is None:
135+
raise PyOpenMLError("OpenMLRun obj does not contain a model. (This should never happen.) ");
136+
87137
predictions = arff.dumps(self._generate_arff_dict())
88138
description_xml = self._create_description_xml()
89-
file_elements = {'predictions': ("predictions.csv", predictions),
139+
140+
file_elements = {'predictions': ("predictions.arff", predictions),
90141
'description': ("description.xml", description_xml)}
142+
if self.trace_content is not None:
143+
trace_arff = arff.dumps(self._generate_trace_arff_dict(self.model))
144+
file_elements['trace'] = ("trace.arff", trace_arff)
145+
91146
return_code, return_value = _perform_api_call(
92147
"/run/", file_elements=file_elements)
93148
run_id = int(xmltodict.parse(return_value)['oml:upload_run']['oml:run_id'])
@@ -104,7 +159,11 @@ def _create_description_xml(self):
104159
"""
105160
run_environment = _get_version_information()
106161

107-
parameter_settings = self.model.get_params()
162+
# TODO: don't we have flow object in data structure? Use this one
163+
downloaded_flow = openml.flows.get_flow(self.flow_id)
164+
165+
openml_param_settings = _parse_parameters(self.model, downloaded_flow)
166+
108167
# as a tag, it must be of the form ([a-zA-Z0-9_\-\.])+
109168
# so we format time from 'mm/dd/yy hh:mm:ss' to 'mm-dd-yy_hh.mm.ss'
110169
well_formatted_time = time.strftime("%c").replace(
@@ -113,11 +172,51 @@ def _create_description_xml(self):
113172
[self.model.__module__ + "." + self.model.__class__.__name__]
114173
description = _to_dict(taskid=self.task_id, flow_id=self.flow_id,
115174
setup_string=_create_setup_string(self.model),
116-
parameter_settings=parameter_settings,
175+
parameter_settings=openml_param_settings,
117176
tags=tags)
118177
description_xml = xmltodict.unparse(description, pretty=True)
119178
return description_xml
120179

180+
def _parse_parameters(model, flow):
181+
"""Extracts all parameter settings from a model in OpenML format.
182+
183+
Parameters
184+
----------
185+
model
186+
the scikit-learn model (fitted)
187+
flow
188+
openml flow object (containing flow ids, i.e., it has to be downloaded from the server)
189+
190+
"""
191+
python_param_settings = model.get_params()
192+
openml_param_settings = []
193+
194+
def get_flow_dict(_flow):
195+
flow_map = {_flow.name: _flow.flow_id}
196+
for subflow in _flow.components:
197+
flow_map.update(get_flow_dict(_flow.components[subflow]))
198+
return flow_map
199+
200+
flow_dict = get_flow_dict(flow)
201+
202+
for param in python_param_settings:
203+
if "__" in param:
204+
# parameter of subflow. will be handled later
205+
continue
206+
if isinstance(python_param_settings[param], BaseEstimator):
207+
# extract parameters of the subflow individually
208+
subflow = flow.components[param]
209+
openml_param_settings += _parse_parameters(python_param_settings[param], subflow)
210+
211+
# add parameter setting (also the subflow. Just because we can)
212+
param_dict = OrderedDict()
213+
param_dict['oml:name'] = param
214+
param_dict['oml:value'] = str(python_param_settings[param])
215+
param_dict['oml:component'] = flow_dict[flow.name]
216+
openml_param_settings.append(param_dict)
217+
218+
return openml_param_settings
219+
121220
################################################################################
122221
# Functions which cannot be in runs/functions due to circular imports
123222

@@ -169,15 +268,7 @@ def _to_dict(taskid, flow_id, setup_string, parameter_settings, tags):
169268
description['oml:run']['@xmlns:oml'] = 'http://openml.org/openml'
170269
description['oml:run']['oml:task_id'] = taskid
171270
description['oml:run']['oml:flow_id'] = flow_id
172-
173-
params = []
174-
for k, v in parameter_settings.items():
175-
param_dict = OrderedDict()
176-
param_dict['oml:name'] = k
177-
param_dict['oml:value'] = ('None' if v is None else v)
178-
params.append(param_dict)
179-
180-
description['oml:run']['oml:parameter_setting'] = params
271+
description['oml:run']['oml:parameter_setting'] = parameter_settings
181272
description['oml:run']['oml:tag'] = tags # Tags describing the run
182273
# description['oml:run']['oml:output_data'] = 0;
183274
# all data that was output of this run, which can be evaluation scores

tests/runs/test_run_functions.py

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,68 @@
11
from sklearn.linear_model import LogisticRegression, SGDClassifier
2+
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
3+
from sklearn.svm import SVC
4+
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
25
import openml
36
import openml.exceptions
47
from openml.testing import TestBase
58

69

710
class TestRun(TestBase):
8-
def test_run_iris(self):
9-
task = openml.tasks.get_task(10107)
10-
clf = LogisticRegression()
11+
12+
def _perform_run(self, task_id, num_instances, clf):
13+
task = openml.tasks.get_task(task_id)
1114
run = openml.runs.run_task(task, clf)
1215
run_ = run.publish()
1316
self.assertEqual(run_, run)
1417
self.assertIsInstance(run.dataset_id, int)
1518

19+
# check arff output
20+
self.assertEqual(len(run.data_content), num_instances)
21+
return run
22+
23+
24+
def test_run_iris(self):
25+
task_id = 10107
26+
num_instances = 150
27+
28+
clf = LogisticRegression()
29+
self._perform_run(task_id,num_instances, clf)
30+
31+
32+
def test_run_optimize_randomforest_iris(self):
33+
task_id = 10107
34+
num_instances = 150
35+
num_folds = 10
36+
num_iterations = 5
37+
38+
clf = RandomForestClassifier(n_estimators=10)
39+
param_dist = {"max_depth": [3, None],
40+
"max_features": [1,2,3,4],
41+
"min_samples_split": [2,3,4,5,6,7,8,9,10],
42+
"min_samples_leaf": [1,2,3,4,5,6,7,8,9,10],
43+
"bootstrap": [True, False],
44+
"criterion": ["gini", "entropy"]}
45+
random_search = RandomizedSearchCV(clf, param_dist,n_iter=num_iterations)
46+
47+
run = self._perform_run(task_id,num_instances, random_search)
48+
self.assertEqual(len(run.trace_content), num_iterations * num_folds)
49+
50+
def test_run_optimize_bagging_iris(self):
51+
task_id = 10107
52+
num_instances = 150
53+
num_folds = 10
54+
num_iterations = 36 # (num values for C times gamma)
55+
56+
task = openml.tasks.get_task(task_id)
57+
bag = BaggingClassifier(base_estimator=SVC())
58+
param_dist = {"base_estimator__C": [0.001, 0.01, 0.1, 1, 10, 100],
59+
"base_estimator__gamma": [0.001, 0.01, 0.1, 1, 10, 100]}
60+
grid_search = GridSearchCV(bag, param_dist)
61+
62+
run = self._perform_run(task_id, num_instances, grid_search)
63+
self.assertEqual(len(run.trace_content), num_iterations * num_folds)
64+
65+
1666
def test__run_task_get_arffcontent(self):
1767
task = openml.tasks.get_task(1939)
1868
class_labels = task.class_labels
@@ -24,9 +74,13 @@ def test__run_task_get_arffcontent(self):
2474
clf, task, class_labels)
2575

2676
clf = SGDClassifier(loss='log', random_state=1)
27-
arff_datacontent = openml.runs.functions._run_task_get_arffcontent(
77+
arff_datacontent, arff_tracecontent = openml.runs.functions._run_task_get_arffcontent(
2878
clf, task, class_labels)
79+
# predictions
2980
self.assertIsInstance(arff_datacontent, list)
81+
# trace. SGD does not produce any
82+
self.assertIsInstance(arff_tracecontent, type(None))
83+
3084
# 10 times 10 fold CV of 150 samples
3185
self.assertEqual(len(arff_datacontent), 1500)
3286
for arff_line in arff_datacontent:

0 commit comments

Comments
 (0)