Skip to content

Commit 3e9a80f

Browse files
committed
Base functionalities for uploading runs and optimization traces
1 parent 604d01e commit 3e9a80f

6 files changed

Lines changed: 118 additions & 30 deletions

File tree

openml/flows/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .flow import OpenMLFlow, create_flow_from_model
2-
from .functions import get_flow
2+
from .functions import get_flow, get_flow_dict
33

4-
__all__ = ['OpenMLFlow', 'create_flow_from_model', 'get_flow']
4+
__all__ = ['OpenMLFlow', 'create_flow_from_model', 'get_flow', 'get_flow_dict']

openml/flows/flow.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -394,19 +394,16 @@ def _ensure_flow_exists(self):
394394
# TODO add numpy and scipy version!
395395

396396
if int(flow_id) == -1:
397-
return_code, response_xml = self.publish()
397+
flow = self.publish()
398398

399-
response_dict = xmltodict.parse(response_xml)
400-
flow_id = response_dict['oml:upload_flow']['oml:id']
401-
return int(flow_id)
399+
return int(flow.flow_id)
402400

403401
return int(flow_id)
404402

405403
def _get_name(self):
406404
"""Helper function. Can be mocked for testing."""
407405
return self.name
408406

409-
410407
def create_flow_from_model(model, converter, description=None):
411408
"""Use a converter to create an OpenMLFlow from model.
412409

openml/flows/functions.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import xmltodict
22

33
from openml._api_calls import _perform_api_call
4+
from openml.exceptions import PyOpenMLError
45
from . import OpenMLFlow
56
from ..util import URLError
67

@@ -34,3 +35,18 @@ def get_flow(flow_id, converter=None):
3435
flow.model = model
3536

3637
return flow
38+
39+
40+
def get_flow_dict(flow):
41+
"""Returns a dictionary with keys flow name and values flow id.
42+
43+
Parameters
44+
----------
45+
flow : OpenMLFlow
46+
"""
47+
if flow.flow_id is None:
48+
raise PyOpenMLError("Can only invoke function 'get_flow_map' on a server downloaded flow. ")
49+
flow_map = {flow.name : flow.flow_id}
50+
for subflow in flow.components:
51+
flow_map.update(get_flow_dict(flow.components[subflow]))
52+
return flow_map

openml/runs/functions.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import io
33
import os
44
import xmltodict
5+
from sklearn.grid_search import BaseSearchCV
56

67
from .. import config
78
from ..flows import create_flow_from_model
@@ -57,19 +58,16 @@ def run_task(task, model):
5758
'only works for tasks with class labels.')
5859

5960
run = OpenMLRun(task_id=task.task_id, flow_id=flow_id,
60-
dataset_id=dataset.dataset_id)
61-
run.data_content = _run_task_get_arffcontent(model, task, class_labels)
61+
dataset_id=dataset.dataset_id, model=model)
62+
run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)
6263

63-
# The model will not be uploaded at the moment, but used to get the
64-
# hyperparameter values when uploading the run
65-
X, Y = task.get_X_and_y()
66-
run.model = model.fit(X, Y)
6764
return run
6865

6966

7067
def _run_task_get_arffcontent(model, task, class_labels):
7168
X, Y = task.get_X_and_y()
7269
arff_datacontent = []
70+
arff_tracecontent = []
7371

7472
rep_no = 0
7573
# TODO use different iterator to only provide a single iterator (less
@@ -84,6 +82,16 @@ def _run_task_get_arffcontent(model, task, class_labels):
8482
testY = Y[test_indices]
8583

8684
model.fit(trainX, trainY)
85+
if isinstance(model, BaseSearchCV):
86+
for itt_no in range(0, len(model.grid_scores_)):
87+
current = model.grid_scores_[itt_no]
88+
# we use the string values for True and False, as it is defined in this way by the OpenML server
89+
selected = 'false'
90+
if current.parameters == model.best_params_:
91+
selected = 'true'
92+
arff_line = [rep_no, fold_no, itt_no, current.parameters, current.mean_validation_score, selected]
93+
arff_tracecontent.append(arff_line)
94+
8795
ProbaY = model.predict_proba(testX)
8896
PredY = model.predict(testX)
8997

@@ -97,7 +105,10 @@ def _run_task_get_arffcontent(model, task, class_labels):
97105
fold_no = fold_no + 1
98106
rep_no = rep_no + 1
99107

100-
return arff_datacontent
108+
if not isinstance(model, BaseSearchCV):
109+
arff_tracecontent = None
110+
111+
return arff_datacontent, arff_tracecontent
101112

102113

103114
def get_runs(run_ids):

openml/runs/run.py

Lines changed: 60 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44

55
import arff
66
import xmltodict
7+
from sklearn.base import BaseEstimator
78

9+
import openml
810
from ..tasks import get_task
911
from .._api_calls import _perform_api_call
10-
12+
from ..exceptions import PyOpenMLError
1113

1214
class OpenMLRun(object):
1315
"""OpenML Run: result of running a model on an openml dataset.
@@ -17,10 +19,10 @@ class OpenMLRun(object):
1719
FIXME
1820
1921
"""
20-
def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
22+
def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
2123
files=None, setup_id=None, tags=None, uploader=None, uploader_name=None,
2224
evaluations=None, detailed_evaluations=None,
23-
data_content=None, model=None, task_type=None,
25+
data_content=None, trace_content=None, model=None, task_type=None,
2426
task_evaluation_measure=None, flow_name=None,
2527
parameter_settings=None, predictions_url=None, task=None,
2628
flow=None, run_id=None):
@@ -39,12 +41,14 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
3941
self.evaluations = evaluations
4042
self.detailed_evaluations = detailed_evaluations
4143
self.data_content = data_content
44+
self.trace_content = trace_content
4245
self.task = task
4346
self.flow = flow
4447
self.run_id = run_id
48+
self.model = model
4549

4650
def _generate_arff_dict(self):
47-
"""Generates the arff dictionary for upload to the server.
51+
"""Generates the arff dictionary for uploading predictions to the server.
4852
4953
Assumes that the run has been executed.
5054
@@ -74,6 +78,20 @@ def _generate_arff_dict(self):
7478
arff_dict['relation'] = 'openml_task_' + str(task.task_id) + '_predictions'
7579
return arff_dict
7680

81+
def _generate_trace_arff_dict(self):
82+
if self.trace_content is None:
83+
raise ValueError('No trace content avaiable. (This should never happen.)')
84+
arff_dict = {}
85+
arff_dict['attributes'] = [('repeat', 'NUMERIC'),
86+
('fold', 'NUMERIC'),
87+
('iteration', 'NUMERIC'),
88+
('setup_string', 'STRING'),
89+
('evaluation', 'NUMERIC'),
90+
('selected', ['true', 'false'])]
91+
arff_dict['data'] = self.trace_content
92+
arff_dict['relation'] = 'openml_task_' + str(self.task_id) + '_predictions'
93+
return arff_dict
94+
7795
def publish(self):
7896
"""Publish a run to the OpenML server.
7997
@@ -84,10 +102,18 @@ def publish(self):
84102
-------
85103
self : OpenMLRun
86104
"""
105+
if self.model is None:
106+
raise PyOpenMLError("OpenMLRun obj does not contain a model. (This should never happen.) ");
107+
87108
predictions = arff.dumps(self._generate_arff_dict())
88109
description_xml = self._create_description_xml()
89-
file_elements = {'predictions': ("predictions.csv", predictions),
110+
111+
file_elements = {'predictions': ("predictions.arff", predictions),
90112
'description': ("description.xml", description_xml)}
113+
if self.trace_content is not None:
114+
trace_arff = arff.dumps(self._generate_trace_arff_dict())
115+
file_elements['trace'] = ("trace.arff", trace_arff)
116+
91117
return_code, return_value = _perform_api_call(
92118
"/run/", file_elements=file_elements)
93119
run_id = int(xmltodict.parse(return_value)['oml:upload_run']['oml:run_id'])
@@ -104,7 +130,11 @@ def _create_description_xml(self):
104130
"""
105131
run_environment = _get_version_information()
106132

107-
parameter_settings = self.model.get_params()
133+
# TODO: don't we have flow object in data structure? Use this one
134+
downloaded_flow = openml.flows.get_flow(self.flow_id)
135+
136+
openml_param_settings = _parse_parameters(self.model, downloaded_flow)
137+
108138
# as a tag, it must be of the form ([a-zA-Z0-9_\-\.])+
109139
# so we format time from 'mm/dd/yy hh:mm:ss' to 'mm-dd-yy_hh.mm.ss'
110140
well_formatted_time = time.strftime("%c").replace(
@@ -113,11 +143,33 @@ def _create_description_xml(self):
113143
[self.model.__module__ + "." + self.model.__class__.__name__]
114144
description = _to_dict(taskid=self.task_id, flow_id=self.flow_id,
115145
setup_string=_create_setup_string(self.model),
116-
parameter_settings=parameter_settings,
146+
parameter_settings=openml_param_settings,
117147
tags=tags)
118148
description_xml = xmltodict.unparse(description, pretty=True)
119149
return description_xml
120150

151+
def _parse_parameters(model, flow):
152+
python_param_settings = model.get_params()
153+
openml_param_settings = []
154+
flow_dict = openml.flows.get_flow_dict(flow)
155+
156+
for param in python_param_settings:
157+
if "__" in param:
158+
# parameter of subflow. will be handled later
159+
continue
160+
if isinstance(python_param_settings[param], BaseEstimator):
161+
# extract parameters of the subflow individually
162+
subflow = flow.components[param]
163+
164+
# add parameter setting (also the subflow. Just because we can)
165+
param_dict = OrderedDict()
166+
param_dict['oml:name'] = param;
167+
param_dict['oml:value'] = str(python_param_settings[param]);
168+
param_dict['oml:component'] = flow_dict[flow.name];
169+
openml_param_settings.append(param_dict)
170+
171+
return openml_param_settings
172+
121173
################################################################################
122174
# Functions which cannot be in runs/functions due to circular imports
123175

@@ -169,15 +221,7 @@ def _to_dict(taskid, flow_id, setup_string, parameter_settings, tags):
169221
description['oml:run']['@xmlns:oml'] = 'http://openml.org/openml'
170222
description['oml:run']['oml:task_id'] = taskid
171223
description['oml:run']['oml:flow_id'] = flow_id
172-
173-
params = []
174-
for k, v in parameter_settings.items():
175-
param_dict = OrderedDict()
176-
param_dict['oml:name'] = k
177-
param_dict['oml:value'] = ('None' if v is None else v)
178-
params.append(param_dict)
179-
180-
description['oml:run']['oml:parameter_setting'] = params
224+
description['oml:run']['oml:parameter_setting'] = parameter_settings
181225
description['oml:run']['oml:tag'] = tags # Tags describing the run
182226
# description['oml:run']['oml:output_data'] = 0;
183227
# all data that was output of this run, which can be evaluation scores

tests/runs/test_run_functions.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
from sklearn.linear_model import LogisticRegression, SGDClassifier
2+
from sklearn.ensemble import RandomForestClassifier
3+
from sklearn.grid_search import RandomizedSearchCV
24
import openml
35
from openml.testing import TestBase
46

@@ -12,6 +14,24 @@ def test_run_iris(self):
1214
self.assertEqual(run_, run)
1315
self.assertIsInstance(run.dataset_id, int)
1416

17+
def test_run_optimize_randomforest_iris(self):
18+
task = openml.tasks.get_task(10107)
19+
clf = RandomForestClassifier(n_estimators=5)
20+
21+
param_dist = {"max_depth": [3, None],
22+
"max_features": [1,2,3,4],
23+
"min_samples_split": [1,2,3,4,5,6,7,8,9,10],
24+
"min_samples_leaf": [1,2,3,4,5,6,7,8,9,10],
25+
"bootstrap": [True, False],
26+
"criterion": ["gini", "entropy"]}
27+
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,n_iter=20)
28+
29+
30+
run = openml.runs.run_task(task, random_search)
31+
run_ = run.publish()
32+
self.assertEqual(run_, run)
33+
self.assertIsInstance(run.dataset_id, int)
34+
1535
def test__run_task_get_arffcontent(self):
1636
task = openml.tasks.get_task(1939)
1737
class_labels = task.class_labels

0 commit comments

Comments
 (0)