Skip to content

Commit ff2fa4b

Browse files
committed
restructured flow mechanism (flow exists is now an independent function, as it never relied on the flow anyway)
bug fix flow exist and setup exists - added usit test for flow exists and setup exists (cases where it exists and not exists)
1 parent 35c66b5 commit ff2fa4b

7 files changed

Lines changed: 182 additions & 86 deletions

File tree

openml/flows/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .flow import OpenMLFlow
22
from .sklearn_converter import sklearn_to_flow, flow_to_sklearn
3-
from .functions import get_flow, list_flows
3+
from .functions import get_flow, list_flows, flow_exists
44

55
__all__ = ['OpenMLFlow', 'create_flow_from_model', 'get_flow', 'list_flows',
6-
'sklearn_to_flow', 'flow_to_sklearn']
6+
'sklearn_to_flow', 'flow_to_sklearn', 'flow_exists']

openml/flows/flow.py

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -341,38 +341,6 @@ def publish(self):
341341
return self
342342

343343

344-
def _check_flow_exists(name, version):
345-
"""Retrieves the flow id of the flow uniquely identified by name+version.
346-
347-
Parameter
348-
---------
349-
name : string
350-
Name of the flow
351-
version : string
352-
Version information associated with flow.
353-
354-
Returns
355-
-------
356-
flow_exist : int
357-
Flow id or -1 if the flow doesn't exist.
358-
359-
Notes
360-
-----
361-
see http://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version
362-
"""
363-
if not (type(name) is str and len(name) > 0):
364-
raise ValueError('Argument \'name\' should be a non-empty string')
365-
if not (type(version) is str and len(version) > 0):
366-
raise ValueError('Argument \'version\' should be a non-empty string')
367-
368-
xml_response = _perform_api_call("flow/exists",
369-
data={'name': name, 'external_version': version})
370-
371-
xml_dict = xmltodict.parse(xml_response)
372-
flow_id = xml_dict['oml:flow_exists']['oml:id']
373-
return xml_response, flow_id
374-
375-
376344
def _add_if_nonempty(dic, key, value):
377345
if value is not None:
378346
dic[key] = value

openml/flows/functions.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,41 @@ def list_flows(offset=None, size=None, tag=None):
6969
return _list_flows(api_call)
7070

7171

72+
def flow_exists(name, version):
73+
"""Retrieves the flow id of the flow uniquely identified by name+version.
74+
75+
Parameter
76+
---------
77+
name : string
78+
Name of the flow
79+
version : string
80+
Version information associated with flow.
81+
82+
Returns
83+
-------
84+
flow_exist : int
85+
flow id iff exists, False otherwise
86+
87+
Notes
88+
-----
89+
see http://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version
90+
"""
91+
if not (type(name) is str and len(name) > 0):
92+
raise ValueError('Argument \'name\' should be a non-empty string')
93+
if not (type(version) is str and len(version) > 0):
94+
raise ValueError('Argument \'version\' should be a non-empty string')
95+
96+
xml_response = _perform_api_call("flow/exists",
97+
data={'name': name, 'external_version': version})
98+
99+
result_dict = xmltodict.parse(xml_response)
100+
flow_id = int(result_dict['oml:flow_exists']['oml:id'])
101+
if flow_id > 0:
102+
return flow_id
103+
else:
104+
return False;
105+
106+
72107
def _list_flows(api_call):
73108
# TODO add proper error handling here!
74109
xml_string = _perform_api_call(api_call)

openml/runs/functions.py

Lines changed: 34 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from ..exceptions import PyOpenMLError
1111
from .. import config
12-
from ..flows import sklearn_to_flow, get_flow
12+
from ..flows import sklearn_to_flow, get_flow, flow_exists
1313
from ..setups import setup_exists
1414
from ..exceptions import OpenMLCacheException, OpenMLServerException
1515
from ..util import URLError
@@ -47,11 +47,11 @@ def run_task(task, model):
4747
flow = sklearn_to_flow(model)
4848

4949
# returns flow id if the flow exists on the server, -1 otherwise
50-
_, flow_id = openml.flows._check_flow_exists(flow.name, flow.external_version)
50+
flow_id = flow_exists(flow.name, flow.external_version)
5151

5252
# skips the run if it already exists and the user opts for this in the config file.
5353
# also, if the flow is not present on the server, the check is not needed.
54-
if config.avoid_duplicate_runs and flow_id > 0:
54+
if config.avoid_duplicate_runs and flow_id:
5555
flow = get_flow(flow_id)
5656
setup_id = setup_exists(flow, model)
5757
ids = _run_exists(task.task_id, setup_id)
@@ -70,13 +70,17 @@ def run_task(task, model):
7070
run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model)
7171
run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)
7272

73-
if flow_id < 0:
74-
flow.publish()
75-
config.logger.info(flow_id)
76-
77-
# attach the flow to the run
78-
run.flow_id = flow_id
73+
if flow_id == False:
74+
# means the flow did not exists.
75+
# As we could run it, publish it now
76+
flow = flow.publish()
77+
else:
78+
# flow already existed, download it from server
79+
# TODO (neccessary? is this a post condition of this function)
80+
flow = get_flow(flow_id)
7981

82+
run.flow_id = flow.flow_id
83+
config.logger.info('Executed Task %d with Flow id: %d' %(task.task_id, run.flow_id))
8084

8185
return run
8286

@@ -311,27 +315,28 @@ def _create_run_from_xml(xml):
311315
evaluations = dict()
312316
detailed_evaluations = defaultdict(lambda: defaultdict(dict))
313317
evaluation_flows = dict()
314-
for evaluation_dict in run['oml:output_data']['oml:evaluation']:
315-
key = evaluation_dict['oml:name']
316-
if 'oml:value' in evaluation_dict:
317-
value = float(evaluation_dict['oml:value'])
318-
elif 'oml:array_data' in evaluation_dict:
319-
value = evaluation_dict['oml:array_data']
320-
else:
321-
raise ValueError('Could not find keys "value" or "array_data" '
322-
'in %s' % str(evaluation_dict.keys()))
323-
324-
if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
325-
repeat = int(evaluation_dict['@repeat'])
326-
fold = int(evaluation_dict['@fold'])
327-
repeat_dict = detailed_evaluations[key]
328-
fold_dict = repeat_dict[repeat]
329-
fold_dict[fold] = value
330-
else:
331-
evaluations[key] = value
332-
evaluation_flows[key] = flow_id
318+
if 'oml:output_data' in run and 'oml:evaluation' in run['oml:output_data']:
319+
for evaluation_dict in run['oml:output_data']['oml:evaluation']:
320+
key = evaluation_dict['oml:name']
321+
if 'oml:value' in evaluation_dict:
322+
value = float(evaluation_dict['oml:value'])
323+
elif 'oml:array_data' in evaluation_dict:
324+
value = evaluation_dict['oml:array_data']
325+
else:
326+
raise ValueError('Could not find keys "value" or "array_data" '
327+
'in %s' % str(evaluation_dict.keys()))
328+
329+
if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
330+
repeat = int(evaluation_dict['@repeat'])
331+
fold = int(evaluation_dict['@fold'])
332+
repeat_dict = detailed_evaluations[key]
333+
fold_dict = repeat_dict[repeat]
334+
fold_dict[fold] = value
335+
else:
336+
evaluations[key] = value
337+
evaluation_flows[key] = flow_id
333338

334-
evaluation_flows[key] = flow_id
339+
evaluation_flows[key] = flow_id
335340

336341
return OpenMLRun(run_id=run_id, uploader=uploader,
337342
uploader_name=uploader_name, task_id=task_id,

openml/setups/functions.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,20 @@ def setup_exists(downloaded_flow, sklearn_model):
77
'''
88
Checks whether a flow / hyperparameter configuration already exists on the server
99
10-
:param downloaded_flow:
10+
Parameter
11+
---------
12+
13+
downloaded_flow : flow
1114
the openml flow object (should be downloaded from server.
1215
Otherwise also give flow id parameter)
13-
:param sklearn_model: obvious
14-
:param flow_id: int
15-
:return: int setup id iff exists, False otherwise
16+
sklearn_model : BaseEstimator
17+
The base estimator that was used to create the flow. Will
18+
be used to extract parameter settings from.
19+
20+
Returns
21+
-------
22+
setup_id : int s
23+
setup id iff exists, False otherwise
1624
'''
1725

1826
# sadly, this api call relies on a run object
@@ -23,10 +31,11 @@ def setup_exists(downloaded_flow, sklearn_model):
2331
result = openml._api_calls._perform_api_call('/setup/exists/',
2432
file_elements = file_elements)
2533
result_dict = xmltodict.parse(result)
26-
if 'oml:id' in result_dict['oml:setup_exists']:
27-
return int(result_dict['oml:setup_exists']['oml:id'])
34+
setup_id = int(result_dict['oml:setup_exists']['oml:id'])
35+
if setup_id > 0:
36+
return setup_id
2837
else:
29-
return False
38+
return False;
3039

3140

3241
def _to_dict(flow_id, openml_parameter_settings):

tests/test_flows/test_flow.py

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import hashlib
33
import re
44
import time
5+
import random
56
import unittest
67

78
import xmltodict
@@ -16,6 +17,7 @@
1617
import sklearn.model_selection
1718
import sklearn.pipeline
1819
import sklearn.preprocessing
20+
import sklearn.naive_bayes
1921
import sklearn.tree
2022

2123
from openml.testing import TestBase
@@ -174,24 +176,27 @@ def test_illegal_flow(self):
174176
('classif', sklearn.tree.DecisionTreeClassifier())])
175177
self.assertRaises(ValueError, openml.flows.sklearn_to_flow, illegal)
176178

177-
def test_ensure_flow_exists(self):
178-
sentinel = get_sentinel()
179+
def test_nonexiting_flow_exists(self):
180+
name = get_sentinel() + get_sentinel()
181+
version = get_sentinel()
179182

180-
flow = openml.OpenMLFlow(name='Test',
181-
description="test description",
182-
model=sklearn.dummy.DummyClassifier(),
183-
components=collections.OrderedDict(),
184-
parameters=collections.OrderedDict(),
185-
parameters_meta_info=collections.OrderedDict(),
186-
external_version=_format_external_version(
187-
'sklearn', sklearn.__version__),
188-
tags=[],
189-
language='English',
190-
dependencies='')
183+
flow_id = openml.flows.flow_exists(name, version)
184+
self.assertEquals(flow_id, False)
185+
186+
def test_exiting_flow_exists(self):
187+
# create a flow
188+
sentinel = get_sentinel()
189+
nb = sklearn.naive_bayes.GaussianNB()
190+
flow = openml.flows.sklearn_to_flow(nb)
191191
flow.name = 'TEST%s%s' % (sentinel, flow.name)
192-
flow_id = flow._ensure_flow_exists()
193-
self.assertIsInstance(flow_id, int)
194-
self.assertEqual(flow._ensure_flow_exists(), flow_id)
192+
193+
flow = flow.publish()
194+
195+
# check if flow exists can find it
196+
flow = openml.flows.get_flow(flow.flow_id)
197+
downloaded_flow_id = openml.flows.flow_exists(flow.name, flow.external_version)
198+
self.assertEquals(downloaded_flow_id, flow.flow_id)
199+
195200

196201
def test_sklearn_to_upload_to_flow(self):
197202
iris = sklearn.datasets.load_iris()
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import sys
2+
import hashlib
3+
import time
4+
5+
import openml
6+
import openml.exceptions
7+
from openml.testing import TestBase
8+
9+
if sys.version_info[0] >= 3:
10+
from unittest import mock
11+
else:
12+
import mock
13+
14+
15+
def get_sentinel():
16+
# Create a unique prefix for the flow. Necessary because the flow is
17+
# identified by its name and external version online. Having a unique
18+
# name allows us to publish the same flow in each test run
19+
md5 = hashlib.md5()
20+
md5.update(str(time.time()).encode('utf-8'))
21+
sentinel = md5.hexdigest()[:10]
22+
sentinel = 'TEST%s' % sentinel
23+
return sentinel
24+
25+
26+
27+
class TestRun(TestBase):
28+
29+
def test_nonexisting_setup_exists(self):
30+
from sklearn.tree import DecisionTreeClassifier
31+
# first publish a nonexiting flow
32+
sentinel = get_sentinel()
33+
dectree = DecisionTreeClassifier()
34+
flow = openml.flows.sklearn_to_flow(dectree)
35+
flow.name = 'TEST%s%s' % (sentinel, flow.name)
36+
flow.publish()
37+
38+
# although the flow exists, we can be sure there are no
39+
# setups (yet) as it hasn't been ran
40+
setup_id = openml.setups.setup_exists(flow, dectree)
41+
self.assertEquals(setup_id, False)
42+
43+
44+
def test_existing_setup_exists(self):
45+
from sklearn.ensemble import BaggingClassifier
46+
from sklearn.tree import DecisionTreeClassifier
47+
# first publish a nonexiting flow
48+
bagging = BaggingClassifier(DecisionTreeClassifier(max_depth=5,
49+
min_samples_split=1),
50+
n_estimators=3,
51+
max_samples=0.5)
52+
flow = openml.flows.sklearn_to_flow(bagging)
53+
flow.name = 'TEST%s%s' % (get_sentinel(), flow.name)
54+
flow = flow.publish()
55+
flow = openml.flows.get_flow(flow.flow_id)
56+
57+
# although the flow exists, we can be sure there are no
58+
# setups (yet) as it hasn't been ran
59+
setup_id = openml.setups.setup_exists(flow, bagging)
60+
self.assertEquals(setup_id, False)
61+
62+
# now run the flow on an easy task:
63+
task = openml.tasks.get_task(115) #diabetes
64+
run = openml.runs.run_task(task, bagging)
65+
# spoof flow id, otherwise the sentinel is ignored
66+
run.flow_id = flow.flow_id
67+
run = run.publish()
68+
# download the run, as it contains the right setup id
69+
run = openml.runs.get_run(run.run_id)
70+
71+
# execute the function we are interested in
72+
setup_id = openml.setups.setup_exists(flow, bagging)
73+
self.assertEquals(setup_id, run.setup_id)
74+

0 commit comments

Comments
 (0)