Skip to content

Commit 5c4bf4a

Browse files
authored
Merge pull request #216 from openml/check_run_exists
Check run exists
2 parents a17b7aa + eff74f6 commit 5c4bf4a

14 files changed

Lines changed: 352 additions & 156 deletions

File tree

openml/_api_calls.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
import requests
44
import arff
55
import warnings
6+
import xmltodict
67

78
from . import config
8-
from .exceptions import OpenMLServerError
9+
from .exceptions import OpenMLServerError, OpenMLServerException
910

1011

1112
def _perform_api_call(call, data=None, file_dictionary=None,
@@ -80,7 +81,7 @@ def _read_url_files(url, data=None, file_dictionary=None, file_elements=None):
8081
# 'gzip,deflate'
8182
response = requests.post(url, data=data, files=file_elements)
8283
if response.status_code != 200:
83-
raise OpenMLServerError(('Status code: %d\n' % response.status_code) + response.text)
84+
raise _parse_server_exception(response)
8485
if 'Content-Encoding' not in response.headers or \
8586
response.headers['Content-Encoding'] != 'gzip':
8687
warnings.warn('Received uncompressed content from OpenML for %s.' % url)
@@ -97,8 +98,23 @@ def _read_url(url, data=None):
9798
response = requests.post(url, data=data)
9899

99100
if response.status_code != 200:
100-
raise OpenMLServerError(('Status code: %d\n' % response.status_code) + response.text)
101+
raise _parse_server_exception(response)
101102
if 'Content-Encoding' not in response.headers or \
102103
response.headers['Content-Encoding'] != 'gzip':
103104
warnings.warn('Received uncompressed content from OpenML for %s.' % url)
104105
return response.text
106+
107+
def _parse_server_exception(response):
108+
# OpenML has a sopisticated error system
109+
# where information about failures is provided. try to parse this
110+
try:
111+
server_exception = xmltodict.parse(response.text)
112+
except:
113+
raise OpenMLServerError(('Status code: %d\n' % response.status_code) + response.text)
114+
115+
code = int(server_exception['oml:error']['oml:code'])
116+
message = server_exception['oml:error']['oml:message']
117+
additional = None
118+
if 'oml:additional_information' in server_exception['oml:error']:
119+
additional = server_exception['oml:error']['oml:additional_information']
120+
return OpenMLServerException(code, message, additional)

openml/config.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def _setup():
3636
"""
3737
global apikey
3838
global server
39+
global avoid_duplicate_runs
3940
# read config file, create cache directory
4041
try:
4142
os.mkdir(os.path.expanduser('~/.openml'))
@@ -46,6 +47,7 @@ def _setup():
4647
apikey = config.get('FAKE_SECTION', 'apikey')
4748
server = config.get('FAKE_SECTION', 'server')
4849
cache_dir = config.get('FAKE_SECTION', 'cachedir')
50+
avoid_duplicate_runs = config.getboolean('FAKE_SECTION', 'avoid_duplicate_runs')
4951
set_cache_directory(cache_dir)
5052

5153

@@ -84,7 +86,8 @@ def _parse_config():
8486
defaults = {'apikey': apikey,
8587
'server': server,
8688
'verbosity': 0,
87-
'cachedir': os.path.expanduser('~/.openml/cache')}
89+
'cachedir': os.path.expanduser('~/.openml/cache'),
90+
'avoid_duplicate_runs': 'True'}
8891

8992
config_file = os.path.expanduser('~/.openml/config')
9093
config = configparser.RawConfigParser(defaults=defaults)

openml/exceptions.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,25 @@ class PyOpenMLError(Exception):
22
def __init__(self, message):
33
super(PyOpenMLError, self).__init__(message)
44

5-
65
class OpenMLServerError(PyOpenMLError):
7-
"""Server didn't respond 200."""
6+
"""class for when something is really wrong on the server
7+
(result did not parse to dict), contains unparsed error."""
8+
89
def __init__(self, message):
910
message = "OpenML Server error: " + message
1011
super(OpenMLServerError, self).__init__(message)
1112

13+
#
14+
class OpenMLServerException(OpenMLServerError):
15+
"""exception for when the result of the server was
16+
not 200 (e.g., listing call w/o results). """
17+
18+
def __init__(self, code, message, additional=None):
19+
self.code = code
20+
self.additional = additional
21+
message = "OpenML Server exception: " + message
22+
super(OpenMLServerException, self).__init__(message)
23+
1224

1325
class OpenMLCacheException(PyOpenMLError):
1426
"""Dataset / task etc not found in cache"""

openml/flows/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .flow import OpenMLFlow
22
from .sklearn_converter import sklearn_to_flow, flow_to_sklearn
3-
from .functions import get_flow, list_flows
3+
from .functions import get_flow, list_flows, flow_exists
44

55
__all__ = ['OpenMLFlow', 'create_flow_from_model', 'get_flow', 'list_flows',
6-
'sklearn_to_flow', 'flow_to_sklearn']
6+
'sklearn_to_flow', 'flow_to_sklearn', 'flow_exists']

openml/flows/flow.py

Lines changed: 0 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -340,58 +340,6 @@ def publish(self):
340340
self.flow_id = int(xmltodict.parse(return_value)['oml:upload_flow']['oml:id'])
341341
return self
342342

343-
def _ensure_flow_exists(self):
344-
""" Checks if a flow exists for the given model and possibly creates it.
345-
346-
If the given flow exists on the server, the flow-id will simply
347-
be returned. Otherwise it will be uploaded to the server.
348-
349-
Returns
350-
-------
351-
flow_id : int
352-
Flow id on the server.
353-
"""
354-
_, flow_id = _check_flow_exists(self.name, self.external_version)
355-
# TODO add numpy and scipy version!
356-
357-
if int(flow_id) == -1:
358-
flow = self.publish()
359-
return int(flow.flow_id)
360-
361-
return int(flow_id)
362-
363-
364-
def _check_flow_exists(name, version):
365-
"""Retrieves the flow id of the flow uniquely identified by name+version.
366-
367-
Parameter
368-
---------
369-
name : string
370-
Name of the flow
371-
version : string
372-
Version information associated with flow.
373-
374-
Returns
375-
-------
376-
flow_exist : int
377-
Flow id or -1 if the flow doesn't exist.
378-
379-
Notes
380-
-----
381-
see http://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version
382-
"""
383-
if not (type(name) is str and len(name) > 0):
384-
raise ValueError('Argument \'name\' should be a non-empty string')
385-
if not (type(version) is str and len(version) > 0):
386-
raise ValueError('Argument \'version\' should be a non-empty string')
387-
388-
xml_response = _perform_api_call("flow/exists",
389-
data={'name': name, 'external_version': version})
390-
391-
xml_dict = xmltodict.parse(xml_response)
392-
flow_id = xml_dict['oml:flow_exists']['oml:id']
393-
return xml_response, flow_id
394-
395343

396344
def _add_if_nonempty(dic, key, value):
397345
if value is not None:

openml/flows/functions.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import xmltodict
2+
import six
23

34
from openml._api_calls import _perform_api_call
45
from . import OpenMLFlow, flow_to_sklearn
@@ -69,6 +70,41 @@ def list_flows(offset=None, size=None, tag=None):
6970
return _list_flows(api_call)
7071

7172

73+
def flow_exists(name, external_version):
74+
"""Retrieves the flow id of the flow uniquely identified by name + external_version.
75+
76+
Parameter
77+
---------
78+
name : string
79+
Name of the flow
80+
version : string
81+
Version information associated with flow.
82+
83+
Returns
84+
-------
85+
flow_exist : int
86+
flow id iff exists, False otherwise
87+
88+
Notes
89+
-----
90+
see http://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version
91+
"""
92+
if not (isinstance(name, six.string_types) and len(name) > 0):
93+
raise ValueError('Argument \'name\' should be a non-empty string')
94+
if not (isinstance(name, six.string_types) and len(external_version) > 0):
95+
raise ValueError('Argument \'version\' should be a non-empty string')
96+
97+
xml_response = _perform_api_call("flow/exists",
98+
data={'name': name, 'external_version': external_version})
99+
100+
result_dict = xmltodict.parse(xml_response)
101+
flow_id = int(result_dict['oml:flow_exists']['oml:id'])
102+
if flow_id > 0:
103+
return flow_id
104+
else:
105+
return False
106+
107+
72108
def _list_flows(api_call):
73109
# TODO add proper error handling here!
74110
xml_string = _perform_api_call(api_call)

openml/runs/functions.py

Lines changed: 78 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@
44
import xmltodict
55
import numpy as np
66
import warnings
7+
import openml
78
from sklearn.model_selection._search import BaseSearchCV
89

910
from ..exceptions import PyOpenMLError
1011
from .. import config
11-
from ..flows import sklearn_to_flow
12-
from ..exceptions import OpenMLCacheException
12+
from ..flows import sklearn_to_flow, get_flow, flow_exists
13+
from ..setups import setup_exists
14+
from ..exceptions import OpenMLCacheException, OpenMLServerException
1315
from ..util import URLError
1416
from ..tasks.functions import _create_task_from_xml
1517
from .._api_calls import _perform_api_call
@@ -21,7 +23,7 @@
2123

2224

2325

24-
def run_task(task, model):
26+
def run_task(task, model, avoid_duplicate_runs=True):
2527
"""Performs a CV run on the dataset of the given task, using the split.
2628
2729
Parameters
@@ -42,6 +44,19 @@ def run_task(task, model):
4244
# TODO move this into its onwn module. While it somehow belongs here, it
4345
# adds quite a lot of functionality which is better suited in other places!
4446
# TODO why doesn't this accept a flow as input? - this would make this more flexible!
47+
flow = sklearn_to_flow(model)
48+
49+
# returns flow id if the flow exists on the server, False otherwise
50+
flow_id = flow_exists(flow.name, flow.external_version)
51+
52+
# skips the run if it already exists and the user opts for this in the config file.
53+
# also, if the flow is not present on the server, the check is not needed.
54+
if avoid_duplicate_runs and flow_id:
55+
flow = get_flow(flow_id)
56+
setup_id = setup_exists(flow, model)
57+
ids = _run_exists(task.task_id, setup_id)
58+
if ids:
59+
raise PyOpenMLError("Run already exists in server. Run id(s): %s" %str(ids))
4560

4661
dataset = task.get_dataset()
4762
X, Y = dataset.get_data(target=task.target_name)
@@ -55,19 +70,44 @@ def run_task(task, model):
5570
run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model)
5671
run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)
5772

58-
# now generate the flow
59-
flow = sklearn_to_flow(model)
60-
flow_id = flow._ensure_flow_exists()
61-
if flow_id < 0:
62-
print("No flow")
63-
return 0, 2
64-
config.logger.info(flow_id)
73+
if flow_id == False:
74+
# means the flow did not exists.
75+
# As we could run it, publish it now
76+
flow = flow.publish()
77+
else:
78+
# flow already existed, download it from server
79+
# TODO (neccessary? is this a post condition of this function)
80+
flow = get_flow(flow_id)
6581

66-
# attach the flow to the run
67-
run.flow_id = flow_id
82+
run.flow_id = flow.flow_id
83+
config.logger.info('Executed Task %d with Flow id: %d' %(task.task_id, run.flow_id))
6884

6985
return run
7086

87+
def _run_exists(task_id, setup_id):
88+
'''
89+
Checks whether a task/setup combination is already present on the server.
90+
91+
:param task_id: int
92+
:param setup_id: int
93+
:return: List of run ids iff these already exists on the server, False otherwise
94+
'''
95+
if setup_id <= 0:
96+
# openml setups are in range 1-inf
97+
return False
98+
99+
try:
100+
result = list_runs(task=[task_id], setup=[setup_id])
101+
if len(result) > 0:
102+
return set(result.keys())
103+
else:
104+
return False
105+
except OpenMLServerException as exception:
106+
# error code 512 implies no results. This means the run does not exist yet
107+
assert(exception.code == 512)
108+
return False
109+
110+
71111

72112
def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label,
73113
predicted_probabilities, class_labels, model_classes_mapping):
@@ -275,27 +315,28 @@ def _create_run_from_xml(xml):
275315
evaluations = dict()
276316
detailed_evaluations = defaultdict(lambda: defaultdict(dict))
277317
evaluation_flows = dict()
278-
for evaluation_dict in run['oml:output_data']['oml:evaluation']:
279-
key = evaluation_dict['oml:name']
280-
if 'oml:value' in evaluation_dict:
281-
value = float(evaluation_dict['oml:value'])
282-
elif 'oml:array_data' in evaluation_dict:
283-
value = evaluation_dict['oml:array_data']
284-
else:
285-
raise ValueError('Could not find keys "value" or "array_data" '
286-
'in %s' % str(evaluation_dict.keys()))
287-
288-
if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
289-
repeat = int(evaluation_dict['@repeat'])
290-
fold = int(evaluation_dict['@fold'])
291-
repeat_dict = detailed_evaluations[key]
292-
fold_dict = repeat_dict[repeat]
293-
fold_dict[fold] = value
294-
else:
295-
evaluations[key] = value
296-
evaluation_flows[key] = flow_id
318+
if 'oml:output_data' in run and 'oml:evaluation' in run['oml:output_data']:
319+
for evaluation_dict in run['oml:output_data']['oml:evaluation']:
320+
key = evaluation_dict['oml:name']
321+
if 'oml:value' in evaluation_dict:
322+
value = float(evaluation_dict['oml:value'])
323+
elif 'oml:array_data' in evaluation_dict:
324+
value = evaluation_dict['oml:array_data']
325+
else:
326+
raise ValueError('Could not find keys "value" or "array_data" '
327+
'in %s' % str(evaluation_dict.keys()))
328+
329+
if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
330+
repeat = int(evaluation_dict['@repeat'])
331+
fold = int(evaluation_dict['@fold'])
332+
repeat_dict = detailed_evaluations[key]
333+
fold_dict = repeat_dict[repeat]
334+
fold_dict[fold] = value
335+
else:
336+
evaluations[key] = value
337+
evaluation_flows[key] = flow_id
297338

298-
evaluation_flows[key] = flow_id
339+
evaluation_flows[key] = flow_id
299340

300341
return OpenMLRun(run_id=run_id, uploader=uploader,
301342
uploader_name=uploader_name, task_id=task_id,
@@ -325,7 +366,7 @@ def _get_cached_run(run_id):
325366
"cached" % run_id)
326367

327368

328-
def list_runs(offset=None, size=None, id=None, task=None,
369+
def list_runs(offset=None, size=None, id=None, task=None, setup=None,
329370
flow=None, uploader=None, tag=None):
330371
"""List all runs matching all of the given filters.
331372
@@ -342,6 +383,8 @@ def list_runs(offset=None, size=None, id=None, task=None,
342383
343384
task : list, optional
344385
386+
setup: list, optional
387+
345388
flow : list, optional
346389
347390
uploader : list, optional
@@ -363,6 +406,8 @@ def list_runs(offset=None, size=None, id=None, task=None,
363406
api_call += "/run/%s" % ','.join([str(int(i)) for i in id])
364407
if task is not None:
365408
api_call += "/task/%s" % ','.join([str(int(i)) for i in task])
409+
if setup is not None:
410+
api_call += "/setup/%s" % ','.join([str(int(i)) for i in setup])
366411
if flow is not None:
367412
api_call += "/flow/%s" % ','.join([str(int(i)) for i in flow])
368413
if uploader is not None:

0 commit comments

Comments
 (0)