Skip to content

Commit c307a19

Browse files
authored
Merge pull request #249 from openml/fix246
Fix246
2 parents bf9d967 + 146f458 commit c307a19

7 files changed

Lines changed: 310 additions & 70 deletions

File tree

openml/_api_calls.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,18 @@ def _read_url(self, url, add_authentication=False, data=None, filePath=None):
5151
return _read_url(url, data)
5252

5353

54+
def _file_id_to_url(file_id, filename=None):
55+
'''
56+
Presents the URL how to download a given file id
57+
filename is optional
58+
'''
59+
openml_url = config.server.split('/api/')
60+
url = openml_url[0] + '/data/download/%s' %file_id
61+
if filename is not None:
62+
url += '/' + filename
63+
return url
64+
65+
5466
def _read_url_files(url, data=None, file_dictionary=None, file_elements=None):
5567
"""do a post request to url with data, file content of
5668
file_dictionary and sending file_elements as files"""

openml/runs/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
from .run import OpenMLRun
2-
from .functions import (run_task, get_run, list_runs, get_runs, initialize_model_from_run)
2+
from .trace import OpenMLRunTrace, OpenMLTraceIteration
3+
from .functions import (run_task, get_run, list_runs, get_runs, get_run_trace,
4+
initialize_model_from_run, initialize_model_from_trace)
35

46
__all__ = ['OpenMLRun', 'run_task', 'get_run', 'list_runs', 'get_runs']

openml/runs/functions.py

Lines changed: 156 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import warnings
77
import sklearn
88
import time
9+
import six
10+
import json
911

1012
from ..exceptions import PyOpenMLError
1113
from .. import config
@@ -15,8 +17,9 @@
1517

1618
from ..exceptions import OpenMLCacheException, OpenMLServerException
1719
from ..util import URLError, version_complies
18-
from .._api_calls import _perform_api_call
20+
from .._api_calls import _perform_api_call, _file_id_to_url
1921
from .run import OpenMLRun, _get_version_information
22+
from .trace import OpenMLRunTrace, OpenMLTraceIteration
2023

2124

2225
# _get_version_info, _get_dict and _create_setup_string are in run.py to avoid
@@ -94,6 +97,24 @@ def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None, seed=None):
9497

9598
return run
9699

100+
101+
def get_run_trace(run_id):
102+
"""Get the optimization trace object for a given run id.
103+
104+
Parameters
105+
----------
106+
run_id : int
107+
108+
Returns
109+
-------
110+
openml.runs.OpenMLTrace
111+
"""
112+
113+
trace_xml = _perform_api_call('run/trace/%d' % run_id)
114+
run_trace = _create_trace_from_description(trace_xml)
115+
return run_trace
116+
117+
97118
def initialize_model_from_run(run_id):
98119
'''
99120
Initialized a model based on a run_id (i.e., using the exact
@@ -112,6 +133,54 @@ def initialize_model_from_run(run_id):
112133
run = get_run(run_id)
113134
return initialize_model(run.setup_id)
114135

136+
137+
def initialize_model_from_trace(run_id, repeat, fold, iteration=None):
138+
'''
139+
Initialize a model based on the parameters that were set
140+
by an optimization procedure (i.e., using the exact same
141+
parameter settings)
142+
143+
Parameters
144+
----------
145+
run_id : int
146+
The Openml run_id. Should contain a trace file,
147+
otherwise a OpenMLServerException is raised
148+
149+
repeat: int
150+
The repeat nr (column in trace file)
151+
152+
fold: int
153+
The fold nr (column in trace file)
154+
155+
iteration: int
156+
The iteration nr (column in trace file). If None, the
157+
best (selected) iteration will be searched (slow),
158+
according to the selection criteria implemented in
159+
OpenMLRunTrace.get_selected_iteration
160+
161+
Returns
162+
-------
163+
model : sklearn model
164+
the scikit-learn model with all parameters initailized
165+
'''
166+
run_trace = get_run_trace(run_id)
167+
168+
if iteration is None:
169+
iteration = run_trace.get_selected_iteration(repeat, fold)
170+
171+
request = (repeat, fold, iteration)
172+
if request not in run_trace.trace_iterations:
173+
raise ValueError('Combination repeat, fold, iteration not availavle')
174+
current = run_trace.trace_iterations[(repeat, fold, iteration)]
175+
176+
search_model = initialize_model_from_run(run_id)
177+
if not isinstance(search_model, sklearn.model_selection._search.BaseSearchCV):
178+
raise ValueError('Deserialized flow not instance of ' \
179+
'sklearn.model_selection._search.BaseSearchCV')
180+
base_estimator = search_model.estimator
181+
base_estimator.set_params(**current.get_parameters())
182+
return base_estimator
183+
115184
def _run_exists(task_id, setup_id):
116185
'''
117186
Checks whether a task/setup combination is already present on the server.
@@ -305,8 +374,9 @@ def _extract_arfftrace(model, rep_no, fold_no):
305374
test_score = model.cv_results_['mean_test_score'][itt_no]
306375
arff_line = [rep_no, fold_no, itt_no, test_score, selected]
307376
for key in model.cv_results_:
308-
if key.startswith("param_"):
309-
arff_line.append(str(model.cv_results_[key][itt_no]))
377+
if key.startswith('param_'):
378+
serialized_value = json.dumps(model.cv_results_[key][itt_no])
379+
arff_line.append(serialized_value)
310380
arff_tracecontent.append(arff_line)
311381
return arff_tracecontent
312382

@@ -326,15 +396,16 @@ def _extract_arfftrace_attributes(model):
326396

327397
# model dependent attributes for trace arff
328398
for key in model.cv_results_:
329-
if key.startswith("param_"):
330-
if all(isinstance(i, (bool)) for i in model.cv_results_[key]):
331-
type = ['True', 'False']
332-
elif all(isinstance(i, (int, float)) for i in model.cv_results_[key]):
333-
type = 'NUMERIC'
399+
if key.startswith('param_'):
400+
# supported types should include all types, including bool, int float
401+
supported_types = (bool, int, float, six.string_types)
402+
if all(isinstance(i, supported_types) or i is None for i in model.cv_results_[key]):
403+
type = 'STRING'
334404
else:
335-
values = list(set(model.cv_results_[key])) # unique values
336-
type = [str(i) for i in values]
405+
raise TypeError('Unsupported param type in param grid')
337406

407+
# we renamed the attribute param to parameter, as this is a required
408+
# OpenML convention
338409
attribute = ("parameter_" + key[6:], type)
339410
trace_attributes.append(attribute)
340411
return trace_attributes
@@ -439,45 +510,52 @@ def _create_run_from_xml(xml):
439510

440511
dataset_id = int(run['oml:input_data']['oml:dataset']['oml:did'])
441512

442-
predictions_url = None
443-
if isinstance(run['oml:output_data']['oml:file'], dict):
444-
# only one result.. probably due to an upload error
445-
file_dict = run['oml:output_data']['oml:file']
446-
if file_dict['oml:name'] == 'predictions':
447-
predictions_url = file_dict['oml:url']
448-
else:
449-
# multiple files, the normal case
450-
for file_dict in run['oml:output_data']['oml:file']:
451-
if file_dict['oml:name'] == 'predictions':
452-
predictions_url = file_dict['oml:url']
453-
if predictions_url is None:
454-
raise ValueError('No URL to download predictions for run %d in run '
455-
'description XML' % run_id)
513+
files = dict()
456514
evaluations = dict()
457515
detailed_evaluations = defaultdict(lambda: defaultdict(dict))
458-
evaluation_flows = dict()
459-
if 'oml:output_data' in run and 'oml:evaluation' in run['oml:output_data']:
460-
for evaluation_dict in run['oml:output_data']['oml:evaluation']:
461-
key = evaluation_dict['oml:name']
462-
if 'oml:value' in evaluation_dict:
463-
value = float(evaluation_dict['oml:value'])
464-
elif 'oml:array_data' in evaluation_dict:
465-
value = evaluation_dict['oml:array_data']
466-
else:
467-
raise ValueError('Could not find keys "value" or "array_data" '
468-
'in %s' % str(evaluation_dict.keys()))
469-
470-
if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
471-
repeat = int(evaluation_dict['@repeat'])
472-
fold = int(evaluation_dict['@fold'])
473-
repeat_dict = detailed_evaluations[key]
474-
fold_dict = repeat_dict[repeat]
475-
fold_dict[fold] = value
476-
else:
477-
evaluations[key] = value
478-
evaluation_flows[key] = flow_id
516+
if 'oml:output_data' not in run:
517+
raise ValueError('Run does not contain output_data (OpenML server error?)')
518+
else:
519+
if isinstance(run['oml:output_data']['oml:file'], dict):
520+
# only one result.. probably due to an upload error
521+
file_dict = run['oml:output_data']['oml:file']
522+
files[file_dict['oml:name']] = int(file_dict['oml:file_id'])
523+
else:
524+
# multiple files, the normal case
525+
for file_dict in run['oml:output_data']['oml:file']:
526+
files[file_dict['oml:name']] = int(file_dict['oml:file_id'])
527+
if 'oml:evaluation' in run['oml:output_data']:
528+
# in normal cases there should be evaluations, but in case there
529+
# was an error these could be absent
530+
for evaluation_dict in run['oml:output_data']['oml:evaluation']:
531+
key = evaluation_dict['oml:name']
532+
if 'oml:value' in evaluation_dict:
533+
value = float(evaluation_dict['oml:value'])
534+
elif 'oml:array_data' in evaluation_dict:
535+
value = evaluation_dict['oml:array_data']
536+
else:
537+
raise ValueError('Could not find keys "value" or "array_data" '
538+
'in %s' % str(evaluation_dict.keys()))
539+
540+
if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
541+
repeat = int(evaluation_dict['@repeat'])
542+
fold = int(evaluation_dict['@fold'])
543+
repeat_dict = detailed_evaluations[key]
544+
fold_dict = repeat_dict[repeat]
545+
fold_dict[fold] = value
546+
else:
547+
evaluations[key] = value
548+
549+
if 'description' not in files:
550+
raise ValueError('No description file for run %d in run '
551+
'description XML' % run_id)
552+
553+
if 'predictions' not in files:
554+
# JvR: actually, I am not sure whether this error should be raised.
555+
# a run can consist without predictions. But for now let's keep it
556+
raise ValueError('No prediction files for run %d in run '
557+
'description XML' % run_id)
479558

480-
evaluation_flows[key] = flow_id
481559
tags = None
482560
if 'oml:tag' in run:
483561
if isinstance(run['oml:tag'], str):
@@ -487,18 +565,48 @@ def _create_run_from_xml(xml):
487565
else:
488566
raise ValueError('Received not string and non list as tag item')
489567

490-
491568
return OpenMLRun(run_id=run_id, uploader=uploader,
492569
uploader_name=uploader_name, task_id=task_id,
493570
task_type=task_type,
494571
task_evaluation_measure=task_evaluation_measure,
495572
flow_id=flow_id, flow_name=flow_name,
496573
setup_id=setup_id, setup_string=setup_string,
497574
parameter_settings=parameters,
498-
dataset_id=dataset_id, predictions_url=predictions_url,
575+
dataset_id=dataset_id, output_files=files,
499576
evaluations=evaluations,
500577
detailed_evaluations=detailed_evaluations, tags=tags)
501578

579+
def _create_trace_from_description(xml):
580+
result_dict = xmltodict.parse(xml)['oml:trace']
581+
582+
run_id = result_dict['oml:run_id']
583+
trace = dict()
584+
585+
if 'oml:trace_iteration' not in result_dict:
586+
raise ValueError('Run does not contain valid trace. ')
587+
588+
for itt in result_dict['oml:trace_iteration']:
589+
repeat = int(itt['oml:repeat'])
590+
fold = int(itt['oml:fold'])
591+
iteration = int(itt['oml:iteration'])
592+
setup_string = json.loads(itt['oml:setup_string'])
593+
evaluation = float(itt['oml:evaluation'])
594+
595+
selectedValue = itt['oml:selected']
596+
if selectedValue == 'true':
597+
selected = True
598+
elif selectedValue == 'false':
599+
selected = False
600+
else:
601+
raise ValueError('expected {"true", "false"} value for '\
602+
'selected field, received: %s' %selectedValue)
603+
604+
current = OpenMLTraceIteration(repeat, fold, iteration,
605+
setup_string, evaluation,
606+
selected)
607+
trace[(repeat, fold, iteration)] = current
608+
609+
return OpenMLRunTrace(run_id, trace)
502610

503611
def _get_cached_run(run_id):
504612
"""Load a run from the cache."""

openml/runs/run.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
import arff
66
import xmltodict
7-
from sklearn.base import BaseEstimator
87

98
import openml
109
from ..tasks import get_task
@@ -20,7 +19,7 @@ class OpenMLRun(object):
2019
2120
"""
2221
def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
23-
files=None, setup_id=None, tags=None, uploader=None, uploader_name=None,
22+
output_files=None, setup_id=None, tags=None, uploader=None, uploader_name=None,
2423
evaluations=None, detailed_evaluations=None,
2524
data_content=None, trace_attributes=None, trace_content=None,
2625
model=None, task_type=None, task_evaluation_measure=None, flow_name=None,
@@ -37,10 +36,10 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
3736
self.setup_string = setup_string
3837
self.parameter_settings = parameter_settings
3938
self.dataset_id = dataset_id
40-
self.predictions_url = predictions_url
4139
self.evaluations = evaluations
4240
self.detailed_evaluations = detailed_evaluations
4341
self.data_content = data_content
42+
self.output_files = output_files
4443
self.trace_attributes = trace_attributes
4544
self.trace_content = trace_content
4645
self.error_message = None

0 commit comments

Comments
 (0)