66import warnings
77import sklearn
88import time
9+ import six
10+ import json
911
1012from ..exceptions import PyOpenMLError
1113from .. import config
1517
1618from ..exceptions import OpenMLCacheException , OpenMLServerException
1719from ..util import URLError , version_complies
18- from .._api_calls import _perform_api_call
20+ from .._api_calls import _perform_api_call , _file_id_to_url
1921from .run import OpenMLRun , _get_version_information
22+ from .trace import OpenMLRunTrace , OpenMLTraceIteration
2023
2124
2225# _get_version_info, _get_dict and _create_setup_string are in run.py to avoid
@@ -94,6 +97,24 @@ def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None, seed=None):
9497
9598 return run
9699
100+
101+ def get_run_trace (run_id ):
102+ """Get the optimization trace object for a given run id.
103+
104+ Parameters
105+ ----------
106+ run_id : int
107+
108+ Returns
109+ -------
110+ openml.runs.OpenMLTrace
111+ """
112+
113+ trace_xml = _perform_api_call ('run/trace/%d' % run_id )
114+ run_trace = _create_trace_from_description (trace_xml )
115+ return run_trace
116+
117+
97118def initialize_model_from_run (run_id ):
98119 '''
99120 Initialized a model based on a run_id (i.e., using the exact
@@ -112,6 +133,54 @@ def initialize_model_from_run(run_id):
112133 run = get_run (run_id )
113134 return initialize_model (run .setup_id )
114135
136+
137+ def initialize_model_from_trace (run_id , repeat , fold , iteration = None ):
138+ '''
139+ Initialize a model based on the parameters that were set
140+ by an optimization procedure (i.e., using the exact same
141+ parameter settings)
142+
143+ Parameters
144+ ----------
145+ run_id : int
146+ The Openml run_id. Should contain a trace file,
147+ otherwise a OpenMLServerException is raised
148+
149+ repeat: int
150+ The repeat nr (column in trace file)
151+
152+ fold: int
153+ The fold nr (column in trace file)
154+
155+ iteration: int
156+ The iteration nr (column in trace file). If None, the
157+ best (selected) iteration will be searched (slow),
158+ according to the selection criteria implemented in
159+ OpenMLRunTrace.get_selected_iteration
160+
161+ Returns
162+ -------
163+ model : sklearn model
164+ the scikit-learn model with all parameters initailized
165+ '''
166+ run_trace = get_run_trace (run_id )
167+
168+ if iteration is None :
169+ iteration = run_trace .get_selected_iteration (repeat , fold )
170+
171+ request = (repeat , fold , iteration )
172+ if request not in run_trace .trace_iterations :
173+ raise ValueError ('Combination repeat, fold, iteration not availavle' )
174+ current = run_trace .trace_iterations [(repeat , fold , iteration )]
175+
176+ search_model = initialize_model_from_run (run_id )
177+ if not isinstance (search_model , sklearn .model_selection ._search .BaseSearchCV ):
178+ raise ValueError ('Deserialized flow not instance of ' \
179+ 'sklearn.model_selection._search.BaseSearchCV' )
180+ base_estimator = search_model .estimator
181+ base_estimator .set_params (** current .get_parameters ())
182+ return base_estimator
183+
115184def _run_exists (task_id , setup_id ):
116185 '''
117186 Checks whether a task/setup combination is already present on the server.
@@ -305,8 +374,9 @@ def _extract_arfftrace(model, rep_no, fold_no):
305374 test_score = model .cv_results_ ['mean_test_score' ][itt_no ]
306375 arff_line = [rep_no , fold_no , itt_no , test_score , selected ]
307376 for key in model .cv_results_ :
308- if key .startswith ("param_" ):
309- arff_line .append (str (model .cv_results_ [key ][itt_no ]))
377+ if key .startswith ('param_' ):
378+ serialized_value = json .dumps (model .cv_results_ [key ][itt_no ])
379+ arff_line .append (serialized_value )
310380 arff_tracecontent .append (arff_line )
311381 return arff_tracecontent
312382
@@ -326,15 +396,16 @@ def _extract_arfftrace_attributes(model):
326396
327397 # model dependent attributes for trace arff
328398 for key in model .cv_results_ :
329- if key .startswith (" param_" ):
330- if all ( isinstance ( i , ( bool )) for i in model . cv_results_ [ key ]):
331- type = [ 'True' , 'False' ]
332- elif all (isinstance (i , ( int , float )) for i in model .cv_results_ [key ]):
333- type = 'NUMERIC '
399+ if key .startswith (' param_' ):
400+ # supported types should include all types, including bool, int float
401+ supported_types = ( bool , int , float , six . string_types )
402+ if all (isinstance (i , supported_types ) or i is None for i in model .cv_results_ [key ]):
403+ type = 'STRING '
334404 else :
335- values = list (set (model .cv_results_ [key ])) # unique values
336- type = [str (i ) for i in values ]
405+ raise TypeError ('Unsupported param type in param grid' )
337406
407+ # we renamed the attribute param to parameter, as this is a required
408+ # OpenML convention
338409 attribute = ("parameter_" + key [6 :], type )
339410 trace_attributes .append (attribute )
340411 return trace_attributes
@@ -439,45 +510,52 @@ def _create_run_from_xml(xml):
439510
440511 dataset_id = int (run ['oml:input_data' ]['oml:dataset' ]['oml:did' ])
441512
442- predictions_url = None
443- if isinstance (run ['oml:output_data' ]['oml:file' ], dict ):
444- # only one result.. probably due to an upload error
445- file_dict = run ['oml:output_data' ]['oml:file' ]
446- if file_dict ['oml:name' ] == 'predictions' :
447- predictions_url = file_dict ['oml:url' ]
448- else :
449- # multiple files, the normal case
450- for file_dict in run ['oml:output_data' ]['oml:file' ]:
451- if file_dict ['oml:name' ] == 'predictions' :
452- predictions_url = file_dict ['oml:url' ]
453- if predictions_url is None :
454- raise ValueError ('No URL to download predictions for run %d in run '
455- 'description XML' % run_id )
513+ files = dict ()
456514 evaluations = dict ()
457515 detailed_evaluations = defaultdict (lambda : defaultdict (dict ))
458- evaluation_flows = dict ()
459- if 'oml:output_data' in run and 'oml:evaluation' in run ['oml:output_data' ]:
460- for evaluation_dict in run ['oml:output_data' ]['oml:evaluation' ]:
461- key = evaluation_dict ['oml:name' ]
462- if 'oml:value' in evaluation_dict :
463- value = float (evaluation_dict ['oml:value' ])
464- elif 'oml:array_data' in evaluation_dict :
465- value = evaluation_dict ['oml:array_data' ]
466- else :
467- raise ValueError ('Could not find keys "value" or "array_data" '
468- 'in %s' % str (evaluation_dict .keys ()))
469-
470- if '@repeat' in evaluation_dict and '@fold' in evaluation_dict :
471- repeat = int (evaluation_dict ['@repeat' ])
472- fold = int (evaluation_dict ['@fold' ])
473- repeat_dict = detailed_evaluations [key ]
474- fold_dict = repeat_dict [repeat ]
475- fold_dict [fold ] = value
476- else :
477- evaluations [key ] = value
478- evaluation_flows [key ] = flow_id
516+ if 'oml:output_data' not in run :
517+ raise ValueError ('Run does not contain output_data (OpenML server error?)' )
518+ else :
519+ if isinstance (run ['oml:output_data' ]['oml:file' ], dict ):
520+ # only one result.. probably due to an upload error
521+ file_dict = run ['oml:output_data' ]['oml:file' ]
522+ files [file_dict ['oml:name' ]] = int (file_dict ['oml:file_id' ])
523+ else :
524+ # multiple files, the normal case
525+ for file_dict in run ['oml:output_data' ]['oml:file' ]:
526+ files [file_dict ['oml:name' ]] = int (file_dict ['oml:file_id' ])
527+ if 'oml:evaluation' in run ['oml:output_data' ]:
528+ # in normal cases there should be evaluations, but in case there
529+ # was an error these could be absent
530+ for evaluation_dict in run ['oml:output_data' ]['oml:evaluation' ]:
531+ key = evaluation_dict ['oml:name' ]
532+ if 'oml:value' in evaluation_dict :
533+ value = float (evaluation_dict ['oml:value' ])
534+ elif 'oml:array_data' in evaluation_dict :
535+ value = evaluation_dict ['oml:array_data' ]
536+ else :
537+ raise ValueError ('Could not find keys "value" or "array_data" '
538+ 'in %s' % str (evaluation_dict .keys ()))
539+
540+ if '@repeat' in evaluation_dict and '@fold' in evaluation_dict :
541+ repeat = int (evaluation_dict ['@repeat' ])
542+ fold = int (evaluation_dict ['@fold' ])
543+ repeat_dict = detailed_evaluations [key ]
544+ fold_dict = repeat_dict [repeat ]
545+ fold_dict [fold ] = value
546+ else :
547+ evaluations [key ] = value
548+
549+ if 'description' not in files :
550+ raise ValueError ('No description file for run %d in run '
551+ 'description XML' % run_id )
552+
553+ if 'predictions' not in files :
554+ # JvR: actually, I am not sure whether this error should be raised.
555+ # a run can consist without predictions. But for now let's keep it
556+ raise ValueError ('No prediction files for run %d in run '
557+ 'description XML' % run_id )
479558
480- evaluation_flows [key ] = flow_id
481559 tags = None
482560 if 'oml:tag' in run :
483561 if isinstance (run ['oml:tag' ], str ):
@@ -487,18 +565,48 @@ def _create_run_from_xml(xml):
487565 else :
488566 raise ValueError ('Received not string and non list as tag item' )
489567
490-
491568 return OpenMLRun (run_id = run_id , uploader = uploader ,
492569 uploader_name = uploader_name , task_id = task_id ,
493570 task_type = task_type ,
494571 task_evaluation_measure = task_evaluation_measure ,
495572 flow_id = flow_id , flow_name = flow_name ,
496573 setup_id = setup_id , setup_string = setup_string ,
497574 parameter_settings = parameters ,
498- dataset_id = dataset_id , predictions_url = predictions_url ,
575+ dataset_id = dataset_id , output_files = files ,
499576 evaluations = evaluations ,
500577 detailed_evaluations = detailed_evaluations , tags = tags )
501578
579+ def _create_trace_from_description (xml ):
580+ result_dict = xmltodict .parse (xml )['oml:trace' ]
581+
582+ run_id = result_dict ['oml:run_id' ]
583+ trace = dict ()
584+
585+ if 'oml:trace_iteration' not in result_dict :
586+ raise ValueError ('Run does not contain valid trace. ' )
587+
588+ for itt in result_dict ['oml:trace_iteration' ]:
589+ repeat = int (itt ['oml:repeat' ])
590+ fold = int (itt ['oml:fold' ])
591+ iteration = int (itt ['oml:iteration' ])
592+ setup_string = json .loads (itt ['oml:setup_string' ])
593+ evaluation = float (itt ['oml:evaluation' ])
594+
595+ selectedValue = itt ['oml:selected' ]
596+ if selectedValue == 'true' :
597+ selected = True
598+ elif selectedValue == 'false' :
599+ selected = False
600+ else :
601+ raise ValueError ('expected {"true", "false"} value for ' \
602+ 'selected field, received: %s' % selectedValue )
603+
604+ current = OpenMLTraceIteration (repeat , fold , iteration ,
605+ setup_string , evaluation ,
606+ selected )
607+ trace [(repeat , fold , iteration )] = current
608+
609+ return OpenMLRunTrace (run_id , trace )
502610
503611def _get_cached_run (run_id ):
504612 """Load a run from the cache."""
0 commit comments