44import xmltodict
55import numpy as np
66import warnings
7+ import openml
78from sklearn .model_selection ._search import BaseSearchCV
89
910from ..exceptions import PyOpenMLError
1011from .. import config
11- from ..flows import sklearn_to_flow
12- from ..exceptions import OpenMLCacheException
12+ from ..flows import sklearn_to_flow , get_flow , flow_exists
13+ from ..setups import setup_exists
14+ from ..exceptions import OpenMLCacheException , OpenMLServerException
1315from ..util import URLError
1416from ..tasks .functions import _create_task_from_xml
1517from .._api_calls import _perform_api_call
2123
2224
2325
24- def run_task (task , model ):
26+ def run_task (task , model , avoid_duplicate_runs = True ):
2527 """Performs a CV run on the dataset of the given task, using the split.
2628
2729 Parameters
@@ -42,6 +44,19 @@ def run_task(task, model):
4244 # TODO move this into its onwn module. While it somehow belongs here, it
4345 # adds quite a lot of functionality which is better suited in other places!
4446 # TODO why doesn't this accept a flow as input? - this would make this more flexible!
47+ flow = sklearn_to_flow (model )
48+
49+ # returns flow id if the flow exists on the server, False otherwise
50+ flow_id = flow_exists (flow .name , flow .external_version )
51+
52+ # skips the run if it already exists and the user opts for this in the config file.
53+ # also, if the flow is not present on the server, the check is not needed.
54+ if avoid_duplicate_runs and flow_id :
55+ flow = get_flow (flow_id )
56+ setup_id = setup_exists (flow , model )
57+ ids = _run_exists (task .task_id , setup_id )
58+ if ids :
59+ raise PyOpenMLError ("Run already exists in server. Run id(s): %s" % str (ids ))
4560
4661 dataset = task .get_dataset ()
4762 X , Y = dataset .get_data (target = task .target_name )
@@ -55,19 +70,44 @@ def run_task(task, model):
5570 run = OpenMLRun (task_id = task .task_id , flow_id = None , dataset_id = dataset .dataset_id , model = model )
5671 run .data_content , run .trace_content = _run_task_get_arffcontent (model , task , class_labels )
5772
58- # now generate the flow
59- flow = sklearn_to_flow (model )
60- flow_id = flow ._ensure_flow_exists ()
61- if flow_id < 0 :
62- print ("No flow" )
63- return 0 , 2
64- config .logger .info (flow_id )
73+ if flow_id == False :
74+ # means the flow did not exists.
75+ # As we could run it, publish it now
76+ flow = flow .publish ()
77+ else :
78+ # flow already existed, download it from server
79+ # TODO (neccessary? is this a post condition of this function)
80+ flow = get_flow (flow_id )
6581
66- # attach the flow to the run
67- run . flow_id = flow_id
82+ run . flow_id = flow . flow_id
83+ config . logger . info ( 'Executed Task %d with Flow id: %d' % ( task . task_id , run . flow_id ))
6884
6985 return run
7086
87+ def _run_exists (task_id , setup_id ):
88+ '''
89+ Checks whether a task/setup combination is already present on the server.
90+
91+ :param task_id: int
92+ :param setup_id: int
93+ :return: List of run ids iff these already exists on the server, False otherwise
94+ '''
95+ if setup_id <= 0 :
96+ # openml setups are in range 1-inf
97+ return False
98+
99+ try :
100+ result = list_runs (task = [task_id ], setup = [setup_id ])
101+ if len (result ) > 0 :
102+ return set (result .keys ())
103+ else :
104+ return False
105+ except OpenMLServerException as exception :
106+ # error code 512 implies no results. This means the run does not exist yet
107+ assert (exception .code == 512 )
108+ return False
109+
110+
71111
72112def _prediction_to_row (rep_no , fold_no , row_id , correct_label , predicted_label ,
73113 predicted_probabilities , class_labels , model_classes_mapping ):
@@ -275,27 +315,28 @@ def _create_run_from_xml(xml):
275315 evaluations = dict ()
276316 detailed_evaluations = defaultdict (lambda : defaultdict (dict ))
277317 evaluation_flows = dict ()
278- for evaluation_dict in run ['oml:output_data' ]['oml:evaluation' ]:
279- key = evaluation_dict ['oml:name' ]
280- if 'oml:value' in evaluation_dict :
281- value = float (evaluation_dict ['oml:value' ])
282- elif 'oml:array_data' in evaluation_dict :
283- value = evaluation_dict ['oml:array_data' ]
284- else :
285- raise ValueError ('Could not find keys "value" or "array_data" '
286- 'in %s' % str (evaluation_dict .keys ()))
287-
288- if '@repeat' in evaluation_dict and '@fold' in evaluation_dict :
289- repeat = int (evaluation_dict ['@repeat' ])
290- fold = int (evaluation_dict ['@fold' ])
291- repeat_dict = detailed_evaluations [key ]
292- fold_dict = repeat_dict [repeat ]
293- fold_dict [fold ] = value
294- else :
295- evaluations [key ] = value
296- evaluation_flows [key ] = flow_id
318+ if 'oml:output_data' in run and 'oml:evaluation' in run ['oml:output_data' ]:
319+ for evaluation_dict in run ['oml:output_data' ]['oml:evaluation' ]:
320+ key = evaluation_dict ['oml:name' ]
321+ if 'oml:value' in evaluation_dict :
322+ value = float (evaluation_dict ['oml:value' ])
323+ elif 'oml:array_data' in evaluation_dict :
324+ value = evaluation_dict ['oml:array_data' ]
325+ else :
326+ raise ValueError ('Could not find keys "value" or "array_data" '
327+ 'in %s' % str (evaluation_dict .keys ()))
328+
329+ if '@repeat' in evaluation_dict and '@fold' in evaluation_dict :
330+ repeat = int (evaluation_dict ['@repeat' ])
331+ fold = int (evaluation_dict ['@fold' ])
332+ repeat_dict = detailed_evaluations [key ]
333+ fold_dict = repeat_dict [repeat ]
334+ fold_dict [fold ] = value
335+ else :
336+ evaluations [key ] = value
337+ evaluation_flows [key ] = flow_id
297338
298- evaluation_flows [key ] = flow_id
339+ evaluation_flows [key ] = flow_id
299340
300341 return OpenMLRun (run_id = run_id , uploader = uploader ,
301342 uploader_name = uploader_name , task_id = task_id ,
@@ -325,7 +366,7 @@ def _get_cached_run(run_id):
325366 "cached" % run_id )
326367
327368
328- def list_runs (offset = None , size = None , id = None , task = None ,
369+ def list_runs (offset = None , size = None , id = None , task = None , setup = None ,
329370 flow = None , uploader = None , tag = None ):
330371 """List all runs matching all of the given filters.
331372
@@ -342,6 +383,8 @@ def list_runs(offset=None, size=None, id=None, task=None,
342383
343384 task : list, optional
344385
386+ setup: list, optional
387+
345388 flow : list, optional
346389
347390 uploader : list, optional
@@ -363,6 +406,8 @@ def list_runs(offset=None, size=None, id=None, task=None,
363406 api_call += "/run/%s" % ',' .join ([str (int (i )) for i in id ])
364407 if task is not None :
365408 api_call += "/task/%s" % ',' .join ([str (int (i )) for i in task ])
409+ if setup is not None :
410+ api_call += "/setup/%s" % ',' .join ([str (int (i )) for i in setup ])
366411 if flow is not None :
367412 api_call += "/flow/%s" % ',' .join ([str (int (i )) for i in flow ])
368413 if uploader is not None :
0 commit comments