44
55import arff
66import xmltodict
7+ from sklearn .base import BaseEstimator
8+ from sklearn .model_selection ._search import BaseSearchCV
79
10+ import openml
811from ..tasks import get_task
912from .._api_calls import _perform_api_call
10-
13+ from .. exceptions import PyOpenMLError
1114
1215class OpenMLRun (object ):
1316 """OpenML Run: result of running a model on an openml dataset.
@@ -17,10 +20,10 @@ class OpenMLRun(object):
1720 FIXME
1821
1922 """
20- def __init__ (self , task_id , flow_id , dataset_id , setup_string = None ,
23+ def __init__ (self , task_id , flow_id , dataset_id , setup_string = None ,
2124 files = None , setup_id = None , tags = None , uploader = None , uploader_name = None ,
2225 evaluations = None , detailed_evaluations = None ,
23- data_content = None , model = None , task_type = None ,
26+ data_content = None , trace_content = None , model = None , task_type = None ,
2427 task_evaluation_measure = None , flow_name = None ,
2528 parameter_settings = None , predictions_url = None , task = None ,
2629 flow = None , run_id = None ):
@@ -39,12 +42,14 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
3942 self .evaluations = evaluations
4043 self .detailed_evaluations = detailed_evaluations
4144 self .data_content = data_content
45+ self .trace_content = trace_content
4246 self .task = task
4347 self .flow = flow
4448 self .run_id = run_id
49+ self .model = model
4550
4651 def _generate_arff_dict (self ):
47- """Generates the arff dictionary for upload to the server.
52+ """Generates the arff dictionary for uploading predictions to the server.
4853
4954 Assumes that the run has been executed.
5055
@@ -74,6 +79,48 @@ def _generate_arff_dict(self):
7479 arff_dict ['relation' ] = 'openml_task_' + str (task .task_id ) + '_predictions'
7580 return arff_dict
7681
82+ def _generate_trace_arff_dict (self , model ):
83+ """Generates the arff dictionary for uploading predictions to the server.
84+
85+ Assumes that the run has been executed.
86+
87+ Returns
88+ -------
89+ arf_dict : dict
90+ Dictionary representation of the ARFF file that will be uploaded.
91+ Contains information about the optimization trace.
92+ """
93+ if self .trace_content is None :
94+ raise ValueError ('No trace content avaiable.' )
95+ if not isinstance (model , BaseSearchCV ):
96+ raise PyOpenMLError ('Cannot generate trace on provided classifier. (This should never happen.)' )
97+
98+ arff_dict = {}
99+ arff_dict ['attributes' ] = [('repeat' , 'NUMERIC' ),
100+ ('fold' , 'NUMERIC' ),
101+ ('iteration' , 'NUMERIC' ),
102+ ('evaluation' , 'NUMERIC' ),
103+ ('selected' , ['true' , 'false' ])]
104+ for key in model .cv_results_ :
105+ if key .startswith ("param_" ):
106+ type = 'STRING'
107+ if all (isinstance (i , (bool )) for i in model .cv_results_ [key ]):
108+ type = ['True' , 'False' ]
109+ elif all (isinstance (i , (int , float )) for i in model .cv_results_ [key ]):
110+ type = 'NUMERIC'
111+ else :
112+ values = list (set (model .cv_results_ [key ])) # unique values
113+ type = [str (i ) for i in values ]
114+ print (key + ": " + str (type ))
115+
116+ attribute = ("parameter_" + key [6 :], type )
117+ arff_dict ['attributes' ].append (attribute )
118+
119+ arff_dict ['data' ] = self .trace_content
120+ arff_dict ['relation' ] = 'openml_task_' + str (self .task_id ) + '_predictions'
121+
122+ return arff_dict
123+
77124 def publish (self ):
78125 """Publish a run to the OpenML server.
79126
@@ -84,10 +131,18 @@ def publish(self):
84131 -------
85132 self : OpenMLRun
86133 """
134+ if self .model is None :
135+ raise PyOpenMLError ("OpenMLRun obj does not contain a model. (This should never happen.) " );
136+
87137 predictions = arff .dumps (self ._generate_arff_dict ())
88138 description_xml = self ._create_description_xml ()
89- file_elements = {'predictions' : ("predictions.csv" , predictions ),
139+
140+ file_elements = {'predictions' : ("predictions.arff" , predictions ),
90141 'description' : ("description.xml" , description_xml )}
142+ if self .trace_content is not None :
143+ trace_arff = arff .dumps (self ._generate_trace_arff_dict (self .model ))
144+ file_elements ['trace' ] = ("trace.arff" , trace_arff )
145+
91146 return_code , return_value = _perform_api_call (
92147 "/run/" , file_elements = file_elements )
93148 run_id = int (xmltodict .parse (return_value )['oml:upload_run' ]['oml:run_id' ])
@@ -104,7 +159,11 @@ def _create_description_xml(self):
104159 """
105160 run_environment = _get_version_information ()
106161
107- parameter_settings = self .model .get_params ()
162+ # TODO: don't we have flow object in data structure? Use this one
163+ downloaded_flow = openml .flows .get_flow (self .flow_id )
164+
165+ openml_param_settings = _parse_parameters (self .model , downloaded_flow )
166+
108167 # as a tag, it must be of the form ([a-zA-Z0-9_\-\.])+
109168 # so we format time from 'mm/dd/yy hh:mm:ss' to 'mm-dd-yy_hh.mm.ss'
110169 well_formatted_time = time .strftime ("%c" ).replace (
@@ -113,11 +172,51 @@ def _create_description_xml(self):
113172 [self .model .__module__ + "." + self .model .__class__ .__name__ ]
114173 description = _to_dict (taskid = self .task_id , flow_id = self .flow_id ,
115174 setup_string = _create_setup_string (self .model ),
116- parameter_settings = parameter_settings ,
175+ parameter_settings = openml_param_settings ,
117176 tags = tags )
118177 description_xml = xmltodict .unparse (description , pretty = True )
119178 return description_xml
120179
180+ def _parse_parameters (model , flow ):
181+ """Extracts all parameter settings from a model in OpenML format.
182+
183+ Parameters
184+ ----------
185+ model
186+ the scikit-learn model (fitted)
187+ flow
188+ openml flow object (containing flow ids, i.e., it has to be downloaded from the server)
189+
190+ """
191+ python_param_settings = model .get_params ()
192+ openml_param_settings = []
193+
194+ def get_flow_dict (_flow ):
195+ flow_map = {_flow .name : _flow .flow_id }
196+ for subflow in _flow .components :
197+ flow_map .update (get_flow_dict (_flow .components [subflow ]))
198+ return flow_map
199+
200+ flow_dict = get_flow_dict (flow )
201+
202+ for param in python_param_settings :
203+ if "__" in param :
204+ # parameter of subflow. will be handled later
205+ continue
206+ if isinstance (python_param_settings [param ], BaseEstimator ):
207+ # extract parameters of the subflow individually
208+ subflow = flow .components [param ]
209+ openml_param_settings += _parse_parameters (python_param_settings [param ], subflow )
210+
211+ # add parameter setting (also the subflow. Just because we can)
212+ param_dict = OrderedDict ()
213+ param_dict ['oml:name' ] = param
214+ param_dict ['oml:value' ] = str (python_param_settings [param ])
215+ param_dict ['oml:component' ] = flow_dict [flow .name ]
216+ openml_param_settings .append (param_dict )
217+
218+ return openml_param_settings
219+
121220################################################################################
122221# Functions which cannot be in runs/functions due to circular imports
123222
@@ -169,15 +268,7 @@ def _to_dict(taskid, flow_id, setup_string, parameter_settings, tags):
169268 description ['oml:run' ]['@xmlns:oml' ] = 'http://openml.org/openml'
170269 description ['oml:run' ]['oml:task_id' ] = taskid
171270 description ['oml:run' ]['oml:flow_id' ] = flow_id
172-
173- params = []
174- for k , v in parameter_settings .items ():
175- param_dict = OrderedDict ()
176- param_dict ['oml:name' ] = k
177- param_dict ['oml:value' ] = ('None' if v is None else v )
178- params .append (param_dict )
179-
180- description ['oml:run' ]['oml:parameter_setting' ] = params
271+ description ['oml:run' ]['oml:parameter_setting' ] = parameter_settings
181272 description ['oml:run' ]['oml:tag' ] = tags # Tags describing the run
182273 # description['oml:run']['oml:output_data'] = 0;
183274 # all data that was output of this run, which can be evaluation scores
0 commit comments