44
55import arff
66import xmltodict
7+ from sklearn .base import BaseEstimator
8+ from sklearn .model_selection ._search import BaseSearchCV
79
10+ import openml
811from ..tasks import get_task
912from .._api_calls import _perform_api_call
10-
13+ from .. exceptions import PyOpenMLError
1114
1215class OpenMLRun (object ):
1316 """OpenML Run: result of running a model on an openml dataset.
@@ -17,10 +20,10 @@ class OpenMLRun(object):
1720 FIXME
1821
1922 """
20- def __init__ (self , task_id , flow_id , dataset_id , setup_string = None ,
23+ def __init__ (self , task_id , flow_id , dataset_id , setup_string = None ,
2124 files = None , setup_id = None , tags = None , uploader = None , uploader_name = None ,
2225 evaluations = None , detailed_evaluations = None ,
23- data_content = None , model = None , task_type = None ,
26+ data_content = None , trace_content = None , model = None , task_type = None ,
2427 task_evaluation_measure = None , flow_name = None ,
2528 parameter_settings = None , predictions_url = None , task = None ,
2629 flow = None , run_id = None ):
@@ -39,12 +42,14 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
3942 self .evaluations = evaluations
4043 self .detailed_evaluations = detailed_evaluations
4144 self .data_content = data_content
45+ self .trace_content = trace_content
4246 self .task = task
4347 self .flow = flow
4448 self .run_id = run_id
49+ self .model = model
4550
4651 def _generate_arff_dict (self ):
47- """Generates the arff dictionary for upload to the server.
52+ """Generates the arff dictionary for uploading predictions to the server.
4853
4954 Assumes that the run has been executed.
5055
@@ -74,6 +79,49 @@ def _generate_arff_dict(self):
7479 arff_dict ['relation' ] = 'openml_task_' + str (task .task_id ) + '_predictions'
7580 return arff_dict
7681
82+ def _generate_trace_arff_dict (self , model ):
83+ """Generates the arff dictionary for uploading predictions to the server.
84+
85+ Assumes that the run has been executed.
86+
87+ Returns
88+ -------
89+ arf_dict : dict
90+ Dictionary representation of the ARFF file that will be uploaded.
91+ Contains information about the optimization trace.
92+ """
93+ if self .trace_content is None :
94+ raise ValueError ('No trace content avaiable.' )
95+ if not isinstance (model , BaseSearchCV ):
96+ raise PyOpenMLError ('Cannot generate trace on provided classifier. (This should never happen.)' )
97+
98+ arff_dict = {}
99+ arff_dict ['attributes' ] = [('repeat' , 'NUMERIC' ),
100+ ('fold' , 'NUMERIC' ),
101+ ('iteration' , 'NUMERIC' ),
102+ ('evaluation' , 'NUMERIC' ),
103+ ('selected' , ['true' , 'false' ])]
104+ for key in model .cv_results_ :
105+ if key .startswith ("param_" ):
106+ type = 'STRING'
107+ if all (isinstance (i , (bool )) for i in model .cv_results_ [key ]):
108+ type = ['True' , 'False' ]
109+ elif all (isinstance (i , (int , float )) for i in model .cv_results_ [key ]):
110+ type = 'NUMERIC'
111+ else :
112+ values = list (set (model .cv_results_ [key ])) # unique values
113+ if len (values ) < 100 : # arbitrary number. make it an option?
114+ type = [str (i ) for i in values ]
115+ print (key + ": " + str (type ))
116+
117+ attribute = ("parameter_" + key [6 :], type )
118+ arff_dict ['attributes' ].append (attribute )
119+
120+ arff_dict ['data' ] = self .trace_content
121+ arff_dict ['relation' ] = 'openml_task_' + str (self .task_id ) + '_predictions'
122+
123+ return arff_dict
124+
77125 def publish (self ):
78126 """Publish a run to the OpenML server.
79127
@@ -84,10 +132,18 @@ def publish(self):
84132 -------
85133 self : OpenMLRun
86134 """
135+ if self .model is None :
136+ raise PyOpenMLError ("OpenMLRun obj does not contain a model. (This should never happen.) " );
137+
87138 predictions = arff .dumps (self ._generate_arff_dict ())
88139 description_xml = self ._create_description_xml ()
89- file_elements = {'predictions' : ("predictions.csv" , predictions ),
140+
141+ file_elements = {'predictions' : ("predictions.arff" , predictions ),
90142 'description' : ("description.xml" , description_xml )}
143+ if self .trace_content is not None :
144+ trace_arff = arff .dumps (self ._generate_trace_arff_dict (self .model ))
145+ file_elements ['trace' ] = ("trace.arff" , trace_arff )
146+
91147 return_code , return_value = _perform_api_call (
92148 "/run/" , file_elements = file_elements )
93149 run_id = int (xmltodict .parse (return_value )['oml:upload_run' ]['oml:run_id' ])
@@ -104,7 +160,11 @@ def _create_description_xml(self):
104160 """
105161 run_environment = _get_version_information ()
106162
107- parameter_settings = self .model .get_params ()
163+ # TODO: don't we have flow object in data structure? Use this one
164+ downloaded_flow = openml .flows .get_flow (self .flow_id )
165+
166+ openml_param_settings = _parse_parameters (self .model , downloaded_flow )
167+
108168 # as a tag, it must be of the form ([a-zA-Z0-9_\-\.])+
109169 # so we format time from 'mm/dd/yy hh:mm:ss' to 'mm-dd-yy_hh.mm.ss'
110170 well_formatted_time = time .strftime ("%c" ).replace (
@@ -113,11 +173,44 @@ def _create_description_xml(self):
113173 [self .model .__module__ + "." + self .model .__class__ .__name__ ]
114174 description = _to_dict (taskid = self .task_id , flow_id = self .flow_id ,
115175 setup_string = _create_setup_string (self .model ),
116- parameter_settings = parameter_settings ,
176+ parameter_settings = openml_param_settings ,
117177 tags = tags )
118178 description_xml = xmltodict .unparse (description , pretty = True )
119179 return description_xml
120180
181+ def _parse_parameters (model , flow ):
182+ """Extracts all parameter settings from an model in OpenML format.
183+
184+ Parameters
185+ ----------
186+ model
187+ the sci-kit learn model (fitted)
188+ flow
189+ openml flow object (containing flow ids, i.e., it has to be downloaded from the server)
190+
191+ """
192+ python_param_settings = model .get_params ()
193+ openml_param_settings = []
194+ flow_dict = openml .flows .get_flow_dict (flow )
195+
196+ for param in python_param_settings :
197+ if "__" in param :
198+ # parameter of subflow. will be handled later
199+ continue
200+ if isinstance (python_param_settings [param ], BaseEstimator ):
201+ # extract parameters of the subflow individually
202+ subflow = flow .components [param ]
203+ openml_param_settings += _parse_parameters (python_param_settings [param ], subflow )
204+
205+ # add parameter setting (also the subflow. Just because we can)
206+ param_dict = OrderedDict ()
207+ param_dict ['oml:name' ] = param
208+ param_dict ['oml:value' ] = str (python_param_settings [param ])
209+ param_dict ['oml:component' ] = flow_dict [flow .name ]
210+ openml_param_settings .append (param_dict )
211+
212+ return openml_param_settings
213+
121214################################################################################
122215# Functions which cannot be in runs/functions due to circular imports
123216
@@ -169,15 +262,7 @@ def _to_dict(taskid, flow_id, setup_string, parameter_settings, tags):
169262 description ['oml:run' ]['@xmlns:oml' ] = 'http://openml.org/openml'
170263 description ['oml:run' ]['oml:task_id' ] = taskid
171264 description ['oml:run' ]['oml:flow_id' ] = flow_id
172-
173- params = []
174- for k , v in parameter_settings .items ():
175- param_dict = OrderedDict ()
176- param_dict ['oml:name' ] = k
177- param_dict ['oml:value' ] = ('None' if v is None else v )
178- params .append (param_dict )
179-
180- description ['oml:run' ]['oml:parameter_setting' ] = params
265+ description ['oml:run' ]['oml:parameter_setting' ] = parameter_settings
181266 description ['oml:run' ]['oml:tag' ] = tags # Tags describing the run
182267 # description['oml:run']['oml:output_data'] = 0;
183268 # all data that was output of this run, which can be evaluation scores
0 commit comments