1+ # Made for a course at Eindhoven University of Technology
2+ # Author: Pieter Gijsbers
3+ # Supervisor: Joaquin Vanschoren
4+
5+ from collections import OrderedDict
6+ from openml .apiconnector import APIConnector # openml 0.0.1.dev0
7+ import sklearn # scikit-learn 0.16.1
8+ import pickle # pickleshare 0.5
9+ import arff # liac-arff 2.1.1.dev0
10+ import xmltodict # xmltodict 0.9.2
11+ import os
12+ import sys
13+ import time
14+
15+
16+ # This can possibly be done by a package such as pyxb, but I could not get it to work properly.
17+ def construct_description_dictionary (taskid , flow_id , setup_string , parameter_settings , tags ):
18+ """ Creates a dictionary corresponding to the desired xml desired by openML
19+ Keyword arguments:
20+ taskid -- the identifier of the task
21+ setup_string -- a CLI string which can invoke the learning with the correct parameter settings
22+ parameter_settings -- an array of dicts, with each dict containing keys name, value and component, one per parameter setting
23+ tags -- an array of strings with information that give a description of the run, must conform to regex "([a-zA-Z0-9_\-\.])+"
24+ Returns: an array with version information of the above packages
25+ """
26+ description = OrderedDict ()
27+ description ['oml:run' ] = OrderedDict ()
28+ description ['oml:run' ]['@xmlns:oml' ] = 'http://openml.org/openml'
29+ description ['oml:run' ]['oml:task_id' ] = taskid
30+
31+ description ['oml:run' ]['oml:flow_id' ] = flow_id
32+
33+ params = []
34+ for k , v in parameter_settings .items ():
35+ param_dict = OrderedDict ()
36+ param_dict ['oml:name' ] = k
37+ param_dict ['oml:value' ] = ('None' if v is None else v )
38+ params .append (param_dict )
39+
40+ description ['oml:run' ]['oml:parameter_setting' ] = params
41+ description ['oml:run' ]['oml:tag' ] = tags # Tags describing the run
42+ #description['oml:run']['oml:output_data'] = 0; # all data that was output of this run, which can be evaluation scores (though those are also calculated serverside)
43+ # must be of special data type
44+ return description
45+
46+ def get_version_information ():
47+ """Gets versions of python, sklearn, numpy and scipy, returns them in an array,
48+ Keyword arguments: -
49+ Returns: an array with version information of the above packages
50+ """
51+ import sklearn
52+ import scipy
53+ import numpy
54+
55+ major , minor , micro , _ , _ = sys .version_info
56+ python_version = 'Python_{}.' .format ("." .join ([str (major ), str (minor ), str (micro )]))
57+ sklearn_version = 'Sklearn_{}.' .format (sklearn .__version__ )
58+ numpy_version = 'NumPy_{}.' .format (numpy .__version__ )
59+ scipy_version = 'SciPy_{}.' .format (scipy .__version__ )
60+
61+ return [python_version , sklearn_version , numpy_version , scipy_version ]
62+
63+ def generate_arff (arff_datacontent , task ):
64+ """
65+ Keyword arguments:
66+ - arff_datacontent, a list of lists containing, in order:
67+ - repeat (int)
68+ - fold (int)
69+ - test index (int)
70+ - predictions per task label (float)
71+ - predicted class label (string)
72+ - actual class label (string)
73+ - task, the OpenML task for which the run is done
74+ """
75+ run_environment = get_version_information ()+ [time .strftime ("%c" )]+ ['Created by openml_run()' ]
76+ class_labels = task .class_labels
77+
78+ arff_dict = {}
79+ arff_dict ['attributes' ] = [('repeat' , 'NUMERIC' ), # lowercase 'numeric' gives an error
80+ ('fold' , 'NUMERIC' ),
81+ ('row_id' , 'NUMERIC' )] + \
82+ [('confidence.' + class_labels [i ], 'NUMERIC' ) for i in range (len (class_labels ))] + \
83+ [('prediction' , class_labels ),
84+ ('correct' , class_labels )]
85+ arff_dict ['data' ] = arff_datacontent
86+ arff_dict ['description' ] = "\n " .join (run_environment )
87+ arff_dict ['relation' ] = 'openml_task_' + str (task .task_id ) + '_predictions'
88+ return arff_dict
89+
90+ def create_description_xml (taskid , flow_id , classifier ):
91+ run_environment = get_version_information ()
92+ setup_string = '' #" ".join(sys.argv);
93+
94+ parameter_settings = classifier .get_params ()
95+ # as a tag, it must be of the form ([a-zA-Z0-9_\-\.])+
96+ # so we format time from 'mm/dd/yy hh:mm:ss' to 'mm-dd-yy_hh.mm.ss'
97+ well_formatted_time = time .strftime ("%c" ).replace (' ' , '_' ).replace ('/' ,'-' ).replace (':' ,'.' )
98+ tags = run_environment + [well_formatted_time ]+ ['openml_run' ]+ [classifier .__module__ + "." + classifier .__class__ .__name__ ]
99+ description = construct_description_dictionary (taskid , flow_id , setup_string , parameter_settings , tags )
100+ description_xml = xmltodict .unparse (description , pretty = True )
101+ return description_xml
102+
103+ def generate_flow_xml (classifier ):
104+ import sklearn
105+ flow_dict = OrderedDict ()
106+ flow_dict ['oml:flow' ] = OrderedDict ()
107+ flow_dict ['oml:flow' ]['@xmlns:oml' ] = 'http://openml.org/openml'
108+ flow_dict ['oml:flow' ]['oml:name' ] = classifier .__module__ + "." + classifier .__class__ .__name__
109+ flow_dict ['oml:flow' ]['oml:external_version' ] = 'Tsklearn_' + sklearn .__version__
110+ flow_dict ['oml:flow' ]['oml:description' ] = 'Flow generated by openml_run'
111+
112+ clf_params = classifier .get_params ()
113+ flow_parameters = []
114+ for k , v in clf_params .items ():
115+ # data_type, default_value, description, recommendedRange
116+ # type = v.__class__.__name__ Not using this because it doesn't conform standards
117+ # eg. int instead of integer
118+ param_dict = {'oml:name' :k }
119+ flow_parameters .append (param_dict )
120+
121+ flow_dict ['oml:flow' ]['oml:parameter' ] = flow_parameters
122+
123+ flow_xml = xmltodict .unparse (flow_dict , pretty = True )
124+
125+ # A flow may not be uploaded with the encoding specification..
126+ flow_xml = flow_xml .split ('\n ' , 1 )[- 1 ]
127+ return flow_xml
128+
129+ def ensure_flow_exists (connector , classifier ):
130+ """
131+ First checks if a flow exists for the given classifier.
132+ If it does, then it will return the corresponding flow id.
133+ If it does not, then it will create a flow, and return the flow id
134+ of the newly created flow.
135+ """
136+ import sklearn
137+ flow_name = classifier .__module__ + "." + classifier .__class__ .__name__
138+ flow_version = 'Tsklearn_' + sklearn .__version__
139+ _ , _ , flow_id = connector .check_flow_exists (flow_name , flow_version )
140+
141+ if int (flow_id ) == - 1 :
142+ # flow does not exist yet, create it
143+ flow_xml = generate_flow_xml (classifier )
144+ file_name = classifier .__class__ .__name__ + '_flow.xml'
145+ abs_file_path = os .path .abspath (file_name )
146+ with open (abs_file_path , 'w' ) as fh :
147+ fh .write (flow_xml )
148+
149+ flow_binary = open (abs_file_path , 'rb' ).read ()
150+ return_code , response_xml = connector .upload_flow (flow_binary )
151+
152+ response_dict = xmltodict .parse (response_xml )
153+ flow_id = response_dict ['oml:upload_flow' ]['oml:id' ]
154+ return int (flow_id )
155+
156+ elif int (flow_id ) == - 2 :
157+ # Something went wrong retrieving the flow
158+ raise NotImplementedError ('Error handling - check_flow_exists fail' )
159+
160+ return int (flow_id )
161+
162+ def openml_run (task , classifier ):
163+ """Performs a CV run on the dataset of the given task, using the split.
164+
165+ Keyword arguments:
166+ connector -- Openml APIConnector which is used to download the OpenML Task and Dataset
167+ taskid -- The integer identifier of the task to run the classifier on
168+ classifier -- a classifier which has a function fit(X,Y) and predict(X),
169+ all supervised estimators of scikit learn follow this definition of a classifier [1]
170+ [1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)
171+
172+ Returns:
173+ classifier -- the classifier, trained on the whole dataset
174+ arff-dict -- a dictionary with an 'attributes' and 'data' entry for an arff file
175+ """
176+ flow_id = ensure_flow_exists (task .api_connector , classifier )
177+ if (flow_id < 0 ):
178+ print ("No flow" )
179+ return 0 , 2
180+ print (flow_id )
181+
182+ split = task .api_connector .download_split (task )
183+ runname = "t" + str (task .task_id ) + "_" + classifier .__class__ .__name__
184+ nr_repeats = len (split .split )
185+ arff_datacontent = []
186+
187+ dataset = task .get_dataset ()
188+ class_labels = task .class_labels
189+ if (class_labels is None ):
190+ raise ValueError ('The task has no class labels. This method currently only works for tasks with class labels.' )
191+
192+ train_times = []
193+
194+ for r in range (0 , nr_repeats ):
195+ nr_folds = len (split .split [r ])
196+
197+ for f in range (0 , nr_folds ):
198+ start_time = time .time ()
199+ TrainX , TrainY , TestX , TestY = task .get_train_and_test_set (f , r )
200+ _ ,test_idx = task .get_train_test_split_indices (f )
201+
202+ classifier .fit (TrainX , TrainY )
203+ ProbaY = classifier .predict_proba (TestX )
204+ PredY = classifier .predict (TestX )
205+ end_time = time .time ()
206+
207+ train_times .append (end_time - start_time )
208+
209+ for i in range (0 ,len (test_idx )):
210+ arff_line = [r , f , test_idx [i ], class_labels [PredY [i ]], class_labels [TestY [i ]]]
211+ arff_line [3 :3 ] = ProbaY [i ]
212+ arff_datacontent .append ( arff_line )
213+
214+ # Generate a dictionary which represents the arff file (with predictions)
215+ arff_dict = generate_arff (arff_datacontent , task )
216+ predictions_path = runname + '.arff'
217+ with open (predictions_path , 'w' ) as fh :
218+ arff .dump (arff_dict , fh )
219+
220+ description_xml = create_description_xml (task .task_id , flow_id , classifier );
221+ description_path = runname + '.xml'
222+ with open (description_path , 'w' ) as fh :
223+ fh .write (description_xml )
224+
225+ # Retrain on all data to save the final model
226+ X , Y = dataset .get_dataset (target = dataset .default_target_attribute )
227+ classifier .fit (X , Y )
228+
229+ # While serializing the model with joblib is often more efficient than pickle[1],
230+ # for now we use pickle[2].
231+ # [1] http://scikit-learn.org/stable/modules/model_persistence.html
232+ # [2] https://github.com/openml/python/issues/21 and correspondence with my supervisor
233+ classifier_pickle = pickle .dump (classifier , open (runname + '.pkl' , "wb" ))
234+
235+ # TODO (?) Return an OpenML run instead.
236+ return predictions_path , description_path
237+
238+ def run_all (tasks , classifiers ):
239+ """
240+ Calls run(task, classifier) with all combinations of tasks and classifiers
241+
242+ Keyword arguments:
243+ - tasks, a list of OpenML Task objects
244+ - classifiers, a list of (scikit learn) classifiers which fit the definition specified for function run(task, classifier)
245+ """
246+ for task in tasks :
247+ # Getting the split through the task object is not yet possible in the OpenML API (17-12)
248+ for clf in classifiers :
249+ runname = "task" + str (task .task_id ) + "_" + classifier .__class__ .__name__
250+
251+ clf , arff_dict = openml_run (task , clf )
252+
253+ create_description_xml (task .task_id )
254+ with open (runname + '.xml' , 'w' ) as fh :
255+ fh .write (description_xml )
0 commit comments