Skip to content

Commit 242ed25

Browse files
committed
Added autorun script which contains the function openml_run which let's a user automatically complete a task for a given sklearn classifier. For now, some tight restrictions apply. An example on how to use it is contained with openml_run_example.py.
1 parent 011b25b commit 242ed25

2 files changed

Lines changed: 285 additions & 0 deletions

File tree

openml/autorun.py

Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
# Made for a course at Eindhoven University of Technology
2+
# Author: Pieter Gijsbers
3+
# Supervisor: Joaquin Vanschoren
4+
5+
from collections import OrderedDict
6+
from openml.apiconnector import APIConnector # openml 0.0.1.dev0
7+
import sklearn # scikit-learn 0.16.1
8+
import pickle # pickleshare 0.5
9+
import arff # liac-arff 2.1.1.dev0
10+
import xmltodict # xmltodict 0.9.2
11+
import os
12+
import sys
13+
import time
14+
15+
16+
# This can possibly be done by a package such as pyxb, but I could not get it to work properly.
17+
def construct_description_dictionary(taskid, flow_id, setup_string, parameter_settings, tags):
18+
""" Creates a dictionary corresponding to the desired xml desired by openML
19+
Keyword arguments:
20+
taskid -- the identifier of the task
21+
setup_string -- a CLI string which can invoke the learning with the correct parameter settings
22+
parameter_settings -- an array of dicts, with each dict containing keys name, value and component, one per parameter setting
23+
tags -- an array of strings with information that give a description of the run, must conform to regex "([a-zA-Z0-9_\-\.])+"
24+
Returns: an array with version information of the above packages
25+
"""
26+
description = OrderedDict()
27+
description['oml:run'] = OrderedDict()
28+
description['oml:run']['@xmlns:oml'] = 'http://openml.org/openml'
29+
description['oml:run']['oml:task_id'] = taskid
30+
31+
description['oml:run']['oml:flow_id'] = flow_id
32+
33+
params = []
34+
for k, v in parameter_settings.items():
35+
param_dict = OrderedDict()
36+
param_dict['oml:name'] = k
37+
param_dict['oml:value'] = ('None' if v is None else v)
38+
params.append(param_dict)
39+
40+
description['oml:run']['oml:parameter_setting'] = params
41+
description['oml:run']['oml:tag'] = tags # Tags describing the run
42+
#description['oml:run']['oml:output_data'] = 0; # all data that was output of this run, which can be evaluation scores (though those are also calculated serverside)
43+
# must be of special data type
44+
return description
45+
46+
def get_version_information():
47+
"""Gets versions of python, sklearn, numpy and scipy, returns them in an array,
48+
Keyword arguments: -
49+
Returns: an array with version information of the above packages
50+
"""
51+
import sklearn
52+
import scipy
53+
import numpy
54+
55+
major, minor, micro, _, _ = sys.version_info
56+
python_version = 'Python_{}.'.format(".".join([str(major), str(minor), str(micro)]))
57+
sklearn_version = 'Sklearn_{}.'.format(sklearn.__version__)
58+
numpy_version = 'NumPy_{}.'.format(numpy.__version__)
59+
scipy_version = 'SciPy_{}.'.format(scipy.__version__)
60+
61+
return [python_version, sklearn_version, numpy_version, scipy_version]
62+
63+
def generate_arff(arff_datacontent, task):
64+
"""
65+
Keyword arguments:
66+
- arff_datacontent, a list of lists containing, in order:
67+
- repeat (int)
68+
- fold (int)
69+
- test index (int)
70+
- predictions per task label (float)
71+
- predicted class label (string)
72+
- actual class label (string)
73+
- task, the OpenML task for which the run is done
74+
"""
75+
run_environment = get_version_information()+[time.strftime("%c")]+['Created by openml_run()']
76+
class_labels = task.class_labels
77+
78+
arff_dict = {}
79+
arff_dict['attributes'] = [('repeat', 'NUMERIC'), # lowercase 'numeric' gives an error
80+
('fold', 'NUMERIC'),
81+
('row_id', 'NUMERIC')] + \
82+
[('confidence.'+class_labels[i], 'NUMERIC') for i in range(len(class_labels))] +\
83+
[('prediction', class_labels),
84+
('correct', class_labels)]
85+
arff_dict['data'] = arff_datacontent
86+
arff_dict['description'] = "\n".join(run_environment)
87+
arff_dict['relation'] = 'openml_task_' + str(task.task_id) + '_predictions'
88+
return arff_dict
89+
90+
def create_description_xml(taskid, flow_id, classifier):
91+
run_environment = get_version_information()
92+
setup_string = ''#" ".join(sys.argv);
93+
94+
parameter_settings = classifier.get_params()
95+
# as a tag, it must be of the form ([a-zA-Z0-9_\-\.])+
96+
# so we format time from 'mm/dd/yy hh:mm:ss' to 'mm-dd-yy_hh.mm.ss'
97+
well_formatted_time = time.strftime("%c").replace(' ', '_').replace('/','-').replace(':','.')
98+
tags = run_environment+[well_formatted_time]+['openml_run']+[classifier.__module__ +"."+ classifier.__class__.__name__]
99+
description = construct_description_dictionary(taskid, flow_id, setup_string, parameter_settings, tags)
100+
description_xml = xmltodict.unparse(description, pretty=True)
101+
return description_xml
102+
103+
def generate_flow_xml(classifier):
104+
import sklearn
105+
flow_dict = OrderedDict()
106+
flow_dict['oml:flow'] = OrderedDict()
107+
flow_dict['oml:flow']['@xmlns:oml'] = 'http://openml.org/openml'
108+
flow_dict['oml:flow']['oml:name'] = classifier.__module__ +"."+ classifier.__class__.__name__
109+
flow_dict['oml:flow']['oml:external_version'] = 'Tsklearn_'+sklearn.__version__
110+
flow_dict['oml:flow']['oml:description'] = 'Flow generated by openml_run'
111+
112+
clf_params = classifier.get_params()
113+
flow_parameters = []
114+
for k, v in clf_params.items():
115+
# data_type, default_value, description, recommendedRange
116+
# type = v.__class__.__name__ Not using this because it doesn't conform standards
117+
# eg. int instead of integer
118+
param_dict = {'oml:name':k}
119+
flow_parameters.append(param_dict)
120+
121+
flow_dict['oml:flow']['oml:parameter'] = flow_parameters
122+
123+
flow_xml = xmltodict.unparse(flow_dict, pretty=True)
124+
125+
# A flow may not be uploaded with the encoding specification..
126+
flow_xml = flow_xml.split('\n', 1)[-1]
127+
return flow_xml
128+
129+
def ensure_flow_exists(connector, classifier):
130+
"""
131+
First checks if a flow exists for the given classifier.
132+
If it does, then it will return the corresponding flow id.
133+
If it does not, then it will create a flow, and return the flow id
134+
of the newly created flow.
135+
"""
136+
import sklearn
137+
flow_name = classifier.__module__ +"."+ classifier.__class__.__name__
138+
flow_version = 'Tsklearn_'+sklearn.__version__
139+
_, _, flow_id = connector.check_flow_exists(flow_name, flow_version)
140+
141+
if int(flow_id) == -1:
142+
# flow does not exist yet, create it
143+
flow_xml = generate_flow_xml(classifier)
144+
file_name = classifier.__class__.__name__ + '_flow.xml'
145+
abs_file_path = os.path.abspath(file_name)
146+
with open(abs_file_path, 'w') as fh:
147+
fh.write(flow_xml)
148+
149+
flow_binary = open(abs_file_path, 'rb').read()
150+
return_code, response_xml = connector.upload_flow(flow_binary)
151+
152+
response_dict = xmltodict.parse(response_xml)
153+
flow_id = response_dict['oml:upload_flow']['oml:id']
154+
return int(flow_id)
155+
156+
elif int(flow_id) == -2:
157+
# Something went wrong retrieving the flow
158+
raise NotImplementedError('Error handling - check_flow_exists fail')
159+
160+
return int(flow_id)
161+
162+
def openml_run(task, classifier):
163+
"""Performs a CV run on the dataset of the given task, using the split.
164+
165+
Keyword arguments:
166+
connector -- Openml APIConnector which is used to download the OpenML Task and Dataset
167+
taskid -- The integer identifier of the task to run the classifier on
168+
classifier -- a classifier which has a function fit(X,Y) and predict(X),
169+
all supervised estimators of scikit learn follow this definition of a classifier [1]
170+
[1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)
171+
172+
Returns:
173+
classifier -- the classifier, trained on the whole dataset
174+
arff-dict -- a dictionary with an 'attributes' and 'data' entry for an arff file
175+
"""
176+
flow_id = ensure_flow_exists(task.api_connector, classifier)
177+
if(flow_id < 0):
178+
print("No flow")
179+
return 0, 2
180+
print(flow_id)
181+
182+
split = task.api_connector.download_split(task)
183+
runname = "t"+str(task.task_id) + "_" + classifier.__class__.__name__
184+
nr_repeats = len(split.split)
185+
arff_datacontent = []
186+
187+
dataset = task.get_dataset()
188+
class_labels = task.class_labels
189+
if(class_labels is None):
190+
raise ValueError('The task has no class labels. This method currently only works for tasks with class labels.')
191+
192+
train_times = []
193+
194+
for r in range(0, nr_repeats):
195+
nr_folds = len(split.split[r])
196+
197+
for f in range(0, nr_folds):
198+
start_time = time.time()
199+
TrainX, TrainY, TestX, TestY = task.get_train_and_test_set(f, r)
200+
_,test_idx = task.get_train_test_split_indices(f)
201+
202+
classifier.fit(TrainX, TrainY)
203+
ProbaY = classifier.predict_proba(TestX)
204+
PredY = classifier.predict(TestX)
205+
end_time = time.time()
206+
207+
train_times.append(end_time - start_time)
208+
209+
for i in range(0,len(test_idx)):
210+
arff_line = [r, f, test_idx[i], class_labels[PredY[i]], class_labels[TestY[i]]]
211+
arff_line[3:3] = ProbaY[i]
212+
arff_datacontent.append( arff_line)
213+
214+
# Generate a dictionary which represents the arff file (with predictions)
215+
arff_dict = generate_arff(arff_datacontent, task)
216+
predictions_path = runname + '.arff'
217+
with open(predictions_path, 'w') as fh:
218+
arff.dump(arff_dict, fh)
219+
220+
description_xml = create_description_xml(task.task_id, flow_id, classifier);
221+
description_path = runname + '.xml'
222+
with open(description_path, 'w') as fh:
223+
fh.write(description_xml)
224+
225+
# Retrain on all data to save the final model
226+
X, Y = dataset.get_dataset(target = dataset.default_target_attribute)
227+
classifier.fit(X, Y)
228+
229+
# While serializing the model with joblib is often more efficient than pickle[1],
230+
# for now we use pickle[2].
231+
# [1] http://scikit-learn.org/stable/modules/model_persistence.html
232+
# [2] https://github.com/openml/python/issues/21 and correspondence with my supervisor
233+
classifier_pickle = pickle.dump(classifier, open(runname + '.pkl', "wb"))
234+
235+
# TODO (?) Return an OpenML run instead.
236+
return predictions_path, description_path
237+
238+
def run_all(tasks, classifiers):
239+
"""
240+
Calls run(task, classifier) with all combinations of tasks and classifiers
241+
242+
Keyword arguments:
243+
- tasks, a list of OpenML Task objects
244+
- classifiers, a list of (scikit learn) classifiers which fit the definition specified for function run(task, classifier)
245+
"""
246+
for task in tasks:
247+
# Getting the split through the task object is not yet possible in the OpenML API (17-12)
248+
for clf in classifiers:
249+
runname = "task"+str(task.task_id) + "_" + classifier.__class__.__name__
250+
251+
clf, arff_dict = openml_run(task, clf)
252+
253+
create_description_xml(task.task_id)
254+
with open(runname + '.xml', 'w') as fh:
255+
fh.write(description_xml)

openml_run_example.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from openml.apiconnector import APIConnector
2+
from openml.autorun import openml_run
3+
from sklearn import ensemble
4+
import xmltodict
5+
import os
6+
"""
7+
An example of an automated machine learning experiment using openml_run
8+
"""
9+
10+
key_file_path = "apikey.txt"
11+
with open(key_file_path, 'r') as fh:
12+
key = fh.readline()
13+
14+
task_id = 59
15+
16+
clf = ensemble.RandomForestClassifier()
17+
connector = APIConnector(apikey = key)
18+
task = connector.download_task(task_id)
19+
20+
prediction_path, description_path = openml_run(task, clf)
21+
22+
prediction_abspath = os.path.abspath(prediction_path)
23+
description_abspath = os.path.abspath(description_path)
24+
25+
return_code, response = connector.upload_run(prediction_abspath, description_abspath)
26+
27+
if(return_code == 200):
28+
response_dict = xmltodict.parse(response.content)
29+
run_id = response_dict['oml:upload_run']['oml:run_id']
30+
print("Uploaded run with id %s" % (run_id))

0 commit comments

Comments
 (0)