Skip to content

Commit 5262272

Browse files
committed
Merge pull request #87 from amueller/flow_refactoring
Flow refactoring
2 parents 17fe0eb + 05bd462 commit 5262272

11 files changed

Lines changed: 154 additions & 152 deletions

File tree

openml/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,10 @@
2020
from . import datasets
2121
from .runs import OpenMLRun
2222
from .tasks import OpenMLTask, OpenMLSplit
23+
from .flows import OpenMLFlow
2324

2425

2526
__version__ = "0.2.1"
2627

2728
__all__ = ['APIConnector', 'OpenMLDataset', 'OpenMLRun', 'OpenMLSplit',
28-
'datasets', 'OpenMLTask']
29+
'datasets', 'OpenMLTask', 'OpenMLFlow']

openml/apiconnector.py

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
import logging
22
import os
33
import sys
4-
#import tempfile
54
import requests
65
import arff
7-
import xmltodict
86

97
if sys.version_info[0] < 3:
108
import ConfigParser as configparser
@@ -235,41 +233,3 @@ def _read_url(self, url, data=None):
235233

236234
response = requests.post(url, data=data)
237235
return response.status_code, response.text
238-
239-
# -> OpenMLFlow
240-
def upload_flow(self, description, flow):
241-
"""
242-
The 'description' is binary data of an XML file according to the XSD Schema (OUTDATED!):
243-
https://github.com/openml/website/blob/master/openml_OS/views/pages/rest_api/xsd/openml.implementation.upload.xsd
244-
245-
(optional) file_path is the absolute path to the file that is the flow (eg. a script)
246-
"""
247-
data = {'description': description, 'source': flow}
248-
return_code, dataset_xml = self._perform_api_call(
249-
"/flow/", data=data)
250-
return return_code, dataset_xml
251-
252-
# -> OpenMLFlow
253-
def check_flow_exists(self, name, version):
254-
"""Retrieves the flow id of the flow uniquely identified by name+version.
255-
256-
Returns flow id if such a flow exists,
257-
returns -1 if flow does not exists,
258-
returns -2 if there was not a well-formed response from the server
259-
http://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version
260-
"""
261-
# Perhaps returns the -1/-2 business with proper raising of exceptions?
262-
263-
if not (type(name) is str and len(name) > 0):
264-
raise ValueError('Parameter \'name\' should be a non-empty string')
265-
if not (type(version) is str and len(version) > 0):
266-
raise ValueError('Parameter \'version\' should be a non-empty string')
267-
268-
return_code, xml_response = self._perform_api_call(
269-
"/flow/exists/%s/%s" % (name, version))
270-
if return_code != 200:
271-
# fixme raise appropriate error
272-
raise ValueError("api call failed: %s" % xml_response)
273-
xml_dict = xmltodict.parse(xml_response)
274-
flow_id = xml_dict['oml:flow_exists']['oml:id']
275-
return return_code, xml_response, flow_id

openml/datasets/functions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ def download_dataset(api_connector, did):
207207
208208
Returns
209209
-------
210-
dataset : :class:`pyMetaLearn.entities.dataset.Dataset`
210+
dataset : :class:`openml.OpenMLDataset`
211211
The downloaded dataset."""
212212
try:
213213
did = int(did)

openml/entities/__init__.py

Whitespace-only changes.

openml/entities/flow.py

Lines changed: 0 additions & 68 deletions
This file was deleted.

openml/flows/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .flow import OpenMLFlow, check_flow_exists
2+
3+
__all__ = ['OpenMLFlow', 'check_flow_exists']

openml/flows/flow.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
from collections import OrderedDict
2+
import xmltodict
3+
import sklearn
4+
5+
6+
class OpenMLFlow(object):
7+
def __init__(self, model, id=None, uploader=None,
8+
description='Flow generated by openml_run', creator=None,
9+
contributor=None, tag=None):
10+
self.id = id
11+
self.upoader = uploader
12+
self.description = description
13+
self.creator = creator
14+
self.tag = tag
15+
self.model = model
16+
self.source = "FIXME DEFINE PYTHON FLOW"
17+
self.name = (model.__module__ + "." +
18+
model.__class__.__name__)
19+
self.external_version = 'Tsklearn_' + sklearn.__version__
20+
21+
def generate_flow_xml(self):
22+
model = self.model
23+
flow_dict = OrderedDict()
24+
flow_dict['oml:flow'] = OrderedDict()
25+
flow_dict['oml:flow']['@xmlns:oml'] = 'http://openml.org/openml'
26+
flow_dict['oml:flow']['oml:name'] = self.name
27+
flow_dict['oml:flow']['oml:external_version'] = self.external_version
28+
flow_dict['oml:flow']['oml:description'] = self.description
29+
30+
clf_params = model.get_params()
31+
flow_parameters = []
32+
for k, v in clf_params.items():
33+
# data_type, default_value, description, recommendedRange
34+
# type = v.__class__.__name__ Not using this because it doesn't conform standards
35+
# eg. int instead of integer
36+
param_dict = {'oml:name': k}
37+
flow_parameters.append(param_dict)
38+
39+
flow_dict['oml:flow']['oml:parameter'] = flow_parameters
40+
41+
flow_xml = xmltodict.unparse(flow_dict, pretty=True)
42+
43+
# A flow may not be uploaded with the encoding specification..
44+
flow_xml = flow_xml.split('\n', 1)[-1]
45+
return flow_xml
46+
47+
def publish(self, api_connector):
48+
"""
49+
The 'description' is binary data of an XML file according to the XSD Schema (OUTDATED!):
50+
https://github.com/openml/website/blob/master/openml_OS/views/pages/rest_api/xsd/openml.implementation.upload.xsd
51+
52+
(optional) file_path is the absolute path to the file that is the flow (eg. a script)
53+
"""
54+
xml_description = self.generate_flow_xml()
55+
data = {'description': xml_description, 'source': self.source}
56+
return_code, return_value = api_connector._perform_api_call(
57+
"/flow/", data=data)
58+
return return_code, return_value
59+
60+
def ensure_flow_exists(self, connector):
61+
"""
62+
First checks if a flow exists for the given model.
63+
If it does, then it will return the corresponding flow id.
64+
If it does not, then it will create a flow, and return the flow id
65+
of the newly created flow.
66+
"""
67+
import sklearn
68+
flow_version = 'Tsklearn_' + sklearn.__version__
69+
_, _, flow_id = check_flow_exists(connector, self.name, flow_version)
70+
71+
if int(flow_id) == -1:
72+
return_code, response_xml = self.publish(connector)
73+
74+
response_dict = xmltodict.parse(response_xml)
75+
flow_id = response_dict['oml:upload_flow']['oml:id']
76+
return int(flow_id)
77+
78+
return int(flow_id)
79+
80+
81+
def check_flow_exists(api_connector, name, version):
82+
"""Retrieves the flow id of the flow uniquely identified by name+version.
83+
84+
Returns flow id if such a flow exists,
85+
returns -1 if flow does not exists,
86+
http://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version
87+
"""
88+
if not (type(name) is str and len(name) > 0):
89+
raise ValueError('Parameter \'name\' should be a non-empty string')
90+
if not (type(version) is str and len(version) > 0):
91+
raise ValueError('Parameter \'version\' should be a non-empty string')
92+
93+
return_code, xml_response = api_connector._perform_api_call(
94+
"/flow/exists/%s/%s" % (name, version))
95+
if return_code != 200:
96+
# fixme raise appropriate error
97+
raise ValueError("api call failed: %s" % xml_response)
98+
xml_dict = xmltodict.parse(xml_response)
99+
flow_id = xml_dict['oml:flow_exists']['oml:id']
100+
return return_code, xml_response, flow_id

openml/runs/run.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import os
77

88

9-
from ..entities.flow import OpenMLFlow
9+
from ..flows import OpenMLFlow
1010
from ..exceptions import OpenMLCacheException
1111
from ..util import URLError
1212
from ..tasks import download_task
@@ -16,7 +16,7 @@ class OpenMLRun(object):
1616
def __init__(self, task_id, flow_id, setup_string, dataset_id, files=None,
1717
setup_id=None, tags=None, run_id=None, uploader=None,
1818
uploader_name=None, evaluations=None, data_content=None,
19-
classifier=None, task_type=None, task_evaluation_measure=None,
19+
model=None, task_type=None, task_evaluation_measure=None,
2020
flow_name=None, parameter_settings=None, predictions_url=None):
2121
self.run_id = run_id
2222
self.uploader = uploader
@@ -33,7 +33,7 @@ def __init__(self, task_id, flow_id, setup_string, dataset_id, files=None,
3333
self.predictions_url = predictions_url
3434
self.evaluations = evaluations
3535
self.data_content = data_content
36-
self.classifier = classifier
36+
self.model = model
3737

3838
def generate_arff(self, api_connector):
3939
"""Generates an arff
@@ -81,48 +81,49 @@ def create_description_xml(self):
8181
run_environment = get_version_information()
8282
setup_string = '' # " ".join(sys.argv);
8383

84-
parameter_settings = self.classifier.get_params()
84+
parameter_settings = self.model.get_params()
8585
# as a tag, it must be of the form ([a-zA-Z0-9_\-\.])+
8686
# so we format time from 'mm/dd/yy hh:mm:ss' to 'mm-dd-yy_hh.mm.ss'
8787
well_formatted_time = time.strftime("%c").replace(
8888
' ', '_').replace('/', '-').replace(':', '.')
8989
tags = run_environment + [well_formatted_time] + ['openml_run'] + \
90-
[self.classifier.__module__ + "." + self.classifier.__class__.__name__]
90+
[self.model.__module__ + "." + self.model.__class__.__name__]
9191
description = construct_description_dictionary(
9292
self.task_id, self.flow_id, setup_string, parameter_settings, tags)
9393
description_xml = xmltodict.unparse(description, pretty=True)
9494
return description_xml
9595

9696

97-
def openml_run(connector, task, classifier):
97+
def openml_run(connector, task, model):
9898
"""Performs a CV run on the dataset of the given task, using the split.
9999
100100
Parameters
101101
----------
102102
connector : APIConnector
103103
Openml APIConnector which is used to download the OpenML Task and Dataset
104104
taskid : int
105-
The integer identifier of the task to run the classifier on
106-
classifier : sklearn classifier
107-
a classifier which has a function fit(X,Y) and predict(X),
108-
all supervised estimators of scikit learn follow this definition of a classifier [1]
105+
The integer identifier of the task to run the model on
106+
model : sklearn model
107+
a model which has a function fit(X,Y) and predict(X),
108+
all supervised estimators of scikit learn follow this definition of a model [1]
109109
[1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)
110110
111111
112112
Returns
113113
-------
114-
classifier : sklearn classifier
115-
the classifier, trained on the whole dataset
114+
model : sklearn model
115+
the model, trained on the whole dataset
116116
arff-dict : dict
117117
a dictionary with an 'attributes' and 'data' entry for an arff file
118118
"""
119-
flow_id = OpenMLFlow.ensure_flow_exists(task.api_connector, classifier)
119+
flow = OpenMLFlow(model=model)
120+
flow_id = flow.ensure_flow_exists(task.api_connector)
120121
if(flow_id < 0):
121122
print("No flow")
122123
return 0, 2
123124
print(flow_id)
124125

125-
#runname = "t" + str(task.task_id) + "_" + str(classifier)
126+
#runname = "t" + str(task.task_id) + "_" + str(model)
126127
arff_datacontent = []
127128

128129
dataset = task.get_dataset()
@@ -132,7 +133,7 @@ def openml_run(connector, task, classifier):
132133
if class_labels is None:
133134
raise ValueError('The task has no class labels. This method currently '
134135
'only works for tasks with class labels.')
135-
setup_string = create_setup_string(classifier)
136+
setup_string = create_setup_string(model)
136137

137138
run = OpenMLRun(task.task_id, flow_id, setup_string, dataset.id)
138139

@@ -149,9 +150,9 @@ def openml_run(connector, task, classifier):
149150
testY = Y[test_indices]
150151

151152
start_time = time.time()
152-
classifier.fit(trainX, trainY)
153-
ProbaY = classifier.predict_proba(testX)
154-
PredY = classifier.predict(testX)
153+
model.fit(trainX, trainY)
154+
ProbaY = model.predict_proba(testX)
155+
PredY = model.predict(testX)
155156
end_time = time.time()
156157

157158
train_times.append(end_time - start_time)
@@ -166,7 +167,7 @@ def openml_run(connector, task, classifier):
166167
rep_no = rep_no + 1
167168

168169
run.data_content = arff_datacontent
169-
run.classifier = classifier.fit(X, Y)
170+
run.model = model.fit(X, Y)
170171
return run
171172

172173

@@ -213,10 +214,10 @@ def construct_description_dictionary(taskid, flow_id, setup_string,
213214
return description
214215

215216

216-
def create_setup_string(classifier):
217+
def create_setup_string(model):
217218
run_environment = " ".join(get_version_information())
218-
# fixme str(classifier) might contain (...)
219-
return run_environment + " " + str(classifier)
219+
# fixme str(model) might contain (...)
220+
return run_environment + " " + str(model)
220221

221222

222223
# This can possibly be done by a package such as pyxb, but I could not get

openml/tasks/task.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def download_split(self):
9999
Parameters
100100
----------
101101
task_id : Task
102-
An entity of :class:`pyMetaLearn.entities.task.Task`.
102+
An entity of :class:`openml.OpenMLTask`.
103103
"""
104104
cached_split_file = os.path.join(
105105
_create_task_cache_dir(self.api_connector, self.task_id), "datasplits.arff")

0 commit comments

Comments
 (0)