Skip to content

Commit f6b68d1

Browse files
authored
Merge branch 'develop' into add/#145
2 parents b9d8425 + 9a0d9a8 commit f6b68d1

9 files changed

Lines changed: 168 additions & 12 deletions

File tree

openml/__init__.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,52 @@
1818

1919
from .datasets import OpenMLDataset, OpenMLDataFeature
2020
from . import datasets
21+
from . import tasks
2122
from . import runs
2223
from . import flows
2324
from .runs import OpenMLRun
2425
from .tasks import OpenMLTask, OpenMLSplit
2526
from .flows import OpenMLFlow
2627

28+
__version__ = "0.3.0"
29+
30+
31+
def populate_cache(task_ids=None, dataset_ids=None, flow_ids=None,
32+
run_ids=None):
33+
"""
34+
Populate a cache for offline and parallel usage of the OpenML connector.
35+
36+
Parameters
37+
----------
38+
task_ids : iterable
39+
40+
dataset_ids : iterable
41+
42+
flow_ids : iterable
43+
44+
run_ids : iterable
45+
46+
Returns
47+
-------
48+
None
49+
"""
50+
if task_ids is not None:
51+
for task_id in task_ids:
52+
tasks.functions.get_task(task_id)
53+
54+
if dataset_ids is not None:
55+
for dataset_id in dataset_ids:
56+
datasets.functions.get_dataset(dataset_id)
57+
58+
if flow_ids is not None:
59+
for flow_id in flow_ids:
60+
flows.functions.get_flow(flow_id)
61+
62+
if run_ids is not None:
63+
for run_id in run_ids:
64+
runs.functions.get_run(run_id)
2765

28-
__version__ = "0.2.1"
2966

3067
__all__ = ['OpenMLDataset', 'OpenMLDataFeature', 'OpenMLRun',
3168
'OpenMLSplit', 'datasets', 'OpenMLTask', 'OpenMLFlow',
32-
'config', 'runs', 'flows']
69+
'config', 'runs', 'flows', 'tasks']

openml/_api_calls.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,9 @@ def _parse_server_exception(response):
110110
try:
111111
server_exception = xmltodict.parse(response.text)
112112
except:
113-
raise OpenMLServerError(('Status code: %d\n' % response.status_code) + response.text)
113+
raise OpenMLServerError(('Unexpected server error. Please '
114+
'contact the developers!\nStatus code: '
115+
'%d\n' % response.status_code) + response.text)
114116

115117
code = int(server_exception['oml:error']['oml:code'])
116118
message = server_exception['oml:error']['oml:message']

openml/flows/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from .flow import OpenMLFlow
2-
from .sklearn_converter import sklearn_to_flow, flow_to_sklearn
2+
3+
from .sklearn_converter import sklearn_to_flow, flow_to_sklearn, _check_n_jobs
34
from .functions import get_flow, list_flows, flow_exists, assert_flows_equal
45

56
__all__ = ['OpenMLFlow', 'create_flow_from_model', 'get_flow', 'list_flows',

openml/flows/sklearn_converter.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,44 @@ def _serialize_cross_validator(o):
544544

545545
return ret
546546

547+
def _check_n_jobs(model):
548+
'''
549+
Returns True if the parameter settings of model are chosen s.t. the model
550+
will run on a single core (in that case, openml-python can measure runtimes)
551+
'''
552+
def check(param_dict, disallow_parameter=False):
553+
for param, value in param_dict.items():
554+
# n_jobs is scikitlearn parameter for paralizing jobs
555+
if param.split('__')[-1] == 'n_jobs':
556+
# 0 = illegal value (?), 1 = use one core, n = use n cores
557+
# -1 = use all available cores -> this makes it hard to
558+
# measure runtime in a fair way
559+
if value != 1 or disallow_parameter:
560+
return False
561+
return True
562+
563+
if not (isinstance(model, sklearn.base.BaseEstimator) or
564+
isinstance(model, sklearn.model_selection._search.BaseSearchCV)):
565+
raise ValueError('model should be BaseEstimator or BaseSearchCV')
566+
567+
# make sure that n_jobs is not in the parameter grid of optimization procedure
568+
if isinstance(model, sklearn.model_selection._search.BaseSearchCV):
569+
param_distributions = None
570+
if isinstance(model, sklearn.model_selection.GridSearchCV):
571+
param_distributions = model.param_grid
572+
elif isinstance(model, sklearn.model_selection.RandomizedSearchCV):
573+
param_distributions = model.param_distributions
574+
else:
575+
print('Warning! Using subclass BaseSearchCV other than ' \
576+
'{GridSearchCV, RandomizedSearchCV}. Should implement param check. ')
577+
pass
578+
579+
if not check(param_distributions, True):
580+
raise PyOpenMLError('openml-python should not be used to '
581+
'optimize the n_jobs parameter.')
582+
583+
# check the parameters for n_jobs
584+
return check(model.get_params(), False)
547585

548586
def _deserialize_cross_validator(value, **kwargs):
549587
model_name = value['name']

openml/runs/functions.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from ..exceptions import PyOpenMLError
1212
from .. import config
13-
from ..flows import sklearn_to_flow, get_flow, flow_exists
13+
from ..flows import sklearn_to_flow, get_flow, flow_exists, _check_n_jobs
1414
from ..setups import setup_exists
1515
from ..exceptions import OpenMLCacheException, OpenMLServerException
1616
from ..util import URLError, version_complies
@@ -160,6 +160,7 @@ def _run_task_get_arffcontent(model, task, class_labels):
160160
user_defined_measures = defaultdict(lambda: defaultdict(dict))
161161

162162
rep_no = 0
163+
can_measure_runtime = version_complies(3, 3) and _check_n_jobs(model)
163164
# TODO use different iterator to only provide a single iterator (less
164165
# methods, less maintenance, less confusion)
165166
for rep in task.iterate_repeats():
@@ -174,11 +175,11 @@ def _run_task_get_arffcontent(model, task, class_labels):
174175

175176
try:
176177
# for measuring runtime. Only available since Python 3.3
177-
if version_complies(3, 3):
178+
if can_measure_runtime:
178179
modelfit_starttime = time.process_time()
179180
model_fold.fit(trainX, trainY)
180181

181-
if version_complies(3, 3):
182+
if can_measure_runtime:
182183
modelfit_duration = (time.process_time() - modelfit_starttime) * 1000
183184
user_defined_measures['usercpu_time_millis_training'][rep_no][fold_no] = modelfit_duration
184185
except AttributeError as e:
@@ -192,12 +193,12 @@ def _run_task_get_arffcontent(model, task, class_labels):
192193
else:
193194
model_classes = model_fold.classes_
194195

195-
if version_complies(3, 3):
196+
if can_measure_runtime:
196197
modelpredict_starttime = time.process_time()
197198

198199
ProbaY = model_fold.predict_proba(testX)
199200
PredY = model_fold.predict(testX)
200-
if version_complies(3, 3):
201+
if can_measure_runtime:
201202
modelpredict_duration = (time.process_time() - modelpredict_starttime) * 1000
202203
user_defined_measures['usercpu_time_millis_testing'][rep_no][fold_no] = modelpredict_duration
203204
user_defined_measures['usercpu_time_millis'][rep_no][fold_no] = modelfit_duration + modelpredict_duration

openml/tasks/functions.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,11 @@ def _create_task_from_xml(xml):
235235
name = input_["@name"]
236236
inputs[name] = input_
237237

238+
evaluation_measures = None
239+
if 'evaluation_measures' in inputs:
240+
evaluation_measures = inputs["evaluation_measures"]["oml:evaluation_measures"]["oml:evaluation_measure"]
241+
242+
238243
# Convert some more parameters
239244
for parameter in \
240245
inputs["estimation_procedure"]["oml:estimation_procedure"][
@@ -251,5 +256,4 @@ def _create_task_from_xml(xml):
251256
"oml:type"],
252257
inputs["estimation_procedure"]["oml:estimation_procedure"][
253258
"oml:data_splits_url"], estimation_parameters,
254-
inputs["evaluation_measures"]["oml:evaluation_measures"][
255-
"oml:evaluation_measure"], None)
259+
evaluation_measures, None)

tests/test_flows/test_sklearn.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,10 @@
2525
import sklearn.tree
2626

2727
from openml.flows import OpenMLFlow, sklearn_to_flow, flow_to_sklearn
28+
2829
from openml.flows.functions import assert_flows_equal
29-
from openml.flows.sklearn_converter import _format_external_version, _check_dependencies
30+
from openml.flows.sklearn_converter import _format_external_version, \
31+
_check_dependencies, _check_n_jobs
3032
from openml.exceptions import PyOpenMLError
3133

3234
this_directory = os.path.dirname(os.path.abspath(__file__))
@@ -555,3 +557,36 @@ def test_illegal_parameter_names_featureunion(self):
555557
('OneHotEncoder', sklearn.preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore'))
556558
]
557559
self.assertRaises(ValueError, sklearn.pipeline.FeatureUnion, transformer_list=transformer_list)
560+
561+
def test_paralizable_check(self):
562+
# using this model should pass the test (if param distribution is legal)
563+
singlecore_bagging = sklearn.ensemble.BaggingClassifier()
564+
# using this model should return false (if param distribution is legal)
565+
multicore_bagging = sklearn.ensemble.BaggingClassifier(n_jobs=5)
566+
# using this param distribution should raise an exception
567+
illegal_param_dist = {"base__n_jobs": [-1, 0, 1] }
568+
# using this param distribution should not raise an exception
569+
legal_param_dist = {"base__max_depth": [2, 3, 4]}
570+
571+
legal_models = [
572+
sklearn.ensemble.RandomForestClassifier(),
573+
sklearn.ensemble.RandomForestClassifier(n_jobs=5),
574+
sklearn.ensemble.RandomForestClassifier(n_jobs=-1),
575+
sklearn.pipeline.Pipeline(steps=[('bag', sklearn.ensemble.BaggingClassifier(n_jobs=1))]),
576+
sklearn.pipeline.Pipeline(steps=[('bag', sklearn.ensemble.BaggingClassifier(n_jobs=5))]),
577+
sklearn.pipeline.Pipeline(steps=[('bag', sklearn.ensemble.BaggingClassifier(n_jobs=-1))]),
578+
sklearn.model_selection.GridSearchCV(singlecore_bagging, legal_param_dist),
579+
sklearn.model_selection.GridSearchCV(multicore_bagging, legal_param_dist)
580+
]
581+
illegal_models = [
582+
sklearn.model_selection.GridSearchCV(singlecore_bagging, illegal_param_dist),
583+
sklearn.model_selection.GridSearchCV(multicore_bagging, illegal_param_dist)
584+
]
585+
586+
answers = [True, False, False, True, False, False, True, False]
587+
588+
for i in range(len(legal_models)):
589+
self.assertTrue(_check_n_jobs(legal_models[i]) == answers[i])
590+
591+
for i in range(len(illegal_models)):
592+
self.assertRaises(PyOpenMLError, _check_n_jobs, illegal_models[i])

tests/test_openml/__init__.py

Whitespace-only changes.

tests/test_openml/test_openml.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import sys
2+
3+
if sys.version_info[0] >= 3:
4+
from unittest import mock
5+
else:
6+
import mock
7+
8+
import six
9+
10+
from openml.testing import TestBase
11+
import openml
12+
13+
14+
class TestInit(TestBase):
15+
16+
@mock.patch('openml.tasks.functions.get_task')
17+
@mock.patch('openml.datasets.functions.get_dataset')
18+
@mock.patch('openml.flows.functions.get_flow')
19+
@mock.patch('openml.runs.functions.get_run')
20+
def test_populate_cache(self, run_mock, flow_mock, dataset_mock, task_mock):
21+
openml.populate_cache(task_ids=[1, 2], dataset_ids=[3, 4],
22+
flow_ids=[5, 6], run_ids=[7, 8])
23+
self.assertEqual(run_mock.call_count, 2)
24+
for argument, fixture in six.moves.zip(run_mock.call_args_list, [(7,), (8,)]):
25+
self.assertEqual(argument[0], fixture)
26+
27+
self.assertEqual(flow_mock.call_count, 2)
28+
for argument, fixture in six.moves.zip(flow_mock.call_args_list, [(5,), (6,)]):
29+
self.assertEqual(argument[0], fixture)
30+
31+
self.assertEqual(dataset_mock.call_count, 2)
32+
for argument, fixture in six.moves.zip(dataset_mock.call_args_list, [(3,), (4,)]):
33+
self.assertEqual(argument[0], fixture)
34+
35+
self.assertEqual(task_mock.call_count, 2)
36+
for argument, fixture in six.moves.zip(task_mock.call_args_list, [(1,), (2,)]):
37+
self.assertEqual(argument[0], fixture)
38+

0 commit comments

Comments
 (0)