Skip to content

Commit ac2d82f

Browse files
authored
Merge pull request #254 from openml/runtests
Runtests
2 parents af39763 + 4f6115f commit ac2d82f

2 files changed

Lines changed: 62 additions & 64 deletions

File tree

openml/runs/functions.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -322,12 +322,23 @@ def _run_task_get_arffcontent(model, task, class_labels):
322322
# typically happens when training a regressor on classification task
323323
raise PyOpenMLError(str(e))
324324

325-
# extract trace
325+
# extract trace, if applicable
326326
if isinstance(model_fold, sklearn.model_selection._search.BaseSearchCV):
327327
arff_tracecontent.extend(_extract_arfftrace(model_fold, rep_no, fold_no))
328-
model_classes = model_fold.best_estimator_.classes_
328+
329+
# search for model classes_ (might differ depending on modeltype)
330+
# first, pipelines are a special case (these don't have a classes_
331+
# object, but rather borrows it from the last step. We do this manually,
332+
# because of the BaseSearch check)
333+
if isinstance(model_fold, sklearn.pipeline.Pipeline):
334+
used_estimator = model_fold.steps[-1][-1]
335+
else:
336+
used_estimator = model_fold
337+
338+
if isinstance(used_estimator, sklearn.model_selection._search.BaseSearchCV):
339+
model_classes = used_estimator.best_estimator_.classes_
329340
else:
330-
model_classes = model_fold.classes_
341+
model_classes = used_estimator.classes_
331342

332343
if can_measure_runtime:
333344
modelpredict_starttime = time.process_time()

tests/test_runs/test_run_functions.py

Lines changed: 48 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,12 @@
99
from openml.testing import TestBase
1010
from openml.runs.functions import _run_task_get_arffcontent
1111

12+
from sklearn.model_selection._search import BaseSearchCV
1213
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
1314
from sklearn.preprocessing.imputation import Imputer
1415
from sklearn.dummy import DummyClassifier
1516
from sklearn.preprocessing import StandardScaler
17+
from sklearn.feature_selection import VarianceThreshold
1618
from sklearn.linear_model import LogisticRegression, SGDClassifier, \
1719
LinearRegression
1820
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
@@ -99,6 +101,9 @@ def _perform_run(self, task_id, num_instances, clf, check_setup=True):
99101
#self.assertEquals(clf.get_params(), clf_prime.get_params())
100102
# self.assertEquals(clf, clf_prime)
101103

104+
downloaded = openml.runs.get_run(run_.run_id)
105+
assert('openml-python' in downloaded.tags)
106+
102107
return run
103108

104109
def test_run_regression_on_classif_task(self):
@@ -120,54 +125,49 @@ def test_check_erronous_sklearn_flow_fails(self, sklearn_to_flow_mock):
120125
self.assertRaisesRegexp(ValueError, "Penalty term must be positive; got \(C='abc'\)",
121126
openml.runs.run_task, task=task, model=clf)
122127

123-
def test_run_diabetes(self):
124-
task_id = 115
125-
num_instances = 768
126-
127-
clf = LogisticRegression()
128-
res = self._perform_run(task_id,num_instances, clf)
129-
130-
downloaded = openml.runs.get_run(res.run_id)
131-
assert('openml-python' in downloaded.tags)
132-
133-
def test_run_optimize_randomforest_diabetes(self):
134-
task_id = 119
135-
num_test_instances = 253
136-
num_folds = 1
137-
num_iterations = 5
138-
139-
clf = RandomForestClassifier(n_estimators=5)
140-
param_dist = {"max_depth": [3, None],
141-
"max_features": [1,2,3,4],
142-
"min_samples_split": [2,3,4,5,6,7,8,9,10],
143-
"min_samples_leaf": [1,2,3,4,5,6,7,8,9,10],
144-
"bootstrap": [True, False],
145-
"criterion": ["gini", "entropy"]}
146-
cv = StratifiedKFold(n_splits=3)
147-
random_search = RandomizedSearchCV(clf, param_dist, cv=cv,
148-
n_iter=num_iterations)
149-
150-
run = self._perform_run(task_id, num_test_instances, random_search)
151-
self.assertEqual(len(run.trace_content), num_iterations * num_folds)
152-
153-
res = self._check_serialized_optimized_run(run.run_id)
154-
self.assertTrue(res)
155-
156-
def test_run_optimize_bagging_diabetes(self):
157-
task_id = 119
158-
num_test_instances = 253
159-
num_folds = 1
160-
num_iterations = 9 # (num values for C times gamma)
161-
162-
bag = BaggingClassifier(base_estimator=SVC())
163-
param_dist = {"base_estimator__C": [0.01, 0.1, 10],
164-
"base_estimator__gamma": [0.01, 0.1, 10]}
165-
grid_search = GridSearchCV(bag, param_dist)
166-
167-
run = self._perform_run(task_id, num_test_instances, grid_search)
168-
self.assertEqual(len(run.trace_content), num_iterations * num_folds)
169-
res = self._check_serialized_optimized_run(run.run_id)
170-
self.assertTrue(res)
128+
def test_run_and_upload(self):
129+
task_id = 119 # diabates dataset
130+
num_test_instances = 253 # 33% holdout task
131+
num_folds = 1 # because of holdout
132+
num_iterations = 5 # for base search classifiers
133+
134+
clfs = [LogisticRegression(),
135+
Pipeline(steps=(('scaler', StandardScaler(with_mean=False)),
136+
('dummy', DummyClassifier(strategy='prior')))),
137+
Pipeline(steps=[('Imputer', Imputer(strategy='median')),
138+
('VarianceThreshold', VarianceThreshold()),
139+
('Estimator', RandomizedSearchCV(DecisionTreeClassifier(),
140+
{'min_samples_split': [2 ** x for x in
141+
range(1, 7 + 1)],
142+
'min_samples_leaf': [2 ** x for x in
143+
range(0, 6 + 1)]},
144+
cv=3, n_iter=10))]),
145+
GridSearchCV(BaggingClassifier(base_estimator=SVC()),
146+
{"base_estimator__C": [0.01, 0.1, 10],
147+
"base_estimator__gamma": [0.01, 0.1, 10]}),
148+
RandomizedSearchCV(RandomForestClassifier(n_estimators=5),
149+
{"max_depth": [3, None],
150+
"max_features": [1, 2, 3, 4],
151+
"min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
152+
"min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
153+
"bootstrap": [True, False],
154+
"criterion": ["gini", "entropy"]},
155+
cv=StratifiedKFold(n_splits=2),
156+
n_iter=num_iterations)
157+
]
158+
159+
for clf in clfs:
160+
run = self._perform_run(task_id, num_test_instances, clf)
161+
if isinstance(clf, BaseSearchCV):
162+
if isinstance(clf, GridSearchCV):
163+
grid_iterations = 1
164+
for param in clf.param_grid:
165+
grid_iterations *= len(clf.param_grid[param])
166+
self.assertEqual(len(run.trace_content), grid_iterations * num_folds)
167+
else:
168+
self.assertEqual(len(run.trace_content), num_iterations * num_folds)
169+
check_res = self._check_serialized_optimized_run(run.run_id)
170+
self.assertTrue(check_res)
171171

172172
def test_run_with_classifiers_in_param_grid(self):
173173
task = openml.tasks.get_task(115)
@@ -180,19 +180,6 @@ def test_run_with_classifiers_in_param_grid(self):
180180
self.assertRaises(TypeError, openml.runs.run_task,
181181
task=task, model=clf, avoid_duplicate_runs=False)
182182

183-
def test_run_pipeline(self):
184-
task_id = 115
185-
num_instances = 768
186-
num_folds = 10
187-
num_iterations = 9 # (num values for C times gamma)
188-
189-
scaler = StandardScaler(with_mean=False)
190-
dummy = DummyClassifier(strategy='prior')
191-
model = Pipeline(steps=(('scaler', scaler), ('dummy', dummy)))
192-
193-
run = self._perform_run(task_id, num_instances, model)
194-
self.assertEqual(run.trace_content, None)
195-
196183
def test__run_task_get_arffcontent(self):
197184
task = openml.tasks.get_task(7)
198185
class_labels = task.class_labels

0 commit comments

Comments
 (0)