99from openml .testing import TestBase
1010from openml .runs .functions import _run_task_get_arffcontent
1111
12+ from sklearn .model_selection ._search import BaseSearchCV
1213from sklearn .tree import DecisionTreeClassifier , ExtraTreeClassifier
1314from sklearn .preprocessing .imputation import Imputer
1415from sklearn .dummy import DummyClassifier
1516from sklearn .preprocessing import StandardScaler
17+ from sklearn .feature_selection import VarianceThreshold
1618from sklearn .linear_model import LogisticRegression , SGDClassifier , \
1719 LinearRegression
1820from sklearn .ensemble import RandomForestClassifier , BaggingClassifier
@@ -99,6 +101,9 @@ def _perform_run(self, task_id, num_instances, clf, check_setup=True):
99101 #self.assertEquals(clf.get_params(), clf_prime.get_params())
100102 # self.assertEquals(clf, clf_prime)
101103
104+ downloaded = openml .runs .get_run (run_ .run_id )
105+ assert ('openml-python' in downloaded .tags )
106+
102107 return run
103108
104109 def test_run_regression_on_classif_task (self ):
@@ -120,54 +125,49 @@ def test_check_erronous_sklearn_flow_fails(self, sklearn_to_flow_mock):
120125 self .assertRaisesRegexp (ValueError , "Penalty term must be positive; got \(C='abc'\)" ,
121126 openml .runs .run_task , task = task , model = clf )
122127
123- def test_run_diabetes (self ):
124- task_id = 115
125- num_instances = 768
126-
127- clf = LogisticRegression ()
128- res = self ._perform_run (task_id ,num_instances , clf )
129-
130- downloaded = openml .runs .get_run (res .run_id )
131- assert ('openml-python' in downloaded .tags )
132-
133- def test_run_optimize_randomforest_diabetes (self ):
134- task_id = 119
135- num_test_instances = 253
136- num_folds = 1
137- num_iterations = 5
138-
139- clf = RandomForestClassifier (n_estimators = 5 )
140- param_dist = {"max_depth" : [3 , None ],
141- "max_features" : [1 ,2 ,3 ,4 ],
142- "min_samples_split" : [2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ],
143- "min_samples_leaf" : [1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ],
144- "bootstrap" : [True , False ],
145- "criterion" : ["gini" , "entropy" ]}
146- cv = StratifiedKFold (n_splits = 3 )
147- random_search = RandomizedSearchCV (clf , param_dist , cv = cv ,
148- n_iter = num_iterations )
149-
150- run = self ._perform_run (task_id , num_test_instances , random_search )
151- self .assertEqual (len (run .trace_content ), num_iterations * num_folds )
152-
153- res = self ._check_serialized_optimized_run (run .run_id )
154- self .assertTrue (res )
155-
156- def test_run_optimize_bagging_diabetes (self ):
157- task_id = 119
158- num_test_instances = 253
159- num_folds = 1
160- num_iterations = 9 # (num values for C times gamma)
161-
162- bag = BaggingClassifier (base_estimator = SVC ())
163- param_dist = {"base_estimator__C" : [0.01 , 0.1 , 10 ],
164- "base_estimator__gamma" : [0.01 , 0.1 , 10 ]}
165- grid_search = GridSearchCV (bag , param_dist )
166-
167- run = self ._perform_run (task_id , num_test_instances , grid_search )
168- self .assertEqual (len (run .trace_content ), num_iterations * num_folds )
169- res = self ._check_serialized_optimized_run (run .run_id )
170- self .assertTrue (res )
128+ def test_run_and_upload (self ):
129+ task_id = 119 # diabates dataset
130+ num_test_instances = 253 # 33% holdout task
131+ num_folds = 1 # because of holdout
132+ num_iterations = 5 # for base search classifiers
133+
134+ clfs = [LogisticRegression (),
135+ Pipeline (steps = (('scaler' , StandardScaler (with_mean = False )),
136+ ('dummy' , DummyClassifier (strategy = 'prior' )))),
137+ Pipeline (steps = [('Imputer' , Imputer (strategy = 'median' )),
138+ ('VarianceThreshold' , VarianceThreshold ()),
139+ ('Estimator' , RandomizedSearchCV (DecisionTreeClassifier (),
140+ {'min_samples_split' : [2 ** x for x in
141+ range (1 , 7 + 1 )],
142+ 'min_samples_leaf' : [2 ** x for x in
143+ range (0 , 6 + 1 )]},
144+ cv = 3 , n_iter = 10 ))]),
145+ GridSearchCV (BaggingClassifier (base_estimator = SVC ()),
146+ {"base_estimator__C" : [0.01 , 0.1 , 10 ],
147+ "base_estimator__gamma" : [0.01 , 0.1 , 10 ]}),
148+ RandomizedSearchCV (RandomForestClassifier (n_estimators = 5 ),
149+ {"max_depth" : [3 , None ],
150+ "max_features" : [1 , 2 , 3 , 4 ],
151+ "min_samples_split" : [2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ],
152+ "min_samples_leaf" : [1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ],
153+ "bootstrap" : [True , False ],
154+ "criterion" : ["gini" , "entropy" ]},
155+ cv = StratifiedKFold (n_splits = 2 ),
156+ n_iter = num_iterations )
157+ ]
158+
159+ for clf in clfs :
160+ run = self ._perform_run (task_id , num_test_instances , clf )
161+ if isinstance (clf , BaseSearchCV ):
162+ if isinstance (clf , GridSearchCV ):
163+ grid_iterations = 1
164+ for param in clf .param_grid :
165+ grid_iterations *= len (clf .param_grid [param ])
166+ self .assertEqual (len (run .trace_content ), grid_iterations * num_folds )
167+ else :
168+ self .assertEqual (len (run .trace_content ), num_iterations * num_folds )
169+ check_res = self ._check_serialized_optimized_run (run .run_id )
170+ self .assertTrue (check_res )
171171
172172 def test_run_with_classifiers_in_param_grid (self ):
173173 task = openml .tasks .get_task (115 )
@@ -180,19 +180,6 @@ def test_run_with_classifiers_in_param_grid(self):
180180 self .assertRaises (TypeError , openml .runs .run_task ,
181181 task = task , model = clf , avoid_duplicate_runs = False )
182182
183- def test_run_pipeline (self ):
184- task_id = 115
185- num_instances = 768
186- num_folds = 10
187- num_iterations = 9 # (num values for C times gamma)
188-
189- scaler = StandardScaler (with_mean = False )
190- dummy = DummyClassifier (strategy = 'prior' )
191- model = Pipeline (steps = (('scaler' , scaler ), ('dummy' , dummy )))
192-
193- run = self ._perform_run (task_id , num_instances , model )
194- self .assertEqual (run .trace_content , None )
195-
196183 def test__run_task_get_arffcontent (self ):
197184 task = openml .tasks .get_task (7 )
198185 class_labels = task .class_labels
0 commit comments