improve testing of setup_exists

mfeurer · mfeurer · commit beaa04696fb4 · 2017-05-16T22:18:25.000+02:00
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -7,7 +7,7 @@
 import warnings
 
 import numpy as np
-import sklearn
+import sklearn.pipeline
 import six
 import xmltodict
 
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
@@ -34,9 +34,7 @@ def setup_exists(flow, model=None):
     if model is None:
         model = flow.model
     else:
-        converted_flow = sklearn_to_flow(model)
-        exists = flow_exists(converted_flow.name,
-                             converted_flow.external_version)
+        exists = flow_exists(flow.name, flow.external_version)
         if exists != flow.flow_id:
             raise ValueError('This should not happen!')
 
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
@@ -8,11 +8,9 @@
 
 from sklearn.ensemble import BaggingClassifier
 from sklearn.tree import DecisionTreeClassifier
-
-if sys.version_info[0] >= 3:
-    from unittest import mock
-else:
-    import mock
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import GaussianNB
+from sklearn.base import BaseEstimator, ClassifierMixin
 
 
 def get_sentinel():
@@ -26,6 +24,29 @@ def get_sentinel():
     return sentinel
 
 
+class ParameterFreeClassifier(BaseEstimator, ClassifierMixin):
+    def __init__(self):
+        self.estimator = None
+
+    def fit(self, X, y):
+        self.estimator = DecisionTreeClassifier()
+        self.estimator.fit(X, y)
+        self.classes_ = self.estimator.classes_
+        return self
+
+    def predict(self, X):
+        return self.estimator.predict(X)
+
+    def predict_proba(self, X):
+        return self.estimator.predict_proba(X)
+
+    def set_params(self, **params):
+        pass
+
+    def get_params(self, deep=True):
+        return {}
+
+
 
 class TestRun(TestBase):
 
@@ -45,39 +66,40 @@ def test_nonexisting_setup_exists(self):
         self.assertFalse(setup_id)
 
     def test_existing_setup_exists(self):
-        # first publish a nonexiting flow
-
-        # because of the sentinel, we can not use flows that contain subflows
-        classif = DecisionTreeClassifier(max_depth=5,
-                                         min_samples_split=3,
-                                         # Not setting the random state will
-                                         # make this flow fail as running it
-                                         # will add a random random_state.
-                                         random_state=1)
-        flow = openml.flows.sklearn_to_flow(classif)
-        flow.name = 'TEST%s%s' % (get_sentinel(), flow.name)
-
-        # Replace the flow by a flow in which the ID got set up correctly
-        flow = flow.publish()
-        flow = openml.flows.get_flow(flow.flow_id)
-
-        # although the flow exists, we can be sure there are no
-        # setups (yet) as it hasn't been ran
-        setup_id = openml.setups.setup_exists(flow)
-        self.assertFalse(setup_id)
-
-        # now run the flow on an easy task:
-        task = openml.tasks.get_task(115) # diabetes
-        run = openml.runs.run_flow_on_task(task, flow)
-        # spoof flow id, otherwise the sentinel is ignored
-        run.flow_id = flow.flow_id
-        run = run.publish()
-        # download the run, as it contains the right setup id
-        run = openml.runs.get_run(run.run_id)
-
-        # execute the function we are interested in
-        setup_id = openml.setups.setup_exists(flow)
-        self.assertEquals(setup_id, run.setup_id)
+        clfs = [ParameterFreeClassifier(),           # zero hyperparemeters
+                GaussianNB(),                        # one hyperparameter
+                DecisionTreeClassifier(max_depth=5,  # many hyperparameters
+                                       min_samples_split=3,
+                                       # Not setting the random state will
+                                       # make this flow fail as running it
+                                       # will add a random random_state.
+                                       random_state=1)]
+
+        for classif in clfs:
+            # first publish a nonexiting flow
+            flow = openml.flows.sklearn_to_flow(classif)
+            flow.name = 'TEST%s%s' % (get_sentinel(), flow.name)
+            flow.publish()
+
+            # although the flow exists, we can be sure there are no
+            # setups (yet) as it hasn't been ran
+            setup_id = openml.setups.setup_exists(flow)
+            self.assertFalse(setup_id)
+            setup_id = openml.setups.setup_exists(flow, classif)
+            self.assertFalse(setup_id)
+
+            # now run the flow on an easy task:
+            task = openml.tasks.get_task(115)  # diabetes
+            run = openml.runs.run_flow_on_task(task, flow)
+            # spoof flow id, otherwise the sentinel is ignored
+            run.flow_id = flow.flow_id
+            run.publish()
+            # download the run, as it contains the right setup id
+            run = openml.runs.get_run(run.run_id)
+
+            # execute the function we are interested in
+            setup_id = openml.setups.setup_exists(flow)
+            self.assertEquals(setup_id, run.setup_id)
 
     def test_get_setup(self):
         # no setups in default test server