Merge pull request #195 from openml/issue-177

mfeurer · web-flow · commit 32026b41d2ce · 2017-02-02T13:25:42.000+01:00
fix issue #177 by changing the order or executing run
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -39,14 +39,6 @@ def run_task(task, model):
     # TODO move this into its onwn module. While it somehow belongs here, it
     # adds quite a lot of functionality which is better suited in other places!
     # TODO why doesn't this accept a flow as input? - this would make this more flexible!
-    flow = sklearn_to_flow(model)
-    flow_id = flow._ensure_flow_exists()
-    if (flow_id < 0):
-        print("No flow")
-        return 0, 2
-    config.logger.info(flow_id)
-
-    arff_datacontent = []
 
     dataset = task.get_dataset()
     X, Y = dataset.get_data(target=task.target_name)
@@ -56,10 +48,21 @@ def run_task(task, model):
         raise ValueError('The task has no class labels. This method currently '
                          'only works for tasks with class labels.')
 
-    run = OpenMLRun(task_id=task.task_id, flow_id=flow_id,
-                    dataset_id=dataset.dataset_id, model=model)
+    # execute the run
+    run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model)
     run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)
 
+    # now generate the flow
+    flow = sklearn_to_flow(model)
+    flow_id = flow._ensure_flow_exists()
+    if flow_id < 0:
+        print("No flow")
+        return 0, 2
+    config.logger.info(flow_id)
+
+    # attach the flow to the run
+    run.flow_id = flow_id
+
     return run
 
 
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -133,6 +133,9 @@ def publish(self):
         """
         if self.model is None:
             raise PyOpenMLError("OpenMLRun obj does not contain a model. (This should never happen.) ");
+        if self.flow_id is None:
+            raise PyOpenMLError("OpenMLRun obj does not contain a flow id. (Should have been added while executing the task.) ");
+
 
         predictions = arff.dumps(self._generate_arff_dict())
         description_xml = self._create_description_xml()
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -1,11 +1,18 @@
-from sklearn.linear_model import LogisticRegression, SGDClassifier
+import sys
+
+from sklearn.linear_model import LogisticRegression, SGDClassifier, LinearRegression
 from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
 from sklearn.svm import SVC
-from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
+from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
 import openml
 import openml.exceptions
 from openml.testing import TestBase
 
+if sys.version_info[0] >= 3:
+    from unittest import mock
+else:
+    import mock
+
 
 class TestRun(TestBase):
 
@@ -20,6 +27,23 @@ def _perform_run(self, task_id, num_instances, clf):
         self.assertEqual(len(run.data_content), num_instances)
         return run
 
+    def test_run_regression_on_classif_task(self):
+        task_id = 10107
+
+        clf = LinearRegression()
+        task = openml.tasks.get_task(task_id)
+        self.assertRaises(AttributeError, openml.runs.run_task, task=task, model=clf)
+
+    @mock.patch('openml.flows.sklearn_to_flow')
+    def test_check_erronous_sklearn_flow_fails(self, sklearn_to_flow_mock):
+        task_id = 10107
+        task = openml.tasks.get_task(task_id)
+
+        # Invalid parameter values
+        clf = LogisticRegression(C='abc')
+        self.assertEqual(sklearn_to_flow_mock.call_count, 0)
+        self.assertRaisesRegexp(ValueError, "Penalty term must be positive; got \(C='abc'\)",
+                                openml.runs.run_task, task=task, model=clf)
 
     def test_run_iris(self):
         task_id = 10107
@@ -28,34 +52,35 @@ def test_run_iris(self):
         clf = LogisticRegression()
         self._perform_run(task_id,num_instances, clf)
 
-
     def test_run_optimize_randomforest_iris(self):
         task_id = 10107
         num_instances = 150
         num_folds = 10
         num_iterations = 5
 
-        clf = RandomForestClassifier(n_estimators=10)
+        clf = RandomForestClassifier(n_estimators=5)
         param_dist = {"max_depth": [3, None],
                       "max_features": [1,2,3,4],
                       "min_samples_split": [2,3,4,5,6,7,8,9,10],
                       "min_samples_leaf": [1,2,3,4,5,6,7,8,9,10],
                       "bootstrap": [True, False],
                       "criterion": ["gini", "entropy"]}
-        random_search = RandomizedSearchCV(clf, param_dist,n_iter=num_iterations)
+        cv = StratifiedKFold(n_splits=3)
+        random_search = RandomizedSearchCV(clf, param_dist, cv=cv,
+                                           n_iter=num_iterations)
 
-        run = self._perform_run(task_id,num_instances, random_search)
+        run = self._perform_run(task_id, num_instances, random_search)
         self.assertEqual(len(run.trace_content), num_iterations * num_folds)
 
     def test_run_optimize_bagging_iris(self):
         task_id = 10107
         num_instances = 150
         num_folds = 10
-        num_iterations = 36 # (num values for C times gamma)
+        num_iterations = 9 # (num values for C times gamma)
 
         bag = BaggingClassifier(base_estimator=SVC())
-        param_dist = {"base_estimator__C": [0.001, 0.01, 0.1, 1, 10, 100],
-                      "base_estimator__gamma": [0.001, 0.01, 0.1, 1, 10, 100]}
+        param_dist = {"base_estimator__C": [0.01, 0.1, 10],
+                      "base_estimator__gamma": [0.01, 0.1, 10]}
         grid_search = GridSearchCV(bag, param_dist)
 
         run = self._perform_run(task_id, num_instances, grid_search)