fix issue #177 by changing the order or executing run (first build model, generate predictions, then check openml flow id)

janvanrijn · janvanrijn · commit a6d48b3b992c · 2017-02-01T15:10:40.000+01:00
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -39,14 +39,6 @@ def run_task(task, model):
     # TODO move this into its onwn module. While it somehow belongs here, it
     # adds quite a lot of functionality which is better suited in other places!
     # TODO why doesn't this accept a flow as input? - this would make this more flexible!
-    flow = sklearn_to_flow(model)
-    flow_id = flow._ensure_flow_exists()
-    if (flow_id < 0):
-        print("No flow")
-        return 0, 2
-    config.logger.info(flow_id)
-
-    arff_datacontent = []
 
     dataset = task.get_dataset()
     X, Y = dataset.get_data(target=task.target_name)
@@ -56,10 +48,21 @@ def run_task(task, model):
         raise ValueError('The task has no class labels. This method currently '
                          'only works for tasks with class labels.')
 
-    run = OpenMLRun(task_id=task.task_id, flow_id=flow_id,
-                    dataset_id=dataset.dataset_id, model=model)
+    # execute the run
+    run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model)
     run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)
 
+    # now generate the flow
+    flow = sklearn_to_flow(model)
+    flow_id = flow._ensure_flow_exists()
+    if flow_id < 0:
+        print("No flow")
+        return 0, 2
+    config.logger.info(flow_id)
+
+    # attach the flow to the run
+    run.flow_id = flow_id
+
     return run
 
 
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -133,6 +133,9 @@ def publish(self):
         """
         if self.model is None:
             raise PyOpenMLError("OpenMLRun obj does not contain a model. (This should never happen.) ");
+        if self.flow_id is None:
+            raise PyOpenMLError("OpenMLRun obj does not contain a flow id. (Should have been added while executing the task.) ");
+
 
         predictions = arff.dumps(self._generate_arff_dict())
         description_xml = self._create_description_xml()
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -1,4 +1,4 @@
-from sklearn.linear_model import LogisticRegression, SGDClassifier
+from sklearn.linear_model import LogisticRegression, SGDClassifier, LinearRegression
 from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
 from sklearn.svm import SVC
 from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
@@ -20,6 +20,12 @@ def _perform_run(self, task_id, num_instances, clf):
         self.assertEqual(len(run.data_content), num_instances)
         return run
 
+    def test_run_regression_on_classif_task(self):
+        task_id = 10107
+
+        clf = LinearRegression()
+        task = openml.tasks.get_task(task_id)
+        self.assertRaises(AttributeError, openml.runs.run_task, task=task, model=clf)
 
     def test_run_iris(self):
         task_id = 10107
@@ -44,19 +50,19 @@ def test_run_optimize_randomforest_iris(self):
                       "criterion": ["gini", "entropy"]}
         random_search = RandomizedSearchCV(clf, param_dist,n_iter=num_iterations)
 
-        run = self._perform_run(task_id,num_instances, random_search)
+        run = self._perform_run(task_id, num_instances, random_search)
         self.assertEqual(len(run.trace_content), num_iterations * num_folds)
 
     def test_run_optimize_bagging_iris(self):
         task_id = 10107
         num_instances = 150
         num_folds = 10
-        num_iterations = 36 # (num values for C times gamma)
+        num_iterations = 16 # (num values for C times gamma)
 
         task = openml.tasks.get_task(task_id)
         bag = BaggingClassifier(base_estimator=SVC())
-        param_dist = {"base_estimator__C": [0.001, 0.01, 0.1, 1, 10, 100],
-                      "base_estimator__gamma": [0.001, 0.01, 0.1, 1, 10, 100]}
+        param_dist = {"base_estimator__C": [0.01, 0.1, 1, 10],
+                      "base_estimator__gamma": [0.01, 0.1, 1, 10]}
         grid_search = GridSearchCV(bag, param_dist)
 
         run = self._perform_run(task_id, num_instances, grid_search)