fixed parameter trace to correct openml format

janvanrijn · janvanrijn · commit 0f58378344cb · 2016-09-29T17:17:25.000+02:00
updated optimization library to "model_selection._search"
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -2,7 +2,7 @@
 import io
 import os
 import xmltodict
-from sklearn.grid_search import BaseSearchCV
+from sklearn.model_selection._search import BaseSearchCV
 
 from .. import config
 from ..flows import create_flow_from_model
@@ -83,13 +83,16 @@ def _run_task_get_arffcontent(model, task, class_labels):
 
             model.fit(trainX, trainY)
             if isinstance(model, BaseSearchCV):
-                for itt_no in range(0, len(model.grid_scores_)):
-                    current = model.grid_scores_[itt_no]
+                for itt_no in range(0, len(model.cv_results_['mean_test_score'])):
                     # we use the string values for True and False, as it is defined in this way by the OpenML server
                     selected = 'false'
-                    if current.parameters == model.best_params_:
+                    if itt_no == model.best_index_:
                        selected = 'true'
-                    arff_line = [rep_no, fold_no, itt_no, current.parameters, current.mean_validation_score, selected]
+                    test_score = model.cv_results_['mean_test_score'][itt_no]
+                    arff_line = [rep_no, fold_no, itt_no, test_score, selected]
+                    for key in model.cv_results_:
+                        if key.startswith("param_"):
+                            arff_line.append(str(model.cv_results_[key][itt_no]))
                     arff_tracecontent.append(arff_line)
 
             ProbaY = model.predict_proba(testX)
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -5,6 +5,7 @@
 import arff
 import xmltodict
 from sklearn.base import BaseEstimator
+from sklearn.model_selection._search import BaseSearchCV
 
 import openml
 from ..tasks import get_task
@@ -78,18 +79,48 @@ def _generate_arff_dict(self):
         arff_dict['relation'] = 'openml_task_' + str(task.task_id) + '_predictions'
         return arff_dict
 
-    def _generate_trace_arff_dict(self):
+    def _generate_trace_arff_dict(self, model):
+        """Generates the arff dictionary for uploading predictions to the server.
+
+        Assumes that the run has been executed.
+
+        Returns
+        -------
+        arf_dict : dict
+            Dictionary representation of the ARFF file that will be uploaded.
+            Contains information about the optimization trace.
+        """
         if self.trace_content is None:
-            raise ValueError('No trace content avaiable. (This should never happen.)')
+            raise ValueError('No trace content avaiable.')
+        if not isinstance(model, BaseSearchCV):
+            raise PyOpenMLError('Cannot generate trace on provided classifier. (This should never happen.)')
+
         arff_dict = {}
         arff_dict['attributes'] = [('repeat', 'NUMERIC'),
                                    ('fold', 'NUMERIC'),
                                    ('iteration', 'NUMERIC'),
-                                   ('setup_string', 'STRING'),
                                    ('evaluation', 'NUMERIC'),
                                    ('selected', ['true', 'false'])]
+        for key in model.cv_results_:
+            if key.startswith("param_"):
+                type = 'STRING'
+                if all(isinstance(i, (bool)) for i in model.cv_results_[key]):
+                    type = ['True', 'False']
+                elif all(isinstance(i, (int, float)) for i in model.cv_results_[key]):
+                    type = 'NUMERIC'
+                else:
+                    values = list(set(model.cv_results_[key])) # unique values
+                    if len(values) < 100: # arbitrary number. make it an option?
+                        type = [str(i) for i in values]
+                    print(key + ": " + str(type))
+
+                attribute = ("parameter_" + key[6:], type)
+                arff_dict['attributes'].append(attribute)
+
         arff_dict['data'] = self.trace_content
         arff_dict['relation'] = 'openml_task_' + str(self.task_id) + '_predictions'
+
+        print(arff_dict)
         return arff_dict
 
     def publish(self):
@@ -111,7 +142,7 @@ def publish(self):
         file_elements = {'predictions': ("predictions.arff", predictions),
                          'description': ("description.xml", description_xml)}
         if self.trace_content is not None:
-            trace_arff = arff.dumps(self._generate_trace_arff_dict())
+            trace_arff = arff.dumps(self._generate_trace_arff_dict(self.model))
             file_elements['trace'] = ("trace.arff", trace_arff)
 
         return_code, return_value = _perform_api_call(
@@ -149,6 +180,16 @@ def _create_description_xml(self):
         return description_xml
 
 def _parse_parameters(model, flow):
+    """Extracts all parameter settings from an model in OpenML format.
+
+    Parameters
+    ----------
+    model
+        the sci-kit learn model (fitted)
+    flow
+        openml flow object (containing flow ids, i.e., it has to be downloaded from the server)
+
+    """
     python_param_settings = model.get_params()
     openml_param_settings = []
     flow_dict = openml.flows.get_flow_dict(flow)
@@ -160,12 +201,13 @@ def _parse_parameters(model, flow):
         if isinstance(python_param_settings[param], BaseEstimator):
             # extract parameters of the subflow individually
             subflow = flow.components[param]
+            openml_param_settings += _parse_parameters(python_param_settings[param], subflow)
 
         # add parameter setting (also the subflow. Just because we can)
         param_dict = OrderedDict()
-        param_dict['oml:name'] = param;
-        param_dict['oml:value'] = str(python_param_settings[param]);
-        param_dict['oml:component'] = flow_dict[flow.name];
+        param_dict['oml:name'] = param
+        param_dict['oml:value'] = str(python_param_settings[param])
+        param_dict['oml:component'] = flow_dict[flow.name]
         openml_param_settings.append(param_dict)
 
     return openml_param_settings
diff --git a/tests/runs/test_run_functions.py b/tests/runs/test_run_functions.py
@@ -1,6 +1,6 @@
 from sklearn.linear_model import LogisticRegression, SGDClassifier
 from sklearn.ensemble import RandomForestClassifier
-from sklearn.grid_search import RandomizedSearchCV
+from sklearn.model_selection import RandomizedSearchCV
 import openml
 from openml.testing import TestBase
 
@@ -16,16 +16,18 @@ def test_run_iris(self):
 
     def test_run_optimize_randomforest_iris(self):
         task = openml.tasks.get_task(10107)
-        clf = RandomForestClassifier(n_estimators=5)
+        numIterations = 5
+
+
+        clf = RandomForestClassifier(n_estimators=numIterations)
 
         param_dist = {"max_depth": [3, None],
                       "max_features": [1,2,3,4],
                       "min_samples_split": [1,2,3,4,5,6,7,8,9,10],
                       "min_samples_leaf": [1,2,3,4,5,6,7,8,9,10],
                       "bootstrap": [True, False],
                       "criterion": ["gini", "entropy"]}
-        random_search = RandomizedSearchCV(clf, param_distributions=param_dist,n_iter=20)
-
+        random_search = RandomizedSearchCV(clf, param_dist,n_iter=20)
 
         run = openml.runs.run_task(task, random_search)
         run_ = run.publish()