Merge pull request #181 from openml/feature/upload-run

mfeurer · web-flow · commit a2c1e7c103de · 2017-02-01T11:15:45.000+01:00
Feature/upload run
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -25,4 +25,4 @@ def get_flow(flow_id):
     if 'sklearn' in flow.external_version:
         flow.model = flow_to_sklearn(flow)
 
-    return flow
+    return flow
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -2,6 +2,7 @@
 import io
 import os
 import xmltodict
+from sklearn.model_selection._search import BaseSearchCV
 
 from .. import config
 from ..flows import sklearn_to_flow
@@ -56,19 +57,16 @@ def run_task(task, model):
                          'only works for tasks with class labels.')
 
     run = OpenMLRun(task_id=task.task_id, flow_id=flow_id,
-                    dataset_id=dataset.dataset_id)
-    run.data_content = _run_task_get_arffcontent(model, task, class_labels)
+                    dataset_id=dataset.dataset_id, model=model)
+    run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)
 
-    # The model will not be uploaded at the moment, but used to get the
-    # hyperparameter values when uploading the run
-    X, Y = task.get_X_and_y()
-    run.model = model.fit(X, Y)
     return run
 
 
 def _run_task_get_arffcontent(model, task, class_labels):
     X, Y = task.get_X_and_y()
     arff_datacontent = []
+    arff_tracecontent = []
 
     rep_no = 0
     # TODO use different iterator to only provide a single iterator (less
@@ -83,6 +81,10 @@ def _run_task_get_arffcontent(model, task, class_labels):
             testY = Y[test_indices]
 
             model.fit(trainX, trainY)
+            if isinstance(model, BaseSearchCV):
+                _add_results_to_arfftrace(arff_tracecontent, fold_no, model,
+                                          rep_no)
+
             ProbaY = model.predict_proba(testX)
             PredY = model.predict(testX)
 
@@ -96,7 +98,24 @@ def _run_task_get_arffcontent(model, task, class_labels):
             fold_no = fold_no + 1
         rep_no = rep_no + 1
 
-    return arff_datacontent
+    if not isinstance(model, BaseSearchCV):
+        arff_tracecontent = None
+
+    return arff_datacontent, arff_tracecontent
+
+
+def _add_results_to_arfftrace(arff_tracecontent, fold_no, model, rep_no):
+    for itt_no in range(0, len(model.cv_results_['mean_test_score'])):
+        # we use the string values for True and False, as it is defined in this way by the OpenML server
+        selected = 'false'
+        if itt_no == model.best_index_:
+            selected = 'true'
+        test_score = model.cv_results_['mean_test_score'][itt_no]
+        arff_line = [rep_no, fold_no, itt_no, test_score, selected]
+        for key in model.cv_results_:
+            if key.startswith("param_"):
+                arff_line.append(str(model.cv_results_[key][itt_no]))
+        arff_tracecontent.append(arff_line)
 
 
 def get_runs(run_ids):
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -4,10 +4,13 @@
 
 import arff
 import xmltodict
+from sklearn.base import BaseEstimator
+from sklearn.model_selection._search import BaseSearchCV
 
+import openml
 from ..tasks import get_task
 from .._api_calls import _perform_api_call
-
+from ..exceptions import PyOpenMLError
 
 class OpenMLRun(object):
     """OpenML Run: result of running a model on an openml dataset.
@@ -17,10 +20,10 @@ class OpenMLRun(object):
     FIXME
 
     """
-    def __init__(self, task_id, flow_id, dataset_id, setup_string=None, 
+    def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
                  files=None, setup_id=None, tags=None, uploader=None, uploader_name=None,
                  evaluations=None, detailed_evaluations=None,
-                 data_content=None, model=None, task_type=None,
+                 data_content=None, trace_content=None, model=None, task_type=None,
                  task_evaluation_measure=None, flow_name=None,
                  parameter_settings=None, predictions_url=None, task=None,
                  flow=None, run_id=None):
@@ -39,12 +42,14 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
         self.evaluations = evaluations
         self.detailed_evaluations = detailed_evaluations
         self.data_content = data_content
+        self.trace_content = trace_content
         self.task = task
         self.flow = flow
         self.run_id = run_id
+        self.model = model
 
     def _generate_arff_dict(self):
-        """Generates the arff dictionary for upload to the server.
+        """Generates the arff dictionary for uploading predictions to the server.
 
         Assumes that the run has been executed.
 
@@ -74,6 +79,48 @@ def _generate_arff_dict(self):
         arff_dict['relation'] = 'openml_task_' + str(task.task_id) + '_predictions'
         return arff_dict
 
+    def _generate_trace_arff_dict(self, model):
+        """Generates the arff dictionary for uploading predictions to the server.
+
+        Assumes that the run has been executed.
+
+        Returns
+        -------
+        arf_dict : dict
+            Dictionary representation of the ARFF file that will be uploaded.
+            Contains information about the optimization trace.
+        """
+        if self.trace_content is None:
+            raise ValueError('No trace content avaiable.')
+        if not isinstance(model, BaseSearchCV):
+            raise PyOpenMLError('Cannot generate trace on provided classifier. (This should never happen.)')
+
+        arff_dict = {}
+        arff_dict['attributes'] = [('repeat', 'NUMERIC'),
+                                   ('fold', 'NUMERIC'),
+                                   ('iteration', 'NUMERIC'),
+                                   ('evaluation', 'NUMERIC'),
+                                   ('selected', ['true', 'false'])]
+        for key in model.cv_results_:
+            if key.startswith("param_"):
+                type = 'STRING'
+                if all(isinstance(i, (bool)) for i in model.cv_results_[key]):
+                    type = ['True', 'False']
+                elif all(isinstance(i, (int, float)) for i in model.cv_results_[key]):
+                    type = 'NUMERIC'
+                else:
+                    values = list(set(model.cv_results_[key])) # unique values
+                    type = [str(i) for i in values]
+                    print(key + ": " + str(type))
+
+                attribute = ("parameter_" + key[6:], type)
+                arff_dict['attributes'].append(attribute)
+
+        arff_dict['data'] = self.trace_content
+        arff_dict['relation'] = 'openml_task_' + str(self.task_id) + '_predictions'
+
+        return arff_dict
+
     def publish(self):
         """Publish a run to the OpenML server.
 
@@ -84,10 +131,18 @@ def publish(self):
         -------
         self : OpenMLRun
         """
+        if self.model is None:
+            raise PyOpenMLError("OpenMLRun obj does not contain a model. (This should never happen.) ");
+
         predictions = arff.dumps(self._generate_arff_dict())
         description_xml = self._create_description_xml()
-        file_elements = {'predictions': ("predictions.csv", predictions),
+
+        file_elements = {'predictions': ("predictions.arff", predictions),
                          'description': ("description.xml", description_xml)}
+        if self.trace_content is not None:
+            trace_arff = arff.dumps(self._generate_trace_arff_dict(self.model))
+            file_elements['trace'] = ("trace.arff", trace_arff)
+
         return_code, return_value = _perform_api_call(
             "/run/", file_elements=file_elements)
         run_id = int(xmltodict.parse(return_value)['oml:upload_run']['oml:run_id'])
@@ -104,7 +159,11 @@ def _create_description_xml(self):
         """
         run_environment = _get_version_information()
 
-        parameter_settings = self.model.get_params()
+        # TODO: don't we have flow object in data structure? Use this one
+        downloaded_flow = openml.flows.get_flow(self.flow_id)
+
+        openml_param_settings = _parse_parameters(self.model, downloaded_flow)
+
         # as a tag, it must be of the form ([a-zA-Z0-9_\-\.])+
         # so we format time from 'mm/dd/yy hh:mm:ss' to 'mm-dd-yy_hh.mm.ss'
         well_formatted_time = time.strftime("%c").replace(
@@ -113,11 +172,51 @@ def _create_description_xml(self):
             [self.model.__module__ + "." + self.model.__class__.__name__]
         description = _to_dict(taskid=self.task_id, flow_id=self.flow_id,
                                setup_string=_create_setup_string(self.model),
-                               parameter_settings=parameter_settings,
+                               parameter_settings=openml_param_settings,
                                tags=tags)
         description_xml = xmltodict.unparse(description, pretty=True)
         return description_xml
 
+def _parse_parameters(model, flow):
+    """Extracts all parameter settings from a model in OpenML format.
+
+    Parameters
+    ----------
+    model
+        the scikit-learn model (fitted)
+    flow
+        openml flow object (containing flow ids, i.e., it has to be downloaded from the server)
+
+    """
+    python_param_settings = model.get_params()
+    openml_param_settings = []
+
+    def get_flow_dict(_flow):
+        flow_map = {_flow.name: _flow.flow_id}
+        for subflow in _flow.components:
+            flow_map.update(get_flow_dict(_flow.components[subflow]))
+        return flow_map
+
+    flow_dict = get_flow_dict(flow)
+
+    for param in python_param_settings:
+        if "__" in param:
+            # parameter of subflow. will be handled later
+            continue
+        if isinstance(python_param_settings[param], BaseEstimator):
+            # extract parameters of the subflow individually
+            subflow = flow.components[param]
+            openml_param_settings += _parse_parameters(python_param_settings[param], subflow)
+
+        # add parameter setting (also the subflow. Just because we can)
+        param_dict = OrderedDict()
+        param_dict['oml:name'] = param
+        param_dict['oml:value'] = str(python_param_settings[param])
+        param_dict['oml:component'] = flow_dict[flow.name]
+        openml_param_settings.append(param_dict)
+
+    return openml_param_settings
+
 ################################################################################
 # Functions which cannot be in runs/functions due to circular imports
 
@@ -169,15 +268,7 @@ def _to_dict(taskid, flow_id, setup_string, parameter_settings, tags):
     description['oml:run']['@xmlns:oml'] = 'http://openml.org/openml'
     description['oml:run']['oml:task_id'] = taskid
     description['oml:run']['oml:flow_id'] = flow_id
-
-    params = []
-    for k, v in parameter_settings.items():
-        param_dict = OrderedDict()
-        param_dict['oml:name'] = k
-        param_dict['oml:value'] = ('None' if v is None else v)
-        params.append(param_dict)
-
-    description['oml:run']['oml:parameter_setting'] = params
+    description['oml:run']['oml:parameter_setting'] = parameter_settings
     description['oml:run']['oml:tag'] = tags  # Tags describing the run
     # description['oml:run']['oml:output_data'] = 0;
     # all data that was output of this run, which can be evaluation scores
diff --git a/tests/runs/test_run_functions.py b/tests/runs/test_run_functions.py
@@ -1,18 +1,68 @@
 from sklearn.linear_model import LogisticRegression, SGDClassifier
+from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
+from sklearn.svm import SVC
+from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
 import openml
 import openml.exceptions
 from openml.testing import TestBase
 
 
 class TestRun(TestBase):
-    def test_run_iris(self):
-        task = openml.tasks.get_task(10107)
-        clf = LogisticRegression()
+
+    def _perform_run(self, task_id, num_instances, clf):
+        task = openml.tasks.get_task(task_id)
         run = openml.runs.run_task(task, clf)
         run_ = run.publish()
         self.assertEqual(run_, run)
         self.assertIsInstance(run.dataset_id, int)
 
+        # check arff output
+        self.assertEqual(len(run.data_content), num_instances)
+        return run
+
+
+    def test_run_iris(self):
+        task_id = 10107
+        num_instances = 150
+
+        clf = LogisticRegression()
+        self._perform_run(task_id,num_instances, clf)
+
+
+    def test_run_optimize_randomforest_iris(self):
+        task_id = 10107
+        num_instances = 150
+        num_folds = 10
+        num_iterations = 5
+
+        clf = RandomForestClassifier(n_estimators=10)
+        param_dist = {"max_depth": [3, None],
+                      "max_features": [1,2,3,4],
+                      "min_samples_split": [2,3,4,5,6,7,8,9,10],
+                      "min_samples_leaf": [1,2,3,4,5,6,7,8,9,10],
+                      "bootstrap": [True, False],
+                      "criterion": ["gini", "entropy"]}
+        random_search = RandomizedSearchCV(clf, param_dist,n_iter=num_iterations)
+
+        run = self._perform_run(task_id,num_instances, random_search)
+        self.assertEqual(len(run.trace_content), num_iterations * num_folds)
+
+    def test_run_optimize_bagging_iris(self):
+        task_id = 10107
+        num_instances = 150
+        num_folds = 10
+        num_iterations = 36 # (num values for C times gamma)
+
+        task = openml.tasks.get_task(task_id)
+        bag = BaggingClassifier(base_estimator=SVC())
+        param_dist = {"base_estimator__C": [0.001, 0.01, 0.1, 1, 10, 100],
+                      "base_estimator__gamma": [0.001, 0.01, 0.1, 1, 10, 100]}
+        grid_search = GridSearchCV(bag, param_dist)
+
+        run = self._perform_run(task_id, num_instances, grid_search)
+        self.assertEqual(len(run.trace_content), num_iterations * num_folds)
+
+
     def test__run_task_get_arffcontent(self):
         task = openml.tasks.get_task(1939)
         class_labels = task.class_labels
@@ -24,9 +74,13 @@ def test__run_task_get_arffcontent(self):
                                 clf, task, class_labels)
 
         clf = SGDClassifier(loss='log', random_state=1)
-        arff_datacontent = openml.runs.functions._run_task_get_arffcontent(
+        arff_datacontent, arff_tracecontent = openml.runs.functions._run_task_get_arffcontent(
             clf, task, class_labels)
+        # predictions
         self.assertIsInstance(arff_datacontent, list)
+        # trace. SGD does not produce any
+        self.assertIsInstance(arff_tracecontent, type(None))
+
         # 10 times 10 fold CV of 150 samples
         self.assertEqual(len(arff_datacontent), 1500)
         for arff_line in arff_datacontent: