restructured flow mechanism (flow exists is now an independent function, as it never relied on the flow anyway)

janvanrijn · janvanrijn · commit ff2fa4b45914 · 2017-03-27T20:13:19.000+02:00
bug fix flow exist and setup exists

- added usit test for flow exists and setup exists (cases where it exists and not exists)
diff --git a/openml/flows/__init__.py b/openml/flows/__init__.py
@@ -1,6 +1,6 @@
 from .flow import OpenMLFlow
 from .sklearn_converter import sklearn_to_flow, flow_to_sklearn
-from .functions import get_flow, list_flows
+from .functions import get_flow, list_flows, flow_exists
 
 __all__ = ['OpenMLFlow', 'create_flow_from_model', 'get_flow', 'list_flows',
-           'sklearn_to_flow', 'flow_to_sklearn']
+           'sklearn_to_flow', 'flow_to_sklearn', 'flow_exists']
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
@@ -341,38 +341,6 @@ def publish(self):
         return self
 
 
-def _check_flow_exists(name, version):
-    """Retrieves the flow id of the flow uniquely identified by name+version.
-
-    Parameter
-    ---------
-    name : string
-        Name of the flow
-    version : string
-        Version information associated with flow.
-
-    Returns
-    -------
-    flow_exist : int
-        Flow id or -1 if the flow doesn't exist.
-
-    Notes
-    -----
-    see http://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version
-    """
-    if not (type(name) is str and len(name) > 0):
-        raise ValueError('Argument \'name\' should be a non-empty string')
-    if not (type(version) is str and len(version) > 0):
-        raise ValueError('Argument \'version\' should be a non-empty string')
-
-    xml_response = _perform_api_call("flow/exists",
-                                     data={'name': name, 'external_version': version})
-
-    xml_dict = xmltodict.parse(xml_response)
-    flow_id = xml_dict['oml:flow_exists']['oml:id']
-    return xml_response, flow_id
-
-
 def _add_if_nonempty(dic, key, value):
     if value is not None:
         dic[key] = value
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -69,6 +69,41 @@ def list_flows(offset=None, size=None, tag=None):
     return _list_flows(api_call)
 
 
+def flow_exists(name, version):
+    """Retrieves the flow id of the flow uniquely identified by name+version.
+
+    Parameter
+    ---------
+    name : string
+        Name of the flow
+    version : string
+        Version information associated with flow.
+
+    Returns
+    -------
+    flow_exist : int
+        flow id iff exists, False otherwise
+
+    Notes
+    -----
+    see http://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version
+    """
+    if not (type(name) is str and len(name) > 0):
+        raise ValueError('Argument \'name\' should be a non-empty string')
+    if not (type(version) is str and len(version) > 0):
+        raise ValueError('Argument \'version\' should be a non-empty string')
+
+    xml_response = _perform_api_call("flow/exists",
+                                     data={'name': name, 'external_version': version})
+
+    result_dict = xmltodict.parse(xml_response)
+    flow_id = int(result_dict['oml:flow_exists']['oml:id'])
+    if flow_id > 0:
+        return flow_id
+    else:
+        return False;
+
+
 def _list_flows(api_call):
     # TODO add proper error handling here!
     xml_string = _perform_api_call(api_call)
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -9,7 +9,7 @@
 
 from ..exceptions import PyOpenMLError
 from .. import config
-from ..flows import sklearn_to_flow, get_flow
+from ..flows import sklearn_to_flow, get_flow, flow_exists
 from ..setups import setup_exists
 from ..exceptions import OpenMLCacheException, OpenMLServerException
 from ..util import URLError
@@ -47,11 +47,11 @@ def run_task(task, model):
     flow = sklearn_to_flow(model)
 
     # returns flow id if the flow exists on the server, -1 otherwise
-    _, flow_id = openml.flows._check_flow_exists(flow.name, flow.external_version)
+    flow_id = flow_exists(flow.name, flow.external_version)
 
     # skips the run if it already exists and the user opts for this in the config file.
     # also, if the flow is not present on the server, the check is not needed.
-    if config.avoid_duplicate_runs and flow_id > 0:
+    if config.avoid_duplicate_runs and flow_id:
         flow = get_flow(flow_id)
         setup_id = setup_exists(flow, model)
         ids = _run_exists(task.task_id, setup_id)
@@ -70,13 +70,17 @@ def run_task(task, model):
     run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model)
     run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)
 
-    if flow_id < 0:
-        flow.publish()
-    config.logger.info(flow_id)
-
-    # attach the flow to the run
-    run.flow_id = flow_id
+    if flow_id == False:
+        # means the flow did not exists.
+        # As we could run it, publish it now
+        flow = flow.publish()
+    else:
+        # flow already existed, download it from server
+        # TODO (neccessary? is this a post condition of this function)
+        flow = get_flow(flow_id)
 
+    run.flow_id = flow.flow_id
+    config.logger.info('Executed Task %d with Flow id: %d' %(task.task_id, run.flow_id))
 
     return run
 
@@ -311,27 +315,28 @@ def _create_run_from_xml(xml):
     evaluations = dict()
     detailed_evaluations = defaultdict(lambda: defaultdict(dict))
     evaluation_flows = dict()
-    for evaluation_dict in run['oml:output_data']['oml:evaluation']:
-        key = evaluation_dict['oml:name']
-        if 'oml:value' in evaluation_dict:
-            value = float(evaluation_dict['oml:value'])
-        elif 'oml:array_data' in evaluation_dict:
-            value = evaluation_dict['oml:array_data']
-        else:
-            raise ValueError('Could not find keys "value" or "array_data" '
-                             'in %s' % str(evaluation_dict.keys()))
-
-        if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
-            repeat = int(evaluation_dict['@repeat'])
-            fold = int(evaluation_dict['@fold'])
-            repeat_dict = detailed_evaluations[key]
-            fold_dict = repeat_dict[repeat]
-            fold_dict[fold] = value
-        else:
-            evaluations[key] = value
-            evaluation_flows[key] = flow_id
+    if 'oml:output_data' in run and 'oml:evaluation' in run['oml:output_data']:
+        for evaluation_dict in run['oml:output_data']['oml:evaluation']:
+            key = evaluation_dict['oml:name']
+            if 'oml:value' in evaluation_dict:
+                value = float(evaluation_dict['oml:value'])
+            elif 'oml:array_data' in evaluation_dict:
+                value = evaluation_dict['oml:array_data']
+            else:
+                raise ValueError('Could not find keys "value" or "array_data" '
+                                 'in %s' % str(evaluation_dict.keys()))
+
+            if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
+                repeat = int(evaluation_dict['@repeat'])
+                fold = int(evaluation_dict['@fold'])
+                repeat_dict = detailed_evaluations[key]
+                fold_dict = repeat_dict[repeat]
+                fold_dict[fold] = value
+            else:
+                evaluations[key] = value
+                evaluation_flows[key] = flow_id
 
-        evaluation_flows[key] = flow_id
+            evaluation_flows[key] = flow_id
 
     return OpenMLRun(run_id=run_id, uploader=uploader,
                      uploader_name=uploader_name, task_id=task_id,
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
@@ -7,12 +7,20 @@ def setup_exists(downloaded_flow, sklearn_model):
     '''
     Checks whether a flow / hyperparameter configuration already exists on the server
 
-    :param downloaded_flow:
+    Parameter
+    ---------
+
+    downloaded_flow : flow
         the openml flow object (should be downloaded from server.
         Otherwise also give flow id parameter)
-    :param sklearn_model: obvious
-    :param flow_id: int
-    :return: int setup id iff exists, False otherwise
+    sklearn_model : BaseEstimator
+        The base estimator that was used to create the flow. Will
+         be used to extract parameter settings from.
+
+    Returns
+    -------
+    setup_id : int s
+        setup id iff exists, False otherwise
     '''
 
     # sadly, this api call relies on a run object
@@ -23,10 +31,11 @@ def setup_exists(downloaded_flow, sklearn_model):
     result = openml._api_calls._perform_api_call('/setup/exists/',
                                                  file_elements = file_elements)
     result_dict = xmltodict.parse(result)
-    if 'oml:id' in result_dict['oml:setup_exists']:
-        return int(result_dict['oml:setup_exists']['oml:id'])
+    setup_id = int(result_dict['oml:setup_exists']['oml:id'])
+    if setup_id > 0:
+        return setup_id
     else:
-        return False
+        return False;
 
 
 def _to_dict(flow_id, openml_parameter_settings):
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
@@ -2,6 +2,7 @@
 import hashlib
 import re
 import time
+import random
 import unittest
 
 import xmltodict
@@ -16,6 +17,7 @@
 import sklearn.model_selection
 import sklearn.pipeline
 import sklearn.preprocessing
+import sklearn.naive_bayes
 import sklearn.tree
 
 from openml.testing import TestBase
@@ -174,24 +176,27 @@ def test_illegal_flow(self):
                                                    ('classif', sklearn.tree.DecisionTreeClassifier())])
         self.assertRaises(ValueError, openml.flows.sklearn_to_flow, illegal)
 
-    def test_ensure_flow_exists(self):
-        sentinel = get_sentinel()
+    def test_nonexiting_flow_exists(self):
+        name = get_sentinel() + get_sentinel()
+        version = get_sentinel()
 
-        flow = openml.OpenMLFlow(name='Test',
-                                 description="test description",
-                                 model=sklearn.dummy.DummyClassifier(),
-                                 components=collections.OrderedDict(),
-                                 parameters=collections.OrderedDict(),
-                                 parameters_meta_info=collections.OrderedDict(),
-                                 external_version=_format_external_version(
-                                     'sklearn', sklearn.__version__),
-                                 tags=[],
-                                 language='English',
-                                 dependencies='')
+        flow_id = openml.flows.flow_exists(name, version)
+        self.assertEquals(flow_id, False)
+
+    def test_exiting_flow_exists(self):
+        # create a flow
+        sentinel = get_sentinel()
+        nb = sklearn.naive_bayes.GaussianNB()
+        flow = openml.flows.sklearn_to_flow(nb)
         flow.name = 'TEST%s%s' % (sentinel, flow.name)
-        flow_id = flow._ensure_flow_exists()
-        self.assertIsInstance(flow_id, int)
-        self.assertEqual(flow._ensure_flow_exists(), flow_id)
+
+        flow = flow.publish()
+
+        # check if flow exists can find it
+        flow = openml.flows.get_flow(flow.flow_id)
+        downloaded_flow_id = openml.flows.flow_exists(flow.name, flow.external_version)
+        self.assertEquals(downloaded_flow_id, flow.flow_id)
+
 
     def test_sklearn_to_upload_to_flow(self):
         iris = sklearn.datasets.load_iris()
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
@@ -0,0 +1,74 @@
+import sys
+import hashlib
+import time
+
+import openml
+import openml.exceptions
+from openml.testing import TestBase
+
+if sys.version_info[0] >= 3:
+    from unittest import mock
+else:
+    import mock
+
+
+def get_sentinel():
+    # Create a unique prefix for the flow. Necessary because the flow is
+    # identified by its name and external version online. Having a unique
+    #  name allows us to publish the same flow in each test run
+    md5 = hashlib.md5()
+    md5.update(str(time.time()).encode('utf-8'))
+    sentinel = md5.hexdigest()[:10]
+    sentinel = 'TEST%s' % sentinel
+    return sentinel
+
+
+
+class TestRun(TestBase):
+
+    def test_nonexisting_setup_exists(self):
+        from sklearn.tree import DecisionTreeClassifier
+        # first publish a nonexiting flow
+        sentinel = get_sentinel()
+        dectree = DecisionTreeClassifier()
+        flow = openml.flows.sklearn_to_flow(dectree)
+        flow.name = 'TEST%s%s' % (sentinel, flow.name)
+        flow.publish()
+
+        # although the flow exists, we can be sure there are no
+        # setups (yet) as it hasn't been ran
+        setup_id = openml.setups.setup_exists(flow, dectree)
+        self.assertEquals(setup_id, False)
+
+
+    def test_existing_setup_exists(self):
+        from sklearn.ensemble import BaggingClassifier
+        from sklearn.tree import DecisionTreeClassifier
+        # first publish a nonexiting flow
+        bagging = BaggingClassifier(DecisionTreeClassifier(max_depth=5,
+                                                           min_samples_split=1),
+                                    n_estimators=3,
+                                    max_samples=0.5)
+        flow = openml.flows.sklearn_to_flow(bagging)
+        flow.name = 'TEST%s%s' % (get_sentinel(), flow.name)
+        flow = flow.publish()
+        flow = openml.flows.get_flow(flow.flow_id)
+
+        # although the flow exists, we can be sure there are no
+        # setups (yet) as it hasn't been ran
+        setup_id = openml.setups.setup_exists(flow, bagging)
+        self.assertEquals(setup_id, False)
+
+        # now run the flow on an easy task:
+        task = openml.tasks.get_task(115) #diabetes
+        run = openml.runs.run_task(task, bagging)
+        # spoof flow id, otherwise the sentinel is ignored
+        run.flow_id = flow.flow_id
+        run = run.publish()
+        # download the run, as it contains the right setup id
+        run = openml.runs.get_run(run.run_id)
+
+        # execute the function we are interested in
+        setup_id = openml.setups.setup_exists(flow, bagging)
+        self.assertEquals(setup_id, run.setup_id)
+