Merge pull request #236 from openml/add/#145

mfeurer · web-flow · commit 512c07a09765 · 2017-04-26T21:30:47.000+02:00
WIP Add a check that flows are correctly stored on the server
diff --git a/openml/flows/__init__.py b/openml/flows/__init__.py
@@ -1,6 +1,7 @@
 from .flow import OpenMLFlow
+
 from .sklearn_converter import sklearn_to_flow, flow_to_sklearn, _check_n_jobs
-from .functions import get_flow, list_flows, flow_exists
+from .functions import get_flow, list_flows, flow_exists, assert_flows_equal
 
 __all__ = ['OpenMLFlow', 'create_flow_from_model', 'get_flow', 'list_flows',
            'sklearn_to_flow', 'flow_to_sklearn', 'flow_exists']
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
@@ -338,9 +338,26 @@ def publish(self):
         file_elements = {'description': xml_description}
         return_value = _perform_api_call("flow/", file_elements=file_elements)
         self.flow_id = int(xmltodict.parse(return_value)['oml:upload_flow']['oml:id'])
+        try:
+            _check_flow(self)
+        except ValueError as e:
+            message = e.args[0]
+            raise ValueError("Flow was not stored correctly on the server. "
+                             "New flow ID is %d. Please check manually and "
+                             "remove the flow if necessary! Error is:\n'%s'" %
+                             (self.flow_id, message))
         return self
 
 
 def _add_if_nonempty(dic, key, value):
     if value is not None:
         dic[key] = value
+
+
+def _check_flow(flow):
+    # Import is not possible at the top of the file as this would cause an
+    # ImportError due to an import cycle.
+    import openml.flows.functions
+
+    flow_copy = openml.flows.functions.get_flow(flow.flow_id)
+    openml.flows.functions.assert_flows_equal(flow, flow_copy)
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -127,4 +127,45 @@ def _list_flows(api_call):
                 'uploader': flow_['oml:uploader']}
         flows[fid] = flow
 
-    return flows
+    return flows
+
+
+def assert_flows_equal(flow1, flow2):
+    """Check equality of two flows.
+
+    Two flows are equal if their all keys which are not set by the server
+    are equal, as well as all their parameters and components.
+    """
+    if not isinstance(flow1, OpenMLFlow):
+        raise TypeError('Argument 1 must be of type OpenMLFlow, but is %s' %
+                        type(flow1))
+
+    if not isinstance(flow2, OpenMLFlow):
+        raise TypeError('Argument 2 must be of type OpenMLFlow, but is %s' %
+                        type(flow2))
+
+    generated_by_the_server = ['flow_id', 'uploader', 'version',
+                               'upload_date', ]
+    ignored_by_python_API = ['binary_url', 'binary_format', 'binary_md5',
+                             'model']
+
+    for key in set(flow1.__dict__.keys()).union(flow2.__dict__.keys()):
+        if key in generated_by_the_server + ignored_by_python_API:
+            continue
+        attr1 = getattr(flow1, key, None)
+        attr2 = getattr(flow2, key, None)
+        if key == 'components':
+            for name in set(attr1.keys()).union(attr2.keys()):
+                if not name in attr1:
+                    raise ValueError('Component %s only available in '
+                                     'argument2, but not in argument1.' % name)
+                if not name in attr2:
+                    raise ValueError('Component %s only available in '
+                                     'argument2, but not in argument1.' % name)
+                assert_flows_equal(attr1[name], attr2[name])
+
+        else:
+            if attr1 != attr2:
+                raise ValueError("Flow %s: values for attribute '%s' differ: "
+                                 "'%s' vs '%s'." %
+                                 (str(flow1.name), str(key), str(attr1), str(attr2)))
diff --git a/openml/flows/sklearn_converter.py b/openml/flows/sklearn_converter.py
@@ -131,7 +131,6 @@ def flow_to_sklearn(o, **kwargs):
                 raise ValueError('Cannot flow_to_sklearn %s' % serialized_type)
 
         else:
-            # Regular dictionary
             rval = OrderedDict((flow_to_sklearn(key, **kwargs),
                                 flow_to_sklearn(value, **kwargs))
                                for key, value in o.items())
@@ -303,8 +302,10 @@ def _extract_information_from_model(model):
                     component_reference = OrderedDict()
                     component_reference[
                         'oml-python:serialized_object'] = 'component_reference'
-                    component_reference['value'] = OrderedDict(
-                        key=identifier, step_name=identifier)
+                    cr_value = OrderedDict()
+                    cr_value['key'] = identifier
+                    cr_value['step_name'] = identifier
+                    component_reference['value'] = cr_value
                     parameter_value.append(component_reference)
 
             if isinstance(rval, tuple):
@@ -326,7 +327,10 @@ def _extract_information_from_model(model):
             component_reference = OrderedDict()
             component_reference[
                 'oml-python:serialized_object'] = 'component_reference'
-            component_reference['value'] = OrderedDict(key=k, step_name=None)
+            cr_value = OrderedDict()
+            cr_value['key'] = k
+            cr_value['step_name'] = None
+            component_reference['value'] = cr_value
             component_reference = sklearn_to_flow(component_reference, model)
             parameters[k] = json.dumps(component_reference)
 
@@ -387,6 +391,9 @@ def _deserialize_model(flow, **kwargs):
 
 
 def _check_dependencies(dependencies):
+    if not dependencies:
+        return
+
     dependencies = dependencies.split('\n')
     for dependency_string in dependencies:
         match = DEPENDENCIES_PATTERN.match(dependency_string)
@@ -448,7 +455,8 @@ def serialize_rv_frozen(o):
     dist = o.dist.__class__.__module__ + '.' + o.dist.__class__.__name__
     ret = OrderedDict()
     ret['oml-python:serialized_object'] = 'rv_frozen'
-    ret['value'] = OrderedDict(dist=dist, a=a, b=b, args=args, kwds=kwds)
+    ret['value'] = OrderedDict((('dist', dist), ('a', a), ('b', b),
+                                ('args', args), ('kwds', kwds)))
     return ret
 
 def deserialize_rv_frozen(o, **kwargs):
diff --git a/openml/testing.py b/openml/testing.py
@@ -1,5 +1,7 @@
+import hashlib
 import inspect
 import os
+import time
 import shutil
 import unittest
 import openml
@@ -54,4 +56,25 @@ def tearDown(self):
         shutil.rmtree(self.workdir)
         openml.config.server = self.production_server
 
+    def _add_sentinel_to_flow_name(self, flow, sentinel=None):
+        if sentinel is None:
+            # Create a unique prefix for the flow. Necessary because the flow is
+            # identified by its name and external version online. Having a unique
+            #  name allows us to publish the same flow in each test run
+            md5 = hashlib.md5()
+            md5.update(str(time.time()).encode('utf-8'))
+            sentinel = md5.hexdigest()[:10]
+            sentinel = 'TEST%s' % sentinel
+
+        flows_to_visit = list()
+        flows_to_visit.append(flow)
+        while len(flows_to_visit) > 0:
+            current_flow = flows_to_visit.pop()
+            current_flow.name = '%s%s' % (sentinel, current_flow.name)
+            for subflow in current_flow.components.values():
+                flows_to_visit.append(subflow)
+
+        return flow, sentinel
+
+
 __all__ = ['TestBase']
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
@@ -1,11 +1,14 @@
 import collections
+import copy
 import hashlib
 import re
+import sys
 import time
-import random
-import unittest
 
-import xmltodict
+if sys.version_info[0] >= 3:
+    from unittest import mock
+else:
+    import mock
 
 import scipy.stats
 import sklearn
@@ -19,58 +22,17 @@
 import sklearn.preprocessing
 import sklearn.naive_bayes
 import sklearn.tree
+import xmltodict
 
 from openml.testing import TestBase
 from openml._api_calls import _perform_api_call
 import openml
 from openml.flows.sklearn_converter import _format_external_version
 
 
-def are_flows_equal(flow1, flow2):
-    """Check equality of two flows.
-
-    Two flows are equal if their all keys which are not set by the server
-    are equal, as well as all their parameters and components.
-    """
-    if not isinstance(flow2, flow1.__class__):
-        return False
-
-    # Name is actually not generated by the server, but it will be
-    # tested further down with a getter (allows mocking in the tests)
-    generated_by_the_server = ['name', 'flow_id', 'uploader', 'version',
-                               'upload_date', 'source_url',
-                               'binary_url', 'source_format',
-                               'binary_format', 'source_md5',
-                               'binary_md5', 'model']
-
-    for key in set(flow1.__dict__.keys()).union(flow2.__dict__.keys()):
-        if key in generated_by_the_server:
-            continue
-        attr1 = getattr(flow1, key, None)
-        attr2 = getattr(flow2, key, None)
-        if key == 'components':
-            for name in set(attr1.keys()).union(attr2.keys()):
-                if not are_flows_equal(attr1[name], attr2[name]):
-                    return False
-        else:
-            if attr1 != attr2:
-                return False
-    return True
-
-
-def get_sentinel():
-    # Create a unique prefix for the flow. Necessary because the flow is
-    # identified by its name and external version online. Having a unique
-    #  name allows us to publish the same flow in each test run
-    md5 = hashlib.md5()
-    md5.update(str(time.time()).encode('utf-8'))
-    sentinel = md5.hexdigest()[:10]
-    sentinel = 'TEST%s' % sentinel
-    return sentinel
-
-
 class TestFlow(TestBase):
 
+
     def test_get_flow(self):
         # We need to use the production server here because 4024 is not the test
         # server
@@ -134,13 +96,14 @@ def test_to_xml_from_xml(self):
         xml = flow._to_xml()
         xml_dict = xmltodict.parse(xml)
         new_flow = openml.flows.OpenMLFlow._from_dict(xml_dict)
-        self.assertTrue(are_flows_equal(new_flow, flow))
+
+        # Would raise exception if they are not legal
+        openml.flows.functions.assert_flows_equal(new_flow, flow)
         self.assertIsNot(new_flow, flow)
 
     def test_publish_flow(self):
-        sentinel = get_sentinel()
-
-        flow = openml.OpenMLFlow(name='Test',
+        flow = openml.OpenMLFlow(name='sklearn.dummy.DummyClassifier',
+                                 class_name='sklearn.dummy.DummyClassifier',
                                  description="test description",
                                  model=sklearn.dummy.DummyClassifier(),
                                  components=collections.OrderedDict(),
@@ -150,8 +113,9 @@ def test_publish_flow(self):
                                      'sklearn', sklearn.__version__),
                                  tags=[],
                                  language='English',
-                                 dependencies='')
-        flow.name = 'TEST%s%s' % (sentinel, flow.name)
+                                 dependencies=None)
+
+        flow, _ = self._add_sentinel_to_flow_name(flow, None)
 
         flow.publish()
         self.assertIsInstance(flow.flow_id, int)
@@ -160,14 +124,44 @@ def test_semi_legal_flow(self):
         # TODO: Test if parameters are set correctly!
         # should not throw error as it contains two differentiable forms of Bagging
         # i.e., Bagging(Bagging(J48)) and Bagging(J48)
-        sentinel = get_sentinel()
         semi_legal = sklearn.ensemble.BaggingClassifier(
             base_estimator=sklearn.ensemble.BaggingClassifier(
                 base_estimator=sklearn.tree.DecisionTreeClassifier()))
         flow = openml.flows.sklearn_to_flow(semi_legal)
-        flow.name = 'TEST%s%s' % (sentinel, flow.name)
+        flow, _ = self._add_sentinel_to_flow_name(flow, None)
+
+        flow.publish()
+
+    @mock.patch('openml.flows.functions.get_flow')
+    @mock.patch('openml.flows.flow._perform_api_call')
+    def test_publish_error(self, api_call_mock, get_flow_mock):
+        model = sklearn.ensemble.RandomForestClassifier()
+        flow = openml.flows.sklearn_to_flow(model)
+        api_call_mock.return_value = "<oml:upload_flow>\n" \
+                                     "    <oml:id>1</oml:id>\n" \
+                                     "</oml:upload_flow>"
+        get_flow_mock.return_value = flow
 
         flow.publish()
+        self.assertEqual(api_call_mock.call_count, 1)
+        self.assertEqual(get_flow_mock.call_count, 1)
+
+        flow_copy = copy.deepcopy(flow)
+        flow_copy.name = flow_copy.name[:-1]
+        get_flow_mock.return_value = flow_copy
+
+        with self.assertRaises(ValueError) as context_manager:
+            flow.publish()
+
+        fixture = "Flow was not stored correctly on the server. " \
+                  "New flow ID is 1. Please check manually and remove " \
+                  "the flow if necessary! Error is:\n" \
+                  "'Flow sklearn.ensemble.forest.RandomForestClassifier: values for attribute 'name' differ: " \
+                  "'sklearn.ensemble.forest.RandomForestClassifier' vs 'sklearn.ensemble.forest.RandomForestClassifie'.'"
+
+        self.assertEqual(context_manager.exception.args[0], fixture)
+        self.assertEqual(api_call_mock.call_count, 2)
+        self.assertEqual(get_flow_mock.call_count, 2)
 
     def test_illegal_flow(self):
         # should throw error as it contains two imputers
@@ -177,6 +171,16 @@ def test_illegal_flow(self):
         self.assertRaises(ValueError, openml.flows.sklearn_to_flow, illegal)
 
     def test_nonexisting_flow_exists(self):
+        def get_sentinel():
+            # Create a unique prefix for the flow. Necessary because the flow is
+            # identified by its name and external version online. Having a unique
+            #  name allows us to publish the same flow in each test run
+            md5 = hashlib.md5()
+            md5.update(str(time.time()).encode('utf-8'))
+            sentinel = md5.hexdigest()[:10]
+            sentinel = 'TEST%s' % sentinel
+            return sentinel
+
         name = get_sentinel() + get_sentinel()
         version = get_sentinel()
 
@@ -185,10 +189,9 @@ def test_nonexisting_flow_exists(self):
 
     def test_existing_flow_exists(self):
         # create a flow
-        sentinel = get_sentinel()
         nb = sklearn.naive_bayes.GaussianNB()
         flow = openml.flows.sklearn_to_flow(nb)
-        flow.name = 'TEST%s%s' % (sentinel, flow.name)
+        flow, _ = self._add_sentinel_to_flow_name(flow, None)
         #publish the flow
         flow = flow.publish()
         #redownload the flow
@@ -204,7 +207,6 @@ def test_sklearn_to_upload_to_flow(self):
         iris = sklearn.datasets.load_iris()
         X = iris.data
         y = iris.target
-        sentinel = get_sentinel()
 
         # Test a more complicated flow
         ohe = sklearn.preprocessing.OneHotEncoder(categorical_features=[1])
@@ -227,17 +229,7 @@ def test_sklearn_to_upload_to_flow(self):
         rs.fit(X, y)
         flow = openml.flows.sklearn_to_flow(rs)
         flow.tags.extend(['openml-python', 'unittest'])
-
-        # Add the sentinel to all name strings in all subflows. Adds it to
-        # name to make it easier in the web gui to see that the flow is only
-        # a test flow
-        to_visit = collections.deque()
-        to_visit.appendleft(flow)
-        while len(to_visit) > 0:
-            current_flow = to_visit.pop()
-            for sub_flow in current_flow.components.values():
-                to_visit.appendleft(sub_flow)
-            current_flow.name = sentinel + current_flow.name
+        flow, sentinel = self._add_sentinel_to_flow_name(flow, None)
 
         flow.publish()
         self.assertIsInstance(flow.flow_id, int)
@@ -267,7 +259,8 @@ def test_sklearn_to_upload_to_flow(self):
 
         self.assertEqual(server_xml, local_xml)
 
-        self.assertTrue(are_flows_equal(new_flow, flow))
+        # Would raise exception if they are not equal!
+        openml.flows.functions.assert_flows_equal(new_flow, flow)
         self.assertIsNot(new_flow, flow)
 
         fixture_name = '%ssklearn.model_selection._search.RandomizedSearchCV(' \
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
diff --git a/tests/test_flows/test_sklearn.py b/tests/test_flows/test_sklearn.py
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py