Put shared logic of Publish into OpenMLBase (#849)

PGijsbers · mfeurer · commit f74b73a95006 · 2019-10-18T15:25:25.000+02:00
* Reworked Task publish and Dataset publish

* Use OpenMLBase publish method.

* Remove unused import. Add study as legal API entity.

* Use shared logic in Flow and fix resolving Study alias.

* Further extract shared logic.

* Fix flake8, mypy
diff --git a/examples/30_extended/create_upload_tutorial.py b/examples/30_extended/create_upload_tutorial.py
@@ -119,8 +119,8 @@
 
 ############################################################################
 
-upload_did = diabetes_dataset.publish()
-print(f"URL for dataset: {openml.config.server}/data/{upload_did}")
+diabetes_dataset.publish()
+print(f"URL for dataset: {diabetes_dataset.openml_url}")
 
 ############################################################################
 # Dataset is a list
@@ -192,8 +192,8 @@
 
 ############################################################################
 
-upload_did = weather_dataset.publish()
-print(f"URL for dataset: {openml.config.server}/data/{upload_did}")
+weather_dataset.publish()
+print(f"URL for dataset: {weather_dataset.openml_url}")
 
 ############################################################################
 # Dataset is a pandas DataFrame
@@ -238,8 +238,8 @@
 
 ############################################################################
 
-upload_did = weather_dataset.publish()
-print(f"URL for dataset: {openml.config.server}/data/{upload_did}")
+weather_dataset.publish()
+print(f"URL for dataset: {weather_dataset.openml_url}")
 
 ############################################################################
 # Dataset is a sparse matrix
@@ -275,8 +275,8 @@
 
 ############################################################################
 
-upload_did = xor_dataset.publish()
-print(f"URL for dataset: {openml.config.server}/data/{upload_did}")
+xor_dataset.publish()
+print(f"URL for dataset: {xor_dataset.openml_url}")
 
 
 ############################################################################
@@ -310,8 +310,8 @@
 
 ############################################################################
 
-upload_did = xor_dataset.publish()
-print(f"URL for dataset: {openml.config.server}/data/{upload_did}")
+xor_dataset.publish()
+print(f"URL for dataset: {xor_dataset.openml_url}")
 
 
 ############################################################################
diff --git a/openml/base.py b/openml/base.py
@@ -1,13 +1,13 @@
 from abc import ABC, abstractmethod
 from collections import OrderedDict
 import re
-from typing import Optional, List, Tuple, Union
+from typing import Optional, List, Tuple, Union, Dict
 import webbrowser
 
 import xmltodict
 
 import openml.config
-from .utils import _tag_openml_base
+from .utils import _tag_openml_base, _get_rest_api_type_alias
 
 
 class OpenMLBase(ABC):
@@ -104,6 +104,34 @@ def _to_xml(self) -> str:
         encoding_specification, xml_body = xml_representation.split('\n', 1)
         return xml_body
 
+    def _get_file_elements(self) -> Dict:
+        """ Get file_elements to upload to the server, called during Publish.
+
+        Derived child classes should overwrite this method as necessary.
+        The description field will be populated automatically if not provided.
+        """
+        return {}
+
+    @abstractmethod
+    def _parse_publish_response(self, xml_response: Dict):
+        """ Parse the id from the xml_response and assign it to self. """
+        pass
+
+    def publish(self) -> 'OpenMLBase':
+        file_elements = self._get_file_elements()
+
+        if 'description' not in file_elements:
+            file_elements['description'] = self._to_xml()
+
+        call = '{}/'.format(_get_rest_api_type_alias(self))
+        response_text = openml._api_calls._perform_api_call(
+            call, 'post', file_elements=file_elements
+        )
+        xml_response = xmltodict.parse(response_text)
+
+        self._parse_publish_response(xml_response)
+        return self
+
     def open_in_browser(self):
         """ Opens the OpenML web page corresponding to this object in your default browser. """
         webbrowser.open(self.openml_url)
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -11,10 +11,8 @@
 import numpy as np
 import pandas as pd
 import scipy.sparse
-import xmltodict
 from warnings import warn
 
-import openml._api_calls
 from openml.base import OpenMLBase
 from .data_feature import OpenMLDataFeature
 from ..exceptions import PyOpenMLError
@@ -728,49 +726,28 @@ def get_features_by_type(self, data_type, exclude=None,
                     result.append(idx - offset)
         return result
 
-    def publish(self):
-        """Publish the dataset on the OpenML server.
+    def _get_file_elements(self) -> Dict:
+        """ Adds the 'dataset' to file elements. """
+        file_elements = {}
+        path = None if self.data_file is None else os.path.abspath(self.data_file)
 
-        Upload the dataset description and dataset content to openml.
-
-        Returns
-        -------
-        dataset_id: int
-            Id of the dataset uploaded to the server.
-        """
-        file_elements = {'description': self._to_xml()}
-
-        # the arff dataset string is available
         if self._dataset is not None:
             file_elements['dataset'] = self._dataset
-        else:
-            # the path to the arff dataset is given
-            if self.data_file is not None:
-                path = os.path.abspath(self.data_file)
-                if os.path.exists(path):
-                    try:
-
-                        with io.open(path, encoding='utf8') as fh:
-                            # check if arff is valid
-                            decoder = arff.ArffDecoder()
-                            decoder.decode(fh, encode_nominal=True)
-                    except arff.ArffException:
-                        raise ValueError("The file you have provided is not "
-                                         "a valid arff file.")
-
-                    with open(path, 'rb') as fp:
-                        file_elements['dataset'] = fp.read()
-            else:
-                if self.url is None:
-                    raise ValueError("No url/path to the data file was given")
-
-        return_value = openml._api_calls._perform_api_call(
-            "data/", 'post',
-            file_elements=file_elements,
-        )
-        response = xmltodict.parse(return_value)
-        self.dataset_id = int(response['oml:upload_data_set']['oml:id'])
-        return self.dataset_id
+        elif path is not None and os.path.exists(path):
+            with open(path, 'rb') as fp:
+                file_elements['dataset'] = fp.read()
+            try:
+                dataset_utf8 = str(file_elements['dataset'], 'utf8')
+                arff.ArffDecoder().decode(dataset_utf8, encode_nominal=True)
+            except arff.ArffException:
+                raise ValueError("The file you have provided is not a valid arff file.")
+        elif self.url is None:
+            raise ValueError("No valid url/path to the data file was given.")
+        return file_elements
+
+    def _parse_publish_response(self, xml_response: Dict):
+        """ Parse the id from the xml_response and assign it to self. """
+        self.dataset_id = int(xml_response['oml:upload_data_set']['oml:id'])
 
     def _to_dict(self) -> 'OrderedDict[str, OrderedDict]':
         """ Creates a dictionary representation of self. """
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
@@ -351,6 +351,10 @@ def from_filesystem(cls, input_directory) -> 'OpenMLFlow':
             xml_string = f.read()
         return OpenMLFlow._from_dict(xmltodict.parse(xml_string))
 
+    def _parse_publish_response(self, xml_response: Dict):
+        """ Parse the id from the xml_response and assign it to self. """
+        self.flow_id = int(xml_response['oml:upload_flow']['oml:id'])
+
     def publish(self, raise_error_if_exists: bool = False) -> 'OpenMLFlow':
         """ Publish this flow to OpenML server.
 
@@ -379,15 +383,8 @@ def publish(self, raise_error_if_exists: bool = False) -> 'OpenMLFlow':
             if self.flow_id:
                 raise openml.exceptions.PyOpenMLError("Flow does not exist on the server, "
                                                       "but 'flow.flow_id' is not None.")
-            xml_description = self._to_xml()
-            file_elements = {'description': xml_description}
-            return_value = openml._api_calls._perform_api_call(
-                "flow/",
-                'post',
-                file_elements=file_elements,
-            )
-            server_response = xmltodict.parse(return_value)
-            flow_id = int(server_response['oml:upload_flow']['oml:id'])
+            super().publish()
+            flow_id = self.flow_id
         elif raise_error_if_exists:
             error_message = "This OpenMLFlow already exists with id: {}.".format(flow_id)
             raise openml.exceptions.PyOpenMLError(error_message)
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -1,12 +1,11 @@
 from collections import OrderedDict
 import pickle
 import time
-from typing import Any, IO, TextIO, List, Union, Tuple, Optional  # noqa F401
+from typing import Any, IO, TextIO, List, Union, Tuple, Optional, Dict  # noqa F401
 import os
 
 import arff
 import numpy as np
-import xmltodict
 
 import openml
 import openml._api_calls
@@ -428,16 +427,15 @@ def _attribute_list_to_dict(attribute_list):
                 scores.append(sklearn_fn(y_true, y_pred, **kwargs))
         return np.array(scores)
 
-    def publish(self) -> 'OpenMLRun':
-        """ Publish a run (and if necessary, its flow) to the OpenML server.
+    def _parse_publish_response(self, xml_response: Dict):
+        """ Parse the id from the xml_response and assign it to self. """
+        self.run_id = int(xml_response['oml:upload_run']['oml:run_id'])
 
-        Uploads the results of a run to OpenML.
-        If the run is of an unpublished OpenMLFlow, the flow will be uploaded too.
-        Sets the run_id on self.
+    def _get_file_elements(self) -> Dict:
+        """ Get file_elements to upload to the server.
 
-        Returns
-        -------
-        self : OpenMLRun
+        Derived child classes should overwrite this method as necessary.
+        The description field will be populated automatically if not provided.
         """
         if self.model is None:
             raise PyOpenMLError(
@@ -463,8 +461,7 @@ def publish(self) -> 'OpenMLRun':
                 self.model,
             )
 
-        description_xml = self._to_xml()
-        file_elements = {'description': ("description.xml", description_xml)}
+        file_elements = {'description': ("description.xml", self._to_xml())}
 
         if self.error_message is None:
             predictions = arff.dumps(self._generate_arff_dict())
@@ -473,13 +470,7 @@ def publish(self) -> 'OpenMLRun':
         if self.trace is not None:
             trace_arff = arff.dumps(self.trace.trace_to_arff())
             file_elements['trace'] = ("trace.arff", trace_arff)
-
-        return_value = openml._api_calls._perform_api_call(
-            "/run/", 'post', file_elements=file_elements
-        )
-        result = xmltodict.parse(return_value)
-        self.run_id = int(result['oml:upload_run']['oml:run_id'])
-        return self
+        return file_elements
 
     def _to_dict(self) -> 'OrderedDict[str, OrderedDict]':
         """ Creates a dictionary representation of self. """
diff --git a/openml/study/study.py b/openml/study/study.py
@@ -1,8 +1,6 @@
 from collections import OrderedDict
 from typing import Dict, List, Optional, Tuple, Union, Any
 
-import xmltodict
-
 import openml
 from openml.base import OpenMLBase
 
@@ -124,26 +122,9 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
                  "Creator", "Upload Time"]
         return [(key, fields[key]) for key in order if key in fields]
 
-    def publish(self) -> int:
-        """
-        Publish the study on the OpenML server.
-
-        Returns
-        -------
-        study_id: int
-            Id of the study uploaded to the server.
-        """
-        file_elements = {
-            'description': self._to_xml()
-        }
-        return_value = openml._api_calls._perform_api_call(
-            "study/",
-            'post',
-            file_elements=file_elements,
-        )
-        study_res = xmltodict.parse(return_value)
-        self.study_id = int(study_res['oml:study_upload']['oml:id'])
-        return self.study_id
+    def _parse_publish_response(self, xml_response: Dict):
+        """ Parse the id from the xml_response and assign it to self. """
+        self.study_id = int(xml_response['oml:study_upload']['oml:id'])
 
     def _to_dict(self) -> 'OrderedDict[str, OrderedDict]':
         """ Creates a dictionary representation of self. """
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
@@ -8,7 +8,6 @@
 import numpy as np
 import pandas as pd
 import scipy.sparse
-import xmltodict
 
 import openml._api_calls
 from openml.base import OpenMLBase
@@ -181,30 +180,9 @@ def _to_dict(self) -> 'OrderedDict[str, OrderedDict]':
 
         return task_container
 
-    def publish(self) -> int:
-        """Publish task to OpenML server.
-
-        Returns
-        -------
-        task_id: int
-            Returns the id of the uploaded task
-            if successful.
-
-        """
-
-        xml_description = self._to_xml()
-
-        file_elements = {'description': xml_description}
-
-        return_value = openml._api_calls._perform_api_call(
-            "task/",
-            'post',
-            file_elements=file_elements,
-        )
-
-        task_id = int(xmltodict.parse(return_value)['oml:upload_task']['oml:id'])
-
-        return task_id
+    def _parse_publish_response(self, xml_response: Dict):
+        """ Parse the id from the xml_response and assign it to self. """
+        self.task_id = int(xml_response['oml:upload_task']['oml:id'])
 
 
 class OpenMLSupervisedTask(OpenMLTask, ABC):
diff --git a/openml/utils.py b/openml/utils.py
@@ -2,7 +2,7 @@
 import hashlib
 import xmltodict
 import shutil
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, List, Tuple, Union, Type
 import warnings
 import pandas as pd
 from functools import wraps
@@ -68,16 +68,23 @@ def extract_xml_tags(xml_tag_name, node, allow_none=True):
                              (xml_tag_name, str(node)))
 
 
-def _tag_openml_base(oml_object: 'OpenMLBase', tag: str, untag: bool = False):
+def _get_rest_api_type_alias(oml_object: 'OpenMLBase') -> str:
+    """ Return the alias of the openml entity as it is defined for the REST API. """
     rest_api_mapping = [
         (openml.datasets.OpenMLDataset, 'data'),
         (openml.flows.OpenMLFlow, 'flow'),
         (openml.tasks.OpenMLTask, 'task'),
-        (openml.runs.OpenMLRun, 'run')
-    ]
+        (openml.runs.OpenMLRun, 'run'),
+        ((openml.study.OpenMLStudy, openml.study.OpenMLBenchmarkSuite), 'study')
+    ]  # type: List[Tuple[Union[Type, Tuple], str]]
     _, api_type_alias = [(python_type, api_alias)
                          for (python_type, api_alias) in rest_api_mapping
                          if isinstance(oml_object, python_type)][0]
+    return api_type_alias
+
+
+def _tag_openml_base(oml_object: 'OpenMLBase', tag: str, untag: bool = False):
+    api_type_alias = _get_rest_api_type_alias(oml_object)
     _tag_entity(api_type_alias, oml_object.id, tag, untag)
 
 
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
diff --git a/tests/test_tasks/test_clustering_task.py b/tests/test_tasks/test_clustering_task.py
diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py