Fix59 (#683)

PGijsbers · mfeurer · commit e4e385bf91f7 · 2019-04-19T22:22:04.000+02:00
* Start method description.

* Include version in listing. Refactor number parsing.

* Towards retrieving by name.

* Finalize _name_to_id.

* Adapt get_dataset(s).

* Address feedback.

* Add two unit tests for retrieving by name. Extract shared code to new function.

* Unit tests name to id.

* Add test get_dataset_by_name

* flake8
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -1,7 +1,7 @@
 import io
 import os
 import re
-from typing import List, Dict, Union
+from typing import List, Dict, Union, Optional
 
 import numpy as np
 import arff
@@ -247,19 +247,20 @@ def __list_datasets(api_call):
 
     datasets = dict()
     for dataset_ in datasets_dict['oml:data']['oml:dataset']:
-        did = int(dataset_['oml:did'])
-        dataset = {'did': did,
-                   'name': dataset_['oml:name'],
-                   'format': dataset_['oml:format'],
-                   'status': dataset_['oml:status']}
+        ignore_attributes = ['oml:file_id', 'oml:quality']
+        dataset = {k.replace('oml:', ''): v
+                   for (k, v) in dataset_.items()
+                   if k not in ignore_attributes}
+        dataset['did'] = int(dataset['did'])
+        dataset['version'] = int(dataset['version'])
 
         # The number of qualities can range from 0 to infinity
         for quality in dataset_.get('oml:quality', list()):
-            quality['#text'] = float(quality['#text'])
-            if abs(int(quality['#text']) - quality['#text']) < 0.0000001:
-                quality['#text'] = int(quality['#text'])
-            dataset[quality['@name']] = quality['#text']
-        datasets[did] = dataset
+            try:
+                dataset[quality['@name']] = int(quality['#text'])
+            except ValueError:
+                dataset[quality['@name']] = float(quality['#text'])
+        datasets[dataset['did']] = dataset
 
     return datasets
 
@@ -298,6 +299,47 @@ def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]:
     return active
 
 
+def _name_to_id(
+    dataset_name: str,
+    version: Optional[int] = None,
+    error_if_multiple: bool = False
+) -> int:
+    """ Attempt to find the dataset id of the dataset with the given name.
+
+    If multiple datasets with the name exist, and ``error_if_multiple`` is ``False``,
+    then return the least recent still active dataset.
+
+    Raises an error if no dataset with the name is found.
+    Raises an error if a version is specified but it could not be found.
+
+    Parameters
+    ----------
+    dataset_name : str
+        The name of the dataset for which to find its id.
+    version : int
+        Version to retrieve. If not specified, the oldest active version is returned.
+    error_if_multiple : bool (default=False)
+        If `False`, if multiple datasets match, return the least recent active dataset.
+        If `True`, if multiple datasets match, raise an error.
+
+    Returns
+    -------
+    int
+       The id of the dataset.
+    """
+    status = None if version is not None else 'active'
+    candidates = list_datasets(data_name=dataset_name, status=status, data_version=version)
+    if error_if_multiple and len(candidates) > 1:
+        raise ValueError("Multiple active datasets exist with name {}".format(dataset_name))
+    if len(candidates) == 0:
+        no_dataset_for_name = "No active datasets exist with name {}".format(dataset_name)
+        and_version = " and version {}".format(version) if version is not None else ""
+        raise RuntimeError(no_dataset_for_name + and_version)
+
+    # Dataset ids are chronological so we can just sort based on ids (instead of version)
+    return sorted(candidates)[0]
+
+
 def get_datasets(
         dataset_ids: List[Union[str, int]],
         download_data: bool = True,
@@ -309,7 +351,8 @@ def get_datasets(
     Parameters
     ----------
     dataset_ids : iterable
-        Integers or strings representing dataset ids.
+        Integers or strings representing dataset ids or dataset names.
+        If dataset names are specified, the least recent still active dataset version is returned.
     download_data : bool, optional
         If True, also download the data file. Beware that some datasets are large and it might
         make the operation noticeably slower. Metadata is also still retrieved.
@@ -328,13 +371,23 @@ def get_datasets(
 
 
 @openml.utils.thread_safe_if_oslo_installed
-def get_dataset(dataset_id: Union[int, str], download_data: bool = True) -> OpenMLDataset:
+def get_dataset(
+    dataset_id: Union[int, str],
+    download_data: bool = True,
+    version: int = None,
+    error_if_multiple: bool = False
+) -> OpenMLDataset:
     """ Download the OpenML dataset representation, optionally also download actual data file.
 
     This function is thread/multiprocessing safe.
     This function uses caching. A check will be performed to determine if the information has
     previously been downloaded, and if so be loaded from disk instead of retrieved from the server.
 
+    If dataset is retrieved by name, a version may be specified.
+    If no version is specified and multiple versions of the dataset exist,
+    the earliest version of the dataset that is still active will be returned.
+    This scenario will raise an error instead if `exception_if_multiple` is `True`.
+
     Parameters
     ----------
     dataset_id : int or str
@@ -344,16 +397,24 @@ def get_dataset(dataset_id: Union[int, str], download_data: bool = True) -> Open
         make the operation noticeably slower. Metadata is also still retrieved.
         If False, create the OpenMLDataset and only populate it with the metadata.
         The data may later be retrieved through the `OpenMLDataset.get_data` method.
+    version : int, optional (default=None)
+        Specifies the version if `dataset_id` is specified by name.
+        If no version is specified, retrieve the least recent still active version.
+    error_if_multiple : bool, optional (default=False)
+        If `True` raise an error if multiple datasets are found with matching criteria.
 
     Returns
     -------
     dataset : :class:`openml.OpenMLDataset`
         The downloaded dataset."""
-    try:
-        dataset_id = int(dataset_id)
-    except (ValueError, TypeError):
-        raise ValueError("Dataset ID is neither an Integer nor can be "
-                         "cast to an Integer.")
+    if isinstance(dataset_id, str):
+        try:
+            dataset_id = int(dataset_id)
+        except ValueError:
+            dataset_id = _name_to_id(dataset_id, version, error_if_multiple)  # type: ignore
+    elif not isinstance(dataset_id, int):
+        raise TypeError("`dataset_id` must be one of `str` or `int`, not {}."
+                        .format(type(dataset_id)))
 
     did_cache_dir = _create_cache_directory_for_id(
         DATASETS_CACHE_DIR_NAME, dataset_id,
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -219,70 +219,120 @@ def test_check_datasets_active(self):
         )
         openml.config.server = self.test_server
 
+    def _datasets_retrieved_successfully(self, dids, metadata_only=True):
+        """ Checks that all files for the given dids have been downloaded.
+
+        This includes:
+            - description
+            - qualities
+            - features
+            - absence of data arff if metadata_only, else it must be present too.
+        """
+        for did in dids:
+            self.assertTrue(os.path.exists(os.path.join(
+                openml.config.get_cache_directory(), "datasets", str(did), "description.xml")))
+            self.assertTrue(os.path.exists(os.path.join(
+                openml.config.get_cache_directory(), "datasets", str(did), "qualities.xml")))
+            self.assertTrue(os.path.exists(os.path.join(
+                openml.config.get_cache_directory(), "datasets", str(did), "features.xml")))
+
+            data_assert = self.assertFalse if metadata_only else self.assertTrue
+            data_assert(os.path.exists(os.path.join(
+                openml.config.get_cache_directory(), "datasets", str(did), "dataset.arff")))
+
+    def test__name_to_id_with_deactivated(self):
+        """ Check that an activated dataset is returned if an earlier deactivated one exists. """
+        openml.config.server = self.production_server
+        # /d/1 was deactivated
+        self.assertEqual(openml.datasets.functions._name_to_id('anneal'), 2)
+        openml.config.server = self.test_server
+
+    def test__name_to_id_with_multiple_active(self):
+        """ With multiple active datasets, retrieve the least recent active. """
+        self.assertEqual(openml.datasets.functions._name_to_id('iris'), 128)
+
+    def test__name_to_id_with_version(self):
+        """ With multiple active datasets, retrieve the least recent active. """
+        self.assertEqual(openml.datasets.functions._name_to_id('iris', version=3), 151)
+
+    def test__name_to_id_with_multiple_active_error(self):
+        """ With multiple active datasets, retrieve the least recent active. """
+        self.assertRaisesRegex(
+            ValueError,
+            "Multiple active datasets exist with name iris",
+            openml.datasets.functions._name_to_id,
+            dataset_name='iris',
+            error_if_multiple=True
+        )
+
+    def test__name_to_id_name_does_not_exist(self):
+        """ With multiple active datasets, retrieve the least recent active. """
+        self.assertRaisesRegex(
+            RuntimeError,
+            "No active datasets exist with name does_not_exist",
+            openml.datasets.functions._name_to_id,
+            dataset_name='does_not_exist'
+        )
+
+    def test__name_to_id_version_does_not_exist(self):
+        """ With multiple active datasets, retrieve the least recent active. """
+        self.assertRaisesRegex(
+            RuntimeError,
+            "No active datasets exist with name iris and version 100000",
+            openml.datasets.functions._name_to_id,
+            dataset_name='iris',
+            version=100000
+        )
+
+    def test_get_datasets_by_name(self):
+        # did 1 and 2 on the test server:
+        dids = ['anneal', 'kr-vs-kp']
+        datasets = openml.datasets.get_datasets(dids, download_data=False)
+        self.assertEqual(len(datasets), 2)
+        self._datasets_retrieved_successfully([1, 2])
+
+    def test_get_datasets_by_mixed(self):
+        # did 1 and 2 on the test server:
+        dids = ['anneal', 2]
+        datasets = openml.datasets.get_datasets(dids, download_data=False)
+        self.assertEqual(len(datasets), 2)
+        self._datasets_retrieved_successfully([1, 2])
+
     def test_get_datasets(self):
         dids = [1, 2]
         datasets = openml.datasets.get_datasets(dids)
         self.assertEqual(len(datasets), 2)
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "description.xml")))
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "2", "description.xml")))
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")))
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "2", "dataset.arff")))
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "features.xml")))
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "2", "features.xml")))
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "qualities.xml")))
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "2", "qualities.xml")))
+        self._datasets_retrieved_successfully([1, 2], metadata_only=False)
 
     def test_get_datasets_lazy(self):
         dids = [1, 2]
         datasets = openml.datasets.get_datasets(dids, download_data=False)
         self.assertEqual(len(datasets), 2)
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "description.xml")))
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "2", "description.xml")))
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "features.xml")))
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "2", "features.xml")))
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "qualities.xml")))
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "2", "qualities.xml")))
-
-        self.assertFalse(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")))
-        self.assertFalse(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "2", "dataset.arff")))
+        self._datasets_retrieved_successfully([1, 2], metadata_only=True)
 
         datasets[0].get_data()
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")))
-
         datasets[1].get_data()
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "2", "dataset.arff")))
+        self._datasets_retrieved_successfully([1, 2], metadata_only=False)
+
+    def test_get_dataset_by_name(self):
+        dataset = openml.datasets.get_dataset('anneal')
+        self.assertEqual(type(dataset), OpenMLDataset)
+        self.assertEqual(dataset.dataset_id, 1)
+        self._datasets_retrieved_successfully([1], metadata_only=False)
+
+        self.assertGreater(len(dataset.features), 1)
+        self.assertGreater(len(dataset.qualities), 4)
+
+        # Issue324 Properly handle private datasets when trying to access them
+        openml.config.server = self.production_server
+        self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45)
 
     def test_get_dataset(self):
         # This is the only non-lazy load to ensure default behaviour works.
         dataset = openml.datasets.get_dataset(1)
         self.assertEqual(type(dataset), OpenMLDataset)
         self.assertEqual(dataset.name, 'anneal')
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "description.xml")))
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")))
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "features.xml")))
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "qualities.xml")))
+        self._datasets_retrieved_successfully([1], metadata_only=False)
 
         self.assertGreater(len(dataset.features), 1)
         self.assertGreater(len(dataset.qualities), 4)
@@ -295,22 +345,13 @@ def test_get_dataset_lazy(self):
         dataset = openml.datasets.get_dataset(1, download_data=False)
         self.assertEqual(type(dataset), OpenMLDataset)
         self.assertEqual(dataset.name, 'anneal')
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "description.xml")))
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "features.xml")))
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "qualities.xml")))
-
-        self.assertFalse(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")))
+        self._datasets_retrieved_successfully([1], metadata_only=True)
 
         self.assertGreater(len(dataset.features), 1)
         self.assertGreater(len(dataset.qualities), 4)
 
         dataset.get_data()
-        self.assertTrue(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")))
+        self._datasets_retrieved_successfully([1], metadata_only=False)
 
         # Issue324 Properly handle private datasets when trying to access them
         openml.config.server = self.production_server
@@ -321,27 +362,26 @@ def test_get_dataset_lazy_all_functions(self):
         dataset = openml.datasets.get_dataset(1, download_data=False)
         # We only tests functions as general integrity is tested by test_get_dataset_lazy
 
+        def ensure_absence_of_real_data():
+            self.assertFalse(os.path.exists(os.path.join(
+                openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")))
+
         tag = 'test_lazy_tag_%d' % random.randint(1, 1000000)
         dataset.push_tag(tag)
-        self.assertFalse(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")))
+        ensure_absence_of_real_data()
 
         dataset.remove_tag(tag)
-        self.assertFalse(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")))
+        ensure_absence_of_real_data()
 
         nominal_indices = dataset.get_features_by_type('nominal')
-        self.assertFalse(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")))
         correct = [0, 1, 2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                    20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 35, 36, 37, 38]
         self.assertEqual(nominal_indices, correct)
+        ensure_absence_of_real_data()
 
         classes = dataset.retrieve_class_labels()
         self.assertEqual(classes, ['1', '2', '3', '4', '5', 'U'])
-
-        self.assertFalse(os.path.exists(os.path.join(
-            openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")))
+        ensure_absence_of_real_data()
 
     def test_get_dataset_sparse(self):
         dataset = openml.datasets.get_dataset(102, download_data=False)