Make oslo a test-only dependency.

PGijsbers · PGijsbers · commit 397f94deec86 · 2019-04-16T10:59:25.000+02:00
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -1,7 +1,6 @@
 import io
 import os
 import re
-import warnings
 from typing import List, Dict, Union
 
 import numpy as np
@@ -10,11 +9,6 @@
 
 import xmltodict
 from scipy.sparse import coo_matrix
-# Currently, importing oslo raises a lot of warning that it will stop working
-# under python3.8; remove this once they disappear
-with warnings.catch_warnings():
-    warnings.simplefilter("ignore")
-    from oslo_concurrency import lockutils
 from collections import OrderedDict
 
 import openml.utils
@@ -334,6 +328,7 @@ def get_datasets(
     return datasets
 
 
+@openml.utils.thread_safe_if_oslo_installed
 def get_dataset(dataset_id: Union[int, str], download_data: bool = True) -> OpenMLDataset:
     """ Download the OpenML dataset representation, optionally also download actual data file.
 
@@ -361,38 +356,34 @@ def get_dataset(dataset_id: Union[int, str], download_data: bool = True) -> Open
         raise ValueError("Dataset ID is neither an Integer nor can be "
                          "cast to an Integer.")
 
-    with lockutils.external_lock(
-        name='datasets.functions.get_dataset:%d' % dataset_id,
-        lock_path=_create_lockfiles_dir(),
-    ):
-        did_cache_dir = _create_cache_directory_for_id(
-            DATASETS_CACHE_DIR_NAME, dataset_id,
-        )
+    did_cache_dir = _create_cache_directory_for_id(
+        DATASETS_CACHE_DIR_NAME, dataset_id,
+    )
 
-        try:
-            remove_dataset_cache = True
-            description = _get_dataset_description(did_cache_dir, dataset_id)
-            features = _get_dataset_features(did_cache_dir, dataset_id)
-            qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
-
-            arff_file = _get_dataset_arff(description) if download_data else None
-
-            remove_dataset_cache = False
-        except OpenMLServerException as e:
-            # if there was an exception,
-            # check if the user had access to the dataset
-            if e.code == 112:
-                raise OpenMLPrivateDatasetError(e.message) from None
-            else:
-                raise e
-        finally:
-            if remove_dataset_cache:
-                _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME,
-                                         did_cache_dir)
-
-        dataset = _create_dataset_from_description(
-            description, features, qualities, arff_file
-        )
+    try:
+        remove_dataset_cache = True
+        description = _get_dataset_description(did_cache_dir, dataset_id)
+        features = _get_dataset_features(did_cache_dir, dataset_id)
+        qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
+
+        arff_file = _get_dataset_arff(description) if download_data else None
+
+        remove_dataset_cache = False
+    except OpenMLServerException as e:
+        # if there was an exception,
+        # check if the user had access to the dataset
+        if e.code == 112:
+            raise OpenMLPrivateDatasetError(e.message) from None
+        else:
+            raise e
+    finally:
+        if remove_dataset_cache:
+            _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME,
+                                     did_cache_dir)
+
+    dataset = _create_dataset_from_description(
+        description, features, qualities, arff_file
+    )
     return dataset
 
 
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -70,6 +70,7 @@ def _get_cached_flow(fid: int) -> OpenMLFlow:
                                    "cached" % fid)
 
 
+@openml.utils.thread_safe_if_oslo_installed
 def get_flow(flow_id: int, reinstantiate: bool = False) -> OpenMLFlow:
     """Download the OpenML flow for a given flow ID.
 
@@ -87,11 +88,7 @@ def get_flow(flow_id: int, reinstantiate: bool = False) -> OpenMLFlow:
         the flow
     """
     flow_id = int(flow_id)
-    with lockutils.external_lock(
-            name='flows.functions.get_flow:%d' % flow_id,
-            lock_path=openml.utils._create_lockfiles_dir(),
-    ):
-        flow = _get_flow_description(flow_id)
+    flow = _get_flow_description(flow_id)
 
     if reinstantiate:
         flow.model = flow.extension.flow_to_model(flow)
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -300,6 +300,7 @@ def get_tasks(task_ids, download_data=True):
     return tasks
 
 
+@openml.utils.thread_safe_if_oslo_installed
 def get_task(task_id: int, download_data: bool = True) -> OpenMLTask:
     """Download OpenML task for a given task ID.
 
@@ -324,34 +325,30 @@ def get_task(task_id: int, download_data: bool = True) -> OpenMLTask:
         raise ValueError("Dataset ID is neither an Integer nor can be "
                          "cast to an Integer.")
 
-    with lockutils.external_lock(
-            name='task.functions.get_task:%d' % task_id,
-            lock_path=openml.utils._create_lockfiles_dir(),
-    ):
-        tid_cache_dir = openml.utils._create_cache_directory_for_id(
-            TASKS_CACHE_DIR_NAME, task_id,
-        )
+    tid_cache_dir = openml.utils._create_cache_directory_for_id(
+        TASKS_CACHE_DIR_NAME, task_id,
+    )
 
-        try:
-            task = _get_task_description(task_id)
-            dataset = get_dataset(task.dataset_id, download_data)
-            # List of class labels availaible in dataset description
-            # Including class labels as part of task meta data handles
-            #   the case where data download was initially disabled
-            if isinstance(task, OpenMLClassificationTask):
-                task.class_labels = \
-                    dataset.retrieve_class_labels(task.target_name)
-            # Clustering tasks do not have class labels
-            # and do not offer download_split
-            if download_data:
-                if isinstance(task, OpenMLSupervisedTask):
-                    task.download_split()
-        except Exception as e:
-            openml.utils._remove_cache_dir_for_id(
-                TASKS_CACHE_DIR_NAME,
-                tid_cache_dir,
-            )
-            raise e
+    try:
+        task = _get_task_description(task_id)
+        dataset = get_dataset(task.dataset_id, download_data)
+        # List of class labels availaible in dataset description
+        # Including class labels as part of task meta data handles
+        #   the case where data download was initially disabled
+        if isinstance(task, OpenMLClassificationTask):
+            task.class_labels = \
+                dataset.retrieve_class_labels(task.target_name)
+        # Clustering tasks do not have class labels
+        # and do not offer download_split
+        if download_data:
+            if isinstance(task, OpenMLSupervisedTask):
+                task.download_split()
+    except Exception as e:
+        openml.utils._remove_cache_dir_for_id(
+            TASKS_CACHE_DIR_NAME,
+            tid_cache_dir,
+        )
+        raise e
 
     return task
 
diff --git a/openml/utils.py b/openml/utils.py
@@ -2,11 +2,23 @@
 import hashlib
 import xmltodict
 import shutil
+import warnings
 
 import openml._api_calls
 import openml.exceptions
 from . import config
 
+oslo_installed = False
+try:
+    # Currently, importing oslo raises a lot of warning that it will stop working
+    # under python3.8; remove this once they disappear
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        from oslo_concurrency import lockutils
+        oslo_installed = True
+except ImportError:
+    pass
+
 
 def extract_xml_tags(xml_tag_name, node, allow_none=True):
     """Helper to extract xml tags from xmltodict.
@@ -279,6 +291,26 @@ def _remove_cache_dir_for_id(key, cache_dir):
                          'Please do this manually!' % (key, cache_dir))
 
 
+def thread_safe_if_oslo_installed(func, *args, **kwargs):
+    if oslo_installed:
+        # Lock directories use the id that is passed as either a first argument, or as a keyword.
+        id_parameters = ['_id' in parameter_name for parameter_name in kwargs]
+        if len(id_parameters) == 1:
+            id_ = kwargs[id_parameters[0]]
+        elif len(args) > 0:
+            id_ = args[0]
+        else:
+            raise RuntimeError("An id must be specified for {}, was passed: ({}, {}).".format(
+                func.__name__, args, kwargs
+            ))
+        # The [7:] gets rid of the 'openml.' prefix
+        lock_name = "{}.{}:{}".format(func.__module__[7:], func.__name__, id_)
+        with lockutils.external_lock(name=lock_name, lock_path=_create_lockfiles_dir()):
+            return func(*args, **kwargs)
+    else:
+        return func(*args, **kwargs)
+
+
 def _create_lockfiles_dir():
     dir = os.path.join(config.get_cache_directory(), 'locks')
     try:
diff --git a/setup.py b/setup.py
@@ -41,7 +41,6 @@
                      'requests',
                      'scikit-learn>=0.18',
                      'python-dateutil',  # Installed through pandas anyway.
-                     'oslo.concurrency',
                      'pandas>=0.19.2',
                      'scipy>=0.13.3',
                      'numpy>=1.6.2'
@@ -54,7 +53,8 @@
                          'pytest',
                          'pytest-xdist',
                          'pytest-timeout',
-                         'nbformat'
+                         'nbformat',
+                         'oslo.concurrency'
                      ],
                      'examples': [
                          'matplotlib',