allow parallel testing of datasets.functions

mfeurer · mfeurer · commit cd71051357b3 · 2017-10-05T14:57:41.000+02:00
diff --git a/openml/config.py b/openml/config.py
@@ -18,9 +18,6 @@
 cachedir = ""
 
 
-
-
-
 def _setup():
     """Setup openml package. Called on first import.
 
@@ -71,7 +68,7 @@ def set_cache_directory(cachedir):
     dataset_cache_dir = os.path.join(cachedir, "datasets")
     task_cache_dir = os.path.join(cachedir, "tasks")
     run_cache_dir = os.path.join(cachedir, 'runs')
-
+    lock_dir = os.path.join(cachedir, 'locks')
 
     for dir_ in [cachedir, dataset_cache_dir, task_cache_dir, run_cache_dir]:
         if not os.path.exists(dir_) and not os.path.isdir(dir_):
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -4,6 +4,7 @@
 import re
 import shutil
 
+from oslo_concurrency import lockutils
 import xmltodict
 
 from .dataset import OpenMLDataset
@@ -259,6 +260,8 @@ def get_dataset(dataset_id):
 
     TODO: explain caching!
 
+    This function is thread/multiprocessing safe.
+
     Parameters
     ----------
     ddataset_id : int
@@ -274,24 +277,32 @@ def get_dataset(dataset_id):
         raise ValueError("Dataset ID is neither an Integer nor can be "
                          "cast to an Integer.")
 
-    did_cache_dir = _create_dataset_cache_directory(dataset_id)
-
-    try:
-        description = _get_dataset_description(did_cache_dir, dataset_id)
-        arff_file = _get_dataset_arff(did_cache_dir, description)
-        features = _get_dataset_features(did_cache_dir, dataset_id)
-        # TODO not used yet, figure out what to do with this...
-        qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
-    except Exception as e:
-        _remove_dataset_cache_dir(did_cache_dir)
-        raise e
+    with lockutils.external_lock(
+        name='datasets.functions.get_dataset:%d' % dataset_id,
+        lock_path=os.path.join(config.get_cache_directory(), 'locks'),
+    ):
+        did_cache_dir = _create_dataset_cache_directory(dataset_id)
 
-    dataset = _create_dataset_from_description(description, features, qualities, arff_file)
+        try:
+            description = _get_dataset_description(did_cache_dir, dataset_id)
+            arff_file = _get_dataset_arff(did_cache_dir, description)
+            features = _get_dataset_features(did_cache_dir, dataset_id)
+            # TODO not used yet, figure out what to do with this...
+            qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
+        except Exception as e:
+            _remove_dataset_cache_dir(did_cache_dir)
+            raise e
+
+        dataset = _create_dataset_from_description(
+            description, features, qualities, arff_file
+        )
     return dataset
 
 
 def _get_dataset_description(did_cache_dir, dataset_id):
-    """Get the dataset description as xml dictionary
+    """Get the dataset description as xml dictionary.
+
+    This function is NOT thread/multiprocessing safe.
 
     Parameters
     ----------
@@ -337,6 +348,8 @@ def _get_dataset_arff(did_cache_dir, description):
     Checks if the file is in the cache, if yes, return the path to the file. If
     not, downloads the file and caches it, then returns the file path.
 
+    This function is NOT thread/multiprocessing safe.
+
     Parameters
     ----------
     did_cache_dir : str
@@ -377,6 +390,8 @@ def _get_dataset_features(did_cache_dir, dataset_id):
     Features are feature descriptions for each column.
     (name, index, categorical, ...)
 
+    This function is NOT thread/multiprocessing safe.
+
     Parameters
     ----------
     did_cache_dir : str
@@ -412,6 +427,8 @@ def _get_dataset_qualities(did_cache_dir, dataset_id):
 
     Features are metafeatures (number of features, number of classes, ...)
 
+    This function is NOT thread/multiprocessing safe.
+
     Parameters
     ----------
     did_cache_dir : str
@@ -449,6 +466,8 @@ def _create_dataset_cache_directory(dataset_id):
     is a directory for each dataset witch the dataset ID being the directory
     name. This function creates this cache directory.
 
+    This function is NOT thread/multiprocessing safe.
+
     Parameters
     ----------
     did : int
@@ -471,6 +490,8 @@ def _create_dataset_cache_directory(dataset_id):
 def _remove_dataset_cache_dir(did_cache_dir):
     """Remove the dataset cache directory
 
+    This function is NOT thread/multiprocessing safe.
+
     Parameters
     ----------
     """
diff --git a/openml/testing.py b/openml/testing.py
@@ -19,7 +19,7 @@ class TestBase(unittest.TestCase):
     Hopefully soon allows using a test server, not the production server.
     """
 
-    def setUp(self):
+    def setUp(self, tmp_dir_name=None):
         # This cache directory is checked in to git to simulate a populated
         # cache
         self.maxDiff = None
@@ -36,7 +36,9 @@ def setUp(self):
 
         self.cwd = os.getcwd()
         workdir = os.path.dirname(os.path.abspath(__file__))
-        self.workdir = os.path.join(workdir, "tmp")
+        if tmp_dir_name is None:
+            tmp_dir_name = 'tmp'
+        self.workdir = os.path.join(workdir, tmp_dir_name)
         try:
             shutil.rmtree(self.workdir)
         except:
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -27,9 +27,12 @@
 
 
 class TestOpenMLDataset(TestBase):
+    _multiprocess_can_split_ = True
 
-    def setUp(self):
-        super(TestOpenMLDataset, self).setUp()
+    def setUp(self, tmp_dir_name=None):
+        tmp_dir_name = self.id()
+        print(tmp_dir_name)
+        super(TestOpenMLDataset, self).setUp(tmp_dir_name=tmp_dir_name)
         self._remove_did1()
 
     def tearDown(self):