Merge pull request #319 from openml/parallel_unit_tests

mfeurer · web-flow · commit a5ef4a5fb369 · 2017-10-05T21:35:29.000+02:00
Parallel unit tests
diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
@@ -28,7 +28,7 @@ conda create -n testenv --yes python=$PYTHON_VERSION pip
 source activate testenv
 pip install nose numpy scipy cython scikit-learn==$SKLEARN_VERSION pandas \
     matplotlib jupyter notebook nbconvert nbformat jupyter_client ipython \
-    ipykernel
+    ipykernel oslo.concurrency
 
 if [[ "$COVERAGE" == "true" ]]; then
     pip install codecov
diff --git a/ci_scripts/test.sh b/ci_scripts/test.sh
@@ -10,7 +10,7 @@ test_dir=$cwd/tests
 cd $TEST_DIR
 
 if [[ "$COVERAGE" == "true" ]]; then
-    nosetests -sv --with-coverage --cover-package=$MODULE $test_dir
+    nosetests --processes=4 --process-timeout=600 -sv --with-coverage --cover-package=$MODULE $test_dir
 else
-    nosetests -sv $test_dir
+    nosetests --processes=4 --process-timeout=600 -sv $test_dir
 fi
diff --git a/openml/config.py b/openml/config.py
@@ -18,9 +18,6 @@
 cachedir = ""
 
 
-
-
-
 def _setup():
     """Setup openml package. Called on first import.
 
@@ -71,7 +68,7 @@ def set_cache_directory(cachedir):
     dataset_cache_dir = os.path.join(cachedir, "datasets")
     task_cache_dir = os.path.join(cachedir, "tasks")
     run_cache_dir = os.path.join(cachedir, 'runs')
-
+    lock_dir = os.path.join(cachedir, 'locks')
 
     for dir_ in [cachedir, dataset_cache_dir, task_cache_dir, run_cache_dir]:
         if not os.path.exists(dir_) and not os.path.isdir(dir_):
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -205,7 +205,7 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
 
         path = self.data_pickle_file
         if not os.path.exists(path):
-            raise ValueError("Cannot find a ndarray file for dataset %s at"
+            raise ValueError("Cannot find a ndarray file for dataset %s at "
                              "location %s " % (self.name, path))
         else:
             with open(path, "rb") as fh:
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -4,6 +4,7 @@
 import re
 import shutil
 
+from oslo_concurrency import lockutils
 import xmltodict
 
 from .dataset import OpenMLDataset
@@ -259,6 +260,8 @@ def get_dataset(dataset_id):
 
     TODO: explain caching!
 
+    This function is thread/multiprocessing safe.
+
     Parameters
     ----------
     ddataset_id : int
@@ -274,24 +277,32 @@ def get_dataset(dataset_id):
         raise ValueError("Dataset ID is neither an Integer nor can be "
                          "cast to an Integer.")
 
-    did_cache_dir = _create_dataset_cache_directory(dataset_id)
-
-    try:
-        description = _get_dataset_description(did_cache_dir, dataset_id)
-        arff_file = _get_dataset_arff(did_cache_dir, description)
-        features = _get_dataset_features(did_cache_dir, dataset_id)
-        # TODO not used yet, figure out what to do with this...
-        qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
-    except Exception as e:
-        _remove_dataset_cache_dir(did_cache_dir)
-        raise e
+    with lockutils.external_lock(
+        name='datasets.functions.get_dataset:%d' % dataset_id,
+        lock_path=os.path.join(config.get_cache_directory(), 'locks'),
+    ):
+        did_cache_dir = _create_dataset_cache_directory(dataset_id)
 
-    dataset = _create_dataset_from_description(description, features, qualities, arff_file)
+        try:
+            description = _get_dataset_description(did_cache_dir, dataset_id)
+            arff_file = _get_dataset_arff(did_cache_dir, description)
+            features = _get_dataset_features(did_cache_dir, dataset_id)
+            # TODO not used yet, figure out what to do with this...
+            qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
+        except Exception as e:
+            _remove_dataset_cache_dir(did_cache_dir)
+            raise e
+
+        dataset = _create_dataset_from_description(
+            description, features, qualities, arff_file
+        )
     return dataset
 
 
 def _get_dataset_description(did_cache_dir, dataset_id):
-    """Get the dataset description as xml dictionary
+    """Get the dataset description as xml dictionary.
+
+    This function is NOT thread/multiprocessing safe.
 
     Parameters
     ----------
@@ -337,6 +348,8 @@ def _get_dataset_arff(did_cache_dir, description):
     Checks if the file is in the cache, if yes, return the path to the file. If
     not, downloads the file and caches it, then returns the file path.
 
+    This function is NOT thread/multiprocessing safe.
+
     Parameters
     ----------
     did_cache_dir : str
@@ -377,6 +390,8 @@ def _get_dataset_features(did_cache_dir, dataset_id):
     Features are feature descriptions for each column.
     (name, index, categorical, ...)
 
+    This function is NOT thread/multiprocessing safe.
+
     Parameters
     ----------
     did_cache_dir : str
@@ -412,6 +427,8 @@ def _get_dataset_qualities(did_cache_dir, dataset_id):
 
     Features are metafeatures (number of features, number of classes, ...)
 
+    This function is NOT thread/multiprocessing safe.
+
     Parameters
     ----------
     did_cache_dir : str
@@ -449,6 +466,8 @@ def _create_dataset_cache_directory(dataset_id):
     is a directory for each dataset witch the dataset ID being the directory
     name. This function creates this cache directory.
 
+    This function is NOT thread/multiprocessing safe.
+
     Parameters
     ----------
     did : int
@@ -471,6 +490,8 @@ def _create_dataset_cache_directory(dataset_id):
 def _remove_dataset_cache_dir(did_cache_dir):
     """Remove the dataset cache directory
 
+    This function is NOT thread/multiprocessing safe.
+
     Parameters
     ----------
     """
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
@@ -32,8 +32,7 @@ def list_evaluations(function, offset=None, size=None, id=None, task=None, setup
 
         Returns
         -------
-        list
-            List of found evaluations.
+        dict
         """
 
     api_call = "evaluation/list/function/%s" %function
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
@@ -93,8 +93,7 @@ def list_setups(flow=None, tag=None, setup=None, offset=None, size=None):
 
         Returns
         -------
-        list
-            List of found setups.
+        dict
         """
 
     api_call = "setup/list"
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -3,6 +3,7 @@
 import re
 import os
 
+from oslo_concurrency import lockutils
 import xmltodict
 
 from ..exceptions import OpenMLCacheException
@@ -195,26 +196,30 @@ def get_task(task_id):
     xml_file = os.path.join(_create_task_cache_dir(task_id),
                             "task.xml")
 
-    try:
-        with io.open(xml_file, encoding='utf8') as fh:
-            task = _create_task_from_xml(fh.read())
+    with lockutils.external_lock(
+            name='datasets.functions.get_dataset:%d' % task_id,
+            lock_path=os.path.join(config.get_cache_directory(), 'locks'),
+    ):
+        try:
+            with io.open(xml_file, encoding='utf8') as fh:
+                task = _create_task_from_xml(fh.read())
 
-    except (OSError, IOError):
-        task_xml = _perform_api_call("task/%d" % task_id)
+        except (OSError, IOError):
+            task_xml = _perform_api_call("task/%d" % task_id)
 
-        with io.open(xml_file, "w", encoding='utf8') as fh:
-            fh.write(task_xml)
+            with io.open(xml_file, "w", encoding='utf8') as fh:
+                fh.write(task_xml)
 
-        task = _create_task_from_xml(task_xml)
+            task = _create_task_from_xml(task_xml)
 
-    # TODO extract this to a function
-    task.download_split()
-    dataset = datasets.get_dataset(task.dataset_id)
+        # TODO extract this to a function
+        task.download_split()
+        dataset = datasets.get_dataset(task.dataset_id)
 
-    # TODO look into either adding the class labels to task xml, or other
-    # way of reading it.
-    class_labels = dataset.retrieve_class_labels(task.target_name)
-    task.class_labels = class_labels
+        # TODO look into either adding the class labels to task xml, or other
+        # way of reading it.
+        class_labels = dataset.retrieve_class_labels(task.target_name)
+        task.class_labels = class_labels
     return task
 
 
diff --git a/openml/testing.py b/openml/testing.py
@@ -36,7 +36,8 @@ def setUp(self):
 
         self.cwd = os.getcwd()
         workdir = os.path.dirname(os.path.abspath(__file__))
-        self.workdir = os.path.join(workdir, "tmp")
+        tmp_dir_name = self.id()
+        self.workdir = os.path.join(workdir, tmp_dir_name)
         try:
             shutil.rmtree(self.workdir)
         except:
diff --git a/requirements.txt b/requirements.txt
@@ -7,4 +7,5 @@ nose
 requests
 scikit-learn>=0.18
 nbformat
-python-dateutil
+python-dateutil
+oslo.concurrency
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
@@ -9,6 +9,8 @@
 
 
 class OpenMLDatasetTest(unittest.TestCase):
+    # Splitting not helpful, these test's don't rely on the server and take less
+    # than 5 seconds + rebuilding the test would potentially be costly
 
     def setUp(self):
         # Load dataset id 1
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -8,7 +8,6 @@
 else:
     import mock
 
-import six
 import scipy.sparse
 
 import openml
@@ -27,6 +26,7 @@
 
 
 class TestOpenMLDataset(TestBase):
+    _multiprocess_can_split_ = True
 
     def setUp(self):
         super(TestOpenMLDataset, self).setUp()
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
@@ -3,6 +3,7 @@
 from openml.testing import TestBase
 
 class TestEvaluationFunctions(TestBase):
+    _multiprocess_can_split_ = True
 
     def test_evaluation_list_filter_task(self):
         openml.config.server = self.production_server
@@ -15,20 +16,16 @@ def test_evaluation_list_filter_task(self):
         for run_id in evaluations.keys():
             self.assertEquals(evaluations[run_id].task_id, task_id)
 
-
-    def test_evaluation_list_filter_uploader(self):
+    def test_evaluation_list_filter_uploader_ID_16(self):
         openml.config.server = self.production_server
 
         uploader_id = 16
 
         evaluations = openml.evaluations.list_evaluations("predictive_accuracy", uploader=[uploader_id])
 
         self.assertGreater(len(evaluations), 100)
-        # for run_id in evaluations.keys():
-        #     self.assertEquals(evaluations[run_id].uploader, uploader_id)
-
 
-    def test_evaluation_list_filter_uploader(self):
+    def test_evaluation_list_filter_uploader_ID_10(self):
         openml.config.server = self.production_server
 
         setup_id = 10
@@ -39,7 +36,6 @@ def test_evaluation_list_filter_uploader(self):
         for run_id in evaluations.keys():
             self.assertEquals(evaluations[run_id].setup_id, setup_id)
 
-
     def test_evaluation_list_filter_flow(self):
         openml.config.server = self.production_server
 
@@ -51,7 +47,6 @@ def test_evaluation_list_filter_flow(self):
         for run_id in evaluations.keys():
             self.assertEquals(evaluations[run_id].flow_id, flow_id)
 
-
     def test_evaluation_list_filter_run(self):
         openml.config.server = self.production_server
 
@@ -63,9 +58,8 @@ def test_evaluation_list_filter_run(self):
         for run_id in evaluations.keys():
             self.assertEquals(evaluations[run_id].run_id, run_id)
 
-
     def test_evaluation_list_limit(self):
         openml.config.server = self.production_server
 
         evaluations = openml.evaluations.list_evaluations("predictive_accuracy", size=100, offset=100)
-        self.assertEquals(len(evaluations), 100)
+        self.assertEquals(len(evaluations), 100)
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
@@ -33,6 +33,7 @@
 
 
 class TestFlow(TestBase):
+    _multiprocess_can_split_ = True
 
     def test_get_flow(self):
         # We need to use the production server here because 4024 is not the test
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
@@ -8,6 +8,8 @@
 
 
 class TestFlowFunctions(unittest.TestCase):
+    _multiprocess_can_split_ = True
+
     def _check_flow(self, flow):
         self.assertEqual(type(flow), dict)
         self.assertEqual(len(flow), 6)
diff --git a/tests/test_flows/test_sklearn.py b/tests/test_flows/test_sklearn.py
@@ -50,6 +50,8 @@ def fit(self, X, y):
 
 
 class TestSklearn(unittest.TestCase):
+    # Splitting not helpful, these test's don't rely on the server and take less
+    # than 1 seconds
     
     def setUp(self):
         iris = sklearn.datasets.load_iris()
diff --git a/tests/test_openml/test_openml.py b/tests/test_openml/test_openml.py
@@ -12,6 +12,8 @@
 
 
 class TestInit(TestBase):
+    # Splitting not helpful, these test's don't rely on the server and take less
+    # than 1 seconds
 
     @mock.patch('openml.tasks.functions.get_task')
     @mock.patch('openml.datasets.functions.get_dataset')
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
@@ -8,6 +8,8 @@
 
 
 class TestRun(TestBase):
+    # Splitting not helpful, these test's don't rely on the server and take less
+    # than 1 seconds
 
     def test_parse_parameters_flow_not_on_server(self):
 
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
diff --git a/tests/test_tasks/test_split.py b/tests/test_tasks/test_split.py
diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py