Skip to content

Commit a5ef4a5

Browse files
authored
Merge pull request #319 from openml/parallel_unit_tests
Parallel unit tests
2 parents c86b9e5 + ab73182 commit a5ef4a5

24 files changed

Lines changed: 214 additions & 173 deletions

ci_scripts/install.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ conda create -n testenv --yes python=$PYTHON_VERSION pip
2828
source activate testenv
2929
pip install nose numpy scipy cython scikit-learn==$SKLEARN_VERSION pandas \
3030
matplotlib jupyter notebook nbconvert nbformat jupyter_client ipython \
31-
ipykernel
31+
ipykernel oslo.concurrency
3232

3333
if [[ "$COVERAGE" == "true" ]]; then
3434
pip install codecov

ci_scripts/test.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ test_dir=$cwd/tests
1010
cd $TEST_DIR
1111

1212
if [[ "$COVERAGE" == "true" ]]; then
13-
nosetests -sv --with-coverage --cover-package=$MODULE $test_dir
13+
nosetests --processes=4 --process-timeout=600 -sv --with-coverage --cover-package=$MODULE $test_dir
1414
else
15-
nosetests -sv $test_dir
15+
nosetests --processes=4 --process-timeout=600 -sv $test_dir
1616
fi

openml/config.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,6 @@
1818
cachedir = ""
1919

2020

21-
22-
23-
2421
def _setup():
2522
"""Setup openml package. Called on first import.
2623
@@ -71,7 +68,7 @@ def set_cache_directory(cachedir):
7168
dataset_cache_dir = os.path.join(cachedir, "datasets")
7269
task_cache_dir = os.path.join(cachedir, "tasks")
7370
run_cache_dir = os.path.join(cachedir, 'runs')
74-
71+
lock_dir = os.path.join(cachedir, 'locks')
7572

7673
for dir_ in [cachedir, dataset_cache_dir, task_cache_dir, run_cache_dir]:
7774
if not os.path.exists(dir_) and not os.path.isdir(dir_):

openml/datasets/dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
205205

206206
path = self.data_pickle_file
207207
if not os.path.exists(path):
208-
raise ValueError("Cannot find a ndarray file for dataset %s at"
208+
raise ValueError("Cannot find a ndarray file for dataset %s at "
209209
"location %s " % (self.name, path))
210210
else:
211211
with open(path, "rb") as fh:

openml/datasets/functions.py

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import re
55
import shutil
66

7+
from oslo_concurrency import lockutils
78
import xmltodict
89

910
from .dataset import OpenMLDataset
@@ -259,6 +260,8 @@ def get_dataset(dataset_id):
259260
260261
TODO: explain caching!
261262
263+
This function is thread/multiprocessing safe.
264+
262265
Parameters
263266
----------
264267
ddataset_id : int
@@ -274,24 +277,32 @@ def get_dataset(dataset_id):
274277
raise ValueError("Dataset ID is neither an Integer nor can be "
275278
"cast to an Integer.")
276279

277-
did_cache_dir = _create_dataset_cache_directory(dataset_id)
278-
279-
try:
280-
description = _get_dataset_description(did_cache_dir, dataset_id)
281-
arff_file = _get_dataset_arff(did_cache_dir, description)
282-
features = _get_dataset_features(did_cache_dir, dataset_id)
283-
# TODO not used yet, figure out what to do with this...
284-
qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
285-
except Exception as e:
286-
_remove_dataset_cache_dir(did_cache_dir)
287-
raise e
280+
with lockutils.external_lock(
281+
name='datasets.functions.get_dataset:%d' % dataset_id,
282+
lock_path=os.path.join(config.get_cache_directory(), 'locks'),
283+
):
284+
did_cache_dir = _create_dataset_cache_directory(dataset_id)
288285

289-
dataset = _create_dataset_from_description(description, features, qualities, arff_file)
286+
try:
287+
description = _get_dataset_description(did_cache_dir, dataset_id)
288+
arff_file = _get_dataset_arff(did_cache_dir, description)
289+
features = _get_dataset_features(did_cache_dir, dataset_id)
290+
# TODO not used yet, figure out what to do with this...
291+
qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
292+
except Exception as e:
293+
_remove_dataset_cache_dir(did_cache_dir)
294+
raise e
295+
296+
dataset = _create_dataset_from_description(
297+
description, features, qualities, arff_file
298+
)
290299
return dataset
291300

292301

293302
def _get_dataset_description(did_cache_dir, dataset_id):
294-
"""Get the dataset description as xml dictionary
303+
"""Get the dataset description as xml dictionary.
304+
305+
This function is NOT thread/multiprocessing safe.
295306
296307
Parameters
297308
----------
@@ -337,6 +348,8 @@ def _get_dataset_arff(did_cache_dir, description):
337348
Checks if the file is in the cache, if yes, return the path to the file. If
338349
not, downloads the file and caches it, then returns the file path.
339350
351+
This function is NOT thread/multiprocessing safe.
352+
340353
Parameters
341354
----------
342355
did_cache_dir : str
@@ -377,6 +390,8 @@ def _get_dataset_features(did_cache_dir, dataset_id):
377390
Features are feature descriptions for each column.
378391
(name, index, categorical, ...)
379392
393+
This function is NOT thread/multiprocessing safe.
394+
380395
Parameters
381396
----------
382397
did_cache_dir : str
@@ -412,6 +427,8 @@ def _get_dataset_qualities(did_cache_dir, dataset_id):
412427
413428
Features are metafeatures (number of features, number of classes, ...)
414429
430+
This function is NOT thread/multiprocessing safe.
431+
415432
Parameters
416433
----------
417434
did_cache_dir : str
@@ -449,6 +466,8 @@ def _create_dataset_cache_directory(dataset_id):
449466
is a directory for each dataset witch the dataset ID being the directory
450467
name. This function creates this cache directory.
451468
469+
This function is NOT thread/multiprocessing safe.
470+
452471
Parameters
453472
----------
454473
did : int
@@ -471,6 +490,8 @@ def _create_dataset_cache_directory(dataset_id):
471490
def _remove_dataset_cache_dir(did_cache_dir):
472491
"""Remove the dataset cache directory
473492
493+
This function is NOT thread/multiprocessing safe.
494+
474495
Parameters
475496
----------
476497
"""

openml/evaluations/functions.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,7 @@ def list_evaluations(function, offset=None, size=None, id=None, task=None, setup
3232
3333
Returns
3434
-------
35-
list
36-
List of found evaluations.
35+
dict
3736
"""
3837

3938
api_call = "evaluation/list/function/%s" %function

openml/setups/functions.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,7 @@ def list_setups(flow=None, tag=None, setup=None, offset=None, size=None):
9393
9494
Returns
9595
-------
96-
list
97-
List of found setups.
96+
dict
9897
"""
9998

10099
api_call = "setup/list"

openml/tasks/functions.py

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import re
44
import os
55

6+
from oslo_concurrency import lockutils
67
import xmltodict
78

89
from ..exceptions import OpenMLCacheException
@@ -195,26 +196,30 @@ def get_task(task_id):
195196
xml_file = os.path.join(_create_task_cache_dir(task_id),
196197
"task.xml")
197198

198-
try:
199-
with io.open(xml_file, encoding='utf8') as fh:
200-
task = _create_task_from_xml(fh.read())
199+
with lockutils.external_lock(
200+
name='datasets.functions.get_dataset:%d' % task_id,
201+
lock_path=os.path.join(config.get_cache_directory(), 'locks'),
202+
):
203+
try:
204+
with io.open(xml_file, encoding='utf8') as fh:
205+
task = _create_task_from_xml(fh.read())
201206

202-
except (OSError, IOError):
203-
task_xml = _perform_api_call("task/%d" % task_id)
207+
except (OSError, IOError):
208+
task_xml = _perform_api_call("task/%d" % task_id)
204209

205-
with io.open(xml_file, "w", encoding='utf8') as fh:
206-
fh.write(task_xml)
210+
with io.open(xml_file, "w", encoding='utf8') as fh:
211+
fh.write(task_xml)
207212

208-
task = _create_task_from_xml(task_xml)
213+
task = _create_task_from_xml(task_xml)
209214

210-
# TODO extract this to a function
211-
task.download_split()
212-
dataset = datasets.get_dataset(task.dataset_id)
215+
# TODO extract this to a function
216+
task.download_split()
217+
dataset = datasets.get_dataset(task.dataset_id)
213218

214-
# TODO look into either adding the class labels to task xml, or other
215-
# way of reading it.
216-
class_labels = dataset.retrieve_class_labels(task.target_name)
217-
task.class_labels = class_labels
219+
# TODO look into either adding the class labels to task xml, or other
220+
# way of reading it.
221+
class_labels = dataset.retrieve_class_labels(task.target_name)
222+
task.class_labels = class_labels
218223
return task
219224

220225

openml/testing.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ def setUp(self):
3636

3737
self.cwd = os.getcwd()
3838
workdir = os.path.dirname(os.path.abspath(__file__))
39-
self.workdir = os.path.join(workdir, "tmp")
39+
tmp_dir_name = self.id()
40+
self.workdir = os.path.join(workdir, tmp_dir_name)
4041
try:
4142
shutil.rmtree(self.workdir)
4243
except:

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ nose
77
requests
88
scikit-learn>=0.18
99
nbformat
10-
python-dateutil
10+
python-dateutil
11+
oslo.concurrency

0 commit comments

Comments
 (0)