Skip to content

Commit 40510b9

Browse files
authored
Merge pull request #336 from amueller/remove_dataset_pickles
Remove dataset pickles
2 parents 6c53531 + af1de06 commit 40510b9

3 files changed

Lines changed: 119 additions & 148 deletions

File tree

openml/datasets/dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
205205

206206
path = self.data_pickle_file
207207
if not os.path.exists(path):
208-
raise ValueError("Cannot find a ndarray file for dataset %s at "
208+
raise ValueError("Cannot find a pickle file for dataset %s at "
209209
"location %s " % (self.name, path))
210210
else:
211211
with open(path, "rb") as fh:
@@ -425,4 +425,4 @@ def _data_features_supported(self):
425425
if self.features[idx].data_type not in ['numeric', 'nominal']:
426426
return False
427427
return True
428-
return True
428+
return True
Lines changed: 101 additions & 136 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,21 @@
1-
import inspect
2-
import unittest
3-
import os
4-
51
import numpy as np
2+
from scipy import sparse
63
import six
74

8-
from openml import OpenMLDataset
5+
from openml.testing import TestBase
6+
import openml
97

108

11-
class OpenMLDatasetTest(unittest.TestCase):
12-
# Splitting not helpful, these test's don't rely on the server and take less
13-
# than 5 seconds + rebuilding the test would potentially be costly
9+
class OpenMLDatasetTest(TestBase):
10+
_multiprocess_can_split_ = True
1411

1512
def setUp(self):
16-
# Load dataset id 1
17-
__file__ = inspect.getfile(OpenMLDatasetTest)
18-
self.directory = os.path.dirname(__file__)
19-
self.arff_filename = os.path.join(self.directory, "..", "files",
20-
"datasets", "2", "dataset.arff")
21-
self.pickle_filename = os.path.join(self.directory, "..", "files",
22-
"datasets", "2", "dataset.pkl")
23-
self.dataset = OpenMLDataset(
24-
1, "anneal", 2, "Lorem ipsum.", "arff", None, None, None,
25-
"2014-04-06 23:19:24", None, "Public",
26-
"http://openml.liacs.nl/files/download/2/dataset_2_anneal.ORIG.arff",
27-
"class", None, None, None, None, None, None, None, None, None,
28-
"939966a711925e333bf4aaadeaa71135", data_file=self.arff_filename)
29-
30-
self.sparse_arff_filename = os.path.join(
31-
self.directory, "..", "files", "datasets", "-1", "dataset.arff")
32-
self.sparse_pickle_filename = os.path.join(
33-
self.directory, "..", "files", "datasets", "-1", "dataset.pkl")
34-
self.sparse_dataset = OpenMLDataset(
35-
-1, "dexter", -1, "Lorem ipsum.", "arff", None, None, None, None,
36-
None, "Public",
37-
"http://www.cs.ubc.ca/labs/beta/Projects/autoweka/datasets/dexter.zip",
38-
"class", None, None, None, None, None, None, None, None, None,
39-
None, data_file=self.sparse_arff_filename)
40-
41-
def tearDown(self):
42-
for file_ in [self.pickle_filename, self.sparse_pickle_filename]:
43-
os.remove(file_)
44-
45-
##########################################################################
46-
# Pandas
47-
48-
@unittest.skip("Does not work right now")
49-
def test_get_arff(self):
50-
rval = self.dataset.get_arff()
51-
self.assertIsInstance(rval, tuple)
52-
self.assertIsInstance(rval[0], np.ndarray)
53-
self.assertTrue(hasattr(rval[1], '__dict__'))
54-
self.assertEqual(rval[0].shape, (898, ))
13+
super(OpenMLDatasetTest, self).setUp()
14+
openml.config.server = self.production_server
15+
16+
# Load dataset id 2 - dataset 2 is interesting because it contains
17+
# missing values, categorical features etc.
18+
self.dataset = openml.datasets.get_dataset(2)
5519

5620
def test_get_data(self):
5721
# Basic usage
@@ -69,22 +33,18 @@ def test_get_data(self):
6933
self.assertTrue(all([isinstance(att, six.string_types)
7034
for att in attribute_names]))
7135

72-
def test_get_sparse_dataset(self):
73-
rval = self.sparse_dataset.get_data()
74-
self.assertIsInstance(rval, np.ndarray)
36+
def test_get_data_with_rowid(self):
37+
self.dataset.row_id_attribute = "condition"
38+
rval, categorical = self.dataset.get_data(
39+
include_row_id=True, return_categorical_indicator=True)
7540
self.assertEqual(rval.dtype, np.float32)
76-
self.assertEqual((2, 20001), rval.shape)
77-
rval, categorical = self.sparse_dataset.get_data(
78-
return_categorical_indicator=True)
79-
self.assertIsInstance(rval, np.ndarray)
80-
self.assertEqual(len(categorical), 20001)
81-
self.assertTrue(all([isinstance(cat, bool) for cat in categorical]))
82-
rval, attribute_names = self.sparse_dataset.get_data(
83-
return_attribute_names=True)
84-
self.assertIsInstance(rval, np.ndarray)
85-
self.assertEqual(len(attribute_names), 20001)
86-
self.assertTrue(all([isinstance(att, six.string_types)
87-
for att in attribute_names]))
41+
self.assertEqual(rval.shape, (898, 39))
42+
self.assertEqual(len(categorical), 39)
43+
rval, categorical = self.dataset.get_data(
44+
include_row_id=False, return_categorical_indicator=True)
45+
self.assertEqual(rval.dtype, np.float32)
46+
self.assertEqual(rval.shape, (898, 38))
47+
self.assertEqual(len(categorical), 38)
8848

8949
def test_get_data_with_target(self):
9050
X, y = self.dataset.get_data(target="class")
@@ -98,122 +58,127 @@ def test_get_data_with_target(self):
9858
self.assertNotIn("class", attribute_names)
9959
self.assertEqual(y.shape, (898, ))
10060

61+
def test_get_data_rowid_and_ignore_and_target(self):
62+
self.dataset.ignore_attributes = ["condition"]
63+
self.dataset.row_id_attribute = ["hardness"]
64+
X, y = self.dataset.get_data(target="class", include_row_id=False,
65+
include_ignore_attributes=False)
66+
self.assertEqual(X.dtype, np.float32)
67+
self.assertIn(y.dtype, [np.int32, np.int64])
68+
self.assertEqual(X.shape, (898, 36))
69+
X, y, categorical = self.dataset.get_data(
70+
target="class", return_categorical_indicator=True)
71+
self.assertEqual(len(categorical), 36)
72+
self.assertListEqual(categorical, [True] * 3 + [False] + [True] * 2 + [
73+
False] + [True] * 23 + [False] * 3 + [True] * 3)
74+
self.assertEqual(y.shape, (898, ))
75+
76+
def test_get_data_with_ignore_attributes(self):
77+
self.dataset.ignore_attributes = ["condition"]
78+
rval = self.dataset.get_data(include_ignore_attributes=True)
79+
self.assertEqual(rval.dtype, np.float32)
80+
self.assertEqual(rval.shape, (898, 39))
81+
rval, categorical = self.dataset.get_data(
82+
include_ignore_attributes=True, return_categorical_indicator=True)
83+
self.assertEqual(len(categorical), 39)
84+
rval = self.dataset.get_data(include_ignore_attributes=False)
85+
self.assertEqual(rval.dtype, np.float32)
86+
self.assertEqual(rval.shape, (898, 38))
87+
rval, categorical = self.dataset.get_data(
88+
include_ignore_attributes=False, return_categorical_indicator=True)
89+
self.assertEqual(len(categorical), 38)
90+
# TODO test multiple ignore attributes!
91+
92+
93+
class OpenMLDatasetTestSparse(TestBase):
94+
_multiprocess_can_split_ = True
95+
96+
def setUp(self):
97+
super(OpenMLDatasetTestSparse, self).setUp()
98+
openml.config.server = self.production_server
99+
100+
self.sparse_dataset = openml.datasets.get_dataset(4136)
101+
101102
def test_get_sparse_dataset_with_target(self):
102103
X, y = self.sparse_dataset.get_data(target="class")
103-
self.assertIsInstance(X, np.ndarray)
104+
self.assertTrue(sparse.issparse(X))
104105
self.assertEqual(X.dtype, np.float32)
105106
self.assertIsInstance(y, np.ndarray)
106107
self.assertIn(y.dtype, [np.int32, np.int64])
107-
self.assertEqual(X.shape, (2, 20000))
108+
self.assertEqual(X.shape, (600, 20000))
108109
X, y, attribute_names = self.sparse_dataset.get_data(
109110
target="class", return_attribute_names=True)
110-
self.assertIsInstance(X, np.ndarray)
111+
self.assertTrue(sparse.issparse(X))
111112
self.assertEqual(len(attribute_names), 20000)
112113
self.assertNotIn("class", attribute_names)
113-
self.assertEqual(y.shape, (2, ))
114+
self.assertEqual(y.shape, (600, ))
114115

115-
def test_get_data_with_rowid(self):
116-
self.dataset.row_id_attribute = "condition"
117-
rval, categorical = self.dataset.get_data(
118-
include_row_id=True, return_categorical_indicator=True)
119-
self.assertEqual(rval.dtype, np.float32)
120-
self.assertEqual(rval.shape, (898, 39))
121-
self.assertEqual(len(categorical), 39)
122-
rval, categorical = self.dataset.get_data(
123-
include_row_id=False, return_categorical_indicator=True)
116+
def test_get_sparse_dataset(self):
117+
rval = self.sparse_dataset.get_data()
118+
self.assertTrue(sparse.issparse(rval))
124119
self.assertEqual(rval.dtype, np.float32)
125-
self.assertEqual(rval.shape, (898, 38))
126-
self.assertEqual(len(categorical), 38)
127-
128-
# TODO this is not yet supported!
129-
#rowid = ["condition", "formability"]
130-
#self.dataset.row_id_attribute = rowid
131-
#rval = self.dataset.get_pandas(include_row_id=False)
120+
self.assertEqual((600, 20001), rval.shape)
121+
rval, categorical = self.sparse_dataset.get_data(
122+
return_categorical_indicator=True)
123+
self.assertTrue(sparse.issparse(rval))
124+
self.assertEqual(len(categorical), 20001)
125+
self.assertTrue(all([isinstance(cat, bool) for cat in categorical]))
126+
rval, attribute_names = self.sparse_dataset.get_data(
127+
return_attribute_names=True)
128+
self.assertTrue(sparse.issparse(rval))
129+
self.assertEqual(len(attribute_names), 20001)
130+
self.assertTrue(all([isinstance(att, six.string_types)
131+
for att in attribute_names]))
132132

133133
def test_get_sparse_dataset_with_rowid(self):
134-
self.sparse_dataset.row_id_attribute = ["a_0"]
134+
self.sparse_dataset.row_id_attribute = ["V256"]
135135
rval, categorical = self.sparse_dataset.get_data(
136136
include_row_id=True, return_categorical_indicator=True)
137-
self.assertIsInstance(rval, np.ndarray)
137+
self.assertTrue(sparse.issparse(rval))
138138
self.assertEqual(rval.dtype, np.float32)
139-
self.assertEqual(rval.shape, (2, 20001))
139+
self.assertEqual(rval.shape, (600, 20001))
140140
self.assertEqual(len(categorical), 20001)
141141
rval, categorical = self.sparse_dataset.get_data(
142142
include_row_id=False, return_categorical_indicator=True)
143-
self.assertIsInstance(rval, np.ndarray)
143+
self.assertTrue(sparse.issparse(rval))
144144
self.assertEqual(rval.dtype, np.float32)
145-
self.assertEqual(rval.shape, (2, 20000))
145+
self.assertEqual(rval.shape, (600, 20000))
146146
self.assertEqual(len(categorical), 20000)
147147

148-
# TODO this is not yet supported!
149-
# rowid = ["condition", "formability"]
150-
#self.dataset.row_id_attribute = rowid
151-
#rval = self.dataset.get_pandas(include_row_id=False)
152-
153-
def test_get_data_with_ignore_attributes(self):
154-
self.dataset.ignore_attributes = ["condition"]
155-
rval = self.dataset.get_data(include_ignore_attributes=True)
156-
self.assertEqual(rval.dtype, np.float32)
157-
self.assertEqual(rval.shape, (898, 39))
158-
rval, categorical = self.dataset.get_data(
159-
include_ignore_attributes=True, return_categorical_indicator=True)
160-
self.assertEqual(len(categorical), 39)
161-
rval = self.dataset.get_data(include_ignore_attributes=False)
162-
self.assertEqual(rval.dtype, np.float32)
163-
self.assertEqual(rval.shape, (898, 38))
164-
rval, categorical = self.dataset.get_data(
165-
include_ignore_attributes=False, return_categorical_indicator=True)
166-
self.assertEqual(len(categorical), 38)
167-
# TODO test multiple ignore attributes!
168-
169148
def test_get_sparse_dataset_with_ignore_attributes(self):
170-
self.sparse_dataset.ignore_attributes = ["a_0"]
149+
self.sparse_dataset.ignore_attributes = ["V256"]
171150
rval = self.sparse_dataset.get_data(include_ignore_attributes=True)
172-
self.assertIsInstance(rval, np.ndarray)
151+
self.assertTrue(sparse.issparse(rval))
173152
self.assertEqual(rval.dtype, np.float32)
174-
self.assertEqual(rval.shape, (2, 20001))
153+
self.assertEqual(rval.shape, (600, 20001))
175154
rval, categorical = self.sparse_dataset.get_data(
176155
include_ignore_attributes=True, return_categorical_indicator=True)
177-
self.assertIsInstance(rval, np.ndarray)
156+
self.assertTrue(sparse.issparse(rval))
178157
self.assertEqual(len(categorical), 20001)
179158
rval = self.sparse_dataset.get_data(include_ignore_attributes=False)
180-
self.assertIsInstance(rval, np.ndarray)
159+
self.assertTrue(sparse.issparse(rval))
181160
self.assertEqual(rval.dtype, np.float32)
182-
self.assertEqual(rval.shape, (2, 20000))
161+
self.assertEqual(rval.shape, (600, 20000))
183162
rval, categorical = self.sparse_dataset.get_data(
184163
include_ignore_attributes=False, return_categorical_indicator=True)
185-
self.assertIsInstance(rval, np.ndarray)
164+
self.assertTrue(sparse.issparse(rval))
186165
self.assertEqual(len(categorical), 20000)
187166
# TODO test multiple ignore attributes!
188167

189-
def test_get_data_rowid_and_ignore_and_target(self):
190-
self.dataset.ignore_attributes = ["condition"]
191-
self.dataset.row_id_attribute = ["hardness"]
192-
X, y = self.dataset.get_data(target="class", include_row_id=False,
193-
include_ignore_attributes=False)
194-
self.assertEqual(X.dtype, np.float32)
195-
self.assertIn(y.dtype, [np.int32, np.int64])
196-
self.assertEqual(X.shape, (898, 36))
197-
X, y, categorical = self.dataset.get_data(
198-
target="class", return_categorical_indicator=True)
199-
self.assertEqual(len(categorical), 36)
200-
self.assertListEqual(categorical, [True] * 3 + [False] + [True] * 2 + [
201-
False] + [True] * 23 + [False] * 3 + [True] * 3)
202-
self.assertEqual(y.shape, (898, ))
203-
204168
def test_get_sparse_dataset_rowid_and_ignore_and_target(self):
205-
self.sparse_dataset.ignore_attributes = ["a_0"]
206-
self.sparse_dataset.row_id_attribute = ["a_1"]
169+
# TODO: re-add row_id and ignore attributes
170+
self.sparse_dataset.ignore_attributes = ["V256"]
171+
self.sparse_dataset.row_id_attribute = ["V512"]
207172
X, y = self.sparse_dataset.get_data(
208173
target="class", include_row_id=False,
209174
include_ignore_attributes=False)
210-
self.assertIsInstance(X, np.ndarray)
175+
self.assertTrue(sparse.issparse(X))
211176
self.assertEqual(X.dtype, np.float32)
212177
self.assertIn(y.dtype, [np.int32, np.int64])
213-
self.assertEqual(X.shape, (2, 19998))
178+
self.assertEqual(X.shape, (600, 19998))
214179
X, y, categorical = self.sparse_dataset.get_data(
215180
target="class", return_categorical_indicator=True)
216-
self.assertIsInstance(X, np.ndarray)
181+
self.assertTrue(sparse.issparse(X))
217182
self.assertEqual(len(categorical), 19998)
218183
self.assertListEqual(categorical, [False] * 19998)
219-
self.assertEqual(y.shape, (2, ))
184+
self.assertEqual(y.shape, (600, ))

tests/test_datasets/test_dataset_functions.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
import unittest
22
import os
3-
import shutil
3+
import os
44
import sys
55

66
if sys.version_info[0] >= 3:
77
from unittest import mock
88
else:
99
import mock
1010

11+
from oslo_concurrency import lockutils
1112
import scipy.sparse
1213

1314
import openml
@@ -22,27 +23,32 @@
2223
_get_dataset_description,
2324
_get_dataset_arff,
2425
_get_dataset_features,
25-
_get_dataset_qualities, get_dataset)
26+
_get_dataset_qualities)
2627

2728

2829
class TestOpenMLDataset(TestBase):
2930
_multiprocess_can_split_ = True
3031

3132
def setUp(self):
3233
super(TestOpenMLDataset, self).setUp()
33-
self._remove_did1()
3434

3535
def tearDown(self):
36+
self._remove_pickle_files()
3637
super(TestOpenMLDataset, self).tearDown()
37-
self._remove_did1()
3838

39-
def _remove_did1(self):
39+
def _remove_pickle_files(self):
4040
cache_dir = self.static_cache_dir
41-
did_1_dir = os.path.join(cache_dir, 'datasets', '1')
42-
try:
43-
shutil.rmtree(did_1_dir)
44-
except:
45-
pass
41+
for did in ['-1', '2']:
42+
with lockutils.external_lock(
43+
name='datasets.functions.get_dataset:%s' % did,
44+
lock_path=os.path.join(openml.config.get_cache_directory(), 'locks'),
45+
):
46+
pickle_path = os.path.join(cache_dir, 'datasets', did,
47+
'dataset.pkl')
48+
try:
49+
os.remove(pickle_path)
50+
except:
51+
pass
4652

4753
def test__list_cached_datasets(self):
4854
openml.config.set_cache_directory(self.static_cache_dir)

0 commit comments

Comments
 (0)