Skip to content

Commit 91e88d5

Browse files
committed
Remove pandas as dataset container, use numpy instead
1 parent a98ad8a commit 91e88d5

4 files changed

Lines changed: 95 additions & 9082 deletions

File tree

openml/entities/dataset.py

Lines changed: 45 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
logger = logging.getLogger(__name__)
1717

1818
import numpy as np
19-
import pandas as pd
2019

2120
from ..util import is_string
2221

@@ -54,10 +53,10 @@ def __init__(self, id, name, version, description, format, creator,
5453
self.md5_cheksum = md5_checksum
5554
self.data_file = data_file
5655

57-
self.pandas_file = data_file.replace('.arff', '.pd')
56+
self.data_pickle_file = data_file.replace('.arff', '.pkl')
5857

59-
if os.path.exists(self.pandas_file):
60-
logger.debug("Pandas file already exists.")
58+
if os.path.exists(self.data_pickle_file):
59+
logger.debug("Data pickle file already exists.")
6160
else:
6261
try:
6362
data = self.get_arff()
@@ -69,12 +68,13 @@ def __init__(self, id, name, version, description, format, creator,
6968
categorical = [False if type(type_) != list else True
7069
for name, type_ in data['attributes']]
7170
attribute_names = [name for name, type_ in data['attributes']]
72-
X = pd.DataFrame(data=data['data'], columns=attribute_names)
71+
# X = pd.DataFrame(data=data['data'], columns=attribute_names)
72+
X = np.array(data['data'], dtype=np.float32)
7373

74-
with open(self.pandas_file, "w") as fh:
75-
pickle.dump((X, categorical), fh, -1)
74+
with open(self.data_pickle_file, "w") as fh:
75+
pickle.dump((X, categorical, attribute_names), fh, -1)
7676
logger.debug("Saved dataset %d: %s to file %s" %
77-
(self.id, self.name, self.pandas_file))
77+
(self.id, self.name, self.data_pickle_file))
7878

7979
def __eq__(self, other):
8080
if type(other) != OpenMLDataset:
@@ -113,16 +113,19 @@ def decode_arff(fh):
113113

114114
############################################################################
115115
# pandas related stuff...
116-
def get_pandas(self, target=None, include_row_id=False,
117-
include_ignore_attributes=False):
116+
def get_dataset(self, target=None, include_row_id=False,
117+
include_ignore_attributes=False,
118+
return_categorical_indicator=False,
119+
return_attribute_names=False):
120+
rval = []
118121

119-
path = self.pandas_file
122+
path = self.data_pickle_file
120123
if not os.path.exists(path):
121124
raise ValueError("Cannot find a ndarray file for dataset %s at"
122125
"location %s " % (self.name, path))
123126
else:
124127
with open(path) as fh:
125-
data, categorical = pickle.load(fh)
128+
data, categorical, attribute_names = pickle.load(fh)
126129

127130
to_exclude = []
128131
if include_row_id == False:
@@ -143,33 +146,48 @@ def get_pandas(self, target=None, include_row_id=False,
143146
else:
144147
to_exclude.extend(self.ignore_attributes)
145148

146-
logger.info("Going to remove the following row_id_attributes:"
147-
" %s" % self.row_id_attribute)
148-
keep = [True if column not in to_exclude else False
149-
for column in data.columns]
150-
data = data.loc[:,keep]
151-
categorical = [cat for cat, k in zip(categorical, keep) if k]
149+
if len(to_exclude) > 0:
150+
logger.info("Going to remove the following row_id_attributes:"
151+
" %s" % self.row_id_attribute)
152+
keep = np.array([True if column not in to_exclude else False
153+
for column in attribute_names])
154+
data = data[:,keep]
155+
categorical = [cat for cat, k in zip(categorical, keep) if k]
156+
attribute_names = [att for att, k in
157+
zip(attribute_names, keep) if k]
152158

153159
if target is None:
154-
return data, categorical
160+
rval.append(data)
155161
else:
156162
if is_string(target):
157163
target = [target]
158164
targets = np.array([True if column in target else False
159-
for column in data.columns])
165+
for column in attribute_names])
160166

161167
try:
162-
x = data.loc[:,~targets]
163-
y = data.loc[:,targets]
168+
x = data[:,~targets]
169+
y = data[:,targets].astype(np.int32)
164170

165-
# Convert to series if possible
166171
if len(y.shape) == 2 and y.shape[1] == 1:
167-
y = y.iloc[:,0]
172+
y = y[:,0]
168173

169-
categorical = [cat for cat, t in zip(categorical, targets)
170-
if not t]
174+
categorical = [cat for cat, t in
175+
zip(categorical, targets) if not t]
176+
attribute_names = [att for att, k in
177+
zip(attribute_names, targets) if not k]
171178
except KeyError as e:
172179
import sys
173180
sys.stdout.flush()
174181
raise e
175-
return x, y, categorical
182+
rval.append(x)
183+
rval.append(y)
184+
185+
if return_categorical_indicator:
186+
rval.append(categorical)
187+
if return_attribute_names:
188+
rval.append(attribute_names)
189+
190+
if len(rval) == 1:
191+
return rval[0]
192+
else:
193+
return rval

setup.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
install_requires=["liac-arff>=2.0.2",
1515
"numpy>1.6.2",
1616
"scipy>0.9",
17-
"pandas>0.13.1",
1817
"xmltodict",
1918
"nose"],
2019
test_suite="nose.collector",

tests/entities/test_dataset.py

Lines changed: 50 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pandas as pd
77

88
from openml.entities.dataset import OpenMLDataset
9+
from openml.util import is_string
910

1011
class OpenMLDatasetTest(unittest.TestCase):
1112
def setUp(self):
@@ -14,8 +15,8 @@ def setUp(self):
1415
self.directory = os.path.dirname(__file__)
1516
self.arff_filename = os.path.join(self.directory, "..",
1617
"files", "datasets", "2", "dataset.arff")
17-
self.pandas_filename = os.path.join(self.directory, "..",
18-
"files", "datasets", "2", "dataset.pd")
18+
self.pickle_filename = os.path.join(self.directory, "..",
19+
"files", "datasets", "2", "dataset.pkl")
1920
self.dataset = OpenMLDataset(1, "anneal", 1, "Lorem ipsum.",
2021
"arff", None, None, None,
2122
"2014-04-06 23:19:24", None, "Public",
@@ -26,7 +27,7 @@ def setUp(self):
2627
data_file=self.arff_filename)
2728

2829
def tearDown(self):
29-
for file_ in [self.pandas_filename]:
30+
for file_ in [self.pickle_filename]:
3031
os.remove(file_)
3132

3233
############################################################################
@@ -40,80 +41,83 @@ def test_get_arff(self):
4041
self.assertTrue(hasattr(rval[1], '__dict__'))
4142
self.assertEqual(rval[0].shape, (898, ))
4243

43-
def test_get_pandas(self):
44+
def test_get_dataset(self):
4445
# Basic usage
45-
rval, categorical = self.dataset.get_pandas()
46-
self.assertIsInstance(rval, pd.DataFrame)
47-
self.assertEqual(rval.values.dtype, np.float64)
46+
rval = self.dataset.get_dataset()
47+
self.assertIsInstance(rval, np.ndarray)
48+
self.assertEqual(rval.dtype, np.float32)
4849
self.assertEqual((898, 39), rval.shape)
50+
rval, categorical = self.dataset.get_dataset(
51+
return_categorical_indicator=True)
4952
self.assertEqual(len(categorical), 39)
53+
self.assertTrue(all([isinstance(cat, bool) for cat in categorical]))
54+
rval, attribute_names = self.dataset.get_dataset(
55+
return_attribute_names=True)
56+
self.assertEqual(len(attribute_names), 39)
57+
self.assertTrue(all([is_string(att) for att in attribute_names]))
5058

51-
def test_get_pandas_with_target(self):
52-
X, y, categorical = self.dataset.get_pandas(target="class")
53-
self.assertEqual(X.values.dtype, np.float64)
54-
self.assertEqual(y.values.dtype, np.int64)
59+
def test_get_dataset_with_target(self):
60+
X, y = self.dataset.get_dataset(target="class")
61+
self.assertEqual(X.dtype, np.float32)
62+
self.assertEqual(y.dtype, np.int32)
5563
self.assertEqual(X.shape, (898, 38))
56-
self.assertEqual(len(categorical), 38)
57-
self.assertNotIn("class", X)
64+
X, y, attribute_names = self.dataset.get_dataset(
65+
target="class", return_attribute_names=True)
66+
self.assertEqual(len(attribute_names), 38)
67+
self.assertNotIn("class", attribute_names)
5868
self.assertEqual(y.shape, (898, ))
59-
self.assertEqual(y.name, "class")
6069

61-
def test_get_pandas_with_rowid(self):
70+
def test_get_dataset_with_rowid(self):
6271
self.dataset.row_id_attribute = "condition"
63-
rval, categorical = self.dataset.get_pandas(include_row_id=True)
64-
self.assertEqual(rval.values.dtype, np.float64)
72+
rval, categorical = self.dataset.get_dataset(
73+
include_row_id=True, return_categorical_indicator=True)
74+
self.assertEqual(rval.dtype, np.float32)
6575
self.assertEqual(rval.shape, (898, 39))
6676
self.assertEqual(len(categorical), 39)
67-
self.assertIn("condition", rval)
68-
rval, categorical = self.dataset.get_pandas(include_row_id=False)
69-
self.assertEqual(rval.values.dtype, np.float64)
77+
rval, categorical = self.dataset.get_dataset(
78+
include_row_id=False, return_categorical_indicator=True)
79+
self.assertEqual(rval.dtype, np.float32)
7080
self.assertEqual(rval.shape, (898, 38))
7181
self.assertEqual(len(categorical), 38)
72-
self.assertNotIn("condition", rval)
7382

7483
# TODO this is not yet supported!
7584
#rowid = ["condition", "formability"]
7685
#self.dataset.row_id_attribute = rowid
7786
#rval = self.dataset.get_pandas(include_row_id=False)
7887

79-
def test_get_pandas_with_ignore_attributes(self):
88+
def test_get_dataset_with_ignore_attributes(self):
8089
self.dataset.ignore_attributes = "condition"
81-
rval, categorical = self.dataset.get_pandas(include_ignore_attributes=True)
82-
self.assertEqual(rval.values.dtype, np.float64)
90+
rval = self.dataset.get_dataset(include_ignore_attributes=True)
91+
self.assertEqual(rval.dtype, np.float32)
8392
self.assertEqual(rval.shape, (898, 39))
93+
rval, categorical = self.dataset.get_dataset(
94+
include_ignore_attributes=True, return_categorical_indicator=True)
8495
self.assertEqual(len(categorical), 39)
85-
self.assertIn("condition", rval)
86-
rval, categorical = self.dataset.get_pandas(include_ignore_attributes=False)
87-
self.assertEqual(rval.values.dtype, np.float64)
96+
rval = self.dataset.get_dataset(include_ignore_attributes=False)
97+
self.assertEqual(rval.dtype, np.float32)
8898
self.assertEqual(rval.shape, (898, 38))
99+
rval, categorical = self.dataset.get_dataset(
100+
include_ignore_attributes=False, return_categorical_indicator=True)
89101
self.assertEqual(len(categorical), 38)
90-
self.assertNotIn("condition", rval)
91102
# TODO test multiple ignore attributes!
92103

93-
def test_get_pandas_rowid_and_ignore(self):
104+
def test_get_dataset_rowid_and_ignore(self):
94105
self.dataset.ignore_attributes = "condition"
95106
self.dataset.row_id_attribute = "condition"
96-
rval, categorical = self.dataset.get_pandas(include_ignore_attributes=False,
97-
include_row_id=False)
98-
self.assertEqual(rval.values.dtype, np.float64)
99-
self.assertEqual(rval.shape, (898, 38))
100-
self.assertEqual(len(categorical), 38)
101-
self.dataset.ignore_attributes = "hardness"
102-
rval, categorical = self.dataset.get_pandas(include_ignore_attributes=False,
103-
include_row_id=False)
104-
self.assertEqual(rval.values.dtype, np.float64)
105-
self.assertEqual(rval.shape, (898, 37))
106-
self.assertEqual(len(categorical), 37)
107+
rval = self.dataset.get_dataset(include_ignore_attributes=False,
108+
include_row_id=False)
109+
self.assertEqual(rval.dtype, np.float32)
107110

108-
def test_get_pandas_rowid_and_ignore_and_target(self):
111+
def test_get_dataset_rowid_and_ignore_and_target(self):
109112
self.dataset.ignore_attributes = "condition"
110113
self.dataset.row_id_attribute = "hardness"
111-
X, y, categorical = self.dataset.get_pandas(target="class",
112-
include_row_id=False,
113-
include_ignore_attributes=False)
114-
self.assertEqual(X.values.dtype, np.float64)
115-
self.assertEqual(y.values.dtype, np.int64)
114+
X, y = self.dataset.get_dataset(target="class", include_row_id=False,
115+
include_ignore_attributes=False)
116+
self.assertEqual(X.dtype, np.float32)
117+
self.assertEqual(y.dtype, np.int32)
116118
self.assertEqual(X.shape, (898, 36))
119+
X, y , categorical = self.dataset.get_dataset(
120+
target="class", return_categorical_indicator=True)
117121
self.assertEqual(len(categorical), 36)
118122
self.assertListEqual(categorical, [True]*3 + [False] + [True]*2 + [
119123
False] + [True]*23 + [False]*3 + [True]*3)

0 commit comments

Comments
 (0)