1- import inspect
2- import unittest
3- import os
4-
51import numpy as np
2+ from scipy import sparse
63import six
74
8- from openml import OpenMLDataset
5+ from openml .testing import TestBase
6+ import openml
97
108
11- class OpenMLDatasetTest (unittest .TestCase ):
12- # Splitting not helpful, these test's don't rely on the server and take less
13- # than 5 seconds + rebuilding the test would potentially be costly
9+ class OpenMLDatasetTest (TestBase ):
10+ _multiprocess_can_split_ = True
1411
1512 def setUp (self ):
16- # Load dataset id 1
17- __file__ = inspect .getfile (OpenMLDatasetTest )
18- self .directory = os .path .dirname (__file__ )
19- self .arff_filename = os .path .join (self .directory , ".." , "files" ,
20- "datasets" , "2" , "dataset.arff" )
21- self .pickle_filename = os .path .join (self .directory , ".." , "files" ,
22- "datasets" , "2" , "dataset.pkl" )
23- self .dataset = OpenMLDataset (
24- 1 , "anneal" , 2 , "Lorem ipsum." , "arff" , None , None , None ,
25- "2014-04-06 23:19:24" , None , "Public" ,
26- "http://openml.liacs.nl/files/download/2/dataset_2_anneal.ORIG.arff" ,
27- "class" , None , None , None , None , None , None , None , None , None ,
28- "939966a711925e333bf4aaadeaa71135" , data_file = self .arff_filename )
29-
30- self .sparse_arff_filename = os .path .join (
31- self .directory , ".." , "files" , "datasets" , "-1" , "dataset.arff" )
32- self .sparse_pickle_filename = os .path .join (
33- self .directory , ".." , "files" , "datasets" , "-1" , "dataset.pkl" )
34- self .sparse_dataset = OpenMLDataset (
35- - 1 , "dexter" , - 1 , "Lorem ipsum." , "arff" , None , None , None , None ,
36- None , "Public" ,
37- "http://www.cs.ubc.ca/labs/beta/Projects/autoweka/datasets/dexter.zip" ,
38- "class" , None , None , None , None , None , None , None , None , None ,
39- None , data_file = self .sparse_arff_filename )
40-
41- def tearDown (self ):
42- for file_ in [self .pickle_filename , self .sparse_pickle_filename ]:
43- os .remove (file_ )
44-
45- ##########################################################################
46- # Pandas
47-
48- @unittest .skip ("Does not work right now" )
49- def test_get_arff (self ):
50- rval = self .dataset .get_arff ()
51- self .assertIsInstance (rval , tuple )
52- self .assertIsInstance (rval [0 ], np .ndarray )
53- self .assertTrue (hasattr (rval [1 ], '__dict__' ))
54- self .assertEqual (rval [0 ].shape , (898 , ))
13+ super (OpenMLDatasetTest , self ).setUp ()
14+ openml .config .server = self .production_server
15+
16+ # Load dataset id 2 - dataset 2 is interesting because it contains
17+ # missing values, categorical features etc.
18+ self .dataset = openml .datasets .get_dataset (2 )
5519
5620 def test_get_data (self ):
5721 # Basic usage
@@ -69,22 +33,18 @@ def test_get_data(self):
6933 self .assertTrue (all ([isinstance (att , six .string_types )
7034 for att in attribute_names ]))
7135
72- def test_get_sparse_dataset (self ):
73- rval = self .sparse_dataset .get_data ()
74- self .assertIsInstance (rval , np .ndarray )
36+ def test_get_data_with_rowid (self ):
37+ self .dataset .row_id_attribute = "condition"
38+ rval , categorical = self .dataset .get_data (
39+ include_row_id = True , return_categorical_indicator = True )
7540 self .assertEqual (rval .dtype , np .float32 )
76- self .assertEqual ((2 , 20001 ), rval .shape )
77- rval , categorical = self .sparse_dataset .get_data (
78- return_categorical_indicator = True )
79- self .assertIsInstance (rval , np .ndarray )
80- self .assertEqual (len (categorical ), 20001 )
81- self .assertTrue (all ([isinstance (cat , bool ) for cat in categorical ]))
82- rval , attribute_names = self .sparse_dataset .get_data (
83- return_attribute_names = True )
84- self .assertIsInstance (rval , np .ndarray )
85- self .assertEqual (len (attribute_names ), 20001 )
86- self .assertTrue (all ([isinstance (att , six .string_types )
87- for att in attribute_names ]))
41+ self .assertEqual (rval .shape , (898 , 39 ))
42+ self .assertEqual (len (categorical ), 39 )
43+ rval , categorical = self .dataset .get_data (
44+ include_row_id = False , return_categorical_indicator = True )
45+ self .assertEqual (rval .dtype , np .float32 )
46+ self .assertEqual (rval .shape , (898 , 38 ))
47+ self .assertEqual (len (categorical ), 38 )
8848
8949 def test_get_data_with_target (self ):
9050 X , y = self .dataset .get_data (target = "class" )
@@ -98,122 +58,127 @@ def test_get_data_with_target(self):
9858 self .assertNotIn ("class" , attribute_names )
9959 self .assertEqual (y .shape , (898 , ))
10060
61+ def test_get_data_rowid_and_ignore_and_target (self ):
62+ self .dataset .ignore_attributes = ["condition" ]
63+ self .dataset .row_id_attribute = ["hardness" ]
64+ X , y = self .dataset .get_data (target = "class" , include_row_id = False ,
65+ include_ignore_attributes = False )
66+ self .assertEqual (X .dtype , np .float32 )
67+ self .assertIn (y .dtype , [np .int32 , np .int64 ])
68+ self .assertEqual (X .shape , (898 , 36 ))
69+ X , y , categorical = self .dataset .get_data (
70+ target = "class" , return_categorical_indicator = True )
71+ self .assertEqual (len (categorical ), 36 )
72+ self .assertListEqual (categorical , [True ] * 3 + [False ] + [True ] * 2 + [
73+ False ] + [True ] * 23 + [False ] * 3 + [True ] * 3 )
74+ self .assertEqual (y .shape , (898 , ))
75+
76+ def test_get_data_with_ignore_attributes (self ):
77+ self .dataset .ignore_attributes = ["condition" ]
78+ rval = self .dataset .get_data (include_ignore_attributes = True )
79+ self .assertEqual (rval .dtype , np .float32 )
80+ self .assertEqual (rval .shape , (898 , 39 ))
81+ rval , categorical = self .dataset .get_data (
82+ include_ignore_attributes = True , return_categorical_indicator = True )
83+ self .assertEqual (len (categorical ), 39 )
84+ rval = self .dataset .get_data (include_ignore_attributes = False )
85+ self .assertEqual (rval .dtype , np .float32 )
86+ self .assertEqual (rval .shape , (898 , 38 ))
87+ rval , categorical = self .dataset .get_data (
88+ include_ignore_attributes = False , return_categorical_indicator = True )
89+ self .assertEqual (len (categorical ), 38 )
90+ # TODO test multiple ignore attributes!
91+
92+
93+ class OpenMLDatasetTestSparse (TestBase ):
94+ _multiprocess_can_split_ = True
95+
96+ def setUp (self ):
97+ super (OpenMLDatasetTestSparse , self ).setUp ()
98+ openml .config .server = self .production_server
99+
100+ self .sparse_dataset = openml .datasets .get_dataset (4136 )
101+
101102 def test_get_sparse_dataset_with_target (self ):
102103 X , y = self .sparse_dataset .get_data (target = "class" )
103- self .assertIsInstance ( X , np . ndarray )
104+ self .assertTrue ( sparse . issparse ( X ) )
104105 self .assertEqual (X .dtype , np .float32 )
105106 self .assertIsInstance (y , np .ndarray )
106107 self .assertIn (y .dtype , [np .int32 , np .int64 ])
107- self .assertEqual (X .shape , (2 , 20000 ))
108+ self .assertEqual (X .shape , (600 , 20000 ))
108109 X , y , attribute_names = self .sparse_dataset .get_data (
109110 target = "class" , return_attribute_names = True )
110- self .assertIsInstance ( X , np . ndarray )
111+ self .assertTrue ( sparse . issparse ( X ) )
111112 self .assertEqual (len (attribute_names ), 20000 )
112113 self .assertNotIn ("class" , attribute_names )
113- self .assertEqual (y .shape , (2 , ))
114+ self .assertEqual (y .shape , (600 , ))
114115
115- def test_get_data_with_rowid (self ):
116- self .dataset .row_id_attribute = "condition"
117- rval , categorical = self .dataset .get_data (
118- include_row_id = True , return_categorical_indicator = True )
119- self .assertEqual (rval .dtype , np .float32 )
120- self .assertEqual (rval .shape , (898 , 39 ))
121- self .assertEqual (len (categorical ), 39 )
122- rval , categorical = self .dataset .get_data (
123- include_row_id = False , return_categorical_indicator = True )
116+ def test_get_sparse_dataset (self ):
117+ rval = self .sparse_dataset .get_data ()
118+ self .assertTrue (sparse .issparse (rval ))
124119 self .assertEqual (rval .dtype , np .float32 )
125- self .assertEqual (rval .shape , (898 , 38 ))
126- self .assertEqual (len (categorical ), 38 )
127-
128- # TODO this is not yet supported!
129- #rowid = ["condition", "formability"]
130- #self.dataset.row_id_attribute = rowid
131- #rval = self.dataset.get_pandas(include_row_id=False)
120+ self .assertEqual ((600 , 20001 ), rval .shape )
121+ rval , categorical = self .sparse_dataset .get_data (
122+ return_categorical_indicator = True )
123+ self .assertTrue (sparse .issparse (rval ))
124+ self .assertEqual (len (categorical ), 20001 )
125+ self .assertTrue (all ([isinstance (cat , bool ) for cat in categorical ]))
126+ rval , attribute_names = self .sparse_dataset .get_data (
127+ return_attribute_names = True )
128+ self .assertTrue (sparse .issparse (rval ))
129+ self .assertEqual (len (attribute_names ), 20001 )
130+ self .assertTrue (all ([isinstance (att , six .string_types )
131+ for att in attribute_names ]))
132132
133133 def test_get_sparse_dataset_with_rowid (self ):
134- self .sparse_dataset .row_id_attribute = ["a_0 " ]
134+ self .sparse_dataset .row_id_attribute = ["V256 " ]
135135 rval , categorical = self .sparse_dataset .get_data (
136136 include_row_id = True , return_categorical_indicator = True )
137- self .assertIsInstance ( rval , np . ndarray )
137+ self .assertTrue ( sparse . issparse ( rval ) )
138138 self .assertEqual (rval .dtype , np .float32 )
139- self .assertEqual (rval .shape , (2 , 20001 ))
139+ self .assertEqual (rval .shape , (600 , 20001 ))
140140 self .assertEqual (len (categorical ), 20001 )
141141 rval , categorical = self .sparse_dataset .get_data (
142142 include_row_id = False , return_categorical_indicator = True )
143- self .assertIsInstance ( rval , np . ndarray )
143+ self .assertTrue ( sparse . issparse ( rval ) )
144144 self .assertEqual (rval .dtype , np .float32 )
145- self .assertEqual (rval .shape , (2 , 20000 ))
145+ self .assertEqual (rval .shape , (600 , 20000 ))
146146 self .assertEqual (len (categorical ), 20000 )
147147
148- # TODO this is not yet supported!
149- # rowid = ["condition", "formability"]
150- #self.dataset.row_id_attribute = rowid
151- #rval = self.dataset.get_pandas(include_row_id=False)
152-
153- def test_get_data_with_ignore_attributes (self ):
154- self .dataset .ignore_attributes = ["condition" ]
155- rval = self .dataset .get_data (include_ignore_attributes = True )
156- self .assertEqual (rval .dtype , np .float32 )
157- self .assertEqual (rval .shape , (898 , 39 ))
158- rval , categorical = self .dataset .get_data (
159- include_ignore_attributes = True , return_categorical_indicator = True )
160- self .assertEqual (len (categorical ), 39 )
161- rval = self .dataset .get_data (include_ignore_attributes = False )
162- self .assertEqual (rval .dtype , np .float32 )
163- self .assertEqual (rval .shape , (898 , 38 ))
164- rval , categorical = self .dataset .get_data (
165- include_ignore_attributes = False , return_categorical_indicator = True )
166- self .assertEqual (len (categorical ), 38 )
167- # TODO test multiple ignore attributes!
168-
169148 def test_get_sparse_dataset_with_ignore_attributes (self ):
170- self .sparse_dataset .ignore_attributes = ["a_0 " ]
149+ self .sparse_dataset .ignore_attributes = ["V256 " ]
171150 rval = self .sparse_dataset .get_data (include_ignore_attributes = True )
172- self .assertIsInstance ( rval , np . ndarray )
151+ self .assertTrue ( sparse . issparse ( rval ) )
173152 self .assertEqual (rval .dtype , np .float32 )
174- self .assertEqual (rval .shape , (2 , 20001 ))
153+ self .assertEqual (rval .shape , (600 , 20001 ))
175154 rval , categorical = self .sparse_dataset .get_data (
176155 include_ignore_attributes = True , return_categorical_indicator = True )
177- self .assertIsInstance ( rval , np . ndarray )
156+ self .assertTrue ( sparse . issparse ( rval ) )
178157 self .assertEqual (len (categorical ), 20001 )
179158 rval = self .sparse_dataset .get_data (include_ignore_attributes = False )
180- self .assertIsInstance ( rval , np . ndarray )
159+ self .assertTrue ( sparse . issparse ( rval ) )
181160 self .assertEqual (rval .dtype , np .float32 )
182- self .assertEqual (rval .shape , (2 , 20000 ))
161+ self .assertEqual (rval .shape , (600 , 20000 ))
183162 rval , categorical = self .sparse_dataset .get_data (
184163 include_ignore_attributes = False , return_categorical_indicator = True )
185- self .assertIsInstance ( rval , np . ndarray )
164+ self .assertTrue ( sparse . issparse ( rval ) )
186165 self .assertEqual (len (categorical ), 20000 )
187166 # TODO test multiple ignore attributes!
188167
189- def test_get_data_rowid_and_ignore_and_target (self ):
190- self .dataset .ignore_attributes = ["condition" ]
191- self .dataset .row_id_attribute = ["hardness" ]
192- X , y = self .dataset .get_data (target = "class" , include_row_id = False ,
193- include_ignore_attributes = False )
194- self .assertEqual (X .dtype , np .float32 )
195- self .assertIn (y .dtype , [np .int32 , np .int64 ])
196- self .assertEqual (X .shape , (898 , 36 ))
197- X , y , categorical = self .dataset .get_data (
198- target = "class" , return_categorical_indicator = True )
199- self .assertEqual (len (categorical ), 36 )
200- self .assertListEqual (categorical , [True ] * 3 + [False ] + [True ] * 2 + [
201- False ] + [True ] * 23 + [False ] * 3 + [True ] * 3 )
202- self .assertEqual (y .shape , (898 , ))
203-
204168 def test_get_sparse_dataset_rowid_and_ignore_and_target (self ):
205- self .sparse_dataset .ignore_attributes = ["a_0" ]
206- self .sparse_dataset .row_id_attribute = ["a_1" ]
169+ # TODO: re-add row_id and ignore attributes
170+ self .sparse_dataset .ignore_attributes = ["V256" ]
171+ self .sparse_dataset .row_id_attribute = ["V512" ]
207172 X , y = self .sparse_dataset .get_data (
208173 target = "class" , include_row_id = False ,
209174 include_ignore_attributes = False )
210- self .assertIsInstance ( X , np . ndarray )
175+ self .assertTrue ( sparse . issparse ( X ) )
211176 self .assertEqual (X .dtype , np .float32 )
212177 self .assertIn (y .dtype , [np .int32 , np .int64 ])
213- self .assertEqual (X .shape , (2 , 19998 ))
178+ self .assertEqual (X .shape , (600 , 19998 ))
214179 X , y , categorical = self .sparse_dataset .get_data (
215180 target = "class" , return_categorical_indicator = True )
216- self .assertIsInstance ( X , np . ndarray )
181+ self .assertTrue ( sparse . issparse ( X ) )
217182 self .assertEqual (len (categorical ), 19998 )
218183 self .assertListEqual (categorical , [False ] * 19998 )
219- self .assertEqual (y .shape , (2 , ))
184+ self .assertEqual (y .shape , (600 , ))
0 commit comments