1010import scipy .sparse
1111import xmltodict
1212
13+ from ..exceptions import PyOpenMLError
14+
1315if sys .version_info [0 ] >= 3 :
1416 import pickle
1517else :
@@ -45,7 +47,7 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
4547 row_id_attribute = None , ignore_attribute = None ,
4648 version_label = None , citation = None , tag = None , visibility = None ,
4749 original_data_url = None , paper_url = None , update_comment = None ,
48- md5_checksum = None , data_file = None ):
50+ md5_checksum = None , data_file = None , features = None ):
4951 # Attributes received by querying the RESTful API
5052 self .dataset_id = int (dataset_id ) if dataset_id is not None else None
5153 self .name = name
@@ -71,38 +73,41 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
7173 self .update_comment = update_comment
7274 self .md5_cheksum = md5_checksum
7375 self .data_file = data_file
76+ self .features = features
77+
7478 if data_file is not None :
75- self .data_pickle_file = data_file .replace ('.arff' , '.pkl' )
79+ if self ._data_features_supported ():
80+ self .data_pickle_file = data_file .replace ('.arff' , '.pkl' )
7681
77- if os .path .exists (self .data_pickle_file ):
78- logger .debug ("Data pickle file already exists." )
79- else :
80- try :
81- data = self ._get_arff (self .format )
82- except OSError as e :
83- logger .critical ("Please check that the data file %s is there "
84- "and can be read." , self .data_file )
85- raise e
86-
87- categorical = [False if type (type_ ) != list else True
88- for name , type_ in data ['attributes' ]]
89- attribute_names = [name for name , type_ in data ['attributes' ]]
90-
91- if isinstance (data ['data' ], tuple ):
92- X = data ['data' ]
93- X_shape = (max (X [1 ]) + 1 , max (X [2 ]) + 1 )
94- X = scipy .sparse .coo_matrix (
95- (X [0 ], (X [1 ], X [2 ])), shape = X_shape , dtype = np .float32 )
96- X = X .tocsr ()
97- elif isinstance (data ['data' ], list ):
98- X = np .array (data ['data' ], dtype = np .float32 )
82+ if os .path .exists (self .data_pickle_file ):
83+ logger .debug ("Data pickle file already exists." )
9984 else :
100- raise Exception ()
101-
102- with open (self .data_pickle_file , "wb" ) as fh :
103- pickle .dump ((X , categorical , attribute_names ), fh , - 1 )
104- logger .debug ("Saved dataset %d: %s to file %s" %
105- (self .dataset_id , self .name , self .data_pickle_file ))
85+ try :
86+ data = self ._get_arff (self .format )
87+ except OSError as e :
88+ logger .critical ("Please check that the data file %s is there "
89+ "and can be read." , self .data_file )
90+ raise e
91+
92+ categorical = [False if type (type_ ) != list else True
93+ for name , type_ in data ['attributes' ]]
94+ attribute_names = [name for name , type_ in data ['attributes' ]]
95+
96+ if isinstance (data ['data' ], tuple ):
97+ X = data ['data' ]
98+ X_shape = (max (X [1 ]) + 1 , max (X [2 ]) + 1 )
99+ X = scipy .sparse .coo_matrix (
100+ (X [0 ], (X [1 ], X [2 ])), shape = X_shape , dtype = np .float32 )
101+ X = X .tocsr ()
102+ elif isinstance (data ['data' ], list ):
103+ X = np .array (data ['data' ], dtype = np .float32 )
104+ else :
105+ raise Exception ()
106+
107+ with open (self .data_pickle_file , "wb" ) as fh :
108+ pickle .dump ((X , categorical , attribute_names ), fh , - 1 )
109+ logger .debug ("Saved dataset %d: %s to file %s" %
110+ (self .dataset_id , self .name , self .data_pickle_file ))
106111
107112 def __eq__ (self , other ):
108113 if type (other ) != OpenMLDataset :
@@ -132,6 +137,9 @@ def _get_arff(self, format):
132137 # 32 bit system...currently 120mb (just a little bit more than covtype)
133138 import struct
134139
140+ if not self ._data_features_supported ():
141+ raise PyOpenMLError ('Dataset not compatible, PyOpenML cannot handle string features' )
142+
135143 filename = self .data_file
136144 bits = (8 * struct .calcsize ("P" ))
137145 if bits != 64 and os .path .getsize (filename ) > 120000000 :
@@ -172,6 +180,9 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
172180 """
173181 rval = []
174182
183+ if not self ._data_features_supported ():
184+ raise PyOpenMLError ('Dataset not compatible, PyOpenML cannot handle string features' )
185+
175186 path = self .data_pickle_file
176187 if not os .path .exists (path ):
177188 raise ValueError ("Cannot find a ndarray file for dataset %s at"
@@ -336,3 +347,11 @@ def _to_xml(self):
336347 xml_dataset += "<oml:{0}>{1}</oml:{0}>\n " .format (prop , content )
337348 xml_dataset += "</oml:data_set_description>"
338349 return xml_dataset
350+
351+ def _data_features_supported (self ):
352+ if self .features is not None :
353+ for feature in self .features ['oml:feature' ]:
354+ if feature ['oml:data_type' ] not in ['numeric' , 'nominal' ]:
355+ return False
356+ return True
357+ return True
0 commit comments