@@ -157,7 +157,7 @@ def __init__(self, name, description, format=None,
157157 feature = OpenMLDataFeature (int (xmlfeature ['oml:index' ]),
158158 xmlfeature ['oml:name' ],
159159 xmlfeature ['oml:data_type' ],
160- None ,
160+ xmlfeature . get ( 'oml:nominal_value' ) ,
161161 int (nr_missing ))
162162 if idx != feature .index :
163163 raise ValueError ('Data features not provided '
@@ -167,96 +167,104 @@ def __init__(self, name, description, format=None,
167167 self .qualities = _check_qualities (qualities )
168168
169169 if data_file is not None :
170- self .data_pickle_file = data_file .replace ('.arff' , '.pkl.py3' )
170+ self .data_pickle_file = self ._data_arff_to_pickle (data_file )
171+ else :
172+ self .data_pickle_file = None
171173
172- if os .path .exists (self .data_pickle_file ):
173- logger .debug ("Data pickle file already exists." )
174- else :
175- try :
176- data = self ._get_arff (self .format )
177- except OSError as e :
178- logger .critical ("Please check that the data file %s is "
179- "there and can be read." , self .data_file )
180- raise e
181-
182- ARFF_DTYPES_TO_PD_DTYPE = {
183- 'INTEGER' : 'integer' ,
184- 'REAL' : 'floating' ,
185- 'NUMERIC' : 'floating' ,
186- 'STRING' : 'string'
187- }
188- attribute_dtype = {}
189- attribute_names = []
190- categories_names = {}
191- categorical = []
192- for name , type_ in data ['attributes' ]:
193- # if the feature is nominal and the a sparse matrix is
194- # requested, the categories need to be numeric
195- if (isinstance (type_ , list )
196- and self .format .lower () == 'sparse_arff' ):
197- try :
198- np .array (type_ , dtype = np .float32 )
199- except ValueError :
200- raise ValueError (
201- "Categorical data needs to be numeric when "
202- "using sparse ARFF."
203- )
204- # string can only be supported with pandas DataFrame
205- elif (type_ == 'STRING'
206- and self .format .lower () == 'sparse_arff' ):
174+ def _data_arff_to_pickle (self , data_file ):
175+ data_pickle_file = data_file .replace ('.arff' , '.pkl.py3' )
176+ if os .path .exists (data_pickle_file ):
177+ logger .debug ("Data pickle file already exists." )
178+ return data_pickle_file
179+ else :
180+ try :
181+ data = self ._get_arff (self .format )
182+ except OSError as e :
183+ logger .critical ("Please check that the data file %s is "
184+ "there and can be read." , data_file )
185+ raise e
186+
187+ ARFF_DTYPES_TO_PD_DTYPE = {
188+ 'INTEGER' : 'integer' ,
189+ 'REAL' : 'floating' ,
190+ 'NUMERIC' : 'floating' ,
191+ 'STRING' : 'string'
192+ }
193+ attribute_dtype = {}
194+ attribute_names = []
195+ categories_names = {}
196+ categorical = []
197+ for name , type_ in data ['attributes' ]:
198+ # if the feature is nominal and the a sparse matrix is
199+ # requested, the categories need to be numeric
200+ if (isinstance (type_ , list )
201+ and self .format .lower () == 'sparse_arff' ):
202+ try :
203+ np .array (type_ , dtype = np .float32 )
204+ except ValueError :
207205 raise ValueError (
208- "Dataset containing strings is not supported "
209- "with sparse ARFF."
206+ "Categorical data needs to be numeric when "
207+ "using sparse ARFF."
210208 )
211-
212- # infer the dtype from the ARFF header
213- if isinstance (type_ , list ):
214- categorical .append (True )
215- categories_names [name ] = type_
216- if len (type_ ) == 2 :
217- type_norm = [cat .lower ().capitalize ()
218- for cat in type_ ]
219- if set (['True' , 'False' ]) == set (type_norm ):
220- categories_names [name ] = [
221- True if cat == 'True' else False
222- for cat in type_norm
223- ]
224- attribute_dtype [name ] = 'boolean'
225- else :
226- attribute_dtype [name ] = 'categorical'
209+ # string can only be supported with pandas DataFrame
210+ elif (type_ == 'STRING'
211+ and self .format .lower () == 'sparse_arff' ):
212+ raise ValueError (
213+ "Dataset containing strings is not supported "
214+ "with sparse ARFF."
215+ )
216+
217+ # infer the dtype from the ARFF header
218+ if isinstance (type_ , list ):
219+ categorical .append (True )
220+ categories_names [name ] = type_
221+ if len (type_ ) == 2 :
222+ type_norm = [cat .lower ().capitalize ()
223+ for cat in type_ ]
224+ if set (['True' , 'False' ]) == set (type_norm ):
225+ categories_names [name ] = [
226+ True if cat == 'True' else False
227+ for cat in type_norm
228+ ]
229+ attribute_dtype [name ] = 'boolean'
227230 else :
228231 attribute_dtype [name ] = 'categorical'
229232 else :
230- categorical .append (False )
231- attribute_dtype [name ] = ARFF_DTYPES_TO_PD_DTYPE [type_ ]
232- attribute_names .append (name )
233-
234- if self .format .lower () == 'sparse_arff' :
235- X = data ['data' ]
236- X_shape = (max (X [1 ]) + 1 , max (X [2 ]) + 1 )
237- X = scipy .sparse .coo_matrix (
238- (X [0 ], (X [1 ], X [2 ])), shape = X_shape , dtype = np .float32 )
239- X = X .tocsr ()
240-
241- elif self .format .lower () == 'arff' :
242- X = pd .DataFrame (data ['data' ], columns = attribute_names )
243-
244- col = []
245- for column_name in X .columns :
246- if attribute_dtype [column_name ] in ('categorical' ,
247- 'boolean' ):
248- col .append (self ._unpack_categories (
249- X [column_name ], categories_names [column_name ]))
250- else :
251- col .append (X [column_name ])
252- X = pd .concat (col , axis = 1 )
253-
254- # Pickle the dataframe or the sparse matrix.
255- with open (self .data_pickle_file , "wb" ) as fh :
256- pickle .dump ((X , categorical , attribute_names ), fh , - 1 )
257- logger .debug ("Saved dataset %d: %s to file %s" %
258- (int (self .dataset_id or - 1 ), self .name ,
259- self .data_pickle_file ))
233+ attribute_dtype [name ] = 'categorical'
234+ else :
235+ categorical .append (False )
236+ attribute_dtype [name ] = ARFF_DTYPES_TO_PD_DTYPE [type_ ]
237+ attribute_names .append (name )
238+
239+ if self .format .lower () == 'sparse_arff' :
240+ X = data ['data' ]
241+ X_shape = (max (X [1 ]) + 1 , max (X [2 ]) + 1 )
242+ X = scipy .sparse .coo_matrix (
243+ (X [0 ], (X [1 ], X [2 ])), shape = X_shape , dtype = np .float32 )
244+ X = X .tocsr ()
245+
246+ elif self .format .lower () == 'arff' :
247+ X = pd .DataFrame (data ['data' ], columns = attribute_names )
248+
249+ col = []
250+ for column_name in X .columns :
251+ if attribute_dtype [column_name ] in ('categorical' ,
252+ 'boolean' ):
253+ col .append (self ._unpack_categories (
254+ X [column_name ], categories_names [column_name ]))
255+ else :
256+ col .append (X [column_name ])
257+ X = pd .concat (col , axis = 1 )
258+
259+ # Pickle the dataframe or the sparse matrix.
260+ with open (data_pickle_file , "wb" ) as fh :
261+ pickle .dump ((X , categorical , attribute_names ), fh , - 1 )
262+ logger .debug ("Saved dataset {did}: {name} to file {path}"
263+ .format (did = int (self .dataset_id or - 1 ),
264+ name = self .name ,
265+ path = data_pickle_file )
266+ )
267+ return data_pickle_file
260268
261269 def push_tag (self , tag ):
262270 """Annotates this data set with a tag on the server.
@@ -394,13 +402,19 @@ def _unpack_categories(series, categories):
394402 return pd .Series (col , index = series .index , dtype = 'category' ,
395403 name = series .name )
396404
397- def get_data (self , target = None ,
398- include_row_id = False ,
399- include_ignore_attributes = False ,
400- return_categorical_indicator = False ,
401- return_attribute_names = False ,
402- dataset_format = None ):
403- """Returns dataset content as dataframes or sparse matrices.
405+ def _download_data (self ) -> None :
406+ """ Download ARFF data file to standard cache directory. Set `self.data_file`. """
407+ # import required here to avoid circular import.
408+ from .functions import _get_dataset_arff
409+ self .data_file = _get_dataset_arff (self )
410+
411+ def get_data (self , target : str = None ,
412+ include_row_id : bool = False ,
413+ include_ignore_attributes : bool = False ,
414+ return_categorical_indicator : bool = False ,
415+ return_attribute_names : bool = False ,
416+ dataset_format : str = None ):
417+ """ Returns dataset content as dataframes or sparse matrices.
404418
405419 Parameters
406420 ----------
@@ -416,10 +430,10 @@ def get_data(self, target=None,
416430 categorical.
417431 return_attribute_names : boolean (default=False)
418432 Whether to return attribute names.
419- dataset_format : string
420- The format of returned dataset. If ``array``, the returned dataset
421- will be a NumPy array or a SciPy sparse matrix. If ``dataframe``,
422- the returned dataset will be a Pandas DataFrame or SparseDataFrame.
433+ dataset_format : string, optional
434+ The format of returned dataset.
435+ If ``array``, the returned dataset will be a NumPy array or a SciPy sparse matrix.
436+ If ``dataframe``, the returned dataset will be a Pandas DataFrame or SparseDataFrame.
423437
424438 Returns
425439 -------
@@ -428,12 +442,11 @@ def get_data(self, target=None,
428442 y : ndarray or series, shape (n_samples,)
429443 Target column(s). Only returned if target is not None.
430444 categorical_indicator : boolean ndarray
431- Mask that indicate categorical features. Only returned if
432- return_categorical_indicator is True.
445+ Mask that indicate categorical features.
446+ Only returned if return_categorical_indicator is True.
433447 return_attribute_names : list of strings
434- List of attribute names. Returned only if return_attribute_names is
435- True.
436-
448+ List of attribute names.
449+ Only returned if return_attribute_names is True.
437450 """
438451 if dataset_format is None :
439452 warn ('The default of "dataset_format" will change from "array" to'
@@ -442,6 +455,11 @@ def get_data(self, target=None,
442455
443456 rval = []
444457
458+ if self .data_pickle_file is None :
459+ if self .data_file is None :
460+ self ._download_data ()
461+ self .data_pickle_file = self ._data_arff_to_pickle (self .data_file )
462+
445463 path = self .data_pickle_file
446464 if not os .path .exists (path ):
447465 raise ValueError ("Cannot find a pickle file for dataset %s at "
@@ -554,26 +572,10 @@ def retrieve_class_labels(self, target_name='class'):
554572 -------
555573 list
556574 """
557-
558- # TODO improve performance, currently reads the whole file
559- # Should make a method that only reads the attributes
560- arffFileName = self .data_file
561-
562- if self .format .lower () == 'arff' :
563- return_type = arff .DENSE
564- elif self .format .lower () == 'sparse_arff' :
565- return_type = arff .COO
566- else :
567- raise ValueError ('Unknown data format %s' % self .format )
568-
569- with io .open (arffFileName , encoding = 'utf8' ) as fh :
570- arffData = arff .ArffDecoder ().decode (fh , return_type = return_type )
571-
572- dataAttributes = dict (arffData ['attributes' ])
573- if target_name in dataAttributes :
574- return dataAttributes [target_name ]
575- else :
576- return None
575+ for feature in self .features .values ():
576+ if (feature .name == target_name ) and (feature .data_type == 'nominal' ):
577+ return feature .nominal_values
578+ return None
577579
578580 def get_features_by_type (self , data_type , exclude = None ,
579581 exclude_ignore_attributes = True ,
0 commit comments