@@ -174,97 +174,106 @@ def __init__(self, name, description, format=None,
174174 def _data_arff_to_pickle (self , data_file ):
175175 data_pickle_file = data_file .replace ('.arff' , '.pkl.py3' )
176176 if os .path .exists (data_pickle_file ):
177- logger .debug ("Data pickle file already exists." )
178- return data_pickle_file
179- else :
180- try :
181- data = self ._get_arff (self .format )
182- except OSError as e :
183- logger .critical ("Please check that the data file %s is "
184- "there and can be read." , data_file )
185- raise e
186-
187- ARFF_DTYPES_TO_PD_DTYPE = {
188- 'INTEGER' : 'integer' ,
189- 'REAL' : 'floating' ,
190- 'NUMERIC' : 'floating' ,
191- 'STRING' : 'string'
192- }
193- attribute_dtype = {}
194- attribute_names = []
195- categories_names = {}
196- categorical = []
197- for name , type_ in data ['attributes' ]:
198- # if the feature is nominal and the a sparse matrix is
199- # requested, the categories need to be numeric
200- if (isinstance (type_ , list )
201- and self .format .lower () == 'sparse_arff' ):
202- try :
203- np .array (type_ , dtype = np .float32 )
204- except ValueError :
205- raise ValueError (
206- "Categorical data needs to be numeric when "
207- "using sparse ARFF."
208- )
209- # string can only be supported with pandas DataFrame
210- elif (type_ == 'STRING'
211- and self .format .lower () == 'sparse_arff' ):
177+ with open (data_pickle_file , "rb" ) as fh :
178+ data , categorical , attribute_names = pickle .load (fh )
179+
180+ # Between v0.8 and v0.9 the format of pickled data changed from
181+ # np.ndarray to pd.DataFrame. This breaks some backwards compatibility,
182+ # e.g. for `run_model_on_task`. If a local file still exists with
183+ # np.ndarray data, we reprocess the data file to store a pickled
184+ # pd.DataFrame blob. See also #646.
185+ if isinstance (data , pd .DataFrame ) or scipy .sparse .issparse (data ):
186+ logger .debug ("Data pickle file already exists." )
187+ return data_pickle_file
188+
189+ try :
190+ data = self ._get_arff (self .format )
191+ except OSError as e :
192+ logger .critical ("Please check that the data file %s is "
193+ "there and can be read." , data_file )
194+ raise e
195+
196+ ARFF_DTYPES_TO_PD_DTYPE = {
197+ 'INTEGER' : 'integer' ,
198+ 'REAL' : 'floating' ,
199+ 'NUMERIC' : 'floating' ,
200+ 'STRING' : 'string'
201+ }
202+ attribute_dtype = {}
203+ attribute_names = []
204+ categories_names = {}
205+ categorical = []
206+ for name , type_ in data ['attributes' ]:
207+ # if the feature is nominal and the a sparse matrix is
208+ # requested, the categories need to be numeric
209+ if (isinstance (type_ , list )
210+ and self .format .lower () == 'sparse_arff' ):
211+ try :
212+ np .array (type_ , dtype = np .float32 )
213+ except ValueError :
212214 raise ValueError (
213- "Dataset containing strings is not supported "
214- "with sparse ARFF."
215+ "Categorical data needs to be numeric when "
216+ "using sparse ARFF."
215217 )
218+ # string can only be supported with pandas DataFrame
219+ elif (type_ == 'STRING'
220+ and self .format .lower () == 'sparse_arff' ):
221+ raise ValueError (
222+ "Dataset containing strings is not supported "
223+ "with sparse ARFF."
224+ )
216225
217- # infer the dtype from the ARFF header
218- if isinstance (type_ , list ):
219- categorical .append (True )
220- categories_names [name ] = type_
221- if len (type_ ) == 2 :
222- type_norm = [cat .lower ().capitalize ()
223- for cat in type_ ]
224- if set (['True' , 'False' ]) == set (type_norm ):
225- categories_names [name ] = [
226- True if cat == 'True' else False
227- for cat in type_norm
228- ]
229- attribute_dtype [name ] = 'boolean'
230- else :
231- attribute_dtype [name ] = 'categorical'
226+ # infer the dtype from the ARFF header
227+ if isinstance (type_ , list ):
228+ categorical .append (True )
229+ categories_names [name ] = type_
230+ if len (type_ ) == 2 :
231+ type_norm = [cat .lower ().capitalize ()
232+ for cat in type_ ]
233+ if set (['True' , 'False' ]) == set (type_norm ):
234+ categories_names [name ] = [
235+ True if cat == 'True' else False
236+ for cat in type_norm
237+ ]
238+ attribute_dtype [name ] = 'boolean'
232239 else :
233240 attribute_dtype [name ] = 'categorical'
234241 else :
235- categorical .append (False )
236- attribute_dtype [name ] = ARFF_DTYPES_TO_PD_DTYPE [type_ ]
237- attribute_names .append (name )
238-
239- if self .format .lower () == 'sparse_arff' :
240- X = data ['data' ]
241- X_shape = (max (X [1 ]) + 1 , max (X [2 ]) + 1 )
242- X = scipy .sparse .coo_matrix (
243- (X [0 ], (X [1 ], X [2 ])), shape = X_shape , dtype = np .float32 )
244- X = X .tocsr ()
245-
246- elif self .format .lower () == 'arff' :
247- X = pd .DataFrame (data ['data' ], columns = attribute_names )
248-
249- col = []
250- for column_name in X .columns :
251- if attribute_dtype [column_name ] in ('categorical' ,
252- 'boolean' ):
253- col .append (self ._unpack_categories (
254- X [column_name ], categories_names [column_name ]))
255- else :
256- col .append (X [column_name ])
257- X = pd .concat (col , axis = 1 )
258-
259- # Pickle the dataframe or the sparse matrix.
260- with open (data_pickle_file , "wb" ) as fh :
261- pickle .dump ((X , categorical , attribute_names ), fh , - 1 )
262- logger .debug ("Saved dataset {did}: {name} to file {path}"
263- .format (did = int (self .dataset_id or - 1 ),
264- name = self .name ,
265- path = data_pickle_file )
266- )
267- return data_pickle_file
242+ attribute_dtype [name ] = 'categorical'
243+ else :
244+ categorical .append (False )
245+ attribute_dtype [name ] = ARFF_DTYPES_TO_PD_DTYPE [type_ ]
246+ attribute_names .append (name )
247+
248+ if self .format .lower () == 'sparse_arff' :
249+ X = data ['data' ]
250+ X_shape = (max (X [1 ]) + 1 , max (X [2 ]) + 1 )
251+ X = scipy .sparse .coo_matrix (
252+ (X [0 ], (X [1 ], X [2 ])), shape = X_shape , dtype = np .float32 )
253+ X = X .tocsr ()
254+
255+ elif self .format .lower () == 'arff' :
256+ X = pd .DataFrame (data ['data' ], columns = attribute_names )
257+
258+ col = []
259+ for column_name in X .columns :
260+ if attribute_dtype [column_name ] in ('categorical' ,
261+ 'boolean' ):
262+ col .append (self ._unpack_categories (
263+ X [column_name ], categories_names [column_name ]))
264+ else :
265+ col .append (X [column_name ])
266+ X = pd .concat (col , axis = 1 )
267+
268+ # Pickle the dataframe or the sparse matrix.
269+ with open (data_pickle_file , "wb" ) as fh :
270+ pickle .dump ((X , categorical , attribute_names ), fh , - 1 )
271+ logger .debug ("Saved dataset {did}: {name} to file {path}"
272+ .format (did = int (self .dataset_id or - 1 ),
273+ name = self .name ,
274+ path = data_pickle_file )
275+ )
276+ return data_pickle_file
268277
269278 def push_tag (self , tag ):
270279 """Annotates this data set with a tag on the server.
0 commit comments