Skip to content

Commit 7ec429e

Browse files
PGijsbersmfeurer
authored andcommitted
Fix backwards compatibility #646. (#654)
* Fix backwards compatibility #646. Reprocess ARFF file if outdated datatype was used in pickle. * Skip test for which API is currently not working.
1 parent 3984a64 commit 7ec429e

1 file changed

Lines changed: 94 additions & 85 deletions

File tree

openml/datasets/dataset.py

Lines changed: 94 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -174,97 +174,106 @@ def __init__(self, name, description, format=None,
174174
def _data_arff_to_pickle(self, data_file):
175175
data_pickle_file = data_file.replace('.arff', '.pkl.py3')
176176
if os.path.exists(data_pickle_file):
177-
logger.debug("Data pickle file already exists.")
178-
return data_pickle_file
179-
else:
180-
try:
181-
data = self._get_arff(self.format)
182-
except OSError as e:
183-
logger.critical("Please check that the data file %s is "
184-
"there and can be read.", data_file)
185-
raise e
186-
187-
ARFF_DTYPES_TO_PD_DTYPE = {
188-
'INTEGER': 'integer',
189-
'REAL': 'floating',
190-
'NUMERIC': 'floating',
191-
'STRING': 'string'
192-
}
193-
attribute_dtype = {}
194-
attribute_names = []
195-
categories_names = {}
196-
categorical = []
197-
for name, type_ in data['attributes']:
198-
# if the feature is nominal and the a sparse matrix is
199-
# requested, the categories need to be numeric
200-
if (isinstance(type_, list)
201-
and self.format.lower() == 'sparse_arff'):
202-
try:
203-
np.array(type_, dtype=np.float32)
204-
except ValueError:
205-
raise ValueError(
206-
"Categorical data needs to be numeric when "
207-
"using sparse ARFF."
208-
)
209-
# string can only be supported with pandas DataFrame
210-
elif (type_ == 'STRING'
211-
and self.format.lower() == 'sparse_arff'):
177+
with open(data_pickle_file, "rb") as fh:
178+
data, categorical, attribute_names = pickle.load(fh)
179+
180+
# Between v0.8 and v0.9 the format of pickled data changed from
181+
# np.ndarray to pd.DataFrame. This breaks some backwards compatibility,
182+
# e.g. for `run_model_on_task`. If a local file still exists with
183+
# np.ndarray data, we reprocess the data file to store a pickled
184+
# pd.DataFrame blob. See also #646.
185+
if isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data):
186+
logger.debug("Data pickle file already exists.")
187+
return data_pickle_file
188+
189+
try:
190+
data = self._get_arff(self.format)
191+
except OSError as e:
192+
logger.critical("Please check that the data file %s is "
193+
"there and can be read.", data_file)
194+
raise e
195+
196+
ARFF_DTYPES_TO_PD_DTYPE = {
197+
'INTEGER': 'integer',
198+
'REAL': 'floating',
199+
'NUMERIC': 'floating',
200+
'STRING': 'string'
201+
}
202+
attribute_dtype = {}
203+
attribute_names = []
204+
categories_names = {}
205+
categorical = []
206+
for name, type_ in data['attributes']:
207+
# if the feature is nominal and the a sparse matrix is
208+
# requested, the categories need to be numeric
209+
if (isinstance(type_, list)
210+
and self.format.lower() == 'sparse_arff'):
211+
try:
212+
np.array(type_, dtype=np.float32)
213+
except ValueError:
212214
raise ValueError(
213-
"Dataset containing strings is not supported "
214-
"with sparse ARFF."
215+
"Categorical data needs to be numeric when "
216+
"using sparse ARFF."
215217
)
218+
# string can only be supported with pandas DataFrame
219+
elif (type_ == 'STRING'
220+
and self.format.lower() == 'sparse_arff'):
221+
raise ValueError(
222+
"Dataset containing strings is not supported "
223+
"with sparse ARFF."
224+
)
216225

217-
# infer the dtype from the ARFF header
218-
if isinstance(type_, list):
219-
categorical.append(True)
220-
categories_names[name] = type_
221-
if len(type_) == 2:
222-
type_norm = [cat.lower().capitalize()
223-
for cat in type_]
224-
if set(['True', 'False']) == set(type_norm):
225-
categories_names[name] = [
226-
True if cat == 'True' else False
227-
for cat in type_norm
228-
]
229-
attribute_dtype[name] = 'boolean'
230-
else:
231-
attribute_dtype[name] = 'categorical'
226+
# infer the dtype from the ARFF header
227+
if isinstance(type_, list):
228+
categorical.append(True)
229+
categories_names[name] = type_
230+
if len(type_) == 2:
231+
type_norm = [cat.lower().capitalize()
232+
for cat in type_]
233+
if set(['True', 'False']) == set(type_norm):
234+
categories_names[name] = [
235+
True if cat == 'True' else False
236+
for cat in type_norm
237+
]
238+
attribute_dtype[name] = 'boolean'
232239
else:
233240
attribute_dtype[name] = 'categorical'
234241
else:
235-
categorical.append(False)
236-
attribute_dtype[name] = ARFF_DTYPES_TO_PD_DTYPE[type_]
237-
attribute_names.append(name)
238-
239-
if self.format.lower() == 'sparse_arff':
240-
X = data['data']
241-
X_shape = (max(X[1]) + 1, max(X[2]) + 1)
242-
X = scipy.sparse.coo_matrix(
243-
(X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
244-
X = X.tocsr()
245-
246-
elif self.format.lower() == 'arff':
247-
X = pd.DataFrame(data['data'], columns=attribute_names)
248-
249-
col = []
250-
for column_name in X.columns:
251-
if attribute_dtype[column_name] in ('categorical',
252-
'boolean'):
253-
col.append(self._unpack_categories(
254-
X[column_name], categories_names[column_name]))
255-
else:
256-
col.append(X[column_name])
257-
X = pd.concat(col, axis=1)
258-
259-
# Pickle the dataframe or the sparse matrix.
260-
with open(data_pickle_file, "wb") as fh:
261-
pickle.dump((X, categorical, attribute_names), fh, -1)
262-
logger.debug("Saved dataset {did}: {name} to file {path}"
263-
.format(did=int(self.dataset_id or -1),
264-
name=self.name,
265-
path=data_pickle_file)
266-
)
267-
return data_pickle_file
242+
attribute_dtype[name] = 'categorical'
243+
else:
244+
categorical.append(False)
245+
attribute_dtype[name] = ARFF_DTYPES_TO_PD_DTYPE[type_]
246+
attribute_names.append(name)
247+
248+
if self.format.lower() == 'sparse_arff':
249+
X = data['data']
250+
X_shape = (max(X[1]) + 1, max(X[2]) + 1)
251+
X = scipy.sparse.coo_matrix(
252+
(X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
253+
X = X.tocsr()
254+
255+
elif self.format.lower() == 'arff':
256+
X = pd.DataFrame(data['data'], columns=attribute_names)
257+
258+
col = []
259+
for column_name in X.columns:
260+
if attribute_dtype[column_name] in ('categorical',
261+
'boolean'):
262+
col.append(self._unpack_categories(
263+
X[column_name], categories_names[column_name]))
264+
else:
265+
col.append(X[column_name])
266+
X = pd.concat(col, axis=1)
267+
268+
# Pickle the dataframe or the sparse matrix.
269+
with open(data_pickle_file, "wb") as fh:
270+
pickle.dump((X, categorical, attribute_names), fh, -1)
271+
logger.debug("Saved dataset {did}: {name} to file {path}"
272+
.format(did=int(self.dataset_id or -1),
273+
name=self.name,
274+
path=data_pickle_file)
275+
)
276+
return data_pickle_file
268277

269278
def push_tag(self, tag):
270279
"""Annotates this data set with a tag on the server.

0 commit comments

Comments
 (0)