Skip to content

Commit b3262b6

Browse files
authored
Merge pull request #213 from openml/smartimputer
Smartimputer
2 parents 34efa1b + b11d5a5 commit b3262b6

16 files changed

Lines changed: 20346 additions & 90 deletions

File tree

openml/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"""
1717
from . import config
1818

19-
from .datasets import OpenMLDataset
19+
from .datasets import OpenMLDataset, OpenMLDataFeature
2020
from . import datasets
2121
from . import runs
2222
from . import flows
@@ -27,5 +27,6 @@
2727

2828
__version__ = "0.2.1"
2929

30-
__all__ = ['OpenMLDataset', 'OpenMLRun', 'OpenMLSplit', 'datasets',
31-
'OpenMLTask', 'OpenMLFlow', 'config', 'runs', 'flows']
30+
__all__ = ['OpenMLDataset', 'OpenMLDataFeature', 'OpenMLRun',
31+
'OpenMLSplit', 'datasets', 'OpenMLTask', 'OpenMLFlow',
32+
'config', 'runs', 'flows']

openml/datasets/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from .functions import (list_datasets, check_datasets_active,
22
get_datasets, get_dataset)
33
from .dataset import OpenMLDataset
4+
from .data_feature import OpenMLDataFeature
45

56
__all__ = ['check_datasets_active', 'get_dataset', 'get_datasets',
6-
'OpenMLDataset', 'list_datasets']
7+
'OpenMLDataset', 'OpenMLDataFeature', 'list_datasets']

openml/datasets/data_feature.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
2+
class OpenMLDataFeature(object):
3+
"""Data Feature (a.k.a. Attribute) object.
4+
5+
Parameters
6+
----------
7+
index : int
8+
The index of this feature
9+
name : str
10+
Name of the feature
11+
data_type : str
12+
can be nominal, numeric, string, date (corresponds to arff)
13+
nominal_values : list(str)
14+
list of the possible values, in case of nominal attribute
15+
number_missing_values : int
16+
"""
17+
LEGAL_DATA_TYPES = ['nominal', 'numeric', 'string', 'date']
18+
19+
def __init__(self, index, name, data_type, nominal_values, number_missing_values):
20+
if type(index) != int:
21+
raise ValueError('Index is of wrong datatype')
22+
if data_type not in self.LEGAL_DATA_TYPES:
23+
raise ValueError('data type should be in %s, found: %s' %(str(self.LEGAL_DATA_TYPES),data_type))
24+
if nominal_values is not None and type(nominal_values) != list:
25+
raise ValueError('Nominal_values is of wrong datatype')
26+
if type(number_missing_values) != int:
27+
raise ValueError('number_missing_values is of wrong datatype')
28+
29+
self.index = index
30+
self.name = str(name)
31+
self.data_type = str(data_type)
32+
self.nominal_values = nominal_values
33+
self.number_missing_values = number_missing_values
34+
35+
def __str__(self):
36+
return "[%d - %s (%s)]" %(self.index, self.name, self.data_type)

openml/datasets/dataset.py

Lines changed: 83 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import io
33
import logging
44
import os
5+
import six
56
import sys
67

78
import arff
@@ -10,6 +11,7 @@
1011
import scipy.sparse
1112
import xmltodict
1213

14+
from .data_feature import OpenMLDataFeature
1315
from ..exceptions import PyOpenMLError
1416

1517
if sys.version_info[0] >= 3:
@@ -63,7 +65,15 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
6365
self.url = url
6466
self.default_target_attribute = default_target_attribute
6567
self.row_id_attribute = row_id_attribute
66-
self.ignore_attributes = ignore_attribute
68+
self.ignore_attributes = None
69+
if isinstance(ignore_attribute, six.string_types):
70+
self.ignore_attributes = [ignore_attribute]
71+
elif isinstance(ignore_attribute, list):
72+
self.ignore_attributes = ignore_attribute
73+
elif ignore_attribute is None:
74+
pass
75+
else:
76+
raise ValueError('wrong data type for ignore_attribute. Should be list. ')
6777
self.version_label = version_label
6878
self.citation = citation
6979
self.tag = tag
@@ -73,7 +83,20 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
7383
self.update_comment = update_comment
7484
self.md5_cheksum = md5_checksum
7585
self.data_file = data_file
76-
self.features = features
86+
self.features = None
87+
88+
if features is not None:
89+
self.features = {}
90+
for idx, xmlfeature in enumerate(features['oml:feature']):
91+
feature = OpenMLDataFeature(int(xmlfeature['oml:index']),
92+
xmlfeature['oml:name'],
93+
xmlfeature['oml:data_type'],
94+
None, #todo add nominal values (currently not in database)
95+
int(xmlfeature['oml:number_of_missing_values']))
96+
if idx != feature.index:
97+
raise ValueError('Data features not provided in right order')
98+
self.features[feature.index] = feature
99+
77100

78101
if data_file is not None:
79102
if self._data_features_supported():
@@ -205,10 +228,7 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
205228
if not self.ignore_attributes:
206229
pass
207230
else:
208-
if is_string(self.ignore_attributes):
209-
to_exclude.append(self.ignore_attributes)
210-
else:
211-
to_exclude.extend(self.ignore_attributes)
231+
to_exclude.extend(self.ignore_attributes)
212232

213233
if len(to_exclude) > 0:
214234
logger.info("Going to remove the following attributes:"
@@ -298,6 +318,61 @@ def retrieve_class_labels(self, target_name='class'):
298318
else:
299319
return None
300320

321+
322+
def get_features_by_type(self, data_type, exclude=None,
323+
exclude_ignore_attributes=True,
324+
exclude_row_id_attribute=True):
325+
'''
326+
Returns indices of features of a given type, e.g., all nominal features.
327+
Can use additional parameters to exclude various features by index or ontology.
328+
329+
Parameters
330+
----------
331+
data_type : str
332+
The data type to return (e.g., nominal, numeric, date, string)
333+
exclude : list(int)
334+
Indices to exclude (and adapt the return values as if these indices
335+
are not present)
336+
exclude_ignore_attributes : bool
337+
Whether to exclude the defined ignore attributes (and adapt the
338+
return values as if these indices are not present)
339+
exclude_row_id_attribute : bool
340+
Whether to exclude the defined row id attributes (and adapt the
341+
return values as if these indices are not present)
342+
343+
Returns
344+
-------
345+
result : list
346+
a list of indices that have the specified data type
347+
'''
348+
assert data_type in OpenMLDataFeature.LEGAL_DATA_TYPES, "Illegal feature type requested"
349+
if self.ignore_attributes is not None:
350+
assert type(self.ignore_attributes) is list, "ignore_attributes should be a list"
351+
if self.row_id_attribute is not None:
352+
assert type(self.row_id_attribute) is str, "row id attribute should be a str"
353+
if exclude is not None:
354+
assert type(exclude) is list, "Exclude should be a list"
355+
# assert all(isinstance(elem, str) for elem in exclude), "Exclude should be a list of strings"
356+
to_exclude = []
357+
if exclude is not None:
358+
to_exclude.extend(exclude)
359+
if exclude_ignore_attributes and self.ignore_attributes is not None:
360+
to_exclude.extend(self.ignore_attributes)
361+
if exclude_row_id_attribute and self.row_id_attribute is not None:
362+
to_exclude.append(self.row_id_attribute)
363+
364+
result = []
365+
offset = 0
366+
# this function assumes that everything in to_exclude will be 'excluded' from the dataset (hence the offset)
367+
for idx in self.features:
368+
name = self.features[idx].name
369+
if name in to_exclude:
370+
offset += 1
371+
else:
372+
if self.features[idx].data_type == data_type:
373+
result.append(idx-offset)
374+
return result
375+
301376
def publish(self):
302377
"""Publish the dataset on the OpenML server.
303378
@@ -349,8 +424,8 @@ def _to_xml(self):
349424

350425
def _data_features_supported(self):
351426
if self.features is not None:
352-
for feature in self.features['oml:feature']:
353-
if feature['oml:data_type'] not in ['numeric', 'nominal']:
427+
for idx in self.features:
428+
if self.features[idx].data_type not in ['numeric', 'nominal']:
354429
return False
355430
return True
356431
return True

openml/flows/functions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,10 @@ def list_flows(offset=None, size=None, tag=None):
6666
if tag is not None:
6767
api_call += "/tag/%s" % tag
6868

69-
return _list_datasets(api_call)
69+
return _list_flows(api_call)
7070

7171

72-
def _list_datasets(api_call):
72+
def _list_flows(api_call):
7373
# TODO add proper error handling here!
7474
xml_string = _perform_api_call(api_call)
7575
flows_dict = xmltodict.parse(xml_string)

openml/runs/functions.py

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22
import io
33
import os
44
import xmltodict
5+
import numpy as np
6+
import warnings
57
from sklearn.model_selection._search import BaseSearchCV
68

9+
from ..exceptions import PyOpenMLError
710
from .. import config
811
from ..flows import sklearn_to_flow
912
from ..exceptions import OpenMLCacheException
@@ -50,11 +53,7 @@ def run_task(task, model):
5053

5154
# execute the run
5255
run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model)
53-
54-
try:
55-
run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)
56-
except AttributeError as message:
57-
run.error_message = str(message)
56+
run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)
5857

5958
# now generate the flow
6059
flow = sklearn_to_flow(model)
@@ -70,6 +69,46 @@ def run_task(task, model):
7069
return run
7170

7271

72+
def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label,
73+
predicted_probabilities, class_labels, model_classes_mapping):
74+
"""Util function that turns probability estimates of a classifier for a given
75+
instance into the right arff format to upload to openml.
76+
77+
Parameters
78+
----------
79+
rep_no : int
80+
fold_no : int
81+
row_id : int
82+
row id in the initial dataset
83+
correct_label : str
84+
original label of the instance
85+
predicted_label : str
86+
the label that was predicted
87+
predicted_probabilities : array (size=num_classes)
88+
probabilities per class
89+
class_labels : array (size=num_classes)
90+
model_classes_mapping : list
91+
A list of classes the model produced.
92+
Obtained by BaseEstimator.classes_
93+
94+
Returns
95+
-------
96+
arff_line : list
97+
representation of the current prediction in OpenML format
98+
"""
99+
arff_line = [rep_no, fold_no, row_id]
100+
for class_label_idx in range(len(class_labels)):
101+
if class_label_idx in model_classes_mapping:
102+
index = np.where(model_classes_mapping == class_label_idx)[0][0] # TODO: WHY IS THIS 2D???
103+
arff_line.append(predicted_probabilities[index])
104+
else:
105+
arff_line.append(0.0)
106+
107+
arff_line.append(class_labels[predicted_label])
108+
arff_line.append(correct_label)
109+
return arff_line
110+
111+
# JvR: why is class labels a parameter? could be removed and taken from task object, right?
73112
def _run_task_get_arffcontent(model, task, class_labels):
74113
X, Y = task.get_X_and_y()
75114
arff_datacontent = []
@@ -88,19 +127,20 @@ def _run_task_get_arffcontent(model, task, class_labels):
88127
testY = Y[test_indices]
89128

90129
model.fit(trainX, trainY)
130+
91131
if isinstance(model, BaseSearchCV):
92-
_add_results_to_arfftrace(arff_tracecontent, fold_no, model,
93-
rep_no)
132+
_add_results_to_arfftrace(arff_tracecontent, fold_no, model, rep_no)
133+
model_classes = model.best_estimator_.classes_
134+
else:
135+
model_classes = model.classes_
94136

95137
ProbaY = model.predict_proba(testX)
96138
PredY = model.predict(testX)
139+
if ProbaY.shape[1] != len(class_labels):
140+
warnings.warn("Repeat %d Fold %d: estimator only predicted for %d/%d classes!" %(rep_no, fold_no, ProbaY.shape[1], len(class_labels)))
97141

98142
for i in range(0, len(test_indices)):
99-
assert(len(ProbaY[i]) == len(class_labels)), 'Predicted probabilities and available classes do not match. (sklearn bug?) '
100-
arff_line = [rep_no, fold_no, test_indices[i]]
101-
arff_line.extend(ProbaY[i])
102-
arff_line.append(class_labels[PredY[i]])
103-
arff_line.append(class_labels[testY[i]])
143+
arff_line = _prediction_to_row(rep_no, fold_no, test_indices[i], class_labels[testY[i]], PredY[i], ProbaY[i], class_labels, model_classes)
104144
arff_datacontent.append(arff_line)
105145

106146
fold_no = fold_no + 1

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@ liac-arff>=2.1.1dev
55
xmltodict
66
nose
77
requests
8-
scikit-learn
8+
scikit-learn>=0.18
99
nbformat

0 commit comments

Comments
 (0)