Skip to content

Commit e5b23ed

Browse files
committed
fixed problem with 'empty' result sets (e.g., dataset anneal misses class '3', which can now be hanndled)
1 parent 6138c52 commit e5b23ed

2 files changed

Lines changed: 86 additions & 7 deletions

File tree

openml/runs/functions.py

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import io
33
import os
44
import xmltodict
5+
import numpy as np
6+
import warnings
57
from sklearn.model_selection._search import BaseSearchCV
68

79
from .. import config
@@ -70,6 +72,41 @@ def run_task(task, model):
7072
return run
7173

7274

75+
def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label, predicted_probabilities, class_labels, model_classes_mapping):
76+
"""Complicated util function that turns probability estimates of a classifier for a given instance into the right arff format to upload to openml.
77+
78+
Parameters
79+
----------
80+
rep_no : int
81+
fold_no : int
82+
row_id : int
83+
row id in the initial dataset
84+
correct_label : str
85+
original label of the instance
86+
predicted_label : str
87+
the label that was predicted
88+
predicted_probabilities : array (size=num_classes)
89+
probabilities per class
90+
class_labels : array (size=num_classes)
91+
92+
Returns
93+
-------
94+
arff_line : list
95+
representation of the current prediction in OpenML format
96+
"""
97+
arff_line = [rep_no, fold_no, row_id]
98+
for class_label_idx in range(len(class_labels)):
99+
if class_label_idx in model_classes_mapping:
100+
index = np.where(model_classes_mapping == class_label_idx)[0][0] # TODO: WHY IS THIS 2D???
101+
arff_line.append(predicted_probabilities[index])
102+
else:
103+
arff_line.append(0.0)
104+
105+
arff_line.append(class_labels[predicted_label])
106+
arff_line.append(correct_label)
107+
return arff_line
108+
109+
# JvR: why is class labels a parameter? could be removed and taken from task object, right?
73110
def _run_task_get_arffcontent(model, task, class_labels):
74111
X, Y = task.get_X_and_y()
75112
arff_datacontent = []
@@ -89,18 +126,15 @@ def _run_task_get_arffcontent(model, task, class_labels):
89126

90127
model.fit(trainX, trainY)
91128
if isinstance(model, BaseSearchCV):
92-
_add_results_to_arfftrace(arff_tracecontent, fold_no, model,
93-
rep_no)
129+
_add_results_to_arfftrace(arff_tracecontent, fold_no, model, rep_no)
94130

95131
ProbaY = model.predict_proba(testX)
96132
PredY = model.predict(testX)
133+
if ProbaY.shape[1] != len(class_labels):
134+
warnings.warn("Repeat %d Fold %d: estimator only predicted for %d/%d classes!" %(rep_no, fold_no, ProbaY.shape[1], len(class_labels)))
97135

98136
for i in range(0, len(test_indices)):
99-
assert(len(ProbaY[i]) == len(class_labels)), 'Predicted probabilities and available classes do not match. (sklearn bug?) '
100-
arff_line = [rep_no, fold_no, test_indices[i]]
101-
arff_line.extend(ProbaY[i])
102-
arff_line.append(class_labels[PredY[i]])
103-
arff_line.append(class_labels[testY[i]])
137+
arff_line = _prediction_to_row(rep_no, fold_no, test_indices[i], class_labels[testY[i]], PredY[i], ProbaY[i], class_labels, model.classes_)
104138
arff_datacontent.append(arff_line)
105139

106140
fold_no = fold_no + 1

tests/test_runs/test_run_functions.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,3 +253,48 @@ def test_get_runs_list_by_filters(self):
253253
def test_get_runs_list_by_tag(self):
254254
runs = openml.runs.list_runs(tag='curves')
255255
self.assertGreaterEqual(len(runs), 1)
256+
257+
def test_run_on_dataset_with_missing_labels(self):
258+
from openml.runs.functions import _prediction_to_row
259+
from sklearn.tree import DecisionTreeClassifier
260+
from sklearn.preprocessing.imputation import Imputer
261+
task = openml.tasks.get_task(2)
262+
class_labels = task.class_labels
263+
264+
model = Pipeline(steps=[('Imputer', Imputer(strategy='median')),
265+
('Estimator', DecisionTreeClassifier())])
266+
267+
X, Y = task.get_X_and_y()
268+
rep_no = 0
269+
# TODO use different iterator to only provide a single iterator (less
270+
# methods, less maintenance, less confusion)
271+
for rep in task.iterate_repeats():
272+
fold_no = 0
273+
for fold in rep:
274+
train_indices, test_indices = fold
275+
trainX = X[train_indices]
276+
trainY = Y[train_indices]
277+
testX = X[test_indices]
278+
testY = Y[test_indices]
279+
280+
model.fit(trainX, trainY)
281+
282+
ProbaY = model.predict_proba(testX)
283+
PredY = model.predict(testX)
284+
285+
missing_label_idx = [3]
286+
287+
for i in range(0, len(test_indices)):
288+
arff_line = _prediction_to_row(rep_no, fold_no, test_indices[i], class_labels[testY[i]], PredY[i],
289+
ProbaY[i], class_labels, model.classes_)
290+
291+
offset = 0
292+
for idx, proba in enumerate(arff_line[3:-2]):
293+
if idx in missing_label_idx:
294+
offset += 1
295+
else:
296+
assert proba == ProbaY[i][idx-offset]
297+
298+
fold_no = fold_no + 1
299+
rep_no = rep_no + 1
300+

0 commit comments

Comments
 (0)