Skip to content

Commit 2d2d3ed

Browse files
committed
incorporate pieter's feedback
1 parent 2f2c555 commit 2d2d3ed

6 files changed

Lines changed: 39 additions & 24 deletions

File tree

openml/extensions/extension_interface.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,6 @@ def _run_model_on_fold(
155155
fold_no: int,
156156
y_train: Optional[np.ndarray] = None,
157157
X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix]] = None,
158-
classes: Optional[List] = None,
159158
) -> Tuple[np.ndarray, np.ndarray, 'OrderedDict[str, float]', Optional['OpenMLRunTrace']]:
160159
"""Run a model on a repeat,fold,subsample triplet of the task and return prediction information.
161160
@@ -179,9 +178,6 @@ def _run_model_on_fold(
179178
indices to the potential classes specified by dataset.
180179
X_test : Optional, array-like (default=None)
181180
Test attributes to test for generalization in supervised tasks.
182-
classes : List
183-
List of classes for supervised classification tasks (and supervised data stream
184-
classification).
185181
186182
Returns
187183
-------

openml/extensions/sklearn/extension.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1103,7 +1103,6 @@ def _run_model_on_fold(
11031103
fold_no: int,
11041104
y_train: Optional[np.ndarray] = None,
11051105
X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame]] = None,
1106-
classes: Optional[List] = None,
11071106
) -> Tuple[np.ndarray, np.ndarray, 'OrderedDict[str, float]', Optional[OpenMLRunTrace]]:
11081107
"""Run a model on a repeat,fold,subsample triplet of the task and return prediction
11091108
information.
@@ -1134,9 +1133,6 @@ def _run_model_on_fold(
11341133
indices to the potential classes specified by dataset.
11351134
X_test : Optional, array-like (default=None)
11361135
Test attributes to test for generalization in supervised tasks.
1137-
classes : List
1138-
List of classes for supervised classification tasks (and supervised data stream
1139-
classification).
11401136
11411137
Returns
11421138
-------
@@ -1183,6 +1179,12 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
11831179
result[obs][prediction_idx] = 1.0
11841180
return result
11851181

1182+
if isinstance(task, OpenMLSupervisedTask):
1183+
if y_train is None:
1184+
raise TypeError('argument y_train must not be of type None')
1185+
if X_test is None:
1186+
raise TypeError('argument X_test must not be of type None')
1187+
11861188
# TODO: if possible, give a warning if model is already fitted (acceptable
11871189
# in case of custom experimentation,
11881190
# but not desirable if we want to upload to OpenML).
@@ -1259,21 +1261,18 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
12591261

12601262
if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
12611263

1262-
if classes is None:
1263-
raise TypeError("Argument classes must not be of type 'None'")
1264-
12651264
try:
12661265
proba_y = model_copy.predict_proba(X_test)
12671266
except AttributeError:
1268-
proba_y = _prediction_to_probabilities(pred_y, list(classes))
1267+
proba_y = _prediction_to_probabilities(pred_y, list(task.class_labels))
12691268

1270-
if proba_y.shape[1] != len(classes):
1269+
if proba_y.shape[1] != len(task.class_labels):
12711270
# Remap the probabilities in case there was a class missing at training time
12721271
# By default, the classification targets are mapped to be zero-based indices to the
12731272
# actual classes. Therefore, the model_classes contain the correct indices to the
12741273
# correct probability array (the actually array might be incorrect if there are
12751274
# some classes not present during train time).
1276-
proba_y_new = np.zeros((proba_y.shape[0], len(classes)))
1275+
proba_y_new = np.zeros((proba_y.shape[0], len(task.class_labels)))
12771276
for idx, model_class in enumerate(model_classes):
12781277
proba_y_new[:, model_class] = proba_y[:, idx]
12791278
proba_y = proba_y_new

openml/runs/functions.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,6 @@ def _run_task_get_arffcontent(
440440
rep_no=rep_no,
441441
fold_no=fold_no,
442442
X_test=test_x,
443-
classes=classes,
444443
)
445444

446445
arff_datacontent_fold = [] # type: List[List]
@@ -516,7 +515,7 @@ def _calculate_local_measure(sklearn_fn, openml_name):
516515
if len(traces) > 0:
517516
if len(traces) != n_fit:
518517
raise ValueError(
519-
'Did not find enough traces (expected %d, found %d)' % (n_fit, len(traces))
518+
'Did not find enough traces (expected {}, found {})'.format(n_fit, len(traces))
520519
)
521520
else:
522521
trace = OpenMLRunTrace.merge_traces(traces)

openml/runs/trace.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,7 @@ def trace_from_xml(cls, xml):
349349
return cls(run_id, trace)
350350

351351
@classmethod
352-
def merge_traces(cls, traces: List['OpenMLRunTrace']):
352+
def merge_traces(cls, traces: List['OpenMLRunTrace']) -> 'OpenMLRunTrace':
353353
for i in range(1, len(traces)):
354354
if traces[i] != traces[i - 1]:
355355
raise ValueError('Cannot merge traces!')
@@ -363,7 +363,7 @@ def merge_traces(cls, traces: List['OpenMLRunTrace']):
363363
return cls(None, merged_trace)
364364

365365
def __str__(self):
366-
return '[Run id: %d, %d trace iterations]' % (
366+
return '[Run id: %d, %d trace iterations]'.format(
367367
-1 if self.run_id is None else self.run_id,
368368
len(self.trace_iterations),
369369
)

openml/tasks/task.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
import io
22
import os
3+
from typing import Union
4+
5+
import numpy as np
6+
import pandas as pd
7+
import scipy.sparse
38

49
from .. import datasets
510
from .split import OpenMLSplit
@@ -108,7 +113,10 @@ def __init__(self, task_id, task_type_id, task_type, data_set_id,
108113
self.target_name = target_name
109114
self.split = None
110115

111-
def get_X_and_y(self, dataset_format='array'):
116+
def get_X_and_y(
117+
self,
118+
dataset_format: str = 'array',
119+
) -> Union[np.ndarray, pd.DataFrame, scipy.sparse.spmatrix]:
112120
"""Get data associated with the current task.
113121
114122
Returns
@@ -177,7 +185,10 @@ def __init__(self, task_id, task_type_id, task_type, data_set_id,
177185
)
178186
self.number_of_clusters = number_of_clusters
179187

180-
def get_X(self, dataset_format='array'):
188+
def get_X(
189+
self,
190+
dataset_format: str = 'array',
191+
) -> Union[np.ndarray, pd.DataFrame, scipy.sparse.spmatrix]:
181192
"""Get data associated with the current task.
182193
183194
Returns

tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1299,7 +1299,6 @@ def test_run_model_on_fold_classification_1(self):
12991299
X_train=X_train,
13001300
y_train=y_train,
13011301
X_test=X_test,
1302-
classes=task.class_labels,
13031302
)
13041303

13051304
y_hat, y_hat_proba, user_defined_measures, trace = res
@@ -1355,7 +1354,6 @@ def test_run_model_on_fold_classification_2(self):
13551354
X_train=X_train,
13561355
y_train=y_train,
13571356
X_test=X_test,
1358-
classes=task.class_labels,
13591357
)
13601358

13611359
y_hat, y_hat_proba, user_defined_measures, trace = res
@@ -1423,7 +1421,6 @@ def predict_proba(*args, **kwargs):
14231421
X_test=X_test,
14241422
fold_no=0,
14251423
rep_no=0,
1426-
classes=task.class_labels,
14271424
)
14281425
pred_2, proba_2, _, _ = self.extension._run_model_on_fold(
14291426
model=clf2,
@@ -1433,11 +1430,24 @@ def predict_proba(*args, **kwargs):
14331430
X_test=X_test,
14341431
fold_no=0,
14351432
rep_no=0,
1436-
classes=task.class_labels,
14371433
)
14381434

14391435
# verifies that the predictions are identical
14401436
np.testing.assert_array_equal(pred_1, pred_2)
1437+
np.testing.assert_array_almost_equal(np.sum(proba_1, axis=1), np.ones(X_test.shape[0]))
1438+
# Test that there are predictions other than ones and zeros
1439+
print(proba_1, proba_2)
1440+
self.assertLess(
1441+
np.sum(proba_1 == 0) + np.sum(proba_1 == 1),
1442+
X_test.shape[0] * len(task.class_labels),
1443+
)
1444+
1445+
np.testing.assert_array_almost_equal(np.sum(proba_2, axis=1), np.ones(X_test.shape[0]))
1446+
# Test that there are only ones and zeros predicted
1447+
self.assertEqual(
1448+
np.sum(proba_2 == 0) + np.sum(proba_2 == 1),
1449+
X_test.shape[0] * len(task.class_labels),
1450+
)
14411451

14421452
def test_run_model_on_fold_regression(self):
14431453
# There aren't any regression tasks on the test server

0 commit comments

Comments
 (0)