Skip to content

Commit deda557

Browse files
committed
add extra tests, minor refactoring
1 parent 2228059 commit deda557

7 files changed

Lines changed: 319 additions & 176 deletions

File tree

openml/_api_calls.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def _read_url_files(url, data=None, file_elements=None):
8080
files=file_elements,
8181
)
8282
if response.status_code != 200:
83-
raise _parse_server_exception(response, url=url)
83+
raise _parse_server_exception(response, url)
8484
if 'Content-Encoding' not in response.headers or \
8585
response.headers['Content-Encoding'] != 'gzip':
8686
warnings.warn('Received uncompressed content from OpenML for {}.'
@@ -95,7 +95,7 @@ def _read_url(url, request_method, data=None):
9595

9696
response = send_request(request_method=request_method, url=url, data=data)
9797
if response.status_code != 200:
98-
raise _parse_server_exception(response, url=url)
98+
raise _parse_server_exception(response, url)
9999
if 'Content-Encoding' not in response.headers or \
100100
response.headers['Content-Encoding'] != 'gzip':
101101
warnings.warn('Received uncompressed content from OpenML for {}.'
@@ -137,15 +137,15 @@ def send_request(
137137
return response
138138

139139

140-
def _parse_server_exception(response, url=None):
140+
def _parse_server_exception(response, url):
141141
# OpenML has a sophisticated error system
142142
# where information about failures is provided. try to parse this
143143
try:
144144
server_exception = xmltodict.parse(response.text)
145145
except Exception:
146146
raise OpenMLServerError(
147-
'Unexpected server error. Please contact the developers!\n'
148-
'Status code: {}\n{}'.format(response.status_code, response.text))
147+
'Unexpected server error when calling {}. Please contact the developers!\n'
148+
'Status code: {}\n{}'.format(url, response.status_code, response.text))
149149

150150
server_error = server_exception['oml:error']
151151
code = int(server_error['oml:code'])

openml/extensions/sklearn/extension.py

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,11 +1099,11 @@ def _run_model_on_fold(
10991099
model: Any,
11001100
task: 'OpenMLTask',
11011101
X_train: Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame],
1102-
y_train: np.ndarray,
11031102
rep_no: int,
11041103
fold_no: int,
1104+
y_train: Optional[np.ndarray] = None,
11051105
X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame]] = None,
1106-
n_classes: Optional[int] = None,
1106+
classes: Optional[int] = None,
11071107
) -> Tuple[np.ndarray, np.ndarray, 'OrderedDict[str, float]', Any]:
11081108
"""Run a model on a repeat,fold,subsample triplet of the task and return prediction
11091109
information.
@@ -1156,7 +1156,7 @@ def _run_model_on_fold(
11561156

11571157
def _prediction_to_probabilities(
11581158
y: np.ndarray,
1159-
model_classes: List,
1159+
classes: List,
11601160
) -> np.ndarray:
11611161
"""Transforms predicted probabilities to match with OpenML class indices.
11621162
@@ -1175,13 +1175,12 @@ def _prediction_to_probabilities(
11751175
# y: list or numpy array of predictions
11761176
# model_classes: sklearn classifier mapping from original array id to
11771177
# prediction index id
1178-
if not isinstance(model_classes, list):
1178+
if not isinstance(classes, list):
11791179
raise ValueError('please convert model classes to list prior to '
11801180
'calling this fn')
1181-
result = np.zeros((len(y), len(model_classes)), dtype=np.float32)
1181+
result = np.zeros((len(y), len(classes)), dtype=np.float32)
11821182
for obs, prediction_idx in enumerate(y):
1183-
array_idx = model_classes.index(prediction_idx)
1184-
result[obs][array_idx] = 1.0
1183+
result[obs][prediction_idx] = 1.0
11851184
return result
11861185

11871186
# TODO: if possible, give a warning if model is already fitted (acceptable
@@ -1239,7 +1238,12 @@ def _prediction_to_probabilities(
12391238

12401239
# In supervised learning this returns the predictions for Y, in clustering
12411240
# it returns the clusters
1242-
pred_y = model_copy.predict(X_test)
1241+
if isinstance(task, OpenMLSupervisedTask):
1242+
pred_y = model_copy.predict(X_test)
1243+
elif isinstance(task, OpenMLClusteringTask):
1244+
pred_y = model_copy.predict(X_train)
1245+
else:
1246+
raise ValueError(task)
12431247

12441248
if can_measure_cputime:
12451249
modelpredict_duration_cputime = (time.process_time()
@@ -1258,13 +1262,18 @@ def _prediction_to_probabilities(
12581262
try:
12591263
proba_y = model_copy.predict_proba(X_test)
12601264
except AttributeError:
1261-
proba_y = _prediction_to_probabilities(pred_y, list(model_classes))
1262-
1263-
pred_y = np.array([model_classes[label] for label in pred_y], dtype=pred_y.dtype)
1264-
proba_y_new = np.zeros((proba_y.shape[0], n_classes))
1265-
for idx, class_idx in enumerate(model_classes):
1266-
proba_y_new[:, class_idx] = proba_y[:, idx]
1267-
proba_y = proba_y_new
1265+
proba_y = _prediction_to_probabilities(pred_y, list(classes))
1266+
1267+
if proba_y.shape[1] != len(classes):
1268+
# Remap the probabilities in case there was a class missing at training time
1269+
# By default, the classification targets are mapped to be zero-based indices to the
1270+
# actual classes. Therefore, the model_classes contain the correct indices to the
1271+
# correct probability array (the actualy array might be incorrect if there are some
1272+
# classes not present during train time).
1273+
proba_y_new = np.zeros((proba_y.shape[0], len(classes)))
1274+
for idx, model_class in enumerate(model_classes):
1275+
proba_y_new[:, model_class] = proba_y[:, idx]
1276+
proba_y = proba_y_new
12681277

12691278
if proba_y.shape[1] != len(task.class_labels):
12701279
message = "Estimator only predicted for {}/{} classes!".format(

openml/runs/functions.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -395,7 +395,7 @@ def _run_task_get_arffcontent(
395395
# TODO use different iterator to only provide a single iterator (less
396396
# methods, less maintenance, less confusion)
397397
num_reps, num_folds, num_samples = task.get_split_dimensions()
398-
n_classes = None
398+
classes = None
399399

400400
n_fit = 0
401401
for rep_no in range(num_reps):
@@ -406,14 +406,15 @@ def _run_task_get_arffcontent(
406406
train_indices, test_indices = task.get_train_test_split_indices(
407407
repeat=rep_no, fold=fold_no, sample=sample_no)
408408
if isinstance(task, OpenMLSupervisedTask):
409-
x, y = task.get_X_and_y()
409+
x, y = task.get_X_and_y(dataset_format='array')
410410
train_x = x[train_indices]
411411
train_y = y[train_indices]
412412
test_x = x[test_indices]
413413
test_y = y[test_indices]
414414
if isinstance(task, (OpenMLClassificationTask, OpenMLClassificationTask)):
415-
n_classes = len(task.class_labels)
415+
classes = task.class_labels
416416
elif isinstance(task, OpenMLClusteringTask):
417+
x = task.get_X(dataset_format='array')
417418
train_x = train_indices
418419
train_y = None
419420
test_x = test_indices
@@ -439,7 +440,7 @@ def _run_task_get_arffcontent(
439440
rep_no=rep_no,
440441
fold_no=fold_no,
441442
X_test=test_x,
442-
n_classes=n_classes,
443+
classes=classes,
443444
)
444445

445446
arff_datacontent_fold = [] # type: List[List]

openml/tasks/task.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def __init__(self, task_id, task_type_id, task_type, data_set_id,
108108
self.target_name = target_name
109109
self.split = None
110110

111-
def get_X_and_y(self):
111+
def get_X_and_y(self, dataset_format='array'):
112112
"""Get data associated with the current task.
113113
114114
Returns
@@ -120,7 +120,7 @@ def get_X_and_y(self):
120120
if self.task_type_id not in (1, 2, 3):
121121
raise NotImplementedError(self.task_type)
122122
X_and_y = dataset.get_data(
123-
dataset_format='array', target=self.target_name
123+
dataset_format=dataset_format, target=self.target_name,
124124
)
125125
return X_and_y
126126

@@ -177,6 +177,20 @@ def __init__(self, task_id, task_type_id, task_type, data_set_id,
177177
)
178178
self.number_of_clusters = number_of_clusters
179179

180+
def get_X(self, dataset_format='array'):
181+
"""Get data associated with the current task.
182+
183+
Returns
184+
-------
185+
tuple - X and y
186+
187+
"""
188+
dataset = self.get_dataset()
189+
X_and_y = dataset.get_data(
190+
dataset_format=dataset_format, target=None,
191+
)
192+
return X_and_y
193+
180194

181195
class OpenMLLearningCurveTask(OpenMLClassificationTask):
182196
def __init__(self, task_id, task_type_id, task_type, data_set_id,

openml/testing.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ def _check_fold_timing_evaluations(
144144
num_folds: int,
145145
max_time_allowed: float = 60000.0,
146146
task_type: int = TaskTypeEnum.SUPERVISED_CLASSIFICATION,
147+
check_scores: bool = True,
147148
):
148149
"""
149150
Checks whether the right timing measures are attached to the run
@@ -167,10 +168,11 @@ def _check_fold_timing_evaluations(
167168
'wall_clock_time_millis': (0, max_time_allowed),
168169
}
169170

170-
if task_type in (TaskTypeEnum.SUPERVISED_CLASSIFICATION, TaskTypeEnum.LEARNING_CURVE):
171-
check_measures['predictive_accuracy'] = (0, 1.)
172-
elif task_type == TaskTypeEnum.SUPERVISED_REGRESSION:
173-
check_measures['mean_absolute_error'] = (0, float("inf"))
171+
if check_scores:
172+
if task_type in (TaskTypeEnum.SUPERVISED_CLASSIFICATION, TaskTypeEnum.LEARNING_CURVE):
173+
check_measures['predictive_accuracy'] = (0, 1.)
174+
elif task_type == TaskTypeEnum.SUPERVISED_REGRESSION:
175+
check_measures['mean_absolute_error'] = (0, float("inf"))
174176

175177
self.assertIsInstance(fold_evaluations, dict)
176178
if sys.version_info[:2] >= (3, 3):

0 commit comments

Comments
 (0)