Skip to content

Commit e354b04

Browse files
committed
incorporate pieter's feedback
1 parent 2d2d3ed commit e354b04

5 files changed

Lines changed: 159 additions & 147 deletions

File tree

openml/extensions/sklearn/extension.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1270,8 +1270,12 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
12701270
# Remap the probabilities in case there was a class missing at training time
12711271
# By default, the classification targets are mapped to be zero-based indices to the
12721272
# actual classes. Therefore, the model_classes contain the correct indices to the
1273-
# correct probability array (the actually array might be incorrect if there are
1274-
# some classes not present during train time).
1273+
# correct probability array. Example:
1274+
# classes in the dataset: 0, 1, 2, 3, 4, 5
1275+
# classes in the training set: 0, 1, 2, 4, 5
1276+
# then we need to add a column full of zeros into the probabilities for class 3
1277+
# (because the rest of the library expects that the probabilities are ordered the
1278+
# same way as the classes are ordered).
12751279
proba_y_new = np.zeros((proba_y.shape[0], len(task.class_labels)))
12761280
for idx, model_class in enumerate(model_classes):
12771281
proba_y_new[:, model_class] = proba_y[:, idx]

openml/runs/functions.py

Lines changed: 101 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from collections import OrderedDict
22
import io
3+
import itertools
34
import os
45
from typing import Any, List, Optional, Set, Tuple, Union, TYPE_CHECKING # noqa F401
56
import warnings
@@ -395,125 +396,119 @@ def _run_task_get_arffcontent(
395396
# TODO use different iterator to only provide a single iterator (less
396397
# methods, less maintenance, less confusion)
397398
num_reps, num_folds, num_samples = task.get_split_dimensions()
398-
classes = None
399-
400-
n_fit = 0
401-
for rep_no in range(num_reps):
402-
for fold_no in range(num_folds):
403-
for sample_no in range(num_samples):
404-
n_fit += 1
405-
406-
train_indices, test_indices = task.get_train_test_split_indices(
407-
repeat=rep_no, fold=fold_no, sample=sample_no)
408-
if isinstance(task, OpenMLSupervisedTask):
409-
x, y = task.get_X_and_y(dataset_format='array')
410-
train_x = x[train_indices]
411-
train_y = y[train_indices]
412-
test_x = x[test_indices]
413-
test_y = y[test_indices]
414-
if isinstance(task, (OpenMLClassificationTask, OpenMLClassificationTask)):
415-
classes = task.class_labels
416-
elif isinstance(task, OpenMLClusteringTask):
417-
x = task.get_X(dataset_format='array')
418-
train_x = train_indices
419-
train_y = None
420-
test_x = test_indices
421-
test_y = None
422-
else:
423-
raise NotImplementedError(task.task_type)
424-
425-
config.logger.info(
426-
"Going to execute flow '%s' on task %d for repeat %d fold %d sample %d.",
427-
flow.name, task.task_id, rep_no, fold_no, sample_no,
428-
)
429399

430-
(
431-
pred_y,
432-
proba_y,
433-
user_defined_measures_fold,
434-
trace,
435-
) = extension._run_model_on_fold(
436-
model=model,
437-
task=task,
438-
X_train=train_x,
439-
y_train=train_y,
440-
rep_no=rep_no,
441-
fold_no=fold_no,
442-
X_test=test_x,
400+
for n_fit, (rep_no, fold_no, sample_no) in enumerate(itertools.product(
401+
range(num_reps),
402+
range(num_folds),
403+
range(num_samples),
404+
)):
405+
406+
train_indices, test_indices = task.get_train_test_split_indices(
407+
repeat=rep_no, fold=fold_no, sample=sample_no)
408+
if isinstance(task, OpenMLSupervisedTask):
409+
x, y = task.get_X_and_y(dataset_format='array')
410+
train_x = x[train_indices]
411+
train_y = y[train_indices]
412+
test_x = x[test_indices]
413+
test_y = y[test_indices]
414+
elif isinstance(task, OpenMLClusteringTask):
415+
x = task.get_X(dataset_format='array')
416+
train_x = x[train_indices]
417+
train_y = None
418+
test_x = None
419+
test_y = None
420+
else:
421+
raise NotImplementedError(task.task_type)
422+
423+
config.logger.info(
424+
"Going to execute flow '%s' on task %d for repeat %d fold %d sample %d.",
425+
flow.name, task.task_id, rep_no, fold_no, sample_no,
426+
)
427+
428+
(
429+
pred_y,
430+
proba_y,
431+
user_defined_measures_fold,
432+
trace,
433+
) = extension._run_model_on_fold(
434+
model=model,
435+
task=task,
436+
X_train=train_x,
437+
y_train=train_y,
438+
rep_no=rep_no,
439+
fold_no=fold_no,
440+
X_test=test_x,
441+
)
442+
if trace is not None:
443+
traces.append(trace)
444+
445+
# add client-side calculated metrics. These is used on the server as
446+
# consistency check, only useful for supervised tasks
447+
def _calculate_local_measure(sklearn_fn, openml_name):
448+
user_defined_measures_fold[openml_name] = sklearn_fn(test_y, pred_y)
449+
450+
if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
451+
452+
for i in range(0, len(test_indices)):
453+
454+
arff_line = [rep_no, fold_no, sample_no, i] # type: List[Any]
455+
for j, class_label in enumerate(task.class_labels):
456+
arff_line.append(proba_y[i][j])
457+
458+
arff_line.append(task.class_labels[pred_y[i]])
459+
arff_line.append(task.class_labels[test_y[i]])
460+
461+
arff_datacontent.append(arff_line)
462+
463+
if add_local_measures:
464+
_calculate_local_measure(
465+
sklearn.metrics.accuracy_score,
466+
'predictive_accuracy',
443467
)
444468

445-
arff_datacontent_fold = [] # type: List[List]
446-
if trace is not None:
447-
traces.append(trace)
448-
449-
# add client-side calculated metrics. These is used on the server as
450-
# consistency check, only useful for supervised tasks
451-
def _calculate_local_measure(sklearn_fn, openml_name):
452-
user_defined_measures_fold[openml_name] = sklearn_fn(test_y, pred_y)
453-
454-
if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
455-
456-
for i in range(0, len(test_indices)):
457-
458-
arff_line = [rep_no, fold_no, sample_no, i] # type: List[Any]
459-
for j, class_label in enumerate(task.class_labels):
460-
arff_line.append(proba_y[i][j])
461-
462-
arff_line.append(task.class_labels[pred_y[i]])
463-
arff_line.append(task.class_labels[test_y[i]])
469+
elif isinstance(task, OpenMLRegressionTask):
464470

465-
arff_datacontent.append(arff_line)
471+
for i in range(0, len(test_indices)):
472+
arff_line = [rep_no, fold_no, test_indices[i], pred_y[i], test_y[i]]
473+
arff_datacontent.append(arff_line)
466474

467-
if add_local_measures:
468-
_calculate_local_measure(
469-
sklearn.metrics.accuracy_score,
470-
'predictive_accuracy',
471-
)
472-
473-
elif isinstance(task, OpenMLRegressionTask):
474-
475-
for i in range(0, len(test_indices)):
476-
arff_line = [rep_no, fold_no, test_indices[i], pred_y[i], test_y[i]]
477-
arff_datacontent.append(arff_line)
478-
479-
if add_local_measures:
480-
_calculate_local_measure(
481-
sklearn.metrics.mean_absolute_error,
482-
'mean_absolute_error',
483-
)
475+
if add_local_measures:
476+
_calculate_local_measure(
477+
sklearn.metrics.mean_absolute_error,
478+
'mean_absolute_error',
479+
)
484480

485-
elif isinstance(task, OpenMLClusteringTask):
486-
for i in range(0, len(test_indices)):
487-
arff_line = [test_indices[i], pred_y[i]] # row_id, cluster ID
488-
arff_datacontent.append(arff_line)
481+
elif isinstance(task, OpenMLClusteringTask):
482+
for i in range(0, len(test_indices)):
483+
arff_line = [test_indices[i], pred_y[i]] # row_id, cluster ID
484+
arff_datacontent.append(arff_line)
489485

490-
else:
491-
raise TypeError(type(task))
492-
493-
arff_datacontent.extend(arff_datacontent_fold)
486+
else:
487+
raise TypeError(type(task))
494488

495-
for measure in user_defined_measures_fold:
489+
for measure in user_defined_measures_fold:
496490

497-
if measure not in user_defined_measures_per_fold:
498-
user_defined_measures_per_fold[measure] = OrderedDict()
499-
if rep_no not in user_defined_measures_per_fold[measure]:
500-
user_defined_measures_per_fold[measure][rep_no] = OrderedDict()
491+
if measure not in user_defined_measures_per_fold:
492+
user_defined_measures_per_fold[measure] = OrderedDict()
493+
if rep_no not in user_defined_measures_per_fold[measure]:
494+
user_defined_measures_per_fold[measure][rep_no] = OrderedDict()
501495

502-
if measure not in user_defined_measures_per_sample:
503-
user_defined_measures_per_sample[measure] = OrderedDict()
504-
if rep_no not in user_defined_measures_per_sample[measure]:
505-
user_defined_measures_per_sample[measure][rep_no] = OrderedDict()
506-
if fold_no not in user_defined_measures_per_sample[
507-
measure][rep_no]:
508-
user_defined_measures_per_sample[measure][rep_no][fold_no] = OrderedDict()
496+
if measure not in user_defined_measures_per_sample:
497+
user_defined_measures_per_sample[measure] = OrderedDict()
498+
if rep_no not in user_defined_measures_per_sample[measure]:
499+
user_defined_measures_per_sample[measure][rep_no] = OrderedDict()
500+
if fold_no not in user_defined_measures_per_sample[measure][rep_no]:
501+
user_defined_measures_per_sample[measure][rep_no][fold_no] = OrderedDict()
509502

510-
user_defined_measures_per_fold[measure][rep_no][
511-
fold_no] = user_defined_measures_fold[measure]
512-
user_defined_measures_per_sample[measure][rep_no][fold_no][
513-
sample_no] = user_defined_measures_fold[measure]
503+
user_defined_measures_per_fold[measure][rep_no][fold_no] = (
504+
user_defined_measures_fold[measure]
505+
)
506+
user_defined_measures_per_sample[measure][rep_no][fold_no][sample_no] = (
507+
user_defined_measures_fold[measure]
508+
)
514509

515510
if len(traces) > 0:
516-
if len(traces) != n_fit:
511+
if len(traces) != n_fit + 1:
517512
raise ValueError(
518513
'Did not find enough traces (expected {}, found {})'.format(n_fit, len(traces))
519514
)

openml/runs/trace.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ def _trace_from_arff_struct(cls, attributes, content, error_message):
283283
setup_string=None,
284284
evaluation=evaluation,
285285
selected=selected,
286-
paramaters=parameters,
286+
parameters=parameters,
287287
)
288288
trace[(repeat, fold, iteration)] = current
289289

@@ -350,15 +350,27 @@ def trace_from_xml(cls, xml):
350350

351351
@classmethod
352352
def merge_traces(cls, traces: List['OpenMLRunTrace']) -> 'OpenMLRunTrace':
353-
for i in range(1, len(traces)):
354-
if traces[i] != traces[i - 1]:
355-
raise ValueError('Cannot merge traces!')
356353

357354
merged_trace = OrderedDict() # type: OrderedDict[Tuple[int, int, int], OpenMLTraceIteration] # noqa E501
358355

356+
previous_iteration = None
359357
for trace in traces:
360358
for iteration in trace:
361-
merged_trace[(iteration.repeat, iteration.fold, iteration.iteration)] = iteration
359+
key = (iteration.repeat, iteration.fold, iteration.iteration)
360+
if previous_iteration is not None:
361+
if (
362+
list(merged_trace[previous_iteration].parameters.keys())
363+
!= list(iteration.parameters.keys())
364+
):
365+
raise ValueError(
366+
'Cannot merge traces because the parameters are not equal: {} vs {}'.
367+
format(
368+
list(merged_trace[previous_iteration].parameters.keys()),
369+
list(iteration.parameters.keys()),
370+
)
371+
)
372+
merged_trace[key] = iteration
373+
previous_iteration = key
362374

363375
return cls(None, merged_trace)
364376

@@ -410,25 +422,25 @@ def __init__(
410422
setup_string,
411423
evaluation,
412424
selected,
413-
paramaters=None,
425+
parameters=None,
414426
):
415427

416428
if not isinstance(selected, bool):
417429
raise TypeError(type(selected))
418-
if setup_string and paramaters:
430+
if setup_string and parameters:
419431
raise ValueError(
420432
'Can only be instantiated with either '
421433
'setup_string or parameters argument.'
422434
)
423-
elif not setup_string and not paramaters:
435+
elif not setup_string and not parameters:
424436
raise ValueError(
425437
'Either setup_string or parameters needs to be passed as '
426438
'argument.'
427439
)
428-
if paramaters is not None and not isinstance(paramaters, OrderedDict):
440+
if parameters is not None and not isinstance(parameters, OrderedDict):
429441
raise TypeError(
430442
'argument parameters is not an instance of OrderedDict, but %s'
431-
% str(type(paramaters))
443+
% str(type(parameters))
432444
)
433445

434446
self.repeat = repeat
@@ -437,7 +449,7 @@ def __init__(
437449
self.setup_string = setup_string
438450
self.evaluation = evaluation
439451
self.selected = selected
440-
self.parameters = paramaters
452+
self.parameters = parameters
441453

442454
def get_parameters(self):
443455
result = {}
@@ -464,14 +476,3 @@ def __str__(self):
464476
self.evaluation,
465477
self.selected,
466478
)
467-
468-
def __eq__(self, other):
469-
if not isinstance(other, OpenMLTraceIteration):
470-
return False
471-
attributes = [
472-
'repeat', 'fold', 'iteration', 'setup_string', 'evaluation', 'selected', 'paramaters',
473-
]
474-
for attr in attributes:
475-
if getattr(self, attr) != getattr(other, attr):
476-
return False
477-
return True

0 commit comments

Comments
 (0)