Skip to content

Commit 38e02ef

Browse files
committed
simplify extension interface
1 parent 4152f91 commit 38e02ef

3 files changed

Lines changed: 113 additions & 137 deletions

File tree

openml/extensions/extension_interface.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
from abc import ABC, abstractmethod
22
from collections import OrderedDict # noqa: F401
3-
from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING
3+
from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING, Union
4+
5+
import numpy as np
6+
import scipy.sparse
7+
import pandas as pd
48

59
# Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
610
if TYPE_CHECKING:
@@ -147,10 +151,14 @@ def _run_model_on_fold(
147151
self,
148152
model: Any,
149153
task: 'OpenMLTask',
154+
X_train: Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame],
155+
y_train: np.ndarray,
150156
rep_no: int,
151157
fold_no: int,
152158
sample_no: int,
153159
add_local_measures: bool,
160+
X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame]] = None,
161+
n_classes: Optional[int] = None,
154162
) -> Tuple[List[List], List[List], 'OrderedDict[str, float]', Any]:
155163
"""Run a model on a repeat,fold,subsample triplet of the task and return prediction information.
156164

openml/extensions/sklearn/extension.py

Lines changed: 22 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@
1212
import warnings
1313

1414
import numpy as np
15+
import pandas as pd
1516
import scipy.stats
17+
import scipy.sparse
1618
import sklearn.base
1719
import sklearn.model_selection
1820
import sklearn.pipeline
@@ -1096,11 +1098,15 @@ def _run_model_on_fold(
10961098
self,
10971099
model: Any,
10981100
task: 'OpenMLTask',
1101+
X_train: Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame],
1102+
y_train: np.ndarray,
10991103
rep_no: int,
11001104
fold_no: int,
11011105
sample_no: int,
11021106
add_local_measures: bool,
1103-
) -> Tuple[List[List], List[List], 'OrderedDict[str, float]', Any]:
1107+
X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame]] = None,
1108+
n_classes: Optional[int] = None,
1109+
) -> Tuple[np.ndarray, np.ndarray, 'OrderedDict[str, float]', Any]:
11041110
"""Run a model on a repeat,fold,subsample triplet of the task and return prediction
11051111
information.
11061112
@@ -1191,20 +1197,6 @@ def _prediction_to_probabilities(
11911197
can_measure_cputime = self._can_measure_cputime(model_copy)
11921198
can_measure_wallclocktime = self._can_measure_wallclocktime(model_copy)
11931199

1194-
train_indices, test_indices = task.get_train_test_split_indices(
1195-
repeat=rep_no, fold=fold_no, sample=sample_no)
1196-
if isinstance(task, OpenMLSupervisedTask):
1197-
x, y = task.get_X_and_y()
1198-
train_x = x[train_indices]
1199-
train_y = y[train_indices]
1200-
test_x = x[test_indices]
1201-
test_y = y[test_indices]
1202-
elif isinstance(task, OpenMLClusteringTask):
1203-
train_x = train_indices
1204-
test_x = test_indices
1205-
else:
1206-
raise NotImplementedError(task.task_type)
1207-
12081200
user_defined_measures = OrderedDict() # type: 'OrderedDict[str, float]'
12091201

12101202
try:
@@ -1213,9 +1205,9 @@ def _prediction_to_probabilities(
12131205
modelfit_start_walltime = time.time()
12141206

12151207
if isinstance(task, OpenMLSupervisedTask):
1216-
model_copy.fit(train_x, train_y)
1208+
model_copy.fit(X_train, y_train)
12171209
elif isinstance(task, OpenMLClusteringTask):
1218-
model_copy.fit(train_x)
1210+
model_copy.fit(X_train)
12191211

12201212
modelfit_dur_cputime = (time.process_time() - modelfit_start_cputime) * 1000
12211213
if can_measure_cputime:
@@ -1229,11 +1221,6 @@ def _prediction_to_probabilities(
12291221
# typically happens when training a regressor on classification task
12301222
raise PyOpenMLError(str(e))
12311223

1232-
# extract trace, if applicable
1233-
arff_tracecontent = [] # type: List[List]
1234-
if self.is_hpo_class(model_copy):
1235-
arff_tracecontent.extend(self._extract_trace_data(model_copy, rep_no, fold_no))
1236-
12371224
if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
12381225
# search for model classes_ (might differ depending on modeltype)
12391226
# first, pipelines are a special case (these don't have a classes_
@@ -1254,7 +1241,7 @@ def _prediction_to_probabilities(
12541241

12551242
# In supervised learning this returns the predictions for Y, in clustering
12561243
# it returns the clusters
1257-
pred_y = model_copy.predict(test_x)
1244+
pred_y = model_copy.predict(X_test)
12581245

12591246
if can_measure_cputime:
12601247
modelpredict_duration_cputime = (time.process_time()
@@ -1268,133 +1255,35 @@ def _prediction_to_probabilities(
12681255
user_defined_measures['wall_clock_time_millis'] = (modelfit_dur_walltime
12691256
+ modelpredict_duration_walltime)
12701257

1271-
# add client-side calculated metrics. These is used on the server as
1272-
# consistency check, only useful for supervised tasks
1273-
def _calculate_local_measure(sklearn_fn, openml_name):
1274-
user_defined_measures[openml_name] = sklearn_fn(test_y, pred_y)
1275-
1276-
# Task type specific outputs
1277-
arff_datacontent = []
1278-
12791258
if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
12801259

12811260
try:
1282-
proba_y = model_copy.predict_proba(test_x)
1261+
proba_y = model_copy.predict_proba(X_test)
12831262
except AttributeError:
12841263
proba_y = _prediction_to_probabilities(pred_y, list(model_classes))
12851264

1265+
pred_y = np.array([model_classes[label] for label in pred_y], dtype=pred_y.dtype)
1266+
proba_y_new = np.zeros((proba_y.shape[0], n_classes))
1267+
for idx, class_idx in enumerate(model_classes):
1268+
proba_y_new[:, class_idx] = proba_y[:, idx]
1269+
proba_y = proba_y_new
1270+
12861271
if proba_y.shape[1] != len(task.class_labels):
12871272
warnings.warn(
1288-
"Repeat %d Fold %d: estimator only predicted for %d/%d classes!"
1289-
% (rep_no, fold_no, proba_y.shape[1], len(task.class_labels))
1273+
"Repeat %d fold %d sample %d: estimator only predicted for %d/%d classes!"
1274+
% (rep_no, fold_no, sample_no, proba_y.shape[1], len(task.class_labels))
12901275
)
12911276

1292-
if add_local_measures:
1293-
_calculate_local_measure(sklearn.metrics.accuracy_score,
1294-
'predictive_accuracy')
1295-
1296-
for i in range(0, len(test_indices)):
1297-
arff_line = self._prediction_to_row(
1298-
rep_no=rep_no,
1299-
fold_no=fold_no,
1300-
sample_no=sample_no,
1301-
row_id=test_indices[i],
1302-
correct_label=task.class_labels[test_y[i]],
1303-
predicted_label=pred_y[i],
1304-
predicted_probabilities=proba_y[i],
1305-
class_labels=task.class_labels,
1306-
model_classes_mapping=model_classes,
1307-
)
1308-
arff_datacontent.append(arff_line)
1309-
13101277
elif isinstance(task, OpenMLRegressionTask):
1311-
if add_local_measures:
1312-
_calculate_local_measure(
1313-
sklearn.metrics.mean_absolute_error,
1314-
'mean_absolute_error',
1315-
)
1316-
1317-
for i in range(0, len(test_indices)):
1318-
arff_line = [rep_no, fold_no, test_indices[i], pred_y[i], test_y[i]]
1319-
arff_datacontent.append(arff_line)
1278+
proba_y = None
13201279

13211280
elif isinstance(task, OpenMLClusteringTask):
1322-
for i in range(0, len(test_indices)):
1323-
arff_line = [test_indices[i], pred_y[i]] # row_id, cluster ID
1324-
arff_datacontent.append(arff_line)
1281+
proba_y = None
13251282

13261283
else:
13271284
raise TypeError(type(task))
13281285

1329-
return arff_datacontent, arff_tracecontent, user_defined_measures, model_copy
1330-
1331-
def _prediction_to_row(
1332-
self,
1333-
rep_no: int,
1334-
fold_no: int,
1335-
sample_no: int,
1336-
row_id: int,
1337-
correct_label: str,
1338-
predicted_label: int,
1339-
predicted_probabilities: np.ndarray,
1340-
class_labels: List,
1341-
model_classes_mapping: List,
1342-
) -> List:
1343-
"""Util function that turns probability estimates of a classifier for a
1344-
given instance into the right arff format to upload to openml.
1345-
1346-
Parameters
1347-
----------
1348-
rep_no : int
1349-
The repeat of the experiment (0-based; in case of 1 time CV,
1350-
always 0)
1351-
fold_no : int
1352-
The fold nr of the experiment (0-based; in case of holdout,
1353-
always 0)
1354-
sample_no : int
1355-
In case of learning curves, the index of the subsample (0-based;
1356-
in case of no learning curve, always 0)
1357-
row_id : int
1358-
row id in the initial dataset
1359-
correct_label : str
1360-
original label of the instance
1361-
predicted_label : str
1362-
the label that was predicted
1363-
predicted_probabilities : array (size=num_classes)
1364-
probabilities per class
1365-
class_labels : array (size=num_classes)
1366-
model_classes_mapping : list
1367-
A list of classes the model produced.
1368-
Obtained by BaseEstimator.classes_
1369-
1370-
Returns
1371-
-------
1372-
arff_line : list
1373-
representation of the current prediction in OpenML format
1374-
"""
1375-
if not isinstance(rep_no, (int, np.integer)):
1376-
raise ValueError('rep_no should be int')
1377-
if not isinstance(fold_no, (int, np.integer)):
1378-
raise ValueError('fold_no should be int')
1379-
if not isinstance(sample_no, (int, np.integer)):
1380-
raise ValueError('sample_no should be int')
1381-
if not isinstance(row_id, (int, np.integer)):
1382-
raise ValueError('row_id should be int')
1383-
if not len(predicted_probabilities) == len(model_classes_mapping):
1384-
raise ValueError('len(predicted_probabilities) != len(class_labels)')
1385-
1386-
arff_line = [rep_no, fold_no, sample_no, row_id] # type: List[Any]
1387-
for class_label_idx in range(len(class_labels)):
1388-
if class_label_idx in model_classes_mapping:
1389-
index = np.where(model_classes_mapping == class_label_idx)[0][0]
1390-
# TODO: WHY IS THIS 2D???
1391-
arff_line.append(predicted_probabilities[index])
1392-
else:
1393-
arff_line.append(0.0)
1394-
1395-
arff_line.append(class_labels[predicted_label])
1396-
arff_line.append(correct_label)
1397-
return arff_line
1286+
return pred_y, proba_y, user_defined_measures, model_copy
13981287

13991288
def _extract_trace_data(self, model, rep_no, fold_no):
14001289
arff_tracecontent = []

openml/runs/functions.py

Lines changed: 82 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from typing import Any, List, Optional, Set, Tuple, Union, TYPE_CHECKING # noqa F401
55
import warnings
66

7+
import numpy as np
8+
import sklearn.metrics
79
import xmltodict
810

911
import openml
@@ -16,7 +18,8 @@
1618
from ..flows import get_flow, flow_exists, OpenMLFlow
1719
from ..setups import setup_exists, initialize_model
1820
from ..exceptions import OpenMLCacheException, OpenMLServerException, OpenMLRunsExistError
19-
from ..tasks import OpenMLTask
21+
from ..tasks import OpenMLTask, OpenMLClassificationTask, OpenMLClusteringTask, \
22+
OpenMLRegressionTask, OpenMLSupervisedTask, OpenMLLearningCurveTask
2023
from .run import OpenMLRun
2124
from .trace import OpenMLRunTrace
2225
from ..tasks import TaskTypeEnum
@@ -391,24 +394,100 @@ def _run_task_get_arffcontent(
391394
# TODO use different iterator to only provide a single iterator (less
392395
# methods, less maintenance, less confusion)
393396
num_reps, num_folds, num_samples = task.get_split_dimensions()
397+
n_classes = None
394398

395399
for rep_no in range(num_reps):
396400
for fold_no in range(num_folds):
397401
for sample_no in range(num_samples):
402+
403+
train_indices, test_indices = task.get_train_test_split_indices(
404+
repeat=rep_no, fold=fold_no, sample=sample_no)
405+
if isinstance(task, OpenMLSupervisedTask):
406+
x, y = task.get_X_and_y()
407+
train_x = x[train_indices]
408+
train_y = y[train_indices]
409+
test_x = x[test_indices]
410+
test_y = y[test_indices]
411+
if isinstance(task, (OpenMLClassificationTask, OpenMLClassificationTask)):
412+
n_classes = len(task.class_labels)
413+
elif isinstance(task, OpenMLClusteringTask):
414+
train_x = train_indices
415+
train_y = None
416+
test_x = test_indices
417+
test_y = None
418+
else:
419+
raise NotImplementedError(task.task_type)
420+
398421
(
399-
arff_datacontent_fold,
400-
arff_tracecontent_fold,
422+
pred_y,
423+
proba_y,
401424
user_defined_measures_fold,
402425
model_fold,
403426
) = extension._run_model_on_fold(
404427
model=model,
405428
task=task,
429+
X_train=train_x,
430+
y_train=train_y,
406431
rep_no=rep_no,
407432
fold_no=fold_no,
408433
sample_no=sample_no,
409434
add_local_measures=add_local_measures,
435+
X_test=test_x,
436+
n_classes=n_classes,
410437
)
411438

439+
arff_datacontent_fold = [] # type: List[List]
440+
# extract trace, if applicable
441+
arff_tracecontent_fold = [] # type: List[List]
442+
if extension.is_hpo_class(model_fold):
443+
arff_tracecontent_fold.extend(
444+
extension._extract_trace_data(model_fold, rep_no, fold_no)
445+
)
446+
447+
# add client-side calculated metrics. These is used on the server as
448+
# consistency check, only useful for supervised tasks
449+
def _calculate_local_measure(sklearn_fn, openml_name):
450+
user_defined_measures_fold[openml_name] = sklearn_fn(test_y, pred_y)
451+
452+
if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
453+
454+
for i in range(0, len(test_indices)):
455+
456+
arff_line = [rep_no, fold_no, sample_no, i] # type: List[Any]
457+
for j, class_label in enumerate(task.class_labels):
458+
arff_line.append(proba_y[i][j])
459+
460+
arff_line.append(task.class_labels[pred_y[i]])
461+
arff_line.append(task.class_labels[test_y[i]])
462+
463+
arff_datacontent.append(arff_line)
464+
465+
if add_local_measures:
466+
_calculate_local_measure(
467+
sklearn.metrics.accuracy_score,
468+
'predictive_accuracy',
469+
)
470+
471+
elif isinstance(task, OpenMLRegressionTask):
472+
473+
for i in range(0, len(test_indices)):
474+
arff_line = [rep_no, fold_no, test_indices[i], pred_y[i], test_y[i]]
475+
arff_datacontent.append(arff_line)
476+
477+
if add_local_measures:
478+
_calculate_local_measure(
479+
sklearn.metrics.mean_absolute_error,
480+
'mean_absolute_error',
481+
)
482+
483+
elif isinstance(task, OpenMLClusteringTask):
484+
for i in range(0, len(test_indices)):
485+
arff_line = [test_indices[i], pred_y[i]] # row_id, cluster ID
486+
arff_datacontent.append(arff_line)
487+
488+
else:
489+
raise TypeError(type(task))
490+
412491
arff_datacontent.extend(arff_datacontent_fold)
413492
arff_tracecontent.extend(arff_tracecontent_fold)
414493

0 commit comments

Comments
 (0)