Skip to content

Commit 813daeb

Browse files
ArlindKadramfeurer
authored andcommitted
[WIP] Task upload (#607)
* Initial implementation * Further progress on task upload * changes to pr * Code refactor, implementation changed * pep8 fix * Fix * Update for the errors only on travis-ci * Fix for failing builds * Fixes in accordance with openml * Refactor and changes * Changes considering the suggestions from Matthias * Updating clustering tasks to bypass the issue * Refactoring and bug fixes * Flake fix and considering another task for classification * Changing the ClassificationTask to the test server * Testing simple solution * Addressing the comments from Matthias * Fixing unused imports * Addressing #656 * Addressing #657 * Addressing the comments from Matthias, refactoring the task classes * Update pr * Trying fix for task upload * Fix bug introduced from previous changes on perform_api_call, increase max_wait_time for task upload * Update code, increase max time for task upload * Increasing wait time for task upload * Further increase in max wait time * Added create_task function, changed the implementation for the unit tests regarding task upload * Overcoming different feature types bug * Type annotations errors * Fixing pep8 spacing * Update 1 * Update 2 * Fixing type annotations * Another try at fixing type annotations for tasks * Fixing bug with unit tests of clustering tasks, changing order for type annotations * Fix for type annotations * Update for type annotations and failing clustering tasks * Further refactoring * Important refactor * Pep8 fix * Trying change * Trying fix for overload of setUp function * Update induced bug * Trying solution for unittest inheritance * Partially addressing the comments from Matthias, pep8 fix * Addressing the comments from Matthias and a first try at the pep8 run issue * Fixing pep8 errors * Enforcing pep8 * Another try at pep8 solution * Pep8 Fix * Address type annotation warnings * pep8 fix * addressing type annotations v2 * Addressing the comments from Matthias * Minor refactor * Testing 2 possible cases of uploading a clustering task
1 parent f656062 commit 813daeb

15 files changed

Lines changed: 734 additions & 174 deletions

openml/datasets/dataset.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -669,15 +669,17 @@ def publish(self):
669669
path = os.path.abspath(self.data_file)
670670
if os.path.exists(path):
671671
try:
672-
# check if arff is valid
673-
decoder = arff.ArffDecoder()
672+
674673
with io.open(path, encoding='utf8') as fh:
674+
# check if arff is valid
675+
decoder = arff.ArffDecoder()
675676
decoder.decode(fh, encode_nominal=True)
676677
except arff.ArffException:
677678
raise ValueError("The file you have provided is not "
678679
"a valid arff file.")
679680

680-
file_elements['dataset'] = open(path, 'rb')
681+
with open(path, 'rb') as fp:
682+
file_elements['dataset'] = fp.read()
681683
else:
682684
if self.url is None:
683685
raise ValueError("No url/path to the data file was given")

openml/extensions/sklearn/extension.py

Lines changed: 30 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1264,29 +1264,36 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
12641264
try:
12651265
proba_y = model_copy.predict_proba(X_test)
12661266
except AttributeError:
1267-
proba_y = _prediction_to_probabilities(pred_y, list(task.class_labels))
1268-
1269-
if proba_y.shape[1] != len(task.class_labels):
1270-
# Remap the probabilities in case there was a class missing at training time
1271-
# By default, the classification targets are mapped to be zero-based indices to the
1272-
# actual classes. Therefore, the model_classes contain the correct indices to the
1273-
# correct probability array. Example:
1274-
# classes in the dataset: 0, 1, 2, 3, 4, 5
1275-
# classes in the training set: 0, 1, 2, 4, 5
1276-
# then we need to add a column full of zeros into the probabilities for class 3
1277-
# (because the rest of the library expects that the probabilities are ordered the
1278-
# same way as the classes are ordered).
1279-
proba_y_new = np.zeros((proba_y.shape[0], len(task.class_labels)))
1280-
for idx, model_class in enumerate(model_classes):
1281-
proba_y_new[:, model_class] = proba_y[:, idx]
1282-
proba_y = proba_y_new
1283-
1284-
if proba_y.shape[1] != len(task.class_labels):
1285-
message = "Estimator only predicted for {}/{} classes!".format(
1286-
proba_y.shape[1], len(task.class_labels),
1287-
)
1288-
warnings.warn(message)
1289-
openml.config.logger.warn(message)
1267+
if task.class_labels is not None:
1268+
proba_y = _prediction_to_probabilities(pred_y, list(task.class_labels))
1269+
else:
1270+
raise ValueError('The task has no class labels')
1271+
1272+
if task.class_labels is not None:
1273+
if proba_y.shape[1] != len(task.class_labels):
1274+
# Remap the probabilities in case there was a class missing
1275+
# at training time. By default, the classification targets
1276+
# are mapped to be zero-based indices to the actual classes.
1277+
# Therefore, the model_classes contain the correct indices to
1278+
# the correct probability array. Example:
1279+
# classes in the dataset: 0, 1, 2, 3, 4, 5
1280+
# classes in the training set: 0, 1, 2, 4, 5
1281+
# then we need to add a column full of zeros into the probabilities
1282+
# for class 3 because the rest of the library expects that the
1283+
# probabilities are ordered the same way as the classes are ordered).
1284+
proba_y_new = np.zeros((proba_y.shape[0], len(task.class_labels)))
1285+
for idx, model_class in enumerate(model_classes):
1286+
proba_y_new[:, model_class] = proba_y[:, idx]
1287+
proba_y = proba_y_new
1288+
1289+
if proba_y.shape[1] != len(task.class_labels):
1290+
message = "Estimator only predicted for {}/{} classes!".format(
1291+
proba_y.shape[1], len(task.class_labels),
1292+
)
1293+
warnings.warn(message)
1294+
openml.config.logger.warn(message)
1295+
else:
1296+
raise ValueError('The task has no class labels')
12901297

12911298
elif isinstance(task, OpenMLRegressionTask):
12921299
proba_y = None

openml/runs/functions.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,9 @@ def run_flow_on_task(
158158
if flow_tags is not None and not isinstance(flow_tags, list):
159159
raise ValueError("flow_tags should be a list")
160160

161+
if task.task_id is None:
162+
raise ValueError("The task should be published at OpenML")
163+
161164
# TODO: At some point in the future do not allow for arguments in old order (changed 6-2018).
162165
# Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019).
163166
if isinstance(flow, OpenMLTask) and isinstance(task, OpenMLFlow):
@@ -452,11 +455,14 @@ def _calculate_local_measure(sklearn_fn, openml_name):
452455
for i, tst_idx in enumerate(test_indices):
453456

454457
arff_line = [rep_no, fold_no, sample_no, tst_idx] # type: List[Any]
455-
for j, class_label in enumerate(task.class_labels):
456-
arff_line.append(proba_y[i][j])
458+
if task.class_labels is not None:
459+
for j, class_label in enumerate(task.class_labels):
460+
arff_line.append(proba_y[i][j])
457461

458-
arff_line.append(task.class_labels[pred_y[i]])
459-
arff_line.append(task.class_labels[test_y[i]])
462+
arff_line.append(task.class_labels[pred_y[i]])
463+
arff_line.append(task.class_labels[test_y[i]])
464+
else:
465+
raise ValueError('The task has no class labels')
460466

461467
arff_datacontent.append(arff_line)
462468

openml/runs/run.py

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -216,30 +216,45 @@ def _generate_arff_dict(self) -> 'OrderedDict[str, Any]':
216216
'openml_task_{}_predictions'.format(task.task_id)
217217

218218
if isinstance(task, OpenMLLearningCurveTask):
219-
class_labels = task.class_labels # type: ignore
220-
arff_dict['attributes'] = [('repeat', 'NUMERIC'),
221-
('fold', 'NUMERIC'),
222-
('sample', 'NUMERIC'),
223-
('row_id', 'NUMERIC')] + \
224-
[('confidence.' + class_labels[i],
225-
'NUMERIC') for i in
226-
range(len(class_labels))] + \
227-
[('prediction', class_labels),
228-
('correct', class_labels)]
219+
class_labels = task.class_labels
220+
instance_specifications = [
221+
('repeat', 'NUMERIC'),
222+
('fold', 'NUMERIC'),
223+
('sample', 'NUMERIC'),
224+
('row_id', 'NUMERIC')
225+
]
226+
227+
arff_dict['attributes'] = instance_specifications
228+
if class_labels is not None:
229+
arff_dict['attributes'] = arff_dict['attributes'] + \
230+
[('confidence.' + class_labels[i],
231+
'NUMERIC')
232+
for i in range(len(class_labels))] + \
233+
[('prediction', class_labels),
234+
('correct', class_labels)]
235+
else:
236+
raise ValueError('The task has no class labels')
237+
229238
elif isinstance(task, OpenMLClassificationTask):
230239
class_labels = task.class_labels
231240
instance_specifications = [('repeat', 'NUMERIC'),
232241
('fold', 'NUMERIC'),
233242
('sample', 'NUMERIC'), # Legacy
234243
('row_id', 'NUMERIC')]
235-
prediction_confidences = [('confidence.' + class_labels[i],
236-
'NUMERIC')
237-
for i in range(len(class_labels))]
238-
prediction_and_true = [('prediction', class_labels),
239-
('correct', class_labels)]
240-
arff_dict['attributes'] = (instance_specifications
241-
+ prediction_confidences
242-
+ prediction_and_true)
244+
245+
arff_dict['attributes'] = instance_specifications
246+
if class_labels is not None:
247+
prediction_confidences = [('confidence.' + class_labels[i],
248+
'NUMERIC')
249+
for i in range(len(class_labels))]
250+
prediction_and_true = [('prediction', class_labels),
251+
('correct', class_labels)]
252+
arff_dict['attributes'] = arff_dict['attributes'] + \
253+
prediction_confidences + \
254+
prediction_and_true
255+
else:
256+
raise ValueError('The task has no class labels')
257+
243258
elif isinstance(task, OpenMLRegressionTask):
244259
arff_dict['attributes'] = [('repeat', 'NUMERIC'),
245260
('fold', 'NUMERIC'),

openml/tasks/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,12 @@
88
TaskTypeEnum,
99
)
1010
from .split import OpenMLSplit
11-
from .functions import (get_task, get_tasks, list_tasks)
11+
from .functions import (
12+
create_task,
13+
get_task,
14+
get_tasks,
15+
list_tasks,
16+
)
1217

1318
__all__ = [
1419
'OpenMLTask',
@@ -17,6 +22,7 @@
1722
'OpenMLRegressionTask',
1823
'OpenMLClassificationTask',
1924
'OpenMLLearningCurveTask',
25+
'create_task',
2026
'get_task',
2127
'get_tasks',
2228
'list_tasks',

openml/tasks/functions.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import io
33
import re
44
import os
5+
from typing import Union, Optional
56
import xmltodict
67

78
from ..exceptions import OpenMLCacheException
@@ -441,3 +442,69 @@ def _create_task_from_xml(xml):
441442
raise NotImplementedError('Task type %s not supported.' %
442443
common_kwargs['task_type'])
443444
return cls(**common_kwargs)
445+
446+
447+
def create_task(
448+
task_type_id: int,
449+
dataset_id: int,
450+
estimation_procedure_id: int,
451+
target_name: Optional[str] = None,
452+
evaluation_measure: Optional[str] = None,
453+
**kwargs
454+
) -> Union[
455+
OpenMLClassificationTask, OpenMLRegressionTask,
456+
OpenMLLearningCurveTask, OpenMLClusteringTask
457+
]:
458+
"""Create a task based on different given attributes.
459+
460+
Builds a task object with the function arguments as
461+
attributes. The type of the task object built is
462+
determined from the task type id.
463+
More information on how the arguments (task attributes),
464+
relate to the different possible tasks can be found in
465+
the individual task objects at the openml.tasks.task
466+
module.
467+
468+
Parameters
469+
----------
470+
task_type_id : int
471+
Id of the task type.
472+
dataset_id : int
473+
The id of the dataset for the task.
474+
target_name : str, optional
475+
The name of the feature used as a target.
476+
At the moment, only optional for the clustering tasks.
477+
estimation_procedure_id : int
478+
The id of the estimation procedure.
479+
evaluation_measure : str, optional
480+
The name of the evaluation measure.
481+
kwargs : dict, optional
482+
Other task attributes that are not mandatory
483+
for task upload.
484+
485+
Returns
486+
-------
487+
OpenMLClassificationTask, OpenMLRegressionTask,
488+
OpenMLLearningCurveTask, OpenMLClusteringTask
489+
"""
490+
task_cls = {
491+
TaskTypeEnum.SUPERVISED_CLASSIFICATION: OpenMLClassificationTask,
492+
TaskTypeEnum.SUPERVISED_REGRESSION: OpenMLRegressionTask,
493+
TaskTypeEnum.CLUSTERING: OpenMLClusteringTask,
494+
TaskTypeEnum.LEARNING_CURVE: OpenMLLearningCurveTask,
495+
}.get(task_type_id)
496+
497+
if task_cls is None:
498+
raise NotImplementedError(
499+
'Task type {0:d} not supported.'.format(task_type_id)
500+
)
501+
else:
502+
return task_cls(
503+
task_type_id=task_type_id,
504+
task_type=None,
505+
data_set_id=dataset_id,
506+
target_name=target_name,
507+
estimation_procedure_id=estimation_procedure_id,
508+
evaluation_measure=evaluation_measure,
509+
**kwargs
510+
)

0 commit comments

Comments
 (0)