Skip to content

Commit 98aede1

Browse files
authored
Merge branch 'develop' into parallel_unit_tests
2 parents ef53267 + c53a731 commit 98aede1

5 files changed

Lines changed: 65 additions & 18 deletions

File tree

.travis.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ env:
1515
- TEST_DIR=/tmp/test_dir/
1616
- MODULE=openml
1717
matrix:
18-
- DISTRIB="conda" PYTHON_VERSION="2.7" NUMPY_VERSION="1.11" SCIPY_VERSION="0.17.0" CYTHON_VERSION="0.21" SKLEARN_VERSION="0.18.1"
19-
- DISTRIB="conda" PYTHON_VERSION="3.4" NUMPY_VERSION="1.11" SCIPY_VERSION="0.17.0" CYTHON_VERSION="0.23.4" SKLEARN_VERSION="0.18.1"
20-
- DISTRIB="conda" PYTHON_VERSION="3.5" NUMPY_VERSION="1.11" SCIPY_VERSION="0.17.0" CYTHON_VERSION="0.23.4" SKLEARN_VERSION="0.18.1"
21-
- DISTRIB="conda" PYTHON_VERSION="3.6" COVERAGE="true" NUMPY_VERSION="1.12.1" SCIPY_VERSION="0.19.0" CYTHON_VERSION="0.25.2" SKLEARN_VERSION="0.18.1"
18+
- DISTRIB="conda" PYTHON_VERSION="2.7" SKLEARN_VERSION="0.18.2"
19+
- DISTRIB="conda" PYTHON_VERSION="3.4" SKLEARN_VERSION="0.18.2"
20+
- DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.18.2"
21+
- DISTRIB="conda" PYTHON_VERSION="3.6" COVERAGE="true" SKLEARN_VERSION="0.18.2"
2222

2323
install: source ci_scripts/install.sh
2424
script: bash ci_scripts/test.sh

ci_scripts/install.sh

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,14 @@ popd
2424

2525
# Configure the conda environment and put it in the path using the
2626
# provided versions
27-
conda create -n testenv --yes python=$PYTHON_VERSION pip nose \
28-
numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION cython=$CYTHON_VERSION \
29-
scikit-learn=$SKLEARN_VERSION pandas
27+
conda create -n testenv --yes python=$PYTHON_VERSION pip
3028
source activate testenv
29+
pip install nose numpy scipy cython scikit-learn==$SKLEARN_VERSION pandas \
30+
matplotlib jupyter notebook nbconvert nbformat jupyter_client ipython \
31+
ipykernel
3132

3233
pip install oslo.concurrency
34+
3335
pip install matplotlib jupyter notebook nbconvert nbformat jupyter_client ipython ipykernel
3436
if [[ "$COVERAGE" == "true" ]]; then
3537
pip install codecov

openml/runs/functions.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,18 @@ def _prediction_to_row(rep_no, fold_no, sample_no, row_id, correct_label,
361361

362362
# JvR: why is class labels a parameter? could be removed and taken from task object, right?
363363
def _run_task_get_arffcontent(model, task, class_labels):
364+
365+
def _prediction_to_probabilities(y, model_classes):
366+
# y: list or numpy array of predictions
367+
# model_classes: sklearn classifier mapping from original array id to prediction index id
368+
if not isinstance(model_classes, list):
369+
raise ValueError('please convert model classes to list prior to calling this fn')
370+
result = np.zeros((len(y), len(model_classes)), dtype=np.float32)
371+
for obs, prediction_idx in enumerate(y):
372+
array_idx = model_classes.index(prediction_idx)
373+
result[obs][array_idx] = 1.0
374+
return result
375+
364376
X, Y = task.get_X_and_y()
365377
arff_datacontent = []
366378
arff_tracecontent = []
@@ -428,8 +440,11 @@ def _run_task_get_arffcontent(model, task, class_labels):
428440
if can_measure_runtime:
429441
modelpredict_starttime = time.process_time()
430442

431-
ProbaY = model_fold.predict_proba(testX)
432443
PredY = model_fold.predict(testX)
444+
try:
445+
ProbaY = model_fold.predict_proba(testX)
446+
except AttributeError:
447+
ProbaY = _prediction_to_probabilities(PredY, list(model_classes))
433448

434449
# add client-side calculated metrics. These might be used on the server as consistency check
435450
def _calculate_local_measure(sklearn_fn, openml_name):
@@ -781,7 +796,7 @@ def _get_cached_run(run_id):
781796

782797

783798
def list_runs(offset=None, size=None, id=None, task=None, setup=None,
784-
flow=None, uploader=None, tag=None):
799+
flow=None, uploader=None, tag=None, display_errors=False):
785800
"""List all runs matching all of the given filters.
786801
787802
Perform API call `/run/list/{filters} <https://www.openml.org/api_docs/#!/run/get_run_list_filters>`_
@@ -805,6 +820,9 @@ def list_runs(offset=None, size=None, id=None, task=None, setup=None,
805820
806821
tag : str, optional
807822
823+
display_errors : bool, optional (default=None)
824+
Whether to list runs which have an error (for example a missing
825+
prediction file).
808826
Returns
809827
-------
810828
list
@@ -828,6 +846,8 @@ def list_runs(offset=None, size=None, id=None, task=None, setup=None,
828846
api_call += "/uploader/%s" % ','.join([str(int(i)) for i in uploader])
829847
if tag is not None:
830848
api_call += "/tag/%s" % tag
849+
if display_errors:
850+
api_call += "/show_errors/true"
831851

832852
return _list_runs(api_call)
833853

openml/runs/run.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,14 +125,14 @@ def get_metric_fn(self, sklearn_fn, kwargs={}):
125125
scores : list
126126
a list of floats, of length num_folds * num_repeats
127127
'''
128-
if self.data_content is not None:
128+
if self.data_content is not None and self.task_id is not None:
129129
predictions_arff = self._generate_arff_dict()
130130
elif 'predictions' in self.output_files:
131131
predictions_file_url = _file_id_to_url(self.output_files['predictions'], 'predictions.arff')
132132
predictions_arff = arff.loads(openml._api_calls._read_url(predictions_file_url))
133133
# TODO: make this a stream reader
134134
else:
135-
raise ValueError('Run should have been locally executed.')
135+
raise ValueError('Run should have been locally executed or contain outputfile reference.')
136136

137137
attribute_names = [att[0] for att in predictions_arff['attributes']]
138138
if 'correct' not in attribute_names:

tests/test_runs/test_run_functions.py

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,21 @@
2727
from sklearn.linear_model import LogisticRegression, SGDClassifier, \
2828
LinearRegression
2929
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
30-
from sklearn.svm import SVC
30+
from sklearn.svm import SVC, LinearSVC
3131
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, \
3232
StratifiedKFold
3333
from sklearn.pipeline import Pipeline
3434

3535

36+
class HardNaiveBayes(GaussianNB):
37+
# class for testing a naive bayes classifier that does not allow soft predictions
38+
def __init__(self, priors=None):
39+
super(HardNaiveBayes, self).__init__(priors)
40+
41+
def predict_proba(*args, **kwargs):
42+
raise AttributeError('predict_proba is not available when probability=False')
43+
44+
3645
class TestRun(TestBase):
3746
_multiprocess_can_split_ = True
3847

@@ -710,12 +719,6 @@ def test__run_task_get_arffcontent(self):
710719
num_folds = 10
711720
num_repeats = 1
712721

713-
clf = SGDClassifier(loss='hinge', random_state=1)
714-
self.assertRaisesRegexp(AttributeError,
715-
"probability estimates are not available for loss='hinge'",
716-
openml.runs.functions._run_task_get_arffcontent,
717-
clf, task, class_labels)
718-
719722
clf = SGDClassifier(loss='log', random_state=1)
720723
res = openml.runs.functions._run_task_get_arffcontent(clf, task, class_labels)
721724
arff_datacontent, arff_tracecontent, _, fold_evaluations, sample_evaluations = res
@@ -901,3 +904,25 @@ def test_run_on_dataset_with_missing_labels(self):
901904
# repeat, fold, row_id, 6 confidences, prediction and correct label
902905
self.assertEqual(len(row), 12)
903906

907+
def test_predict_proba_hardclassifier(self):
908+
# task 1 (test server) is important, as it is a task with an unused class
909+
tasks = [1, 3, 115]
910+
911+
for task_id in tasks:
912+
task = openml.tasks.get_task(task_id)
913+
clf1 = sklearn.pipeline.Pipeline(steps=[
914+
('imputer', sklearn.preprocessing.Imputer()), ('estimator', GaussianNB())
915+
])
916+
clf2 = sklearn.pipeline.Pipeline(steps=[
917+
('imputer', sklearn.preprocessing.Imputer()), ('estimator', HardNaiveBayes())
918+
])
919+
920+
arff_content1, arff_header1, _, _, _ = _run_task_get_arffcontent(clf1, task, task.class_labels)
921+
arff_content2, arff_header2, _, _, _ = _run_task_get_arffcontent(clf2, task, task.class_labels)
922+
923+
# verifies last two arff indices (predict and correct)
924+
# TODO: programmatically check wether these are indeed features (predict, correct)
925+
predictionsA = np.array(arff_content1)[:, -2:]
926+
predictionsB = np.array(arff_content2)[:, -2:]
927+
928+
np.testing.assert_array_equal(predictionsA, predictionsB)

0 commit comments

Comments
 (0)