Skip to content

Commit 0806855

Browse files
committed
Merge branch 'origin/MAIN/improve_coverage' into develop
2 parents 7f1c0eb + 070374f commit 0806855

2 files changed

Lines changed: 61 additions & 23 deletions

File tree

openml/runs/run.py

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ class OpenMLRun(object):
2323
FIXME
2424
2525
"""
26-
def __init__(self, task_id, flow_id, setup_string, dataset_id, files=None,
27-
setup_id=None, tags=None, uploader=None, uploader_name=None,
26+
def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
27+
files=None, setup_id=None, tags=None, uploader=None, uploader_name=None,
2828
evaluations=None, detailed_evaluations=None,
2929
data_content=None, model=None, task_type=None,
3030
task_evaluation_measure=None, flow_name=None,
@@ -49,15 +49,20 @@ def __init__(self, task_id, flow_id, setup_string, dataset_id, files=None,
4949
self.flow = flow
5050
self.run_id = run_id
5151

52-
def _generate_arff(self):
53-
"""Generates an arff for upload to server.
52+
def _generate_arff_dict(self):
53+
"""Generates the arff dictionary for upload to the server.
54+
55+
Assumes that the run has been executed.
5456
5557
Returns
5658
-------
57-
arf_dict : dictionary
58-
Dictionary representation of an ARFF data format containing
59-
predictions and confidences.
59+
arf_dict : dict
60+
Dictionary representation of the ARFF file that will be uploaded.
61+
Contains predictions and information about the run environment.
6062
"""
63+
if self.data_content is None:
64+
raise ValueError('Run has not been executed.')
65+
6166
run_environment = (_get_version_information() +
6267
[time.strftime("%c")] + ['Created by run_task()'])
6368
task = get_task(self.task_id)
@@ -85,7 +90,7 @@ def publish(self):
8590
-------
8691
self : OpenMLRun
8792
"""
88-
predictions = arff.dumps(self._generate_arff())
93+
predictions = arff.dumps(self._generate_arff_dict())
8994
description_xml = self._create_description_xml()
9095
file_elements = {'predictions': ("predictions.csv", predictions),
9196
'description': ("description.xml", description_xml)}
@@ -159,11 +164,19 @@ def run_task(task, model):
159164
'only works for tasks with class labels.')
160165
setup_string = _create_setup_string(model)
161166

162-
run = OpenMLRun(task_id=task.task_id, flow_id=flow_id,
163-
setup_string=setup_string, dataset_id=dataset.dataset_id,
164-
task=task, flow=flow)
167+
run = OpenMLRun(task.task_id, flow_id, setup_string, dataset.id)
168+
run.data_content = _run_task_get_arffcontent(model, task, class_labels)
169+
170+
# The model will not be uploaded at the moment, but used to get the
171+
# hyperparameter values when uploading the run
172+
X, Y = task.get_X_and_y()
173+
run.model = model.fit(X, Y)
174+
return run
175+
165176

166-
train_times = []
177+
def _run_task_get_arffcontent(model, task, class_labels):
178+
X, Y = task.get_X_and_y()
179+
arff_datacontent = []
167180

168181
rep_no = 0
169182
# TODO use different iterator to only provide a single iterator (less
@@ -177,26 +190,21 @@ def run_task(task, model):
177190
testX = X[test_indices]
178191
testY = Y[test_indices]
179192

180-
start_time = time.time()
181193
model.fit(trainX, trainY)
182194
ProbaY = model.predict_proba(testX)
183195
PredY = model.predict(testX)
184-
end_time = time.time()
185-
186-
train_times.append(end_time - start_time)
187196

188197
for i in range(0, len(test_indices)):
189-
arff_line = [rep_no, fold_no, test_indices[i],
190-
class_labels[PredY[i]], class_labels[testY[i]]]
191-
arff_line[3:3] = ProbaY[i]
198+
arff_line = [rep_no, fold_no, test_indices[i]]
199+
arff_line.extend(ProbaY[i])
200+
arff_line.append(class_labels[PredY[i]])
201+
arff_line.append(class_labels[testY[i]])
192202
arff_datacontent.append(arff_line)
193203

194204
fold_no = fold_no + 1
195205
rep_no = rep_no + 1
196206

197-
run.data_content = arff_datacontent
198-
run.model = model.fit(X, Y)
199-
return run
207+
return arff_datacontent
200208

201209

202210
def _to_dict(taskid, flow_id, setup_string, parameter_settings, tags):

tests/runs/test_runs.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from sklearn.linear_model import LogisticRegression
1+
from sklearn.linear_model import LogisticRegression, SGDClassifier
22
import openml
33
from openml.testing import TestBase
44

@@ -12,6 +12,36 @@ def test_run_iris(self):
1212
self.assertEqual(run_, run)
1313
self.assertIsInstance(run.dataset_id, int)
1414

15+
def test__run_task_get_arffcontent(self):
16+
task = openml.tasks.get_task(1939)
17+
class_labels = task.class_labels
18+
19+
clf = SGDClassifier(loss='hinge', random_state=1)
20+
self.assertRaisesRegexp(AttributeError,
21+
"probability estimates are not available for loss='hinge'",
22+
openml.runs.run._run_task_get_arffcontent,
23+
clf, task, class_labels)
24+
25+
clf = SGDClassifier(loss='log', random_state=1)
26+
arff_datacontent = openml.runs.run._run_task_get_arffcontent(
27+
clf, task, class_labels)
28+
self.assertIsInstance(arff_datacontent, list)
29+
# 10 times 10 fold CV of 150 samples
30+
self.assertEqual(len(arff_datacontent), 1500)
31+
for arff_line in arff_datacontent:
32+
self.assertEqual(len(arff_line), 8)
33+
self.assertGreaterEqual(arff_line[0], 0)
34+
self.assertLessEqual(arff_line[0], 9)
35+
self.assertGreaterEqual(arff_line[1], 0)
36+
self.assertLessEqual(arff_line[1], 9)
37+
self.assertGreaterEqual(arff_line[2], 0)
38+
self.assertLessEqual(arff_line[2], 149)
39+
self.assertAlmostEqual(sum(arff_line[3:6]), 1.0)
40+
self.assertIn(arff_line[6], ['Iris-setosa', 'Iris-versicolor',
41+
'Iris-virginica'])
42+
self.assertIn(arff_line[7], ['Iris-setosa', 'Iris-versicolor',
43+
'Iris-virginica'])
44+
1545
def test_get_run(self):
1646
run = openml.runs.get_run(473350)
1747
self.assertEqual(run.dataset_id, 1167)

0 commit comments

Comments
 (0)