Skip to content

Commit 0f58378

Browse files
committed
fixed parameter trace to correct openml format
updated optimization library to "model_selection._search"
1 parent 7337d2a commit 0f58378

3 files changed

Lines changed: 63 additions & 16 deletions

File tree

openml/runs/functions.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import io
33
import os
44
import xmltodict
5-
from sklearn.grid_search import BaseSearchCV
5+
from sklearn.model_selection._search import BaseSearchCV
66

77
from .. import config
88
from ..flows import create_flow_from_model
@@ -83,13 +83,16 @@ def _run_task_get_arffcontent(model, task, class_labels):
8383

8484
model.fit(trainX, trainY)
8585
if isinstance(model, BaseSearchCV):
86-
for itt_no in range(0, len(model.grid_scores_)):
87-
current = model.grid_scores_[itt_no]
86+
for itt_no in range(0, len(model.cv_results_['mean_test_score'])):
8887
# we use the string values for True and False, as it is defined in this way by the OpenML server
8988
selected = 'false'
90-
if current.parameters == model.best_params_:
89+
if itt_no == model.best_index_:
9190
selected = 'true'
92-
arff_line = [rep_no, fold_no, itt_no, current.parameters, current.mean_validation_score, selected]
91+
test_score = model.cv_results_['mean_test_score'][itt_no]
92+
arff_line = [rep_no, fold_no, itt_no, test_score, selected]
93+
for key in model.cv_results_:
94+
if key.startswith("param_"):
95+
arff_line.append(str(model.cv_results_[key][itt_no]))
9396
arff_tracecontent.append(arff_line)
9497

9598
ProbaY = model.predict_proba(testX)

openml/runs/run.py

Lines changed: 49 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import arff
66
import xmltodict
77
from sklearn.base import BaseEstimator
8+
from sklearn.model_selection._search import BaseSearchCV
89

910
import openml
1011
from ..tasks import get_task
@@ -78,18 +79,48 @@ def _generate_arff_dict(self):
7879
arff_dict['relation'] = 'openml_task_' + str(task.task_id) + '_predictions'
7980
return arff_dict
8081

81-
def _generate_trace_arff_dict(self):
82+
def _generate_trace_arff_dict(self, model):
83+
"""Generates the arff dictionary for uploading predictions to the server.
84+
85+
Assumes that the run has been executed.
86+
87+
Returns
88+
-------
89+
arf_dict : dict
90+
Dictionary representation of the ARFF file that will be uploaded.
91+
Contains information about the optimization trace.
92+
"""
8293
if self.trace_content is None:
83-
raise ValueError('No trace content avaiable. (This should never happen.)')
94+
raise ValueError('No trace content avaiable.')
95+
if not isinstance(model, BaseSearchCV):
96+
raise PyOpenMLError('Cannot generate trace on provided classifier. (This should never happen.)')
97+
8498
arff_dict = {}
8599
arff_dict['attributes'] = [('repeat', 'NUMERIC'),
86100
('fold', 'NUMERIC'),
87101
('iteration', 'NUMERIC'),
88-
('setup_string', 'STRING'),
89102
('evaluation', 'NUMERIC'),
90103
('selected', ['true', 'false'])]
104+
for key in model.cv_results_:
105+
if key.startswith("param_"):
106+
type = 'STRING'
107+
if all(isinstance(i, (bool)) for i in model.cv_results_[key]):
108+
type = ['True', 'False']
109+
elif all(isinstance(i, (int, float)) for i in model.cv_results_[key]):
110+
type = 'NUMERIC'
111+
else:
112+
values = list(set(model.cv_results_[key])) # unique values
113+
if len(values) < 100: # arbitrary number. make it an option?
114+
type = [str(i) for i in values]
115+
print(key + ": " + str(type))
116+
117+
attribute = ("parameter_" + key[6:], type)
118+
arff_dict['attributes'].append(attribute)
119+
91120
arff_dict['data'] = self.trace_content
92121
arff_dict['relation'] = 'openml_task_' + str(self.task_id) + '_predictions'
122+
123+
print(arff_dict)
93124
return arff_dict
94125

95126
def publish(self):
@@ -111,7 +142,7 @@ def publish(self):
111142
file_elements = {'predictions': ("predictions.arff", predictions),
112143
'description': ("description.xml", description_xml)}
113144
if self.trace_content is not None:
114-
trace_arff = arff.dumps(self._generate_trace_arff_dict())
145+
trace_arff = arff.dumps(self._generate_trace_arff_dict(self.model))
115146
file_elements['trace'] = ("trace.arff", trace_arff)
116147

117148
return_code, return_value = _perform_api_call(
@@ -149,6 +180,16 @@ def _create_description_xml(self):
149180
return description_xml
150181

151182
def _parse_parameters(model, flow):
183+
"""Extracts all parameter settings from an model in OpenML format.
184+
185+
Parameters
186+
----------
187+
model
188+
the sci-kit learn model (fitted)
189+
flow
190+
openml flow object (containing flow ids, i.e., it has to be downloaded from the server)
191+
192+
"""
152193
python_param_settings = model.get_params()
153194
openml_param_settings = []
154195
flow_dict = openml.flows.get_flow_dict(flow)
@@ -160,12 +201,13 @@ def _parse_parameters(model, flow):
160201
if isinstance(python_param_settings[param], BaseEstimator):
161202
# extract parameters of the subflow individually
162203
subflow = flow.components[param]
204+
openml_param_settings += _parse_parameters(python_param_settings[param], subflow)
163205

164206
# add parameter setting (also the subflow. Just because we can)
165207
param_dict = OrderedDict()
166-
param_dict['oml:name'] = param;
167-
param_dict['oml:value'] = str(python_param_settings[param]);
168-
param_dict['oml:component'] = flow_dict[flow.name];
208+
param_dict['oml:name'] = param
209+
param_dict['oml:value'] = str(python_param_settings[param])
210+
param_dict['oml:component'] = flow_dict[flow.name]
169211
openml_param_settings.append(param_dict)
170212

171213
return openml_param_settings

tests/runs/test_run_functions.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from sklearn.linear_model import LogisticRegression, SGDClassifier
22
from sklearn.ensemble import RandomForestClassifier
3-
from sklearn.grid_search import RandomizedSearchCV
3+
from sklearn.model_selection import RandomizedSearchCV
44
import openml
55
from openml.testing import TestBase
66

@@ -16,16 +16,18 @@ def test_run_iris(self):
1616

1717
def test_run_optimize_randomforest_iris(self):
1818
task = openml.tasks.get_task(10107)
19-
clf = RandomForestClassifier(n_estimators=5)
19+
numIterations = 5
20+
21+
22+
clf = RandomForestClassifier(n_estimators=numIterations)
2023

2124
param_dist = {"max_depth": [3, None],
2225
"max_features": [1,2,3,4],
2326
"min_samples_split": [1,2,3,4,5,6,7,8,9,10],
2427
"min_samples_leaf": [1,2,3,4,5,6,7,8,9,10],
2528
"bootstrap": [True, False],
2629
"criterion": ["gini", "entropy"]}
27-
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,n_iter=20)
28-
30+
random_search = RandomizedSearchCV(clf, param_dist,n_iter=20)
2931

3032
run = openml.runs.run_task(task, random_search)
3133
run_ = run.publish()

0 commit comments

Comments
 (0)