Skip to content

Commit 2796b9a

Browse files
committed
Adding Perrone example for building surrogate
1 parent 8eac076 commit 2796b9a

1 file changed

Lines changed: 168 additions & 1 deletion

File tree

examples/40_paper/2018_neurips_perrone_example.py

Lines changed: 168 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,172 @@
1313
| In *Advances in Neural Information Processing Systems 31*, 2018
1414
| Available at http://papers.nips.cc/paper/7917-scalable-hyperparameter-transfer-learning.pdf
1515
16-
This is currently a placeholder.
16+
This example demonstrates how OpenML runs can be used to construct a surrogate model.
17+
18+
In the following section, we shall do the following:
19+
20+
* Retrieve tasks and flows as used in the experiments by Perrone et al.
21+
* Build a tabular data by fetching the evaluations uploaded to OpenML
22+
* Impute missing values and handle categorical data before building a Random Forest model that
23+
maps hyperparameter values to the area under curve score
24+
"""
25+
26+
############################################################################
27+
import openml
28+
import numpy as np
29+
import pandas as pd
30+
from sklearn.impute import SimpleImputer
31+
from sklearn.preprocessing import OneHotEncoder
32+
from sklearn.ensemble import RandomForestRegressor
33+
34+
user_id = 2702
35+
############################################################################
36+
37+
"""
38+
The subsequent functions are defined to fetch tasks, flows, evaluations and preprocess them into
39+
a tabular format that can be used to build models.
1740
"""
41+
42+
def fetch_evaluations(run_full=False, flow_type='svm', metric = 'area_under_roc_curve'):
43+
'''
44+
Fetch a list of evaluations based on the flows and tasks used in the experiments.
45+
46+
Parameters
47+
----------
48+
run_full : boolean
49+
If True, use the full list of tasks used in the paper
50+
If False, use 5 tasks with the smallest number of evaluations available
51+
flow_type : str, {'svm', 'xgboost'}
52+
To select whether svm or xgboost experiments are to be run
53+
metric : str
54+
The evaluation measure that is passed to openml.evaluations.list_evaluations
55+
56+
Returns
57+
-------
58+
eval_df : dataframe
59+
task_ids : list
60+
flow_id : int
61+
'''
62+
# Collecting task IDs as used by the experiments from the paper
63+
if flow_type == 'svm' and run_full:
64+
task_ids = [10101, 145878, 146064, 14951, 34537, 3485, 3492, 3493, 3494, 37, 3889, 3891,
65+
3899, 3902, 3903, 3913, 3918, 3950, 9889, 9914, 9946, 9952, 9967, 9971, 9976,
66+
9978, 9980, 9983]
67+
elif flow_type == 'svm' and not run_full:
68+
task_ids = [9983, 3485, 3902, 3903, 145878]
69+
elif flow_type == 'xgboost' and run_full:
70+
task_ids = [10093, 10101, 125923, 145847, 145857, 145862, 145872, 145878, 145953, 145972,
71+
145976, 145979, 146064, 14951, 31, 3485, 3492, 3493, 37, 3896, 3903, 3913,
72+
3917, 3918, 3, 49, 9914, 9946, 9952, 9967]
73+
else: #flow_type == 'xgboost' and not run_full:
74+
task_ids = [3903, 37, 3485, 49, 3913]
75+
76+
# Fetching the relevant flow
77+
flow_id = 5891 if flow_type == 'svm' else 6767
78+
79+
# Fetching evaluations
80+
eval_df = openml.evaluations.list_evaluations(function=metric, task=task_ids, flow=[flow_id],
81+
uploader=[2702], output_format='dataframe')
82+
return eval_df, task_ids, flow_id
83+
84+
85+
def create_table_from_evaluations(eval_df, flow_type='svm', run_count=np.iinfo(np.int64).max,
86+
metric = 'area_under_roc_curve', task_ids=None):
87+
'''
88+
Create a tabular data with its ground truth from a dataframe of evaluations.
89+
Optionally, can filter out records based on task ids.
90+
91+
Parameters
92+
----------
93+
eval_df : dataframe
94+
Containing list of runs as obtained from list_evaluations()
95+
flow_type : str, {'svm', 'xgboost'}
96+
To select whether svm or xgboost experiments are to be run
97+
run_count : int
98+
Maximum size of the table created, or number of runs included in the table
99+
metric : str
100+
The evaluation measure that is passed to openml.evaluations.list_evaluations
101+
task_ids : list, (optional)
102+
List of integers specifying the tasks to be retained from the evaluations dataframe
103+
104+
Returns
105+
-------
106+
eval_table : dataframe
107+
values : list
108+
'''
109+
if task_ids is not None:
110+
eval_df = eval_df.loc[eval_df.task_id.isin(task_ids)]
111+
ncols = 4 if flow_type == 'svm' else 10 # ncols determine the number of hyperparameters
112+
if flow_type == 'svm':
113+
ncols = 4
114+
colnames = ['cost', 'degree', 'gamma', 'kernel']
115+
else:
116+
ncols = 10
117+
colnames = ['alpha', 'booster', 'colsample_bylevel', 'colsample_bytree', 'eta', 'lambda',
118+
'max_depth', 'min_child_weight', 'nrounds', 'subsample']
119+
eval_df = eval_df.sample(frac=1) # shuffling rows
120+
run_ids = eval_df.run_id[:run_count]
121+
eval_table = pd.DataFrame(np.nan, index=run_ids, columns=colnames)
122+
values = []
123+
for run_id in run_ids:
124+
r = openml.runs.get_run(run_id)
125+
params = r.parameter_settings
126+
for p in params:
127+
name, value = p['oml:name'], p['oml:value']
128+
if name in colnames:
129+
eval_table.loc[run_id, name] = value
130+
values.append(r.evaluations[metric])
131+
return eval_table, values
132+
133+
134+
def impute_missing_values(eval_table, flow_type='svm'):
135+
# Replacing NaNs with fixed values outside the range of the parameters
136+
# given in the supplement material of the paper
137+
if flow_type == 'svm':
138+
eval_table.kernel.fillna("None", inplace=True)
139+
eval_table.fillna(-1, inplace=True)
140+
else:
141+
eval_table.booster.fillna("None", inplace=True)
142+
eval_table.fillna(-1, inplace=True)
143+
return eval_table
144+
145+
146+
def preprocess(eval_table, flow_type='svm'):
147+
eval_table = impute_missing_values(eval_table, flow_type)
148+
# Encode categorical variables as one-hot vectors
149+
enc = OneHotEncoder(handle_unknown='ignore')
150+
enc.fit(eval_table.kernel.to_numpy().reshape(-1, 1))
151+
one_hots = enc.transform(eval_table.kernel.to_numpy().reshape(-1, 1)).toarray()
152+
if flow_type == 'svm':
153+
eval_table = np.hstack((eval_table.drop('kernel', 1), one_hots)).astype(float)
154+
else:
155+
eval_table = np.hstack((eval_table.drop('booster', 1), one_hots)).astype(float)
156+
return eval_table
157+
158+
159+
#############################################################################
160+
# Fetching the tasks and evaluations
161+
# ==================================
162+
# To read all the tasks and evaluations for them and collate into a table. Here, we are reading
163+
# all the tasks and evaluations for the SVM flow and preprocessing all retrieved evaluations.
164+
165+
eval_df, task_ids, flow_id = fetch_evaluations(run_full=False)
166+
X, y = create_table_from_evaluations(eval_df, run_count=1000)
167+
X = preprocess(X)
168+
169+
170+
#############################################################################
171+
# Building a surrogate model on a task's evaluation
172+
# =================================================
173+
# The same set of functions can be used for a single task to retrieve a singular table which can
174+
# be used for the surrogate model construction. We shall use the SVM flow here to keep execution
175+
# time simple and quick.
176+
177+
# Selecting a task
178+
task_id = task_ids[-1]
179+
X, y = create_table_from_evaluations(eval_df, run_count=1000, task_ids=[task_id], flow_type='svm')
180+
X = preprocess(X, flow_type='svm')
181+
182+
# Surrogate model
183+
clf = RandomForestRegressor(n_estimators=50, max_depth=3)
184+
clf.fit(X, y)

0 commit comments

Comments
 (0)