Skip to content

Commit 56fa7f9

Browse files
authored
Merge pull request #832 from openml/transfer_learning_example
Adding Perrone example for building surrogate
2 parents a5b35e6 + f6a2a95 commit 56fa7f9

1 file changed

Lines changed: 242 additions & 1 deletion

File tree

examples/40_paper/2018_neurips_perrone_example.py

Lines changed: 242 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,246 @@
1313
| In *Advances in Neural Information Processing Systems 31*, 2018
1414
| Available at http://papers.nips.cc/paper/7917-scalable-hyperparameter-transfer-learning.pdf
1515
16-
This is currently a placeholder.
16+
This example demonstrates how OpenML runs can be used to construct a surrogate model.
17+
18+
In the following section, we shall do the following:
19+
20+
* Retrieve tasks and flows as used in the experiments by Perrone et al. (2018).
21+
* Build a tabular data by fetching the evaluations uploaded to OpenML.
22+
* Impute missing values and handle categorical data before building a Random Forest model that
23+
maps hyperparameter values to the area under curve score.
1724
"""
25+
26+
############################################################################
27+
import openml
28+
import numpy as np
29+
import pandas as pd
30+
from matplotlib import pyplot as plt
31+
from sklearn.pipeline import Pipeline
32+
from sklearn.impute import SimpleImputer
33+
from sklearn.compose import ColumnTransformer
34+
from sklearn.metrics import mean_squared_error
35+
from sklearn.preprocessing import OneHotEncoder
36+
from sklearn.ensemble import RandomForestRegressor
37+
38+
flow_type = 'svm' # this example will use the smaller svm flow evaluations
39+
############################################################################
40+
# The subsequent functions are defined to fetch tasks, flows, evaluations and preprocess them into
41+
# a tabular format that can be used to build models.
42+
#
43+
44+
def fetch_evaluations(run_full=False,
45+
flow_type='svm',
46+
metric='area_under_roc_curve'):
47+
'''
48+
Fetch a list of evaluations based on the flows and tasks used in the experiments.
49+
50+
Parameters
51+
----------
52+
run_full : boolean
53+
If True, use the full list of tasks used in the paper
54+
If False, use 5 tasks with the smallest number of evaluations available
55+
flow_type : str, {'svm', 'xgboost'}
56+
To select whether svm or xgboost experiments are to be run
57+
metric : str
58+
The evaluation measure that is passed to openml.evaluations.list_evaluations
59+
60+
Returns
61+
-------
62+
eval_df : dataframe
63+
task_ids : list
64+
flow_id : int
65+
'''
66+
# Collecting task IDs as used by the experiments from the paper
67+
if flow_type == 'svm' and run_full:
68+
task_ids = [
69+
10101, 145878, 146064, 14951, 34537, 3485, 3492, 3493, 3494,
70+
37, 3889, 3891, 3899, 3902, 3903, 3913, 3918, 3950, 9889,
71+
9914, 9946, 9952, 9967, 9971, 9976, 9978, 9980, 9983,
72+
]
73+
elif flow_type == 'svm' and not run_full:
74+
task_ids = [9983, 3485, 3902, 3903, 145878]
75+
elif flow_type == 'xgboost' and run_full:
76+
task_ids = [
77+
10093, 10101, 125923, 145847, 145857, 145862, 145872, 145878,
78+
145953, 145972, 145976, 145979, 146064, 14951, 31, 3485,
79+
3492, 3493, 37, 3896, 3903, 3913, 3917, 3918, 3, 49, 9914,
80+
9946, 9952, 9967,
81+
]
82+
else: #flow_type == 'xgboost' and not run_full:
83+
task_ids = [3903, 37, 3485, 49, 3913]
84+
85+
# Fetching the relevant flow
86+
flow_id = 5891 if flow_type == 'svm' else 6767
87+
88+
# Fetching evaluations
89+
eval_df = openml.evaluations.list_evaluations(function=metric,
90+
task=task_ids,
91+
flow=[flow_id],
92+
uploader=[2702],
93+
output_format='dataframe')
94+
return eval_df, task_ids, flow_id
95+
96+
97+
def create_table_from_evaluations(eval_df,
98+
flow_type='svm',
99+
run_count=np.iinfo(np.int64).max,
100+
metric = 'area_under_roc_curve',
101+
task_ids=None):
102+
'''
103+
Create a tabular data with its ground truth from a dataframe of evaluations.
104+
Optionally, can filter out records based on task ids.
105+
106+
Parameters
107+
----------
108+
eval_df : dataframe
109+
Containing list of runs as obtained from list_evaluations()
110+
flow_type : str, {'svm', 'xgboost'}
111+
To select whether svm or xgboost experiments are to be run
112+
run_count : int
113+
Maximum size of the table created, or number of runs included in the table
114+
metric : str
115+
The evaluation measure that is passed to openml.evaluations.list_evaluations
116+
task_ids : list, (optional)
117+
List of integers specifying the tasks to be retained from the evaluations dataframe
118+
119+
Returns
120+
-------
121+
eval_table : dataframe
122+
values : list
123+
'''
124+
if task_ids is not None:
125+
eval_df = eval_df[eval_df['task_id'].isin(task_ids)]
126+
if flow_type == 'svm':
127+
colnames = ['cost', 'degree', 'gamma', 'kernel']
128+
else:
129+
colnames = [
130+
'alpha', 'booster', 'colsample_bylevel', 'colsample_bytree',
131+
'eta', 'lambda', 'max_depth', 'min_child_weight', 'nrounds',
132+
'subsample',
133+
]
134+
eval_df = eval_df.sample(frac=1) # shuffling rows
135+
run_ids = eval_df["run_id"][:run_count]
136+
eval_table = pd.DataFrame(np.nan, index=run_ids, columns=colnames)
137+
values = []
138+
runs = openml.runs.get_runs(run_ids)
139+
for r in runs:
140+
params = r.parameter_settings
141+
for p in params:
142+
name, value = p['oml:name'], p['oml:value']
143+
if name in colnames:
144+
eval_table.loc[r.run_id, name] = value
145+
values.append(r.evaluations[metric])
146+
return eval_table, values
147+
148+
149+
def list_categorical_attributes(flow_type='svm'):
150+
if flow_type == 'svm':
151+
return ['kernel']
152+
return ['booster']
153+
154+
155+
#############################################################################
156+
# Fetching the data from OpenML
157+
# *****************************
158+
# Now, we read all the tasks and evaluations for them and collate into a table.
159+
# Here, we are reading all the tasks and evaluations for the SVM flow and
160+
# pre-processing all retrieved evaluations.
161+
162+
eval_df, task_ids, flow_id = fetch_evaluations(run_full=False, flow_type=flow_type)
163+
# run_count can not be passed if all the results are required
164+
# it is set to 500 here arbitrarily to get results quickly
165+
X, y = create_table_from_evaluations(eval_df, run_count=500, flow_type=flow_type)
166+
print(X.head())
167+
print("Y : ", y[:5])
168+
169+
#############################################################################
170+
# Creating pre-processing and modelling pipelines
171+
# ***********************************************
172+
# The two primary tasks are to impute the missing values, that is, account for the hyperparameters
173+
# that are not available with the runs from OpenML. And secondly, to handle categorical variables
174+
# using One-hot encoding prior to modelling.
175+
176+
# Separating data into categorical and non-categorical (numeric for this example) columns
177+
cat_cols = list_categorical_attributes(flow_type=flow_type)
178+
num_cols = list(set(X.columns) - set(cat_cols))
179+
X_cat = X.loc[:, cat_cols]
180+
X_num = X.loc[:, num_cols]
181+
182+
# Missing value imputers
183+
cat_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='None')
184+
num_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1)
185+
186+
# Creating the one-hot encoder
187+
enc = OneHotEncoder(handle_unknown='ignore')
188+
189+
# Pipeline to handle categorical column transformations
190+
cat_transforms = Pipeline([('impute', cat_imputer), ('encode', enc)])
191+
192+
# Combining column transformers
193+
ct = ColumnTransformer([('cat', cat_transforms, cat_cols), ('num', num_imputer, num_cols)])
194+
195+
# Creating the full pipeline with the surrogate model
196+
clf = RandomForestRegressor(n_estimators=50)
197+
model = Pipeline(steps=[('preprocess', ct), ('surrogate', clf)])
198+
199+
200+
#############################################################################
201+
# Building a surrogate model on a task's evaluation
202+
# *************************************************
203+
# The same set of functions can be used for a single task to retrieve a singular table which can
204+
# be used for the surrogate model construction. We shall use the SVM flow here to keep execution
205+
# time simple and quick.
206+
207+
# Selecting a task for the surrogate
208+
task_id = task_ids[-1]
209+
print("Task ID : ", task_id)
210+
X, y = create_table_from_evaluations(eval_df, run_count=1000, task_ids=[task_id], flow_type='svm')
211+
212+
model.fit(X, y)
213+
y_pred = model.predict(X)
214+
215+
print("Training RMSE : {:.5}".format(mean_squared_error(y, y_pred)))
216+
217+
218+
#############################################################################
219+
# Evaluating the surrogate model
220+
# ******************************
221+
# The surrogate model built from a task's evaluations fetched from OpenML will be put into
222+
# trivial action here, where we shall randomly sample configurations and observe the trajectory
223+
# of the area under curve (auc) we can obtain from the surrogate we've built.
224+
#
225+
# NOTE: This section is written exclusively for the SVM flow
226+
227+
# Sampling random configurations
228+
def random_sample_configurations(num_samples=100):
229+
colnames = ['cost', 'degree', 'gamma', 'kernel']
230+
ranges = [(0.000986, 998.492437),
231+
(2.0, 5.0),
232+
(0.000988, 913.373845),
233+
(['linear', 'polynomial', 'radial', 'sigmoid'])]
234+
X = pd.DataFrame(np.nan, index=range(num_samples), columns=colnames)
235+
for i in range(len(colnames)):
236+
if len(ranges[i]) == 2:
237+
col_val = np.random.uniform(low=ranges[i][0], high=ranges[i][1], size=num_samples)
238+
else:
239+
col_val = np.random.choice(ranges[i], size=num_samples)
240+
X.iloc[:, i] = col_val
241+
return X
242+
243+
configs = random_sample_configurations(num_samples=1000)
244+
print(configs)
245+
246+
#############################################################################
247+
preds = model.predict(configs)
248+
249+
# tracking the maximum AUC obtained over the functions evaluations
250+
preds = np.maximum.accumulate(preds)
251+
# computing regret (1 - predicted_auc)
252+
regret = 1 - preds
253+
254+
# plotting the regret curve
255+
plt.plot(regret)
256+
plt.title('AUC regret for Random Search on surrogate')
257+
plt.xlabel('Numbe of function evaluations')
258+
plt.ylabel('Regret')

0 commit comments

Comments
 (0)