Skip to content

Commit cfba39d

Browse files
committed
Finishing the whole example design
1 parent 1a3f456 commit cfba39d

1 file changed

Lines changed: 99 additions & 14 deletions

File tree

examples/40_paper/2018_neurips_perrone_example.py

Lines changed: 99 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,17 @@
2727
import openml
2828
import numpy as np
2929
import pandas as pd
30+
from matplotlib import pyplot as plt
31+
from sklearn.pipeline import Pipeline
3032
from sklearn.impute import SimpleImputer
33+
from sklearn.compose import ColumnTransformer
34+
from sklearn.metrics import mean_squared_error
3135
from sklearn.preprocessing import OneHotEncoder
3236
from sklearn.ensemble import RandomForestRegressor
3337

38+
3439
user_id = 2702
40+
flow_type = 'svm' # this example will use the smaller svm flow evaluations
3541
############################################################################
3642

3743
"""
@@ -138,6 +144,12 @@ def create_table_from_evaluations(eval_df,
138144
return eval_table, values
139145

140146

147+
def list_categorical_attributes(flow_type='svm'):
148+
if flow_type == 'svm':
149+
return ['kernel']
150+
return ['booster']
151+
152+
141153
def impute_missing_values(eval_table, flow_type='svm'):
142154
# Replacing NaNs with fixed values outside the range of the parameters
143155
# given in the supplement material of the paper
@@ -164,30 +176,103 @@ def preprocess(eval_table, flow_type='svm'):
164176

165177

166178
#############################################################################
167-
# Fetching the tasks and evaluations
168-
# ==================================
179+
# Fetching the data from OpenML
180+
# *****************************
169181
# To read all the tasks and evaluations for them and collate into a table. Here, we are reading
170-
# all the tasks and evaluations for the SVM flow and preprocessing all retrieved evaluations.
182+
# all the tasks and evaluations for the SVM flow and pre-processing all retrieved evaluations.
183+
184+
eval_df, task_ids, flow_id = fetch_evaluations(run_full=False, flow_type=flow_type)
185+
# run_count can not be passed if all the results are required
186+
# it is set to 1000 here arbitrarily to get results quickly
187+
X, y = create_table_from_evaluations(eval_df, run_count=1000, flow_type=flow_type)
188+
print(X.head())
189+
print("Y : ", y[:5])
190+
191+
#############################################################################
192+
# Creating pre-processing and modelling pipelines
193+
# ***********************************************
194+
# The two primary tasks are to impute the missing values, that is, account for the hyperparameters
195+
# that are not available with the runs from OpenML. And secondly, to handle categorical variables
196+
# using One-hot encoding prior to modelling.
197+
198+
# Separating data into categorical and non-categorical (numeric for this example) columns
199+
cat_cols = list_categorical_attributes(flow_type=flow_type)
200+
num_cols = list(set(X.columns) - set(cat_cols))
201+
X_cat = X.loc[:, cat_cols]
202+
X_num = X.loc[:, num_cols]
203+
204+
# Missing value imputers
205+
cat_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='None')
206+
num_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1)
207+
208+
# Creating the one-hot encoder
209+
enc = OneHotEncoder(handle_unknown='ignore')
171210

172-
eval_df, task_ids, flow_id = fetch_evaluations(run_full=False)
173-
X, y = create_table_from_evaluations(eval_df, run_count=1000)
174-
X = preprocess(X)
175-
print("Type: {}; Shape: {}".format(type(X), X.shape))
176-
print(X[:5])
211+
# Pipeline to handle categorical column transformations
212+
cat_transforms = Pipeline([('impute', cat_imputer), ('encode', enc)])
213+
214+
# Combining column transformers
215+
ct = ColumnTransformer([('cat', cat_transforms, cat_cols), ('num', num_imputer, num_cols)])
216+
217+
# Creating the full pipeline with the surrogate model
218+
clf = RandomForestRegressor(n_estimators=50)
219+
model = Pipeline(steps=[('preprocess', ct), ('surrogate', clf)])
177220

178221

179222
#############################################################################
180223
# Building a surrogate model on a task's evaluation
181-
# =================================================
224+
# *************************************************
182225
# The same set of functions can be used for a single task to retrieve a singular table which can
183226
# be used for the surrogate model construction. We shall use the SVM flow here to keep execution
184227
# time simple and quick.
185228

186-
# Selecting a task
229+
# Selecting a task for the surrogate
187230
task_id = task_ids[-1]
231+
print("Task ID : ", task_id)
188232
X, y = create_table_from_evaluations(eval_df, run_count=1000, task_ids=[task_id], flow_type='svm')
189-
X = preprocess(X, flow_type='svm')
190233

191-
# Surrogate model
192-
clf = RandomForestRegressor(n_estimators=50, max_depth=3)
193-
clf.fit(X, y)
234+
model.fit(X, y)
235+
y_pred = model.predict(X)
236+
237+
print("Training RMSE : {:.5}".format(mean_squared_error(y, y_pred)))
238+
239+
240+
#############################################################################
241+
# Evaluating the surrogate model
242+
# ******************************
243+
# The surrogate model built from a task's evaluations fetched from OpenML will be put into
244+
# trivial action here, where we shall randomly sample configurations and observe the trajectory
245+
# of the area under curve (auc) we can obtain from the surrogate we've built.
246+
# NOTE: This section is written exclusively for the SVM flow
247+
248+
# Sampling random configurations
249+
def random_sample_configurations(num_samples=100):
250+
colnames = ['cost', 'degree', 'gamma', 'kernel']
251+
ranges = [(0.000986, 998.492437),
252+
(2.0, 5.0),
253+
(0.000988, 913.373845),
254+
(['linear', 'polynomial', 'radial', 'sigmoid'])]
255+
X = pd.DataFrame(np.nan, index=range(num_samples), columns=colnames)
256+
for i in range(len(colnames)):
257+
if len(ranges[i]) == 2:
258+
col_val = np.random.uniform(low=ranges[i][0], high=ranges[i][1], size=num_samples)
259+
else:
260+
col_val = np.random.choice(ranges[i], size=num_samples)
261+
X.iloc[:, i] = col_val
262+
return X
263+
264+
configs = random_sample_configurations(num_samples=1000)
265+
preds = model.predict(configs)
266+
267+
# tracking the maximum AUC obtained over the functions evaluations
268+
preds = np.maximum.accumulate(preds)
269+
# computing regret (1 - predicted_auc)
270+
regret = 1 - preds
271+
272+
# plotting the regret curve
273+
plt.plot(regret)
274+
# plt.yscale('log')
275+
plt.title('AUC regret for Random Search on surrogate')
276+
plt.xlabel('Numbe of function evaluations')
277+
plt.ylabel('Regret')
278+
plt.show()

0 commit comments

Comments
 (0)