2727import openml
2828import numpy as np
2929import pandas as pd
30+ from matplotlib import pyplot as plt
31+ from sklearn .pipeline import Pipeline
3032from sklearn .impute import SimpleImputer
33+ from sklearn .compose import ColumnTransformer
34+ from sklearn .metrics import mean_squared_error
3135from sklearn .preprocessing import OneHotEncoder
3236from sklearn .ensemble import RandomForestRegressor
3337
38+
3439user_id = 2702
40+ flow_type = 'svm' # this example will use the smaller svm flow evaluations
3541############################################################################
3642
3743"""
@@ -138,6 +144,12 @@ def create_table_from_evaluations(eval_df,
138144 return eval_table , values
139145
140146
147+ def list_categorical_attributes (flow_type = 'svm' ):
148+ if flow_type == 'svm' :
149+ return ['kernel' ]
150+ return ['booster' ]
151+
152+
141153def impute_missing_values (eval_table , flow_type = 'svm' ):
142154 # Replacing NaNs with fixed values outside the range of the parameters
143155 # given in the supplement material of the paper
@@ -164,30 +176,103 @@ def preprocess(eval_table, flow_type='svm'):
164176
165177
166178#############################################################################
167- # Fetching the tasks and evaluations
168- # ==================================
179+ # Fetching the data from OpenML
180+ # *****************************
169181# To read all the tasks and evaluations for them and collate into a table. Here, we are reading
170- # all the tasks and evaluations for the SVM flow and preprocessing all retrieved evaluations.
182+ # all the tasks and evaluations for the SVM flow and pre-processing all retrieved evaluations.
183+
184+ eval_df , task_ids , flow_id = fetch_evaluations (run_full = False , flow_type = flow_type )
185+ # run_count can not be passed if all the results are required
186+ # it is set to 1000 here arbitrarily to get results quickly
187+ X , y = create_table_from_evaluations (eval_df , run_count = 1000 , flow_type = flow_type )
188+ print (X .head ())
189+ print ("Y : " , y [:5 ])
190+
191+ #############################################################################
192+ # Creating pre-processing and modelling pipelines
193+ # ***********************************************
194+ # The two primary tasks are to impute the missing values, that is, account for the hyperparameters
195+ # that are not available with the runs from OpenML. And secondly, to handle categorical variables
196+ # using One-hot encoding prior to modelling.
197+
198+ # Separating data into categorical and non-categorical (numeric for this example) columns
199+ cat_cols = list_categorical_attributes (flow_type = flow_type )
200+ num_cols = list (set (X .columns ) - set (cat_cols ))
201+ X_cat = X .loc [:, cat_cols ]
202+ X_num = X .loc [:, num_cols ]
203+
204+ # Missing value imputers
205+ cat_imputer = SimpleImputer (missing_values = np .nan , strategy = 'constant' , fill_value = 'None' )
206+ num_imputer = SimpleImputer (missing_values = np .nan , strategy = 'constant' , fill_value = - 1 )
207+
208+ # Creating the one-hot encoder
209+ enc = OneHotEncoder (handle_unknown = 'ignore' )
171210
172- eval_df , task_ids , flow_id = fetch_evaluations (run_full = False )
173- X , y = create_table_from_evaluations (eval_df , run_count = 1000 )
174- X = preprocess (X )
175- print ("Type: {}; Shape: {}" .format (type (X ), X .shape ))
176- print (X [:5 ])
211+ # Pipeline to handle categorical column transformations
212+ cat_transforms = Pipeline ([('impute' , cat_imputer ), ('encode' , enc )])
213+
214+ # Combining column transformers
215+ ct = ColumnTransformer ([('cat' , cat_transforms , cat_cols ), ('num' , num_imputer , num_cols )])
216+
217+ # Creating the full pipeline with the surrogate model
218+ clf = RandomForestRegressor (n_estimators = 50 )
219+ model = Pipeline (steps = [('preprocess' , ct ), ('surrogate' , clf )])
177220
178221
179222#############################################################################
180223# Building a surrogate model on a task's evaluation
181- # =================================================
224+ # *************************************************
182225# The same set of functions can be used for a single task to retrieve a singular table which can
183226# be used for the surrogate model construction. We shall use the SVM flow here to keep execution
184227# time simple and quick.
185228
186- # Selecting a task
229+ # Selecting a task for the surrogate
187230task_id = task_ids [- 1 ]
231+ print ("Task ID : " , task_id )
188232X , y = create_table_from_evaluations (eval_df , run_count = 1000 , task_ids = [task_id ], flow_type = 'svm' )
189- X = preprocess (X , flow_type = 'svm' )
190233
191- # Surrogate model
192- clf = RandomForestRegressor (n_estimators = 50 , max_depth = 3 )
193- clf .fit (X , y )
234+ model .fit (X , y )
235+ y_pred = model .predict (X )
236+
237+ print ("Training RMSE : {:.5}" .format (mean_squared_error (y , y_pred )))
238+
239+
240+ #############################################################################
241+ # Evaluating the surrogate model
242+ # ******************************
243+ # The surrogate model built from a task's evaluations fetched from OpenML will be put into
244+ # trivial action here, where we shall randomly sample configurations and observe the trajectory
245+ # of the area under curve (auc) we can obtain from the surrogate we've built.
246+ # NOTE: This section is written exclusively for the SVM flow
247+
248+ # Sampling random configurations
249+ def random_sample_configurations (num_samples = 100 ):
250+ colnames = ['cost' , 'degree' , 'gamma' , 'kernel' ]
251+ ranges = [(0.000986 , 998.492437 ),
252+ (2.0 , 5.0 ),
253+ (0.000988 , 913.373845 ),
254+ (['linear' , 'polynomial' , 'radial' , 'sigmoid' ])]
255+ X = pd .DataFrame (np .nan , index = range (num_samples ), columns = colnames )
256+ for i in range (len (colnames )):
257+ if len (ranges [i ]) == 2 :
258+ col_val = np .random .uniform (low = ranges [i ][0 ], high = ranges [i ][1 ], size = num_samples )
259+ else :
260+ col_val = np .random .choice (ranges [i ], size = num_samples )
261+ X .iloc [:, i ] = col_val
262+ return X
263+
264+ configs = random_sample_configurations (num_samples = 1000 )
265+ preds = model .predict (configs )
266+
267+ # tracking the maximum AUC obtained over the functions evaluations
268+ preds = np .maximum .accumulate (preds )
269+ # computing regret (1 - predicted_auc)
270+ regret = 1 - preds
271+
272+ # plotting the regret curve
273+ plt .plot (regret )
274+ # plt.yscale('log')
275+ plt .title ('AUC regret for Random Search on surrogate' )
276+ plt .xlabel ('Numbe of function evaluations' )
277+ plt .ylabel ('Regret' )
278+ plt .show ()
0 commit comments