1717
1818In the following section, we shall do the following:
1919
20- * Retrieve tasks and flows as used in the experiments by Perrone et al.
21- * Build a tabular data by fetching the evaluations uploaded to OpenML
20+ * Retrieve tasks and flows as used in the experiments by Perrone et al. (2018).
21+ * Build a tabular data by fetching the evaluations uploaded to OpenML.
2222* Impute missing values and handle categorical data before building a Random Forest model that
23- maps hyperparameter values to the area under curve score
23+ maps hyperparameter values to the area under curve score.
2424"""
2525
2626############################################################################
3535from sklearn .preprocessing import OneHotEncoder
3636from sklearn .ensemble import RandomForestRegressor
3737
38-
39- user_id = 2702
4038flow_type = 'svm' # this example will use the smaller svm flow evaluations
4139############################################################################
42-
43- """
44- The subsequent functions are defined to fetch tasks, flows, evaluations and preprocess them into
45- a tabular format that can be used to build models.
46- """
40+ # The subsequent functions are defined to fetch tasks, flows, evaluations and preprocess them into
41+ # a tabular format that can be used to build models.
42+ #
4743
4844def fetch_evaluations (run_full = False ,
4945 flow_type = 'svm' ,
@@ -69,15 +65,20 @@ def fetch_evaluations(run_full=False,
6965 '''
7066 # Collecting task IDs as used by the experiments from the paper
7167 if flow_type == 'svm' and run_full :
72- task_ids = [10101 , 145878 , 146064 , 14951 , 34537 , 3485 , 3492 , 3493 , 3494 , 37 , 3889 , 3891 ,
73- 3899 , 3902 , 3903 , 3913 , 3918 , 3950 , 9889 , 9914 , 9946 , 9952 , 9967 , 9971 , 9976 ,
74- 9978 , 9980 , 9983 ]
68+ task_ids = [
69+ 10101 , 145878 , 146064 , 14951 , 34537 , 3485 , 3492 , 3493 , 3494 ,
70+ 37 , 3889 , 3891 , 3899 , 3902 , 3903 , 3913 , 3918 , 3950 , 9889 ,
71+ 9914 , 9946 , 9952 , 9967 , 9971 , 9976 , 9978 , 9980 , 9983 ,
72+ ]
7573 elif flow_type == 'svm' and not run_full :
7674 task_ids = [9983 , 3485 , 3902 , 3903 , 145878 ]
7775 elif flow_type == 'xgboost' and run_full :
78- task_ids = [10093 , 10101 , 125923 , 145847 , 145857 , 145862 , 145872 , 145878 , 145953 , 145972 ,
79- 145976 , 145979 , 146064 , 14951 , 31 , 3485 , 3492 , 3493 , 37 , 3896 , 3903 , 3913 ,
80- 3917 , 3918 , 3 , 49 , 9914 , 9946 , 9952 , 9967 ]
76+ task_ids = [
77+ 10093 , 10101 , 125923 , 145847 , 145857 , 145862 , 145872 , 145878 ,
78+ 145953 , 145972 , 145976 , 145979 , 146064 , 14951 , 31 , 3485 ,
79+ 3492 , 3493 , 37 , 3896 , 3903 , 3913 , 3917 , 3918 , 3 , 49 , 9914 ,
80+ 9946 , 9952 , 9967 ,
81+ ]
8182 else : #flow_type == 'xgboost' and not run_full:
8283 task_ids = [3903 , 37 , 3485 , 49 , 3913 ]
8384
@@ -123,23 +124,24 @@ def create_table_from_evaluations(eval_df,
123124 if task_ids is not None :
124125 eval_df = eval_df [eval_df ['task_id' ].isin (task_ids )]
125126 if flow_type == 'svm' :
126- ncols = 4
127127 colnames = ['cost' , 'degree' , 'gamma' , 'kernel' ]
128128 else :
129- ncols = 10
130- colnames = ['alpha' , 'booster' , 'colsample_bylevel' , 'colsample_bytree' , 'eta' , 'lambda' ,
131- 'max_depth' , 'min_child_weight' , 'nrounds' , 'subsample' ]
129+ colnames = [
130+ 'alpha' , 'booster' , 'colsample_bylevel' , 'colsample_bytree' ,
131+ 'eta' , 'lambda' , 'max_depth' , 'min_child_weight' , 'nrounds' ,
132+ 'subsample' ,
133+ ]
132134 eval_df = eval_df .sample (frac = 1 ) # shuffling rows
133- run_ids = eval_df . loc [:, "run_id" ][:run_count ]
135+ run_ids = eval_df [ "run_id" ][:run_count ]
134136 eval_table = pd .DataFrame (np .nan , index = run_ids , columns = colnames )
135137 values = []
136- for run_id in run_ids :
137- r = openml . runs . get_run ( run_id )
138+ runs = openml . runs . get_runs ( run_ids )
139+ for r in runs :
138140 params = r .parameter_settings
139141 for p in params :
140142 name , value = p ['oml:name' ], p ['oml:value' ]
141143 if name in colnames :
142- eval_table .loc [run_id , name ] = value
144+ eval_table .loc [r . run_id , name ] = value
143145 values .append (r .evaluations [metric ])
144146 return eval_table , values
145147
@@ -153,13 +155,14 @@ def list_categorical_attributes(flow_type='svm'):
153155#############################################################################
154156# Fetching the data from OpenML
155157# *****************************
156- # To read all the tasks and evaluations for them and collate into a table. Here, we are reading
157- # all the tasks and evaluations for the SVM flow and pre-processing all retrieved evaluations.
158+ # Now, we read all the tasks and evaluations for them and collate into a table.
159+ # Here, we are reading all the tasks and evaluations for the SVM flow and
160+ # pre-processing all retrieved evaluations.
158161
159162eval_df , task_ids , flow_id = fetch_evaluations (run_full = False , flow_type = flow_type )
160163# run_count can not be passed if all the results are required
161- # it is set to 1000 here arbitrarily to get results quickly
162- X , y = create_table_from_evaluations (eval_df , run_count = 1000 , flow_type = flow_type )
164+ # it is set to 500 here arbitrarily to get results quickly
165+ X , y = create_table_from_evaluations (eval_df , run_count = 500 , flow_type = flow_type )
163166print (X .head ())
164167print ("Y : " , y [:5 ])
165168
@@ -218,6 +221,7 @@ def list_categorical_attributes(flow_type='svm'):
218221# The surrogate model built from a task's evaluations fetched from OpenML will be put into
219222# trivial action here, where we shall randomly sample configurations and observe the trajectory
220223# of the area under curve (auc) we can obtain from the surrogate we've built.
224+ #
221225# NOTE: This section is written exclusively for the SVM flow
222226
223227# Sampling random configurations
@@ -246,8 +250,6 @@ def random_sample_configurations(num_samples=100):
246250
247251# plotting the regret curve
248252plt .plot (regret )
249- # plt.yscale('log')
250253plt .title ('AUC regret for Random Search on surrogate' )
251254plt .xlabel ('Numbe of function evaluations' )
252255plt .ylabel ('Regret' )
253- plt .show ()
0 commit comments