3939############################################################################
4040# The subsequent functions are defined to fetch tasks, flows, evaluations and preprocess them into
4141# a tabular format that can be used to build models.
42- #
42+
4343
4444def fetch_evaluations (run_full = False ,
4545 flow_type = 'svm' ,
@@ -79,25 +79,25 @@ def fetch_evaluations(run_full=False,
7979 3492 , 3493 , 37 , 3896 , 3903 , 3913 , 3917 , 3918 , 3 , 49 , 9914 ,
8080 9946 , 9952 , 9967 ,
8181 ]
82- else : #flow_type == 'xgboost' and not run_full:
82+ else : # flow_type == 'xgboost' and not run_full:
8383 task_ids = [3903 , 37 , 3485 , 49 , 3913 ]
8484
8585 # Fetching the relevant flow
8686 flow_id = 5891 if flow_type == 'svm' else 6767
8787
8888 # Fetching evaluations
89- eval_df = openml .evaluations .list_evaluations (function = metric ,
90- task = task_ids ,
91- flow = [flow_id ],
92- uploader = [2702 ],
93- output_format = 'dataframe' )
89+ eval_df = openml .evaluations .list_evaluations_setups (function = metric ,
90+ task = task_ids ,
91+ flow = [flow_id ],
92+ uploader = [2702 ],
93+ output_format = 'dataframe' ,
94+ parameters_in_separate_columns = True )
9495 return eval_df , task_ids , flow_id
9596
9697
9798def create_table_from_evaluations (eval_df ,
9899 flow_type = 'svm' ,
99100 run_count = np .iinfo (np .int64 ).max ,
100- metric = 'area_under_roc_curve' ,
101101 task_ids = None ):
102102 '''
103103 Create a tabular data with its ground truth from a dataframe of evaluations.
@@ -111,8 +111,6 @@ def create_table_from_evaluations(eval_df,
111111 To select whether svm or xgboost experiments are to be run
112112 run_count : int
113113 Maximum size of the table created, or number of runs included in the table
114- metric : str
115- The evaluation measure that is passed to openml.evaluations.list_evaluations
116114 task_ids : list, (optional)
117115 List of integers specifying the tasks to be retained from the evaluations dataframe
118116
@@ -132,18 +130,11 @@ def create_table_from_evaluations(eval_df,
132130 'subsample' ,
133131 ]
134132 eval_df = eval_df .sample (frac = 1 ) # shuffling rows
135- run_ids = eval_df ["run_id" ][:run_count ]
136- eval_table = pd .DataFrame (np .nan , index = run_ids , columns = colnames )
137- values = []
138- runs = openml .runs .get_runs (run_ids )
139- for r in runs :
140- params = r .parameter_settings
141- for p in params :
142- name , value = p ['oml:name' ], p ['oml:value' ]
143- if name in colnames :
144- eval_table .loc [r .run_id , name ] = value
145- values .append (r .evaluations [metric ])
146- return eval_table , values
133+ eval_df = eval_df .iloc [:run_count , :]
134+ eval_df .columns = [column .split ('_' )[- 1 ] for column in eval_df .columns ]
135+ eval_table = eval_df .loc [:, colnames ]
136+ value = eval_df .loc [:, 'value' ]
137+ return eval_table , value
147138
148139
149140def list_categorical_attributes (flow_type = 'svm' ):
@@ -160,9 +151,7 @@ def list_categorical_attributes(flow_type='svm'):
160151# pre-processing all retrieved evaluations.
161152
162153eval_df , task_ids , flow_id = fetch_evaluations (run_full = False , flow_type = flow_type )
163- # run_count can not be passed if all the results are required
164- # it is set to 500 here arbitrarily to get results quickly
165- X , y = create_table_from_evaluations (eval_df , run_count = 500 , flow_type = flow_type )
154+ X , y = create_table_from_evaluations (eval_df , flow_type = flow_type )
166155print (X .head ())
167156print ("Y : " , y [:5 ])
168157
@@ -176,8 +165,6 @@ def list_categorical_attributes(flow_type='svm'):
176165# Separating data into categorical and non-categorical (numeric for this example) columns
177166cat_cols = list_categorical_attributes (flow_type = flow_type )
178167num_cols = list (set (X .columns ) - set (cat_cols ))
179- X_cat = X .loc [:, cat_cols ]
180- X_num = X .loc [:, num_cols ]
181168
182169# Missing value imputers
183170cat_imputer = SimpleImputer (missing_values = np .nan , strategy = 'constant' , fill_value = 'None' )
@@ -187,7 +174,7 @@ def list_categorical_attributes(flow_type='svm'):
187174enc = OneHotEncoder (handle_unknown = 'ignore' )
188175
189176# Pipeline to handle categorical column transformations
190- cat_transforms = Pipeline ([('impute' , cat_imputer ), ('encode' , enc )])
177+ cat_transforms = Pipeline (steps = [('impute' , cat_imputer ), ('encode' , enc )])
191178
192179# Combining column transformers
193180ct = ColumnTransformer ([('cat' , cat_transforms , cat_cols ), ('num' , num_imputer , num_cols )])
@@ -207,7 +194,7 @@ def list_categorical_attributes(flow_type='svm'):
207194# Selecting a task for the surrogate
208195task_id = task_ids [- 1 ]
209196print ("Task ID : " , task_id )
210- X , y = create_table_from_evaluations (eval_df , run_count = 1000 , task_ids = [task_id ], flow_type = 'svm' )
197+ X , y = create_table_from_evaluations (eval_df , task_ids = [task_id ], flow_type = 'svm' )
211198
212199model .fit (X , y )
213200y_pred = model .predict (X )
@@ -224,6 +211,7 @@ def list_categorical_attributes(flow_type='svm'):
224211#
225212# NOTE: This section is written exclusively for the SVM flow
226213
214+
227215# Sampling random configurations
228216def random_sample_configurations (num_samples = 100 ):
229217 colnames = ['cost' , 'degree' , 'gamma' , 'kernel' ]
@@ -240,6 +228,7 @@ def random_sample_configurations(num_samples=100):
240228 X .iloc [:, i ] = col_val
241229 return X
242230
231+
243232configs = random_sample_configurations (num_samples = 1000 )
244233print (configs )
245234
0 commit comments