Skip to content

Commit 433f1e7

Browse files
Neeratyoymfeurer
authored andcommitted
Optimizing Perrone example (#853)
* Making example faster and adding unit test for it * Fixing server for the unit test * Fixing sklearn version issues in unit test * Removing redundant unit test
1 parent f74b73a commit 433f1e7

1 file changed

Lines changed: 18 additions & 29 deletions

File tree

examples/40_paper/2018_neurips_perrone_example.py

Lines changed: 18 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
############################################################################
4040
# The subsequent functions are defined to fetch tasks, flows, evaluations and preprocess them into
4141
# a tabular format that can be used to build models.
42-
#
42+
4343

4444
def fetch_evaluations(run_full=False,
4545
flow_type='svm',
@@ -79,25 +79,25 @@ def fetch_evaluations(run_full=False,
7979
3492, 3493, 37, 3896, 3903, 3913, 3917, 3918, 3, 49, 9914,
8080
9946, 9952, 9967,
8181
]
82-
else: #flow_type == 'xgboost' and not run_full:
82+
else: # flow_type == 'xgboost' and not run_full:
8383
task_ids = [3903, 37, 3485, 49, 3913]
8484

8585
# Fetching the relevant flow
8686
flow_id = 5891 if flow_type == 'svm' else 6767
8787

8888
# Fetching evaluations
89-
eval_df = openml.evaluations.list_evaluations(function=metric,
90-
task=task_ids,
91-
flow=[flow_id],
92-
uploader=[2702],
93-
output_format='dataframe')
89+
eval_df = openml.evaluations.list_evaluations_setups(function=metric,
90+
task=task_ids,
91+
flow=[flow_id],
92+
uploader=[2702],
93+
output_format='dataframe',
94+
parameters_in_separate_columns=True)
9495
return eval_df, task_ids, flow_id
9596

9697

9798
def create_table_from_evaluations(eval_df,
9899
flow_type='svm',
99100
run_count=np.iinfo(np.int64).max,
100-
metric = 'area_under_roc_curve',
101101
task_ids=None):
102102
'''
103103
Create a tabular data with its ground truth from a dataframe of evaluations.
@@ -111,8 +111,6 @@ def create_table_from_evaluations(eval_df,
111111
To select whether svm or xgboost experiments are to be run
112112
run_count : int
113113
Maximum size of the table created, or number of runs included in the table
114-
metric : str
115-
The evaluation measure that is passed to openml.evaluations.list_evaluations
116114
task_ids : list, (optional)
117115
List of integers specifying the tasks to be retained from the evaluations dataframe
118116
@@ -132,18 +130,11 @@ def create_table_from_evaluations(eval_df,
132130
'subsample',
133131
]
134132
eval_df = eval_df.sample(frac=1) # shuffling rows
135-
run_ids = eval_df["run_id"][:run_count]
136-
eval_table = pd.DataFrame(np.nan, index=run_ids, columns=colnames)
137-
values = []
138-
runs = openml.runs.get_runs(run_ids)
139-
for r in runs:
140-
params = r.parameter_settings
141-
for p in params:
142-
name, value = p['oml:name'], p['oml:value']
143-
if name in colnames:
144-
eval_table.loc[r.run_id, name] = value
145-
values.append(r.evaluations[metric])
146-
return eval_table, values
133+
eval_df = eval_df.iloc[:run_count, :]
134+
eval_df.columns = [column.split('_')[-1] for column in eval_df.columns]
135+
eval_table = eval_df.loc[:, colnames]
136+
value = eval_df.loc[:, 'value']
137+
return eval_table, value
147138

148139

149140
def list_categorical_attributes(flow_type='svm'):
@@ -160,9 +151,7 @@ def list_categorical_attributes(flow_type='svm'):
160151
# pre-processing all retrieved evaluations.
161152

162153
eval_df, task_ids, flow_id = fetch_evaluations(run_full=False, flow_type=flow_type)
163-
# run_count can not be passed if all the results are required
164-
# it is set to 500 here arbitrarily to get results quickly
165-
X, y = create_table_from_evaluations(eval_df, run_count=500, flow_type=flow_type)
154+
X, y = create_table_from_evaluations(eval_df, flow_type=flow_type)
166155
print(X.head())
167156
print("Y : ", y[:5])
168157

@@ -176,8 +165,6 @@ def list_categorical_attributes(flow_type='svm'):
176165
# Separating data into categorical and non-categorical (numeric for this example) columns
177166
cat_cols = list_categorical_attributes(flow_type=flow_type)
178167
num_cols = list(set(X.columns) - set(cat_cols))
179-
X_cat = X.loc[:, cat_cols]
180-
X_num = X.loc[:, num_cols]
181168

182169
# Missing value imputers
183170
cat_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='None')
@@ -187,7 +174,7 @@ def list_categorical_attributes(flow_type='svm'):
187174
enc = OneHotEncoder(handle_unknown='ignore')
188175

189176
# Pipeline to handle categorical column transformations
190-
cat_transforms = Pipeline([('impute', cat_imputer), ('encode', enc)])
177+
cat_transforms = Pipeline(steps=[('impute', cat_imputer), ('encode', enc)])
191178

192179
# Combining column transformers
193180
ct = ColumnTransformer([('cat', cat_transforms, cat_cols), ('num', num_imputer, num_cols)])
@@ -207,7 +194,7 @@ def list_categorical_attributes(flow_type='svm'):
207194
# Selecting a task for the surrogate
208195
task_id = task_ids[-1]
209196
print("Task ID : ", task_id)
210-
X, y = create_table_from_evaluations(eval_df, run_count=1000, task_ids=[task_id], flow_type='svm')
197+
X, y = create_table_from_evaluations(eval_df, task_ids=[task_id], flow_type='svm')
211198

212199
model.fit(X, y)
213200
y_pred = model.predict(X)
@@ -224,6 +211,7 @@ def list_categorical_attributes(flow_type='svm'):
224211
#
225212
# NOTE: This section is written exclusively for the SVM flow
226213

214+
227215
# Sampling random configurations
228216
def random_sample_configurations(num_samples=100):
229217
colnames = ['cost', 'degree', 'gamma', 'kernel']
@@ -240,6 +228,7 @@ def random_sample_configurations(num_samples=100):
240228
X.iloc[:, i] = col_val
241229
return X
242230

231+
243232
configs = random_sample_configurations(num_samples=1000)
244233
print(configs)
245234

0 commit comments

Comments
 (0)