1212import warnings
1313
1414import numpy as np
15+ import pandas as pd
1516import scipy .stats
17+ import scipy .sparse
1618import sklearn .base
1719import sklearn .model_selection
1820import sklearn .pipeline
@@ -1096,11 +1098,15 @@ def _run_model_on_fold(
10961098 self ,
10971099 model : Any ,
10981100 task : 'OpenMLTask' ,
1101+ X_train : Union [np .ndarray , scipy .sparse .spmatrix , pd .DataFrame ],
1102+ y_train : np .ndarray ,
10991103 rep_no : int ,
11001104 fold_no : int ,
11011105 sample_no : int ,
11021106 add_local_measures : bool ,
1103- ) -> Tuple [List [List ], List [List ], 'OrderedDict[str, float]' , Any ]:
1107+ X_test : Optional [Union [np .ndarray , scipy .sparse .spmatrix , pd .DataFrame ]] = None ,
1108+ n_classes : Optional [int ] = None ,
1109+ ) -> Tuple [np .ndarray , np .ndarray , 'OrderedDict[str, float]' , Any ]:
11041110 """Run a model on a repeat,fold,subsample triplet of the task and return prediction
11051111 information.
11061112
@@ -1191,20 +1197,6 @@ def _prediction_to_probabilities(
11911197 can_measure_cputime = self ._can_measure_cputime (model_copy )
11921198 can_measure_wallclocktime = self ._can_measure_wallclocktime (model_copy )
11931199
1194- train_indices , test_indices = task .get_train_test_split_indices (
1195- repeat = rep_no , fold = fold_no , sample = sample_no )
1196- if isinstance (task , OpenMLSupervisedTask ):
1197- x , y = task .get_X_and_y ()
1198- train_x = x [train_indices ]
1199- train_y = y [train_indices ]
1200- test_x = x [test_indices ]
1201- test_y = y [test_indices ]
1202- elif isinstance (task , OpenMLClusteringTask ):
1203- train_x = train_indices
1204- test_x = test_indices
1205- else :
1206- raise NotImplementedError (task .task_type )
1207-
12081200 user_defined_measures = OrderedDict () # type: 'OrderedDict[str, float]'
12091201
12101202 try :
@@ -1213,9 +1205,9 @@ def _prediction_to_probabilities(
12131205 modelfit_start_walltime = time .time ()
12141206
12151207 if isinstance (task , OpenMLSupervisedTask ):
1216- model_copy .fit (train_x , train_y )
1208+ model_copy .fit (X_train , y_train )
12171209 elif isinstance (task , OpenMLClusteringTask ):
1218- model_copy .fit (train_x )
1210+ model_copy .fit (X_train )
12191211
12201212 modelfit_dur_cputime = (time .process_time () - modelfit_start_cputime ) * 1000
12211213 if can_measure_cputime :
@@ -1229,11 +1221,6 @@ def _prediction_to_probabilities(
12291221 # typically happens when training a regressor on classification task
12301222 raise PyOpenMLError (str (e ))
12311223
1232- # extract trace, if applicable
1233- arff_tracecontent = [] # type: List[List]
1234- if self .is_hpo_class (model_copy ):
1235- arff_tracecontent .extend (self ._extract_trace_data (model_copy , rep_no , fold_no ))
1236-
12371224 if isinstance (task , (OpenMLClassificationTask , OpenMLLearningCurveTask )):
12381225 # search for model classes_ (might differ depending on modeltype)
12391226 # first, pipelines are a special case (these don't have a classes_
@@ -1254,7 +1241,7 @@ def _prediction_to_probabilities(
12541241
12551242 # In supervised learning this returns the predictions for Y, in clustering
12561243 # it returns the clusters
1257- pred_y = model_copy .predict (test_x )
1244+ pred_y = model_copy .predict (X_test )
12581245
12591246 if can_measure_cputime :
12601247 modelpredict_duration_cputime = (time .process_time ()
@@ -1268,133 +1255,35 @@ def _prediction_to_probabilities(
12681255 user_defined_measures ['wall_clock_time_millis' ] = (modelfit_dur_walltime
12691256 + modelpredict_duration_walltime )
12701257
1271- # add client-side calculated metrics. These is used on the server as
1272- # consistency check, only useful for supervised tasks
1273- def _calculate_local_measure (sklearn_fn , openml_name ):
1274- user_defined_measures [openml_name ] = sklearn_fn (test_y , pred_y )
1275-
1276- # Task type specific outputs
1277- arff_datacontent = []
1278-
12791258 if isinstance (task , (OpenMLClassificationTask , OpenMLLearningCurveTask )):
12801259
12811260 try :
1282- proba_y = model_copy .predict_proba (test_x )
1261+ proba_y = model_copy .predict_proba (X_test )
12831262 except AttributeError :
12841263 proba_y = _prediction_to_probabilities (pred_y , list (model_classes ))
12851264
1265+ pred_y = np .array ([model_classes [label ] for label in pred_y ], dtype = pred_y .dtype )
1266+ proba_y_new = np .zeros ((proba_y .shape [0 ], n_classes ))
1267+ for idx , class_idx in enumerate (model_classes ):
1268+ proba_y_new [:, class_idx ] = proba_y [:, idx ]
1269+ proba_y = proba_y_new
1270+
12861271 if proba_y .shape [1 ] != len (task .class_labels ):
12871272 warnings .warn (
1288- "Repeat %d Fold %d: estimator only predicted for %d/%d classes!"
1289- % (rep_no , fold_no , proba_y .shape [1 ], len (task .class_labels ))
1273+ "Repeat %d fold %d sample %d: estimator only predicted for %d/%d classes!"
1274+ % (rep_no , fold_no , sample_no , proba_y .shape [1 ], len (task .class_labels ))
12901275 )
12911276
1292- if add_local_measures :
1293- _calculate_local_measure (sklearn .metrics .accuracy_score ,
1294- 'predictive_accuracy' )
1295-
1296- for i in range (0 , len (test_indices )):
1297- arff_line = self ._prediction_to_row (
1298- rep_no = rep_no ,
1299- fold_no = fold_no ,
1300- sample_no = sample_no ,
1301- row_id = test_indices [i ],
1302- correct_label = task .class_labels [test_y [i ]],
1303- predicted_label = pred_y [i ],
1304- predicted_probabilities = proba_y [i ],
1305- class_labels = task .class_labels ,
1306- model_classes_mapping = model_classes ,
1307- )
1308- arff_datacontent .append (arff_line )
1309-
13101277 elif isinstance (task , OpenMLRegressionTask ):
1311- if add_local_measures :
1312- _calculate_local_measure (
1313- sklearn .metrics .mean_absolute_error ,
1314- 'mean_absolute_error' ,
1315- )
1316-
1317- for i in range (0 , len (test_indices )):
1318- arff_line = [rep_no , fold_no , test_indices [i ], pred_y [i ], test_y [i ]]
1319- arff_datacontent .append (arff_line )
1278+ proba_y = None
13201279
13211280 elif isinstance (task , OpenMLClusteringTask ):
1322- for i in range (0 , len (test_indices )):
1323- arff_line = [test_indices [i ], pred_y [i ]] # row_id, cluster ID
1324- arff_datacontent .append (arff_line )
1281+ proba_y = None
13251282
13261283 else :
13271284 raise TypeError (type (task ))
13281285
1329- return arff_datacontent , arff_tracecontent , user_defined_measures , model_copy
1330-
1331- def _prediction_to_row (
1332- self ,
1333- rep_no : int ,
1334- fold_no : int ,
1335- sample_no : int ,
1336- row_id : int ,
1337- correct_label : str ,
1338- predicted_label : int ,
1339- predicted_probabilities : np .ndarray ,
1340- class_labels : List ,
1341- model_classes_mapping : List ,
1342- ) -> List :
1343- """Util function that turns probability estimates of a classifier for a
1344- given instance into the right arff format to upload to openml.
1345-
1346- Parameters
1347- ----------
1348- rep_no : int
1349- The repeat of the experiment (0-based; in case of 1 time CV,
1350- always 0)
1351- fold_no : int
1352- The fold nr of the experiment (0-based; in case of holdout,
1353- always 0)
1354- sample_no : int
1355- In case of learning curves, the index of the subsample (0-based;
1356- in case of no learning curve, always 0)
1357- row_id : int
1358- row id in the initial dataset
1359- correct_label : str
1360- original label of the instance
1361- predicted_label : str
1362- the label that was predicted
1363- predicted_probabilities : array (size=num_classes)
1364- probabilities per class
1365- class_labels : array (size=num_classes)
1366- model_classes_mapping : list
1367- A list of classes the model produced.
1368- Obtained by BaseEstimator.classes_
1369-
1370- Returns
1371- -------
1372- arff_line : list
1373- representation of the current prediction in OpenML format
1374- """
1375- if not isinstance (rep_no , (int , np .integer )):
1376- raise ValueError ('rep_no should be int' )
1377- if not isinstance (fold_no , (int , np .integer )):
1378- raise ValueError ('fold_no should be int' )
1379- if not isinstance (sample_no , (int , np .integer )):
1380- raise ValueError ('sample_no should be int' )
1381- if not isinstance (row_id , (int , np .integer )):
1382- raise ValueError ('row_id should be int' )
1383- if not len (predicted_probabilities ) == len (model_classes_mapping ):
1384- raise ValueError ('len(predicted_probabilities) != len(class_labels)' )
1385-
1386- arff_line = [rep_no , fold_no , sample_no , row_id ] # type: List[Any]
1387- for class_label_idx in range (len (class_labels )):
1388- if class_label_idx in model_classes_mapping :
1389- index = np .where (model_classes_mapping == class_label_idx )[0 ][0 ]
1390- # TODO: WHY IS THIS 2D???
1391- arff_line .append (predicted_probabilities [index ])
1392- else :
1393- arff_line .append (0.0 )
1394-
1395- arff_line .append (class_labels [predicted_label ])
1396- arff_line .append (correct_label )
1397- return arff_line
1286+ return pred_y , proba_y , user_defined_measures , model_copy
13981287
13991288 def _extract_trace_data (self , model , rep_no , fold_no ):
14001289 arff_tracecontent = []
0 commit comments