@@ -1099,11 +1099,11 @@ def _run_model_on_fold(
10991099 model : Any ,
11001100 task : 'OpenMLTask' ,
11011101 X_train : Union [np .ndarray , scipy .sparse .spmatrix , pd .DataFrame ],
1102- y_train : np .ndarray ,
11031102 rep_no : int ,
11041103 fold_no : int ,
1104+ y_train : Optional [np .ndarray ] = None ,
11051105 X_test : Optional [Union [np .ndarray , scipy .sparse .spmatrix , pd .DataFrame ]] = None ,
1106- n_classes : Optional [int ] = None ,
1106+ classes : Optional [int ] = None ,
11071107 ) -> Tuple [np .ndarray , np .ndarray , 'OrderedDict[str, float]' , Any ]:
11081108 """Run a model on a repeat,fold,subsample triplet of the task and return prediction
11091109 information.
@@ -1156,7 +1156,7 @@ def _run_model_on_fold(
11561156
11571157 def _prediction_to_probabilities (
11581158 y : np .ndarray ,
1159- model_classes : List ,
1159+ classes : List ,
11601160 ) -> np .ndarray :
11611161 """Transforms predicted probabilities to match with OpenML class indices.
11621162
@@ -1175,13 +1175,12 @@ def _prediction_to_probabilities(
11751175 # y: list or numpy array of predictions
11761176 # model_classes: sklearn classifier mapping from original array id to
11771177 # prediction index id
1178- if not isinstance (model_classes , list ):
1178+ if not isinstance (classes , list ):
11791179 raise ValueError ('please convert model classes to list prior to '
11801180 'calling this fn' )
1181- result = np .zeros ((len (y ), len (model_classes )), dtype = np .float32 )
1181+ result = np .zeros ((len (y ), len (classes )), dtype = np .float32 )
11821182 for obs , prediction_idx in enumerate (y ):
1183- array_idx = model_classes .index (prediction_idx )
1184- result [obs ][array_idx ] = 1.0
1183+ result [obs ][prediction_idx ] = 1.0
11851184 return result
11861185
11871186 # TODO: if possible, give a warning if model is already fitted (acceptable
@@ -1239,7 +1238,12 @@ def _prediction_to_probabilities(
12391238
12401239 # In supervised learning this returns the predictions for Y, in clustering
12411240 # it returns the clusters
1242- pred_y = model_copy .predict (X_test )
1241+ if isinstance (task , OpenMLSupervisedTask ):
1242+ pred_y = model_copy .predict (X_test )
1243+ elif isinstance (task , OpenMLClusteringTask ):
1244+ pred_y = model_copy .predict (X_train )
1245+ else :
1246+ raise ValueError (task )
12431247
12441248 if can_measure_cputime :
12451249 modelpredict_duration_cputime = (time .process_time ()
@@ -1258,13 +1262,18 @@ def _prediction_to_probabilities(
12581262 try :
12591263 proba_y = model_copy .predict_proba (X_test )
12601264 except AttributeError :
1261- proba_y = _prediction_to_probabilities (pred_y , list (model_classes ))
1262-
1263- pred_y = np .array ([model_classes [label ] for label in pred_y ], dtype = pred_y .dtype )
1264- proba_y_new = np .zeros ((proba_y .shape [0 ], n_classes ))
1265- for idx , class_idx in enumerate (model_classes ):
1266- proba_y_new [:, class_idx ] = proba_y [:, idx ]
1267- proba_y = proba_y_new
1265+ proba_y = _prediction_to_probabilities (pred_y , list (classes ))
1266+
1267+ if proba_y .shape [1 ] != len (classes ):
1268+ # Remap the probabilities in case there was a class missing at training time
1269+ # By default, the classification targets are mapped to be zero-based indices to the
1270+ # actual classes. Therefore, the model_classes contain the correct indices to the
1271+ # correct probability array (the actualy array might be incorrect if there are some
1272+ # classes not present during train time).
1273+ proba_y_new = np .zeros ((proba_y .shape [0 ], len (classes )))
1274+ for idx , model_class in enumerate (model_classes ):
1275+ proba_y_new [:, model_class ] = proba_y [:, idx ]
1276+ proba_y = proba_y_new
12681277
12691278 if proba_y .shape [1 ] != len (task .class_labels ):
12701279 message = "Estimator only predicted for {}/{} classes!" .format (
0 commit comments