Skip to content

Commit c4a23d1

Browse files
committed
fix #210, preventing data leakage. before every x-validation fold, the model is cloned from an untrained version
1 parent f4df535 commit c4a23d1

1 file changed

Lines changed: 4 additions & 3 deletions

File tree

openml/runs/functions.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import numpy as np
66
import warnings
77
import openml
8+
import sklearn
89
from sklearn.model_selection._search import BaseSearchCV
910

1011
from ..exceptions import PyOpenMLError
@@ -59,7 +60,6 @@ def run_task(task, model, avoid_duplicate_runs=True):
5960
raise PyOpenMLError("Run already exists in server. Run id(s): %s" %str(ids))
6061

6162
dataset = task.get_dataset()
62-
X, Y = dataset.get_data(target=task.target_name)
6363

6464
class_labels = task.class_labels
6565
if class_labels is None:
@@ -160,6 +160,7 @@ def _run_task_get_arffcontent(model, task, class_labels):
160160
for rep in task.iterate_repeats():
161161
fold_no = 0
162162
for fold in rep:
163+
model_fold = sklearn.base.clone(model, safe=True)
163164
train_indices, test_indices = fold
164165
trainX = X[train_indices]
165166
trainY = Y[train_indices]
@@ -174,8 +175,8 @@ def _run_task_get_arffcontent(model, task, class_labels):
174175
else:
175176
model_classes = model.classes_
176177

177-
ProbaY = model.predict_proba(testX)
178-
PredY = model.predict(testX)
178+
ProbaY = model_fold.predict_proba(testX)
179+
PredY = model_fold.predict(testX)
179180
if ProbaY.shape[1] != len(class_labels):
180181
warnings.warn("Repeat %d Fold %d: estimator only predicted for %d/%d classes!" %(rep_no, fold_no, ProbaY.shape[1], len(class_labels)))
181182

0 commit comments

Comments
 (0)