Skip to content

Commit 548dede

Browse files
authored
Fix #182: Supporting private data for sklearn models (#312)
* refactoring pytorch model to work with any other method * setting up dice-genetic for private data * updated pytorch explainer and model to work with all methods Signed-off-by: Amit Sharma <amit_sharma@live.com> * updated tf methods to work with transformers Signed-off-by: Amit Sharma <amit_sharma@live.com> * fixed some bugs with private data * added private data support for random and genetic Signed-off-by: Amit Sharma <amit_sharma@live.com> * added tests for private data Signed-off-by: Amit Sharma <amit_sharma@live.com> * updated lint error Signed-off-by: Amit Sharma <amit_sharma@live.com> Signed-off-by: Amit Sharma <amit_sharma@live.com>
1 parent 1651751 commit 548dede

11 files changed

Lines changed: 241 additions & 75 deletions

File tree

dice_ml/data_interfaces/private_data_interface.py

Lines changed: 58 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import numpy as np
99
import pandas as pd
10+
from sklearn.preprocessing import LabelEncoder
1011

1112
from dice_ml.data_interfaces.base_data_interface import _BaseData
1213

@@ -60,6 +61,7 @@ def __init__(self, params):
6061
self._validate_and_set_mad(params=params)
6162
self._validate_and_set_permitted_range(params=params, features_dict=features_dict)
6263
self.feature_names = list(features_dict.keys())
64+
self.number_of_features = len(self.feature_names)
6365

6466
self.continuous_feature_indexes = [list(features_dict.keys()).index(
6567
name) for name in self.continuous_feature_names if name in features_dict]
@@ -91,20 +93,34 @@ def one_hot_encode_data(self, data):
9193
"""One-hot-encodes the data."""
9294
return pd.get_dummies(data, drop_first=False, columns=self.categorical_feature_names)
9395

94-
def normalize_data(self, df, encoding='one-hot'):
96+
def normalize_data(self, df):
9597
"""Normalizes continuous features to make them fall in the range [0,1]."""
9698
result = df.copy()
97-
for feature_name in self.continuous_feature_names:
98-
max_value = self.permitted_range[feature_name][1]
99-
min_value = self.permitted_range[feature_name][0]
100-
result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
101-
102-
# if encoding == 'label': # need not do this if not required
103-
# for ix in self.categorical_feature_indexes:
104-
# feature_name = self.feature_names[ix]
105-
# max_value = len(self.categorical_levels[feature_name])-1
106-
# min_value = 0
107-
# result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
99+
if isinstance(df, pd.DataFrame) or isinstance(df, dict):
100+
for feature_name in self.continuous_feature_names:
101+
max_value = self.permitted_range[feature_name][1]
102+
min_value = self.permitted_range[feature_name][0]
103+
if min_value == max_value:
104+
result[feature_name] = 0
105+
else:
106+
result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
107+
else:
108+
result = result.astype('float')
109+
for feature_index in self.continuous_feature_indexes:
110+
feature_name = self.feature_names[feature_index]
111+
max_value = self.permitted_range[feature_name][1]
112+
min_value = self.permitted_range[feature_name][0]
113+
if len(df.shape) == 1:
114+
if min_value == max_value:
115+
value = 0
116+
else:
117+
value = (df[feature_index] - min_value) / (max_value - min_value)
118+
result[feature_index] = value
119+
else:
120+
if min_value == max_value:
121+
result[:, feature_index] = np.zeros(len(df[:, feature_index]))
122+
else:
123+
result[:, feature_index] = (df[:, feature_index] - min_value) / (max_value - min_value)
108124
return result
109125

110126
def de_normalize_data(self, df):
@@ -161,6 +177,9 @@ def get_valid_mads(self, normalized=False, display_warnings=False, return_mads=T
161177
return mads
162178

163179
def get_features_range(self, permitted_range_input=None, features_dict=None):
180+
if features_dict is None:
181+
features_dict = self.permitted_range
182+
164183
ranges = {}
165184
# Getting default ranges based on the dataset
166185
for feature in features_dict:
@@ -241,6 +260,13 @@ def get_indexes_of_features_to_vary(self, features_to_vary='all'):
241260
ixs.append(colidx)
242261
return ixs
243262

263+
def fit_label_encoders(self):
264+
labelencoders = {}
265+
for column in self.categorical_feature_names:
266+
labelencoders[column] = LabelEncoder()
267+
labelencoders[column].fit(self.permitted_range[column])
268+
return labelencoders
269+
244270
def from_label(self, data):
245271
"""Transforms label encoded data back to categorical values"""
246272
out = data.copy()
@@ -267,7 +293,7 @@ def from_dummies(self, data, prefix_sep='_'):
267293
def get_decimal_precisions(self, output_type="list"):
268294
""""Gets the precision of continuous features in the data."""
269295
precisions_dict = defaultdict(int)
270-
precisions = [0]*len(self.continuous_feature_names)
296+
precisions = [0]*len(self.feature_names)
271297
for ix, feature_name in enumerate(self.continuous_feature_names):
272298
type_prec = self.type_and_precision[feature_name]
273299
if type_prec == 'int':
@@ -357,7 +383,7 @@ def get_ohe_min_max_normalized_data(self, query_instance):
357383
temp = self.one_hot_encode_data(temp)
358384
temp = temp.tail(query_instance.shape[0]).reset_index(drop=True)
359385
# returns a pandas dataframe
360-
return self.normalize_data(temp)
386+
return self.normalize_data(temp).apply(pd.to_numeric)
361387

362388
def get_inverse_ohe_min_max_normalized_data(self, transformed_data):
363389
"""Transforms one-hot-encoded and min-max normalized data into raw user-fed data format. transformed_data
@@ -370,3 +396,21 @@ def get_inverse_ohe_min_max_normalized_data(self, transformed_data):
370396
raw_data = raw_data[self.feature_names]
371397
# returns a pandas dataframe
372398
return raw_data
399+
400+
def get_all_dummy_colnames(self):
401+
max_vals = max([len(val) for col, val in self.permitted_range.items()])
402+
sample_data_dict = {col: np.resize(val, max_vals) for col, val in self.permitted_range.items()}
403+
sample_df = pd.DataFrame(sample_data_dict)
404+
return pd.get_dummies(sample_df).columns
405+
406+
def get_valid_feature_range(self, feature_range_input, normalized=True):
407+
"""Gets the min/max value of features in normalized or de-normalized
408+
form. Assumes that all features are already encoded to numerical form
409+
such that the number of features remains the same.
410+
411+
# TODO needs work adhere to label encoded max and to support permitted_range for
412+
both continuous and discrete when provided in _generate_counterfactuals.
413+
"""
414+
if normalized:
415+
raise NotImplementedError("Normalized feature range not supported for private data interface")
416+
return feature_range_input

dice_ml/data_interfaces/public_data_interface.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import numpy as np
88
import pandas as pd
9+
from sklearn.preprocessing import LabelEncoder
910

1011
from dice_ml.data_interfaces.base_data_interface import _BaseData
1112
from dice_ml.utils.exception import (SystemException,
@@ -338,6 +339,13 @@ def get_indexes_of_features_to_vary(self, features_to_vary='all'):
338339
ixs.append(colidx)
339340
return ixs
340341

342+
def fit_label_encoders(self):
343+
labelencoders = {}
344+
for column in self.categorical_feature_names:
345+
labelencoders[column] = LabelEncoder()
346+
labelencoders[column] = labelencoders[column].fit(self.data_df[column])
347+
return labelencoders
348+
341349
def from_label(self, data):
342350
"""Transforms label encoded data back to categorical values"""
343351
out = data.copy()
@@ -488,3 +496,6 @@ def get_inverse_ohe_min_max_normalized_data(self, transformed_data):
488496
raw_data = raw_data[self.feature_names]
489497
# returns a pandas dataframe
490498
return raw_data
499+
500+
def get_all_dummy_colnames(self):
501+
return pd.get_dummies(self.data_df[self.feature_names]).columns

dice_ml/dice.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def decide_implementation_type(self, data_interface, model_interface, method, **
2525
if model_interface.backend == BackEndTypes.Sklearn:
2626
if method == SamplingStrategy.KdTree and isinstance(data_interface, PrivateData):
2727
raise UserConfigValidationException(
28-
'Private data interface is not supported with sklearn kdtree explainer'
28+
'Private data interface is not supported with kdtree explainer'
2929
' since kdtree explainer needs access to entire training data')
3030
self.__class__ = decide(model_interface, method)
3131
self.__init__(data_interface, model_interface, **kwargs)

dice_ml/explainer_interfaces/dice_genetic.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@
88

99
import numpy as np
1010
import pandas as pd
11-
from sklearn.preprocessing import LabelEncoder
1211

1312
from dice_ml import diverse_counterfactuals as exp
1413
from dice_ml.constants import ModelTypes
1514
from dice_ml.explainer_interfaces.explainer_base import ExplainerBase
15+
from dice_ml.utils.exception import UserConfigValidationException
1616

1717

1818
class DiceGenetic(ExplainerBase):
@@ -24,6 +24,7 @@ def __init__(self, data_interface, model_interface):
2424
:param model_interface: an interface class to access trained ML model.
2525
"""
2626
super().__init__(data_interface, model_interface) # initiating data related parameters
27+
self.num_output_nodes = None
2728

2829
# variables required to generate CFs - see generate_counterfactuals() for more info
2930
self.cfs = []
@@ -33,15 +34,7 @@ def __init__(self, data_interface, model_interface):
3334
self.feature_weights_input = ''
3435

3536
# Initializing a label encoder to obtain label-encoded values for categorical variables
36-
self.labelencoder = {}
37-
38-
self.label_encoded_data = self.data_interface.data_df.copy()
39-
40-
for column in self.data_interface.categorical_feature_names:
41-
self.labelencoder[column] = LabelEncoder()
42-
self.label_encoded_data[column] = self.labelencoder[column].fit_transform(
43-
self.data_interface.data_df[column])
44-
37+
self.labelencoder = self.data_interface.fit_label_encoders()
4538
self.predicted_outcome_name = self.data_interface.outcome_name + '_pred'
4639

4740
def update_hyperparameters(self, proximity_weight, sparsity_weight,
@@ -61,7 +54,6 @@ def do_loss_initializations(self, yloss_type, diversity_loss_type, feature_weigh
6154
# define the loss parts
6255
self.yloss_type = yloss_type
6356
self.diversity_loss_type = diversity_loss_type
64-
6557
# define feature weights
6658
if feature_weights != self.feature_weights_input:
6759
self.feature_weights_input = feature_weights
@@ -83,8 +75,8 @@ def do_loss_initializations(self, yloss_type, diversity_loss_type, feature_weigh
8375
if feature in feature_weights:
8476
feature_weights_list.append(feature_weights[feature])
8577
else:
86-
# TODO: why is the weight the max value of the encoded feature
87-
feature_weights_list.append(self.label_encoded_data[feature].max())
78+
# the weight is inversely proportional to max value
79+
feature_weights_list.append(round(1 / self.feature_range[feature].max(), 2))
8880
self.feature_weights_list = [feature_weights_list]
8981

9082
def do_random_init(self, num_inits, features_to_vary, query_instance, desired_class, desired_range):
@@ -255,6 +247,11 @@ def _generate_counterfactuals(self, query_instance, total_CFs, initialization="k
255247
(see diverse_counterfactuals.py).
256248
"""
257249

250+
if not hasattr(self.data_interface, 'data_df') and initialization == "kdtree":
251+
raise UserConfigValidationException(
252+
"kd-tree initialization is not supported for private data"
253+
" interface because training data to build kd-tree is not available.")
254+
258255
self.population_size = 10 * total_CFs
259256

260257
self.start_time = timeit.default_timer()
@@ -284,7 +281,7 @@ def _generate_counterfactuals(self, query_instance, total_CFs, initialization="k
284281
desired_class = self.misc_init(stopping_threshold, desired_class, desired_range, test_pred)
285282

286283
query_instance_df_dummies = pd.get_dummies(query_instance_orig)
287-
for col in pd.get_dummies(self.data_interface.data_df[self.data_interface.feature_names]).columns:
284+
for col in self.data_interface.get_all_dummy_colnames():
288285
if col not in query_instance_df_dummies.columns:
289286
query_instance_df_dummies[col] = 0
290287

dice_ml/explainer_interfaces/dice_random.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,8 @@ class of query_instance for binary classification.
173173
posthoc_sparsity_param,
174174
posthoc_sparsity_algorithm,
175175
limit_steps_ls)
176+
elif self.final_cfs is not None:
177+
final_cfs_df_sparse = final_cfs_df.copy()
176178
else:
177179
final_cfs_df_sparse = None
178180

@@ -205,10 +207,6 @@ def get_samples(self, fixed_features_values, feature_range, sampling_random_seed
205207
# first get required parameters
206208
precisions = self.data_interface.get_decimal_precisions(output_type="dict")
207209

208-
categorical_features_frequencies = {}
209-
for feature in self.data_interface.categorical_feature_names:
210-
categorical_features_frequencies[feature] = len(self.data_interface.data_df[feature].value_counts())
211-
212210
if sampling_random_seed is not None:
213211
random.seed(sampling_random_seed)
214212

dice_ml/utils/helpers.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,16 @@
22
This module containts helper functions to load data and get meta deta.
33
"""
44
import os
5+
import pickle
56
import shutil
67

78
import numpy as np
89
import pandas as pd
10+
from sklearn.compose import ColumnTransformer
11+
from sklearn.ensemble import RandomForestClassifier
912
from sklearn.model_selection import train_test_split
10-
from sklearn.preprocessing import FunctionTransformer
13+
from sklearn.pipeline import Pipeline
14+
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
1115

1216
import dice_ml
1317

@@ -84,6 +88,31 @@ def load_adult_income_dataset(only_train=True):
8488
return adult_data
8589

8690

91+
def save_adult_income_model(modelpath, test_fraction=0.2, random_state=0):
92+
dataset = load_adult_income_dataset()
93+
target = dataset["income"]
94+
train_dataset, x, y_train, y = train_test_split(dataset,
95+
target,
96+
test_size=test_fraction,
97+
random_state=random_state,
98+
stratify=target)
99+
x_train = train_dataset.drop('income', axis=1)
100+
numerical = ["age", "hours_per_week"]
101+
categorical = x_train.columns.difference(numerical)
102+
103+
categorical_transformer = Pipeline(steps=[
104+
('onehot', OneHotEncoder(handle_unknown='ignore'))])
105+
106+
transformations = ColumnTransformer(
107+
transformers=[
108+
('cat', categorical_transformer, categorical)])
109+
110+
clf = Pipeline(steps=[('preprocessor', transformations),
111+
('classifier', RandomForestClassifier())])
112+
model = clf.fit(x_train, y_train)
113+
pickle.dump(model, open(modelpath, 'wb'))
114+
115+
87116
def load_custom_testing_dataset():
88117
data = [['a', 10, 0], ['b', 10000, 0], ['c', 14, 0], ['a', 88, 0], ['c', 14, 0]]
89118
return pd.DataFrame(data, columns=['Categorical', 'Numerical', 'Outcome'])
@@ -116,7 +145,7 @@ def load_custom_testing_dataset_regression():
116145

117146
def get_adult_income_modelpath(backend='TF1'):
118147
pkg_path = dice_ml.__path__[0]
119-
model_ext = '.h5' if 'TF' in backend else '.pth'
148+
model_ext = '.h5' if 'TF' in backend else ('.pth' if backend == 'PYT' else '.pkl')
120149
modelpath = os.path.join(pkg_path, 'utils', 'sample_trained_models', 'adult'+model_ext)
121150
return modelpath
122151

11.3 MB
Binary file not shown.

dice_ml/utils/serialize.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
class DummyDataInterface:
22
def __init__(self, outcome_name, data_df=None):
33
self.outcome_name = outcome_name
4+
self.data_df = None
45
if data_df is not None:
56
self.data_df = data_df
67

0 commit comments

Comments
 (0)