interpretml
diff --git a/‎dice_ml/data_interfaces/private_data_interface.py‎
Lines changed: 58 additions & 14 deletions b/‎dice_ml/data_interfaces/private_data_interface.py‎
Lines changed: 58 additions & 14 deletions
diff --git a/‎dice_ml/data_interfaces/public_data_interface.py‎
Lines changed: 11 additions & 0 deletions b/‎dice_ml/data_interfaces/public_data_interface.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎dice_ml/dice.py‎
Lines changed: 1 addition & 1 deletion b/‎dice_ml/dice.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dice_ml/explainer_interfaces/dice_genetic.py‎
Lines changed: 11 additions & 14 deletions b/‎dice_ml/explainer_interfaces/dice_genetic.py‎
Lines changed: 11 additions & 14 deletions
diff --git a/‎dice_ml/explainer_interfaces/dice_random.py‎
Lines changed: 2 additions & 4 deletions b/‎dice_ml/explainer_interfaces/dice_random.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎dice_ml/utils/helpers.py‎
Lines changed: 31 additions & 2 deletions b/‎dice_ml/utils/helpers.py‎
Lines changed: 31 additions & 2 deletions
diff --git a/‎dice_ml/utils/sample_trained_models/adult.pkl‎
11.3 MB b/‎dice_ml/utils/sample_trained_models/adult.pkl‎
11.3 MB
diff --git a/‎dice_ml/utils/serialize.py‎
Lines changed: 1 addition & 0 deletions b/‎dice_ml/utils/serialize.py‎
Lines changed: 1 addition & 0 deletions
@@ -7,6 +7,7 @@
 
 import numpy as np
 import pandas as pd
+from sklearn.preprocessing import LabelEncoder
 
 from dice_ml.data_interfaces.base_data_interface import _BaseData
 
@@ -60,6 +61,7 @@ def __init__(self, params):
         self._validate_and_set_mad(params=params)
         self._validate_and_set_permitted_range(params=params, features_dict=features_dict)
         self.feature_names = list(features_dict.keys())
+        self.number_of_features = len(self.feature_names)
 
         self.continuous_feature_indexes = [list(features_dict.keys()).index(
             name) for name in self.continuous_feature_names if name in features_dict]
@@ -91,20 +93,34 @@ def one_hot_encode_data(self, data):
         """One-hot-encodes the data."""
         return pd.get_dummies(data, drop_first=False, columns=self.categorical_feature_names)
 
-    def normalize_data(self, df, encoding='one-hot'):
+    def normalize_data(self, df):
         """Normalizes continuous features to make them fall in the range [0,1]."""
         result = df.copy()
-        for feature_name in self.continuous_feature_names:
-            max_value = self.permitted_range[feature_name][1]
-            min_value = self.permitted_range[feature_name][0]
-            result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
-
-        # if encoding == 'label': # need not do this if not required
-        #     for ix in self.categorical_feature_indexes:
-        #         feature_name = self.feature_names[ix]
-        #         max_value = len(self.categorical_levels[feature_name])-1
-        #         min_value = 0
-        #         result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
+        if isinstance(df, pd.DataFrame) or isinstance(df, dict):
+            for feature_name in self.continuous_feature_names:
+                max_value = self.permitted_range[feature_name][1]
+                min_value = self.permitted_range[feature_name][0]
+                if min_value == max_value:
+                    result[feature_name] = 0
+                else:
+                    result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
+        else:
+            result = result.astype('float')
+            for feature_index in self.continuous_feature_indexes:
+                feature_name = self.feature_names[feature_index]
+                max_value = self.permitted_range[feature_name][1]
+                min_value = self.permitted_range[feature_name][0]
+                if len(df.shape) == 1:
+                    if min_value == max_value:
+                        value = 0
+                    else:
+                        value = (df[feature_index] - min_value) / (max_value - min_value)
+                    result[feature_index] = value
+                else:
+                    if min_value == max_value:
+                        result[:, feature_index] = np.zeros(len(df[:, feature_index]))
+                    else:
+                        result[:, feature_index] = (df[:, feature_index] - min_value) / (max_value - min_value)
         return result
 
     def de_normalize_data(self, df):
@@ -161,6 +177,9 @@ def get_valid_mads(self, normalized=False, display_warnings=False, return_mads=T
             return mads
 
     def get_features_range(self, permitted_range_input=None, features_dict=None):
+        if features_dict is None:
+            features_dict = self.permitted_range
+
         ranges = {}
         # Getting default ranges based on the dataset
         for feature in features_dict:
@@ -241,6 +260,13 @@ def get_indexes_of_features_to_vary(self, features_to_vary='all'):
                     ixs.append(colidx)
             return ixs
 
+    def fit_label_encoders(self):
+        labelencoders = {}
+        for column in self.categorical_feature_names:
+            labelencoders[column] = LabelEncoder()
+            labelencoders[column].fit(self.permitted_range[column])
+        return labelencoders
+
     def from_label(self, data):
         """Transforms label encoded data back to categorical values"""
         out = data.copy()
@@ -267,7 +293,7 @@ def from_dummies(self, data, prefix_sep='_'):
     def get_decimal_precisions(self, output_type="list"):
         """"Gets the precision of continuous features in the data."""
         precisions_dict = defaultdict(int)
-        precisions = [0]*len(self.continuous_feature_names)
+        precisions = [0]*len(self.feature_names)
         for ix, feature_name in enumerate(self.continuous_feature_names):
             type_prec = self.type_and_precision[feature_name]
             if type_prec == 'int':
@@ -357,7 +383,7 @@ def get_ohe_min_max_normalized_data(self, query_instance):
         temp = self.one_hot_encode_data(temp)
         temp = temp.tail(query_instance.shape[0]).reset_index(drop=True)
         # returns a pandas dataframe
-        return self.normalize_data(temp)
+        return self.normalize_data(temp).apply(pd.to_numeric)
 
     def get_inverse_ohe_min_max_normalized_data(self, transformed_data):
         """Transforms one-hot-encoded and min-max normalized data into raw user-fed data format. transformed_data
@@ -370,3 +396,21 @@ def get_inverse_ohe_min_max_normalized_data(self, transformed_data):
         raw_data = raw_data[self.feature_names]
         # returns a pandas dataframe
         return raw_data
+
+    def get_all_dummy_colnames(self):
+        max_vals = max([len(val) for col, val in self.permitted_range.items()])
+        sample_data_dict = {col: np.resize(val, max_vals) for col, val in self.permitted_range.items()}
+        sample_df = pd.DataFrame(sample_data_dict)
+        return pd.get_dummies(sample_df).columns
+
+    def get_valid_feature_range(self, feature_range_input, normalized=True):
+        """Gets the min/max value of features in normalized or de-normalized
+        form. Assumes that all features are already encoded to numerical form
+        such that the number of features remains the same.
+
+        # TODO needs work adhere to label encoded max and to support permitted_range for
+        both continuous and discrete when provided in _generate_counterfactuals.
+        """
+        if normalized:
+            raise NotImplementedError("Normalized feature range not supported for private data interface")
+        return feature_range_input
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+from sklearn.preprocessing import LabelEncoder
 
 from dice_ml.data_interfaces.base_data_interface import _BaseData
 from dice_ml.utils.exception import (SystemException,
@@ -338,6 +339,13 @@ def get_indexes_of_features_to_vary(self, features_to_vary='all'):
                     ixs.append(colidx)
             return ixs
 
+    def fit_label_encoders(self):
+        labelencoders = {}
+        for column in self.categorical_feature_names:
+            labelencoders[column] = LabelEncoder()
+            labelencoders[column] = labelencoders[column].fit(self.data_df[column])
+        return labelencoders
+
     def from_label(self, data):
         """Transforms label encoded data back to categorical values"""
         out = data.copy()
@@ -488,3 +496,6 @@ def get_inverse_ohe_min_max_normalized_data(self, transformed_data):
         raw_data = raw_data[self.feature_names]
         # returns a pandas dataframe
         return raw_data
+
+    def get_all_dummy_colnames(self):
+        return pd.get_dummies(self.data_df[self.feature_names]).columns
@@ -25,7 +25,7 @@ def decide_implementation_type(self, data_interface, model_interface, method, **
         if model_interface.backend == BackEndTypes.Sklearn:
             if method == SamplingStrategy.KdTree and isinstance(data_interface, PrivateData):
                 raise UserConfigValidationException(
-                    'Private data interface is not supported with sklearn kdtree explainer'
+                    'Private data interface is not supported with kdtree explainer'
                     ' since kdtree explainer needs access to entire training data')
         self.__class__ = decide(model_interface, method)
         self.__init__(data_interface, model_interface, **kwargs)
 
@@ -8,11 +8,11 @@
 
 import numpy as np
 import pandas as pd
-from sklearn.preprocessing import LabelEncoder
 
 from dice_ml import diverse_counterfactuals as exp
 from dice_ml.constants import ModelTypes
 from dice_ml.explainer_interfaces.explainer_base import ExplainerBase
+from dice_ml.utils.exception import UserConfigValidationException
 
 
 class DiceGenetic(ExplainerBase):
@@ -24,6 +24,7 @@ def __init__(self, data_interface, model_interface):
         :param model_interface: an interface class to access trained ML model.
         """
         super().__init__(data_interface, model_interface)  # initiating data related parameters
+        self.num_output_nodes = None
 
         # variables required to generate CFs - see generate_counterfactuals() for more info
         self.cfs = []
@@ -33,15 +34,7 @@ def __init__(self, data_interface, model_interface):
         self.feature_weights_input = ''
 
         # Initializing a label encoder to obtain label-encoded values for categorical variables
-        self.labelencoder = {}
-
-        self.label_encoded_data = self.data_interface.data_df.copy()
-
-        for column in self.data_interface.categorical_feature_names:
-            self.labelencoder[column] = LabelEncoder()
-            self.label_encoded_data[column] = self.labelencoder[column].fit_transform(
-                self.data_interface.data_df[column])
-
+        self.labelencoder = self.data_interface.fit_label_encoders()
         self.predicted_outcome_name = self.data_interface.outcome_name + '_pred'
 
     def update_hyperparameters(self, proximity_weight, sparsity_weight,
@@ -61,7 +54,6 @@ def do_loss_initializations(self, yloss_type, diversity_loss_type, feature_weigh
         # define the loss parts
         self.yloss_type = yloss_type
         self.diversity_loss_type = diversity_loss_type
-
         # define feature weights
         if feature_weights != self.feature_weights_input:
             self.feature_weights_input = feature_weights
@@ -83,8 +75,8 @@ def do_loss_initializations(self, yloss_type, diversity_loss_type, feature_weigh
                     if feature in feature_weights:
                         feature_weights_list.append(feature_weights[feature])
                     else:
-                        # TODO: why is the weight the max value of the encoded feature
-                        feature_weights_list.append(self.label_encoded_data[feature].max())
+                        # the weight is inversely proportional to max value
+                        feature_weights_list.append(round(1 / self.feature_range[feature].max(), 2))
             self.feature_weights_list = [feature_weights_list]
 
     def do_random_init(self, num_inits, features_to_vary, query_instance, desired_class, desired_range):
@@ -255,6 +247,11 @@ def _generate_counterfactuals(self, query_instance, total_CFs, initialization="k
                  (see diverse_counterfactuals.py).
         """
 
+        if not hasattr(self.data_interface, 'data_df') and initialization == "kdtree":
+            raise UserConfigValidationException(
+                    "kd-tree initialization is not supported for private data"
+                    " interface because training data to build kd-tree is not available.")
+
         self.population_size = 10 * total_CFs
 
         self.start_time = timeit.default_timer()
@@ -284,7 +281,7 @@ def _generate_counterfactuals(self, query_instance, total_CFs, initialization="k
         desired_class = self.misc_init(stopping_threshold, desired_class, desired_range, test_pred)
 
         query_instance_df_dummies = pd.get_dummies(query_instance_orig)
-        for col in pd.get_dummies(self.data_interface.data_df[self.data_interface.feature_names]).columns:
+        for col in self.data_interface.get_all_dummy_colnames():
             if col not in query_instance_df_dummies.columns:
                 query_instance_df_dummies[col] = 0
 
 
@@ -173,6 +173,8 @@ class of query_instance for binary classification.
                                                                        posthoc_sparsity_param,
                                                                        posthoc_sparsity_algorithm,
                                                                        limit_steps_ls)
+        elif self.final_cfs is not None:
+            final_cfs_df_sparse = final_cfs_df.copy()
         else:
             final_cfs_df_sparse = None
 
@@ -205,10 +207,6 @@ def get_samples(self, fixed_features_values, feature_range, sampling_random_seed
         # first get required parameters
         precisions = self.data_interface.get_decimal_precisions(output_type="dict")
 
-        categorical_features_frequencies = {}
-        for feature in self.data_interface.categorical_feature_names:
-            categorical_features_frequencies[feature] = len(self.data_interface.data_df[feature].value_counts())
-
         if sampling_random_seed is not None:
             random.seed(sampling_random_seed)
 
 
@@ -2,12 +2,16 @@
 This module containts helper functions to load data and get meta deta.
 """
 import os
+import pickle
 import shutil
 
 import numpy as np
 import pandas as pd
+from sklearn.compose import ColumnTransformer
+from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import FunctionTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
 
 import dice_ml
 
@@ -84,6 +88,31 @@ def load_adult_income_dataset(only_train=True):
     return adult_data
 
 
+def save_adult_income_model(modelpath, test_fraction=0.2, random_state=0):
+    dataset = load_adult_income_dataset()
+    target = dataset["income"]
+    train_dataset, x, y_train, y = train_test_split(dataset,
+                                                    target,
+                                                    test_size=test_fraction,
+                                                    random_state=random_state,
+                                                    stratify=target)
+    x_train = train_dataset.drop('income', axis=1)
+    numerical = ["age", "hours_per_week"]
+    categorical = x_train.columns.difference(numerical)
+
+    categorical_transformer = Pipeline(steps=[
+        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
+
+    transformations = ColumnTransformer(
+        transformers=[
+            ('cat', categorical_transformer, categorical)])
+
+    clf = Pipeline(steps=[('preprocessor', transformations),
+                          ('classifier', RandomForestClassifier())])
+    model = clf.fit(x_train, y_train)
+    pickle.dump(model, open(modelpath, 'wb'))
+
+
 def load_custom_testing_dataset():
     data = [['a', 10, 0], ['b', 10000, 0], ['c', 14, 0], ['a', 88, 0], ['c', 14, 0]]
     return pd.DataFrame(data, columns=['Categorical', 'Numerical', 'Outcome'])
@@ -116,7 +145,7 @@ def load_custom_testing_dataset_regression():
 
 def get_adult_income_modelpath(backend='TF1'):
     pkg_path = dice_ml.__path__[0]
-    model_ext = '.h5' if 'TF' in backend else '.pth'
+    model_ext = '.h5' if 'TF' in backend else ('.pth' if backend == 'PYT' else '.pkl')
     modelpath = os.path.join(pkg_path, 'utils', 'sample_trained_models', 'adult'+model_ext)
     return modelpath
 
 
@@ -1,6 +1,7 @@
 class DummyDataInterface:
     def __init__(self, outcome_name, data_df=None):
         self.outcome_name = outcome_name
+        self.data_df = None
         if data_df is not None:
             self.data_df = data_df