77
88import numpy as np
99import pandas as pd
10+ from sklearn .preprocessing import LabelEncoder
1011
1112from dice_ml .data_interfaces .base_data_interface import _BaseData
1213
@@ -60,6 +61,7 @@ def __init__(self, params):
6061 self ._validate_and_set_mad (params = params )
6162 self ._validate_and_set_permitted_range (params = params , features_dict = features_dict )
6263 self .feature_names = list (features_dict .keys ())
64+ self .number_of_features = len (self .feature_names )
6365
6466 self .continuous_feature_indexes = [list (features_dict .keys ()).index (
6567 name ) for name in self .continuous_feature_names if name in features_dict ]
@@ -91,20 +93,34 @@ def one_hot_encode_data(self, data):
9193 """One-hot-encodes the data."""
9294 return pd .get_dummies (data , drop_first = False , columns = self .categorical_feature_names )
9395
94- def normalize_data (self , df , encoding = 'one-hot' ):
96+ def normalize_data (self , df ):
9597 """Normalizes continuous features to make them fall in the range [0,1]."""
9698 result = df .copy ()
97- for feature_name in self .continuous_feature_names :
98- max_value = self .permitted_range [feature_name ][1 ]
99- min_value = self .permitted_range [feature_name ][0 ]
100- result [feature_name ] = (df [feature_name ] - min_value ) / (max_value - min_value )
101-
102- # if encoding == 'label': # need not do this if not required
103- # for ix in self.categorical_feature_indexes:
104- # feature_name = self.feature_names[ix]
105- # max_value = len(self.categorical_levels[feature_name])-1
106- # min_value = 0
107- # result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
99+ if isinstance (df , pd .DataFrame ) or isinstance (df , dict ):
100+ for feature_name in self .continuous_feature_names :
101+ max_value = self .permitted_range [feature_name ][1 ]
102+ min_value = self .permitted_range [feature_name ][0 ]
103+ if min_value == max_value :
104+ result [feature_name ] = 0
105+ else :
106+ result [feature_name ] = (df [feature_name ] - min_value ) / (max_value - min_value )
107+ else :
108+ result = result .astype ('float' )
109+ for feature_index in self .continuous_feature_indexes :
110+ feature_name = self .feature_names [feature_index ]
111+ max_value = self .permitted_range [feature_name ][1 ]
112+ min_value = self .permitted_range [feature_name ][0 ]
113+ if len (df .shape ) == 1 :
114+ if min_value == max_value :
115+ value = 0
116+ else :
117+ value = (df [feature_index ] - min_value ) / (max_value - min_value )
118+ result [feature_index ] = value
119+ else :
120+ if min_value == max_value :
121+ result [:, feature_index ] = np .zeros (len (df [:, feature_index ]))
122+ else :
123+ result [:, feature_index ] = (df [:, feature_index ] - min_value ) / (max_value - min_value )
108124 return result
109125
110126 def de_normalize_data (self , df ):
@@ -161,6 +177,9 @@ def get_valid_mads(self, normalized=False, display_warnings=False, return_mads=T
161177 return mads
162178
163179 def get_features_range (self , permitted_range_input = None , features_dict = None ):
180+ if features_dict is None :
181+ features_dict = self .permitted_range
182+
164183 ranges = {}
165184 # Getting default ranges based on the dataset
166185 for feature in features_dict :
@@ -241,6 +260,13 @@ def get_indexes_of_features_to_vary(self, features_to_vary='all'):
241260 ixs .append (colidx )
242261 return ixs
243262
263+ def fit_label_encoders (self ):
264+ labelencoders = {}
265+ for column in self .categorical_feature_names :
266+ labelencoders [column ] = LabelEncoder ()
267+ labelencoders [column ].fit (self .permitted_range [column ])
268+ return labelencoders
269+
244270 def from_label (self , data ):
245271 """Transforms label encoded data back to categorical values"""
246272 out = data .copy ()
@@ -267,7 +293,7 @@ def from_dummies(self, data, prefix_sep='_'):
267293 def get_decimal_precisions (self , output_type = "list" ):
268294 """"Gets the precision of continuous features in the data."""
269295 precisions_dict = defaultdict (int )
270- precisions = [0 ]* len (self .continuous_feature_names )
296+ precisions = [0 ]* len (self .feature_names )
271297 for ix , feature_name in enumerate (self .continuous_feature_names ):
272298 type_prec = self .type_and_precision [feature_name ]
273299 if type_prec == 'int' :
@@ -357,7 +383,7 @@ def get_ohe_min_max_normalized_data(self, query_instance):
357383 temp = self .one_hot_encode_data (temp )
358384 temp = temp .tail (query_instance .shape [0 ]).reset_index (drop = True )
359385 # returns a pandas dataframe
360- return self .normalize_data (temp )
386+ return self .normalize_data (temp ). apply ( pd . to_numeric )
361387
362388 def get_inverse_ohe_min_max_normalized_data (self , transformed_data ):
363389 """Transforms one-hot-encoded and min-max normalized data into raw user-fed data format. transformed_data
@@ -370,3 +396,21 @@ def get_inverse_ohe_min_max_normalized_data(self, transformed_data):
370396 raw_data = raw_data [self .feature_names ]
371397 # returns a pandas dataframe
372398 return raw_data
399+
400+ def get_all_dummy_colnames (self ):
401+ max_vals = max ([len (val ) for col , val in self .permitted_range .items ()])
402+ sample_data_dict = {col : np .resize (val , max_vals ) for col , val in self .permitted_range .items ()}
403+ sample_df = pd .DataFrame (sample_data_dict )
404+ return pd .get_dummies (sample_df ).columns
405+
406+ def get_valid_feature_range (self , feature_range_input , normalized = True ):
407+ """Gets the min/max value of features in normalized or de-normalized
408+ form. Assumes that all features are already encoded to numerical form
409+ such that the number of features remains the same.
410+
411+ # TODO needs work adhere to label encoded max and to support permitted_range for
412+ both continuous and discrete when provided in _generate_counterfactuals.
413+ """
414+ if normalized :
415+ raise NotImplementedError ("Normalized feature range not supported for private data interface" )
416+ return feature_range_input
0 commit comments