1-
2- from sklearn .preprocessing .imputation import Imputer , check_array , _get_mask , _most_frequent
1+ from scipy . integrate . tests . test_bvp import emden_bc
2+ from sklearn .preprocessing .imputation import Imputer , _get_mask
33
44import warnings
5-
5+ import math
66import numpy as np
7- import numpy .ma as ma
87from scipy import sparse
8+
9+ from sklearn .utils import check_array
910from sklearn .utils .fixes import astype
1011from sklearn .utils .sparsefuncs import _get_median
1112from sklearn .utils .validation import check_is_fitted
1213from sklearn .utils .validation import FLOAT_DTYPES
1314
1415
16+
1517class ConditionalImputer (Imputer ):
1618 """Imputation transformer for completing missing values.
1719
@@ -34,6 +36,13 @@ class ConditionalImputer(Imputer):
3436 - If "most_frequent", then replace missing using the most frequent
3537 value along the axis.
3638
39+ strategy_nominal : string, optional (default="most_frequent")
40+ The imputation strategy for nominal attributes. For values, see "strategy"
41+
42+ indices_nominal : list (int)
43+ An array of indices determining which are treated as nominal. If None,
44+ the Conditional Imputer will guess based on the values
45+
3746 axis : integer, optional (default=0)
3847 The axis along which to impute.
3948
@@ -68,12 +77,15 @@ class ConditionalImputer(Imputer):
6877 """
6978 def __init__ (self , missing_values = "NaN" , strategy = "mean" ,
7079 strategy_nominal = "most_frequent" ,
71- indeces_nominal = None ,
80+ categorical_features = None ,
81+ empty_attribute_constant = None ,
7282 axis = 0 , verbose = 0 , copy = True ):
7383 self .missing_values = missing_values
7484 self .strategy = strategy
7585 self .strategy_nominal = strategy_nominal
76- self .indeces_nominal = indeces_nominal
86+ self .categorical_features = categorical_features
87+ self .categorical_features_implied = None
88+ self .empty_attribute_constant = empty_attribute_constant
7789 self .axis = axis
7890 self .verbose = verbose
7991 self .copy = copy
@@ -125,8 +137,78 @@ def fit(self, X, y=None):
125137
126138 # here the indexes of nominal values get set
127139 self .statistics_ = statistics_general
128- if self .indeces_nominal is not None :
129- for i in self .indeces_nominal :
140+ if self .categorical_features is not None :
141+ for i in self .categorical_features :
130142 self .statistics_ [i ] = statistics_nominal [i ]
143+ else :
144+ # iterate over all attributes
145+ self .categorical_features_implied = []
146+ for iAtt in range (len (statistics_general )):
147+ isNominal = True
148+ for iInst in range (len (X )):
149+ if not np .isnan (X [iInst ][iAtt ]) and math .floor (X [iInst ][iAtt ]) != X [iInst ][iAtt ]:
150+ isNominal = False
151+ break
152+ if isNominal :
153+ # book keeping, for testing purposes
154+ self .categorical_features_implied .append (iAtt )
155+ self .statistics_ [iAtt ] = statistics_nominal [iAtt ]
156+
157+ return self
158+
159+
160+ def transform (self , X ):
161+ """Impute all missing values in X.
162+ Parameters
163+ ----------
164+ X : {array-like, sparse matrix}, shape = [n_samples, n_features]
165+ The input data to complete.
166+ """
167+ check_is_fitted (self , 'statistics_' )
168+ X = check_array (X , accept_sparse = 'csc' , dtype = FLOAT_DTYPES ,
169+ force_all_finite = False , copy = self .copy )
170+ statistics = self .statistics_
171+ if X .shape [1 ] != statistics .shape [0 ]:
172+ raise ValueError ("X has %d features per sample, expected %d"
173+ % (X .shape [1 ], self .statistics_ .shape [0 ]))
174+
175+ # impute completelly empty columns with constant
176+ if self .empty_attribute_constant is not None :
177+ invalid_mask = np .isnan (statistics )
178+ X [:, invalid_mask ] = self .empty_attribute_constant
179+ self .statistics_ [invalid_mask ] = self .empty_attribute_constant
180+
181+ # Delete the invalid rows/columns
182+ invalid_mask = np .isnan (statistics )
183+ valid_mask = np .logical_not (invalid_mask )
184+ valid_statistics = statistics [valid_mask ]
185+ valid_statistics_indexes = np .where (valid_mask )[0 ]
186+ missing = np .arange (X .shape [not self .axis ])[invalid_mask ]
187+
188+ if invalid_mask .any ():
189+ if self .verbose :
190+ warnings .warn ("Deleting features without "
191+ "observed values: %s" % missing )
192+ X = X [:, valid_statistics_indexes ]
193+
194+ # Do actual imputation
195+ if sparse .issparse (X ) and self .missing_values != 0 :
196+ mask = _get_mask (X .data , self .missing_values )
197+ indexes = np .repeat (np .arange (len (X .indptr ) - 1 , dtype = np .int ),
198+ np .diff (X .indptr ))[mask ]
199+
200+ X .data [mask ] = astype (valid_statistics [indexes ], X .dtype ,
201+ copy = False )
202+ else :
203+ if sparse .issparse (X ):
204+ X = X .toarray ()
205+
206+ mask = _get_mask (X , self .missing_values )
207+ n_missing = np .sum (mask , axis = self .axis )
208+ values = np .repeat (valid_statistics , n_missing )
209+
210+ coordinates = np .where (mask .transpose ())[::- 1 ]
211+
212+ X [coordinates ] = values
131213
132- return self
214+ return X
0 commit comments