1+
2+ from sklearn .preprocessing .imputation import Imputer , check_array , _get_mask , _most_frequent
3+
4+ import warnings
5+
6+ import numpy as np
7+ import numpy .ma as ma
8+ from scipy import sparse
9+ from sklearn .utils .fixes import astype
10+ from sklearn .utils .sparsefuncs import _get_median
11+ from sklearn .utils .validation import check_is_fitted
12+ from sklearn .utils .validation import FLOAT_DTYPES
13+
14+
15+ class ConditionalImputer (Imputer ):
16+ """Imputation transformer for completing missing values.
17+
18+ Read more in the :ref:`User Guide <imputation>`.
19+
20+ Parameters
21+ ----------
22+ missing_values : integer or "NaN", optional (default="NaN")
23+ The placeholder for the missing values. All occurrences of
24+ `missing_values` will be imputed. For missing values encoded as np.nan,
25+ use the string value "NaN".
26+
27+ strategy : string, optional (default="mean")
28+ The imputation strategy.
29+
30+ - If "mean", then replace missing values using the mean along
31+ the axis.
32+ - If "median", then replace missing values using the median along
33+ the axis.
34+ - If "most_frequent", then replace missing using the most frequent
35+ value along the axis.
36+
37+ axis : integer, optional (default=0)
38+ The axis along which to impute.
39+
40+ - If `axis=0`, then impute along columns.
41+ - If `axis=1`, then impute along rows. (Not supported)
42+
43+ verbose : integer, optional (default=0)
44+ Controls the verbosity of the imputer.
45+
46+ copy : boolean, optional (default=True)
47+ If True, a copy of X will be created. If False, imputation will
48+ be done in-place whenever possible. Note that, in the following cases,
49+ a new copy will always be made, even if `copy=False`:
50+
51+ - If X is not an array of floating values;
52+ - If X is sparse and `missing_values=0`;
53+ - If `axis=0` and X is encoded as a CSR matrix;
54+ - If `axis=1` and X is encoded as a CSC matrix.
55+
56+ Attributes
57+ ----------
58+ statistics_ : array of shape (n_features,)
59+ The imputation fill value for each feature if axis == 0.
60+
61+ Notes
62+ -----
63+ - When ``axis=0``, columns which only contained missing values at `fit`
64+ are discarded upon `transform`.
65+ - When ``axis=1``, an exception is raised if there are rows for which it is
66+ not possible to fill in the missing values (e.g., because they only
67+ contain missing values).
68+ """
69+ def __init__ (self , missing_values = "NaN" , strategy = "mean" ,
70+ strategy_nominal = "most_frequent" ,
71+ indexes_nominal = None ,
72+ axis = 0 , verbose = 0 , copy = True ):
73+ self .missing_values = missing_values
74+ self .strategy = strategy
75+ self .strategy_nominal = strategy_nominal
76+ self .indexes_nominal = indexes_nominal
77+ self .axis = axis
78+ self .verbose = verbose
79+ self .copy = copy
80+
81+ def fit (self , X , y = None ):
82+ """Fit the imputer on X.
83+ Parameters
84+ ----------
85+ X : {array-like, sparse matrix}, shape (n_samples, n_features)
86+ Input data, where ``n_samples`` is the number of samples and
87+ ``n_features`` is the number of features.
88+ Returns
89+ -------
90+ self : object
91+ Returns self.
92+ """
93+ # Check parameters
94+ allowed_strategies = ["mean" , "median" , "most_frequent" ]
95+ if self .strategy not in allowed_strategies :
96+ raise ValueError ("Can only use these strategies: {0} "
97+ " got strategy={1}" .format (allowed_strategies ,
98+ self .strategy ))
99+
100+ if self .axis not in [0 ]:
101+ raise ValueError ("Can only impute missing values on axis 0 (axis 1 not supported), "
102+ " got axis={0}" .format (self .axis ))
103+
104+ X = check_array (X , accept_sparse = 'csc' , dtype = np .float64 ,
105+ force_all_finite = False )
106+
107+ if sparse .issparse (X ):
108+ statistics_general = self ._sparse_fit (X ,
109+ self .strategy ,
110+ self .missing_values ,
111+ self .axis )
112+ statistics_nominal = self ._sparse_fit (X ,
113+ self .strategy_nominal ,
114+ self .missing_values ,
115+ self .axis )
116+ else :
117+ statistics_general = self ._dense_fit (X ,
118+ self .strategy ,
119+ self .missing_values ,
120+ self .axis )
121+ statistics_nominal = self ._dense_fit (X ,
122+ self .strategy_nominal ,
123+ self .missing_values ,
124+ self .axis )
125+
126+ # here the indexes of nominal values get set
127+ self .statistics_ = statistics_general
128+ if self .indexes_nominal is not None :
129+ for i in self .indexes_nominal :
130+ self .statistics_ [i ] = statistics_nominal [i ]
131+
132+ return self
0 commit comments