|
19 | 19 | "import dice_ml\n", |
20 | 20 | "from dice_ml import Dice\n", |
21 | 21 | "\n", |
22 | | - "from sklearn.datasets import load_iris, load_boston\n", |
| 22 | + "from sklearn.datasets import load_iris, fetch_california_housing\n", |
23 | 23 | "from sklearn.pipeline import Pipeline\n", |
24 | 24 | "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", |
25 | 25 | "from sklearn.model_selection import train_test_split\n", |
|
162 | 162 | "outputs": [], |
163 | 163 | "source": [ |
164 | 164 | "# Single input\n", |
165 | | - "query_instances_iris = x_train[2:3]\n", |
| 165 | + "query_instances_iris = x_test[2:3]\n", |
166 | 166 | "genetic_iris = exp_genetic_iris.generate_counterfactuals(query_instances_iris, total_CFs=7, desired_class=2)\n", |
167 | 167 | "genetic_iris.visualize_as_dataframe()" |
168 | 168 | ] |
|
174 | 174 | "outputs": [], |
175 | 175 | "source": [ |
176 | 176 | "# Multiple queries can be given as input at once\n", |
177 | | - "query_instances_iris = x_train[17:19]\n", |
| 177 | + "query_instances_iris = x_test[17:19]\n", |
178 | 178 | "genetic_iris = exp_genetic_iris.generate_counterfactuals(query_instances_iris, total_CFs=7, desired_class=2)\n", |
179 | 179 | "genetic_iris.visualize_as_dataframe(show_only_changes=True)" |
180 | 180 | ] |
|
190 | 190 | "cell_type": "markdown", |
191 | 191 | "metadata": {}, |
192 | 192 | "source": [ |
193 | | - "For regression, we will use sklearn's boston dataset. This dataset contains boston house-prices. More information at https://scikit-learn.org/stable/datasets/toy_dataset.html#boston-house-prices-dataset" |
| 193 | + "For regression, we will use sklearn's California Housing dataset. This dataset contains California house prices. More information at https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html" |
194 | 194 | ] |
195 | 195 | }, |
196 | 196 | { |
|
199 | 199 | "metadata": {}, |
200 | 200 | "outputs": [], |
201 | 201 | "source": [ |
202 | | - "boston_data = load_boston()\n", |
203 | | - "df_boston = pd.DataFrame(boston_data.data, columns=boston_data.feature_names)\n", |
204 | | - "df_boston[outcome_name] = pd.Series(boston_data.target)\n", |
205 | | - "df_boston.head()" |
| 202 | + "housing_data = fetch_california_housing()\n", |
| 203 | + "df_housing = pd.DataFrame(housing_data.data, columns=housing_data.feature_names)\n", |
| 204 | + "df_housing[outcome_name] = pd.Series(housing_data.target)\n", |
| 205 | + "df_housing.head()" |
206 | 206 | ] |
207 | 207 | }, |
208 | 208 | { |
|
211 | 211 | "metadata": {}, |
212 | 212 | "outputs": [], |
213 | 213 | "source": [ |
214 | | - "df_boston.info()" |
| 214 | + "df_housing.info()" |
215 | 215 | ] |
216 | 216 | }, |
217 | 217 | { |
|
220 | 220 | "metadata": {}, |
221 | 221 | "outputs": [], |
222 | 222 | "source": [ |
223 | | - "continuous_features_boston = df_boston.drop(outcome_name, axis=1).columns.tolist()\n", |
224 | | - "target = df_boston[outcome_name]" |
| 223 | + "continuous_features_housing = df_housing.drop(outcome_name, axis=1).columns.tolist()\n", |
| 224 | + "target = df_housing[outcome_name]" |
225 | 225 | ] |
226 | 226 | }, |
227 | 227 | { |
|
231 | 231 | "outputs": [], |
232 | 232 | "source": [ |
233 | 233 | "# Split data into train and test\n", |
234 | | - "datasetX = df_boston.drop(outcome_name, axis=1)\n", |
| 234 | + "datasetX = df_housing.drop(outcome_name, axis=1)\n", |
235 | 235 | "x_train, x_test, y_train, y_test = train_test_split(datasetX,\n", |
236 | 236 | " target,\n", |
237 | 237 | " test_size=0.2,\n", |
238 | 238 | " random_state=0)\n", |
239 | 239 | "\n", |
240 | | - "categorical_features = x_train.columns.difference(continuous_features_boston)\n", |
| 240 | + "categorical_features = x_train.columns.difference(continuous_features_housing)\n", |
241 | 241 | "\n", |
242 | 242 | "# We create the preprocessing pipelines for both numeric and categorical data.\n", |
243 | 243 | "numeric_transformer = Pipeline(steps=[\n", |
|
248 | 248 | "\n", |
249 | 249 | "transformations = ColumnTransformer(\n", |
250 | 250 | " transformers=[\n", |
251 | | - " ('num', numeric_transformer, continuous_features_boston),\n", |
| 251 | + " ('num', numeric_transformer, continuous_features_housing),\n", |
252 | 252 | " ('cat', categorical_transformer, categorical_features)])\n", |
253 | 253 | "\n", |
254 | 254 | "# Append classifier to preprocessing pipeline.\n", |
255 | 255 | "# Now we have a full prediction pipeline.\n", |
256 | | - "regr_boston = Pipeline(steps=[('preprocessor', transformations),\n", |
257 | | - " ('regressor', RandomForestRegressor())])\n", |
258 | | - "model_boston = regr_boston.fit(x_train, y_train)" |
| 256 | + "regr_housing = Pipeline(steps=[('preprocessor', transformations),\n", |
| 257 | + " ('regressor', RandomForestRegressor())])\n", |
| 258 | + "model_housing = regr_housing.fit(x_train, y_train)" |
259 | 259 | ] |
260 | 260 | }, |
261 | 261 | { |
|
264 | 264 | "metadata": {}, |
265 | 265 | "outputs": [], |
266 | 266 | "source": [ |
267 | | - "d_boston = dice_ml.Data(dataframe=df_boston, continuous_features=continuous_features_boston, outcome_name=outcome_name)\n", |
| 267 | + "d_housing = dice_ml.Data(dataframe=df_housing, continuous_features=continuous_features_housing, outcome_name=outcome_name)\n", |
268 | 268 | "# We provide the type of model as a parameter (model_type)\n", |
269 | | - "m_boston = dice_ml.Model(model=model_boston, backend=\"sklearn\", model_type='regressor')" |
| 269 | + "m_housing = dice_ml.Model(model=model_housing, backend=\"sklearn\", model_type='regressor')" |
270 | 270 | ] |
271 | 271 | }, |
272 | 272 | { |
|
275 | 275 | "metadata": {}, |
276 | 276 | "outputs": [], |
277 | 277 | "source": [ |
278 | | - "exp_genetic_boston = Dice(d_boston, m_boston, method=\"genetic\")" |
| 278 | + "exp_genetic_housing = Dice(d_housing, m_housing, method=\"genetic\")" |
279 | 279 | ] |
280 | 280 | }, |
281 | 281 | { |
|
292 | 292 | "outputs": [], |
293 | 293 | "source": [ |
294 | 294 | "# Multiple queries can be given as input at once\n", |
295 | | - "query_instances_boston = x_train[2:3]\n", |
296 | | - "genetic_boston = exp_genetic_boston.generate_counterfactuals(query_instances_boston,\n", |
297 | | - " total_CFs=2,\n", |
298 | | - " desired_range=[30, 45])\n", |
299 | | - "genetic_boston.visualize_as_dataframe(show_only_changes=True)" |
300 | | - ] |
301 | | - }, |
302 | | - { |
303 | | - "cell_type": "code", |
304 | | - "execution_count": null, |
305 | | - "metadata": {}, |
306 | | - "outputs": [], |
307 | | - "source": [ |
308 | | - "# Multiple queries can be given as input at once\n", |
309 | | - "query_instances_boston = x_train[17:19]\n", |
310 | | - "genetic_boston = exp_genetic_boston.generate_counterfactuals(query_instances_boston, total_CFs=4, desired_range=[40, 50])\n", |
311 | | - "genetic_boston.visualize_as_dataframe(show_only_changes=True)" |
| 295 | + "query_instances_housing = x_test[2:4]\n", |
| 296 | + "genetic_housing = exp_genetic_housing.generate_counterfactuals(query_instances_housing,\n", |
| 297 | + " total_CFs=2,\n", |
| 298 | + " desired_range=[3.0, 5.0])\n", |
| 299 | + "genetic_housing.visualize_as_dataframe(show_only_changes=True)" |
312 | 300 | ] |
313 | 301 | } |
314 | 302 | ], |
|
0 commit comments