Skip to content

Commit bb9df91

Browse files
authored
Merge pull request #266 from interpretml/gaugup/ReplaceBostonHousingDataset
Replace load_boston() with fetch_california_housing()
2 parents 428d125 + 604676a commit bb9df91

4 files changed

Lines changed: 37 additions & 50 deletions

File tree

docs/source/notebooks/DiCE_multiclass_classification_and_regression.ipynb

Lines changed: 25 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"import dice_ml\n",
2020
"from dice_ml import Dice\n",
2121
"\n",
22-
"from sklearn.datasets import load_iris, load_boston\n",
22+
"from sklearn.datasets import load_iris, fetch_california_housing\n",
2323
"from sklearn.pipeline import Pipeline\n",
2424
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
2525
"from sklearn.model_selection import train_test_split\n",
@@ -162,7 +162,7 @@
162162
"outputs": [],
163163
"source": [
164164
"# Single input\n",
165-
"query_instances_iris = x_train[2:3]\n",
165+
"query_instances_iris = x_test[2:3]\n",
166166
"genetic_iris = exp_genetic_iris.generate_counterfactuals(query_instances_iris, total_CFs=7, desired_class=2)\n",
167167
"genetic_iris.visualize_as_dataframe()"
168168
]
@@ -174,7 +174,7 @@
174174
"outputs": [],
175175
"source": [
176176
"# Multiple queries can be given as input at once\n",
177-
"query_instances_iris = x_train[17:19]\n",
177+
"query_instances_iris = x_test[17:19]\n",
178178
"genetic_iris = exp_genetic_iris.generate_counterfactuals(query_instances_iris, total_CFs=7, desired_class=2)\n",
179179
"genetic_iris.visualize_as_dataframe(show_only_changes=True)"
180180
]
@@ -190,7 +190,7 @@
190190
"cell_type": "markdown",
191191
"metadata": {},
192192
"source": [
193-
"For regression, we will use sklearn's boston dataset. This dataset contains boston house-prices. More information at https://scikit-learn.org/stable/datasets/toy_dataset.html#boston-house-prices-dataset"
193+
"For regression, we will use sklearn's California Housing dataset. This dataset contains California house prices. More information at https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html"
194194
]
195195
},
196196
{
@@ -199,10 +199,10 @@
199199
"metadata": {},
200200
"outputs": [],
201201
"source": [
202-
"boston_data = load_boston()\n",
203-
"df_boston = pd.DataFrame(boston_data.data, columns=boston_data.feature_names)\n",
204-
"df_boston[outcome_name] = pd.Series(boston_data.target)\n",
205-
"df_boston.head()"
202+
"housing_data = fetch_california_housing()\n",
203+
"df_housing = pd.DataFrame(housing_data.data, columns=housing_data.feature_names)\n",
204+
"df_housing[outcome_name] = pd.Series(housing_data.target)\n",
205+
"df_housing.head()"
206206
]
207207
},
208208
{
@@ -211,7 +211,7 @@
211211
"metadata": {},
212212
"outputs": [],
213213
"source": [
214-
"df_boston.info()"
214+
"df_housing.info()"
215215
]
216216
},
217217
{
@@ -220,8 +220,8 @@
220220
"metadata": {},
221221
"outputs": [],
222222
"source": [
223-
"continuous_features_boston = df_boston.drop(outcome_name, axis=1).columns.tolist()\n",
224-
"target = df_boston[outcome_name]"
223+
"continuous_features_housing = df_housing.drop(outcome_name, axis=1).columns.tolist()\n",
224+
"target = df_housing[outcome_name]"
225225
]
226226
},
227227
{
@@ -231,13 +231,13 @@
231231
"outputs": [],
232232
"source": [
233233
"# Split data into train and test\n",
234-
"datasetX = df_boston.drop(outcome_name, axis=1)\n",
234+
"datasetX = df_housing.drop(outcome_name, axis=1)\n",
235235
"x_train, x_test, y_train, y_test = train_test_split(datasetX,\n",
236236
" target,\n",
237237
" test_size=0.2,\n",
238238
" random_state=0)\n",
239239
"\n",
240-
"categorical_features = x_train.columns.difference(continuous_features_boston)\n",
240+
"categorical_features = x_train.columns.difference(continuous_features_housing)\n",
241241
"\n",
242242
"# We create the preprocessing pipelines for both numeric and categorical data.\n",
243243
"numeric_transformer = Pipeline(steps=[\n",
@@ -248,14 +248,14 @@
248248
"\n",
249249
"transformations = ColumnTransformer(\n",
250250
" transformers=[\n",
251-
" ('num', numeric_transformer, continuous_features_boston),\n",
251+
" ('num', numeric_transformer, continuous_features_housing),\n",
252252
" ('cat', categorical_transformer, categorical_features)])\n",
253253
"\n",
254254
"# Append classifier to preprocessing pipeline.\n",
255255
"# Now we have a full prediction pipeline.\n",
256-
"regr_boston = Pipeline(steps=[('preprocessor', transformations),\n",
257-
" ('regressor', RandomForestRegressor())])\n",
258-
"model_boston = regr_boston.fit(x_train, y_train)"
256+
"regr_housing = Pipeline(steps=[('preprocessor', transformations),\n",
257+
" ('regressor', RandomForestRegressor())])\n",
258+
"model_housing = regr_housing.fit(x_train, y_train)"
259259
]
260260
},
261261
{
@@ -264,9 +264,9 @@
264264
"metadata": {},
265265
"outputs": [],
266266
"source": [
267-
"d_boston = dice_ml.Data(dataframe=df_boston, continuous_features=continuous_features_boston, outcome_name=outcome_name)\n",
267+
"d_housing = dice_ml.Data(dataframe=df_housing, continuous_features=continuous_features_housing, outcome_name=outcome_name)\n",
268268
"# We provide the type of model as a parameter (model_type)\n",
269-
"m_boston = dice_ml.Model(model=model_boston, backend=\"sklearn\", model_type='regressor')"
269+
"m_housing = dice_ml.Model(model=model_housing, backend=\"sklearn\", model_type='regressor')"
270270
]
271271
},
272272
{
@@ -275,7 +275,7 @@
275275
"metadata": {},
276276
"outputs": [],
277277
"source": [
278-
"exp_genetic_boston = Dice(d_boston, m_boston, method=\"genetic\")"
278+
"exp_genetic_housing = Dice(d_housing, m_housing, method=\"genetic\")"
279279
]
280280
},
281281
{
@@ -292,23 +292,11 @@
292292
"outputs": [],
293293
"source": [
294294
"# Multiple queries can be given as input at once\n",
295-
"query_instances_boston = x_train[2:3]\n",
296-
"genetic_boston = exp_genetic_boston.generate_counterfactuals(query_instances_boston,\n",
297-
" total_CFs=2,\n",
298-
" desired_range=[30, 45])\n",
299-
"genetic_boston.visualize_as_dataframe(show_only_changes=True)"
300-
]
301-
},
302-
{
303-
"cell_type": "code",
304-
"execution_count": null,
305-
"metadata": {},
306-
"outputs": [],
307-
"source": [
308-
"# Multiple queries can be given as input at once\n",
309-
"query_instances_boston = x_train[17:19]\n",
310-
"genetic_boston = exp_genetic_boston.generate_counterfactuals(query_instances_boston, total_CFs=4, desired_range=[40, 50])\n",
311-
"genetic_boston.visualize_as_dataframe(show_only_changes=True)"
295+
"query_instances_housing = x_test[2:4]\n",
296+
"genetic_housing = exp_genetic_housing.generate_counterfactuals(query_instances_housing,\n",
297+
" total_CFs=2,\n",
298+
" desired_range=[3.0, 5.0])\n",
299+
"genetic_housing.visualize_as_dataframe(show_only_changes=True)"
312300
]
313301
}
314302
],

tests/conftest.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import pandas as pd
44
import pytest
5-
from sklearn.datasets import load_boston, load_iris
5+
from sklearn.datasets import fetch_california_housing, load_iris
66
from sklearn.model_selection import train_test_split
77

88
import dice_ml
@@ -219,11 +219,11 @@ def create_iris_data():
219219

220220

221221
@pytest.fixture
222-
def create_boston_data():
223-
boston = load_boston()
222+
def create_housing_data():
223+
housing = fetch_california_housing()
224224
x_train, x_test, y_train, y_test = train_test_split(
225-
boston.data, boston.target,
225+
housing.data, housing.target,
226226
test_size=0.2, random_state=7)
227-
x_train = pd.DataFrame(data=x_train, columns=boston.feature_names)
228-
x_test = pd.DataFrame(data=x_test, columns=boston.feature_names)
229-
return x_train, x_test, y_train, y_test, boston.feature_names.tolist()
227+
x_train = pd.DataFrame(data=x_train, columns=housing.feature_names)
228+
x_test = pd.DataFrame(data=x_test, columns=housing.feature_names)
229+
return x_train, x_test, y_train, y_test, housing.feature_names

tests/test_dice_interface/test_explainer_base.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -398,18 +398,17 @@ def test_zero_totalcfs(self, desired_range, regression_exp_object, sample_custom
398398
desired_range=desired_range)
399399

400400
@pytest.mark.parametrize("desired_range, method",
401-
[([10, 100], 'random')])
402-
def test_numeric_categories(self, desired_range, method, create_boston_data):
401+
[([3, 5], 'random')])
402+
def test_numeric_categories(self, desired_range, method, create_housing_data):
403403
x_train, x_test, y_train, y_test, feature_names = \
404-
create_boston_data
404+
create_housing_data
405405

406406
rfc = RandomForestRegressor(n_estimators=10, max_depth=4,
407407
random_state=777)
408408
model = rfc.fit(x_train, y_train)
409409

410410
dataset_train = x_train.copy()
411411
dataset_train['Outcome'] = y_train
412-
feature_names.remove('CHAS')
413412

414413
d = dice_ml.Data(dataframe=dataset_train, continuous_features=feature_names, outcome_name='Outcome')
415414
m = dice_ml.Model(model=model, backend='sklearn', model_type='regressor')

tests/test_model_interface/test_base_model.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,9 @@ def create_sklearn_random_forest_regressor(self, X, y):
4747
model = rfc.fit(X, y)
4848
return model
4949

50-
def test_base_model_regression(self, create_boston_data):
50+
def test_base_model_regression(self, create_housing_data):
5151
x_train, x_test, y_train, y_test, feature_names = \
52-
create_boston_data
52+
create_housing_data
5353
trained_model = self.create_sklearn_random_forest_regressor(x_train, y_train)
5454

5555
diceml_model = dice_ml.Model(model=trained_model, model_type='regressor', backend='sklearn')

0 commit comments

Comments
 (0)