openml
diff --git a/‎ci_scripts/test.sh‎
Lines changed: 1 addition & 1 deletion b/‎ci_scripts/test.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/datasets_tutorial.py‎
Lines changed: 14 additions & 2 deletions b/‎examples/datasets_tutorial.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎examples/flows_and_runs_tutorial.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/flows_and_runs_tutorial.py‎
Lines changed: 2 additions & 0 deletions
@@ -22,7 +22,7 @@ run_tests() {
         PYTEST_ARGS=''
     fi
 
-    pytest -n 4 --timeout=600 --timeout-method=thread -sv --ignore='test_OpenMLDemo.py' $PYTEST_ARGS $test_dir
+    pytest -n 4 --duration=20 --timeout=600 --timeout-method=thread -sv --ignore='test_OpenMLDemo.py' $PYTEST_ARGS $test_dir
 }
 
 if [[ "$RUN_FLAKE8" == "true" ]]; then
 
@@ -55,16 +55,28 @@
 ############################################################################
 # Get the actual data.
 #
-# Returned as numpy array, with meta-info
-# (e.g. target feature, feature names, ...)
+# The dataset can be returned in 2 possible formats: as a NumPy array, a SciPy
+# sparse matrix, or as a Pandas DataFrame (or SparseDataFrame). The format is
+# controlled with the parameter ``dataset_format`` which can be either 'array'
+# (default) or 'dataframe'. Let's first build our dataset from a NumPy array
+# and manually create a dataframe.
 X, y, attribute_names = dataset.get_data(
+    dataset_format='array',
     target=dataset.default_target_attribute,
     return_attribute_names=True,
 )
 eeg = pd.DataFrame(X, columns=attribute_names)
 eeg['class'] = y
 print(eeg[:10])
 
+############################################################################
+# Instead of manually creating the dataframe, you can already request a
+# dataframe with the correct dtypes.
+X, y = dataset.get_data(target=dataset.default_target_attribute,
+                        dataset_format='dataframe')
+print(X.head())
+print(X.info())
+
 ############################################################################
 # Exercise 2
 # **********
 
@@ -17,6 +17,7 @@
 
 dataset = openml.datasets.get_dataset(68)
 X, y = dataset.get_data(
+    dataset_format='array',
     target=dataset.default_target_attribute
 )
 clf = neighbors.KNeighborsClassifier(n_neighbors=1)
@@ -28,6 +29,7 @@
 # * e.g. categorical features -> do feature encoding
 dataset = openml.datasets.get_dataset(17)
 X, y, categorical = dataset.get_data(
+    dataset_format='array',
     target=dataset.default_target_attribute,
     return_categorical_indicator=True,
 )
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ run_tests() {`
`22`	`22`	`PYTEST_ARGS=''`
`23`	`23`	`fi`
`24`	`24`
`25`		`- pytest -n 4 --timeout=600 --timeout-method=thread -sv --ignore='test_OpenMLDemo.py' $PYTEST_ARGS $test_dir`
	`25`	`+ pytest -n 4 --duration=20 --timeout=600 --timeout-method=thread -sv --ignore='test_OpenMLDemo.py' $PYTEST_ARGS $test_dir`
`26`	`26`	`}`
`27`	`27`
`28`	`28`	`if [[ "$RUN_FLAKE8" == "true" ]]; then`
Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@`
`17`	`17`
`18`	`18`	`dataset = openml.datasets.get_dataset(68)`
`19`	`19`	`X, y = dataset.get_data(`
	`20`	`+ dataset_format='array',`
`20`	`21`	`target=dataset.default_target_attribute`
`21`	`22`	`)`
`22`	`23`	`clf = neighbors.KNeighborsClassifier(n_neighbors=1)`
`@@ -28,6 +29,7 @@`
`28`	`29`	`# * e.g. categorical features -> do feature encoding`
`29`	`30`	`dataset = openml.datasets.get_dataset(17)`
`30`	`31`	`X, y, categorical = dataset.get_data(`
	`32`	`+ dataset_format='array',`
`31`	`33`	`target=dataset.default_target_attribute,`
`32`	`34`	`return_categorical_indicator=True,`
`33`	`35`	`)`