|
64 | 64 | ############################################################################ |
65 | 65 | # Get the actual data. |
66 | 66 | # |
67 | | -# The dataset can be returned in 3 possible formats: as a NumPy array, a SciPy |
68 | | -# sparse matrix, or as a Pandas DataFrame. The format is |
69 | | -# controlled with the parameter ``dataset_format`` which can be either 'array' |
70 | | -# (default) or 'dataframe'. Let's first build our dataset from a NumPy array |
71 | | -# and manually create a dataframe. |
72 | | -X, y, categorical_indicator, attribute_names = dataset.get_data( |
73 | | - dataset_format="array", target=dataset.default_target_attribute |
74 | | -) |
75 | | -eeg = pd.DataFrame(X, columns=attribute_names) |
76 | | -eeg["class"] = y |
77 | | -print(eeg[:10]) |
| 67 | +# openml-python returns data as pandas dataframes (stored in the `eeg` variable below), |
| 68 | +# and also some additional metadata that we don't care about right now. |
| 69 | +eeg, *_ = dataset.get_data() |
78 | 70 |
|
79 | 71 | ############################################################################ |
80 | | -# Instead of manually creating the dataframe, you can already request a |
81 | | -# dataframe with the correct dtypes. |
| 72 | +# You can optionally choose to have openml separate out a column from the |
| 73 | +# dataset. In particular, many datasets for supervised problems have a set |
| 74 | +# `default_target_attribute` which may help identify the target variable. |
82 | 75 | X, y, categorical_indicator, attribute_names = dataset.get_data( |
83 | | - target=dataset.default_target_attribute, dataset_format="dataframe" |
| 76 | + target=dataset.default_target_attribute |
84 | 77 | ) |
85 | 78 | print(X.head()) |
86 | 79 | print(X.info()) |
|
91 | 84 | # data file. The dataset object can be used as normal. |
92 | 85 | # Whenever you use any functionality that requires the data, |
93 | 86 | # such as `get_data`, the data will be downloaded. |
| 87 | +# Starting from 0.15, not downloading data will be the default behavior instead. |
| 88 | +# The data will be downloading automatically when you try to access it through |
| 89 | +# openml objects, e.g., using `dataset.features`. |
94 | 90 | dataset = openml.datasets.get_dataset(1471, download_data=False) |
95 | 91 |
|
96 | 92 | ############################################################################ |
|
99 | 95 | # * Explore the data visually. |
100 | 96 | eegs = eeg.sample(n=1000) |
101 | 97 | _ = pd.plotting.scatter_matrix( |
102 | | - eegs.iloc[:100, :4], |
103 | | - c=eegs[:100]["class"], |
| 98 | + X.iloc[:100, :4], |
| 99 | + c=y[:100], |
104 | 100 | figsize=(10, 10), |
105 | 101 | marker="o", |
106 | 102 | hist_kwds={"bins": 20}, |
|
0 commit comments