|
19 | 19 | set, comparing the model predictions with the corresponding ground-truth fMRI |
20 | 20 | responses. |
21 | 21 |
|
22 | | -The banded ridge model is fitted with the package |
| 22 | +The ridge model is fitted with the package |
23 | 23 | `himalaya <https://github.com/gallantlab/himalaya>`_. |
24 | 24 | """ |
25 | 25 | ############################################################################### |
|
95 | 95 | # ---------------- |
96 | 96 | # |
97 | 97 | # Now, let's define the model pipeline. |
98 | | - |
99 | | -from sklearn.pipeline import make_pipeline |
| 98 | +# |
| 99 | +# We first center the features, since we will not use an intercept. |
100 | 100 | from sklearn.preprocessing import StandardScaler |
101 | 101 |
|
102 | | -# display the scikit-learn pipeline with an HTML diagram |
103 | | -from sklearn import set_config |
104 | | -set_config(display='diagram') |
| 102 | +scaler = StandardScaler(with_mean=True, with_std=False) |
| 103 | + |
| 104 | +############################################################################### |
| 105 | +# Then we concatenate the features with multiple delays, to account for the |
| 106 | +# hemodynamic response. The linear regression model will then weight each |
| 107 | +# delayed feature with a different weight, to build a predictive model. |
| 108 | +# |
| 109 | +# With a sample every 2 seconds, we use 4 delays [1, 2, 3, 4] to cover the |
| 110 | +# most part of the hemodynamic response peak. |
| 111 | +from voxelwise.delayer import Delayer |
| 112 | + |
| 113 | +delayer = Delayer(delays=[1, 2, 3, 4]) |
105 | 114 |
|
106 | 115 | ############################################################################### |
| 116 | +# The model we use is a ridge regression. When the number of features is |
| 117 | +# larger than the number of samples, it is more efficient to solve a ridge |
| 118 | +# regression using the (equivalent) dual formulation, kernel ridge regression |
| 119 | +# with a linear kernel. |
| 120 | +# Here, we have 3600 training samples, and 1705 * 4 = 6820 features (we |
| 121 | +# multiply by 4 since we use 4 time delays), therefore we use kernel ridge |
| 122 | +# regression. |
| 123 | + |
107 | 124 | # With one target, we could directly use the pipeline in scikit-learn's |
108 | 125 | # GridSearchCV, to select the optimal hyperparameters over cross-validation. |
109 | 126 | # However, GridSearchCV can only optimize one score. Thus, in the multiple |
|
113 | 130 | from himalaya.kernel_ridge import KernelRidgeCV |
114 | 131 |
|
115 | 132 | ############################################################################### |
116 | | -# We first concatenate the features with multiple delays, to account for the |
117 | | -# hemodynamic response. The linear regression model will then weight each |
118 | | -# delayed feature with a different weight, to build a predictive model. |
119 | | -# |
120 | | -# With a sample every 2 seconds, we use 4 delays [1, 2, 3, 4] to cover the |
121 | | -# most part of the hemodynamic response peak. |
122 | | -from voxelwise.delayer import Delayer |
| 133 | +# The scale of the regularization hyperparameter alpha is unknown, so we use |
| 134 | +# a large logarithmic range, and we will check after the fit that best |
| 135 | +# hyperparameters are not all on one range edge. |
| 136 | +alphas = np.logspace(1, 20, 20) |
| 137 | + |
| 138 | +kernel_ridge_cv = KernelRidgeCV( |
| 139 | + alphas=alphas, cv=cv, |
| 140 | + solver_params=dict(n_targets_batch=500, n_alphas_batch=5, |
| 141 | + n_targets_batch_refit=100)) |
123 | 142 |
|
124 | 143 | ############################################################################### |
125 | 144 | # We set himalaya's backend to "torch_cuda" to fit the model using GPU. |
|
133 | 152 | backend = set_backend("torch_cuda") |
134 | 153 |
|
135 | 154 | ############################################################################### |
136 | | -# The scale of the regularization hyperparameter alpha is unknown, so we use |
137 | | -# a large logarithmic range, and we will check after the fit that best |
138 | | -# hyperparameters are not all on one range edge. |
139 | | -alphas = np.logspace(1, 20, 20) |
140 | | - |
141 | | -############################################################################### |
| 155 | +# We use scikit-learn Pipeline to link the different steps together. |
142 | 156 | # The scikit-learn Pipeline can be used as a regular estimator, calling |
143 | | -# pipeline.fit, pipeline.predict, etc. |
| 157 | +# `pipeline.fit`, `pipeline.predict`, etc. |
144 | 158 | # Using a pipeline can be useful to clarify the different steps, avoid |
145 | 159 | # cross-validation mistakes, or automatically cache intermediate results. |
| 160 | +from sklearn.pipeline import make_pipeline |
| 161 | + |
| 162 | +# define the pipeline |
146 | 163 | pipeline = make_pipeline( |
147 | | - StandardScaler(with_mean=True, with_std=False), |
148 | | - Delayer(delays=[1, 2, 3, 4]), |
149 | | - KernelRidgeCV( |
150 | | - alphas=alphas, cv=cv, |
151 | | - solver_params=dict(n_targets_batch=500, n_alphas_batch=5, |
152 | | - n_targets_batch_refit=100)), |
| 164 | + scaler, |
| 165 | + delayer, |
| 166 | + kernel_ridge_cv, |
153 | 167 | ) |
| 168 | + |
| 169 | +############################################################################### |
| 170 | +# We can display the scikit-learn pipeline with an HTML diagram. |
| 171 | +from sklearn import set_config |
| 172 | +set_config(display='diagram') |
| 173 | + |
154 | 174 | pipeline |
155 | 175 |
|
156 | 176 | ############################################################################### |
|
164 | 184 | pipeline.fit(X_train, Y_train) |
165 | 185 |
|
166 | 186 | scores = pipeline.score(X_test, Y_test) |
| 187 | + |
167 | 188 | ############################################################################### |
168 | 189 | # Since we performed the KernelRidgeCV on GPU, scores are returned as |
169 | 190 | # torch.Tensor on GPU. Thus, we need to move them into numpy arrays on CPU, to |
|
232 | 253 | # ----------------------------------- |
233 | 254 | # |
234 | 255 | # To present an example of model comparison, we define here another model, |
235 | | -# without feature delays (i.e. no Delayer). This model is unlikely to perform |
| 256 | +# without feature delays (i.e. no `Delayer`). This model is unlikely to perform |
236 | 257 | # well, since fMRI responses are delayed in time with respect to the stimulus. |
237 | 258 |
|
238 | 259 | pipeline_nodelay = make_pipeline( |
|
261 | 282 | ax.set(title='Generalization R2 scores', xlabel='model without delays', |
262 | 283 | ylabel='model with delays') |
263 | 284 | plt.show() |
| 285 | + |
| 286 | +############################################################################### |
| 287 | +# Visualize the HRF |
| 288 | +# ----------------- |
| 289 | +# |
| 290 | +# Here we will visualize the hemodynamic response function (HRF), as captured |
| 291 | +# in the ridge regression weights. |
| 292 | +# |
| 293 | +# Fitting a kernel ridge regression results in a set of coefficients called the |
| 294 | +# "dual" coefficients. These coefficients are different from the "primal" |
| 295 | +# coefficients obtained with a ridge regression, but the primal coefficients |
| 296 | +# can be computed from the dual coefficients using the training features. |
| 297 | +# |
| 298 | +# To better visualize the HRF, we will refit a model with more delays, but only |
| 299 | +# on a selection of voxels to speed up the computations. |
| 300 | + |
| 301 | +# pick the 10 best voxels |
| 302 | +voxel_selection = np.argsort(scores)[-10:] |
| 303 | + |
| 304 | +# define a pipeline with more delays |
| 305 | +pipeline_many_delays = make_pipeline( |
| 306 | + StandardScaler(with_mean=True, with_std=False), |
| 307 | + Delayer(delays=np.arange(7)), |
| 308 | + KernelRidgeCV( |
| 309 | + alphas=alphas, cv=cv, |
| 310 | + solver_params=dict(n_targets_batch=500, n_alphas_batch=5, |
| 311 | + n_targets_batch_refit=100)), |
| 312 | +) |
| 313 | + |
| 314 | +pipeline_many_delays.fit(X_train, Y_train[:, voxel_selection]) |
| 315 | + |
| 316 | +# get the (primal) ridge regression coefficients |
| 317 | +primal_coef = pipeline_many_delays[-1].get_primal_coef() |
| 318 | +primal_coef = backend.to_numpy(primal_coef) |
| 319 | + |
| 320 | +# get the delays |
| 321 | +delays = pipeline_many_delays.named_steps['delayer'].delays |
| 322 | +# split the ridge coefficients per delays |
| 323 | +primal_coef_per_delay = np.stack(np.split(primal_coef, len(delays), axis=0)) |
| 324 | + |
| 325 | +# select the feature with the largest coefficients for each voxel |
| 326 | +feature_selection = np.argmax(np.sum(np.abs(primal_coef_per_delay), axis=0), |
| 327 | + axis=0) |
| 328 | +primal_coef_selection = primal_coef_per_delay[:, feature_selection, |
| 329 | + np.arange(len(voxel_selection))] |
| 330 | + |
| 331 | +plt.plot(delays, primal_coef_selection) |
| 332 | +plt.xlabel('Delays') |
| 333 | +plt.xticks(delays) |
| 334 | +plt.ylabel('Ridge coefficients') |
| 335 | +plt.title(f'Largest feature for the {len(voxel_selection)} best voxels') |
| 336 | +plt.axhline(0, color='k', linewidth=0.5) |
| 337 | +plt.show() |
| 338 | + |
| 339 | +############################################################################### |
| 340 | +# We see that the hemodynamic response function (HRF) is captured in the model |
| 341 | +# weights. In practice, we can limit the number of features by using only |
| 342 | +# the most informative delays, for example [1, 2, 3, 4]. |
0 commit comments