|
3 | 3 | Fit a banded ridge model with both wordnet and motion energy features |
4 | 4 | ===================================================================== |
5 | 5 |
|
| 6 | +In this example, we model the fMRI responses with a banded ridg regression |
| 7 | +model, with two different feature spaces. |
| 8 | +The two feature spaces used will be motion-energy and wordnet categories. |
| 9 | +
|
| 10 | +Since the relative scaling of both feature spaces is unknown, we use two |
| 11 | +regularization hyperparameters (one per feature space). Just like with ridge |
| 12 | +regression, we need to optimize the hyperparameters over cross-validation. |
| 13 | +
|
| 14 | +The banded ridge model is fitted with the package |
| 15 | +`himalaya <https://github.com/gallantlab/himalaya>`_. |
6 | 16 | """ |
| 17 | + |
| 18 | +# path of the data directory |
| 19 | +directory = '/data1/tutorials/vim-4/' |
| 20 | + |
| 21 | +# modify to use another subject |
| 22 | +subject = "S01" |
| 23 | + |
| 24 | +############################################################################### |
| 25 | +# Load the data |
| 26 | +# ------------- |
| 27 | +# |
| 28 | +# As in the previous models, we first load the fMRI responses, which are our |
| 29 | +# regression targets. |
| 30 | +import os.path as op |
| 31 | +import numpy as np |
| 32 | + |
| 33 | +from voxelwise.io import load_hdf5_array |
| 34 | + |
| 35 | +file_name = op.join(directory, "responses", f"{subject}_responses.hdf") |
| 36 | +Y_train = load_hdf5_array(file_name, key="Y_train") |
| 37 | +Y_test = load_hdf5_array(file_name, key="Y_test") |
| 38 | +run_onsets = load_hdf5_array(file_name, key="run_onsets") |
| 39 | + |
| 40 | +# Average test repeats, since we cannot model the non-repeatable part of |
| 41 | +# fMRI responses. |
| 42 | +Y_test = Y_test.mean(0) |
| 43 | + |
| 44 | +# remove nans, mainly present on non-cortical voxels |
| 45 | +Y_train = np.nan_to_num(Y_train) |
| 46 | +Y_test = np.nan_to_num(Y_test) |
| 47 | + |
| 48 | +############################################################################### |
| 49 | +# Then we load both feature spaces, that are going to be used for the |
| 50 | +# linear regression model. |
| 51 | + |
| 52 | +print("loading features..") |
| 53 | + |
| 54 | +feature_names = ["motion_energy", "wordnet"] |
| 55 | + |
| 56 | +Xs_train = [] |
| 57 | +Xs_test = [] |
| 58 | +n_features_list = [] |
| 59 | +for feature_space in feature_names: |
| 60 | + file_name = op.join(directory, "features", f"{feature_space}.hdf") |
| 61 | + Xi_train = load_hdf5_array(file_name, key="X_train") |
| 62 | + Xi_test = load_hdf5_array(file_name, key="X_test") |
| 63 | + |
| 64 | + Xs_train.append(Xi_train.astype(dtype="float32")) |
| 65 | + Xs_test.append(Xi_test.astype(dtype="float32")) |
| 66 | + n_features_list.append(Xi_train.shape[1]) |
| 67 | + |
| 68 | +# concatenate the feature spaces |
| 69 | +X_train = np.concatenate(Xs_train, 1) |
| 70 | +X_test = np.concatenate(Xs_test, 1) |
| 71 | + |
| 72 | +############################################################################### |
| 73 | +# Define the cross-validation scheme |
| 74 | +# ---------------------------------- |
| 75 | +# |
| 76 | +# We define again a leave-one-run-out cross-validation split scheme. |
| 77 | + |
| 78 | +from sklearn.model_selection import check_cv |
| 79 | +from voxelwise.utils import generate_leave_one_run_out |
| 80 | + |
| 81 | +n_samples_train = X_train.shape[0] |
| 82 | + |
| 83 | +# define a cross-validation splitter, compatible with scikit-learn |
| 84 | +cv = generate_leave_one_run_out(n_samples_train, run_onsets) |
| 85 | +cv = check_cv(cv) # copy the splitter into a reusable list |
| 86 | + |
| 87 | +############################################################################### |
| 88 | +# Define the model |
| 89 | +# ---------------- |
| 90 | + |
| 91 | +from sklearn.pipeline import make_pipeline |
| 92 | +from sklearn.preprocessing import StandardScaler |
| 93 | + |
| 94 | +# display the scikit-learn pipeline with an HTML diagram |
| 95 | +from sklearn import set_config |
| 96 | +set_config(display='diagram') |
| 97 | + |
| 98 | +############################################################################### |
| 99 | +# We first concatenate the features with multiple delays, to account for the |
| 100 | +# hemodynamic response. The linear regression model will then weight each |
| 101 | +# delayed feature with a different weight, to build a predictive model. |
| 102 | +# |
| 103 | +# With a sample every 2 seconds, we use 4 delays [1, 2, 3, 4] to cover the |
| 104 | +# most part of the hemodynamic response peak. |
| 105 | + |
| 106 | +from voxelwise.delayer import Delayer |
| 107 | + |
| 108 | +############################################################################### |
| 109 | +# We set himalaya's backend to "torch_cuda" to fit the model using GPU. |
| 110 | +# The available backends are: |
| 111 | +# |
| 112 | +# - "numpy" (CPU) (default) |
| 113 | +# - "torch" (CPU) |
| 114 | +# - "torch_cuda" (GPU) |
| 115 | +# - "cupy" (GPU) |
| 116 | +from himalaya.backend import set_backend |
| 117 | +backend = set_backend("torch_cuda") |
| 118 | + |
| 119 | +############################################################################### |
| 120 | +# To fit the banded ridge model, we use ``himalaya``'s scikit-learn API, and |
| 121 | +# fit a MultipleKernelRidgeCV model. |
| 122 | +# The class takes a number of common parameters during initialization, such as |
| 123 | +# `kernels` or `solver`. Since the solver parameters might be different |
| 124 | +# depending on the solver, they can be passed in the `solver_params` parameter. |
| 125 | + |
| 126 | +from himalaya.kernel_ridge import MultipleKernelRidgeCV |
| 127 | + |
| 128 | +# Here we will use the "random_search" solver. |
| 129 | +# We can check its specific parameters in the function docstring: |
| 130 | +solver_function = MultipleKernelRidgeCV.ALL_SOLVERS["random_search"] |
| 131 | +print("Docstring of the function %s:" % solver_function.__name__) |
| 132 | +print(solver_function.__doc__) |
| 133 | + |
| 134 | +############################################################################### |
| 135 | +# We use 50 iterations to have a reasonably fast example. |
| 136 | +# To have a better convergence, we might need more iterations. |
| 137 | +# Note that there is currently no stopping criterion in this method. |
| 138 | +n_iter = 50 |
| 139 | + |
| 140 | +############################################################################### |
| 141 | +# The scale of the regularization hyperparameter alpha is unknown, so we use |
| 142 | +# a large range, and we will check after the fit that best hyperparameters are |
| 143 | +# not all on one range edge. |
| 144 | +alphas = np.logspace(1, 20, 20) |
| 145 | + |
| 146 | +############################################################################### |
| 147 | +# Batch parameters, used to reduce the necessary GPU memory. A larger value |
| 148 | +# will be a bit faster, but the solver might crash if it is out of memory. |
| 149 | +# Optimal values depend on the size of your dataset. |
| 150 | +n_targets_batch = 200 |
| 151 | +n_alphas_batch = 5 |
| 152 | +n_targets_batch_refit = 200 |
| 153 | + |
| 154 | +############################################################################### |
| 155 | +# We put all these parameters in a dictionary ``solver_params``, and define |
| 156 | +# the main estimator ``MultipleKernelRidgeCV``. |
| 157 | + |
| 158 | +solver_params = dict(n_iter=n_iter, alphas=alphas, |
| 159 | + n_targets_batch=n_targets_batch, |
| 160 | + n_alphas_batch=n_alphas_batch, |
| 161 | + n_targets_batch_refit=n_targets_batch_refit) |
| 162 | + |
| 163 | +mkr_model = MultipleKernelRidgeCV(kernels="precomputed", |
| 164 | + solver="random_search", |
| 165 | + solver_params=solver_params, cv=cv) |
| 166 | + |
| 167 | +############################################################################### |
| 168 | +# We need a bit more work than in previous examples before defining the full |
| 169 | +# pipeline, since the banded ridge model requires multiple precomputed kernels, |
| 170 | +# one for each feature space. To compute them, we use the ColumnKernelizer, |
| 171 | +# that can be conveniently integrated in a scikit-learn Pipeline. |
| 172 | + |
| 173 | +from himalaya.kernel_ridge import Kernelizer |
| 174 | +from himalaya.kernel_ridge import ColumnKernelizer |
| 175 | + |
| 176 | +# Find the start and end of each feature space in X_train |
| 177 | +start_and_end = np.concatenate([[0], np.cumsum(n_features_list)]) |
| 178 | +slices = [ |
| 179 | + slice(start, end) |
| 180 | + for start, end in zip(start_and_end[:-1], start_and_end[1:]) |
| 181 | +] |
| 182 | + |
| 183 | +############################################################################### |
| 184 | +# Then we create a different Kernelizer for each feature space. |
| 185 | +# Here we use a linear kernel for all feature spaces, but ColumnKernelizer |
| 186 | +# accepts any Kernelizer, or scikit-learn Pipeline ending with a Kernelizer. |
| 187 | +preprocess_pipeline = make_pipeline( |
| 188 | + StandardScaler(with_mean=True, with_std=False), |
| 189 | + Delayer(delays=[1, 2, 3, 4]), |
| 190 | + Kernelizer(kernel="linear"), |
| 191 | +) |
| 192 | + |
| 193 | +# The column kernelizer applies a different pipeline on each selection of |
| 194 | +# features, here defined with ``slices``. |
| 195 | +column_kernelizer = ColumnKernelizer([ |
| 196 | + (name, preprocess_pipeline, slice_) |
| 197 | + for name, slice_ in zip(feature_names, slices) |
| 198 | +]) |
| 199 | + |
| 200 | +# Note that ColumnKernelizer has a parameter `n_jobs` to parallelize each |
| 201 | +# kernelizer, yet such parallelism does not work with GPU arrays. |
| 202 | + |
| 203 | +############################################################################### |
| 204 | +# Then we can define the model pipeline. |
| 205 | + |
| 206 | +pipeline = make_pipeline( |
| 207 | + column_kernelizer, |
| 208 | + mkr_model, |
| 209 | +) |
| 210 | + |
| 211 | +############################################################################### |
| 212 | +# Fit the model |
| 213 | +# ------------- |
| 214 | +# |
| 215 | +# We fit on the train set, and score on the test set. |
| 216 | + |
| 217 | +pipeline.fit(X_train, Y_train) |
| 218 | + |
| 219 | +scores = pipeline.score(X_test, Y_test) |
| 220 | + |
| 221 | +############################################################################### |
| 222 | +# Since we performed the MultipleKernelRidgeCV on GPU, scores are returned as |
| 223 | +# torch.Tensor on GPU. Thus, we need to move them into numpy arrays on CPU, to |
| 224 | +# be able to use them e.g. in a matplotlib figure. |
| 225 | +scores = backend.to_numpy(scores) |
| 226 | + |
| 227 | +############################################################################### |
| 228 | +# Compare with a ridge model |
| 229 | +# -------------------------- |
| 230 | +# |
| 231 | +# We can compare with a baseline model, which does not use one hyperparameter |
| 232 | +# per feature space, but share the same hyperparameter for all spaces. |
| 233 | + |
| 234 | +from himalaya.kernel_ridge import KernelRidgeCV |
| 235 | + |
| 236 | +pipeline_baseline = make_pipeline( |
| 237 | + StandardScaler(with_mean=True, with_std=False), |
| 238 | + Delayer(delays=[1, 2, 3, 4]), |
| 239 | + KernelRidgeCV( |
| 240 | + alphas=alphas, cv=cv, |
| 241 | + solver_params=dict(n_targets_batch=n_targets_batch, |
| 242 | + n_alphas_batch=n_alphas_batch, |
| 243 | + n_targets_batch_refit=n_targets_batch_refit)), |
| 244 | +) |
| 245 | + |
| 246 | +print("model fitting..") |
| 247 | +pipeline_baseline.fit(X_train, Y_train) |
| 248 | +scores_baseline = pipeline_baseline.score(X_test, Y_test) |
| 249 | +scores_baseline = backend.to_numpy(scores_baseline) |
| 250 | + |
| 251 | +############################################################################### |
| 252 | +# Here we plot the comparison of model performances with a 2D histogram. |
| 253 | +# All 70k voxels are represented in this histogram, where the diagonal |
| 254 | +# corresponds to identical performance for both models. A distibution deviating |
| 255 | +# from the diagonal means that one model has better predictive performances |
| 256 | +# than the other. |
| 257 | +import matplotlib.pyplot as plt |
| 258 | +from voxelwise.viz import plot_hist2d |
| 259 | + |
| 260 | +ax = plot_hist2d(scores_baseline, scores) |
| 261 | +ax.set(title='Generalization R2 scores', xlabel='KernelRidgeCV', |
| 262 | + ylabel='MultipleKernelRidgeCV') |
| 263 | +plt.show() |
0 commit comments