|
289 | 289 | # the solver over cross-validation. This plot is helpful to refine the alpha |
290 | 290 | # grid if the range is too small or too large. |
291 | 291 | # |
292 | | -# Note that some voxels might be at the maximum regularization value in the grid |
293 | | -# search. These are voxels where the model has no predictive power, thus |
294 | | -# the optimal regularization parameter is large to lead to a prediction equal |
| 292 | +# Note that some voxels might be at the maximum regularization value in the |
| 293 | +# grid search. These are voxels where the model has no predictive power, thus |
| 294 | +# the optimal regularization parameter is large to lead to a prediction equal |
295 | 295 | # to zero. We do not need to extend the alpha range for these voxels. |
296 | 296 |
|
297 | 297 | from himalaya.viz import plot_alphas_diagnostic |
|
305 | 305 | # |
306 | 306 | # To present an example of model comparison, we define here another model, |
307 | 307 | # without feature delays (i.e. no ``Delayer``). Because the BOLD signal is |
308 | | -# inherently slow due to the dynamics of neuro-vascular coupling, this model |
309 | | -# is unlikely to perform well. |
| 308 | +# inherently slow due to the dynamics of neuro-vascular coupling, this model is |
| 309 | +# unlikely to perform well. |
310 | 310 |
|
311 | 311 | pipeline_nodelay = make_pipeline( |
312 | 312 | StandardScaler(with_mean=True, with_std=False), |
|
351 | 351 | # Visualize the HRF |
352 | 352 | # ----------------- |
353 | 353 | # |
354 | | -# We just saw that delays are necessary to model BOLD responses. |
355 | | -# Here we show how the fitted ridge regression weights follow the hemodynamic response |
| 354 | +# We just saw that delays are necessary to model BOLD responses. Here we show |
| 355 | +# how the fitted ridge regression weights follow the hemodynamic response |
356 | 356 | # function (HRF). |
357 | 357 | # |
358 | 358 | # Fitting a kernel ridge regression results in a set of coefficients called the |
359 | | -# "dual" coefficients :math:`w`. These coefficients differ from the |
360 | | -# "primal" coefficients :math:`\beta` obtained with a ridge regression, |
361 | | -# but the primal coefficients can be computed from the dual coefficients |
362 | | -# using the training features :math:`X`: |
| 359 | +# "dual" coefficients :math:`w`. These coefficients differ from the "primal" |
| 360 | +# coefficients :math:`\beta` obtained with a ridge regression, but the primal |
| 361 | +# coefficients can be computed from the dual coefficients using the training |
| 362 | +# features :math:`X`: |
363 | 363 | # |
364 | 364 | # .. math:: |
365 | 365 | # |
|
413 | 413 | # weights. In this dataset, we can limit the number of features by using only |
414 | 414 | # the most informative delays, for example [1, 2, 3, 4]. |
415 | 415 |
|
| 416 | +############################################################################### |
| 417 | +# Visualize the regression coefficients |
| 418 | +# ------------------------------------- |
| 419 | +# |
| 420 | +# Here, we go back to the main model on all voxels. Since our model is linear, |
| 421 | +# we can use the (primal) regression coefficients to interpret the model. The |
| 422 | +# basic intuition is that the model will use larger coefficients on features |
| 423 | +# that have more predictive power. |
| 424 | +# |
| 425 | +# Since we know the meaning of each feature, we can interpret the large |
| 426 | +# regression coefficients. In the case of wordnet features, we can even build |
| 427 | +# a graph that represents the features linked by a semantic relationship. |
| 428 | + |
| 429 | +############################################################################### |
| 430 | +# We first get the (primal) ridge regression coefficients from the fitted |
| 431 | +# model. |
| 432 | +primal_coef = pipeline[-1].get_primal_coef() |
| 433 | +primal_coef = backend.to_numpy(primal_coef) |
| 434 | +print("(n_delays * n_features, n_voxels) =", primal_coef.shape) |
| 435 | + |
| 436 | +############################################################################### |
| 437 | +# Here, we are only interested in the voxels with good generalization |
| 438 | +# performances. We select an arbitrary threshold of 0.05 (R^2 score). |
| 439 | +primal_coef = primal_coef[:, scores > 0.05] |
| 440 | + |
| 441 | +############################################################################### |
| 442 | +# Then, we aggregate the coefficients across the different delays. |
| 443 | + |
| 444 | +# get the delays |
| 445 | +delays = pipeline.named_steps['delayer'].delays |
| 446 | +print("delays =", delays) |
| 447 | + |
| 448 | +# split the ridge coefficients per delays |
| 449 | +primal_coef_per_delay = np.stack(np.split(primal_coef, len(delays), axis=0)) |
| 450 | +print("(n_delays, n_features, n_voxels) =", primal_coef_per_delay.shape) |
| 451 | + |
| 452 | +# average over delays |
| 453 | +average_coef = np.mean(primal_coef_per_delay, axis=0) |
| 454 | +print("(n_features, n_voxels) =", average_coef.shape) |
| 455 | + |
| 456 | +############################################################################### |
| 457 | +# Even after averaging over delays, the coefficient matrix is still too large |
| 458 | +# to understand it. Therefore, we use principal component analysis (PCA) to |
| 459 | +# reduce the dimensionality of the matrix. |
| 460 | +from sklearn.decomposition import PCA |
| 461 | + |
| 462 | +pca = PCA(n_components=4) |
| 463 | +pca.fit(average_coef.T) |
| 464 | +components = pca.components_ |
| 465 | +print("(n_components, n_features) =", components.shape) |
| 466 | + |
| 467 | +############################################################################### |
| 468 | +# We can check the ratio of explained variance by each principal component. |
| 469 | +# We see that the first four components already explain a large part of the |
| 470 | +# coefficients variance. |
| 471 | +print("PCA explained variance =", pca.explained_variance_ratio_) |
| 472 | + |
| 473 | +############################################################################### |
| 474 | +# Similarly to [1]_, we correct the coefficients of features linked by a |
| 475 | +# semantic relationship. Indeed, in the wordnet features, if a clip was labeled |
| 476 | +# with `wolf`, the authors automatically added the categories `canine`, |
| 477 | +# `carnivore`, `placental mammal`, `mamma`, `vertebrate`, `chordate`, |
| 478 | +# `organism`, and `whole`. The authors thus argue that the same correction |
| 479 | +# needs to be done on the coefficients. |
| 480 | + |
| 481 | +from voxelwise_tutorials.wordnet import load_wordnet |
| 482 | +from voxelwise_tutorials.wordnet import correct_coefficients |
| 483 | +_, wordnet_categories = load_wordnet() |
| 484 | +components = correct_coefficients(components.T, wordnet_categories).T |
| 485 | +components -= components.mean(axis=1)[:, None] |
| 486 | +components /= components.std(axis=1)[:, None] |
| 487 | + |
| 488 | +############################################################################### |
| 489 | +# Finally, we plot the first principal component on the wordnet graph. In such |
| 490 | +# graph, links indicate "is a" relationships (e.g. an `athlete` "is a" |
| 491 | +# `person`). Each marker represents a single noun (circle) or verb (square). |
| 492 | +# The area of each marker indicates the principal component magnitude, and the |
| 493 | +# color indicates the sign (red is positive, blue is negative). |
| 494 | + |
| 495 | +from voxelwise_tutorials.wordnet import plot_wordnet_graph |
| 496 | +from voxelwise_tutorials.wordnet import apply_cmap |
| 497 | + |
| 498 | +first_component = components[0] |
| 499 | +node_sizes = np.abs(first_component) |
| 500 | +node_colors = apply_cmap(first_component, vmin=-2, vmax=2, cmap='coolwarm', |
| 501 | + n_colors=2) |
| 502 | + |
| 503 | +plot_wordnet_graph(node_colors=node_colors, node_sizes=node_sizes) |
| 504 | +plt.show() |
| 505 | + |
| 506 | +############################################################################### |
| 507 | +# According to the authors of [1]_, "this principal component distinguishes |
| 508 | +# between categories with high stimulus energy (e.g. moving objects like |
| 509 | +# `person` and `vehicle`) and those with low stimulus energy (e.g. stationary |
| 510 | +# objects like `sky` and `city`)". |
| 511 | +# |
| 512 | +# Our result is slightly different than in [1]_, since we only use one subject, |
| 513 | +# and the voxel selection is slightly different. We also use a different |
| 514 | +# regularization parameter in each voxels, while in [1]_ all voxels use the |
| 515 | +# same regularization parameter. |
| 516 | + |
| 517 | +############################################################################### |
| 518 | +# Following [1]_, we also plot the next three principal components on the |
| 519 | +# wordnet graph, mapping the three vectors to RGB colors. |
| 520 | + |
| 521 | +from voxelwise_tutorials.wordnet import scale_to_rgb_cube |
| 522 | + |
| 523 | +next_three_components = components[1:4].T |
| 524 | +node_sizes = np.linalg.norm(next_three_components, axis=1) |
| 525 | +node_colors = scale_to_rgb_cube(next_three_components) |
| 526 | + |
| 527 | +plot_wordnet_graph(node_colors=node_colors, node_sizes=node_sizes) |
| 528 | +plt.show() |
| 529 | + |
| 530 | +############################################################################### |
| 531 | +# According to the authors of [1]_, "this graph shows that categories thought |
| 532 | +# to be semantically related (e.g. athletes and walking) are represented |
| 533 | +# similarly in the brain". |
| 534 | +# |
| 535 | +# Again, our results are slightly different than in [1]_, for the same reasons |
| 536 | +# mentioned earlier. |
| 537 | + |
416 | 538 | ############################################################################### |
417 | 539 | # References |
418 | 540 | # ---------- |
|
0 commit comments