diff --git a/README.md b/README.md index 103a0301..f543ff43 100644 --- a/README.md +++ b/README.md @@ -199,6 +199,7 @@ The table below lists the recommendation models/algorithms featured in Cornac. E | 2013 | [Hidden Factors and Hidden Topics (HFT)](cornac/models/hft), [docs](https://cornac.readthedocs.io/en/stable/api_ref/models.html#module-cornac.models.hft.recom_hft), [paper](https://cs.stanford.edu/people/jure/pubs/reviews-recsys13.pdf) | Content-Based / Text | CPU | [quick-start](examples/hft_example.py) | 2012 | [Weighted Bayesian Personalized Ranking (WBPR)](cornac/models/bpr), [docs](https://cornac.readthedocs.io/en/stable/api_ref/models.html#weighted-bayesian-personalized-ranking-wbpr), [paper](http://proceedings.mlr.press/v18/gantner12a/gantner12a.pdf) | Collaborative Filtering | CPU | [quick-start](examples/bpr_netflix.py) | 2011 | [Collaborative Topic Regression (CTR)](cornac/models/ctr), [docs](https://cornac.readthedocs.io/en/stable/api_ref/models.html#module-cornac.models.ctr.recom_ctr), [paper](http://www.cs.columbia.edu/~blei/papers/WangBlei2011.pdf) | Content-Based / Text | CPU | [quick-start](examples/ctr_example_citeulike.py), [deep-dive](https://github.com/PreferredAI/tutorials/blob/master/recommender-systems/05_multimodality.ipynb) +| 2010 | [Factorizing Personalized Markov Chains (FPMC)](cornac/models/fpmc), [docs](https://cornac.readthedocs.io/en/stable/api_ref/models.html#module-cornac.models.fpmc.recom_fpmc), [paper](https://www.ismll.uni-hildesheim.de/pub/pdfs/RendleFreudenthaler2010-FPMC.pdf) | Next-Item | [requirements](cornac/models/fpmc/requirements.txt), CPU / GPU | [quick-start](examples/fpmc_diginetica.py) | Earlier | [Baseline Only](cornac/models/baseline_only), [docs](https://cornac.readthedocs.io/en/stable/api_ref/models.html#baseline-only), [paper](http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf) | Baseline | CPU | [quick-start](examples/svd_example.py) | | [Bayesian Personalized Ranking (BPR)](cornac/models/bpr), [docs](https://cornac.readthedocs.io/en/stable/api_ref/models.html#bayesian-personalized-ranking-bpr) [paper](https://arxiv.org/ftp/arxiv/papers/1205/1205.2618.pdf) | Collaborative Filtering | CPU | [quick-start](examples/bpr_netflix.py), [deep-dive](https://github.com/recommenders-team/recommenders/blob/main/examples/02_model_collaborative_filtering/cornac_bpr_deep_dive.ipynb) | | [Factorization Machines (FM)](cornac/models/fm), [docs](https://cornac.readthedocs.io/en/stable/api_ref/models.html#factorization-machines-fm), [paper](https://www.csie.ntu.edu.tw/~b97053/paper/Factorization%20Machines%20with%20libFM.pdf) | Collaborative Filtering / Content-Based | Linux, CPU | [quick-start](examples/fm_example.py), [deep-dive](https://github.com/PreferredAI/tutorials/blob/master/recommender-systems/06_contextual_awareness.ipynb) diff --git a/cornac/datasets/README.md b/cornac/datasets/README.md index 438fa689..3dd614df 100644 --- a/cornac/datasets/README.md +++ b/cornac/datasets/README.md @@ -279,3 +279,5 @@ Session-aware recommendation extends next-item (session-based) recommendation by | [Diginetica](./diginetica.py) | 571 | 6,008 | 2,670 | 12,146 | 4.68 | 2.02 | 4.55 | 0.354% | | [RetailRocket](./retailrocket.py) | 4,249 | 36,658 | 24,732 | 230,817 | 5.82 | 6.30 | 9.33 | 0.148% | | [Cosmetics](./cosmetics.py) | 17,268 | 42,367 | 172,242 | 2,533,262 | 9.97 | 59.79 | 14.71 | 0.346% | + +For session-based (next-item) evaluation, [Diginetica](./diginetica.py)'s `load_val()` and `load_test()` default to `mode="session-based"`, returning each user's single held-out session (`val_sbr`/`test_sbr`) with no training transitions repeated — the clean evaluation set used by session-based models such as [FPMC](../models/fpmc/) and [GRU4Rec](../models/gru4rec/). Pass `mode="session-aware"` to load the cumulative files (`val`/`test`) instead, where each user's prior sessions precede their held-out one for cross-session models. diff --git a/cornac/datasets/diginetica.py b/cornac/datasets/diginetica.py index 23daf633..93f9c91b 100644 --- a/cornac/datasets/diginetica.py +++ b/cornac/datasets/diginetica.py @@ -44,7 +44,7 @@ def load_train(fmt="USIT", reader: Reader = None) -> List: return reader.read(fpath, fmt=fmt, sep=",") -def load_val(fmt="USIT", reader: Reader = None) -> List: +def load_val(fmt="USIT", reader: Reader = None, mode="session-based") -> List: """Load validation data Parameters @@ -52,21 +52,32 @@ def load_val(fmt="USIT", reader: Reader = None) -> List: reader: `obj:cornac.data.Reader`, default: None Reader object used to read the data. + mode: str, default: 'session-based' + - ``'session-based'`` (default): each user's single held-out session + only (``val_sbr``) — the clean evaluation set for session-based + models, with no training transitions repeated. + - ``'session-aware'``: the full cumulative file (``val``), where each + user's prior sessions precede their held-out session, so a + session-aware model can use cross-session history at eval time. + Returns ------- data: array-like Data in the form of a list of tuples (user, session, item, timestamp). """ + if mode not in ("session-based", "session-aware"): + raise ValueError(f"mode='{mode}' not supported; choose 'session-based' or 'session-aware'") + name = "val_sbr" if mode == "session-based" else "val" fpath = cache( - url="https://static.preferred.ai/cornac/datasets/diginetica/val.zip", + url=f"https://static.preferred.ai/cornac/datasets/diginetica/{name}.zip", unzip=True, - relative_path="diginetica/val.csv", + relative_path=f"diginetica/{name}.csv", ) reader = Reader() if reader is None else reader return reader.read(fpath, fmt=fmt, sep=",") -def load_test(fmt="USIT", reader: Reader = None) -> List: +def load_test(fmt="USIT", reader: Reader = None, mode="session-based") -> List: """Load test data Parameters @@ -74,15 +85,26 @@ def load_test(fmt="USIT", reader: Reader = None) -> List: reader: `obj:cornac.data.Reader`, default: None Reader object used to read the data. + mode: str, default: 'session-based' + - ``'session-based'`` (default): each user's single held-out session + only (``test_sbr``) — the clean evaluation set for session-based + models, with no training transitions repeated. + - ``'session-aware'``: the full cumulative file (``test``), where each + user's prior sessions precede their held-out session, so a + session-aware model can use cross-session history at eval time. + Returns ------- data: array-like Data in the form of a list of tuples (user, session, item, timestamp). """ + if mode not in ("session-based", "session-aware"): + raise ValueError(f"mode='{mode}' not supported; choose 'session-based' or 'session-aware'") + name = "test_sbr" if mode == "session-based" else "test" fpath = cache( - url="https://static.preferred.ai/cornac/datasets/diginetica/test.zip", + url=f"https://static.preferred.ai/cornac/datasets/diginetica/{name}.zip", unzip=True, - relative_path="diginetica/test.csv", + relative_path=f"diginetica/{name}.csv", ) reader = Reader() if reader is None else reader return reader.read(fpath, fmt=fmt, sep=",") diff --git a/cornac/models/__init__.py b/cornac/models/__init__.py index 3451261a..278fed13 100644 --- a/cornac/models/__init__.py +++ b/cornac/models/__init__.py @@ -45,6 +45,7 @@ from .ease import EASE from .efm import EFM from .fm import FM +from .fpmc import FPMC from .gcmc import GCMC from .global_avg import GlobalAvg from .gp_top import GPTop diff --git a/cornac/models/fpmc/__init__.py b/cornac/models/fpmc/__init__.py new file mode 100644 index 00000000..ca661be3 --- /dev/null +++ b/cornac/models/fpmc/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2026 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from .recom_fpmc import FPMC diff --git a/cornac/models/fpmc/fpmc.py b/cornac/models/fpmc/fpmc.py new file mode 100644 index 00000000..2acb3ec7 --- /dev/null +++ b/cornac/models/fpmc/fpmc.py @@ -0,0 +1,84 @@ +# Copyright 2026 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import torch +import torch.nn as nn + + +class FPMC_Model(nn.Module): + """Factorizing Personalized Markov Chains (Rendle et al., 2010). + + Score for user ``u``, last item ``i``, candidate ``j``: + + s(u, i, j) = + + + Implemented to return ``(B, B+N)`` matrices when out_iids contains the + in-batch positives followed by ``N`` shared negatives. + """ + + def __init__(self, user_num, item_num, factor_num, pad_idx=None, device="cpu"): + super().__init__() + self.user_num = user_num + self.item_num = item_num + self.factor_num = factor_num + self.device = device + self.pad_idx = pad_idx if pad_idx is not None else item_num + + # User-Item (target) embeddings + self.UI_emb = nn.Embedding(user_num + 1, factor_num, padding_idx=user_num) + # Item-User factor: scoring with UI + self.IU_emb = nn.Embedding(item_num + 1, factor_num, padding_idx=self.pad_idx) + # Last-Item factor (input) + self.LI_emb = nn.Embedding(item_num + 1, factor_num, padding_idx=self.pad_idx) + # Item-Last factor: scoring with LI + self.IL_emb = nn.Embedding(item_num + 1, factor_num, padding_idx=self.pad_idx) + self.item_biases = nn.Embedding(item_num + 1, 1, padding_idx=self.pad_idx) + + nn.init.normal_(self.UI_emb.weight, std=0.01) + nn.init.normal_(self.IU_emb.weight, std=0.01) + nn.init.normal_(self.LI_emb.weight, std=0.01) + nn.init.normal_(self.IL_emb.weight, std=0.01) + nn.init.constant_(self.item_biases.weight, 0) + for emb in ( + self.UI_emb, + self.IU_emb, + self.LI_emb, + self.IL_emb, + self.item_biases, + ): + if emb.padding_idx is not None: + emb.weight.data[emb.padding_idx].zero_() + + def forward(self, in_uids, in_iids, out_iids): + last_item_emb = self.LI_emb(in_iids) # (B, D) + user_emb = self.UI_emb(in_uids) # (B, D) + iu_emb = self.IU_emb(out_iids) # (B+N, D) + il_emb = self.IL_emb(out_iids) # (B+N, D) + bias = self.item_biases(out_iids) # (B+N, 1) + + mf = torch.einsum("be,ne->bn", user_emb, iu_emb) + fmc = torch.einsum("ne,be->bn", il_emb, last_item_emb) + return mf + fmc + bias.T + + @torch.no_grad() + def predict(self, user_idx, last_iid, item_indices=None): + if item_indices is None: + item_indices = torch.arange(self.item_num, device=self.device) + else: + item_indices = torch.as_tensor( + item_indices, dtype=torch.long, device=self.device + ) + scores = self.forward(user_idx, last_iid, item_indices).squeeze() + return scores.detach().cpu().numpy() diff --git a/cornac/models/fpmc/recom_fpmc.py b/cornac/models/fpmc/recom_fpmc.py new file mode 100644 index 00000000..3afe8105 --- /dev/null +++ b/cornac/models/fpmc/recom_fpmc.py @@ -0,0 +1,254 @@ +# Copyright 2026 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import numpy as np +from tqdm.auto import trange + +from cornac.models.recommender import NextItemRecommender + +from ...utils import get_rng +from ..seq_utils import session_seq_iter, val_score + +SUPPORTED_LOSSES = ( + "bpr", + "bce", + "ce", + "bpr-max", + "softmax", + "cross-entropy", + "xe_softmax", + "top1", +) + + +class FPMC(NextItemRecommender): + """Factorizing Personalized Markov Chains for next-item recommendation. + + Operates on ``(user, last_item) -> next_item`` triples. Training reuses + the shared session iterator with ``max_len=1`` so each example contributes + a single ``(last_item, target)`` pair; the last item only comes from the + current session (session-based). + + Parameters + ---------- + name: string, default: 'FPMC' + + embedding_dim: int, optional, default: 100 + Latent factor dimension. + + loss: str, optional, default: 'bpr' + Loss function. Supported: 'bpr', 'bce', 'ce', 'bpr-max', 'softmax', + 'cross-entropy', 'xe_softmax', 'top1'. + + batch_size, learning_rate, n_sample, sample_alpha, n_epochs: + Standard training hyperparameters. + + bpreg, elu_param: only used when ``loss="bpr-max"``. + + momentum: float, optional, default: 0.0 + Momentum for the IndexedAdagradM optimizer. + + device: str, optional, default: 'cpu' + Set to 'cuda' for GPU support. + + model_selection: str, optional, default: 'last' + One of 'last' or 'best'. When 'best', the model with the highest + validation score (evaluated every ``val_eval_every`` epochs) is + restored at the end of ``fit``. + + val_eval_every: int, optional, default: 5 + val_k: int, optional, default: 20 + val_metric: str, optional, default: 'recall' + Cutoff and metric used for best-on-val selection. See + :func:`cornac.models.seq_utils.val_score`. + + trainable: bool, optional, default: True + When False, the model will not be re-trained. + + verbose: bool, optional, default: False + When True, running logs are displayed. + + seed: int, optional, default: None + Random seed for weight initialization. + + References + ---------- + Rendle, S., Freudenthaler, C., & Schmidt-Thieme, L. (2010). + Factorizing personalized Markov chains for next-basket recommendation. WWW. + """ + + def __init__( + self, + name="FPMC", + embedding_dim=100, + loss="bpr", + batch_size=512, + learning_rate=0.05, + momentum=0.0, + n_sample=2048, + sample_alpha=0.5, + n_epochs=10, + bpreg=1.0, + elu_param=0.5, + device="cpu", + trainable=True, + verbose=False, + seed=None, + model_selection="last", + val_eval_every=5, + val_k=20, + val_metric="recall", + ): + super().__init__(name, trainable=trainable, verbose=verbose) + if loss not in SUPPORTED_LOSSES: + raise ValueError( + f"loss='{loss}' not supported; choose from {SUPPORTED_LOSSES}" + ) + if model_selection not in ("last", "best"): + raise ValueError( + f"model_selection='{model_selection}' not supported; choose 'last' or 'best'" + ) + self.embedding_dim = embedding_dim + self.loss = loss + self.batch_size = batch_size + self.learning_rate = learning_rate + self.momentum = momentum + self.n_sample = n_sample + self.sample_alpha = sample_alpha + self.n_epochs = n_epochs + self.bpreg = bpreg + self.elu_param = elu_param + self.device = device + self.seed = seed + self.rng = get_rng(seed) + self.model_selection = model_selection + self.val_eval_every = val_eval_every + self.val_k = val_k + self.val_metric = val_metric + + def fit(self, train_set, val_set=None): + super().fit(train_set, val_set) + if not self.trainable: + return self + + import torch + + from .fpmc import FPMC_Model + from ..seq_utils.losses import get_loss_function + from ..seq_utils.optim import IndexedAdagradM + + torch.manual_seed(self.seed if self.seed is not None else 0) + + self.pad_idx = self.total_items + self.model = FPMC_Model( + user_num=self.total_users, + item_num=self.total_items, + factor_num=self.embedding_dim, + pad_idx=self.pad_idx, + device=self.device, + ).to(self.device) + + loss_fn = get_loss_function(self.loss) + loss_kwargs = dict( + bpreg=self.bpreg, elu_param=self.elu_param, n_sample=self.n_sample + ) + opt = IndexedAdagradM( + self.model.parameters(), lr=self.learning_rate, momentum=self.momentum + ) + + best_val = -float("inf") + best_state = None + progress_bar = trange(1, self.n_epochs + 1, disable=not self.verbose) + for epoch_id in progress_bar: + self.model.train() + total_loss = 0.0 + cnt = 0 + for inc, (in_uids, hist_iids, out_iids) in enumerate( + session_seq_iter( + self.train_set, + pad_index=self.pad_idx, + batch_size=self.batch_size, + max_len=1, + n_sample=self.n_sample, + sample_alpha=self.sample_alpha, + rng=self.rng, + shuffle=True, + ) + ): + if len(hist_iids) < 2: + continue + in_uids_t = torch.tensor( + in_uids, dtype=torch.long, device=self.device, requires_grad=False + ) + # FPMC uses just the most recent item (hist_iids shape (B, 1)). + last_iid_t = torch.tensor( + hist_iids[:, -1], + dtype=torch.long, + device=self.device, + requires_grad=False, + ) + out_iids_t = torch.tensor( + out_iids, dtype=torch.long, device=self.device, requires_grad=False + ) + + self.model.zero_grad() + item_scores = self.model(in_uids_t, last_iid_t, out_iids_t) + L = loss_fn( + item_scores, + out_iids=out_iids_t, + batch_size=len(in_uids), + **loss_kwargs, + ) + L.backward() + opt.step() + + total_loss += L.cpu().detach().numpy() * len(in_uids) + cnt += len(in_uids) + if inc % 10 == 0 and cnt > 0: + progress_bar.set_postfix(loss=(total_loss / cnt)) + + if ( + self.model_selection == "best" + and val_set is not None + and epoch_id % self.val_eval_every == 0 + ): + score = val_score( + self, self.train_set, val_set, metric=self.val_metric, k=self.val_k + ) + if score is not None and score > best_val: + best_val = score + best_state = { + n: p.detach().clone() + for n, p in self.model.state_dict().items() + } + + if self.model_selection == "best" and best_state is not None: + self.model.load_state_dict(best_state) + return self + + def score(self, user_idx, history_items, **kwargs): + import torch + + if len(history_items) == 0: + return np.ones(self.total_items, dtype="float") + last = int(history_items[-1]) + # Cap user index to known users (cold-start fallback to padding row). + u_idx = user_idx if 0 <= user_idx < self.total_users else self.total_users + self.model.eval() + with torch.no_grad(): + u_t = torch.tensor([u_idx], dtype=torch.long, device=self.device) + i_t = torch.tensor([last], dtype=torch.long, device=self.device) + cdds = torch.arange(self.total_items, dtype=torch.long, device=self.device) + return self.model.predict(u_t, i_t, cdds) diff --git a/cornac/models/fpmc/requirements.txt b/cornac/models/fpmc/requirements.txt new file mode 100644 index 00000000..be222b02 --- /dev/null +++ b/cornac/models/fpmc/requirements.txt @@ -0,0 +1 @@ +torch>=1.12.0 diff --git a/cornac/models/gru4rec/recom_gru4rec.py b/cornac/models/gru4rec/recom_gru4rec.py index d48bf460..e3a59104 100644 --- a/cornac/models/gru4rec/recom_gru4rec.py +++ b/cornac/models/gru4rec/recom_gru4rec.py @@ -21,7 +21,7 @@ from cornac.models.recommender import NextItemRecommender from ...utils import get_rng -from ..seq_utils import io_iter +from ..seq_utils import io_iter, val_score SUPPORTED_LOSSES = ( "cross-entropy", @@ -94,6 +94,17 @@ class GRU4Rec(NextItemRecommender): device: str, optional, default: 'cpu' Set to 'cuda' for GPU support. + model_selection: str, optional, default: 'last' + One of 'last' or 'best'. When 'best', the model with the highest + validation score (evaluated every ``val_eval_every`` epochs) is + restored at the end of ``fit``. + + val_eval_every: int, optional, default: 5 + val_k: int, optional, default: 20 + val_metric: str, optional, default: 'recall' + Cutoff and metric used for best-on-val selection. See + :func:`cornac.models.seq_utils.val_score`. + trainable: bool, optional, default: True When False, the model will not be re-trained. @@ -129,6 +140,10 @@ def __init__( elu_param=0.5, logq=0.0, device="cpu", + model_selection="last", + val_eval_every=5, + val_k=20, + val_metric="recall", trainable=True, verbose=False, seed=None, @@ -136,6 +151,10 @@ def __init__( super().__init__(name, trainable=trainable, verbose=verbose) if loss not in SUPPORTED_LOSSES: raise ValueError(f"loss='{loss}' not supported; choose from {SUPPORTED_LOSSES}") + if model_selection not in ("last", "best"): + raise ValueError( + f"model_selection='{model_selection}' not supported; choose 'last' or 'best'" + ) self.layers = layers self.loss = loss self.batch_size = batch_size @@ -152,6 +171,10 @@ def __init__( self.elu_param = elu_param self.logq = logq self.device = device + self.model_selection = model_selection + self.val_eval_every = val_eval_every + self.val_k = val_k + self.val_metric = val_metric self.seed = seed self.rng = get_rng(seed) @@ -206,8 +229,11 @@ def fit(self, train_set, val_set=None): opt = IndexedAdagradM(self.model.parameters(), self.learning_rate, self.momentum) + best_val = -float("inf") + best_state = None progress_bar = trange(1, self.n_epochs + 1, disable=not self.verbose) - for _ in progress_bar: + for epoch_id in progress_bar: + self.model.train() H = [ torch.zeros( (self.batch_size, self.layers[i]), @@ -247,6 +273,24 @@ def fit(self, train_set, val_set=None): cnt += len(in_iids) if inc % 10 == 0 and cnt > 0: progress_bar.set_postfix(loss=(total_loss / cnt)) + + if ( + self.model_selection == "best" + and val_set is not None + and epoch_id % self.val_eval_every == 0 + ): + score = val_score( + self, self.train_set, val_set, metric=self.val_metric, k=self.val_k + ) + if score is not None and score > best_val: + best_val = score + best_state = { + n: p.detach().clone() + for n, p in self.model.state_dict().items() + } + + if self.model_selection == "best" and best_state is not None: + self.model.load_state_dict(best_state) return self def score(self, user_idx, history_items, **kwargs): diff --git a/cornac/models/seq_utils/__init__.py b/cornac/models/seq_utils/__init__.py index 6d802368..da8cf33c 100644 --- a/cornac/models/seq_utils/__init__.py +++ b/cornac/models/seq_utils/__init__.py @@ -33,8 +33,10 @@ """ from .iterators import io_iter, session_seq_iter +from .selection import val_score __all__ = [ "io_iter", "session_seq_iter", + "val_score", ] diff --git a/cornac/models/seq_utils/selection.py b/cornac/models/seq_utils/selection.py new file mode 100644 index 00000000..a7b3bbbd --- /dev/null +++ b/cornac/models/seq_utils/selection.py @@ -0,0 +1,71 @@ +# Copyright 2026 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Validation scoring helper shared by sequential models for best-on-val +model selection (``model_selection="best"``).""" + + +def val_score(model, train_set, val_set, metric="recall", k=20): + """Compute a next-item ranking metric on ``val_set`` during training. + + Delegates to :func:`cornac.eval_methods.next_item_evaluation.ranking_eval` + so the validation score matches the eventual test protocol, letting a + model keep the best-on-val checkpoint across epochs. + + Parameters + ---------- + model: :obj:`cornac.models.NextItemRecommender` + The model being trained (must implement ``score``). + + train_set, val_set: :obj:`cornac.data.SequentialDataset` + Training set (to exclude seen items) and the validation set. + + metric: str, optional, default: 'recall' + One of ``'recall'``, ``'ndcg'``, ``'auc'``, ``'mrr'`` + (case-insensitive). ``k`` is ignored for ``'auc'`` and ``'mrr'``. + + Returns + ------- + float or None + The averaged metric value, or ``None`` if ``val_set`` is ``None``. + """ + if val_set is None: + return None + + from ...eval_methods.next_item_evaluation import ranking_eval + from ...metrics import AUC, MRR, NDCG, Recall + + name = metric.lower() + if name == "recall": + m = Recall(k=k) + elif name == "ndcg": + m = NDCG(k=k) + elif name == "auc": + m = AUC() + elif name == "mrr": + m = MRR() + else: + raise ValueError( + f"val_metric='{metric}' not supported; choose from recall/ndcg/auc/mrr" + ) + + avg, _ = ranking_eval( + model=model, + metrics=[m], + train_set=train_set, + test_set=val_set, + exclude_unknowns=True, + verbose=False, + ) + return float(avg[0]) if avg else 0.0 diff --git a/examples/README.md b/examples/README.md index f667087f..b7be32f0 100644 --- a/examples/README.md +++ b/examples/README.md @@ -126,6 +126,8 @@ [gru4rec_yoochoose.py](gru4rec_yoochoose.py) - Example of Session-based Recommendations with Recurrent Neural Networks (GRU4Rec). +[fpmc_diginetica.py](fpmc_diginetica.py) - Example of Factorizing Personalized Markov Chains (FPMC) with Diginetica dataset. + ---- ## Next-Basket Algorithms diff --git a/examples/fpmc_diginetica.py b/examples/fpmc_diginetica.py new file mode 100644 index 00000000..770d21d3 --- /dev/null +++ b/examples/fpmc_diginetica.py @@ -0,0 +1,65 @@ +# Copyright 2026 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Example of Factorizing Personalized Markov Chains (FPMC) with Diginetica data""" + +import cornac +from cornac.datasets import diginetica +from cornac.eval_methods import NextItemEvaluation +from cornac.metrics import MRR, NDCG, Recall +from cornac.models import FPMC + +train_data = diginetica.load_train() +val_data = diginetica.load_val() +test_data = diginetica.load_test() +print("data loaded") + +next_item_eval = NextItemEvaluation.from_splits( + train_data=train_data, + val_data=val_data, + test_data=test_data, + exclude_unknowns=True, + verbose=True, + fmt="USIT", +) + +model = FPMC( + embedding_dim=64, + loss="cross-entropy", + n_sample=512, + batch_size=128, + learning_rate=0.1, + n_epochs=100, + model_selection="best", + val_eval_every=5, + val_metric="ndcg", + val_k=10, + device="cpu", + verbose=True, + seed=123, +) + +metrics = [ + NDCG(k=10), + NDCG(k=50), + Recall(k=10), + Recall(k=50), + MRR(), +] + +cornac.Experiment( + eval_method=next_item_eval, + models=[model], + metrics=metrics, +).run() diff --git a/tests/cornac/datasets/test_diginetica.py b/tests/cornac/datasets/test_diginetica.py index 14225590..2448157c 100644 --- a/tests/cornac/datasets/test_diginetica.py +++ b/tests/cornac/datasets/test_diginetica.py @@ -26,12 +26,17 @@ def test_load_train_val_test(self): random.seed(time.time()) if random.random() > 0.8: train = diginetica.load_train() + # default mode is 'session-based': each user's single held-out session val = diginetica.load_val() test = diginetica.load_test() self.assertEqual(len(train), 7273) - self.assertEqual(len(val), 9733) - self.assertEqual(len(test), 9686) + self.assertEqual(len(val), 2460) + self.assertEqual(len(test), 2413) + + # 'session-aware' returns the full cumulative file (held-out + history) + self.assertEqual(len(diginetica.load_val(mode="session-aware")), 9733) + self.assertEqual(len(diginetica.load_test(mode="session-aware")), 9686) if __name__ == "__main__":