Summary
The local reranker can fail when initializing SentenceTransformer for the configured Jina embedding model.
Error
TypeError: transformers.models.auto.tokenization_auto.AutoTokenizer.from_pretrained()
got multiple values for keyword argument 'trust_remote_code'
Relevant Stack
File "src/zotero_arxiv_daily/executor.py", line 112, in run
reranked_papers = self.reranker.rerank(all_papers, corpus)
File "src/zotero_arxiv_daily/reranker/local.py", line 22, in get_similarity_score
encoder = SentenceTransformer(self.config.reranker.local.model, trust_remote_code=True)
...
File ".../custom_st.py", line 50, in __init__
self.tokenizer = AutoTokenizer.from_pretrained(..., trust_remote_code=True, ...)
TypeError: ... got multiple values for keyword argument 'trust_remote_code'
Possible Cause
This looks like a compatibility issue between sentence-transformers and the Jina model's custom remote code path.
A likely explanation is that the model's remote code changed upstream. The same project code had been working before, and local reproduction now fetches updated files from the Hugging Face model repo, including custom_st.py, before hitting this error.
The failure seems to happen because:
SentenceTransformer(..., trust_remote_code=True) passes trust_remote_code through its init kwargs
- the Jina remote
custom_st.py also passes trust_remote_code=True explicitly to AutoTokenizer.from_pretrained(...)
- the tokenizer load path ends up receiving the same keyword twice
Local Reproduction
This reproduces locally with the current locked environment:
from sentence_transformers import SentenceTransformer
SentenceTransformer(
"jinaai/jina-embeddings-v5-text-nano",
trust_remote_code=True,
)
Current Package Versions
The local reproduction used the current locked environment with:
python==3.13.13
sentence-transformers==5.3.0
transformers==5.3.0
huggingface-hub==1.7.2
tokenizers==0.22.2
torch==2.11.0+cpu
Workaround
As a local workaround, a small patch to src/zotero_arxiv_daily/reranker/local.py can avoid passing trust_remote_code twice on the tokenizer path.
The .patch below was generated from the actual fix commit with git show --patch 3b4c649.
commit 3b4c649e0ecc913064a7c9b2f18c778422c60466
Author: Wenrui Huang <rijuyuezhuhwr@gmail.com>
Date: Fri Apr 10 09:32:31 2026 +0800
fix: monkey patch reranker
diff --git a/src/zotero_arxiv_daily/reranker/local.py b/src/zotero_arxiv_daily/reranker/local.py
index 411c666..19a4b83 100644
--- a/src/zotero_arxiv_daily/reranker/local.py
+++ b/src/zotero_arxiv_daily/reranker/local.py
@@ -1,7 +1,32 @@
from .base import BaseReranker, register_reranker
+from contextlib import contextmanager
import logging
import warnings
import numpy as np
+
+
+@contextmanager
+def _dedupe_trust_remote_code_for_tokenizer():
+ from sentence_transformers.models import Transformer
+
+ original = Transformer._load_init_kwargs.__func__
+
+ def patched(cls, *args, **kwargs):
+ init_kwargs = original(cls, *args, **kwargs)
+ tokenizer_args = init_kwargs.get("tokenizer_args")
+ if tokenizer_args:
+ tokenizer_args = dict(tokenizer_args)
+ tokenizer_args.pop("trust_remote_code", None)
+ init_kwargs["tokenizer_args"] = tokenizer_args
+ return init_kwargs
+
+ setattr(Transformer, "_load_init_kwargs", classmethod(patched))
+ try:
+ yield
+ finally:
+ setattr(Transformer, "_load_init_kwargs", classmethod(original))
+
+
@register_reranker("local")
class LocalReranker(BaseReranker):
def get_similarity_score(self, s1: list[str], s2: list[str]) -> np.ndarray:
@@ -19,7 +44,10 @@ class LocalReranker(BaseReranker):
logging.getLogger("huggingface_hub.utils._http").setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=FutureWarning)
- encoder = SentenceTransformer(self.config.reranker.local.model, trust_remote_code=True)
+ with _dedupe_trust_remote_code_for_tokenizer():
+ encoder = SentenceTransformer(
+ self.config.reranker.local.model, trust_remote_code=True
+ )
if self.config.reranker.local.encode_kwargs:
encode_kwargs = self.config.reranker.local.encode_kwargs
else:
This workaround was enough to make the local reproduction succeed again.
Summary
The local reranker can fail when initializing
SentenceTransformerfor the configured Jina embedding model.Error
Relevant Stack
Possible Cause
This looks like a compatibility issue between
sentence-transformersand the Jina model's custom remote code path.A likely explanation is that the model's remote code changed upstream. The same project code had been working before, and local reproduction now fetches updated files from the Hugging Face model repo, including
custom_st.py, before hitting this error.The failure seems to happen because:
SentenceTransformer(..., trust_remote_code=True)passestrust_remote_codethrough its init kwargscustom_st.pyalso passestrust_remote_code=Trueexplicitly toAutoTokenizer.from_pretrained(...)Local Reproduction
This reproduces locally with the current locked environment:
Current Package Versions
The local reproduction used the current locked environment with:
python==3.13.13sentence-transformers==5.3.0transformers==5.3.0huggingface-hub==1.7.2tokenizers==0.22.2torch==2.11.0+cpuWorkaround
As a local workaround, a small patch to
src/zotero_arxiv_daily/reranker/local.pycan avoid passingtrust_remote_codetwice on the tokenizer path.The
.patchbelow was generated from the actual fix commit withgit show --patch 3b4c649.commit 3b4c649e0ecc913064a7c9b2f18c778422c60466 Author: Wenrui Huang <rijuyuezhuhwr@gmail.com> Date: Fri Apr 10 09:32:31 2026 +0800 fix: monkey patch reranker diff --git a/src/zotero_arxiv_daily/reranker/local.py b/src/zotero_arxiv_daily/reranker/local.py index 411c666..19a4b83 100644 --- a/src/zotero_arxiv_daily/reranker/local.py +++ b/src/zotero_arxiv_daily/reranker/local.py @@ -1,7 +1,32 @@ from .base import BaseReranker, register_reranker +from contextlib import contextmanager import logging import warnings import numpy as np + + +@contextmanager +def _dedupe_trust_remote_code_for_tokenizer(): + from sentence_transformers.models import Transformer + + original = Transformer._load_init_kwargs.__func__ + + def patched(cls, *args, **kwargs): + init_kwargs = original(cls, *args, **kwargs) + tokenizer_args = init_kwargs.get("tokenizer_args") + if tokenizer_args: + tokenizer_args = dict(tokenizer_args) + tokenizer_args.pop("trust_remote_code", None) + init_kwargs["tokenizer_args"] = tokenizer_args + return init_kwargs + + setattr(Transformer, "_load_init_kwargs", classmethod(patched)) + try: + yield + finally: + setattr(Transformer, "_load_init_kwargs", classmethod(original)) + + @register_reranker("local") class LocalReranker(BaseReranker): def get_similarity_score(self, s1: list[str], s2: list[str]) -> np.ndarray: @@ -19,7 +44,10 @@ class LocalReranker(BaseReranker): logging.getLogger("huggingface_hub.utils._http").setLevel(logging.ERROR) warnings.filterwarnings("ignore", category=FutureWarning) - encoder = SentenceTransformer(self.config.reranker.local.model, trust_remote_code=True) + with _dedupe_trust_remote_code_for_tokenizer(): + encoder = SentenceTransformer( + self.config.reranker.local.model, trust_remote_code=True + ) if self.config.reranker.local.encode_kwargs: encode_kwargs = self.config.reranker.local.encode_kwargs else:This workaround was enough to make the local reproduction succeed again.