Skip to content

Commit dd29e35

Browse files
committed
Added Transformers.rb examples [skip ci]
1 parent 1562765 commit dd29e35

5 files changed

Lines changed: 130 additions & 0 deletions

File tree

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ Or check out some examples:
2525

2626
- [Embeddings](examples/openai_embeddings.rb) with OpenAI
2727
- [Binary embeddings](examples/cohere_embeddings.rb) with Cohere
28+
- [Sentence embeddings](examples/sentence_embeddings.rb) with Transformers.rb
29+
- [Hybrid search](examples/hybrid_search.rb) with Transformers.rb (Reciprocal Rank Fusion)
30+
- [Sparse search](examples/sparse_search.rb) with Transformers.rb
2831
- [Morgan fingerprints](examples/morgan_fingerprints.rb) with RDKit.rb
2932
- [Topic modeling](examples/topic_modeling.rb) with tomoto.rb
3033
- [User-based recommendations](examples/disco_user_recs.rb) with Disco

examples/Gemfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ gem "pg"
88
gem "rdkit-rb"
99
gem "sequel"
1010
gem "tomoto"
11+
gem "transformers-rb"

examples/hybrid_search.rb

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
require "pg"
2+
require "pgvector"
3+
require "transformers-rb"
4+
5+
conn = PG.connect(dbname: "pgvector_example")
6+
conn.exec("CREATE EXTENSION IF NOT EXISTS vector")
7+
8+
conn.exec("DROP TABLE IF EXISTS documents")
9+
conn.exec("CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(384))")
10+
conn.exec("CREATE INDEX ON documents USING GIN (to_tsvector('english', content))")
11+
12+
model = Transformers::SentenceTransformer.new("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
13+
14+
input = [
15+
"The dog is barking",
16+
"The cat is purring",
17+
"The bear is growling"
18+
]
19+
embeddings = model.encode(input)
20+
input.zip(embeddings) do |content, embedding|
21+
conn.exec_params("INSERT INTO documents (content, embedding) VALUES ($1, $2)", [content, embedding])
22+
end
23+
24+
sql = <<~SQL
25+
WITH semantic_search AS (
26+
SELECT id, RANK () OVER (ORDER BY embedding <=> $2) AS rank
27+
FROM documents
28+
ORDER BY embedding <=> $2
29+
LIMIT 20
30+
),
31+
keyword_search AS (
32+
SELECT id, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('english', content), query) DESC)
33+
FROM documents, plainto_tsquery('english', $1) query
34+
WHERE to_tsvector('english', content) @@ query
35+
ORDER BY ts_rank_cd(to_tsvector('english', content), query) DESC
36+
LIMIT 20
37+
)
38+
SELECT
39+
COALESCE(semantic_search.id, keyword_search.id) AS id,
40+
COALESCE(1.0 / ($3 + semantic_search.rank), 0.0) +
41+
COALESCE(1.0 / ($3 + keyword_search.rank), 0.0) AS score
42+
FROM semantic_search
43+
FULL OUTER JOIN keyword_search ON semantic_search.id = keyword_search.id
44+
ORDER BY score DESC
45+
LIMIT 5
46+
SQL
47+
query = "growling bear"
48+
query_embedding = model.encode(query)
49+
k = 60
50+
result = conn.exec_params(sql, [query, query_embedding, k])
51+
result.each do |row|
52+
puts "document: #{row["id"]}, RRF score: #{row["score"]}"
53+
end

examples/sentence_embeddings.rb

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
require "pg"
2+
require "pgvector"
3+
require "transformers-rb"
4+
5+
conn = PG.connect(dbname: "pgvector_example")
6+
conn.exec("CREATE EXTENSION IF NOT EXISTS vector")
7+
8+
conn.exec("DROP TABLE IF EXISTS documents")
9+
conn.exec("CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(384))")
10+
11+
model = Transformers::SentenceTransformer.new("sentence-transformers/all-MiniLM-L6-v2")
12+
13+
input = [
14+
"The dog is barking",
15+
"The cat is purring",
16+
"The bear is growling"
17+
]
18+
embeddings = model.encode(input)
19+
20+
input.zip(embeddings) do |content, embedding|
21+
conn.exec_params("INSERT INTO documents (content, embedding) VALUES ($1, $2)", [content, embedding])
22+
end
23+
24+
document_id = 1
25+
result = conn.exec_params("SELECT content FROM documents WHERE id != $1 ORDER BY embedding <=> (SELECT embedding FROM documents WHERE id = $1) LIMIT 5", [document_id])
26+
result.each do |row|
27+
puts row["content"]
28+
end

examples/sparse_search.rb

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# good resources
2+
# https://opensearch.org/blog/improving-document-retrieval-with-sparse-semantic-encoders/
3+
# https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1
4+
5+
require "pg"
6+
require "pgvector"
7+
require "transformers-rb"
8+
9+
conn = PG.connect(dbname: "pgvector_example")
10+
conn.exec("CREATE EXTENSION IF NOT EXISTS vector")
11+
12+
conn.exec("DROP TABLE IF EXISTS documents")
13+
conn.exec("CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding sparsevec(30522))")
14+
15+
model_id = "opensearch-project/opensearch-neural-sparse-encoding-v1"
16+
model = Transformers::AutoModelForMaskedLM.from_pretrained(model_id)
17+
tokenizer = Transformers::AutoTokenizer.from_pretrained(model_id)
18+
special_token_ids = tokenizer.special_tokens_map.map { |_, token| tokenizer.vocab[token] }
19+
20+
fetch_embeddings = lambda do |input|
21+
feature = tokenizer.(input, padding: true, truncation: true, return_tensors: "pt", return_token_type_ids: false)
22+
output = model.(**feature)[0]
23+
24+
values, _ = Torch.max(output * feature[:attention_mask].unsqueeze(-1), dim: 1)
25+
values = Torch.log(1 + Torch.relu(values))
26+
values[0.., special_token_ids] = 0
27+
values.to_a
28+
end
29+
30+
input = [
31+
"The dog is barking",
32+
"The cat is purring",
33+
"The bear is growling"
34+
]
35+
embeddings = fetch_embeddings.(input)
36+
input.zip(embeddings) do |content, embedding|
37+
conn.exec_params("INSERT INTO documents (content, embedding) VALUES ($1, $2)", [content, Pgvector::SparseVector.new(embedding)])
38+
end
39+
40+
query = "forest"
41+
query_embedding = fetch_embeddings.([query])[0]
42+
result = conn.exec_params("SELECT content FROM documents ORDER BY embedding <#> $1 LIMIT 5", [Pgvector::SparseVector.new(query_embedding)])
43+
result.each do |row|
44+
puts row["content"]
45+
end

0 commit comments

Comments
 (0)