Added Transformers.rb examples [skip ci]

ankane · ankane · commit dd29e35d7b58 · 2024-08-19T11:58:09.000-07:00
diff --git a/README.md b/README.md
@@ -25,6 +25,9 @@ Or check out some examples:
 
 - [Embeddings](examples/openai_embeddings.rb) with OpenAI
 - [Binary embeddings](examples/cohere_embeddings.rb) with Cohere
+- [Sentence embeddings](examples/sentence_embeddings.rb) with Transformers.rb
+- [Hybrid search](examples/hybrid_search.rb) with Transformers.rb (Reciprocal Rank Fusion)
+- [Sparse search](examples/sparse_search.rb) with Transformers.rb
 - [Morgan fingerprints](examples/morgan_fingerprints.rb) with RDKit.rb
 - [Topic modeling](examples/topic_modeling.rb) with tomoto.rb
 - [User-based recommendations](examples/disco_user_recs.rb) with Disco
diff --git a/examples/Gemfile b/examples/Gemfile
@@ -8,3 +8,4 @@ gem "pg"
 gem "rdkit-rb"
 gem "sequel"
 gem "tomoto"
+gem "transformers-rb"
diff --git a/examples/hybrid_search.rb b/examples/hybrid_search.rb
@@ -0,0 +1,53 @@
+require "pg"
+require "pgvector"
+require "transformers-rb"
+
+conn = PG.connect(dbname: "pgvector_example")
+conn.exec("CREATE EXTENSION IF NOT EXISTS vector")
+
+conn.exec("DROP TABLE IF EXISTS documents")
+conn.exec("CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(384))")
+conn.exec("CREATE INDEX ON documents USING GIN (to_tsvector('english', content))")
+
+model = Transformers::SentenceTransformer.new("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
+
+input = [
+  "The dog is barking",
+  "The cat is purring",
+  "The bear is growling"
+]
+embeddings = model.encode(input)
+input.zip(embeddings) do |content, embedding|
+  conn.exec_params("INSERT INTO documents (content, embedding) VALUES ($1, $2)", [content, embedding])
+end
+
+sql = <<~SQL
+WITH semantic_search AS (
+    SELECT id, RANK () OVER (ORDER BY embedding <=> $2) AS rank
+    FROM documents
+    ORDER BY embedding <=> $2
+    LIMIT 20
+),
+keyword_search AS (
+    SELECT id, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('english', content), query) DESC)
+    FROM documents, plainto_tsquery('english', $1) query
+    WHERE to_tsvector('english', content) @@ query
+    ORDER BY ts_rank_cd(to_tsvector('english', content), query) DESC
+    LIMIT 20
+)
+SELECT
+    COALESCE(semantic_search.id, keyword_search.id) AS id,
+    COALESCE(1.0 / ($3 + semantic_search.rank), 0.0) +
+    COALESCE(1.0 / ($3 + keyword_search.rank), 0.0) AS score
+FROM semantic_search
+FULL OUTER JOIN keyword_search ON semantic_search.id = keyword_search.id
+ORDER BY score DESC
+LIMIT 5
+SQL
+query = "growling bear"
+query_embedding = model.encode(query)
+k = 60
+result = conn.exec_params(sql, [query, query_embedding, k])
+result.each do |row|
+  puts "document: #{row["id"]}, RRF score: #{row["score"]}"
+end
diff --git a/examples/sentence_embeddings.rb b/examples/sentence_embeddings.rb
@@ -0,0 +1,28 @@
+require "pg"
+require "pgvector"
+require "transformers-rb"
+
+conn = PG.connect(dbname: "pgvector_example")
+conn.exec("CREATE EXTENSION IF NOT EXISTS vector")
+
+conn.exec("DROP TABLE IF EXISTS documents")
+conn.exec("CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(384))")
+
+model = Transformers::SentenceTransformer.new("sentence-transformers/all-MiniLM-L6-v2")
+
+input = [
+  "The dog is barking",
+  "The cat is purring",
+  "The bear is growling"
+]
+embeddings = model.encode(input)
+
+input.zip(embeddings) do |content, embedding|
+  conn.exec_params("INSERT INTO documents (content, embedding) VALUES ($1, $2)", [content, embedding])
+end
+
+document_id = 1
+result = conn.exec_params("SELECT content FROM documents WHERE id != $1 ORDER BY embedding <=> (SELECT embedding FROM documents WHERE id = $1) LIMIT 5", [document_id])
+result.each do |row|
+  puts row["content"]
+end
diff --git a/examples/sparse_search.rb b/examples/sparse_search.rb
@@ -0,0 +1,45 @@
+# good resources
+# https://opensearch.org/blog/improving-document-retrieval-with-sparse-semantic-encoders/
+# https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1
+
+require "pg"
+require "pgvector"
+require "transformers-rb"
+
+conn = PG.connect(dbname: "pgvector_example")
+conn.exec("CREATE EXTENSION IF NOT EXISTS vector")
+
+conn.exec("DROP TABLE IF EXISTS documents")
+conn.exec("CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding sparsevec(30522))")
+
+model_id = "opensearch-project/opensearch-neural-sparse-encoding-v1"
+model = Transformers::AutoModelForMaskedLM.from_pretrained(model_id)
+tokenizer = Transformers::AutoTokenizer.from_pretrained(model_id)
+special_token_ids = tokenizer.special_tokens_map.map { |_, token| tokenizer.vocab[token] }
+
+fetch_embeddings = lambda do |input|
+  feature = tokenizer.(input, padding: true, truncation: true, return_tensors: "pt", return_token_type_ids: false)
+  output = model.(**feature)[0]
+
+  values, _ = Torch.max(output * feature[:attention_mask].unsqueeze(-1), dim: 1)
+  values = Torch.log(1 + Torch.relu(values))
+  values[0.., special_token_ids] = 0
+  values.to_a
+end
+
+input = [
+  "The dog is barking",
+  "The cat is purring",
+  "The bear is growling"
+]
+embeddings = fetch_embeddings.(input)
+input.zip(embeddings) do |content, embedding|
+  conn.exec_params("INSERT INTO documents (content, embedding) VALUES ($1, $2)", [content, Pgvector::SparseVector.new(embedding)])
+end
+
+query = "forest"
+query_embedding = fetch_embeddings.([query])[0]
+result = conn.exec_params("SELECT content FROM documents ORDER BY embedding <#> $1 LIMIT 5", [Pgvector::SparseVector.new(query_embedding)])
+result.each do |row|
+  puts row["content"]
+end