Skip to content

Commit d2baae3

Browse files
authored
Merge pull request #279 from ncbo/feature/precompute-ancestors-indexing
Precompute ancestor hierarchy to speed up term indexing
2 parents 23e0a3e + 5ffb7b6 commit d2baae3

3 files changed

Lines changed: 289 additions & 6 deletions

File tree

lib/ontologies_linked_data/models/class.rb

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,14 @@ class Class < LinkedData::Models::Base
1515
include LinkedData::Concerns::Concept::InScheme
1616
include LinkedData::Concerns::Concept::InCollection
1717

18+
class << self
19+
# When set, index_doc uses this precomputed map instead of per-class
20+
# SPARQL ancestor traversal. Keyed by class URI string, values are
21+
# Sets of ancestor URI strings. Set by OntologySubmissionIndexer
22+
# during bulk indexing and cleared after completion.
23+
attr_accessor :ancestors_cache
24+
end
25+
1826
model :class, name_with: :id, collection: :submission,
1927
namespace: :owl, :schemaless => :true,
2028
rdf_type: lambda { |*x| self.class_rdf_type(x) }
@@ -254,15 +262,17 @@ def index_doc(to_set=nil)
254262
end
255263

256264
begin
257-
# paths_to_root = self.paths_to_root
258-
# paths_to_root.each do |paths|
259-
# path_ids += paths.map { |p| p.id.to_s }
265+
# TODO: do we ever need per-class ancestor lookup outside of bulk indexing?
266+
# If so, uncomment the fallback below.
267+
# if self.class.ancestors_cache
268+
# path_ids = (self.class.ancestors_cache[class_id] || Set.new).dup
269+
# else
270+
# path_ids = retrieve_hierarchy_ids(:ancestors)
260271
# end
261-
# path_ids.delete(class_id)
262-
path_ids = retrieve_hierarchy_ids(:ancestors)
272+
path_ids = (self.class.ancestors_cache[class_id] || Set.new).dup
263273
path_ids.select! { |x| !x["owl#Thing"] }
264274
doc[:parents] = path_ids
265-
rescue Exception => e
275+
rescue StandardError => e
266276
doc[:parents] = []
267277
puts "Exception getting paths to root for search for #{class_id}: #{e.class}: #{e.message}\n#{e.backtrace.join("\n")}"
268278
end

lib/ontologies_linked_data/services/submission_process/operations/submission_indexer.rb

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ def index(logger, commit = true, optimize = true)
3636
csv_writer = LinkedData::Utils::OntologyCSVWriter.new
3737
csv_writer.open(@submission.ontology, @submission.csv_path)
3838

39+
LinkedData::Models::Class.ancestors_cache = compute_ancestors_map(logger)
40+
3941
begin
4042
logger.info("Indexing ontology terms: #{@submission.ontology.acronym}...")
4143
t0 = Time.now
@@ -144,6 +146,9 @@ def index(logger, commit = true, optimize = true)
144146
end
145147
end
146148

149+
# TODO: Remove once precomputed ancestors are validated against production data
150+
validate_class_ancestors(c, logger) if ENV['OP_VALIDATE_ANCESTORS']
151+
147152
@submission.synchronize do
148153
csv_writer.write_class(c)
149154
end
@@ -180,6 +185,8 @@ def index(logger, commit = true, optimize = true)
180185
logger.error("\n\n#{e.class}: #{e.message}\n")
181186
logger.error(e.backtrace)
182187
raise e
188+
ensure
189+
LinkedData::Models::Class.ancestors_cache = nil
183190
end
184191
end
185192
logger.info("Completed indexing ontology terms: #{@submission.ontology.acronym} in #{time} sec. #{count_classes} classes.")
@@ -194,6 +201,106 @@ def index(logger, commit = true, optimize = true)
194201
end
195202
end
196203

204+
def compute_ancestors_map(logger)
205+
@submission.bring(:hasOntologyLanguage) unless @submission.loaded_attributes.include?(:hasOntologyLanguage)
206+
tree_property = LinkedData::Models::Class.tree_view_property(@submission)
207+
graph = @submission.id.to_s
208+
209+
logger.info("Precomputing ancestor hierarchy for indexing...")
210+
t0 = Time.now
211+
212+
direct_parents = fetch_all_parent_edges(graph, tree_property)
213+
edge_count = direct_parents.values.sum(&:length)
214+
logger.info("Fetched #{edge_count} parent-child edges for #{direct_parents.size} classes in #{Time.now - t0}s")
215+
216+
ancestors_map = {}
217+
direct_parents.each_key do |cls|
218+
compute_ancestors_for(cls, direct_parents, ancestors_map)
219+
end
220+
221+
logger.info("Computed ancestor map for #{ancestors_map.size} classes in #{Time.now - t0}s")
222+
ancestors_map
223+
end
224+
225+
def fetch_all_parent_edges(graph, tree_property)
226+
direct_parents = {}
227+
page_size = 50_000
228+
offset = 0
229+
230+
loop do
231+
query = "SELECT ?child ?parent WHERE { " \
232+
"GRAPH <#{graph}> { " \
233+
"?child <#{tree_property}> ?parent . " \
234+
"FILTER(isIRI(?parent)) " \
235+
"} } LIMIT #{page_size} OFFSET #{offset}"
236+
237+
count = 0
238+
Goo.sparql_query_client.query(query, query_options: { rules: :NONE }, graphs: [graph]).each do |sol|
239+
child = sol[:child].to_s
240+
parent = sol[:parent].to_s
241+
next unless child.start_with?("http") && parent.start_with?("http")
242+
(direct_parents[child] ||= []) << parent
243+
count += 1
244+
end
245+
246+
break if count < page_size
247+
offset += page_size
248+
end
249+
250+
direct_parents
251+
end
252+
253+
def compute_ancestors_for(cls, direct_parents, ancestors_map)
254+
return ancestors_map[cls] if ancestors_map.key?(cls)
255+
256+
visited = Set.new
257+
queue = (direct_parents[cls] || []).dup
258+
259+
while queue.any?
260+
parent = queue.shift
261+
next if visited.include?(parent)
262+
visited.add(parent)
263+
264+
if ancestors_map.key?(parent)
265+
visited.merge(ancestors_map[parent])
266+
else
267+
(direct_parents[parent] || []).each do |grandparent|
268+
queue.push(grandparent) unless visited.include?(grandparent)
269+
end
270+
end
271+
end
272+
273+
ancestors_map[cls] = visited
274+
end
275+
276+
# TODO: Remove once precomputed ancestors are validated against production data
277+
def validate_class_ancestors(cls, logger)
278+
cls_id = cls.id.to_s
279+
ancestors_cache = LinkedData::Models::Class.ancestors_cache
280+
return unless ancestors_cache
281+
282+
sparql_ancestors = nil
283+
sparql_time = Benchmark.realtime do
284+
sparql_ancestors = cls.retrieve_hierarchy_ids(:ancestors)
285+
sparql_ancestors.select! { |x| !x["owl#Thing"] }
286+
end
287+
288+
cached_ancestors = nil
289+
cache_time = Benchmark.realtime do
290+
cached_ancestors = (ancestors_cache[cls_id] || Set.new).reject { |x| x["owl#Thing"] }.to_set
291+
end
292+
293+
if sparql_ancestors == cached_ancestors
294+
logger.info("Ancestor OK for #{cls_id}: #{sparql_ancestors.size} ancestors, sparql=#{sparql_time.round(4)}s cache=#{cache_time.round(4)}s")
295+
else
296+
only_sparql = sparql_ancestors - cached_ancestors
297+
only_cached = cached_ancestors - sparql_ancestors
298+
logger.warn("Ancestor MISMATCH for #{cls_id}: sparql=#{sparql_ancestors.size} (#{sparql_time.round(4)}s) cache=#{cached_ancestors.size} (#{cache_time.round(4)}s) only_in_sparql=#{only_sparql.to_a.first(5)} only_in_cache=#{only_cached.to_a.first(5)}")
299+
end
300+
rescue StandardError => e
301+
logger.warn("Ancestor validation failed for #{cls_id}: #{e.class}: #{e.message}")
302+
end
303+
197304
end
198305
end
199306
end
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
require_relative '../test_case'
2+
3+
class TestAncestorsPrecompute < LinkedData::TestCase
4+
5+
def setup
6+
@indexer = LinkedData::Services::OntologySubmissionIndexer.new(nil)
7+
end
8+
9+
# A -> B -> C (linear chain)
10+
def test_linear_chain
11+
direct_parents = {
12+
"http://example.org/C" => ["http://example.org/B"],
13+
"http://example.org/B" => ["http://example.org/A"]
14+
}
15+
ancestors_map = {}
16+
17+
compute_all(direct_parents, ancestors_map)
18+
19+
assert_equal Set.new(["http://example.org/B", "http://example.org/A"]),
20+
ancestors_map["http://example.org/C"]
21+
assert_equal Set.new(["http://example.org/A"]),
22+
ancestors_map["http://example.org/B"]
23+
end
24+
25+
# Root node with no parents
26+
def test_root_node
27+
direct_parents = {
28+
"http://example.org/A" => []
29+
}
30+
ancestors_map = {}
31+
32+
compute_all(direct_parents, ancestors_map)
33+
34+
assert_equal Set.new, ancestors_map["http://example.org/A"]
35+
end
36+
37+
# A
38+
# / \
39+
# B C
40+
# \ /
41+
# D
42+
def test_diamond_inheritance
43+
direct_parents = {
44+
"http://example.org/D" => ["http://example.org/B", "http://example.org/C"],
45+
"http://example.org/B" => ["http://example.org/A"],
46+
"http://example.org/C" => ["http://example.org/A"]
47+
}
48+
ancestors_map = {}
49+
50+
compute_all(direct_parents, ancestors_map)
51+
52+
assert_equal Set.new(["http://example.org/B", "http://example.org/C", "http://example.org/A"]),
53+
ancestors_map["http://example.org/D"]
54+
assert_equal Set.new(["http://example.org/A"]),
55+
ancestors_map["http://example.org/B"]
56+
assert_equal Set.new(["http://example.org/A"]),
57+
ancestors_map["http://example.org/C"]
58+
end
59+
60+
# A B
61+
# | |
62+
# C D
63+
def test_multiple_roots
64+
direct_parents = {
65+
"http://example.org/C" => ["http://example.org/A"],
66+
"http://example.org/D" => ["http://example.org/B"]
67+
}
68+
ancestors_map = {}
69+
70+
compute_all(direct_parents, ancestors_map)
71+
72+
assert_equal Set.new(["http://example.org/A"]),
73+
ancestors_map["http://example.org/C"]
74+
assert_equal Set.new(["http://example.org/B"]),
75+
ancestors_map["http://example.org/D"]
76+
end
77+
78+
# A -> B -> A (cycle)
79+
def test_cycle
80+
direct_parents = {
81+
"http://example.org/A" => ["http://example.org/B"],
82+
"http://example.org/B" => ["http://example.org/A"]
83+
}
84+
ancestors_map = {}
85+
86+
compute_all(direct_parents, ancestors_map)
87+
88+
assert_includes ancestors_map["http://example.org/A"], "http://example.org/B"
89+
assert_includes ancestors_map["http://example.org/B"], "http://example.org/A"
90+
end
91+
92+
# Class not in direct_parents (leaf with no edges)
93+
def test_class_not_in_map
94+
direct_parents = {}
95+
ancestors_map = {}
96+
97+
@indexer.send(:compute_ancestors_for, "http://example.org/X", direct_parents, ancestors_map)
98+
99+
assert_equal Set.new, ancestors_map["http://example.org/X"]
100+
end
101+
102+
# Memoization: computing ancestors for a child reuses already-computed parent ancestors
103+
def test_memoization
104+
direct_parents = {
105+
"http://example.org/C" => ["http://example.org/B"],
106+
"http://example.org/B" => ["http://example.org/A"]
107+
}
108+
ancestors_map = {}
109+
110+
# Compute B first
111+
@indexer.send(:compute_ancestors_for, "http://example.org/B", direct_parents, ancestors_map)
112+
assert ancestors_map.key?("http://example.org/B")
113+
refute ancestors_map.key?("http://example.org/C")
114+
115+
# Now compute C — should reuse B's cached result
116+
@indexer.send(:compute_ancestors_for, "http://example.org/C", direct_parents, ancestors_map)
117+
assert_equal Set.new(["http://example.org/B", "http://example.org/A"]),
118+
ancestors_map["http://example.org/C"]
119+
end
120+
121+
# A
122+
# / \
123+
# B C
124+
# / \ \
125+
# D E F
126+
# \ /
127+
# G
128+
def test_complex_dag
129+
direct_parents = {
130+
"http://example.org/D" => ["http://example.org/B"],
131+
"http://example.org/E" => ["http://example.org/B"],
132+
"http://example.org/F" => ["http://example.org/C"],
133+
"http://example.org/G" => ["http://example.org/E", "http://example.org/F"],
134+
"http://example.org/B" => ["http://example.org/A"],
135+
"http://example.org/C" => ["http://example.org/A"]
136+
}
137+
ancestors_map = {}
138+
139+
compute_all(direct_parents, ancestors_map)
140+
141+
assert_equal Set.new(["http://example.org/E", "http://example.org/F",
142+
"http://example.org/B", "http://example.org/C",
143+
"http://example.org/A"]),
144+
ancestors_map["http://example.org/G"]
145+
146+
assert_equal Set.new(["http://example.org/B", "http://example.org/A"]),
147+
ancestors_map["http://example.org/D"]
148+
end
149+
150+
def test_empty_ontology
151+
direct_parents = {}
152+
ancestors_map = {}
153+
154+
compute_all(direct_parents, ancestors_map)
155+
156+
assert_equal({}, ancestors_map)
157+
end
158+
159+
private
160+
161+
def compute_all(direct_parents, ancestors_map)
162+
direct_parents.each_key do |cls|
163+
@indexer.send(:compute_ancestors_for, cls, direct_parents, ancestors_map)
164+
end
165+
end
166+
end

0 commit comments

Comments
 (0)