Skip to content

Commit 389ae3a

Browse files
committed
Refactor BLAST::Report to lazily generate reports
In some cases (e.g. when used by some extensions) it is beneficial to not generate the report files upfront, and only do it when it's required. Performance gains are significant with large result sets (i.e. hundreds of MB or GBs) as the data does not need to be loaded into the process memory. Lazy method evaluation with memoization allows to achieve just that without cognitive overload to developer - operations are now only executed when the methods are invoked, not up-front by default. Implementation logic was not changed and regression test prepared upfront is not failing and producing identical results.
1 parent 430c3da commit 389ae3a

2 files changed

Lines changed: 75 additions & 87 deletions

File tree

lib/sequenceserver/blast/report.rb

Lines changed: 74 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -24,20 +24,11 @@ module BLAST
2424
class Report < Report
2525
def initialize(job)
2626
super do
27-
@queries = []
2827
@querydb = job.databases
2928
end
3029
end
3130

32-
# Attributes parsed out from BLAST output.
33-
attr_reader :program, :program_version, :stats, :queries
34-
35-
# Attributes parsed from job metadata and BLAST output.
36-
attr_reader :querydb, :dbtype, :params
37-
3831
def to_json(*_args)
39-
generate
40-
4132
%i[querydb program program_version params stats
4233
queries].inject({}) do |h, k|
4334
h[k] = send(k)
@@ -53,83 +44,89 @@ def to_json(*_args)
5344
def xml_file_size
5445
return File.size(job.imported_xml_file) if job.imported_xml_file
5546

56-
generate
57-
5847
xml_formatter.size
5948
end
6049

61-
# Generate report.
62-
def generate
63-
return self if @_generated
64-
65-
job.raise!
66-
xml_ir = nil
67-
tsv_ir = nil
68-
if job.imported_xml_file
69-
xml_ir = parse_xml File.read(job.imported_xml_file)
70-
tsv_ir = Hash.new do |h1, k1|
71-
h1[k1] = Hash.new do |h2, k2|
72-
h2[k2] = ['', '', []]
73-
end
74-
end
75-
else
76-
xml_ir = parse_xml(xml_formatter.read_file)
77-
tsv_ir = parse_tsv(tsv_formatter.read_file)
50+
def done?
51+
return true if job.imported_xml_file
52+
53+
File.exist?(xml_formatter.filepath) && File.exist?(tsv_formatter.filepath)
54+
end
55+
56+
def program
57+
@program ||= xml_ir[0]
58+
end
59+
60+
def program_version
61+
@program_version ||= xml_ir[1]
62+
end
63+
64+
def querydb
65+
@querydb ||= xml_ir[3].split.map do |path|
66+
{ title: File.basename(path) }
7867
end
79-
extract_program_info xml_ir
80-
extract_db_info xml_ir
81-
extract_params xml_ir
82-
extract_stats xml_ir
83-
extract_queries xml_ir, tsv_ir
68+
end
69+
70+
def dbtype
71+
@dbtype ||= querydb&.first&.type || dbtype_from_program
72+
end
8473

85-
@_generated = true
74+
def params
75+
@params ||= extract_params
76+
end
8677

87-
self
78+
def stats
79+
@stats ||= extract_stats
8880
end
8981

90-
def done?
91-
return true if job.imported_xml_file
82+
def queries
83+
@queries ||= xml_ir[8].map do |n|
84+
query = Query.new(self, n[0], n[2], n[3], [])
85+
query.hits = query_hits(n[4], tsv_ir[query.id], query)
9286

93-
File.exist?(xml_formatter.filepath) && File.exist?(tsv_formatter.filepath)
87+
query
88+
end
9489
end
9590

9691
private
9792

98-
def xml_formatter
99-
@xml_formatter ||= Formatter.run(job, 'xml')
93+
def xml_ir
94+
@xml_ir ||=
95+
if job.imported_xml_file
96+
parse_xml File.read(job.imported_xml_file)
97+
else
98+
job.raise!
99+
parse_xml(xml_formatter.read_file)
100+
end
100101
end
101102

102-
def tsv_formatter
103-
@tsv_formatter ||= Formatter.run(job, 'custom_tsv')
103+
def tsv_ir
104+
@tsv_ir ||=
105+
if job.imported_xml_file
106+
Hash.new do |h1, k1|
107+
h1[k1] = Hash.new do |h2, k2|
108+
h2[k2] = ['', '', []]
109+
end
110+
end
111+
else
112+
job.raise!
113+
parse_tsv(tsv_formatter.read_file)
114+
end
104115
end
105116

106-
# Make program name and program name + version available via `program`
107-
# and `program_version` attributes.
108-
def extract_program_info(ir)
109-
@program = ir[0]
110-
@program_version = ir[1]
117+
def xml_formatter
118+
@xml_formatter ||= Formatter.run(job, 'xml')
111119
end
112120

113-
# Get database information (title and type) from job yaml or from XML.
114-
# Sets `querydb` and `dbtype` attributes.
115-
def extract_db_info(ir)
116-
if @querydb.empty?
117-
@querydb = ir[3].split.map do |path|
118-
{ title: File.basename(path) }
119-
end
120-
@dbtype = dbtype_from_program
121-
else
122-
@dbtype = @querydb.first.type
123-
end
121+
def tsv_formatter
122+
@tsv_formatter ||= Formatter.run(job, 'custom_tsv')
124123
end
125124

126-
# Make search params available via `params` attribute.
127-
#
128125
# Search params tweak the results. Like evalue cutoff or penalty to open
129126
# a gap. BLAST+ doesn't list all input params in the XML output. Only
130127
# matrix, evalue, gapopen, gapextend, and filters are available from XML
131128
# output.
132-
def extract_params(ir)
129+
def extract_params
133130
# Parse/get params from the job first.
134131
job_params = parse_advanced(job.advanced)
135132
# Old jobs from beta releases may not have the advanced key but they
@@ -138,21 +135,19 @@ def extract_params(ir)
138135

139136
# Parse params from BLAST XML.
140137
@params = Hash[
141-
*ir[7].first.map { |k, v| [k.gsub('Parameters_', ''), v] }.flatten
138+
*xml_ir[7].first.map { |k, v| [k.gsub('Parameters_', ''), v] }.flatten
142139
]
143140
@params['evalue'] = @params.delete('expect')
144141

145142
# Merge into job_params.
146143
@params = job_params.merge(@params)
147144
end
148145

149-
# Make search stats available via `stats` attribute.
150-
#
151146
# Search stats are computed metrics. Like total number of sequences or
152147
# effective search space.
153-
def extract_stats(ir)
154-
stats = ir[8].first[5][0]
155-
@stats = {
148+
def extract_stats
149+
stats = xml_ir[8].first[5][0]
150+
{
156151
nsequences: stats[0],
157152
ncharacters: stats[1],
158153
hsp_length: stats[2],
@@ -163,20 +158,11 @@ def extract_stats(ir)
163158
}
164159
end
165160

166-
# Create query objects for the given report from the given ir.
167-
def extract_queries(xml_ir, tsv_ir)
168-
xml_ir[8].each do |n|
169-
query = Query.new(self, n[0], n[2], n[3], [])
170-
extract_hits(n[4], tsv_ir[query.id], query)
171-
queries << query
172-
end
173-
end
174-
175161
# Create Hit objects for the given query from the given ir.
176-
def extract_hits(xml_ir, tsv_ir, query)
177-
return if xml_ir == ["\n"] # => No hits.
162+
def query_hits(xml_ir, tsv_ir, query)
163+
return [] if xml_ir == ["\n"] # => No hits.
178164

179-
xml_ir.each do |n|
165+
xml_ir.map do |n|
180166
# If hit comes from a non -parse_seqids database, then id (n[1]) is a
181167
# BLAST assigned internal id of the format 'gnl|BL_ORD_ID|serial'. We
182168
# assign the id to accession (because we use accession for sequence
@@ -190,19 +176,21 @@ def extract_hits(xml_ir, tsv_ir, query)
190176
n[1] = defline.shift
191177
n[2] = defline.join(' ')
192178
end
179+
193180
hit = Hit.new(query, n[0], n[1], n[3], n[2], n[4],
194181
tsv_ir[n[1]][0], tsv_ir[n[1]][1], [])
195-
extract_hsps(n[5], tsv_ir[n[1]][2], hit)
196-
query.hits << hit
182+
183+
hit.hsps = hsps(n[5], tsv_ir[n[1]][2], hit)
184+
185+
hit
197186
end
198187
end
199188

200-
# Create HSP objects for the given hit from the given ir.
201-
def extract_hsps(xml_ir, tsv_ir, hit)
202-
xml_ir.each_with_index do |n, i|
189+
def hsps(xml_ir, tsv_ir, hit)
190+
xml_ir.map.with_index do |n, i|
203191
n.insert(14, tsv_ir[i])
204-
hsp = HSP.new(hit, *n)
205-
hit.hsps << hsp
192+
193+
HSP.new(hit, *n)
206194
end
207195
end
208196

lib/sequenceserver/report.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ module SequenceServer
88
class Report
99
class << self
1010
def generate(job)
11-
BLAST::Report.new(job).generate
11+
BLAST::Report.new(job)
1212
end
1313
end
1414

0 commit comments

Comments
 (0)