Skip to content

Commit 8bdd2fd

Browse files
committed
Ignore unknown bases ('N's) when detecting DB type
Some FASTAs might have a lot of unknown bases before meaningful data starts e.g. https://www.ncbi.nlm.nih.gov/nuccore/CM000994.3 Keep scanning the file until we reach meaningful bases to run the determination algorithm against. Closes #727
1 parent a1d4dca commit 8bdd2fd

2 files changed

Lines changed: 37 additions & 2 deletions

File tree

lib/sequenceserver/makeblastdb.rb

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ module SequenceServer
1313
#
1414
class MAKEBLASTDB
1515
extend Forwardable
16+
GUESS_SAMPLE_SIZE = 1_048_576
1617

1718
def_delegators SequenceServer, :config, :sys
1819

@@ -333,8 +334,21 @@ def guess_sequence_type_in_fasta(file)
333334
# If the given file is FASTA, returns Array of as many different
334335
# sequences in the portion of the file read. Returns the portion
335336
# of the file read wrapped in an Array otherwise.
336-
def sample_sequences(file)
337-
File.read(file, 1_048_576).split(/^>.+$/).delete_if(&:empty?)
337+
def sample_sequences(file, offset = 0)
338+
sample = File.read(file, GUESS_SAMPLE_SIZE, offset)
339+
340+
return [] if sample.nil?
341+
342+
# remove all unknown bases (indicated by 'N') before sampling
343+
sample.gsub!(/N/, '')
344+
meaningful_samples = sample.split(/^>.+$/).map { |line| line.gsub(/^\n+$/, '') }.delete_if(&:empty?)
345+
346+
if meaningful_samples.empty?
347+
offset += GUESS_SAMPLE_SIZE
348+
sample_sequences(file, offset)
349+
else
350+
meaningful_samples
351+
end
338352
end
339353
end
340354
end

spec/makeblastdb_spec.rb

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,23 @@ module SequenceServer
4242
'Sinvicta2-2-3.prot.subset.fasta')
4343
end
4444

45+
let 'fasta_file_prot_with_unknown_bases_seq' do
46+
dir_path = File.join(root_database_dir, 'with_unknown_bases')
47+
FileUtils.mkdir_p(dir_path)
48+
file_path = File.join(dir_path, 'with_unknown_bases.fasta')
49+
# Write a file with unknown bases that is GUESS_SAMPLE_SIZE 'N's long.
50+
File.open(file_path, 'w') do |f|
51+
f.puts ">seq1"
52+
f.puts (("N" * 70) + "\n") * (SequenceServer::MAKEBLASTDB::GUESS_SAMPLE_SIZE / 70).to_i
53+
f.puts "VSDTAKVLVTEVLEKVSVNRVATFTIEADASLGTPVVEVLSPTRESLSVHVKQNSQGTYTV"
54+
f.puts ">seq2"
55+
f.puts "VSDTAKVLVTEVLEKVSVNRVATFTIEADASLGTPVVEVLSPTRESLSVHVKQNSQGTYTV"
56+
f.puts "VSDTAKVLVTEVLEKVSVNRVATFTIEADASLGTPVVEVLSPTRESLSVHVKQNSQGTYTV"
57+
end
58+
59+
file_path
60+
end
61+
4562
let 'fasta_file_nucl_seqs' do
4663
File.join(database_dir_v5, 'transcripts', 'Solenopsis_invicta',
4764
'Sinvicta2-2-3.cdna.subset.fasta')
@@ -72,6 +89,10 @@ module SequenceServer
7289
expect(makeblastdb.send(:guess_sequence_type_in_fasta, fasta_file_nucl_seqs)).to eq :nucleotide
7390
end
7491

92+
it 'can ignore unknown bases when detecting the sequence type' do
93+
expect(makeblastdb.send(:guess_sequence_type_in_fasta, fasta_file_prot_with_unknown_bases_seq)).to eq :protein
94+
end
95+
7596
it 'can tell FASTA files that are yet to be made into a BLAST+ database' do
7697
makeblastdb = SequenceServer::MAKEBLASTDB.new(database_dir_unformatted)
7798
expect(makeblastdb.any_to_format_or_reformat?).to be_truthy

0 commit comments

Comments
 (0)