class SequenceServer::Database
Model Database's eigenclass as a collection of Database
objects.
Public Class Methods
# File lib/sequenceserver/database.rb, line 59 def <<(database) collection[database.id] = database end
# File lib/sequenceserver/database.rb, line 63 def [](ids) ids = Array ids collection.values_at(*ids) end
# File lib/sequenceserver/database.rb, line 166 def _make_blast_database(file, type, title, taxid, quiet = false) cmd = 'makeblastdb -parse_seqids -hash_index ' \ "-in #{file} -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \ " -taxid #{taxid}" cmd << ' &> /dev/null' if quiet system cmd end
# File lib/sequenceserver/database.rb, line 72 def all collection.values end
Intended to be used only for testing.
# File lib/sequenceserver/database.rb, line 94 def clear collection.clear end
# File lib/sequenceserver/database.rb, line 76 def each(&block) all.each(&block) end
Get taxid from the user. Returns user input or 0.
Using 0 as taxid is equivalent to not setting taxid for the database that will be created.
# File lib/sequenceserver/database.rb, line 203 def fetch_tax_id default = 0 print 'Enter taxid (optional): ' response_user = STDIN.gets.to_s.strip response_user.empty? && default || response_user end
Intended to be used only for testing.
# File lib/sequenceserver/database.rb, line 89 def first all.first end
Generate a title for the given database and show it to the user for confirmation.
Returns user input if any. Auto-generated title otherwise.
# File lib/sequenceserver/database.rb, line 192 def get_database_title(path) default = make_db_title(File.basename(path)) print "Enter a database title or will use '#{default}': " from_user = STDIN.gets.to_s.strip from_user.empty? && default || from_user end
# File lib/sequenceserver/database.rb, line 84 def group_by(&block) all.group_by(&block) end
Guess whether FASTA file contains protein or nucleotide sequences based on first 32768 characters.
NOTE: 2^15 == 32786. Approximately 546 lines, assuming 60 characters on each line.
# File lib/sequenceserver/database.rb, line 249 def guess_sequence_type_in_fasta(file) sequences = sample_sequences(file) sequence_types = sequences.map { |seq| Sequence.guess_type(seq) } sequence_types = sequence_types.uniq.compact (sequence_types.length == 1) && sequence_types.first end
# File lib/sequenceserver/database.rb, line 68 def ids collection.keys end
# File lib/sequenceserver/database.rb, line 80 def include?(path) collection.include? Digest::MD5.hexdigest path end
Create BLAST
database, given FASTA file and sequence type in FASTA file.
# File lib/sequenceserver/database.rb, line 159 def make_blast_database(file, type) return unless make_blast_database? file, type title = get_database_title(file) taxid = fetch_tax_id _make_blast_database(file, type, title, taxid) end
Show file path and guessed sequence type to the user and obtain a y/n response.
Returns true if the user entered anything but 'n' or 'N'.
# File lib/sequenceserver/database.rb, line 178 def make_blast_database?(file, type) puts puts puts "FASTA file: #{file}" puts "FASTA type: #{type}" print 'Proceed? [y/n] (Default: y): ' response = STDIN.gets.to_s.strip !response.match(/n/i) end
Recursively scan `database_dir` for un-formatted FASTA and format them for use with BLAST+.
# File lib/sequenceserver/database.rb, line 132 def make_blast_databases unformatted_fastas.select do |file, sequence_type| make_blast_database(file, sequence_type) end end
Suggests improved titles when generating database names from files for improved apperance and readability in web interface. For example: Cobs1.4.proteins.fasta -> Cobs 1.4 proteins S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl
# File lib/sequenceserver/database.rb, line 231 def make_db_title(db_name) db_name.gsub!('"', "'") # removes .fasta like extension names db_name.gsub!(File.extname(db_name), '') # replaces _ with ' ', db_name.gsub!(/(_)/, ' ') # replaces '.' with ' ' when no numbers are on either side, db_name.gsub!(/(\D)\.(?=\D)/, '\1 ') # preserves version numbers db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ') db_name end
Returns true if the database name appears to be a multi-part database name.
e.g. /home/ben/pd.ben/sequenceserver/db/nr.00 => yes /home/ben/pd.ben/sequenceserver/db/nr => no /home/ben/pd.ben/sequenceserver/db/img3.5.finished.faa.01 => yes
# File lib/sequenceserver/database.rb, line 217 def multipart_database_name?(db_name) !(db_name.match(%r{.+/\S+\.\d{2}$}).nil?) end
Returns true if first character of the file is '>'.
# File lib/sequenceserver/database.rb, line 222 def probably_fasta?(file) File.read(file, 1) == '>' end
Read first 32768 characters of the file. Split on fasta def line pattern and return.
If the given file is FASTA, returns Array of as many different sequences in the portion of the file read. Returns the portion of the file read wrapped in an Array otherwise.
# File lib/sequenceserver/database.rb, line 262 def sample_sequences(file) File.read(file, 32_768).split(/^>.+$/).delete_if(&:empty?) end
Recurisvely scan `database_dir` for blast databases.
rubocop:disable Metrics/AbcSize, Metrics/MethodLength
# File lib/sequenceserver/database.rb, line 101 def scan_databases_dir cmd = "blastdbcmd -recursive -list #{config[:database_dir]}" \ ' -list_outfmt "%f %t %p %n %l %d"' Open3.popen3(cmd) do |_, out, err| out = out.read err = err.read throw_scan_error(cmd, out, err, $CHILD_STATUS) out.each_line do |line| name = line.split(' ')[0] next if multipart_database_name?(name) begin self << Database.new(*line.split(' ')) rescue NoMethodError => e err << "BLAST Database error:\n#{e}\n#{line}" end end throw_scan_error(cmd, out, err, $CHILD_STATUS) end end
rubocop:enable Metrics/AbcSize, Metrics/MethodLength
# File lib/sequenceserver/database.rb, line 122 def throw_scan_error(cmd, out, err, child_status) errpat = /BLAST Database error/ if !child_status.success? || err.match(errpat) fail BLAST_DATABASE_ERROR.new(cmd, err) end fail NO_BLAST_DATABASE_FOUND, config[:database_dir] if out.empty? end
Returns an Array of FASTA files that may require formatting, and the type of sequence contained in each FASTA.
> unformatted_fastas => [['/foo/bar.fasta', :nulceotide], ...]
# File lib/sequenceserver/database.rb, line 143 def unformatted_fastas list = [] database_dir = config[:database_dir] Find.find database_dir do |file| next if File.directory? file next if Database.include? file next unless probably_fasta? file sequence_type = guess_sequence_type_in_fasta file if [:protein, :nucleotide].include?(sequence_type) list << [file, sequence_type] end end list end
Private Class Methods
# File lib/sequenceserver/database.rb, line 53 def collection @collection ||= {} end