class SequenceServer::Database

Model Database's eigenclass as a collection of Database objects.

Public Class Methods

<<(database) click to toggle source
# File lib/sequenceserver/database.rb, line 59
def <<(database)
  collection[database.id] = database
end
[](ids) click to toggle source
# File lib/sequenceserver/database.rb, line 63
def [](ids)
  ids = Array ids
  collection.values_at(*ids)
end
_make_blast_database(file, type, title, taxid, quiet = false) click to toggle source
# File lib/sequenceserver/database.rb, line 166
def _make_blast_database(file, type, title, taxid, quiet = false)
  cmd = 'makeblastdb -parse_seqids -hash_index ' \
        "-in #{file} -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
        " -taxid #{taxid}"
  cmd << ' &> /dev/null' if quiet
  system cmd
end
all() click to toggle source
# File lib/sequenceserver/database.rb, line 72
def all
  collection.values
end
clear() click to toggle source

Intended to be used only for testing.

# File lib/sequenceserver/database.rb, line 94
def clear
  collection.clear
end
each(&block) click to toggle source
# File lib/sequenceserver/database.rb, line 76
def each(&block)
  all.each(&block)
end
fetch_tax_id() click to toggle source

Get taxid from the user. Returns user input or 0.

Using 0 as taxid is equivalent to not setting taxid for the database that will be created.

# File lib/sequenceserver/database.rb, line 203
def fetch_tax_id
  default = 0
  print 'Enter taxid (optional): '
  response_user = STDIN.gets.to_s.strip
  response_user.empty? && default || response_user
end
first() click to toggle source

Intended to be used only for testing.

# File lib/sequenceserver/database.rb, line 89
def first
  all.first
end
get_database_title(path) click to toggle source

Generate a title for the given database and show it to the user for confirmation.

Returns user input if any. Auto-generated title otherwise.

# File lib/sequenceserver/database.rb, line 192
def get_database_title(path)
  default = make_db_title(File.basename(path))
  print "Enter a database title or will use '#{default}': "
  from_user = STDIN.gets.to_s.strip
  from_user.empty? && default || from_user
end
group_by(&block) click to toggle source
# File lib/sequenceserver/database.rb, line 84
def group_by(&block)
  all.group_by(&block)
end
guess_sequence_type_in_fasta(file) click to toggle source

Guess whether FASTA file contains protein or nucleotide sequences based on first 32768 characters.

NOTE: 2^15 == 32786. Approximately 546 lines, assuming 60 characters on each line.

# File lib/sequenceserver/database.rb, line 249
def guess_sequence_type_in_fasta(file)
  sequences = sample_sequences(file)
  sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
  sequence_types = sequence_types.uniq.compact
  (sequence_types.length == 1) && sequence_types.first
end
ids() click to toggle source
# File lib/sequenceserver/database.rb, line 68
def ids
  collection.keys
end
include?(path) click to toggle source
# File lib/sequenceserver/database.rb, line 80
def include?(path)
  collection.include? Digest::MD5.hexdigest path
end
make_blast_database(file, type) click to toggle source

Create BLAST database, given FASTA file and sequence type in FASTA file.

# File lib/sequenceserver/database.rb, line 159
def make_blast_database(file, type)
  return unless make_blast_database? file, type
  title = get_database_title(file)
  taxid = fetch_tax_id
  _make_blast_database(file, type, title, taxid)
end
make_blast_database?(file, type) click to toggle source

Show file path and guessed sequence type to the user and obtain a y/n response.

Returns true if the user entered anything but 'n' or 'N'.

# File lib/sequenceserver/database.rb, line 178
def make_blast_database?(file, type)
  puts
  puts
  puts "FASTA file: #{file}"
  puts "FASTA type: #{type}"
  print 'Proceed? [y/n] (Default: y): '
  response = STDIN.gets.to_s.strip
  !response.match(/n/i)
end
make_blast_databases() click to toggle source

Recursively scan `database_dir` for un-formatted FASTA and format them for use with BLAST+.

# File lib/sequenceserver/database.rb, line 132
def make_blast_databases
  unformatted_fastas.select do |file, sequence_type|
    make_blast_database(file, sequence_type)
  end
end
make_db_title(db_name) click to toggle source

Suggests improved titles when generating database names from files for improved apperance and readability in web interface. For example: Cobs1.4.proteins.fasta -> Cobs 1.4 proteins S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl

# File lib/sequenceserver/database.rb, line 231
def make_db_title(db_name)
  db_name.gsub!('"', "'")
  # removes .fasta like extension names
  db_name.gsub!(File.extname(db_name), '')
  # replaces _ with ' ',
  db_name.gsub!(/(_)/, ' ')
  # replaces '.' with ' ' when no numbers are on either side,
  db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
  # preserves version numbers
  db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
  db_name
end
multipart_database_name?(db_name) click to toggle source

Returns true if the database name appears to be a multi-part database name.

e.g. /home/ben/pd.ben/sequenceserver/db/nr.00 => yes /home/ben/pd.ben/sequenceserver/db/nr => no /home/ben/pd.ben/sequenceserver/db/img3.5.finished.faa.01 => yes

# File lib/sequenceserver/database.rb, line 217
def multipart_database_name?(db_name)
  !(db_name.match(%r{.+/\S+\.\d{2}$}).nil?)
end
probably_fasta?(file) click to toggle source

Returns true if first character of the file is '>'.

# File lib/sequenceserver/database.rb, line 222
def probably_fasta?(file)
  File.read(file, 1) == '>'
end
sample_sequences(file) click to toggle source

Read first 32768 characters of the file. Split on fasta def line pattern and return.

If the given file is FASTA, returns Array of as many different sequences in the portion of the file read. Returns the portion of the file read wrapped in an Array otherwise.

# File lib/sequenceserver/database.rb, line 262
def sample_sequences(file)
  File.read(file, 32_768).split(/^>.+$/).delete_if(&:empty?)
end
scan_databases_dir() click to toggle source

Recurisvely scan `database_dir` for blast databases.

rubocop:disable Metrics/AbcSize, Metrics/MethodLength

# File lib/sequenceserver/database.rb, line 101
def scan_databases_dir
  cmd = "blastdbcmd -recursive -list #{config[:database_dir]}" \
        ' -list_outfmt "%f        %t     %p  %n       %l    %d"'
  Open3.popen3(cmd) do |_, out, err|
    out = out.read
    err = err.read
    throw_scan_error(cmd, out, err, $CHILD_STATUS)
    out.each_line do |line|
      name = line.split(' ')[0]
      next if multipart_database_name?(name)
      begin
        self << Database.new(*line.split('        '))
      rescue NoMethodError => e
        err << "BLAST Database error:\n#{e}\n#{line}"
      end
    end
    throw_scan_error(cmd, out, err, $CHILD_STATUS)
  end
end
throw_scan_error(cmd, out, err, child_status) click to toggle source

rubocop:enable Metrics/AbcSize, Metrics/MethodLength

# File lib/sequenceserver/database.rb, line 122
def throw_scan_error(cmd, out, err, child_status)
  errpat = /BLAST Database error/
  if !child_status.success? || err.match(errpat)
    fail BLAST_DATABASE_ERROR.new(cmd, err)
  end
  fail NO_BLAST_DATABASE_FOUND, config[:database_dir] if out.empty?
end
unformatted_fastas() click to toggle source

Returns an Array of FASTA files that may require formatting, and the type of sequence contained in each FASTA.

> unformatted_fastas
=> [['/foo/bar.fasta', :nulceotide], ...]
# File lib/sequenceserver/database.rb, line 143
def unformatted_fastas
  list = []
  database_dir = config[:database_dir]
  Find.find database_dir do |file|
    next if File.directory? file
    next if Database.include? file
    next unless probably_fasta? file
    sequence_type = guess_sequence_type_in_fasta file
    if [:protein, :nucleotide].include?(sequence_type)
      list << [file, sequence_type]
    end
  end
  list
end

Private Class Methods

collection() click to toggle source
# File lib/sequenceserver/database.rb, line 53
def collection
  @collection ||= {}
end