class SequenceServer::MAKEBLASTDB

Smart ‘makeblastdb` wrapper: recursively scans database directory determining which files need to be formatted or re-formatted.

Example usage:

makeblastdb = MAKEBLASTDB.new(database_dir)
makeblastdb.run # formats and re-formats databases in database_dir
makeblastdb.formatted_fastas # lists formatted databases

Constants

GUESS_SAMPLE_SIZE

Attributes

database_dir[R]

Public Class Methods

new(database_dir) click to toggle source
# File lib/sequenceserver/makeblastdb.rb, line 20
def initialize(database_dir)
  @database_dir = database_dir
end

Public Instance Methods

any_formatted?() click to toggle source
# File lib/sequenceserver/makeblastdb.rb, line 26
def any_formatted?
  formatted_fastas.any?
end
any_to_format?() click to toggle source
# File lib/sequenceserver/makeblastdb.rb, line 87
def any_to_format?
  fastas_to_format.any?
end
any_to_format_or_reformat?() click to toggle source
# File lib/sequenceserver/makeblastdb.rb, line 30
def any_to_format_or_reformat?
  any_to_format? || any_to_reformat?
end
format() click to toggle source

Format any unformatted FASTA files in database directory. Returns Array of files that were formatted.

# File lib/sequenceserver/makeblastdb.rb, line 47
def format
  # Make the intent clear as well as ensure the program won't crash if we
  # accidentally call format before calling scan.
  return unless any_to_format?

  fastas_to_format.select do |path, title, type|
    make_blast_database('format', path, title, type)
  end
end
formatted_fastas() click to toggle source

Determines which FASTA files in the database directory are already formatted.

# File lib/sequenceserver/makeblastdb.rb, line 71
def formatted_fastas
  return @formatted_fastas if defined?(@formatted_fastas)

  @formatted_fastas = []

  blastdbcmd.each_line do |line|
    path, *rest = line.chomp.split("\t")
    next if multipart_database_name?(path)

    rest << get_categories(path)
    @formatted_fastas << Database.new(path, *rest)
  end

  @formatted_fastas
end
no_fastas?() click to toggle source
# File lib/sequenceserver/makeblastdb.rb, line 34
def no_fastas?
  probably_fastas.empty?
end
reformat() click to toggle source

Re-format databases that require reformatting. Returns Array of files that were reformatted.

# File lib/sequenceserver/makeblastdb.rb, line 59
def reformat
  # Make the intent clear as well as ensure the program won't crash if
  # we accidentally call reformat before calling scan.
  return unless any_to_reformat?

  fastas_to_reformat.select do |path, title, type, non_parse_seqids|
    make_blast_database('reformat', path, title, type, non_parse_seqids)
  end
end
run() click to toggle source

Runs makeblastdb on each file in ‘fastas_to_format` and `fastas_to_reformat`.

# File lib/sequenceserver/makeblastdb.rb, line 40
def run
  format
  reformat
end

Private Instance Methods

_make_blast_database(file, type, title, taxonomy) click to toggle source
# File lib/sequenceserver/makeblastdb.rb, line 216
    def _make_blast_database(file, type, title, taxonomy)
      cmd = "makeblastdb -parse_seqids -hash_index -in '#{file}'" \
            " -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'" \
            " #{taxonomy}"

      output = if File.directory?(file)
                 File.join(file, 'makeblastdb')
               else
                 "#{file}.makeblastdb"
               end

      out, err = sys(
        cmd,
        path: config[:bin],
        stderr: [output, 'stderr'].join,
        stdout: [output, 'stdout'].join
      )

      puts out.strip
      puts err.strip

      true
    rescue CommandFailed => e
      puts <<~MSG
        Could not create BLAST database for: #{file}
        Tried: #{cmd}
        stdout: #{e.stdout}
        stderr: #{e.stderr}
      MSG
      exit!
    end
any_to_reformat?() click to toggle source
# File lib/sequenceserver/makeblastdb.rb, line 93
def any_to_reformat?
  fastas_to_reformat.any?
end
blastdbcmd() click to toggle source

Runs ‘blastdbcmd` to determine formatted FASTA files in the database directory. Returns the output of `blastdbcmd`. This method is called by `determine_formatted_fastas`.

# File lib/sequenceserver/makeblastdb.rb, line 146
def blastdbcmd
  cmd = "blastdbcmd -recursive -list #{config[:database_dir]}" \
        ' -list_outfmt "%f  %t       %p    %n %l      %d   %v"'
  out, err = sys(cmd, path: config[:bin])
  errpat = /BLAST Database error/
  fail BLAST_DATABASE_ERROR.new(cmd, err) if err.match(errpat)

  out
rescue CommandFailed => e
  raise BLAST_DATABASE_ERROR.new(cmd, e.stderr)
end
confirm_database_title(default) click to toggle source

Show the database title that we are going to use to the user for confirmation.

Returns user input if any. Auto-determined title otherwise.

# File lib/sequenceserver/makeblastdb.rb, line 186
def confirm_database_title(default)
  print "Enter a database title or will use '#{default}': "
  from_user = STDIN.gets.to_s.strip
  from_user.empty? && default || from_user
end
extract_fasta(db) click to toggle source

Extract FASTA file from BLAST database.

Invoked while reformatting a BLAST database if the corresponding FASTA file does not exist.

# File lib/sequenceserver/makeblastdb.rb, line 252
    def extract_fasta(db)
      puts
      puts 'Extracting sequences ...'
      cmd = "blastdbcmd -entry all -db #{db}"
      sys(cmd, stdout: db, path: config[:bin])
    rescue CommandFailed => e
      puts <<~MSG
        Could not extract sequences from: #{db}
        Tried: #{cmd}
        stdout: #{e.stdout}
        stderr: #{e.stderr}
      MSG
      exit!
    end
extract_taxid_map(db, taxmap_file) click to toggle source
# File lib/sequenceserver/makeblastdb.rb, line 267
def extract_taxid_map(db, taxmap_file)
  cmd = "blastdbcmd -entry all -db #{db} -outfmt '%i %T'"
  sys(cmd, stdout: taxmap_file, path: config[:bin])
rescue CommandFailed => e
  # silence error
end
fastas_to_format() click to toggle source

Determines which FASTA files in the database directory are unformatted.

# File lib/sequenceserver/makeblastdb.rb, line 112
def fastas_to_format
  return @fastas_to_format if defined?(@fastas_to_format)

  formatted_fasta_paths = formatted_fastas.map { |f| f[0] }
  fasta_paths_to_format = probably_fastas - formatted_fasta_paths

  @fastas_to_format = fasta_paths_to_format.map do |path|
    [
      path,
      make_db_title(path),
      guess_sequence_type_in_fasta(path)
    ]
  end

  @fastas_to_format
end
fastas_to_reformat() click to toggle source

Determines which FASTA files in the database directory require reformatting.

# File lib/sequenceserver/makeblastdb.rb, line 99
def fastas_to_reformat
  return @fastas_to_reformat if defined?(@fastas_to_reformat)

  @fastas_to_reformat = []
  formatted_fastas.each do |ff|
    @fastas_to_reformat << [ff.path, ff.title, ff.type, ff.non_parse_seqids?] if ff.v4? || ff.non_parse_seqids?
  end

  @fastas_to_reformat
end
get_categories(path) click to toggle source
# File lib/sequenceserver/makeblastdb.rb, line 287
def get_categories(path)
  path
    .gsub(config[:database_dir], '') # remove database_dir from path
    .split('/')
    .reject(&:empty?)[0..-2] # the first entry might be '' if database_dir does not end with /
end
guess_sequence_type_in_fasta(file) click to toggle source

Guess whether FASTA file contains protein or nucleotide sequences by sampling a few few characters of the file.

# File lib/sequenceserver/makeblastdb.rb, line 324
def guess_sequence_type_in_fasta(file)
  sequences = sample_sequences(file)
  sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
  sequence_types = sequence_types.uniq.compact
  (sequence_types.length == 1) && sequence_types.first
end
make_blast_database(action, file, title, type, non_parse_seqids = false) click to toggle source

Create BLAST database, given FASTA file and sequence type in FASTA file.

# File lib/sequenceserver/makeblastdb.rb, line 159
def make_blast_database(action, file, title, type, non_parse_seqids = false)
  return unless make_blast_database?(action, file, type)

  title = confirm_database_title(title)
  extract_fasta(file) unless File.exist?(file)
  taxonomy = taxid_map(file, non_parse_seqids) || taxid
  _make_blast_database(file, type, title, taxonomy)
end
make_blast_database?(action, file, type) click to toggle source

Show file path and guessed sequence type to the user and obtain a y/n response.

Returns true if the user entered anything but ‘n’ or ‘N’.

# File lib/sequenceserver/makeblastdb.rb, line 172
def make_blast_database?(action, file, type)
  puts
  puts
  puts "FASTA file to #{action}: #{file}"
  puts "FASTA type: #{type}"
  print 'Proceed? [y/n] (Default: y): '
  response = STDIN.gets.to_s.strip
  !response.match(/n/i)
end
make_db_title(path) click to toggle source

Suggests improved titles when generating database names from files for improved apperance and readability in web interface. For example: Cobs1.4.proteins.fasta -> Cobs 1.4 proteins S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl

# File lib/sequenceserver/makeblastdb.rb, line 308
def make_db_title(path)
  db_name = File.basename(path)
  db_name.tr!('"', "'")
  # removes .fasta like extension names
  db_name.gsub!(File.extname(db_name), '')
  # replaces _ with ' ',
  db_name.gsub!(/(_)/, ' ')
  # replaces '.' with ' ' when no numbers are on either side,
  db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
  # preserves version numbers
  db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
  db_name
end
multipart_database_name?(db_name) click to toggle source

Returns true if the database name appears to be a multi-part database name.

e.g. /home/ben/pd.ben/sequenceserver/db/nr.00 => yes /home/ben/pd.ben/sequenceserver/db/nr => no /home/ben/pd.ben/sequenceserver/db/img3.5.finished.faa.01 => yes /home/ben/pd.ben/sequenceserver/db/nr00 => no /mnt/blast-db/refseq_genomic.100 => yes

# File lib/sequenceserver/makeblastdb.rb, line 283
def multipart_database_name?(db_name)
  !db_name.match(%r{.+/\S+\.\d{2,3}$}).nil?
end
probably_fasta?(file) click to toggle source

Returns true if first character of the file is ‘>’.

# File lib/sequenceserver/makeblastdb.rb, line 295
def probably_fasta?(file)
  unless file.match(/((cdna)|(cds)|(dna)|(fa)|(faa)|(fas)|(fasta)|(fna)|(genome)|(nt)|(nuc)|(pep)|(prot))$/i)
    return false
  end

  File.read(file, 1) == '>'
end
probably_fastas() click to toggle source
# File lib/sequenceserver/makeblastdb.rb, line 129
def probably_fastas
  return @probably_fastas if defined?(@probably_fastas)

  @probably_fastas = []

  Find.find(database_dir + '/') do |path|
    next if File.directory?(path)

    @probably_fastas << path if probably_fasta?(path)
  end

  @probably_fastas
end
sample_sequences(file, offset = 0) click to toggle source

Read first 1,048,576 characters of the file, split the read text on fasta def line pattern and return.

If the given file is FASTA, returns Array of as many different sequences in the portion of the file read. Returns the portion of the file read wrapped in an Array otherwise.

# File lib/sequenceserver/makeblastdb.rb, line 337
def sample_sequences(file, offset = 0)
  sample = File.read(file, GUESS_SAMPLE_SIZE, offset)

  return [] if sample.nil?

  # remove all unknown bases (indicated by 'N') before sampling
  sample.gsub!(/N/, '')
  meaningful_samples = sample.split(/^>.+$/).map { |line| line.gsub(/^\n+$/, '') }.delete_if(&:empty?)

  if meaningful_samples.empty?
    offset += GUESS_SAMPLE_SIZE
    sample_sequences(file, offset)
  else
    meaningful_samples
  end
end
taxid() click to toggle source

Get taxid from the user. Returns user input or 0.

Using 0 as taxid is equivalent to not setting taxid for the database that will be created.

# File lib/sequenceserver/makeblastdb.rb, line 206
def taxid
  default = 0
  print 'Enter taxid (optional): '
  user_response = STDIN.gets.strip
  "-taxid #{user_response.empty? && default || Integer(user_response)}"
rescue ArgumentError # presumably from call to Interger()
  puts 'taxid should be a number'
  retry
end
taxid_map(db, non_parse_seqids) click to toggle source

Check if a ‘.taxid_map.txt’ file exists. If not, try getting it using blastdbcmd.

# File lib/sequenceserver/makeblastdb.rb, line 194
def taxid_map(db, non_parse_seqids)
  return if non_parse_seqids

  taxid_map = db.sub(/#{File.extname(db)}$/, '.taxid_map.txt')
  extract_taxid_map(db, taxid_map) unless File.exist?(taxid_map)
  "-taxid_map #{taxid_map}" unless File.zero?(taxid_map)
end