class SequenceServer::BLAST::Report

Captures results of a BLAST search.

A report is constructed from a search id. Search id is simply the basename of the temporary file that holds BLAST results in binary BLAST archive format.

For a given search id, result is obtained in XML format using the Formatter class, parsed into a simple intermediate representation (Array of values and Arrays) and information extracted from the intermediate representation (ir).

Constants

PARSEABLE_AS_ARRAY
PARSEABLE_AS_HASH

Public Class Methods

new(job) click to toggle source
Calls superclass method
# File lib/sequenceserver/blast/report.rb, line 25
def initialize(job)
  super do
    @querydb = job.databases
  end
end

Public Instance Methods

dbtype() click to toggle source
# File lib/sequenceserver/blast/report.rb, line 70
def dbtype
  @dbtype ||= querydb&.first&.type || dbtype_from_program
end
done?() click to toggle source
# File lib/sequenceserver/blast/report.rb, line 50
def done?
  return true if job.imported_xml_file

  File.exist?(xml_formatter.filepath) && File.exist?(tsv_formatter.filepath)
end
params() click to toggle source
# File lib/sequenceserver/blast/report.rb, line 74
def params
  @params ||= extract_params
end
program() click to toggle source
# File lib/sequenceserver/blast/report.rb, line 56
def program
  @program ||= xml_ir[0]
end
program_version() click to toggle source
# File lib/sequenceserver/blast/report.rb, line 60
def program_version
  @program_version ||= xml_ir[1]
end
queries() click to toggle source
# File lib/sequenceserver/blast/report.rb, line 82
def queries
  @queries ||= xml_ir[8].map do |n|
    query = Query.new(self, n[0], n[2], n[3], [])
    query.hits = query_hits(n[4], tsv_ir[query.id], query)

    query
  end
end
querydb() click to toggle source
# File lib/sequenceserver/blast/report.rb, line 64
def querydb
  @querydb ||= xml_ir[3].split.map do |path|
    { title: File.basename(path) }
  end
end
stats() click to toggle source
# File lib/sequenceserver/blast/report.rb, line 78
def stats
  @stats ||= extract_stats
end
to_json(*_args) click to toggle source
# File lib/sequenceserver/blast/report.rb, line 31
def to_json(*_args)
  %i[querydb program program_version params stats
     queries].inject({}) do |h, k|
    h[k] = send(k)
    h
  end.update(search_id: job.id,
             submitted_at: job.submitted_at.utc,
             imported_xml: !job.imported_xml_file.nil?,
             seqserv_version: SequenceServer::VERSION,
             cloud_sharing_enabled: SequenceServer.config[:cloud_share_url].start_with?('http'),
             non_parse_seqids: !!job.databases&.any?(&:non_parse_seqids?)).to_json
end
xml_file_size() click to toggle source
# File lib/sequenceserver/blast/report.rb, line 44
def xml_file_size
  return File.size(job.imported_xml_file) if job.imported_xml_file

  xml_formatter.size
end

Private Instance Methods

dbtype_from_program() click to toggle source

Returns database type (nucleotide or protein) inferred from Report#program (i.e., the BLAST algorithm)

# File lib/sequenceserver/blast/report.rb, line 287
def dbtype_from_program
  case program
  when /blastn|tblastn|tblastx/
    'nucleotide'
  when /blastp|blastx/
    'protein'
  end
end
extract_params() click to toggle source

Search params tweak the results. Like evalue cutoff or penalty to open a gap. BLAST+ doesn’t list all input params in the XML output. Only matrix, evalue, gapopen, gapextend, and filters are available from XML output.

# File lib/sequenceserver/blast/report.rb, line 129
def extract_params
  # Parse/get params from the job first.
  job_params = parse_advanced(job.advanced)
  # Old jobs from beta releases may not have the advanced key but they
  # will have the deprecated advanced_params key.
  job_params.update(job.advanced_params) if job.advanced_params

  # Parse params from BLAST XML.
  @params = Hash[
    *xml_ir[7].first.map { |k, v| [k.gsub('Parameters_', ''), v] }.flatten
  ]
  @params['evalue'] = @params.delete('expect')

  # Merge into job_params.
  @params = job_params.merge(@params)
end
extract_stats() click to toggle source

Search stats are computed metrics. Like total number of sequences or effective search space.

# File lib/sequenceserver/blast/report.rb, line 148
def extract_stats
  stats = xml_ir[8].first[5][0]
  {
    nsequences: stats[0],
    ncharacters: stats[1],
    hsp_length: stats[2],
    search_space: stats[3],
    kappa: stats[4],
    labmda: stats[5],
    entropy: stats[6]
  }
end
first_text(node) click to toggle source
# File lib/sequenceserver/blast/report.rb, line 241
def first_text(node)
  node.nodes.find { |n| n.is_a? String }
end
hsps(xml_ir, tsv_ir, hit) click to toggle source
# File lib/sequenceserver/blast/report.rb, line 189
def hsps(xml_ir, tsv_ir, hit)
  xml_ir.map.with_index do |n, i|
    n.insert(14, tsv_ir[i])

    HSP.new(hit, *n)
  end
end
node_to_array(element) click to toggle source
# File lib/sequenceserver/blast/report.rb, line 224
def node_to_array(element)
  element.nodes.map { |n| node_to_value n }
end
node_to_hash(element) click to toggle source
# File lib/sequenceserver/blast/report.rb, line 220
def node_to_hash(element)
  Hash[*element.nodes.map { |n| [n.name, node_to_value(n)] }.flatten]
end
node_to_value(node) click to toggle source
# File lib/sequenceserver/blast/report.rb, line 228
def node_to_value(node)
  # Ensure that the recursion doesn't fails when String value is received.
  return node if node.is_a?(String)

  if PARSEABLE_AS_HASH.include? node.name
    node_to_hash(node)
  elsif PARSEABLE_AS_ARRAY.include? node.name
    node_to_array(node)
  else
    first_text(node)
  end
end
parse_advanced(param_line) click to toggle source

Parse BLAST CLI string from job.advanced.

# File lib/sequenceserver/blast/report.rb, line 267
def parse_advanced(param_line)
  param_list = (param_line || '').split(' ')
  res = {}

  param_list.each_with_index do |word, i|
    nxt = param_list[i + 1]
    next unless word.start_with? '-'

    word.sub!('-', '')
    res[word] = unless nxt.nil? || nxt.start_with?('-')
                  nxt
                else
                  'True'
                end
  end
  res
end
parse_tsv(tsv) click to toggle source

Parses the given TSV string as:

{

qseqid: {
  sseqid: [sciname, qcovs, [qcovhsp]],
  ...
},
...

}

# File lib/sequenceserver/blast/report.rb, line 254
def parse_tsv(tsv)
  ir = Hash.new { |h, k| h[k] = {} }
  tsv.each_line do |line|
    next if line.start_with? '#'

    row = line.chomp.split("\t")

    (ir[row[0]][row[1]] ||= [row[2], row[3], []])[2] << row[4]
  end
  ir
end
parse_xml(xml) click to toggle source
# File lib/sequenceserver/blast/report.rb, line 197
      def parse_xml(xml)
        node_to_array Ox.parse(xml).root
      rescue Ox::ParseError
        raise 'Error parsing XML file' if job.imported_xml_file

        raise InputError, <<~MSG
          BLAST generated incorrect XML output. This can happen if sequence ids in your
          databases are not unique across all files. As a temporary workaround, you can
          repeat the search with one database at a time. Proper fix is to recreate the
          following databases with unique sequence ids:

              #{querydb.map(&:title).join(', ')}

          If you are not the one managing this server, try to let the manager know
          about this.
        MSG
      end
query_hits(xml_ir, tsv_ir, query) click to toggle source

Create Hit objects for the given query from the given ir.

# File lib/sequenceserver/blast/report.rb, line 162
def query_hits(xml_ir, tsv_ir, query)
  return [] if xml_ir == ["\n"] # => No hits.

  xml_ir.map do |n|
    # If hit comes from a non -parse_seqids database, then id (n[1]) is a
    # BLAST assigned internal id of the format 'gnl|BL_ORD_ID|serial'. We
    # assign the id to accession (because we use accession for sequence
    # retrieval and this id is what blastdbcmd expects for non
    # -parse_seqids databases) and parse the hit defline to
    # obtain id and title ourselves (we use id and title
    # for display purposes).
    if n[1] =~ /^gnl\|BL_ORD_ID\|\d+/
      n[3] = n[1]
      defline = n[2].split
      n[1] = defline.shift
      n[2] = defline.join(' ')
    end

    hit = Hit.new(query, n[0], n[1], n[3], n[2], n[4],
                  tsv_ir[n[1]][0], tsv_ir[n[1]][1], [])

    hit.hsps = hsps(n[5], tsv_ir[n[1]][2], hit)

    hit
  end
end
tsv_formatter() click to toggle source
# File lib/sequenceserver/blast/report.rb, line 121
def tsv_formatter
  @tsv_formatter ||= Formatter.run(job, 'custom_tsv')
end
tsv_ir() click to toggle source
# File lib/sequenceserver/blast/report.rb, line 103
def tsv_ir
  @tsv_ir ||=
    if job.imported_xml_file
      Hash.new do |h1, k1|
        h1[k1] = Hash.new do |h2, k2|
          h2[k2] = ['', '', []]
        end
      end
    else
      job.raise!
      parse_tsv(tsv_formatter.read_file)
    end
end
xml_formatter() click to toggle source
# File lib/sequenceserver/blast/report.rb, line 117
def xml_formatter
  @xml_formatter ||= Formatter.run(job, 'xml')
end
xml_ir() click to toggle source
# File lib/sequenceserver/blast/report.rb, line 93
def xml_ir
  @xml_ir ||=
    if job.imported_xml_file
      parse_xml File.read(job.imported_xml_file)
    else
      job.raise!
      parse_xml(xml_formatter.read_file)
    end
end