class SequenceServer::BLAST::Report
Captures results of a BLAST
search.
A report is constructed from a search id. Search id is simply the basename of the temporary file that holds BLAST
results in binary BLAST
archive format.
For a given search id, result is obtained in XML format using the Formatter
class, parsed into a simple intermediate representation (Array of values and Arrays) and information extracted from the intermediate representation (ir).
Constants
- PARSEABLE_AS_ARRAY
- PARSEABLE_AS_HASH
Public Class Methods
# File lib/sequenceserver/blast/report.rb, line 25 def initialize(job) super do @querydb = job.databases end end
Public Instance Methods
# File lib/sequenceserver/blast/report.rb, line 70 def dbtype @dbtype ||= querydb&.first&.type || dbtype_from_program end
# File lib/sequenceserver/blast/report.rb, line 50 def done? return true if job.imported_xml_file File.exist?(xml_formatter.filepath) && File.exist?(tsv_formatter.filepath) end
# File lib/sequenceserver/blast/report.rb, line 74 def params @params ||= extract_params end
# File lib/sequenceserver/blast/report.rb, line 56 def program @program ||= xml_ir[0] end
# File lib/sequenceserver/blast/report.rb, line 60 def program_version @program_version ||= xml_ir[1] end
# File lib/sequenceserver/blast/report.rb, line 82 def queries @queries ||= xml_ir[8].map do |n| query = Query.new(self, n[0], n[2], n[3], []) query.hits = query_hits(n[4], tsv_ir[query.id], query) query end end
# File lib/sequenceserver/blast/report.rb, line 64 def querydb @querydb ||= xml_ir[3].split.map do |path| { title: File.basename(path) } end end
# File lib/sequenceserver/blast/report.rb, line 78 def stats @stats ||= extract_stats end
# File lib/sequenceserver/blast/report.rb, line 31 def to_json(*_args) %i[querydb program program_version params stats queries].inject({}) do |h, k| h[k] = send(k) h end.update(search_id: job.id, submitted_at: job.submitted_at.utc, imported_xml: !job.imported_xml_file.nil?, seqserv_version: SequenceServer::VERSION, cloud_sharing_enabled: SequenceServer.config[:cloud_share_url].start_with?('http'), non_parse_seqids: !!job.databases&.any?(&:non_parse_seqids?)).to_json end
# File lib/sequenceserver/blast/report.rb, line 44 def xml_file_size return File.size(job.imported_xml_file) if job.imported_xml_file xml_formatter.size end
Private Instance Methods
Returns database type (nucleotide or protein) inferred from Report#program
(i.e., the BLAST
algorithm)
# File lib/sequenceserver/blast/report.rb, line 287 def dbtype_from_program case program when /blastn|tblastn|tblastx/ 'nucleotide' when /blastp|blastx/ 'protein' end end
Search params tweak the results. Like evalue cutoff or penalty to open a gap. BLAST+ doesn’t list all input params in the XML output. Only matrix, evalue, gapopen, gapextend, and filters are available from XML output.
# File lib/sequenceserver/blast/report.rb, line 129 def extract_params # Parse/get params from the job first. job_params = parse_advanced(job.advanced) # Old jobs from beta releases may not have the advanced key but they # will have the deprecated advanced_params key. job_params.update(job.advanced_params) if job.advanced_params # Parse params from BLAST XML. @params = Hash[ *xml_ir[7].first.map { |k, v| [k.gsub('Parameters_', ''), v] }.flatten ] @params['evalue'] = @params.delete('expect') # Merge into job_params. @params = job_params.merge(@params) end
Search stats are computed metrics. Like total number of sequences or effective search space.
# File lib/sequenceserver/blast/report.rb, line 148 def extract_stats stats = xml_ir[8].first[5][0] { nsequences: stats[0], ncharacters: stats[1], hsp_length: stats[2], search_space: stats[3], kappa: stats[4], labmda: stats[5], entropy: stats[6] } end
# File lib/sequenceserver/blast/report.rb, line 241 def first_text(node) node.nodes.find { |n| n.is_a? String } end
# File lib/sequenceserver/blast/report.rb, line 189 def hsps(xml_ir, tsv_ir, hit) xml_ir.map.with_index do |n, i| n.insert(14, tsv_ir[i]) HSP.new(hit, *n) end end
# File lib/sequenceserver/blast/report.rb, line 224 def node_to_array(element) element.nodes.map { |n| node_to_value n } end
# File lib/sequenceserver/blast/report.rb, line 220 def node_to_hash(element) Hash[*element.nodes.map { |n| [n.name, node_to_value(n)] }.flatten] end
# File lib/sequenceserver/blast/report.rb, line 228 def node_to_value(node) # Ensure that the recursion doesn't fails when String value is received. return node if node.is_a?(String) if PARSEABLE_AS_HASH.include? node.name node_to_hash(node) elsif PARSEABLE_AS_ARRAY.include? node.name node_to_array(node) else first_text(node) end end
Parse BLAST
CLI string from job.advanced.
# File lib/sequenceserver/blast/report.rb, line 267 def parse_advanced(param_line) param_list = (param_line || '').split(' ') res = {} param_list.each_with_index do |word, i| nxt = param_list[i + 1] next unless word.start_with? '-' word.sub!('-', '') res[word] = unless nxt.nil? || nxt.start_with?('-') nxt else 'True' end end res end
Parses the given TSV string as:
{
qseqid: { sseqid: [sciname, qcovs, [qcovhsp]], ... }, ...
}
# File lib/sequenceserver/blast/report.rb, line 254 def parse_tsv(tsv) ir = Hash.new { |h, k| h[k] = {} } tsv.each_line do |line| next if line.start_with? '#' row = line.chomp.split("\t") (ir[row[0]][row[1]] ||= [row[2], row[3], []])[2] << row[4] end ir end
# File lib/sequenceserver/blast/report.rb, line 197 def parse_xml(xml) node_to_array Ox.parse(xml).root rescue Ox::ParseError raise 'Error parsing XML file' if job.imported_xml_file raise InputError, <<~MSG BLAST generated incorrect XML output. This can happen if sequence ids in your databases are not unique across all files. As a temporary workaround, you can repeat the search with one database at a time. Proper fix is to recreate the following databases with unique sequence ids: #{querydb.map(&:title).join(', ')} If you are not the one managing this server, try to let the manager know about this. MSG end
Create Hit objects for the given query from the given ir.
# File lib/sequenceserver/blast/report.rb, line 162 def query_hits(xml_ir, tsv_ir, query) return [] if xml_ir == ["\n"] # => No hits. xml_ir.map do |n| # If hit comes from a non -parse_seqids database, then id (n[1]) is a # BLAST assigned internal id of the format 'gnl|BL_ORD_ID|serial'. We # assign the id to accession (because we use accession for sequence # retrieval and this id is what blastdbcmd expects for non # -parse_seqids databases) and parse the hit defline to # obtain id and title ourselves (we use id and title # for display purposes). if n[1] =~ /^gnl\|BL_ORD_ID\|\d+/ n[3] = n[1] defline = n[2].split n[1] = defline.shift n[2] = defline.join(' ') end hit = Hit.new(query, n[0], n[1], n[3], n[2], n[4], tsv_ir[n[1]][0], tsv_ir[n[1]][1], []) hit.hsps = hsps(n[5], tsv_ir[n[1]][2], hit) hit end end
# File lib/sequenceserver/blast/report.rb, line 121 def tsv_formatter @tsv_formatter ||= Formatter.run(job, 'custom_tsv') end
# File lib/sequenceserver/blast/report.rb, line 103 def tsv_ir @tsv_ir ||= if job.imported_xml_file Hash.new do |h1, k1| h1[k1] = Hash.new do |h2, k2| h2[k2] = ['', '', []] end end else job.raise! parse_tsv(tsv_formatter.read_file) end end
# File lib/sequenceserver/blast/report.rb, line 117 def xml_formatter @xml_formatter ||= Formatter.run(job, 'xml') end
# File lib/sequenceserver/blast/report.rb, line 93 def xml_ir @xml_ir ||= if job.imported_xml_file parse_xml File.read(job.imported_xml_file) else job.raise! parse_xml(xml_formatter.read_file) end end