class MgNu::Parser::Blast::Format0

Attributes

blast_type[RW]
queries[RW]

Public Class Methods

new(file) click to toggle source

create a new Format0 parser

# File lib/mgnu/parser/blast/format0.rb, line 14
def initialize(file)
  @query = nil
  @sbjct = nil
  @sbjct_number = 0
  @blast_type = nil
  @queries = []

  @file = file
end

Public Instance Methods

extract_all_hsps(buffer) click to toggle source

create Hsp objects from the complete alignment section

@param [Array] buffer containing all the lines from the

alignment section
# File lib/mgnu/parser/blast/format0.rb, line 158
def extract_all_hsps(buffer)
  unless buffer[0] =~ /^\s+Score =/
    $stderr.puts "can't process HSP buffer - missing Score = line!"
    exit(1)
  end

  hsp_buffer = Array.new
  while line = buffer.shift
    if line =~ /^\s+Score =/ and hsp_buffer.length > 0
      process_hsp(hsp_buffer)              
      hsp_buffer.clear
      buffer.unshift(line)
    else
      hsp_buffer << line
    end
  end
  process_hsp(hsp_buffer) if hsp_buffer.length > 0
end
extract_dbinfo(buffer) click to toggle source
# File lib/mgnu/parser/blast/format0.rb, line 87
def extract_dbinfo(buffer)
  str = ""
  while line = buffer.shift
    break if line =~ /^\s*$/ # empty line, break
    str += line.chomp
  end
  
  str.gsub!(/\s+/," ")
  if str =~ /Database:\s+(.+)\.?\s+([0-9,]+)\s+sequences;\s+([0-9,]+)\s+total\s+letters/
    db_name, db_seq_count, db_total_letters = $1, $2, $3
    @query.database = db_name
    @query.database_sequence_count = db_seq_count.gsub(",","").to_i
    @query.database_total_letters = db_total_letters.gsub(",","").to_i
  else
    $stderr.puts "extract_dbinfo: Database line mismatch!"
    $stderr.puts "database, database_sequence_count and database_total_letters are not set"
    $stderr.puts str
  end

  # eat up single-line summary cruft until beginning of subjects
  while line = buffer.shift
    if line =~ /^>/ # first sbjct, break
      buffer.unshift(line)
      break
    end
  end
end
extract_query(buffer) click to toggle source
# File lib/mgnu/parser/blast/format0.rb, line 47
def extract_query(buffer)
  str = ""
  while line = buffer.shift
    break if line =~ /^\s*$/ # empty line, break
    str += line.chomp
  end
  str.gsub!(/\s+/, " ")
  @query = Query.new
  if str =~ /^Query= (.+?) (.*) ?\(([0-9,]+) letters\)/
    @query.query_id = $1
    @query.definition = $2
    @query.length = $3.gsub(",","").to_i
  else
    # Blast+ output has query id/definition and length on separate lines
    while line = buffer.shift
      str += line.chomp
      break if line =~ /Length=/
    end
    if str =~ /^Query=\s([^\s]+)\s?(.*)\s?Length=([,\d]+)/
      @query.query_id = $1
      @query.definition = $2
      @query.length = $3.gsub(",","").to_i
    end
  end
  extract_dbinfo(buffer)

  sbjct_buffer = Array.new
  while line = buffer.shift
    if line =~ /^>/ and sbjct_buffer.length > 0
      extract_sbjct(sbjct_buffer)
      sbjct_buffer.clear
      buffer.unshift(line)
    else
      sbjct_buffer << line
    end
  end
  extract_sbjct(sbjct_buffer) if sbjct_buffer.length > 0

end
extract_sbjct(buffer) click to toggle source
# File lib/mgnu/parser/blast/format0.rb, line 115
def extract_sbjct(buffer)
  if buffer[0] !~ /^>/
    $stderr.puts "can't process subject buffer - missing fasta header line!"
    exit(1)
  end

  str = ""
  # read until blank line to get header, but ensure that we already have Length= line
  while line = buffer.shift
    break if line =~ /^\s*$/ && str =~ /Length\s*=/
    str += line.chomp
  end
  str.gsub!(/\s+/," ") # shrink spaces
  @sbjct = Sbjct.new
  if str =~ />(.+?)\s+(.*)\s*Length\s+=\s+(\d+)/m or
     str =~ />\s(.+?)\s+(.*)\s*Length=\s?(\d+)/m
    @sbjct.number = @query.sbjcts.length + 1
    @sbjct.sbjct_id = $1
    @sbjct.definition = $2.rstrip
    @sbjct.length = $3.to_i
    @sbjct.query = @query
  end
 
  hsp_buffer = Array.new
  while line = buffer.shift
    if line =~ /^>/ and hsp_buffer.length > 0
      extract_all_hsps(hsp_buffer)
      hsp_buffer.clear
      buffer.unshift(line)
      break
    else
      hsp_buffer << line
    end
  end
  extract_all_hsps(hsp_buffer) if hsp_buffer.length > 0
  
  @query.sbjcts << @sbjct
end
parse() click to toggle source

parse the input blast file

# File lib/mgnu/parser/blast/format0.rb, line 25
def parse
  line = @file.readline
  @blast_type = line.split[0]
  buffer = parse_until(@file,/^Query=/) # get the 1st chunk of the blast report
  while buffer.length > 0
    if buffer[0] =~ /^Query=/
      process_buffer(buffer)
    end
    buffer = parse_until(@file,/^Query=/) 
  end # end while
end
process_buffer(buffer) click to toggle source

filter a blast query entry for important parts @param [Array] buffer containing a “Query=” block from

the blast output file

@return [Bool] success or failure of the processing

# File lib/mgnu/parser/blast/format0.rb, line 41
def process_buffer(buffer)
  return false if buffer.length == 0
  extract_query(buffer)
  @queries << @query
end
process_hsp(buffer) click to toggle source
# File lib/mgnu/parser/blast/format0.rb, line 177
def process_hsp(buffer)
  unless buffer[0] =~ /^\s+Score =/
    $stderr.puts "can't process HSP buffer - missing Score = line!"
    exit(1)
  end

  str = ""
  # read until blank line to get header
  while line = buffer.shift
    break if line =~ /^\s*$/
    str += line.chomp
  end

  hsp = Hsp.new
  if str =~ / Score =\s+(\d+(?:\.\d+)?)\s+bits\s+\((\d+)\)/
    hsp.bit_score = $1.to_f
    hsp.score = $2.to_i
  end

  if str =~ /Expect.*\s+=\s+(\d+\.\d+)/ or
     str =~ /Expect.*\s+=\s+(\d+e-\d+)/ or
     str =~ /Expect.*\s+=\s+(e-\d+)/
    hsp.evalue = $1.to_f 
  end

  if str =~ /Identities\s+=\s+(\d+)\/(\d+)\s+\((\d+%)\)/
    hsp.length = $2.to_i
    hsp.identity = $3.to_i
  end

  if str =~ /Positives\s+=\s+(\d+)\/(\d+)\s+\((\d+)%\)/
    hsp.positive = $1.to_i
  end

  if str =~ /Gaps\s+=\s+(\d+)\/(\d+)\s+\((\d+%)\)/
    hsp.gap_count = $1.to_i
  end

  if str =~ /Frame\s+=\s+([+-]\d)/
    hsp.query_frame = $1
  elsif str =~ /Frame\s+=\s+([+-]\d)\s+\/\s+([+-]\d)/
    hsp.query_frame = $1
    hsp.sbjct_frame = $2
  end
  
  if str =~ /Strand\s+=\s+(Plus|Minus)\s+\/\s+(Plus|Minus)/
    hsp.query_frame = $1 == "Plus" ? 1 : -1
    hsp.sbjct_frame = $2 == "Plus" ? 1 : -1
  end

  # read remaining buffer lines for the alignment
  # buffer.delete_if {|x| x =~ /^\s*$/} # drop empty lines

  query_to = nil
  sbjct_to = nil
  while buffer.length > 0
    line = buffer.shift
    if line =~ /Query/
      q_line = line
      m_line = buffer.shift
      s_line = buffer.shift
      leader = 0
     
      break if q_line =~ /#{@blast_type}/ # end of hsps so exit
      if q_line =~ /\s+Database:\s+#{@query.database}/ # end of report so exit
        break
      else
      end

      # process query line
      unless q_line =~ /^Query/
        $stderr.puts "Query line is malformed - skipping alignment"
        $stderr.puts q_line
        break
      end

      q_line =~ /^Query:?\s+(\d+)\s*(.+?)\s+(\d+)$/
      if hsp.query_from.nil?
        hsp.query_from = $1.to_i
      end
      hsp.query_sequence += $2
      query_to = $3.to_i

      if leader == 0
        q_line =~ /^(Query:?\s+\d+\s*)/
        leader = $1.length
      end
      
      # process mid line
      hsp.midline += m_line[leader,m_line.length]
      
      # process sbjct line
      unless s_line =~ /^Sbjct/
        $stderr.puts "Sbjct line is malformed - skipping alignment"
        $stderr.puts s_line
        break
      end

      s_line =~ /^Sbjct:?\s+(\d+)\s+(.+?)\s+(\d+)$/
      if hsp.sbjct_from.nil?
        hsp.sbjct_from = $1.to_i
      end
      hsp.sbjct_sequence += $2
      sbjct_to = $3.to_i
    end # end of if line =~ /Query/
  end # end while buffer.length > 0
  hsp.query_to = query_to
  hsp.sbjct_to = sbjct_to
  @sbjct.hsps << hsp # add this hsp to the sbjct
end