module Bio::MAF::MAFParsing

MAF parsing code useful for sequential and random-access parsing.

Constants

BLOCK_START
BLOCK_START_OR_EOS
COMMENT
E
EOL_OR_EOF
I
JRUBY_P
Q
S
STRAND_SYM

Public Instance Methods

_parse_block() click to toggle source

Parse the block at the current position, joining fragments across chunk boundaries if necessary.

@return [Block] alignment block @api public

# File lib/bio/maf/parser.rb, line 196
def _parse_block
  return nil if at_end
  if s.pos != last_block_pos
    # in non-trailing block
    parse_block_data
  else
    # in trailing block fragment
    parse_trailing_fragment
  end
end
gather_leading_fragment() click to toggle source

Read chunks and accumulate a leading fragment until we encounter a block start or EOF.

# File lib/bio/maf/parser.rb, line 209
def gather_leading_fragment
  leading_frag = ''
  while true
    next_chunk_start = cr.pos
    next_chunk = cr.read_chunk
    if next_chunk
      next_scanner = StringScanner.new(next_chunk)
      # If this trailing fragment ends with a newline, then an
      # 'a' at the beginning of the leading fragment is the
      # start of the next alignment block.
      if trailing_nl?(leading_frag) || trailing_nl?(s.string)
        pat = BLOCK_START
      else
        pat = /(?:\n(?=a))/
      end
      frag = next_scanner.scan_until(pat)
      if frag
        # got block start
        leading_frag << frag
        break
      else
        # no block start in this
        leading_frag << next_chunk
      end
    else
      # EOF
      @at_end = true
      break
    end
  end
  return leading_frag, next_scanner, next_chunk_start
end
parse_block_data() click to toggle source

Parse a {Block} from the current position. Requires that {#s} and {#chunk_start} be set correctly.

@return [Block] the alignment block.

# File lib/bio/maf/parser.rb, line 298
def parse_block_data
  block_start_pos = s.pos
  block_offset = chunk_start + block_start_pos
  s.scan(/^a\s*/) || parse_error("bad a line")
  block_vars = parse_maf_vars()
  seqs = []
  payload = s.scan_until(/^(?=a)/)
  unless payload
    payload = s.rest
    s.pos = s.string.size # jump to EOS
  end
  filtered = false
  lines = payload.split("\n")
  until lines.empty?
    line = lines.shift
    first = line.getbyte(0)
    if first == S
      seq = parse_seq_line(line, sequence_filter)
      if seq
        seqs << seq
      else
        filtered = true
      end
    elsif first == E && parse_empty
      e_seq = parse_empty_line(line, sequence_filter)
      if e_seq
        seqs << e_seq
      else
        filtered = true
      end
    elsif first == I && parse_extended
      parts = line.split
      parse_error("wrong i source #{parts[1]}!") unless seqs.last.source == parts[1]
      seqs.last.i_data = parts.slice(2..6)
    elsif first == Q && parse_extended
      _, src, quality = line.split
      parse_error("wrong q source #{src}!") unless seqs.last.source == src
      seqs.last.quality = quality
    elsif [I, E, Q, COMMENT, nil].include? first
      next
    else
      if opts[:strict]
        parse_error "unexpected line: '#{line}'"
      else
        LOG.warn "Ignoring invalid MAF line: '#{line}'"
      end
    end
  end
  b = Block.new(block_vars,
                seqs,
                block_offset,
                s.pos - block_start_pos,
                filtered)
  if opts[:retain_text]
    b.orig_text = s.string.slice(block_start_pos...(s.pos))
  end
  return b
end
parse_empty_line(line, filter) click to toggle source

Parse an ‘e’ line. @return [EmptySequence]

# File lib/bio/maf/parser.rb, line 376
def parse_empty_line(line, filter)
  _, src, start, size, strand, src_size, status = line.split
  return nil if filter && ! seq_filter_ok?(src, filter)
  begin
    EmptySequence.new(src,
                      start.to_i,
                      size.to_i,
                      STRAND_SYM.fetch(strand),
                      src_size.to_i,
                      status)
  rescue KeyError
    parse_error "invalid empty sequence line: #{line}"
  end
end
parse_error(msg) click to toggle source

Raise a {ParseError}, indicating position within the MAF file and the chunk as well as the text surrounding the current scanner position.

@param [String] msg the error message

# File lib/bio/maf/parser.rb, line 274
def parse_error(msg)
  s_start = [s.pos - 10, 0].max
  s_end = [s.pos + 10, s.string.length].min
  if s_start > 0
    left = s.string[s_start..(s.pos - 1)]
  else
    left = ''
  end
  right = s.string[s.pos..s_end]
  extra = "pos #{s.pos} [#{chunk_start + s.pos}], last #{last_block_pos}"

  raise ParseError, "#{msg} at: '#{left}>><<#{right}' (#{extra})"
end
parse_maf_vars() click to toggle source

Parse key-value pairs from the MAF header or an ‘a’ line. @return [Hash]

# File lib/bio/maf/parser.rb, line 405
def parse_maf_vars
  vars = {}
  while s.scan(/(\w+)=(\S*)\s+/) do
    vars[s[1].to_sym] = s[2]
  end
  vars
end
parse_seq_line(line, filter) click to toggle source

Parse an ‘s’ line. @return [Sequence]

# File lib/bio/maf/parser.rb, line 359
def parse_seq_line(line, filter)
  _, src, start, size, strand, src_size, text = line.split
  return nil if filter && ! seq_filter_ok?(src, filter)
  begin
    Sequence.new(src,
                 start.to_i,
                 size.to_i,
                 STRAND_SYM.fetch(strand),
                 src_size.to_i,
                 text)
  rescue KeyError
    parse_error "invalid sequence line: #{line}"
  end
end
parse_trailing_fragment() click to toggle source

Join the trailing fragment of the current chunk with the leading fragment of the next chunk and parse the resulting block.

@return [Block] the alignment block.

# File lib/bio/maf/parser.rb, line 248
def parse_trailing_fragment
  leading_frag, next_scanner, next_chunk_start = gather_leading_fragment
  # join fragments and parse
  trailing_frag = s.rest
  joined_block = trailing_frag + leading_frag
  @chunk_start = chunk_start + s.pos
  @s = StringScanner.new(joined_block)
  begin
    block = parse_block_data
  rescue ParseError => pe
    parse_error "Could not parse joined fragments: #{pe}\nTRAILING: #{trailing_frag}\nLEADING: #{leading_frag}"
  end
  # Set up to parse the next block
  @s = next_scanner
  @chunk_start = next_chunk_start
  unless @at_end
    set_last_block_pos!
  end
  return block
end
seq_filter_ok?(src, filter) click to toggle source

Indicates whether the given sequence source should be parsed, given the current sequence filters.

# File lib/bio/maf/parser.rb, line 393
def seq_filter_ok?(src, filter)
  if filter[:only_species]
    src_sp = src.split('.', 2)[0]
    m = filter[:only_species].find { |sp| src_sp == sp }
    return m
  else
    return true
  end
end
set_last_block_pos!() click to toggle source
# File lib/bio/maf/parser.rb, line 176
def set_last_block_pos!
  @last_block_pos = s.string.rindex(BLOCK_START)
end
trailing_nl?(string) click to toggle source

Does ‘string` have a trailing newline?

# File lib/bio/maf/parser.rb, line 414
def trailing_nl?(string)
  if string.empty?
    false
  else
    s.string[s.string.size - 1] == "\n"
  end
end