class Bio::Newick

Public Instance Methods

__parse_newick_tokenize(str, options) click to toggle source

splits string to tokens

# File lib/iroki/main/main.rb, line 48
def __parse_newick_tokenize(str, options)
  str = str.chop if str[-1..-1] == ';'
  # http://evolution.genetics.washington.edu/phylip/newick_doc.html
  # quoted_label ==> ' string_of_printing_characters '
  # single quote in quoted_label is '' (two single quotes)
  #

  if __get_option(:parser, options) == :naive then
    ary = str.split(/([\(\)\,\:\[\]])/)
    ary.collect! { |x| x.strip!; x.empty? ? nil : x }
    ary.compact!
    ary.collect! do |x|
      if /\A([\(\)\,\:\[\]])\z/ =~ x then
        x.intern
      else
        x
      end
    end
    return ary
  end

  tokens = []
  ss = StringScanner.new(str)

  while !(ss.eos?)
    if ss.scan(/\s+/) then
      # do nothing

    elsif ss.scan(/[\(\)\,\:\[\]]/) then
      # '(' or ')' or ',' or ':' or '[' or ']'
      t = ss.matched
      tokens.push t.intern

    elsif ss.scan(/\'/) then
      # quoted_label
      t = ''
      while true
        if ss.scan(/([^\']*)\'/) then
          t.concat ss[1]
          if  ss.scan(/\'/) then
            # single quote in quoted_label
            t.concat ss.matched
          else
            break
          end
        else
          # incomplete quoted_label?
          break
        end
      end #while true
      unless ss.match?(/\s*[\(\)\,\:\[\]]/) or ss.match?(/\s*\z/) then
        # label continues? (illegal, but try to rescue)
        if ss.scan(/[^\(\)\,\:\[\]]+/) then
          t.concat ss.matched.lstrip
        end
      end
      tokens.push t

    elsif ss.scan(/[^\(\)\,\:\[\]]+/) then
      # unquoted_label
      t = ss.matched.strip
      t.gsub!(/[\r\n]/, '')

      unless __get_option(:parser, options) == :iroki then
        # unquoted underscore should be converted to blank
        t.gsub!(/\_/, ' ')
      end
      tokens.push t unless t.empty?

    else
      # unquoted_label in end of string
      t = ss.rest.strip
      t.gsub!(/[\r\n]/, '')

      unless __get_option(:parser, options) == :iroki then
        # unquoted underscore should be converted to blank
        t.gsub!(/\_/, ' ')
      end

      tokens.push t unless t.empty?
      ss.terminate

    end
  end #while !(ss.eos?)

  tokens
end