class Bio::Newick

Newick standard phylogenetic tree parser class.

This is alpha version. Incompatible changes may be made frequently.

Constants

DELIMITER

delemiter of the entry

Edge

same as Bio::Tree::Edge

Node

same as Bio::Tree::Node

Attributes

entry_overrun[R]

string after this entry

options[R]

parser options (in some cases, options can be automatically set by the parser)

original_string[R]

original string before parsing

Public Class Methods

new(str, options = nil) click to toggle source

Creates a new Newick object. options for parsing can be set.

Available options:

:bootstrap_style

:traditional for traditional bootstrap style, :molphy for molphy style, :disabled to ignore bootstrap strings. For details of default actions, please read the notes below.

:parser

:naive for using naive parser, compatible with BioRuby 1.1.0, which ignores quoted strings and do not convert underscores to spaces.

Notes for bootstrap style: Molphy-style bootstrap values may always be parsed, even if the options[:bootstrap_style] is set to :traditional or :disabled.

Note for default or traditional bootstrap style: By default, if all of the internal node's names are numeric and there are no NHX and no molphy-style boostrap values, the names of internal nodes are regarded as bootstrap values. options[:bootstrap_style] = :disabled or :molphy to disable the feature (or at least one NHX tag exists).

   # File lib/bio/db/newick.rb
71 def initialize(str, options = nil)
72   str = str.sub(/\;(.*)/m, ';')
73   @original_string = str
74   @entry_overrun = $1
75   @options = (options or {})
76 end

Public Instance Methods

reparse() click to toggle source

Re-parses the tree from the original string. Returns self. This method is useful after changing parser options.

    # File lib/bio/db/newick.rb
101 def reparse
102   if defined?(@tree)
103     remove_instance_variable(:@tree)
104   end
105   self.tree
106   self
107 end
tree() click to toggle source

Gets the tree. Returns a Bio::Tree object.

   # File lib/bio/db/newick.rb
90 def tree
91   if !defined?(@tree)
92     @tree = __parse_newick(@original_string, @options)
93   else
94     @tree
95   end
96 end

Private Instance Methods

__get_option(key, options) click to toggle source

gets a option

    # File lib/bio/db/newick.rb
112 def __get_option(key, options)
113   options[key] or (@options ? @options[key] : nil)
114 end
__parse_newick(str, options = {}) click to toggle source

Parses newick formatted string.

    # File lib/bio/db/newick.rb
301 def __parse_newick(str, options = {})
302   # initializing
303   root = Node.new
304   cur_node = root
305   edges = []
306   nodes = [ root ]
307   internal_nodes = []
308   node_stack = []
309   # preparation of tokens
310   ary = __parse_newick_tokenize(str, options)
311   previous_token = nil
312   # main loop
313   while token = ary.shift
314     #p token
315     case token
316     when :','
317       if previous_token == :',' or previous_token == :'(' then
318         # there is a leaf whose name is empty.
319         ary.unshift(token)
320         ary.unshift('')
321         token = nil
322       end
323     when :'('
324       node = Node.new
325       nodes << node
326       internal_nodes << node
327       node_stack.push(cur_node)
328       cur_node = node
329     when :')'
330       if previous_token == :',' or previous_token == :'(' then
331         # there is a leaf whose name is empty.
332         ary.unshift(token)
333         ary.unshift('')
334         token = nil
335       else
336         edge = Edge.new
337         leaf_tokens = __parse_newick_get_tokens_for_leaf(ary)
338         token = nil
339         if leaf_tokens.size > 0 then
340           __parse_newick_leaf(leaf_tokens, cur_node, edge, options)
341         end
342         parent = node_stack.pop
343         raise ParseError, 'unmatched parentheses' unless parent
344         edges << Bio::Relation.new(parent, cur_node, edge)
345         cur_node = parent
346       end
347     else
348       leaf = Node.new
349       edge = Edge.new
350       ary.unshift(token)
351       leaf_tokens = __parse_newick_get_tokens_for_leaf(ary)
352       token = nil
353       __parse_newick_leaf(leaf_tokens, leaf, edge, options)
354       nodes << leaf
355       edges << Bio::Relation.new(cur_node, leaf, edge)
356     end #case
357     previous_token = token
358   end #while
359   raise ParseError, 'unmatched parentheses' unless node_stack.empty?
360   bsopt = __get_option(:bootstrap_style, options)
361   ofmt  = __get_option(:original_format, options)
362   unless bsopt == :disabled or bsopt == :molphy or 
363       ofmt == :nhx or ofmt == :molphy then
364     # If all of the internal node's names are numeric,
365     # the names are regarded as bootstrap values.
366     flag = false
367     internal_nodes.each do |inode|
368       if inode.name and !inode.name.to_s.strip.empty? then
369         if /\A[\+\-]?\d*\.?\d*\z/ =~ inode.name
370           flag = true
371         else
372           flag = false
373           break
374         end
375       end
376     end
377     if flag then
378       @options[:bootstrap_style] = :traditional
379       @options[:original_format] = :traditional
380       internal_nodes.each do |inode|
381         if inode.name then
382           inode.bootstrap_string = inode.name
383           inode.name = nil
384         end
385       end
386     end
387   end
388   # Sets nodes order numbers
389   nodes.each_with_index do |xnode, i|
390     xnode.order_number = i
391   end
392   # If the root implicitly prepared by the program is a leaf and
393   # there are no additional information for the edge from the root to
394   # the first internal node, the root is removed.
395   if rel = edges[-1] and rel.node == [ root, internal_nodes[0] ] and
396       rel.relation.instance_eval {
397       !defined?(@distance) and !defined?(@log_likelihood) and
398       !defined?(@width) and !defined?(@nhx_parameters) } and
399       edges.find_all { |x| x.node.include?(root) }.size == 1
400     nodes.shift
401     edges.pop
402   end
403   # Let the tree into instance variables
404   tree = Bio::Tree.new
405   tree.instance_eval {
406     @pathway.relations.concat(edges)
407     @pathway.to_list
408   }
409   tree.root = nodes[0]
410   tree.options.update(@options)
411   tree
412 end
__parse_newick_get_tokens_for_leaf(ary) click to toggle source

get tokens for a leaf

    # File lib/bio/db/newick.rb
292 def __parse_newick_get_tokens_for_leaf(ary)
293   r = []
294   while t = ary[0] and t != :',' and t != :')' and t != :'('
295     r.push ary.shift
296   end
297   r
298 end
__parse_newick_leaf(leaf_tokens, node, edge, options) click to toggle source

Parses newick formatted leaf (or internal node) name.

    # File lib/bio/db/newick.rb
117 def __parse_newick_leaf(leaf_tokens, node, edge, options)
118   t = leaf_tokens.shift
119   if !t.kind_of?(Symbol) then
120     node.name = t
121     t = leaf_tokens.shift
122   end
123 
124   if t == :':' then
125     t = leaf_tokens.shift
126     if !t.kind_of?(Symbol) then
127       edge.distance_string = t if t and !(t.strip.empty?)
128       t = leaf_tokens.shift
129     end
130   end
131 
132   if t == :'[' then
133     btokens = leaf_tokens
134     case __get_option(:original_format, options)
135     when :nhx
136       # regarded as NHX string which might be broken
137       __parse_nhx(btokens, node, edge)
138     when :traditional
139       # simply ignored
140     else
141       case btokens[0].to_s.strip
142       when ''
143         # not automatically determined
144       when /\A\&\&NHX/
145         # NHX string
146         # force to set NHX mode
147         @options[:original_format] = :nhx
148         __parse_nhx(btokens, node, edge)
149       else
150         # Molphy-style boostrap values
151         # let molphy mode if nothing determined
152         @options[:original_format] ||= :molphy
153         bstr = ''
154         while t = btokens.shift and t != :']'
155           bstr.concat t.to_s
156         end
157         node.bootstrap_string = bstr
158       end #case btokens[0]
159     end
160   end
161 
162   if !btokens and !leaf_tokens.empty? then
163     # syntax error?
164   end
165   node.name ||= '' # compatibility for older BioRuby
166 
167   # returns true
168   true
169 end
__parse_newick_tokenize(str, options) click to toggle source

splits string to tokens

    # File lib/bio/db/newick.rb
210 def __parse_newick_tokenize(str, options)
211   str = str.chop if str[-1..-1] == ';'
212   # http://evolution.genetics.washington.edu/phylip/newick_doc.html
213   # quoted_label ==> ' string_of_printing_characters '
214   # single quote in quoted_label is '' (two single quotes)
215   #
216 
217   if __get_option(:parser, options) == :naive then
218     ary = str.split(/([\(\)\,\:\[\]])/)
219     ary.collect! { |x| x.strip!; x.empty? ? nil : x }
220     ary.compact!
221     ary.collect! do |x|
222       if /\A([\(\)\,\:\[\]])\z/ =~ x then
223         x.intern
224       else
225         x
226       end
227     end
228     return ary
229   end
230 
231   tokens = []
232   ss = StringScanner.new(str)
233 
234   while !(ss.eos?)
235     if ss.scan(/\s+/) then
236       # do nothing
237 
238     elsif ss.scan(/[\(\)\,\:\[\]]/) then
239       # '(' or ')' or ',' or ':' or '[' or ']'
240       t = ss.matched
241       tokens.push t.intern
242 
243     elsif ss.scan(/\'/) then
244       # quoted_label
245       t = ''
246       while true
247         if ss.scan(/([^\']*)\'/) then
248           t.concat ss[1]
249           if  ss.scan(/\'/) then
250             # single quote in quoted_label
251             t.concat ss.matched
252           else
253             break
254           end
255         else
256           # incomplete quoted_label?
257           break
258         end
259       end #while true
260       unless ss.match?(/\s*[\(\)\,\:\[\]]/) or ss.match?(/\s*\z/) then
261         # label continues? (illegal, but try to rescue)
262         if ss.scan(/[^\(\)\,\:\[\]]+/) then
263           t.concat ss.matched.lstrip
264         end
265       end
266       tokens.push t
267 
268     elsif ss.scan(/[^\(\)\,\:\[\]]+/) then
269       # unquoted_label
270       t = ss.matched.strip
271       t.gsub!(/[\r\n]/, '')
272       # unquoted underscore should be converted to blank
273       t.gsub!(/\_/, ' ')
274       tokens.push t unless t.empty?
275 
276     else
277       # unquoted_label in end of string
278       t = ss.rest.strip
279       t.gsub!(/[\r\n]/, '')
280       # unquoted underscore should be converted to blank
281       t.gsub!(/\_/, ' ')
282       tokens.push t unless t.empty?
283       ss.terminate
284 
285     end
286   end #while !(ss.eos?)
287 
288   tokens
289 end
__parse_nhx(btokens, node, edge) click to toggle source

Parses NHX (New Hampshire eXtended) string

    # File lib/bio/db/newick.rb
172 def __parse_nhx(btokens, node, edge)
173   btokens.shift if btokens[0] == '&&NHX'
174   btokens.each do |str|
175     break if str == :']'
176     next if str.kind_of?(Symbol)
177     tag, val = str.split(/\=/, 2)
178     case tag
179     when 'B'
180       node.bootstrap_string = val
181     when 'D'
182       case val
183         when 'Y'
184         node.events.push :gene_duplication
185         when 'N'
186         node.events.push :speciation
187       end
188     when 'E'
189       node.ec_number = val
190     when 'L'
191       edge.log_likelihood = val.to_f
192     when 'S'
193       node.scientific_name = val
194     when 'T'
195       node.taxonomy_id = val
196     when 'W'
197       edge.width = val.to_i
198     when 'XB'
199       edge.nhx_parameters[:XB] = val
200     when 'O', 'SO'
201       node.nhx_parameters[tag.to_sym] = val.to_i
202     else # :Co, :SN, :Sw, :XN, and others
203       node.nhx_parameters[tag.to_sym] = val
204     end
205   end #each
206   true
207 end