class Bio::Newick
Newick
standard phylogenetic tree parser class.
This is alpha version. Incompatible changes may be made frequently.
Constants
- DELIMITER
delemiter of the entry
- Edge
same as
Bio::Tree::Edge
- Node
same as
Bio::Tree::Node
Attributes
string after this entry
parser options (in some cases, options can be automatically set by the parser)
original string before parsing
Public Class Methods
Creates a new Newick
object. options for parsing can be set.
Available options:
:bootstrap_style
-
:traditional
for traditional bootstrap style,:molphy
for molphy style,:disabled
to ignore bootstrap strings. For details of default actions, please read the notes below. :parser
-
:naive
for using naive parser, compatible with BioRuby 1.1.0, which ignores quoted strings and do not convert underscores to spaces.
Notes for bootstrap style: Molphy-style bootstrap values may always be parsed, even if the options[:bootstrap_style]
is set to :traditional
or :disabled
.
Note for default or traditional bootstrap style: By default, if all of the internal node's names are numeric and there are no NHX and no molphy-style boostrap values, the names of internal nodes are regarded as bootstrap values. options[:bootstrap_style] = :disabled
or :molphy
to disable the feature (or at least one NHX tag exists).
# File lib/bio/db/newick.rb 71 def initialize(str, options = nil) 72 str = str.sub(/\;(.*)/m, ';') 73 @original_string = str 74 @entry_overrun = $1 75 @options = (options or {}) 76 end
Public Instance Methods
Re-parses the tree from the original string. Returns self. This method is useful after changing parser options.
# File lib/bio/db/newick.rb 101 def reparse 102 if defined?(@tree) 103 remove_instance_variable(:@tree) 104 end 105 self.tree 106 self 107 end
Gets the tree. Returns a Bio::Tree
object.
# File lib/bio/db/newick.rb 90 def tree 91 if !defined?(@tree) 92 @tree = __parse_newick(@original_string, @options) 93 else 94 @tree 95 end 96 end
Private Instance Methods
gets a option
# File lib/bio/db/newick.rb 112 def __get_option(key, options) 113 options[key] or (@options ? @options[key] : nil) 114 end
Parses newick formatted string.
# File lib/bio/db/newick.rb 301 def __parse_newick(str, options = {}) 302 # initializing 303 root = Node.new 304 cur_node = root 305 edges = [] 306 nodes = [ root ] 307 internal_nodes = [] 308 node_stack = [] 309 # preparation of tokens 310 ary = __parse_newick_tokenize(str, options) 311 previous_token = nil 312 # main loop 313 while token = ary.shift 314 #p token 315 case token 316 when :',' 317 if previous_token == :',' or previous_token == :'(' then 318 # there is a leaf whose name is empty. 319 ary.unshift(token) 320 ary.unshift('') 321 token = nil 322 end 323 when :'(' 324 node = Node.new 325 nodes << node 326 internal_nodes << node 327 node_stack.push(cur_node) 328 cur_node = node 329 when :')' 330 if previous_token == :',' or previous_token == :'(' then 331 # there is a leaf whose name is empty. 332 ary.unshift(token) 333 ary.unshift('') 334 token = nil 335 else 336 edge = Edge.new 337 leaf_tokens = __parse_newick_get_tokens_for_leaf(ary) 338 token = nil 339 if leaf_tokens.size > 0 then 340 __parse_newick_leaf(leaf_tokens, cur_node, edge, options) 341 end 342 parent = node_stack.pop 343 raise ParseError, 'unmatched parentheses' unless parent 344 edges << Bio::Relation.new(parent, cur_node, edge) 345 cur_node = parent 346 end 347 else 348 leaf = Node.new 349 edge = Edge.new 350 ary.unshift(token) 351 leaf_tokens = __parse_newick_get_tokens_for_leaf(ary) 352 token = nil 353 __parse_newick_leaf(leaf_tokens, leaf, edge, options) 354 nodes << leaf 355 edges << Bio::Relation.new(cur_node, leaf, edge) 356 end #case 357 previous_token = token 358 end #while 359 raise ParseError, 'unmatched parentheses' unless node_stack.empty? 360 bsopt = __get_option(:bootstrap_style, options) 361 ofmt = __get_option(:original_format, options) 362 unless bsopt == :disabled or bsopt == :molphy or 363 ofmt == :nhx or ofmt == :molphy then 364 # If all of the internal node's names are numeric, 365 # the names are regarded as bootstrap values. 366 flag = false 367 internal_nodes.each do |inode| 368 if inode.name and !inode.name.to_s.strip.empty? then 369 if /\A[\+\-]?\d*\.?\d*\z/ =~ inode.name 370 flag = true 371 else 372 flag = false 373 break 374 end 375 end 376 end 377 if flag then 378 @options[:bootstrap_style] = :traditional 379 @options[:original_format] = :traditional 380 internal_nodes.each do |inode| 381 if inode.name then 382 inode.bootstrap_string = inode.name 383 inode.name = nil 384 end 385 end 386 end 387 end 388 # Sets nodes order numbers 389 nodes.each_with_index do |xnode, i| 390 xnode.order_number = i 391 end 392 # If the root implicitly prepared by the program is a leaf and 393 # there are no additional information for the edge from the root to 394 # the first internal node, the root is removed. 395 if rel = edges[-1] and rel.node == [ root, internal_nodes[0] ] and 396 rel.relation.instance_eval { 397 !defined?(@distance) and !defined?(@log_likelihood) and 398 !defined?(@width) and !defined?(@nhx_parameters) } and 399 edges.find_all { |x| x.node.include?(root) }.size == 1 400 nodes.shift 401 edges.pop 402 end 403 # Let the tree into instance variables 404 tree = Bio::Tree.new 405 tree.instance_eval { 406 @pathway.relations.concat(edges) 407 @pathway.to_list 408 } 409 tree.root = nodes[0] 410 tree.options.update(@options) 411 tree 412 end
get tokens for a leaf
# File lib/bio/db/newick.rb 292 def __parse_newick_get_tokens_for_leaf(ary) 293 r = [] 294 while t = ary[0] and t != :',' and t != :')' and t != :'(' 295 r.push ary.shift 296 end 297 r 298 end
Parses newick formatted leaf (or internal node) name.
# File lib/bio/db/newick.rb 117 def __parse_newick_leaf(leaf_tokens, node, edge, options) 118 t = leaf_tokens.shift 119 if !t.kind_of?(Symbol) then 120 node.name = t 121 t = leaf_tokens.shift 122 end 123 124 if t == :':' then 125 t = leaf_tokens.shift 126 if !t.kind_of?(Symbol) then 127 edge.distance_string = t if t and !(t.strip.empty?) 128 t = leaf_tokens.shift 129 end 130 end 131 132 if t == :'[' then 133 btokens = leaf_tokens 134 case __get_option(:original_format, options) 135 when :nhx 136 # regarded as NHX string which might be broken 137 __parse_nhx(btokens, node, edge) 138 when :traditional 139 # simply ignored 140 else 141 case btokens[0].to_s.strip 142 when '' 143 # not automatically determined 144 when /\A\&\&NHX/ 145 # NHX string 146 # force to set NHX mode 147 @options[:original_format] = :nhx 148 __parse_nhx(btokens, node, edge) 149 else 150 # Molphy-style boostrap values 151 # let molphy mode if nothing determined 152 @options[:original_format] ||= :molphy 153 bstr = '' 154 while t = btokens.shift and t != :']' 155 bstr.concat t.to_s 156 end 157 node.bootstrap_string = bstr 158 end #case btokens[0] 159 end 160 end 161 162 if !btokens and !leaf_tokens.empty? then 163 # syntax error? 164 end 165 node.name ||= '' # compatibility for older BioRuby 166 167 # returns true 168 true 169 end
splits string to tokens
# File lib/bio/db/newick.rb 210 def __parse_newick_tokenize(str, options) 211 str = str.chop if str[-1..-1] == ';' 212 # http://evolution.genetics.washington.edu/phylip/newick_doc.html 213 # quoted_label ==> ' string_of_printing_characters ' 214 # single quote in quoted_label is '' (two single quotes) 215 # 216 217 if __get_option(:parser, options) == :naive then 218 ary = str.split(/([\(\)\,\:\[\]])/) 219 ary.collect! { |x| x.strip!; x.empty? ? nil : x } 220 ary.compact! 221 ary.collect! do |x| 222 if /\A([\(\)\,\:\[\]])\z/ =~ x then 223 x.intern 224 else 225 x 226 end 227 end 228 return ary 229 end 230 231 tokens = [] 232 ss = StringScanner.new(str) 233 234 while !(ss.eos?) 235 if ss.scan(/\s+/) then 236 # do nothing 237 238 elsif ss.scan(/[\(\)\,\:\[\]]/) then 239 # '(' or ')' or ',' or ':' or '[' or ']' 240 t = ss.matched 241 tokens.push t.intern 242 243 elsif ss.scan(/\'/) then 244 # quoted_label 245 t = '' 246 while true 247 if ss.scan(/([^\']*)\'/) then 248 t.concat ss[1] 249 if ss.scan(/\'/) then 250 # single quote in quoted_label 251 t.concat ss.matched 252 else 253 break 254 end 255 else 256 # incomplete quoted_label? 257 break 258 end 259 end #while true 260 unless ss.match?(/\s*[\(\)\,\:\[\]]/) or ss.match?(/\s*\z/) then 261 # label continues? (illegal, but try to rescue) 262 if ss.scan(/[^\(\)\,\:\[\]]+/) then 263 t.concat ss.matched.lstrip 264 end 265 end 266 tokens.push t 267 268 elsif ss.scan(/[^\(\)\,\:\[\]]+/) then 269 # unquoted_label 270 t = ss.matched.strip 271 t.gsub!(/[\r\n]/, '') 272 # unquoted underscore should be converted to blank 273 t.gsub!(/\_/, ' ') 274 tokens.push t unless t.empty? 275 276 else 277 # unquoted_label in end of string 278 t = ss.rest.strip 279 t.gsub!(/[\r\n]/, '') 280 # unquoted underscore should be converted to blank 281 t.gsub!(/\_/, ' ') 282 tokens.push t unless t.empty? 283 ss.terminate 284 285 end 286 end #while !(ss.eos?) 287 288 tokens 289 end
Parses NHX (New Hampshire eXtended) string
# File lib/bio/db/newick.rb 172 def __parse_nhx(btokens, node, edge) 173 btokens.shift if btokens[0] == '&&NHX' 174 btokens.each do |str| 175 break if str == :']' 176 next if str.kind_of?(Symbol) 177 tag, val = str.split(/\=/, 2) 178 case tag 179 when 'B' 180 node.bootstrap_string = val 181 when 'D' 182 case val 183 when 'Y' 184 node.events.push :gene_duplication 185 when 'N' 186 node.events.push :speciation 187 end 188 when 'E' 189 node.ec_number = val 190 when 'L' 191 edge.log_likelihood = val.to_f 192 when 'S' 193 node.scientific_name = val 194 when 'T' 195 node.taxonomy_id = val 196 when 'W' 197 edge.width = val.to_i 198 when 'XB' 199 edge.nhx_parameters[:XB] = val 200 when 'O', 'SO' 201 node.nhx_parameters[tag.to_sym] = val.to_i 202 else # :Co, :SN, :Sw, :XN, and others 203 node.nhx_parameters[tag.to_sym] = val 204 end 205 end #each 206 true 207 end