class TreeBunchStarter

Attributes

cluster_batch[R]

Given an initial alignment, it creates a initial bunch of ML trees in bunch_0 dir should log results

Public Class Methods

new(opts) click to toggle source
# File lib/starter.rb, line 54
def initialize(opts)
  @phylip = opts[:phylip]
  @base_dir = opts[:base_dir]
  @prev_dir = opts[:prev_dir]
  @update_id = opts[:update_id] || 0
  @num_threads = opts[:num_threads] || 0 
  # create dirs if required
  @alignment_dir = File.join @base_dir, "alignments"
  @parsimony_trees_dir = File.join @base_dir, "parsimony_trees"
  @parsimony_trees_out_dir = File.join @parsimony_trees_dir, "output"
  @ml_trees_dir = File.join @base_dir, "ml_trees"
  @bestML_trees_dir = File.join @base_dir, "best_ml_trees"
  # the new phylip
  @phylip_updated = File.join @alignment_dir, "phy_#{@update_id.to_s}"
  # defaults
  @num_parsi_trees = 4 
  @num_bestML_trees = @num_parsi_trees / 2 
  @CAT_topology_bunch = File.join @ml_trees_dir, "CAT_topology_bunch.nw"
  @CAT_topology_bunch_order = File.join @ml_trees_dir, "CAT_topology_bunch_order.txt"
  @bestML_bunch = File.join @bestML_trees_dir, "best_bunch.nw"
  @prev_bestML_bunch = File.join @prev_dir, "best_ml_trees", "best_bunch.nw" unless @prev_dir.nil?
  @cluster_batch = opts[:cluster_batch] 
  @logpath = File.join @base_dir, "starter.log"
end

Public Instance Methods

add_update(opts) click to toggle source
# File lib/starter.rb, line 107
def add_update(opts)
  check_options(opts)
  begin
    num_parsi_trees = opts[:num_parsi_trees] || @num_parsi_trees
    num_bestML_trees = opts[:num_bestML_trees] || @num_bestML_trees
    # prepare the parsimony starting trees
    raise "prev bunch not ready #{@prev_bestML_bunch}" unless File.exist?(@prev_bestML_bunch)
    last_best_bunch = NewickFile.new(@prev_bestML_bunch)
    # TODO this save_each_newick_as should be wrapped around sth. that considers a list of outliers and
    # prunes out the ouliers before saving as "prev_parsi_tree" to make it compatible with the next round
    last_best_bunch.save_each_newick_as(File.join(@parsimony_trees_dir, 'prev_parsi_tree'), "nw") 
    prev_trees = Dir.entries(@parsimony_trees_dir).select{|f| f =~ /^prev_parsi_tree/}
    if num_bestML_trees > num_parsi_trees * prev_trees.size
      raise "#bestML trees (#{num_bestML_trees}) is too higher than trees from previous round"
    end
    unless @cluster_batch.empty?
      logput "Exp #{opts[:exp_name]}, your cluster will take care of this update no #{@update_id}. stay tuned"
            c = CycleController.new(:iter => @update_id, 
                                                            :phy => @phylip_updated, 
                                    :num_parsi_trees => num_parsi_trees, 
                                    :num_ptrees => num_parsi_trees * prev_trees.size, 
                                    :num_bestML_trees => num_bestML_trees,
                                    :base_dir => @base_dir,
                                    :exp_name => opts[:exp_name]
                                   )
            c.run_as_batch(@cluster_batch, @logpath)
      "cluster"
    else
      logput "****** Start update no #{@update_id} ********"
      logput "step 1 of 3 : Parsimony starting trees #{num_parsi_trees} each\n----"
      update_parsimony_trees(num_parsi_trees, prev_trees)
      # raxml light phase (2/3): use threads here? / each tree could be computed in paralell
      logput "step 2 of 3 : ML trees\n----"
      # generate_ML_trees
      generate_ML_trees(@parsimony_trees_out_dir, @phylip_updated)
      # raxml scoring of initial bunch (needs to be done after step 2, or not?)
      logput "step 3 of 3 : Score bunch of initial ML trees and select best #{num_bestML_trees}\n----"
      best_lh = score_ML_trees(num_bestML_trees, @phylip_updated) 
      logput "Bunch of initial ML trees #{num_bestML_trees}, ready at #{@bestML_bunch}\n----"
      best_lh
    end
  rescue Exception => e
    logput(e, error = true)
    raise e
  end
end
generate_initial_bunch(opts) click to toggle source
# File lib/starter.rb, line 170
def generate_initial_bunch(opts)
  check_options(opts)
  begin
    num_parsi_trees = opts[:num_parsi_trees] || @num_parsi_trees
    num_bestML_trees = opts[:num_bestML_trees] || @num_bestML_trees
    if num_bestML_trees > num_parsi_trees
      raise "#bestML trees (#{num_bestML_trees}) cant be higher than #parsi trees(#{num_parsi_trees})"
    end
    # phases 2 and 3 can be done in paralell
    unless @cluster_batch.empty?
            logput "your cluster will take care of this start #{@update_id}. Stay tuned"
            c = CycleController.new(:iter => 0, 
                                    :phy => @phylip, 
                                    :num_parsi_trees => num_parsi_trees, 
                                    :num_bestML_trees => num_bestML_trees,
                                    :base_dir => @base_dir,
                                    :exp_name => opts[:exp_name]
                                   )
            c.run_as_batch(@cluster_batch, @logpath)
            "cluster"
    else
            logput "Start generating initial bunch"
            logput "step 1 of 3 : Parsimony starting trees #{num_parsi_trees}\n----"
            generate_parsimony_trees(num_parsi_trees)
            # raxml light phase (2/3): use threads here? / each tree could be computed in paralell
            logput "step 2 of 3 : ML trees\n----"
            generate_ML_trees(@parsimony_trees_dir, @phylip)
            # raxml scoring of initial bunch (needs to be done after step 2, or not?)
            logput "step 3 of 3 : Score bunch of initial ML trees and select best #{num_bestML_trees}\n----"
            best_lh = score_ML_trees(num_bestML_trees, @phylip) 
            logput "Bunch of initial ML trees #{num_bestML_trees}, ready at #{@bestML_bunch}\n----"
            best_lh
    end
  rescue Exception => e
    logput(e, error = true)
    raise e
  end
end
logput(msg, error = false) click to toggle source
# File lib/starter.rb, line 78
def logput(msg, error = false)
  @logger ||= Logger.new(@logpath)
  if error
    @logger.error msg
  else
    @logger.info msg
  end
  puts msg
end
ready?() click to toggle source
# File lib/starter.rb, line 87
def ready?
  ready = true
  dirs = [@alignment_dir, @parsimony_trees_dir, @parsimony_trees_out_dir,@ml_trees_dir, @bestML_trees_dir]
  dirs.each do |d|
    if not File.exist?(d)
      FileUtils.mkdir_p d
      logput "Created #{d}"
    else
      logput "Exists #{d}"
      ready = false
    end
  end
  if @update_id == 0
    FileUtils.cp @phylip, @alignment_dir 
  else
    logput "Copying new update alignment (not expanding) from #{@phylip} to #{@phylip_updated}"
    FileUtils.cp @phylip, @phylip_updated 
  end
  ready
end
search_std(num_gamma_trees = nil) click to toggle source
# File lib/starter.rb, line 153
def search_std(num_gamma_trees = nil)
      search_opts = {
        :phylip => @phylip,
        :outdir => @ml_trees_dir,
        :num_gamma_trees => num_gamma_trees || 1, 
        :stderr => File.join(@ml_trees_dir, "err"),
        :stdout => File.join(@ml_trees_dir, "info"),
        :name => "std_GAMMA_search" 
      }
      search_opts.merge!({:num_threads => @num_threads}) if @num_threads.to_i > 0
      r = RaxmlGammaSearch.new(search_opts)
      logput "Start ML search from scratch with #{num_gamma_trees} trees"
      r.run
      bestLH = File.open(r.stdout).readlines.find{|l| l =~ /^Final GAMMA-based Score of best/}.chomp.split("tree").last
      logput "Done ML search from scratch with #{num_gamma_trees} trees"
      bestLH
end

Private Instance Methods

append_to_file(file, str) click to toggle source
# File lib/starter.rb, line 304
def append_to_file(file, str)
  File.open(file, "a+") do |f|
    f.puts str
  end
end
check_options(opts) click to toggle source
# File lib/starter.rb, line 209
def check_options(opts)
  supported_opts = [:num_parsi_trees, :num_bestML_trees, :exp_name, :cycle_batch_script]
  opts.keys.each do |key|
    unless supported_opts.include?(key)
      logput "Option #{key} is unknwon"
    end
  end
end
generate_ML_trees(starting_trees_dir, phylip) click to toggle source
# File lib/starter.rb, line 251
def generate_ML_trees(starting_trees_dir, phylip)
  starting_trees = Dir.entries(starting_trees_dir).select{|f| f =~ /^RAxML_parsimonyTree/}
  raise "no starting trees available" if starting_trees.nil? or starting_trees.size < 1 
  starting_trees.each_with_index do |parsimony_tree, i|
    # ideally we just submit here to the cluster...and start phase 3 when all are done
    tree_id = parsimony_tree.split("parsimonyTree.").last
    light_opts = {
      :phylip => phylip,
      :outdir => @ml_trees_dir,
      :flags => " -D ", # default to a RF convergence criterion
      :starting_newick => File.join(starting_trees_dir, parsimony_tree),
      :stderr => File.join(@ml_trees_dir, "err#{tree_id}"),
      :stdout => File.join(@ml_trees_dir, "info#{tree_id}"),
      :name => "starting_parsimony_tree_" + tree_id
    }
    light_opts.merge!({:num_threads => @num_threads}) if @num_threads.to_i > 0
    r = RaxmlLight.new(light_opts)
    logput "Start ML search for #{parsimony_tree} (#{i+1} of #{starting_trees.size})"
    r.run
    logput "Done ML search for #{parsimony_tree} (#{i+1} of #{starting_trees.size})"
    # add the result to the bunch
    newick_str = NewickFile.new(File.join(r.outdir, "RAxML_result.#{r.name}")).newickStrings[0].str
    append_to_file(@CAT_topology_bunch, newick_str)
    append_to_file(@CAT_topology_bunch_order, r.name)
  end
end
generate_parsimony_trees(num_parsi_trees) click to toggle source
# File lib/starter.rb, line 217
def generate_parsimony_trees(num_parsi_trees)
  logput "Starting parsimony with #{num_parsi_trees} trees" 
  parsimonator_opts = {
    :phylip => @phylip,
    :num_trees => num_parsi_trees,
    :outdir => @parsimony_trees_dir,
    :stderr => File.join(@parsimony_trees_dir, "err"),
    :stdout => File.join(@parsimony_trees_dir, "info"),
    :name => "parsimony_initial"
  }
  parsi = Parsimonator.new(parsimonator_opts)  
  logput "Start computing parsimony trees of initial bunch"
  parsi.run
  logput "Done with parsimony trees of initial bunch"
end
score_ML_trees(num_bestML_trees, phylip) click to toggle source
# File lib/starter.rb, line 277
def score_ML_trees(num_bestML_trees, phylip)
  logput "Starting scoring of ML trees" 
  scorer_opts = {
    :phylip => phylip,
    :outdir => @ml_trees_dir,
    :starting_newick => @CAT_topology_bunch,
    :stderr => File.join(@ml_trees_dir, "err_scores"),
    :stdout => File.join(@ml_trees_dir, "info_scores"),
    :name => "SCORES"
  }
  scorer_opts.merge!({:num_threads => @num_threads}) if @num_threads.to_i > 0
  scorer = GammaScorer.new(scorer_opts)
  scorer.run
  logput "Done scoring of ML trees, selecting the best #{num_bestML_trees}..." 
  rank_file = File.join @ml_trees_dir, "RAxML_info.#{scorer.name}"
  lh_lines = File.open(rank_file).readlines.select{|l| l =~ /^[0-9]+ -[0-9]+.[0-9]+$/}
  rank_id = lh_lines.map{|l| l.split.first}
  best_lh = lh_lines.first.split.last
  newick_bunch = File.open(@CAT_topology_bunch).readlines
  File.open(@bestML_bunch, "w") do |f|
    rank_id[0...num_bestML_trees].each_with_index do |newick_id, i|
      f.puts newick_bunch[newick_id.to_i]
      logput "#{i+1}: Selected tree with id #{newick_id}" 
    end
  end
  best_lh
end
update_parsimony_trees(num_parsi_trees, trees) click to toggle source
# File lib/starter.rb, line 232
def update_parsimony_trees(num_parsi_trees, trees)
  trees.each_with_index do |parsi_start_tree, i|
    logput "Starting new parsimony tree with #{parsi_start_tree} trees" 
    parsimonator_opts = {
      :phylip => @phylip_updated,
      :num_trees => num_parsi_trees,
      :newick => File.join(@parsimony_trees_dir, parsi_start_tree),
      :outdir => @parsimony_trees_out_dir,
      :stderr => File.join(@parsimony_trees_out_dir, "err_#{parsi_start_tree}"),
      :stdout => File.join(@parsimony_trees_out_dir, "info_#{parsi_start_tree}"),
      :name => "u#{@update_id}_#{parsi_start_tree}"
    }
    parsi = Parsimonator.new(parsimonator_opts)  
    logput "Start computing parsimony trees of #{parsi_start_tree}, #{i+1} of #{trees.size}"
    parsi.run
    logput "run with options #{parsi.ops.to_s}"
    logput "Done with parsimony trees of #{parsi_start_tree}, #{i+1} of #{trees.size}"
  end 
end