class ROCker

@author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com> @author Luis (Coto) Orellana @license artistic license 2.0 @update Sep-07-2015

@author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com> @author Luis (Coto) Orellana @license artistic license 2.0 @update Jun-08-2015

@author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com> @author Luis (Coto) Orellana @license artistic license 2.0 @update Sep-09-2015

Constants

R_BIN

Attributes

o[R]
[ Instance ]

Public Class Methods

CITATION(j = ' ') click to toggle source
# File lib/rocker.rb, line 38
def self.CITATION(j = ' ') @@CITATION.join(j) ; end
DATE() click to toggle source
# File lib/rocker.rb, line 37
def self.DATE; @@DATE ; end
VERSION() click to toggle source
# File lib/rocker.rb, line 36
def self.VERSION; @@VERSION ; end
default(k) click to toggle source
# File lib/rocker.rb, line 35
def self.default(k) @@DEFAULTS[k] ; end
defaults() click to toggle source
# File lib/rocker.rb, line 34
def self.defaults() @@DEFAULTS ; end
ebirest() click to toggle source
# File lib/rocker/step/build.rb, line 27
def self.ebirest() @@EBIREST ; end
has_build_gems?() click to toggle source
# File lib/rocker/step/build.rb, line 28
def self.has_build_gems?
   return @@HAS_BUILD_GEMS unless @@HAS_BUILD_GEMS.nil?
   @@HAS_BUILD_GEMS = true
   begin
      require "rubygems"
      require "restclient"
   rescue LoadError
      @@HAS_BUILD_GEMS = false
   end
   @@HAS_BUILD_GEMS
end
new(opts) click to toggle source
# File lib/rocker.rb, line 42
def initialize(opts)
  @o = ROCker.defaults
  opts.each{ |k,v| @o[k] = v }
  RInterface.R_BIN = opts[:r] unless opts[:r].nil?
end

Public Instance Methods

bash(cmd, err_msg=nil) click to toggle source
# File lib/rocker.rb, line 59
def bash(cmd, err_msg=nil)
  o = `#{cmd} 2>&1 && echo '{'`
  raise (err_msg.nil? ? "Error executing: #{cmd}\n\n#{o}" : err_msg) unless
    o[-2]=="{"
  true
end
blast2table(blast_f, table_f, aln, minscore) click to toggle source
[ Utilities ]
# File lib/rocker.rb, line 49
def blast2table(blast_f, table_f, aln, minscore)
  ifh = File.open(blast_f, 'r')
  ofh = File.open(table_f, 'w')
  while ln = ifh.gets
    bh = BlastHit.new(ln, aln)
    ofh.print bh.to_s if bh.bits >= minscore
  end
  ifh.close
  ofh.close
end
build!() click to toggle source
[ Build ]
# File lib/rocker/step/build.rb, line 109
def build!
   # Check requirements
   puts "Testing environment." unless @o[:q]
   {  searchcmd: :search, makedbcmd: :search,
      alignercmd: :aligner, alignerbin: :aligner,
      simulatorcmd: :simulator, simulatorbin: :simulator
   }.each_pair { |k,v| @o[k] = @o[k][@o[v]] if @o[k].is_a? Hash }
   @o[:nosearch]=true if @o[:nosimulate]
   raise "Unsatisfied requirements, please see the help message (-h)." unless
      ROCker.has_build_gems?
   protein_set = {}
   protein_set[:+] = ProteinSet.new(self,@o[:positive],@o[:posfile],@o[:aln])
   protein_set[:-] = ProteinSet.new(self,@o[:negative],@o[:negfile])
   raise "-p, -P, or -a are mandatory." if protein_set[:+].empty?
   raise "-o/--baseout is mandatory." if @o[:baseout].nil?
   if protein_set[:+].size==1 and not @o[:noaln]
      warn "\nWARNING: Positive set contains only one sequence, turning " +
         "off alignment.\n\n"
      @o[:noaln] = true
   end
   unless @o[:nosimulate]
      self.bash("#{@o[:simulatorbin]} --version",
         "--simulator-bin must be executable. Is Grinder installed?") if
         @o[:simulator]==:grinder
   end
   unless @o[:noaln]
      self.bash("#{@o[:alignerbin]} -version",
         "--aligner-bin must be executable. Is Muscle installed?") if
         @o[:aligner]==:muscle
      self.bash("#{@o[:alignerbin]} --version",
         "--aligner-bin must be executable. Is ClustalOmega installed?") if
         @o[:aligner]==:clustalo
   end
   unless @o[:nosearch]
      self.bash("#{@o[:searchbins]}makeblastdb -version",
         "--search-bins must contain executables. Is BLAST+ installed?") if
         @o[:search]==:blast
      self.bash("#{@o[:searchbins]}diamond --help",
         "--search-bins must contain executables. Is DIAMOND installed?") if
         @o[:search]==:diamond
   end

   # Download genes
   puts "Downloading gene data." unless @o[:q]
   ref_file = @o[:baseout] + ".ref.fasta"
   if not protein_set[:+].aln.nil?
      puts "  * reusing aligned sequences as positive set." unless @o[:q]
      protein_set[:+].get_from_aln(ref_file, protein_set[:+].aln)
      @o[:noaln] = true
   elsif @o[:reuse] and File.size? ref_file
      puts "  * reusing positive set: #{ref_file}." unless @o[:q]
   else
      puts "  * downloading #{protein_set[:+].size} sequence(s) in " +
         "positive set." unless @o[:q]
      $stderr.puts "   # #{protein_set[:+].ids}" if @o[:debug]
      protein_set[:+].download(ref_file)
   end
   [:+, :-].each do |set|
      unless protein_set[set].empty?
         puts "  * linking genomes from #{protein_set[set].size} " +
            "[#{set.to_s}] sequence(s)." unless @o[:q]
         $stderr.puts "   # #{protein_set[set].ids}" if @o[:debug]
         protein_set[set].get_genomes!
      end
   end
   raise "No genomes associated with the positive set." if
      protein_set[:+].genomes.empty?
   genome_set = {:+ => GenomeSet.new(self, protein_set[:+].genomes),
      :- => GenomeSet.new(self, protein_set[:-].genomes)}
   
   # Locate genes
   puts "Analyzing genome data." unless @o[:q]
   coords_file = @o[:baseout] + ".src.coords"
   if @o[:reuse] and File.size? coords_file
      puts "  * reusing coordinates: #{coords_file}." unless @o[:q]
      c = JSON.parse File.read(coords_file), {symbolize_names:true}
      positive_coords = c[:positive_coords]
      negative_coords = c[:negative_coords]
      genome_set[:+].taxa = c[:taxa_pos]
      genome_set[:-].taxa = c[:taxa_neg]
   else
      all_coords = {}
      [:+, :-].each do |set_type|
         all_coords[set_type] = {}
         next if genome_set[set_type].empty?
         thrs = [@o[:thr], genome_set[set_type].size].min
         puts "  * downloading and parsing #{genome_set[set_type].size} " +
            "GFF3 document(s) in #{thrs} threads." unless @o[:q]
         $stderr.puts "   # Looking for translations: " +
            "#{protein_set[set_type].tranids_dump}" if @o[:debug]
         $stderr.puts "   # Looking into: #{genome_set[set_type].ids}" if
            @o[:debug]
         # Launch threads
         thr_obj = []
         (0 .. (thrs-1)).each do |thr_i|
            ids_to_parse = []
            (0 .. (genome_set[set_type].size-1)).each do |i|
               ids_to_parse << protein_set[set_type].genomes[i] if
                  (i % thrs) == thr_i
            end
            json_file = @o[:baseout] + ".src.coords." + thr_i.to_s + ".tmp"
            thr_obj << json_file
            fork do
               get_coords_from_gff3(ids_to_parse, protein_set[set_type],
                  thr_i, json_file)
            end
         end
         # Combine results
         Process.waitall
         thr_obj.each do |t|
            raise "Thread failed without error trace: #{t}" unless
               File.exist? t
            o = JSON.parse(File.read(t), {symbolize_names:true})
            o[:coords].each_pair do |k,v|
               all_coords[set_type][ k ] ||= []
               all_coords[set_type][ k ] += v
            end
            File.unlink t
         end
      end # [:+, :-].each
      positive_coords = all_coords[:+]
      negative_coords = all_coords[:-]
      # Select one genome per taxon
      unless @o[:pertaxon].nil?
         puts "  Selecting genomes by #{@o[:pertaxon]}." unless @o[:q]
         [:+,:-].each{ |set| genome_set[set].choose_genomes! @o[:pertaxon] }
      end
      # Save coordinates and taxa
      ofh = File.open(coords_file, "w")
      ofh.print JSON.pretty_generate({
         positive_coords:positive_coords,
         negative_coords:negative_coords,
         taxa_pos:genome_set[:+].taxa,
         taxa_neg:genome_set[:-].taxa})
      ofh.close
   end # if @o[:reuse] and File.size? coords_file ... else
   unless @o[:pertaxon].nil?
      puts "  Using " +
         [:+,:-].map{ |set| genome_set[set].size }.reduce(:+).to_s +
         " genome(s) after filtering by #{@o[:pertaxon]}." unless @o[:q]
   end
   found = protein_set[:+].in_coords(positive_coords)
   raise "Cannot find the genomic location of any provided sequence." if
      found.nil?
   missing = protein_set[:+].ids - found
   unless missing.empty?
      warn "\nWARNING: Cannot find genomic location of #{missing.size} " +
         "sequence(s) #{missing.join(",")}.\n\n"
      unless @o[:keep_unlinked]
         del_genomes = protein_set[:+].remove_genomes_by_prot_id!(missing)
         warn "WARNING: Ignoring #{del_genomes.size} genomes with missing " +
            "coordinates #{del_genomes.join(",")}.\n\n"
         genome_set[ :+ ].delete!(del_genomes)
      end
   end
   
   # Download genomes
   genome_set[:all] = GenomeSet.new(self,
      genome_set[ :+ ].ids + genome_set[ :- ].ids)
   genomes_file = @o[:baseout] + ".src.fasta"
   if @o[:reuse] and File.size? genomes_file
      puts "  * reusing existing file: #{genomes_file}." unless @o[:q]
   else
      raise "Something went wrong: Nothing left to download." if
         genome_set[:all].empty?
      puts "  * downloading " + genome_set[:all].size.to_s +
         " genome(s) in FastA." unless @o[:q]
      $stderr.puts "   # #{genome_set[:all].ids}" if @o[:debug]
      genome_set[:all].download genomes_file
   end
   
   # Generate metagenome
   unless @o[:nosimulate]
      puts "Generating in silico metagenome" unless @o[:q]
      if @o[:reuse] and File.size? @o[:baseout] + ".mg.fasta"
         puts "  * reusing existing file: #{@o[:baseout]}.mg.fasta." unless
            @o[:q]
      else
         all_src = File.readlines("#{@o[:baseout]}.src.fasta"
            ).select{ |l| l =~ /^>/ }.size
         thrs = [@o[:thr], all_src].min
         thr_obj = []
         seqs_per_thr = (all_src.to_f/thrs).ceil
         thrs = (all_src.to_f/seqs_per_thr).ceil
         puts "  * simulating metagenomes and tagging positive reads in " +
            thrs.to_s + " threads." unless @o[:q]
         $stderr.puts "   # #{positive_coords}" if @o[:debug]
         (0 .. (thrs-1)).each do |thr_i|
            output = @o[:baseout] + ".mg.fasta.#{thr_i.to_s}"
            thr_obj << output
            fork do
               seqs_a = thr_i*seqs_per_thr + 1
               seqs_b = [seqs_a + seqs_per_thr - 1, all_src].min
               # Create sub-fasta
               ofh = File.open("#{@o[:baseout]}.src.fasta.#{thr_i.to_s}","w")
               ifh = File.open("#{@o[:baseout]}.src.fasta","r")
               seq_i = 0
               while l = ifh.gets
                  seq_i+=1 if l =~ /^>/
                          break if seq_i > seqs_b
                  ofh.print l if seq_i >= seqs_a
               end
               ifh.close
               ofh.close

               # Run simulator (except if the temporal file is already
               # there and can be reused)
               ofile = "#{@o[:baseout]}.mg.tmp.#{thr_i.to_s}"
               bash sprintf(@o[:simulatorcmd], @o[:simulatorbin],
                  "#{@o[:baseout]}.src.fasta.#{thr_i.to_s}",
                  @o[:seqdepth]*@o[:readlen].to_f, @o[:readlen],
                  File.basename(ofile), File.dirname(ofile)) unless
                     @o[:reuse] and
                     File.size? @o[:baseout] +
                     ".mg.tmp.#{thr_i.to_s}-reads.fa"

               # Tag positive and negative reads
               puts "  * tagging reads [thread #{thr_i}]." unless
                  @o[:q]
               ifh = File.open(@o[:baseout] + ".mg.tmp.#{thr_i}-reads.fa",
                  "r")
               ofh = File.open(@o[:baseout] + ".mg.fasta.#{thr_i}", "w")
               while l = ifh.gets
                  if l =~ /^>/
                     rd = %r{
                        ^>(?<id>\d+)\s
                        reference=[A-Za-z_]+[\|:]
                        (?<genome_id>[A-Za-z0-9_]+)(?:\|.*)?\s
                        position=(?<comp>complement\()?(?<from>\d+)\.\.
                        (?<to>\d+)\)?\s
                     }x.match(l)
                     raise "Cannot parse simulated read's defline, are " +
                        "you using Grinder?: #{l}" if rd.nil?
                     positive = false
                     positive_coords[rd[:genome_id].to_sym] ||= []
                     positive_coords[rd[:genome_id].to_sym].each do |gn|
                        left  = rd[:to].to_i - gn[:from]
                        right = gn[:to] - rd[:from].to_i
                        if (left*right >= 0) and
                              ([left, right].min >= @o[:minovl])
                           positive = true
                           break
                        end
                     end
                     negative = false
                     negative_coords[rd[:genome_id].to_sym] ||= []
                     negative_coords[rd[:genome_id].to_sym].each do |gn|
                        left  = rd[:to].to_i - gn[:from]
                        right = gn[:to] - rd[:from].to_i
                        if (left*right >= 0) and
                              ([left, right].min >= @o[:minovl])
                           negative = true
                           break
                        end
                     end
                     l = ">#{thr_i.to_s}_#{rd[:id]}" +
                        "#{positive ? "@%" : (negative ? "@$" : "")} " +
                        "ref=#{rd[:genome_id]}:#{rd[:from]}..#{rd[:to]}" +
                        "#{(rd[:comp]=="complement(") ? "-" : "+"}\n"
                  end
                  ofh.print l
               end
               ofh.close
               ifh.close
            end # fork
         end # (1 .. thrs).each
         Process.waitall
         # Concatenate results
         ofh = File.open(@o[:baseout] + ".mg.fasta", "w")
         thr_obj.each do |t|
            raise "Thread failed without error trace: #{t}" unless
               File.exist? t
            ifh = File.open(t, "r")
            while l = ifh.gets
               ofh.print l
            end
            ifh.close
            File.unlink t
         end
         ofh.close
      end
   end # unless @o[:nosimulate]
   
   # Align references
   unless @o[:noaln]
      puts "Aligning reference set." unless @o[:q]
      if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.aln"
         puts "  * reusing existing file: #{@o[:baseout]}.ref.aln." unless
            @o[:q]
      else
         bash(sprintf(@o[:alignercmd],
            @o[:alignerbin], "#{@o[:baseout]}.ref.fasta",
            "#{@o[:baseout]}.ref.aln", @o[:thr]))
         puts "  +--\n  | IMPORTANT NOTE: Manually checking the alignment " +
            "before\n  | the 'compile' step is *strongly* encouraged.\n  " +
            "+--\n" unless @o[:q]
      end
   end
   
   # Run similarity search
   unless @o[:nosearch]
      puts "Running similarity search." unless @o[:q]
      if @o[:reuse] and File.size? "#{@o[:baseout]}.ref.blast"
         puts "  * reusing existing file: #{@o[:baseout]}.ref.blast." unless
            @o[:q]
      else
         puts "  * preparing database." unless @o[:q]
         bash(sprintf(@o[:makedbcmd],
            @o[:searchbins], "prot", "#{@o[:baseout]}.ref.fasta",
            "#{@o[:baseout]}.ref"))
         puts "  * running similarity search." unless @o[:q]
         bash(sprintf(@o[:searchcmd],
            @o[:searchbins], "blastx", "#{@o[:baseout]}.mg.fasta",
            "#{@o[:baseout]}.ref", "#{@o[:baseout]}.ref.blast", @o[:thr]))
      end
   end
   
   # Clean
   unless @o[:noclean]
      puts "Cleaning." unless @o[:q]
      sff  = %w{.src.xml .src.fasta}
      sff += %w{.mg.tmp-reads.fa .mg.tmp-ranks.txt} unless @o[:nosimulate]
      sff += %w{.ref.phr .ref.pin .ref.psq} unless @o[:nosearch]
      sff.each { |sf| File.unlink @o[:baseout] + sf if
         File.exist? @o[:baseout] + sf }
   end
end
compile!() click to toggle source
[ Compile ]
# File lib/rocker/step/compile.rb, line 13
def compile!
   raise "-a/--alignment is mandatory." if @o[:aln].nil?
   raise "-a/--alignment must exist." unless File.exist? @o[:aln]
   raise "-l/--readlen is mandatory." if @o[:readlen].nil?
   if @o[:table].nil?
      raise "-T/--table is mandatory unless -b is provided." if
         @o[:blast].nil? or not File.exist? @o[:blast]
      @o[:table] = "#{@o[:blast]}.table"
   else
      @o[:reuse] = true
   end
   raise "-b/--blast is mandatory unless -t exists." if
      @o[:blast].nil? and not File.exist? @o[:table]
   raise "-k/--rocker is mandatory." if @o[:rocker].nil?

   puts "Testing environment." unless @o[:q]
   bash("echo '' | #{@o[:r]} --vanilla",
      "-r/--path-to-r must be executable. Is R installed?")
   bash("echo \"library('pROC')\" | #{@o[:r]} --vanilla",
      "Please install the 'pROC' library for R first.")

   puts "Reading files." unless @o[:q]
   puts "  * loading alignment: #{@o[:aln]}." unless @o[:q]
   aln = Alignment.new
   aln.read_fasta @o[:aln]
   
   if @o[:reuse] and File.exist? @o[:table]
      puts "  * reusing existing file: #{@o[:table]}." unless @o[:q]
   else
      puts "  * generating table: #{@o[:table]}." unless @o[:q]
      blast2table(@o[:blast], @o[:table], aln, @o[:minscore])
   end

   puts "Analyzing data." unless @o[:q]
   puts "  * computing windows." unless @o[:q]
   data = ROCData.new(@o[:table], aln, @o[:win])
   data.nucl = @o[:nucl]
   if @o[:refine]
      puts "  * refining windows." unless @o[:q]
      warn "Insufficient hits to refine results." unless
         data.refine! @o[:table]
   end
   puts "  * saving ROCker file: #{@o[:rocker]}." unless @o[:q]
   data.save(@o[:rocker], l: @o[:readlen])
end
correct_bs(bh, readlen, exp_readlen, max_corr, penalty) click to toggle source
# File lib/rocker/step/filter.rb, line 60
def correct_bs(bh, readlen, exp_readlen, max_corr, penalty)
  bs = bh.bits
  return bs if @o[:lencorr].nil? or readlen.nil? or readlen >= exp_readlen
  bits_per_aa = bs.to_f / readlen
  if penalty.nil? or penalty > 1.0
    extra = [0.0, readlen * (max_corr + 1.0) - exp_readlen].max
    max_tri = max_corr * readlen * bits_per_aa / 2
    tanTheta = max_corr > 0.0 ? bits_per_aa / (max_corr * readlen) : 0.0
    extra_tri = extra * extra * tanTheta / 2
    return bs + (max_tri - extra_tri)
  else
    miss = [exp_readlen - readlen, max_corr * readlen].min
    return bs + (bits_per_aa * miss * (1.0 - penalty))
  end
end
ebiFetch(db, ids, format, outfile=nil) click to toggle source
# File lib/rocker/step/build.rb, line 54
def ebiFetch(db, ids, format, outfile=nil)
   url = "#{ROCker.ebirest}/dbfetch/dbfetch/" +
      "#{db.to_s}/#{ids.join(",")}/#{format.to_s}"
   $stderr.puts url
   self.restcall url, outfile
end
filter!(data=nil) click to toggle source
[ Filter ]
# File lib/rocker/step/filter.rb, line 13
def filter!(data=nil)
  raise "-k/--rocker is mandatory." if @o[:rocker].nil?
  raise "-x/--query-blast is mandatory." if @o[:qblast].nil?
  raise "-o/--out-blast is mandatory." if @o[:oblast].nil?
  
  # Read ROCker file
  if data.nil?
    puts "Loading ROCker file: #{@o[:rocker]}." unless @o[:q]
    data = ROCData.new @o[:rocker]
  end
  readlengths = {}
  exp_readlen = 0
  unless @o[:lencorr].nil?
    @o[:lencorr_max] ||= 0.4
    raise "Unsigned length in model, please re-compile model to use -L" if
      data.signatures[:l].nil?
    exp_readlen = data.signatures[:l].to_i
    File.open(@o[:lencorr], 'r') do |fh|
      k = nil
      fh.each_line do |ln|
        if ln =~ /^>(\S+)/
          k = $1
          readlengths[k] = 0
        else
          readlengths[k] += ln.chomp.size
        end
      end
    end
  end

  # Filter similarity search
  puts "Filtering similarity search: #{@o[:qblast]}." unless @o[:q]
  oh = File.open(@o[:oblast], 'w')
  File.open(@o[:qblast], 'r') do |ih|
    ih.each_line do |ln|
      bh = BlastHit.new(ln, data.aln)
      next if bh.sbj.nil? # <- When the hit is not against a known target
      bs = @o[:lencorr].nil? ? bh.bits :
        correct_bs(bh, readlengths[bh.qry], exp_readlen,
          @o[:lencorr_max], @o[:lencorr_pen])
      oh.print ln if not(bh.sfrom.nil?) and
        bs >= data.win_at_col(bh.midpoint).thr
    end
  end
  oh.close
end
get_coords_from_gff3(genome_ids, pset, thread_id, json_file) click to toggle source
# File lib/rocker/step/build.rb, line 60
def get_coords_from_gff3(genome_ids, pset, thread_id, json_file)
   coords = {}
   i = 0
   genome_ids.each do |genome_id|
      print "  * scanning #{(i+=1).ordinalize} genome out of " +
         "#{genome_ids.size} in first thread.  \r" if
         thread_id==0 and not @o[:q]
      genome_file = @o[:baseout] + ".src." + genome_id + ".gff3"
      if @o[:reuse] and File.size? genome_file
         ifh = File.open(genome_file, "r")
         doc = ifh.readlines.grep(/^[^#]/)
         ifh.close
      else
         genome_file=nil unless @o[:noclean]
         doc = ebiFetch(:embl, [genome_id], :gff3,
            genome_file).split("\n").grep(/^[^#]/)
         if doc.first =~ /ERROR 12 No entries found/
            doc = ebiFetch(:emblconexp, [genome_id], :gff3,
               genome_file).split("\n").grep(/^[^#]/)
         end
      end
      doc.each do |ln|
         next if ln =~ /^#/
         r = ln.chomp.split /\t/
         next if r.size < 9
         prots = r[8].split(/;/).grep(
            /^db_xref=UniProtKB[\/A-Za-z-]*:/){ |xref| xref.split(/:/)[1] }
         p = prots.compact.select{ |id| pset.ids.include? id }.first
         trans = r[8].split(/;/).grep(
            /^protein_id=/){ |pid| pid.split(/=/)[1] }
         t = trans.select{ |id| pset.tranids.include? id }.first
         next if p.nil? and t.nil?
         coords[ r[0].to_sym ] ||= []
         coords[ r[0].to_sym ] << {
            prot_id:        p,
            tran_id:        t,
            from:   r[3].to_i,
            to:     r[4].to_i,
            strand: r[6]
         }
      end
   end
   print "\n" if thread_id==0 and not @o[:q]
   ofh = File.open(json_file, "w")
   ofh.print({coords:coords}.to_json)
   ofh.close
end
plot!() click to toggle source
[ Search ]
# File lib/rocker/step/plot.rb, line 16
def plot!
   raise "-k/--rocker is mandatory." if o[:rocker].nil?
   if @o[:table].nil?
      raise "-t/--table is mandatory unless -b is provided." if
         @o[:blast].nil?
      @o[:table] = "#{@o[:blast]}.table"
   end
   raise "-b/--blast is mandatory unless -t exists." if
      @o[:blast].nil? and not File.exist? @o[:table]

   puts "Testing environment." unless @o[:q]
   bash "echo '' | #{@o[:r]} --vanilla", "-r/--path-to-r must be " +
      "executable. Is R installed?"

   # Source files
   puts "Reading files." unless @o[:q]
   puts "  * loding ROCker file: #{@o[:rocker]}." unless @o[:q]
   data = ROCData.new @o[:rocker]
   if File.exist? @o[:table]
      puts "  * reusing existing file: #{@o[:table]}." unless @o[:q]
   else
      puts "  * generating table: #{@o[:table]}." unless @o[:q]
      blast2table(@o[:blast], @o[:table], data.aln, @o[:minscore])
   end

   # Matches (middle panel)
   puts "Plotting matches." unless @o[:q]
   extra = @o[:gformat]=="pdf" ? "" : ", units='in', res=300"
   @o[:gout] ||= "#{@o[:rocker]}.#{@o[:gformat]}"
   data.rrun "#{@o[:gformat]}('#{@o[:gout]}', #{@o[:width]}, " +
      "#{@o[:height]}#{extra});"
   data.rrun "layout(c(2,1,3), heights=c(2-1/#{data.aln.size},3,1));"
   some_thr = data.load_table! @o[:table], @o[:sbj], @o[:minscore]
   data.rrun "par(mar=c(0,4,0,0.5)+.1);"
   data.rrun "plot(1, t='n', xlim=c(0.5,#{data.aln.cols}+0.5), " +
      "ylim=range(x$V4)+c(-0.04,0.04)*diff(range(x$V4)), xlab='', " +
      "ylab='Bit score', xaxs='i', xaxt='n');"
   data.rrun "noise <- runif(ncol(x),-.2,.2)"
   data.rrun "hit.col <- ifelse(x$V5==1, " +
      "rgb(0,0,.5,#{@o[:transparency] ? ".2" : "1"}), " +
      "rgb(.5,0,0,#{@o[:transparency] ? ".2" : "1"}))"
   data.rrun "hit.col[ x$V5==-1 ] <- " +
      "rgb(0.722,0.722,0,#{@o[:transparency] ? ".2" : "1"})" if
      @o[:tag_negatives]
   data.rrun "arrows(x0=x$V2, x1=x$V3, y0=x$V4+noise, lty=1, col=hit.col, " +
      "length=0);"
   data.rrun "points(x$V6, x$V4+noise, col=hit.col, pch=19, cex=1/4);"
   
   # Windows (middle panel)
   puts "Plotting windows." unless @o[:q]
   if some_thr
      data.rrun "arrows(x0=w$V1, x1=w$V2, y0=w$V5, lwd=2, length=0)"
      data.rrun "arrows(x0=w$V2[-nrow(w)], x1=w$V1[-1], " +
         "y0=w$V5[-nrow(w)], y1=w$V5[-1], lwd=2, length=0)"
   end
   data.rrun "legend('bottomright', legend=c('Match span'," +
      "'Match mid-point','Reference (+)'," +
      "#{"'Reference (-)'," if @o[:tag_negatives]}'Non-reference'), " +
      "lwd=c(1,NA,1,1,1), pch=c(NA,19,19,19,19), ncol=5, bty='n', " +
      "col=c('black','black','darkblue'," +
      "#{"rgb(.722,.722,0)," if @o[:tag_negatives]}'darkred'))"

   # Alignment (top panel)
   puts "Plotting alignment." unless @o[:q]
   data.rrun "par(mar=c(0,4,0.5,0.5)+0.1);"
   data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}), " +
      "ylim=c(1,#{data.aln.seqs.size}), xlab='', ylab='Alignment', " +
      "xaxs='i', xaxt='n', yaxs='i', yaxt='n', bty='n');"
   i = 0
   data.rrun "clr <- rainbow(26, v=1/2, s=3/4);" if @o[:color]
   data.aln.seqs.values.each do |s|
      color = (s.aln.split(//).map do |c|
         c=="-" ? "'grey80'" :
            (@o[:sbj].include?(s.id) ? "'red'" :
            (@o[:color] ? "clr[#{c.ord-64}]" :
            "'black'"))
      end.join(","))
      data.rrun "rect((1:#{data.aln.cols-1})-0.5, " +
         "rep(#{i}, #{data.aln.cols-1}), (1:#{data.aln.cols-1})+0.5, " +
         "rep(#{i+1}, #{data.aln.cols-1}), col=c(#{color}), border=NA);"
      i += 1
   end

   # Statistics (bottom panel)
   puts "Plotting statistics." unless @o[:q]
   data.rrun "par(mar=c(5,4,0,0.5)+.1);"
   data.rrun "plot(1, t='n', xlim=c(0,#{data.aln.cols}), " +
      "ylim=c(#{@o[:ylim].nil? ? (@o[:impact] ? "-2,.1" : "50,100") :
         @o[:ylim]}), xlab='Alignment position (amino acids)', " +
      "ylab='Precision',xaxs='i');"
   if some_thr
      sn = data.rrun "100*sum(w$tp)/(sum(w$tp)+sum(w$fn))", :float
      sp = data.rrun "100*sum(w$tn)/(sum(w$fp)+sum(w$tn))", :float
      ac = data.rrun "100*(sum(w$tp)+sum(w$tn))/(sum(w$p)+sum(w$n))", :float
      unless @o[:q]
         puts "  * sensitivity: #{sn}%"
         puts "  * specificity: #{sp}%"
         puts "  * accuracy: #{ac}%"
      end
      data.rrun "pos <- (w$V1+w$V2)/2"
      if @o[:impact]
         data.rrun "lines(pos[!is.na(w$specificity)], " +
            "(w$specificity[!is.na(w$specificity)]-#{sp})*" +
               "w$tp[!is.na(w$specificity)]/sum(w$tp), " +
            "col='darkred', lwd=2, t='o', cex=1/3, pch=19);"
         data.rrun "lines(pos[!is.na(w$sensitivity)], " +
            "(w$sensitivity[!is.na(w$sensitivity)]-#{sn})*" +
               "w$tn[!is.na(w$sensitivity)]/sum(w$tn), " +
            "col='darkgreen', lwd=2, t='o', cex=1/3, pch=19);"
         data.rrun "lines(pos[!is.na(w$accuracy)], " +
            "(w$accuracy[!is.na(w$accuracy)]-#{ac})*" +
               "(w$tp+w$tn)[!is.na(w$accuracy)]/sum(c(w$tp, w$tn)), " +
            "col='darkblue', lwd=2, t='o', cex=1/3, pch=19);"
      else
         data.rrun "lines(pos[!is.na(w$specificity)], " +
            "w$specificity[!is.na(w$specificity)], col='darkred', " +
            "lwd=2, t='o', cex=1/3, pch=19);"
         data.rrun "lines(pos[!is.na(w$sensitivity)], " +
            "w$sensitivity[!is.na(w$sensitivity)], col='darkgreen', " +
            "lwd=2, t='o', cex=1/3, pch=19);"
         data.rrun "lines(pos[!is.na(w$accuracy)], " +
            "w$accuracy[!is.na(w$accuracy)], col='darkblue', lwd=2, " +
            "t='o', cex=1/3, pch=19);"
      end
   end
   data.rrun "legend('bottomright', " +
      "legend=c('Specificity','Sensitivity','Accuracy'), lwd=2, " +
      "col=c('darkred','darkgreen','darkblue'), ncol=3, bty='n')"
   data.rrun "dev.off();"
end
restcall(url, outfile=nil) click to toggle source
[ Utilities ]
# File lib/rocker/step/build.rb, line 41
def restcall(url, outfile=nil)
   $stderr.puts "   # Calling: #{url}" if @o[:debug]
   response = RestClient::Request.execute(method: :get, url: url,
      timeout: 600)
   raise "Unable to reach EBI REST client, error code " +
      response.code.to_s + "." unless response.code == 200
   unless outfile.nil?
      ohf = File.open(outfile, "w")
      ohf.print response.to_s
      ohf.close
   end
   response.to_s
end
search!() click to toggle source
[ Search ]
# File lib/rocker/step/search.rb, line 15
def search!
   raise "-k/--rocker is mandatory." if @o[:rocker].nil?
   raise "-i/--query is mandatory." if @o[:query].nil?
   @o[:lencorr] = @o[:query] unless @o[:lencorr].nil?

   # Check requirements
   puts "Testing environment." unless @o[:q]
   @o[:searchcmd] = @o[:searchcmd][@o[:search]] if @o[:searchcmd].is_a? Hash
   @o[:makedbcmd] = @o[:makedbcmd][@o[:search]] if @o[:makedbcmd].is_a? Hash
   self.bash "#{@o[:searchbins]}makeblastdb -version", "--search-bins must contain executables. Is BLAST+ installed?" if @o[:search]==:blast
   self.bash "#{@o[:searchbins]}diamond --help", "--search-bins must contain executables. Is DIAMOND installed?" if @o[:search]==:diamond

   # Run similarity search
   Dir.mktmpdir do |dir|
      @o[:qblast] ||= "#{dir}/blast"
      puts "Loading ROCker file: #{@o[:rocker]}." unless @o[:q]
      data = ROCData.new @o[:rocker]
      puts "Running similarity search." unless @o[:q]
      puts "  * preparing database." unless @o[:q]
      ofh = File.new("#{dir}/ref.fasta", "w")
      ofh.print data.aln.to_seq_s
      ofh.close
      bash sprintf(@o[:makedbcmd], @o[:searchbins], 'prot', "#{dir}/ref.fasta", "#{dir}/ref")
      puts "  * running similarity search." unless @o[:q]
      bash sprintf(@o[:searchcmd], @o[:searchbins], 'blastx', @o[:query], "#{dir}/ref", @o[:qblast], @o[:thr])
      self.filter! data
   end
end