class CRB_Blast::CRB_Blast

Attributes

missed[RW]
query_is_prot[RW]
query_name[RW]
query_results[RW]
reciprocal_hits[R]
reciprocals[RW]
target_is_prot[RW]
target_name[RW]
target_results[RW]
working_dir[RW]

Public Class Methods

new(query, target, output=nil) click to toggle source
# File lib/crb-blast/crb-blast.rb, line 29
def initialize query, target, output=nil
  raise IOError.new("File not found #{query}") if !File.exist?(query)
  raise IOError.new("File not found #{target}") if !File.exist?(target)
  @query = File.expand_path(query)
  @target = File.expand_path(target)
  if output.nil?
    #@working_dir = File.expand_path(File.dirname(query)) # no trailing /
    @working_dir = "."
  else
    @working_dir = File.expand_path(output)
    mkcmd = "mkdir #{@working_dir}"
    if !Dir.exist?(@working_dir)
      puts mkcmd
      mkdir = Cmd.new(mkcmd)
      mkdir.run
      if !mkdir.status.success?
        raise RuntimeError.new("Unable to create output directory")
      end
    end
  end
  name = "#{File.basename(query)}_#{File.basename(target)}_reciprocals.txt"
  @reciprocal_hits = File.join(@working_dir, name)
  @makedb_path = which('makeblastdb')
  raise 'makeblastdb was not in the PATH' if @makedb_path.empty?
  @blastn_path = which('blastn')
  raise 'blastn was not in the PATH' if @blastn_path.empty?
  @tblastn_path = which('tblastn')
  raise 'tblastn was not in the PATH' if @tblastn_path.empty?
  @blastx_path = which('blastx')
  raise 'blastx was not in the PATH' if @blastx_path.empty?
  @blastp_path = which('blastp')
  raise 'blastp was not in the PATH' if @blastp_path.empty?
  @makedb_path = @makedb_path.first
  @blastn_path = @blastn_path.first
  @tblastn_path = @tblastn_path.first
  @blastx_path = @blastx_path.first
  @blastp_path = @blastp_path.first
end

Public Instance Methods

clear_memory() click to toggle source
# File lib/crb-blast/crb-blast.rb, line 519
def clear_memory
  # running lots of jobs at the same time was keeping a lot of stuff in
  # memory that you might not want so this empties out those big hashes.
  @query_results = nil
  @target_results = nil
end
find_reciprocals() click to toggle source

fills @reciprocals with strict reciprocal hits from the blast results

# File lib/crb-blast/crb-blast.rb, line 399
def find_reciprocals
  if File.exist?(@reciprocal_hits)
    # puts "reciprocal output already exists"
  else
    @reciprocals = Hash.new
    @missed = Hash.new
    @evalues = []
    @longest = 0
    hits = 0
    @query_results.each_pair do |query_id, list_of_hits|
      list_of_hits.each_with_index do |target_hit, query_index|
        if @target_results.has_key?(target_hit.target)
          list_of_hits_2 = @target_results[target_hit.target]
          list_of_hits_2.each_with_index do |query_hit2, target_index|
            if query_index == 0 && target_index == 0 &&
               query_id == query_hit2.target
              e = target_hit.evalue.to_f
              e = 1e-200 if e==0
              e = -Math.log10(e)
              @reciprocals[query_id] ||= []
              @reciprocals[query_id] << target_hit
              hits += 1
              @longest = target_hit.alnlen  if target_hit.alnlen > @longest
              @evalues << {:e => e, :length => target_hit.alnlen}
            elsif query_id == query_hit2.target
              if !@missed.key?(query_id)
                @missed[query_id] = []
              end
              @missed[query_id] << target_hit
            end
          end
        end
      end
    end
  end
  return hits
end
find_secondaries() click to toggle source

Learns the evalue cutoff based on the length of the sequence Finds hits that have a lower evalue than this cutoff

# File lib/crb-blast/crb-blast.rb, line 439
def find_secondaries

  if File.exist?(@reciprocal_hits)
    # puts "reciprocal output already exists"
  else
    length_hash = Hash.new
    fitting = Hash.new
    File.open("evalues_data", "w") do |io|
      @evalues.each do |h|
        length_hash[h[:length]] = [] if !length_hash.key?(h[:length])
        length_hash[h[:length]] << h
        io.write "#{h[:length]}\t#{h[:e]}\n"
      end
    end

    (10..@longest).each do |centre|
      e = 0
      count = 0
      s = centre*0.1
      s = s.to_i
      s = 5 if s < 5
      (-s..s).each do |side|
        if length_hash.has_key?(centre+side)
          length_hash[centre+side].each do |point|
            e += point[:e]
            count += 1
          end
        end
      end
      if count>0
        mean = e/count
        if fitting[centre-1]
          if fitting[centre-1] > mean # monotonic fitting
            fitting[centre] = fitting[centre-1]
          else
            fitting[centre] = mean
          end
        else
          fitting[centre] = mean
        end
      end
    end
    # output fitting data
    File.open("fitting_data", "w") do |io|
      fitting.each do |centre, mean|
        io.write "#{centre}\t#{mean}\n"
      end
    end
    #
    hits = 0
    @missed.each_pair do |id, list|
      list.each do |hit|
        l = hit.alnlen.to_i
        e = hit.evalue
        e = 1e-200 if e==0
        e = -Math.log10(e)
        if fitting.has_key?(l)
          if e >= fitting[l]
            if !@reciprocals.key?(id)
              @reciprocals[id] = []
              found = false
              @reciprocals[id].each do |existing_hit|
                if existing_hit.query == hit.query &&
                  existing_hit.target == hit.target
                 found = true
                end
              end
              if !found
                @reciprocals[id] << hit
                hits += 1
              end
            end
          end
        end
      end
    end
  end
  return hits
end
has_reciprocal?(contig) click to toggle source
# File lib/crb-blast/crb-blast.rb, line 574
def has_reciprocal? contig
  return true if @reciprocals.has_key?(contig)
  return false
end
load_outputs() click to toggle source

Load the two BLAST output files and store the hits in a hash

# File lib/crb-blast/crb-blast.rb, line 362
def load_outputs
  if File.exist?(@reciprocal_hits)
    # puts "reciprocal output already exists"
  else
    @query_results = Hash.new
    @target_results = Hash.new
    q_count=0
    t_count=0
    if !File.exists?("#{@output1}")
      raise RuntimeError.new("can't find #{@output1}")
    end
    if !File.exists?("#{@output2}")
      raise RuntimeError.new("can't find #{@output2}")
    end
    if File.exists?("#{@output1}") and File.exists?("#{@output2}")
      File.open("#{@output1}").each_line do |line|
        cols = line.chomp.split("\t")
        hit = Hit.new(cols, @query_is_prot, @target_is_prot)
        @query_results[hit.query] = [] if !@query_results.has_key?(hit.query)
        @query_results[hit.query] << hit
        q_count += 1
      end
      File.open("#{@output2}").each_line do |line|
        cols = line.chomp.split("\t")
        hit = Hit.new(cols, @target_is_prot, @query_is_prot)
        @target_results[hit.query] = [] if !@target_results.has_key?(hit.query)
        @target_results[hit.query] << hit
        t_count += 1
      end
    else
      raise "need to run blast first"
    end
  end
  [q_count, t_count]
end
makedb() click to toggle source

makes a blast database from the query and the target

# File lib/crb-blast/crb-blast.rb, line 71
def makedb
  # only scan the first few hundred entries
  n = 100
  # check if the query is a nucl or prot seq
  query_file = Bio::FastaFormat.open(@query)
  count_p=0
  count=0
  query_file.take(n).each do |entry|
    count_p += 1 if entry.isProt?
    count += 1
  end
  if count_p > count*0.9
    @query_is_prot = true
  else
    @query_is_prot = false
  end

  # check if the target is a nucl or prot seq
  target_file = Bio::FastaFormat.open(@target)
  count_p=0
  count=0
  target_file.take(n).each do |entry|
    count_p += 1 if entry.isProt?
    count += 1
  end
  if count_p > count*0.9
    @target_is_prot = true
  else
    @target_is_prot = false
  end
  # construct the output database names
  @query_name = File.basename(@query).split('.')[0..-2].join('.')
  @target_name = File.basename(@target).split('.')[0..-2].join('.')

  # check if the databases already exist in @working_dir
  make_query_db_cmd = "#{@makedb_path} -in #{@query}"
  make_query_db_cmd << " -dbtype nucl " if !@query_is_prot
  make_query_db_cmd << " -dbtype prot " if @query_is_prot
  make_query_db_cmd << " -title #{query_name} "
  make_query_db_cmd << " -out #{@working_dir}/#{query_name}"
  db_query = "#{query_name}.nsq" if !@query_is_prot
  db_query = "#{query_name}.psq" if @query_is_prot
  if !File.exists?("#{@working_dir}/#{db_query}")
    make_db = Cmd.new(make_query_db_cmd)
    make_db.run
    if !make_db.status.success?
      msg = "BLAST Error creating database:\n" +
            make_db.stdout + "\n" +
            make_db.stderr
      raise RuntimeError.new(msg)
    end
  end

  make_target_db_cmd = "#{@makedb_path} -in #{@target}"
  make_target_db_cmd << " -dbtype nucl " if !@target_is_prot
  make_target_db_cmd << " -dbtype prot " if @target_is_prot
  make_target_db_cmd << " -title #{target_name} "
  make_target_db_cmd << " -out #{@working_dir}/#{target_name}"

  db_target = "#{target_name}.nsq" if !@target_is_prot
  db_target = "#{target_name}.psq" if @target_is_prot
  if !File.exists?("#{@working_dir}/#{db_target}")
    make_db = Cmd.new(make_target_db_cmd)
    make_db.run
    if !make_db.status.success?
      raise RuntimeError.new("BLAST Error creating database")
    end
  end
  @databases = true
  [@query_name, @target_name]
end
run(evalue=1e-5, threads=1, split=true) click to toggle source
# File lib/crb-blast/crb-blast.rb, line 542
def run evalue=1e-5, threads=1, split=true
  makedb
  run_blast evalue, threads, split
  load_outputs
  find_reciprocals
  find_secondaries
end
run_blast(evalue, threads, split) click to toggle source

Construct BLAST output file name and run blast with multiple chunks or with multiple threads

@param [Float] evalue The evalue cutoff to use with BLAST @param [Integer] threads The number of threads to run @param [Boolean] split If the fasta files should be split into chunks

# File lib/crb-blast/crb-blast.rb, line 149
def run_blast(evalue, threads, split)
  if @databases
    @output1 = "#{@working_dir}/#{query_name}_into_#{target_name}.1.blast"
    @output2 = "#{@working_dir}/#{target_name}_into_#{query_name}.2.blast"
    if @query_is_prot
      if @target_is_prot
        bin1 = "#{@blastp_path} "
        bin2 = "#{@blastp_path} "
      else
        bin1 = "#{@tblastn_path} "
        bin2 = "#{@blastx_path} "
      end
    else
      if @target_is_prot
        bin1 = "#{@blastx_path} "
        bin2 = "#{@tblastn_path} "
      else
        bin1 = "#{@blastn_path} "
        bin2 = "#{@blastn_path} "
      end
    end
    if split and threads > 1
      run_blast_with_splitting evalue, threads, bin1, bin2
    else
      run_blast_with_threads evalue, threads, bin1, bin2
    end
    return true
  else
    return false
  end
end
run_blast_with_splitting(evalue, threads, bin1, bin2) click to toggle source

Run BLAST by splitting the input into multiple chunks and using 1 thread for each chunk

@param [Float] evalue The evalue cutoff to use with BLAST @param [Integer] threads The number of threads to run @param [String] bin1 @param [String] bin2

# File lib/crb-blast/crb-blast.rb, line 234
def run_blast_with_splitting evalue, threads, bin1, bin2
  # puts "running blast by splitting input into #{threads} pieces"
  if !File.exist?(@output1)
    blasts=[]
    files = split_input(@query, threads)
    threads = [threads, files.length].min
    files.threach(threads) do |thread|
      cmd1 = "#{bin1} -query #{thread} -db #{@working_dir}/#{@target_name} "
      cmd1 << " -out #{thread}.blast -evalue #{evalue} "
      cmd1 << " -outfmt \"6 std qlen slen\" "
      if bin1=="blastn"
        cmd1 << " -dust no "
      elsif bin1=="blastx" or bin1=="blastp" or bin1=="tblastn"
        cmd1 << " -seg no "
      end
      cmd1 << " -soft_masking false "
      cmd1 << " -max_target_seqs 50 "
      cmd1 << " -num_threads 1"
      if !File.exists?("#{thread}.blast")
        blast1 = Cmd.new(cmd1)
        blast1.run
        if !blast1.status.success?
          raise RuntimeError.new("BLAST Error:\n#{blast1.stderr}")
        end
      end
      blasts << "#{thread}.blast"
    end
    cat_cmd = "cat "
    cat_cmd << blasts.join(" ")
    cat_cmd << " > #{@output1}"
    catting = Cmd.new(cat_cmd)
    catting.run
    if !catting.status.success?
      raise RuntimeError.new("Problem catting files:\n#{catting.stderr}")
    end
    files.each do |file|
      File.delete(file) if File.exist?(file)
    end
    blasts.each do |b|
      File.delete(b) # delete intermediate blast output files
    end
  end

  if !File.exist?(@output2)
    blasts=[]
    files = split_input(@target, threads)
    threads = [threads, files.length].min
    files.threach(threads) do |thread|
      cmd2 = "#{bin2} -query #{thread} -db #{@working_dir}/#{@query_name} "
      cmd2 << " -out #{thread}.blast -evalue #{evalue} "
      cmd2 << " -outfmt \"6 std qlen slen\" "
      cmd2 << " -max_target_seqs 50 "
      cmd2 << " -num_threads 1"
      if !File.exists?("#{thread}.blast")
        blast2 = Cmd.new(cmd2)
        blast2.run
        if !blast2.status.success?
          raise RuntimeError.new("BLAST Error:\n#{blast2.stderr}")
        end
      end
      blasts << "#{thread}.blast"
    end
    cat_cmd = "cat "
    cat_cmd << blasts.join(" ")
    cat_cmd << " > #{@output2}"
    catting = Cmd.new(cat_cmd)
    catting.run
    if !catting.status.success?
      raise RuntimeError.new("Problem catting files:\n#{catting.stderr}")
    end
    files.each do |file|
      File.delete(file) if File.exist?(file)
    end
    blasts.each do |b|
      File.delete(b) # delete intermediate blast output files
    end
  end

end
run_blast_with_threads(evalue, threads, bin1, bin2) click to toggle source

Run BLAST using its own multithreading

@param [Float] evalue The evalue cutoff to use with BLAST @param [Integer] threads The number of threads to run @param [String] bin1 @param [String] bin2

# File lib/crb-blast/crb-blast.rb, line 187
def run_blast_with_threads evalue, threads, bin1, bin2
  # puts "running blast with #{threads} threads"
  cmd1 = "#{bin1} -query #{@query} -db #{@working_dir}/#{@target_name} "
  cmd1 << " -out #{@output1} -evalue #{evalue} "
  cmd1 << " -outfmt \"6 std qlen slen\" "
  cmd1 << " -max_target_seqs 50 "
  if bin1=="blastn"
    cmd1 << " -dust no "
  elsif bin1=~/blastx/ or bin1=~/blastp/ or bin1=~/tblastn/
    cmd1 << " -seg no "
  end
  cmd1 << " -num_threads #{threads}"

  cmd2 = "#{bin2} -query #{@target} -db #{@working_dir}/#{@query_name} "
  cmd2 << " -out #{@output2} -evalue #{evalue} "
  cmd2 << " -outfmt \"6 std qlen slen\" "
  cmd2 << " -max_target_seqs 50 "
  if bin2=="blastn"
    cmd2 << " -dust no "
  elsif bin1=="blastx" or bin1=="blastp" or bin1=="tblastn"
    cmd2 << " -seg no "
  end
  cmd2 << " -num_threads #{threads}"
  if !File.exist?("#{@output1}")
    blast1 = Cmd.new(cmd1)
    blast1.run
    if !blast1.status.success?
      raise RuntimeError.new("BLAST Error:\n#{blast1.stderr}")
    end
  end

  if !File.exist?("#{@output2}")
    blast2 = Cmd.new(cmd2)
    blast2.run
    if !blast2.status.success?
      raise RuntimeError.new("BLAST Error:\n#{blast2.stderr}")
    end
  end
end
size() click to toggle source
# File lib/crb-blast/crb-blast.rb, line 550
def size
  hits=0
  @reciprocals.each_pair do |key, list|
    list.each do |hit|
      hits += 1
    end
  end
  hits
end
split_input(filename, pieces) click to toggle source

Split a fasta file in pieces

@param [String] filename @param [Integer] pieces

# File lib/crb-blast/crb-blast.rb, line 318
def split_input filename, pieces
  input = {}
  name = nil
  seq=""
  sequences=0
  File.open(filename).each_line do |line|
    if line =~ /^>(.*)$/
      sequences+=1
      if name
        input[name]=seq
        seq=""
      end
      name = $1
    else
      seq << line.chomp
    end
  end
  input[name]=seq
  # construct list of output file handles
  outputs=[]
  output_files=[]
  pieces = [pieces, sequences].min
  pieces.times do |n|
    outfile = File.basename("#{filename}_chunk_#{n}.fasta")
    outfile = "#{@working_dir}/#{outfile}"
    outputs[n] = File.open("#{outfile}", "w")
    output_files[n] = "#{outfile}"
  end
  # write sequences
  count=0
  input.each_pair do |name, seq|
    outputs[count].write(">#{name}\n")
    outputs[count].write("#{seq}\n")
    count += 1
    count %= pieces
  end
  outputs.each do |out|
    out.close
  end
  output_files
end
tidy_up() click to toggle source

delete blast database files

# File lib/crb-blast/crb-blast.rb, line 527
def tidy_up
  Dir["*nin"].each do |file|
    File.delete(file)
  end
  Dir["*nhr"].each do |file|
    File.delete(file)
  end
  Dir["*nsq"].each do |file|
    File.delete(file)
  end
  Dir["*blast"].each do |file|
    File.delete(file)
  end
end
write_output() click to toggle source
# File lib/crb-blast/crb-blast.rb, line 560
def write_output
  s=""
  unless @reciprocals.nil?
  s << "query\ttarget\tid\talnlen\tevalue\tbitscore\t"
  s << "qstart..qend\ttstart..tend\tqlen\ttlen\n"
    @reciprocals.each_pair do |query_id, hits|
      hits.each do |hit|
        s << "#{hit}\n"
      end
    end
    File.open(@reciprocal_hits, "w") { |f| f.write s }
  end
end