class BioDSL::ClusterOtus

Create OTUs from sequences in the stream.

Use the usearch program cluster_otus to cluster sequences in the stream and output a representative sequence from each cluster. Sequences must be dereplicated and sorted according to SEQ_COUNT in decreasing order.

Please refer to the manual:

drive5.com/usearch/manual/cluster_otus.html

Usearch 7.0 must be installed for usearch to work. Read more here:

www.drive5.com/usearch/

Usage

cluster_otus([identity: <float>])

Options

* identity: <float> - OTU cluster identity between 0.0 and 1.0
                      (Default 0.97).

Examples

To create OTU clusters do:

BD.new.
read_fasta(input: "in.fna").
dereplicate_seq.
sort(key: :SEQ_COUNT, reverse: true).
cluster_otus.
run

Constants

STATS

Public Class Methods

new(options) click to toggle source

Constructor for ClusterOtu.

@param options [Hash] Options hash. @option options [Float] :identity Cluster identity.

@return [ClusterOtu] Instance of ClusterOtu.

# File lib/BioDSL/commands/cluster_otus.rb, line 76
def initialize(options)
  @options = options

  aux_exist('usearch')
  check_options
  defaults
end

Public Instance Methods

lmb() click to toggle source
# File lib/BioDSL/commands/cluster_otus.rb, line 84
def lmb
  lambda do |input, output, status|
    status_init(status, STATS)

    TmpDir.create('tmp.fa', 'tmp.uc') do |tmp_in, tmp_out|
      process_input(input, output, tmp_in)

      BioDSL::Usearch.cluster_otus(input: tmp_in, output: tmp_out,
                                   identity: @options[:identity],
                                   verbose: @options[:verbose])

      process_output(output, tmp_out)
    end
  end
end

Private Instance Methods

check_options() click to toggle source

Check options.

# File lib/BioDSL/commands/cluster_otus.rb, line 103
def check_options
  options_allowed(@options, :identity)
  options_assert(@options, ':identity >= 0.0')
  options_assert(@options, ':identity <= 1.0')
end
defaults() click to toggle source

Set default options.

# File lib/BioDSL/commands/cluster_otus.rb, line 110
def defaults
  @options[:identity] ||= 0.97
end
process_input(input, output, tmp_in) click to toggle source

Process input records and save sequence data to a temporary FASTA file for use with +usearch cluster_otus+.

@param input [Enumerator] Input stream. @param output [Enumerator::Yielder] Output stream. @param tmp_in [String] Path to temporary FASTA file.

# File lib/BioDSL/commands/cluster_otus.rb, line 120
def process_input(input, output, tmp_in)
  BioDSL::Fasta.open(tmp_in, 'w') do |ios|
    input.each_with_index do |record, i|
      @status[:records_in] += 1

      if record.key? :SEQ
        @status[:sequences_in] += 1
        @status[:residues_in] += record[:SEQ].length
        ios.puts record2entry(record, i).to_fasta
      else
        output << record
        @status[:records_out] += 1
      end
    end
  end
end
process_output(output, tmp_out) click to toggle source

Process the cluster output and emit otus to the output stream.

@param output [Enumerator::Yielder] Output stream. @param tmp_out [String] Path to temporary OTU file.

@raise [UsearchError] if size info is missing from SEQ_NAME.

# File lib/BioDSL/commands/cluster_otus.rb, line 160
def process_output(output, tmp_out)
  BioDSL::Fasta.open(tmp_out) do |ios|
    ios.each do |entry|
      record = entry.to_bp

      if record[:SEQ_NAME] =~ /;size=(\d+)$/
        record[:SEQ_COUNT] = Regexp.last_match(1).to_i
        record[:SEQ_NAME].sub!(/;size=\d+$/, '')
      else
        fail BioDSL::UsearchError, 'Missing size in SEQ_NAME: ' \
          "#{record[:SEQ_NAME]}"
      end

      output << record
      @status[:sequences_out] += 1
      @status[:residues_out] += record[:SEQ].length
      @status[:records_out] += 1
    end
  end
end
record2entry(record, i) click to toggle source

Create a Sequence entry from a record using the record index as sequence name if no such is found.

@param record [Hash] BioDSL record. @param i [Integer] Record index

# File lib/BioDSL/commands/cluster_otus.rb, line 142
def record2entry(record, i)
  seq_name = record[:SEQ_NAME] || i.to_s

  if record.key? :SEQ_COUNT
    seq_name << ";size=#{record[:SEQ_COUNT]}"
  else
    fail BioDSL::SeqError, 'Missing SEQ_COUNT'
  end

  BioDSL::Seq.new(seq_name: seq_name, seq: record[:SEQ])
end