class BioDSL::Uclust

Run uclust on sequences in the stream.

This is a wrapper for the usearch tool to run the program uclust. Basically sequence type records are clustered de-novo and records containing sequence and cluster information is output. If the align option is given the sequnces will be aligned.

Please refer to the manual:

www.drive5.com/usearch/manual/cmd_cluster_smallmem.html

Usearch 7.0 must be installed for usearch to work. Read more here:

www.drive5.com/usearch/

Usage

uclust(<identity: float>, <strand: "plus|both">[, align: <bool>
       [, cpus: <uint>]])

Options

Examples

rubocop: disable ClassLength

Constants

STATS

Public Class Methods

new(options) click to toggle source

Constructor for Uclust.

@param options [Hash] Options hash. @option options [Float] :identity @option options [String,Symbol] :strand @option options [Boolean] :align @option options [Integer] :cpus

@return [Uclust] Class instance.

# File lib/BioDSL/commands/uclust.rb, line 78
def initialize(options)
  @options = options
  @options[:cpus] ||= 1

  aux_exist('usearch')
  check_options
end

Public Instance Methods

lmb() click to toggle source

Return command lambda for uclust.

@return [Proc] Command lambda.

# File lib/BioDSL/commands/uclust.rb, line 89
def lmb
  lambda do |input, output, status|
    status_init(status, STATS)

    TmpDir.create('rec', 'in', 'out') do |tmp_rec, tmp_in, tmp_out|
      process_input(input, output, tmp_rec, tmp_in)

      run_uclust(tmp_in, tmp_out)

      if @options[:align]
        process_output_align(output, tmp_out)
      else
        process_output(output, tmp_rec, tmp_out)
      end
    end
  end
end

Private Instance Methods

check_options() click to toggle source

Check options.

# File lib/BioDSL/commands/uclust.rb, line 110
def check_options
  options_allowed(@options, :identity, :strand, :align, :cpus)
  options_required(@options, :identity, :strand)
  options_allowed_values(@options, strand: ['plus', 'both', :plus, :both])
  options_allowed_values(@options, align:  [nil, false, true])
  options_assert(@options, ':identity >  0.0')
  options_assert(@options, ':identity <= 1.0')
  options_assert(@options, ':cpus >= 1')
  options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
end
output_entry(ios, record, i) click to toggle source

Save a BioDSL record to a FASTA file.

@param ios [Fasta::IO] Output stream to a FASTA file @param record [Hash] BioDSL record. @param i [Integer] Record index.

# File lib/BioDSL/commands/uclust.rb, line 166
def output_entry(ios, record, i)
  @status[:sequences_in] += 1

  record[:SEQ_NAME] ||= i.to_s

  entry = BioDSL::Seq.new(seq_name: record[:SEQ_NAME], seq: record[:SEQ])

  @status[:residues_in] += entry.length

  ios.puts entry.to_fasta
end
parse_output(tmp_out) click to toggle source

Parse uclust output file and return a hash with Q_ID as key and the uclust record as value.

@param tmp_out [String] Path to output file.

@return [Hash] Q_ID as keys and Uclust records.

# File lib/BioDSL/commands/uclust.rb, line 206
def parse_output(tmp_out)
  results = {}

  BioDSL::Usearch.open(tmp_out) do |ios|
    ios.each(:uc) do |record|
      record[:RECORD_TYPE] = 'uclust'

      results[record[:Q_ID]] = record
    end
  end

  results
end
process_input(input, output, tmp_rec, tmp_in) click to toggle source

Process input data and serialize all records into a temporary file and all records containing sequence to a temporary FASTA file.

@param input [Enumerator] Input stream @param output [Enumerator::Yeilder] Output stream. @param tmp_rec [String] Path to serialized records file. @param tmp_in [String] Path to input file.

# File lib/BioDSL/commands/uclust.rb, line 128
def process_input(input, output, tmp_rec, tmp_in)
  File.open(tmp_rec, 'wb') do |ios_rec|
    BioDSL::Serializer.new(ios_rec) do |s|
      BioDSL::Fasta.open(tmp_in, 'w') do |ios|
        process_input_records(input, output, ios, s)
      end
    end
  end
end
process_input_records(input, output, ios, serializer) click to toggle source

Iterate over records in the input stream and serialize all records. Also, records with sequence are saved in a FASTA file or emitted to the output stream if the record contains no sequence.

@param input [Enumerator] Input stream @param output [Enumerator::Yeilder] Output stream. @param ios [Fasta::IO] Output stream to a FASTA file @param serializer [BioDSL::Serializer] Serializer IO.

# File lib/BioDSL/commands/uclust.rb, line 146
def process_input_records(input, output, ios, serializer)
  input.each_with_index do |record, i|
    @status[:records_in] += 1

    if record[:SEQ]
      output_entry(ios, record, i)
    else
      @status[:records_out] += 1
      output << record
    end

    serializer << record
  end
end
process_output(output, tmp_rec, tmp_out) click to toggle source

Parse results form uclust and merge with serialized data and output to the output stream.

@param output [Enumerator::Yeilder] Output stream. @param tmp_rec [String] Path to serialized records file. @param tmp_out [String] Path to uclust output file.

# File lib/BioDSL/commands/uclust.rb, line 249
def process_output(output, tmp_rec, tmp_out)
  results = parse_output(tmp_out)

  File.open(tmp_rec, 'rb') do |ios_rec|
    BioDSL::Serializer.new(ios_rec) do |s|
      process_output_serial(s, results, output)
    end
  end
end
process_output_align(output, tmp_out) click to toggle source

Parse MSA alignment data from uclust output file and emit to the output stream.

@param output [Enumerator::Yeilder] Output stream. @param tmp_out [String] Path to uclust output file.

# File lib/BioDSL/commands/uclust.rb, line 225
def process_output_align(output, tmp_out)
  BioDSL::Fasta.open(tmp_out) do |ios|
    ios.each do |entry|
      if entry.seq_name == 'consensus'
        @status[:clusters_out] += 1
      else
        record = {RECORD_TYPE: 'uclust', CLUSTER: @status[:clusters_out]}
        record.merge!(entry.to_bp)

        output << record
        @status[:records_out] += 1
        @status[:sequences_out] += 1
        @status[:residues_out] += entry.length
      end
    end
  end
end
process_output_serial(serializer, results, output) click to toggle source

Deserialize records from temporary file, merge these with cluster data and emit to the output stream.

@param serializer [BioDSL::Serializer]

Serializer IO.

@param results [Hash]

Results from uclust with Q_ID as key and uclust record as value

@param output [Enumerator::Yeilder]

Output stream.
# File lib/BioDSL/commands/uclust.rb, line 270
def process_output_serial(serializer, results, output)
  serializer.each do |record|
    next unless record[:SEQ_NAME]

    if (r = results[record[:SEQ_NAME]])
      output << record.merge(r)
      @status[:records_out] += 1
      @status[:sequences_out] += 1
      @status[:residues_out] += record[:SEQ].length
    else
      fail BioDSL::UsearchError, 'Sequence name: ' \
        "#{record[:SEQ_NAME]} not found in uclust results"
    end
  end
end
run_uclust(tmp_in, tmp_out) click to toggle source

Run the uclust command.

@param tmp_in [String] Path to input file. @param tmp_out [String] Path to output file.

@raise [BioDSL::UsearchError] if command fails.

# File lib/BioDSL/commands/uclust.rb, line 184
def run_uclust(tmp_in, tmp_out)
  uclust_opts = {
    input:    tmp_in,
    output:   tmp_out,
    strand:   @options[:strand],
    identity: @options[:identity],
    align:    @options[:align],
    cpus:     @options[:cpus],
    verbose:  @options[:verbose]
  }

  BioDSL::Usearch.cluster_smallmem(uclust_opts)
rescue BioDSL::UsearchError => e
  raise unless e.message =~ /Empty input file/
end