class BioDSL::AssembleSeqIdba

Assemble sequences the stream using IDBA_UD.

assemble_seq_idba is a wrapper around the prokaryotic metagenome assembler IDBA_UD:

i.cs.hku.hk/~alse/hkubrg/projects/idba_ud/

Any records containing sequence information will be included in the assembly, but only the assembled contig sequences will be output to the stream.

The sequences records may contain quality scores, and if the sequence names indicates that the sequence order is inter-leaved paired-end assembly will be performed.

Usage

assemble_seq_idba([kmer_min: <uint>[, kmer_max: <uint>[, cpus: <uint>]]])

Options

Examples

If you have two pair-end sequence files with the Illumina data then you can assemble these using assemble_seq_idba like this:

BD.new.
read_fastq(input: "file1.fq", input2: "file2.fq).
assemble_seq_idba.
write_fasta(output: "contigs.fna").
run

Constants

STATS

Public Class Methods

new(options) click to toggle source

Constructor for the AssembleSeqIdba class.

@param [Hash] options Options hash. @option options [Integer] :kmer_min Minimum kmer value. @option options [Integer] :kmer_max Maximum kmer value. @option options [Integer] :cpus CPUs to use.

@return [AssembleSeqIdba] Returns an instance of the class.

# File lib/BioDSL/commands/assemble_seq_idba.rb, line 81
def initialize(options)
  @options = options
  @lengths = []

  aux_exist('idba_ud')
  check_options
  defaults
end

Public Instance Methods

lmb() click to toggle source

Return a lambda for the AssembleSeqIdba command.

@return [Proc] Returns the command lambda.

# File lib/BioDSL/commands/assemble_seq_idba.rb, line 93
def lmb
  lambda do |input, output, status|
    status_init(status, STATS)

    TmpDir.create('reads.fna', 'contig.fa') do |fa_in, fa_out, tmp_dir|
      process_input(input, output, fa_in)
      execute_idba(fa_in, tmp_dir)
      process_output(output, fa_out)
    end

    calc_n50(status)
  end
end

Private Instance Methods

calc_n50(status) click to toggle source

Calculate the n50 and add to the status.

{en.wikipedia.org/wiki/N50_statistic}

@param status [Hash] Status hash.

# File lib/BioDSL/commands/assemble_seq_idba.rb, line 208
def calc_n50(status)
  @lengths.sort!
  @lengths.reverse!

  status[:contig_max] = @lengths.first || 0
  status[:contig_min] = @lengths.last || 0
  status[:contig_n50] = 0

  count = 0

  @lengths.each do |length|
    count += length

    if count >= status[:residues_out] * 0.50
      status[:contig_n50] = length
      break
    end
  end
end
check_options() click to toggle source

Check the options.

# File lib/BioDSL/commands/assemble_seq_idba.rb, line 110
def check_options
  options_allowed(@options, :kmer_min, :kmer_max, :cpus)
  options_assert(@options, ':kmer_min >= 16')
  options_assert(@options, ':kmer_min <= 256')
  options_assert(@options, ':kmer_max >= 16')
  options_assert(@options, ':kmer_max <= 512')
  options_assert(@options, ':cpus >= 1')
  options_assert(@options, ":cpus <= #{BioDSL::Config::CORES_MAX}")
end
compile_cmd_line(fa_in, tmp_dir) click to toggle source

Compile the command and options for executing IDBA.

@param fa_in [String] Path to input FASTA file. @param tmp_dir [String] Temporary directory path.

@return [String] The command line for the IDBA system call.

# File lib/BioDSL/commands/assemble_seq_idba.rb, line 173
def compile_cmd_line(fa_in, tmp_dir)
  cmd = []
  cmd << 'idba_ud'
  cmd << "--read #{fa_in}"
  cmd << "--out #{tmp_dir}"
  cmd << "--mink #{@options[:kmer_min]}"
  cmd << "--maxk #{@options[:kmer_max]}"
  cmd << "--num_threads #{@options[:cpus]}"
  cmd << '> /dev/null 2>&1' unless BioDSL.verbose

  cmd.join(' ')
end
defaults() click to toggle source

Set the default option values.

# File lib/BioDSL/commands/assemble_seq_idba.rb, line 121
def defaults
  @options[:kmer_min] ||= 24
  @options[:kmer_max] ||= 48
  @options[:cpus] ||= 1
end
execute_idba(fa_in, tmp_dir) click to toggle source

Execute IDBA.

@param fa_in [String] Path to input FASTA file. @param tmp_dir [String] Temporary directory path.

@raise If execution fails.

# File lib/BioDSL/commands/assemble_seq_idba.rb, line 159
def execute_idba(fa_in, tmp_dir)
  cmd_line = compile_cmd_line(fa_in, tmp_dir)
  $stderr.puts "Running: #{cmd_line}" if BioDSL.verbose
  system(cmd_line)

  fail cmd_line unless $CHILD_STATUS.success?
end
process_input(input, output, fa_in) click to toggle source

Read all records from input and emit non-sequence records to the output stream. Sequence records are saved to a temporary file.

@param input [Enumerator] input stream. @param output [Enumerator::Yielder] Output stream. @param fa_in [String] Path to temporary FASTA file.

# File lib/BioDSL/commands/assemble_seq_idba.rb, line 133
def process_input(input, output, fa_in)
  BioDSL::Fasta.open(fa_in, 'w') do |fasta_io|
    input.each do |record|
      @status[:records_in] += 1

      if record.key? :SEQ
        entry = BioDSL::Seq.new_bp(record)

        @status[:sequences_in] += 1
        @status[:residues_in] += entry.length

        fasta_io.puts entry.to_fasta
      else
        @status[:records_out] += 1
        output.puts record
      end
    end
  end
end
process_output(output, fa_out) click to toggle source

Read the IDBA assembled contigs and output to the stream.

@param output [Enumerator::Yielder] Output stream. @param fa_out [String] Path to contig FASTA file.

# File lib/BioDSL/commands/assemble_seq_idba.rb, line 190
def process_output(output, fa_out)
  BioDSL::Fasta.open(fa_out, 'r') do |ios|
    ios.each do |entry|
      output << entry.to_bp
      @status[:records_out] += 1
      @status[:sequences_out] += 1
      @status[:residues_out] += entry.length

      @lengths << entry.length
    end
  end
end